6 files changed, 178 insertions, 40 deletions
diff --git a/pdf_report_first_pages.py b/pdf_report_first_pages.py
deleted file mode 100644
index ae080539..00000000
--- a/pdf_report_first_pages.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import os
-import gzip
-import glob
-import json
-import click
-from util import *
-
-PDF_DIR = 'datasets/s2/pdf'
-FIRST_PAGES_KEYS = ''
-
-@click.command()
-def pdf_report_first_pages():
-  rows = []
-  for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
-    row = process_paper(fn)
-    rows.append(row)
-  write_report('reports/first_pages.html', title='First pages', keys=FIRST_PAGES_KEYS, rows=rows)
-  print("Wrote {} rows".format(len(rows)))
-
-def process_paper(fn):
-  index = fn.replace(PDF_DIR, '').split('/')[2]
-  with open(fn, 'r') as f:
-    lines = ''
-    for line in f.readlines():
-      if 'abstract' in line.lower():
-        break
-      if len(line) < 3:
-        continue
-      lines += line + '<br>'
-    return [
-      index,
-      lines
-    ]
-
-def paper_path(paper_id):
-  return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id)
-  
-if __name__ == '__main__':
-  pdf_report_first_pages()
diff --git a/reports/first_pages.html b/reports/first_pages.html
new file mode 100644
index 00000000..24fc6e5e
--- /dev/null
+++ b/reports/first_pages.html
@@ -0,0 +1,39 @@
+<!doctype html><html><head><title>First pages</title><link rel='stylesheet' href='reports.css'></head><body><h2>First pages</h2><table border='1' cellpadding='3' cellspacing='3'><tr><td>396a19e29853f31736ca171a3f40c506ef418a9f</td><td>Real World Real-time Automatic Recognition of Facial Expressions
+<br/><b>Exploratory Computer Vision Group, IBM T. J. Watson Research Center</b><br/>PO Box 704, Yorktown Heights, NY 10598
+</td><td>('8193125', 'Ying-li Tian', 'ying-li tian')<br/>('1773140', 'Ruud Bolle', 'ruud bolle')</td><td>{yltian,lisabr,arunh,sharat,aws,bolle}@us.ibm.com
+</td></tr><tr><td>392d35bb359a3b61cca1360272a65690a97a2b3f</td><td>YAN, YAP, MORI: ONE-SHOT MULTI-TASK LEARNING FOR VIDEO EVENT DETECTION 1
+<br/>Multi-Task Transfer Methods to Improve
+<br/>One-Shot Learning for Multimedia Event
+<br/>Detection
+<br/>School of Computing Science
+<br/><b>Simon Fraser University</b><br/>Burnaby, BC, CANADA
+</td><td>('34289418', 'Wang Yan', 'wang yan')<br/>('32874186', 'Jordan Yap', 'jordan yap')<br/>('10771328', 'Greg Mori', 'greg mori')</td><td>wyan@sfu.ca
+<br/>jjyap@sfu.ca
+<br/>mori@cs.sfu.ca
+</td></tr><tr><td>392425be1c9d9c2ee6da45de9df7bef0d278e85f</td><td></td><td></td><td></td></tr><tr><td>3946b8f862ecae64582ef0912ca2aa6d3f6f84dc</td><td>Who and Where: People and Location Co-Clustering
+<br/>Electrical Engineering
+<br/><b>Stanford University</b></td><td>('8491578', 'Zixuan Wang', 'zixuan wang')</td><td>zxwang@stanford.edu
+</td></tr><tr><td>3933416f88c36023a0cba63940eb92f5cef8001a</td><td>Learning Robust Subspace Clustering
+<br/>Department of Electrical and Computer Engineering
+<br/><b>Duke University</b><br/>Durham, NC, 27708
+<br/>May 11, 2014
+</td><td>('2077648', 'Qiang Qiu', 'qiang qiu')<br/>('1699339', 'Guillermo Sapiro', 'guillermo sapiro')</td><td>{qiang.qiu, guillermo.sapiro}@duke.edu
+</td></tr><tr><td>39150acac6ce7fba56d54248f9c0badbfaeef0ea</td><td>Proceedings, Digital Signal Processing for in-Vehicle and mobile systems, Istanbul, Turkey, June 2007. 
+<br/><b>Sabanci University</b><br/>Faculty of 
+<br/>Engineering and Natural Sciences 
+<br/>Orhanli, Istanbul 
+</td><td>('40322754', 'Esra Vural', 'esra vural')<br/>('21691177', 'Mujdat Cetin', 'mujdat cetin')<br/>('31849282', 'Aytul Ercil', 'aytul ercil')<br/>('2724380', 'Gwen Littlewort', 'gwen littlewort')<br/>('1858421', 'Marian Bartlett', 'marian bartlett')<br/>('29794862', 'Javier Movellan', 'javier movellan')</td><td></td></tr><tr><td>39f03d1dfd94e6f06c1565d7d1bb14ab0eee03bc</td><td>Simultaneous Local Binary Feature Learning and Encoding for Face Recognition
+<br/><b>1Department of Automation, Tsinghua University, Beijing, China</b><br/>2Rapid-Rich Object Search (ROSE) Lab, Interdisciplinary Graduate School,
+<br/><b>Nanyang Technological University, Singapore</b></td><td>('1697700', 'Jiwen Lu', 'jiwen lu')<br/>('1754854', 'Venice Erin Liong', 'venice erin liong')<br/>('39491387', 'Jie Zhou', 'jie zhou')</td><td>elujiwen@gmail.com; veniceer001@e.ntu.edu.sg; jzhou@tsinghua.edu.cn
+</td></tr><tr><td>3983637022992a329f1d721bed246ae76bc934f7</td><td>Wide-Baseline Stereo for Face Recognition with Large Pose Variation
+<br/>Computer Science Department
+<br/><b>University of Maryland, College Park</b></td><td>('38171682', 'Carlos D. Castillo', 'carlos d. castillo')<br/>('34734622', 'David W. Jacobs', 'david w. jacobs')</td><td>{carlos,djacobs}@cs.umd.edu
+</td></tr><tr><td>39ecdbad173e45964ffe589b9ced9f1ebfe2d44e</td><td>Automatic Recognition of Lower Facial Action Units 
+<br/>Joint Research Group on Audio Visual Signal Processing (AVSP),  
+<br/><b>Vrije Universiteit Brussel, Department ETRO,</b><br/>Pleinlaan 2, 1050 Brussels 
+<br/>lower 
+<br/>recognize 
+</td><td>('1802474', 'Werner Verhelst', 'werner verhelst')<br/>('34068333', 'Isabel Gonzalez', 'isabel gonzalez')<br/>('1970907', 'Hichem Sahli', 'hichem sahli')</td><td>igonzale@etro.vub.ac.be 
+<br/>hichem.sahli@etro.vub.ac.be 
+<br/>wverhels@etro.vub.ac.be 
+</td></tr></table></body></html>
+\ No newline at end of file
diff --git a/reports/reports.css b/reports/reports.css
new file mode 100644
index 00000000..69372951
--- /dev/null
+++ b/reports/reports.css
@@ -0,0 +1,2 @@
+body { font-size: smaller; }
+td,th { vertical-align: top; }
+\ No newline at end of file
diff --git a/s2-pdf-report.py b/s2-pdf-report.py
new file mode 100644
index 00000000..6ef5c0f7
--- /dev/null
+++ b/s2-pdf-report.py
@@ -0,0 +1,90 @@
+import re
+import os
+import gzip
+import glob
+import json
+import click
+from util import *
+
+PDF_DIR = 'datasets/s2/pdf'
+
+@click.command()
+def pdf_report_first_pages():
+  rows = []
+  for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
+    row = process_paper(fn)
+    print(row)
+    rows.append(row)
+  write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows)
+  print("Wrote {} rows".format(len(rows)))
+
+def process_paper(fn):
+  paper_id = fn.replace(PDF_DIR, '').split('/')[2]
+  paper = load_paper(paper_id)
+  if paper is None:
+    print("{} no paper found!".format(paper_id))
+    return None
+  with open(fn, 'r') as f:
+    lines = []
+    emails = []
+    authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
+    journal = paper.journal.lower()
+    found_authors = []
+    for line in f.readlines():
+      l = line.lower()
+      if 'abstract' in l:
+        break
+      if len(line) < 3:
+        continue
+      if journal and journal in l:
+        continue
+      if '@' in line:
+        # print('email {}'.format(line))
+        emails.append(line)
+        continue
+      names = [s.strip() for s in re.split(',| and ', l)]
+      was_found = False
+      for name in names:
+        found = find_authors(authors, name)
+        if found:
+          was_found = True
+          # print("found {}".format(found[1]))
+          if found[0]:
+            found_authors.append(found)
+      if was_found:
+        # lines.append(NameLine(line))
+        continue
+      if 'university' in l or 'universiteit' in l or 'research center' in l:
+        lines.append(BoldLine(line))
+        continue
+      lines.append(line)
+    return [
+      paper_id,
+      lines,
+      found_authors,
+      emails,
+    ]
+
+class NameLine(object):
+  def __init__(self, s):
+    self.s = s.strip()
+  def __str__(self):
+    return '<span class="name">' + self.s + '</span>'
+
+class BoldLine(object):
+  def __init__(self, s):
+    self.s = s.strip()
+  def __str__(self):
+    return '<b>' + self.s + '</b>'
+
+def find_authors(authors, line):
+  for a in authors:
+    if a[2] in line:
+      return a
+  return None
+
+def paper_path(paper_id):
+  return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id)
+  
+if __name__ == '__main__':
+  pdf_report_first_pages()
diff --git a/s2.py b/s2.py
index ca03e22e..b6b3caef 100644
--- a/s2.py
+++ b/s2.py
@@ -1,3 +1,4 @@
+import os
 import requests
 
 class AuthorStub(object):
diff --git a/util.py b/util.py
index d5796c8e..400c7ee3 100644
--- a/util.py
+++ b/util.py
@@ -59,8 +59,53 @@ def write_report(fn, title=None, keys=None, rows=[]):
     for row in rows:
       f.write("<tr>")
       for cell in row:
-        f.write("<td>{}</td>".format(cell))
+        if isinstance(cell, list) or isinstance(cell, tuple):
+          f.write("<td>{}</td>".format('<br/>'.join(str(x) for x in cell)))
+        else:
+          f.write("<td>{}</td>".format(cell))
       f.write("</tr>")
     f.write("</table>")
     f.write("</body>")
     f.write("</html>")
+
+def paper_path(key='papers', paper_id=''):
+  return '{}/{}/{}/{}/paper.json'.format('./datasets/s2', key, paper_id[0:2], paper_id)
+
+class DbPaper(object):
+  def __init__(self, paper_id):
+    self.paper_id = paper_id
+    self.data = read_json(paper_path('db_papers', paper_id))
+  @property
+  def title(self):
+    return self.data['title']
+  @property
+  def journal(self):
+    return self.data['journalName']
+  @property
+  def authors(self):
+    return [ (author['ids'][0] if len(author['ids']) else '', author['name']) for author in self.data['authors'] ]
+
+class RawPaper(object):
+  def __init__(self, paper_id):
+    self.paper_id = paper_id
+    self.data = read_json(paper_path('raw_papers', paper_id))['paper']
+  @property
+  def title(self):
+    return self.data['title']['text']
+  @property
+  def journal(self):
+    return self.data['journal']['name']
+  @property
+  def authors(self):
+    return [ (author[0]['ids'][0], author[0]['name']) for author in self.data['authors'] ]
+
+def load_paper(paper_id):
+  print('_______________')
+  if os.path.exists(paper_path('db_papers', paper_id)):
+    print('db paper')
+    return DbPaper(paper_id)
+  if os.path.exists(paper_path('raw_papers', paper_id)):
+    print('raw paper')
+    return RawPaper(paper_id)
+  print('no paper')
+  return None