summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2018-11-06 01:30:46 +0100
committerJules Laplace <julescarbon@gmail.com>2018-11-06 01:30:46 +0100
commitacc16d8f35a3b10021ff75db06503851feb8efde (patch)
treef4c347ed9fa1731b66bf3f1ee42b48ab7e702839
parent4e7350603f294fa6eea31146f41711b79d9e1c64 (diff)
reports
-rw-r--r--pdf_report_first_pages.py39
-rw-r--r--reports/first_pages.html39
-rw-r--r--reports/reports.css2
-rw-r--r--s2-pdf-report.py90
-rw-r--r--s2.py1
-rw-r--r--util.py47
6 files changed, 178 insertions, 40 deletions
diff --git a/pdf_report_first_pages.py b/pdf_report_first_pages.py
deleted file mode 100644
index ae080539..00000000
--- a/pdf_report_first_pages.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import os
-import gzip
-import glob
-import json
-import click
-from util import *
-
-PDF_DIR = 'datasets/s2/pdf'
-FIRST_PAGES_KEYS = ''
-
-@click.command()
-def pdf_report_first_pages():
- rows = []
- for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
- row = process_paper(fn)
- rows.append(row)
- write_report('reports/first_pages.html', title='First pages', keys=FIRST_PAGES_KEYS, rows=rows)
- print("Wrote {} rows".format(len(rows)))
-
-def process_paper(fn):
- index = fn.replace(PDF_DIR, '').split('/')[2]
- with open(fn, 'r') as f:
- lines = ''
- for line in f.readlines():
- if 'abstract' in line.lower():
- break
- if len(line) < 3:
- continue
- lines += line + '<br>'
- return [
- index,
- lines
- ]
-
-def paper_path(paper_id):
- return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id)
-
-if __name__ == '__main__':
- pdf_report_first_pages()
diff --git a/reports/first_pages.html b/reports/first_pages.html
new file mode 100644
index 00000000..24fc6e5e
--- /dev/null
+++ b/reports/first_pages.html
@@ -0,0 +1,39 @@
+<!doctype html><html><head><title>First pages</title><link rel='stylesheet' href='reports.css'></head><body><h2>First pages</h2><table border='1' cellpadding='3' cellspacing='3'><tr><td>396a19e29853f31736ca171a3f40c506ef418a9f</td><td>Real World Real-time Automatic Recognition of Facial Expressions
+<br/><b>Exploratory Computer Vision Group, IBM T. J. Watson Research Center</b><br/>PO Box 704, Yorktown Heights, NY 10598
+</td><td>('8193125', 'Ying-li Tian', 'ying-li tian')<br/>('1773140', 'Ruud Bolle', 'ruud bolle')</td><td>{yltian,lisabr,arunh,sharat,aws,bolle}@us.ibm.com
+</td></tr><tr><td>392d35bb359a3b61cca1360272a65690a97a2b3f</td><td>YAN, YAP, MORI: ONE-SHOT MULTI-TASK LEARNING FOR VIDEO EVENT DETECTION 1
+<br/>Multi-Task Transfer Methods to Improve
+<br/>One-Shot Learning for Multimedia Event
+<br/>Detection
+<br/>School of Computing Science
+<br/><b>Simon Fraser University</b><br/>Burnaby, BC, CANADA
+</td><td>('34289418', 'Wang Yan', 'wang yan')<br/>('32874186', 'Jordan Yap', 'jordan yap')<br/>('10771328', 'Greg Mori', 'greg mori')</td><td>wyan@sfu.ca
+<br/>jjyap@sfu.ca
+<br/>mori@cs.sfu.ca
+</td></tr><tr><td>392425be1c9d9c2ee6da45de9df7bef0d278e85f</td><td></td><td></td><td></td></tr><tr><td>3946b8f862ecae64582ef0912ca2aa6d3f6f84dc</td><td>Who and Where: People and Location Co-Clustering
+<br/>Electrical Engineering
+<br/><b>Stanford University</b></td><td>('8491578', 'Zixuan Wang', 'zixuan wang')</td><td>zxwang@stanford.edu
+</td></tr><tr><td>3933416f88c36023a0cba63940eb92f5cef8001a</td><td>Learning Robust Subspace Clustering
+<br/>Department of Electrical and Computer Engineering
+<br/><b>Duke University</b><br/>Durham, NC, 27708
+<br/>May 11, 2014
+</td><td>('2077648', 'Qiang Qiu', 'qiang qiu')<br/>('1699339', 'Guillermo Sapiro', 'guillermo sapiro')</td><td>{qiang.qiu, guillermo.sapiro}@duke.edu
+</td></tr><tr><td>39150acac6ce7fba56d54248f9c0badbfaeef0ea</td><td>Proceedings, Digital Signal Processing for in-Vehicle and mobile systems, Istanbul, Turkey, June 2007.
+<br/><b>Sabanci University</b><br/>Faculty of
+<br/>Engineering and Natural Sciences
+<br/>Orhanli, Istanbul
+</td><td>('40322754', 'Esra Vural', 'esra vural')<br/>('21691177', 'Mujdat Cetin', 'mujdat cetin')<br/>('31849282', 'Aytul Ercil', 'aytul ercil')<br/>('2724380', 'Gwen Littlewort', 'gwen littlewort')<br/>('1858421', 'Marian Bartlett', 'marian bartlett')<br/>('29794862', 'Javier Movellan', 'javier movellan')</td><td></td></tr><tr><td>39f03d1dfd94e6f06c1565d7d1bb14ab0eee03bc</td><td>Simultaneous Local Binary Feature Learning and Encoding for Face Recognition
+<br/><b>1Department of Automation, Tsinghua University, Beijing, China</b><br/>2Rapid-Rich Object Search (ROSE) Lab, Interdisciplinary Graduate School,
+<br/><b>Nanyang Technological University, Singapore</b></td><td>('1697700', 'Jiwen Lu', 'jiwen lu')<br/>('1754854', 'Venice Erin Liong', 'venice erin liong')<br/>('39491387', 'Jie Zhou', 'jie zhou')</td><td>elujiwen@gmail.com; veniceer001@e.ntu.edu.sg; jzhou@tsinghua.edu.cn
+</td></tr><tr><td>3983637022992a329f1d721bed246ae76bc934f7</td><td>Wide-Baseline Stereo for Face Recognition with Large Pose Variation
+<br/>Computer Science Department
+<br/><b>University of Maryland, College Park</b></td><td>('38171682', 'Carlos D. Castillo', 'carlos d. castillo')<br/>('34734622', 'David W. Jacobs', 'david w. jacobs')</td><td>{carlos,djacobs}@cs.umd.edu
+</td></tr><tr><td>39ecdbad173e45964ffe589b9ced9f1ebfe2d44e</td><td>Automatic Recognition of Lower Facial Action Units
+<br/>Joint Research Group on Audio Visual Signal Processing (AVSP),
+<br/><b>Vrije Universiteit Brussel, Department ETRO,</b><br/>Pleinlaan 2, 1050 Brussels
+<br/>lower
+<br/>recognize
+</td><td>('1802474', 'Werner Verhelst', 'werner verhelst')<br/>('34068333', 'Isabel Gonzalez', 'isabel gonzalez')<br/>('1970907', 'Hichem Sahli', 'hichem sahli')</td><td>igonzale@etro.vub.ac.be
+<br/>hichem.sahli@etro.vub.ac.be
+<br/>wverhels@etro.vub.ac.be
+</td></tr></table></body></html> \ No newline at end of file
diff --git a/reports/reports.css b/reports/reports.css
new file mode 100644
index 00000000..69372951
--- /dev/null
+++ b/reports/reports.css
@@ -0,0 +1,2 @@
+body { font-size: smaller; }
+td,th { vertical-align: top; } \ No newline at end of file
diff --git a/s2-pdf-report.py b/s2-pdf-report.py
new file mode 100644
index 00000000..6ef5c0f7
--- /dev/null
+++ b/s2-pdf-report.py
@@ -0,0 +1,90 @@
+import re
+import os
+import gzip
+import glob
+import json
+import click
+from util import *
+
+PDF_DIR = 'datasets/s2/pdf'
+
+@click.command()
+def pdf_report_first_pages():
+ rows = []
+ for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
+ row = process_paper(fn)
+ print(row)
+ rows.append(row)
+ write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows)
+ print("Wrote {} rows".format(len(rows)))
+
+def process_paper(fn):
+ paper_id = fn.replace(PDF_DIR, '').split('/')[2]
+ paper = load_paper(paper_id)
+ if paper is None:
+ print("{} no paper found!".format(paper_id))
+ return None
+ with open(fn, 'r') as f:
+ lines = []
+ emails = []
+ authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
+ journal = paper.journal.lower()
+ found_authors = []
+ for line in f.readlines():
+ l = line.lower()
+ if 'abstract' in l:
+ break
+ if len(line) < 3:
+ continue
+ if journal and journal in l:
+ continue
+ if '@' in line:
+ # print('email {}'.format(line))
+ emails.append(line)
+ continue
+ names = [s.strip() for s in re.split(',| and ', l)]
+ was_found = False
+ for name in names:
+ found = find_authors(authors, name)
+ if found:
+ was_found = True
+ # print("found {}".format(found[1]))
+ if found[0]:
+ found_authors.append(found)
+ if was_found:
+ # lines.append(NameLine(line))
+ continue
+ if 'university' in l or 'universiteit' in l or 'research center' in l:
+ lines.append(BoldLine(line))
+ continue
+ lines.append(line)
+ return [
+ paper_id,
+ lines,
+ found_authors,
+ emails,
+ ]
+
+class NameLine(object):
+ def __init__(self, s):
+ self.s = s.strip()
+ def __str__(self):
+ return '<span class="name">' + self.s + '</span>'
+
+class BoldLine(object):
+ def __init__(self, s):
+ self.s = s.strip()
+ def __str__(self):
+ return '<b>' + self.s + '</b>'
+
+def find_authors(authors, line):
+ for a in authors:
+ if a[2] in line:
+ return a
+ return None
+
+def paper_path(paper_id):
+ return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id)
+
+if __name__ == '__main__':
+ pdf_report_first_pages()
diff --git a/s2.py b/s2.py
index ca03e22e..b6b3caef 100644
--- a/s2.py
+++ b/s2.py
@@ -1,3 +1,4 @@
+import os
import requests
class AuthorStub(object):
diff --git a/util.py b/util.py
index d5796c8e..400c7ee3 100644
--- a/util.py
+++ b/util.py
@@ -59,8 +59,53 @@ def write_report(fn, title=None, keys=None, rows=[]):
for row in rows:
f.write("<tr>")
for cell in row:
- f.write("<td>{}</td>".format(cell))
+ if isinstance(cell, list) or isinstance(cell, tuple):
+ f.write("<td>{}</td>".format('<br/>'.join(str(x) for x in cell)))
+ else:
+ f.write("<td>{}</td>".format(cell))
f.write("</tr>")
f.write("</table>")
f.write("</body>")
f.write("</html>")
+
+def paper_path(key='papers', paper_id=''):
+ return '{}/{}/{}/{}/paper.json'.format('./datasets/s2', key, paper_id[0:2], paper_id)
+
+class DbPaper(object):
+ def __init__(self, paper_id):
+ self.paper_id = paper_id
+ self.data = read_json(paper_path('db_papers', paper_id))
+ @property
+ def title(self):
+ return self.data['title']
+ @property
+ def journal(self):
+ return self.data['journalName']
+ @property
+ def authors(self):
+ return [ (author['ids'][0] if len(author['ids']) else '', author['name']) for author in self.data['authors'] ]
+
+class RawPaper(object):
+ def __init__(self, paper_id):
+ self.paper_id = paper_id
+ self.data = read_json(paper_path('raw_papers', paper_id))['paper']
+ @property
+ def title(self):
+ return self.data['title']['text']
+ @property
+ def journal(self):
+ return self.data['journal']['name']
+ @property
+ def authors(self):
+ return [ (author[0]['ids'][0], author[0]['name']) for author in self.data['authors'] ]
+
+def load_paper(paper_id):
+ print('_______________')
+ if os.path.exists(paper_path('db_papers', paper_id)):
+ print('db paper')
+ return DbPaper(paper_id)
+ if os.path.exists(paper_path('raw_papers', paper_id)):
+ print('raw paper')
+ return RawPaper(paper_id)
+ print('no paper')
+ return None