diff options
| -rw-r--r-- | pdf_report_first_pages.py | 39 | ||||
| -rw-r--r-- | reports/first_pages.html | 39 | ||||
| -rw-r--r-- | reports/reports.css | 2 | ||||
| -rw-r--r-- | s2-pdf-report.py | 90 | ||||
| -rw-r--r-- | s2.py | 1 | ||||
| -rw-r--r-- | util.py | 47 |
6 files changed, 178 insertions, 40 deletions
diff --git a/pdf_report_first_pages.py b/pdf_report_first_pages.py deleted file mode 100644 index ae080539..00000000 --- a/pdf_report_first_pages.py +++ /dev/null @@ -1,39 +0,0 @@ -import os -import gzip -import glob -import json -import click -from util import * - -PDF_DIR = 'datasets/s2/pdf' -FIRST_PAGES_KEYS = '' - -@click.command() -def pdf_report_first_pages(): - rows = [] - for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): - row = process_paper(fn) - rows.append(row) - write_report('reports/first_pages.html', title='First pages', keys=FIRST_PAGES_KEYS, rows=rows) - print("Wrote {} rows".format(len(rows))) - -def process_paper(fn): - index = fn.replace(PDF_DIR, '').split('/')[2] - with open(fn, 'r') as f: - lines = '' - for line in f.readlines(): - if 'abstract' in line.lower(): - break - if len(line) < 3: - continue - lines += line + '<br>' - return [ - index, - lines - ] - -def paper_path(paper_id): - return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) - -if __name__ == '__main__': - pdf_report_first_pages() diff --git a/reports/first_pages.html b/reports/first_pages.html new file mode 100644 index 00000000..24fc6e5e --- /dev/null +++ b/reports/first_pages.html @@ -0,0 +1,39 @@ +<!doctype html><html><head><title>First pages</title><link rel='stylesheet' href='reports.css'></head><body><h2>First pages</h2><table border='1' cellpadding='3' cellspacing='3'><tr><td>396a19e29853f31736ca171a3f40c506ef418a9f</td><td>Real World Real-time Automatic Recognition of Facial Expressions +<br/><b>Exploratory Computer Vision Group, IBM T. J. Watson Research Center</b><br/>PO Box 704, Yorktown Heights, NY 10598 +</td><td>('8193125', 'Ying-li Tian', 'ying-li tian')<br/>('1773140', 'Ruud Bolle', 'ruud bolle')</td><td>{yltian,lisabr,arunh,sharat,aws,bolle}@us.ibm.com +</td></tr><tr><td>392d35bb359a3b61cca1360272a65690a97a2b3f</td><td>YAN, YAP, MORI: ONE-SHOT MULTI-TASK LEARNING FOR VIDEO EVENT DETECTION 1 +<br/>Multi-Task Transfer Methods to Improve +<br/>One-Shot Learning for Multimedia Event +<br/>Detection +<br/>School of Computing Science +<br/><b>Simon Fraser University</b><br/>Burnaby, BC, CANADA +</td><td>('34289418', 'Wang Yan', 'wang yan')<br/>('32874186', 'Jordan Yap', 'jordan yap')<br/>('10771328', 'Greg Mori', 'greg mori')</td><td>wyan@sfu.ca +<br/>jjyap@sfu.ca +<br/>mori@cs.sfu.ca +</td></tr><tr><td>392425be1c9d9c2ee6da45de9df7bef0d278e85f</td><td></td><td></td><td></td></tr><tr><td>3946b8f862ecae64582ef0912ca2aa6d3f6f84dc</td><td>Who and Where: People and Location Co-Clustering +<br/>Electrical Engineering +<br/><b>Stanford University</b></td><td>('8491578', 'Zixuan Wang', 'zixuan wang')</td><td>zxwang@stanford.edu +</td></tr><tr><td>3933416f88c36023a0cba63940eb92f5cef8001a</td><td>Learning Robust Subspace Clustering +<br/>Department of Electrical and Computer Engineering +<br/><b>Duke University</b><br/>Durham, NC, 27708 +<br/>May 11, 2014 +</td><td>('2077648', 'Qiang Qiu', 'qiang qiu')<br/>('1699339', 'Guillermo Sapiro', 'guillermo sapiro')</td><td>{qiang.qiu, guillermo.sapiro}@duke.edu +</td></tr><tr><td>39150acac6ce7fba56d54248f9c0badbfaeef0ea</td><td>Proceedings, Digital Signal Processing for in-Vehicle and mobile systems, Istanbul, Turkey, June 2007. +<br/><b>Sabanci University</b><br/>Faculty of +<br/>Engineering and Natural Sciences +<br/>Orhanli, Istanbul +</td><td>('40322754', 'Esra Vural', 'esra vural')<br/>('21691177', 'Mujdat Cetin', 'mujdat cetin')<br/>('31849282', 'Aytul Ercil', 'aytul ercil')<br/>('2724380', 'Gwen Littlewort', 'gwen littlewort')<br/>('1858421', 'Marian Bartlett', 'marian bartlett')<br/>('29794862', 'Javier Movellan', 'javier movellan')</td><td></td></tr><tr><td>39f03d1dfd94e6f06c1565d7d1bb14ab0eee03bc</td><td>Simultaneous Local Binary Feature Learning and Encoding for Face Recognition +<br/><b>1Department of Automation, Tsinghua University, Beijing, China</b><br/>2Rapid-Rich Object Search (ROSE) Lab, Interdisciplinary Graduate School, +<br/><b>Nanyang Technological University, Singapore</b></td><td>('1697700', 'Jiwen Lu', 'jiwen lu')<br/>('1754854', 'Venice Erin Liong', 'venice erin liong')<br/>('39491387', 'Jie Zhou', 'jie zhou')</td><td>elujiwen@gmail.com; veniceer001@e.ntu.edu.sg; jzhou@tsinghua.edu.cn +</td></tr><tr><td>3983637022992a329f1d721bed246ae76bc934f7</td><td>Wide-Baseline Stereo for Face Recognition with Large Pose Variation +<br/>Computer Science Department +<br/><b>University of Maryland, College Park</b></td><td>('38171682', 'Carlos D. Castillo', 'carlos d. castillo')<br/>('34734622', 'David W. Jacobs', 'david w. jacobs')</td><td>{carlos,djacobs}@cs.umd.edu +</td></tr><tr><td>39ecdbad173e45964ffe589b9ced9f1ebfe2d44e</td><td>Automatic Recognition of Lower Facial Action Units +<br/>Joint Research Group on Audio Visual Signal Processing (AVSP), +<br/><b>Vrije Universiteit Brussel, Department ETRO,</b><br/>Pleinlaan 2, 1050 Brussels +<br/>lower +<br/>recognize +</td><td>('1802474', 'Werner Verhelst', 'werner verhelst')<br/>('34068333', 'Isabel Gonzalez', 'isabel gonzalez')<br/>('1970907', 'Hichem Sahli', 'hichem sahli')</td><td>igonzale@etro.vub.ac.be +<br/>hichem.sahli@etro.vub.ac.be +<br/>wverhels@etro.vub.ac.be +</td></tr></table></body></html>
\ No newline at end of file diff --git a/reports/reports.css b/reports/reports.css new file mode 100644 index 00000000..69372951 --- /dev/null +++ b/reports/reports.css @@ -0,0 +1,2 @@ +body { font-size: smaller; } +td,th { vertical-align: top; }
\ No newline at end of file diff --git a/s2-pdf-report.py b/s2-pdf-report.py new file mode 100644 index 00000000..6ef5c0f7 --- /dev/null +++ b/s2-pdf-report.py @@ -0,0 +1,90 @@ +import re +import os +import gzip +import glob +import json +import click +from util import * + +PDF_DIR = 'datasets/s2/pdf' + +@click.command() +def pdf_report_first_pages(): + rows = [] + for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): + row = process_paper(fn) + print(row) + rows.append(row) + write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows) + print("Wrote {} rows".format(len(rows))) + +def process_paper(fn): + paper_id = fn.replace(PDF_DIR, '').split('/')[2] + paper = load_paper(paper_id) + if paper is None: + print("{} no paper found!".format(paper_id)) + return None + with open(fn, 'r') as f: + lines = [] + emails = [] + authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ] + journal = paper.journal.lower() + found_authors = [] + for line in f.readlines(): + l = line.lower() + if 'abstract' in l: + break + if len(line) < 3: + continue + if journal and journal in l: + continue + if '@' in line: + # print('email {}'.format(line)) + emails.append(line) + continue + names = [s.strip() for s in re.split(',| and ', l)] + was_found = False + for name in names: + found = find_authors(authors, name) + if found: + was_found = True + # print("found {}".format(found[1])) + if found[0]: + found_authors.append(found) + if was_found: + # lines.append(NameLine(line)) + continue + if 'university' in l or 'universiteit' in l or 'research center' in l: + lines.append(BoldLine(line)) + continue + lines.append(line) + return [ + paper_id, + lines, + found_authors, + emails, + ] + +class NameLine(object): + def __init__(self, s): + self.s = s.strip() + def __str__(self): + return '<span class="name">' + self.s + '</span>' + +class BoldLine(object): + def __init__(self, s): + self.s = s.strip() + def __str__(self): + return '<b>' + self.s + '</b>' + +def find_authors(authors, line): + for a in authors: + if a[2] in line: + return a + return None + +def paper_path(paper_id): + return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) + +if __name__ == '__main__': + pdf_report_first_pages() @@ -1,3 +1,4 @@ +import os import requests class AuthorStub(object): @@ -59,8 +59,53 @@ def write_report(fn, title=None, keys=None, rows=[]): for row in rows: f.write("<tr>") for cell in row: - f.write("<td>{}</td>".format(cell)) + if isinstance(cell, list) or isinstance(cell, tuple): + f.write("<td>{}</td>".format('<br/>'.join(str(x) for x in cell))) + else: + f.write("<td>{}</td>".format(cell)) f.write("</tr>") f.write("</table>") f.write("</body>") f.write("</html>") + +def paper_path(key='papers', paper_id=''): + return '{}/{}/{}/{}/paper.json'.format('./datasets/s2', key, paper_id[0:2], paper_id) + +class DbPaper(object): + def __init__(self, paper_id): + self.paper_id = paper_id + self.data = read_json(paper_path('db_papers', paper_id)) + @property + def title(self): + return self.data['title'] + @property + def journal(self): + return self.data['journalName'] + @property + def authors(self): + return [ (author['ids'][0] if len(author['ids']) else '', author['name']) for author in self.data['authors'] ] + +class RawPaper(object): + def __init__(self, paper_id): + self.paper_id = paper_id + self.data = read_json(paper_path('raw_papers', paper_id))['paper'] + @property + def title(self): + return self.data['title']['text'] + @property + def journal(self): + return self.data['journal']['name'] + @property + def authors(self): + return [ (author[0]['ids'][0], author[0]['name']) for author in self.data['authors'] ] + +def load_paper(paper_id): + print('_______________') + if os.path.exists(paper_path('db_papers', paper_id)): + print('db paper') + return DbPaper(paper_id) + if os.path.exists(paper_path('raw_papers', paper_id)): + print('raw paper') + return RawPaper(paper_id) + print('no paper') + return None |
