From a7529c979563e87fe9c518a3b6a084772d4b63a4 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Mon, 5 Nov 2018 23:34:55 +0100 Subject: k --- util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'util.py') diff --git a/util.py b/util.py index f3cdb814..d5796c8e 100644 --- a/util.py +++ b/util.py @@ -47,12 +47,12 @@ def write_report(fn, title=None, keys=None, rows=[]): f.write("") if title is not None: f.write("{}".format(title)) - f.write("") + f.write("") f.write("") f.write("") if title is not None: f.write("

{}

".format(title)) - f.write("") + f.write("
") if keys is not None: for key in keys: f.write("".format(key)) -- cgit v1.2.3-70-g09d2 From acc16d8f35a3b10021ff75db06503851feb8efde Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Tue, 6 Nov 2018 01:30:46 +0100 Subject: reports --- pdf_report_first_pages.py | 39 -------------------- reports/first_pages.html | 39 ++++++++++++++++++++ reports/reports.css | 2 ++ s2-pdf-report.py | 90 +++++++++++++++++++++++++++++++++++++++++++++++ s2.py | 1 + util.py | 47 ++++++++++++++++++++++++- 6 files changed, 178 insertions(+), 40 deletions(-) delete mode 100644 pdf_report_first_pages.py create mode 100644 reports/first_pages.html create mode 100644 reports/reports.css create mode 100644 s2-pdf-report.py (limited to 'util.py') diff --git a/pdf_report_first_pages.py b/pdf_report_first_pages.py deleted file mode 100644 index ae080539..00000000 --- a/pdf_report_first_pages.py +++ /dev/null @@ -1,39 +0,0 @@ -import os -import gzip -import glob -import json -import click -from util import * - -PDF_DIR = 'datasets/s2/pdf' -FIRST_PAGES_KEYS = '' - -@click.command() -def pdf_report_first_pages(): - rows = [] - for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): - row = process_paper(fn) - rows.append(row) - write_report('reports/first_pages.html', title='First pages', keys=FIRST_PAGES_KEYS, rows=rows) - print("Wrote {} rows".format(len(rows))) - -def process_paper(fn): - index = fn.replace(PDF_DIR, '').split('/')[2] - with open(fn, 'r') as f: - lines = '' - for line in f.readlines(): - if 'abstract' in line.lower(): - break - if len(line) < 3: - continue - lines += line + '
' - return [ - index, - lines - ] - -def paper_path(paper_id): - return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) - -if __name__ == '__main__': - pdf_report_first_pages() diff --git a/reports/first_pages.html b/reports/first_pages.html new file mode 100644 index 00000000..24fc6e5e --- /dev/null +++ b/reports/first_pages.html @@ -0,0 +1,39 @@ +First pages

First pages

{}
396a19e29853f31736ca171a3f40c506ef418a9fReal World Real-time Automatic Recognition of Facial Expressions +
Exploratory Computer Vision Group, IBM T. J. Watson Research Center
PO Box 704, Yorktown Heights, NY 10598 +
('8193125', 'Ying-li Tian', 'ying-li tian')
('1773140', 'Ruud Bolle', 'ruud bolle')
{yltian,lisabr,arunh,sharat,aws,bolle}@us.ibm.com +
392d35bb359a3b61cca1360272a65690a97a2b3fYAN, YAP, MORI: ONE-SHOT MULTI-TASK LEARNING FOR VIDEO EVENT DETECTION 1 +
Multi-Task Transfer Methods to Improve +
One-Shot Learning for Multimedia Event +
Detection +
School of Computing Science +
Simon Fraser University
Burnaby, BC, CANADA +
('34289418', 'Wang Yan', 'wang yan')
('32874186', 'Jordan Yap', 'jordan yap')
('10771328', 'Greg Mori', 'greg mori')
wyan@sfu.ca +
jjyap@sfu.ca +
mori@cs.sfu.ca +
392425be1c9d9c2ee6da45de9df7bef0d278e85f
3946b8f862ecae64582ef0912ca2aa6d3f6f84dcWho and Where: People and Location Co-Clustering +
Electrical Engineering +
Stanford University
('8491578', 'Zixuan Wang', 'zixuan wang')zxwang@stanford.edu +
3933416f88c36023a0cba63940eb92f5cef8001aLearning Robust Subspace Clustering +
Department of Electrical and Computer Engineering +
Duke University
Durham, NC, 27708 +
May 11, 2014 +
('2077648', 'Qiang Qiu', 'qiang qiu')
('1699339', 'Guillermo Sapiro', 'guillermo sapiro')
{qiang.qiu, guillermo.sapiro}@duke.edu +
39150acac6ce7fba56d54248f9c0badbfaeef0eaProceedings, Digital Signal Processing for in-Vehicle and mobile systems, Istanbul, Turkey, June 2007. +
Sabanci University
Faculty of +
Engineering and Natural Sciences +
Orhanli, Istanbul +
('40322754', 'Esra Vural', 'esra vural')
('21691177', 'Mujdat Cetin', 'mujdat cetin')
('31849282', 'Aytul Ercil', 'aytul ercil')
('2724380', 'Gwen Littlewort', 'gwen littlewort')
('1858421', 'Marian Bartlett', 'marian bartlett')
('29794862', 'Javier Movellan', 'javier movellan')
39f03d1dfd94e6f06c1565d7d1bb14ab0eee03bcSimultaneous Local Binary Feature Learning and Encoding for Face Recognition +
1Department of Automation, Tsinghua University, Beijing, China
2Rapid-Rich Object Search (ROSE) Lab, Interdisciplinary Graduate School, +
Nanyang Technological University, Singapore
('1697700', 'Jiwen Lu', 'jiwen lu')
('1754854', 'Venice Erin Liong', 'venice erin liong')
('39491387', 'Jie Zhou', 'jie zhou')
elujiwen@gmail.com; veniceer001@e.ntu.edu.sg; jzhou@tsinghua.edu.cn +
3983637022992a329f1d721bed246ae76bc934f7Wide-Baseline Stereo for Face Recognition with Large Pose Variation +
Computer Science Department +
University of Maryland, College Park
('38171682', 'Carlos D. Castillo', 'carlos d. castillo')
('34734622', 'David W. Jacobs', 'david w. jacobs')
{carlos,djacobs}@cs.umd.edu +
39ecdbad173e45964ffe589b9ced9f1ebfe2d44eAutomatic Recognition of Lower Facial Action Units +
Joint Research Group on Audio Visual Signal Processing (AVSP), +
Vrije Universiteit Brussel, Department ETRO,
Pleinlaan 2, 1050 Brussels +
lower +
recognize +
('1802474', 'Werner Verhelst', 'werner verhelst')
('34068333', 'Isabel Gonzalez', 'isabel gonzalez')
('1970907', 'Hichem Sahli', 'hichem sahli')
igonzale@etro.vub.ac.be +
hichem.sahli@etro.vub.ac.be +
wverhels@etro.vub.ac.be +
\ No newline at end of file diff --git a/reports/reports.css b/reports/reports.css new file mode 100644 index 00000000..69372951 --- /dev/null +++ b/reports/reports.css @@ -0,0 +1,2 @@ +body { font-size: smaller; } +td,th { vertical-align: top; } \ No newline at end of file diff --git a/s2-pdf-report.py b/s2-pdf-report.py new file mode 100644 index 00000000..6ef5c0f7 --- /dev/null +++ b/s2-pdf-report.py @@ -0,0 +1,90 @@ +import re +import os +import gzip +import glob +import json +import click +from util import * + +PDF_DIR = 'datasets/s2/pdf' + +@click.command() +def pdf_report_first_pages(): + rows = [] + for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): + row = process_paper(fn) + print(row) + rows.append(row) + write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows) + print("Wrote {} rows".format(len(rows))) + +def process_paper(fn): + paper_id = fn.replace(PDF_DIR, '').split('/')[2] + paper = load_paper(paper_id) + if paper is None: + print("{} no paper found!".format(paper_id)) + return None + with open(fn, 'r') as f: + lines = [] + emails = [] + authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ] + journal = paper.journal.lower() + found_authors = [] + for line in f.readlines(): + l = line.lower() + if 'abstract' in l: + break + if len(line) < 3: + continue + if journal and journal in l: + continue + if '@' in line: + # print('email {}'.format(line)) + emails.append(line) + continue + names = [s.strip() for s in re.split(',| and ', l)] + was_found = False + for name in names: + found = find_authors(authors, name) + if found: + was_found = True + # print("found {}".format(found[1])) + if found[0]: + found_authors.append(found) + if was_found: + # lines.append(NameLine(line)) + continue + if 'university' in l or 'universiteit' in l or 'research center' in l: + lines.append(BoldLine(line)) + continue + lines.append(line) + return [ + paper_id, + lines, + found_authors, + emails, + ] + +class NameLine(object): + def __init__(self, s): + self.s = s.strip() + def __str__(self): + return '' + self.s + '' + +class BoldLine(object): + def __init__(self, s): + self.s = s.strip() + def __str__(self): + return '' + self.s + '' + +def find_authors(authors, line): + for a in authors: + if a[2] in line: + return a + return None + +def paper_path(paper_id): + return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) + +if __name__ == '__main__': + pdf_report_first_pages() diff --git a/s2.py b/s2.py index ca03e22e..b6b3caef 100644 --- a/s2.py +++ b/s2.py @@ -1,3 +1,4 @@ +import os import requests class AuthorStub(object): diff --git a/util.py b/util.py index d5796c8e..400c7ee3 100644 --- a/util.py +++ b/util.py @@ -59,8 +59,53 @@ def write_report(fn, title=None, keys=None, rows=[]): for row in rows: f.write("") for cell in row: - f.write("{}".format(cell)) + if isinstance(cell, list) or isinstance(cell, tuple): + f.write("{}".format('
'.join(str(x) for x in cell))) + else: + f.write("{}".format(cell)) f.write("") f.write("") f.write("") f.write("") + +def paper_path(key='papers', paper_id=''): + return '{}/{}/{}/{}/paper.json'.format('./datasets/s2', key, paper_id[0:2], paper_id) + +class DbPaper(object): + def __init__(self, paper_id): + self.paper_id = paper_id + self.data = read_json(paper_path('db_papers', paper_id)) + @property + def title(self): + return self.data['title'] + @property + def journal(self): + return self.data['journalName'] + @property + def authors(self): + return [ (author['ids'][0] if len(author['ids']) else '', author['name']) for author in self.data['authors'] ] + +class RawPaper(object): + def __init__(self, paper_id): + self.paper_id = paper_id + self.data = read_json(paper_path('raw_papers', paper_id))['paper'] + @property + def title(self): + return self.data['title']['text'] + @property + def journal(self): + return self.data['journal']['name'] + @property + def authors(self): + return [ (author[0]['ids'][0], author[0]['name']) for author in self.data['authors'] ] + +def load_paper(paper_id): + print('_______________') + if os.path.exists(paper_path('db_papers', paper_id)): + print('db paper') + return DbPaper(paper_id) + if os.path.exists(paper_path('raw_papers', paper_id)): + print('raw paper') + return RawPaper(paper_id) + print('no paper') + return None -- cgit v1.2.3-70-g09d2