summaryrefslogtreecommitdiff
path: root/s2-pdf-report.py
diff options
context:
space:
mode:
Diffstat (limited to 's2-pdf-report.py')
-rw-r--r--s2-pdf-report.py90
1 files changed, 90 insertions, 0 deletions
diff --git a/s2-pdf-report.py b/s2-pdf-report.py
new file mode 100644
index 00000000..6ef5c0f7
--- /dev/null
+++ b/s2-pdf-report.py
@@ -0,0 +1,90 @@
+import re
+import os
+import gzip
+import glob
+import json
+import click
+from util import *
+
+PDF_DIR = 'datasets/s2/pdf'
+
+@click.command()
+def pdf_report_first_pages():
+ rows = []
+ for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
+ row = process_paper(fn)
+ print(row)
+ rows.append(row)
+ write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows)
+ print("Wrote {} rows".format(len(rows)))
+
+def process_paper(fn):
+ paper_id = fn.replace(PDF_DIR, '').split('/')[2]
+ paper = load_paper(paper_id)
+ if paper is None:
+ print("{} no paper found!".format(paper_id))
+ return None
+ with open(fn, 'r') as f:
+ lines = []
+ emails = []
+ authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
+ journal = paper.journal.lower()
+ found_authors = []
+ for line in f.readlines():
+ l = line.lower()
+ if 'abstract' in l:
+ break
+ if len(line) < 3:
+ continue
+ if journal and journal in l:
+ continue
+ if '@' in line:
+ # print('email {}'.format(line))
+ emails.append(line)
+ continue
+ names = [s.strip() for s in re.split(',| and ', l)]
+ was_found = False
+ for name in names:
+ found = find_authors(authors, name)
+ if found:
+ was_found = True
+ # print("found {}".format(found[1]))
+ if found[0]:
+ found_authors.append(found)
+ if was_found:
+ # lines.append(NameLine(line))
+ continue
+ if 'university' in l or 'universiteit' in l or 'research center' in l:
+ lines.append(BoldLine(line))
+ continue
+ lines.append(line)
+ return [
+ paper_id,
+ lines,
+ found_authors,
+ emails,
+ ]
+
+class NameLine(object):
+ def __init__(self, s):
+ self.s = s.strip()
+ def __str__(self):
+ return '<span class="name">' + self.s + '</span>'
+
+class BoldLine(object):
+ def __init__(self, s):
+ self.s = s.strip()
+ def __str__(self):
+ return '<b>' + self.s + '</b>'
+
+def find_authors(authors, line):
+ for a in authors:
+ if a[2] in line:
+ return a
+ return None
+
+def paper_path(paper_id):
+ return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id)
+
+if __name__ == '__main__':
+ pdf_report_first_pages()