summaryrefslogtreecommitdiff
path: root/pdf_report_first_pages.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2018-11-05 23:14:56 +0100
committerJules Laplace <julescarbon@gmail.com>2018-11-05 23:14:56 +0100
commitf616775cd805ef991bae5f3058bb9c7857896d5a (patch)
treecb0c5c020f1bf2a40c0625609a513ad735cb77ba /pdf_report_first_pages.py
parentd6f2c1a496fb478e6533730fef654b7aa8833f90 (diff)
dump first pages
Diffstat (limited to 'pdf_report_first_pages.py')
-rw-r--r--pdf_report_first_pages.py36
1 files changed, 36 insertions, 0 deletions
diff --git a/pdf_report_first_pages.py b/pdf_report_first_pages.py
new file mode 100644
index 00000000..d7fd3061
--- /dev/null
+++ b/pdf_report_first_pages.py
@@ -0,0 +1,36 @@
+import os
+import gzip
+import glob
+import json
+import click
+from util import *
+
+PDF_DIR = 'datasets/s2/pdf'
+
+@click.command()
+def pdf_report_first_pages():
+ ids = {}
+ for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
+ process_paper(fn, ids)
+ first_pages = list(ids.keys())
+ print("Wrote {} ids".format(len(id_list)))
+ write_html('reports/first_pages.html', first_pages)
+
+def process_paper(fn, ids):
+ with open(fn, 'r') as f:
+ lines = []
+ for line in f.readlines:
+ if 'abstract' in line.lower():
+ break
+ if len(line) < 3:
+ continue
+ lines.append(line)
+ return [
+ lines.join(''),
+ ]
+
+def paper_path(paper_id):
+ return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id)
+
+if __name__ == '__main__':
+ pdf_report_first_pages()