summaryrefslogtreecommitdiff
path: root/pdf_report_first_pages.py
diff options
context:
space:
mode:
Diffstat (limited to 'pdf_report_first_pages.py')
-rw-r--r--pdf_report_first_pages.py36
1 files changed, 36 insertions, 0 deletions
diff --git a/pdf_report_first_pages.py b/pdf_report_first_pages.py
new file mode 100644
index 00000000..d7fd3061
--- /dev/null
+++ b/pdf_report_first_pages.py
@@ -0,0 +1,36 @@
+import os
+import gzip
+import glob
+import json
+import click
+from util import *
+
+PDF_DIR = 'datasets/s2/pdf'
+
+@click.command()
+def pdf_report_first_pages():
+ ids = {}
+ for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
+ process_paper(fn, ids)
+ first_pages = list(ids.keys())
+ print("Wrote {} ids".format(len(id_list)))
+ write_html('reports/first_pages.html', first_pages)
+
+def process_paper(fn, ids):
+ with open(fn, 'r') as f:
+ lines = []
+ for line in f.readlines:
+ if 'abstract' in line.lower():
+ break
+ if len(line) < 3:
+ continue
+ lines.append(line)
+ return [
+ lines.join(''),
+ ]
+
+def paper_path(paper_id):
+ return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id)
+
+if __name__ == '__main__':
+ pdf_report_first_pages()