summaryrefslogtreecommitdiff
path: root/pdf_report_first_pages.py
diff options
context:
space:
mode:
Diffstat (limited to 'pdf_report_first_pages.py')
-rw-r--r--pdf_report_first_pages.py39
1 files changed, 0 insertions, 39 deletions
diff --git a/pdf_report_first_pages.py b/pdf_report_first_pages.py
deleted file mode 100644
index ae080539..00000000
--- a/pdf_report_first_pages.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import os
-import gzip
-import glob
-import json
-import click
-from util import *
-
-PDF_DIR = 'datasets/s2/pdf'
-FIRST_PAGES_KEYS = ''
-
-@click.command()
-def pdf_report_first_pages():
- rows = []
- for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
- row = process_paper(fn)
- rows.append(row)
- write_report('reports/first_pages.html', title='First pages', keys=FIRST_PAGES_KEYS, rows=rows)
- print("Wrote {} rows".format(len(rows)))
-
-def process_paper(fn):
- index = fn.replace(PDF_DIR, '').split('/')[2]
- with open(fn, 'r') as f:
- lines = ''
- for line in f.readlines():
- if 'abstract' in line.lower():
- break
- if len(line) < 3:
- continue
- lines += line + '<br>'
- return [
- index,
- lines
- ]
-
-def paper_path(paper_id):
- return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id)
-
-if __name__ == '__main__':
- pdf_report_first_pages()