diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-11-05 23:14:56 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-11-05 23:14:56 +0100 |
| commit | f616775cd805ef991bae5f3058bb9c7857896d5a (patch) | |
| tree | cb0c5c020f1bf2a40c0625609a513ad735cb77ba /pdf_report_first_pages.py | |
| parent | d6f2c1a496fb478e6533730fef654b7aa8833f90 (diff) | |
dump first pages
Diffstat (limited to 'pdf_report_first_pages.py')
| -rw-r--r-- | pdf_report_first_pages.py | 36 |
1 files changed, 36 insertions, 0 deletions
diff --git a/pdf_report_first_pages.py b/pdf_report_first_pages.py new file mode 100644 index 00000000..d7fd3061 --- /dev/null +++ b/pdf_report_first_pages.py @@ -0,0 +1,36 @@ +import os +import gzip +import glob +import json +import click +from util import * + +PDF_DIR = 'datasets/s2/pdf' + +@click.command() +def pdf_report_first_pages(): + ids = {} + for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): + process_paper(fn, ids) + first_pages = list(ids.keys()) + print("Wrote {} ids".format(len(id_list))) + write_html('reports/first_pages.html', first_pages) + +def process_paper(fn, ids): + with open(fn, 'r') as f: + lines = [] + for line in f.readlines: + if 'abstract' in line.lower(): + break + if len(line) < 3: + continue + lines.append(line) + return [ + lines.join(''), + ] + +def paper_path(paper_id): + return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) + +if __name__ == '__main__': + pdf_report_first_pages() |
