diff options
| author | Jules <jules@asdf.us> | 2018-11-05 19:39:22 -0500 |
|---|---|---|
| committer | Jules <jules@asdf.us> | 2018-11-05 19:39:22 -0500 |
| commit | 743159991f1bcf2080693424ebe5ad7001865583 (patch) | |
| tree | 836d49721266f8a19d7abf02212feee35074b44f /pdf_report_first_pages.py | |
| parent | d8244781971d2523a3a0343837efd180ab01228a (diff) | |
| parent | acc16d8f35a3b10021ff75db06503851feb8efde (diff) | |
Merge branch 'master' of asdf.us:megapixels_dev
Diffstat (limited to 'pdf_report_first_pages.py')
| -rw-r--r-- | pdf_report_first_pages.py | 36 |
1 files changed, 0 insertions, 36 deletions
diff --git a/pdf_report_first_pages.py b/pdf_report_first_pages.py deleted file mode 100644 index d7fd3061..00000000 --- a/pdf_report_first_pages.py +++ /dev/null @@ -1,36 +0,0 @@ -import os -import gzip -import glob -import json -import click -from util import * - -PDF_DIR = 'datasets/s2/pdf' - -@click.command() -def pdf_report_first_pages(): - ids = {} - for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): - process_paper(fn, ids) - first_pages = list(ids.keys()) - print("Wrote {} ids".format(len(id_list))) - write_html('reports/first_pages.html', first_pages) - -def process_paper(fn, ids): - with open(fn, 'r') as f: - lines = [] - for line in f.readlines: - if 'abstract' in line.lower(): - break - if len(line) < 3: - continue - lines.append(line) - return [ - lines.join(''), - ] - -def paper_path(paper_id): - return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) - -if __name__ == '__main__': - pdf_report_first_pages() |
