diff options
Diffstat (limited to 'pdf_report_first_pages.py')
| -rw-r--r-- | pdf_report_first_pages.py | 36 |
1 files changed, 0 insertions, 36 deletions
diff --git a/pdf_report_first_pages.py b/pdf_report_first_pages.py deleted file mode 100644 index d7fd3061..00000000 --- a/pdf_report_first_pages.py +++ /dev/null @@ -1,36 +0,0 @@ -import os -import gzip -import glob -import json -import click -from util import * - -PDF_DIR = 'datasets/s2/pdf' - -@click.command() -def pdf_report_first_pages(): - ids = {} - for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): - process_paper(fn, ids) - first_pages = list(ids.keys()) - print("Wrote {} ids".format(len(id_list))) - write_html('reports/first_pages.html', first_pages) - -def process_paper(fn, ids): - with open(fn, 'r') as f: - lines = [] - for line in f.readlines: - if 'abstract' in line.lower(): - break - if len(line) < 3: - continue - lines.append(line) - return [ - lines.join(''), - ] - -def paper_path(paper_id): - return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) - -if __name__ == '__main__': - pdf_report_first_pages() |
