diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-11-06 01:30:46 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-11-06 01:30:46 +0100 |
| commit | acc16d8f35a3b10021ff75db06503851feb8efde (patch) | |
| tree | f4c347ed9fa1731b66bf3f1ee42b48ab7e702839 /pdf_report_first_pages.py | |
| parent | 4e7350603f294fa6eea31146f41711b79d9e1c64 (diff) | |
reports
Diffstat (limited to 'pdf_report_first_pages.py')
| -rw-r--r-- | pdf_report_first_pages.py | 39 |
1 files changed, 0 insertions, 39 deletions
diff --git a/pdf_report_first_pages.py b/pdf_report_first_pages.py deleted file mode 100644 index ae080539..00000000 --- a/pdf_report_first_pages.py +++ /dev/null @@ -1,39 +0,0 @@ -import os -import gzip -import glob -import json -import click -from util import * - -PDF_DIR = 'datasets/s2/pdf' -FIRST_PAGES_KEYS = '' - -@click.command() -def pdf_report_first_pages(): - rows = [] - for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): - row = process_paper(fn) - rows.append(row) - write_report('reports/first_pages.html', title='First pages', keys=FIRST_PAGES_KEYS, rows=rows) - print("Wrote {} rows".format(len(rows))) - -def process_paper(fn): - index = fn.replace(PDF_DIR, '').split('/')[2] - with open(fn, 'r') as f: - lines = '' - for line in f.readlines(): - if 'abstract' in line.lower(): - break - if len(line) < 3: - continue - lines += line + '<br>' - return [ - index, - lines - ] - -def paper_path(paper_id): - return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) - -if __name__ == '__main__': - pdf_report_first_pages() |
