From a7529c979563e87fe9c518a3b6a084772d4b63a4 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Mon, 5 Nov 2018 23:34:55 +0100 Subject: k --- pdf_report_first_pages.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'pdf_report_first_pages.py') diff --git a/pdf_report_first_pages.py b/pdf_report_first_pages.py index d7fd3061..ae080539 100644 --- a/pdf_report_first_pages.py +++ b/pdf_report_first_pages.py @@ -6,27 +6,30 @@ import click from util import * PDF_DIR = 'datasets/s2/pdf' +FIRST_PAGES_KEYS = '' @click.command() def pdf_report_first_pages(): - ids = {} + rows = [] for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): - process_paper(fn, ids) - first_pages = list(ids.keys()) - print("Wrote {} ids".format(len(id_list))) - write_html('reports/first_pages.html', first_pages) + row = process_paper(fn) + rows.append(row) + write_report('reports/first_pages.html', title='First pages', keys=FIRST_PAGES_KEYS, rows=rows) + print("Wrote {} rows".format(len(rows))) -def process_paper(fn, ids): +def process_paper(fn): + index = fn.replace(PDF_DIR, '').split('/')[2] with open(fn, 'r') as f: - lines = [] - for line in f.readlines: + lines = '' + for line in f.readlines(): if 'abstract' in line.lower(): break if len(line) < 3: continue - lines.append(line) + lines += line + '
' return [ - lines.join(''), + index, + lines ] def paper_path(paper_id): -- cgit v1.2.3-70-g09d2