import os import gzip import glob import json import click from util import * PDF_DIR = 'datasets/s2/pdf' FIRST_PAGES_KEYS = '' @click.command() def pdf_report_first_pages(): rows = [] for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): row = process_paper(fn) rows.append(row) write_report('reports/first_pages.html', title='First pages', keys=FIRST_PAGES_KEYS, rows=rows) print("Wrote {} rows".format(len(rows))) def process_paper(fn): index = fn.replace(PDF_DIR, '').split('/')[2] with open(fn, 'r') as f: lines = '' for line in f.readlines(): if 'abstract' in line.lower(): break if len(line) < 3: continue lines += line + '
' return [ index, lines ] def paper_path(paper_id): return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) if __name__ == '__main__': pdf_report_first_pages()