import os import gzip import glob import json import click from util import * PDF_DIR = 'datasets/s2/pdf' @click.command() def pdf_report_first_pages(): ids = {} for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): process_paper(fn, ids) first_pages = list(ids.keys()) print("Wrote {} ids".format(len(id_list))) write_html('reports/first_pages.html', first_pages) def process_paper(fn, ids): with open(fn, 'r') as f: lines = [] for line in f.readlines: if 'abstract' in line.lower(): break if len(line) < 3: continue lines.append(line) return [ lines.join(''), ] def paper_path(paper_id): return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) if __name__ == '__main__': pdf_report_first_pages()