From f616775cd805ef991bae5f3058bb9c7857896d5a Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Mon, 5 Nov 2018 23:14:56 +0100 Subject: dump first pages --- pdf_dump_first_page.sh | 8 ++++++++ pdf_report_first_pages.py | 36 ++++++++++++++++++++++++++++++++++++ s2.py | 2 +- util.py | 25 +++++++++++++++++++++++++ 4 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 pdf_dump_first_page.sh create mode 100644 pdf_report_first_pages.py diff --git a/pdf_dump_first_page.sh b/pdf_dump_first_page.sh new file mode 100644 index 00000000..dd67c87d --- /dev/null +++ b/pdf_dump_first_page.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +for i in datasets/s2/pdf/*/*/*.pdf + do + OUTPUT="${i%.*}.txt" + pdf2txt.py -p 1 $i > $OUTPUT + echo $OUTPUT + done \ No newline at end of file diff --git a/pdf_report_first_pages.py b/pdf_report_first_pages.py new file mode 100644 index 00000000..d7fd3061 --- /dev/null +++ b/pdf_report_first_pages.py @@ -0,0 +1,36 @@ +import os +import gzip +import glob +import json +import click +from util import * + +PDF_DIR = 'datasets/s2/pdf' + +@click.command() +def pdf_report_first_pages(): + ids = {} + for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): + process_paper(fn, ids) + first_pages = list(ids.keys()) + print("Wrote {} ids".format(len(id_list))) + write_html('reports/first_pages.html', first_pages) + +def process_paper(fn, ids): + with open(fn, 'r') as f: + lines = [] + for line in f.readlines: + if 'abstract' in line.lower(): + break + if len(line) < 3: + continue + lines.append(line) + return [ + lines.join(''), + ] + +def paper_path(paper_id): + return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) + +if __name__ == '__main__': + pdf_report_first_pages() diff --git a/s2.py b/s2.py index 696b1a45..f3b1176f 100644 --- a/s2.py +++ b/s2.py @@ -122,7 +122,7 @@ class SemanticScholarAPI(object): @staticmethod def fetch_file(url, fn, **kwargs): - resp = requests.get(url, params=kwargs, headers=SemanticScholarAPI.headers) + resp = requests.get(url, params=kwargs, headers=SemanticScholarAPI.headers, verify=False) if resp.status_code != 200: return None size = 0 diff --git a/util.py b/util.py index 9f321465..fde0519f 100644 --- a/util.py +++ b/util.py @@ -39,3 +39,28 @@ def write_csv(fn, keys, rows): writer.writerow(keys) for row in rows: writer.writerow(row) + +def write_report(fn, title=None, keys=None, rows): + with open(fn, 'w') as f: + f.write("") + f.write("") + f.write("") + if title is not None: + f.write("{}".format(title)) + f.write("") + f.write("") + f.write("") + if title is not None: + f.write("

{}

".format(title)) + f.write("") + if keys is not None: + for key in keys: + f.write("".format(key)) + for row in rows: + f.write("") + for cell in row: + f.write("".format(cell) + f.write("") + f.write("
{}
{}
") + f.write("") + f.write("") -- cgit v1.2.3-70-g09d2