From cf43a180bc42a677ffc33a8178c83546f2e4b2cd Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Fri, 7 Dec 2018 22:04:56 +0100 Subject: s2-dump-db-pdf-urls.py --- scraper/s2-dump-db-pdf-urls.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'scraper') diff --git a/scraper/s2-dump-db-pdf-urls.py b/scraper/s2-dump-db-pdf-urls.py index dbcb91d8..608248e9 100644 --- a/scraper/s2-dump-db-pdf-urls.py +++ b/scraper/s2-dump-db-pdf-urls.py @@ -6,8 +6,6 @@ from urllib.parse import urlparse import operator from util import * -PAPER_JSON_DIR = 'datasets/s2/db_papers' - @click.command() def s2_dump_pdf_urls(): # loop over all the papers in db_papers @@ -23,11 +21,11 @@ def s2_dump_pdf_urls(): domains = {} pdf = [] doi = [] - for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True): - # if 'db_paper' in fn: - row = process_db_paper(fn) - # elif 'raw_paper' in fn: - # row = process_raw_paper(fn) + for fn in glob.iglob('./datasets/s2/*_paper/**/paper.json', recursive=True): + if 'db_paper' in fn: + row = process_db_paper(fn) + elif 'raw_paper' in fn: + row = process_raw_paper(fn) if row is not None: rows.append(row) if row[1] is not None: -- cgit v1.2.3-70-g09d2