summaryrefslogtreecommitdiff
path: root/scraper
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2018-12-07 22:04:56 +0100
committerJules Laplace <julescarbon@gmail.com>2018-12-07 22:04:56 +0100
commitcf43a180bc42a677ffc33a8178c83546f2e4b2cd (patch)
tree677ddbf6631443c692a826f3019ef98f16e86e33 /scraper
parent588c96ab6d38f30bbef3aa773163b36838538355 (diff)
s2-dump-db-pdf-urls.py
Diffstat (limited to 'scraper')
-rw-r--r--scraper/s2-dump-db-pdf-urls.py12
1 files changed, 5 insertions, 7 deletions
diff --git a/scraper/s2-dump-db-pdf-urls.py b/scraper/s2-dump-db-pdf-urls.py
index dbcb91d8..608248e9 100644
--- a/scraper/s2-dump-db-pdf-urls.py
+++ b/scraper/s2-dump-db-pdf-urls.py
@@ -6,8 +6,6 @@ from urllib.parse import urlparse
import operator
from util import *
-PAPER_JSON_DIR = 'datasets/s2/db_papers'
-
@click.command()
def s2_dump_pdf_urls():
# loop over all the papers in db_papers
@@ -23,11 +21,11 @@ def s2_dump_pdf_urls():
domains = {}
pdf = []
doi = []
- for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True):
- # if 'db_paper' in fn:
- row = process_db_paper(fn)
- # elif 'raw_paper' in fn:
- # row = process_raw_paper(fn)
+ for fn in glob.iglob('./datasets/s2/*_paper/**/paper.json', recursive=True):
+ if 'db_paper' in fn:
+ row = process_db_paper(fn)
+ elif 'raw_paper' in fn:
+ row = process_raw_paper(fn)
if row is not None:
rows.append(row)
if row[1] is not None: