diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-12-07 22:04:56 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-12-07 22:04:56 +0100 |
| commit | cf43a180bc42a677ffc33a8178c83546f2e4b2cd (patch) | |
| tree | 677ddbf6631443c692a826f3019ef98f16e86e33 /scraper | |
| parent | 588c96ab6d38f30bbef3aa773163b36838538355 (diff) | |
s2-dump-db-pdf-urls.py
Diffstat (limited to 'scraper')
| -rw-r--r-- | scraper/s2-dump-db-pdf-urls.py | 12 |
1 files changed, 5 insertions, 7 deletions
diff --git a/scraper/s2-dump-db-pdf-urls.py b/scraper/s2-dump-db-pdf-urls.py index dbcb91d8..608248e9 100644 --- a/scraper/s2-dump-db-pdf-urls.py +++ b/scraper/s2-dump-db-pdf-urls.py @@ -6,8 +6,6 @@ from urllib.parse import urlparse import operator from util import * -PAPER_JSON_DIR = 'datasets/s2/db_papers' - @click.command() def s2_dump_pdf_urls(): # loop over all the papers in db_papers @@ -23,11 +21,11 @@ def s2_dump_pdf_urls(): domains = {} pdf = [] doi = [] - for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True): - # if 'db_paper' in fn: - row = process_db_paper(fn) - # elif 'raw_paper' in fn: - # row = process_raw_paper(fn) + for fn in glob.iglob('./datasets/s2/*_paper/**/paper.json', recursive=True): + if 'db_paper' in fn: + row = process_db_paper(fn) + elif 'raw_paper' in fn: + row = process_raw_paper(fn) if row is not None: rows.append(row) if row[1] is not None: |
