diff options
Diffstat (limited to 's2-dump-db-pdf-urls.py')
| -rw-r--r-- | s2-dump-db-pdf-urls.py | 18 |
1 files changed, 9 insertions, 9 deletions
diff --git a/s2-dump-db-pdf-urls.py b/s2-dump-db-pdf-urls.py index 406bfeea..dbcb91d8 100644 --- a/s2-dump-db-pdf-urls.py +++ b/s2-dump-db-pdf-urls.py @@ -6,7 +6,7 @@ from urllib.parse import urlparse import operator from util import * -PAPER_JSON_DIR = 'datasets/s2/raw_papers' +PAPER_JSON_DIR = 'datasets/s2/db_papers' @click.command() def s2_dump_pdf_urls(): @@ -25,9 +25,9 @@ def s2_dump_pdf_urls(): doi = [] for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True): # if 'db_paper' in fn: - # row = process_db_paper(fn) + row = process_db_paper(fn) # elif 'raw_paper' in fn: - row = process_raw_paper(fn) + # row = process_raw_paper(fn) if row is not None: rows.append(row) if row[1] is not None: @@ -58,12 +58,12 @@ def s2_dump_pdf_urls(): for domain, count in sorted(domains.items(), key=operator.itemgetter(1)): print(" -- {} - {}".format(domain, count)) print("empty count: {}".format(empty_count)) - # write_csv('db_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'DOI URL', 'Extra URL'], rows=rows) - # write_csv('db_paper_pdf.csv', keys=None, rows=pdf) - # write_csv('db_paper_doi.csv', keys=None, rows=doi) - write_csv('raw_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'DOI URL', 'Extra URL'], rows=rows) - write_csv('raw_paper_pdf.csv', keys=None, rows=pdf) - write_csv('raw_paper_doi.csv', keys=None, rows=doi) + write_csv('db_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'DOI URL', 'Extra URL'], rows=rows) + write_csv('db_paper_pdf.csv', keys=None, rows=pdf) + write_csv('db_paper_doi.csv', keys=None, rows=doi) + # write_csv('raw_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'DOI URL', 'Extra URL'], rows=rows) + # write_csv('raw_paper_pdf.csv', keys=None, rows=pdf) + # write_csv('raw_paper_doi.csv', keys=None, rows=doi) def process_db_paper(fn): # print(fn) |
