diff options
| author | jules@lens <julescarbon@gmail.com> | 2018-11-04 12:35:06 +0100 |
|---|---|---|
| committer | jules@lens <julescarbon@gmail.com> | 2018-11-04 12:35:06 +0100 |
| commit | b6eaaf784f42867fb0ab6ce274c564c69043600a (patch) | |
| tree | b3aa5ffe7782fbef7dd934a276c2b34d9790e892 | |
| parent | 0af672923232b53b2cdd3c41cb6768a64e200e68 (diff) | |
stats
| -rw-r--r-- | s2-dump-db-pdf-urls.py | 41 |
1 files changed, 32 insertions, 9 deletions
diff --git a/s2-dump-db-pdf-urls.py b/s2-dump-db-pdf-urls.py index 473e90af..80dcb0bd 100644 --- a/s2-dump-db-pdf-urls.py +++ b/s2-dump-db-pdf-urls.py @@ -2,6 +2,8 @@ import os import glob import simplejson as json import click +from urllib.parse import urlparse +import operator from util import * PAPER_JSON_DIR = 'datasets/s2/db_papers' @@ -15,46 +17,67 @@ def s2_dump_pdf_urls(): rows = [] pdf_count = 0 ieee_count = 0 - extra_count = 0 - empty_count += 1 + url_count = 0 + doi_count = 0 + empty_count = 0 + domains = {} + pdf = [] + doi = [] for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True): row = process_paper(fn) if row is not None: rows.append(row) if row[1] is not None: + pdf.append([row[0], row[1]]) pdf_count += 1 - if row[2] is not None: + elif row[2] is not None: ieee_count += 1 - if row[3] is not None: - extra_count += 1 - if row[1] is None and row[2] is None and row[3] is None: + elif row[3] is not None: + doi.append([row[0], row[3]]) + doi_count += 1 + elif row[4] is not None: + url_count += 1 + domain = urlparse(row[4]).netloc + if domain in domains: + domains[domain] += 1 + else: + domains[domain] = 1 + else: empty_count += 1 print("Wrote {} rows".format(len(rows))) print("pdf count: {}".format(pdf_count)) print("ieee count: {}".format(ieee_count)) + print("doi count: {}".format(doi_count)) print("url count: {}".format(url_count)) + for domain, count in sorted(domains.items(), key=operator.itemgetter(1)): + print(" -- {} - {}".format(domain, count)) print("empty count: {}".format(empty_count)) - write_csv('db_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'Extra URL'], rows=rows) + write_csv('db_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'DOI URL', 'Extra URL'], rows=rows) + write_csv('db_paper_pdf.csv', keys=None, rows=pdf) + write_csv('db_paper_doi.csv', keys=None, rows=doi) def process_paper(fn): - print(fn) + # print(fn) paper = read_json(fn) if paper is None: return None paper_id = paper['id'] pdf_url = None ieee_url = None + doi_url = None extra_url = None if paper['s2PdfUrl']: pdf_url = paper['s2PdfUrl'] for url in paper['pdfUrls']: if 'ieeexplore.ieee.org' in url: ieee_url = url + elif 'doi.org' in url: + doi_url = url elif pdf_url is None and 'pdf' in url: pdf_url = url else: extra_url = url - return [paper_id, pdf_url, ieee_url, extra_url] + return [paper_id, pdf_url, ieee_url, doi_url, extra_url] if __name__ == '__main__': s2_dump_pdf_urls() |
