diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-11-06 16:14:36 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-11-06 16:14:36 +0100 |
| commit | 6fa2b98685fafc63385bc5618c043c120933a811 (patch) | |
| tree | a85c939a836b9eeb742e7dd7c3c38c1bfdf1f2ab /s2-dump-db-pdf-urls.py | |
| parent | e6a19cb5c9db39f00eb83cf0ae48edc85878e08e (diff) | |
raw paper scrapes
Diffstat (limited to 's2-dump-db-pdf-urls.py')
| -rw-r--r-- | s2-dump-db-pdf-urls.py | 53 |
1 files changed, 47 insertions, 6 deletions
diff --git a/s2-dump-db-pdf-urls.py b/s2-dump-db-pdf-urls.py index 80dcb0bd..406bfeea 100644 --- a/s2-dump-db-pdf-urls.py +++ b/s2-dump-db-pdf-urls.py @@ -6,7 +6,7 @@ from urllib.parse import urlparse import operator from util import * -PAPER_JSON_DIR = 'datasets/s2/db_papers' +PAPER_JSON_DIR = 'datasets/s2/raw_papers' @click.command() def s2_dump_pdf_urls(): @@ -24,18 +24,24 @@ def s2_dump_pdf_urls(): pdf = [] doi = [] for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True): - row = process_paper(fn) + # if 'db_paper' in fn: + # row = process_db_paper(fn) + # elif 'raw_paper' in fn: + row = process_raw_paper(fn) if row is not None: rows.append(row) if row[1] is not None: pdf.append([row[0], row[1]]) pdf_count += 1 elif row[2] is not None: + doi.append([row[0], row[2]]) ieee_count += 1 elif row[3] is not None: doi.append([row[0], row[3]]) doi_count += 1 elif row[4] is not None: + if 'pdf' not in row[4]: + doi.append([row[0], row[4]]) url_count += 1 domain = urlparse(row[4]).netloc if domain in domains: @@ -52,11 +58,14 @@ def s2_dump_pdf_urls(): for domain, count in sorted(domains.items(), key=operator.itemgetter(1)): print(" -- {} - {}".format(domain, count)) print("empty count: {}".format(empty_count)) - write_csv('db_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'DOI URL', 'Extra URL'], rows=rows) - write_csv('db_paper_pdf.csv', keys=None, rows=pdf) - write_csv('db_paper_doi.csv', keys=None, rows=doi) + # write_csv('db_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'DOI URL', 'Extra URL'], rows=rows) + # write_csv('db_paper_pdf.csv', keys=None, rows=pdf) + # write_csv('db_paper_doi.csv', keys=None, rows=doi) + write_csv('raw_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'DOI URL', 'Extra URL'], rows=rows) + write_csv('raw_paper_pdf.csv', keys=None, rows=pdf) + write_csv('raw_paper_doi.csv', keys=None, rows=doi) -def process_paper(fn): +def process_db_paper(fn): # print(fn) paper = read_json(fn) if paper is None: @@ -79,5 +88,37 @@ def process_paper(fn): extra_url = url return [paper_id, pdf_url, ieee_url, doi_url, extra_url] +def process_raw_paper(fn): + print(fn) + data = read_json(fn) + if 'paper' not in data: + print(data) + return + paper = data['paper'] + if paper is None: + return None + paper_id = paper['id'] + pdf_url = None + ieee_url = None + doi_url = None + extra_url = None + if 'primaryPaperLink' in paper and 'url' in paper['primaryPaperLink']: + primary_url = paper['primaryPaperLink']['url'] + if 'pdf' in primary_url: + pdf_url = primary_url + elif 'doi' in primary_url: + doi_url = primary_url + for link in paper['links']: + url = link['url'] + if 'ieeexplore.ieee.org' in url: + ieee_url = url + elif 'doi.org' in url: + doi_url = url + elif pdf_url is None and 'pdf' in url: + pdf_url = url + else: + extra_url = url + return [paper_id, pdf_url, ieee_url, doi_url, extra_url] + if __name__ == '__main__': s2_dump_pdf_urls() |
