From ee3d0d98e19f1d8177d85af1866fd0ee431fe9ea Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Sun, 25 Nov 2018 22:19:15 +0100 Subject: moving stuff --- scraper/s2-dump-db-pdf-urls.py | 124 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 scraper/s2-dump-db-pdf-urls.py (limited to 'scraper/s2-dump-db-pdf-urls.py') diff --git a/scraper/s2-dump-db-pdf-urls.py b/scraper/s2-dump-db-pdf-urls.py new file mode 100644 index 00000000..dbcb91d8 --- /dev/null +++ b/scraper/s2-dump-db-pdf-urls.py @@ -0,0 +1,124 @@ +import os +import glob +import simplejson as json +import click +from urllib.parse import urlparse +import operator +from util import * + +PAPER_JSON_DIR = 'datasets/s2/db_papers' + +@click.command() +def s2_dump_pdf_urls(): + # loop over all the papers in db_papers + # get all the PDF urls, pick the best one + # store it and the paper id + # another script will fetch the urls from this process + rows = [] + pdf_count = 0 + ieee_count = 0 + url_count = 0 + doi_count = 0 + empty_count = 0 + domains = {} + pdf = [] + doi = [] + for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True): + # if 'db_paper' in fn: + row = process_db_paper(fn) + # elif 'raw_paper' in fn: + # row = process_raw_paper(fn) + if row is not None: + rows.append(row) + if row[1] is not None: + pdf.append([row[0], row[1]]) + pdf_count += 1 + elif row[2] is not None: + doi.append([row[0], row[2]]) + ieee_count += 1 + elif row[3] is not None: + doi.append([row[0], row[3]]) + doi_count += 1 + elif row[4] is not None: + if 'pdf' not in row[4]: + doi.append([row[0], row[4]]) + url_count += 1 + domain = urlparse(row[4]).netloc + if domain in domains: + domains[domain] += 1 + else: + domains[domain] = 1 + else: + empty_count += 1 + print("Wrote {} rows".format(len(rows))) + print("pdf count: {}".format(pdf_count)) + print("ieee count: {}".format(ieee_count)) + print("doi count: {}".format(doi_count)) + print("url count: {}".format(url_count)) + for domain, count in sorted(domains.items(), key=operator.itemgetter(1)): + print(" -- {} - {}".format(domain, count)) + print("empty count: {}".format(empty_count)) + write_csv('db_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'DOI URL', 'Extra URL'], rows=rows) + write_csv('db_paper_pdf.csv', keys=None, rows=pdf) + write_csv('db_paper_doi.csv', keys=None, rows=doi) + # write_csv('raw_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'DOI URL', 'Extra URL'], rows=rows) + # write_csv('raw_paper_pdf.csv', keys=None, rows=pdf) + # write_csv('raw_paper_doi.csv', keys=None, rows=doi) + +def process_db_paper(fn): + # print(fn) + paper = read_json(fn) + if paper is None: + return None + paper_id = paper['id'] + pdf_url = None + ieee_url = None + doi_url = None + extra_url = None + if paper['s2PdfUrl']: + pdf_url = paper['s2PdfUrl'] + for url in paper['pdfUrls']: + if 'ieeexplore.ieee.org' in url: + ieee_url = url + elif 'doi.org' in url: + doi_url = url + elif pdf_url is None and 'pdf' in url: + pdf_url = url + else: + extra_url = url + return [paper_id, pdf_url, ieee_url, doi_url, extra_url] + +def process_raw_paper(fn): + print(fn) + data = read_json(fn) + if 'paper' not in data: + print(data) + return + paper = data['paper'] + if paper is None: + return None + paper_id = paper['id'] + pdf_url = None + ieee_url = None + doi_url = None + extra_url = None + if 'primaryPaperLink' in paper and 'url' in paper['primaryPaperLink']: + primary_url = paper['primaryPaperLink']['url'] + if 'pdf' in primary_url: + pdf_url = primary_url + elif 'doi' in primary_url: + doi_url = primary_url + for link in paper['links']: + url = link['url'] + if 'ieeexplore.ieee.org' in url: + ieee_url = url + elif 'doi.org' in url: + doi_url = url + elif pdf_url is None and 'pdf' in url: + pdf_url = url + else: + extra_url = url + return [paper_id, pdf_url, ieee_url, doi_url, extra_url] + +if __name__ == '__main__': + s2_dump_pdf_urls() -- cgit v1.2.3-70-g09d2 From cf43a180bc42a677ffc33a8178c83546f2e4b2cd Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Fri, 7 Dec 2018 22:04:56 +0100 Subject: s2-dump-db-pdf-urls.py --- scraper/s2-dump-db-pdf-urls.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'scraper/s2-dump-db-pdf-urls.py') diff --git a/scraper/s2-dump-db-pdf-urls.py b/scraper/s2-dump-db-pdf-urls.py index dbcb91d8..608248e9 100644 --- a/scraper/s2-dump-db-pdf-urls.py +++ b/scraper/s2-dump-db-pdf-urls.py @@ -6,8 +6,6 @@ from urllib.parse import urlparse import operator from util import * -PAPER_JSON_DIR = 'datasets/s2/db_papers' - @click.command() def s2_dump_pdf_urls(): # loop over all the papers in db_papers @@ -23,11 +21,11 @@ def s2_dump_pdf_urls(): domains = {} pdf = [] doi = [] - for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True): - # if 'db_paper' in fn: - row = process_db_paper(fn) - # elif 'raw_paper' in fn: - # row = process_raw_paper(fn) + for fn in glob.iglob('./datasets/s2/*_paper/**/paper.json', recursive=True): + if 'db_paper' in fn: + row = process_db_paper(fn) + elif 'raw_paper' in fn: + row = process_raw_paper(fn) if row is not None: rows.append(row) if row[1] is not None: -- cgit v1.2.3-70-g09d2 From 61c2add5fe0cb914f09ab33e63ce801002d59c86 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Fri, 7 Dec 2018 22:05:55 +0100 Subject: s2-dump-db-pdf-urls.py --- scraper/s2-dump-db-pdf-urls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'scraper/s2-dump-db-pdf-urls.py') diff --git a/scraper/s2-dump-db-pdf-urls.py b/scraper/s2-dump-db-pdf-urls.py index 608248e9..ae8faede 100644 --- a/scraper/s2-dump-db-pdf-urls.py +++ b/scraper/s2-dump-db-pdf-urls.py @@ -21,7 +21,7 @@ def s2_dump_pdf_urls(): domains = {} pdf = [] doi = [] - for fn in glob.iglob('./datasets/s2/*_paper/**/paper.json', recursive=True): + for fn in glob.iglob('./datasets/s2/*_papers/**/paper.json', recursive=True): if 'db_paper' in fn: row = process_db_paper(fn) elif 'raw_paper' in fn: -- cgit v1.2.3-70-g09d2 From 774be2b5323e4bc4b0a61f1ff998fd910627d23b Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Fri, 7 Dec 2018 22:08:05 +0100 Subject: s2-dump-db-pdf-urls.py --- scraper/s2-dump-db-pdf-urls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'scraper/s2-dump-db-pdf-urls.py') diff --git a/scraper/s2-dump-db-pdf-urls.py b/scraper/s2-dump-db-pdf-urls.py index ae8faede..bc702e09 100644 --- a/scraper/s2-dump-db-pdf-urls.py +++ b/scraper/s2-dump-db-pdf-urls.py @@ -87,7 +87,7 @@ def process_db_paper(fn): return [paper_id, pdf_url, ieee_url, doi_url, extra_url] def process_raw_paper(fn): - print(fn) + # print(fn) data = read_json(fn) if 'paper' not in data: print(data) -- cgit v1.2.3-70-g09d2