diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-11-25 22:19:15 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-11-25 22:19:15 +0100 |
| commit | ee3d0d98e19f1d8177d85af1866fd0ee431fe9ea (patch) | |
| tree | 41372528e78d4328bc2a47bbbabac7e809c58894 /s2-dump-db-pdf-urls.py | |
| parent | 255b8178af1e25a71fd23703d30c0d1f74911f47 (diff) | |
moving stuff
Diffstat (limited to 's2-dump-db-pdf-urls.py')
| -rw-r--r-- | s2-dump-db-pdf-urls.py | 124 |
1 files changed, 0 insertions, 124 deletions
diff --git a/s2-dump-db-pdf-urls.py b/s2-dump-db-pdf-urls.py deleted file mode 100644 index dbcb91d8..00000000 --- a/s2-dump-db-pdf-urls.py +++ /dev/null @@ -1,124 +0,0 @@ -import os -import glob -import simplejson as json -import click -from urllib.parse import urlparse -import operator -from util import * - -PAPER_JSON_DIR = 'datasets/s2/db_papers' - -@click.command() -def s2_dump_pdf_urls(): - # loop over all the papers in db_papers - # get all the PDF urls, pick the best one - # store it and the paper id - # another script will fetch the urls from this process - rows = [] - pdf_count = 0 - ieee_count = 0 - url_count = 0 - doi_count = 0 - empty_count = 0 - domains = {} - pdf = [] - doi = [] - for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True): - # if 'db_paper' in fn: - row = process_db_paper(fn) - # elif 'raw_paper' in fn: - # row = process_raw_paper(fn) - if row is not None: - rows.append(row) - if row[1] is not None: - pdf.append([row[0], row[1]]) - pdf_count += 1 - elif row[2] is not None: - doi.append([row[0], row[2]]) - ieee_count += 1 - elif row[3] is not None: - doi.append([row[0], row[3]]) - doi_count += 1 - elif row[4] is not None: - if 'pdf' not in row[4]: - doi.append([row[0], row[4]]) - url_count += 1 - domain = urlparse(row[4]).netloc - if domain in domains: - domains[domain] += 1 - else: - domains[domain] = 1 - else: - empty_count += 1 - print("Wrote {} rows".format(len(rows))) - print("pdf count: {}".format(pdf_count)) - print("ieee count: {}".format(ieee_count)) - print("doi count: {}".format(doi_count)) - print("url count: {}".format(url_count)) - for domain, count in sorted(domains.items(), key=operator.itemgetter(1)): - print(" -- {} - {}".format(domain, count)) - print("empty count: {}".format(empty_count)) - write_csv('db_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'DOI URL', 'Extra URL'], rows=rows) - write_csv('db_paper_pdf.csv', keys=None, rows=pdf) - write_csv('db_paper_doi.csv', keys=None, rows=doi) - # write_csv('raw_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'DOI URL', 'Extra URL'], rows=rows) - # write_csv('raw_paper_pdf.csv', keys=None, rows=pdf) - # write_csv('raw_paper_doi.csv', keys=None, rows=doi) - -def process_db_paper(fn): - # print(fn) - paper = read_json(fn) - if paper is None: - return None - paper_id = paper['id'] - pdf_url = None - ieee_url = None - doi_url = None - extra_url = None - if paper['s2PdfUrl']: - pdf_url = paper['s2PdfUrl'] - for url in paper['pdfUrls']: - if 'ieeexplore.ieee.org' in url: - ieee_url = url - elif 'doi.org' in url: - doi_url = url - elif pdf_url is None and 'pdf' in url: - pdf_url = url - else: - extra_url = url - return [paper_id, pdf_url, ieee_url, doi_url, extra_url] - -def process_raw_paper(fn): - print(fn) - data = read_json(fn) - if 'paper' not in data: - print(data) - return - paper = data['paper'] - if paper is None: - return None - paper_id = paper['id'] - pdf_url = None - ieee_url = None - doi_url = None - extra_url = None - if 'primaryPaperLink' in paper and 'url' in paper['primaryPaperLink']: - primary_url = paper['primaryPaperLink']['url'] - if 'pdf' in primary_url: - pdf_url = primary_url - elif 'doi' in primary_url: - doi_url = primary_url - for link in paper['links']: - url = link['url'] - if 'ieeexplore.ieee.org' in url: - ieee_url = url - elif 'doi.org' in url: - doi_url = url - elif pdf_url is None and 'pdf' in url: - pdf_url = url - else: - extra_url = url - return [paper_id, pdf_url, ieee_url, doi_url, extra_url] - -if __name__ == '__main__': - s2_dump_pdf_urls() |
