summaryrefslogtreecommitdiff
path: root/s2-dump-db-pdf-urls.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2018-11-25 22:19:15 +0100
committerJules Laplace <julescarbon@gmail.com>2018-11-25 22:19:15 +0100
commitee3d0d98e19f1d8177d85af1866fd0ee431fe9ea (patch)
tree41372528e78d4328bc2a47bbbabac7e809c58894 /s2-dump-db-pdf-urls.py
parent255b8178af1e25a71fd23703d30c0d1f74911f47 (diff)
moving stuff
Diffstat (limited to 's2-dump-db-pdf-urls.py')
-rw-r--r--s2-dump-db-pdf-urls.py124
1 files changed, 0 insertions, 124 deletions
diff --git a/s2-dump-db-pdf-urls.py b/s2-dump-db-pdf-urls.py
deleted file mode 100644
index dbcb91d8..00000000
--- a/s2-dump-db-pdf-urls.py
+++ /dev/null
@@ -1,124 +0,0 @@
-import os
-import glob
-import simplejson as json
-import click
-from urllib.parse import urlparse
-import operator
-from util import *
-
-PAPER_JSON_DIR = 'datasets/s2/db_papers'
-
-@click.command()
-def s2_dump_pdf_urls():
- # loop over all the papers in db_papers
- # get all the PDF urls, pick the best one
- # store it and the paper id
- # another script will fetch the urls from this process
- rows = []
- pdf_count = 0
- ieee_count = 0
- url_count = 0
- doi_count = 0
- empty_count = 0
- domains = {}
- pdf = []
- doi = []
- for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True):
- # if 'db_paper' in fn:
- row = process_db_paper(fn)
- # elif 'raw_paper' in fn:
- # row = process_raw_paper(fn)
- if row is not None:
- rows.append(row)
- if row[1] is not None:
- pdf.append([row[0], row[1]])
- pdf_count += 1
- elif row[2] is not None:
- doi.append([row[0], row[2]])
- ieee_count += 1
- elif row[3] is not None:
- doi.append([row[0], row[3]])
- doi_count += 1
- elif row[4] is not None:
- if 'pdf' not in row[4]:
- doi.append([row[0], row[4]])
- url_count += 1
- domain = urlparse(row[4]).netloc
- if domain in domains:
- domains[domain] += 1
- else:
- domains[domain] = 1
- else:
- empty_count += 1
- print("Wrote {} rows".format(len(rows)))
- print("pdf count: {}".format(pdf_count))
- print("ieee count: {}".format(ieee_count))
- print("doi count: {}".format(doi_count))
- print("url count: {}".format(url_count))
- for domain, count in sorted(domains.items(), key=operator.itemgetter(1)):
- print(" -- {} - {}".format(domain, count))
- print("empty count: {}".format(empty_count))
- write_csv('db_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'DOI URL', 'Extra URL'], rows=rows)
- write_csv('db_paper_pdf.csv', keys=None, rows=pdf)
- write_csv('db_paper_doi.csv', keys=None, rows=doi)
- # write_csv('raw_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'DOI URL', 'Extra URL'], rows=rows)
- # write_csv('raw_paper_pdf.csv', keys=None, rows=pdf)
- # write_csv('raw_paper_doi.csv', keys=None, rows=doi)
-
-def process_db_paper(fn):
- # print(fn)
- paper = read_json(fn)
- if paper is None:
- return None
- paper_id = paper['id']
- pdf_url = None
- ieee_url = None
- doi_url = None
- extra_url = None
- if paper['s2PdfUrl']:
- pdf_url = paper['s2PdfUrl']
- for url in paper['pdfUrls']:
- if 'ieeexplore.ieee.org' in url:
- ieee_url = url
- elif 'doi.org' in url:
- doi_url = url
- elif pdf_url is None and 'pdf' in url:
- pdf_url = url
- else:
- extra_url = url
- return [paper_id, pdf_url, ieee_url, doi_url, extra_url]
-
-def process_raw_paper(fn):
- print(fn)
- data = read_json(fn)
- if 'paper' not in data:
- print(data)
- return
- paper = data['paper']
- if paper is None:
- return None
- paper_id = paper['id']
- pdf_url = None
- ieee_url = None
- doi_url = None
- extra_url = None
- if 'primaryPaperLink' in paper and 'url' in paper['primaryPaperLink']:
- primary_url = paper['primaryPaperLink']['url']
- if 'pdf' in primary_url:
- pdf_url = primary_url
- elif 'doi' in primary_url:
- doi_url = primary_url
- for link in paper['links']:
- url = link['url']
- if 'ieeexplore.ieee.org' in url:
- ieee_url = url
- elif 'doi.org' in url:
- doi_url = url
- elif pdf_url is None and 'pdf' in url:
- pdf_url = url
- else:
- extra_url = url
- return [paper_id, pdf_url, ieee_url, doi_url, extra_url]
-
-if __name__ == '__main__':
- s2_dump_pdf_urls()