summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjules@lens <julescarbon@gmail.com>2018-11-04 12:35:06 +0100
committerjules@lens <julescarbon@gmail.com>2018-11-04 12:35:06 +0100
commitb6eaaf784f42867fb0ab6ce274c564c69043600a (patch)
treeb3aa5ffe7782fbef7dd934a276c2b34d9790e892
parent0af672923232b53b2cdd3c41cb6768a64e200e68 (diff)
stats
-rw-r--r--s2-dump-db-pdf-urls.py41
1 files changed, 32 insertions, 9 deletions
diff --git a/s2-dump-db-pdf-urls.py b/s2-dump-db-pdf-urls.py
index 473e90af..80dcb0bd 100644
--- a/s2-dump-db-pdf-urls.py
+++ b/s2-dump-db-pdf-urls.py
@@ -2,6 +2,8 @@ import os
import glob
import simplejson as json
import click
+from urllib.parse import urlparse
+import operator
from util import *
PAPER_JSON_DIR = 'datasets/s2/db_papers'
@@ -15,46 +17,67 @@ def s2_dump_pdf_urls():
rows = []
pdf_count = 0
ieee_count = 0
- extra_count = 0
- empty_count += 1
+ url_count = 0
+ doi_count = 0
+ empty_count = 0
+ domains = {}
+ pdf = []
+ doi = []
for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True):
row = process_paper(fn)
if row is not None:
rows.append(row)
if row[1] is not None:
+ pdf.append([row[0], row[1]])
pdf_count += 1
- if row[2] is not None:
+ elif row[2] is not None:
ieee_count += 1
- if row[3] is not None:
- extra_count += 1
- if row[1] is None and row[2] is None and row[3] is None:
+ elif row[3] is not None:
+ doi.append([row[0], row[3]])
+ doi_count += 1
+ elif row[4] is not None:
+ url_count += 1
+ domain = urlparse(row[4]).netloc
+ if domain in domains:
+ domains[domain] += 1
+ else:
+ domains[domain] = 1
+ else:
empty_count += 1
print("Wrote {} rows".format(len(rows)))
print("pdf count: {}".format(pdf_count))
print("ieee count: {}".format(ieee_count))
+ print("doi count: {}".format(doi_count))
print("url count: {}".format(url_count))
+ for domain, count in sorted(domains.items(), key=operator.itemgetter(1)):
+ print(" -- {} - {}".format(domain, count))
print("empty count: {}".format(empty_count))
- write_csv('db_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'Extra URL'], rows=rows)
+ write_csv('db_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'DOI URL', 'Extra URL'], rows=rows)
+ write_csv('db_paper_pdf.csv', keys=None, rows=pdf)
+ write_csv('db_paper_doi.csv', keys=None, rows=doi)
def process_paper(fn):
- print(fn)
+ # print(fn)
paper = read_json(fn)
if paper is None:
return None
paper_id = paper['id']
pdf_url = None
ieee_url = None
+ doi_url = None
extra_url = None
if paper['s2PdfUrl']:
pdf_url = paper['s2PdfUrl']
for url in paper['pdfUrls']:
if 'ieeexplore.ieee.org' in url:
ieee_url = url
+ elif 'doi.org' in url:
+ doi_url = url
elif pdf_url is None and 'pdf' in url:
pdf_url = url
else:
extra_url = url
- return [paper_id, pdf_url, ieee_url, extra_url]
+ return [paper_id, pdf_url, ieee_url, doi_url, extra_url]
if __name__ == '__main__':
s2_dump_pdf_urls()