summaryrefslogtreecommitdiff
path: root/scraper/s2-dump-db-pdf-urls.py
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/s2-dump-db-pdf-urls.py')
-rw-r--r--scraper/s2-dump-db-pdf-urls.py124
1 files changed, 124 insertions, 0 deletions
diff --git a/scraper/s2-dump-db-pdf-urls.py b/scraper/s2-dump-db-pdf-urls.py
new file mode 100644
index 00000000..dbcb91d8
--- /dev/null
+++ b/scraper/s2-dump-db-pdf-urls.py
@@ -0,0 +1,124 @@
+import os
+import glob
+import simplejson as json
+import click
+from urllib.parse import urlparse
+import operator
+from util import *
+
+PAPER_JSON_DIR = 'datasets/s2/db_papers'
+
+@click.command()
+def s2_dump_pdf_urls():
+ # loop over all the papers in db_papers
+ # get all the PDF urls, pick the best one
+ # store it and the paper id
+ # another script will fetch the urls from this process
+ rows = []
+ pdf_count = 0
+ ieee_count = 0
+ url_count = 0
+ doi_count = 0
+ empty_count = 0
+ domains = {}
+ pdf = []
+ doi = []
+ for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True):
+ # if 'db_paper' in fn:
+ row = process_db_paper(fn)
+ # elif 'raw_paper' in fn:
+ # row = process_raw_paper(fn)
+ if row is not None:
+ rows.append(row)
+ if row[1] is not None:
+ pdf.append([row[0], row[1]])
+ pdf_count += 1
+ elif row[2] is not None:
+ doi.append([row[0], row[2]])
+ ieee_count += 1
+ elif row[3] is not None:
+ doi.append([row[0], row[3]])
+ doi_count += 1
+ elif row[4] is not None:
+ if 'pdf' not in row[4]:
+ doi.append([row[0], row[4]])
+ url_count += 1
+ domain = urlparse(row[4]).netloc
+ if domain in domains:
+ domains[domain] += 1
+ else:
+ domains[domain] = 1
+ else:
+ empty_count += 1
+ print("Wrote {} rows".format(len(rows)))
+ print("pdf count: {}".format(pdf_count))
+ print("ieee count: {}".format(ieee_count))
+ print("doi count: {}".format(doi_count))
+ print("url count: {}".format(url_count))
+ for domain, count in sorted(domains.items(), key=operator.itemgetter(1)):
+ print(" -- {} - {}".format(domain, count))
+ print("empty count: {}".format(empty_count))
+ write_csv('db_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'DOI URL', 'Extra URL'], rows=rows)
+ write_csv('db_paper_pdf.csv', keys=None, rows=pdf)
+ write_csv('db_paper_doi.csv', keys=None, rows=doi)
+ # write_csv('raw_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'DOI URL', 'Extra URL'], rows=rows)
+ # write_csv('raw_paper_pdf.csv', keys=None, rows=pdf)
+ # write_csv('raw_paper_doi.csv', keys=None, rows=doi)
+
+def process_db_paper(fn):
+ # print(fn)
+ paper = read_json(fn)
+ if paper is None:
+ return None
+ paper_id = paper['id']
+ pdf_url = None
+ ieee_url = None
+ doi_url = None
+ extra_url = None
+ if paper['s2PdfUrl']:
+ pdf_url = paper['s2PdfUrl']
+ for url in paper['pdfUrls']:
+ if 'ieeexplore.ieee.org' in url:
+ ieee_url = url
+ elif 'doi.org' in url:
+ doi_url = url
+ elif pdf_url is None and 'pdf' in url:
+ pdf_url = url
+ else:
+ extra_url = url
+ return [paper_id, pdf_url, ieee_url, doi_url, extra_url]
+
+def process_raw_paper(fn):
+ print(fn)
+ data = read_json(fn)
+ if 'paper' not in data:
+ print(data)
+ return
+ paper = data['paper']
+ if paper is None:
+ return None
+ paper_id = paper['id']
+ pdf_url = None
+ ieee_url = None
+ doi_url = None
+ extra_url = None
+ if 'primaryPaperLink' in paper and 'url' in paper['primaryPaperLink']:
+ primary_url = paper['primaryPaperLink']['url']
+ if 'pdf' in primary_url:
+ pdf_url = primary_url
+ elif 'doi' in primary_url:
+ doi_url = primary_url
+ for link in paper['links']:
+ url = link['url']
+ if 'ieeexplore.ieee.org' in url:
+ ieee_url = url
+ elif 'doi.org' in url:
+ doi_url = url
+ elif pdf_url is None and 'pdf' in url:
+ pdf_url = url
+ else:
+ extra_url = url
+ return [paper_id, pdf_url, ieee_url, doi_url, extra_url]
+
+if __name__ == '__main__':
+ s2_dump_pdf_urls()