import os import glob import simplejson as json import click from urllib.parse import urlparse import operator from util import * PAPER_JSON_DIR = 'datasets/s2/db_papers' @click.command() def s2_dump_pdf_urls(): # loop over all the papers in db_papers # get all the PDF urls, pick the best one # store it and the paper id # another script will fetch the urls from this process rows = [] pdf_count = 0 ieee_count = 0 url_count = 0 doi_count = 0 empty_count = 0 domains = {} pdf = [] doi = [] for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True): # if 'db_paper' in fn: row = process_db_paper(fn) # elif 'raw_paper' in fn: # row = process_raw_paper(fn) if row is not None: rows.append(row) if row[1] is not None: pdf.append([row[0], row[1]]) pdf_count += 1 elif row[2] is not None: doi.append([row[0], row[2]]) ieee_count += 1 elif row[3] is not None: doi.append([row[0], row[3]]) doi_count += 1 elif row[4] is not None: if 'pdf' not in row[4]: doi.append([row[0], row[4]]) url_count += 1 domain = urlparse(row[4]).netloc if domain in domains: domains[domain] += 1 else: domains[domain] = 1 else: empty_count += 1 print("Wrote {} rows".format(len(rows))) print("pdf count: {}".format(pdf_count)) print("ieee count: {}".format(ieee_count)) print("doi count: {}".format(doi_count)) print("url count: {}".format(url_count)) for domain, count in sorted(domains.items(), key=operator.itemgetter(1)): print(" -- {} - {}".format(domain, count)) print("empty count: {}".format(empty_count)) write_csv('db_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'DOI URL', 'Extra URL'], rows=rows) write_csv('db_paper_pdf.csv', keys=None, rows=pdf) write_csv('db_paper_doi.csv', keys=None, rows=doi) # write_csv('raw_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'DOI URL', 'Extra URL'], rows=rows) # write_csv('raw_paper_pdf.csv', keys=None, rows=pdf) # write_csv('raw_paper_doi.csv', keys=None, rows=doi) def process_db_paper(fn): # print(fn) paper = read_json(fn) if paper is None: return None paper_id = paper['id'] pdf_url = None ieee_url = None doi_url = None extra_url = None if paper['s2PdfUrl']: pdf_url = paper['s2PdfUrl'] for url in paper['pdfUrls']: if 'ieeexplore.ieee.org' in url: ieee_url = url elif 'doi.org' in url: doi_url = url elif pdf_url is None and 'pdf' in url: pdf_url = url else: extra_url = url return [paper_id, pdf_url, ieee_url, doi_url, extra_url] def process_raw_paper(fn): print(fn) data = read_json(fn) if 'paper' not in data: print(data) return paper = data['paper'] if paper is None: return None paper_id = paper['id'] pdf_url = None ieee_url = None doi_url = None extra_url = None if 'primaryPaperLink' in paper and 'url' in paper['primaryPaperLink']: primary_url = paper['primaryPaperLink']['url'] if 'pdf' in primary_url: pdf_url = primary_url elif 'doi' in primary_url: doi_url = primary_url for link in paper['links']: url = link['url'] if 'ieeexplore.ieee.org' in url: ieee_url = url elif 'doi.org' in url: doi_url = url elif pdf_url is None and 'pdf' in url: pdf_url = url else: extra_url = url return [paper_id, pdf_url, ieee_url, doi_url, extra_url] if __name__ == '__main__': s2_dump_pdf_urls()