summaryrefslogtreecommitdiff
path: root/scraper/s2-doi-report.py
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/s2-doi-report.py')
-rw-r--r--scraper/s2-doi-report.py249
1 files changed, 249 insertions, 0 deletions
diff --git a/scraper/s2-doi-report.py b/scraper/s2-doi-report.py
new file mode 100644
index 00000000..b10b5da1
--- /dev/null
+++ b/scraper/s2-doi-report.py
@@ -0,0 +1,249 @@
+import re
+import os
+import gzip
+import glob
+import simplejson as json
+import click
+import operator
+from util import *
+from bs4 import BeautifulSoup
+from importlib import import_module
+from urllib.parse import unquote
+doi = import_module('s2-fetch-doi')
+
+DOI_DIR = 'datasets/s2/doi'
+
+@click.command()
+def doi_report():
+ rows = []
+ domains = {}
+ institutions = {}
+ # geocode_lookup = load_geocode_lookup()
+ addresses = AddressBook()
+
+ geocoded_papers = []
+ unknown_papers = []
+ unattributed_papers = []
+ paper_count = 0
+ ieee_count = 0
+ springer_count = 0
+ sciencedirect_count = 0
+ acm_count = 0
+ computerorg_count = 0
+ elsevier_count = 0
+ unparsed_count = 0
+ for fn in glob.iglob('{}/**/*.url'.format(DOI_DIR), recursive=True):
+ paper_count += 1
+ url_info = read_json(fn)
+ domain = url_info['domain']
+ paper_id = url_info['paper_id']
+ paper = load_paper(paper_id)
+ doi_fn = fn.replace('.url', '.doi')
+ address = None
+ if domain in domains:
+ domains[domain] += 1
+ else:
+ domains[domain] = 1
+ affiliations = None
+ paper_affiliation_count = 0
+
+ if 'ieee.org' in domain:
+ ieee_count += 1
+ affiliations = load_ieee(paper, doi_fn)
+ elif 'link.springer.com' in domain:
+ springer_count += 1
+ affiliations = load_springer(paper, doi_fn)
+ elif 'sciencedirect.com' in domain:
+ sciencedirect_count += 1
+ affiliations = load_sciencedirect(paper, doi_fn)
+ elif 'acm.org' in domain:
+ acm_count += 1
+ affiliations = load_acm(paper, doi_fn)
+ elif 'computer.org' in domain:
+ computerorg_count += 1
+ affiliations = load_computerorg(paper, doi_fn)
+ elif 'elsevier.com' in domain:
+ elsevier_count += 1
+ affiliations = load_elsevier(paper, doi_fn)
+ else:
+ unparsed_count += 1
+
+ if affiliations:
+ for affiliation in affiliations:
+ if affiliation:
+ paper_affiliation_count += 1
+ if affiliation in institutions:
+ institutions[affiliation] += 1
+ else:
+ institutions[affiliation] = 1
+ address = addresses.find(affiliation)
+ if not address:
+ unknown_papers.append([paper.paper_id, paper.title, affiliation])
+ if paper_affiliation_count == 0:
+ unattributed_papers.append([paper.paper_id, paper.title])
+ if address:
+ geocoded_papers.append([paper.paper_id, paper.title] + address)
+
+ domain_list = reversed(sorted(domains.items(), key=operator.itemgetter(1)))
+ # for domain, count in domain_list:
+ # print('{}\t{}'.format(count, domain))
+ institution_list = reversed(sorted(institutions.items(), key=operator.itemgetter(1)))
+ # for institution, count in institution_list:
+ # print('{}\t{}'.format(count, institution))
+ display_institution_list = []
+ unknown_institution_list = []
+ for inst in institution_list:
+ addr = addresses.find(inst[0])
+ if addr:
+ display_institution_list.append((BoldLine(inst[0]), inst[1],))
+ elif len(inst[0]) > 1:
+ display_institution_list.append(inst)
+ unknown_institution_list.append(inst)
+ write_report('reports/doi_domains.html', title='DOI Domains', keys=None, rows=domain_list)
+ write_report('reports/doi_institutions.html', title='Institutions from IEEE', keys=None, rows=display_institution_list)
+ write_report('reports/doi_institutions_unknown.html', title='Unknown Institutions from DOI', keys=None, rows=unknown_institution_list)
+ write_csv('reports/doi_institutions_geocoded.csv', keys=None, rows=geocoded_papers)
+ write_csv('reports/doi_institutions_unknown.csv', keys=None, rows=unknown_papers)
+ write_csv('reports/doi_institutions_unattributed.csv', keys=None, rows=unattributed_papers)
+ print("total papers: {}".format(paper_count))
+ print(".. ieee: {}".format(ieee_count))
+ print(".. springer: {}".format(springer_count))
+ print(".. acm: {}".format(acm_count))
+ print(".. computerorg: {}".format(computerorg_count))
+ print(".. sciencedirect: {}".format(sciencedirect_count))
+ print(".. elsevier: {}".format(elsevier_count))
+ print(".. unparsed: {}".format(unparsed_count))
+ print("geocoded papers: {}".format(len(geocoded_papers)))
+ print("unknown papers: {}".format(len(unknown_papers)))
+ print("unattributed papers: {}".format(len(unattributed_papers)))
+
+def load_ieee(paper, fn):
+ with open(fn, 'r') as f:
+ try:
+ data = f.read().split('global.document.metadata=')[1].split('</script>')[0].strip()[:-1]
+ data = json.loads(data)
+ write_json(fn.replace('paper.doi', 'ieee.json'), data)
+ # print(data)
+ except:
+ print('ieee: could not read data')
+ return None
+ affiliations = [ author['affiliation'] for author in data['authors'] ]
+ institutions = [ [ paper.paper_id, author['affiliation'], author['affiliation'] ] for author in data['authors'] ]
+ # print(affiliations)
+ write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions })
+ return affiliations
+
+def load_springer(paper, fn):
+ # print('springer: {}'.format(paper.paper_id))
+ with open(fn, 'r') as f:
+ try:
+ soup = BeautifulSoup(f.read(), 'html.parser')
+ except:
+ print('springer: could not read data')
+ return None
+ items = soup.find_all(class_='affiliation__item')
+ affiliations = [ ', '.join(item.strings) for item in items ]
+ institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ]
+ write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions })
+ return affiliations
+
+def load_sciencedirect(paper, fn):
+ # print('sciencedirect: {}'.format(paper.paper_id))
+ with open(fn, 'r') as f:
+ try:
+ soup = BeautifulSoup(f.read(), 'html.parser')
+ except:
+ print('sciencedirect: could not read data')
+ return None
+
+ items = soup.find_all("script", type='application/json', limit=1)
+ if len(items) == 0:
+ return None
+
+ try:
+ data = json.loads(items[0].string)
+ write_json(fn.replace('paper.doi', 'sciencedirect.json'), data)
+ # print(data)
+ except:
+ print('sciencedirect: json error')
+ return None
+
+ affiliations = [value['$$'][0]['_'] for value in data['authors']['affiliations'].values()]
+
+ institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ]
+ write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions })
+ return affiliations
+
+def load_acm(paper, fn):
+ # print('acm: {}'.format(paper.paper_id))
+ with open(fn, 'r') as f:
+ try:
+ soup = BeautifulSoup(f.read(), 'html.parser')
+ except:
+ print('acm: could not read data')
+ return None
+ items = soup.find_all("a", title='Institutional Profile Page')
+ affiliations = [ item.string for item in items ]
+ # print(affiliations)
+ institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ]
+ write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions })
+ return affiliations
+
+def load_computerorg(paper, fn):
+ # print('computerorg: {}'.format(paper.paper_id))
+ # if not os.path.exists(doi.old_doi_fn(fn)):
+ pass
+ # with open(fn, 'r') as f:
+ # try:
+ # soup = BeautifulSoup(f.read(), 'html.parser')
+ # except:
+ # print('computerorg: could not read data')
+ # return None
+ # items = soup.find_all("a", title='Institutional Profile Page')
+ # affiliations = [ item.string for item in items ]
+ # print(affiliations)
+ # institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ]
+ # write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions })
+ # return affiliations
+
+def load_elsevier(paper, fn):
+ print('elsevier: {}'.format(paper.paper_id))
+ if not os.path.exists(doi.old_doi_fn(paper.paper_id)):
+ with open(fn, 'r') as f:
+ try:
+ soup = BeautifulSoup(f.read(), 'html.parser')
+ except:
+ print('elsevier: could not read data')
+ return None
+ item = soup.find_all("input", attrs={"name": 'redirectURL'})[0]
+ new_url = unquote(item['value'])
+ if new_url:
+ print(new_url)
+ doi.fetch_doi(paper.paper_id, new_url, replace=True)
+ else:
+ print("missing redirect url: {}".format(paper.paper_id))
+ # print('elsevier: {}'.format(paper.paper_id))
+ # with open(fn, 'r') as f:
+ # try:
+ # soup = BeautifulSoup(f.read(), 'html.parser')
+ # except:
+ # print('elsevier: could not read data')
+ # return None
+ # items = soup.find_all("a", title='Institutional Profile Page')
+ # affiliations = [ item.string for item in items ]
+ # # print(affiliations)
+ # institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ]
+ # write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions })
+ # return affiliations
+
+def find_authors(authors, line):
+ for a in authors:
+ if a[2] in line:
+ return a
+ return None
+
+def paper_path(paper_id):
+ return '{}/{}/{}'.format(DOI_DIR, paper_id[0:2], paper_id)
+
+if __name__ == '__main__':
+ doi_report()