summaryrefslogtreecommitdiff
path: root/s2-doi-report.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2018-11-25 22:19:15 +0100
committerJules Laplace <julescarbon@gmail.com>2018-11-25 22:19:15 +0100
commitee3d0d98e19f1d8177d85af1866fd0ee431fe9ea (patch)
tree41372528e78d4328bc2a47bbbabac7e809c58894 /s2-doi-report.py
parent255b8178af1e25a71fd23703d30c0d1f74911f47 (diff)
moving stuff
Diffstat (limited to 's2-doi-report.py')
-rw-r--r--s2-doi-report.py249
1 files changed, 0 insertions, 249 deletions
diff --git a/s2-doi-report.py b/s2-doi-report.py
deleted file mode 100644
index 3f13021f..00000000
--- a/s2-doi-report.py
+++ /dev/null
@@ -1,249 +0,0 @@
-import re
-import os
-import gzip
-import glob
-import json
-import click
-import operator
-from util import *
-from bs4 import BeautifulSoup
-from importlib import import_module
-from urllib.parse import unquote
-doi = import_module('s2-fetch-doi')
-
-DOI_DIR = 'datasets/s2/doi'
-
-@click.command()
-def doi_report():
- rows = []
- domains = {}
- institutions = {}
- # geocode_lookup = load_geocode_lookup()
- addresses = AddressBook()
-
- geocoded_papers = []
- unknown_papers = []
- unattributed_papers = []
- paper_count = 0
- ieee_count = 0
- springer_count = 0
- sciencedirect_count = 0
- acm_count = 0
- computerorg_count = 0
- elsevier_count = 0
- unparsed_count = 0
- for fn in glob.iglob('{}/**/*.url'.format(DOI_DIR), recursive=True):
- paper_count += 1
- url_info = read_json(fn)
- domain = url_info['domain']
- paper_id = url_info['paper_id']
- paper = load_paper(paper_id)
- doi_fn = fn.replace('.url', '.doi')
- address = None
- if domain in domains:
- domains[domain] += 1
- else:
- domains[domain] = 1
- affiliations = None
- paper_affiliation_count = 0
-
- if 'ieee.org' in domain:
- ieee_count += 1
- affiliations = load_ieee(paper, doi_fn)
- elif 'link.springer.com' in domain:
- springer_count += 1
- affiliations = load_springer(paper, doi_fn)
- elif 'sciencedirect.com' in domain:
- sciencedirect_count += 1
- affiliations = load_sciencedirect(paper, doi_fn)
- elif 'acm.org' in domain:
- acm_count += 1
- affiliations = load_acm(paper, doi_fn)
- elif 'computer.org' in domain:
- computerorg_count += 1
- affiliations = load_computerorg(paper, doi_fn)
- elif 'elsevier.com' in domain:
- elsevier_count += 1
- affiliations = load_elsevier(paper, doi_fn)
- else:
- unparsed_count += 1
-
- if affiliations:
- for affiliation in affiliations:
- if affiliation:
- paper_affiliation_count += 1
- if affiliation in institutions:
- institutions[affiliation] += 1
- else:
- institutions[affiliation] = 1
- address = addresses.find(affiliation)
- if not address:
- unknown_papers.append([paper.paper_id, paper.title, affiliation])
- if paper_affiliation_count == 0:
- unattributed_papers.append([paper.paper_id, paper.title])
- if address:
- geocoded_papers.append([paper.paper_id, paper.title] + address)
-
- domain_list = reversed(sorted(domains.items(), key=operator.itemgetter(1)))
- # for domain, count in domain_list:
- # print('{}\t{}'.format(count, domain))
- institution_list = reversed(sorted(institutions.items(), key=operator.itemgetter(1)))
- # for institution, count in institution_list:
- # print('{}\t{}'.format(count, institution))
- display_institution_list = []
- unknown_institution_list = []
- for inst in institution_list:
- addr = addresses.find(inst[0])
- if addr:
- display_institution_list.append((BoldLine(inst[0]), inst[1],))
- elif len(inst[0]) > 1:
- display_institution_list.append(inst)
- unknown_institution_list.append(inst)
- write_report('reports/doi_domains.html', title='DOI Domains', keys=None, rows=domain_list)
- write_report('reports/doi_institutions.html', title='Institutions from IEEE', keys=None, rows=display_institution_list)
- write_report('reports/doi_institutions_unknown.html', title='Unknown Institutions from DOI', keys=None, rows=unknown_institution_list)
- write_csv('reports/doi_institutions_geocoded.csv', keys=None, rows=geocoded_papers)
- write_csv('reports/doi_institutions_unknown.csv', keys=None, rows=unknown_papers)
- write_csv('reports/doi_institutions_unattributed.csv', keys=None, rows=unattributed_papers)
- print("total papers: {}".format(paper_count))
- print(".. ieee: {}".format(ieee_count))
- print(".. springer: {}".format(springer_count))
- print(".. acm: {}".format(acm_count))
- print(".. computerorg: {}".format(computerorg_count))
- print(".. sciencedirect: {}".format(sciencedirect_count))
- print(".. elsevier: {}".format(elsevier_count))
- print(".. unparsed: {}".format(unparsed_count))
- print("geocoded papers: {}".format(len(geocoded_papers)))
- print("unknown papers: {}".format(len(unknown_papers)))
- print("unattributed papers: {}".format(len(unattributed_papers)))
-
-def load_ieee(paper, fn):
- with open(fn, 'r') as f:
- try:
- data = f.read().split('global.document.metadata=')[1].split('</script>')[0].strip()[:-1]
- data = json.loads(data)
- write_json(fn.replace('paper.doi', 'ieee.json'), data)
- # print(data)
- except:
- print('ieee: could not read data')
- return None
- affiliations = [ author['affiliation'] for author in data['authors'] ]
- institutions = [ [ paper.paper_id, author['affiliation'], author['affiliation'] ] for author in data['authors'] ]
- # print(affiliations)
- write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions })
- return affiliations
-
-def load_springer(paper, fn):
- # print('springer: {}'.format(paper.paper_id))
- with open(fn, 'r') as f:
- try:
- soup = BeautifulSoup(f.read(), 'html.parser')
- except:
- print('springer: could not read data')
- return None
- items = soup.find_all(class_='affiliation__item')
- affiliations = [ ', '.join(item.strings) for item in items ]
- institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ]
- write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions })
- return affiliations
-
-def load_sciencedirect(paper, fn):
- # print('sciencedirect: {}'.format(paper.paper_id))
- with open(fn, 'r') as f:
- try:
- soup = BeautifulSoup(f.read(), 'html.parser')
- except:
- print('sciencedirect: could not read data')
- return None
-
- items = soup.find_all("script", type='application/json', limit=1)
- if len(items) == 0:
- return None
-
- try:
- data = json.loads(items[0].string)
- write_json(fn.replace('paper.doi', 'sciencedirect.json'), data)
- # print(data)
- except:
- print('sciencedirect: json error')
- return None
-
- affiliations = [value['$$'][0]['_'] for value in data['authors']['affiliations'].values()]
-
- institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ]
- write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions })
- return affiliations
-
-def load_acm(paper, fn):
- # print('acm: {}'.format(paper.paper_id))
- with open(fn, 'r') as f:
- try:
- soup = BeautifulSoup(f.read(), 'html.parser')
- except:
- print('acm: could not read data')
- return None
- items = soup.find_all("a", title='Institutional Profile Page')
- affiliations = [ item.string for item in items ]
- # print(affiliations)
- institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ]
- write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions })
- return affiliations
-
-def load_computerorg(paper, fn):
- # print('computerorg: {}'.format(paper.paper_id))
- # if not os.path.exists(doi.old_doi_fn(fn)):
- pass
- # with open(fn, 'r') as f:
- # try:
- # soup = BeautifulSoup(f.read(), 'html.parser')
- # except:
- # print('computerorg: could not read data')
- # return None
- # items = soup.find_all("a", title='Institutional Profile Page')
- # affiliations = [ item.string for item in items ]
- # print(affiliations)
- # institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ]
- # write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions })
- # return affiliations
-
-def load_elsevier(paper, fn):
- print('elsevier: {}'.format(paper.paper_id))
- if not os.path.exists(doi.old_doi_fn(paper.paper_id)):
- with open(fn, 'r') as f:
- try:
- soup = BeautifulSoup(f.read(), 'html.parser')
- except:
- print('elsevier: could not read data')
- return None
- item = soup.find_all("input", attrs={"name": 'redirectURL'})[0]
- new_url = unquote(item['value'])
- if new_url:
- print(new_url)
- doi.fetch_doi(paper.paper_id, new_url, replace=True)
- else:
- print("missing redirect url: {}".format(paper.paper_id))
- # print('elsevier: {}'.format(paper.paper_id))
- # with open(fn, 'r') as f:
- # try:
- # soup = BeautifulSoup(f.read(), 'html.parser')
- # except:
- # print('elsevier: could not read data')
- # return None
- # items = soup.find_all("a", title='Institutional Profile Page')
- # affiliations = [ item.string for item in items ]
- # # print(affiliations)
- # institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ]
- # write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions })
- # return affiliations
-
-def find_authors(authors, line):
- for a in authors:
- if a[2] in line:
- return a
- return None
-
-def paper_path(paper_id):
- return '{}/{}/{}'.format(DOI_DIR, paper_id[0:2], paper_id)
-
-if __name__ == '__main__':
- doi_report()