diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-11-25 22:19:15 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-11-25 22:19:15 +0100 |
| commit | ee3d0d98e19f1d8177d85af1866fd0ee431fe9ea (patch) | |
| tree | 41372528e78d4328bc2a47bbbabac7e809c58894 /s2-doi-report.py | |
| parent | 255b8178af1e25a71fd23703d30c0d1f74911f47 (diff) | |
moving stuff
Diffstat (limited to 's2-doi-report.py')
| -rw-r--r-- | s2-doi-report.py | 249 |
1 files changed, 0 insertions, 249 deletions
diff --git a/s2-doi-report.py b/s2-doi-report.py deleted file mode 100644 index 3f13021f..00000000 --- a/s2-doi-report.py +++ /dev/null @@ -1,249 +0,0 @@ -import re -import os -import gzip -import glob -import json -import click -import operator -from util import * -from bs4 import BeautifulSoup -from importlib import import_module -from urllib.parse import unquote -doi = import_module('s2-fetch-doi') - -DOI_DIR = 'datasets/s2/doi' - -@click.command() -def doi_report(): - rows = [] - domains = {} - institutions = {} - # geocode_lookup = load_geocode_lookup() - addresses = AddressBook() - - geocoded_papers = [] - unknown_papers = [] - unattributed_papers = [] - paper_count = 0 - ieee_count = 0 - springer_count = 0 - sciencedirect_count = 0 - acm_count = 0 - computerorg_count = 0 - elsevier_count = 0 - unparsed_count = 0 - for fn in glob.iglob('{}/**/*.url'.format(DOI_DIR), recursive=True): - paper_count += 1 - url_info = read_json(fn) - domain = url_info['domain'] - paper_id = url_info['paper_id'] - paper = load_paper(paper_id) - doi_fn = fn.replace('.url', '.doi') - address = None - if domain in domains: - domains[domain] += 1 - else: - domains[domain] = 1 - affiliations = None - paper_affiliation_count = 0 - - if 'ieee.org' in domain: - ieee_count += 1 - affiliations = load_ieee(paper, doi_fn) - elif 'link.springer.com' in domain: - springer_count += 1 - affiliations = load_springer(paper, doi_fn) - elif 'sciencedirect.com' in domain: - sciencedirect_count += 1 - affiliations = load_sciencedirect(paper, doi_fn) - elif 'acm.org' in domain: - acm_count += 1 - affiliations = load_acm(paper, doi_fn) - elif 'computer.org' in domain: - computerorg_count += 1 - affiliations = load_computerorg(paper, doi_fn) - elif 'elsevier.com' in domain: - elsevier_count += 1 - affiliations = load_elsevier(paper, doi_fn) - else: - unparsed_count += 1 - - if affiliations: - for affiliation in affiliations: - if affiliation: - paper_affiliation_count += 1 - if affiliation in institutions: - institutions[affiliation] += 1 - else: - institutions[affiliation] = 1 - address = addresses.find(affiliation) - if not address: - unknown_papers.append([paper.paper_id, paper.title, affiliation]) - if paper_affiliation_count == 0: - unattributed_papers.append([paper.paper_id, paper.title]) - if address: - geocoded_papers.append([paper.paper_id, paper.title] + address) - - domain_list = reversed(sorted(domains.items(), key=operator.itemgetter(1))) - # for domain, count in domain_list: - # print('{}\t{}'.format(count, domain)) - institution_list = reversed(sorted(institutions.items(), key=operator.itemgetter(1))) - # for institution, count in institution_list: - # print('{}\t{}'.format(count, institution)) - display_institution_list = [] - unknown_institution_list = [] - for inst in institution_list: - addr = addresses.find(inst[0]) - if addr: - display_institution_list.append((BoldLine(inst[0]), inst[1],)) - elif len(inst[0]) > 1: - display_institution_list.append(inst) - unknown_institution_list.append(inst) - write_report('reports/doi_domains.html', title='DOI Domains', keys=None, rows=domain_list) - write_report('reports/doi_institutions.html', title='Institutions from IEEE', keys=None, rows=display_institution_list) - write_report('reports/doi_institutions_unknown.html', title='Unknown Institutions from DOI', keys=None, rows=unknown_institution_list) - write_csv('reports/doi_institutions_geocoded.csv', keys=None, rows=geocoded_papers) - write_csv('reports/doi_institutions_unknown.csv', keys=None, rows=unknown_papers) - write_csv('reports/doi_institutions_unattributed.csv', keys=None, rows=unattributed_papers) - print("total papers: {}".format(paper_count)) - print(".. ieee: {}".format(ieee_count)) - print(".. springer: {}".format(springer_count)) - print(".. acm: {}".format(acm_count)) - print(".. computerorg: {}".format(computerorg_count)) - print(".. sciencedirect: {}".format(sciencedirect_count)) - print(".. elsevier: {}".format(elsevier_count)) - print(".. unparsed: {}".format(unparsed_count)) - print("geocoded papers: {}".format(len(geocoded_papers))) - print("unknown papers: {}".format(len(unknown_papers))) - print("unattributed papers: {}".format(len(unattributed_papers))) - -def load_ieee(paper, fn): - with open(fn, 'r') as f: - try: - data = f.read().split('global.document.metadata=')[1].split('</script>')[0].strip()[:-1] - data = json.loads(data) - write_json(fn.replace('paper.doi', 'ieee.json'), data) - # print(data) - except: - print('ieee: could not read data') - return None - affiliations = [ author['affiliation'] for author in data['authors'] ] - institutions = [ [ paper.paper_id, author['affiliation'], author['affiliation'] ] for author in data['authors'] ] - # print(affiliations) - write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) - return affiliations - -def load_springer(paper, fn): - # print('springer: {}'.format(paper.paper_id)) - with open(fn, 'r') as f: - try: - soup = BeautifulSoup(f.read(), 'html.parser') - except: - print('springer: could not read data') - return None - items = soup.find_all(class_='affiliation__item') - affiliations = [ ', '.join(item.strings) for item in items ] - institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ] - write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) - return affiliations - -def load_sciencedirect(paper, fn): - # print('sciencedirect: {}'.format(paper.paper_id)) - with open(fn, 'r') as f: - try: - soup = BeautifulSoup(f.read(), 'html.parser') - except: - print('sciencedirect: could not read data') - return None - - items = soup.find_all("script", type='application/json', limit=1) - if len(items) == 0: - return None - - try: - data = json.loads(items[0].string) - write_json(fn.replace('paper.doi', 'sciencedirect.json'), data) - # print(data) - except: - print('sciencedirect: json error') - return None - - affiliations = [value['$$'][0]['_'] for value in data['authors']['affiliations'].values()] - - institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ] - write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) - return affiliations - -def load_acm(paper, fn): - # print('acm: {}'.format(paper.paper_id)) - with open(fn, 'r') as f: - try: - soup = BeautifulSoup(f.read(), 'html.parser') - except: - print('acm: could not read data') - return None - items = soup.find_all("a", title='Institutional Profile Page') - affiliations = [ item.string for item in items ] - # print(affiliations) - institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ] - write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) - return affiliations - -def load_computerorg(paper, fn): - # print('computerorg: {}'.format(paper.paper_id)) - # if not os.path.exists(doi.old_doi_fn(fn)): - pass - # with open(fn, 'r') as f: - # try: - # soup = BeautifulSoup(f.read(), 'html.parser') - # except: - # print('computerorg: could not read data') - # return None - # items = soup.find_all("a", title='Institutional Profile Page') - # affiliations = [ item.string for item in items ] - # print(affiliations) - # institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ] - # write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) - # return affiliations - -def load_elsevier(paper, fn): - print('elsevier: {}'.format(paper.paper_id)) - if not os.path.exists(doi.old_doi_fn(paper.paper_id)): - with open(fn, 'r') as f: - try: - soup = BeautifulSoup(f.read(), 'html.parser') - except: - print('elsevier: could not read data') - return None - item = soup.find_all("input", attrs={"name": 'redirectURL'})[0] - new_url = unquote(item['value']) - if new_url: - print(new_url) - doi.fetch_doi(paper.paper_id, new_url, replace=True) - else: - print("missing redirect url: {}".format(paper.paper_id)) - # print('elsevier: {}'.format(paper.paper_id)) - # with open(fn, 'r') as f: - # try: - # soup = BeautifulSoup(f.read(), 'html.parser') - # except: - # print('elsevier: could not read data') - # return None - # items = soup.find_all("a", title='Institutional Profile Page') - # affiliations = [ item.string for item in items ] - # # print(affiliations) - # institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ] - # write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) - # return affiliations - -def find_authors(authors, line): - for a in authors: - if a[2] in line: - return a - return None - -def paper_path(paper_id): - return '{}/{}/{}'.format(DOI_DIR, paper_id[0:2], paper_id) - -if __name__ == '__main__': - doi_report() |
