diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-11-25 22:18:01 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-11-25 22:18:01 +0100 |
| commit | 255b8178af1e25a71fd23703d30c0d1f74911f47 (patch) | |
| tree | 405e23266e93501d67465e3e868007d36a50eaca /s2-doi-report.py | |
| parent | 4145a02558e33699ca754c143c547cbe68fafa23 (diff) | |
doi institutions unknown report
Diffstat (limited to 's2-doi-report.py')
| -rw-r--r-- | s2-doi-report.py | 176 |
1 files changed, 153 insertions, 23 deletions
diff --git a/s2-doi-report.py b/s2-doi-report.py index 74e388e3..3f13021f 100644 --- a/s2-doi-report.py +++ b/s2-doi-report.py @@ -6,6 +6,10 @@ import json import click import operator from util import * +from bs4 import BeautifulSoup +from importlib import import_module +from urllib.parse import unquote +doi = import_module('s2-fetch-doi') DOI_DIR = 'datasets/s2/doi' @@ -22,6 +26,11 @@ def doi_report(): unattributed_papers = [] paper_count = 0 ieee_count = 0 + springer_count = 0 + sciencedirect_count = 0 + acm_count = 0 + computerorg_count = 0 + elsevier_count = 0 unparsed_count = 0 for fn in glob.iglob('{}/**/*.url'.format(DOI_DIR), recursive=True): paper_count += 1 @@ -30,18 +39,36 @@ def doi_report(): paper_id = url_info['paper_id'] paper = load_paper(paper_id) doi_fn = fn.replace('.url', '.doi') - # institutions_fn = fn.replace('paper.url', 'institutions.json') address = None if domain in domains: domains[domain] += 1 else: domains[domain] = 1 - # if not os.path.exists(institutions_fn): - # continue + affiliations = None paper_affiliation_count = 0 - if 'ieee' in domain: + + if 'ieee.org' in domain: ieee_count += 1 - affiliations = load_ieee(paper_id, doi_fn) + affiliations = load_ieee(paper, doi_fn) + elif 'link.springer.com' in domain: + springer_count += 1 + affiliations = load_springer(paper, doi_fn) + elif 'sciencedirect.com' in domain: + sciencedirect_count += 1 + affiliations = load_sciencedirect(paper, doi_fn) + elif 'acm.org' in domain: + acm_count += 1 + affiliations = load_acm(paper, doi_fn) + elif 'computer.org' in domain: + computerorg_count += 1 + affiliations = load_computerorg(paper, doi_fn) + elif 'elsevier.com' in domain: + elsevier_count += 1 + affiliations = load_elsevier(paper, doi_fn) + else: + unparsed_count += 1 + + if affiliations: for affiliation in affiliations: if affiliation: paper_affiliation_count += 1 @@ -54,10 +81,8 @@ def doi_report(): unknown_papers.append([paper.paper_id, paper.title, affiliation]) if paper_affiliation_count == 0: unattributed_papers.append([paper.paper_id, paper.title]) - else: - unparsed_count += 1 - if address: - geocoded_papers.append([paper.paper_id, paper.title] + address) + if address: + geocoded_papers.append([paper.paper_id, paper.title] + address) domain_list = reversed(sorted(domains.items(), key=operator.itemgetter(1))) # for domain, count in domain_list: @@ -66,46 +91,151 @@ def doi_report(): # for institution, count in institution_list: # print('{}\t{}'.format(count, institution)) display_institution_list = [] - raw_institution_list = [] + unknown_institution_list = [] for inst in institution_list: addr = addresses.find(inst[0]) if addr: display_institution_list.append((BoldLine(inst[0]), inst[1],)) - else: + elif len(inst[0]) > 1: display_institution_list.append(inst) + unknown_institution_list.append(inst) write_report('reports/doi_domains.html', title='DOI Domains', keys=None, rows=domain_list) write_report('reports/doi_institutions.html', title='Institutions from IEEE', keys=None, rows=display_institution_list) + write_report('reports/doi_institutions_unknown.html', title='Unknown Institutions from DOI', keys=None, rows=unknown_institution_list) write_csv('reports/doi_institutions_geocoded.csv', keys=None, rows=geocoded_papers) write_csv('reports/doi_institutions_unknown.csv', keys=None, rows=unknown_papers) write_csv('reports/doi_institutions_unattributed.csv', keys=None, rows=unattributed_papers) print("total papers: {}".format(paper_count)) - print("ieee papers: {}".format(ieee_count)) - print("unparsed papers: {}".format(unparsed_count)) + print(".. ieee: {}".format(ieee_count)) + print(".. springer: {}".format(springer_count)) + print(".. acm: {}".format(acm_count)) + print(".. computerorg: {}".format(computerorg_count)) + print(".. sciencedirect: {}".format(sciencedirect_count)) + print(".. elsevier: {}".format(elsevier_count)) + print(".. unparsed: {}".format(unparsed_count)) print("geocoded papers: {}".format(len(geocoded_papers))) print("unknown papers: {}".format(len(unknown_papers))) print("unattributed papers: {}".format(len(unattributed_papers))) -def load_ieee(paper_id, fn): - paper = load_paper(paper_id) - if paper is None: - print("{} no paper found!".format(paper_id)) - return [] +def load_ieee(paper, fn): with open(fn, 'r') as f: - authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ] try: data = f.read().split('global.document.metadata=')[1].split('</script>')[0].strip()[:-1] data = json.loads(data) write_json(fn.replace('paper.doi', 'ieee.json'), data) # print(data) except: - print('could not read data') - return [] + print('ieee: could not read data') + return None affiliations = [ author['affiliation'] for author in data['authors'] ] - institutions = [ [ paper_id, author['affiliation'], author['affiliation'] ] for author in data['authors'] ] + institutions = [ [ paper.paper_id, author['affiliation'], author['affiliation'] ] for author in data['authors'] ] + # print(affiliations) + write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) + return affiliations + +def load_springer(paper, fn): + # print('springer: {}'.format(paper.paper_id)) + with open(fn, 'r') as f: + try: + soup = BeautifulSoup(f.read(), 'html.parser') + except: + print('springer: could not read data') + return None + items = soup.find_all(class_='affiliation__item') + affiliations = [ ', '.join(item.strings) for item in items ] + institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ] + write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) + return affiliations + +def load_sciencedirect(paper, fn): + # print('sciencedirect: {}'.format(paper.paper_id)) + with open(fn, 'r') as f: + try: + soup = BeautifulSoup(f.read(), 'html.parser') + except: + print('sciencedirect: could not read data') + return None + + items = soup.find_all("script", type='application/json', limit=1) + if len(items) == 0: + return None + + try: + data = json.loads(items[0].string) + write_json(fn.replace('paper.doi', 'sciencedirect.json'), data) + # print(data) + except: + print('sciencedirect: json error') + return None + + affiliations = [value['$$'][0]['_'] for value in data['authors']['affiliations'].values()] + + institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ] + write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) + return affiliations + +def load_acm(paper, fn): + # print('acm: {}'.format(paper.paper_id)) + with open(fn, 'r') as f: + try: + soup = BeautifulSoup(f.read(), 'html.parser') + except: + print('acm: could not read data') + return None + items = soup.find_all("a", title='Institutional Profile Page') + affiliations = [ item.string for item in items ] # print(affiliations) - write_json('{}/{}'.format(paper_path(paper_id), 'institutions.json'), { 'institutions': institutions }) + institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ] + write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) return affiliations +def load_computerorg(paper, fn): + # print('computerorg: {}'.format(paper.paper_id)) + # if not os.path.exists(doi.old_doi_fn(fn)): + pass + # with open(fn, 'r') as f: + # try: + # soup = BeautifulSoup(f.read(), 'html.parser') + # except: + # print('computerorg: could not read data') + # return None + # items = soup.find_all("a", title='Institutional Profile Page') + # affiliations = [ item.string for item in items ] + # print(affiliations) + # institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ] + # write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) + # return affiliations + +def load_elsevier(paper, fn): + print('elsevier: {}'.format(paper.paper_id)) + if not os.path.exists(doi.old_doi_fn(paper.paper_id)): + with open(fn, 'r') as f: + try: + soup = BeautifulSoup(f.read(), 'html.parser') + except: + print('elsevier: could not read data') + return None + item = soup.find_all("input", attrs={"name": 'redirectURL'})[0] + new_url = unquote(item['value']) + if new_url: + print(new_url) + doi.fetch_doi(paper.paper_id, new_url, replace=True) + else: + print("missing redirect url: {}".format(paper.paper_id)) + # print('elsevier: {}'.format(paper.paper_id)) + # with open(fn, 'r') as f: + # try: + # soup = BeautifulSoup(f.read(), 'html.parser') + # except: + # print('elsevier: could not read data') + # return None + # items = soup.find_all("a", title='Institutional Profile Page') + # affiliations = [ item.string for item in items ] + # # print(affiliations) + # institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ] + # write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) + # return affiliations + def find_authors(authors, line): for a in authors: if a[2] in line: |
