diff options
Diffstat (limited to 'scraper/s2-doi-report.py')
| -rw-r--r-- | scraper/s2-doi-report.py | 249 |
1 files changed, 249 insertions, 0 deletions
diff --git a/scraper/s2-doi-report.py b/scraper/s2-doi-report.py new file mode 100644 index 00000000..b10b5da1 --- /dev/null +++ b/scraper/s2-doi-report.py @@ -0,0 +1,249 @@ +import re +import os +import gzip +import glob +import simplejson as json +import click +import operator +from util import * +from bs4 import BeautifulSoup +from importlib import import_module +from urllib.parse import unquote +doi = import_module('s2-fetch-doi') + +DOI_DIR = 'datasets/s2/doi' + +@click.command() +def doi_report(): + rows = [] + domains = {} + institutions = {} + # geocode_lookup = load_geocode_lookup() + addresses = AddressBook() + + geocoded_papers = [] + unknown_papers = [] + unattributed_papers = [] + paper_count = 0 + ieee_count = 0 + springer_count = 0 + sciencedirect_count = 0 + acm_count = 0 + computerorg_count = 0 + elsevier_count = 0 + unparsed_count = 0 + for fn in glob.iglob('{}/**/*.url'.format(DOI_DIR), recursive=True): + paper_count += 1 + url_info = read_json(fn) + domain = url_info['domain'] + paper_id = url_info['paper_id'] + paper = load_paper(paper_id) + doi_fn = fn.replace('.url', '.doi') + address = None + if domain in domains: + domains[domain] += 1 + else: + domains[domain] = 1 + affiliations = None + paper_affiliation_count = 0 + + if 'ieee.org' in domain: + ieee_count += 1 + affiliations = load_ieee(paper, doi_fn) + elif 'link.springer.com' in domain: + springer_count += 1 + affiliations = load_springer(paper, doi_fn) + elif 'sciencedirect.com' in domain: + sciencedirect_count += 1 + affiliations = load_sciencedirect(paper, doi_fn) + elif 'acm.org' in domain: + acm_count += 1 + affiliations = load_acm(paper, doi_fn) + elif 'computer.org' in domain: + computerorg_count += 1 + affiliations = load_computerorg(paper, doi_fn) + elif 'elsevier.com' in domain: + elsevier_count += 1 + affiliations = load_elsevier(paper, doi_fn) + else: + unparsed_count += 1 + + if affiliations: + for affiliation in affiliations: + if affiliation: + paper_affiliation_count += 1 + if affiliation in institutions: + institutions[affiliation] += 1 + else: + institutions[affiliation] = 1 + address = addresses.find(affiliation) + if not address: + unknown_papers.append([paper.paper_id, paper.title, affiliation]) + if paper_affiliation_count == 0: + unattributed_papers.append([paper.paper_id, paper.title]) + if address: + geocoded_papers.append([paper.paper_id, paper.title] + address) + + domain_list = reversed(sorted(domains.items(), key=operator.itemgetter(1))) + # for domain, count in domain_list: + # print('{}\t{}'.format(count, domain)) + institution_list = reversed(sorted(institutions.items(), key=operator.itemgetter(1))) + # for institution, count in institution_list: + # print('{}\t{}'.format(count, institution)) + display_institution_list = [] + unknown_institution_list = [] + for inst in institution_list: + addr = addresses.find(inst[0]) + if addr: + display_institution_list.append((BoldLine(inst[0]), inst[1],)) + elif len(inst[0]) > 1: + display_institution_list.append(inst) + unknown_institution_list.append(inst) + write_report('reports/doi_domains.html', title='DOI Domains', keys=None, rows=domain_list) + write_report('reports/doi_institutions.html', title='Institutions from IEEE', keys=None, rows=display_institution_list) + write_report('reports/doi_institutions_unknown.html', title='Unknown Institutions from DOI', keys=None, rows=unknown_institution_list) + write_csv('reports/doi_institutions_geocoded.csv', keys=None, rows=geocoded_papers) + write_csv('reports/doi_institutions_unknown.csv', keys=None, rows=unknown_papers) + write_csv('reports/doi_institutions_unattributed.csv', keys=None, rows=unattributed_papers) + print("total papers: {}".format(paper_count)) + print(".. ieee: {}".format(ieee_count)) + print(".. springer: {}".format(springer_count)) + print(".. acm: {}".format(acm_count)) + print(".. computerorg: {}".format(computerorg_count)) + print(".. sciencedirect: {}".format(sciencedirect_count)) + print(".. elsevier: {}".format(elsevier_count)) + print(".. unparsed: {}".format(unparsed_count)) + print("geocoded papers: {}".format(len(geocoded_papers))) + print("unknown papers: {}".format(len(unknown_papers))) + print("unattributed papers: {}".format(len(unattributed_papers))) + +def load_ieee(paper, fn): + with open(fn, 'r') as f: + try: + data = f.read().split('global.document.metadata=')[1].split('</script>')[0].strip()[:-1] + data = json.loads(data) + write_json(fn.replace('paper.doi', 'ieee.json'), data) + # print(data) + except: + print('ieee: could not read data') + return None + affiliations = [ author['affiliation'] for author in data['authors'] ] + institutions = [ [ paper.paper_id, author['affiliation'], author['affiliation'] ] for author in data['authors'] ] + # print(affiliations) + write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) + return affiliations + +def load_springer(paper, fn): + # print('springer: {}'.format(paper.paper_id)) + with open(fn, 'r') as f: + try: + soup = BeautifulSoup(f.read(), 'html.parser') + except: + print('springer: could not read data') + return None + items = soup.find_all(class_='affiliation__item') + affiliations = [ ', '.join(item.strings) for item in items ] + institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ] + write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) + return affiliations + +def load_sciencedirect(paper, fn): + # print('sciencedirect: {}'.format(paper.paper_id)) + with open(fn, 'r') as f: + try: + soup = BeautifulSoup(f.read(), 'html.parser') + except: + print('sciencedirect: could not read data') + return None + + items = soup.find_all("script", type='application/json', limit=1) + if len(items) == 0: + return None + + try: + data = json.loads(items[0].string) + write_json(fn.replace('paper.doi', 'sciencedirect.json'), data) + # print(data) + except: + print('sciencedirect: json error') + return None + + affiliations = [value['$$'][0]['_'] for value in data['authors']['affiliations'].values()] + + institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ] + write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) + return affiliations + +def load_acm(paper, fn): + # print('acm: {}'.format(paper.paper_id)) + with open(fn, 'r') as f: + try: + soup = BeautifulSoup(f.read(), 'html.parser') + except: + print('acm: could not read data') + return None + items = soup.find_all("a", title='Institutional Profile Page') + affiliations = [ item.string for item in items ] + # print(affiliations) + institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ] + write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) + return affiliations + +def load_computerorg(paper, fn): + # print('computerorg: {}'.format(paper.paper_id)) + # if not os.path.exists(doi.old_doi_fn(fn)): + pass + # with open(fn, 'r') as f: + # try: + # soup = BeautifulSoup(f.read(), 'html.parser') + # except: + # print('computerorg: could not read data') + # return None + # items = soup.find_all("a", title='Institutional Profile Page') + # affiliations = [ item.string for item in items ] + # print(affiliations) + # institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ] + # write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) + # return affiliations + +def load_elsevier(paper, fn): + print('elsevier: {}'.format(paper.paper_id)) + if not os.path.exists(doi.old_doi_fn(paper.paper_id)): + with open(fn, 'r') as f: + try: + soup = BeautifulSoup(f.read(), 'html.parser') + except: + print('elsevier: could not read data') + return None + item = soup.find_all("input", attrs={"name": 'redirectURL'})[0] + new_url = unquote(item['value']) + if new_url: + print(new_url) + doi.fetch_doi(paper.paper_id, new_url, replace=True) + else: + print("missing redirect url: {}".format(paper.paper_id)) + # print('elsevier: {}'.format(paper.paper_id)) + # with open(fn, 'r') as f: + # try: + # soup = BeautifulSoup(f.read(), 'html.parser') + # except: + # print('elsevier: could not read data') + # return None + # items = soup.find_all("a", title='Institutional Profile Page') + # affiliations = [ item.string for item in items ] + # # print(affiliations) + # institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ] + # write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) + # return affiliations + +def find_authors(authors, line): + for a in authors: + if a[2] in line: + return a + return None + +def paper_path(paper_id): + return '{}/{}/{}'.format(DOI_DIR, paper_id[0:2], paper_id) + +if __name__ == '__main__': + doi_report() |
