diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-11-07 00:04:38 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-11-07 00:04:38 +0100 |
| commit | 7e8161af7bbb6dbfaefeef986299f8fb6d2e0915 (patch) | |
| tree | 84a8029410c5a4ccc9cbb0d47feda3a1df70ea4e /s2-doi-report.py | |
| parent | 77226327bf7cc228a47d7765cf76f52e7dd799ae (diff) | |
ieee domain reports
Diffstat (limited to 's2-doi-report.py')
| -rw-r--r-- | s2-doi-report.py | 81 |
1 files changed, 63 insertions, 18 deletions
diff --git a/s2-doi-report.py b/s2-doi-report.py index e322b531..8be76192 100644 --- a/s2-doi-report.py +++ b/s2-doi-report.py @@ -13,18 +13,53 @@ DOI_DIR = 'datasets/s2/doi' def doi_report(): rows = [] domains = {} - institution_names = [] - institutions = [] - no_institutions = [] - for fn in glob.iglob('{}/**/*.url'.format(PDF_DIR), recursive=True): + institutions = {} + geocode_lookup = load_geocode_lookup() + + for fn in glob.iglob('{}/**/*.url'.format(DOI_DIR), recursive=True): url_info = read_json(fn) domain = url_info['domain'] + paper_id = url_info['paper_id'] + doi_fn = fn.replace('.url', '.doi') + institutions_fn = fn.replace('paper.url', 'institutions.json') if domain in domains: domains[domain] += 1 else: domains[domain] = 1 - domain_list = sorted(domains.items(), key=operator.itemgetter(1)) - print(domain_list) + if os.path.exists(institutions_fn): + continue + if 'ieee' in domain: + affiliations = load_ieee(paper_id, doi_fn) + for affiliation in affiliations: + if affiliation in institutions: + institutions[affiliation] += 1 + else: + institutions[affiliation] = 1 + domain_list = reversed(sorted(domains.items(), key=operator.itemgetter(1))) + # for domain, count in domain_list: + # print('{}\t{}'.format(count, domain)) + institution_list = reversed(sorted(institutions.items(), key=operator.itemgetter(1))) + # for institution, count in institution_list: + # print('{}\t{}'.format(count, institution)) + display_institution_list = [] + raw_institution_list = [] + for inst in institution_list: + raw_institution_list.append(inst) + if inst[0] in geocode_lookup: + display_institution_list.append((BoldLine(inst[0]), inst[1],)) + continue + inst_parts = inst[0].split(',') + if inst_parts[0] in geocode_lookup: + display_institution_list.append((BoldLine(inst[0]), inst[1],)) + elif len(inst_parts) > 1 and inst_parts[1] in geocode_lookup: + display_institution_list.append((BoldLine(inst[0]), inst[1],)) + else: + display_institution_list.append(inst) + write_report('reports/doi_domains.html', title='DOI Domains', keys=None, rows=domain_list) + write_report('reports/doi_institutions.html', title='Institutions from IEEE', keys=None, rows=display_institution_list) + write_csv('reports/doi_institutions.csv', keys=None, rows=raw_institution_list) + + # print(domain_list) # rows.append(data['first_pages']) # if data['institutions']: # for institution in data['institutions']: @@ -47,20 +82,33 @@ def dedupe(a): ss = sorted(p.keys()) return ss -def process_paper(fn): - paper_id = fn.replace(PDF_DIR, '').split('/')[2] +def load_ieee(paper_id, fn): paper = load_paper(paper_id) if paper is None: print("{} no paper found!".format(paper_id)) - return None + return [] with open(fn, 'r') as f: - lines = [] - emails = [] - institutions = [] authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ] - journal = paper.journal.lower() - found_authors = [] - for line in f.readlines(): + try: + data = f.read().split('global.document.metadata=')[1].split('</script>')[0].strip()[:-1] + data = json.loads(data) + write_json(fn.replace('paper.doi', 'ieee.json'), data) + # print(data) + except: + print('could not read data') + return [] + affiliations = [ author['affiliation'] for author in data['authors'] ] + # print(affiliations) + return affiliations + +def load_geocode_lookup(): + insts = read_csv('reports/institutions_found.csv', keys=None) + lookup = {} + for inst in insts: + # print(inst) + lookup[inst[0]] = True + lookup[inst[3]] = True + return lookup class NameLine(object): def __init__(self, s): @@ -80,8 +128,5 @@ def find_authors(authors, line): return a return None -def paper_path(paper_id): - return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) - if __name__ == '__main__': doi_report() |
