diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-11-09 02:52:17 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-11-09 02:52:17 +0100 |
| commit | ca626447b49c55f40ef58d97ee7ff1784f3481b0 (patch) | |
| tree | bc442fdeeaec70bad6286a03b5ae96738e716428 /s2-doi-report.py | |
| parent | 2fd066e9c3cb0e45d7a055d090084f941a40fadb (diff) | |
arcs on dark maps
Diffstat (limited to 's2-doi-report.py')
| -rw-r--r-- | s2-doi-report.py | 80 |
1 files changed, 31 insertions, 49 deletions
diff --git a/s2-doi-report.py b/s2-doi-report.py index d2facb92..611b6391 100644 --- a/s2-doi-report.py +++ b/s2-doi-report.py @@ -14,27 +14,45 @@ def doi_report(): rows = [] domains = {} institutions = {} - geocode_lookup = load_geocode_lookup() + # geocode_lookup = load_geocode_lookup() + addresses = AddressBook() + + geocoded_papers = [] + unknown_papers = [] + unattributed_papers = [] for fn in glob.iglob('{}/**/*.url'.format(DOI_DIR), recursive=True): url_info = read_json(fn) domain = url_info['domain'] paper_id = url_info['paper_id'] + paper = load_paper(paper_id) doi_fn = fn.replace('.url', '.doi') - institutions_fn = fn.replace('paper.url', 'institutions.json') + # institutions_fn = fn.replace('paper.url', 'institutions.json') + address = None if domain in domains: domains[domain] += 1 else: domains[domain] = 1 - if os.path.exists(institutions_fn): - continue + # if not os.path.exists(institutions_fn): + # continue + paper_affiliation_count = 0 if 'ieee' in domain: affiliations = load_ieee(paper_id, doi_fn) for affiliation in affiliations: - if affiliation in institutions: - institutions[affiliation] += 1 - else: - institutions[affiliation] = 1 + if affiliation: + paper_affiliation_count += 1 + if affiliation in institutions: + institutions[affiliation] += 1 + else: + institutions[affiliation] = 1 + address = addresses.find(affiliation) + if not address: + unknown_papers.append([paper.paper_id, paper.title, affiliation]) + if paper_affiliation_count == 0: + unattributed_papers.append([paper.paper_id, paper.title]) + if address: + geocoded_papers.append([paper.paper_id, paper.title] + address) + domain_list = reversed(sorted(domains.items(), key=operator.itemgetter(1))) # for domain, count in domain_list: # print('{}\t{}'.format(count, domain)) @@ -44,43 +62,16 @@ def doi_report(): display_institution_list = [] raw_institution_list = [] for inst in institution_list: - raw_institution_list.append(inst) - if inst[0] in geocode_lookup: - display_institution_list.append((BoldLine(inst[0]), inst[1],)) - continue - inst_parts = inst[0].split(',') - if inst_parts[0] in geocode_lookup: - display_institution_list.append((BoldLine(inst[0]), inst[1],)) - elif len(inst_parts) > 1 and inst_parts[1] in geocode_lookup: + addr = addresses.find(inst[0]) + if addr: display_institution_list.append((BoldLine(inst[0]), inst[1],)) else: display_institution_list.append(inst) write_report('reports/doi_domains.html', title='DOI Domains', keys=None, rows=domain_list) write_report('reports/doi_institutions.html', title='Institutions from IEEE', keys=None, rows=display_institution_list) - write_csv('reports/doi_institutions.csv', keys=None, rows=raw_institution_list) - - # print(domain_list) - # rows.append(data['first_pages']) - # if data['institutions']: - # for institution in data['institutions']: - # institutions.append(institution) - # institution_names.append(institution[1]) - # if data['no_institutions']: - # no_institutions.append(data['no_institutions']) - # deduped_institutions = dedupe(institution_names) - - # write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows) - # write_report('reports/institutions.html', title='Institutions', keys=None, rows=sorted(institutions, key=lambda x: x[1])) - # write_report('reports/institutions_missing.html', title='Institutions', keys=None, rows=no_institutions) - # write_csv('reports/institution_names.csv', keys=None, rows=[(name,) for name in deduped_institutions]) - # print("{} deduped institutions".format(len(deduped_institutions))) - -def dedupe(a): - p = {} - for s in a: - p[s] = None - ss = sorted(p.keys()) - return ss + write_csv('reports/doi_institutions_geocoded.csv', keys=None, rows=geocoded_papers) + write_csv('reports/doi_institutions_unknown.csv', keys=None, rows=unknown_papers) + write_csv('reports/doi_institutions_unattributed.csv', keys=None, rows=unattributed_papers) def load_ieee(paper_id, fn): paper = load_paper(paper_id) @@ -103,15 +94,6 @@ def load_ieee(paper_id, fn): write_json('{}/{}'.format(paper_path(paper_id), 'institutions.json'), { 'institutions': institutions }) return affiliations -def load_geocode_lookup(): - insts = read_csv('reports/institutions_found.csv', keys=None) - lookup = {} - for inst in insts: - # print(inst) - lookup[inst[0]] = True - lookup[inst[3]] = True - return lookup - def find_authors(authors, line): for a in authors: if a[2] in line: |
