summaryrefslogtreecommitdiff
path: root/s2-doi-report.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2018-11-07 00:04:38 +0100
committerJules Laplace <julescarbon@gmail.com>2018-11-07 00:04:38 +0100
commit7e8161af7bbb6dbfaefeef986299f8fb6d2e0915 (patch)
tree84a8029410c5a4ccc9cbb0d47feda3a1df70ea4e /s2-doi-report.py
parent77226327bf7cc228a47d7765cf76f52e7dd799ae (diff)
ieee domain reports
Diffstat (limited to 's2-doi-report.py')
-rw-r--r--s2-doi-report.py81
1 files changed, 63 insertions, 18 deletions
diff --git a/s2-doi-report.py b/s2-doi-report.py
index e322b531..8be76192 100644
--- a/s2-doi-report.py
+++ b/s2-doi-report.py
@@ -13,18 +13,53 @@ DOI_DIR = 'datasets/s2/doi'
def doi_report():
rows = []
domains = {}
- institution_names = []
- institutions = []
- no_institutions = []
- for fn in glob.iglob('{}/**/*.url'.format(PDF_DIR), recursive=True):
+ institutions = {}
+ geocode_lookup = load_geocode_lookup()
+
+ for fn in glob.iglob('{}/**/*.url'.format(DOI_DIR), recursive=True):
url_info = read_json(fn)
domain = url_info['domain']
+ paper_id = url_info['paper_id']
+ doi_fn = fn.replace('.url', '.doi')
+ institutions_fn = fn.replace('paper.url', 'institutions.json')
if domain in domains:
domains[domain] += 1
else:
domains[domain] = 1
- domain_list = sorted(domains.items(), key=operator.itemgetter(1))
- print(domain_list)
+ if os.path.exists(institutions_fn):
+ continue
+ if 'ieee' in domain:
+ affiliations = load_ieee(paper_id, doi_fn)
+ for affiliation in affiliations:
+ if affiliation in institutions:
+ institutions[affiliation] += 1
+ else:
+ institutions[affiliation] = 1
+ domain_list = reversed(sorted(domains.items(), key=operator.itemgetter(1)))
+ # for domain, count in domain_list:
+ # print('{}\t{}'.format(count, domain))
+ institution_list = reversed(sorted(institutions.items(), key=operator.itemgetter(1)))
+ # for institution, count in institution_list:
+ # print('{}\t{}'.format(count, institution))
+ display_institution_list = []
+ raw_institution_list = []
+ for inst in institution_list:
+ raw_institution_list.append(inst)
+ if inst[0] in geocode_lookup:
+ display_institution_list.append((BoldLine(inst[0]), inst[1],))
+ continue
+ inst_parts = inst[0].split(',')
+ if inst_parts[0] in geocode_lookup:
+ display_institution_list.append((BoldLine(inst[0]), inst[1],))
+ elif len(inst_parts) > 1 and inst_parts[1] in geocode_lookup:
+ display_institution_list.append((BoldLine(inst[0]), inst[1],))
+ else:
+ display_institution_list.append(inst)
+ write_report('reports/doi_domains.html', title='DOI Domains', keys=None, rows=domain_list)
+ write_report('reports/doi_institutions.html', title='Institutions from IEEE', keys=None, rows=display_institution_list)
+ write_csv('reports/doi_institutions.csv', keys=None, rows=raw_institution_list)
+
+ # print(domain_list)
# rows.append(data['first_pages'])
# if data['institutions']:
# for institution in data['institutions']:
@@ -47,20 +82,33 @@ def dedupe(a):
ss = sorted(p.keys())
return ss
-def process_paper(fn):
- paper_id = fn.replace(PDF_DIR, '').split('/')[2]
+def load_ieee(paper_id, fn):
paper = load_paper(paper_id)
if paper is None:
print("{} no paper found!".format(paper_id))
- return None
+ return []
with open(fn, 'r') as f:
- lines = []
- emails = []
- institutions = []
authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
- journal = paper.journal.lower()
- found_authors = []
- for line in f.readlines():
+ try:
+ data = f.read().split('global.document.metadata=')[1].split('</script>')[0].strip()[:-1]
+ data = json.loads(data)
+ write_json(fn.replace('paper.doi', 'ieee.json'), data)
+ # print(data)
+ except:
+ print('could not read data')
+ return []
+ affiliations = [ author['affiliation'] for author in data['authors'] ]
+ # print(affiliations)
+ return affiliations
+
+def load_geocode_lookup():
+ insts = read_csv('reports/institutions_found.csv', keys=None)
+ lookup = {}
+ for inst in insts:
+ # print(inst)
+ lookup[inst[0]] = True
+ lookup[inst[3]] = True
+ return lookup
class NameLine(object):
def __init__(self, s):
@@ -80,8 +128,5 @@ def find_authors(authors, line):
return a
return None
-def paper_path(paper_id):
- return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id)
-
if __name__ == '__main__':
doi_report()