summaryrefslogtreecommitdiff
path: root/s2-doi-report.py
blob: 611b63917beb1bd2b3a5eb79058d9cb4fcfa87c0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import re
import os
import gzip
import glob
import json
import click
import operator
from util import *

DOI_DIR = 'datasets/s2/doi'

@click.command()
def doi_report():
  rows = []
  domains = {}
  institutions = {}
  # geocode_lookup = load_geocode_lookup()
  addresses = AddressBook()

  geocoded_papers = []
  unknown_papers = []
  unattributed_papers = []

  for fn in glob.iglob('{}/**/*.url'.format(DOI_DIR), recursive=True):
    url_info = read_json(fn)
    domain = url_info['domain']
    paper_id = url_info['paper_id']
    paper = load_paper(paper_id)
    doi_fn = fn.replace('.url', '.doi')
    # institutions_fn = fn.replace('paper.url', 'institutions.json')
    address = None
    if domain in domains:
      domains[domain] += 1
    else:
      domains[domain] = 1
    # if not os.path.exists(institutions_fn):
      # continue
    paper_affiliation_count = 0
    if 'ieee' in domain:
      affiliations = load_ieee(paper_id,  doi_fn)
      for affiliation in affiliations:
        if affiliation:
          paper_affiliation_count += 1
          if affiliation in institutions:
            institutions[affiliation] += 1
          else:
            institutions[affiliation] = 1
            address = addresses.find(affiliation)
            if not address:
              unknown_papers.append([paper.paper_id, paper.title, affiliation])
      if paper_affiliation_count == 0:
        unattributed_papers.append([paper.paper_id, paper.title])
    if address:
      geocoded_papers.append([paper.paper_id, paper.title] + address)

  domain_list = reversed(sorted(domains.items(), key=operator.itemgetter(1)))
  # for domain, count in domain_list:
    # print('{}\t{}'.format(count, domain))
  institution_list = reversed(sorted(institutions.items(), key=operator.itemgetter(1)))
  # for institution, count in institution_list:
    # print('{}\t{}'.format(count, institution))
  display_institution_list = []
  raw_institution_list = []
  for inst in institution_list:
    addr = addresses.find(inst[0])
    if addr:
      display_institution_list.append((BoldLine(inst[0]), inst[1],))
    else:
      display_institution_list.append(inst)
  write_report('reports/doi_domains.html', title='DOI Domains', keys=None, rows=domain_list)
  write_report('reports/doi_institutions.html', title='Institutions from IEEE', keys=None, rows=display_institution_list)
  write_csv('reports/doi_institutions_geocoded.csv', keys=None, rows=geocoded_papers)
  write_csv('reports/doi_institutions_unknown.csv', keys=None, rows=unknown_papers)
  write_csv('reports/doi_institutions_unattributed.csv', keys=None, rows=unattributed_papers)

def load_ieee(paper_id, fn):
  paper = load_paper(paper_id)
  if paper is None:
    print("{} no paper found!".format(paper_id))
    return []
  with open(fn, 'r') as f:
    authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
    try:
      data = f.read().split('global.document.metadata=')[1].split('</script>')[0].strip()[:-1]
      data = json.loads(data)
      write_json(fn.replace('paper.doi', 'ieee.json'), data)
      # print(data)
    except:
      print('could not read data')
      return []
    affiliations = [ author['affiliation'] for author in data['authors'] ]
    institutions = [ [ paper_id, author['affiliation'], author['affiliation'] ] for author in data['authors'] ]
    # print(affiliations)
    write_json('{}/{}'.format(paper_path(paper_id), 'institutions.json'), { 'institutions': institutions })
    return affiliations

def find_authors(authors, line):
  for a in authors:
    if a[2] in line:
      return a
  return None

def paper_path(paper_id):
  return '{}/{}/{}'.format(DOI_DIR, paper_id[0:2], paper_id)

if __name__ == '__main__':
  doi_report()