moving stuff

author: Jules Laplace <julescarbon@gmail.com> 2018-11-25 22:19:15 +0100
committer: Jules Laplace <julescarbon@gmail.com> 2018-11-25 22:19:15 +0100
commit: ee3d0d98e19f1d8177d85af1866fd0ee431fe9ea (patch)
tree: 41372528e78d4328bc2a47bbbabac7e809c58894 /s2-citation-report.py
parent: 255b8178af1e25a71fd23703d30c0d1f74911f47 (diff)
1 files changed, 0 insertions, 282 deletions
diff --git a/s2-citation-report.py b/s2-citation-report.py
deleted file mode 100644
index 0d1712b6..00000000
--- a/s2-citation-report.py
+++ /dev/null
@@ -1,282 +0,0 @@
-import os
-import re
-import glob
-import json
-import math
-import operator
-import click
-from util import *
-
-@click.command()
-def s2_citation_report():
-  addresses = AddressBook()
-  megapixels = load_megapixels_queries()
-  successful_geocodes = {}
-  papers = []
-  for fn in glob.iglob('datasets/s2/papers/**/*.json', recursive=True):
-    paper_data = process_paper(fn, addresses, megapixels, successful_geocodes)
-    papers.append(paper_data)
-  write_papers_report('reports/report_index.html', 'All Papers', papers, 'title')
-  write_papers_report('reports/report_coverage.html', 'Coverage', papers, 'citations_geocoded', reverse=True)
-
-  paper_count = 0
-  geocode_count = 0
-  for key, value in successful_geocodes.items():
-    if value:
-      geocode_count += 1
-    paper_count += 1
-  print("citations: {}".format(paper_count))
-  print("geocoded: {} ({}%)".format(geocode_count, percent(geocode_count, paper_count)))
-
-def write_papers_report(fn, title, papers, key, reverse=False):
-  sorted_papers = []
-  for paper in sorted(papers, key=lambda x: x[key], reverse=reverse):
-    sorted_papers.append([
-      paper['paperId'],
-      paper['key'],
-      LinkLine(paper['report_link'], paper['title']),
-      LinkLine(paper['pdf_link'], '[pdf]'),
-      paper['journal'],
-      paper['address_type'],
-      paper['address'],
-      paper['lat'],
-      paper['lng'],
-      str(percent(paper['citations_geocoded'], paper['citation_count'])) + '%',
-      paper['citation_count'],
-      paper['citations_geocoded'],
-      paper['citations_unknown'],
-      paper['citations_empty'],
-      paper['citations_pdf'],
-      paper['citations_doi'],
-    ])
-  sorted_paper_keys = [
-    'Paper ID',
-    'Megapixels Key',
-    'Report Link',
-    'PDF Link',
-    'Journal',
-    'Type',
-    'Address',
-    'Lat',
-    'Lng',
-    'Coverage',
-    'Total Citations',
-    'Geocoded Citations',
-    'Unknown Citations',
-    'Empty Citations',
-    'With PDF', 
-    'With DOI',
-  ]
-  write_report(fn, title=title, keys=sorted_paper_keys, rows=sorted_papers)
-
-def process_paper(fn, addresses, megapixels, success):
-  res = {
-    'paperId': '',
-    'key': '',
-    'title': '',
-    'journal': '',
-    'address': '',
-    'address_type': '',
-    'lat': '',
-    'lng': '',
-    'pdf_link': '',
-    'report_link': '',
-    'citation_count': 0,
-    'citations_geocoded': 0,
-    'citations_unknown': 0,
-    'citations_empty': 0,
-    'citations_pdf': 0,
-    'citations_doi': 0,
-  }
-
-  geocoded_citations = []
-  unknown_citations = []
-  display_geocoded_citations = []
-  empty_citations = []
-  pdf_count = 0
-  doi_count = 0
-  address_count = 0
-
-  with open(fn, 'r') as f:
-    data = json.load(f)
-    print('>> {}'.format(data['paperId']))
-    paper = load_paper(data['paperId'])
-    if paper.data is None:
-      print("Paper missing! {}".format(data['paperId']))
-      return
-    
-    res['paperId'] = paper.paper_id
-    res['title'] = paper.title
-    res['journal'] = paper.journal
-    res['report_link'] = 'papers/{}.html'.format(paper.paper_id)
-    res['pdf_link'] = paper.pdf_link
-    # res['authors'] = ', '.join(paper.authors)
-    # res['citations'] = []
-
-    if res['title'] in megapixels:
-      res['key'] = megapixels[res['title']]['Database Name']
-
-    paper_institutions = load_institutions(paper.paper_id)
-    paper_address = None
-    for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
-      # print(inst[1])
-      institution = inst[1]
-      if paper_address is None:
-        paper_address = addresses.find(institution)
-
-    if paper_address:
-      print(paper_address)
-      res['address'] = paper_address[0]
-      res['lat'] = paper_address[3]
-      res['lng'] = paper_address[4]
-      res['address_type'] = paper_address[5]
-
-    for cite in data['citations']:
-      citationId = cite['paperId']
-      citation = load_paper(citationId)
-      has_pdf = os.path.exists(file_path('pdf', citationId, 'paper.txt'))
-      has_doi = os.path.exists(file_path('doi', citationId, 'paper.doi'))
-      if has_pdf:
-        pdf_count += 1
-      if has_doi:
-        doi_count += 1
-      if citation.data is None:
-        print("Citation missing! {}".format(cite['paperId']))
-        continue
-      institutions = load_institutions(citationId)
-      geocoded_institutions = []
-      unknown_institutions = []
-      institution = ''
-      address = None
-      for inst in sorted(institutions, key=operator.itemgetter(1)):
-        # print(inst[1])
-        address_count += 1
-        institution = inst[1]
-        next_address = addresses.find(institution)
-        if next_address:
-          address = next_address
-          geocoded_institutions.append(institution)
-        else:
-          unknown_institutions.append(institution)
-      if not address:
-        if has_pdf:
-          headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation)
-          heading_string = '\n'.join(headings[0:20])
-          found_addresses = []
-          if len(headings):
-            for heading in headings:
-              l = heading.lower().strip()
-              if l:
-                next_address = addresses.find(l)
-                if next_address:
-                  address = next_address
-                  geocoded_institutions.append(heading)
-                else:
-                  unknown_institutions.append(heading)
-          else:
-            empty_citations.append([
-              citationId,
-              citation.title,
-            ])
-
-      # res['citations'].append({
-      #   'title': citation.title,
-      #   'journal': citation.journal,
-      #   'authors': citation.authors,
-      #   'institutions': [inst[1] for inst in institutions],
-      #   'geocoded': geocoded_institutions,
-      # })
-      if address:
-        success[citationId] = True
-        geocoded_citations.append([
-          citation.title,
-          institution,
-        ] + address)
-        display_geocoded_citations.append([
-          citationId,
-          LinkLine(citation.pdf_link, '[pdf]'),
-          citation.title,
-        ] + address[0:5])
-      else:
-        success[citationId] = False
-        unknown_citations.append([
-          citationId,
-          LinkLine(citation.pdf_link, '[pdf]'),
-          citation.title,
-          '<br>'.join(unknown_institutions),
-        ])
-    res['citation_count'] = len(data['citations'])
-    res['citations_geocoded'] = len(geocoded_citations)
-    res['citations_unknown'] = len(unknown_citations)
-    res['citations_empty'] = len(empty_citations)
-    res['citations_pdf'] = pdf_count
-    res['citations_doi'] = doi_count
-
-  total_citations = len(geocoded_citations) + len(unknown_citations)
-  os.makedirs('reports/papers/', exist_ok=True)
-  with open('reports/papers/{}.html'.format(paper.paper_id), 'w') as f:
-    f.write("<!doctype html>")
-    f.write("<html>")
-    f.write("<head>")
-    f.write('<meta charset="utf-8">')
-    f.write("<title>{}</title>".format(paper.title))
-    f.write("<link rel='stylesheet' href='../reports.css'>")
-    f.write('<link rel="stylesheet" href="https://unpkg.com/leaflet@1.3.4/dist/leaflet.css" integrity="sha512-puBpdR0798OZvTTbP4A8Ix/l+A4dHDD0DGqYW6RQ+9jxkRFclaxxQb/SJAWZfWAkuyeQUytO7+7N4QKrDh+drA==" crossorigin=""/>')
-    f.write("</head>")
-    f.write("<body>")
-    f.write("<div id='mapid'></div>")
-    f.write("<h2>{}</h2>".format(paper.title))
-    f.write('<ul>')
-    if paper.journal:
-      f.write('<li>Journal: {}</li>'.format(paper.journal))
-    if paper_address:
-      f.write('<li>Research institution: {}</li>'.format(paper_address[0]))
-      f.write('<li>Address: {}</li>'.format(paper_address[2]))
-      f.write('<li>Lat/Lng: {}, {}</li>'.format(paper_address[3], paper_address[4]))
-    f.write('<li>Year: {}</li>'.format(paper.year))
-    f.write('<li>Coverage: {} / {} citations were located ({} %).</li>'.format(len(geocoded_citations), total_citations, math.floor(len(geocoded_citations) / total_citations * 100)))
-    f.write('</ul>')
-    f.write('<h3>{}</h3>'.format('Geocoded Citations'))
-    write_table(f, keys=None, rows=sorted(display_geocoded_citations, key=operator.itemgetter(0)))
-    f.write('<h3>{}</h3>'.format('Other Citations'))
-    write_table(f, keys=None, rows=sorted(unknown_citations, key=operator.itemgetter(0)))
-    f.write("</body>")
-    f.write('<script src="../snap.svg-min.js"></script>')
-    f.write('<script src="https://unpkg.com/leaflet@1.3.4/dist/leaflet.js" integrity="sha512-nMMmRyTVoLYqjP9hrbed9S+FzjZHW5gY1TWCHA5ckwXZBadntCNs8kEqAWdrb9O7rxbCaA4lKTIWjDXZxflOcA==" crossorigin=""></script>')
-    f.write('<script src="../leaflet.arc.js"></script>')
-    f.write('<script src="../leaflet.bezier.js"></script>')
-    f.write('<script type="text/json" id="address">')
-    json.dump(paper_address, f)
-    f.write('</script>')
-    f.write('<script type="text/json" id="citations">')
-    json.dump(geocoded_citations, f)
-    f.write('</script>')
-    f.write('<script src="../map.js"></script>')
-    f.write("</html>")
-  return res
-
-def load_megapixels_queries():
-  keys, rows = read_csv('datasets/citations-2018310.csv')
-  lookup = {}
-  for row in rows:
-    rec = {}
-    for index, key in enumerate(keys):
-      rec[key] = row[index]
-    lookup[rec['Title'].strip()] = rec
-  return lookup
-
-def load_institutions(paperId):
-  if os.path.exists(file_path('pdf', paperId, 'institutions.json')):
-    return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions']
-  elif os.path.exists(file_path('doi', paperId, 'institutions.json')):
-    return read_json(file_path('doi', paperId, 'institutions.json'))['institutions']
-  else:
-    return []
-
-def data_path(key, paper_id):
-  return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id)
-def file_path(key, paper_id, fn):
-  return os.path.join(data_path(key, paper_id), fn)
-  
-if __name__ == '__main__':
-  s2_citation_report()
author	Jules Laplace <julescarbon@gmail.com>	2018-11-25 22:19:15 +0100
committer	Jules Laplace <julescarbon@gmail.com>	2018-11-25 22:19:15 +0100
commit	ee3d0d98e19f1d8177d85af1866fd0ee431fe9ea (patch)
tree	41372528e78d4328bc2a47bbbabac7e809c58894 /s2-citation-report.py
parent	255b8178af1e25a71fd23703d30c0d1f74911f47 (diff)