arcs on dark maps

author: Jules Laplace <julescarbon@gmail.com> 2018-11-09 02:52:17 +0100
committer: Jules Laplace <julescarbon@gmail.com> 2018-11-09 02:52:17 +0100
commit: ca626447b49c55f40ef58d97ee7ff1784f3481b0 (patch)
tree: bc442fdeeaec70bad6286a03b5ae96738e716428 /s2-pdf-report.py
parent: 2fd066e9c3cb0e45d7a055d090084f941a40fadb (diff)
1 files changed, 2 insertions, 134 deletions
diff --git a/s2-pdf-report.py b/s2-pdf-report.py
index 4475f3a9..d659ed15 100644
--- a/s2-pdf-report.py
+++ b/s2-pdf-report.py
@@ -10,12 +10,8 @@ from util import *
 
 PDF_DIR = 'datasets/s2/pdf'
 
-@click.group()
+@click.command()
 def s2_pdf_report():
-  pass
-
-@s2_pdf_report.command()
-def report_geocoded_papers():
   rows = []
   empty_papers = []
   no_separator_papers = []
@@ -45,6 +41,7 @@ def report_geocoded_papers():
       if address:
         found_addresses.append(address)
 
+    # MAYBE try checking the entire string against everything?
     # if not len(found_addresses):
     #   l = heading_string.lower().strip()
     #   address = addresses.find(l)
@@ -105,135 +102,6 @@ def read_headings(fn, paper):
       headings.append(line.strip())
   return headings, found_abstract
 
-class AddressBook (object):
-  def __init__(self):
-    lookup = {}
-    data = read_csv('reports/all_institutions_sorted.csv', keys=None)
-    for index, line in enumerate(data):
-      lookup[line[1].lower().strip()] = index
-    self.data = data
-    self.lookup = lookup
-  def find(self, address):
-    address = address.lower().strip().strip(string.digits)
-    if address in self.lookup:
-      index = self.lookup[address]
-      return self.data[index]
-    for part in address.split(','):
-      part = part.strip().replace('  ', ' ')
-      if part in self.lookup:
-        index = self.lookup[part]
-        return self.data[index]
-    return None
-
-@s2_pdf_report.command()
-def report_first_pages():
-  rows = []
-  institution_names = []
-  institutions = []
-  no_institutions = []
-  for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
-    data = process_paper(fn)
-    rows.append(data['first_pages'])
-    if data['institutions']:
-      for institution in data['institutions']:
-        institutions.append(institution)
-        institution_names.append(institution[1])
-    if data['no_institutions']:
-      no_institutions.append(data['no_institutions'])
-  deduped_institutions = dedupe(institution_names)
-
-  write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows)
-  write_report('reports/institutions.html', title='Institutions', keys=None, rows=sorted(institutions, key=lambda x: x[1]))
-  write_report('reports/institutions_missing.html', title='Institutions', keys=None, rows=no_institutions)
-  write_csv('reports/institution_names.csv', keys=None, rows=[(name,) for name in deduped_institutions])
-  print("{} deduped institutions".format(len(deduped_institutions)))
-
-def dedupe(a):
-  p = {}
-  for s in a:
-    p[s] = None
-  ss = sorted(p.keys())
-  return ss
-
-def process_paper(fn):
-  paper_id = fn.replace(PDF_DIR, '').split('/')[2]
-  paper = load_paper(paper_id)
-  if paper is None:
-    print("{} no paper found!".format(paper_id))
-    return None
-  with open(fn, 'r') as f:
-    lines = []
-    emails = []
-    institutions = []
-    authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
-    journal = paper.journal.lower()
-    found_authors = []
-    for line in f.readlines():
-      l = line.lower()
-      if 'abstract' in l:
-        break
-      if len(line) < 3:
-        continue
-      if journal and journal in l:
-        continue
-      if '@' in line:
-        # print('email {}'.format(line))
-        emails.append(line)
-        continue
-      names = [s.strip() for s in re.split(',| and ', l)]
-      was_found = False
-      for name in names:
-        found = find_authors(authors, name)
-        if found:
-          was_found = True
-          # print("found {}".format(found[1]))
-          if found[0]:
-            found_authors.append(found)
-      if was_found:
-        # lines.append(NameLine(line))
-        continue
-      if 'university' in l or 'universiteit' in l or 'research center' in l or 'research lab' in l or 'college' in l or ', inc' in l or 'institute' in l:
-        inst = re.sub(r'^[\W\d]+', '', line)
-        inst = re.sub(r'[\W\d]+$', '', inst)
-        inst = re.sub(r'\s+', ' ', inst)
-        inst = re.sub(r'Dept.', 'Department ', inst)
-        if len(inst) < 160:
-          inst = inst.replace('&', 'and')
-          inst_parts = []
-          department = ''
-          for inst_part in inst.split(','):
-            inst_part = inst_part.strip()
-            inst_low = inst_part.lower()
-            if 'prof' in inst_low:
-              continue
-            if 'article ' in inst_low:
-              continue
-            if 'department' in inst_low:
-              department = inst_part
-            else:
-              inst_parts.append(inst_part)
-          inst = ', '.join(inst_parts)
-          if inst:
-            inst = ''.join([i if ord(i) < 128 else ' ' for i in inst]).strip()
-            institutions.append([ paper_id, inst, department ])
-        lines.append(BoldLine(inst))
-        continue
-      lines.append(line)
-    write_json('{}/{}'.format(paper_path(paper_id), 'institutions.json'), { 'institutions': institutions })
-    return {
-      'first_pages': [
-        paper_id,
-        lines,
-        found_authors,
-        emails,
-      ],
-      'institutions': None if not len(institutions) else institutions,
-      'no_institutions': None if len(institutions) else [
-        paper_id,
-        lines,
-      ],
-    }
-
 def find_authors(authors, line):
   for a in authors:
     if a[2] in line:
author	Jules Laplace <julescarbon@gmail.com>	2018-11-09 02:52:17 +0100
committer	Jules Laplace <julescarbon@gmail.com>	2018-11-09 02:52:17 +0100
commit	ca626447b49c55f40ef58d97ee7ff1784f3481b0 (patch)
tree	bc442fdeeaec70bad6286a03b5ae96738e716428 /s2-pdf-report.py
parent	2fd066e9c3cb0e45d7a055d090084f941a40fadb (diff)