moving stuff

author: Jules Laplace <julescarbon@gmail.com> 2018-11-25 22:19:15 +0100
committer: Jules Laplace <julescarbon@gmail.com> 2018-11-25 22:19:15 +0100
commit: ee3d0d98e19f1d8177d85af1866fd0ee431fe9ea (patch)
tree: 41372528e78d4328bc2a47bbbabac7e809c58894 /s2-pdf-first-pages.py
parent: 255b8178af1e25a71fd23703d30c0d1f74911f47 (diff)
1 files changed, 0 insertions, 133 deletions
diff --git a/s2-pdf-first-pages.py b/s2-pdf-first-pages.py
deleted file mode 100644
index c8a34af4..00000000
--- a/s2-pdf-first-pages.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import re
-import os
-import gzip
-import glob
-import json
-import click
-import math
-import string
-from util import *
-
-PDF_DIR = 'datasets/s2/pdf'
-
-@click.command()
-def report_first_pages():
-  rows = []
-  institution_names = []
-  institutions = []
-  no_institutions = []
-  for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
-    data = process_paper(fn)
-    rows.append(data['first_pages'])
-    if data['institutions']:
-      for institution in data['institutions']:
-        institutions.append(institution)
-        institution_names.append(institution[1])
-    if data['no_institutions']:
-      no_institutions.append(data['no_institutions'])
-  deduped_institutions = dedupe(institution_names)
-
-  write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows)
-  write_report('reports/institutions.html', title='Institutions', keys=None, rows=sorted(institutions, key=lambda x: x[1]))
-  write_report('reports/institutions_missing.html', title='Institutions', keys=None, rows=no_institutions)
-  write_csv('reports/institution_names.csv', keys=None, rows=[(name,) for name in deduped_institutions])
-  print("{} deduped institutions".format(len(deduped_institutions)))
-
-def dedupe(a):
-  p = {}
-  for s in a:
-    p[s] = None
-  ss = sorted(p.keys())
-  return ss
-
-def process_paper(fn):
-  paper_id = fn.replace(PDF_DIR, '').split('/')[2]
-  paper = load_paper(paper_id)
-  if paper is None:
-    print("{} no paper found!".format(paper_id))
-    return None
-  with open(fn, 'r') as f:
-    lines = []
-    emails = []
-    institutions = []
-    authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
-    journal = paper.journal.lower()
-    found_authors = []
-    for line in f.readlines():
-      l = line.lower()
-      if 'abstract' in l:
-        break
-      if len(line) < 3:
-        continue
-      if journal and journal in l:
-        continue
-      if '@' in line:
-        # print('email {}'.format(line))
-        emails.append(line)
-        continue
-      names = [s.strip() for s in re.split(',| and ', l)]
-      was_found = False
-      for name in names:
-        found = find_authors(authors, name)
-        if found:
-          was_found = True
-          # print("found {}".format(found[1]))
-          if found[0]:
-            found_authors.append(found)
-      if was_found:
-        # lines.append(NameLine(line))
-        continue
-
-      if 'university' in l or 'universiteit' in l or 'research center' in l or 'research lab' in l or 'college' in l or ', inc' in l or 'institute' in l:
-        inst = re.sub(r'^[\W\d]+', '', line)
-        inst = re.sub(r'[\W\d]+$', '', inst)
-        inst = re.sub(r'\s+', ' ', inst)
-        inst = re.sub(r'Dept.', 'Department ', inst)
-        if len(inst) < 160:
-          inst = inst.replace('&', 'and')
-          inst_parts = []
-          department = ''
-          for inst_part in inst.split(','):
-            inst_part = inst_part.strip()
-            inst_low = inst_part.lower()
-            if 'prof' in inst_low:
-              continue
-            if 'article ' in inst_low:
-              continue
-            if 'department' in inst_low:
-              department = inst_part
-            else:
-              inst_parts.append(inst_part)
-          inst = ', '.join(inst_parts)
-          if inst:
-            inst = ''.join([i if ord(i) < 128 else ' ' for i in inst]).strip()
-            institutions.append([ paper_id, inst, department ])
-        lines.append(BoldLine(inst))
-        continue
-      lines.append(line)
-    write_json('{}/{}'.format(paper_path(paper_id), 'institutions.json'), { 'institutions': institutions })
-    return {
-      'first_pages': [
-        paper_id,
-        lines,
-        found_authors,
-        emails,
-      ],
-      'institutions': None if not len(institutions) else institutions,
-      'no_institutions': None if len(institutions) else [
-        paper_id,
-        lines,
-      ],
-    }
-
-def find_authors(authors, line):
-  for a in authors:
-    if a[2] in line:
-      return a
-  return None
-
-def paper_path(paper_id):
-  return '{}/{}/{}'.format(PDF_DIR, paper_id[0:2], paper_id)
-  
-if __name__ == '__main__':
-  report_first_pages()
author	Jules Laplace <julescarbon@gmail.com>	2018-11-25 22:19:15 +0100
committer	Jules Laplace <julescarbon@gmail.com>	2018-11-25 22:19:15 +0100
commit	ee3d0d98e19f1d8177d85af1866fd0ee431fe9ea (patch)
tree	41372528e78d4328bc2a47bbbabac7e809c58894 /s2-pdf-first-pages.py
parent	255b8178af1e25a71fd23703d30c0d1f74911f47 (diff)