summaryrefslogtreecommitdiff
path: root/s2-pdf-report.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2018-11-09 02:52:17 +0100
committerJules Laplace <julescarbon@gmail.com>2018-11-09 02:52:17 +0100
commitca626447b49c55f40ef58d97ee7ff1784f3481b0 (patch)
treebc442fdeeaec70bad6286a03b5ae96738e716428 /s2-pdf-report.py
parent2fd066e9c3cb0e45d7a055d090084f941a40fadb (diff)
arcs on dark maps
Diffstat (limited to 's2-pdf-report.py')
-rw-r--r--s2-pdf-report.py136
1 files changed, 2 insertions, 134 deletions
diff --git a/s2-pdf-report.py b/s2-pdf-report.py
index 4475f3a9..d659ed15 100644
--- a/s2-pdf-report.py
+++ b/s2-pdf-report.py
@@ -10,12 +10,8 @@ from util import *
PDF_DIR = 'datasets/s2/pdf'
-@click.group()
+@click.command()
def s2_pdf_report():
- pass
-
-@s2_pdf_report.command()
-def report_geocoded_papers():
rows = []
empty_papers = []
no_separator_papers = []
@@ -45,6 +41,7 @@ def report_geocoded_papers():
if address:
found_addresses.append(address)
+ # MAYBE try checking the entire string against everything?
# if not len(found_addresses):
# l = heading_string.lower().strip()
# address = addresses.find(l)
@@ -105,135 +102,6 @@ def read_headings(fn, paper):
headings.append(line.strip())
return headings, found_abstract
-class AddressBook (object):
- def __init__(self):
- lookup = {}
- data = read_csv('reports/all_institutions_sorted.csv', keys=None)
- for index, line in enumerate(data):
- lookup[line[1].lower().strip()] = index
- self.data = data
- self.lookup = lookup
- def find(self, address):
- address = address.lower().strip().strip(string.digits)
- if address in self.lookup:
- index = self.lookup[address]
- return self.data[index]
- for part in address.split(','):
- part = part.strip().replace(' ', ' ')
- if part in self.lookup:
- index = self.lookup[part]
- return self.data[index]
- return None
-
-@s2_pdf_report.command()
-def report_first_pages():
- rows = []
- institution_names = []
- institutions = []
- no_institutions = []
- for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
- data = process_paper(fn)
- rows.append(data['first_pages'])
- if data['institutions']:
- for institution in data['institutions']:
- institutions.append(institution)
- institution_names.append(institution[1])
- if data['no_institutions']:
- no_institutions.append(data['no_institutions'])
- deduped_institutions = dedupe(institution_names)
-
- write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows)
- write_report('reports/institutions.html', title='Institutions', keys=None, rows=sorted(institutions, key=lambda x: x[1]))
- write_report('reports/institutions_missing.html', title='Institutions', keys=None, rows=no_institutions)
- write_csv('reports/institution_names.csv', keys=None, rows=[(name,) for name in deduped_institutions])
- print("{} deduped institutions".format(len(deduped_institutions)))
-
-def dedupe(a):
- p = {}
- for s in a:
- p[s] = None
- ss = sorted(p.keys())
- return ss
-
-def process_paper(fn):
- paper_id = fn.replace(PDF_DIR, '').split('/')[2]
- paper = load_paper(paper_id)
- if paper is None:
- print("{} no paper found!".format(paper_id))
- return None
- with open(fn, 'r') as f:
- lines = []
- emails = []
- institutions = []
- authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
- journal = paper.journal.lower()
- found_authors = []
- for line in f.readlines():
- l = line.lower()
- if 'abstract' in l:
- break
- if len(line) < 3:
- continue
- if journal and journal in l:
- continue
- if '@' in line:
- # print('email {}'.format(line))
- emails.append(line)
- continue
- names = [s.strip() for s in re.split(',| and ', l)]
- was_found = False
- for name in names:
- found = find_authors(authors, name)
- if found:
- was_found = True
- # print("found {}".format(found[1]))
- if found[0]:
- found_authors.append(found)
- if was_found:
- # lines.append(NameLine(line))
- continue
- if 'university' in l or 'universiteit' in l or 'research center' in l or 'research lab' in l or 'college' in l or ', inc' in l or 'institute' in l:
- inst = re.sub(r'^[\W\d]+', '', line)
- inst = re.sub(r'[\W\d]+$', '', inst)
- inst = re.sub(r'\s+', ' ', inst)
- inst = re.sub(r'Dept.', 'Department ', inst)
- if len(inst) < 160:
- inst = inst.replace('&', 'and')
- inst_parts = []
- department = ''
- for inst_part in inst.split(','):
- inst_part = inst_part.strip()
- inst_low = inst_part.lower()
- if 'prof' in inst_low:
- continue
- if 'article ' in inst_low:
- continue
- if 'department' in inst_low:
- department = inst_part
- else:
- inst_parts.append(inst_part)
- inst = ', '.join(inst_parts)
- if inst:
- inst = ''.join([i if ord(i) < 128 else ' ' for i in inst]).strip()
- institutions.append([ paper_id, inst, department ])
- lines.append(BoldLine(inst))
- continue
- lines.append(line)
- write_json('{}/{}'.format(paper_path(paper_id), 'institutions.json'), { 'institutions': institutions })
- return {
- 'first_pages': [
- paper_id,
- lines,
- found_authors,
- emails,
- ],
- 'institutions': None if not len(institutions) else institutions,
- 'no_institutions': None if len(institutions) else [
- paper_id,
- lines,
- ],
- }
-
def find_authors(authors, line):
for a in authors:
if a[2] in line: