summaryrefslogtreecommitdiff
path: root/s2-pdf-report.py
diff options
context:
space:
mode:
Diffstat (limited to 's2-pdf-report.py')
-rw-r--r--s2-pdf-report.py138
1 files changed, 122 insertions, 16 deletions
diff --git a/s2-pdf-report.py b/s2-pdf-report.py
index b22d44d5..4475f3a9 100644
--- a/s2-pdf-report.py
+++ b/s2-pdf-report.py
@@ -4,12 +4,129 @@ import gzip
import glob
import json
import click
+import math
+import string
from util import *
PDF_DIR = 'datasets/s2/pdf'
-@click.command()
-def pdf_report_first_pages():
+@click.group()
+def s2_pdf_report():
+ pass
+
+@s2_pdf_report.command()
+def report_geocoded_papers():
+ rows = []
+ empty_papers = []
+ no_separator_papers = []
+ geocoded_papers = []
+ unknown_papers = []
+ found_count = 0
+ total_count = 0
+ addresses = AddressBook()
+ for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
+ paper_id = fn.replace(PDF_DIR, '').split('/')[2]
+ paper = load_paper(paper_id)
+ total_count += 1
+ # print(paper_id)
+ headings, found_abstract = read_headings(fn, paper)
+ heading_string = '\n'.join(headings[0:20])
+ found_addresses = []
+ if not found_abstract:
+ if len(headings) == 0:
+ empty_papers.append(paper.record())
+ continue
+ if len(headings) > 20:
+ no_separator_papers.append(paper.record())
+ # continue
+ for heading in headings:
+ l = heading.lower().strip()
+ address = addresses.find(l)
+ if address:
+ found_addresses.append(address)
+
+ # if not len(found_addresses):
+ # l = heading_string.lower().strip()
+ # address = addresses.find(l)
+ # if address:
+ # found_addresses.append(address)
+
+ if len(found_addresses):
+ found_count += 1
+ for address in found_addresses:
+ geocoded_papers.append([paper.paper_id, paper.title] + address)
+ else:
+ unknown_papers.append([paper.paper_id, paper.title, heading_string])
+ write_csv('reports/stats/empty_papers.csv', keys=None, rows=empty_papers)
+ write_csv('reports/stats/no_separator_papers.csv', keys=None, rows=no_separator_papers)
+ write_csv('reports/stats/geocoded_papers.csv', keys=None, rows=geocoded_papers)
+ write_csv('reports/stats/unknown_papers.csv', keys=None, rows=unknown_papers)
+ print("{} {} ({}%)".format('empty', len(empty_papers), percent(len(empty_papers), total_count)))
+ print("{} {} ({}%)".format('no separator', len(no_separator_papers), percent(len(no_separator_papers), total_count)))
+ print("{} {} ({}%)".format('found', found_count, percent(found_count, total_count)))
+ print("{} {} ({}%)".format('unknown', len(unknown_papers), percent(len(unknown_papers), total_count)))
+ print("{} {} entities".format('geocoded', len(geocoded_papers)))
+
+def percent(a,b):
+ return round(100 * a / b)
+
+def read_headings(fn, paper):
+ headings = []
+ found_abstract = False
+ found_authors = []
+ journal = paper.journal.lower()
+ authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
+ with open(fn, 'r') as f:
+ for line in f.readlines():
+ line = re.sub(r"\S*@\S*\s?", '', line)
+ l = line.lower().strip()
+ if len(l) < 5:
+ continue
+ if line[0] == 'a' or line[0] == 'b' or line[0] == 'c' or line[0] == '1' or line[0] == '2' or line[0] == '3' or line[0] == '4':
+ line = line[1:]
+ line = line.strip("∗†‡")
+ line = line.replace("fl", "fl").replace('ff', 'ff').replace('ffi', 'f‌f‌i').replace('ffl', 'f‌f‌l')
+ line = line.strip()
+ if 'abstract' in l:
+ found_abstract = True
+ break
+ if journal and journal in l:
+ continue
+ names = [s.strip() for s in re.split(',| and ', l)]
+ was_found = False
+ for name in names:
+ found = find_authors(authors, name)
+ if found:
+ was_found = True
+ # print("found {}".format(found[1]))
+ if found[0]:
+ found_authors.append(found)
+ continue
+ headings.append(line.strip())
+ return headings, found_abstract
+
+class AddressBook (object):
+ def __init__(self):
+ lookup = {}
+ data = read_csv('reports/all_institutions_sorted.csv', keys=None)
+ for index, line in enumerate(data):
+ lookup[line[1].lower().strip()] = index
+ self.data = data
+ self.lookup = lookup
+ def find(self, address):
+ address = address.lower().strip().strip(string.digits)
+ if address in self.lookup:
+ index = self.lookup[address]
+ return self.data[index]
+ for part in address.split(','):
+ part = part.strip().replace(' ', ' ')
+ if part in self.lookup:
+ index = self.lookup[part]
+ return self.data[index]
+ return None
+
+@s2_pdf_report.command()
+def report_first_pages():
rows = []
institution_names = []
institutions = []
@@ -102,6 +219,7 @@ def process_paper(fn):
lines.append(BoldLine(inst))
continue
lines.append(line)
+ write_json('{}/{}'.format(paper_path(paper_id), 'institutions.json'), { 'institutions': institutions })
return {
'first_pages': [
paper_id,
@@ -116,18 +234,6 @@ def process_paper(fn):
],
}
-class NameLine(object):
- def __init__(self, s):
- self.s = s.strip()
- def __str__(self):
- return '<span class="name">' + self.s + '</span>'
-
-class BoldLine(object):
- def __init__(self, s):
- self.s = s.strip()
- def __str__(self):
- return '<b>' + self.s + '</b>'
-
def find_authors(authors, line):
for a in authors:
if a[2] in line:
@@ -135,7 +241,7 @@ def find_authors(authors, line):
return None
def paper_path(paper_id):
- return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id)
+ return '{}/{}/{}'.format(PDF_DIR, paper_id[0:2], paper_id)
if __name__ == '__main__':
- pdf_report_first_pages()
+ s2_pdf_report()