diff options
Diffstat (limited to 's2-pdf-report.py')
| -rw-r--r-- | s2-pdf-report.py | 138 |
1 files changed, 122 insertions, 16 deletions
diff --git a/s2-pdf-report.py b/s2-pdf-report.py index b22d44d5..4475f3a9 100644 --- a/s2-pdf-report.py +++ b/s2-pdf-report.py @@ -4,12 +4,129 @@ import gzip import glob import json import click +import math +import string from util import * PDF_DIR = 'datasets/s2/pdf' -@click.command() -def pdf_report_first_pages(): +@click.group() +def s2_pdf_report(): + pass + +@s2_pdf_report.command() +def report_geocoded_papers(): + rows = [] + empty_papers = [] + no_separator_papers = [] + geocoded_papers = [] + unknown_papers = [] + found_count = 0 + total_count = 0 + addresses = AddressBook() + for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): + paper_id = fn.replace(PDF_DIR, '').split('/')[2] + paper = load_paper(paper_id) + total_count += 1 + # print(paper_id) + headings, found_abstract = read_headings(fn, paper) + heading_string = '\n'.join(headings[0:20]) + found_addresses = [] + if not found_abstract: + if len(headings) == 0: + empty_papers.append(paper.record()) + continue + if len(headings) > 20: + no_separator_papers.append(paper.record()) + # continue + for heading in headings: + l = heading.lower().strip() + address = addresses.find(l) + if address: + found_addresses.append(address) + + # if not len(found_addresses): + # l = heading_string.lower().strip() + # address = addresses.find(l) + # if address: + # found_addresses.append(address) + + if len(found_addresses): + found_count += 1 + for address in found_addresses: + geocoded_papers.append([paper.paper_id, paper.title] + address) + else: + unknown_papers.append([paper.paper_id, paper.title, heading_string]) + write_csv('reports/stats/empty_papers.csv', keys=None, rows=empty_papers) + write_csv('reports/stats/no_separator_papers.csv', keys=None, rows=no_separator_papers) + write_csv('reports/stats/geocoded_papers.csv', keys=None, rows=geocoded_papers) + write_csv('reports/stats/unknown_papers.csv', keys=None, rows=unknown_papers) + print("{} {} ({}%)".format('empty', len(empty_papers), percent(len(empty_papers), total_count))) + print("{} {} ({}%)".format('no separator', len(no_separator_papers), percent(len(no_separator_papers), total_count))) + print("{} {} ({}%)".format('found', found_count, percent(found_count, total_count))) + print("{} {} ({}%)".format('unknown', len(unknown_papers), percent(len(unknown_papers), total_count))) + print("{} {} entities".format('geocoded', len(geocoded_papers))) + +def percent(a,b): + return round(100 * a / b) + +def read_headings(fn, paper): + headings = [] + found_abstract = False + found_authors = [] + journal = paper.journal.lower() + authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ] + with open(fn, 'r') as f: + for line in f.readlines(): + line = re.sub(r"\S*@\S*\s?", '', line) + l = line.lower().strip() + if len(l) < 5: + continue + if line[0] == 'a' or line[0] == 'b' or line[0] == 'c' or line[0] == '1' or line[0] == '2' or line[0] == '3' or line[0] == '4': + line = line[1:] + line = line.strip("∗†‡") + line = line.replace("fl", "fl").replace('ff', 'ff').replace('ffi', 'ffi').replace('ffl', 'ffl') + line = line.strip() + if 'abstract' in l: + found_abstract = True + break + if journal and journal in l: + continue + names = [s.strip() for s in re.split(',| and ', l)] + was_found = False + for name in names: + found = find_authors(authors, name) + if found: + was_found = True + # print("found {}".format(found[1])) + if found[0]: + found_authors.append(found) + continue + headings.append(line.strip()) + return headings, found_abstract + +class AddressBook (object): + def __init__(self): + lookup = {} + data = read_csv('reports/all_institutions_sorted.csv', keys=None) + for index, line in enumerate(data): + lookup[line[1].lower().strip()] = index + self.data = data + self.lookup = lookup + def find(self, address): + address = address.lower().strip().strip(string.digits) + if address in self.lookup: + index = self.lookup[address] + return self.data[index] + for part in address.split(','): + part = part.strip().replace(' ', ' ') + if part in self.lookup: + index = self.lookup[part] + return self.data[index] + return None + +@s2_pdf_report.command() +def report_first_pages(): rows = [] institution_names = [] institutions = [] @@ -102,6 +219,7 @@ def process_paper(fn): lines.append(BoldLine(inst)) continue lines.append(line) + write_json('{}/{}'.format(paper_path(paper_id), 'institutions.json'), { 'institutions': institutions }) return { 'first_pages': [ paper_id, @@ -116,18 +234,6 @@ def process_paper(fn): ], } -class NameLine(object): - def __init__(self, s): - self.s = s.strip() - def __str__(self): - return '<span class="name">' + self.s + '</span>' - -class BoldLine(object): - def __init__(self, s): - self.s = s.strip() - def __str__(self): - return '<b>' + self.s + '</b>' - def find_authors(authors, line): for a in authors: if a[2] in line: @@ -135,7 +241,7 @@ def find_authors(authors, line): return None def paper_path(paper_id): - return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) + return '{}/{}/{}'.format(PDF_DIR, paper_id[0:2], paper_id) if __name__ == '__main__': - pdf_report_first_pages() + s2_pdf_report() |
