import re import os import gzip import glob import simplejson as json import click import math import string # import nltk from collections import Counter from util import * PDF_DIR = 'datasets/s2/pdf' punctuation = re.compile(r'[-.?!,":;()|0-9]') @click.command() def s2_pdf_report(): rows = [] empty_papers = [] no_separator_papers = [] geocoded_papers = [] unknown_papers = [] unknown_terms = Counter() unknown_bigrams = Counter() unknown_trigrams = Counter() found_count = 0 total_count = 0 addresses = AddressBook() for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): paper_id = fn.replace(PDF_DIR, '').split('/')[2] paper = load_paper(paper_id) if paper is None or paper.data is None: continue total_count += 1 # print(paper_id) headings, found_abstract = read_headings(fn, paper) heading_string = '\n'.join(headings[0:20]) found_addresses = [] if not found_abstract: if len(headings) == 0: empty_papers.append(paper.record()) continue if len(headings) > 20: no_separator_papers.append(paper.record()) # continue for heading in headings: l = heading.lower().strip() address = addresses.find(l) if address: found_addresses.append(address) if not address: for heading in headings: l = heading.lower().strip() l = re.sub('[^a-zA-Z]+', ' ', l) l = re.sub('\s+', ' ', l) terms = l.strip().split(' ') last_term = None penultimate_term = None for term in terms: if len(term) > 1 and term != 'cid': if len(term) > 2: unknown_terms[term] += 1 if last_term: unknown_bigrams[last_term + ' ' + term] += 1 if penultimate_term: unknown_trigrams[penultimate_term + ' ' + last_term + ' ' + term] += 1 penultimate_term = last_term last_term = term # MAYBE try checking the entire string against everything? # if not len(found_addresses): # l = heading_string.lower().strip() # address = addresses.find(l) # if address: # found_addresses.append(address) if len(found_addresses): found_count += 1 for address in found_addresses: geocoded_papers.append([paper.paper_id, paper.title] + address) else: unknown_papers.append([paper.paper_id, paper.title, heading_string]) write_report('reports/pdf_unknown_terms.html', title='PDF Report: Unknown Terms', keys=None, rows=unknown_terms.most_common(1000)) write_report('reports/pdf_unknown_bigrams.html', title='PDF Report: Unknown Bigrams', keys=None, rows=unknown_bigrams.most_common(1000)) write_report('reports/pdf_unknown_trigram.html', title='PDF Report: Unknown Trigrams', keys=None, rows=unknown_trigrams.most_common(1000)) write_csv('reports/stats/empty_papers.csv', keys=None, rows=empty_papers) write_csv('reports/stats/no_separator_papers.csv', keys=None, rows=no_separator_papers) write_csv('reports/stats/geocoded_papers.csv', keys=None, rows=geocoded_papers) write_csv('reports/stats/unknown_papers.csv', keys=None, rows=unknown_papers) print("{} {} ({}%)".format('empty', len(empty_papers), percent(len(empty_papers), total_count))) print("{} {} ({}%)".format('no separator', len(no_separator_papers), percent(len(no_separator_papers), total_count))) print("{} {} ({}%)".format('found', found_count, percent(found_count, total_count))) print("{} {} ({}%)".format('unknown', len(unknown_papers), percent(len(unknown_papers), total_count))) print("{} {} entities".format('geocoded', len(geocoded_papers))) def percent(a,b): return round(100 * a / b) def paper_path(paper_id): return '{}/{}/{}'.format(PDF_DIR, paper_id[0:2], paper_id) if __name__ == '__main__': s2_pdf_report()