diff options
Diffstat (limited to 's2-pdf-report.py')
| -rw-r--r-- | s2-pdf-report.py | 30 |
1 files changed, 29 insertions, 1 deletions
diff --git a/s2-pdf-report.py b/s2-pdf-report.py index c47579a6..d3e117f1 100644 --- a/s2-pdf-report.py +++ b/s2-pdf-report.py @@ -6,17 +6,23 @@ import json import click import math import string +# import nltk +from collections import Counter from util import * PDF_DIR = 'datasets/s2/pdf' +punctuation = re.compile(r'[-.?!,":;()|0-9]') @click.command() def s2_pdf_report(): rows = [] empty_papers = [] no_separator_papers = [] - geocoded_papers = [] + geocoded_papers = [] unknown_papers = [] + unknown_terms = Counter() + unknown_bigrams = Counter() + unknown_trigrams = Counter() found_count = 0 total_count = 0 addresses = AddressBook() @@ -40,6 +46,24 @@ def s2_pdf_report(): address = addresses.find(l) if address: found_addresses.append(address) + if not address: + for heading in headings: + l = heading.lower().strip() + l = re.sub('[^a-zA-Z]+', ' ', l) + l = re.sub('\s+', ' ', l) + terms = l.strip().split(' ') + last_term = None + penultimate_term = None + for term in terms: + if len(term) > 1 and term != 'cid': + if len(term) > 2: + unknown_terms[term] += 1 + if last_term: + unknown_bigrams[last_term + ' ' + term] += 1 + if penultimate_term: + unknown_trigrams[penultimate_term + ' ' + last_term + ' ' + term] += 1 + penultimate_term = last_term + last_term = term # MAYBE try checking the entire string against everything? # if not len(found_addresses): @@ -54,6 +78,10 @@ def s2_pdf_report(): geocoded_papers.append([paper.paper_id, paper.title] + address) else: unknown_papers.append([paper.paper_id, paper.title, heading_string]) + + write_report('reports/pdf_unknown_terms.html', title='PDF Report: Unknown Terms', keys=None, rows=unknown_terms.most_common(1000)) + write_report('reports/pdf_unknown_bigrams.html', title='PDF Report: Unknown Bigrams', keys=None, rows=unknown_bigrams.most_common(1000)) + write_report('reports/pdf_unknown_trigram.html', title='PDF Report: Unknown Trigrams', keys=None, rows=unknown_trigrams.most_common(1000)) write_csv('reports/stats/empty_papers.csv', keys=None, rows=empty_papers) write_csv('reports/stats/no_separator_papers.csv', keys=None, rows=no_separator_papers) write_csv('reports/stats/geocoded_papers.csv', keys=None, rows=geocoded_papers) |
