summaryrefslogtreecommitdiff
path: root/s2-pdf-report.py
diff options
context:
space:
mode:
Diffstat (limited to 's2-pdf-report.py')
-rw-r--r--s2-pdf-report.py30
1 files changed, 29 insertions, 1 deletions
diff --git a/s2-pdf-report.py b/s2-pdf-report.py
index c47579a6..d3e117f1 100644
--- a/s2-pdf-report.py
+++ b/s2-pdf-report.py
@@ -6,17 +6,23 @@ import json
import click
import math
import string
+# import nltk
+from collections import Counter
from util import *
PDF_DIR = 'datasets/s2/pdf'
+punctuation = re.compile(r'[-.?!,":;()|0-9]')
@click.command()
def s2_pdf_report():
rows = []
empty_papers = []
no_separator_papers = []
- geocoded_papers = []
+ geocoded_papers = []
unknown_papers = []
+ unknown_terms = Counter()
+ unknown_bigrams = Counter()
+ unknown_trigrams = Counter()
found_count = 0
total_count = 0
addresses = AddressBook()
@@ -40,6 +46,24 @@ def s2_pdf_report():
address = addresses.find(l)
if address:
found_addresses.append(address)
+ if not address:
+ for heading in headings:
+ l = heading.lower().strip()
+ l = re.sub('[^a-zA-Z]+', ' ', l)
+ l = re.sub('\s+', ' ', l)
+ terms = l.strip().split(' ')
+ last_term = None
+ penultimate_term = None
+ for term in terms:
+ if len(term) > 1 and term != 'cid':
+ if len(term) > 2:
+ unknown_terms[term] += 1
+ if last_term:
+ unknown_bigrams[last_term + ' ' + term] += 1
+ if penultimate_term:
+ unknown_trigrams[penultimate_term + ' ' + last_term + ' ' + term] += 1
+ penultimate_term = last_term
+ last_term = term
# MAYBE try checking the entire string against everything?
# if not len(found_addresses):
@@ -54,6 +78,10 @@ def s2_pdf_report():
geocoded_papers.append([paper.paper_id, paper.title] + address)
else:
unknown_papers.append([paper.paper_id, paper.title, heading_string])
+
+ write_report('reports/pdf_unknown_terms.html', title='PDF Report: Unknown Terms', keys=None, rows=unknown_terms.most_common(1000))
+ write_report('reports/pdf_unknown_bigrams.html', title='PDF Report: Unknown Bigrams', keys=None, rows=unknown_bigrams.most_common(1000))
+ write_report('reports/pdf_unknown_trigram.html', title='PDF Report: Unknown Trigrams', keys=None, rows=unknown_trigrams.most_common(1000))
write_csv('reports/stats/empty_papers.csv', keys=None, rows=empty_papers)
write_csv('reports/stats/no_separator_papers.csv', keys=None, rows=no_separator_papers)
write_csv('reports/stats/geocoded_papers.csv', keys=None, rows=geocoded_papers)