summaryrefslogtreecommitdiff
path: root/scraper/s2-pdf-report.py
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/s2-pdf-report.py')
-rw-r--r--scraper/s2-pdf-report.py102
1 files changed, 102 insertions, 0 deletions
diff --git a/scraper/s2-pdf-report.py b/scraper/s2-pdf-report.py
new file mode 100644
index 00000000..cdb340f5
--- /dev/null
+++ b/scraper/s2-pdf-report.py
@@ -0,0 +1,102 @@
+import re
+import os
+import gzip
+import glob
+import simplejson as json
+import click
+import math
+import string
+# import nltk
+from collections import Counter
+from util import *
+
+PDF_DIR = 'datasets/s2/pdf'
+punctuation = re.compile(r'[-.?!,":;()|0-9]')
+
+@click.command()
+def s2_pdf_report():
+ rows = []
+ empty_papers = []
+ no_separator_papers = []
+ geocoded_papers = []
+ unknown_papers = []
+ unknown_terms = Counter()
+ unknown_bigrams = Counter()
+ unknown_trigrams = Counter()
+ found_count = 0
+ total_count = 0
+ addresses = AddressBook()
+ for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
+ paper_id = fn.replace(PDF_DIR, '').split('/')[2]
+ paper = load_paper(paper_id)
+ total_count += 1
+ # print(paper_id)
+ headings, found_abstract = read_headings(fn, paper)
+ heading_string = '\n'.join(headings[0:20])
+ found_addresses = []
+ if not found_abstract:
+ if len(headings) == 0:
+ empty_papers.append(paper.record())
+ continue
+ if len(headings) > 20:
+ no_separator_papers.append(paper.record())
+ # continue
+ for heading in headings:
+ l = heading.lower().strip()
+ address = addresses.find(l)
+ if address:
+ found_addresses.append(address)
+ if not address:
+ for heading in headings:
+ l = heading.lower().strip()
+ l = re.sub('[^a-zA-Z]+', ' ', l)
+ l = re.sub('\s+', ' ', l)
+ terms = l.strip().split(' ')
+ last_term = None
+ penultimate_term = None
+ for term in terms:
+ if len(term) > 1 and term != 'cid':
+ if len(term) > 2:
+ unknown_terms[term] += 1
+ if last_term:
+ unknown_bigrams[last_term + ' ' + term] += 1
+ if penultimate_term:
+ unknown_trigrams[penultimate_term + ' ' + last_term + ' ' + term] += 1
+ penultimate_term = last_term
+ last_term = term
+
+ # MAYBE try checking the entire string against everything?
+ # if not len(found_addresses):
+ # l = heading_string.lower().strip()
+ # address = addresses.find(l)
+ # if address:
+ # found_addresses.append(address)
+
+ if len(found_addresses):
+ found_count += 1
+ for address in found_addresses:
+ geocoded_papers.append([paper.paper_id, paper.title] + address)
+ else:
+ unknown_papers.append([paper.paper_id, paper.title, heading_string])
+
+ write_report('reports/pdf_unknown_terms.html', title='PDF Report: Unknown Terms', keys=None, rows=unknown_terms.most_common(1000))
+ write_report('reports/pdf_unknown_bigrams.html', title='PDF Report: Unknown Bigrams', keys=None, rows=unknown_bigrams.most_common(1000))
+ write_report('reports/pdf_unknown_trigram.html', title='PDF Report: Unknown Trigrams', keys=None, rows=unknown_trigrams.most_common(1000))
+ write_csv('reports/stats/empty_papers.csv', keys=None, rows=empty_papers)
+ write_csv('reports/stats/no_separator_papers.csv', keys=None, rows=no_separator_papers)
+ write_csv('reports/stats/geocoded_papers.csv', keys=None, rows=geocoded_papers)
+ write_csv('reports/stats/unknown_papers.csv', keys=None, rows=unknown_papers)
+ print("{} {} ({}%)".format('empty', len(empty_papers), percent(len(empty_papers), total_count)))
+ print("{} {} ({}%)".format('no separator', len(no_separator_papers), percent(len(no_separator_papers), total_count)))
+ print("{} {} ({}%)".format('found', found_count, percent(found_count, total_count)))
+ print("{} {} ({}%)".format('unknown', len(unknown_papers), percent(len(unknown_papers), total_count)))
+ print("{} {} entities".format('geocoded', len(geocoded_papers)))
+
+def percent(a,b):
+ return round(100 * a / b)
+
+def paper_path(paper_id):
+ return '{}/{}/{}'.format(PDF_DIR, paper_id[0:2], paper_id)
+
+if __name__ == '__main__':
+ s2_pdf_report()