diff options
Diffstat (limited to 's2-pdf-report.py')
| -rw-r--r-- | s2-pdf-report.py | 102 |
1 files changed, 0 insertions, 102 deletions
diff --git a/s2-pdf-report.py b/s2-pdf-report.py deleted file mode 100644 index d3e117f1..00000000 --- a/s2-pdf-report.py +++ /dev/null @@ -1,102 +0,0 @@ -import re -import os -import gzip -import glob -import json -import click -import math -import string -# import nltk -from collections import Counter -from util import * - -PDF_DIR = 'datasets/s2/pdf' -punctuation = re.compile(r'[-.?!,":;()|0-9]') - -@click.command() -def s2_pdf_report(): - rows = [] - empty_papers = [] - no_separator_papers = [] - geocoded_papers = [] - unknown_papers = [] - unknown_terms = Counter() - unknown_bigrams = Counter() - unknown_trigrams = Counter() - found_count = 0 - total_count = 0 - addresses = AddressBook() - for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): - paper_id = fn.replace(PDF_DIR, '').split('/')[2] - paper = load_paper(paper_id) - total_count += 1 - # print(paper_id) - headings, found_abstract = read_headings(fn, paper) - heading_string = '\n'.join(headings[0:20]) - found_addresses = [] - if not found_abstract: - if len(headings) == 0: - empty_papers.append(paper.record()) - continue - if len(headings) > 20: - no_separator_papers.append(paper.record()) - # continue - for heading in headings: - l = heading.lower().strip() - address = addresses.find(l) - if address: - found_addresses.append(address) - if not address: - for heading in headings: - l = heading.lower().strip() - l = re.sub('[^a-zA-Z]+', ' ', l) - l = re.sub('\s+', ' ', l) - terms = l.strip().split(' ') - last_term = None - penultimate_term = None - for term in terms: - if len(term) > 1 and term != 'cid': - if len(term) > 2: - unknown_terms[term] += 1 - if last_term: - unknown_bigrams[last_term + ' ' + term] += 1 - if penultimate_term: - unknown_trigrams[penultimate_term + ' ' + last_term + ' ' + term] += 1 - penultimate_term = last_term - last_term = term - - # MAYBE try checking the entire string against everything? - # if not len(found_addresses): - # l = heading_string.lower().strip() - # address = addresses.find(l) - # if address: - # found_addresses.append(address) - - if len(found_addresses): - found_count += 1 - for address in found_addresses: - geocoded_papers.append([paper.paper_id, paper.title] + address) - else: - unknown_papers.append([paper.paper_id, paper.title, heading_string]) - - write_report('reports/pdf_unknown_terms.html', title='PDF Report: Unknown Terms', keys=None, rows=unknown_terms.most_common(1000)) - write_report('reports/pdf_unknown_bigrams.html', title='PDF Report: Unknown Bigrams', keys=None, rows=unknown_bigrams.most_common(1000)) - write_report('reports/pdf_unknown_trigram.html', title='PDF Report: Unknown Trigrams', keys=None, rows=unknown_trigrams.most_common(1000)) - write_csv('reports/stats/empty_papers.csv', keys=None, rows=empty_papers) - write_csv('reports/stats/no_separator_papers.csv', keys=None, rows=no_separator_papers) - write_csv('reports/stats/geocoded_papers.csv', keys=None, rows=geocoded_papers) - write_csv('reports/stats/unknown_papers.csv', keys=None, rows=unknown_papers) - print("{} {} ({}%)".format('empty', len(empty_papers), percent(len(empty_papers), total_count))) - print("{} {} ({}%)".format('no separator', len(no_separator_papers), percent(len(no_separator_papers), total_count))) - print("{} {} ({}%)".format('found', found_count, percent(found_count, total_count))) - print("{} {} ({}%)".format('unknown', len(unknown_papers), percent(len(unknown_papers), total_count))) - print("{} {} entities".format('geocoded', len(geocoded_papers))) - -def percent(a,b): - return round(100 * a / b) - -def paper_path(paper_id): - return '{}/{}/{}'.format(PDF_DIR, paper_id[0:2], paper_id) - -if __name__ == '__main__': - s2_pdf_report() |
