diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-11-25 22:19:15 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-11-25 22:19:15 +0100 |
| commit | ee3d0d98e19f1d8177d85af1866fd0ee431fe9ea (patch) | |
| tree | 41372528e78d4328bc2a47bbbabac7e809c58894 /scraper/s2-pdf-report.py | |
| parent | 255b8178af1e25a71fd23703d30c0d1f74911f47 (diff) | |
moving stuff
Diffstat (limited to 'scraper/s2-pdf-report.py')
| -rw-r--r-- | scraper/s2-pdf-report.py | 102 |
1 files changed, 102 insertions, 0 deletions
diff --git a/scraper/s2-pdf-report.py b/scraper/s2-pdf-report.py new file mode 100644 index 00000000..d3e117f1 --- /dev/null +++ b/scraper/s2-pdf-report.py @@ -0,0 +1,102 @@ +import re +import os +import gzip +import glob +import json +import click +import math +import string +# import nltk +from collections import Counter +from util import * + +PDF_DIR = 'datasets/s2/pdf' +punctuation = re.compile(r'[-.?!,":;()|0-9]') + +@click.command() +def s2_pdf_report(): + rows = [] + empty_papers = [] + no_separator_papers = [] + geocoded_papers = [] + unknown_papers = [] + unknown_terms = Counter() + unknown_bigrams = Counter() + unknown_trigrams = Counter() + found_count = 0 + total_count = 0 + addresses = AddressBook() + for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): + paper_id = fn.replace(PDF_DIR, '').split('/')[2] + paper = load_paper(paper_id) + total_count += 1 + # print(paper_id) + headings, found_abstract = read_headings(fn, paper) + heading_string = '\n'.join(headings[0:20]) + found_addresses = [] + if not found_abstract: + if len(headings) == 0: + empty_papers.append(paper.record()) + continue + if len(headings) > 20: + no_separator_papers.append(paper.record()) + # continue + for heading in headings: + l = heading.lower().strip() + address = addresses.find(l) + if address: + found_addresses.append(address) + if not address: + for heading in headings: + l = heading.lower().strip() + l = re.sub('[^a-zA-Z]+', ' ', l) + l = re.sub('\s+', ' ', l) + terms = l.strip().split(' ') + last_term = None + penultimate_term = None + for term in terms: + if len(term) > 1 and term != 'cid': + if len(term) > 2: + unknown_terms[term] += 1 + if last_term: + unknown_bigrams[last_term + ' ' + term] += 1 + if penultimate_term: + unknown_trigrams[penultimate_term + ' ' + last_term + ' ' + term] += 1 + penultimate_term = last_term + last_term = term + + # MAYBE try checking the entire string against everything? + # if not len(found_addresses): + # l = heading_string.lower().strip() + # address = addresses.find(l) + # if address: + # found_addresses.append(address) + + if len(found_addresses): + found_count += 1 + for address in found_addresses: + geocoded_papers.append([paper.paper_id, paper.title] + address) + else: + unknown_papers.append([paper.paper_id, paper.title, heading_string]) + + write_report('reports/pdf_unknown_terms.html', title='PDF Report: Unknown Terms', keys=None, rows=unknown_terms.most_common(1000)) + write_report('reports/pdf_unknown_bigrams.html', title='PDF Report: Unknown Bigrams', keys=None, rows=unknown_bigrams.most_common(1000)) + write_report('reports/pdf_unknown_trigram.html', title='PDF Report: Unknown Trigrams', keys=None, rows=unknown_trigrams.most_common(1000)) + write_csv('reports/stats/empty_papers.csv', keys=None, rows=empty_papers) + write_csv('reports/stats/no_separator_papers.csv', keys=None, rows=no_separator_papers) + write_csv('reports/stats/geocoded_papers.csv', keys=None, rows=geocoded_papers) + write_csv('reports/stats/unknown_papers.csv', keys=None, rows=unknown_papers) + print("{} {} ({}%)".format('empty', len(empty_papers), percent(len(empty_papers), total_count))) + print("{} {} ({}%)".format('no separator', len(no_separator_papers), percent(len(no_separator_papers), total_count))) + print("{} {} ({}%)".format('found', found_count, percent(found_count, total_count))) + print("{} {} ({}%)".format('unknown', len(unknown_papers), percent(len(unknown_papers), total_count))) + print("{} {} entities".format('geocoded', len(geocoded_papers))) + +def percent(a,b): + return round(100 * a / b) + +def paper_path(paper_id): + return '{}/{}/{}'.format(PDF_DIR, paper_id[0:2], paper_id) + +if __name__ == '__main__': + s2_pdf_report() |
