diff options
Diffstat (limited to 's2-pdf-report.py')
| -rw-r--r-- | s2-pdf-report.py | 136 |
1 files changed, 2 insertions, 134 deletions
diff --git a/s2-pdf-report.py b/s2-pdf-report.py index 4475f3a9..d659ed15 100644 --- a/s2-pdf-report.py +++ b/s2-pdf-report.py @@ -10,12 +10,8 @@ from util import * PDF_DIR = 'datasets/s2/pdf' -@click.group() +@click.command() def s2_pdf_report(): - pass - -@s2_pdf_report.command() -def report_geocoded_papers(): rows = [] empty_papers = [] no_separator_papers = [] @@ -45,6 +41,7 @@ def report_geocoded_papers(): if address: found_addresses.append(address) + # MAYBE try checking the entire string against everything? # if not len(found_addresses): # l = heading_string.lower().strip() # address = addresses.find(l) @@ -105,135 +102,6 @@ def read_headings(fn, paper): headings.append(line.strip()) return headings, found_abstract -class AddressBook (object): - def __init__(self): - lookup = {} - data = read_csv('reports/all_institutions_sorted.csv', keys=None) - for index, line in enumerate(data): - lookup[line[1].lower().strip()] = index - self.data = data - self.lookup = lookup - def find(self, address): - address = address.lower().strip().strip(string.digits) - if address in self.lookup: - index = self.lookup[address] - return self.data[index] - for part in address.split(','): - part = part.strip().replace(' ', ' ') - if part in self.lookup: - index = self.lookup[part] - return self.data[index] - return None - -@s2_pdf_report.command() -def report_first_pages(): - rows = [] - institution_names = [] - institutions = [] - no_institutions = [] - for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): - data = process_paper(fn) - rows.append(data['first_pages']) - if data['institutions']: - for institution in data['institutions']: - institutions.append(institution) - institution_names.append(institution[1]) - if data['no_institutions']: - no_institutions.append(data['no_institutions']) - deduped_institutions = dedupe(institution_names) - - write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows) - write_report('reports/institutions.html', title='Institutions', keys=None, rows=sorted(institutions, key=lambda x: x[1])) - write_report('reports/institutions_missing.html', title='Institutions', keys=None, rows=no_institutions) - write_csv('reports/institution_names.csv', keys=None, rows=[(name,) for name in deduped_institutions]) - print("{} deduped institutions".format(len(deduped_institutions))) - -def dedupe(a): - p = {} - for s in a: - p[s] = None - ss = sorted(p.keys()) - return ss - -def process_paper(fn): - paper_id = fn.replace(PDF_DIR, '').split('/')[2] - paper = load_paper(paper_id) - if paper is None: - print("{} no paper found!".format(paper_id)) - return None - with open(fn, 'r') as f: - lines = [] - emails = [] - institutions = [] - authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ] - journal = paper.journal.lower() - found_authors = [] - for line in f.readlines(): - l = line.lower() - if 'abstract' in l: - break - if len(line) < 3: - continue - if journal and journal in l: - continue - if '@' in line: - # print('email {}'.format(line)) - emails.append(line) - continue - names = [s.strip() for s in re.split(',| and ', l)] - was_found = False - for name in names: - found = find_authors(authors, name) - if found: - was_found = True - # print("found {}".format(found[1])) - if found[0]: - found_authors.append(found) - if was_found: - # lines.append(NameLine(line)) - continue - if 'university' in l or 'universiteit' in l or 'research center' in l or 'research lab' in l or 'college' in l or ', inc' in l or 'institute' in l: - inst = re.sub(r'^[\W\d]+', '', line) - inst = re.sub(r'[\W\d]+$', '', inst) - inst = re.sub(r'\s+', ' ', inst) - inst = re.sub(r'Dept.', 'Department ', inst) - if len(inst) < 160: - inst = inst.replace('&', 'and') - inst_parts = [] - department = '' - for inst_part in inst.split(','): - inst_part = inst_part.strip() - inst_low = inst_part.lower() - if 'prof' in inst_low: - continue - if 'article ' in inst_low: - continue - if 'department' in inst_low: - department = inst_part - else: - inst_parts.append(inst_part) - inst = ', '.join(inst_parts) - if inst: - inst = ''.join([i if ord(i) < 128 else ' ' for i in inst]).strip() - institutions.append([ paper_id, inst, department ]) - lines.append(BoldLine(inst)) - continue - lines.append(line) - write_json('{}/{}'.format(paper_path(paper_id), 'institutions.json'), { 'institutions': institutions }) - return { - 'first_pages': [ - paper_id, - lines, - found_authors, - emails, - ], - 'institutions': None if not len(institutions) else institutions, - 'no_institutions': None if len(institutions) else [ - paper_id, - lines, - ], - } - def find_authors(authors, line): for a in authors: if a[2] in line: |
