1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
|
import re
import os
import gzip
import glob
import json
import click
import math
import string
from util import *
PDF_DIR = 'datasets/s2/pdf'
@click.command()
def s2_pdf_report():
rows = []
empty_papers = []
no_separator_papers = []
geocoded_papers = []
unknown_papers = []
found_count = 0
total_count = 0
addresses = AddressBook()
for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
paper_id = fn.replace(PDF_DIR, '').split('/')[2]
paper = load_paper(paper_id)
total_count += 1
# print(paper_id)
headings, found_abstract = read_headings(fn, paper)
heading_string = '\n'.join(headings[0:20])
found_addresses = []
if not found_abstract:
if len(headings) == 0:
empty_papers.append(paper.record())
continue
if len(headings) > 20:
no_separator_papers.append(paper.record())
# continue
for heading in headings:
l = heading.lower().strip()
address = addresses.find(l)
if address:
found_addresses.append(address)
# MAYBE try checking the entire string against everything?
# if not len(found_addresses):
# l = heading_string.lower().strip()
# address = addresses.find(l)
# if address:
# found_addresses.append(address)
if len(found_addresses):
found_count += 1
for address in found_addresses:
geocoded_papers.append([paper.paper_id, paper.title] + address)
else:
unknown_papers.append([paper.paper_id, paper.title, heading_string])
write_csv('reports/stats/empty_papers.csv', keys=None, rows=empty_papers)
write_csv('reports/stats/no_separator_papers.csv', keys=None, rows=no_separator_papers)
write_csv('reports/stats/geocoded_papers.csv', keys=None, rows=geocoded_papers)
write_csv('reports/stats/unknown_papers.csv', keys=None, rows=unknown_papers)
print("{} {} ({}%)".format('empty', len(empty_papers), percent(len(empty_papers), total_count)))
print("{} {} ({}%)".format('no separator', len(no_separator_papers), percent(len(no_separator_papers), total_count)))
print("{} {} ({}%)".format('found', found_count, percent(found_count, total_count)))
print("{} {} ({}%)".format('unknown', len(unknown_papers), percent(len(unknown_papers), total_count)))
print("{} {} entities".format('geocoded', len(geocoded_papers)))
def percent(a,b):
return round(100 * a / b)
def read_headings(fn, paper):
headings = []
found_abstract = False
found_authors = []
journal = paper.journal.lower()
authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
with open(fn, 'r') as f:
for line in f.readlines():
line = re.sub(r"\S*@\S*\s?", '', line)
l = line.lower().strip()
if len(l) < 5:
continue
if line[0] == 'a' or line[0] == 'b' or line[0] == 'c' or line[0] == '1' or line[0] == '2' or line[0] == '3' or line[0] == '4':
line = line[1:]
line = line.strip("∗†‡")
line = line.replace("fl", "fl").replace('ff', 'ff').replace('ffi', 'ffi').replace('ffl', 'ffl')
line = line.strip()
if 'abstract' in l:
found_abstract = True
break
if journal and journal in l:
continue
names = [s.strip() for s in re.split(',| and ', l)]
was_found = False
for name in names:
found = find_authors(authors, name)
if found:
was_found = True
# print("found {}".format(found[1]))
if found[0]:
found_authors.append(found)
continue
headings.append(line.strip())
return headings, found_abstract
def find_authors(authors, line):
for a in authors:
if a[2] in line:
return a
return None
def paper_path(paper_id):
return '{}/{}/{}'.format(PDF_DIR, paper_id[0:2], paper_id)
if __name__ == '__main__':
s2_pdf_report()
|