summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--README.md18
-rw-r--r--reports/map.js8
-rw-r--r--reports/pdf_institutions_deduped.csv (renamed from pdf_institutions_deduped.csv)0
-rw-r--r--s2-citation-report.py67
-rw-r--r--s2-fetch-google-sheet.py4
-rw-r--r--s2-geocode-spreadsheet.py83
-rw-r--r--s2-pdf-report.py44
-rw-r--r--util.py63
9 files changed, 206 insertions, 83 deletions
diff --git a/.gitignore b/.gitignore
index 84335112..7ecf4532 100644
--- a/.gitignore
+++ b/.gitignore
@@ -154,3 +154,5 @@ datasets/
reports/papers/
+.creds
+
diff --git a/README.md b/README.md
index 249e1ca1..964a3ee3 100644
--- a/README.md
+++ b/README.md
@@ -84,7 +84,7 @@ Fetch the files listed in ieee.json and process them.
Use pdfminer.six to extract the first page from the PDFs.
-### s2-pdf-report.py report_first_pages
+### s2-pdf-first-pages.py
Perform initial extraction of university-like terms, to be geocoded.
@@ -115,11 +115,21 @@ After scraping these universities, we got up to 47% match rate on papers from th
### expand-uni-lookup.py
-At this point in the process, I had divided the task of scraping and geocoding between 4 different machines, so I reduced down the output of these scripts into the file `reports/all_institutions.csv`. I got increased accuracy from my paper classifier using just university names, so I wrote this script to group the rows using the extracted university names, and show me which address they geocode to. This file must be gone through manually. This technique geocoded around 47% of papers.
+By now I had a list of institutions in `reports/all_institutions.csv` (done by merging the results of the geocoding, as I had done this on 4 computers and thus had 4 files of institutions). This file must be gone through manually. This technique geocoded around 47% of papers.
-### s2-pdf-report.py report_geocoded_papers
+At this point I moved `reports/all_institutions.csv` into the Google Sheets. All further results use the CSV on Google Sheets.
-Perform initial extraction of university-like terms, to be geocoded.
+### s2-pdf-report.py
+
+Generates reports of things from the PDFs that were not found.
+
+### s2-geocode-spreadsheet.py
+
+To add new institutions, simply list them in the spreadsheet with the lat/lng fields empty. Then run this script and anything missing a lat/lng will get one.
+
+### s2-citation-report.py
+
+Generate the main report with maps and citation lists.
---
diff --git a/reports/map.js b/reports/map.js
index b1b896e7..e6244686 100644
--- a/reports/map.js
+++ b/reports/map.js
@@ -19,8 +19,7 @@ let points = read_json('citations')
let address = read_json('address')
let source = [0,0]
if (address) {
- source[0] = parseFloat(address[1])
- source[1] = parseFloat(address[2])
+ source = address.slice(3,5).map(n => parseFloat(n))
console.log(address, source)
}
points.forEach(point => {
@@ -37,7 +36,9 @@ points.forEach(point => {
]
*/
- const latlng = point[2].slice(1,3)
+ const latlng = point.slice(5,7).map(n => parseFloat(n))
+ console.log(point)
+ if (!latlng.length || isNaN(latlng[0]) || isNaN(latlng[1])) return
var marker = L.marker(latlng).addTo(map);
marker.bindPopup([
"<b>",point[0], "</b>",
@@ -52,6 +53,7 @@ points.forEach(point => {
// vertices: 100,
// }
// L.Polyline.Arc(source, latlng, arcStyle).addTo(map);
+ console.log(latlng)
var pathStyle = {
color: 'rgb(245, 246, 150)',
fillColor: 'rgb(245, 246, 150)',
diff --git a/pdf_institutions_deduped.csv b/reports/pdf_institutions_deduped.csv
index 6a5e23e0..6a5e23e0 100644
--- a/pdf_institutions_deduped.csv
+++ b/reports/pdf_institutions_deduped.csv
diff --git a/s2-citation-report.py b/s2-citation-report.py
index 19b018f8..58b7ed8f 100644
--- a/s2-citation-report.py
+++ b/s2-citation-report.py
@@ -1,5 +1,5 @@
import os
-import gzip
+import re
import glob
import json
import math
@@ -9,7 +9,7 @@ from util import *
@click.command()
def s2_citation_report():
- addresses = load_addresses()
+ addresses = AddressBook()
for fn in glob.iglob('datasets/s2/papers/**/*.json', recursive=True):
process_paper(fn, addresses)
@@ -40,22 +40,37 @@ def process_paper(fn, addresses):
continue
institutions = load_institutions(citationId)
geocoded_institutions = []
+ unknown_institutions = []
institution = ''
address = None
for inst in sorted(institutions, key=operator.itemgetter(1)):
# print(inst[1])
address_count += 1
institution = inst[1]
- if institution in addresses:
- address = addresses[institution]
+ next_address = addresses.find(institution)
+ if next_address:
+ address = next_address
geocode_count += 1
geocoded_institutions.append(institution)
else:
- for part in institution.split(', '):
- if part in addresses:
- address = addresses[part]
- geocode_count += 1
- geocoded_institutions.append(institution)
+ unknown_institutions.append(institution)
+ if not address:
+ if os.path.exists(file_path('pdf', citationId, 'paper.txt')):
+ headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation)
+ heading_string = '\n'.join(headings[0:20])
+ found_addresses = []
+ if len(headings):
+ for heading in headings:
+ l = heading.lower().strip()
+ if l:
+ next_address = addresses.find(l)
+ if next_address:
+ address = next_address
+ geocode_count += 1
+ geocoded_institutions.append(heading)
+ else:
+ unknown_institutions.append(heading)
+
res['citations'].append({
'title': citation.title,
'journal': citation.journal,
@@ -63,31 +78,27 @@ def process_paper(fn, addresses):
'institutions': [inst[1] for inst in institutions],
'geocoded': geocoded_institutions,
})
- if len(geocoded_institutions):
+ if address:
geocoded_citations.append([
citation.title,
institution,
- address,
- ])
+ ] + address)
display_geocoded_citations.append([
citation.title,
- institution,
- ', '.join(address),
- ])
+ ] + address)
else:
unknown_citations.append([
citation.title,
- institution,
+ '<br>'.join(unknown_institutions),
])
paper_institutions = load_institutions(paper.paper_id)
paper_address = None
- for inst in sorted(institutions, key=operator.itemgetter(1)):
+ for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
# print(inst[1])
address_count += 1
institution = inst[1]
- if institution in addresses:
- paper_address = addresses[institution]
+ paper_address = addresses.find(institution)
if paper_address:
print(paper_address)
@@ -134,24 +145,18 @@ def process_paper(fn, addresses):
f.write("</html>")
return res
-def load_addresses():
- data = read_csv('reports/all_institutions.csv', keys=None)
- lookup = {}
- for row in data:
- name = row[0]
- lookup[name] = row
- return lookup
-
def load_institutions(paperId):
- if os.path.exists(os.path.join(data_path('pdf', paperId), 'institutions.json')):
- return read_json(os.path.join(data_path('pdf', paperId), 'institutions.json'))['institutions']
- elif os.path.exists(os.path.join(data_path('doi', paperId), 'institutions.json')):
- return read_json(os.path.join(data_path('doi', paperId), 'institutions.json'))['institutions']
+ if os.path.exists(file_path('pdf', paperId, 'institutions.json')):
+ return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions']
+ elif os.path.exists(file_path('doi', paperId, 'institutions.json')):
+ return read_json(file_path('doi', paperId, 'institutions.json'))['institutions']
else:
return []
def data_path(key, paper_id):
return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id)
+def file_path(key, paper_id, fn):
+ return os.path.join(data_path(key, paper_id), fn)
if __name__ == '__main__':
s2_citation_report()
diff --git a/s2-fetch-google-sheet.py b/s2-fetch-google-sheet.py
new file mode 100644
index 00000000..1fc887e4
--- /dev/null
+++ b/s2-fetch-google-sheet.py
@@ -0,0 +1,4 @@
+from util import *
+
+if __name__ == '__main__':
+ fetch_google_sheet()
diff --git a/s2-geocode-spreadsheet.py b/s2-geocode-spreadsheet.py
new file mode 100644
index 00000000..d0fd2050
--- /dev/null
+++ b/s2-geocode-spreadsheet.py
@@ -0,0 +1,83 @@
+import os
+import csv
+import click
+import time
+from geopy import geocoders
+from dotenv import load_dotenv
+from util import *
+load_dotenv()
+
+@click.command()
+def s2_geocode_spreadsheet():
+ geolocator = geocoders.GoogleV3(os.getenv('MAPS_API_KEY'))
+
+ worksheet = fetch_worksheet()
+ rows = fetch_google_sheet()
+ valid_count = 0
+ invalid_count = 0
+
+ print("got {} rows".format(len(rows)))
+
+ cname_lookup = {}
+ for i, row in enumerate(rows):
+ if len(row) == 6:
+ cname, name, address, lat, lng, org_type = row
+ elif len(row) == 7:
+ cname, name, address, lat, lng, org_type, extra_address = row
+ else:
+ print("Weirdly formatted row {}".format(i))
+ continue
+ if cname == name or cname not in cname_lookup:
+ cname_lookup[cname] = i
+
+ # 0 cname 1 name 2 address 3 lat 4 lng 5 org_type
+ for i, row in enumerate(rows):
+ if len(row) == 6:
+ cname, name, address, lat, lng, org_type = row
+ elif len(row) == 7:
+ cname, name, address, lat, lng, org_type, extra_address = row
+ else:
+ print("Weirdly formatted row {}: {} entries".format(i, len(row)))
+ continue
+ if lat and lng:
+ continue
+ c_row = rows[cname_lookup[cname]]
+ if c_row[3] and c_row[4]:
+ print("name {}, found cname: {}".format(name, cname))
+ worksheet.update_cell(i+2, 3, c_row[2])
+ worksheet.update_cell(i+2, 4, c_row[3])
+ worksheet.update_cell(i+2, 5, c_row[4])
+ continue
+ if address:
+ address_to_geocode = address
+ elif name:
+ address_to_geocode = name
+ elif cname:
+ address_to_geocode = cname
+
+ if not address_to_geocode:
+ continue
+
+ print(address_to_geocode)
+ location = geolocator.geocode(address_to_geocode)
+ if location:
+ print("{} found: {}".format(i+1, name))
+ print(location.raw)
+ worksheet.update_cell(i+2, 3, location.address)
+ worksheet.update_cell(i+2, 4, location.latitude)
+ worksheet.update_cell(i+2, 5, location.longitude)
+ if address and address != location.address:
+ worksheet.update_cell(i+2, 7, address)
+ valid_count += 1
+ row[2] = location.address
+ row[3] = location.latitude
+ row[4] = location.longitude
+ else:
+ print("{} not found: {}".format(i+1, address_to_geocode))
+ invalid_count += 1
+ time.sleep(2)
+
+ print("geocoded {} addresses, {} found, {} not found".format(len(rows), valid_count, invalid_count))
+
+if __name__ == '__main__':
+ s2_geocode_spreadsheet()
diff --git a/s2-pdf-report.py b/s2-pdf-report.py
index d659ed15..0748897f 100644
--- a/s2-pdf-report.py
+++ b/s2-pdf-report.py
@@ -22,10 +22,9 @@ def s2_pdf_report():
addresses = AddressBook()
for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
paper_id = fn.replace(PDF_DIR, '').split('/')[2]
- paper = load_paper(paper_id)
total_count += 1
# print(paper_id)
- headings, found_abstract = read_headings(fn, paper)
+ headings, found_abstract = read_headings(fn)
heading_string = '\n'.join(headings[0:20])
found_addresses = []
if not found_abstract:
@@ -67,47 +66,6 @@ def s2_pdf_report():
def percent(a,b):
return round(100 * a / b)
-def read_headings(fn, paper):
- headings = []
- found_abstract = False
- found_authors = []
- journal = paper.journal.lower()
- authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
- with open(fn, 'r') as f:
- for line in f.readlines():
- line = re.sub(r"\S*@\S*\s?", '', line)
- l = line.lower().strip()
- if len(l) < 5:
- continue
- if line[0] == 'a' or line[0] == 'b' or line[0] == 'c' or line[0] == '1' or line[0] == '2' or line[0] == '3' or line[0] == '4':
- line = line[1:]
- line = line.strip("∗†‡")
- line = line.replace("fl", "fl").replace('ff', 'ff').replace('ffi', 'f‌f‌i').replace('ffl', 'f‌f‌l')
- line = line.strip()
- if 'abstract' in l:
- found_abstract = True
- break
- if journal and journal in l:
- continue
- names = [s.strip() for s in re.split(',| and ', l)]
- was_found = False
- for name in names:
- found = find_authors(authors, name)
- if found:
- was_found = True
- # print("found {}".format(found[1]))
- if found[0]:
- found_authors.append(found)
- continue
- headings.append(line.strip())
- return headings, found_abstract
-
-def find_authors(authors, line):
- for a in authors:
- if a[2] in line:
- return a
- return None
-
def paper_path(paper_id):
return '{}/{}/{}'.format(PDF_DIR, paper_id[0:2], paper_id)
diff --git a/util.py b/util.py
index d851d797..ed7ee396 100644
--- a/util.py
+++ b/util.py
@@ -1,8 +1,11 @@
+import re
import os
import csv
import string
import codecs
+import gspread
import simplejson as json
+from oauth2client.service_account import ServiceAccountCredentials
def read_citation_list(index=0):
filename = './datasets/citations.csv'
@@ -190,13 +193,55 @@ def dedupe(a):
ss = sorted(p.keys())
return ss
+def read_headings(fn, paper):
+ headings = []
+ found_abstract = False
+ found_authors = []
+ journal = paper.journal.lower()
+ authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
+ with open(fn, 'r') as f:
+ for line in f.readlines():
+ line = re.sub(r"\S*@\S*\s?", '', line)
+ l = line.lower().strip()
+ if len(l) < 5:
+ continue
+ if line[0] == 'a' or line[0] == 'b' or line[0] == 'c' or line[0] == '1' or line[0] == '2' or line[0] == '3' or line[0] == '4':
+ line = line[1:]
+ line = line.strip("∗†‡")
+ line = line.replace("fl", "fl").replace('ff', 'ff').replace('ffi', 'f‌f‌i').replace('ffl', 'f‌f‌l')
+ line = line.strip()
+ if 'abstract' in l:
+ found_abstract = True
+ break
+ if journal and journal in l:
+ continue
+ names = [s.strip() for s in re.split(',| and ', l)]
+ was_found = False
+ for name in names:
+ found = find_authors(authors, name)
+ if found:
+ was_found = True
+ # print("found {}".format(found[1]))
+ if found[0]:
+ found_authors.append(found)
+ continue
+ headings.append(line.strip())
+ return headings, found_abstract
+
+def find_authors(authors, line):
+ for a in authors:
+ if a[2] in line:
+ return a
+ return None
+
class AddressBook (object):
def __init__(self):
entities = {}
lookup = {}
- data = read_csv('reports/all_institutions_sorted.csv', keys=None)
+ data = fetch_google_sheet()
+ # keys, data = read_csv('reports/pdf_institutions_deduped.csv', keys=True)
for index, line in enumerate(data):
- if line[0] == line[1]:
+ if line[0] == line[1] or line[0] not in entities:
entities[line[0]] = index
lookup[line[1].lower().strip()] = line[0]
self.data = data
@@ -216,3 +261,17 @@ class AddressBook (object):
index = self.entities[entity]
return self.data[index]
return None
+
+def fetch_worksheet():
+ scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
+ credentials = ServiceAccountCredentials.from_json_keyfile_name('./.creds/Megapixels-ef28f91112a9.json', scope)
+ docid = "1denb7TjYsN9igHyvYah7fQ0daABW32Z30lwV7QrDJQc"
+ client = gspread.authorize(credentials)
+ spreadsheet = client.open_by_key(docid)
+ return spreadsheet.worksheet("institutions")
+
+def fetch_google_sheet():
+ rows = fetch_worksheet().get_all_values()
+ keys = rows[0]
+ lines = rows[1:]
+ return lines