diff options
| -rw-r--r-- | .gitignore | 2 | ||||
| -rw-r--r-- | README.md | 18 | ||||
| -rw-r--r-- | reports/map.js | 8 | ||||
| -rw-r--r-- | reports/pdf_institutions_deduped.csv (renamed from pdf_institutions_deduped.csv) | 0 | ||||
| -rw-r--r-- | s2-citation-report.py | 67 | ||||
| -rw-r--r-- | s2-fetch-google-sheet.py | 4 | ||||
| -rw-r--r-- | s2-geocode-spreadsheet.py | 83 | ||||
| -rw-r--r-- | s2-pdf-report.py | 44 | ||||
| -rw-r--r-- | util.py | 63 |
9 files changed, 206 insertions, 83 deletions
@@ -154,3 +154,5 @@ datasets/ reports/papers/ +.creds + @@ -84,7 +84,7 @@ Fetch the files listed in ieee.json and process them. Use pdfminer.six to extract the first page from the PDFs. -### s2-pdf-report.py report_first_pages +### s2-pdf-first-pages.py Perform initial extraction of university-like terms, to be geocoded. @@ -115,11 +115,21 @@ After scraping these universities, we got up to 47% match rate on papers from th ### expand-uni-lookup.py -At this point in the process, I had divided the task of scraping and geocoding between 4 different machines, so I reduced down the output of these scripts into the file `reports/all_institutions.csv`. I got increased accuracy from my paper classifier using just university names, so I wrote this script to group the rows using the extracted university names, and show me which address they geocode to. This file must be gone through manually. This technique geocoded around 47% of papers. +By now I had a list of institutions in `reports/all_institutions.csv` (done by merging the results of the geocoding, as I had done this on 4 computers and thus had 4 files of institutions). This file must be gone through manually. This technique geocoded around 47% of papers. -### s2-pdf-report.py report_geocoded_papers +At this point I moved `reports/all_institutions.csv` into the Google Sheets. All further results use the CSV on Google Sheets. -Perform initial extraction of university-like terms, to be geocoded. +### s2-pdf-report.py + +Generates reports of things from the PDFs that were not found. + +### s2-geocode-spreadsheet.py + +To add new institutions, simply list them in the spreadsheet with the lat/lng fields empty. Then run this script and anything missing a lat/lng will get one. + +### s2-citation-report.py + +Generate the main report with maps and citation lists. --- diff --git a/reports/map.js b/reports/map.js index b1b896e7..e6244686 100644 --- a/reports/map.js +++ b/reports/map.js @@ -19,8 +19,7 @@ let points = read_json('citations') let address = read_json('address') let source = [0,0] if (address) { - source[0] = parseFloat(address[1]) - source[1] = parseFloat(address[2]) + source = address.slice(3,5).map(n => parseFloat(n)) console.log(address, source) } points.forEach(point => { @@ -37,7 +36,9 @@ points.forEach(point => { ] */ - const latlng = point[2].slice(1,3) + const latlng = point.slice(5,7).map(n => parseFloat(n)) + console.log(point) + if (!latlng.length || isNaN(latlng[0]) || isNaN(latlng[1])) return var marker = L.marker(latlng).addTo(map); marker.bindPopup([ "<b>",point[0], "</b>", @@ -52,6 +53,7 @@ points.forEach(point => { // vertices: 100, // } // L.Polyline.Arc(source, latlng, arcStyle).addTo(map); + console.log(latlng) var pathStyle = { color: 'rgb(245, 246, 150)', fillColor: 'rgb(245, 246, 150)', diff --git a/pdf_institutions_deduped.csv b/reports/pdf_institutions_deduped.csv index 6a5e23e0..6a5e23e0 100644 --- a/pdf_institutions_deduped.csv +++ b/reports/pdf_institutions_deduped.csv diff --git a/s2-citation-report.py b/s2-citation-report.py index 19b018f8..58b7ed8f 100644 --- a/s2-citation-report.py +++ b/s2-citation-report.py @@ -1,5 +1,5 @@ import os -import gzip +import re import glob import json import math @@ -9,7 +9,7 @@ from util import * @click.command() def s2_citation_report(): - addresses = load_addresses() + addresses = AddressBook() for fn in glob.iglob('datasets/s2/papers/**/*.json', recursive=True): process_paper(fn, addresses) @@ -40,22 +40,37 @@ def process_paper(fn, addresses): continue institutions = load_institutions(citationId) geocoded_institutions = [] + unknown_institutions = [] institution = '' address = None for inst in sorted(institutions, key=operator.itemgetter(1)): # print(inst[1]) address_count += 1 institution = inst[1] - if institution in addresses: - address = addresses[institution] + next_address = addresses.find(institution) + if next_address: + address = next_address geocode_count += 1 geocoded_institutions.append(institution) else: - for part in institution.split(', '): - if part in addresses: - address = addresses[part] - geocode_count += 1 - geocoded_institutions.append(institution) + unknown_institutions.append(institution) + if not address: + if os.path.exists(file_path('pdf', citationId, 'paper.txt')): + headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation) + heading_string = '\n'.join(headings[0:20]) + found_addresses = [] + if len(headings): + for heading in headings: + l = heading.lower().strip() + if l: + next_address = addresses.find(l) + if next_address: + address = next_address + geocode_count += 1 + geocoded_institutions.append(heading) + else: + unknown_institutions.append(heading) + res['citations'].append({ 'title': citation.title, 'journal': citation.journal, @@ -63,31 +78,27 @@ def process_paper(fn, addresses): 'institutions': [inst[1] for inst in institutions], 'geocoded': geocoded_institutions, }) - if len(geocoded_institutions): + if address: geocoded_citations.append([ citation.title, institution, - address, - ]) + ] + address) display_geocoded_citations.append([ citation.title, - institution, - ', '.join(address), - ]) + ] + address) else: unknown_citations.append([ citation.title, - institution, + '<br>'.join(unknown_institutions), ]) paper_institutions = load_institutions(paper.paper_id) paper_address = None - for inst in sorted(institutions, key=operator.itemgetter(1)): + for inst in sorted(paper_institutions, key=operator.itemgetter(1)): # print(inst[1]) address_count += 1 institution = inst[1] - if institution in addresses: - paper_address = addresses[institution] + paper_address = addresses.find(institution) if paper_address: print(paper_address) @@ -134,24 +145,18 @@ def process_paper(fn, addresses): f.write("</html>") return res -def load_addresses(): - data = read_csv('reports/all_institutions.csv', keys=None) - lookup = {} - for row in data: - name = row[0] - lookup[name] = row - return lookup - def load_institutions(paperId): - if os.path.exists(os.path.join(data_path('pdf', paperId), 'institutions.json')): - return read_json(os.path.join(data_path('pdf', paperId), 'institutions.json'))['institutions'] - elif os.path.exists(os.path.join(data_path('doi', paperId), 'institutions.json')): - return read_json(os.path.join(data_path('doi', paperId), 'institutions.json'))['institutions'] + if os.path.exists(file_path('pdf', paperId, 'institutions.json')): + return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions'] + elif os.path.exists(file_path('doi', paperId, 'institutions.json')): + return read_json(file_path('doi', paperId, 'institutions.json'))['institutions'] else: return [] def data_path(key, paper_id): return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id) +def file_path(key, paper_id, fn): + return os.path.join(data_path(key, paper_id), fn) if __name__ == '__main__': s2_citation_report() diff --git a/s2-fetch-google-sheet.py b/s2-fetch-google-sheet.py new file mode 100644 index 00000000..1fc887e4 --- /dev/null +++ b/s2-fetch-google-sheet.py @@ -0,0 +1,4 @@ +from util import * + +if __name__ == '__main__': + fetch_google_sheet() diff --git a/s2-geocode-spreadsheet.py b/s2-geocode-spreadsheet.py new file mode 100644 index 00000000..d0fd2050 --- /dev/null +++ b/s2-geocode-spreadsheet.py @@ -0,0 +1,83 @@ +import os +import csv +import click +import time +from geopy import geocoders +from dotenv import load_dotenv +from util import * +load_dotenv() + +@click.command() +def s2_geocode_spreadsheet(): + geolocator = geocoders.GoogleV3(os.getenv('MAPS_API_KEY')) + + worksheet = fetch_worksheet() + rows = fetch_google_sheet() + valid_count = 0 + invalid_count = 0 + + print("got {} rows".format(len(rows))) + + cname_lookup = {} + for i, row in enumerate(rows): + if len(row) == 6: + cname, name, address, lat, lng, org_type = row + elif len(row) == 7: + cname, name, address, lat, lng, org_type, extra_address = row + else: + print("Weirdly formatted row {}".format(i)) + continue + if cname == name or cname not in cname_lookup: + cname_lookup[cname] = i + + # 0 cname 1 name 2 address 3 lat 4 lng 5 org_type + for i, row in enumerate(rows): + if len(row) == 6: + cname, name, address, lat, lng, org_type = row + elif len(row) == 7: + cname, name, address, lat, lng, org_type, extra_address = row + else: + print("Weirdly formatted row {}: {} entries".format(i, len(row))) + continue + if lat and lng: + continue + c_row = rows[cname_lookup[cname]] + if c_row[3] and c_row[4]: + print("name {}, found cname: {}".format(name, cname)) + worksheet.update_cell(i+2, 3, c_row[2]) + worksheet.update_cell(i+2, 4, c_row[3]) + worksheet.update_cell(i+2, 5, c_row[4]) + continue + if address: + address_to_geocode = address + elif name: + address_to_geocode = name + elif cname: + address_to_geocode = cname + + if not address_to_geocode: + continue + + print(address_to_geocode) + location = geolocator.geocode(address_to_geocode) + if location: + print("{} found: {}".format(i+1, name)) + print(location.raw) + worksheet.update_cell(i+2, 3, location.address) + worksheet.update_cell(i+2, 4, location.latitude) + worksheet.update_cell(i+2, 5, location.longitude) + if address and address != location.address: + worksheet.update_cell(i+2, 7, address) + valid_count += 1 + row[2] = location.address + row[3] = location.latitude + row[4] = location.longitude + else: + print("{} not found: {}".format(i+1, address_to_geocode)) + invalid_count += 1 + time.sleep(2) + + print("geocoded {} addresses, {} found, {} not found".format(len(rows), valid_count, invalid_count)) + +if __name__ == '__main__': + s2_geocode_spreadsheet() diff --git a/s2-pdf-report.py b/s2-pdf-report.py index d659ed15..0748897f 100644 --- a/s2-pdf-report.py +++ b/s2-pdf-report.py @@ -22,10 +22,9 @@ def s2_pdf_report(): addresses = AddressBook() for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): paper_id = fn.replace(PDF_DIR, '').split('/')[2] - paper = load_paper(paper_id) total_count += 1 # print(paper_id) - headings, found_abstract = read_headings(fn, paper) + headings, found_abstract = read_headings(fn) heading_string = '\n'.join(headings[0:20]) found_addresses = [] if not found_abstract: @@ -67,47 +66,6 @@ def s2_pdf_report(): def percent(a,b): return round(100 * a / b) -def read_headings(fn, paper): - headings = [] - found_abstract = False - found_authors = [] - journal = paper.journal.lower() - authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ] - with open(fn, 'r') as f: - for line in f.readlines(): - line = re.sub(r"\S*@\S*\s?", '', line) - l = line.lower().strip() - if len(l) < 5: - continue - if line[0] == 'a' or line[0] == 'b' or line[0] == 'c' or line[0] == '1' or line[0] == '2' or line[0] == '3' or line[0] == '4': - line = line[1:] - line = line.strip("∗†‡") - line = line.replace("fl", "fl").replace('ff', 'ff').replace('ffi', 'ffi').replace('ffl', 'ffl') - line = line.strip() - if 'abstract' in l: - found_abstract = True - break - if journal and journal in l: - continue - names = [s.strip() for s in re.split(',| and ', l)] - was_found = False - for name in names: - found = find_authors(authors, name) - if found: - was_found = True - # print("found {}".format(found[1])) - if found[0]: - found_authors.append(found) - continue - headings.append(line.strip()) - return headings, found_abstract - -def find_authors(authors, line): - for a in authors: - if a[2] in line: - return a - return None - def paper_path(paper_id): return '{}/{}/{}'.format(PDF_DIR, paper_id[0:2], paper_id) @@ -1,8 +1,11 @@ +import re import os import csv import string import codecs +import gspread import simplejson as json +from oauth2client.service_account import ServiceAccountCredentials def read_citation_list(index=0): filename = './datasets/citations.csv' @@ -190,13 +193,55 @@ def dedupe(a): ss = sorted(p.keys()) return ss +def read_headings(fn, paper): + headings = [] + found_abstract = False + found_authors = [] + journal = paper.journal.lower() + authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ] + with open(fn, 'r') as f: + for line in f.readlines(): + line = re.sub(r"\S*@\S*\s?", '', line) + l = line.lower().strip() + if len(l) < 5: + continue + if line[0] == 'a' or line[0] == 'b' or line[0] == 'c' or line[0] == '1' or line[0] == '2' or line[0] == '3' or line[0] == '4': + line = line[1:] + line = line.strip("∗†‡") + line = line.replace("fl", "fl").replace('ff', 'ff').replace('ffi', 'ffi').replace('ffl', 'ffl') + line = line.strip() + if 'abstract' in l: + found_abstract = True + break + if journal and journal in l: + continue + names = [s.strip() for s in re.split(',| and ', l)] + was_found = False + for name in names: + found = find_authors(authors, name) + if found: + was_found = True + # print("found {}".format(found[1])) + if found[0]: + found_authors.append(found) + continue + headings.append(line.strip()) + return headings, found_abstract + +def find_authors(authors, line): + for a in authors: + if a[2] in line: + return a + return None + class AddressBook (object): def __init__(self): entities = {} lookup = {} - data = read_csv('reports/all_institutions_sorted.csv', keys=None) + data = fetch_google_sheet() + # keys, data = read_csv('reports/pdf_institutions_deduped.csv', keys=True) for index, line in enumerate(data): - if line[0] == line[1]: + if line[0] == line[1] or line[0] not in entities: entities[line[0]] = index lookup[line[1].lower().strip()] = line[0] self.data = data @@ -216,3 +261,17 @@ class AddressBook (object): index = self.entities[entity] return self.data[index] return None + +def fetch_worksheet(): + scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive'] + credentials = ServiceAccountCredentials.from_json_keyfile_name('./.creds/Megapixels-ef28f91112a9.json', scope) + docid = "1denb7TjYsN9igHyvYah7fQ0daABW32Z30lwV7QrDJQc" + client = gspread.authorize(credentials) + spreadsheet = client.open_by_key(docid) + return spreadsheet.worksheet("institutions") + +def fetch_google_sheet(): + rows = fetch_worksheet().get_all_values() + keys = rows[0] + lines = rows[1:] + return lines |
