summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2019-03-28 17:25:28 +0100
committerJules Laplace <julescarbon@gmail.com>2019-03-28 17:25:28 +0100
commitfd4faf7fb94e7b5cbcf5e232d1fd08822e8825bb (patch)
tree060e7ff9b25402e90eb3cab078193155e60b1fcf
parent7347fb5a2a8b966b9dce79d97a5d2bdf3c6557d1 (diff)
build verified citations report
-rw-r--r--scraper/s2-final-report.py22
-rw-r--r--scraper/util.py14
2 files changed, 34 insertions, 2 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py
index 3673f516..febbbafd 100644
--- a/scraper/s2-final-report.py
+++ b/scraper/s2-final-report.py
@@ -11,6 +11,7 @@ from util import *
DIR_PUBLIC_CITATIONS = "../site/datasets/citations"
DIR_FINAL_CITATIONS = "../site/datasets/final"
DIR_UNKNOWN_CITATIONS = "../site/datasets/unknown"
+DIR_VERIFIED_CITATIONS = "../site/datasets/verified"
addresses = AddressBook()
paper_location_lookup = fetch_google_lookup('paper_locations', item_key='paper_id')
@@ -18,10 +19,15 @@ paper_location_lookup = fetch_google_lookup('paper_locations', item_key='paper_i
@click.command()
def s2_final_report():
megapixels = load_megapixels_lookup()
+ verified_lookup = fetch_verified_paper_lookup()
items = []
for key, item in megapixels.items():
if 'ft_share' in item['dataset'] and item['dataset']['ft_share'] == 'Y':
- items.append((item,))
+ if key in verified_lookup:
+ lookup = verified_lookup[key]
+ else:
+ lookup = {}
+ items.append((item, lookup,))
parallelize(process_paper, items)
# key name_short name_full purpose url
# wild indoor outdoor campus cyberspace parent
@@ -36,7 +42,7 @@ def s2_final_report():
"s3://megapixels/v1/citations/",
])
-def process_paper(row):
+def process_paper(row, verified_lookup):
aggregate_citations = {}
unknown_citations = {}
address = None
@@ -78,6 +84,18 @@ def process_paper(row):
'address': address_list[0] if len(address_list) else {},
'citations': [aggregate_citations[key] for key in aggregate_citations.keys()],
}, f)
+ with open('{}/{}.json'.format(DIR_VERIFIED_CITATIONS, row['key']), 'w') as f:
+ json.dump({
+ 'id': paper['paper_id'],
+ 'paper': {
+ 'key': row['key'],
+ 'name': row['name'],
+ 'title': paper['title'],
+ 'year': paper['year'],
+ },
+ 'address': address_list[0] if len(address_list) else {},
+ 'citations': [aggregate_citations[key] for key in verified_citations.keys()],
+ }, f)
def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations):
res = {
diff --git a/scraper/util.py b/scraper/util.py
index 96ced430..ad7b1f4d 100644
--- a/scraper/util.py
+++ b/scraper/util.py
@@ -452,6 +452,20 @@ def fetch_google_lookup(name, item_key='key'):
lookup[rec[item_key]] = rec
return lookup
+def fetch_verified_paper_lookup():
+ """Fetch a lookup keyed by dataset, where each dataset points to a hash of valid or invalid papers..."""
+ keys, rows = fetch_google_sheet('verifications')
+ verified_lookup = {}
+ for row in rows:
+ rec = {}
+ for index, key in enumerate(keys):
+ rec[key] = row[index]
+ if rec['dataset'] not in verified_lookup:
+ verified_lookup[rec['dataset']] = {}
+ if str(rec['uses_dataset']) == '1':
+ verified_lookup[rec['dataset']][rec['paper_id']] = rec
+ return verified_lookup
+
def update_or_append_worksheet(name, form):
worksheet = fetch_worksheet(name)
keys = worksheet.row_values(1)