summaryrefslogtreecommitdiff
path: root/scraper/s2-final-report.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2019-02-13 16:46:10 +0100
committerJules Laplace <julescarbon@gmail.com>2019-02-13 16:46:10 +0100
commitd0dc5cd83f1c436185d247600c3c5be9360bf1ca (patch)
tree92db65b2a525b6512fd7f5349da561c476fe997e /scraper/s2-final-report.py
parent1563d1da307a78ddc388483fd95a68a511e18048 (diff)
displaying more info about the papers
Diffstat (limited to 'scraper/s2-final-report.py')
-rw-r--r--scraper/s2-final-report.py44
1 files changed, 21 insertions, 23 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py
index 58ac481f..283ca4fc 100644
--- a/scraper/s2-final-report.py
+++ b/scraper/s2-final-report.py
@@ -11,18 +11,18 @@ from util import *
DIR_PUBLIC_CITATIONS = "../site/datasets/final"
DIR_UNKNOWN_CITATIONS = "../site/datasets/unknown"
+addresses = AddressBook()
+
@click.command()
def s2_final_report():
- addresses = AddressBook()
megapixels = load_megapixels_lookup()
- ft_lookup = load_ft_lookup()
- for key, row in megapixels.items():
- print(key)
- ft_share = ft_lookup[key]
- if ft_share:
- paper_data = process_paper(row, addresses)
+ items = []
+ for key, item in megapixels.items():
+ if item['dataset']['ft_share'] == '1':
+ items.append((item,))
+ parallelize(process_paper, items)
-def process_paper(row, addresses):
+def process_paper(row):
aggregate_citations = {}
unknown_citations = {}
address = None
@@ -39,6 +39,8 @@ def process_paper(row, addresses):
with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f:
json.dump({
'id': papers[0]['paper_id'],
+ 'dataset': row['dataset'],
+ 'statistics': row['statistics'],
'paper': papers[0],
'address': address,
'additional_papers': papers[1:],
@@ -47,7 +49,6 @@ def process_paper(row, addresses):
with open('{}/{}.json'.format(DIR_UNKNOWN_CITATIONS, row['key']), 'w') as f:
json.dump({
'id': papers[0]['paper_id'],
- 'paper': papers[0],
'citations': [unknown_citations[key] for key in unknown_citations.keys()],
}, f)
@@ -161,33 +162,30 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_
}
return res
-def load_ft_lookup():
- keys, rows = fetch_google_sheet('datasets')
- lookup = {}
- for row in rows:
- rec = {}
- for index, key in enumerate(keys):
- rec[key] = row[index]
- if rec['ft_share'] == '1' or rec['ft_share'] == 1:
- lookup[rec['key']] = True
- else:
- lookup[rec['key']] = False
- return lookup
-
def load_megapixels_lookup():
keys, rows = fetch_google_sheet('citation_lookup')
+ dataset_lookup = fetch_google_lookup('datasets')
+ statistics_lookup = fetch_google_lookup('statistics')
lookup = {}
for row in rows:
rec = {}
for index, key in enumerate(keys):
rec[key] = row[index]
- if rec['paper_id'] == "":
+ if rec['paper_id'] == "" or (rec['verified'] != 1 and rec['verified'] != '1'):
continue
paper_key = rec['key']
if paper_key not in lookup:
rec['paper_ids'] = []
lookup[paper_key] = rec
lookup[paper_key]['paper_ids'].append(rec['paper_id'])
+ if paper_key in dataset_lookup:
+ lookup[paper_key]['dataset'] = dataset_lookup[paper_key]
+ else:
+ print("not in datasets lookup:", paper_key)
+ if paper_key in statistics_lookup:
+ lookup[paper_key]['statistics'] = statistics_lookup[paper_key]
+ else:
+ print("not in statistics lookup:", paper_key)
# recs.append(rec)
return lookup