summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--scraper/s2-final-report.py19
-rw-r--r--scraper/s2-papers.py5
-rw-r--r--scraper/util.py8
3 files changed, 24 insertions, 8 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py
index dc65a3a3..ddee18c7 100644
--- a/scraper/s2-final-report.py
+++ b/scraper/s2-final-report.py
@@ -22,6 +22,8 @@ def s2_final_report():
verified_lookup = fetch_verified_paper_lookup()
items = []
for key, item in megapixels.items():
+ if key != 'brainwash':
+ continue
ft_share = 'ft_share' in item['dataset'] and item['dataset']['ft_share'] == 'Y'
nyt_share = 'nyt_share' in item['dataset'] and item['dataset']['nyt_share'] == 'Y'
if ft_share or nyt_share:
@@ -43,11 +45,11 @@ def s2_final_report():
# DIR_PUBLIC_CITATIONS + '/',
# "s3://megapixels/v1/citations/",
# ])
- subprocess.call([
- "s3cmd", "put", "-P", "--recursive",
- DIR_VERIFIED_CITATIONS + '/',
- "s3://megapixels/v1/citations/verified/",
- ])
+ #subprocess.call([
+ # "s3cmd", "put", "-P", "--recursive",
+ # DIR_VERIFIED_CITATIONS + '/',
+ # "s3://megapixels/v1/citations/verified/",
+ #])
def process_paper(row, verified_lookup):
aggregate_citations = {}
@@ -63,6 +65,7 @@ def process_paper(row, verified_lookup):
if res['address']:
address_list.append(res['address'])
process_single_paper(row, 'search', addresses, aggregate_citations, unknown_citations)
+ process_single_paper(row, 'verified', addresses, aggregate_citations, unknown_citations, verified_lookup.keys())
if not len(papers):
return
paper = papers[0]
@@ -114,7 +117,7 @@ def process_paper(row, verified_lookup):
'citations': [aggregate_citations[key] for key in verified_lookup.keys() if key in aggregate_citations],
}, f)
-def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations):
+def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations, verified_citations=[]):
res = {
'paper_id': '',
'key': '',
@@ -131,7 +134,9 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_
# 'citations_doi': 0,
}
- if paper_id == 'search':
+ if paper_id == 'verified':
+ data = { 'citations': [ { 'paperId': paperId } for paperId in verified_citations ] }
+ elif paper_id == 'search':
dataset = row['key']
fn = 'datasets/s2/search_papers/{}.json'.format(dataset)
if not os.path.exists(fn):
diff --git a/scraper/s2-papers.py b/scraper/s2-papers.py
index 2e382900..8f318d57 100644
--- a/scraper/s2-papers.py
+++ b/scraper/s2-papers.py
@@ -17,7 +17,8 @@ s2 = SemanticScholarAPI()
@click.command()
@click.option('--freshen/--no-freshen', '-f', help='Force it to query the paper API again')
-def fetch_papers(freshen):
+@click.option('--dataset', '-d', default=None, help='Specific dataset to query')
+def fetch_papers(freshen, dataset):
addresses = AddressBook()
lookup_keys, lines = fetch_google_sheet('citation_lookup')
report_keys = [
@@ -32,6 +33,8 @@ def fetch_papers(freshen):
name = line[1]
title = line[2]
paper_id = line[3]
+ if dataset is not None and dataset != key:
+ continue
if paper_id == '':
continue
paper = fetch_paper(s2, paper_id, freshen)
diff --git a/scraper/util.py b/scraper/util.py
index 05b01fa7..c7e18b44 100644
--- a/scraper/util.py
+++ b/scraper/util.py
@@ -67,12 +67,20 @@ def read_text(fn):
return f.read()
def read_json(fn):
+ #try:
with open(fn, 'r') as json_file:
return json.load(json_file)
+ #except:
+ # print("ERROR READING: {}".format(fn))
+ # return {}
def write_json(fn, data):
+ #try:
with open(fn, 'w') as outfile:
json.dump(data, outfile)
+ #except:
+ # print("ERROR WRITING: {}".format(fn))
+ # return {}
def write_report(fn, title=None, keys=None, rows=[]):
with open(fn, 'w') as f: