diff options
| -rw-r--r-- | scraper/s2-final-report.py | 19 | ||||
| -rw-r--r-- | scraper/s2-papers.py | 5 | ||||
| -rw-r--r-- | scraper/util.py | 8 |
3 files changed, 24 insertions, 8 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py index dc65a3a3..ddee18c7 100644 --- a/scraper/s2-final-report.py +++ b/scraper/s2-final-report.py @@ -22,6 +22,8 @@ def s2_final_report(): verified_lookup = fetch_verified_paper_lookup() items = [] for key, item in megapixels.items(): + if key != 'brainwash': + continue ft_share = 'ft_share' in item['dataset'] and item['dataset']['ft_share'] == 'Y' nyt_share = 'nyt_share' in item['dataset'] and item['dataset']['nyt_share'] == 'Y' if ft_share or nyt_share: @@ -43,11 +45,11 @@ def s2_final_report(): # DIR_PUBLIC_CITATIONS + '/', # "s3://megapixels/v1/citations/", # ]) - subprocess.call([ - "s3cmd", "put", "-P", "--recursive", - DIR_VERIFIED_CITATIONS + '/', - "s3://megapixels/v1/citations/verified/", - ]) + #subprocess.call([ + # "s3cmd", "put", "-P", "--recursive", + # DIR_VERIFIED_CITATIONS + '/', + # "s3://megapixels/v1/citations/verified/", + #]) def process_paper(row, verified_lookup): aggregate_citations = {} @@ -63,6 +65,7 @@ def process_paper(row, verified_lookup): if res['address']: address_list.append(res['address']) process_single_paper(row, 'search', addresses, aggregate_citations, unknown_citations) + process_single_paper(row, 'verified', addresses, aggregate_citations, unknown_citations, verified_lookup.keys()) if not len(papers): return paper = papers[0] @@ -114,7 +117,7 @@ def process_paper(row, verified_lookup): 'citations': [aggregate_citations[key] for key in verified_lookup.keys() if key in aggregate_citations], }, f) -def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations): +def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations, verified_citations=[]): res = { 'paper_id': '', 'key': '', @@ -131,7 +134,9 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_ # 'citations_doi': 0, } - if paper_id == 'search': + if paper_id == 'verified': + data = { 'citations': [ { 'paperId': paperId } for paperId in verified_citations ] } + elif paper_id == 'search': dataset = row['key'] fn = 'datasets/s2/search_papers/{}.json'.format(dataset) if not os.path.exists(fn): diff --git a/scraper/s2-papers.py b/scraper/s2-papers.py index 2e382900..8f318d57 100644 --- a/scraper/s2-papers.py +++ b/scraper/s2-papers.py @@ -17,7 +17,8 @@ s2 = SemanticScholarAPI() @click.command() @click.option('--freshen/--no-freshen', '-f', help='Force it to query the paper API again') -def fetch_papers(freshen): +@click.option('--dataset', '-d', default=None, help='Specific dataset to query') +def fetch_papers(freshen, dataset): addresses = AddressBook() lookup_keys, lines = fetch_google_sheet('citation_lookup') report_keys = [ @@ -32,6 +33,8 @@ def fetch_papers(freshen): name = line[1] title = line[2] paper_id = line[3] + if dataset is not None and dataset != key: + continue if paper_id == '': continue paper = fetch_paper(s2, paper_id, freshen) diff --git a/scraper/util.py b/scraper/util.py index 05b01fa7..c7e18b44 100644 --- a/scraper/util.py +++ b/scraper/util.py @@ -67,12 +67,20 @@ def read_text(fn): return f.read() def read_json(fn): + #try: with open(fn, 'r') as json_file: return json.load(json_file) + #except: + # print("ERROR READING: {}".format(fn)) + # return {} def write_json(fn, data): + #try: with open(fn, 'w') as outfile: json.dump(data, outfile) + #except: + # print("ERROR WRITING: {}".format(fn)) + # return {} def write_report(fn, title=None, keys=None, rows=[]): with open(fn, 'w') as f: |
