summaryrefslogtreecommitdiff
path: root/scraper
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2019-05-03 19:11:07 +0200
committerJules Laplace <julescarbon@gmail.com>2019-05-03 19:11:07 +0200
commit4f823ca49a01becafdcd38c4d342080c1f29ce87 (patch)
treee9cb3da884e9101561e6ad74adbbc908b58ba808 /scraper
parenta53909352266a2258ddfa287508f979da59a9d1d (diff)
final report stuph
Diffstat (limited to 'scraper')
-rw-r--r--scraper/s2-final-report.py76
-rw-r--r--scraper/s2.py5
2 files changed, 49 insertions, 32 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py
index c369fa6f..854aa940 100644
--- a/scraper/s2-final-report.py
+++ b/scraper/s2-final-report.py
@@ -62,9 +62,12 @@ def process_paper(row, verified_lookup):
papers.append(res)
if res['address']:
address_list.append(res['address'])
+ process_single_paper(row, 'search', addresses, aggregate_citations, unknown_citations)
if not len(papers):
return
paper = papers[0]
+
+ # final citations - a report of all geocoded citations
with open('{}/{}.json'.format(DIR_FINAL_CITATIONS, row['key']), 'w') as f:
json.dump({
'id': paper['paper_id'],
@@ -74,11 +77,16 @@ def process_paper(row, verified_lookup):
'additional_papers': papers[1:],
'citations': [aggregate_citations[key] for key in aggregate_citations.keys()],
}, f)
+
+ # unkonwn citations - a report of all non-geocoded citations
with open('{}/{}.json'.format(DIR_UNKNOWN_CITATIONS, row['key']), 'w') as f:
json.dump({
'id': papers[0]['paper_id'],
'citations': [unknown_citations[key] for key in unknown_citations.keys()],
}, f)
+
+ # "public" citations - initial citation reports digested by the geocoding frontend -bad name i know
+ # this might not need to get built...
with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f:
json.dump({
'id': paper['paper_id'],
@@ -91,6 +99,8 @@ def process_paper(row, verified_lookup):
},
'citations': [aggregate_citations[key] for key in aggregate_citations.keys()],
}, f)
+
+ # verified citations - the final public reports
with open('{}/{}.json'.format(DIR_VERIFIED_CITATIONS, row['key']), 'w') as f:
json.dump({
'id': paper['paper_id'],
@@ -121,38 +131,44 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_
# 'citations_doi': 0,
}
- fn = file_path('papers', paper_id, 'paper.json')
-
- with open(fn, 'r') as f:
- data = json.load(f)
- print('>> {} {}'.format(data['paperId'], row['key']))
- paper = load_paper(data['paperId'])
- if paper is None:
- print("Paper missing! {}".format(data['paperId']))
- return
+ if paper_id == 'search':
+ dataset = row['key']
+ fn = 'datasets/s2/search_papers/{}.json'.format(dataset)
+ with open(fn, 'r') as f:
+ citations = json.load(f)
+ data = { 'citations': [ { 'paperId': paperId } for paperId in citations ] }
+ else:
+ fn = file_path('papers', paper_id, 'paper.json')
+ with open(fn, 'r') as f:
+ data = json.load(f)
+ print('>> {} {}'.format(data['paperId'], row['key']))
+ paper = load_paper(data['paperId'])
+ if paper is None:
+ print("Paper missing! {}".format(data['paperId']))
+ return
- res['key'] = row['key']
- res['name'] = row['name']
- res['paper_id'] = paper.paper_id
- res['title'] = paper.title
- # res['journal'] = paper.journal
- res['year'] = paper.year
- res['pdf'] = paper.pdf_links()
- res['doi'] = paper.doi_links()
- # res['authors'] = ', '.join(paper.authors)
- # res['citations'] = []
+ res['key'] = row['key']
+ res['name'] = row['name']
+ res['paper_id'] = paper.paper_id
+ res['title'] = paper.title
+ # res['journal'] = paper.journal
+ res['year'] = paper.year
+ res['pdf'] = paper.pdf_links()
+ res['doi'] = paper.doi_links()
+ # res['authors'] = ', '.join(paper.authors)
+ # res['citations'] = []
- paper_institutions = load_institutions(paper.paper_id, paper_location_lookup)
- paper_address = None
- for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
- #print(inst[1])
- institution = inst[1]
- if paper_address is None:
- paper_address = addresses.findObject(institution)
+ paper_institutions = load_institutions(paper.paper_id, paper_location_lookup)
+ paper_address = None
+ for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
+ #print(inst[1])
+ institution = inst[1]
+ if paper_address is None:
+ paper_address = addresses.findObject(institution)
- if paper_address:
- # print(paper_address)
- res['address'] = paper_address
+ if paper_address:
+ # print(paper_address)
+ res['address'] = paper_address
for cite in data['citations']:
citationId = cite['paperId']
@@ -169,7 +185,7 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_
# if has_doi:
# doi_count += 1
if citation is None or citation.data is None:
- print("Citation missing! {}".format(cite['paperId']))
+ print("Citation missing! {}".format(citationId))
continue
institutions = load_institutions(citationId, paper_location_lookup)
geocoded_addresses = []
diff --git a/scraper/s2.py b/scraper/s2.py
index 26334de8..62fd9a94 100644
--- a/scraper/s2.py
+++ b/scraper/s2.py
@@ -178,7 +178,7 @@ class SemanticScholarAPI(object):
@staticmethod
def search(q, page=1, pageSize=10, yearFilter=None):
- resp = requests.post(SemanticScholarAPI.SEARCH_ENDPOINT, json={
+ query = {
'authors': [],
'coAuthors': [],
'facets': {},
@@ -190,6 +190,7 @@ class SemanticScholarAPI(object):
'sort': "relevance",
'venues': [],
'yearFilter': yearFilter,
- }, headers=SemanticScholarAPI.headers)
+ }
+ resp = requests.post(SemanticScholarAPI.SEARCH_ENDPOINT, json=query, headers=SemanticScholarAPI.headers)
# print(resp.status_code)
return None if resp.status_code != 200 else resp.json()