summaryrefslogtreecommitdiff
path: root/scraper
diff options
context:
space:
mode:
authorjules@lens <julescarbon@gmail.com>2019-05-03 19:12:48 +0200
committerjules@lens <julescarbon@gmail.com>2019-05-03 19:12:48 +0200
commit44c3cd19655db3877ec3b8e2fbcab23302973f27 (patch)
tree83e1f4e4f33c7d22f64d872ce67f889cd39b3a86 /scraper
parente3ac08949f737e0c9d0c10f797294725361a4547 (diff)
parent8594f42dc20909e6e8a1f245504ffdcc577413cf (diff)
Merge branch 'master' of asdf.us:megapixels_dev
Diffstat (limited to 'scraper')
-rw-r--r--scraper/s2-final-report.py76
-rw-r--r--scraper/s2.py9
2 files changed, 50 insertions, 35 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py
index c369fa6f..854aa940 100644
--- a/scraper/s2-final-report.py
+++ b/scraper/s2-final-report.py
@@ -62,9 +62,12 @@ def process_paper(row, verified_lookup):
papers.append(res)
if res['address']:
address_list.append(res['address'])
+ process_single_paper(row, 'search', addresses, aggregate_citations, unknown_citations)
if not len(papers):
return
paper = papers[0]
+
+ # final citations - a report of all geocoded citations
with open('{}/{}.json'.format(DIR_FINAL_CITATIONS, row['key']), 'w') as f:
json.dump({
'id': paper['paper_id'],
@@ -74,11 +77,16 @@ def process_paper(row, verified_lookup):
'additional_papers': papers[1:],
'citations': [aggregate_citations[key] for key in aggregate_citations.keys()],
}, f)
+
+ # unkonwn citations - a report of all non-geocoded citations
with open('{}/{}.json'.format(DIR_UNKNOWN_CITATIONS, row['key']), 'w') as f:
json.dump({
'id': papers[0]['paper_id'],
'citations': [unknown_citations[key] for key in unknown_citations.keys()],
}, f)
+
+ # "public" citations - initial citation reports digested by the geocoding frontend -bad name i know
+ # this might not need to get built...
with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f:
json.dump({
'id': paper['paper_id'],
@@ -91,6 +99,8 @@ def process_paper(row, verified_lookup):
},
'citations': [aggregate_citations[key] for key in aggregate_citations.keys()],
}, f)
+
+ # verified citations - the final public reports
with open('{}/{}.json'.format(DIR_VERIFIED_CITATIONS, row['key']), 'w') as f:
json.dump({
'id': paper['paper_id'],
@@ -121,38 +131,44 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_
# 'citations_doi': 0,
}
- fn = file_path('papers', paper_id, 'paper.json')
-
- with open(fn, 'r') as f:
- data = json.load(f)
- print('>> {} {}'.format(data['paperId'], row['key']))
- paper = load_paper(data['paperId'])
- if paper is None:
- print("Paper missing! {}".format(data['paperId']))
- return
+ if paper_id == 'search':
+ dataset = row['key']
+ fn = 'datasets/s2/search_papers/{}.json'.format(dataset)
+ with open(fn, 'r') as f:
+ citations = json.load(f)
+ data = { 'citations': [ { 'paperId': paperId } for paperId in citations ] }
+ else:
+ fn = file_path('papers', paper_id, 'paper.json')
+ with open(fn, 'r') as f:
+ data = json.load(f)
+ print('>> {} {}'.format(data['paperId'], row['key']))
+ paper = load_paper(data['paperId'])
+ if paper is None:
+ print("Paper missing! {}".format(data['paperId']))
+ return
- res['key'] = row['key']
- res['name'] = row['name']
- res['paper_id'] = paper.paper_id
- res['title'] = paper.title
- # res['journal'] = paper.journal
- res['year'] = paper.year
- res['pdf'] = paper.pdf_links()
- res['doi'] = paper.doi_links()
- # res['authors'] = ', '.join(paper.authors)
- # res['citations'] = []
+ res['key'] = row['key']
+ res['name'] = row['name']
+ res['paper_id'] = paper.paper_id
+ res['title'] = paper.title
+ # res['journal'] = paper.journal
+ res['year'] = paper.year
+ res['pdf'] = paper.pdf_links()
+ res['doi'] = paper.doi_links()
+ # res['authors'] = ', '.join(paper.authors)
+ # res['citations'] = []
- paper_institutions = load_institutions(paper.paper_id, paper_location_lookup)
- paper_address = None
- for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
- #print(inst[1])
- institution = inst[1]
- if paper_address is None:
- paper_address = addresses.findObject(institution)
+ paper_institutions = load_institutions(paper.paper_id, paper_location_lookup)
+ paper_address = None
+ for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
+ #print(inst[1])
+ institution = inst[1]
+ if paper_address is None:
+ paper_address = addresses.findObject(institution)
- if paper_address:
- # print(paper_address)
- res['address'] = paper_address
+ if paper_address:
+ # print(paper_address)
+ res['address'] = paper_address
for cite in data['citations']:
citationId = cite['paperId']
@@ -169,7 +185,7 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_
# if has_doi:
# doi_count += 1
if citation is None or citation.data is None:
- print("Citation missing! {}".format(cite['paperId']))
+ print("Citation missing! {}".format(citationId))
continue
institutions = load_institutions(citationId, paper_location_lookup)
geocoded_addresses = []
diff --git a/scraper/s2.py b/scraper/s2.py
index 36dbc906..ec6a9172 100644
--- a/scraper/s2.py
+++ b/scraper/s2.py
@@ -178,9 +178,7 @@ class SemanticScholarAPI(object):
@staticmethod
def search(q, page=1, pageSize=10, yearFilter=None):
- #print(q)
- #print(yearFilter)
- resp = requests.post(SemanticScholarAPI.SEARCH_ENDPOINT, json={
+ query = {
'authors': [],
'coAuthors': [],
#'facets': {},
@@ -192,6 +190,7 @@ class SemanticScholarAPI(object):
'sort': "relevance",
'venues': [],
'yearFilter': yearFilter,
- }, headers=SemanticScholarAPI.headers)
- #print(resp.status_code)
+ }
+ resp = requests.post(SemanticScholarAPI.SEARCH_ENDPOINT, json=query, headers=SemanticScholarAPI.headers)
return None if resp.status_code != 200 else resp.json()
+