Merge branch 'master' of asdf.us:megapixels_dev

author: jules@lens <julescarbon@gmail.com> 2019-05-03 19:12:48 +0200
committer: jules@lens <julescarbon@gmail.com> 2019-05-03 19:12:48 +0200
commit: 44c3cd19655db3877ec3b8e2fbcab23302973f27 (patch)
tree: 83e1f4e4f33c7d22f64d872ce67f889cd39b3a86 /scraper
parent: e3ac08949f737e0c9d0c10f797294725361a4547 (diff)
parent: 8594f42dc20909e6e8a1f245504ffdcc577413cf (diff)
2 files changed, 50 insertions, 35 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py
index c369fa6f..854aa940 100644
--- a/scraper/s2-final-report.py
+++ b/scraper/s2-final-report.py
@@ -62,9 +62,12 @@ def process_paper(row, verified_lookup):
       papers.append(res)
       if res['address']:
         address_list.append(res['address'])
+  process_single_paper(row, 'search', addresses, aggregate_citations, unknown_citations)
   if not len(papers):
     return
   paper = papers[0]
+
+  # final citations - a report of all geocoded citations
   with open('{}/{}.json'.format(DIR_FINAL_CITATIONS, row['key']), 'w') as f:
     json.dump({
       'id': paper['paper_id'],
@@ -74,11 +77,16 @@ def process_paper(row, verified_lookup):
       'additional_papers': papers[1:],
       'citations': [aggregate_citations[key] for key in aggregate_citations.keys()],
     }, f)
+
+  # unkonwn citations - a report of all non-geocoded citations
   with open('{}/{}.json'.format(DIR_UNKNOWN_CITATIONS, row['key']), 'w') as f:
     json.dump({
       'id': papers[0]['paper_id'],
       'citations': [unknown_citations[key] for key in unknown_citations.keys()],
     }, f)
+
+  # "public" citations - initial citation reports digested by the geocoding frontend -bad name i know
+  # this might not need to get built...
   with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f:
     json.dump({
       'id': paper['paper_id'],
@@ -91,6 +99,8 @@ def process_paper(row, verified_lookup):
       },
       'citations': [aggregate_citations[key] for key in aggregate_citations.keys()],
     }, f)
+
+  # verified citations - the final public reports 
   with open('{}/{}.json'.format(DIR_VERIFIED_CITATIONS, row['key']), 'w') as f:
     json.dump({
       'id': paper['paper_id'],
@@ -121,38 +131,44 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_
     # 'citations_doi': 0,
   }
 
-  fn = file_path('papers', paper_id, 'paper.json')
-
-  with open(fn, 'r') as f:
-    data = json.load(f)
-  print('>> {} {}'.format(data['paperId'], row['key']))
-  paper = load_paper(data['paperId'])
-  if paper is None:
-    print("Paper missing! {}".format(data['paperId']))
-    return
+  if paper_id == 'search':
+    dataset = row['key']
+    fn = 'datasets/s2/search_papers/{}.json'.format(dataset)
+    with open(fn, 'r') as f:
+      citations = json.load(f)
+      data = { 'citations': [ { 'paperId': paperId } for paperId in citations ] }
+  else:
+    fn = file_path('papers', paper_id, 'paper.json')
+    with open(fn, 'r') as f:
+      data = json.load(f)
+    print('>> {} {}'.format(data['paperId'], row['key']))
+    paper = load_paper(data['paperId'])
+    if paper is None:
+      print("Paper missing! {}".format(data['paperId']))
+      return
   
-  res['key'] = row['key']
-  res['name'] = row['name']
-  res['paper_id'] = paper.paper_id
-  res['title'] = paper.title
-  # res['journal'] = paper.journal
-  res['year'] = paper.year
-  res['pdf'] = paper.pdf_links()
-  res['doi'] = paper.doi_links()
-  # res['authors'] = ', '.join(paper.authors)
-  # res['citations'] = []
+    res['key'] = row['key']
+    res['name'] = row['name']
+    res['paper_id'] = paper.paper_id
+    res['title'] = paper.title
+    # res['journal'] = paper.journal
+    res['year'] = paper.year
+    res['pdf'] = paper.pdf_links()
+    res['doi'] = paper.doi_links()
+    # res['authors'] = ', '.join(paper.authors)
+    # res['citations'] = []
 
-  paper_institutions = load_institutions(paper.paper_id, paper_location_lookup)
-  paper_address = None
-  for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
-    #print(inst[1])
-    institution = inst[1]
-    if paper_address is None:
-      paper_address = addresses.findObject(institution)
+    paper_institutions = load_institutions(paper.paper_id, paper_location_lookup)
+    paper_address = None
+    for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
+      #print(inst[1])
+      institution = inst[1]
+      if paper_address is None:
+        paper_address = addresses.findObject(institution)
 
-  if paper_address:
-    # print(paper_address)
-    res['address'] = paper_address
+    if paper_address:
+      # print(paper_address)
+      res['address'] = paper_address
 
   for cite in data['citations']:
     citationId = cite['paperId']
@@ -169,7 +185,7 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_
     # if has_doi:
     #   doi_count += 1
     if citation is None or citation.data is None:
-      print("Citation missing! {}".format(cite['paperId']))
+      print("Citation missing! {}".format(citationId))
       continue
     institutions = load_institutions(citationId, paper_location_lookup)
     geocoded_addresses = []
diff --git a/scraper/s2.py b/scraper/s2.py
index 36dbc906..ec6a9172 100644
--- a/scraper/s2.py
+++ b/scraper/s2.py
@@ -178,9 +178,7 @@ class SemanticScholarAPI(object):
 
     @staticmethod
     def search(q, page=1, pageSize=10, yearFilter=None):
-        #print(q)
-        #print(yearFilter)
-        resp = requests.post(SemanticScholarAPI.SEARCH_ENDPOINT, json={
+        query = {
             'authors': [],
             'coAuthors': [],
             #'facets': {},
@@ -192,6 +190,7 @@ class SemanticScholarAPI(object):
             'sort': "relevance",
             'venues': [],
             'yearFilter': yearFilter,
-        }, headers=SemanticScholarAPI.headers)
-        #print(resp.status_code)
+        }
+        resp = requests.post(SemanticScholarAPI.SEARCH_ENDPOINT, json=query, headers=SemanticScholarAPI.headers)
         return None if resp.status_code != 200 else resp.json()
+
author	jules@lens <julescarbon@gmail.com>	2019-05-03 19:12:48 +0200
committer	jules@lens <julescarbon@gmail.com>	2019-05-03 19:12:48 +0200
commit	44c3cd19655db3877ec3b8e2fbcab23302973f27 (patch)
tree	83e1f4e4f33c7d22f64d872ce67f889cd39b3a86 /scraper
parent	e3ac08949f737e0c9d0c10f797294725361a4547 (diff)
parent	8594f42dc20909e6e8a1f245504ffdcc577413cf (diff)