From a53909352266a2258ddfa287508f979da59a9d1d Mon Sep 17 00:00:00 2001
From: Jules Laplace <julescarbon@gmail.com>
Date: Fri, 3 May 2019 17:48:22 +0200
Subject: search-deep.py

---
 scraper/s2.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'scraper/s2.py')

diff --git a/scraper/s2.py b/scraper/s2.py
index 01c0b4d5..26334de8 100644
--- a/scraper/s2.py
+++ b/scraper/s2.py
@@ -177,19 +177,19 @@ class SemanticScholarAPI(object):
         return None if resp.status_code != 200 else resp.json() # Paper(**resp.json())
 
     @staticmethod
-    def search(q):
+    def search(q, page=1, pageSize=10, yearFilter=None):
         resp = requests.post(SemanticScholarAPI.SEARCH_ENDPOINT, json={
             'authors': [],
             'coAuthors': [],
             'facets': {},
-            'page': 1,
-            'pageSize': 10,
+            'page': page,
+            'pageSize': pageSize,
             'publicationTypes': [],
             'queryString': q,
             'requireViewablePdf': False,
             'sort': "relevance",
             'venues': [],
-            'yearFilter': None,
+            'yearFilter': yearFilter,
         }, headers=SemanticScholarAPI.headers)
         # print(resp.status_code)
         return None if resp.status_code != 200 else resp.json()
-- 
cgit v1.2.3-70-g09d2


From 1be5e0e1a85a84d9eca7d1d89d14a562b356f2e0 Mon Sep 17 00:00:00 2001
From: "jules@lens" <julescarbon@gmail.com>
Date: Fri, 3 May 2019 18:31:12 +0200
Subject: fixing up deep search

---
 scraper/s2-search-deep.py | 36 +++++++++++++++++++++++-------------
 scraper/s2.py             |  6 ++++--
 2 files changed, 27 insertions(+), 15 deletions(-)

(limited to 'scraper/s2.py')

diff --git a/scraper/s2-search-deep.py b/scraper/s2-search-deep.py
index 47c2f021..9846f2a3 100644
--- a/scraper/s2-search-deep.py
+++ b/scraper/s2-search-deep.py
@@ -27,7 +27,7 @@ totalResults
 
 s2 = SemanticScholarAPI()
 
-def fetch_query(query, since=None):
+def fetch_query(query, since=None, refresh=False):
   clean_title = re.sub(r'[^-0-9a-zA-Z ]+', '', query)
   yearFilter = {'min': since, 'max': 2020 } if since else None
   results_path = './datasets/s2/search/{}'.format(clean_title)
@@ -38,47 +38,57 @@ def fetch_query(query, since=None):
 
   while True:
     dump_fn = '{}/{}.json'.format(results_path, page)
+    #print(dump_fn)
     if not refresh and os.path.exists(dump_fn):
       results = read_json(dump_fn)
     else:
-      results = s2.search(q=clean_title, page=page, pageSize=10, yearFilter=yearFilter)
+      results = s2.search(q=clean_title.replace(' ', '+'), page=page, pageSize=10, yearFilter=yearFilter)
       write_json(dump_fn, results)
+      time.sleep(5)
 
-    total += len(results['results'])
-    if len(results['results']) == 0:
+    #print(results)
+    if not results or len(results['results']) == 0:
       break
 
+    total += len(results['results'])
+
     print("+ {} page {}".format(query, page))
 
     for result in results['results']:
       paper_id = result['id']
       if paper_id not in paper_ids:
         paper_ids[paper_id] = True
-    if total >= results['totalResults']:
+    page += 1
+    if total >= results['totalResults'] - 9:
       break
   return paper_ids
 
 @click.command()
 @click.option('--refresh/--no-refresh', '-f', help='Force it to query the paper API again')
-def search_deep(index, refresh):
+def search_deep(refresh):
   s2_queries = fetch_google_sheet_objects("s2_queries")
   os.makedirs('./datasets/s2/search_papers', exist_ok=True)
   for row in s2_queries:
     since = row['since']
+    if not since:
+      continue
+    since = int(since)
     queries = []
     row_paper_ids = {}
     for i in range(1, 6):
-      query_key = 'query' + i
+      query_key = 'query{}'.format(i)
       query = row[query_key]
       if query:
-        paper_ids = fetch_query(query, since)
+        paper_ids = fetch_query(query, since, refresh)
         for paper_id in paper_ids:
           row_paper_ids[paper_id] = True
 
-    row_fn = './datasets/s2/search_papers/{}'.format(row['key'])
-    write_csv(row_fn, keys=False, rows=row_paper_ids.keys())
-
-  # parallelize(raw_paper_module.fetch_raw_paper, paper_ids.keys())
+    paper_ids = list(row_paper_ids.keys())
+    if len(paper_ids):
+      print("Writing {} paper ids".format(len(paper_ids)))
+      row_fn = './datasets/s2/search_papers/{}.json'.format(row['key'])
+      write_json(row_fn, paper_ids)
+      parallelize(raw_paper_module.fetch_raw_paper, [(id,) for id in paper_ids])
 
 if __name__ == '__main__':
-  fetch_entries()
+  search_deep()
diff --git a/scraper/s2.py b/scraper/s2.py
index 26334de8..36dbc906 100644
--- a/scraper/s2.py
+++ b/scraper/s2.py
@@ -178,10 +178,12 @@ class SemanticScholarAPI(object):
 
     @staticmethod
     def search(q, page=1, pageSize=10, yearFilter=None):
+        #print(q)
+        #print(yearFilter)
         resp = requests.post(SemanticScholarAPI.SEARCH_ENDPOINT, json={
             'authors': [],
             'coAuthors': [],
-            'facets': {},
+            #'facets': {},
             'page': page,
             'pageSize': pageSize,
             'publicationTypes': [],
@@ -191,5 +193,5 @@ class SemanticScholarAPI(object):
             'venues': [],
             'yearFilter': yearFilter,
         }, headers=SemanticScholarAPI.headers)
-        # print(resp.status_code)
+        #print(resp.status_code)
         return None if resp.status_code != 200 else resp.json()
-- 
cgit v1.2.3-70-g09d2


From 4f823ca49a01becafdcd38c4d342080c1f29ce87 Mon Sep 17 00:00:00 2001
From: Jules Laplace <julescarbon@gmail.com>
Date: Fri, 3 May 2019 19:11:07 +0200
Subject: final report stuph

---
 scraper/s2-final-report.py | 76 ++++++++++++++++++++++++++++------------------
 scraper/s2.py              |  5 +--
 2 files changed, 49 insertions(+), 32 deletions(-)

(limited to 'scraper/s2.py')

diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py
index c369fa6f..854aa940 100644
--- a/scraper/s2-final-report.py
+++ b/scraper/s2-final-report.py
@@ -62,9 +62,12 @@ def process_paper(row, verified_lookup):
       papers.append(res)
       if res['address']:
         address_list.append(res['address'])
+  process_single_paper(row, 'search', addresses, aggregate_citations, unknown_citations)
   if not len(papers):
     return
   paper = papers[0]
+
+  # final citations - a report of all geocoded citations
   with open('{}/{}.json'.format(DIR_FINAL_CITATIONS, row['key']), 'w') as f:
     json.dump({
       'id': paper['paper_id'],
@@ -74,11 +77,16 @@ def process_paper(row, verified_lookup):
       'additional_papers': papers[1:],
       'citations': [aggregate_citations[key] for key in aggregate_citations.keys()],
     }, f)
+
+  # unkonwn citations - a report of all non-geocoded citations
   with open('{}/{}.json'.format(DIR_UNKNOWN_CITATIONS, row['key']), 'w') as f:
     json.dump({
       'id': papers[0]['paper_id'],
       'citations': [unknown_citations[key] for key in unknown_citations.keys()],
     }, f)
+
+  # "public" citations - initial citation reports digested by the geocoding frontend -bad name i know
+  # this might not need to get built...
   with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f:
     json.dump({
       'id': paper['paper_id'],
@@ -91,6 +99,8 @@ def process_paper(row, verified_lookup):
       },
       'citations': [aggregate_citations[key] for key in aggregate_citations.keys()],
     }, f)
+
+  # verified citations - the final public reports 
   with open('{}/{}.json'.format(DIR_VERIFIED_CITATIONS, row['key']), 'w') as f:
     json.dump({
       'id': paper['paper_id'],
@@ -121,38 +131,44 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_
     # 'citations_doi': 0,
   }
 
-  fn = file_path('papers', paper_id, 'paper.json')
-
-  with open(fn, 'r') as f:
-    data = json.load(f)
-  print('>> {} {}'.format(data['paperId'], row['key']))
-  paper = load_paper(data['paperId'])
-  if paper is None:
-    print("Paper missing! {}".format(data['paperId']))
-    return
+  if paper_id == 'search':
+    dataset = row['key']
+    fn = 'datasets/s2/search_papers/{}.json'.format(dataset)
+    with open(fn, 'r') as f:
+      citations = json.load(f)
+      data = { 'citations': [ { 'paperId': paperId } for paperId in citations ] }
+  else:
+    fn = file_path('papers', paper_id, 'paper.json')
+    with open(fn, 'r') as f:
+      data = json.load(f)
+    print('>> {} {}'.format(data['paperId'], row['key']))
+    paper = load_paper(data['paperId'])
+    if paper is None:
+      print("Paper missing! {}".format(data['paperId']))
+      return
   
-  res['key'] = row['key']
-  res['name'] = row['name']
-  res['paper_id'] = paper.paper_id
-  res['title'] = paper.title
-  # res['journal'] = paper.journal
-  res['year'] = paper.year
-  res['pdf'] = paper.pdf_links()
-  res['doi'] = paper.doi_links()
-  # res['authors'] = ', '.join(paper.authors)
-  # res['citations'] = []
+    res['key'] = row['key']
+    res['name'] = row['name']
+    res['paper_id'] = paper.paper_id
+    res['title'] = paper.title
+    # res['journal'] = paper.journal
+    res['year'] = paper.year
+    res['pdf'] = paper.pdf_links()
+    res['doi'] = paper.doi_links()
+    # res['authors'] = ', '.join(paper.authors)
+    # res['citations'] = []
 
-  paper_institutions = load_institutions(paper.paper_id, paper_location_lookup)
-  paper_address = None
-  for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
-    #print(inst[1])
-    institution = inst[1]
-    if paper_address is None:
-      paper_address = addresses.findObject(institution)
+    paper_institutions = load_institutions(paper.paper_id, paper_location_lookup)
+    paper_address = None
+    for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
+      #print(inst[1])
+      institution = inst[1]
+      if paper_address is None:
+        paper_address = addresses.findObject(institution)
 
-  if paper_address:
-    # print(paper_address)
-    res['address'] = paper_address
+    if paper_address:
+      # print(paper_address)
+      res['address'] = paper_address
 
   for cite in data['citations']:
     citationId = cite['paperId']
@@ -169,7 +185,7 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_
     # if has_doi:
     #   doi_count += 1
     if citation is None or citation.data is None:
-      print("Citation missing! {}".format(cite['paperId']))
+      print("Citation missing! {}".format(citationId))
       continue
     institutions = load_institutions(citationId, paper_location_lookup)
     geocoded_addresses = []
diff --git a/scraper/s2.py b/scraper/s2.py
index 26334de8..62fd9a94 100644
--- a/scraper/s2.py
+++ b/scraper/s2.py
@@ -178,7 +178,7 @@ class SemanticScholarAPI(object):
 
     @staticmethod
     def search(q, page=1, pageSize=10, yearFilter=None):
-        resp = requests.post(SemanticScholarAPI.SEARCH_ENDPOINT, json={
+        query = {
             'authors': [],
             'coAuthors': [],
             'facets': {},
@@ -190,6 +190,7 @@ class SemanticScholarAPI(object):
             'sort': "relevance",
             'venues': [],
             'yearFilter': yearFilter,
-        }, headers=SemanticScholarAPI.headers)
+        }
+        resp = requests.post(SemanticScholarAPI.SEARCH_ENDPOINT, json=query, headers=SemanticScholarAPI.headers)
         # print(resp.status_code)
         return None if resp.status_code != 200 else resp.json()
-- 
cgit v1.2.3-70-g09d2