summaryrefslogtreecommitdiff
path: root/scraper/util.py
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/util.py')
-rw-r--r--scraper/util.py38
1 files changed, 32 insertions, 6 deletions
diff --git a/scraper/util.py b/scraper/util.py
index 27465487..7b55afae 100644
--- a/scraper/util.py
+++ b/scraper/util.py
@@ -156,11 +156,16 @@ class DbPaper(object):
return [ (author['ids'][0] if len(author['ids']) else '', author['name']) for author in self.data['authors'] ]
@property
def pdf_link(self):
+ link = None
if self.data['s2PdfUrl']:
- return self.data['s2PdfUrl']
- if len(self.data['pdfUrls']):
- return self.data['pdfUrls'][0]
- return None
+ link = self.data['s2PdfUrl']
+ elif len(self.data['pdfUrls']):
+ link = self.data['pdfUrls'][0]
+ if link is None:
+ return None
+ if type(link) == dict and 'url' in link:
+ return link['url']
+ return link
def record(self):
return [ self.paper_id, self.title, self.journal, self.year ]
@@ -192,7 +197,10 @@ class RawPaper(object):
@property
def pdf_link(self):
if 'primaryPaperLink' in self.data:
- return self.data['primaryPaperLink']
+ link = self.data['primaryPaperLink']
+ if type(link) == dict and 'url' in link:
+ return link['url']
+ return link
return None
def record(self):
return [ self.paper_id, self.title, self.journal, self.year ]
@@ -290,10 +298,28 @@ class AddressBook (object):
'address': row[0],
'lat': row[3],
'lng': row[4],
- 'address_type': row[5],
+ 'type': row[5],
}
return None
+def fetch_paper(s2, paper_id):
+ os.makedirs('./datasets/s2/papers/{}/{}'.format(paper_id[0:2], paper_id), exist_ok=True)
+ paper_fn = './datasets/s2/papers/{}/{}/paper.json'.format(paper_id[0:2], paper_id)
+ if os.path.exists(paper_fn):
+ return read_json(paper_fn)
+ print(paper_id)
+ paper = s2.paper(paper_id)
+ if paper is None:
+ print("Got none paper??")
+ # time.sleep(random.randint(1, 2))
+ paper = s2.paper(paper_id)
+ if paper is None:
+ print("Paper not found")
+ return None
+ write_json(paper_fn, paper)
+ # time.sleep(random.randint(1, 2))
+ return paper
+
def fetch_spreadsheet():
scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
credentials = ServiceAccountCredentials.from_json_keyfile_name('./.creds/Megapixels-ef28f91112a9.json', scope)