diff options
| -rw-r--r-- | check-counts.py | 39 | ||||
| -rw-r--r-- | s2-index.py | 24 | ||||
| -rw-r--r-- | s2-papers.py | 89 | ||||
| -rw-r--r-- | s2-search.py | 68 | ||||
| -rw-r--r-- | s2.py | 151 | ||||
| -rw-r--r-- | scholar-fetch.py (renamed from fetch-entries.py) | 0 |
6 files changed, 371 insertions, 0 deletions
diff --git a/check-counts.py b/check-counts.py new file mode 100644 index 00000000..4fed4494 --- /dev/null +++ b/check-counts.py @@ -0,0 +1,39 @@ +import os +import sys +import csv +from math import ceil +import subprocess +import random + +import click + +@click.command() +def check_counts(): + """Split a CSV into groups.""" + mypath = './datasets/scholar/entries/' + onlyfiles = [f for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))] + recs = [] + for f in onlyfiles: + with open(os.path.join(mypath, f), 'rb') as f: + reader = csv.reader(f, delimiter='|') + print f + print repr(reader) + lines = list(reader) + rec = lines[0] + recs.append(rec) + + out_fn = './datasets/scholar_entries.csv' + write_csv(out_fn, keys=None, chunk=recs) + +# Write a CSV +def write_csv(fn, keys, chunk): + print(fn) + with open(fn, 'w') as f: + writer = csv.writer(f) + if keys is not None: + writer.writerow(keys) + for row in chunk: + writer.writerow(row) + +if __name__ == '__main__': + check_counts() diff --git a/s2-index.py b/s2-index.py new file mode 100644 index 00000000..779f63a5 --- /dev/null +++ b/s2-index.py @@ -0,0 +1,24 @@ +import click +import ijson.backends.yajl2_cffi as ijson + +@click.command() +@click.option('--index', '-n', default=0, help='Index of CSV.') +def build_index(): + pass + +def index_file(fn): + with open(fn, 'r') as f: + parser = ijson.parse(urlopen('http://.../')) + stream.write('<geo>') + for prefix, event, value in parser: + if (prefix, event) == ('earth', 'map_key'): + stream.write('<%s>' % value) + continent = value + elif prefix.endswith('.name'): + stream.write('<object name="%s"/>' % value) + elif (prefix, event) == ('earth.%s' % continent, 'end_map'): + stream.write('</%s>' % continent) + stream.write('</geo>') + +if __name__ == '__main__': + build_index() diff --git a/s2-papers.py b/s2-papers.py new file mode 100644 index 00000000..7320c095 --- /dev/null +++ b/s2-papers.py @@ -0,0 +1,89 @@ +import os +import sys +import csv +import subprocess +import time +import random +import re +import json +import click +from s2 import SemanticScholarAPI + +''' +s2 search API format: +results +matchedAuthors +matchedPresentations +query +querySuggestions +results +stats +totalPages +totalResults +''' + +s2 = SemanticScholarAPI() + +@click.command() +@click.option('--index', '-n', default=0, help='Index of CSV.') +@click.option('--depth', '-d', default=1, help='Depth to recurse.') +def fetch_papers(index, depth): + keys, lines = read_citation_list(index) + for line in lines: + label = line[0] + title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[1]) + entry_fn = './datasets/s2/entries/{}.json'.format(title) + if not os.path.exists(entry_fn): + print('not found: {}'.format(entry_fn)) + continue + result = read_json(entry_fn) + paper_id = result['id'] + paper = fetch_paper(paper_id) + # get all of the paper's citations + +def fetch_paper(paper_id): + os.makedirs('./datasets/s2/papers/{}/{}'.format(paper_id[0:2], paper_id), exist_ok=True) + paper_fn = './datasets/s2/papers/{}/{}/paper.json'.format(paper_id[0:2], paper_id) + if os.path.exists(paper_fn): + return read_json(paper_fn) + print(paper_id) + paper = s2.paper(paper_id) + if paper is None: + print("Got none paper??") + time.sleep(random.randint(20, 30)) + paper = s2.paper(paper_id) + if paper is None: + print("Paper not found") + return None + write_json(paper_fn, paper) + time.sleep(random.randint(5, 10)) + return paper + +def read_citation_list(index=0): + filename = './datasets/citations.csv' + if index > 0: + fn, ext = os.path.splitext(filename) + filename = fn + '-' + str(index) + ext + with open(filename, 'r') as f: + reader = csv.reader(f) + lines = list(reader) + keys = lines[0] + lines = lines[1:] + return keys, lines + +def read_json(fn): + with open(fn, 'r') as json_file: + return json.load(json_file) +def write_json(fn, data): + with open(fn, 'w') as outfile: + json.dump(data, outfile) +def write_csv(fn, keys, rows): + with open(fn, 'w') as f: + writer = csv.writer(f) + if keys is not None: + writer.writerow(keys) + for row in rows: + writer.writerow(row) + +if __name__ == '__main__': + fetch_papers() diff --git a/s2-search.py b/s2-search.py new file mode 100644 index 00000000..1804207b --- /dev/null +++ b/s2-search.py @@ -0,0 +1,68 @@ +import os +import sys +import csv +import subprocess +import time +import random +import re +import json +import click +from s2 import SemanticScholarAPI + +''' +s2 search API format: +results +matchedAuthors +matchedPresentations +query +querySuggestions +results +stats +totalPages +totalResults +''' + +@click.command() +@click.option('--index', '-n', default=0, help='Index of CSV.') +def fetch_entries(index): + keys, lines = read_citation_list(index) + s2 = SemanticScholarAPI() + for line in lines: + label = line[0] + title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[1]) + entry_fn = './datasets/s2/entries/{}.json'.format(title) + if not os.path.exists(entry_fn): + results = s2.search(title) + write_json(dump_fn, results) + if len(results['results']) == 0: + print("No results for {}".format(title)) + else: + print(title) + write_json(entry_fn, results['results'][0]) + time.sleep(random.randint(10, 20)) + +def read_citation_list(index=0): + filename = './datasets/citations.csv' + if index > 0: + fn, ext = os.path.splitext(filename) + filename = fn + '-' + str(index) + ext + with open(filename, 'r') as f: + reader = csv.reader(f) + lines = list(reader) + keys = lines[0] + lines = lines[1:] + return keys, lines + +def write_json(fn, data): + with open(fn, 'w') as outfile: + json.dump(data, outfile) +def write_csv(fn, keys, rows): + with open(fn, 'w') as f: + writer = csv.writer(f) + if keys is not None: + writer.writerow(keys) + for row in rows: + writer.writerow(row) + +if __name__ == '__main__': + fetch_entries() @@ -0,0 +1,151 @@ +import requests + +class AuthorStub(object): + + def __init__(self, **kwargs): + self.authorId = kwargs["authorId"] + self.name = kwargs.get("name", None) + self.url = kwargs.get("url", None) + + def __str__(self): + return self.authorId + + def __eq__(self, other): + return isinstance(other, AuthorStub) and self.authorId == other.authorId + + def __hash__(self): + return hash(self.authorId) + + def json(self): + return { + "authorId" : self.authorId, + "name" : self.name, + "url" : self.url + } + + def full(self, **kwargs): + return SemanticScholarAPI.author(self.authorId, **kwargs) + +class Author(object): + + def __init__(self, **kwargs): + self._kwargs = kwargs + self.authorId = kwargs["authorId"] + self.name = kwargs.get("name", None) + self.aliases = kwargs.get("aliases", []) + self.citationVelocity = kwargs.get("citationVelocity", None) + self.influentialCitationCount = kwargs.get("influentialCitationCount", None) + self.url = kwargs.get("url", None) + + def __str__(self): + return self.authorId + + def __eq__(self, other): + return isinstance(other, Author) and self.authorId == other.authorId + + def __hash__(self): + return hash(self.authorId) + + def papers(self): + for elem in self._kwargs.get("papers", []): + yield SemanticScholarAPI.paper(elem["paperId"]) + + def json(self): + return self._kwargs + +class PaperStub(object): + + def __init__(self, **kwargs): + self.paperId = kwargs["paperId"] + self.isInfluential = kwargs.get("isInfluential", False) + self.title = kwargs.get("title", None) + self.venue = kwargs.get("venue", None) + self.year = kwargs.get("year", None) + + def __str__(self): + return self.paperId + + def __eq__(self, other): + return isinstance(other, PaperStub) and self.paperId == other.paperId + + def __hash__(self): + return hash(self.paperId) + + def json(self): + return { + "paperId" : self.paperId, + "isInfluential" : self.isInfluential, + "title" : self.title, + "venue" : self.venue, + "year" : self.year, + } + + def full(self, **kwargs): + return SemanticScholarAPI.paper(self.paperId, **kwargs) + +class Paper(object): + + def __init__(self, **kwargs): + self.doi = kwargs.get("doi", None) + self.citationVelocity = kwargs.get("citationVelocity", None) + self.influentialCitationCount = kwargs.get("influentialCitationCount", None) + self.url = kwargs.get("url", None) + self.authors = [AuthorStub(**elem) for elem in kwargs.get("authors", [])] + self.citations = [PaperStub(**elem) for elem in kwargs.get("citations", [])] + self.references = [PaperStub(**elem) for elem in kwargs.get("references", [])] + self.venue = kwargs.get("venue", None) + self.references = kwargs.get("references", []) + self.title = kwargs.get("title", None) + self.year = kwargs.get("year", None) + + def __str__(self): + return self.paperId + + def __eq__(self, other): + return isinstance(other, Paper) and self.paperId == other.paperId + + def __hash__(self): + return hash(self.paperId) + + def json(self): + return self._kwargs + +class SemanticScholarAPI(object): + BASE_URL = "http://api.semanticscholar.org/v1" + AUTHOR_ENDPOINT = "{}/{}".format(BASE_URL, "author") + PAPER_ENDPOINT = "{}/{}".format(BASE_URL, "paper") + SEARCH_ENDPOINT = "https://www.semanticscholar.org/api/1/search" + + @staticmethod + def paper(paper_id, **kwargs): + url = "{}/{}".format(SemanticScholarAPI.PAPER_ENDPOINT, paper_id) + resp = requests.get(url, params=kwargs) + return None if resp.status_code != 200 else resp.json() # Paper(**resp.json()) + + @staticmethod + def author(author_id, **kwargs): + url = "{}/{}".format(SemanticScholarAPI.AUTHOR_ENDPOINT, author_id) + resp = requests.get(url, params=kwargs) + return None if resp.status_code != 200 else resp.json() # Author(**resp.json()) + + @staticmethod + def pdf_url(paper_id): + return "http://pdfs.semanticscholar.org/{}/{}.pdf".format(paper_id[:4], paper_id[4:]) + + @staticmethod + def search(q): + resp = requests.post(SemanticScholarAPI.SEARCH_ENDPOINT, json={ + 'authors': [], + 'coAuthors': [], + 'facets': {}, + 'page': 1, + 'pageSize': 10, + 'publicationTypes': [], + 'queryString': q, + 'requireViewablePdf': False, + 'sort': "relevance", + 'venues': [], + 'yearFilter': None, + }) + # print(resp.status_code) + return None if resp.status_code != 200 else resp.json() diff --git a/fetch-entries.py b/scholar-fetch.py index e206b058..e206b058 100644 --- a/fetch-entries.py +++ b/scholar-fetch.py |
