diff options
| author | jules@lens <julescarbon@gmail.com> | 2019-05-03 18:31:12 +0200 |
|---|---|---|
| committer | jules@lens <julescarbon@gmail.com> | 2019-05-03 18:31:12 +0200 |
| commit | 1be5e0e1a85a84d9eca7d1d89d14a562b356f2e0 (patch) | |
| tree | c85d4382bf974dd66e674546889c891c19b7fa8f /scraper | |
| parent | a53909352266a2258ddfa287508f979da59a9d1d (diff) | |
fixing up deep search
Diffstat (limited to 'scraper')
| -rw-r--r-- | scraper/s2-search-deep.py | 36 | ||||
| -rw-r--r-- | scraper/s2.py | 6 |
2 files changed, 27 insertions, 15 deletions
diff --git a/scraper/s2-search-deep.py b/scraper/s2-search-deep.py index 47c2f021..9846f2a3 100644 --- a/scraper/s2-search-deep.py +++ b/scraper/s2-search-deep.py @@ -27,7 +27,7 @@ totalResults s2 = SemanticScholarAPI() -def fetch_query(query, since=None): +def fetch_query(query, since=None, refresh=False): clean_title = re.sub(r'[^-0-9a-zA-Z ]+', '', query) yearFilter = {'min': since, 'max': 2020 } if since else None results_path = './datasets/s2/search/{}'.format(clean_title) @@ -38,47 +38,57 @@ def fetch_query(query, since=None): while True: dump_fn = '{}/{}.json'.format(results_path, page) + #print(dump_fn) if not refresh and os.path.exists(dump_fn): results = read_json(dump_fn) else: - results = s2.search(q=clean_title, page=page, pageSize=10, yearFilter=yearFilter) + results = s2.search(q=clean_title.replace(' ', '+'), page=page, pageSize=10, yearFilter=yearFilter) write_json(dump_fn, results) + time.sleep(5) - total += len(results['results']) - if len(results['results']) == 0: + #print(results) + if not results or len(results['results']) == 0: break + total += len(results['results']) + print("+ {} page {}".format(query, page)) for result in results['results']: paper_id = result['id'] if paper_id not in paper_ids: paper_ids[paper_id] = True - if total >= results['totalResults']: + page += 1 + if total >= results['totalResults'] - 9: break return paper_ids @click.command() @click.option('--refresh/--no-refresh', '-f', help='Force it to query the paper API again') -def search_deep(index, refresh): +def search_deep(refresh): s2_queries = fetch_google_sheet_objects("s2_queries") os.makedirs('./datasets/s2/search_papers', exist_ok=True) for row in s2_queries: since = row['since'] + if not since: + continue + since = int(since) queries = [] row_paper_ids = {} for i in range(1, 6): - query_key = 'query' + i + query_key = 'query{}'.format(i) query = row[query_key] if query: - paper_ids = fetch_query(query, since) + paper_ids = fetch_query(query, since, refresh) for paper_id in paper_ids: row_paper_ids[paper_id] = True - row_fn = './datasets/s2/search_papers/{}'.format(row['key']) - write_csv(row_fn, keys=False, rows=row_paper_ids.keys()) - - # parallelize(raw_paper_module.fetch_raw_paper, paper_ids.keys()) + paper_ids = list(row_paper_ids.keys()) + if len(paper_ids): + print("Writing {} paper ids".format(len(paper_ids))) + row_fn = './datasets/s2/search_papers/{}.json'.format(row['key']) + write_json(row_fn, paper_ids) + parallelize(raw_paper_module.fetch_raw_paper, [(id,) for id in paper_ids]) if __name__ == '__main__': - fetch_entries() + search_deep() diff --git a/scraper/s2.py b/scraper/s2.py index 26334de8..36dbc906 100644 --- a/scraper/s2.py +++ b/scraper/s2.py @@ -178,10 +178,12 @@ class SemanticScholarAPI(object): @staticmethod def search(q, page=1, pageSize=10, yearFilter=None): + #print(q) + #print(yearFilter) resp = requests.post(SemanticScholarAPI.SEARCH_ENDPOINT, json={ 'authors': [], 'coAuthors': [], - 'facets': {}, + #'facets': {}, 'page': page, 'pageSize': pageSize, 'publicationTypes': [], @@ -191,5 +193,5 @@ class SemanticScholarAPI(object): 'venues': [], 'yearFilter': yearFilter, }, headers=SemanticScholarAPI.headers) - # print(resp.status_code) + #print(resp.status_code) return None if resp.status_code != 200 else resp.json() |
