From a53909352266a2258ddfa287508f979da59a9d1d Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Fri, 3 May 2019 17:48:22 +0200 Subject: search-deep.py --- scraper/s2.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'scraper/s2.py') diff --git a/scraper/s2.py b/scraper/s2.py index 01c0b4d5..26334de8 100644 --- a/scraper/s2.py +++ b/scraper/s2.py @@ -177,19 +177,19 @@ class SemanticScholarAPI(object): return None if resp.status_code != 200 else resp.json() # Paper(**resp.json()) @staticmethod - def search(q): + def search(q, page=1, pageSize=10, yearFilter=None): resp = requests.post(SemanticScholarAPI.SEARCH_ENDPOINT, json={ 'authors': [], 'coAuthors': [], 'facets': {}, - 'page': 1, - 'pageSize': 10, + 'page': page, + 'pageSize': pageSize, 'publicationTypes': [], 'queryString': q, 'requireViewablePdf': False, 'sort': "relevance", 'venues': [], - 'yearFilter': None, + 'yearFilter': yearFilter, }, headers=SemanticScholarAPI.headers) # print(resp.status_code) return None if resp.status_code != 200 else resp.json() -- cgit v1.2.3-70-g09d2 From 1be5e0e1a85a84d9eca7d1d89d14a562b356f2e0 Mon Sep 17 00:00:00 2001 From: "jules@lens" Date: Fri, 3 May 2019 18:31:12 +0200 Subject: fixing up deep search --- scraper/s2-search-deep.py | 36 +++++++++++++++++++++++------------- scraper/s2.py | 6 ++++-- 2 files changed, 27 insertions(+), 15 deletions(-) (limited to 'scraper/s2.py') diff --git a/scraper/s2-search-deep.py b/scraper/s2-search-deep.py index 47c2f021..9846f2a3 100644 --- a/scraper/s2-search-deep.py +++ b/scraper/s2-search-deep.py @@ -27,7 +27,7 @@ totalResults s2 = SemanticScholarAPI() -def fetch_query(query, since=None): +def fetch_query(query, since=None, refresh=False): clean_title = re.sub(r'[^-0-9a-zA-Z ]+', '', query) yearFilter = {'min': since, 'max': 2020 } if since else None results_path = './datasets/s2/search/{}'.format(clean_title) @@ -38,47 +38,57 @@ def fetch_query(query, since=None): while True: dump_fn = '{}/{}.json'.format(results_path, page) + #print(dump_fn) if not refresh and os.path.exists(dump_fn): results = read_json(dump_fn) else: - results = s2.search(q=clean_title, page=page, pageSize=10, yearFilter=yearFilter) + results = s2.search(q=clean_title.replace(' ', '+'), page=page, pageSize=10, yearFilter=yearFilter) write_json(dump_fn, results) + time.sleep(5) - total += len(results['results']) - if len(results['results']) == 0: + #print(results) + if not results or len(results['results']) == 0: break + total += len(results['results']) + print("+ {} page {}".format(query, page)) for result in results['results']: paper_id = result['id'] if paper_id not in paper_ids: paper_ids[paper_id] = True - if total >= results['totalResults']: + page += 1 + if total >= results['totalResults'] - 9: break return paper_ids @click.command() @click.option('--refresh/--no-refresh', '-f', help='Force it to query the paper API again') -def search_deep(index, refresh): +def search_deep(refresh): s2_queries = fetch_google_sheet_objects("s2_queries") os.makedirs('./datasets/s2/search_papers', exist_ok=True) for row in s2_queries: since = row['since'] + if not since: + continue + since = int(since) queries = [] row_paper_ids = {} for i in range(1, 6): - query_key = 'query' + i + query_key = 'query{}'.format(i) query = row[query_key] if query: - paper_ids = fetch_query(query, since) + paper_ids = fetch_query(query, since, refresh) for paper_id in paper_ids: row_paper_ids[paper_id] = True - row_fn = './datasets/s2/search_papers/{}'.format(row['key']) - write_csv(row_fn, keys=False, rows=row_paper_ids.keys()) - - # parallelize(raw_paper_module.fetch_raw_paper, paper_ids.keys()) + paper_ids = list(row_paper_ids.keys()) + if len(paper_ids): + print("Writing {} paper ids".format(len(paper_ids))) + row_fn = './datasets/s2/search_papers/{}.json'.format(row['key']) + write_json(row_fn, paper_ids) + parallelize(raw_paper_module.fetch_raw_paper, [(id,) for id in paper_ids]) if __name__ == '__main__': - fetch_entries() + search_deep() diff --git a/scraper/s2.py b/scraper/s2.py index 26334de8..36dbc906 100644 --- a/scraper/s2.py +++ b/scraper/s2.py @@ -178,10 +178,12 @@ class SemanticScholarAPI(object): @staticmethod def search(q, page=1, pageSize=10, yearFilter=None): + #print(q) + #print(yearFilter) resp = requests.post(SemanticScholarAPI.SEARCH_ENDPOINT, json={ 'authors': [], 'coAuthors': [], - 'facets': {}, + #'facets': {}, 'page': page, 'pageSize': pageSize, 'publicationTypes': [], @@ -191,5 +193,5 @@ class SemanticScholarAPI(object): 'venues': [], 'yearFilter': yearFilter, }, headers=SemanticScholarAPI.headers) - # print(resp.status_code) + #print(resp.status_code) return None if resp.status_code != 200 else resp.json() -- cgit v1.2.3-70-g09d2 From 4f823ca49a01becafdcd38c4d342080c1f29ce87 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Fri, 3 May 2019 19:11:07 +0200 Subject: final report stuph --- scraper/s2-final-report.py | 76 ++++++++++++++++++++++++++++------------------ scraper/s2.py | 5 +-- 2 files changed, 49 insertions(+), 32 deletions(-) (limited to 'scraper/s2.py') diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py index c369fa6f..854aa940 100644 --- a/scraper/s2-final-report.py +++ b/scraper/s2-final-report.py @@ -62,9 +62,12 @@ def process_paper(row, verified_lookup): papers.append(res) if res['address']: address_list.append(res['address']) + process_single_paper(row, 'search', addresses, aggregate_citations, unknown_citations) if not len(papers): return paper = papers[0] + + # final citations - a report of all geocoded citations with open('{}/{}.json'.format(DIR_FINAL_CITATIONS, row['key']), 'w') as f: json.dump({ 'id': paper['paper_id'], @@ -74,11 +77,16 @@ def process_paper(row, verified_lookup): 'additional_papers': papers[1:], 'citations': [aggregate_citations[key] for key in aggregate_citations.keys()], }, f) + + # unkonwn citations - a report of all non-geocoded citations with open('{}/{}.json'.format(DIR_UNKNOWN_CITATIONS, row['key']), 'w') as f: json.dump({ 'id': papers[0]['paper_id'], 'citations': [unknown_citations[key] for key in unknown_citations.keys()], }, f) + + # "public" citations - initial citation reports digested by the geocoding frontend -bad name i know + # this might not need to get built... with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f: json.dump({ 'id': paper['paper_id'], @@ -91,6 +99,8 @@ def process_paper(row, verified_lookup): }, 'citations': [aggregate_citations[key] for key in aggregate_citations.keys()], }, f) + + # verified citations - the final public reports with open('{}/{}.json'.format(DIR_VERIFIED_CITATIONS, row['key']), 'w') as f: json.dump({ 'id': paper['paper_id'], @@ -121,38 +131,44 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_ # 'citations_doi': 0, } - fn = file_path('papers', paper_id, 'paper.json') - - with open(fn, 'r') as f: - data = json.load(f) - print('>> {} {}'.format(data['paperId'], row['key'])) - paper = load_paper(data['paperId']) - if paper is None: - print("Paper missing! {}".format(data['paperId'])) - return + if paper_id == 'search': + dataset = row['key'] + fn = 'datasets/s2/search_papers/{}.json'.format(dataset) + with open(fn, 'r') as f: + citations = json.load(f) + data = { 'citations': [ { 'paperId': paperId } for paperId in citations ] } + else: + fn = file_path('papers', paper_id, 'paper.json') + with open(fn, 'r') as f: + data = json.load(f) + print('>> {} {}'.format(data['paperId'], row['key'])) + paper = load_paper(data['paperId']) + if paper is None: + print("Paper missing! {}".format(data['paperId'])) + return - res['key'] = row['key'] - res['name'] = row['name'] - res['paper_id'] = paper.paper_id - res['title'] = paper.title - # res['journal'] = paper.journal - res['year'] = paper.year - res['pdf'] = paper.pdf_links() - res['doi'] = paper.doi_links() - # res['authors'] = ', '.join(paper.authors) - # res['citations'] = [] + res['key'] = row['key'] + res['name'] = row['name'] + res['paper_id'] = paper.paper_id + res['title'] = paper.title + # res['journal'] = paper.journal + res['year'] = paper.year + res['pdf'] = paper.pdf_links() + res['doi'] = paper.doi_links() + # res['authors'] = ', '.join(paper.authors) + # res['citations'] = [] - paper_institutions = load_institutions(paper.paper_id, paper_location_lookup) - paper_address = None - for inst in sorted(paper_institutions, key=operator.itemgetter(1)): - #print(inst[1]) - institution = inst[1] - if paper_address is None: - paper_address = addresses.findObject(institution) + paper_institutions = load_institutions(paper.paper_id, paper_location_lookup) + paper_address = None + for inst in sorted(paper_institutions, key=operator.itemgetter(1)): + #print(inst[1]) + institution = inst[1] + if paper_address is None: + paper_address = addresses.findObject(institution) - if paper_address: - # print(paper_address) - res['address'] = paper_address + if paper_address: + # print(paper_address) + res['address'] = paper_address for cite in data['citations']: citationId = cite['paperId'] @@ -169,7 +185,7 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_ # if has_doi: # doi_count += 1 if citation is None or citation.data is None: - print("Citation missing! {}".format(cite['paperId'])) + print("Citation missing! {}".format(citationId)) continue institutions = load_institutions(citationId, paper_location_lookup) geocoded_addresses = [] diff --git a/scraper/s2.py b/scraper/s2.py index 26334de8..62fd9a94 100644 --- a/scraper/s2.py +++ b/scraper/s2.py @@ -178,7 +178,7 @@ class SemanticScholarAPI(object): @staticmethod def search(q, page=1, pageSize=10, yearFilter=None): - resp = requests.post(SemanticScholarAPI.SEARCH_ENDPOINT, json={ + query = { 'authors': [], 'coAuthors': [], 'facets': {}, @@ -190,6 +190,7 @@ class SemanticScholarAPI(object): 'sort': "relevance", 'venues': [], 'yearFilter': yearFilter, - }, headers=SemanticScholarAPI.headers) + } + resp = requests.post(SemanticScholarAPI.SEARCH_ENDPOINT, json=query, headers=SemanticScholarAPI.headers) # print(resp.status_code) return None if resp.status_code != 200 else resp.json() -- cgit v1.2.3-70-g09d2