diff options
| author | adamhrv <adam@ahprojects.com> | 2019-02-11 00:37:31 +0100 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-02-11 00:37:31 +0100 |
| commit | 730a32a51cac1d1b70fdade93d0986b8b4e1ac69 (patch) | |
| tree | 59df856eede67f855eaa157eacbe8da219e85c70 /megapixels | |
| parent | 439d0d5aca826917da5467750fa0f3b9e6ce7402 (diff) | |
s2 scraper bs4
Diffstat (limited to 'megapixels')
| -rw-r--r-- | megapixels/commands/datasets/semantic_scholar_scrape.py | 123 |
1 files changed, 123 insertions, 0 deletions
diff --git a/megapixels/commands/datasets/semantic_scholar_scrape.py b/megapixels/commands/datasets/semantic_scholar_scrape.py new file mode 100644 index 00000000..80c44590 --- /dev/null +++ b/megapixels/commands/datasets/semantic_scholar_scrape.py @@ -0,0 +1,123 @@ +import urllib +from os.path import join + +import click + +from app.settings import types +from app.models.dataset import Dataset +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils.logger_utils import Logger + +log = Logger.getLogger() +sort_types = ['is-influential', 'year'] + +# 370b5757a5379b15e30d619e4d3fb9e8e13f3256 is LFW +@click.command() +@click.option('--paper-id', 'opt_paper_id', required=True, + help='Input license data CSV') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output directory') +@click.option('-t', '--threads', 'opt_threads', default=4, + help='Number of threads') +@click.option('--sort', 'opt_sort_type', default='is-influential', type=click.Choice(sort_types), + help='Sort types') +@click.option('--citations', 'opt_num_citations', required=True, type=int, + help='Num pages (TODO: read var from page') +@click.pass_context +def cli(ctx, opt_paper_id, opt_fp_out, opt_sort_type, opt_threads, opt_num_citations): + """_template_""" + + import sys + from glob import glob + from os.path import join + from pathlib import Path + import time + + import pandas as pd + from tqdm import tqdm + from bs4 import BeautifulSoup + from pathlib import Path + + import urllib.request + import lxml + from functools import partial + from multiprocessing.dummy import Pool as ThreadPool + + from app.utils import file_utils, im_utils + from app.models.data_store import DataStore + + log = Logger.getLogger() + + def pool_process(url_obj): + # threaded function + results = [] + try: + data = urllib.request.urlopen(url_obj['url'], timeout=30).read() + soup = BeautifulSoup(data,'lxml') + titles = soup.find_all('h2', attrs={'class': 'citation__title'}) + page_results = [] + for t in titles: + page_results.append({'title': t.text, 'paper_id': t['data-heap-paper-id']}) + results += page_results + except Exception as e: + print(f'Error: {e}, {url_obj["url"]}') + pbar.update(1) + return results # list of paper title and id + + # pregenerate URLs + page_limit = 10 + num_pages = opt_num_citations // page_limit + if num_pages > 990: + log.warn('semanticscholar can not handle more than 990 citations yet') + num_pages = 990 + + url_objs = [] + for page_num in range(num_pages): + url = gen_url(opt_paper_id, page_num, opt_sort_type, page_limit=page_limit) + url_objs.append({'url': url, 'sort_type':opt_sort_type}) + + results = [] + pbar = tqdm(total=len(url_objs)) + pool_process = partial(pool_process) + pool = ThreadPool(opt_threads) + with tqdm(total=len(url_objs)) as pbar: + results = pool.map(pool_process, url_objs) + + # dedupe + papers_all = [] + papers_ids = [] + for paper_list in results: + for paper in paper_list: + paper_id = paper['paper_id'] + if not paper_id in papers_ids: + papers_ids.append(paper_id) + papers_all.append(paper) + + pbar.close() + + # save + df = pd.DataFrame.from_dict(papers_all) + df.index.name = 'index' + df.to_csv(opt_fp_out) + + + +# ---------------------------------------------------------------------------- +# Helpers +# ---------------------------------------------------------------------------- + +def gen_url(paper_id, page_num, sort_type, page_limit=10): + url = 'https://www.semanticscholar.org/paper/' + params = { + 'tab': 'abstract', + 'citingPapersSort': sort_type, # citingPapersSort=year,is-influential + 'citingPapersLimit': page_limit, + 'citingPapersOffset': page_num * page_limit, + 'citedPapersSort': 'is-influential', + 'citedPapersLimit': 1, + 'citedPapersOffset': 0, + } + url_args = urllib.parse.urlencode(params) + url = join(url, paper_id, f'?{url_args}') + return url
\ No newline at end of file |
