import urllib from os.path import join import click from app.settings import types from app.models.dataset import Dataset from app.utils import click_utils from app.settings import app_cfg as cfg from app.utils.logger_utils import Logger log = Logger.getLogger() sort_types = ['is-influential', 'year'] # 370b5757a5379b15e30d619e4d3fb9e8e13f3256 is LFW @click.command() @click.option('--paper-id', 'opt_paper_id', required=True, help='Input license data CSV') @click.option('-o', '--output', 'opt_fp_out', required=True, help='Output directory') @click.option('-t', '--threads', 'opt_threads', default=4, help='Number of threads') @click.option('--sort', 'opt_sort_type', default='is-influential', type=click.Choice(sort_types), help='Sort types') @click.option('--citations', 'opt_num_citations', required=True, type=int, help='Num pages (TODO: read var from page') @click.pass_context def cli(ctx, opt_paper_id, opt_fp_out, opt_sort_type, opt_threads, opt_num_citations): """_template_""" import sys from glob import glob from os.path import join from pathlib import Path import time import pandas as pd from tqdm import tqdm from bs4 import BeautifulSoup from pathlib import Path import urllib.request import lxml from functools import partial from multiprocessing.dummy import Pool as ThreadPool from app.utils import file_utils, im_utils from app.models.data_store import DataStore log = Logger.getLogger() def pool_process(url_obj): # threaded function results = [] try: data = urllib.request.urlopen(url_obj['url'], timeout=30).read() soup = BeautifulSoup(data,'lxml') titles = soup.find_all('h2', attrs={'class': 'citation__title'}) page_results = [] for t in titles: page_results.append({'title': t.text, 'paper_id': t['data-heap-paper-id']}) results += page_results except Exception as e: print(f'Error: {e}, {url_obj["url"]}') pbar.update(1) return results # list of paper title and id # pregenerate URLs page_limit = 10 num_pages = opt_num_citations // page_limit if num_pages > 990: log.warn('semanticscholar can not handle more than 990 citations yet') num_pages = 990 url_objs = [] for page_num in range(num_pages): url = gen_url(opt_paper_id, page_num, opt_sort_type, page_limit=page_limit) url_objs.append({'url': url, 'sort_type':opt_sort_type}) results = [] pbar = tqdm(total=len(url_objs)) pool_process = partial(pool_process) pool = ThreadPool(opt_threads) with tqdm(total=len(url_objs)) as pbar: results = pool.map(pool_process, url_objs) # dedupe papers_all = [] papers_ids = [] for paper_list in results: for paper in paper_list: paper_id = paper['paper_id'] if not paper_id in papers_ids: papers_ids.append(paper_id) papers_all.append(paper) pbar.close() # save df = pd.DataFrame.from_dict(papers_all) df.index.name = 'index' df.to_csv(opt_fp_out) # ---------------------------------------------------------------------------- # Helpers # ---------------------------------------------------------------------------- def gen_url(paper_id, page_num, sort_type, page_limit=10): url = 'https://www.semanticscholar.org/paper/' params = { 'tab': 'abstract', 'citingPapersSort': sort_type, # citingPapersSort=year,is-influential 'citingPapersLimit': page_limit, 'citingPapersOffset': page_num * page_limit, 'citedPapersSort': 'is-influential', 'citedPapersLimit': 1, 'citedPapersOffset': 0, } url_args = urllib.parse.urlencode(params) url = join(url, paper_id, f'?{url_args}') return url