summaryrefslogtreecommitdiff
path: root/megapixels
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2019-02-11 00:37:31 +0100
committeradamhrv <adam@ahprojects.com>2019-02-11 00:37:31 +0100
commit730a32a51cac1d1b70fdade93d0986b8b4e1ac69 (patch)
tree59df856eede67f855eaa157eacbe8da219e85c70 /megapixels
parent439d0d5aca826917da5467750fa0f3b9e6ce7402 (diff)
s2 scraper bs4
Diffstat (limited to 'megapixels')
-rw-r--r--megapixels/commands/datasets/semantic_scholar_scrape.py123
1 files changed, 123 insertions, 0 deletions
diff --git a/megapixels/commands/datasets/semantic_scholar_scrape.py b/megapixels/commands/datasets/semantic_scholar_scrape.py
new file mode 100644
index 00000000..80c44590
--- /dev/null
+++ b/megapixels/commands/datasets/semantic_scholar_scrape.py
@@ -0,0 +1,123 @@
+import urllib
+from os.path import join
+
+import click
+
+from app.settings import types
+from app.models.dataset import Dataset
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+sort_types = ['is-influential', 'year']
+
+# 370b5757a5379b15e30d619e4d3fb9e8e13f3256 is LFW
+@click.command()
+@click.option('--paper-id', 'opt_paper_id', required=True,
+ help='Input license data CSV')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+ help='Output directory')
+@click.option('-t', '--threads', 'opt_threads', default=4,
+ help='Number of threads')
+@click.option('--sort', 'opt_sort_type', default='is-influential', type=click.Choice(sort_types),
+ help='Sort types')
+@click.option('--citations', 'opt_num_citations', required=True, type=int,
+ help='Num pages (TODO: read var from page')
+@click.pass_context
+def cli(ctx, opt_paper_id, opt_fp_out, opt_sort_type, opt_threads, opt_num_citations):
+ """_template_"""
+
+ import sys
+ from glob import glob
+ from os.path import join
+ from pathlib import Path
+ import time
+
+ import pandas as pd
+ from tqdm import tqdm
+ from bs4 import BeautifulSoup
+ from pathlib import Path
+
+ import urllib.request
+ import lxml
+ from functools import partial
+ from multiprocessing.dummy import Pool as ThreadPool
+
+ from app.utils import file_utils, im_utils
+ from app.models.data_store import DataStore
+
+ log = Logger.getLogger()
+
+ def pool_process(url_obj):
+ # threaded function
+ results = []
+ try:
+ data = urllib.request.urlopen(url_obj['url'], timeout=30).read()
+ soup = BeautifulSoup(data,'lxml')
+ titles = soup.find_all('h2', attrs={'class': 'citation__title'})
+ page_results = []
+ for t in titles:
+ page_results.append({'title': t.text, 'paper_id': t['data-heap-paper-id']})
+ results += page_results
+ except Exception as e:
+ print(f'Error: {e}, {url_obj["url"]}')
+ pbar.update(1)
+ return results # list of paper title and id
+
+ # pregenerate URLs
+ page_limit = 10
+ num_pages = opt_num_citations // page_limit
+ if num_pages > 990:
+ log.warn('semanticscholar can not handle more than 990 citations yet')
+ num_pages = 990
+
+ url_objs = []
+ for page_num in range(num_pages):
+ url = gen_url(opt_paper_id, page_num, opt_sort_type, page_limit=page_limit)
+ url_objs.append({'url': url, 'sort_type':opt_sort_type})
+
+ results = []
+ pbar = tqdm(total=len(url_objs))
+ pool_process = partial(pool_process)
+ pool = ThreadPool(opt_threads)
+ with tqdm(total=len(url_objs)) as pbar:
+ results = pool.map(pool_process, url_objs)
+
+ # dedupe
+ papers_all = []
+ papers_ids = []
+ for paper_list in results:
+ for paper in paper_list:
+ paper_id = paper['paper_id']
+ if not paper_id in papers_ids:
+ papers_ids.append(paper_id)
+ papers_all.append(paper)
+
+ pbar.close()
+
+ # save
+ df = pd.DataFrame.from_dict(papers_all)
+ df.index.name = 'index'
+ df.to_csv(opt_fp_out)
+
+
+
+# ----------------------------------------------------------------------------
+# Helpers
+# ----------------------------------------------------------------------------
+
+def gen_url(paper_id, page_num, sort_type, page_limit=10):
+ url = 'https://www.semanticscholar.org/paper/'
+ params = {
+ 'tab': 'abstract',
+ 'citingPapersSort': sort_type, # citingPapersSort=year,is-influential
+ 'citingPapersLimit': page_limit,
+ 'citingPapersOffset': page_num * page_limit,
+ 'citedPapersSort': 'is-influential',
+ 'citedPapersLimit': 1,
+ 'citedPapersOffset': 0,
+ }
+ url_args = urllib.parse.urlencode(params)
+ url = join(url, paper_id, f'?{url_args}')
+ return url \ No newline at end of file