megapixels/commands/datasets/semantic_scholar_scrape.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123

import urllib
from os.path import join

import click

from app.settings import types
from app.models.dataset import Dataset
from app.utils import click_utils
from app.settings import app_cfg as cfg
from app.utils.logger_utils import Logger

log = Logger.getLogger()
sort_types = ['is-influential', 'year']

# 370b5757a5379b15e30d619e4d3fb9e8e13f3256 is LFW
@click.command()
@click.option('--paper-id', 'opt_paper_id', required=True,
  help='Input license data CSV')
@click.option('-o', '--output', 'opt_fp_out', required=True, 
  help='Output directory')
@click.option('-t', '--threads', 'opt_threads', default=4,
  help='Number of threads')
@click.option('--sort', 'opt_sort_type', default='is-influential', type=click.Choice(sort_types),
  help='Sort types')
@click.option('--citations', 'opt_num_citations', required=True, type=int,
  help='Num pages (TODO: read var from page')
@click.pass_context
def cli(ctx, opt_paper_id, opt_fp_out, opt_sort_type, opt_threads, opt_num_citations):
  """_template_"""
  
  import sys
  from glob import glob
  from os.path import join
  from pathlib import Path
  import time

  import pandas as pd
  from tqdm import tqdm
  from bs4 import BeautifulSoup
  from pathlib import Path
  
  import urllib.request
  import lxml
  from functools import partial
  from multiprocessing.dummy import Pool as ThreadPool

  from app.utils import file_utils, im_utils
  from app.models.data_store import DataStore

  log = Logger.getLogger()
 
  def pool_process(url_obj):
    # threaded function
    results = []
    try:
      data = urllib.request.urlopen(url_obj['url'], timeout=30).read()
      soup = BeautifulSoup(data,'lxml')
      titles = soup.find_all('h2', attrs={'class': 'citation__title'})
      page_results = []
      for t in titles:
        page_results.append({'title': t.text, 'paper_id': t['data-heap-paper-id']})
      results += page_results
    except Exception as e:
      print(f'Error: {e}, {url_obj["url"]}')
    pbar.update(1)
    return results  # list of paper title and id

  # pregenerate URLs
  page_limit = 10
  num_pages = opt_num_citations // page_limit
  if num_pages > 990:
    log.warn('semanticscholar can not handle more than 990 citations yet')
    num_pages = 990

  url_objs = []
  for page_num in range(num_pages):
    url = gen_url(opt_paper_id, page_num, opt_sort_type, page_limit=page_limit)
    url_objs.append({'url': url, 'sort_type':opt_sort_type})

  results = []
  pbar = tqdm(total=len(url_objs))
  pool_process = partial(pool_process)
  pool = ThreadPool(opt_threads) 
  with tqdm(total=len(url_objs)) as pbar:
    results = pool.map(pool_process, url_objs)
  
  # dedupe
  papers_all = []
  papers_ids  = []
  for paper_list in results:
    for paper in paper_list:
      paper_id = paper['paper_id']
      if not paper_id in papers_ids:
        papers_ids.append(paper_id)
        papers_all.append(paper)

  pbar.close()
  
  # save
  df = pd.DataFrame.from_dict(papers_all)
  df.index.name = 'index'
  df.to_csv(opt_fp_out)


# ----------------------------------------------------------------------------
# Helpers
# ----------------------------------------------------------------------------

def gen_url(paper_id, page_num, sort_type, page_limit=10):
    url = 'https://www.semanticscholar.org/paper/'
    params = {
      'tab': 'abstract',
      'citingPapersSort': sort_type,  # citingPapersSort=year,is-influential
      'citingPapersLimit': page_limit,
      'citingPapersOffset': page_num * page_limit,
      'citedPapersSort': 'is-influential',
      'citedPapersLimit': 1,
      'citedPapersOffset': 0,
      }
    url_args = urllib.parse.urlencode(params)
    url = join(url, paper_id, f'?{url_args}')
    return url