1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
|
import urllib
from os.path import join
import click
from app.settings import types
from app.models.dataset import Dataset
from app.utils import click_utils
from app.settings import app_cfg as cfg
from app.utils.logger_utils import Logger
log = Logger.getLogger()
sort_types = ['is-influential', 'year']
# 370b5757a5379b15e30d619e4d3fb9e8e13f3256 is LFW
@click.command()
@click.option('--paper-id', 'opt_paper_id', required=True,
help='Input license data CSV')
@click.option('-o', '--output', 'opt_fp_out', required=True,
help='Output directory')
@click.option('-t', '--threads', 'opt_threads', default=4,
help='Number of threads')
@click.option('--sort', 'opt_sort_type', default='is-influential', type=click.Choice(sort_types),
help='Sort types')
@click.option('--citations', 'opt_num_citations', required=True, type=int,
help='Num pages (TODO: read var from page')
@click.pass_context
def cli(ctx, opt_paper_id, opt_fp_out, opt_sort_type, opt_threads, opt_num_citations):
"""_template_"""
import sys
from glob import glob
from os.path import join
from pathlib import Path
import time
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from pathlib import Path
import urllib.request
import lxml
from functools import partial
from multiprocessing.dummy import Pool as ThreadPool
from app.utils import file_utils, im_utils
from app.models.data_store import DataStore
log = Logger.getLogger()
def pool_process(url_obj):
# threaded function
results = []
try:
data = urllib.request.urlopen(url_obj['url'], timeout=30).read()
soup = BeautifulSoup(data,'lxml')
titles = soup.find_all('h2', attrs={'class': 'citation__title'})
page_results = []
for t in titles:
page_results.append({'title': t.text, 'paper_id': t['data-heap-paper-id']})
results += page_results
except Exception as e:
print(f'Error: {e}, {url_obj["url"]}')
pbar.update(1)
return results # list of paper title and id
# pregenerate URLs
page_limit = 10
num_pages = opt_num_citations // page_limit
if num_pages > 990:
log.warn('semanticscholar can not handle more than 990 citations yet')
num_pages = 990
url_objs = []
for page_num in range(num_pages):
url = gen_url(opt_paper_id, page_num, opt_sort_type, page_limit=page_limit)
url_objs.append({'url': url, 'sort_type':opt_sort_type})
results = []
pbar = tqdm(total=len(url_objs))
pool_process = partial(pool_process)
pool = ThreadPool(opt_threads)
with tqdm(total=len(url_objs)) as pbar:
results = pool.map(pool_process, url_objs)
# dedupe
papers_all = []
papers_ids = []
for paper_list in results:
for paper in paper_list:
paper_id = paper['paper_id']
if not paper_id in papers_ids:
papers_ids.append(paper_id)
papers_all.append(paper)
pbar.close()
# save
df = pd.DataFrame.from_dict(papers_all)
df.index.name = 'index'
df.to_csv(opt_fp_out)
# ----------------------------------------------------------------------------
# Helpers
# ----------------------------------------------------------------------------
def gen_url(paper_id, page_num, sort_type, page_limit=10):
url = 'https://www.semanticscholar.org/paper/'
params = {
'tab': 'abstract',
'citingPapersSort': sort_type, # citingPapersSort=year,is-influential
'citingPapersLimit': page_limit,
'citingPapersOffset': page_num * page_limit,
'citedPapersSort': 'is-influential',
'citedPapersLimit': 1,
'citedPapersOffset': 0,
}
url_args = urllib.parse.urlencode(params)
url = join(url, paper_id, f'?{url_args}')
return url
|