diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2019-02-10 16:38:53 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2019-02-10 16:38:53 +0100 |
| commit | efadb39172eb36b4ffebeaa5169219c7954a263b (patch) | |
| tree | 078ece0d9af403527a575f63a8997f5de5073726 /scraper/util.py | |
| parent | b248ce4f21a94e4d081d93328961aca5942ac8f0 (diff) | |
parallelize fetch pdf
Diffstat (limited to 'scraper/util.py')
| -rw-r--r-- | scraper/util.py | 7 |
1 files changed, 7 insertions, 0 deletions
diff --git a/scraper/util.py b/scraper/util.py index 7b55afae..d3f4e751 100644 --- a/scraper/util.py +++ b/scraper/util.py @@ -4,6 +4,7 @@ import csv import string import codecs import gspread +from multiprocessing import Pool import simplejson as json from oauth2client.service_account import ServiceAccountCredentials @@ -302,6 +303,12 @@ class AddressBook (object): } return None +def parallelize(func, rows): + print("Fetching {} items".format(len(rows))) + chunksize = 3 + with Pool(processes=len(os.sched_getaffinity(0))) as pool: + pool.starmap(func, rows, chunksize) + def fetch_paper(s2, paper_id): os.makedirs('./datasets/s2/papers/{}/{}'.format(paper_id[0:2], paper_id), exist_ok=True) paper_fn = './datasets/s2/papers/{}/{}/paper.json'.format(paper_id[0:2], paper_id) |
