summaryrefslogtreecommitdiff
path: root/scraper/util.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2019-02-10 16:38:53 +0100
committerJules Laplace <julescarbon@gmail.com>2019-02-10 16:38:53 +0100
commitefadb39172eb36b4ffebeaa5169219c7954a263b (patch)
tree078ece0d9af403527a575f63a8997f5de5073726 /scraper/util.py
parentb248ce4f21a94e4d081d93328961aca5942ac8f0 (diff)
parallelize fetch pdf
Diffstat (limited to 'scraper/util.py')
-rw-r--r--scraper/util.py7
1 files changed, 7 insertions, 0 deletions
diff --git a/scraper/util.py b/scraper/util.py
index 7b55afae..d3f4e751 100644
--- a/scraper/util.py
+++ b/scraper/util.py
@@ -4,6 +4,7 @@ import csv
import string
import codecs
import gspread
+from multiprocessing import Pool
import simplejson as json
from oauth2client.service_account import ServiceAccountCredentials
@@ -302,6 +303,12 @@ class AddressBook (object):
}
return None
+def parallelize(func, rows):
+ print("Fetching {} items".format(len(rows)))
+ chunksize = 3
+ with Pool(processes=len(os.sched_getaffinity(0))) as pool:
+ pool.starmap(func, rows, chunksize)
+
def fetch_paper(s2, paper_id):
os.makedirs('./datasets/s2/papers/{}/{}'.format(paper_id[0:2], paper_id), exist_ok=True)
paper_fn = './datasets/s2/papers/{}/{}/paper.json'.format(paper_id[0:2], paper_id)