diff options
| -rw-r--r-- | scraper/s2-fetch-pdf.py | 7 | ||||
| -rw-r--r-- | scraper/util.py | 7 |
2 files changed, 9 insertions, 5 deletions
diff --git a/scraper/s2-fetch-pdf.py b/scraper/s2-fetch-pdf.py index 30bc5a40..72ca4ca8 100644 --- a/scraper/s2-fetch-pdf.py +++ b/scraper/s2-fetch-pdf.py @@ -7,6 +7,7 @@ import random import re import simplejson as json import click +from multiprocessing import Pool from s2 import SemanticScholarAPI from util import * @@ -16,9 +17,7 @@ s2 = SemanticScholarAPI() @click.option('--fn', '-i', default='db_paper_pdf.csv', help='Filename of CSV (id, url,)') def fetch_pdfs(fn): lines = read_csv(fn, keys=False) - for line in lines: - paper_id, url = line - fetch_pdf(paper_id, url) + parallelize(fetch_pdf, lines) print("{} papers processed".format(len(lines))) def fetch_pdf(paper_id, url): @@ -33,8 +32,6 @@ def fetch_pdf(paper_id, url): print("{} empty?".format(paper_id)) return None print("{} {} kb".format(paper_id, int(size / 1024))) - return - # return paper def make_pdf_path(paper_id): return './datasets/s2/pdf/{}/{}'.format(paper_id[0:2], paper_id) diff --git a/scraper/util.py b/scraper/util.py index 7b55afae..d3f4e751 100644 --- a/scraper/util.py +++ b/scraper/util.py @@ -4,6 +4,7 @@ import csv import string import codecs import gspread +from multiprocessing import Pool import simplejson as json from oauth2client.service_account import ServiceAccountCredentials @@ -302,6 +303,12 @@ class AddressBook (object): } return None +def parallelize(func, rows): + print("Fetching {} items".format(len(rows))) + chunksize = 3 + with Pool(processes=len(os.sched_getaffinity(0))) as pool: + pool.starmap(func, rows, chunksize) + def fetch_paper(s2, paper_id): os.makedirs('./datasets/s2/papers/{}/{}'.format(paper_id[0:2], paper_id), exist_ok=True) paper_fn = './datasets/s2/papers/{}/{}/paper.json'.format(paper_id[0:2], paper_id) |
