summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2019-02-13 02:00:57 +0100
committerJules Laplace <julescarbon@gmail.com>2019-02-13 02:00:57 +0100
commitdc7d9cbba842472efb33186e97ee55751e4d50ca (patch)
treeca12f9b2f381fee7590bc8587ee08b59ccff6487
parentdc1889f15ab1b1338c557cda0b1bcd989e1fdf9b (diff)
parent6b3923624f352b13633e83ac18ac2f7fd74be34a (diff)
Merge branch 'master' of github.com:adamhrv/megapixels_dev
-rw-r--r--megapixels/commands/datasets/imdb_face_download.py60
-rw-r--r--megapixels/commands/templates/multithreaded.py17
2 files changed, 68 insertions, 9 deletions
diff --git a/megapixels/commands/datasets/imdb_face_download.py b/megapixels/commands/datasets/imdb_face_download.py
new file mode 100644
index 00000000..4180fac0
--- /dev/null
+++ b/megapixels/commands/datasets/imdb_face_download.py
@@ -0,0 +1,60 @@
+import click
+
+fp_in = '/data_store/datasets/people/imdb_face/downloads/IMDb-Face.csv'
+fp_out = '/data_store_hdd/datasets/people/imdb_face/media/'
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in,
+ help='Input')
+@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out,
+ help='Output')
+@click.option('-t', '--threads', 'opt_threads', default=8,
+ help='Number of threads')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
+ """Download IMDb-Face URLs"""
+
+ from os.path import join
+ from functools import partial
+ from multiprocessing.dummy import Pool as ThreadPool
+ import urllib
+
+ import pandas as pd
+ from tqdm import tqdm
+ from app.utils import file_utils
+ from app.utils.logger_utils import Logger
+
+ log = Logger.getLogger()
+
+ # setup multithreading function
+ def pool_process(item):
+ # threaded function
+ try:
+ # download image
+ file_utils.mkdirs(item['fp'])
+ urllib.request.urlretrieve(item['url'], item['fp'], timeout=20)
+ item['status'] = True
+ except Exception as e:
+ log.debug(f'Error: {e}')
+ item['status'] = False
+ pbar.update(1)
+ return item
+
+ # setup multithreading data holds
+ log.debug(f'loading {opt_fp_in}')
+ records = pd.read_csv(opt_fp_in).to_dict('records')
+ pool_items = [{'url':x['url'], 'fp': join(opt_fp_out, x['index'], x['image'])} for x in records]
+ num_items = len(pool_items)
+ log.info(f'processing {num_items:,} items')
+ pool_results = []
+
+ # run the multithreading with progress bar
+ pbar = tqdm(total=num_items)
+ pool_process = partial(pool_process)
+ pool = ThreadPool(opt_threads)
+ with tqdm(total=num_items) as pbar:
+ pool_results = pool.map(pool_process, pool_items)
+
+ pbar.close()
+
+
diff --git a/megapixels/commands/templates/multithreaded.py b/megapixels/commands/templates/multithreaded.py
index fec3dac4..a9b287f8 100644
--- a/megapixels/commands/templates/multithreaded.py
+++ b/megapixels/commands/templates/multithreaded.py
@@ -2,9 +2,9 @@ import click
@click.command()
@click.option('-i', '--input', 'opt_fp_in', required=True,
- help='Input file')
+ help='Input')
@click.option('-o', '--output', 'opt_fp_out', required=True,
- help='Output file')
+ help='Output')
@click.option('-t', '--threads', 'opt_threads', default=4,
help='Number of threads')
@click.pass_context
@@ -22,28 +22,27 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
log.info('multithreaded template')
# setup multithreading function
- def pool_process(data_obj):
+ def pool_process(item):
# threaded function
- global parse_yt_page
results = []
try:
- # do something here with data_obj
+ # do something here with item
except Exception as e:
log.debug(f'Error: {e}')
pbar.update(1)
return results
# setup multithreading data holds
- items = [] # list of dicts to process
- results = []
- num_items = len(items)
+ pool_items = [] # list of dicts to process
+ pool_results = []
+ num_items = len(pool_items)
# run the multithreading with progress bar
pbar = tqdm(total=num_items)
pool_process = partial(pool_process)
pool = ThreadPool(opt_threads)
with tqdm(total=num_items) as pbar:
- results = pool.map(pool_process, media_items)
+ pool_results = pool.map(pool_process, pool_items)
pbar.close()