summaryrefslogtreecommitdiff
path: root/megapixels/commands/datasets/msceleb.py
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/commands/datasets/msceleb.py')
-rw-r--r--megapixels/commands/datasets/msceleb.py66
1 files changed, 66 insertions, 0 deletions
diff --git a/megapixels/commands/datasets/msceleb.py b/megapixels/commands/datasets/msceleb.py
new file mode 100644
index 00000000..969a1df2
--- /dev/null
+++ b/megapixels/commands/datasets/msceleb.py
@@ -0,0 +1,66 @@
+'''
+Converts MsCelebV1-ImageThumbnails.part.00.tsv to names and images
+'''
+import click
+
+from app.settings import types
+from app.models.dataset import Dataset
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+ help='Path to input TSV file')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+ help='Output path for images')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+ help='Slice list of files')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_slice):
+ """Converts MSCeleb TSV to images"""
+
+ import sys
+ import os
+ from glob import glob
+ from os.path import join
+ from pathlib import Path
+ import time
+ import base64
+ from io import BytesIO
+
+ import pandas as pd
+ import cv2 as cv
+ from PIL import Image
+ from tqdm import tqdm
+
+ from app.utils import file_utils, im_utils
+ from app.models.data_store import DataStore
+
+
+ log = Logger.getLogger()
+ log.debug(f'opening "{opt_fp_in}" ...')
+ try:
+ n_lines = sum(1 for line in open(opt_fp_in))
+ except:
+ n_lines = 1
+
+ log.debug('{:,}'.format(n_lines))
+
+ with open(opt_fp_in, 'rb') as fp:
+ for data_line in tqdm(fp, total=n_lines):
+ try:
+ freebase_mid, query_name, search_rank, url_image, url_page, b64_bytes = data_line.decode().split('\t')
+ # decode image
+ im64 = base64.b64decode(b64_bytes)
+ im = Image.open(BytesIO(im64))
+ # save image
+ dir_out = join(opt_fp_out, freebase_mid)
+ Path(dir_out).mkdir(parents=True, exist_ok=True)
+ idx = len(os.listdir(dir_out))
+ fp_out = join(dir_out, '{}.jpg'.format(file_utils.zpad(idx)))
+ im.save(fp_out, quality=100)
+ except Exception as e:
+ log.error('Could not process: {}, {}. Error: {}'.format(query_name, url_image, e))