diff options
Diffstat (limited to 'megapixels/commands/datasets/msceleb.py')
| -rw-r--r-- | megapixels/commands/datasets/msceleb.py | 66 |
1 files changed, 66 insertions, 0 deletions
diff --git a/megapixels/commands/datasets/msceleb.py b/megapixels/commands/datasets/msceleb.py new file mode 100644 index 00000000..969a1df2 --- /dev/null +++ b/megapixels/commands/datasets/msceleb.py @@ -0,0 +1,66 @@ +''' +Converts MsCelebV1-ImageThumbnails.part.00.tsv to names and images +''' +import click + +from app.settings import types +from app.models.dataset import Dataset +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Path to input TSV file') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output path for images') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice): + """Converts MSCeleb TSV to images""" + + import sys + import os + from glob import glob + from os.path import join + from pathlib import Path + import time + import base64 + from io import BytesIO + + import pandas as pd + import cv2 as cv + from PIL import Image + from tqdm import tqdm + + from app.utils import file_utils, im_utils + from app.models.data_store import DataStore + + + log = Logger.getLogger() + log.debug(f'opening "{opt_fp_in}" ...') + try: + n_lines = sum(1 for line in open(opt_fp_in)) + except: + n_lines = 1 + + log.debug('{:,}'.format(n_lines)) + + with open(opt_fp_in, 'rb') as fp: + for data_line in tqdm(fp, total=n_lines): + try: + freebase_mid, query_name, search_rank, url_image, url_page, b64_bytes = data_line.decode().split('\t') + # decode image + im64 = base64.b64decode(b64_bytes) + im = Image.open(BytesIO(im64)) + # save image + dir_out = join(opt_fp_out, freebase_mid) + Path(dir_out).mkdir(parents=True, exist_ok=True) + idx = len(os.listdir(dir_out)) + fp_out = join(dir_out, '{}.jpg'.format(file_utils.zpad(idx))) + im.save(fp_out, quality=100) + except Exception as e: + log.error('Could not process: {}, {}. Error: {}'.format(query_name, url_image, e)) |
