''' Converts MsCelebV1-ImageThumbnails.part.00.tsv to names and images ''' import click from app.settings import types from app.models.dataset import Dataset from app.utils import click_utils from app.settings import app_cfg as cfg from app.utils.logger_utils import Logger log = Logger.getLogger() @click.command() @click.option('-i', '--input', 'opt_fp_in', required=True, help='Path to input TSV file') @click.option('-o', '--output', 'opt_fp_out', required=True, help='Output path for images') @click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), help='Slice list of files') @click.pass_context def cli(ctx, opt_fp_in, opt_fp_out, opt_slice): """Converts MSCeleb TSV to images""" import sys import os from glob import glob from os.path import join from pathlib import Path import time import base64 from io import BytesIO import pandas as pd import cv2 as cv from PIL import Image from tqdm import tqdm from app.utils import file_utils, im_utils from app.models.data_store import DataStore log = Logger.getLogger() log.debug(f'opening "{opt_fp_in}" ...') try: n_lines = sum(1 for line in open(opt_fp_in)) except: n_lines = 1 log.debug('{:,}'.format(n_lines)) with open(opt_fp_in, 'rb') as fp: for data_line in tqdm(fp, total=n_lines): try: freebase_mid, query_name, search_rank, url_image, url_page, b64_bytes = data_line.decode().split('\t') # decode image im64 = base64.b64decode(b64_bytes) im = Image.open(BytesIO(im64)) # save image dir_out = join(opt_fp_out, freebase_mid) Path(dir_out).mkdir(parents=True, exist_ok=True) idx = len(os.listdir(dir_out)) fp_out = join(dir_out, '{}.jpg'.format(file_utils.zpad(idx))) im.save(fp_out, quality=100) except Exception as e: log.error('Could not process: {}, {}. Error: {}'.format(query_name, url_image, e))