''' Converts MsCelebV1-ImageThumbnails.part.00.tsv to names and images ''' import click from app.settings import types from app.models.dataset import Dataset from app.utils import click_utils from app.settings import app_cfg as cfg from app.utils.logger_utils import Logger log = Logger.getLogger() @click.command() @click.option('-i', '--input', 'opt_fp_in', required=True, help='Path to input TSV file') @click.option('-o', '--output', 'opt_fp_out', required=True, help='Output path for images') @click.pass_context def cli(ctx, opt_fp_in, opt_fp_out): """Converts MSCeleb TSV to names file with image count""" import sys import os from glob import glob from os.path import join from pathlib import Path import time import base64 from io import BytesIO import pandas as pd import cv2 as cv from PIL import Image from tqdm import tqdm from app.utils import file_utils, im_utils from app.models.data_store import DataStore log = Logger.getLogger() log.debug(f'opening "{opt_fp_in}" ...') n_lines = sum(1 for line in open(opt_fp_in)) log.debug('{:,}'.format(n_lines)) with open(opt_fp_in, 'rb') as fp: for data_line in tqdm(fp, total=n_lines): freebase_mid, query_name, search_rank, url_image, url_page, b64_bytes = data_line.decode().split('\t') # decode image im64 = base64.b64decode(b64_bytes) im = Image.open(BytesIO(im64)) # save image dir_out = join(opt_fp_out, freebase_mid) Path(dir_out).mkdir(parents=True, exist_ok=True) idx = len(os.listdir(dir_out)) fp_out = join(dir_out, '{}.jpg'.format(file_utils.zpad(idx))) im.save(fp_out, quality=100)