1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
|
'''
Converts MsCelebV1-ImageThumbnails.part.00.tsv to names and images
'''
import click
from app.settings import types
from app.models.dataset import Dataset
from app.utils import click_utils
from app.settings import app_cfg as cfg
from app.utils.logger_utils import Logger
log = Logger.getLogger()
@click.command()
@click.option('-i', '--input', 'opt_fp_in', required=True,
help='Path to input TSV file')
@click.option('-o', '--output', 'opt_fp_out', required=True,
help='Output path for images')
@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
help='Slice list of files')
@click.pass_context
def cli(ctx, opt_fp_in, opt_fp_out, opt_slice):
"""Converts MSCeleb TSV to images"""
import sys
import os
from glob import glob
from os.path import join
from pathlib import Path
import time
import base64
from io import BytesIO
import pandas as pd
import cv2 as cv
from PIL import Image
from tqdm import tqdm
from app.utils import file_utils, im_utils
from app.models.data_store import DataStore
log = Logger.getLogger()
log.debug(f'opening "{opt_fp_in}" ...')
try:
n_lines = sum(1 for line in open(opt_fp_in))
except:
n_lines = 1
log.debug('{:,}'.format(n_lines))
with open(opt_fp_in, 'rb') as fp:
for data_line in tqdm(fp, total=n_lines):
try:
freebase_mid, query_name, search_rank, url_image, url_page, b64_bytes = data_line.decode().split('\t')
# decode image
im64 = base64.b64decode(b64_bytes)
im = Image.open(BytesIO(im64))
# save image
dir_out = join(opt_fp_out, freebase_mid)
Path(dir_out).mkdir(parents=True, exist_ok=True)
idx = len(os.listdir(dir_out))
fp_out = join(dir_out, '{}.jpg'.format(file_utils.zpad(idx)))
im.save(fp_out, quality=100)
except Exception as e:
log.error('Could not process: {}, {}. Error: {}'.format(query_name, url_image, e))
|