summaryrefslogtreecommitdiff
path: root/megapixels/commands/datasets/msceleb_names.py
blob: 6ee2ad9a855176520b5b129c15ffbf807c35e566 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
'''
Converts MsCelebV1-ImageThumbnails.part.00.tsv to names and images
'''
import click

from app.settings import types
from app.models.dataset import Dataset
from app.utils import click_utils
from app.settings import app_cfg as cfg
from app.utils.logger_utils import Logger

log = Logger.getLogger()

@click.command()
@click.option('-i', '--input', 'opt_fp_in', required=True,
  help='Path to input TSV file')
@click.option('-o', '--output', 'opt_fp_out', required=True,
  help='Output path for images')
@click.pass_context
def cli(ctx, opt_fp_in, opt_fp_out):
  """Converts MSCeleb TSV to names file with image count"""
  
  import sys
  import os
  from glob import glob
  from os.path import join
  from pathlib import Path
  import time
  import base64
  from io import BytesIO

  import pandas as pd
  import cv2 as cv
  from PIL import Image
  from tqdm import tqdm
  
  from app.utils import file_utils, im_utils
  from app.models.data_store import DataStore


  log = Logger.getLogger()
  log.debug(f'opening "{opt_fp_in}" ...')
  n_lines = sum(1 for line in open(opt_fp_in))
  log.debug('{:,}'.format(n_lines))

  with open(opt_fp_in, 'rb') as fp:
    for data_line in tqdm(fp, total=n_lines):
      freebase_mid, query_name, search_rank, url_image, url_page, b64_bytes = data_line.decode().split('\t')
      # decode image
      im64 = base64.b64decode(b64_bytes)
      im = Image.open(BytesIO(im64))
      # save image
      dir_out = join(opt_fp_out, freebase_mid)
      Path(dir_out).mkdir(parents=True, exist_ok=True)
      idx = len(os.listdir(dir_out))
      fp_out = join(dir_out, '{}.jpg'.format(file_utils.zpad(idx)))
      im.save(fp_out, quality=100)