diff options
Diffstat (limited to 'megapixels/commands/datasets')
| -rw-r--r-- | megapixels/commands/datasets/file_record.py (renamed from megapixels/commands/datasets/records.py) | 89 | ||||
| -rw-r--r-- | megapixels/commands/datasets/msceleb.py | 66 | ||||
| -rw-r--r-- | megapixels/commands/datasets/msceleb_names.py | 57 |
3 files changed, 186 insertions, 26 deletions
diff --git a/megapixels/commands/datasets/records.py b/megapixels/commands/datasets/file_record.py index b6ef618b..355b22f2 100644 --- a/megapixels/commands/datasets/records.py +++ b/megapixels/commands/datasets/file_record.py @@ -10,7 +10,12 @@ from app.utils.logger_utils import Logger log = Logger.getLogger() -identity_sources = ['subdir', 'subdir_head', 'subdir_tail'] +# Choose part of the filepath that will be used for the person identity +# eg subdirectory "lfw/media/original/batch_1/train/barack_obama/001.jpg" --> [subdir_tail] --> "barack_obama" +# eg subdirectory "lfw/media/original/batch_1/train/barack_obama/001.jpg" --> [subdir_head] --> "batch_1" +# eg subdirectory "lfw/media/original/batch_1/train/barack_obama/001.jpg" --> [subdir] --> "barack_obama" + +identity_sources = ['subdir', 'numeric'] @click.command() @click.option('-i', '--input', 'opt_fp_in', default=None, @@ -21,7 +26,7 @@ identity_sources = ['subdir', 'subdir_head', 'subdir_tail'] help='Override enum media directory') @click.option('--data_store', 'opt_data_store', type=cfg.DataStoreVar, - default=click_utils.get_default(types.DataStore.SSD), + default=click_utils.get_default(types.DataStore.HDD), show_default=True, help=click_utils.show_help(types.Dataset)) @click.option('--dataset', 'opt_dataset', @@ -35,7 +40,8 @@ identity_sources = ['subdir', 'subdir_head', 'subdir_tail'] help='Number of threads') @click.option('-f', '--force', 'opt_force', is_flag=True, help='Force overwrite file') -@click.option('--identity', 'opt_identity', default=None, type=click.Choice(identity_sources), +@click.option('--identity', 'opt_identity', type=click.Choice(identity_sources), + default='numeric', help='Identity source, blank for no identity') @click.option('--recursive/--no-recursive', 'opt_recursive', is_flag=True, default=False, help='Use glob recursion (slower)') @@ -44,7 +50,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, opt_identity, opt_force, opt_recursive): """Generates sha256, uuid, and identity index CSV file""" - import sys + import sys, os from glob import glob from os.path import join from pathlib import Path @@ -53,9 +59,11 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, import random import uuid + import cv2 as cv import pandas as pd from tqdm import tqdm from glob import glob + from operator import itemgetter from app.models.data_store import DataStore from app.utils import file_utils, im_utils @@ -91,15 +99,26 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, pbar = tqdm(total=len(fp_ims)) - def as_sha256(fp_im): + def pool_mapper(fp_im): pbar.update(1) - return file_utils.sha256(fp_im) + sha256 = file_utils.sha256(fp_im) + im = cv.imread(fp_im) + w, h = im.shape[:2][::-1] + file_size_kb = os.stat(fp_im).st_size // 1000 + num_channels = im_utils.num_channels(im) + return { + 'width': w, + 'height': h, + 'sha256': sha256, + 'file_size_kb': file_size_kb, + 'num_channels': num_channels + } # convert to thread pool - sha256s = [] # ? + pool_maps = [] # ? pool = ThreadPool(opt_threads) with tqdm(total=len(fp_ims)) as pbar: - sha256s = pool.map(as_sha256, fp_ims) + pool_maps = pool.map(pool_mapper, fp_ims) pbar.close() @@ -108,11 +127,12 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, data = [] indentity_count = 0 - for sha256, fp_im in zip(sha256s, fp_ims): + for pool_map, fp_im in zip(pool_maps, fp_ims): fpp_im = Path(fp_im) subdir = str(fpp_im.parent.relative_to(fp_in)) + #subdir = '' if subdir is '.' else subdir + log.debug(subdir) - if opt_identity: subdirs = subdir.split('/') if not len(subdirs) > 0: @@ -120,27 +140,46 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, log.error('exiting') return if opt_identity == 'subdir': - identity = subdirs[0] # use first/only part - elif opt_identity == 'subdir_head': - identity = subdirs[0] # use first part of subdir path - elif opt_identity == 'subdir_tail': identity = subdirs[-1] # use last part of subdir path + elif opt_identity == 'numeric': + identity = indentity_count # use incrementing number + indentity_count += 1 else: - identity = indentity_count # use incrementing number - indentity_count += 1 + identity = '' data.append({ 'subdir': subdir, + 'num_channels': pool_map['num_channels'], 'fn': fpp_im.stem, 'ext': fpp_im.suffix.replace('.',''), - 'sha256': sha256, + 'sha256': pool_map['sha256'], 'uuid': uuid.uuid4(), - 'identity_key': identity + 'identity_key': identity, + 'width': pool_map['width'], + 'height': pool_map['height'] }) + # create dataframe df_records = pd.DataFrame.from_dict(data) + + df_records.index.name = 'index' # reassign 'index' as primary key column + # write to CSV + file_utils.mkdirs(fp_out) + df_records.to_csv(fp_out) + # done + log.info(f'wrote {len(df_records)} rows to "{fp_out}"') + # save script + cmd_line = ' '.join(sys.argv) + file_utils.write_text(cmd_line, '{}.sh'.format(fp_out)) + + +''' +# create dataframe + df_records = pd.DataFrame.from_dict(data) + + # add identity key (used for associating identity) if opt_identity: - log.info(f'adding identity index using: "{opt_identity}". This may take a while...') + log.info(f'adding identity index using: "{opt_identity}" subdirectory') # convert dict to DataFrame # sort based on identity_key df_records = df_records.sort_values(by=['identity_key'], ascending=True) @@ -149,19 +188,17 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, # populate the identity_index df_records_identity_groups = df_records.groupby('identity_key') # enumerate groups to create identity indices + log.info(f'updating records with identity_key. This may take a while...') + st = time.time() for identity_index, df_records_identity_group_tuple in enumerate(df_records_identity_groups): identity_key, df_records_identity_group = df_records_identity_group_tuple for ds_record in df_records_identity_group.itertuples(): df_records.at[ds_record.Index, 'identity_index'] = identity_index # reset index after being sorted df_records = df_records.reset_index(drop=True) + log.debug('update time: {:.2f}s'.format(time.time() - st)) else: # name everyone person 1, 2, 3... + df_records = df_records.sort_values(by=['subdir'], ascending=True) pass - - df_records.index.name = 'index' # reassign 'index' as primary key column - # write to CSV - file_utils.mkdirs(fp_out) - df_records.to_csv(fp_out) - # done - log.info(f'wrote rows: {len(df_records)} to {fp_out}')
\ No newline at end of file +'''
\ No newline at end of file diff --git a/megapixels/commands/datasets/msceleb.py b/megapixels/commands/datasets/msceleb.py new file mode 100644 index 00000000..969a1df2 --- /dev/null +++ b/megapixels/commands/datasets/msceleb.py @@ -0,0 +1,66 @@ +''' +Converts MsCelebV1-ImageThumbnails.part.00.tsv to names and images +''' +import click + +from app.settings import types +from app.models.dataset import Dataset +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Path to input TSV file') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output path for images') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice): + """Converts MSCeleb TSV to images""" + + import sys + import os + from glob import glob + from os.path import join + from pathlib import Path + import time + import base64 + from io import BytesIO + + import pandas as pd + import cv2 as cv + from PIL import Image + from tqdm import tqdm + + from app.utils import file_utils, im_utils + from app.models.data_store import DataStore + + + log = Logger.getLogger() + log.debug(f'opening "{opt_fp_in}" ...') + try: + n_lines = sum(1 for line in open(opt_fp_in)) + except: + n_lines = 1 + + log.debug('{:,}'.format(n_lines)) + + with open(opt_fp_in, 'rb') as fp: + for data_line in tqdm(fp, total=n_lines): + try: + freebase_mid, query_name, search_rank, url_image, url_page, b64_bytes = data_line.decode().split('\t') + # decode image + im64 = base64.b64decode(b64_bytes) + im = Image.open(BytesIO(im64)) + # save image + dir_out = join(opt_fp_out, freebase_mid) + Path(dir_out).mkdir(parents=True, exist_ok=True) + idx = len(os.listdir(dir_out)) + fp_out = join(dir_out, '{}.jpg'.format(file_utils.zpad(idx))) + im.save(fp_out, quality=100) + except Exception as e: + log.error('Could not process: {}, {}. Error: {}'.format(query_name, url_image, e)) diff --git a/megapixels/commands/datasets/msceleb_names.py b/megapixels/commands/datasets/msceleb_names.py new file mode 100644 index 00000000..6ee2ad9a --- /dev/null +++ b/megapixels/commands/datasets/msceleb_names.py @@ -0,0 +1,57 @@ +''' +Converts MsCelebV1-ImageThumbnails.part.00.tsv to names and images +''' +import click + +from app.settings import types +from app.models.dataset import Dataset +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Path to input TSV file') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output path for images') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out): + """Converts MSCeleb TSV to names file with image count""" + + import sys + import os + from glob import glob + from os.path import join + from pathlib import Path + import time + import base64 + from io import BytesIO + + import pandas as pd + import cv2 as cv + from PIL import Image + from tqdm import tqdm + + from app.utils import file_utils, im_utils + from app.models.data_store import DataStore + + + log = Logger.getLogger() + log.debug(f'opening "{opt_fp_in}" ...') + n_lines = sum(1 for line in open(opt_fp_in)) + log.debug('{:,}'.format(n_lines)) + + with open(opt_fp_in, 'rb') as fp: + for data_line in tqdm(fp, total=n_lines): + freebase_mid, query_name, search_rank, url_image, url_page, b64_bytes = data_line.decode().split('\t') + # decode image + im64 = base64.b64decode(b64_bytes) + im = Image.open(BytesIO(im64)) + # save image + dir_out = join(opt_fp_out, freebase_mid) + Path(dir_out).mkdir(parents=True, exist_ok=True) + idx = len(os.listdir(dir_out)) + fp_out = join(dir_out, '{}.jpg'.format(file_utils.zpad(idx))) + im.save(fp_out, quality=100) |
