From 27340ac4cd43f8eec7414495b541a65566ae2656 Mon Sep 17 00:00:00 2001 From: adamhrv Date: Tue, 8 Oct 2019 16:02:47 +0200 Subject: update site, white --- megapixels/app/models/bbox.py | 14 +- megapixels/app/models/dataset.py | 14 +- megapixels/app/site/parser.py | 30 ++- megapixels/app/utils/draw_utils.py | 42 +++- megapixels/app/utils/im_utils.py | 14 +- megapixels/commands/datasets/file_record.py | 234 --------------------- .../commands/datasets/megaface_age_from_orig.py | 62 ++++++ megapixels/commands/demo/face_search.py | 2 +- .../commands/processor/_old_files_to_face_rois.py | 2 +- .../commands/processor/face_roi_from_annos.py | 187 ++++++++++++++++ megapixels/commands/processor/file_record.py | 234 +++++++++++++++++++++ megapixels/commands/site/age_gender_to_site.py | 100 +++++++++ 12 files changed, 684 insertions(+), 251 deletions(-) delete mode 100644 megapixels/commands/datasets/file_record.py create mode 100644 megapixels/commands/datasets/megaface_age_from_orig.py create mode 100644 megapixels/commands/processor/face_roi_from_annos.py create mode 100644 megapixels/commands/processor/file_record.py create mode 100644 megapixels/commands/site/age_gender_to_site.py (limited to 'megapixels') diff --git a/megapixels/app/models/bbox.py b/megapixels/app/models/bbox.py index 8ecc8971..c840ea1b 100644 --- a/megapixels/app/models/bbox.py +++ b/megapixels/app/models/bbox.py @@ -207,11 +207,21 @@ class BBox: # ----------------------------------------------------------------- # Convert to - def to_square(self, bounds): + def to_square(self): '''Forces bbox to square dimensions - :param bounds: (int, int) w, h of the image :returns (BBox) in square ratio ''' + if self._width > self._height: + delta = (self._width - self._height) / 2 + self._y1 -= delta + self._y2 += delta + elif self._height > self._width: + delta = (self._height - self._width) / 2 + self._x1 -= delta + self._x2 += delta + return BBox(self._x1, self._y1, self._x2, self._y2) + + def to_dim(self, dim): """scale is (w, h) is tuple of dimensions""" diff --git a/megapixels/app/models/dataset.py b/megapixels/app/models/dataset.py index a7227a70..c908da1b 100644 --- a/megapixels/app/models/dataset.py +++ b/megapixels/app/models/dataset.py @@ -152,6 +152,8 @@ class Dataset: image_records = [] # list of image matches w/identity if available # find most similar feature vectors indexes #match_idxs = self.similar(query_vec, n_results, threshold) + + # TODO: add cosine similarity sim_scores = np.linalg.norm(np.array([query_vec]) - np.array(self._face_vectors), axis=1) match_idxs = np.argpartition(sim_scores, range(n_results))[:n_results] @@ -180,7 +182,17 @@ class Dataset: s3_url = self.data_store_s3.face(ds_record.uuid) bbox_norm = BBox.from_xywh_norm_dim(ds_roi.x, ds_roi.y, ds_roi.w, ds_roi.h, dim) self.log.debug(f'bbox_norm: {bbox_norm}') - score = sim_scores[match_idx] + self.log.debug(f'match_idx: {match_idx}, record_idx: {record_idx}, roi_index: {roi_index}, len sim_scores: {len(sim_scores)}') + try: + score = sim_scores[match_idx] + except Exception as e: + self.log.error(e) + try: + score = sim_scores[record_idx] + except Exception as e: + self.log.error(e) + + if types.Metadata.IDENTITY in self._metadata.keys(): ds_id = df_identity.loc[df_identity['identity_key'] == ds_record.identity_key].iloc[0] diff --git a/megapixels/app/site/parser.py b/megapixels/app/site/parser.py index 3700efd1..6ab8c700 100644 --- a/megapixels/app/site/parser.py +++ b/megapixels/app/site/parser.py @@ -162,6 +162,35 @@ def intro_section(metadata, s3_path): Build the intro section for datasets """ + section = "
".format(s3_path + metadata['image']) + # section += "
" + + # parts = [] + # if 'desc' in metadata: + # desc = metadata['desc'] + # # colorize the first instance of the database name in the header + # if 'color' in metadata and metadata['title'] in desc: + # desc = desc.replace(metadata['title'], "{}".format(metadata['color'], metadata['title']), 1) + # section += "
{}
".format(desc, desc) + + # if 'subdesc' in metadata: + # subdesc = markdown(metadata['subdesc']).replace('

', '').replace('

', '') + # section += "
{}
".format(subdesc, subdesc) + + # section += "
" + section += "
" + + if 'caption' in metadata: + section += "
{}
".format(metadata['caption']) + + return section + + +def intro_section_v1(metadata, s3_path): + """ + Build the intro section for datasets + """ + section = "
".format(s3_path + metadata['image']) section += "
" @@ -185,7 +214,6 @@ def intro_section(metadata, s3_path): return section - def fix_images(lines, s3_path): """ do our own transformation of the markdown around images to handle wide images etc diff --git a/megapixels/app/utils/draw_utils.py b/megapixels/app/utils/draw_utils.py index 7044a62f..1836768b 100644 --- a/megapixels/app/utils/draw_utils.py +++ b/megapixels/app/utils/draw_utils.py @@ -3,8 +3,10 @@ from math import sqrt import numpy as np import cv2 as cv +import PIL +from PIL import ImageDraw -from app.utils import logger_utils +from app.utils import logger_utils, im_utils log = logger_utils.Logger.getLogger() @@ -118,6 +120,22 @@ def draw_landmarks2D(im, points_norm, radius=3, color=(0,255,0)): cv.circle(im_dst, pt, radius, color, -1, cv.LINE_AA) return im_dst +def draw_landmarks2D_pil(im, points_norm, radius=3, color=(0,255,0)): + '''Draws facial landmarks, either 5pt or 68pt + ''' + im_pil = im_utils.ensure_pil(im_utils.bgr2rgb(im)) + draw = ImageDraw.Draw(im_pil) + dim = im.shape[:2][::-1] + for x,y in points_norm: + x1, y1 = (int(x*dim[0]), int(y*dim[1])) + xyxy = (x1, y1, x1+radius, y1+radius) + draw.ellipse(xyxy, fill='white') + del draw + im_dst = im_utils.ensure_np(im_pil) + im_dst = im_utils.rgb2bgr(im_dst) + return im_dst + + def draw_landmarks3D(im, points, radius=3, color=(0,255,0)): '''Draws 3D facial landmarks ''' @@ -126,12 +144,26 @@ def draw_landmarks3D(im, points, radius=3, color=(0,255,0)): cv.circle(im_dst, (x,y), radius, color, -1, cv.LINE_AA) return im_dst -def draw_bbox(im, bbox_norm, color=(0,255,0), stroke_weight=2): +def draw_bbox(im, bboxes_norm, color=(0,255,0), stroke_weight=2): '''Draws BBox onto cv image + :param color: RGB value ''' - im_dst = im.copy() - bbox_dim = bbox_norm.to_dim(im.shape[:2][::-1]) - cv.rectangle(im_dst, bbox_dim.pt_tl, bbox_dim.pt_br, color, stroke_weight, cv.LINE_AA) + #im_dst = im.copy() + if not type(bboxes_norm) == list: + bboxes_norm = [bboxes_norm] + + im_pil = im_utils.ensure_pil(im_utils.bgr2rgb(im)) + im_pil_draw = ImageDraw.ImageDraw(im_pil) + + for bbox_norm in bboxes_norm: + bbox_dim = bbox_norm.to_dim(im.shape[:2][::-1]) + #cv.rectangle(im_dst, bbox_dim.pt_tl, bbox_dim.pt_br, color, stroke_weight, cv.LINE_AA) + xyxy = (bbox_dim.pt_tl, bbox_dim.pt_br) + im_pil_draw.rectangle(xyxy, outline=color, width=stroke_weight) + # draw.rectangle([x1, y1, x2, y2], outline=, width=3) + im_dst = im_utils.ensure_np(im_pil) + im_dst = im_utils.rgb2bgr(im_dst) + del im_pil_draw return im_dst def draw_pose(im, pt_nose, image_pts): diff --git a/megapixels/app/utils/im_utils.py b/megapixels/app/utils/im_utils.py index d36c1c32..670d5168 100644 --- a/megapixels/app/utils/im_utils.py +++ b/megapixels/app/utils/im_utils.py @@ -11,11 +11,6 @@ from skimage import feature import imutils import time import numpy as np -import torch -import torch.nn as nn -import torchvision.models as models -import torchvision.transforms as transforms -from torch.autograd import Variable from sklearn.metrics.pairwise import cosine_similarity import datetime @@ -293,6 +288,13 @@ def bgr2rgb(im): """ return cv.cvtColor(im,cv.COLOR_BGR2RGB) +def rgb2bgr(im): + """Wrapper for cv2.cvtColor transform + :param im: Numpy.ndarray (BGR) + :returns: Numpy.ndarray (RGB) + """ + return cv.cvtColor(im,cv.COLOR_RGB2BGR) + def compute_laplacian(im): # below 100 is usually blurry return cv.Laplacian(im, cv.CV_64F).var() @@ -329,7 +331,7 @@ def normalizedGraylevelVariance(img): s = stdev[0]**2 / mean[0] return s[0] -def compute_if_blank(im,width=100,sigma=0,thresh_canny=.1,thresh_mean=4,mask=None): +def is_blank(im,width=100,sigma=0,thresh_canny=.1,thresh_mean=4,mask=None): # im is graysacale np #im = imutils.resize(im,width=width) #mask = imutils.resize(mask,width=width) diff --git a/megapixels/commands/datasets/file_record.py b/megapixels/commands/datasets/file_record.py deleted file mode 100644 index 41a5df28..00000000 --- a/megapixels/commands/datasets/file_record.py +++ /dev/null @@ -1,234 +0,0 @@ -''' - -''' -import click - -from app.settings import types -from app.utils import click_utils -from app.settings import app_cfg as cfg -from app.utils.logger_utils import Logger - -log = Logger.getLogger() - -# Choose part of the filepath that will be used for the person identity -# eg subdirectory "lfw/media/original/batch_1/train/barack_obama/001.jpg" --> [subdir_tail] --> "barack_obama" -# eg subdirectory "lfw/media/original/batch_1/train/barack_obama/001.jpg" --> [subdir_head] --> "batch_1" -# eg subdirectory "lfw/media/original/batch_1/train/barack_obama/001.jpg" --> [subdir] --> "barack_obama" - -identity_sources = ['subdir', 'numeric'] - -@click.command() -@click.option('-i', '--input', 'opt_fp_in', default=None, - help='Override enum input filename CSV') -@click.option('-o', '--output', 'opt_fp_out', default=None, - help='Override enum output filename CSV') -@click.option('-m', '--media', 'opt_dir_media', default=None, - help='Override enum media directory') -@click.option('--data_store', 'opt_data_store', - type=cfg.DataStoreVar, - default=click_utils.get_default(types.DataStore.HDD), - show_default=True, - help=click_utils.show_help(types.Dataset)) -@click.option('--dataset', 'opt_dataset', - type=cfg.DatasetVar, - required=True, - show_default=True, - help=click_utils.show_help(types.Dataset)) -@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), - help='Slice list of files') -@click.option('-t', '--threads', 'opt_threads', default=12, - help='Number of threads') -@click.option('-f', '--force', 'opt_force', is_flag=True, - help='Force overwrite file') -@click.option('--identity', 'opt_identity', type=click.Choice(identity_sources), - required=True, - help='Identity source key') -@click.option('--recursive/--no-recursive', 'opt_recursive', is_flag=True, default=False, - help='Use glob recursion (slower)') -@click.option('--max-depth', 'opt_max_depth', default=None, type=int, - help='Max number of images per subdirectory') -@click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, opt_slice, opt_threads, - opt_identity, opt_force, opt_recursive, opt_max_depth): - """Generates sha256, uuid, and identity index CSV file""" - - import sys, os - from glob import glob - from os.path import join - from pathlib import Path - import time - from multiprocessing.dummy import Pool as ThreadPool - import random - import uuid - - from PIL import Image - import cv2 as cv - import pandas as pd - from tqdm import tqdm - from glob import glob - from operator import itemgetter - - from app.models.data_store import DataStore - from app.utils import file_utils, im_utils - - - # set data_store - data_store = DataStore(opt_data_store, opt_dataset) - # get filepath out - fp_out = data_store.metadata(types.Metadata.FILE_RECORD) if opt_fp_out is None else opt_fp_out - # exit if exists - if not opt_force and Path(fp_out).exists(): - log.error('File exists. Use "-f / --force" to overwite') - return - - # ---------------------------------------------------------------- - # glob files - - fp_in = opt_fp_in if opt_fp_in is not None else data_store.media_images_original() - log.info(f'Globbing {fp_in}') - fp_ims = file_utils.glob_multi(fp_in, ['jpg', 'png'], recursive=opt_recursive) - - log.info('Found {:,} images'.format(len(fp_ims))) - subdir_groups = {} - if opt_max_depth: - log.debug(f'using max depth: {opt_max_depth}') - for fp_im in fp_ims: - fpp_im = Path(fp_im) - - subdir = fp_im.split('/')[-2] - if not subdir in subdir_groups.keys(): - subdir_groups[subdir] = [] - else: - subdir_groups[subdir].append(fp_im) - # for each subgroup, limit number of files - fp_ims = [] - for subdir_name, items in subdir_groups.items(): - ims = items[0:opt_max_depth] - fp_ims += ims - - log.debug(f'num subdirs: {len(subdir_groups.keys())}') - # fail if none - if not fp_ims: - log.error('No images. Try with "--recursive"') - return - # slice to reduce - if opt_slice: - fp_ims = fp_ims[opt_slice[0]:opt_slice[1]] - log.info('Found {:,} images'.format(len(fp_ims))) - - # ---------------------------------------------------------------- - # multithread process into SHA256 - - pbar = tqdm(total=len(fp_ims)) - - def pool_mapper(fp_im): - pbar.update(1) - try: - sha256 = file_utils.sha256(fp_im) - im = Image.open(fp_im) - im.verify() # throws error if bad file - assert(im.size[0] > 60 and im.size[1] > 60) - except Exception as e: - log.warn(f'skipping file: {fp_im}') - return None - im = cv.imread(fp_im) - w, h = im.shape[:2][::-1] - file_size_kb = os.stat(fp_im).st_size // 1000 - num_channels = im_utils.num_channels(im) - return { - 'width': w, - 'height': h, - 'sha256': sha256, - 'file_size_kb': file_size_kb, - 'num_channels': num_channels - } - - # convert to thread pool - pool_maps = [] # ? - pool = ThreadPool(opt_threads) - with tqdm(total=len(fp_ims)) as pbar: - pool_maps = pool.map(pool_mapper, fp_ims) - pbar.close() - - - # ---------------------------------------------------------------- - # convert data to dict - - data = [] - indentity_count = 0 - for pool_map, fp_im in zip(pool_maps, fp_ims): - if pool_map is None: - log.warn(f'skipping file: {fp_im}') - continue # skip error files - fpp_im = Path(fp_im) - subdir = str(fpp_im.parent.relative_to(fp_in)) - - if opt_identity: - subdirs = subdir.split('/') - if not len(subdirs) > 0: - log.error(f'Could not split subdir: "{subdir}. Try different option for "--identity"') - log.error('exiting') - return - if opt_identity == 'subdir': - identity = subdirs[-1] # use last part of subdir path - elif opt_identity == 'numeric': - identity = indentity_count # use incrementing number - indentity_count += 1 - else: - identity = '' - - data.append({ - 'subdir': subdir, - 'num_channels': pool_map['num_channels'], - 'fn': fpp_im.stem, - 'ext': fpp_im.suffix.replace('.',''), - 'sha256': pool_map['sha256'], - 'uuid': uuid.uuid4(), - 'identity_key': identity, - 'width': pool_map['width'], - 'height': pool_map['height'] - }) - - # create dataframe - df_records = pd.DataFrame.from_dict(data) - - df_records.index.name = 'index' # reassign 'index' as primary key column - # write to CSV - file_utils.mkdirs(fp_out) - df_records.to_csv(fp_out) - # done - log.info(f'wrote {len(df_records)} rows to "{fp_out}"') - # save script - cmd_line = ' '.join(sys.argv) - file_utils.write_text(cmd_line, '{}.sh'.format(fp_out)) - - -''' -# create dataframe - df_records = pd.DataFrame.from_dict(data) - - # add identity key (used for associating identity) - if opt_identity: - log.info(f'adding identity index using: "{opt_identity}" subdirectory') - # convert dict to DataFrame - # sort based on identity_key - df_records = df_records.sort_values(by=['identity_key'], ascending=True) - # add new column for identity - df_records['identity_index'] = [-1] * len(df_records) - # populate the identity_index - df_records_identity_groups = df_records.groupby('identity_key') - # enumerate groups to create identity indices - log.info(f'updating records with identity_key. This may take a while...') - st = time.time() - for identity_index, df_records_identity_group_tuple in enumerate(df_records_identity_groups): - identity_key, df_records_identity_group = df_records_identity_group_tuple - for ds_record in df_records_identity_group.itertuples(): - df_records.at[ds_record.Index, 'identity_index'] = identity_index - # reset index after being sorted - df_records = df_records.reset_index(drop=True) - log.debug('update time: {:.2f}s'.format(time.time() - st)) - else: - # name everyone person 1, 2, 3... - df_records = df_records.sort_values(by=['subdir'], ascending=True) - pass -''' \ No newline at end of file diff --git a/megapixels/commands/datasets/megaface_age_from_orig.py b/megapixels/commands/datasets/megaface_age_from_orig.py new file mode 100644 index 00000000..489bebf3 --- /dev/null +++ b/megapixels/commands/datasets/megaface_age_from_orig.py @@ -0,0 +1,62 @@ +import click + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input path to metadata directory') +@click.option('-o', '--output', 'opt_fp_out', + help='Output path to age CSV') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out): + """Creates CSV of MegaFace ages from original BBoxes""" + + import os + from os.path import join + from pathlib import Path + from glob import glob + + import dlib + import pandas as pd + from tqdm import tqdm + + from app.settings import types + from app.utils import click_utils + from app.settings import app_cfg + + from PIL import Image, ImageOps, ImageFilter + from app.utils import file_utils, im_utils, logger_utils + + log = logger_utils.Logger.getLogger() + + # ------------------------------------------------- + # process + fp_im_dirs = glob(join(opt_fp_in, '**/'), recursive=True) + + log.info('Found {} directories'.format(len(fp_im_dirs))) + + identities = {} + + for fp_im_dir in tqdm(fp_im_dirs): + # 1234567@N05_identity_1 + try: + dir_id_name = Path(fp_im_dir).name + nsid = dir_id_name.split('_')[0] + identity_num = dir_id_name.split('_')[2] + id_key = '{}_{}'.format(nsid, identity_num) + num_images = len(glob(join(fp_im_dir, '*.jpg'))) + if not id_key in identities.keys(): + identities[id_key] = {'nsid': nsid, 'identity': identity_num, 'images': num_images} + else: + identities[id_key]['images'] += num_images + except Exception as e: + continue + + # convert to dict + identities_list = [v for k, v in identities.items()] + df = pd.DataFrame.from_dict(identities_list) + + file_utils.mkdirs(opt_fp_out) + + log.info('Wrote {} lines to {}'.format(len(df), opt_fp_out)) + df.to_csv(opt_fp_out, index=False) + + diff --git a/megapixels/commands/demo/face_search.py b/megapixels/commands/demo/face_search.py index 4c7036f4..5218d501 100644 --- a/megapixels/commands/demo/face_search.py +++ b/megapixels/commands/demo/face_search.py @@ -10,7 +10,7 @@ log = Logger.getLogger() @click.command() @click.option('-i', '--input', 'opt_fp_in', required=True, - help='File to lookup') + help='Face image file to lookup') @click.option('--data_store', 'opt_data_store', type=cfg.DataStoreVar, default=click_utils.get_default(types.DataStore.HDD), diff --git a/megapixels/commands/processor/_old_files_to_face_rois.py b/megapixels/commands/processor/_old_files_to_face_rois.py index 895f4718..d92cbd74 100644 --- a/megapixels/commands/processor/_old_files_to_face_rois.py +++ b/megapixels/commands/processor/_old_files_to_face_rois.py @@ -1,4 +1,4 @@ - """ +""" Crop images to prepare for training """ diff --git a/megapixels/commands/processor/face_roi_from_annos.py b/megapixels/commands/processor/face_roi_from_annos.py new file mode 100644 index 00000000..fc933049 --- /dev/null +++ b/megapixels/commands/processor/face_roi_from_annos.py @@ -0,0 +1,187 @@ +""" +Crop images to prepare for training +""" + +import click +# from PIL import Image, ImageOps, ImageFilter, ImageDraw + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg + +color_filters = {'color': 1, 'gray': 2, 'all': 3} + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', default=None, + help='Override enum input filename CSV') +@click.option('-o', '--output', 'opt_fp_out', default=None, + help='Override enum output filename CSV') +@click.option('-m', '--media', 'opt_dir_media', default=None, + help='Override enum media directory') +@click.option('--store', 'opt_data_store', + type=cfg.DataStoreVar, + default=click_utils.get_default(types.DataStore.HDD), + show_default=True, + help=click_utils.show_help(types.Dataset)) +@click.option('--dataset', 'opt_dataset', + type=cfg.DatasetVar, + required=True, + show_default=True, + help=click_utils.show_help(types.Dataset)) +@click.option('--size', 'opt_size', + type=(int, int), default=(480, 480), + help='Output image size') +@click.option('-d', '--detector', 'opt_detector_type', + type=cfg.FaceDetectNetVar, + default=click_utils.get_default(types.FaceDetectNet.CVDNN), + help=click_utils.show_help(types.FaceDetectNet)) +@click.option('-g', '--gpu', 'opt_gpu', default=0, + help='GPU index') +@click.option('--conf', 'opt_conf_thresh', default=0.85, type=click.FloatRange(0,1), + help='Confidence minimum threshold') +@click.option('-p', '--pyramids', 'opt_pyramids', default=0, type=click.IntRange(0,4), + help='Number pyramids to upscale for DLIB detectors') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('--display/--no-display', 'opt_display', is_flag=True, default=False, + help='Display detections to debug') +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite file') +@click.option('--color', 'opt_color_filter', + type=click.Choice(color_filters.keys()), default='color', + help='Filter to keep color or grayscale images (color = keep color') +@click.option('--keep', 'opt_largest', type=click.Choice(['largest', 'all']), default='largest', + help='Only keep largest face') +@click.option('--zone', 'opt_zone', default=(0.0, 0.0), type=(float, float), + help='Face center must be located within zone region (0.5 = half width/height)') +@click.pass_context +def cli(ctx, opt_fp_in, opt_dir_media, opt_fp_out, opt_data_store, opt_dataset, opt_size, opt_detector_type, + opt_gpu, opt_conf_thresh, opt_pyramids, opt_slice, opt_display, opt_force, opt_color_filter, + opt_largest, opt_zone): + """Converts frames with faces to CSV of ROIs""" + + import sys + import os + from os.path import join + from pathlib import Path + from glob import glob + + from tqdm import tqdm + import numpy as np + import dlib # must keep a local reference for dlib + import cv2 as cv + import pandas as pd + + from app.utils import logger_utils, file_utils, im_utils, display_utils, draw_utils + from app.processors import face_detector + from app.models.data_store import DataStore + + # ------------------------------------------------- + # init here + + log = logger_utils.Logger.getLogger() + + # set data_store + data_store = DataStore(opt_data_store, opt_dataset) + + # get filepath out + fp_out = data_store.metadata(types.Metadata.FACE_ROI) if opt_fp_out is None else opt_fp_out + if not opt_force and Path(fp_out).exists(): + log.error('File exists. Use "-f / --force" to overwite') + return + + # set detector + if opt_detector_type == types.FaceDetectNet.CVDNN: + detector = face_detector.DetectorCVDNN() + elif opt_detector_type == types.FaceDetectNet.DLIB_CNN: + detector = face_detector.DetectorDLIBCNN(gpu=opt_gpu) + elif opt_detector_type == types.FaceDetectNet.DLIB_HOG: + detector = face_detector.DetectorDLIBHOG() + elif opt_detector_type == types.FaceDetectNet.MTCNN_TF: + detector = face_detector.DetectorMTCNN_TF(gpu=opt_gpu) + elif opt_detector_type == types.FaceDetectNet.HAAR: + log.error('{} not yet implemented'.format(opt_detector_type.name)) + return + + + # get list of files to process + fp_record = data_store.metadata(types.Metadata.FILE_RECORD) if opt_fp_in is None else opt_fp_in + df_record = pd.read_csv(fp_record, dtype=cfg.FILE_RECORD_DTYPES).set_index('index') + if opt_slice: + df_record = df_record[opt_slice[0]:opt_slice[1]] + log.debug('processing {:,} files'.format(len(df_record))) + + # filter out grayscale + color_filter = color_filters[opt_color_filter] + # set largest flag, to keep all or only largest + opt_largest = (opt_largest == 'largest') + + data = [] + skipped_files = [] + processed_files = [] + + for df_record in tqdm(df_record.itertuples(), total=len(df_record)): + fp_im = data_store.face(str(df_record.subdir), str(df_record.fn), str(df_record.ext)) + try: + im = cv.imread(fp_im) + im_resized = im_utils.resize(im, width=opt_size[0], height=opt_size[1]) + except Exception as e: + log.debug(f'could not read: {fp_im}') + return + # filter out color or grayscale iamges + if color_filter != color_filters['all']: + try: + is_gray = im_utils.is_grayscale(im) + if is_gray and color_filter != color_filters['gray']: + log.debug('Skipping grayscale image: {}'.format(fp_im)) + continue + except Exception as e: + log.error('Could not check grayscale: {}'.format(fp_im)) + continue + + try: + bboxes_norm = detector.detect(im_resized, pyramids=opt_pyramids, largest=opt_largest, + zone=opt_zone, conf_thresh=opt_conf_thresh) + except Exception as e: + log.error('could not detect: {}'.format(fp_im)) + log.error('{}'.format(e)) + continue + + if len(bboxes_norm) == 0: + skipped_files.append(fp_im) + log.warn(f'no faces in: {fp_im}') + log.warn(f'skipped: {len(skipped_files)}. found:{len(processed_files)} files') + else: + processed_files.append(fp_im) + for bbox in bboxes_norm: + roi = { + 'record_index': int(df_record.Index), + 'x': bbox.x, + 'y': bbox.y, + 'w': bbox.w, + 'h': bbox.h + } + data.append(roi) + + # if display optined + if opt_display and len(bboxes_norm): + # draw each box + for bbox_norm in bboxes_norm: + dim = im_resized.shape[:2][::-1] + bbox_dim = bbox.to_dim(dim) + if dim[0] > 1000: + im_resized = im_utils.resize(im_resized, width=1000) + im_resized = draw_utils.draw_bbox(im_resized, bbox_norm) + + # display and wait + cv.imshow('', im_resized) + display_utils.handle_keyboard() + + # create DataFrame and save to CSV + file_utils.mkdirs(fp_out) + df = pd.DataFrame.from_dict(data) + df.index.name = 'index' + df.to_csv(fp_out) + + # save script + file_utils.write_text(' '.join(sys.argv), '{}.sh'.format(fp_out)) \ No newline at end of file diff --git a/megapixels/commands/processor/file_record.py b/megapixels/commands/processor/file_record.py new file mode 100644 index 00000000..6403c768 --- /dev/null +++ b/megapixels/commands/processor/file_record.py @@ -0,0 +1,234 @@ +''' + +''' +import click + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +# Choose part of the filepath that will be used for the person identity +# eg subdirectory "lfw/media/original/batch_1/train/barack_obama/001.jpg" --> [subdir_tail] --> "barack_obama" +# eg subdirectory "lfw/media/original/batch_1/train/barack_obama/001.jpg" --> [subdir_head] --> "batch_1" +# eg subdirectory "lfw/media/original/batch_1/train/barack_obama/001.jpg" --> [subdir] --> "barack_obama" + +identity_sources = ['subdir', 'numeric'] + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', default=None, + help='Override enum input filename CSV') +@click.option('-o', '--output', 'opt_fp_out', default=None, + help='Override enum output filename CSV') +@click.option('-m', '--media', 'opt_dir_media', default=None, + help='Override enum media directory') +@click.option('--data_store', 'opt_data_store', + type=cfg.DataStoreVar, + default=click_utils.get_default(types.DataStore.HDD), + show_default=True, + help=click_utils.show_help(types.Dataset)) +@click.option('--dataset', 'opt_dataset', + type=cfg.DatasetVar, + required=True, + show_default=True, + help=click_utils.show_help(types.Dataset)) +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('-t', '--threads', 'opt_threads', default=12, + help='Number of threads') +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite file') +@click.option('--identity', 'opt_identity', type=click.Choice(identity_sources), + required=True, + help='Identity source key') +@click.option('--recursive/--no-recursive', 'opt_recursive', is_flag=True, default=False, + help='Use glob recursion (slower)') +@click.option('--max-depth', 'opt_max_depth', default=None, type=int, + help='Max number of images per subdirectory') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, opt_slice, opt_threads, + opt_identity, opt_force, opt_recursive, opt_max_depth): + """Generates sha256, uuid, and identity index CSV file""" + + import sys, os + from glob import glob + from os.path import join + from pathlib import Path + import time + from multiprocessing.dummy import Pool as ThreadPool + import random + import uuid + + from PIL import Image + import cv2 as cv + import pandas as pd + from tqdm import tqdm + from glob import glob + from operator import itemgetter + + from app.models.data_store import DataStore + from app.utils import file_utils, im_utils + + + # set data_store + data_store = DataStore(opt_data_store, opt_dataset) + # get filepath out + fp_out = data_store.metadata(types.Metadata.FILE_RECORD) if opt_fp_out is None else opt_fp_out + # exit if exists + if not opt_force and Path(fp_out).exists(): + log.error(f'File {fp_out} exists. Use "-f / --force" to overwite') + return + + # ---------------------------------------------------------------- + # glob files + + fp_in = opt_fp_in if opt_fp_in is not None else data_store.media_images_original() + log.info(f'Globbing {fp_in}') + fp_ims = file_utils.glob_multi(fp_in, ['jpg', 'png'], recursive=opt_recursive) + + log.info('Found {:,} images'.format(len(fp_ims))) + subdir_groups = {} + if opt_max_depth: + log.debug(f'using max depth: {opt_max_depth}') + for fp_im in fp_ims: + fpp_im = Path(fp_im) + + subdir = fp_im.split('/')[-2] + if not subdir in subdir_groups.keys(): + subdir_groups[subdir] = [] + else: + subdir_groups[subdir].append(fp_im) + # for each subgroup, limit number of files + fp_ims = [] + for subdir_name, items in subdir_groups.items(): + ims = items[0:opt_max_depth] + fp_ims += ims + + log.debug(f'num subdirs: {len(subdir_groups.keys())}') + # fail if none + if not fp_ims: + log.error('No images. Try with "--recursive"') + return + # slice to reduce + if opt_slice: + fp_ims = fp_ims[opt_slice[0]:opt_slice[1]] + log.info('Found {:,} images'.format(len(fp_ims))) + + # ---------------------------------------------------------------- + # multithread process into SHA256 + + pbar = tqdm(total=len(fp_ims)) + + def pool_mapper(fp_im): + pbar.update(1) + try: + sha256 = file_utils.sha256(fp_im) + im = Image.open(fp_im) + im.verify() # throws error if bad file + assert(im.size[0] > 60 and im.size[1] > 60) + except Exception as e: + log.warn(f'skipping file: {fp_im}') + return None + im = cv.imread(fp_im) + w, h = im.shape[:2][::-1] + file_size_kb = os.stat(fp_im).st_size // 1000 + num_channels = im_utils.num_channels(im) + return { + 'width': w, + 'height': h, + 'sha256': sha256, + 'file_size_kb': file_size_kb, + 'num_channels': num_channels + } + + # convert to thread pool + pool_maps = [] # ? + pool = ThreadPool(opt_threads) + with tqdm(total=len(fp_ims)) as pbar: + pool_maps = pool.map(pool_mapper, fp_ims) + pbar.close() + + + # ---------------------------------------------------------------- + # convert data to dict + + data = [] + indentity_count = 0 + for pool_map, fp_im in zip(pool_maps, fp_ims): + if pool_map is None: + log.warn(f'skipping file: {fp_im}') + continue # skip error files + fpp_im = Path(fp_im) + subdir = str(fpp_im.parent.relative_to(fp_in)) + + if opt_identity: + subdirs = subdir.split('/') + if not len(subdirs) > 0: + log.error(f'Could not split subdir: "{subdir}. Try different option for "--identity"') + log.error('exiting') + return + if opt_identity == 'subdir': + identity = subdirs[-1] # use last part of subdir path + elif opt_identity == 'numeric': + identity = indentity_count # use incrementing number + indentity_count += 1 + else: + identity = '' + + data.append({ + 'subdir': subdir, + 'num_channels': pool_map['num_channels'], + 'fn': fpp_im.stem, + 'ext': fpp_im.suffix.replace('.',''), + 'sha256': pool_map['sha256'], + 'uuid': uuid.uuid4(), + 'identity_key': identity, + 'width': pool_map['width'], + 'height': pool_map['height'] + }) + + # create dataframe + df_records = pd.DataFrame.from_dict(data) + + df_records.index.name = 'index' # reassign 'index' as primary key column + # write to CSV + file_utils.mkdirs(fp_out) + df_records.to_csv(fp_out) + # done + log.info(f'wrote {len(df_records)} rows to "{fp_out}"') + # save script + cmd_line = ' '.join(sys.argv) + file_utils.write_text(cmd_line, '{}.sh'.format(fp_out)) + + +''' +# create dataframe + df_records = pd.DataFrame.from_dict(data) + + # add identity key (used for associating identity) + if opt_identity: + log.info(f'adding identity index using: "{opt_identity}" subdirectory') + # convert dict to DataFrame + # sort based on identity_key + df_records = df_records.sort_values(by=['identity_key'], ascending=True) + # add new column for identity + df_records['identity_index'] = [-1] * len(df_records) + # populate the identity_index + df_records_identity_groups = df_records.groupby('identity_key') + # enumerate groups to create identity indices + log.info(f'updating records with identity_key. This may take a while...') + st = time.time() + for identity_index, df_records_identity_group_tuple in enumerate(df_records_identity_groups): + identity_key, df_records_identity_group = df_records_identity_group_tuple + for ds_record in df_records_identity_group.itertuples(): + df_records.at[ds_record.Index, 'identity_index'] = identity_index + # reset index after being sorted + df_records = df_records.reset_index(drop=True) + log.debug('update time: {:.2f}s'.format(time.time() - st)) + else: + # name everyone person 1, 2, 3... + df_records = df_records.sort_values(by=['subdir'], ascending=True) + pass +''' \ No newline at end of file diff --git a/megapixels/commands/site/age_gender_to_site.py b/megapixels/commands/site/age_gender_to_site.py new file mode 100644 index 00000000..3ad24a8d --- /dev/null +++ b/megapixels/commands/site/age_gender_to_site.py @@ -0,0 +1,100 @@ +""" + +""" + +import click + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg + + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', default=None, + help='Override enum input filename CSV') +@click.option('-o', '--output', 'opt_fp_out', default=None, + help='Override enum output filename CSV') +@click.option('-m', '--media', 'opt_dir_media', default=None, + help='Override enum media directory') +@click.option('--store', 'opt_data_store', + type=cfg.DataStoreVar, + default=click_utils.get_default(types.DataStore.HDD), + show_default=True, + help=click_utils.show_help(types.Dataset)) +@click.option('--dataset', 'opt_dataset', + type=cfg.DatasetVar, + required=True, + show_default=True, + help=click_utils.show_help(types.Dataset)) +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite file') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_dir_media, opt_data_store, opt_dataset, opt_force): + """Converts age/gender to CSV for pie chartgs""" + + import sys + import os + from os.path import join + from pathlib import Path + from glob import glob + + from tqdm import tqdm + import numpy as np + import cv2 as cv + import pandas as pd + + from app.utils import logger_utils + from app.models.data_store import DataStore + + # ------------------------------------------------------------------------- + # init here + + log = logger_utils.Logger.getLogger() + + # init filepaths + data_store = DataStore(opt_data_store, opt_dataset) + # set file output path + metadata_type = types.Metadata.FACE_ATTRIBUTES + fp_in = data_store.metadata(metadata_type) if opt_fp_out is None else opt_fp_in + dk = opt_dataset.name.lower() + log.debug(f'dk: {dk}') + fp_out_age = f'../site/content/pages/datasets/{dk}/assets/age.csv' + fp_out_gender = f'../site/content/pages/datasets/{dk}/assets/gender.csv' + + if not opt_force and (Path(fp_out_age).exists() or Path(fp_out_gender).exists()): + log.error('File exists. Use "-f / --force" to overwite') + return + + # ------------------------------------------------------------------------- + # Age + + df = pd.read_csv(fp_in) + + results = [] + brackets = [(0, 12), (13, 18), (19,24), (25, 34), (35, 44), (45, 54), (55, 64), (64, 75), (75, 100)] + df_age = df['age_real'] + + for a1, a2 in brackets: + n = len(df_age.loc[((df_age >= a1) & (df_age <= a2))]) + results.append({'age': f'{a1} - {a2}', 'faces': n}) + + df_out = pd.DataFrame.from_dict(results) + df_out = df_out[['age','faces']] + df_out.to_csv(fp_out_age, index=False) + + # Gender + results = [] + + df_f = df['f'] + nm = len(df_f.loc[((df_f < 0.33))]) + nnb = len(df_f.loc[((df_f >= 0.33) & (df_f <= 0.66))]) + nf = len(df_f.loc[((df_f > 0.66))]) + + results = [] + results.append({'gender': 'Male', 'faces':nm}) + results.append({'gender': 'Female', 'faces': nf}) + results.append({'gender': 'They', 'faces': nnb}) + + df_out = pd.DataFrame.from_dict(results) + df_out = df_out[['gender','faces']] + df_out.to_csv(fp_out_gender, index=False) \ No newline at end of file -- cgit v1.2.3-70-g09d2