From 27340ac4cd43f8eec7414495b541a65566ae2656 Mon Sep 17 00:00:00 2001 From: adamhrv Date: Tue, 8 Oct 2019 16:02:47 +0200 Subject: update site, white --- TODO.md | 14 ++ client/chart/chart.css | 14 +- client/modalImage/modal.css | 2 +- client/table/tabulator.css | 9 +- environment.yml | 153 -------------- megapixels/app/models/bbox.py | 14 +- megapixels/app/models/dataset.py | 14 +- megapixels/app/site/parser.py | 30 ++- megapixels/app/utils/draw_utils.py | 42 +++- megapixels/app/utils/im_utils.py | 14 +- megapixels/commands/datasets/file_record.py | 234 --------------------- .../commands/datasets/megaface_age_from_orig.py | 62 ++++++ megapixels/commands/demo/face_search.py | 2 +- .../commands/processor/_old_files_to_face_rois.py | 2 +- .../commands/processor/face_roi_from_annos.py | 187 ++++++++++++++++ megapixels/commands/processor/file_record.py | 234 +++++++++++++++++++++ megapixels/commands/site/age_gender_to_site.py | 100 +++++++++ site/assets/css/applets.css | 3 +- site/assets/css/css.css | 117 +++++++---- site/assets/css/tabulator.css | 2 +- site/includes/age_gender_disclaimer.html | 3 + site/includes/chart.html | 14 -- site/includes/dashboard.html | 4 +- site/includes/map.html | 22 -- site/public/about/index.html | 29 +-- site/public/about/legal/index.html | 4 +- site/public/datasets/adience/index.html | 7 +- site/public/datasets/brainwash/index.html | 17 +- site/public/datasets/duke_mtmc/index.html | 11 +- site/public/datasets/helen/index.html | 94 ++++++++- site/public/datasets/ibm_dif/index.html | 29 +-- site/public/datasets/ijb_c/index.html | 7 +- site/public/datasets/index.html | 30 ++- site/public/datasets/lfpw/index.html | 15 +- site/public/datasets/megaface/index.html | 12 +- site/public/datasets/msceleb/index.html | 29 ++- site/public/datasets/oxford_town_centre/index.html | 10 +- site/public/datasets/pipa/index.html | 7 +- site/public/datasets/uccs/index.html | 11 +- site/public/datasets/who_goes_there/index.html | 7 +- site/public/research/index.html | 2 +- .../research/munich_security_conference/index.html | 5 +- todo.md | 130 ------------ 43 files changed, 1007 insertions(+), 741 deletions(-) create mode 100644 TODO.md delete mode 100644 environment.yml delete mode 100644 megapixels/commands/datasets/file_record.py create mode 100644 megapixels/commands/datasets/megaface_age_from_orig.py create mode 100644 megapixels/commands/processor/face_roi_from_annos.py create mode 100644 megapixels/commands/processor/file_record.py create mode 100644 megapixels/commands/site/age_gender_to_site.py create mode 100644 site/includes/age_gender_disclaimer.html delete mode 100644 site/includes/chart.html delete mode 100644 site/includes/map.html delete mode 100644 todo.md diff --git a/TODO.md b/TODO.md new file mode 100644 index 00000000..90de6790 --- /dev/null +++ b/TODO.md @@ -0,0 +1,14 @@ +# TODO + +## CSS + +- change font size in Tabulator to 12px (can't find where to edit it) + + +## Charts, JS + +- can we make the age/gender all in one include? +- can we auto-add download links to age/gender csv? +- can the pie chart labels keep same order as in CSV? + + diff --git a/client/chart/chart.css b/client/chart/chart.css index f9c33247..2df5a97a 100644 --- a/client/chart/chart.css +++ b/client/chart/chart.css @@ -1,22 +1,24 @@ .chart text { - fill: white; + fill: black; } .chart line { - stroke: white; + stroke: black; } .chart path { - stroke: white; + stroke: black; } .c3 path, .c3 line { - stroke: white; + stroke: black; } .c3-tooltip, .c3-tooltip td { background: rgba(0,0,0,0.8); + color: #fff; } .c3-tooltip th { font-family: 'Roboto', sans-serif; - background: black; -} + background: rgba(255,255,255,0.0); + /*background: black;*/ +} \ No newline at end of file diff --git a/client/modalImage/modal.css b/client/modalImage/modal.css index d628cc48..cc9a1f32 100644 --- a/client/modalImage/modal.css +++ b/client/modalImage/modal.css @@ -31,7 +31,7 @@ .modal .caption { display: block; text-align: center; - background: black; + /*background: black;*/ padding: 10px; } .modal .prev span, diff --git a/client/table/tabulator.css b/client/table/tabulator.css index 0ea81974..0d9e8ff1 100644 --- a/client/table/tabulator.css +++ b/client/table/tabulator.css @@ -1,6 +1,7 @@ .tabulator { border-left: 1px solid #333; border-bottom: 1px solid #333; + font-size: 12px; } .tabulator-row.tabulator-row-odd { background-color: #222; @@ -42,7 +43,7 @@ background-image: url(/assets/img/icon-search.png); background-position: 378px center; background-repeat: no-repeat; - box-shadow: 0px 2px 4px rgba(0,0,0,0.2); + box-shadow: 1px 2px 4px rgba(0,0,0,0.6); border: 0; } @@ -55,10 +56,10 @@ } .download { display: block; - font-size: 13px; - color: #ddd; + font-size: 12px; + color: #333; cursor: pointer; - background: #333; + background: #ddd; padding: 5px 8px 5px 8px; border-radius: 5px; transition: all 0.2s; diff --git a/environment.yml b/environment.yml deleted file mode 100644 index b3d28c7e..00000000 --- a/environment.yml +++ /dev/null @@ -1,153 +0,0 @@ -name: megapixels -channels: - - pytorch - - conda-forge - - defaults -dependencies: - - atk=2.25.90=hb9dd440_1002 - - attrs=18.2.0=py_0 - - backcall=0.1.0=py_0 - - blas=1.0=mkl - - bleach=3.1.0=py_0 - - ca-certificates=2018.11.29=ha4d7672_0 - - cairo=1.16.0=ha4e643d_1000 - - certifi=2018.11.29=py36_1000 - - cffi=1.11.5=py36h9745a5d_1001 - - cudatoolkit=9.0=h13b8566_0 - - dbus=1.13.0=h4e0c4b3_1000 - - decorator=4.3.2=py_0 - - entrypoints=0.3=py36_1000 - - expat=2.2.5=hf484d3e_1002 - - fontconfig=2.13.1=h2176d3f_1000 - - freetype=2.9.1=h94bbf69_1005 - - gdk-pixbuf=2.36.12=h49783d7_1002 - - gettext=0.19.8.1=h9745a5d_1001 - - glib=2.58.2=hf63aee3_1001 - - gobject-introspection=1.58.2=py36h2da5eee_1000 - - graphite2=1.3.13=hf484d3e_1000 - - gstreamer=1.14.4=h66beb1c_1001 - - gtk2=2.24.31=hb68c50a_1001 - - harfbuzz=2.3.1=h6824563_0 - - icu=58.2=hf484d3e_1000 - - ipykernel=5.1.0=py36h24bf2e0_1002 - - ipython=7.2.0=py36h24bf2e0_1000 - - ipython_genutils=0.2.0=py_1 - - ipywidgets=7.4.2=py_0 - - jedi=0.13.2=py36_1000 - - jinja2=2.10=py_1 - - jpeg=9c=h14c3975_1001 - - jsonschema=3.0.0a3=py36_1000 - - jupyter=1.0.0=py_1 - - jupyter_client=5.2.4=py_1 - - jupyter_console=6.0.0=py_0 - - jupyter_core=4.4.0=py_0 - - libffi=3.2.1=hf484d3e_1005 - - libgcc-ng=7.3.0=hdf63c60_0 - - libgfortran-ng=7.2.0=hdf63c60_3 - - libiconv=1.15=h14c3975_1004 - - libpng=1.6.36=h84994c4_1000 - - libsodium=1.0.16=h14c3975_1001 - - libstdcxx-ng=7.3.0=hdf63c60_0 - - libtiff=4.0.10=h648cc4a_1001 - - libuuid=2.32.1=h14c3975_1000 - - libxcb=1.13=h14c3975_1002 - - libxml2=2.9.8=h143f9aa_1005 - - markupsafe=1.1.0=py36h14c3975_1000 - - mistune=0.8.4=py36h14c3975_1000 - - mkl_fft=1.0.10=py36h14c3975_1 - - mkl_random=1.0.2=py36h637b7d7_2 - - nb_conda=2.2.1=py36_0 - - nb_conda_kernels=2.2.0=py36_1000 - - nbconvert=5.3.1=py_1 - - nbformat=4.4.0=py_1 - - ncurses=6.1=hf484d3e_1002 - - ninja=1.9.0=h6bb024c_0 - - notebook=5.7.4=py36_1000 - - numpy-base=1.15.4=py36hde5b4d6_0 - - olefile=0.46=py_0 - - openssl=1.1.1a=h14c3975_1000 - - pandoc=2.6=1 - - pandocfilters=1.4.2=py_1 - - pango=1.40.14=h4ea9474_1004 - - parso=0.3.3=py_0 - - pcre=8.41=hf484d3e_1003 - - pexpect=4.6.0=py36_1000 - - pickleshare=0.7.5=py36_1000 - - pip=19.0.2=py36_0 - - pixman=0.34.0=h14c3975_1003 - - prometheus_client=0.5.0=py_0 - - prompt_toolkit=2.0.8=py_0 - - pthread-stubs=0.4=h14c3975_1001 - - ptyprocess=0.6.0=py36_1000 - - pycparser=2.19=py_0 - - pygments=2.3.1=py_0 - - pyqt=4.11.4=py36_3 - - pyrsistent=0.14.10=py36h14c3975_0 - - python=3.6.8=h0371630_0 - - python-dateutil=2.8.0=py_0 - - pytorch=1.0.1=py3.6_cuda9.0.176_cudnn7.4.2_2 - - pyzmq=17.1.2=py36h6afc9c9_1001 - - qt=4.8.7=2 - - qtconsole=4.4.3=py_0 - - readline=7.0=hf8c457e_1001 - - send2trash=1.5.0=py_0 - - setuptools=40.7.3=py36_0 - - sip=4.18=py36_1 - - six=1.12.0=py36_1000 - - sqlite=3.26.0=h67949de_1000 - - terminado=0.8.1=py36_1001 - - testpath=0.4.2=py36_1000 - - tk=8.6.9=h84994c4_1000 - - torchvision=0.2.1=py_2 - - tornado=5.1.1=py36h14c3975_1000 - - traitlets=4.3.2=py36_1000 - - wcwidth=0.1.7=py_1 - - webencodings=0.5.1=py_1 - - wheel=0.32.3=py36_0 - - widgetsnbextension=3.4.2=py36_1000 - - xorg-kbproto=1.0.7=h14c3975_1002 - - xorg-libice=1.0.9=h14c3975_1004 - - xorg-libsm=1.2.3=h4937e3b_1000 - - xorg-libx11=1.6.7=h14c3975_1000 - - xorg-libxau=1.0.8=h14c3975_1006 - - xorg-libxdmcp=1.1.2=h14c3975_1007 - - xorg-libxext=1.3.3=h14c3975_1004 - - xorg-libxrender=0.9.10=h14c3975_1002 - - xorg-libxt=1.1.5=h14c3975_1002 - - xorg-renderproto=0.11.1=h14c3975_1002 - - xorg-xextproto=7.3.0=h14c3975_1002 - - xorg-xproto=7.0.31=h14c3975_1007 - - xz=5.2.4=h14c3975_1001 - - zeromq=4.2.5=hf484d3e_1006 - - zlib=1.2.11=h14c3975_1004 - - pip: - - click==7.0 - - cloudpickle==0.7.0 - - cmake==3.13.3 - - colorlog==4.0.2 - - cycler==0.10.0 - - dask==1.1.1 - - dlib==19.16.0 - - imagehash==4.0 - - imutils==0.5.2 - - intel-openmp==2019.0 - - kiwisolver==1.0.1 - - matplotlib==3.0.2 - - mkl==2019.0 - - networkx==2.2 - - numpy==1.16.1 - - opencv-python==4.0.0.21 - - pandas==0.24.1 - - pillow==5.4.1 - - pymediainfo==3.0 - - pyparsing==2.3.1 - - python-dotenv==0.10.1 - - pytz==2018.9 - - pywavelets==1.0.1 - - scikit-image==0.14.2 - - scikit-learn==0.20.2 - - scipy==1.2.1 - - toolz==0.9.0 - - tqdm==4.31.0 -prefix: /home/adam/anaconda3/envs/megapixels - diff --git a/megapixels/app/models/bbox.py b/megapixels/app/models/bbox.py index 8ecc8971..c840ea1b 100644 --- a/megapixels/app/models/bbox.py +++ b/megapixels/app/models/bbox.py @@ -207,11 +207,21 @@ class BBox: # ----------------------------------------------------------------- # Convert to - def to_square(self, bounds): + def to_square(self): '''Forces bbox to square dimensions - :param bounds: (int, int) w, h of the image :returns (BBox) in square ratio ''' + if self._width > self._height: + delta = (self._width - self._height) / 2 + self._y1 -= delta + self._y2 += delta + elif self._height > self._width: + delta = (self._height - self._width) / 2 + self._x1 -= delta + self._x2 += delta + return BBox(self._x1, self._y1, self._x2, self._y2) + + def to_dim(self, dim): """scale is (w, h) is tuple of dimensions""" diff --git a/megapixels/app/models/dataset.py b/megapixels/app/models/dataset.py index a7227a70..c908da1b 100644 --- a/megapixels/app/models/dataset.py +++ b/megapixels/app/models/dataset.py @@ -152,6 +152,8 @@ class Dataset: image_records = [] # list of image matches w/identity if available # find most similar feature vectors indexes #match_idxs = self.similar(query_vec, n_results, threshold) + + # TODO: add cosine similarity sim_scores = np.linalg.norm(np.array([query_vec]) - np.array(self._face_vectors), axis=1) match_idxs = np.argpartition(sim_scores, range(n_results))[:n_results] @@ -180,7 +182,17 @@ class Dataset: s3_url = self.data_store_s3.face(ds_record.uuid) bbox_norm = BBox.from_xywh_norm_dim(ds_roi.x, ds_roi.y, ds_roi.w, ds_roi.h, dim) self.log.debug(f'bbox_norm: {bbox_norm}') - score = sim_scores[match_idx] + self.log.debug(f'match_idx: {match_idx}, record_idx: {record_idx}, roi_index: {roi_index}, len sim_scores: {len(sim_scores)}') + try: + score = sim_scores[match_idx] + except Exception as e: + self.log.error(e) + try: + score = sim_scores[record_idx] + except Exception as e: + self.log.error(e) + + if types.Metadata.IDENTITY in self._metadata.keys(): ds_id = df_identity.loc[df_identity['identity_key'] == ds_record.identity_key].iloc[0] diff --git a/megapixels/app/site/parser.py b/megapixels/app/site/parser.py index 3700efd1..6ab8c700 100644 --- a/megapixels/app/site/parser.py +++ b/megapixels/app/site/parser.py @@ -162,6 +162,35 @@ def intro_section(metadata, s3_path): Build the intro section for datasets """ + section = "
".format(s3_path + metadata['image']) + # section += "
" + + # parts = [] + # if 'desc' in metadata: + # desc = metadata['desc'] + # # colorize the first instance of the database name in the header + # if 'color' in metadata and metadata['title'] in desc: + # desc = desc.replace(metadata['title'], "{}".format(metadata['color'], metadata['title']), 1) + # section += "
{}
".format(desc, desc) + + # if 'subdesc' in metadata: + # subdesc = markdown(metadata['subdesc']).replace('

', '').replace('

', '') + # section += "
{}
".format(subdesc, subdesc) + + # section += "
" + section += "
" + + if 'caption' in metadata: + section += "
{}
".format(metadata['caption']) + + return section + + +def intro_section_v1(metadata, s3_path): + """ + Build the intro section for datasets + """ + section = "
".format(s3_path + metadata['image']) section += "
" @@ -185,7 +214,6 @@ def intro_section(metadata, s3_path): return section - def fix_images(lines, s3_path): """ do our own transformation of the markdown around images to handle wide images etc diff --git a/megapixels/app/utils/draw_utils.py b/megapixels/app/utils/draw_utils.py index 7044a62f..1836768b 100644 --- a/megapixels/app/utils/draw_utils.py +++ b/megapixels/app/utils/draw_utils.py @@ -3,8 +3,10 @@ from math import sqrt import numpy as np import cv2 as cv +import PIL +from PIL import ImageDraw -from app.utils import logger_utils +from app.utils import logger_utils, im_utils log = logger_utils.Logger.getLogger() @@ -118,6 +120,22 @@ def draw_landmarks2D(im, points_norm, radius=3, color=(0,255,0)): cv.circle(im_dst, pt, radius, color, -1, cv.LINE_AA) return im_dst +def draw_landmarks2D_pil(im, points_norm, radius=3, color=(0,255,0)): + '''Draws facial landmarks, either 5pt or 68pt + ''' + im_pil = im_utils.ensure_pil(im_utils.bgr2rgb(im)) + draw = ImageDraw.Draw(im_pil) + dim = im.shape[:2][::-1] + for x,y in points_norm: + x1, y1 = (int(x*dim[0]), int(y*dim[1])) + xyxy = (x1, y1, x1+radius, y1+radius) + draw.ellipse(xyxy, fill='white') + del draw + im_dst = im_utils.ensure_np(im_pil) + im_dst = im_utils.rgb2bgr(im_dst) + return im_dst + + def draw_landmarks3D(im, points, radius=3, color=(0,255,0)): '''Draws 3D facial landmarks ''' @@ -126,12 +144,26 @@ def draw_landmarks3D(im, points, radius=3, color=(0,255,0)): cv.circle(im_dst, (x,y), radius, color, -1, cv.LINE_AA) return im_dst -def draw_bbox(im, bbox_norm, color=(0,255,0), stroke_weight=2): +def draw_bbox(im, bboxes_norm, color=(0,255,0), stroke_weight=2): '''Draws BBox onto cv image + :param color: RGB value ''' - im_dst = im.copy() - bbox_dim = bbox_norm.to_dim(im.shape[:2][::-1]) - cv.rectangle(im_dst, bbox_dim.pt_tl, bbox_dim.pt_br, color, stroke_weight, cv.LINE_AA) + #im_dst = im.copy() + if not type(bboxes_norm) == list: + bboxes_norm = [bboxes_norm] + + im_pil = im_utils.ensure_pil(im_utils.bgr2rgb(im)) + im_pil_draw = ImageDraw.ImageDraw(im_pil) + + for bbox_norm in bboxes_norm: + bbox_dim = bbox_norm.to_dim(im.shape[:2][::-1]) + #cv.rectangle(im_dst, bbox_dim.pt_tl, bbox_dim.pt_br, color, stroke_weight, cv.LINE_AA) + xyxy = (bbox_dim.pt_tl, bbox_dim.pt_br) + im_pil_draw.rectangle(xyxy, outline=color, width=stroke_weight) + # draw.rectangle([x1, y1, x2, y2], outline=, width=3) + im_dst = im_utils.ensure_np(im_pil) + im_dst = im_utils.rgb2bgr(im_dst) + del im_pil_draw return im_dst def draw_pose(im, pt_nose, image_pts): diff --git a/megapixels/app/utils/im_utils.py b/megapixels/app/utils/im_utils.py index d36c1c32..670d5168 100644 --- a/megapixels/app/utils/im_utils.py +++ b/megapixels/app/utils/im_utils.py @@ -11,11 +11,6 @@ from skimage import feature import imutils import time import numpy as np -import torch -import torch.nn as nn -import torchvision.models as models -import torchvision.transforms as transforms -from torch.autograd import Variable from sklearn.metrics.pairwise import cosine_similarity import datetime @@ -293,6 +288,13 @@ def bgr2rgb(im): """ return cv.cvtColor(im,cv.COLOR_BGR2RGB) +def rgb2bgr(im): + """Wrapper for cv2.cvtColor transform + :param im: Numpy.ndarray (BGR) + :returns: Numpy.ndarray (RGB) + """ + return cv.cvtColor(im,cv.COLOR_RGB2BGR) + def compute_laplacian(im): # below 100 is usually blurry return cv.Laplacian(im, cv.CV_64F).var() @@ -329,7 +331,7 @@ def normalizedGraylevelVariance(img): s = stdev[0]**2 / mean[0] return s[0] -def compute_if_blank(im,width=100,sigma=0,thresh_canny=.1,thresh_mean=4,mask=None): +def is_blank(im,width=100,sigma=0,thresh_canny=.1,thresh_mean=4,mask=None): # im is graysacale np #im = imutils.resize(im,width=width) #mask = imutils.resize(mask,width=width) diff --git a/megapixels/commands/datasets/file_record.py b/megapixels/commands/datasets/file_record.py deleted file mode 100644 index 41a5df28..00000000 --- a/megapixels/commands/datasets/file_record.py +++ /dev/null @@ -1,234 +0,0 @@ -''' - -''' -import click - -from app.settings import types -from app.utils import click_utils -from app.settings import app_cfg as cfg -from app.utils.logger_utils import Logger - -log = Logger.getLogger() - -# Choose part of the filepath that will be used for the person identity -# eg subdirectory "lfw/media/original/batch_1/train/barack_obama/001.jpg" --> [subdir_tail] --> "barack_obama" -# eg subdirectory "lfw/media/original/batch_1/train/barack_obama/001.jpg" --> [subdir_head] --> "batch_1" -# eg subdirectory "lfw/media/original/batch_1/train/barack_obama/001.jpg" --> [subdir] --> "barack_obama" - -identity_sources = ['subdir', 'numeric'] - -@click.command() -@click.option('-i', '--input', 'opt_fp_in', default=None, - help='Override enum input filename CSV') -@click.option('-o', '--output', 'opt_fp_out', default=None, - help='Override enum output filename CSV') -@click.option('-m', '--media', 'opt_dir_media', default=None, - help='Override enum media directory') -@click.option('--data_store', 'opt_data_store', - type=cfg.DataStoreVar, - default=click_utils.get_default(types.DataStore.HDD), - show_default=True, - help=click_utils.show_help(types.Dataset)) -@click.option('--dataset', 'opt_dataset', - type=cfg.DatasetVar, - required=True, - show_default=True, - help=click_utils.show_help(types.Dataset)) -@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), - help='Slice list of files') -@click.option('-t', '--threads', 'opt_threads', default=12, - help='Number of threads') -@click.option('-f', '--force', 'opt_force', is_flag=True, - help='Force overwrite file') -@click.option('--identity', 'opt_identity', type=click.Choice(identity_sources), - required=True, - help='Identity source key') -@click.option('--recursive/--no-recursive', 'opt_recursive', is_flag=True, default=False, - help='Use glob recursion (slower)') -@click.option('--max-depth', 'opt_max_depth', default=None, type=int, - help='Max number of images per subdirectory') -@click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, opt_slice, opt_threads, - opt_identity, opt_force, opt_recursive, opt_max_depth): - """Generates sha256, uuid, and identity index CSV file""" - - import sys, os - from glob import glob - from os.path import join - from pathlib import Path - import time - from multiprocessing.dummy import Pool as ThreadPool - import random - import uuid - - from PIL import Image - import cv2 as cv - import pandas as pd - from tqdm import tqdm - from glob import glob - from operator import itemgetter - - from app.models.data_store import DataStore - from app.utils import file_utils, im_utils - - - # set data_store - data_store = DataStore(opt_data_store, opt_dataset) - # get filepath out - fp_out = data_store.metadata(types.Metadata.FILE_RECORD) if opt_fp_out is None else opt_fp_out - # exit if exists - if not opt_force and Path(fp_out).exists(): - log.error('File exists. Use "-f / --force" to overwite') - return - - # ---------------------------------------------------------------- - # glob files - - fp_in = opt_fp_in if opt_fp_in is not None else data_store.media_images_original() - log.info(f'Globbing {fp_in}') - fp_ims = file_utils.glob_multi(fp_in, ['jpg', 'png'], recursive=opt_recursive) - - log.info('Found {:,} images'.format(len(fp_ims))) - subdir_groups = {} - if opt_max_depth: - log.debug(f'using max depth: {opt_max_depth}') - for fp_im in fp_ims: - fpp_im = Path(fp_im) - - subdir = fp_im.split('/')[-2] - if not subdir in subdir_groups.keys(): - subdir_groups[subdir] = [] - else: - subdir_groups[subdir].append(fp_im) - # for each subgroup, limit number of files - fp_ims = [] - for subdir_name, items in subdir_groups.items(): - ims = items[0:opt_max_depth] - fp_ims += ims - - log.debug(f'num subdirs: {len(subdir_groups.keys())}') - # fail if none - if not fp_ims: - log.error('No images. Try with "--recursive"') - return - # slice to reduce - if opt_slice: - fp_ims = fp_ims[opt_slice[0]:opt_slice[1]] - log.info('Found {:,} images'.format(len(fp_ims))) - - # ---------------------------------------------------------------- - # multithread process into SHA256 - - pbar = tqdm(total=len(fp_ims)) - - def pool_mapper(fp_im): - pbar.update(1) - try: - sha256 = file_utils.sha256(fp_im) - im = Image.open(fp_im) - im.verify() # throws error if bad file - assert(im.size[0] > 60 and im.size[1] > 60) - except Exception as e: - log.warn(f'skipping file: {fp_im}') - return None - im = cv.imread(fp_im) - w, h = im.shape[:2][::-1] - file_size_kb = os.stat(fp_im).st_size // 1000 - num_channels = im_utils.num_channels(im) - return { - 'width': w, - 'height': h, - 'sha256': sha256, - 'file_size_kb': file_size_kb, - 'num_channels': num_channels - } - - # convert to thread pool - pool_maps = [] # ? - pool = ThreadPool(opt_threads) - with tqdm(total=len(fp_ims)) as pbar: - pool_maps = pool.map(pool_mapper, fp_ims) - pbar.close() - - - # ---------------------------------------------------------------- - # convert data to dict - - data = [] - indentity_count = 0 - for pool_map, fp_im in zip(pool_maps, fp_ims): - if pool_map is None: - log.warn(f'skipping file: {fp_im}') - continue # skip error files - fpp_im = Path(fp_im) - subdir = str(fpp_im.parent.relative_to(fp_in)) - - if opt_identity: - subdirs = subdir.split('/') - if not len(subdirs) > 0: - log.error(f'Could not split subdir: "{subdir}. Try different option for "--identity"') - log.error('exiting') - return - if opt_identity == 'subdir': - identity = subdirs[-1] # use last part of subdir path - elif opt_identity == 'numeric': - identity = indentity_count # use incrementing number - indentity_count += 1 - else: - identity = '' - - data.append({ - 'subdir': subdir, - 'num_channels': pool_map['num_channels'], - 'fn': fpp_im.stem, - 'ext': fpp_im.suffix.replace('.',''), - 'sha256': pool_map['sha256'], - 'uuid': uuid.uuid4(), - 'identity_key': identity, - 'width': pool_map['width'], - 'height': pool_map['height'] - }) - - # create dataframe - df_records = pd.DataFrame.from_dict(data) - - df_records.index.name = 'index' # reassign 'index' as primary key column - # write to CSV - file_utils.mkdirs(fp_out) - df_records.to_csv(fp_out) - # done - log.info(f'wrote {len(df_records)} rows to "{fp_out}"') - # save script - cmd_line = ' '.join(sys.argv) - file_utils.write_text(cmd_line, '{}.sh'.format(fp_out)) - - -''' -# create dataframe - df_records = pd.DataFrame.from_dict(data) - - # add identity key (used for associating identity) - if opt_identity: - log.info(f'adding identity index using: "{opt_identity}" subdirectory') - # convert dict to DataFrame - # sort based on identity_key - df_records = df_records.sort_values(by=['identity_key'], ascending=True) - # add new column for identity - df_records['identity_index'] = [-1] * len(df_records) - # populate the identity_index - df_records_identity_groups = df_records.groupby('identity_key') - # enumerate groups to create identity indices - log.info(f'updating records with identity_key. This may take a while...') - st = time.time() - for identity_index, df_records_identity_group_tuple in enumerate(df_records_identity_groups): - identity_key, df_records_identity_group = df_records_identity_group_tuple - for ds_record in df_records_identity_group.itertuples(): - df_records.at[ds_record.Index, 'identity_index'] = identity_index - # reset index after being sorted - df_records = df_records.reset_index(drop=True) - log.debug('update time: {:.2f}s'.format(time.time() - st)) - else: - # name everyone person 1, 2, 3... - df_records = df_records.sort_values(by=['subdir'], ascending=True) - pass -''' \ No newline at end of file diff --git a/megapixels/commands/datasets/megaface_age_from_orig.py b/megapixels/commands/datasets/megaface_age_from_orig.py new file mode 100644 index 00000000..489bebf3 --- /dev/null +++ b/megapixels/commands/datasets/megaface_age_from_orig.py @@ -0,0 +1,62 @@ +import click + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input path to metadata directory') +@click.option('-o', '--output', 'opt_fp_out', + help='Output path to age CSV') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out): + """Creates CSV of MegaFace ages from original BBoxes""" + + import os + from os.path import join + from pathlib import Path + from glob import glob + + import dlib + import pandas as pd + from tqdm import tqdm + + from app.settings import types + from app.utils import click_utils + from app.settings import app_cfg + + from PIL import Image, ImageOps, ImageFilter + from app.utils import file_utils, im_utils, logger_utils + + log = logger_utils.Logger.getLogger() + + # ------------------------------------------------- + # process + fp_im_dirs = glob(join(opt_fp_in, '**/'), recursive=True) + + log.info('Found {} directories'.format(len(fp_im_dirs))) + + identities = {} + + for fp_im_dir in tqdm(fp_im_dirs): + # 1234567@N05_identity_1 + try: + dir_id_name = Path(fp_im_dir).name + nsid = dir_id_name.split('_')[0] + identity_num = dir_id_name.split('_')[2] + id_key = '{}_{}'.format(nsid, identity_num) + num_images = len(glob(join(fp_im_dir, '*.jpg'))) + if not id_key in identities.keys(): + identities[id_key] = {'nsid': nsid, 'identity': identity_num, 'images': num_images} + else: + identities[id_key]['images'] += num_images + except Exception as e: + continue + + # convert to dict + identities_list = [v for k, v in identities.items()] + df = pd.DataFrame.from_dict(identities_list) + + file_utils.mkdirs(opt_fp_out) + + log.info('Wrote {} lines to {}'.format(len(df), opt_fp_out)) + df.to_csv(opt_fp_out, index=False) + + diff --git a/megapixels/commands/demo/face_search.py b/megapixels/commands/demo/face_search.py index 4c7036f4..5218d501 100644 --- a/megapixels/commands/demo/face_search.py +++ b/megapixels/commands/demo/face_search.py @@ -10,7 +10,7 @@ log = Logger.getLogger() @click.command() @click.option('-i', '--input', 'opt_fp_in', required=True, - help='File to lookup') + help='Face image file to lookup') @click.option('--data_store', 'opt_data_store', type=cfg.DataStoreVar, default=click_utils.get_default(types.DataStore.HDD), diff --git a/megapixels/commands/processor/_old_files_to_face_rois.py b/megapixels/commands/processor/_old_files_to_face_rois.py index 895f4718..d92cbd74 100644 --- a/megapixels/commands/processor/_old_files_to_face_rois.py +++ b/megapixels/commands/processor/_old_files_to_face_rois.py @@ -1,4 +1,4 @@ - """ +""" Crop images to prepare for training """ diff --git a/megapixels/commands/processor/face_roi_from_annos.py b/megapixels/commands/processor/face_roi_from_annos.py new file mode 100644 index 00000000..fc933049 --- /dev/null +++ b/megapixels/commands/processor/face_roi_from_annos.py @@ -0,0 +1,187 @@ +""" +Crop images to prepare for training +""" + +import click +# from PIL import Image, ImageOps, ImageFilter, ImageDraw + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg + +color_filters = {'color': 1, 'gray': 2, 'all': 3} + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', default=None, + help='Override enum input filename CSV') +@click.option('-o', '--output', 'opt_fp_out', default=None, + help='Override enum output filename CSV') +@click.option('-m', '--media', 'opt_dir_media', default=None, + help='Override enum media directory') +@click.option('--store', 'opt_data_store', + type=cfg.DataStoreVar, + default=click_utils.get_default(types.DataStore.HDD), + show_default=True, + help=click_utils.show_help(types.Dataset)) +@click.option('--dataset', 'opt_dataset', + type=cfg.DatasetVar, + required=True, + show_default=True, + help=click_utils.show_help(types.Dataset)) +@click.option('--size', 'opt_size', + type=(int, int), default=(480, 480), + help='Output image size') +@click.option('-d', '--detector', 'opt_detector_type', + type=cfg.FaceDetectNetVar, + default=click_utils.get_default(types.FaceDetectNet.CVDNN), + help=click_utils.show_help(types.FaceDetectNet)) +@click.option('-g', '--gpu', 'opt_gpu', default=0, + help='GPU index') +@click.option('--conf', 'opt_conf_thresh', default=0.85, type=click.FloatRange(0,1), + help='Confidence minimum threshold') +@click.option('-p', '--pyramids', 'opt_pyramids', default=0, type=click.IntRange(0,4), + help='Number pyramids to upscale for DLIB detectors') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('--display/--no-display', 'opt_display', is_flag=True, default=False, + help='Display detections to debug') +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite file') +@click.option('--color', 'opt_color_filter', + type=click.Choice(color_filters.keys()), default='color', + help='Filter to keep color or grayscale images (color = keep color') +@click.option('--keep', 'opt_largest', type=click.Choice(['largest', 'all']), default='largest', + help='Only keep largest face') +@click.option('--zone', 'opt_zone', default=(0.0, 0.0), type=(float, float), + help='Face center must be located within zone region (0.5 = half width/height)') +@click.pass_context +def cli(ctx, opt_fp_in, opt_dir_media, opt_fp_out, opt_data_store, opt_dataset, opt_size, opt_detector_type, + opt_gpu, opt_conf_thresh, opt_pyramids, opt_slice, opt_display, opt_force, opt_color_filter, + opt_largest, opt_zone): + """Converts frames with faces to CSV of ROIs""" + + import sys + import os + from os.path import join + from pathlib import Path + from glob import glob + + from tqdm import tqdm + import numpy as np + import dlib # must keep a local reference for dlib + import cv2 as cv + import pandas as pd + + from app.utils import logger_utils, file_utils, im_utils, display_utils, draw_utils + from app.processors import face_detector + from app.models.data_store import DataStore + + # ------------------------------------------------- + # init here + + log = logger_utils.Logger.getLogger() + + # set data_store + data_store = DataStore(opt_data_store, opt_dataset) + + # get filepath out + fp_out = data_store.metadata(types.Metadata.FACE_ROI) if opt_fp_out is None else opt_fp_out + if not opt_force and Path(fp_out).exists(): + log.error('File exists. Use "-f / --force" to overwite') + return + + # set detector + if opt_detector_type == types.FaceDetectNet.CVDNN: + detector = face_detector.DetectorCVDNN() + elif opt_detector_type == types.FaceDetectNet.DLIB_CNN: + detector = face_detector.DetectorDLIBCNN(gpu=opt_gpu) + elif opt_detector_type == types.FaceDetectNet.DLIB_HOG: + detector = face_detector.DetectorDLIBHOG() + elif opt_detector_type == types.FaceDetectNet.MTCNN_TF: + detector = face_detector.DetectorMTCNN_TF(gpu=opt_gpu) + elif opt_detector_type == types.FaceDetectNet.HAAR: + log.error('{} not yet implemented'.format(opt_detector_type.name)) + return + + + # get list of files to process + fp_record = data_store.metadata(types.Metadata.FILE_RECORD) if opt_fp_in is None else opt_fp_in + df_record = pd.read_csv(fp_record, dtype=cfg.FILE_RECORD_DTYPES).set_index('index') + if opt_slice: + df_record = df_record[opt_slice[0]:opt_slice[1]] + log.debug('processing {:,} files'.format(len(df_record))) + + # filter out grayscale + color_filter = color_filters[opt_color_filter] + # set largest flag, to keep all or only largest + opt_largest = (opt_largest == 'largest') + + data = [] + skipped_files = [] + processed_files = [] + + for df_record in tqdm(df_record.itertuples(), total=len(df_record)): + fp_im = data_store.face(str(df_record.subdir), str(df_record.fn), str(df_record.ext)) + try: + im = cv.imread(fp_im) + im_resized = im_utils.resize(im, width=opt_size[0], height=opt_size[1]) + except Exception as e: + log.debug(f'could not read: {fp_im}') + return + # filter out color or grayscale iamges + if color_filter != color_filters['all']: + try: + is_gray = im_utils.is_grayscale(im) + if is_gray and color_filter != color_filters['gray']: + log.debug('Skipping grayscale image: {}'.format(fp_im)) + continue + except Exception as e: + log.error('Could not check grayscale: {}'.format(fp_im)) + continue + + try: + bboxes_norm = detector.detect(im_resized, pyramids=opt_pyramids, largest=opt_largest, + zone=opt_zone, conf_thresh=opt_conf_thresh) + except Exception as e: + log.error('could not detect: {}'.format(fp_im)) + log.error('{}'.format(e)) + continue + + if len(bboxes_norm) == 0: + skipped_files.append(fp_im) + log.warn(f'no faces in: {fp_im}') + log.warn(f'skipped: {len(skipped_files)}. found:{len(processed_files)} files') + else: + processed_files.append(fp_im) + for bbox in bboxes_norm: + roi = { + 'record_index': int(df_record.Index), + 'x': bbox.x, + 'y': bbox.y, + 'w': bbox.w, + 'h': bbox.h + } + data.append(roi) + + # if display optined + if opt_display and len(bboxes_norm): + # draw each box + for bbox_norm in bboxes_norm: + dim = im_resized.shape[:2][::-1] + bbox_dim = bbox.to_dim(dim) + if dim[0] > 1000: + im_resized = im_utils.resize(im_resized, width=1000) + im_resized = draw_utils.draw_bbox(im_resized, bbox_norm) + + # display and wait + cv.imshow('', im_resized) + display_utils.handle_keyboard() + + # create DataFrame and save to CSV + file_utils.mkdirs(fp_out) + df = pd.DataFrame.from_dict(data) + df.index.name = 'index' + df.to_csv(fp_out) + + # save script + file_utils.write_text(' '.join(sys.argv), '{}.sh'.format(fp_out)) \ No newline at end of file diff --git a/megapixels/commands/processor/file_record.py b/megapixels/commands/processor/file_record.py new file mode 100644 index 00000000..6403c768 --- /dev/null +++ b/megapixels/commands/processor/file_record.py @@ -0,0 +1,234 @@ +''' + +''' +import click + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +# Choose part of the filepath that will be used for the person identity +# eg subdirectory "lfw/media/original/batch_1/train/barack_obama/001.jpg" --> [subdir_tail] --> "barack_obama" +# eg subdirectory "lfw/media/original/batch_1/train/barack_obama/001.jpg" --> [subdir_head] --> "batch_1" +# eg subdirectory "lfw/media/original/batch_1/train/barack_obama/001.jpg" --> [subdir] --> "barack_obama" + +identity_sources = ['subdir', 'numeric'] + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', default=None, + help='Override enum input filename CSV') +@click.option('-o', '--output', 'opt_fp_out', default=None, + help='Override enum output filename CSV') +@click.option('-m', '--media', 'opt_dir_media', default=None, + help='Override enum media directory') +@click.option('--data_store', 'opt_data_store', + type=cfg.DataStoreVar, + default=click_utils.get_default(types.DataStore.HDD), + show_default=True, + help=click_utils.show_help(types.Dataset)) +@click.option('--dataset', 'opt_dataset', + type=cfg.DatasetVar, + required=True, + show_default=True, + help=click_utils.show_help(types.Dataset)) +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('-t', '--threads', 'opt_threads', default=12, + help='Number of threads') +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite file') +@click.option('--identity', 'opt_identity', type=click.Choice(identity_sources), + required=True, + help='Identity source key') +@click.option('--recursive/--no-recursive', 'opt_recursive', is_flag=True, default=False, + help='Use glob recursion (slower)') +@click.option('--max-depth', 'opt_max_depth', default=None, type=int, + help='Max number of images per subdirectory') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, opt_slice, opt_threads, + opt_identity, opt_force, opt_recursive, opt_max_depth): + """Generates sha256, uuid, and identity index CSV file""" + + import sys, os + from glob import glob + from os.path import join + from pathlib import Path + import time + from multiprocessing.dummy import Pool as ThreadPool + import random + import uuid + + from PIL import Image + import cv2 as cv + import pandas as pd + from tqdm import tqdm + from glob import glob + from operator import itemgetter + + from app.models.data_store import DataStore + from app.utils import file_utils, im_utils + + + # set data_store + data_store = DataStore(opt_data_store, opt_dataset) + # get filepath out + fp_out = data_store.metadata(types.Metadata.FILE_RECORD) if opt_fp_out is None else opt_fp_out + # exit if exists + if not opt_force and Path(fp_out).exists(): + log.error(f'File {fp_out} exists. Use "-f / --force" to overwite') + return + + # ---------------------------------------------------------------- + # glob files + + fp_in = opt_fp_in if opt_fp_in is not None else data_store.media_images_original() + log.info(f'Globbing {fp_in}') + fp_ims = file_utils.glob_multi(fp_in, ['jpg', 'png'], recursive=opt_recursive) + + log.info('Found {:,} images'.format(len(fp_ims))) + subdir_groups = {} + if opt_max_depth: + log.debug(f'using max depth: {opt_max_depth}') + for fp_im in fp_ims: + fpp_im = Path(fp_im) + + subdir = fp_im.split('/')[-2] + if not subdir in subdir_groups.keys(): + subdir_groups[subdir] = [] + else: + subdir_groups[subdir].append(fp_im) + # for each subgroup, limit number of files + fp_ims = [] + for subdir_name, items in subdir_groups.items(): + ims = items[0:opt_max_depth] + fp_ims += ims + + log.debug(f'num subdirs: {len(subdir_groups.keys())}') + # fail if none + if not fp_ims: + log.error('No images. Try with "--recursive"') + return + # slice to reduce + if opt_slice: + fp_ims = fp_ims[opt_slice[0]:opt_slice[1]] + log.info('Found {:,} images'.format(len(fp_ims))) + + # ---------------------------------------------------------------- + # multithread process into SHA256 + + pbar = tqdm(total=len(fp_ims)) + + def pool_mapper(fp_im): + pbar.update(1) + try: + sha256 = file_utils.sha256(fp_im) + im = Image.open(fp_im) + im.verify() # throws error if bad file + assert(im.size[0] > 60 and im.size[1] > 60) + except Exception as e: + log.warn(f'skipping file: {fp_im}') + return None + im = cv.imread(fp_im) + w, h = im.shape[:2][::-1] + file_size_kb = os.stat(fp_im).st_size // 1000 + num_channels = im_utils.num_channels(im) + return { + 'width': w, + 'height': h, + 'sha256': sha256, + 'file_size_kb': file_size_kb, + 'num_channels': num_channels + } + + # convert to thread pool + pool_maps = [] # ? + pool = ThreadPool(opt_threads) + with tqdm(total=len(fp_ims)) as pbar: + pool_maps = pool.map(pool_mapper, fp_ims) + pbar.close() + + + # ---------------------------------------------------------------- + # convert data to dict + + data = [] + indentity_count = 0 + for pool_map, fp_im in zip(pool_maps, fp_ims): + if pool_map is None: + log.warn(f'skipping file: {fp_im}') + continue # skip error files + fpp_im = Path(fp_im) + subdir = str(fpp_im.parent.relative_to(fp_in)) + + if opt_identity: + subdirs = subdir.split('/') + if not len(subdirs) > 0: + log.error(f'Could not split subdir: "{subdir}. Try different option for "--identity"') + log.error('exiting') + return + if opt_identity == 'subdir': + identity = subdirs[-1] # use last part of subdir path + elif opt_identity == 'numeric': + identity = indentity_count # use incrementing number + indentity_count += 1 + else: + identity = '' + + data.append({ + 'subdir': subdir, + 'num_channels': pool_map['num_channels'], + 'fn': fpp_im.stem, + 'ext': fpp_im.suffix.replace('.',''), + 'sha256': pool_map['sha256'], + 'uuid': uuid.uuid4(), + 'identity_key': identity, + 'width': pool_map['width'], + 'height': pool_map['height'] + }) + + # create dataframe + df_records = pd.DataFrame.from_dict(data) + + df_records.index.name = 'index' # reassign 'index' as primary key column + # write to CSV + file_utils.mkdirs(fp_out) + df_records.to_csv(fp_out) + # done + log.info(f'wrote {len(df_records)} rows to "{fp_out}"') + # save script + cmd_line = ' '.join(sys.argv) + file_utils.write_text(cmd_line, '{}.sh'.format(fp_out)) + + +''' +# create dataframe + df_records = pd.DataFrame.from_dict(data) + + # add identity key (used for associating identity) + if opt_identity: + log.info(f'adding identity index using: "{opt_identity}" subdirectory') + # convert dict to DataFrame + # sort based on identity_key + df_records = df_records.sort_values(by=['identity_key'], ascending=True) + # add new column for identity + df_records['identity_index'] = [-1] * len(df_records) + # populate the identity_index + df_records_identity_groups = df_records.groupby('identity_key') + # enumerate groups to create identity indices + log.info(f'updating records with identity_key. This may take a while...') + st = time.time() + for identity_index, df_records_identity_group_tuple in enumerate(df_records_identity_groups): + identity_key, df_records_identity_group = df_records_identity_group_tuple + for ds_record in df_records_identity_group.itertuples(): + df_records.at[ds_record.Index, 'identity_index'] = identity_index + # reset index after being sorted + df_records = df_records.reset_index(drop=True) + log.debug('update time: {:.2f}s'.format(time.time() - st)) + else: + # name everyone person 1, 2, 3... + df_records = df_records.sort_values(by=['subdir'], ascending=True) + pass +''' \ No newline at end of file diff --git a/megapixels/commands/site/age_gender_to_site.py b/megapixels/commands/site/age_gender_to_site.py new file mode 100644 index 00000000..3ad24a8d --- /dev/null +++ b/megapixels/commands/site/age_gender_to_site.py @@ -0,0 +1,100 @@ +""" + +""" + +import click + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg + + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', default=None, + help='Override enum input filename CSV') +@click.option('-o', '--output', 'opt_fp_out', default=None, + help='Override enum output filename CSV') +@click.option('-m', '--media', 'opt_dir_media', default=None, + help='Override enum media directory') +@click.option('--store', 'opt_data_store', + type=cfg.DataStoreVar, + default=click_utils.get_default(types.DataStore.HDD), + show_default=True, + help=click_utils.show_help(types.Dataset)) +@click.option('--dataset', 'opt_dataset', + type=cfg.DatasetVar, + required=True, + show_default=True, + help=click_utils.show_help(types.Dataset)) +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite file') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_dir_media, opt_data_store, opt_dataset, opt_force): + """Converts age/gender to CSV for pie chartgs""" + + import sys + import os + from os.path import join + from pathlib import Path + from glob import glob + + from tqdm import tqdm + import numpy as np + import cv2 as cv + import pandas as pd + + from app.utils import logger_utils + from app.models.data_store import DataStore + + # ------------------------------------------------------------------------- + # init here + + log = logger_utils.Logger.getLogger() + + # init filepaths + data_store = DataStore(opt_data_store, opt_dataset) + # set file output path + metadata_type = types.Metadata.FACE_ATTRIBUTES + fp_in = data_store.metadata(metadata_type) if opt_fp_out is None else opt_fp_in + dk = opt_dataset.name.lower() + log.debug(f'dk: {dk}') + fp_out_age = f'../site/content/pages/datasets/{dk}/assets/age.csv' + fp_out_gender = f'../site/content/pages/datasets/{dk}/assets/gender.csv' + + if not opt_force and (Path(fp_out_age).exists() or Path(fp_out_gender).exists()): + log.error('File exists. Use "-f / --force" to overwite') + return + + # ------------------------------------------------------------------------- + # Age + + df = pd.read_csv(fp_in) + + results = [] + brackets = [(0, 12), (13, 18), (19,24), (25, 34), (35, 44), (45, 54), (55, 64), (64, 75), (75, 100)] + df_age = df['age_real'] + + for a1, a2 in brackets: + n = len(df_age.loc[((df_age >= a1) & (df_age <= a2))]) + results.append({'age': f'{a1} - {a2}', 'faces': n}) + + df_out = pd.DataFrame.from_dict(results) + df_out = df_out[['age','faces']] + df_out.to_csv(fp_out_age, index=False) + + # Gender + results = [] + + df_f = df['f'] + nm = len(df_f.loc[((df_f < 0.33))]) + nnb = len(df_f.loc[((df_f >= 0.33) & (df_f <= 0.66))]) + nf = len(df_f.loc[((df_f > 0.66))]) + + results = [] + results.append({'gender': 'Male', 'faces':nm}) + results.append({'gender': 'Female', 'faces': nf}) + results.append({'gender': 'They', 'faces': nnb}) + + df_out = pd.DataFrame.from_dict(results) + df_out = df_out[['gender','faces']] + df_out.to_csv(fp_out_gender, index=False) \ No newline at end of file diff --git a/site/assets/css/applets.css b/site/assets/css/applets.css index daf36a19..245643f1 100644 --- a/site/assets/css/applets.css +++ b/site/assets/css/applets.css @@ -187,6 +187,7 @@ .tabulator { font-family: 'Roboto', sans-serif; + font-size:10px; } .tabulator-row { transition: background-color 100ms cubic-bezier(0,0,1,1); @@ -247,7 +248,7 @@ stroke: rgba(64,64,64,0.3); } .chartCaption { - color: #888; + color: #333; font-size: 12px; font-family: 'Roboto', sans-serif; font-weight: 400; diff --git a/site/assets/css/css.css b/site/assets/css/css.css index 6b1f40cd..75f1ad3f 100644 --- a/site/assets/css/css.css +++ b/site/assets/css/css.css @@ -12,11 +12,11 @@ html, body { min-height: 100%; /*font-family: 'Roboto Mono', sans-serif;*/ font-family: 'Roboto', sans-serif; - color: #eee; + color: #000; overflow-x: hidden; } html { - background: #181818; + background: #fff; } a { outline: none; } img { border: 0; } @@ -32,6 +32,7 @@ html.mobile .content { html.mobile .content{ } +/* header */ /* header */ header { @@ -155,7 +156,7 @@ footer { display: flex; flex-direction: row; justify-content: space-between; - color: #666; + color: #000; font-size: 13px; /*line-height: 17px;*/ padding: 15px; @@ -211,30 +212,34 @@ footer ul:last-child li { /* headings */ h1 { - color: #eee; - font-weight: 400; - font-size: 34pt; + color: #000; + font-weight: 500; + font-size: 30pt; margin: 20px auto 10px auto; padding: 0; transition: color 0.1s cubic-bezier(0,0,1,1); font-family: 'Roboto Mono', monospace; + text-transform: uppercase; } h2 { - color: #eee; - font-weight: 400; + color: #111; + font-weight: 500; font-size: 34px; line-height: 43px; margin: 20px auto 20px auto; padding: 0; transition: color 0.1s cubic-bezier(0,0,1,1); font-family: 'Roboto Mono', monospace; + text-transform: uppercase; } h3 { + color: #333; margin: 20px auto 10px auto; font-size: 28px; font-weight: 400; transition: color 0.1s cubic-bezier(0,0,1,1); font-family: 'Roboto Mono', monospace; + text-transform: uppercase; } h4 { margin: 6px auto 10px auto; @@ -243,6 +248,7 @@ h4 { font-weight: 400; transition: color 0.1s cubic-bezier(0,0,1,1); font-family: 'Roboto Mono', monospace; + text-transform: uppercase; } h5 { margin: 6px auto 10px auto; @@ -253,11 +259,11 @@ h5 { font-family: 'Roboto Mono', monospace; } .content h3 a { - color: #888; + color: #333; text-decoration: none; } .desktop .content h3 a:hover { - color: #fff; + color: #111; text-decoration: underline; } .right-sidebar h3 { @@ -272,12 +278,15 @@ h5 { .right-sidebar ul li a { border-bottom: 0; } +.right-sidebar ul li:last-child{ + border-bottom: 0; +} th, .gray { font-family: 'Roboto', monospace; font-weight: 500; text-transform: uppercase; letter-spacing: .15rem; - color: #777; + color: #333; } th, .gray { font-size: 9pt; @@ -354,10 +363,10 @@ section { } section p { margin: 10px auto 20px auto; - line-height: 1.9rem; - font-size: 17px; + line-height: 1.95rem; + font-size: 16px; font-weight: 400; - color: #cdcdcd; + color: #111; } section ul { margin: 10px auto 20px auto; @@ -367,22 +376,32 @@ section h1, section h2, section h3, section h4, section h5, section h6, section max-width: 720px; } -.content-dataset section:nth-child(2) p:first-child{ - font-size:19px; +.content-dataset-list section:nth-child(1) p:nth-child(2){ + font-size:22px; + line-height:34px; +} +.content-dataset section:nth-child(4) p:nth-child(2){ + font-size:20px; + line-height: 32px; + color:#000; +} +.content-dataset section:nth-child(3) p:nth-child(2) { + /* highlight news text */ + color:#f00; } p.subp{ font-size: 14px; } .content a { - color: #dedede; + color: #333; text-decoration: none; - border-bottom: 2px solid #666; + border-bottom: 1px solid #333; padding-bottom: 1px; transition: color 0.1s cubic-bezier(0,0,1,1); } .desktop .content a:hover { - color: #fff; - border-bottom: 2px solid #ccc; + color: #111; + border-bottom: 1px solid #111; } /* top of post metadata */ @@ -393,7 +412,7 @@ p.subp{ justify-content: flex-start; align-items: flex-start; font-size: 12px; - color: #ccc; + color: #111; margin-bottom: 20px; font-family: 'Roboto', sans-serif; margin-right: 20px; @@ -412,7 +431,6 @@ p.subp{ float: right; width: 200px; margin: 0px 20px 20px 20px; - padding-top: 12px; padding-left: 20px; border-left: 1px solid #333; font-family: 'Roboto'; @@ -442,7 +460,10 @@ p.subp{ border-bottom: 1px solid #333; padding:10px 10px 10px 0; margin: 0 4px 4px 0; - color: #bbb; + color: #111; +} +.right-sidebar .meta:last-child{ + border-bottom: 0; } .right-sidebar ul { margin-bottom: 10px; @@ -477,7 +498,7 @@ ul { } ul li { margin-bottom: 8px; - color: #dedede; + color: #333; font-weight: 400; font-size: 14px; } @@ -497,8 +518,9 @@ pre { border-radius: 2px; padding: 10px; display: block; - background: #333; + background: #ddd; overflow: auto + /*margin-bottom: 10px;*/ } pre code { display: block; @@ -533,10 +555,10 @@ table tr td{ font-size:12px; } table tbody tr:nth-child(odd){ - background-color:#292929; + background-color:#ebebeb; } table tbody tr:nth-child(even){ - background-color:#333; + background-color:#ccc; } hr { @@ -670,22 +692,24 @@ section.fullwidth .image { } .image .caption.intro-caption{ text-align: center; + color:#666; } .caption { text-align: center; font-size: 10pt; - color: #999; + line-height: 14pt; + color: #555; max-width: 960px; margin: 10px auto 10px auto; font-family: 'Roboto'; } .caption a { - color: #ccc; - border: 0; + color: #333; + border-bottom: 1px solid #333; } .desktop .caption a:hover { - color: #fff; - border: 0; + color: #111; + border-bottom: 1px solid #111; } @@ -873,7 +897,7 @@ section.fullwidth .image { .dataset-list .dataset { width: 300px; padding: 12px; - color: white; + color: #000; font-weight: 400; font-family: 'Roboto'; position: relative; @@ -884,21 +908,22 @@ section.fullwidth .image { height: 178px; } .desktop .content .dataset-list a { - border: 1px solid #333; + border: 1px solid #999; } .desktop .dataset-list a:hover { - border: 1px solid #666; + border: 1px solid #000; } .dataset-list .fields { font-size: 12px; - color: #ccc; + line-height: 17px; + color: #333; } .dataset-list .dataset .title{ font-size: 16px; line-height: 20px; margin-bottom: 4px; - font-weight: 400; + font-weight: 500; display: block; } .dataset-list .fields div { @@ -965,7 +990,7 @@ section.intro_section { justify-content: center; align-items: center; background-color: #111111; - margin-bottom: 20px; + /*margin-bottom: 20px;*/ padding: 0; } .intro_section .inner { @@ -1091,7 +1116,8 @@ ul.map-legend li:before { } ul.map-legend li.active { text-decoration: underline; - color: #fff; + color: #000; + font-weight: 500; } ul.map-legend li.edu:before { background-color: #f2f293; @@ -1118,7 +1144,7 @@ ul.map-legend li.source:before { } .content-about { - color: #fff; + /*color: #fff;*/ } .content-about p { font-size: 16px; @@ -1141,12 +1167,13 @@ ul.map-legend li.source:before { } .content-about .about-menu ul li a { border-bottom: 0; - color: #aaa; + color: #555; } .content-about .about-menu ul li a.current { - border-bottom: 1px solid #ddd; - color: #ddd; + border-bottom: 1px solid #000; + color: #000; + font-weight: 500; } /* columns */ @@ -1237,7 +1264,7 @@ a.footnote { /*display: inline-block;*/ bottom: 7px; text-decoration: none; - color: #ff8; + color: #666; border: 0; left: -1px; transition-duration: 0s; @@ -1255,7 +1282,7 @@ a.footnote_shim { } .desktop a.footnote:hover { /*background-color: #ff8;*/ - color: #fff; + color: #000; border: 0; } .backlinks { diff --git a/site/assets/css/tabulator.css b/site/assets/css/tabulator.css index d7a3fab3..baf44536 100755 --- a/site/assets/css/tabulator.css +++ b/site/assets/css/tabulator.css @@ -1,7 +1,7 @@ /* Tabulator v4.1.3 (c) Oliver Folkerd */ .tabulator { position: relative; - font-size: 13px; + font-size: 12px; text-align: left; overflow: hidden; -ms-transform: translatez(0); diff --git a/site/includes/age_gender_disclaimer.html b/site/includes/age_gender_disclaimer.html new file mode 100644 index 00000000..f8dceb62 --- /dev/null +++ b/site/includes/age_gender_disclaimer.html @@ -0,0 +1,3 @@ +
+

Age and gender estimation distribution were calculated by anlayzing all faces in the dataset images. This may include additional faces appearing next to an annotated face, or this may skip false faces that were erroneously included as part of the original dataset. These numbers are provided as an estimation and not a factual representation of the exact gender and age of all faces.

+
\ No newline at end of file diff --git a/site/includes/chart.html b/site/includes/chart.html deleted file mode 100644 index 01c2e83b..00000000 --- a/site/includes/chart.html +++ /dev/null @@ -1,14 +0,0 @@ -
-

Who used {{ metadata.meta.dataset.name_display }}?

- -

- This bar chart presents a ranking of the top countries where dataset citations originated. Mouse over individual columns to see yearly totals. These charts show at most the top 10 countries. -

- -
- -
- -
-
diff --git a/site/includes/dashboard.html b/site/includes/dashboard.html index d5e5693d..02d054b5 100644 --- a/site/includes/dashboard.html +++ b/site/includes/dashboard.html @@ -19,10 +19,10 @@
-

Information Supply chain

+

Information Supply Chain

- To help understand how {{ metadata.meta.dataset.name_display }} has been used around the world by commercial, military, and academic organizations; existing publicly available research citing {{ metadata.meta.dataset.name_full }} was collected, verified, and geocoded to show the biometric trade routes of people appearing in the images. Click on the markers to reveal research projects at that location. + To help understand how {{ metadata.meta.dataset.name_display }} has been used around the world by commercial, military, and academic organizations; existing publicly available research citing {{ metadata.meta.dataset.name_full }} was collected, verified, and geocoded to show how AI training data has proliferated around the world. Click on the markers to reveal research projects at that location.

diff --git a/site/includes/map.html b/site/includes/map.html deleted file mode 100644 index 372bed8d..00000000 --- a/site/includes/map.html +++ /dev/null @@ -1,22 +0,0 @@ -
- -

Information Supply Chain

- -

- To help understand how {{ metadata.meta.dataset.name_display }} has been used around the world by commercial, military, and academic organizations; existing publicly available research citing {{ metadata.meta.dataset.name_full }} was collected, verified, and geocoded to show the biometric trade routes of people appearing in the images. Click on the location markers to reveal research projects at that location. -

- -
- -
-
-
- -
-
    -
  • Academic
  • -
  • Commercial
  • -
  • Military / Government
  • -
-
Citation data is collected using SemanticScholar.org and then dataset usage verified and geolocated.
-
\ No newline at end of file diff --git a/site/public/about/index.html b/site/public/about/index.html index ce2b6228..427a97a2 100644 --- a/site/public/about/index.html +++ b/site/public/about/index.html @@ -63,22 +63,9 @@
  • Attribution
  • Legal / Privacy
  • -

    MegaPixels is an independent art and research project by Adam Harvey and Jules LaPlace that investigates the ethics, origins, and individual privacy implications of face recognition image datasets and their role in the expansion of biometric surveillance technologies.

    +

    MegaPixels is an independent art and research project by Adam Harvey and Jules LaPlace that investigates the ethics, origins, and individual privacy implications of face recognition image datasets and their role in the expansion of biometric surveillance technologies.

    MegaPixels is made possible with support from Mozilla

    -
    -
    -

    Adam Harvey

    -

    is Berlin-based American artist and researcher. His previous projects (CV Dazzle, Stealth Wear, and SkyLift) explore the potential for counter-surveillance as artwork. He is the founder of VFRAME (visual forensics software for human rights groups) and is a currently researcher in residence at Karlsruhe HfG.

    -

    ahprojects.com

    -

    -
    -
    -

    Jules LaPlace

    -

    is an American technologist and artist also based in Berlin. He was previously the CTO of a digital agency in NYC and now also works at VFRAME, developing computer vision and data analysis software for human rights groups. Jules also builds experimental software for artists and musicians. -

    -

    asdf.us

    -
    -

    MegaPixels is an art and research project first launched in 2017 for an installation at Tactical Technology Collective's GlassRoom about face recognition datasets. In 2018 MegaPixels was extended to cover pedestrian analysis datasets for a commission by Elevate Arts festival in Austria. Since then MegaPixels has evolved into a large-scale interrogation of hundreds of publicly-available face and person analysis datasets, the first of which launched on this site in April 2019.

    +

    MegaPixels is an art and research project first launched in 2017 for an installation at Tactical Technology Collective's GlassRoom about face recognition datasets. In 2018 MegaPixels was extended to cover pedestrian analysis datasets for a commission by Elevate Arts festival in Austria. Since then MegaPixels has evolved into a large-scale interrogation of hundreds of publicly-available face and person analysis datasets, the first of which launched on this site in April 2019.

    MegaPixels aims to provide a critical perspective on machine learning image datasets, one that might otherwise escape academia and industry funded artificial intelligence think tanks that are often supported by the same technology companies who created many of the datasets presented on this site.

    MegaPixels is an independent project, designed as a public resource for educators, students, journalists, and researchers. Each dataset presented on this site undergoes a thorough review of its images, intent, and citations. MegaPixels is a website-first research project, with an academic publication to follow in fall 2019.

    A dataset of verified geocoded citations and dataset statistics will be published in Fall 2019 along with a research paper as part of a research fellowship for KIM (Critical Artificial Intelligence) Karlsruhe HfG.

    @@ -90,18 +77,18 @@
  • June 26, 2019: The Atlantic writes about image training datasets "in the wild" and research ethics: Universities Record Students on Campuses for Research by Sidney Fussell
  • Read more news

    -
    Team
    +
    Team
    • Adam Harvey: Concept, research and analysis, design, computer vision
    • Jules LaPlace: Information and systems architecture, data management, web applications
    -
    Contributing Researchers
    +
    Contributing Researchers
    • Beth (aka Ms. Celeb)
    • Berit Gilma
    • Mathana Stender
    -
    Code and Libraries
    +
    Code and Libraries
    • Semantic Scholar for citation aggregation
    • Leaflet.js for maps
    • @@ -109,7 +96,7 @@
    • ThreeJS for 3D visualizations
    • PDFMiner.Six and Pandas for research paper analysis
    -
    Attribution
    +
    Attribution

    If you use MegaPixels or any data derived from it for your work, please cite our original work as follows:

     @online{megapixels,
    @@ -119,9 +106,7 @@
      url = {https://megapixels.cc/},
      urldate = {2019-04-18}
     }
    -
    Contact
    -

    Please direct questions, comments, or feedback to mastodon.social/@adamhrv or contact via https://ahprojects.com/about

    -
    +