diff options
| author | adamhrv <adam@ahprojects.com> | 2018-12-16 19:37:58 +0100 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2018-12-16 19:37:58 +0100 |
| commit | c3839ea797401d740db64691c0b4922c935b131c (patch) | |
| tree | ef64b6b441dd677a41f79a423af8b7a44e68b23f /megapixels/app/models/dataset.py | |
| parent | 10f467b64e3be528ac246d5cf664d675aca3e7f3 (diff) | |
still sorting CSV vectors indexes
Diffstat (limited to 'megapixels/app/models/dataset.py')
| -rw-r--r-- | megapixels/app/models/dataset.py | 41 |
1 files changed, 25 insertions, 16 deletions
diff --git a/megapixels/app/models/dataset.py b/megapixels/app/models/dataset.py index 11d568a5..8fef8a7e 100644 --- a/megapixels/app/models/dataset.py +++ b/megapixels/app/models/dataset.py @@ -2,6 +2,7 @@ Dataset model: container for all CSVs about a dataset """ import os +import sys from os.path import join from pathlib import Path import logging @@ -12,7 +13,8 @@ import numpy as np from app.settings import app_cfg as cfg from app.settings import types from app.models.bbox import BBox -from app.utils import file_utils, im_utils, path_utils +from app.utils import file_utils, im_utils +from app.models.data_store import DataStore, DataStoreS3 from app.utils.logger_utils import Logger # ------------------------------------------------------------------------- @@ -21,17 +23,19 @@ from app.utils.logger_utils import Logger class Dataset: - def __init__(self, opt_dataset_type, opt_data_store=types.DataStore.NAS): + def __init__(self, opt_data_store, opt_dataset_type, load_files=True): self._dataset_type = opt_dataset_type # enum type self.log = Logger.getLogger() self._metadata = {} self._face_vectors = [] self._nullframe = pd.DataFrame() # empty placeholder - self.data_store = path_utils.DataStore(opt_data_store, self._dataset_type) - self.data_store_s3 = path_utils.DataStoreS3(self._dataset_type) + self.data_store = DataStore(opt_data_store, self._dataset_type) + self.data_store_s3 = DataStoreS3(self._dataset_type) + self.load_metadata() - def load(self, opt_data_store): + def load_metadata(self): '''Loads all CSV files into (dict) of DataFrames''' + self.log.info(f'creating dataset: {self._dataset_type}...') for metadata_type in types.Metadata: self.log.info(f'load metadata: {metadata_type}') fp_csv = self.data_store.metadata(metadata_type) @@ -40,11 +44,12 @@ class Dataset: self._metadata[metadata_type] = pd.read_csv(fp_csv).set_index('index') if metadata_type == types.Metadata.FACE_VECTOR: # convert DataFrame to list of floats - self._face_vecs = self.df_to_vec_list(self._metadata[metadata_type]) + self._face_vectors = self.df_to_vec_list(self._metadata[metadata_type]) + self.log.info(f'build face vector dict: {len(self._face_vectors)}') self._metadata[metadata_type].drop('vec', axis=1, inplace=True) else: - self.log.error('File not found: {fp_csv}. Replaced with empty DataFrame') - self._metadata[metadata_type] = self._nullframe + self.log.error(f'File not found: {fp_csv}. Exiting.') + sys.exit() self.log.info('finished loading') def metadata(self, opt_metadata_type): @@ -80,7 +85,7 @@ class Dataset: image_record = ImageRecord(image_index, sha256, uuid, bbox, fp_im, fp_url) # now get the identity index (if available) identity_index = ds_sha256.identity_index - if identity_index: + if identity_index > -1: # then use the identity index to get the identity meta df_identity = df_filepath = self._metadata[types.Metadata.IDENTITY] ds_identity = df_identity.iloc[identity_index] @@ -95,18 +100,24 @@ class Dataset: identity = Identity(identity_index, name=name, desc=desc, gender=gender, n_images=n_images, url=url, age=age, nationality=nationality) image_record.identity = identity + else: + self.log.info(f'no identity index: {ds_sha256}') return image_record - def matches(self, query_vec, n_results=5, threshold=0.5): + def find_matches(self, query_vec, n_results=5, threshold=0.6): image_records = [] # list of image matches w/identity if available # find most similar feature vectors indexes - match_idxs = self.similar(query_vec, n_results, threshold) + #match_idxs = self.similar(query_vec, n_results, threshold) + sim_scores = np.linalg.norm(np.array([query_vec]) - np.array(self._face_vectors), axis=1) + match_idxs = np.argpartition(sim_scores, n_results)[:n_results] + for match_idx in match_idxs: # get the corresponding face vector row + self.log.debug(f'find match index: {match_idx}') image_record = self.roi_idx_to_record(match_idx) - results.append(image_record) + image_records.append(image_record) return image_records # ---------------------------------------------------------------------- @@ -114,8 +125,7 @@ class Dataset: def df_to_vec_list(self, df): # convert the DataFrame CSV to float list of vecs - vecs = [list(map(float,x.vec.split(','))) for x in df.itertuples()] - return vecs + return [list(map(float,x.vec.split(','))) for x in df.itertuples()] def similar(self, query_vec, n_results): '''Finds most similar N indices of query face vector @@ -124,8 +134,7 @@ class Dataset: :returns (list) of (int) indices ''' # uses np.linalg based on the ageitgey/face_recognition code - vecs_sim_scores = np.linalg.norm(np.array([query_vec]) - np.array(self._face_vectors), axis=1) - top_idxs = np.argpartition(vecs_sim_scores, n_results)[:n_results] + return top_idxs |
