import os from pathlib import Path from glob import glob import unidecode import difflib from app.settings import types from app.models.data_store import DataStore from app.utils import logger_utils log = logger_utils.Logger.getLogger() az = 'abcdefghijklmlopqrstuvwzxyz' AZ = az.upper() z9 = list(map(str, list(range(0,10)))) aZ9 = list(az) + list(AZ) + z9 def letter_strip(a, b=aZ9): # strip every letter from a that is not in b return ''.join([x for x in a if x in b]) def letter_match(a, b): # check if every letter (a-zA-Z0-9) exists in both return sum([x in b for x in a]) == len(a) def names_match_strict(a, b): clean_a = letter_strip(a) clean_b = letter_strip(b) return len(clean_a) == len(clean_b) and letter_match(clean_a, clean_b) and letter_match(clean_b, clean_a) ''' class Dataset(Enum): LFW, VGG_FACE, VGG_FACE2, MSCELEB, UCCS, UMD_FACES, SCUT_FBP, UCF_SELFIE, UTK, \ CASIA_WEBFACE, AFW, PUBFIG83, HELEN, PIPA, MEGAFACE, BRAINWASH, IMDB_WIKI = range(16) ''' # Get list of names based on Dataset type def get_names(opt_dataset, opt_data_store=types.DataStore.HDD): data_store = DataStore(opt_data_store, opt_dataset) dir_dataset = data_store.dir_dataset # path to dataset root dir_media_orig = data_store.dir_media_original if opt_dataset == types.Dataset.AFW: # Annotated Faces in the Wild pass elif opt_dataset == types.Dataset.BRAINWASH: # Brainwash IP Cam dataset pass elif opt_dataset == types.Dataset.CASIA_WEBFACE: # pass elif opt_dataset == types.Dataset.HELEN: # Helen pass elif opt_dataset == types.Dataset.IMDB_WIKI: # University of Tennessee Knoxville pass elif opt_dataset == types.Dataset.LAG: # Large Age Gap pass elif opt_dataset == types.Dataset.LFW: # Labeled Faces in The Wild names_orig = [x for x in os.listdir(dir_media_orig)] names_query = [x.replace('_', ' ') for x in names_orig] elif opt_dataset == types.Dataset.MEGAFACE: # MegaFace pass elif opt_dataset == types.Dataset.MSCELEB: # MS Celeb pass elif opt_dataset == types.Dataset.PIPA: # People in Photo Albums pass elif opt_dataset == types.Dataset.PUBFIG83: # PubFig83 names_orig = [x for x in os.listdir(dir_media_orig) if Path(x).suffix is not '.txt'] names_query = [x.replace('_', ' ') for x in names_orig] elif opt_dataset == types.Dataset.SCUT_FBP: # SCUT Facial Beauty Perception pass elif opt_dataset == types.Dataset.UCCS: # Unconstrianed College Students pass elif opt_dataset == types.Dataset.UMD_FACES: # University of Maryland Faces pass elif opt_dataset == types.Dataset.UTK: # University of Tennessee Knoxville pass elif opt_dataset == types.Dataset.UCF_SELFIE: # University of Central Florida Selfie pass elif opt_dataset == types.Dataset.VGG_FACE: # Visual Geometry Group Face 1 pass elif opt_dataset == types.Dataset.VGG_FACE2: # Visual Geometry Group Face 2 pass else: log.warn(f'{opt_dataset} not yet implemented') names_orig = [] names_query = [] result = {'names_orig': names_orig, 'names_query': names_query} return result def similarity(a, b): return difflib.SequenceMatcher(a=a.lower(), b=b.lower()).ratio() def names_match(name_a, name_b, threshold=0.9, as_float=False, compound_score=False): '''Returns boolean if names are similar enough ''' # strip spaces and split names into list of plain text words name_a_clean = [unidecode.unidecode(x.strip().lower()) for x in name_a.strip().split(' ')] name_b_clean = [unidecode.unidecode(x.strip().lower()) for x in name_b.strip().split(' ')] # assign short long vars len_a = len(name_a_clean) len_b = len(name_b_clean) len_min = min(len_a, len_b) len_max = max(len_a, len_b) # compute scores scores = [] for i in range(len(name_a_clean)): word_a = name_a_clean[i] subscores = [] for j in range(len(name_b_clean)): word_b = name_b_clean[j] score = similarity(word_a, word_b) subscores.append(score) scores.append(subscores) # return result ratio_similar = sum(max(x) for x in scores) / len(scores) if compound_score: # combine with any missing letters/words letters_a = sum(len(x) for x in name_a_clean) letters_b = sum(len(x) for x in name_b_clean) ratio_letters = min(letters_a, letters_b) / max(letters_a, letters_b) score = (0.8 * ratio_similar) + (0.2 * ratio_letters) else: score = ratio_similar if as_float: return score else: return score > threshold