import os from pathlib import Path from glob import glob import unidecode import difflib from app.settings import types from app.utils import logger_utils log = logger_utils.Logger.getLogger() # Get list of names based on Dataset type def get_names(enum_dataset): if enum_dataset == types.Dataset.LFW: dir_lfw = '/data_store_hdd/datasets/people/lfw/media/original/' names_orig = [x for x in os.listdir(dir_lfw)] names_query = [x.replace('_', ' ') for x in names_orig] result = {'names_orig': names_orig, 'names_query': names_query} elif enum_dataset == types.Dataset.YOUTUBE_FACES: names = [x for x in names if 'labeled faces.txt' not in x] else: log.warn(f'{enum_dataset} not yet implemented') result = {} return result def similarity(a, b): return difflib.SequenceMatcher(a=a.lower(), b=b.lower()).ratio() def names_match(name_a, name_b, threshold=0.9, as_float=False, compound_score=False): '''Returns boolean if names are similar enough ''' # strip spaces and split names into list of plain text words name_a_clean = [unidecode.unidecode(x.strip().lower()) for x in name_a.strip().split(' ')] name_b_clean = [unidecode.unidecode(x.strip().lower()) for x in name_b.strip().split(' ')] # assign short long vars len_a = len(name_a_clean) len_b = len(name_b_clean) len_min = min(len_a, len_b) len_max = max(len_a, len_b) # compute scores scores = [] for i in range(len(name_a_clean)): word_a = name_a_clean[i] subscores = [] for j in range(len(name_b_clean)): word_b = name_b_clean[j] score = similarity(word_a, word_b) subscores.append(score) scores.append(subscores) # return result ratio_similar = sum(max(x) for x in scores) / len_min if compound_score: # combine with any missing letters/words letters_a = sum(len(x) for x in name_a_clean) letters_b = sum(len(x) for x in name_b_clean) ratio_letters = min(letters_a, letters_b) / max(letters_a, letters_b) score = (0.8 * ratio_similar) + (0.2 * ratio_letters) else: score = ratio_similar if as_float: return score else: return score > threshold