diff options
| -rw-r--r-- | megapixels/app/utils/identity_utils.py | 68 |
1 files changed, 68 insertions, 0 deletions
diff --git a/megapixels/app/utils/identity_utils.py b/megapixels/app/utils/identity_utils.py new file mode 100644 index 00000000..5b73a8e9 --- /dev/null +++ b/megapixels/app/utils/identity_utils.py @@ -0,0 +1,68 @@ +import os +from pathlib import Path +from glob import glob +import unidecode +import difflib + +from app.settings import types +from app.utils import logger_utils + +log = logger_utils.Logger.getLogger() + +# Get list of names based on Dataset type +def get_names(enum_dataset): + if enum_dataset == types.Dataset.LFW: + dir_lfw = '/data_store_hdd/datasets/people/lfw/media/original/' + names_orig = [x for x in os.listdir(dir_lfw)] + names_query = [x.replace('_', ' ') for x in names_orig] + result = {'names_orig': names_orig, 'names_query': names_query} + elif enum_dataset == types.Dataset.YOUTUBE_FACES: + names = [x for x in names if 'labeled faces.txt' not in x] + else: + log.warn(f'{enum_dataset} not yet implemented') + result = {} + return result + +def similarity(a, b): + return difflib.SequenceMatcher(a=a.lower(), b=b.lower()).ratio() + +def names_match(name_a, name_b, threshold=0.9, as_float=False, compound_score=False): + '''Returns boolean if names are similar enough + ''' + # strip spaces and split names into list of plain text words + name_a_clean = [unidecode.unidecode(x.strip().lower()) for x in name_a.strip().split(' ')] + name_b_clean = [unidecode.unidecode(x.strip().lower()) for x in name_b.strip().split(' ')] + + # assign short long vars + len_a = len(name_a_clean) + len_b = len(name_b_clean) + len_min = min(len_a, len_b) + len_max = max(len_a, len_b) + + # compute scores + scores = [] + for i in range(len(name_a_clean)): + word_a = name_a_clean[i] + subscores = [] + for j in range(len(name_b_clean)): + word_b = name_b_clean[j] + score = similarity(word_a, word_b) + subscores.append(score) + scores.append(subscores) + + # return result + ratio_similar = sum(max(x) for x in scores) / len_min + # combine with any missing letters/words + letters_a = sum(len(x) for x in name_a_clean) + letters_b = sum(len(x) for x in name_b_clean) + ratio_letters = min(letters_a, letters_b) / max(letters_a, letters_b) + + if compound_score: + score = (0.8 * ratio_similar) + (0.2 * ratio_letters) + else: + score = ratio_similar + + if as_float: + return score + else: + return score > threshold
\ No newline at end of file |
