diff options
Diffstat (limited to 'cli/app/utils/identity_utils.py')
| -rw-r--r-- | cli/app/utils/identity_utils.py | 161 |
1 files changed, 161 insertions, 0 deletions
diff --git a/cli/app/utils/identity_utils.py b/cli/app/utils/identity_utils.py new file mode 100644 index 0000000..5855fbb --- /dev/null +++ b/cli/app/utils/identity_utils.py @@ -0,0 +1,161 @@ +import os +from pathlib import Path +from glob import glob +import unidecode +import difflib + +from app.settings import types +from app.models.data_store import DataStore +from app.utils import logger_utils + +log = logger_utils.Logger.getLogger() + +az = 'abcdefghijklmlopqrstuvwzxyz' +AZ = az.upper() +z9 = list(map(str, list(range(0,10)))) +aZ9 = list(az) + list(AZ) + z9 + +def letter_strip(a, b=aZ9): + # strip every letter from a that is not in b + return ''.join([x for x in a if x in b]) + +def letter_match(a, b): + # check if every letter (a-zA-Z0-9) exists in both + return sum([x in b for x in a]) == len(a) + +def names_match_strict(a, b): + clean_a = letter_strip(a) + clean_b = letter_strip(b) + return len(clean_a) == len(clean_b) and letter_match(clean_a, clean_b) and letter_match(clean_b, clean_a) + + +def sanitize_name(name, as_str=False): + splits = [unidecode.unidecode(x.strip().lower()) for x in name.strip().split(' ')] + if as_str: + return ' '.join(splits) + else: + return splits + +''' +class Dataset(Enum): + LFW, VGG_FACE, VGG_FACE2, MSCELEB, UCCS, UMD_FACES, SCUT_FBP, UCF_SELFIE, UTK, \ + CASIA_WEBFACE, AFW, PUBFIG83, HELEN, PIPA, MEGAFACE, BRAINWASH, IMDB_WIKI = range(16) +''' +# Get list of names based on Dataset type +def get_names(opt_dataset, opt_data_store=types.DataStore.HDD): + data_store = DataStore(opt_data_store, opt_dataset) + dir_dataset = data_store.dir_dataset # path to dataset root + dir_media_orig = data_store.dir_media_original + if opt_dataset == types.Dataset.AFW: + # Annotated Faces in the Wild + pass + elif opt_dataset == types.Dataset.BRAINWASH: + # Brainwash IP Cam dataset + pass + elif opt_dataset == types.Dataset.CASIA_WEBFACE: + # + pass + elif opt_dataset == types.Dataset.HELEN: + # Helen + pass + elif opt_dataset == types.Dataset.IMDB_WIKI: + # University of Tennessee Knoxville + pass + elif opt_dataset == types.Dataset.LAG: + # Large Age Gap + pass + elif opt_dataset == types.Dataset.LFW: + # Labeled Faces in The Wild + names_orig = [x for x in os.listdir(dir_media_orig)] + names_query = [x.replace('_', ' ') for x in names_orig] + elif opt_dataset == types.Dataset.MEGAFACE: + # MegaFace + pass + elif opt_dataset == types.Dataset.MSCELEB: + # MS Celeb + pass + elif opt_dataset == types.Dataset.PIPA: + # People in Photo Albums + pass + elif opt_dataset == types.Dataset.PUBFIG83: + # PubFig83 + names_orig = [x for x in os.listdir(dir_media_orig) if Path(x).suffix is not '.txt'] + names_query = [x.replace('_', ' ') for x in names_orig] + elif opt_dataset == types.Dataset.SCUT_FBP: + # SCUT Facial Beauty Perception + pass + elif opt_dataset == types.Dataset.UCCS: + # Unconstrianed College Students + pass + elif opt_dataset == types.Dataset.UMD_FACES: + # University of Maryland Faces + pass + elif opt_dataset == types.Dataset.UTK: + # University of Tennessee Knoxville + pass + elif opt_dataset == types.Dataset.UCF_SELFIE: + # University of Central Florida Selfie + pass + elif opt_dataset == types.Dataset.VGG_FACE: + # Visual Geometry Group Face 1 + pass + elif opt_dataset == types.Dataset.VGG_FACE2: + # Visual Geometry Group Face 2 + pass + else: + log.warn(f'{opt_dataset} not yet implemented') + names_orig = [] + names_query = [] + result = {'names_orig': names_orig, 'names_query': names_query} + return result + + +def similarity(a, b): + return difflib.SequenceMatcher(a=a.lower(), b=b.lower()).ratio() + +def names_match(name_a, name_b, threshold=0.9, as_float=False, compound_score=False, name_a_pre=False, name_b_pre=False): + '''Returns boolean if names are similar enough + ''' + # strip spaces and split names into list of plain text words + if name_a_pre: + name_a_clean = name_a + else: + name_a_clean = [unidecode.unidecode(x.strip().lower()) for x in name_a.strip().split(' ')] + if name_b_pre: + name_b_clean = name_b + else: + name_b_clean = [unidecode.unidecode(x.strip().lower()) for x in name_b.strip().split(' ')] + + # assign short long vars + len_a = len(name_a_clean) + len_b = len(name_b_clean) + len_min = min(len_a, len_b) + len_max = max(len_a, len_b) + + # compute scores + scores = [] + for i in range(len(name_a_clean)): + word_a = name_a_clean[i] + subscores = [] + for j in range(len(name_b_clean)): + word_b = name_b_clean[j] + score = similarity(word_a, word_b) + subscores.append(score) + scores.append(subscores) + + # return result + ratio_similar = sum(max(x) for x in scores) / len(scores) + + if compound_score: + # combine with any missing letters/words + letters_a = sum(len(x) for x in name_a_clean) + letters_b = sum(len(x) for x in name_b_clean) + ratio_letters = min(letters_a, letters_b) / max(letters_a, letters_b) + score = (0.8 * ratio_similar) + (0.2 * ratio_letters) + else: + score = ratio_similar + + if as_float: + return score + else: + return score > threshold
\ No newline at end of file |
