1 files changed, 161 insertions, 0 deletions
diff --git a/cli/app/utils/identity_utils.py b/cli/app/utils/identity_utils.py
new file mode 100644
index 0000000..5855fbb
--- /dev/null
+++ b/cli/app/utils/identity_utils.py
@@ -0,0 +1,161 @@
+import os
+from pathlib import Path
+from glob import glob
+import unidecode
+import difflib
+
+from app.settings import types
+from app.models.data_store import DataStore
+from app.utils import logger_utils
+
+log = logger_utils.Logger.getLogger()
+
+az = 'abcdefghijklmlopqrstuvwzxyz'
+AZ = az.upper()
+z9 = list(map(str, list(range(0,10))))
+aZ9 = list(az) + list(AZ) + z9
+
+def letter_strip(a, b=aZ9):
+  # strip every letter from a that is not in b
+  return ''.join([x for x in a if x in b])
+
+def letter_match(a, b):
+  # check if every letter (a-zA-Z0-9) exists in both
+  return sum([x in b for x in a]) == len(a)
+
+def names_match_strict(a, b):
+  clean_a = letter_strip(a)
+  clean_b = letter_strip(b)
+  return len(clean_a) == len(clean_b) and letter_match(clean_a, clean_b) and letter_match(clean_b, clean_a)
+
+
+def sanitize_name(name, as_str=False):
+  splits = [unidecode.unidecode(x.strip().lower()) for x in name.strip().split(' ')]
+  if as_str:
+    return ' '.join(splits)
+  else:
+    return splits
+
+'''
+class Dataset(Enum):
+  LFW, VGG_FACE, VGG_FACE2, MSCELEB, UCCS, UMD_FACES, SCUT_FBP, UCF_SELFIE, UTK, \
+    CASIA_WEBFACE, AFW, PUBFIG83, HELEN, PIPA, MEGAFACE, BRAINWASH, IMDB_WIKI = range(16)
+'''
+# Get list of names based on Dataset type
+def get_names(opt_dataset, opt_data_store=types.DataStore.HDD):
+  data_store = DataStore(opt_data_store, opt_dataset)
+  dir_dataset = data_store.dir_dataset  # path to dataset root
+  dir_media_orig = data_store.dir_media_original
+  if opt_dataset == types.Dataset.AFW:
+    # Annotated Faces in the Wild
+    pass
+  elif opt_dataset == types.Dataset.BRAINWASH:
+    # Brainwash IP Cam dataset
+    pass
+  elif opt_dataset == types.Dataset.CASIA_WEBFACE:
+    # 
+    pass
+  elif opt_dataset == types.Dataset.HELEN:
+    # Helen
+    pass
+  elif opt_dataset == types.Dataset.IMDB_WIKI:
+    # University of Tennessee Knoxville
+    pass
+  elif opt_dataset == types.Dataset.LAG:
+    # Large Age Gap
+    pass
+  elif opt_dataset == types.Dataset.LFW:
+    # Labeled Faces in The Wild
+    names_orig = [x for x in os.listdir(dir_media_orig)]
+    names_query = [x.replace('_', ' ') for x in names_orig]
+  elif opt_dataset == types.Dataset.MEGAFACE:
+    # MegaFace
+    pass
+  elif opt_dataset == types.Dataset.MSCELEB:
+    # MS Celeb
+    pass
+  elif opt_dataset == types.Dataset.PIPA:
+    # People in Photo Albums
+    pass
+  elif opt_dataset == types.Dataset.PUBFIG83:
+    # PubFig83
+    names_orig = [x for x in os.listdir(dir_media_orig) if Path(x).suffix is not '.txt']
+    names_query = [x.replace('_', ' ') for x in names_orig]
+  elif opt_dataset == types.Dataset.SCUT_FBP:
+    # SCUT Facial Beauty Perception
+    pass
+  elif opt_dataset == types.Dataset.UCCS:
+    # Unconstrianed College Students
+    pass
+  elif opt_dataset == types.Dataset.UMD_FACES:
+    # University of Maryland Faces
+    pass
+  elif opt_dataset == types.Dataset.UTK:
+    # University of Tennessee Knoxville
+    pass
+  elif opt_dataset == types.Dataset.UCF_SELFIE:
+    # University of Central Florida Selfie
+    pass
+  elif opt_dataset == types.Dataset.VGG_FACE:
+    # Visual Geometry Group Face 1
+    pass
+  elif opt_dataset == types.Dataset.VGG_FACE2:
+    # Visual Geometry Group Face 2
+    pass
+  else:
+    log.warn(f'{opt_dataset} not yet implemented')
+    names_orig = []
+    names_query = []
+  result = {'names_orig': names_orig, 'names_query': names_query}
+  return result
+
+
+def similarity(a, b):
+  return difflib.SequenceMatcher(a=a.lower(), b=b.lower()).ratio()
+
+def names_match(name_a, name_b, threshold=0.9, as_float=False, compound_score=False, name_a_pre=False, name_b_pre=False):
+  '''Returns boolean if names are similar enough
+  '''
+  # strip spaces and split names into list of plain text words
+  if name_a_pre:
+    name_a_clean = name_a
+  else:
+    name_a_clean = [unidecode.unidecode(x.strip().lower()) for x in name_a.strip().split(' ')]
+  if name_b_pre:
+    name_b_clean = name_b
+  else:
+    name_b_clean = [unidecode.unidecode(x.strip().lower()) for x in name_b.strip().split(' ')]
+  
+  # assign short long vars
+  len_a = len(name_a_clean)
+  len_b = len(name_b_clean)
+  len_min = min(len_a, len_b)
+  len_max = max(len_a, len_b)
+  
+  # compute scores
+  scores = []
+  for i in range(len(name_a_clean)):
+    word_a = name_a_clean[i]
+    subscores = []
+    for j in range(len(name_b_clean)):
+      word_b = name_b_clean[j]
+      score = similarity(word_a, word_b)
+      subscores.append(score)
+    scores.append(subscores)
+  
+  # return result
+  ratio_similar = sum(max(x) for x in scores) / len(scores)
+
+  if compound_score:
+    # combine with any missing letters/words
+    letters_a = sum(len(x) for x in name_a_clean)
+    letters_b = sum(len(x) for x in name_b_clean)
+    ratio_letters = min(letters_a, letters_b) / max(letters_a, letters_b)
+    score = (0.8 * ratio_similar) + (0.2 * ratio_letters)
+  else:
+    score = ratio_similar
+
+  if as_float:
+    return score
+  else:
+    return score > threshold
+\ No newline at end of file