summaryrefslogtreecommitdiff
path: root/cli/app/utils/identity_utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'cli/app/utils/identity_utils.py')
-rw-r--r--cli/app/utils/identity_utils.py161
1 files changed, 161 insertions, 0 deletions
diff --git a/cli/app/utils/identity_utils.py b/cli/app/utils/identity_utils.py
new file mode 100644
index 0000000..5855fbb
--- /dev/null
+++ b/cli/app/utils/identity_utils.py
@@ -0,0 +1,161 @@
+import os
+from pathlib import Path
+from glob import glob
+import unidecode
+import difflib
+
+from app.settings import types
+from app.models.data_store import DataStore
+from app.utils import logger_utils
+
+log = logger_utils.Logger.getLogger()
+
+az = 'abcdefghijklmlopqrstuvwzxyz'
+AZ = az.upper()
+z9 = list(map(str, list(range(0,10))))
+aZ9 = list(az) + list(AZ) + z9
+
+def letter_strip(a, b=aZ9):
+ # strip every letter from a that is not in b
+ return ''.join([x for x in a if x in b])
+
+def letter_match(a, b):
+ # check if every letter (a-zA-Z0-9) exists in both
+ return sum([x in b for x in a]) == len(a)
+
+def names_match_strict(a, b):
+ clean_a = letter_strip(a)
+ clean_b = letter_strip(b)
+ return len(clean_a) == len(clean_b) and letter_match(clean_a, clean_b) and letter_match(clean_b, clean_a)
+
+
+def sanitize_name(name, as_str=False):
+ splits = [unidecode.unidecode(x.strip().lower()) for x in name.strip().split(' ')]
+ if as_str:
+ return ' '.join(splits)
+ else:
+ return splits
+
+'''
+class Dataset(Enum):
+ LFW, VGG_FACE, VGG_FACE2, MSCELEB, UCCS, UMD_FACES, SCUT_FBP, UCF_SELFIE, UTK, \
+ CASIA_WEBFACE, AFW, PUBFIG83, HELEN, PIPA, MEGAFACE, BRAINWASH, IMDB_WIKI = range(16)
+'''
+# Get list of names based on Dataset type
+def get_names(opt_dataset, opt_data_store=types.DataStore.HDD):
+ data_store = DataStore(opt_data_store, opt_dataset)
+ dir_dataset = data_store.dir_dataset # path to dataset root
+ dir_media_orig = data_store.dir_media_original
+ if opt_dataset == types.Dataset.AFW:
+ # Annotated Faces in the Wild
+ pass
+ elif opt_dataset == types.Dataset.BRAINWASH:
+ # Brainwash IP Cam dataset
+ pass
+ elif opt_dataset == types.Dataset.CASIA_WEBFACE:
+ #
+ pass
+ elif opt_dataset == types.Dataset.HELEN:
+ # Helen
+ pass
+ elif opt_dataset == types.Dataset.IMDB_WIKI:
+ # University of Tennessee Knoxville
+ pass
+ elif opt_dataset == types.Dataset.LAG:
+ # Large Age Gap
+ pass
+ elif opt_dataset == types.Dataset.LFW:
+ # Labeled Faces in The Wild
+ names_orig = [x for x in os.listdir(dir_media_orig)]
+ names_query = [x.replace('_', ' ') for x in names_orig]
+ elif opt_dataset == types.Dataset.MEGAFACE:
+ # MegaFace
+ pass
+ elif opt_dataset == types.Dataset.MSCELEB:
+ # MS Celeb
+ pass
+ elif opt_dataset == types.Dataset.PIPA:
+ # People in Photo Albums
+ pass
+ elif opt_dataset == types.Dataset.PUBFIG83:
+ # PubFig83
+ names_orig = [x for x in os.listdir(dir_media_orig) if Path(x).suffix is not '.txt']
+ names_query = [x.replace('_', ' ') for x in names_orig]
+ elif opt_dataset == types.Dataset.SCUT_FBP:
+ # SCUT Facial Beauty Perception
+ pass
+ elif opt_dataset == types.Dataset.UCCS:
+ # Unconstrianed College Students
+ pass
+ elif opt_dataset == types.Dataset.UMD_FACES:
+ # University of Maryland Faces
+ pass
+ elif opt_dataset == types.Dataset.UTK:
+ # University of Tennessee Knoxville
+ pass
+ elif opt_dataset == types.Dataset.UCF_SELFIE:
+ # University of Central Florida Selfie
+ pass
+ elif opt_dataset == types.Dataset.VGG_FACE:
+ # Visual Geometry Group Face 1
+ pass
+ elif opt_dataset == types.Dataset.VGG_FACE2:
+ # Visual Geometry Group Face 2
+ pass
+ else:
+ log.warn(f'{opt_dataset} not yet implemented')
+ names_orig = []
+ names_query = []
+ result = {'names_orig': names_orig, 'names_query': names_query}
+ return result
+
+
+def similarity(a, b):
+ return difflib.SequenceMatcher(a=a.lower(), b=b.lower()).ratio()
+
+def names_match(name_a, name_b, threshold=0.9, as_float=False, compound_score=False, name_a_pre=False, name_b_pre=False):
+ '''Returns boolean if names are similar enough
+ '''
+ # strip spaces and split names into list of plain text words
+ if name_a_pre:
+ name_a_clean = name_a
+ else:
+ name_a_clean = [unidecode.unidecode(x.strip().lower()) for x in name_a.strip().split(' ')]
+ if name_b_pre:
+ name_b_clean = name_b
+ else:
+ name_b_clean = [unidecode.unidecode(x.strip().lower()) for x in name_b.strip().split(' ')]
+
+ # assign short long vars
+ len_a = len(name_a_clean)
+ len_b = len(name_b_clean)
+ len_min = min(len_a, len_b)
+ len_max = max(len_a, len_b)
+
+ # compute scores
+ scores = []
+ for i in range(len(name_a_clean)):
+ word_a = name_a_clean[i]
+ subscores = []
+ for j in range(len(name_b_clean)):
+ word_b = name_b_clean[j]
+ score = similarity(word_a, word_b)
+ subscores.append(score)
+ scores.append(subscores)
+
+ # return result
+ ratio_similar = sum(max(x) for x in scores) / len(scores)
+
+ if compound_score:
+ # combine with any missing letters/words
+ letters_a = sum(len(x) for x in name_a_clean)
+ letters_b = sum(len(x) for x in name_b_clean)
+ ratio_letters = min(letters_a, letters_b) / max(letters_a, letters_b)
+ score = (0.8 * ratio_similar) + (0.2 * ratio_letters)
+ else:
+ score = ratio_similar
+
+ if as_float:
+ return score
+ else:
+ return score > threshold \ No newline at end of file