summaryrefslogtreecommitdiff
path: root/megapixels/app
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/app')
-rw-r--r--megapixels/app/utils/identity_utils.py68
1 files changed, 68 insertions, 0 deletions
diff --git a/megapixels/app/utils/identity_utils.py b/megapixels/app/utils/identity_utils.py
new file mode 100644
index 00000000..5b73a8e9
--- /dev/null
+++ b/megapixels/app/utils/identity_utils.py
@@ -0,0 +1,68 @@
+import os
+from pathlib import Path
+from glob import glob
+import unidecode
+import difflib
+
+from app.settings import types
+from app.utils import logger_utils
+
+log = logger_utils.Logger.getLogger()
+
+# Get list of names based on Dataset type
+def get_names(enum_dataset):
+ if enum_dataset == types.Dataset.LFW:
+ dir_lfw = '/data_store_hdd/datasets/people/lfw/media/original/'
+ names_orig = [x for x in os.listdir(dir_lfw)]
+ names_query = [x.replace('_', ' ') for x in names_orig]
+ result = {'names_orig': names_orig, 'names_query': names_query}
+ elif enum_dataset == types.Dataset.YOUTUBE_FACES:
+ names = [x for x in names if 'labeled faces.txt' not in x]
+ else:
+ log.warn(f'{enum_dataset} not yet implemented')
+ result = {}
+ return result
+
+def similarity(a, b):
+ return difflib.SequenceMatcher(a=a.lower(), b=b.lower()).ratio()
+
+def names_match(name_a, name_b, threshold=0.9, as_float=False, compound_score=False):
+ '''Returns boolean if names are similar enough
+ '''
+ # strip spaces and split names into list of plain text words
+ name_a_clean = [unidecode.unidecode(x.strip().lower()) for x in name_a.strip().split(' ')]
+ name_b_clean = [unidecode.unidecode(x.strip().lower()) for x in name_b.strip().split(' ')]
+
+ # assign short long vars
+ len_a = len(name_a_clean)
+ len_b = len(name_b_clean)
+ len_min = min(len_a, len_b)
+ len_max = max(len_a, len_b)
+
+ # compute scores
+ scores = []
+ for i in range(len(name_a_clean)):
+ word_a = name_a_clean[i]
+ subscores = []
+ for j in range(len(name_b_clean)):
+ word_b = name_b_clean[j]
+ score = similarity(word_a, word_b)
+ subscores.append(score)
+ scores.append(subscores)
+
+ # return result
+ ratio_similar = sum(max(x) for x in scores) / len_min
+ # combine with any missing letters/words
+ letters_a = sum(len(x) for x in name_a_clean)
+ letters_b = sum(len(x) for x in name_b_clean)
+ ratio_letters = min(letters_a, letters_b) / max(letters_a, letters_b)
+
+ if compound_score:
+ score = (0.8 * ratio_similar) + (0.2 * ratio_letters)
+ else:
+ score = ratio_similar
+
+ if as_float:
+ return score
+ else:
+ return score > threshold \ No newline at end of file