1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
|
import os
from pathlib import Path
from glob import glob
import unidecode
import difflib
from app.settings import types
from app.models.data_store import DataStore
from app.utils import logger_utils
log = logger_utils.Logger.getLogger()
az = 'abcdefghijklmlopqrstuvwzxyz'
AZ = az.upper()
z9 = list(map(str, list(range(0,10))))
aZ9 = list(az) + list(AZ) + z9
def letter_strip(a, b=aZ9):
# strip every letter from a that is not in b
return ''.join([x for x in a if x in b])
def letter_match(a, b):
# check if every letter (a-zA-Z0-9) exists in both
return sum([x in b for x in a]) == len(a)
def names_match_strict(a, b):
clean_a = letter_strip(a)
clean_b = letter_strip(b)
return len(clean_a) == len(clean_b) and letter_match(clean_a, clean_b) and letter_match(clean_b, clean_a)
'''
class Dataset(Enum):
LFW, VGG_FACE, VGG_FACE2, MSCELEB, UCCS, UMD_FACES, SCUT_FBP, UCF_SELFIE, UTK, \
CASIA_WEBFACE, AFW, PUBFIG83, HELEN, PIPA, MEGAFACE, BRAINWASH, IMDB_WIKI = range(16)
'''
# Get list of names based on Dataset type
def get_names(opt_dataset, opt_data_store=types.DataStore.HDD):
data_store = DataStore(opt_data_store, opt_dataset)
dir_dataset = data_store.dir_dataset # path to dataset root
dir_media_orig = data_store.dir_media_original
if opt_dataset == types.Dataset.AFW:
# Annotated Faces in the Wild
pass
elif opt_dataset == types.Dataset.BRAINWASH:
# Brainwash IP Cam dataset
pass
elif opt_dataset == types.Dataset.CASIA_WEBFACE:
#
pass
elif opt_dataset == types.Dataset.HELEN:
# Helen
pass
elif opt_dataset == types.Dataset.IMDB_WIKI:
# University of Tennessee Knoxville
pass
elif opt_dataset == types.Dataset.LAG:
# Large Age Gap
pass
elif opt_dataset == types.Dataset.LFW:
# Labeled Faces in The Wild
names_orig = [x for x in os.listdir(dir_media_orig)]
names_query = [x.replace('_', ' ') for x in names_orig]
elif opt_dataset == types.Dataset.MEGAFACE:
# MegaFace
pass
elif opt_dataset == types.Dataset.MSCELEB:
# MS Celeb
pass
elif opt_dataset == types.Dataset.PIPA:
# People in Photo Albums
pass
elif opt_dataset == types.Dataset.PUBFIG83:
# PubFig83
names_orig = [x for x in os.listdir(dir_media_orig) if Path(x).suffix is not '.txt']
names_query = [x.replace('_', ' ') for x in names_orig]
elif opt_dataset == types.Dataset.SCUT_FBP:
# SCUT Facial Beauty Perception
pass
elif opt_dataset == types.Dataset.UCCS:
# Unconstrianed College Students
pass
elif opt_dataset == types.Dataset.UMD_FACES:
# University of Maryland Faces
pass
elif opt_dataset == types.Dataset.UTK:
# University of Tennessee Knoxville
pass
elif opt_dataset == types.Dataset.UCF_SELFIE:
# University of Central Florida Selfie
pass
elif opt_dataset == types.Dataset.VGG_FACE:
# Visual Geometry Group Face 1
pass
elif opt_dataset == types.Dataset.VGG_FACE2:
# Visual Geometry Group Face 2
pass
else:
log.warn(f'{opt_dataset} not yet implemented')
names_orig = []
names_query = []
result = {'names_orig': names_orig, 'names_query': names_query}
return result
def similarity(a, b):
return difflib.SequenceMatcher(a=a.lower(), b=b.lower()).ratio()
def names_match(name_a, name_b, threshold=0.9, as_float=False, compound_score=False):
'''Returns boolean if names are similar enough
'''
# strip spaces and split names into list of plain text words
name_a_clean = [unidecode.unidecode(x.strip().lower()) for x in name_a.strip().split(' ')]
name_b_clean = [unidecode.unidecode(x.strip().lower()) for x in name_b.strip().split(' ')]
# assign short long vars
len_a = len(name_a_clean)
len_b = len(name_b_clean)
len_min = min(len_a, len_b)
len_max = max(len_a, len_b)
# compute scores
scores = []
for i in range(len(name_a_clean)):
word_a = name_a_clean[i]
subscores = []
for j in range(len(name_b_clean)):
word_b = name_b_clean[j]
score = similarity(word_a, word_b)
subscores.append(score)
scores.append(subscores)
# return result
ratio_similar = sum(max(x) for x in scores) / len(scores)
if compound_score:
# combine with any missing letters/words
letters_a = sum(len(x) for x in name_a_clean)
letters_b = sum(len(x) for x in name_b_clean)
ratio_letters = min(letters_a, letters_b) / max(letters_a, letters_b)
score = (0.8 * ratio_similar) + (0.2 * ratio_letters)
else:
score = ratio_similar
if as_float:
return score
else:
return score > threshold
|