1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
|
import os
from pathlib import Path
from glob import glob
import unidecode
import difflib
from app.settings import types
from app.models.data_store import DataStore
from app.utils import logger_utils
log = logger_utils.Logger.getLogger()
'''
class Dataset(Enum):
LFW, VGG_FACE, VGG_FACE2, MSCELEB, UCCS, UMD_FACES, SCUT_FBP, UCF_SELFIE, UTK, \
CASIA_WEBFACE, AFW, PUBFIG83, HELEN, PIPA, MEGAFACE, BRAINWASH, IMDB_WIKI = range(16)
'''
# Get list of names based on Dataset type
def get_names(opt_dataset, opt_data_store=types.DataStore.HDD):
data_store = DataStore(opt_data_store, opt_dataset)
dir_dataset = data_store.dir_dataset # path to dataset root
dir_media_orig = data_store.dir_media_original
if opt_dataset == types.Dataset.AFW:
# Annotated Faces in the Wild
pass
elif opt_dataset == types.Dataset.BRAINWASH:
# Brainwash IP Cam dataset
pass
elif opt_dataset == types.Dataset.CASIA_WEBFACE:
#
pass
elif opt_dataset == types.Dataset.HELEN:
# Helen
pass
elif opt_dataset == types.Dataset.IMDB_WIKI:
# University of Tennessee Knoxville
pass
elif opt_dataset == types.Dataset.LAG:
# Large Age Gap
pass
elif opt_dataset == types.Dataset.LFW:
# Labeled Faces in The Wild
names_orig = [x for x in os.listdir(dir_media_orig)]
names_query = [x.replace('_', ' ') for x in names_orig]
elif opt_dataset == types.Dataset.MEGAFACE:
# MegaFace
pass
elif opt_dataset == types.Dataset.MSCELEB:
# MS Celeb
pass
elif opt_dataset == types.Dataset.PIPA:
# People in Photo Albums
pass
elif opt_dataset == types.Dataset.PUBFIG83:
# PubFig83
names_orig = [x for x in os.listdir(dir_media_orig) if Path(x).suffix is not '.txt']
names_query = [x.replace('_', ' ') for x in names_orig]
elif opt_dataset == types.Dataset.SCUT_FBP:
# SCUT Facial Beauty Perception
pass
elif opt_dataset == types.Dataset.UCCS:
# Unconstrianed College Students
pass
elif opt_dataset == types.Dataset.UMD_FACES:
# University of Maryland Faces
pass
elif opt_dataset == types.Dataset.UTK:
# University of Tennessee Knoxville
pass
elif opt_dataset == types.Dataset.UCF_SELFIE:
# University of Central Florida Selfie
pass
elif opt_dataset == types.Dataset.VGG_FACE:
# Visual Geometry Group Face 1
pass
elif opt_dataset == types.Dataset.VGG_FACE2:
# Visual Geometry Group Face 2
pass
else:
log.warn(f'{opt_dataset} not yet implemented')
names_orig = []
names_query = []
result = {'names_orig': names_orig, 'names_query': names_query}
return result
def similarity(a, b):
return difflib.SequenceMatcher(a=a.lower(), b=b.lower()).ratio()
def names_match(name_a, name_b, threshold=0.9, as_float=False, compound_score=False):
'''Returns boolean if names are similar enough
'''
# strip spaces and split names into list of plain text words
name_a_clean = [unidecode.unidecode(x.strip().lower()) for x in name_a.strip().split(' ')]
name_b_clean = [unidecode.unidecode(x.strip().lower()) for x in name_b.strip().split(' ')]
# assign short long vars
len_a = len(name_a_clean)
len_b = len(name_b_clean)
len_min = min(len_a, len_b)
len_max = max(len_a, len_b)
# compute scores
scores = []
for i in range(len(name_a_clean)):
word_a = name_a_clean[i]
subscores = []
for j in range(len(name_b_clean)):
word_b = name_b_clean[j]
score = similarity(word_a, word_b)
subscores.append(score)
scores.append(subscores)
# return result
ratio_similar = sum(max(x) for x in scores) / len_min
if compound_score:
# combine with any missing letters/words
letters_a = sum(len(x) for x in name_a_clean)
letters_b = sum(len(x) for x in name_b_clean)
ratio_letters = min(letters_a, letters_b) / max(letters_a, letters_b)
score = (0.8 * ratio_similar) + (0.2 * ratio_letters)
else:
score = ratio_similar
if as_float:
return score
else:
return score > threshold
|