megapixels/app/utils/identity_utils.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148

import os
from pathlib import Path
from glob import glob
import unidecode
import difflib

from app.settings import types
from app.models.data_store import DataStore
from app.utils import logger_utils

log = logger_utils.Logger.getLogger()

az = 'abcdefghijklmlopqrstuvwzxyz'
AZ = az.upper()
z9 = list(map(str, list(range(0,10))))
aZ9 = list(az) + list(AZ) + z9

def letter_strip(a, b=aZ9):
  # strip every letter from a that is not in b
  return ''.join([x for x in a if x in b])

def letter_match(a, b):
  # check if every letter (a-zA-Z0-9) exists in both
  return sum([x in b for x in a]) == len(a)

def names_match_strict(a, b):
  clean_a = letter_strip(a)
  clean_b = letter_strip(b)
  return len(clean_a) == len(clean_b) and letter_match(clean_a, clean_b) and letter_match(clean_b, clean_a)


'''
class Dataset(Enum):
  LFW, VGG_FACE, VGG_FACE2, MSCELEB, UCCS, UMD_FACES, SCUT_FBP, UCF_SELFIE, UTK, \
    CASIA_WEBFACE, AFW, PUBFIG83, HELEN, PIPA, MEGAFACE, BRAINWASH, IMDB_WIKI = range(16)
'''
# Get list of names based on Dataset type
def get_names(opt_dataset, opt_data_store=types.DataStore.HDD):
  data_store = DataStore(opt_data_store, opt_dataset)
  dir_dataset = data_store.dir_dataset  # path to dataset root
  dir_media_orig = data_store.dir_media_original
  if opt_dataset == types.Dataset.AFW:
    # Annotated Faces in the Wild
    pass
  elif opt_dataset == types.Dataset.BRAINWASH:
    # Brainwash IP Cam dataset
    pass
  elif opt_dataset == types.Dataset.CASIA_WEBFACE:
    # 
    pass
  elif opt_dataset == types.Dataset.HELEN:
    # Helen
    pass
  elif opt_dataset == types.Dataset.IMDB_WIKI:
    # University of Tennessee Knoxville
    pass
  elif opt_dataset == types.Dataset.LAG:
    # Large Age Gap
    pass
  elif opt_dataset == types.Dataset.LFW:
    # Labeled Faces in The Wild
    names_orig = [x for x in os.listdir(dir_media_orig)]
    names_query = [x.replace('_', ' ') for x in names_orig]
  elif opt_dataset == types.Dataset.MEGAFACE:
    # MegaFace
    pass
  elif opt_dataset == types.Dataset.MSCELEB:
    # MS Celeb
    pass
  elif opt_dataset == types.Dataset.PIPA:
    # People in Photo Albums
    pass
  elif opt_dataset == types.Dataset.PUBFIG83:
    # PubFig83
    names_orig = [x for x in os.listdir(dir_media_orig) if Path(x).suffix is not '.txt']
    names_query = [x.replace('_', ' ') for x in names_orig]
  elif opt_dataset == types.Dataset.SCUT_FBP:
    # SCUT Facial Beauty Perception
    pass
  elif opt_dataset == types.Dataset.UCCS:
    # Unconstrianed College Students
    pass
  elif opt_dataset == types.Dataset.UMD_FACES:
    # University of Maryland Faces
    pass
  elif opt_dataset == types.Dataset.UTK:
    # University of Tennessee Knoxville
    pass
  elif opt_dataset == types.Dataset.UCF_SELFIE:
    # University of Central Florida Selfie
    pass
  elif opt_dataset == types.Dataset.VGG_FACE:
    # Visual Geometry Group Face 1
    pass
  elif opt_dataset == types.Dataset.VGG_FACE2:
    # Visual Geometry Group Face 2
    pass
  else:
    log.warn(f'{opt_dataset} not yet implemented')
    names_orig = []
    names_query = []
  result = {'names_orig': names_orig, 'names_query': names_query}
  return result


def similarity(a, b):
  return difflib.SequenceMatcher(a=a.lower(), b=b.lower()).ratio()

def names_match(name_a, name_b, threshold=0.9, as_float=False, compound_score=False):
  '''Returns boolean if names are similar enough
  '''
  # strip spaces and split names into list of plain text words
  name_a_clean = [unidecode.unidecode(x.strip().lower()) for x in name_a.strip().split(' ')]
  name_b_clean = [unidecode.unidecode(x.strip().lower()) for x in name_b.strip().split(' ')]
  
  # assign short long vars
  len_a = len(name_a_clean)
  len_b = len(name_b_clean)
  len_min = min(len_a, len_b)
  len_max = max(len_a, len_b)
  
  # compute scores
  scores = []
  for i in range(len(name_a_clean)):
    word_a = name_a_clean[i]
    subscores = []
    for j in range(len(name_b_clean)):
      word_b = name_b_clean[j]
      score = similarity(word_a, word_b)
      subscores.append(score)
    scores.append(subscores)
  
  # return result
  ratio_similar = sum(max(x) for x in scores) / len(scores)

  if compound_score:
    # combine with any missing letters/words
    letters_a = sum(len(x) for x in name_a_clean)
    letters_b = sum(len(x) for x in name_b_clean)
    ratio_letters = min(letters_a, letters_b) / max(letters_a, letters_b)
    score = (0.8 * ratio_similar) + (0.2 * ratio_letters)
  else:
    score = ratio_similar

  if as_float:
    return score
  else:
    return score > threshold