megapixels/app/utils/identity_utils.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128

import os
from pathlib import Path
from glob import glob
import unidecode
import difflib

from app.settings import types
from app.models.data_store import DataStore
from app.utils import logger_utils

log = logger_utils.Logger.getLogger()

'''
class Dataset(Enum):
  LFW, VGG_FACE, VGG_FACE2, MSCELEB, UCCS, UMD_FACES, SCUT_FBP, UCF_SELFIE, UTK, \
    CASIA_WEBFACE, AFW, PUBFIG83, HELEN, PIPA, MEGAFACE, BRAINWASH, IMDB_WIKI = range(16)
'''
# Get list of names based on Dataset type
def get_names(opt_dataset, opt_data_store=types.DataStore.HDD):
  data_store = DataStore(opt_data_store, opt_dataset)
  dir_dataset = data_store.dir_dataset  # path to dataset root
  dir_media_orig = data_store.dir_media_original
  if opt_dataset == types.Dataset.AFW:
    # Annotated Faces in the Wild
    pass
  elif opt_dataset == types.Dataset.BRAINWASH:
    # Brainwash IP Cam dataset
    pass
  elif opt_dataset == types.Dataset.CASIA_WEBFACE:
    # 
    pass
  elif opt_dataset == types.Dataset.HELEN:
    # Helen
    pass
  elif opt_dataset == types.Dataset.IMDB_WIKI:
    # University of Tennessee Knoxville
    pass
  elif opt_dataset == types.Dataset.LAG:
    # Large Age Gap
    pass
  elif opt_dataset == types.Dataset.LFW:
    # Labeled Faces in The Wild
    names_orig = [x for x in os.listdir(dir_media_orig)]
    names_query = [x.replace('_', ' ') for x in names_orig]
  elif opt_dataset == types.Dataset.MEGAFACE:
    # MegaFace
    pass
  elif opt_dataset == types.Dataset.MSCELEB:
    # MS Celeb
    pass
  elif opt_dataset == types.Dataset.PIPA:
    # People in Photo Albums
    pass
  elif opt_dataset == types.Dataset.PUBFIG83:
    # PubFig83
    names_orig = [x for x in os.listdir(dir_media_orig) if Path(x).suffix is not '.txt']
    names_query = [x.replace('_', ' ') for x in names_orig]
  elif opt_dataset == types.Dataset.SCUT_FBP:
    # SCUT Facial Beauty Perception
    pass
  elif opt_dataset == types.Dataset.UCCS:
    # Unconstrianed College Students
    pass
  elif opt_dataset == types.Dataset.UMD_FACES:
    # University of Maryland Faces
    pass
  elif opt_dataset == types.Dataset.UTK:
    # University of Tennessee Knoxville
    pass
  elif opt_dataset == types.Dataset.UCF_SELFIE:
    # University of Central Florida Selfie
    pass
  elif opt_dataset == types.Dataset.VGG_FACE:
    # Visual Geometry Group Face 1
    pass
  elif opt_dataset == types.Dataset.VGG_FACE2:
    # Visual Geometry Group Face 2
    pass
  else:
    log.warn(f'{opt_dataset} not yet implemented')
    names_orig = []
    names_query = []
  result = {'names_orig': names_orig, 'names_query': names_query}
  return result

def similarity(a, b):
  return difflib.SequenceMatcher(a=a.lower(), b=b.lower()).ratio()

def names_match(name_a, name_b, threshold=0.9, as_float=False, compound_score=False):
  '''Returns boolean if names are similar enough
  '''
  # strip spaces and split names into list of plain text words
  name_a_clean = [unidecode.unidecode(x.strip().lower()) for x in name_a.strip().split(' ')]
  name_b_clean = [unidecode.unidecode(x.strip().lower()) for x in name_b.strip().split(' ')]
  
  # assign short long vars
  len_a = len(name_a_clean)
  len_b = len(name_b_clean)
  len_min = min(len_a, len_b)
  len_max = max(len_a, len_b)
  
  # compute scores
  scores = []
  for i in range(len(name_a_clean)):
    word_a = name_a_clean[i]
    subscores = []
    for j in range(len(name_b_clean)):
      word_b = name_b_clean[j]
      score = similarity(word_a, word_b)
      subscores.append(score)
    scores.append(subscores)
  
  # return result
  ratio_similar = sum(max(x) for x in scores) / len_min

  if compound_score:
    # combine with any missing letters/words
    letters_a = sum(len(x) for x in name_a_clean)
    letters_b = sum(len(x) for x in name_b_clean)
    ratio_letters = min(letters_a, letters_b) / max(letters_a, letters_b)
    score = (0.8 * ratio_similar) + (0.2 * ratio_letters)
  else:
    score = ratio_similar

  if as_float:
    return score
  else:
    return score > threshold