11 files changed, 448 insertions, 1 deletions
diff --git a/megapixels/app/models/sql_factory.py b/megapixels/app/models/sql_factory.py
new file mode 100644
index 00000000..e35c3e15
--- /dev/null
+++ b/megapixels/app/models/sql_factory.py
@@ -0,0 +1,152 @@
+import os
+import glob
+import time
+import pandas as pd
+
+from sqlalchemy import create_engine, Table, Column, String, Integer, DateTime, Float
+from sqlalchemy.orm import sessionmaker
+from sqlalchemy.ext.declarative import declarative_base
+
+from app.utils.file_utils import load_recipe, load_csv_safe
+from app.settings import app_cfg as cfg
+
+connection_url = "mysql+mysqldb://{}:{}@{}/{}".format(
+  os.getenv("DB_USER"),
+  os.getenv("DB_PASS"),
+  os.getenv("DB_HOST"),
+  os.getenv("DB_NAME")
+)
+
+datasets = {}
+loaded = False
+
+def list_datasets():
+  return [dataset.describe() for dataset in datasets.values()]
+
+def get_dataset(name):
+  return datasets[name] if name in datasets else None
+
+def get_table(name, table_name):
+  dataset = get_dataset(name)
+  return dataset.get_table(table_name) if dataset else None
+
+def load_sql_datasets(replace=False, base_model=None):
+  global datasets, loaded
+  if loaded:
+    return datasets
+  engine = create_engine(connection_url) if replace else None
+  for path in glob.iglob(os.path.join(cfg.DIR_FAISS_METADATA, "*")):
+    dataset = load_sql_dataset(path, replace, engine, base_model)
+    datasets[dataset.name] = dataset
+  loaded = True
+  return datasets
+
+def load_sql_dataset(path, replace=False, engine=None, base_model=None):
+  name = os.path.basename(path)
+  dataset = SqlDataset(name, base_model=base_model)
+
+  for fn in glob.iglob(os.path.join(path, "*.csv")):
+    key = os.path.basename(fn).replace(".csv", "")
+    table = dataset.get_table(key)
+    if table is None:
+      continue
+    if replace:
+      print('loading dataset {}'.format(fn))
+      df = pd.read_csv(fn)
+      # fix columns that are named "index", a sql reserved word
+      df.columns = table.__table__.columns.keys()
+      df.to_sql(name=table.__tablename__, con=engine, if_exists='replace', index=False)
+  return dataset
+
+class SqlDataset:
+  """
+  Bridge between the facial information CSVs connected to the datasets, and MySQL
+  - each dataset should have files that can be loaded into these database models
+  - names will be fixed to work in SQL (index -> id)
+  - we can then have more generic models for fetching this info after doing a FAISS query
+  """
+  def __init__(self, name, engine=None, base_model=None):
+    self.name = name
+    self.tables = {}
+    if base_model is None:
+      self.engine = create_engine(connection_url)
+      base_model = declarative_base(engine)
+    self.base_model = base_model
+
+  def describe(self):
+    return {
+      'name': self.name,
+      'tables': list(self.tables.keys()),
+    }
+
+  def get_table(self, type):
+    if type in self.tables:
+      return self.tables[type]
+    elif type == 'uuids':
+      self.tables[type] = self.uuid_table()
+    elif type == 'roi':
+      self.tables[type] = self.roi_table()
+    elif type == 'identity_meta':
+      self.tables[type] = self.identity_table()
+    elif type == 'pose':
+      self.tables[type] = self.pose_table()
+    else:
+      return None
+    return self.tables[type]
+
+  # ==> uuids.csv <==
+  # index,uuid
+  # 0,f03fd921-2d56-4e83-8115-f658d6a72287
+  def uuid_table(self):
+    class UUID(self.base_model):
+      __tablename__ = self.name + "_uuid"
+      id = Column(Integer, primary_key=True)
+      uuid = Column(String(36), nullable=False)
+    return UUID
+
+  # ==> roi.csv <==
+  # index,h,image_height,image_index,image_width,w,x,y
+  # 0,0.33000000000000007,250,0,250,0.32999999999999996,0.33666666666666667,0.35
+  def roi_table(self):
+    class ROI(self.base_model):
+      __tablename__ = self.name + "_roi"
+      id = Column(Integer, primary_key=True)
+      h = Column(Float, nullable=False)
+      image_height = Column(Integer, nullable=False)
+      image_index = Column(Integer, nullable=False)
+      image_width = Column(Integer, nullable=False)
+      w = Column(Float, nullable=False)
+      x = Column(Float, nullable=False)
+      y = Column(Float, nullable=False)
+    return ROI
+
+  # ==> identity.csv <==
+  # index,fullname,description,gender,images,image_index
+  # 0,A. J. Cook,Canadian actress,f,1,0
+  def identity_table(self):
+    class Identity(self.base_model):
+      __tablename__ = self.name + "_identity"
+      id = Column(Integer, primary_key=True)
+      fullname = Column(String(36), nullable=False)
+      description = Column(String(36), nullable=False)
+      gender = Column(String(1), nullable=False)
+      images = Column(Integer, nullable=False)
+      image_id = Column(Integer, nullable=False)
+    return Identity
+
+  # ==> pose.csv <==
+  # index,image_index,pitch,roll,yaw
+  # 0,0,11.16264458441435,10.415885631337728,22.99719032415318
+  def pose_table(self):
+    class Pose(self.base_model):
+      __tablename__ = self.name + "_pose"
+      id = Column(Integer, primary_key=True)
+      image_id = Column(Integer, primary_key=True)
+      pitch = Column(Float, nullable=False)
+      roll = Column(Float, nullable=False)
+      yaw = Column(Float, nullable=False)
+    return Pose
+
+
+# Session = sessionmaker(bind=engine)
+# session = Session()
diff --git a/megapixels/app/server/api.py b/megapixels/app/server/api.py
new file mode 100644
index 00000000..c5e27dd2
--- /dev/null
+++ b/megapixels/app/server/api.py
@@ -0,0 +1,54 @@
+import os
+import re
+import time
+from flask import Blueprint, request, jsonify
+from PIL import Image  # todo: try to remove PIL dependency
+
+from app.models.sql_factory import list_datasets, get_dataset, get_table
+
+sanitize_re = re.compile('[\W]+')
+valid_exts = ['.gif', '.jpg', '.jpeg', '.png']
+
+api = Blueprint('api', __name__)
+
+@api.route('/')
+def index():
+  return jsonify({ 'datasets': list_datasets() })
+
+@api.route('/dataset/<name>')
+def show(name):
+  dataset = get_dataset(name)
+  if dataset:
+    return jsonify(dataset.describe())
+  else:
+    return jsonify({ 'status': 404 })
+
+@api.route('/dataset/<dataset>/face', methods=['POST'])
+def upload(name):
+  file = request.files['query_img']
+  fn = file.filename
+  if fn.endswith('blob'):
+    fn = 'filename.jpg'
+
+  basename, ext = os.path.splitext(fn)
+  print("got {}, type {}".format(basename, ext))
+  if ext.lower() not in valid_exts:
+    return jsonify({ 'error': 'not an image' })
+
+  img = Image.open(file.stream).convert('RGB')
+  
+  # vec = db.load_feature_vector_from_file(uploaded_img_path)
+  # vec = fe.extract(img)
+  # print(vec.shape)
+  # results = db.search(vec, limit=limit)
+
+  query = {
+    'timing': time.time() - start,
+  }
+  results = []
+
+  print(results)
+  return jsonify({
+    'query': query,
+    'results': results,
+  })
diff --git a/megapixels/app/server/create.py b/megapixels/app/server/create.py
new file mode 100644
index 00000000..9efed669
--- /dev/null
+++ b/megapixels/app/server/create.py
@@ -0,0 +1,36 @@
+from flask import Flask, Blueprint, jsonify
+from flask_sqlalchemy import SQLAlchemy
+from app.models.sql_factory import connection_url, load_sql_datasets
+
+from app.server.api import api
+
+db = SQLAlchemy()
+
+def create_app(script_info=None):
+  app = Flask(__name__, static_url_path='')
+  app.config['SQLALCHEMY_DATABASE_URI'] = connection_url
+  app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
+
+  db.init_app(app)
+  datasets = load_sql_datasets(replace=False, base_model=db.Model)
+
+  app.register_blueprint(api, url_prefix='/api')
+
+  @app.route('/', methods=['GET'])
+  def index():
+    return app.send_static_file('index.html')
+
+  @app.shell_context_processor
+  def shell_context():
+    return { 'app': app, 'db': db }
+
+  @app.route("/site-map")
+  def site_map():
+    links = []
+    for rule in app.url_map.iter_rules():
+      # url = url_for(rule.endpoint, **(rule.defaults or {}))
+      # print(url)
+      links.append((rule.endpoint))
+    return(jsonify(links))
+
+  return app
diff --git a/megapixels/app/server/static b/megapixels/app/server/static
new file mode 120000
index 00000000..1dc7a639
--- /dev/null
+++ b/megapixels/app/server/static
@@ -0,0 +1 @@
+../../../site/public
+\ No newline at end of file
diff --git a/megapixels/app/settings/app_cfg.py b/megapixels/app/settings/app_cfg.py
index 7406caad..0507366f 100644
--- a/megapixels/app/settings/app_cfg.py
+++ b/megapixels/app/settings/app_cfg.py
@@ -2,6 +2,7 @@ import os
 from os.path import join
 import logging
 import collections
+from dotenv import load_dotenv
 
 import cv2 as cv
 
@@ -48,6 +49,11 @@ DIR_MODELS_DLIB_5PT = join(DIR_MODELS_DLIB, 'shape_predictor_5_face_landmarks.da
 DIR_MODELS_DLIB_68PT = join(DIR_MODELS_DLIB, 'shape_predictor_68_face_landmarks.dat')
 DIR_MODELS_DLIB_FACEREC_RESNET = join(DIR_MODELS_DLIB, 'dlib_face_recognition_resnet_model_v1.dat')
 
+DIR_FAISS = join(DIR_APP, 'faiss')
+DIR_FAISS_INDEXES = join(DIR_FAISS, 'indexes')
+DIR_FAISS_METADATA = join(DIR_FAISS, 'metadata')
+DIR_FAISS_RECIPES = join(DIR_FAISS, 'recipes')
+
 # Test images
 DIR_TEST_IMAGES = join(DIR_APP, 'test', 'images')
 
@@ -64,6 +70,7 @@ FP_FONT = join(DIR_ASSETS, 'font')
 DIR_COMMANDS_CV = 'commands/cv'
 DIR_COMMANDS_ADMIN = 'commands/admin'
 DIR_COMMANDS_DATASETS = 'commands/datasets'
+DIR_COMMANDS_FAISS = 'commands/faiss'
 DIR_COMMANDS_MISC  = 'commands/misc'
 
 # -----------------------------------------------------------------------------
@@ -111,3 +118,10 @@ LOGFILE_FORMAT = "%(log_color)s%(levelname)-8s%(reset)s %(cyan)s%(filename)s:%(l
 # -----------------------------------------------------------------------------
 S3_MEDIA_ROOT = 's3://megapixels/v1/media/'
 S3_METADATA_ROOT = 's3://megapixels/v1/metadata/'
+
+# -----------------------------------------------------------------------------
+# .env config for keys
+# -----------------------------------------------------------------------------
+
+DIR_DOTENV = join(DIR_APP, '.env')
+load_dotenv(dotenv_path=DIR_DOTENV)
diff --git a/megapixels/app/utils/file_utils.py b/megapixels/app/utils/file_utils.py
index 99282bd0..80239fe2 100644
--- a/megapixels/app/utils/file_utils.py
+++ b/megapixels/app/utils/file_utils.py
@@ -77,7 +77,7 @@ def load_csv(fp_in, as_list=True):
   :returns: list of all CSV data
   """ 
   if not Path(fp_in).exists():
-    log.info('loading {}'.format(fp_in))
+    log.info('not found: {}'.format(fp_in))
   log.info('loading: {}'.format(fp_in))
   with open(fp_in, 'r') as fp:
     items = csv.DictReader(fp)
@@ -86,6 +86,50 @@ def load_csv(fp_in, as_list=True):
     log.info('returning {:,} items'.format(len(items)))
     return items
 
+def unfussy_csv_reader(reader):
+  """Loads a CSV while ignoring possible data errors
+  :param reader: Special reader for load_csv_safe which ignores CSV parse errors
+  """
+  while True:
+    try:
+      yield next(reader)
+    except StopIteration:
+      return
+    except csv.Error:
+      print(csv.Error)
+      # log the problem or whatever
+      continue
+
+def load_csv_safe(fp_in, keys=True, create=False):
+  """Loads a CSV while ignoring possible data errors
+  :param fp_in: string filepath to JSON file
+  :param keys: boolean set to false if the first line is not headers (for some reason)
+  :param create: boolean set to true to return an empty keys/values if the CSV does not exist
+  """ 
+  try:
+    with open(fp_in, 'r', newline='', encoding='utf-8') as f:
+      # reader = csv.reader( (line.replace('\0','') for line in f) )
+      reader = csv.reader(f)
+      lines = list(unfussy_csv_reader(reader))
+      if keys:
+        keys = lines[0]
+        lines = lines[1:]
+        return keys, lines
+      return lines
+  except:
+    if create:
+      if keys:
+        return {}, []
+      return []
+    raise
+
+def load_recipe(fp_in):
+  """Loads a JSON file as an object with properties accessible with dot syntax
+  :param fp_in: string filepath to JSON file
+  """ 
+  with open(path) as fh:
+    return json.load(fh, object_hook=lambda d: collections.namedtuple('X', d.keys())(*d.values()))
+
 
 def lazywrite(data, fp_out, sort_keys=True):
   """Writes JSON or Pickle data"""
diff --git a/megapixels/cli_faiss.py b/megapixels/cli_faiss.py
new file mode 100644
index 00000000..9953d9b7
--- /dev/null
+++ b/megapixels/cli_faiss.py
@@ -0,0 +1,36 @@
+# --------------------------------------------------------
+# add/edit commands in commands/faiss directory
+# --------------------------------------------------------
+
+import click
+
+from app.settings import app_cfg as cfg
+from app.utils import logger_utils
+from app.models.click_factory import ClickSimple
+
+# click cli factory
+cc = ClickSimple.create(cfg.DIR_COMMANDS_FAISS)
+
+# --------------------------------------------------------
+# CLI
+# --------------------------------------------------------
+@click.group(cls=cc, chain=False)
+@click.option('-v', '--verbose', 'verbosity', count=True, default=4, 
+  show_default=True,
+  help='Verbosity: -v DEBUG, -vv INFO, -vvv WARN, -vvvv ERROR, -vvvvv CRITICAL')
+@click.pass_context
+def cli(ctx, **kwargs):
+  """\033[1m\033[94mMegaPixels: FAISS Data Scripts\033[0m                                                
+  """
+  ctx.opts = {}
+  # init logger
+  logger_utils.Logger.create(verbosity=kwargs['verbosity'])
+
+
+
+# --------------------------------------------------------
+# Entrypoint
+# --------------------------------------------------------
+if __name__ == '__main__':
+    cli()
+
diff --git a/megapixels/cli_flask.py b/megapixels/cli_flask.py
new file mode 100644
index 00000000..369bec01
--- /dev/null
+++ b/megapixels/cli_flask.py
@@ -0,0 +1,19 @@
+# --------------------------------------------------------
+# wrapper for flask CLI API
+# --------------------------------------------------------
+
+import click
+
+from flask.cli import FlaskGroup
+from app.server.create import create_app
+
+# from app.settings import app_cfg as cfg
+# from app.utils import logger_utils
+
+cli = FlaskGroup(create_app=create_app)
+
+# --------------------------------------------------------
+# Entrypoint
+# --------------------------------------------------------
+if __name__ == '__main__':
+    cli()
diff --git a/megapixels/commands/faiss/build_db.py b/megapixels/commands/faiss/build_db.py
new file mode 100644
index 00000000..0f979e41
--- /dev/null
+++ b/megapixels/commands/faiss/build_db.py
@@ -0,0 +1,15 @@
+"""
+Load all the CSV files into MySQL
+"""
+
+import click
+
+from app.models.sql_factory import load_sql_datasets
+
+@click.command()
+@click.pass_context
+def cli(ctx):
+  """import the various CSVs into MySQL
+  """
+  print('Loading CSV datasets into SQL...')
+  load_sql_datasets(replace=True)
diff --git a/megapixels/commands/faiss/build_faiss.py b/megapixels/commands/faiss/build_faiss.py
new file mode 100644
index 00000000..96d3f99e
--- /dev/null
+++ b/megapixels/commands/faiss/build_faiss.py
@@ -0,0 +1,58 @@
+"""
+Index all of the FAISS datasets
+"""
+
+import os
+import glob
+import click
+import faiss
+import time
+import numpy as np
+
+from app.utils.file_utils import load_recipe, load_csv_safe
+from app.settings import app_cfg as cfg
+
+engine = create_engine('sqlite:///:memory:')
+
+class DefaultRecipe:
+  def __init__(self):
+    self.dim = 128
+    self.factory_type = 'Flat'
+
+@click.command()
+@click.pass_context
+def cli(ctx):
+  """build the FAISS index.
+    - looks for all datasets in faiss/metadata/
+    - uses the recipe above by default
+    - however you can override this by adding a new recipe in faiss/recipes/{name}.json
+  """
+  datasets = []
+  for fn in glob.iglob(os.path.join(cfg.DIR_FAISS_METADATA, "*")):
+    name = os.path.basename(fn)
+    recipe_fn = os.path.join(cfg.DIR_FAISS_RECIPES, name + ".json")
+    if os.path.exists(recipe_fn):
+      build_faiss(name, load_recipe(recipe_fn))
+    else:
+      build_faiss(name, DefaultRecipe())
+
+def build_faiss(name, recipe):
+  vec_fn = os.path.join(cfg.DIR_FAISS_METADATA, name, "vecs.csv")
+  index_fn = os.path.join(cfg.DIR_FAISS_INDEXES, name + ".index")
+
+  index = faiss.index_factory(recipe.dim, recipe.factory_type)
+
+  keys, rows = load_csv_safe(vec_fn)
+  feats = np.array([ list(map(float, row[3].split(","))) for row in rows ]).astype('float32')
+  n, d = feats.shape
+
+  print("{}: training {} x {} dim vectors".format(name, n, d))
+  print(recipe.factory_type)
+
+  add_start = time.time()
+  index.add(feats)
+  add_end = time.time()
+  add_time = add_end - add_start
+  print("{}: add time: {:.1f}s".format(name, add_time))
+
+  faiss.write_index(index, index_fn)
diff --git a/megapixels/commands/faiss/sync_metadata.py b/megapixels/commands/faiss/sync_metadata.py
new file mode 100644
index 00000000..b01211b4
--- /dev/null
+++ b/megapixels/commands/faiss/sync_metadata.py
@@ -0,0 +1,18 @@
+"""
+Sync the FAISS metadata
+"""
+
+import subprocess
+import click
+
+from app.settings import app_cfg as cfg
+
+@click.command()
+@click.pass_context
+def cli(ctx):
+  """synchronize metadata files from s3"""
+  sts = subprocess.call([
+    "s3cmd", "sync",
+    "s3://megapixels/v1/metadata/",
+    cfg.DIR_FAISS_METADATA + '/',
+  ])