summaryrefslogtreecommitdiff
path: root/megapixels
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels')
-rw-r--r--megapixels/app/models/sql_factory.py152
-rw-r--r--megapixels/app/server/api.py54
-rw-r--r--megapixels/app/server/create.py36
l---------megapixels/app/server/static1
-rw-r--r--megapixels/app/settings/app_cfg.py14
-rw-r--r--megapixels/app/utils/file_utils.py46
-rw-r--r--megapixels/cli_faiss.py36
-rw-r--r--megapixels/cli_flask.py19
-rw-r--r--megapixels/commands/faiss/build_db.py15
-rw-r--r--megapixels/commands/faiss/build_faiss.py58
-rw-r--r--megapixels/commands/faiss/sync_metadata.py18
11 files changed, 448 insertions, 1 deletions
diff --git a/megapixels/app/models/sql_factory.py b/megapixels/app/models/sql_factory.py
new file mode 100644
index 00000000..e35c3e15
--- /dev/null
+++ b/megapixels/app/models/sql_factory.py
@@ -0,0 +1,152 @@
+import os
+import glob
+import time
+import pandas as pd
+
+from sqlalchemy import create_engine, Table, Column, String, Integer, DateTime, Float
+from sqlalchemy.orm import sessionmaker
+from sqlalchemy.ext.declarative import declarative_base
+
+from app.utils.file_utils import load_recipe, load_csv_safe
+from app.settings import app_cfg as cfg
+
+connection_url = "mysql+mysqldb://{}:{}@{}/{}".format(
+ os.getenv("DB_USER"),
+ os.getenv("DB_PASS"),
+ os.getenv("DB_HOST"),
+ os.getenv("DB_NAME")
+)
+
+datasets = {}
+loaded = False
+
+def list_datasets():
+ return [dataset.describe() for dataset in datasets.values()]
+
+def get_dataset(name):
+ return datasets[name] if name in datasets else None
+
+def get_table(name, table_name):
+ dataset = get_dataset(name)
+ return dataset.get_table(table_name) if dataset else None
+
+def load_sql_datasets(replace=False, base_model=None):
+ global datasets, loaded
+ if loaded:
+ return datasets
+ engine = create_engine(connection_url) if replace else None
+ for path in glob.iglob(os.path.join(cfg.DIR_FAISS_METADATA, "*")):
+ dataset = load_sql_dataset(path, replace, engine, base_model)
+ datasets[dataset.name] = dataset
+ loaded = True
+ return datasets
+
+def load_sql_dataset(path, replace=False, engine=None, base_model=None):
+ name = os.path.basename(path)
+ dataset = SqlDataset(name, base_model=base_model)
+
+ for fn in glob.iglob(os.path.join(path, "*.csv")):
+ key = os.path.basename(fn).replace(".csv", "")
+ table = dataset.get_table(key)
+ if table is None:
+ continue
+ if replace:
+ print('loading dataset {}'.format(fn))
+ df = pd.read_csv(fn)
+ # fix columns that are named "index", a sql reserved word
+ df.columns = table.__table__.columns.keys()
+ df.to_sql(name=table.__tablename__, con=engine, if_exists='replace', index=False)
+ return dataset
+
+class SqlDataset:
+ """
+ Bridge between the facial information CSVs connected to the datasets, and MySQL
+ - each dataset should have files that can be loaded into these database models
+ - names will be fixed to work in SQL (index -> id)
+ - we can then have more generic models for fetching this info after doing a FAISS query
+ """
+ def __init__(self, name, engine=None, base_model=None):
+ self.name = name
+ self.tables = {}
+ if base_model is None:
+ self.engine = create_engine(connection_url)
+ base_model = declarative_base(engine)
+ self.base_model = base_model
+
+ def describe(self):
+ return {
+ 'name': self.name,
+ 'tables': list(self.tables.keys()),
+ }
+
+ def get_table(self, type):
+ if type in self.tables:
+ return self.tables[type]
+ elif type == 'uuids':
+ self.tables[type] = self.uuid_table()
+ elif type == 'roi':
+ self.tables[type] = self.roi_table()
+ elif type == 'identity_meta':
+ self.tables[type] = self.identity_table()
+ elif type == 'pose':
+ self.tables[type] = self.pose_table()
+ else:
+ return None
+ return self.tables[type]
+
+ # ==> uuids.csv <==
+ # index,uuid
+ # 0,f03fd921-2d56-4e83-8115-f658d6a72287
+ def uuid_table(self):
+ class UUID(self.base_model):
+ __tablename__ = self.name + "_uuid"
+ id = Column(Integer, primary_key=True)
+ uuid = Column(String(36), nullable=False)
+ return UUID
+
+ # ==> roi.csv <==
+ # index,h,image_height,image_index,image_width,w,x,y
+ # 0,0.33000000000000007,250,0,250,0.32999999999999996,0.33666666666666667,0.35
+ def roi_table(self):
+ class ROI(self.base_model):
+ __tablename__ = self.name + "_roi"
+ id = Column(Integer, primary_key=True)
+ h = Column(Float, nullable=False)
+ image_height = Column(Integer, nullable=False)
+ image_index = Column(Integer, nullable=False)
+ image_width = Column(Integer, nullable=False)
+ w = Column(Float, nullable=False)
+ x = Column(Float, nullable=False)
+ y = Column(Float, nullable=False)
+ return ROI
+
+ # ==> identity.csv <==
+ # index,fullname,description,gender,images,image_index
+ # 0,A. J. Cook,Canadian actress,f,1,0
+ def identity_table(self):
+ class Identity(self.base_model):
+ __tablename__ = self.name + "_identity"
+ id = Column(Integer, primary_key=True)
+ fullname = Column(String(36), nullable=False)
+ description = Column(String(36), nullable=False)
+ gender = Column(String(1), nullable=False)
+ images = Column(Integer, nullable=False)
+ image_id = Column(Integer, nullable=False)
+ return Identity
+
+ # ==> pose.csv <==
+ # index,image_index,pitch,roll,yaw
+ # 0,0,11.16264458441435,10.415885631337728,22.99719032415318
+ def pose_table(self):
+ class Pose(self.base_model):
+ __tablename__ = self.name + "_pose"
+ id = Column(Integer, primary_key=True)
+ image_id = Column(Integer, primary_key=True)
+ pitch = Column(Float, nullable=False)
+ roll = Column(Float, nullable=False)
+ yaw = Column(Float, nullable=False)
+ return Pose
+
+
+# Session = sessionmaker(bind=engine)
+# session = Session()
diff --git a/megapixels/app/server/api.py b/megapixels/app/server/api.py
new file mode 100644
index 00000000..c5e27dd2
--- /dev/null
+++ b/megapixels/app/server/api.py
@@ -0,0 +1,54 @@
+import os
+import re
+import time
+from flask import Blueprint, request, jsonify
+from PIL import Image # todo: try to remove PIL dependency
+
+from app.models.sql_factory import list_datasets, get_dataset, get_table
+
+sanitize_re = re.compile('[\W]+')
+valid_exts = ['.gif', '.jpg', '.jpeg', '.png']
+
+api = Blueprint('api', __name__)
+
+@api.route('/')
+def index():
+ return jsonify({ 'datasets': list_datasets() })
+
+@api.route('/dataset/<name>')
+def show(name):
+ dataset = get_dataset(name)
+ if dataset:
+ return jsonify(dataset.describe())
+ else:
+ return jsonify({ 'status': 404 })
+
+@api.route('/dataset/<dataset>/face', methods=['POST'])
+def upload(name):
+ file = request.files['query_img']
+ fn = file.filename
+ if fn.endswith('blob'):
+ fn = 'filename.jpg'
+
+ basename, ext = os.path.splitext(fn)
+ print("got {}, type {}".format(basename, ext))
+ if ext.lower() not in valid_exts:
+ return jsonify({ 'error': 'not an image' })
+
+ img = Image.open(file.stream).convert('RGB')
+
+ # vec = db.load_feature_vector_from_file(uploaded_img_path)
+ # vec = fe.extract(img)
+ # print(vec.shape)
+ # results = db.search(vec, limit=limit)
+
+ query = {
+ 'timing': time.time() - start,
+ }
+ results = []
+
+ print(results)
+ return jsonify({
+ 'query': query,
+ 'results': results,
+ })
diff --git a/megapixels/app/server/create.py b/megapixels/app/server/create.py
new file mode 100644
index 00000000..9efed669
--- /dev/null
+++ b/megapixels/app/server/create.py
@@ -0,0 +1,36 @@
+from flask import Flask, Blueprint, jsonify
+from flask_sqlalchemy import SQLAlchemy
+from app.models.sql_factory import connection_url, load_sql_datasets
+
+from app.server.api import api
+
+db = SQLAlchemy()
+
+def create_app(script_info=None):
+ app = Flask(__name__, static_url_path='')
+ app.config['SQLALCHEMY_DATABASE_URI'] = connection_url
+ app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
+
+ db.init_app(app)
+ datasets = load_sql_datasets(replace=False, base_model=db.Model)
+
+ app.register_blueprint(api, url_prefix='/api')
+
+ @app.route('/', methods=['GET'])
+ def index():
+ return app.send_static_file('index.html')
+
+ @app.shell_context_processor
+ def shell_context():
+ return { 'app': app, 'db': db }
+
+ @app.route("/site-map")
+ def site_map():
+ links = []
+ for rule in app.url_map.iter_rules():
+ # url = url_for(rule.endpoint, **(rule.defaults or {}))
+ # print(url)
+ links.append((rule.endpoint))
+ return(jsonify(links))
+
+ return app
diff --git a/megapixels/app/server/static b/megapixels/app/server/static
new file mode 120000
index 00000000..1dc7a639
--- /dev/null
+++ b/megapixels/app/server/static
@@ -0,0 +1 @@
+../../../site/public \ No newline at end of file
diff --git a/megapixels/app/settings/app_cfg.py b/megapixels/app/settings/app_cfg.py
index 7406caad..0507366f 100644
--- a/megapixels/app/settings/app_cfg.py
+++ b/megapixels/app/settings/app_cfg.py
@@ -2,6 +2,7 @@ import os
from os.path import join
import logging
import collections
+from dotenv import load_dotenv
import cv2 as cv
@@ -48,6 +49,11 @@ DIR_MODELS_DLIB_5PT = join(DIR_MODELS_DLIB, 'shape_predictor_5_face_landmarks.da
DIR_MODELS_DLIB_68PT = join(DIR_MODELS_DLIB, 'shape_predictor_68_face_landmarks.dat')
DIR_MODELS_DLIB_FACEREC_RESNET = join(DIR_MODELS_DLIB, 'dlib_face_recognition_resnet_model_v1.dat')
+DIR_FAISS = join(DIR_APP, 'faiss')
+DIR_FAISS_INDEXES = join(DIR_FAISS, 'indexes')
+DIR_FAISS_METADATA = join(DIR_FAISS, 'metadata')
+DIR_FAISS_RECIPES = join(DIR_FAISS, 'recipes')
+
# Test images
DIR_TEST_IMAGES = join(DIR_APP, 'test', 'images')
@@ -64,6 +70,7 @@ FP_FONT = join(DIR_ASSETS, 'font')
DIR_COMMANDS_CV = 'commands/cv'
DIR_COMMANDS_ADMIN = 'commands/admin'
DIR_COMMANDS_DATASETS = 'commands/datasets'
+DIR_COMMANDS_FAISS = 'commands/faiss'
DIR_COMMANDS_MISC = 'commands/misc'
# -----------------------------------------------------------------------------
@@ -111,3 +118,10 @@ LOGFILE_FORMAT = "%(log_color)s%(levelname)-8s%(reset)s %(cyan)s%(filename)s:%(l
# -----------------------------------------------------------------------------
S3_MEDIA_ROOT = 's3://megapixels/v1/media/'
S3_METADATA_ROOT = 's3://megapixels/v1/metadata/'
+
+# -----------------------------------------------------------------------------
+# .env config for keys
+# -----------------------------------------------------------------------------
+
+DIR_DOTENV = join(DIR_APP, '.env')
+load_dotenv(dotenv_path=DIR_DOTENV)
diff --git a/megapixels/app/utils/file_utils.py b/megapixels/app/utils/file_utils.py
index 99282bd0..80239fe2 100644
--- a/megapixels/app/utils/file_utils.py
+++ b/megapixels/app/utils/file_utils.py
@@ -77,7 +77,7 @@ def load_csv(fp_in, as_list=True):
:returns: list of all CSV data
"""
if not Path(fp_in).exists():
- log.info('loading {}'.format(fp_in))
+ log.info('not found: {}'.format(fp_in))
log.info('loading: {}'.format(fp_in))
with open(fp_in, 'r') as fp:
items = csv.DictReader(fp)
@@ -86,6 +86,50 @@ def load_csv(fp_in, as_list=True):
log.info('returning {:,} items'.format(len(items)))
return items
+def unfussy_csv_reader(reader):
+ """Loads a CSV while ignoring possible data errors
+ :param reader: Special reader for load_csv_safe which ignores CSV parse errors
+ """
+ while True:
+ try:
+ yield next(reader)
+ except StopIteration:
+ return
+ except csv.Error:
+ print(csv.Error)
+ # log the problem or whatever
+ continue
+
+def load_csv_safe(fp_in, keys=True, create=False):
+ """Loads a CSV while ignoring possible data errors
+ :param fp_in: string filepath to JSON file
+ :param keys: boolean set to false if the first line is not headers (for some reason)
+ :param create: boolean set to true to return an empty keys/values if the CSV does not exist
+ """
+ try:
+ with open(fp_in, 'r', newline='', encoding='utf-8') as f:
+ # reader = csv.reader( (line.replace('\0','') for line in f) )
+ reader = csv.reader(f)
+ lines = list(unfussy_csv_reader(reader))
+ if keys:
+ keys = lines[0]
+ lines = lines[1:]
+ return keys, lines
+ return lines
+ except:
+ if create:
+ if keys:
+ return {}, []
+ return []
+ raise
+
+def load_recipe(fp_in):
+ """Loads a JSON file as an object with properties accessible with dot syntax
+ :param fp_in: string filepath to JSON file
+ """
+ with open(path) as fh:
+ return json.load(fh, object_hook=lambda d: collections.namedtuple('X', d.keys())(*d.values()))
+
def lazywrite(data, fp_out, sort_keys=True):
"""Writes JSON or Pickle data"""
diff --git a/megapixels/cli_faiss.py b/megapixels/cli_faiss.py
new file mode 100644
index 00000000..9953d9b7
--- /dev/null
+++ b/megapixels/cli_faiss.py
@@ -0,0 +1,36 @@
+# --------------------------------------------------------
+# add/edit commands in commands/faiss directory
+# --------------------------------------------------------
+
+import click
+
+from app.settings import app_cfg as cfg
+from app.utils import logger_utils
+from app.models.click_factory import ClickSimple
+
+# click cli factory
+cc = ClickSimple.create(cfg.DIR_COMMANDS_FAISS)
+
+# --------------------------------------------------------
+# CLI
+# --------------------------------------------------------
+@click.group(cls=cc, chain=False)
+@click.option('-v', '--verbose', 'verbosity', count=True, default=4,
+ show_default=True,
+ help='Verbosity: -v DEBUG, -vv INFO, -vvv WARN, -vvvv ERROR, -vvvvv CRITICAL')
+@click.pass_context
+def cli(ctx, **kwargs):
+ """\033[1m\033[94mMegaPixels: FAISS Data Scripts\033[0m
+ """
+ ctx.opts = {}
+ # init logger
+ logger_utils.Logger.create(verbosity=kwargs['verbosity'])
+
+
+
+# --------------------------------------------------------
+# Entrypoint
+# --------------------------------------------------------
+if __name__ == '__main__':
+ cli()
+
diff --git a/megapixels/cli_flask.py b/megapixels/cli_flask.py
new file mode 100644
index 00000000..369bec01
--- /dev/null
+++ b/megapixels/cli_flask.py
@@ -0,0 +1,19 @@
+# --------------------------------------------------------
+# wrapper for flask CLI API
+# --------------------------------------------------------
+
+import click
+
+from flask.cli import FlaskGroup
+from app.server.create import create_app
+
+# from app.settings import app_cfg as cfg
+# from app.utils import logger_utils
+
+cli = FlaskGroup(create_app=create_app)
+
+# --------------------------------------------------------
+# Entrypoint
+# --------------------------------------------------------
+if __name__ == '__main__':
+ cli()
diff --git a/megapixels/commands/faiss/build_db.py b/megapixels/commands/faiss/build_db.py
new file mode 100644
index 00000000..0f979e41
--- /dev/null
+++ b/megapixels/commands/faiss/build_db.py
@@ -0,0 +1,15 @@
+"""
+Load all the CSV files into MySQL
+"""
+
+import click
+
+from app.models.sql_factory import load_sql_datasets
+
+@click.command()
+@click.pass_context
+def cli(ctx):
+ """import the various CSVs into MySQL
+ """
+ print('Loading CSV datasets into SQL...')
+ load_sql_datasets(replace=True)
diff --git a/megapixels/commands/faiss/build_faiss.py b/megapixels/commands/faiss/build_faiss.py
new file mode 100644
index 00000000..96d3f99e
--- /dev/null
+++ b/megapixels/commands/faiss/build_faiss.py
@@ -0,0 +1,58 @@
+"""
+Index all of the FAISS datasets
+"""
+
+import os
+import glob
+import click
+import faiss
+import time
+import numpy as np
+
+from app.utils.file_utils import load_recipe, load_csv_safe
+from app.settings import app_cfg as cfg
+
+engine = create_engine('sqlite:///:memory:')
+
+class DefaultRecipe:
+ def __init__(self):
+ self.dim = 128
+ self.factory_type = 'Flat'
+
+@click.command()
+@click.pass_context
+def cli(ctx):
+ """build the FAISS index.
+ - looks for all datasets in faiss/metadata/
+ - uses the recipe above by default
+ - however you can override this by adding a new recipe in faiss/recipes/{name}.json
+ """
+ datasets = []
+ for fn in glob.iglob(os.path.join(cfg.DIR_FAISS_METADATA, "*")):
+ name = os.path.basename(fn)
+ recipe_fn = os.path.join(cfg.DIR_FAISS_RECIPES, name + ".json")
+ if os.path.exists(recipe_fn):
+ build_faiss(name, load_recipe(recipe_fn))
+ else:
+ build_faiss(name, DefaultRecipe())
+
+def build_faiss(name, recipe):
+ vec_fn = os.path.join(cfg.DIR_FAISS_METADATA, name, "vecs.csv")
+ index_fn = os.path.join(cfg.DIR_FAISS_INDEXES, name + ".index")
+
+ index = faiss.index_factory(recipe.dim, recipe.factory_type)
+
+ keys, rows = load_csv_safe(vec_fn)
+ feats = np.array([ list(map(float, row[3].split(","))) for row in rows ]).astype('float32')
+ n, d = feats.shape
+
+ print("{}: training {} x {} dim vectors".format(name, n, d))
+ print(recipe.factory_type)
+
+ add_start = time.time()
+ index.add(feats)
+ add_end = time.time()
+ add_time = add_end - add_start
+ print("{}: add time: {:.1f}s".format(name, add_time))
+
+ faiss.write_index(index, index_fn)
diff --git a/megapixels/commands/faiss/sync_metadata.py b/megapixels/commands/faiss/sync_metadata.py
new file mode 100644
index 00000000..b01211b4
--- /dev/null
+++ b/megapixels/commands/faiss/sync_metadata.py
@@ -0,0 +1,18 @@
+"""
+Sync the FAISS metadata
+"""
+
+import subprocess
+import click
+
+from app.settings import app_cfg as cfg
+
+@click.command()
+@click.pass_context
+def cli(ctx):
+ """synchronize metadata files from s3"""
+ sts = subprocess.call([
+ "s3cmd", "sync",
+ "s3://megapixels/v1/metadata/",
+ cfg.DIR_FAISS_METADATA + '/',
+ ])