diff options
Diffstat (limited to 'megapixels')
| -rw-r--r-- | megapixels/app/models/sql_factory.py | 152 | ||||
| -rw-r--r-- | megapixels/app/server/api.py | 54 | ||||
| -rw-r--r-- | megapixels/app/server/create.py | 36 | ||||
| l--------- | megapixels/app/server/static | 1 | ||||
| -rw-r--r-- | megapixels/app/settings/app_cfg.py | 14 | ||||
| -rw-r--r-- | megapixels/app/utils/file_utils.py | 46 | ||||
| -rw-r--r-- | megapixels/cli_faiss.py | 36 | ||||
| -rw-r--r-- | megapixels/cli_flask.py | 19 | ||||
| -rw-r--r-- | megapixels/commands/faiss/build_db.py | 15 | ||||
| -rw-r--r-- | megapixels/commands/faiss/build_faiss.py | 58 | ||||
| -rw-r--r-- | megapixels/commands/faiss/sync_metadata.py | 18 |
11 files changed, 448 insertions, 1 deletions
diff --git a/megapixels/app/models/sql_factory.py b/megapixels/app/models/sql_factory.py new file mode 100644 index 00000000..e35c3e15 --- /dev/null +++ b/megapixels/app/models/sql_factory.py @@ -0,0 +1,152 @@ +import os +import glob +import time +import pandas as pd + +from sqlalchemy import create_engine, Table, Column, String, Integer, DateTime, Float +from sqlalchemy.orm import sessionmaker +from sqlalchemy.ext.declarative import declarative_base + +from app.utils.file_utils import load_recipe, load_csv_safe +from app.settings import app_cfg as cfg + +connection_url = "mysql+mysqldb://{}:{}@{}/{}".format( + os.getenv("DB_USER"), + os.getenv("DB_PASS"), + os.getenv("DB_HOST"), + os.getenv("DB_NAME") +) + +datasets = {} +loaded = False + +def list_datasets(): + return [dataset.describe() for dataset in datasets.values()] + +def get_dataset(name): + return datasets[name] if name in datasets else None + +def get_table(name, table_name): + dataset = get_dataset(name) + return dataset.get_table(table_name) if dataset else None + +def load_sql_datasets(replace=False, base_model=None): + global datasets, loaded + if loaded: + return datasets + engine = create_engine(connection_url) if replace else None + for path in glob.iglob(os.path.join(cfg.DIR_FAISS_METADATA, "*")): + dataset = load_sql_dataset(path, replace, engine, base_model) + datasets[dataset.name] = dataset + loaded = True + return datasets + +def load_sql_dataset(path, replace=False, engine=None, base_model=None): + name = os.path.basename(path) + dataset = SqlDataset(name, base_model=base_model) + + for fn in glob.iglob(os.path.join(path, "*.csv")): + key = os.path.basename(fn).replace(".csv", "") + table = dataset.get_table(key) + if table is None: + continue + if replace: + print('loading dataset {}'.format(fn)) + df = pd.read_csv(fn) + # fix columns that are named "index", a sql reserved word + df.columns = table.__table__.columns.keys() + df.to_sql(name=table.__tablename__, con=engine, if_exists='replace', index=False) + return dataset + +class SqlDataset: + """ + Bridge between the facial information CSVs connected to the datasets, and MySQL + - each dataset should have files that can be loaded into these database models + - names will be fixed to work in SQL (index -> id) + - we can then have more generic models for fetching this info after doing a FAISS query + """ + def __init__(self, name, engine=None, base_model=None): + self.name = name + self.tables = {} + if base_model is None: + self.engine = create_engine(connection_url) + base_model = declarative_base(engine) + self.base_model = base_model + + def describe(self): + return { + 'name': self.name, + 'tables': list(self.tables.keys()), + } + + def get_table(self, type): + if type in self.tables: + return self.tables[type] + elif type == 'uuids': + self.tables[type] = self.uuid_table() + elif type == 'roi': + self.tables[type] = self.roi_table() + elif type == 'identity_meta': + self.tables[type] = self.identity_table() + elif type == 'pose': + self.tables[type] = self.pose_table() + else: + return None + return self.tables[type] + + # ==> uuids.csv <== + # index,uuid + # 0,f03fd921-2d56-4e83-8115-f658d6a72287 + def uuid_table(self): + class UUID(self.base_model): + __tablename__ = self.name + "_uuid" + id = Column(Integer, primary_key=True) + uuid = Column(String(36), nullable=False) + return UUID + + # ==> roi.csv <== + # index,h,image_height,image_index,image_width,w,x,y + # 0,0.33000000000000007,250,0,250,0.32999999999999996,0.33666666666666667,0.35 + def roi_table(self): + class ROI(self.base_model): + __tablename__ = self.name + "_roi" + id = Column(Integer, primary_key=True) + h = Column(Float, nullable=False) + image_height = Column(Integer, nullable=False) + image_index = Column(Integer, nullable=False) + image_width = Column(Integer, nullable=False) + w = Column(Float, nullable=False) + x = Column(Float, nullable=False) + y = Column(Float, nullable=False) + return ROI + + # ==> identity.csv <== + # index,fullname,description,gender,images,image_index + # 0,A. J. Cook,Canadian actress,f,1,0 + def identity_table(self): + class Identity(self.base_model): + __tablename__ = self.name + "_identity" + id = Column(Integer, primary_key=True) + fullname = Column(String(36), nullable=False) + description = Column(String(36), nullable=False) + gender = Column(String(1), nullable=False) + images = Column(Integer, nullable=False) + image_id = Column(Integer, nullable=False) + return Identity + + # ==> pose.csv <== + # index,image_index,pitch,roll,yaw + # 0,0,11.16264458441435,10.415885631337728,22.99719032415318 + def pose_table(self): + class Pose(self.base_model): + __tablename__ = self.name + "_pose" + id = Column(Integer, primary_key=True) + image_id = Column(Integer, primary_key=True) + pitch = Column(Float, nullable=False) + roll = Column(Float, nullable=False) + yaw = Column(Float, nullable=False) + return Pose + + +# Session = sessionmaker(bind=engine) +# session = Session() diff --git a/megapixels/app/server/api.py b/megapixels/app/server/api.py new file mode 100644 index 00000000..c5e27dd2 --- /dev/null +++ b/megapixels/app/server/api.py @@ -0,0 +1,54 @@ +import os +import re +import time +from flask import Blueprint, request, jsonify +from PIL import Image # todo: try to remove PIL dependency + +from app.models.sql_factory import list_datasets, get_dataset, get_table + +sanitize_re = re.compile('[\W]+') +valid_exts = ['.gif', '.jpg', '.jpeg', '.png'] + +api = Blueprint('api', __name__) + +@api.route('/') +def index(): + return jsonify({ 'datasets': list_datasets() }) + +@api.route('/dataset/<name>') +def show(name): + dataset = get_dataset(name) + if dataset: + return jsonify(dataset.describe()) + else: + return jsonify({ 'status': 404 }) + +@api.route('/dataset/<dataset>/face', methods=['POST']) +def upload(name): + file = request.files['query_img'] + fn = file.filename + if fn.endswith('blob'): + fn = 'filename.jpg' + + basename, ext = os.path.splitext(fn) + print("got {}, type {}".format(basename, ext)) + if ext.lower() not in valid_exts: + return jsonify({ 'error': 'not an image' }) + + img = Image.open(file.stream).convert('RGB') + + # vec = db.load_feature_vector_from_file(uploaded_img_path) + # vec = fe.extract(img) + # print(vec.shape) + # results = db.search(vec, limit=limit) + + query = { + 'timing': time.time() - start, + } + results = [] + + print(results) + return jsonify({ + 'query': query, + 'results': results, + }) diff --git a/megapixels/app/server/create.py b/megapixels/app/server/create.py new file mode 100644 index 00000000..9efed669 --- /dev/null +++ b/megapixels/app/server/create.py @@ -0,0 +1,36 @@ +from flask import Flask, Blueprint, jsonify +from flask_sqlalchemy import SQLAlchemy +from app.models.sql_factory import connection_url, load_sql_datasets + +from app.server.api import api + +db = SQLAlchemy() + +def create_app(script_info=None): + app = Flask(__name__, static_url_path='') + app.config['SQLALCHEMY_DATABASE_URI'] = connection_url + app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False + + db.init_app(app) + datasets = load_sql_datasets(replace=False, base_model=db.Model) + + app.register_blueprint(api, url_prefix='/api') + + @app.route('/', methods=['GET']) + def index(): + return app.send_static_file('index.html') + + @app.shell_context_processor + def shell_context(): + return { 'app': app, 'db': db } + + @app.route("/site-map") + def site_map(): + links = [] + for rule in app.url_map.iter_rules(): + # url = url_for(rule.endpoint, **(rule.defaults or {})) + # print(url) + links.append((rule.endpoint)) + return(jsonify(links)) + + return app diff --git a/megapixels/app/server/static b/megapixels/app/server/static new file mode 120000 index 00000000..1dc7a639 --- /dev/null +++ b/megapixels/app/server/static @@ -0,0 +1 @@ +../../../site/public
\ No newline at end of file diff --git a/megapixels/app/settings/app_cfg.py b/megapixels/app/settings/app_cfg.py index e915b5d4..9ea4b72b 100644 --- a/megapixels/app/settings/app_cfg.py +++ b/megapixels/app/settings/app_cfg.py @@ -2,6 +2,7 @@ import os from os.path import join import logging import collections +from dotenv import load_dotenv import cv2 as cv @@ -49,6 +50,11 @@ DIR_MODELS_DLIB_5PT = join(DIR_MODELS_DLIB, 'shape_predictor_5_face_landmarks.da DIR_MODELS_DLIB_68PT = join(DIR_MODELS_DLIB, 'shape_predictor_68_face_landmarks.dat') DIR_MODELS_DLIB_FACEREC_RESNET = join(DIR_MODELS_DLIB, 'dlib_face_recognition_resnet_model_v1.dat') +DIR_FAISS = join(DIR_APP, 'faiss') +DIR_FAISS_INDEXES = join(DIR_FAISS, 'indexes') +DIR_FAISS_METADATA = join(DIR_FAISS, 'metadata') +DIR_FAISS_RECIPES = join(DIR_FAISS, 'recipes') + # Test images DIR_TEST_IMAGES = join(DIR_APP, 'test', 'images') @@ -65,6 +71,7 @@ FP_FONT = join(DIR_ASSETS, 'font') DIR_COMMANDS_CV = 'commands/cv' DIR_COMMANDS_ADMIN = 'commands/admin' DIR_COMMANDS_DATASETS = 'commands/datasets' +DIR_COMMANDS_FAISS = 'commands/faiss' DIR_COMMANDS_MISC = 'commands/misc' # ----------------------------------------------------------------------------- @@ -112,3 +119,10 @@ LOGFILE_FORMAT = "%(log_color)s%(levelname)-8s%(reset)s %(cyan)s%(filename)s:%(l # ----------------------------------------------------------------------------- S3_MEDIA_ROOT = 's3://megapixels/v1/media/' S3_METADATA_ROOT = 's3://megapixels/v1/metadata/' + +# ----------------------------------------------------------------------------- +# .env config for keys +# ----------------------------------------------------------------------------- + +DIR_DOTENV = join(DIR_APP, '.env') +load_dotenv(dotenv_path=DIR_DOTENV) diff --git a/megapixels/app/utils/file_utils.py b/megapixels/app/utils/file_utils.py index 99282bd0..80239fe2 100644 --- a/megapixels/app/utils/file_utils.py +++ b/megapixels/app/utils/file_utils.py @@ -77,7 +77,7 @@ def load_csv(fp_in, as_list=True): :returns: list of all CSV data """ if not Path(fp_in).exists(): - log.info('loading {}'.format(fp_in)) + log.info('not found: {}'.format(fp_in)) log.info('loading: {}'.format(fp_in)) with open(fp_in, 'r') as fp: items = csv.DictReader(fp) @@ -86,6 +86,50 @@ def load_csv(fp_in, as_list=True): log.info('returning {:,} items'.format(len(items))) return items +def unfussy_csv_reader(reader): + """Loads a CSV while ignoring possible data errors + :param reader: Special reader for load_csv_safe which ignores CSV parse errors + """ + while True: + try: + yield next(reader) + except StopIteration: + return + except csv.Error: + print(csv.Error) + # log the problem or whatever + continue + +def load_csv_safe(fp_in, keys=True, create=False): + """Loads a CSV while ignoring possible data errors + :param fp_in: string filepath to JSON file + :param keys: boolean set to false if the first line is not headers (for some reason) + :param create: boolean set to true to return an empty keys/values if the CSV does not exist + """ + try: + with open(fp_in, 'r', newline='', encoding='utf-8') as f: + # reader = csv.reader( (line.replace('\0','') for line in f) ) + reader = csv.reader(f) + lines = list(unfussy_csv_reader(reader)) + if keys: + keys = lines[0] + lines = lines[1:] + return keys, lines + return lines + except: + if create: + if keys: + return {}, [] + return [] + raise + +def load_recipe(fp_in): + """Loads a JSON file as an object with properties accessible with dot syntax + :param fp_in: string filepath to JSON file + """ + with open(path) as fh: + return json.load(fh, object_hook=lambda d: collections.namedtuple('X', d.keys())(*d.values())) + def lazywrite(data, fp_out, sort_keys=True): """Writes JSON or Pickle data""" diff --git a/megapixels/cli_faiss.py b/megapixels/cli_faiss.py new file mode 100644 index 00000000..9953d9b7 --- /dev/null +++ b/megapixels/cli_faiss.py @@ -0,0 +1,36 @@ +# -------------------------------------------------------- +# add/edit commands in commands/faiss directory +# -------------------------------------------------------- + +import click + +from app.settings import app_cfg as cfg +from app.utils import logger_utils +from app.models.click_factory import ClickSimple + +# click cli factory +cc = ClickSimple.create(cfg.DIR_COMMANDS_FAISS) + +# -------------------------------------------------------- +# CLI +# -------------------------------------------------------- +@click.group(cls=cc, chain=False) +@click.option('-v', '--verbose', 'verbosity', count=True, default=4, + show_default=True, + help='Verbosity: -v DEBUG, -vv INFO, -vvv WARN, -vvvv ERROR, -vvvvv CRITICAL') +@click.pass_context +def cli(ctx, **kwargs): + """\033[1m\033[94mMegaPixels: FAISS Data Scripts\033[0m + """ + ctx.opts = {} + # init logger + logger_utils.Logger.create(verbosity=kwargs['verbosity']) + + + +# -------------------------------------------------------- +# Entrypoint +# -------------------------------------------------------- +if __name__ == '__main__': + cli() + diff --git a/megapixels/cli_flask.py b/megapixels/cli_flask.py new file mode 100644 index 00000000..369bec01 --- /dev/null +++ b/megapixels/cli_flask.py @@ -0,0 +1,19 @@ +# -------------------------------------------------------- +# wrapper for flask CLI API +# -------------------------------------------------------- + +import click + +from flask.cli import FlaskGroup +from app.server.create import create_app + +# from app.settings import app_cfg as cfg +# from app.utils import logger_utils + +cli = FlaskGroup(create_app=create_app) + +# -------------------------------------------------------- +# Entrypoint +# -------------------------------------------------------- +if __name__ == '__main__': + cli() diff --git a/megapixels/commands/faiss/build_db.py b/megapixels/commands/faiss/build_db.py new file mode 100644 index 00000000..0f979e41 --- /dev/null +++ b/megapixels/commands/faiss/build_db.py @@ -0,0 +1,15 @@ +""" +Load all the CSV files into MySQL +""" + +import click + +from app.models.sql_factory import load_sql_datasets + +@click.command() +@click.pass_context +def cli(ctx): + """import the various CSVs into MySQL + """ + print('Loading CSV datasets into SQL...') + load_sql_datasets(replace=True) diff --git a/megapixels/commands/faiss/build_faiss.py b/megapixels/commands/faiss/build_faiss.py new file mode 100644 index 00000000..96d3f99e --- /dev/null +++ b/megapixels/commands/faiss/build_faiss.py @@ -0,0 +1,58 @@ +""" +Index all of the FAISS datasets +""" + +import os +import glob +import click +import faiss +import time +import numpy as np + +from app.utils.file_utils import load_recipe, load_csv_safe +from app.settings import app_cfg as cfg + +engine = create_engine('sqlite:///:memory:') + +class DefaultRecipe: + def __init__(self): + self.dim = 128 + self.factory_type = 'Flat' + +@click.command() +@click.pass_context +def cli(ctx): + """build the FAISS index. + - looks for all datasets in faiss/metadata/ + - uses the recipe above by default + - however you can override this by adding a new recipe in faiss/recipes/{name}.json + """ + datasets = [] + for fn in glob.iglob(os.path.join(cfg.DIR_FAISS_METADATA, "*")): + name = os.path.basename(fn) + recipe_fn = os.path.join(cfg.DIR_FAISS_RECIPES, name + ".json") + if os.path.exists(recipe_fn): + build_faiss(name, load_recipe(recipe_fn)) + else: + build_faiss(name, DefaultRecipe()) + +def build_faiss(name, recipe): + vec_fn = os.path.join(cfg.DIR_FAISS_METADATA, name, "vecs.csv") + index_fn = os.path.join(cfg.DIR_FAISS_INDEXES, name + ".index") + + index = faiss.index_factory(recipe.dim, recipe.factory_type) + + keys, rows = load_csv_safe(vec_fn) + feats = np.array([ list(map(float, row[3].split(","))) for row in rows ]).astype('float32') + n, d = feats.shape + + print("{}: training {} x {} dim vectors".format(name, n, d)) + print(recipe.factory_type) + + add_start = time.time() + index.add(feats) + add_end = time.time() + add_time = add_end - add_start + print("{}: add time: {:.1f}s".format(name, add_time)) + + faiss.write_index(index, index_fn) diff --git a/megapixels/commands/faiss/sync_metadata.py b/megapixels/commands/faiss/sync_metadata.py new file mode 100644 index 00000000..b01211b4 --- /dev/null +++ b/megapixels/commands/faiss/sync_metadata.py @@ -0,0 +1,18 @@ +""" +Sync the FAISS metadata +""" + +import subprocess +import click + +from app.settings import app_cfg as cfg + +@click.command() +@click.pass_context +def cli(ctx): + """synchronize metadata files from s3""" + sts = subprocess.call([ + "s3cmd", "sync", + "s3://megapixels/v1/metadata/", + cfg.DIR_FAISS_METADATA + '/', + ]) |
