""" Index all of the FAISS datasets """ import os import glob import click import faiss import time import numpy as np from app.utils.file_utils import load_recipe, load_csv_safe from app.settings import app_cfg as cfg engine = create_engine('sqlite:///:memory:') class DefaultRecipe: def __init__(self): self.dim = 128 self.factory_type = 'Flat' @click.command() @click.pass_context def cli(ctx): """build the FAISS index. - looks for all datasets in faiss/metadata/ - uses the recipe above by default - however you can override this by adding a new recipe in faiss/recipes/{name}.json """ datasets = [] for fn in glob.iglob(os.path.join(cfg.DIR_FAISS_METADATA, "*")): name = os.path.basename(fn) recipe_fn = os.path.join(cfg.DIR_FAISS_RECIPES, name + ".json") if os.path.exists(recipe_fn): build_faiss(name, load_recipe(recipe_fn)) else: build_faiss(name, DefaultRecipe()) # index identities # certain CSV files should be loaded into mysql # User.__table__.drop() SQLemployees.create(engine) def build_faiss(name, recipe): vec_fn = os.path.join(cfg.DIR_FAISS_METADATA, name, "vecs.csv") index_fn = os.path.join(cfg.DIR_FAISS_INDEXES, name + ".index") index = faiss.index_factory(recipe.dim, recipe.factory_type) keys, rows = load_csv_safe(vec_fn) feats = np.array([ list(map(float, row[3].split(","))) for row in rows ]).astype('float32') n, d = feats.shape print("{}: training {} x {} dim vectors".format(name, n, d)) print(recipe.factory_type) add_start = time.time() index.add(feats) add_end = time.time() add_time = add_end - add_start print("{}: add time: {:.1f}s".format(name, add_time)) faiss.write_index(index, index_fn)