diff options
Diffstat (limited to 'megapixels/app/processors/faiss.py')
| -rw-r--r-- | megapixels/app/processors/faiss.py | 58 |
1 files changed, 58 insertions, 0 deletions
diff --git a/megapixels/app/processors/faiss.py b/megapixels/app/processors/faiss.py new file mode 100644 index 00000000..5156ad71 --- /dev/null +++ b/megapixels/app/processors/faiss.py @@ -0,0 +1,58 @@ +""" +Index all of the FAISS datasets +""" + +import os +import glob +import faiss +import time +import numpy as np + +from app.utils.file_utils import load_recipe, load_csv_safe +from app.settings import app_cfg as cfg + +class DefaultRecipe: + def __init__(self): + self.dim = 128 + self.factory_type = 'Flat' + +def build_all_faiss_databases(): + datasets = [] + for fn in glob.iglob(os.path.join(cfg.DIR_FAISS_METADATA, "*")): + name = os.path.basename(fn) + recipe_fn = os.path.join(cfg.DIR_FAISS_RECIPES, name + ".json") + if os.path.exists(recipe_fn): + build_faiss_database(name, load_recipe(recipe_fn)) + else: + build_faiss_database(name, DefaultRecipe()) + +def build_faiss_database(name, recipe): + vec_fn = os.path.join(cfg.DIR_FAISS_METADATA, name, "vecs.csv") + index_fn = os.path.join(cfg.DIR_FAISS_INDEXES, name + ".index") + + index = faiss.index_factory(recipe.dim, recipe.factory_type) + + keys, rows = load_csv_safe(vec_fn) + feats = np.array([ list(map(float, row[3].split(","))) for row in rows ]).astype('float32') + n, d = feats.shape + + print("{}: training {} x {} dim vectors".format(name, n, d)) + print(recipe.factory_type) + + add_start = time.time() + index.add(feats) + add_end = time.time() + add_time = add_end - add_start + print("{}: add time: {:.1f}s".format(name, add_time)) + + faiss.write_index(index, index_fn) + +def load_faiss_databases(): + faiss_datasets = {} + for fn in glob.iglob(os.path.join(cfg.DIR_FAISS_METADATA, "*")): + name = os.path.basename(fn) + index_fn = os.path.join(cfg.DIR_FAISS_INDEXES, name + ".index") + if os.path.exists(index_fn): + index = faiss.read_index(index_fn) + faiss_datasets[name] = index + return faiss_datasets |
