summaryrefslogtreecommitdiff
path: root/megapixels/commands/datasets/fix_identity_key.py
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2019-01-28 18:11:36 +0100
committeradamhrv <adam@ahprojects.com>2019-01-28 18:11:36 +0100
commitdd2c36288aa1e8af14588f9258f6785879b8638c (patch)
tree543564ff7cc9b83ae1ecbc5b0d89bca9a6c17742 /megapixels/commands/datasets/fix_identity_key.py
parentb0b06be0defe97ef19cf4d0f3328db40d299e110 (diff)
add utils for analyzing identities
Diffstat (limited to 'megapixels/commands/datasets/fix_identity_key.py')
-rw-r--r--megapixels/commands/datasets/fix_identity_key.py58
1 files changed, 58 insertions, 0 deletions
diff --git a/megapixels/commands/datasets/fix_identity_key.py b/megapixels/commands/datasets/fix_identity_key.py
new file mode 100644
index 00000000..ad2b555f
--- /dev/null
+++ b/megapixels/commands/datasets/fix_identity_key.py
@@ -0,0 +1,58 @@
+'''
+
+'''
+import click
+
+from app.settings import types
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+
+identity_sources = ['subdir', 'numeric']
+
+@click.command()
+@click.option('--data_store', 'opt_data_store',
+ type=cfg.DataStoreVar,
+ default=click_utils.get_default(types.DataStore.HDD),
+ show_default=True,
+ help=click_utils.show_help(types.Dataset))
+@click.option('--dataset', 'opt_dataset',
+ type=cfg.DatasetVar,
+ required=True,
+ show_default=True,
+ help=click_utils.show_help(types.Dataset))
+@click.pass_context
+def cli(ctx, opt_dataset, opt_data_store):
+ """Fix identity key to be slug"""
+
+ import sys, os
+ from glob import glob
+ from os.path import join
+ from pathlib import Path
+
+ import pandas as pd
+ from glob import glob
+ from slugify import slugify
+ from tqdm import tqdm
+
+ from app.models.data_store import DataStore
+
+
+ data_store = DataStore(opt_data_store, opt_dataset)
+ fp_records = data_store.metadata(types.Metadata.FILE_RECORD)
+
+
+ # ----------------------------------------------------------------
+ # load csv and slugify
+
+ df_records = pd.read_csv(fp_records, dtype=cfg.FILE_RECORD_DTYPES).set_index('index')
+ records = df_records.to_dict('records')
+ for r in tqdm(records):
+ r['identity_key'] = slugify(r['identity_key'], separator='_')
+ df_records = pd.DataFrame.from_dict(records)
+ df_records.index.name = 'index'
+ df_records.to_csv(fp_records)
+ log.info(f'wrote: {fp_records}') \ No newline at end of file