summaryrefslogtreecommitdiff
path: root/megapixels/commands/datasets/fix_identity_key.py
blob: ad2b555f03cb4ca5dacbff39a80129d7564ad149 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
'''

'''
import click

from app.settings import types
from app.utils import click_utils
from app.settings import app_cfg as cfg
from app.utils.logger_utils import Logger

log = Logger.getLogger()


identity_sources = ['subdir', 'numeric']

@click.command()
@click.option('--data_store', 'opt_data_store',
  type=cfg.DataStoreVar,
  default=click_utils.get_default(types.DataStore.HDD),
  show_default=True,
  help=click_utils.show_help(types.Dataset))
@click.option('--dataset', 'opt_dataset',
  type=cfg.DatasetVar,
  required=True,
  show_default=True,
  help=click_utils.show_help(types.Dataset))
@click.pass_context
def cli(ctx, opt_dataset, opt_data_store):
  """Fix identity key to be slug"""
  
  import sys, os
  from glob import glob
  from os.path import join
  from pathlib import Path

  import pandas as pd
  from glob import glob
  from slugify import slugify
  from tqdm import tqdm

  from app.models.data_store import DataStore


  data_store = DataStore(opt_data_store, opt_dataset)
  fp_records = data_store.metadata(types.Metadata.FILE_RECORD)

  
  # ----------------------------------------------------------------
  # load csv and slugify

  df_records = pd.read_csv(fp_records, dtype=cfg.FILE_RECORD_DTYPES).set_index('index')
  records = df_records.to_dict('records')
  for r in tqdm(records):
    r['identity_key'] = slugify(r['identity_key'], separator='_')
  df_records = pd.DataFrame.from_dict(records)
  df_records.index.name = 'index'
  df_records.to_csv(fp_records)
  log.info(f'wrote: {fp_records}')