diff options
Diffstat (limited to 'megapixels/commands/datasets/records.py')
| -rw-r--r-- | megapixels/commands/datasets/records.py | 40 |
1 files changed, 24 insertions, 16 deletions
diff --git a/megapixels/commands/datasets/records.py b/megapixels/commands/datasets/records.py index 80de5040..b6ef618b 100644 --- a/megapixels/commands/datasets/records.py +++ b/megapixels/commands/datasets/records.py @@ -107,10 +107,12 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, # convert data to dict data = [] + indentity_count = 0 for sha256, fp_im in zip(sha256s, fp_ims): fpp_im = Path(fp_im) subdir = str(fpp_im.parent.relative_to(fp_in)) + if opt_identity: subdirs = subdir.split('/') if not len(subdirs) > 0: @@ -124,7 +126,8 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, elif opt_identity == 'subdir_tail': identity = subdirs[-1] # use last part of subdir path else: - identity = '' + identity = indentity_count # use incrementing number + indentity_count += 1 data.append({ 'subdir': subdir, @@ -135,22 +138,27 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, 'identity_key': identity }) - log.info(f'adding identity index using: "{opt_identity}". This may take a while...') - # convert dict to DataFrame df_records = pd.DataFrame.from_dict(data) - # sort based on identity_key - df_records = df_records.sort_values(by=['identity_key'], ascending=True) - # add new column for identity - df_records['identity_index'] = [-1] * len(df_records) - # populate the identity_index - df_records_identity_groups = df_records.groupby('identity_key') - # enumerate groups to create identity indices - for identity_index, df_records_identity_group_tuple in enumerate(df_records_identity_groups): - identity_key, df_records_identity_group = df_records_identity_group_tuple - for ds_record in df_records_identity_group.itertuples(): - df_records.at[ds_record.Index, 'identity_index'] = identity_index - # reset index after being sorted - df_records = df_records.reset_index(drop=True) + if opt_identity: + log.info(f'adding identity index using: "{opt_identity}". This may take a while...') + # convert dict to DataFrame + # sort based on identity_key + df_records = df_records.sort_values(by=['identity_key'], ascending=True) + # add new column for identity + df_records['identity_index'] = [-1] * len(df_records) + # populate the identity_index + df_records_identity_groups = df_records.groupby('identity_key') + # enumerate groups to create identity indices + for identity_index, df_records_identity_group_tuple in enumerate(df_records_identity_groups): + identity_key, df_records_identity_group = df_records_identity_group_tuple + for ds_record in df_records_identity_group.itertuples(): + df_records.at[ds_record.Index, 'identity_index'] = identity_index + # reset index after being sorted + df_records = df_records.reset_index(drop=True) + else: + # name everyone person 1, 2, 3... + pass + df_records.index.name = 'index' # reassign 'index' as primary key column # write to CSV file_utils.mkdirs(fp_out) |
