diff options
Diffstat (limited to 'check/commands')
| -rw-r--r-- | check/commands/phash/report.py | 101 | ||||
| -rw-r--r-- | check/commands/phash/report_html.py | 80 |
2 files changed, 181 insertions, 0 deletions
diff --git a/check/commands/phash/report.py b/check/commands/phash/report.py new file mode 100644 index 0000000..362480d --- /dev/null +++ b/check/commands/phash/report.py @@ -0,0 +1,101 @@ +""" +Generate a test report from a directory of images +This demo does not use SQL storage + +""" + +from pathlib import Path + +import click + +@click.command('') +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Path to input dir') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Path to output directory') +@click.option('--recursive', 'opt_recursive', is_flag=True, + help='Recursive globbing') +@click.option('-t', '--thresh', 'opt_thresh', default=3, + help='Perceptual hash threshold') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None)) +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_recursive, opt_thresh, opt_slice): + """Deduplicate images""" + + # ------------------------------------------------ + # imports + import sys + from os.path import join + from glob import glob + + import pandas as pd + from tqdm import tqdm + import numpy as np + import cv2 as cv + import imagehash + + from app.utils import logger_utils, im_utils, file_utils + + log = logger_utils.Logger.getLogger() + log.info(f'De-duplicating: {opt_fp_in}') + + # get list of all images + fp_ims = glob(join(opt_fp_in, '*')) + print(len(fp_ims)) + exts = ['.jpg', '.png', '.jpeg'] + fp_ims = [x for x in fp_ims if Path(x).suffix in exts] + if opt_slice: + fp_ims = fp_ims[opt_slice[0]:opt_slice[1]] + + log.info(f'Processing {len(fp_ims):,} images') + + # Create image meta objects + ims_meta = {} + log.info('Computing sha256 and perceptual hashes...') + for fp_im in tqdm(fp_ims): + sha256 = file_utils.sha256(fp_im) + im = cv.imread(fp_im) + im_hash = im_utils.compute_phash(im) # uses PIL + ims_meta[sha256] = { + 'imhash': im_hash, + 'filepath': fp_im, + 'fname': Path(fp_im).name, + 'sha256': sha256, + 'duplicate': None, + } + + # Deduplicate the list of images + log.info('Deduplicating images...') + duplicates = [] + for sha256_a, im_obj_a in tqdm(ims_meta.copy().items()): + for sha256_b, im_obj_b in ims_meta.copy().items(): + if sha256_a == sha256_b: + continue + d = abs(im_obj_a['imhash'] - im_obj_b['imhash']) + if d <= opt_thresh: + # mark B as a duplicate of A + ims_meta[sha256_b]['duplicate'] = sha256_a + duplicates.append({'sha256_a': sha256_a, 'fname_a': im_obj_a['fname'], + 'sha256_b': sha256_b, 'fname_b': im_obj_b['fname'], 'score': d}) + ims_meta.pop(sha256_b) + + n_dupes = sum(1 for k,v in ims_meta.items() if v['duplicate'] is not None) + log.info(f'Found {n_dupes}') + + + #im_list = [v for k,v in ims_meta.items()] # dict to list of dicts + df_items = pd.DataFrame.from_dict(duplicates) + #df_items.drop(['imhash', 'filepath'], axis=1, inplace=True) + file_utils.ensure_dir(opt_fp_out) + log.info(f'Writing: {opt_fp_out}') + df_items.to_csv(opt_fp_out, index=False) + + # generate HTML + # copy images to another directory + # import shutil + # file_utils.ensure_dir(opt_fp_dir_copyto) + # for sha256, im_meta in ims_meta.items(): + # src = im_meta['filepath'] + # dst = join(opt_fp_dir_copyto, f'{sha256}.jpg') + # # dst = join(opt_fp_dir_copyto, f"{im_meta['fname']}") + # shutil.copy(src, dst)
\ No newline at end of file diff --git a/check/commands/phash/report_html.py b/check/commands/phash/report_html.py new file mode 100644 index 0000000..61a8b48 --- /dev/null +++ b/check/commands/phash/report_html.py @@ -0,0 +1,80 @@ +""" +Generate a test report from a directory of images +This demo does not use SQL storage + +""" + +from pathlib import Path + +import click + +@click.command('') +@click.option('--csv', 'opt_fp_in_csv', required=True, + help='Path to input CSV') +@click.option('--images', 'opt_fp_in_img', required=True, + help='Path to images') +@click.option('-o', '--output', 'opt_fp_out_dir', type=click.Path(file_okay=False, dir_okay=True), + help='Path to output directory') +@click.pass_context +def cli(ctx, opt_fp_in_csv, opt_fp_in_img, opt_fp_out_dir): + """Generate HTML report""" + + # ------------------------------------------------ + # imports + import sys + from os.path import join + from glob import glob + + import pandas as pd + from tqdm import tqdm + import jinja2 + from flask import url_for + import shutil + + from app.utils import logger_utils, im_utils, file_utils + + log = logger_utils.Logger.getLogger() + log.info(f'Generating HTML report from: {opt_fp_in_csv}') + + + template_loader = jinja2.FileSystemLoader(searchpath="./static/") + template_env = jinja2.Environment(loader=template_loader) + TEMPLATE_FILE = "perceptual_hash_report.html" + template = template_env.get_template(TEMPLATE_FILE) + + # create project output dir + fp_out_dir_assets = join(opt_fp_out_dir, 'assets') + fp_out_dir_images = join(opt_fp_out_dir, 'images') + + file_utils.ensure_dir(opt_fp_out_dir) + file_utils.ensure_dir(fp_out_dir_assets) + file_utils.ensure_dir(fp_out_dir_images) + + df_dupes = pd.read_csv(opt_fp_in_csv) + image_groups = df_dupes.groupby('fname_a') + + log.info(f'Saving HTML report to: {opt_fp_out_dir}') + # im_objs = df_dupes.to_records('dict') + fp_out_html = join(opt_fp_out_dir, 'index.html') + with open(fp_out_html, 'w') as fp: + html_text = template.render(image_groups=image_groups, + dir_ims=Path(fp_out_dir_images).name, dir_assets=Path(fp_out_dir_assets).name) + fp.write(html_text) + + # copy css + fp_src = 'static/assets/css.css' + fp_dst = join(fp_out_dir_assets, Path(fp_src).name) + shutil.copy(fp_src, fp_dst) + + # copy images + for fname_a, image_group in image_groups: + # get image a + for df_im in image_group.itertuples(): + # image a + fp_src = join(opt_fp_in_img, df_im.fname_a) + fp_dst = join(fp_out_dir_images, df_im.fname_a) + shutil.copy(fp_src, fp_dst) + # image b + fp_src = join(opt_fp_in_img, df_im.fname_b) + fp_dst = join(fp_out_dir_images, df_im.fname_b) + shutil.copy(fp_src, fp_dst)
\ No newline at end of file |
