From 23e8e2222e4577d346e21003ab2f83ef36bfdb55 Mon Sep 17 00:00:00 2001 From: adamhrv Date: Tue, 23 Apr 2019 12:57:20 +0200 Subject: mod utils --- check/app/utils/file_utils.py | 19 +++++++++++++++++++ check/app/utils/im_utils.py | 5 ----- 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'check') diff --git a/check/app/utils/file_utils.py b/check/app/utils/file_utils.py index 5c7b39d..1ed1833 100644 --- a/check/app/utils/file_utils.py +++ b/check/app/utils/file_utils.py @@ -310,6 +310,25 @@ def mkdirs(fp): fpp = fpp.parent if fpp.suffix else fpp fpp.mkdir(parents=True, exist_ok=True) +def ensure_posixpath(fp): + """Ensures filepath is pathlib.Path + :param fp: a (str, LazyFile, PosixPath) + :returns: a PosixPath filepath object + """ + if type(fp) == str: + fpp = Path(fp) + elif type(fp) == click.utils.LazyFile: + fpp = Path(fp.name) + elif type(fp) == pathlib.PosixPath: + fpp = fp + else: + raise TypeError('{} is not a valid filepath type'.format(type(fp))) + return fpp + +def ensure_dir(fp): + if not Path(fp).is_dir(): + mkdirs(fp) + def ext_media_format(ext): """Converts file extension into Enum MediaType diff --git a/check/app/utils/im_utils.py b/check/app/utils/im_utils.py index 747e900..1d1affb 100644 --- a/check/app/utils/im_utils.py +++ b/check/app/utils/im_utils.py @@ -11,12 +11,7 @@ from skimage import feature import imutils import time import numpy as np -import torch -import torch.nn as nn -import torchvision.models as models -import torchvision.transforms as transforms import struct -from torch.autograd import Variable from sklearn.metrics.pairwise import cosine_similarity import datetime -- cgit v1.2.3-70-g09d2 From fc5fe28c55429c4439387df81967b5fc9777c511 Mon Sep 17 00:00:00 2001 From: adamhrv Date: Tue, 23 Apr 2019 12:57:59 +0200 Subject: add report generators --- check/commands/phash/report.py | 101 ++++++++++++++++++++++++++++++++++++ check/commands/phash/report_html.py | 80 ++++++++++++++++++++++++++++ 2 files changed, 181 insertions(+) create mode 100644 check/commands/phash/report.py create mode 100644 check/commands/phash/report_html.py (limited to 'check') diff --git a/check/commands/phash/report.py b/check/commands/phash/report.py new file mode 100644 index 0000000..362480d --- /dev/null +++ b/check/commands/phash/report.py @@ -0,0 +1,101 @@ +""" +Generate a test report from a directory of images +This demo does not use SQL storage + +""" + +from pathlib import Path + +import click + +@click.command('') +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Path to input dir') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Path to output directory') +@click.option('--recursive', 'opt_recursive', is_flag=True, + help='Recursive globbing') +@click.option('-t', '--thresh', 'opt_thresh', default=3, + help='Perceptual hash threshold') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None)) +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_recursive, opt_thresh, opt_slice): + """Deduplicate images""" + + # ------------------------------------------------ + # imports + import sys + from os.path import join + from glob import glob + + import pandas as pd + from tqdm import tqdm + import numpy as np + import cv2 as cv + import imagehash + + from app.utils import logger_utils, im_utils, file_utils + + log = logger_utils.Logger.getLogger() + log.info(f'De-duplicating: {opt_fp_in}') + + # get list of all images + fp_ims = glob(join(opt_fp_in, '*')) + print(len(fp_ims)) + exts = ['.jpg', '.png', '.jpeg'] + fp_ims = [x for x in fp_ims if Path(x).suffix in exts] + if opt_slice: + fp_ims = fp_ims[opt_slice[0]:opt_slice[1]] + + log.info(f'Processing {len(fp_ims):,} images') + + # Create image meta objects + ims_meta = {} + log.info('Computing sha256 and perceptual hashes...') + for fp_im in tqdm(fp_ims): + sha256 = file_utils.sha256(fp_im) + im = cv.imread(fp_im) + im_hash = im_utils.compute_phash(im) # uses PIL + ims_meta[sha256] = { + 'imhash': im_hash, + 'filepath': fp_im, + 'fname': Path(fp_im).name, + 'sha256': sha256, + 'duplicate': None, + } + + # Deduplicate the list of images + log.info('Deduplicating images...') + duplicates = [] + for sha256_a, im_obj_a in tqdm(ims_meta.copy().items()): + for sha256_b, im_obj_b in ims_meta.copy().items(): + if sha256_a == sha256_b: + continue + d = abs(im_obj_a['imhash'] - im_obj_b['imhash']) + if d <= opt_thresh: + # mark B as a duplicate of A + ims_meta[sha256_b]['duplicate'] = sha256_a + duplicates.append({'sha256_a': sha256_a, 'fname_a': im_obj_a['fname'], + 'sha256_b': sha256_b, 'fname_b': im_obj_b['fname'], 'score': d}) + ims_meta.pop(sha256_b) + + n_dupes = sum(1 for k,v in ims_meta.items() if v['duplicate'] is not None) + log.info(f'Found {n_dupes}') + + + #im_list = [v for k,v in ims_meta.items()] # dict to list of dicts + df_items = pd.DataFrame.from_dict(duplicates) + #df_items.drop(['imhash', 'filepath'], axis=1, inplace=True) + file_utils.ensure_dir(opt_fp_out) + log.info(f'Writing: {opt_fp_out}') + df_items.to_csv(opt_fp_out, index=False) + + # generate HTML + # copy images to another directory + # import shutil + # file_utils.ensure_dir(opt_fp_dir_copyto) + # for sha256, im_meta in ims_meta.items(): + # src = im_meta['filepath'] + # dst = join(opt_fp_dir_copyto, f'{sha256}.jpg') + # # dst = join(opt_fp_dir_copyto, f"{im_meta['fname']}") + # shutil.copy(src, dst) \ No newline at end of file diff --git a/check/commands/phash/report_html.py b/check/commands/phash/report_html.py new file mode 100644 index 0000000..61a8b48 --- /dev/null +++ b/check/commands/phash/report_html.py @@ -0,0 +1,80 @@ +""" +Generate a test report from a directory of images +This demo does not use SQL storage + +""" + +from pathlib import Path + +import click + +@click.command('') +@click.option('--csv', 'opt_fp_in_csv', required=True, + help='Path to input CSV') +@click.option('--images', 'opt_fp_in_img', required=True, + help='Path to images') +@click.option('-o', '--output', 'opt_fp_out_dir', type=click.Path(file_okay=False, dir_okay=True), + help='Path to output directory') +@click.pass_context +def cli(ctx, opt_fp_in_csv, opt_fp_in_img, opt_fp_out_dir): + """Generate HTML report""" + + # ------------------------------------------------ + # imports + import sys + from os.path import join + from glob import glob + + import pandas as pd + from tqdm import tqdm + import jinja2 + from flask import url_for + import shutil + + from app.utils import logger_utils, im_utils, file_utils + + log = logger_utils.Logger.getLogger() + log.info(f'Generating HTML report from: {opt_fp_in_csv}') + + + template_loader = jinja2.FileSystemLoader(searchpath="./static/") + template_env = jinja2.Environment(loader=template_loader) + TEMPLATE_FILE = "perceptual_hash_report.html" + template = template_env.get_template(TEMPLATE_FILE) + + # create project output dir + fp_out_dir_assets = join(opt_fp_out_dir, 'assets') + fp_out_dir_images = join(opt_fp_out_dir, 'images') + + file_utils.ensure_dir(opt_fp_out_dir) + file_utils.ensure_dir(fp_out_dir_assets) + file_utils.ensure_dir(fp_out_dir_images) + + df_dupes = pd.read_csv(opt_fp_in_csv) + image_groups = df_dupes.groupby('fname_a') + + log.info(f'Saving HTML report to: {opt_fp_out_dir}') + # im_objs = df_dupes.to_records('dict') + fp_out_html = join(opt_fp_out_dir, 'index.html') + with open(fp_out_html, 'w') as fp: + html_text = template.render(image_groups=image_groups, + dir_ims=Path(fp_out_dir_images).name, dir_assets=Path(fp_out_dir_assets).name) + fp.write(html_text) + + # copy css + fp_src = 'static/assets/css.css' + fp_dst = join(fp_out_dir_assets, Path(fp_src).name) + shutil.copy(fp_src, fp_dst) + + # copy images + for fname_a, image_group in image_groups: + # get image a + for df_im in image_group.itertuples(): + # image a + fp_src = join(opt_fp_in_img, df_im.fname_a) + fp_dst = join(fp_out_dir_images, df_im.fname_a) + shutil.copy(fp_src, fp_dst) + # image b + fp_src = join(opt_fp_in_img, df_im.fname_b) + fp_dst = join(fp_out_dir_images, df_im.fname_b) + shutil.copy(fp_src, fp_dst) \ No newline at end of file -- cgit v1.2.3-70-g09d2 From 650318a79fe28ae5322d6483aa98475857b7e20b Mon Sep 17 00:00:00 2001 From: adamhrv Date: Tue, 23 Apr 2019 12:58:20 +0200 Subject: add template --- check/static/assets/css.css | 341 +++++++++++++++++++++++++++++++ check/static/perceptual_hash_report.html | 39 ++++ 2 files changed, 380 insertions(+) create mode 100755 check/static/assets/css.css create mode 100644 check/static/perceptual_hash_report.html (limited to 'check') diff --git a/check/static/assets/css.css b/check/static/assets/css.css new file mode 100755 index 0000000..9e8a59f --- /dev/null +++ b/check/static/assets/css.css @@ -0,0 +1,341 @@ +body { + font-family: Helvetica, arial, sans-serif; + font-size: 14px; + line-height: 1.6; + padding-top: 10px; + padding-bottom: 10px; + background-color: white; + padding: 45px; } + +body > *:first-child { + margin-top: 0 !important; } +body > *:last-child { + margin-bottom: 0 !important; } + +a { + color: #4183C4; } +a.absent { + color: #cc0000; } +a.anchor { + display: block; + padding-left: 30px; + margin-left: -30px; + cursor: pointer; + position: absolute; + top: 0; + left: 0; + bottom: 0; } + +h1, h2, h3, h4, h5, h6 { + margin: 25px 0 8px; + padding: 0; + font-weight: bold; + -webkit-font-smoothing: antialiased; + cursor: text; + position: relative; } + +h1:hover a.anchor, h2:hover a.anchor, h3:hover a.anchor, h4:hover a.anchor, h5:hover a.anchor, h6:hover a.anchor { + background: url() no-repeat 10px center; + text-decoration: none; } + +h1 tt, h1 code { + font-size: inherit; } + +h2 tt, h2 code { + font-size: inherit; } + +h3 tt, h3 code { + font-size: inherit; } + +h4 tt, h4 code { + font-size: inherit; } + +h5 tt, h5 code { + font-size: inherit; } + +h6 tt, h6 code { + font-size: inherit; } + +h1 { + font-size: 28px; + color: black; } + +h2 { + font-size: 24px; } + +h3 { + font-size: 18px; } + +h4 { + font-size: 16px; } + +h5 { + font-size: 14px; } + +h6 { + color: #777777; + font-size: 14px; } + +p, blockquote, ul, ol, dl, li, table, pre { + margin: 15px 0; } + +hr { + background: transparent url() repeat-x 0 0; + border: 0 none; + color: #cccccc; + height: 4px; + padding: 0; +} + +body > h2:first-child { + margin-top: 0; + padding-top: 0; } +body > h1:first-child { + margin-top: 0; + padding-top: 0; } + body > h1:first-child + h2 { + margin-top: 0; + padding-top: 0; } +body > h3:first-child, body > h4:first-child, body > h5:first-child, body > h6:first-child { + margin-top: 0; + padding-top: 0; } + +a:first-child h1, a:first-child h2, a:first-child h3, a:first-child h4, a:first-child h5, a:first-child h6 { + margin-top: 0; + padding-top: 0; } + +h1 p, h2 p, h3 p, h4 p, h5 p, h6 p { + margin-top: 0; } + +li p.first { + display: inline-block; } +li { + margin: 0; } +ul, ol { + padding-left: 30px; } + +ul :first-child, ol :first-child { + margin-top: 0; } + +dl { + padding: 0; } + dl dt { + font-size: 14px; + font-weight: bold; + font-style: italic; + padding: 0; + margin: 15px 0 5px; } + dl dt:first-child { + padding: 0; } + dl dt > :first-child { + margin-top: 0; } + dl dt > :last-child { + margin-bottom: 0; } + dl dd { + margin: 0 0 15px; + padding: 0 15px; } + dl dd > :first-child { + margin-top: 0; } + dl dd > :last-child { + margin-bottom: 0; } + +blockquote { + border-left: 4px solid #dddddd; + padding: 0 15px; + color: #777777; } + blockquote > :first-child { + margin-top: 0; } + blockquote > :last-child { + margin-bottom: 0; } + +table { + font-size:11px; + padding: 0;border-collapse: collapse; } + table tr { + border-top: 1px solid #cccccc; + background-color: white; + margin: 0; + padding: 0; } + table tr:nth-child(2n) { + background-color: #f8f8f8; } + table tr th { + font-weight: bold; + border: 1px solid #cccccc; + margin: 0; + padding: 6px 13px; } + table tr td { + border: 1px solid #cccccc; + margin: 0; + padding: 6px 13px; } + table tr th :first-child, table tr td :first-child { + margin-top: 0; } + table tr th :last-child, table tr td :last-child { + margin-bottom: 0; } + +img { + max-width: 100%; } +img + em{ + font-size:.675rem; + color:#999; + display: block; +} +span.frame { + display: block; + overflow: hidden; } + span.frame > span { + border: 1px solid #dddddd; + display: block; + float: left; + overflow: hidden; + margin: 13px 0 0; + padding: 7px; + width: auto; } + span.frame span img { + display: block; + float: left; } + span.frame span span { + clear: both; + color: #333333; + display: block; + padding: 5px 0 0; } +span.align-center { + display: block; + overflow: hidden; + clear: both; } + span.align-center > span { + display: block; + overflow: hidden; + margin: 13px auto 0; + text-align: center; } + span.align-center span img { + margin: 0 auto; + text-align: center; } +span.align-right { + display: block; + overflow: hidden; + clear: both; } + span.align-right > span { + display: block; + overflow: hidden; + margin: 13px 0 0; + text-align: right; } + span.align-right span img { + margin: 0; + text-align: right; } +span.float-left { + display: block; + margin-right: 13px; + overflow: hidden; + float: left; } + span.float-left span { + margin: 13px 0 0; } +span.float-right { + display: block; + margin-left: 13px; + overflow: hidden; + float: right; } + span.float-right > span { + display: block; + overflow: hidden; + margin: 13px auto 0; + text-align: right; } + +code, tt { + margin: 0 2px; + padding: 0 5px; + white-space: nowrap; + border: 1px solid #eaeaea; + background-color: #f8f8f8; + border-radius: 3px; } + +pre code { + margin: 0; + padding: 0; + white-space: pre; + border: none; + background: transparent; } + +.highlight pre { + background-color: #f8f8f8; + border: 1px solid #cccccc; + font-size: 13px; + line-height: 19px; + overflow: auto; + padding: 6px 10px; + border-radius: 3px; } + +pre { + background-color: #f8f8f8; + border: 1px solid #cccccc; + font-size: 13px; + line-height: 19px; + overflow: auto; + padding: 6px 10px; + border-radius: 3px; } + pre code, pre tt { + background-color: transparent; + border: none; } + +sup { + font-size: 0.83em; + vertical-align: super; + line-height: 0; +} + +kbd { + display: inline-block; + padding: 3px 5px; + font-size: 11px; + line-height: 10px; + color: #555; + vertical-align: middle; + background-color: #fcfcfc; + border: solid 1px #ccc; + border-bottom-color: #bbb; + border-radius: 3px; + box-shadow: inset 0 -1px 0 #bbb +} + +* { + -webkit-print-color-adjust: exact; +} +@media screen and (min-width: 914px) { + body { + width: 854px; + margin:0 auto; + } +} +@media print { + table, pre { + page-break-inside: avoid; + } + pre { + word-wrap: break-word; + } +} +.caption{ + font-size:.83em; + color:#999; + margin-bottom:10px; +} + +.pagebreak { page-break-inside:avoid; page-break-after:always; } + +/* image match results */ + +.img_match{ + max-width:300px +} +td.result_txt{ + vertical-align: top; +} +td.result_txt ul{ + margin-left: 0; + padding-left: 0; + list-style: none; +} +td.result_txt ul li{ + margin-left:0; + padding-bottom: 5px; + list-style: none; +} \ No newline at end of file diff --git a/check/static/perceptual_hash_report.html b/check/static/perceptual_hash_report.html new file mode 100644 index 0000000..566a058 --- /dev/null +++ b/check/static/perceptual_hash_report.html @@ -0,0 +1,39 @@ + + + + + + +

Image Duplicates

+ + {% for fname_a, image_group in image_groups %} +

Duplicates for {{ fname_a }}

+ + + + + + + {% for df in image_group.itertuples() %} + + + + + + + {% endfor %} +
Image AImage BScore
+ + + + +
    +
  • Filename A: {{ df.fname_a }}
  • +
  • Filename B:
    {{ df.fname_b }}
  • +
  • Score: {{ df.score }}
  • +
+
+ {% endfor %} + + + \ No newline at end of file -- cgit v1.2.3-70-g09d2