summaryrefslogtreecommitdiff
path: root/check/commands
diff options
context:
space:
mode:
Diffstat (limited to 'check/commands')
-rw-r--r--check/commands/phash/dedupe.py46
1 files changed, 46 insertions, 0 deletions
diff --git a/check/commands/phash/dedupe.py b/check/commands/phash/dedupe.py
new file mode 100644
index 0000000..3cf60d4
--- /dev/null
+++ b/check/commands/phash/dedupe.py
@@ -0,0 +1,46 @@
+"""
+Dedupe a folder of images
+"""
+
+import click
+import os
+import glob
+
+from PIL import Image
+
+from app.utils.im_utils import compute_phash
+
+@click.command()
+@click.option('-i', '--input', 'opt_input_glob',
+ required=True,
+ help="Input glob to search -- e.g. '../docs/images/*.jpg'")
+@click.option('-t', '--threshold', 'opt_threshold',
+ required=True,
+ default=6,
+ type=int,
+ help="Threshold for PHash hamming distance comparison (0-64, default=6)")
+@click.pass_context
+def cli(ctx, opt_input_glob, opt_threshold):
+ """
+ Dedupe a folder of images
+ """
+ seen = []
+ total = 0
+ for fn in sorted(glob.iglob(os.path.expanduser(opt_input_glob))):
+ total += 1
+ im = Image.open(fn).convert('RGB')
+ phash = compute_phash(im)
+ if is_phash_new(fn, phash, seen, opt_threshold):
+ seen.append({
+ 'phash': phash,
+ 'fn': fn,
+ })
+ print("checked {} files, found {} unique".format(total, len(seen)))
+
+def is_phash_new(fn, phash, seen, opt_threshold):
+ for item in seen:
+ diff = item['phash'] - phash
+ if diff < opt_threshold:
+ print("{} === {} (diff: {})".format(fn, item['fn'], diff))
+ return False
+ return True