diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2019-04-23 13:10:45 +0200 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2019-04-23 13:10:45 +0200 |
| commit | f5f30912a2efd9c3d47cc9aeaf6a542211b00474 (patch) | |
| tree | 094c4d1e006711b3f7724f2aaf869e21ac5cb406 /check/commands/phash | |
| parent | 79f0e696f3f6067a0841a37404fb546dedaa07cb (diff) | |
edits
Diffstat (limited to 'check/commands/phash')
| -rw-r--r-- | check/commands/phash/dedupe.py | 46 |
1 files changed, 46 insertions, 0 deletions
diff --git a/check/commands/phash/dedupe.py b/check/commands/phash/dedupe.py new file mode 100644 index 0000000..3cf60d4 --- /dev/null +++ b/check/commands/phash/dedupe.py @@ -0,0 +1,46 @@ +""" +Dedupe a folder of images +""" + +import click +import os +import glob + +from PIL import Image + +from app.utils.im_utils import compute_phash + +@click.command() +@click.option('-i', '--input', 'opt_input_glob', + required=True, + help="Input glob to search -- e.g. '../docs/images/*.jpg'") +@click.option('-t', '--threshold', 'opt_threshold', + required=True, + default=6, + type=int, + help="Threshold for PHash hamming distance comparison (0-64, default=6)") +@click.pass_context +def cli(ctx, opt_input_glob, opt_threshold): + """ + Dedupe a folder of images + """ + seen = [] + total = 0 + for fn in sorted(glob.iglob(os.path.expanduser(opt_input_glob))): + total += 1 + im = Image.open(fn).convert('RGB') + phash = compute_phash(im) + if is_phash_new(fn, phash, seen, opt_threshold): + seen.append({ + 'phash': phash, + 'fn': fn, + }) + print("checked {} files, found {} unique".format(total, len(seen))) + +def is_phash_new(fn, phash, seen, opt_threshold): + for item in seen: + diff = item['phash'] - phash + if diff < opt_threshold: + print("{} === {} (diff: {})".format(fn, item['fn'], diff)) + return False + return True |
