1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
|
"""
Dedupe a folder of images (uses phash directly, does not use database)
"""
import click
import os
import glob
from PIL import Image
from app.utils.im_utils import compute_phash
from app.utils.file_utils import write_json, sha256
@click.command()
@click.option('-i', '--input', 'opt_input_glob',
required=True,
help="Input glob to search -- e.g. '../docs/images/*.jpg'")
@click.option('-o', '--output', 'opt_output_fn',
required=False,
help="Input glob to search -- e.g. '../docs/images/*.jpg'")
@click.option('-t', '--threshold', 'opt_threshold',
required=True,
default=6,
type=int,
help="Threshold for PHash hamming distance comparison (0-64, default=6)")
@click.pass_context
def cli(ctx, opt_input_glob, opt_output_fn, opt_threshold):
"""
Dedupe a folder of images
"""
seen = []
total = 0
for fn in sorted(glob.iglob(os.path.expanduser(opt_input_glob))):
total += 1
im = Image.open(fn).convert('RGB')
phash = compute_phash(im)
if is_phash_new(fn, phash, seen, opt_threshold):
hash = sha256(fn)
seen.append({
'sha256': hash,
'phash': phash,
'fn': fn,
})
if opt_output_fn:
write_json(seen, opt_output_fn)
print("checked {} files, found {} unique".format(total, len(seen)))
def is_phash_new(fn, phash, seen, opt_threshold):
for item in seen:
diff = item['phash'] - phash
if diff < opt_threshold:
print("{} === {} (diff: {})".format(fn, item['fn'], diff))
return False
return True
|