summaryrefslogtreecommitdiff
path: root/check/commands/phash/dedupe.py
blob: 28266f42a3b3765234784af6a4dab7cab2117f6e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
"""
Dedupe a folder of images (uses phash directly, does not use database)
"""

import click
import os
import glob

from PIL import Image

from app.utils.im_utils import compute_phash
from app.utils.file_utils import write_json, sha256

@click.command()
@click.option('-i', '--input', 'opt_input_glob',
  required=True,
  help="Input glob to search -- e.g. '../docs/images/*.jpg'")
@click.option('-o', '--output', 'opt_output_fn',
  required=False,
  help="Input glob to search -- e.g. '../docs/images/*.jpg'")
@click.option('-t', '--threshold', 'opt_threshold',
  required=True,
  default=6,
  type=int,
  help="Threshold for PHash hamming distance comparison (0-64, default=6)")
@click.pass_context
def cli(ctx, opt_input_glob, opt_output_fn, opt_threshold):
  """
  Dedupe a folder of images
  """
  seen = []
  total = 0
  for fn in sorted(glob.iglob(os.path.expanduser(opt_input_glob))):
    total += 1
    im = Image.open(fn).convert('RGB')
    phash = compute_phash(im)
    if is_phash_new(fn, phash, seen, opt_threshold):
      hash = sha256(fn)
      seen.append({
        'sha256': hash,
        'phash': phash,
        'fn': fn,
      })
  if opt_output_fn:
    write_json(seen, opt_output_fn)
  print("checked {} files, found {} unique".format(total, len(seen)))

def is_phash_new(fn, phash, seen, opt_threshold):
  for item in seen:
    diff = item['phash'] - phash
    if diff < opt_threshold:
      print("{} === {} (diff: {})".format(fn, item['fn'], diff))
      return False
  return True