summaryrefslogtreecommitdiff
path: root/check/commands/phash/import.py
blob: 5e8dc6ff1847e3e1bdbded01733596e4b2b28b5f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
"""
Import a folder of images, deduping them first
"""

import click
import os
import glob

from PIL import Image

from app.models.sql_factory import add_phash
from app.utils.im_utils import compute_phash, phash2int
from app.utils.file_utils import write_json, sha256

valid_exts = ['.gif', '.jpg', '.jpeg', '.png']

@click.command()
@click.option('-i', '--input', 'opt_input_glob',
  required=True,
  help="Input glob to search -- e.g. 'static/sample_set_test_01/images/*'")
@click.option('-t', '--threshold', 'opt_threshold',
  required=True,
  default=6,
  type=int,
  help="Threshold for hamming distance comparison (0-64, default=6)")
@click.pass_context
def cli(ctx, opt_input_glob, opt_threshold):
  """
  Import a folder of images, deduping them first
  """
  seen = []
  total = 0
  if not opt_input_glob.startswith('static/'):
    print("Please move your files into the static folder to make them accessible")
    return
  for fn in sorted(glob.iglob(os.path.expanduser(opt_input_glob))):
    fname, ext = os.path.splitext(fn)
    if ext not in valid_exts:
      continue
    ext = ext[1:]
    total += 1
    im = Image.open(fn).convert('RGB')
    phash = compute_phash(im)
    if is_phash_new(fn, phash, seen, opt_threshold):
      hash = sha256(fn)
      url = '/' + fn
      seen.append({
        'sha256': hash,
        'phash': phash,
        'fn': fn,
      })
      add_phash(sha256=hash, phash=phash2int(phash), ext=ext, url=url)
  print("checked {} files, found {} unique".format(total, len(seen)))

def is_phash_new(fn, phash, seen, opt_threshold):
  for item in seen:
    diff = item['phash'] - phash
    if diff < opt_threshold:
      return False
  return True