summaryrefslogtreecommitdiff
path: root/check/commands/phash/import_csv.py
blob: 5e09aa8d472a0ba7cc09578ae91127e906e8ec46 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
"""
Import a CSV of URLs
"""

import click
import os
import glob
import io
import random

from PIL import Image

from app.models.sql_factory import add_phash
from app.utils.im_utils import compute_phash_int
from app.utils.file_utils import load_csv, sha256_stream
from app.utils.process_utils import parallelize
from app.server.api import fetch_url

@click.command()
@click.option('-i', '--input', 'opt_input_fn',
  required=True,
  help="Input path to CSV")
@click.option('-b', '--base_href', 'opt_base_href',
  required=False,
  default="",
  help="Base href, default is empty string")
@click.option('-e', '--field', 'opt_field',
  required=False,
  default="address",
  help="Field in CSV containing URL")
@click.pass_context
def cli(ctx, opt_input_fn, opt_base_href, opt_field):
  """
  Import a folder of images, deduping them first
  """
  def add_url(url):
    fname, ext = os.path.splitext(url)
    if ext not in ['.gif', '.jpg', '.jpeg', '.png']:
      return
    ext = ext[1:]
    try:
      raw, im = fetch_url(url)
    except:
      # print('404 {}'.format(url))
      return
    print(url)
    phash = compute_phash_int(im)
    hash = sha256_stream(io.BytesIO(raw))
    add_phash(sha256=hash, phash=phash, ext=ext, url=url)

  rows = load_csv(opt_input_fn)
  urls = [opt_base_href + row['address'] for row in rows]
  random.shuffle(urls)
  parallelize(urls, add_url)