1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
|
"""
Import a CSV of URLs
"""
import click
import os
import glob
import io
import random
from PIL import Image
from app.models.sql_factory import add_phash
from app.utils.im_utils import compute_phash_int
from app.utils.file_utils import load_csv, sha256_stream
from app.utils.process_utils import parallelize
from app.server.api import fetch_url
@click.command()
@click.option('-i', '--input', 'opt_input_fn',
required=True,
help="Input path to CSV")
@click.option('-b', '--base_href', 'opt_base_href',
required=False,
default="",
help="Base href, default is empty string")
@click.option('-e', '--field', 'opt_field',
required=False,
default="address",
help="Field in CSV containing URL")
@click.pass_context
def cli(ctx, opt_input_fn, opt_base_href, opt_field):
"""
Import a folder of images, deduping them first
"""
def add_url(url):
fname, ext = os.path.splitext(url)
if ext not in ['.gif', '.jpg', '.jpeg', '.png']:
return
ext = ext[1:]
try:
raw, im = fetch_url(url)
except:
# print('404 {}'.format(url))
return
print(url)
phash = compute_phash_int(im)
hash = sha256_stream(io.BytesIO(raw))
add_phash(sha256=hash, phash=phash, ext=ext, url=url)
rows = load_csv(opt_input_fn)
urls = [opt_base_href + row['address'] for row in rows]
random.shuffle(urls)
parallelize(urls, add_url)
|