""" Import a CSV of URLs """ import click import os import glob import io import random from PIL import Image from app.models.sql_factory import add_phash from app.utils.im_utils import compute_phash_int from app.utils.file_utils import load_csv, sha256_stream from app.utils.process_utils import parallelize from app.server.api import fetch_url @click.command() @click.option('-i', '--input', 'opt_input_fn', required=True, help="Input path to CSV") @click.option('-b', '--base_href', 'opt_base_href', required=False, default="", help="Base href, default is empty string") @click.option('-e', '--field', 'opt_field', required=False, default="address", help="Field in CSV containing URL") @click.pass_context def cli(ctx, opt_input_fn, opt_base_href, opt_field): """ Import a folder of images, deduping them first """ def add_url(url): fname, ext = os.path.splitext(url) if ext not in ['.gif', '.jpg', '.jpeg', '.png']: return ext = ext[1:] try: raw, im = fetch_url(url) except: # print('404 {}'.format(url)) return print(url) phash = compute_phash_int(im) hash = sha256_stream(io.BytesIO(raw)) add_phash(sha256=hash, phash=phash, ext=ext, url=url) rows = load_csv(opt_input_fn) urls = [opt_base_href + row['address'] for row in rows] random.shuffle(urls) parallelize(urls, add_url)