diff options
Diffstat (limited to 'check/commands/phash/import_csv.py')
| -rw-r--r-- | check/commands/phash/import_csv.py | 54 |
1 files changed, 54 insertions, 0 deletions
diff --git a/check/commands/phash/import_csv.py b/check/commands/phash/import_csv.py new file mode 100644 index 0000000..5e09aa8 --- /dev/null +++ b/check/commands/phash/import_csv.py @@ -0,0 +1,54 @@ +""" +Import a CSV of URLs +""" + +import click +import os +import glob +import io +import random + +from PIL import Image + +from app.models.sql_factory import add_phash +from app.utils.im_utils import compute_phash_int +from app.utils.file_utils import load_csv, sha256_stream +from app.utils.process_utils import parallelize +from app.server.api import fetch_url + +@click.command() +@click.option('-i', '--input', 'opt_input_fn', + required=True, + help="Input path to CSV") +@click.option('-b', '--base_href', 'opt_base_href', + required=False, + default="", + help="Base href, default is empty string") +@click.option('-e', '--field', 'opt_field', + required=False, + default="address", + help="Field in CSV containing URL") +@click.pass_context +def cli(ctx, opt_input_fn, opt_base_href, opt_field): + """ + Import a folder of images, deduping them first + """ + def add_url(url): + fname, ext = os.path.splitext(url) + if ext not in ['.gif', '.jpg', '.jpeg', '.png']: + return + ext = ext[1:] + try: + raw, im = fetch_url(url) + except: + # print('404 {}'.format(url)) + return + print(url) + phash = compute_phash_int(im) + hash = sha256_stream(io.BytesIO(raw)) + add_phash(sha256=hash, phash=phash, ext=ext, url=url) + + rows = load_csv(opt_input_fn) + urls = [opt_base_href + row['address'] for row in rows] + random.shuffle(urls) + parallelize(urls, add_url) |
