diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2019-04-29 01:36:27 +0200 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2019-04-29 01:36:27 +0200 |
| commit | 86e34a1bc43d1995e20c52fa639412c46105d400 (patch) | |
| tree | 35623620556e6cfac2ca67e2b6f4f61cd2329e2a /check/commands/phash | |
| parent | dbfaa9024c844dd5c14259c858564e32149afd87 (diff) | |
Diffstat (limited to 'check/commands/phash')
| -rw-r--r-- | check/commands/phash/dedupe.py | 5 | ||||
| -rw-r--r-- | check/commands/phash/import_csv.py | 54 |
2 files changed, 58 insertions, 1 deletions
diff --git a/check/commands/phash/dedupe.py b/check/commands/phash/dedupe.py index 28266f4..6b8194b 100644 --- a/check/commands/phash/dedupe.py +++ b/check/commands/phash/dedupe.py @@ -17,7 +17,7 @@ from app.utils.file_utils import write_json, sha256 help="Input glob to search -- e.g. '../docs/images/*.jpg'") @click.option('-o', '--output', 'opt_output_fn', required=False, - help="Input glob to search -- e.g. '../docs/images/*.jpg'") + help="Output filename") @click.option('-t', '--threshold', 'opt_threshold', required=True, default=6, @@ -36,10 +36,13 @@ def cli(ctx, opt_input_glob, opt_output_fn, opt_threshold): phash = compute_phash(im) if is_phash_new(fn, phash, seen, opt_threshold): hash = sha256(fn) + fpart, ext = os.path.splitext(fn) + ext = ext[1:] seen.append({ 'sha256': hash, 'phash': phash, 'fn': fn, + 'ext': ext, }) if opt_output_fn: write_json(seen, opt_output_fn) diff --git a/check/commands/phash/import_csv.py b/check/commands/phash/import_csv.py new file mode 100644 index 0000000..5e09aa8 --- /dev/null +++ b/check/commands/phash/import_csv.py @@ -0,0 +1,54 @@ +""" +Import a CSV of URLs +""" + +import click +import os +import glob +import io +import random + +from PIL import Image + +from app.models.sql_factory import add_phash +from app.utils.im_utils import compute_phash_int +from app.utils.file_utils import load_csv, sha256_stream +from app.utils.process_utils import parallelize +from app.server.api import fetch_url + +@click.command() +@click.option('-i', '--input', 'opt_input_fn', + required=True, + help="Input path to CSV") +@click.option('-b', '--base_href', 'opt_base_href', + required=False, + default="", + help="Base href, default is empty string") +@click.option('-e', '--field', 'opt_field', + required=False, + default="address", + help="Field in CSV containing URL") +@click.pass_context +def cli(ctx, opt_input_fn, opt_base_href, opt_field): + """ + Import a folder of images, deduping them first + """ + def add_url(url): + fname, ext = os.path.splitext(url) + if ext not in ['.gif', '.jpg', '.jpeg', '.png']: + return + ext = ext[1:] + try: + raw, im = fetch_url(url) + except: + # print('404 {}'.format(url)) + return + print(url) + phash = compute_phash_int(im) + hash = sha256_stream(io.BytesIO(raw)) + add_phash(sha256=hash, phash=phash, ext=ext, url=url) + + rows = load_csv(opt_input_fn) + urls = [opt_base_href + row['address'] for row in rows] + random.shuffle(urls) + parallelize(urls, add_url) |
