summaryrefslogtreecommitdiff
path: root/check/commands/phash/import_csv.py
diff options
context:
space:
mode:
Diffstat (limited to 'check/commands/phash/import_csv.py')
-rw-r--r--check/commands/phash/import_csv.py54
1 files changed, 54 insertions, 0 deletions
diff --git a/check/commands/phash/import_csv.py b/check/commands/phash/import_csv.py
new file mode 100644
index 0000000..5e09aa8
--- /dev/null
+++ b/check/commands/phash/import_csv.py
@@ -0,0 +1,54 @@
+"""
+Import a CSV of URLs
+"""
+
+import click
+import os
+import glob
+import io
+import random
+
+from PIL import Image
+
+from app.models.sql_factory import add_phash
+from app.utils.im_utils import compute_phash_int
+from app.utils.file_utils import load_csv, sha256_stream
+from app.utils.process_utils import parallelize
+from app.server.api import fetch_url
+
+@click.command()
+@click.option('-i', '--input', 'opt_input_fn',
+ required=True,
+ help="Input path to CSV")
+@click.option('-b', '--base_href', 'opt_base_href',
+ required=False,
+ default="",
+ help="Base href, default is empty string")
+@click.option('-e', '--field', 'opt_field',
+ required=False,
+ default="address",
+ help="Field in CSV containing URL")
+@click.pass_context
+def cli(ctx, opt_input_fn, opt_base_href, opt_field):
+ """
+ Import a folder of images, deduping them first
+ """
+ def add_url(url):
+ fname, ext = os.path.splitext(url)
+ if ext not in ['.gif', '.jpg', '.jpeg', '.png']:
+ return
+ ext = ext[1:]
+ try:
+ raw, im = fetch_url(url)
+ except:
+ # print('404 {}'.format(url))
+ return
+ print(url)
+ phash = compute_phash_int(im)
+ hash = sha256_stream(io.BytesIO(raw))
+ add_phash(sha256=hash, phash=phash, ext=ext, url=url)
+
+ rows = load_csv(opt_input_fn)
+ urls = [opt_base_href + row['address'] for row in rows]
+ random.shuffle(urls)
+ parallelize(urls, add_url)