diff options
Diffstat (limited to 'check/app')
| -rw-r--r-- | check/app/server/api.py | 6 | ||||
| l--------- | check/app/server/static | 1 | ||||
| -rw-r--r-- | check/app/utils/file_utils.py | 452 | ||||
| -rw-r--r-- | check/app/utils/im_utils.py | 14 |
4 files changed, 467 insertions, 6 deletions
diff --git a/check/app/server/api.py b/check/app/server/api.py index 5c5b4bf..12d955b 100644 --- a/check/app/server/api.py +++ b/check/app/server/api.py @@ -6,6 +6,7 @@ from flask import Blueprint, request, jsonify from PIL import Image # from app.utils.im_utils import pil2np +from app.models.sql_factory import search_by_phash, add_phash sanitize_re = re.compile('[\W]+') valid_exts = ['.gif', '.jpg', '.jpeg', '.png'] @@ -19,8 +20,9 @@ def index(): """API status noop""" return jsonify({ 'status': 'ok' }) -@api.route('/v1/match/') -def match(): +@api.route('/v1/match/', methods=['POST']) +def upload(): """Search by image""" + start = time.time() # get threshold return jsonify({ 'status': 'ok' }) diff --git a/check/app/server/static b/check/app/server/static new file mode 120000 index 0000000..382349a --- /dev/null +++ b/check/app/server/static @@ -0,0 +1 @@ +../../static/
\ No newline at end of file diff --git a/check/app/utils/file_utils.py b/check/app/utils/file_utils.py new file mode 100644 index 0000000..5c7b39d --- /dev/null +++ b/check/app/utils/file_utils.py @@ -0,0 +1,452 @@ +""" +File utilities +""" +import sys +import os +from os.path import join +import stat + +from glob import glob +from pprint import pprint +import shutil +import distutils +import pathlib +from pathlib import Path +import json +import csv +import pickle +import threading +from queue import Queue +import time +import logging +import itertools +import collections + +import hashlib +import pymediainfo +import click +from tqdm import tqdm +import cv2 as cv +from PIL import Image +import imutils + +from app.settings import app_cfg as cfg +from app.settings import types + +log = logging.getLogger(cfg.LOGGER_NAME) + + +# ------------------------------------------ +# File I/O read/write little helpers +# ------------------------------------------ + +def glob_multi(dir_in, exts, recursive=False): + files = [] + for ext in exts: + if recursive: + fp_glob = join(dir_in, '**/*.{}'.format(ext)) + log.info(f'glob {fp_glob}') + files += glob(fp_glob, recursive=True) + else: + fp_glob = join(dir_in, '*.{}'.format(ext)) + files += glob(fp_glob) + return files + + +def zpad(x, zeros=cfg.ZERO_PADDING): + return str(x).zfill(zeros) + +def get_ext(fpp, lower=True): + """Retuns the file extension w/o dot + :param fpp: (Pathlib.path) filepath + :param lower: (bool) force lowercase + :returns: (str) file extension (ie 'jpg') + """ + fpp = ensure_posixpath(fpp) + ext = fpp.suffix.replace('.', '') + return ext.lower() if lower else ext + + +def convert(fp_in, fp_out): + """Converts between JSON and Pickle formats + Pickle files are about 30-40% smaller filesize + """ + if get_ext(fp_in) == get_ext(fp_out): + log.error('Input: {} and output: {} are the same. Use this to convert.') + + lazywrite(lazyload(fp_in), fp_out) + + +def load_csv(fp_in, as_list=True): + """Loads CSV and retuns list of items + :param fp_in: string filepath to CSV + :returns: list of all CSV data + """ + if not Path(fp_in).exists(): + log.info('not found: {}'.format(fp_in)) + log.info('loading: {}'.format(fp_in)) + with open(fp_in, 'r') as fp: + items = csv.DictReader(fp) + if as_list: + items = [x for x in items] + log.info('returning {:,} items'.format(len(items))) + return items + +def unfussy_csv_reader(reader): + """Loads a CSV while ignoring possible data errors + :param reader: Special reader for load_csv_safe which ignores CSV parse errors + """ + while True: + try: + yield next(reader) + except StopIteration: + return + except csv.Error: + print(csv.Error) + # log the problem or whatever + continue + +def load_csv_safe(fp_in, keys=True, create=False): + """Loads a CSV while ignoring possible data errors + :param fp_in: string filepath to JSON file + :param keys: boolean set to false if the first line is not headers (for some reason) + :param create: boolean set to true to return an empty keys/values if the CSV does not exist + """ + try: + with open(fp_in, 'r', newline='', encoding='utf-8') as f: + # reader = csv.reader( (line.replace('\0','') for line in f) ) + reader = csv.reader(f) + lines = list(unfussy_csv_reader(reader)) + if keys: + keys = lines[0] + lines = lines[1:] + return keys, lines + return lines + except: + if create: + if keys: + return {}, [] + return [] + raise + +def load_recipe(fp_in): + """Loads a JSON file as an object with properties accessible with dot syntax + :param fp_in: string filepath to JSON file + """ + with open(path) as fh: + return json.load(fh, object_hook=lambda d: collections.namedtuple('X', d.keys())(*d.values())) + + +def lazywrite(data, fp_out, sort_keys=True): + """Writes JSON or Pickle data""" + ext = get_ext(fp_out) + if ext == 'json': + return write_json(data, fp_out, sort_keys=sort_keys) + elif ext == 'pkl': + return write_pickle(data, fp_out) + else: + raise NotImplementedError('[!] {} is not yet supported. Use .pkl or .json'.format(ext)) + + +def lazyload(fp_in, ordered=True): + """Loads JSON or Pickle serialized data""" + if not Path(fp_in).exists(): + log.error('file does not exist: {}'.format(fp_in)) + return {} + ext = get_ext(fp_in) + if ext == 'json': + items = load_json(fp_in) + elif ext == 'pkl': + items = load_pickle(fp_in) + else: + raise NotImplementedError('[!] {} is not yet supported. Use .pkl or .json'.format(ext)) + + if ordered: + return collections.OrderedDict(sorted(items.items(), key=lambda t: t[0])) + else: + return items + + +def load_text(fp_in): + with open(fp_in, 'rt') as fp: + lines = fp.read().rstrip('\n').split('\n') + return lines + +def load_json(fp_in): + """Loads JSON and returns items + :param fp_in: (str) filepath + :returns: data from JSON + """ + if not Path(fp_in).exists(): + log.error('file does not exist: {}'.format(fp_in)) + return {} + with open(str(fp_in), 'r') as fp: + data = json.load(fp) + return data + + +def load_pickle(fp_in): + """Loads Pickle and returns items + :param fp_in: (str) filepath + :returns: data from JSON + """ + if not Path(fp_in).exists(): + log.error('file does not exist: {}'.format(fp_in)) + return {} + with open(str(fp_in), 'rb') as fp: + data = pickle.load(fp) + return data + + +def order_items(records): + """Orders records by ASC SHA256""" + return collections.OrderedDict(sorted(records.items(), key=lambda t: t[0])) + +def write_text(data, fp_out, ensure_path=True): + if not data: + log.error('no data') + return + + if ensure_path: + mkdirs(fp_out) + with open(fp_out, 'w') as fp: + if type(data) == list: + fp.write('\n'.join(data)) + else: + fp.write(data) + + +def write_pickle(data, fp_out, ensure_path=True): + """ + """ + if ensure_path: + mkdirs(fp_out) # mkdir + with open(fp_out, 'wb') as fp: + pickle.dump(data, fp) + + +def write_json(data, fp_out, minify=True, ensure_path=True, sort_keys=True, verbose=False): + """ + """ + if ensure_path: + mkdirs(fp_out) + with open(fp_out, 'w') as fp: + if minify: + json.dump(data, fp, separators=(',',':'), sort_keys=sort_keys) + else: + json.dump(data, fp, indent=2, sort_keys=sort_keys) + if verbose: + log.info('Wrote JSON: {}'.format(fp_out)) + +def write_csv(data, fp_out, header=None): + """ """ + with open(fp_out, 'w') as fp: + writer = csv.DictWriter(fp, fieldnames=header) + writer.writeheader() + if type(data) is dict: + for k, v in data.items(): + fp.writerow('{},{}'.format(k, v)) + + +def write_serialized_items(items, fp_out, ensure_path=True, minify=True, sort_keys=True): + """Writes serialized data + :param items: (dict) a sha256 dict of MappingItems + :param serialize: (bool) serialize the data + :param ensure_path: ensure the parent directories exist + :param minify: reduces JSON file size + """ + log.info('Writing serialized data...') + fpp_out = ensure_posixpath(fp_out) + serialized_items = {k: v.serialize() for k, v in tqdm(items.items()) } + # write data + ext = get_ext(fpp_out) + if ext == 'json': + write_json(serialized_items, fp_out, ensure_path=ensure_path, minify=minify, sort_keys=sort_keys) + elif ext == 'pkl': + write_pickle(serialized_items, fp_out) + else: + raise NotImplementedError('[!] {} is not yet supported. Use .pkl or .json'.format(ext)) + log.info('Wrote {:,} items to {}'.format(len(items), fp_out)) + + +def write_modeled_data(data, fp_out, ensure_path=False): + """ + """ + fpp_out = ensure_posixpath(fp_out) + if ensure_path: + mkdirs(fpp_out) + ext = get_ext(fpp_out) + if ext == 'pkl': + write_pickle(data, str(fp_out)) + else: + raise NotImplementedError('[!] {} is not yet supported. Use .pkl or .json'.format(ext)) + + +# --------------------------------------------------------------------- +# Filepath utilities +# --------------------------------------------------------------------- + +def ensure_posixpath(fp): + """Ensures filepath is pathlib.Path + :param fp: a (str, LazyFile, PosixPath) + :returns: a PosixPath filepath object + """ + if type(fp) == str: + fpp = Path(fp) + elif type(fp) == click.utils.LazyFile: + fpp = Path(fp.name) + elif type(fp) == pathlib.PosixPath: + fpp = fp + else: + raise TypeError('{} is not a valid filepath type'.format(type(fp))) + return fpp + + +def mkdirs(fp): + """Ensure parent directories exist for a filepath + :param fp: string, Path, or click.File + """ + fpp = ensure_posixpath(fp) + fpp = fpp.parent if fpp.suffix else fpp + fpp.mkdir(parents=True, exist_ok=True) + + +def ext_media_format(ext): + """Converts file extension into Enum MediaType + param ext: str of file extension" + """ + for media_format, exts in cfg.VALID_MEDIA_EXTS.items(): + if ext in exts: + return media_format + raise ValueError('{} is not a valid option'.format(ext)) + + +def sha256(fp_in, block_size=65536): + """Generates SHA256 hash for a file + :param fp_in: (str) filepath + :param block_size: (int) byte size of block + :returns: (str) hash + """ + sha256 = hashlib.sha256() + with open(fp_in, 'rb') as fp: + for block in iter(lambda: fp.read(block_size), b''): + sha256.update(block) + return sha256.hexdigest() + + +def sha256_tree(sha256): + """Split hash into branches with tree-depth for faster file indexing + :param sha256: str of a sha256 hash + :returns: str with sha256 tree with '/' delimeter + """ + branch_size = cfg.HASH_BRANCH_SIZE + tree_size = cfg.HASH_TREE_DEPTH * branch_size + sha256_tree = [sha256[i:(i+branch_size)] for i in range(0, tree_size, branch_size)] + return '/'.join(sha256_tree) + + +def migrate(fmaps, threads=1, action='copy', force=False): + """Copy/move/symlink files form src to dst directory + :param fmaps: (dict) with 'src' and 'dst' filepaths + :param threads: (int) number of threads + :param action: (str) copy/move/symlink + :param force: (bool) force overwrite existing files + """ + log = log + num_items = len(fmaps) + + def copytree(src, dst, symlinks = False, ignore = None): + # ozxyqk: https://stackoverflow.com/questions/22588225/how-do-you-merge-two-directories-or-move-with-replace-from-the-windows-command + if not os.path.exists(dst): + mkdirs(dst) + # os.makedirs(dst) + shutil.copystat(src, dst) + lst = os.listdir(src) + if ignore: + excl = ignore(src, lst) + lst = [x for x in lst if x not in excl] + for item in lst: + s = os.path.join(src, item) + d = os.path.join(dst, item) + if symlinks and os.path.islink(s): + if os.path.exists(d): + os.remove(d) + os.symlink(os.readlink(s), d) + try: + st = os.lstat(s) + mode = stat.S_IMODE(st.st_mode) + os.lchmod(d, mode) + except: + pass # lchmod not available + elif os.path.isdir(s): + copytree(s, d, symlinks, ignore) + else: + shutil.copy(s, d) + + assert(action in ['copy','move','symlink']) + + if threads > 1: + # threaded + task_queue = Queue() + print_lock = threading.Lock() + + def migrate_action(fmap): + data_local = threading.local() + data_local.src, data_local.dst = (fmap['src'], fmap['dst']) + data_local.src_path = Path(data_local.src) + data_local.dst_path = Path(data_local.dst) + + if force or not data_local.dst_path.exists(): + if action == 'copy': + shutil.copy(data_local.src, data_local.dst) + #if data_local.src_path.is_dir(): + # copytree(data_local.src, data_local.dst) + #else: + elif action == 'move': + shutil.move(data_local.src, data_local.dst) + elif action == 'symlink': + if force: + data_local.dst_path.unlink() + Path(data_local.src).symlink_to(data_local.dst) + + def process_queue(num_items): + # TODO: progress bar + while True: + fmap = task_queue.get() + migrate_action(fmap) + log.info('migrate: {:.2f} {:,}/{:,}'.format( + (task_queue.qsize() / num_items)*100, task_queue.qsize(), num_items)) + task_queue.task_done() + + # avoid race conditions by creating dir structure here + log.info('create directory structure') + for fmap in tqdm(fmaps): + mkdirs(fmap['dst']) + + # init threads + for i in range(threads): + t = threading.Thread(target=process_queue, args=(num_items,)) + t.daemon = True + t.start() + + # process threads + start = time.time() + for fmap in fmaps: + task_queue.put(fmap) + + task_queue.join() + + else: + # non-threaded + for fmap in tqdm(fmaps): + mkdirs(fmap['dst']) + if action == 'copy': + shutil.copy(fmap['src'], fmap['dst']) + elif action == 'move': + shutil.move(fmap['src'], fmap['dst']) + elif action == 'symlink': + if force: + Path(fmap['dst'].unlink()) + Path(fp_src).symlink_to(fp_dst) + return + diff --git a/check/app/utils/im_utils.py b/check/app/utils/im_utils.py index eae7bc1..dfd5739 100644 --- a/check/app/utils/im_utils.py +++ b/check/app/utils/im_utils.py @@ -208,16 +208,22 @@ def compute_phash(im): """ return imagehash.phash(ensure_pil(im)) -def compute_phash_int(im): +def phash2int(phash): """Compute perceptual hash using ImageHash library and convert to binary - :param im: Numpy.ndarray - :returns: ImageHash as binary + :param phash: Imagehash.ImageHash + :returns: binary-encoded bigint """ - phash = imagehash.phash(ensure_pil(im)) phash.hash[-1] = False phash_as_bigint = struct.unpack('Q', numpy.packbits(phash.hash))[0] return phash_as_bigint +def compute_phash_int(im): + """Compute perceptual hash using ImageHash library and convert to binary + :param im: Numpy.ndarray + :returns: binary-encoded bigint + """ + return phash2int(compute_phash(im)) + def compute_dhash(im): """Compute difference hash using ImageHash library :param im: Numpy.ndarray |
