summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--check/app/models/sql_factory.py12
-rw-r--r--check/app/server/api.py59
-rw-r--r--check/app/utils/file_utils.py10
-rw-r--r--check/commands/phash/dedupe.py2
-rw-r--r--check/commands/phash/query.py42
-rw-r--r--check/commands/phash/test.py49
-rw-r--r--check/static/demo.html7
-rw-r--r--check/static/index.html2
-rw-r--r--docs/specifications.md2
9 files changed, 124 insertions, 61 deletions
diff --git a/check/app/models/sql_factory.py b/check/app/models/sql_factory.py
index 1d32a68..68c2e30 100644
--- a/check/app/models/sql_factory.py
+++ b/check/app/models/sql_factory.py
@@ -44,11 +44,17 @@ class FileTable(Base):
Base.metadata.create_all(engine)
-def search_by_phash(phash, threshold=6):
+def search_by_phash(phash, threshold=6, limit=1):
"""Search files for a particular phash"""
connection = engine.connect()
- cmd = 'SELECT files.*, BIT_COUNT(phash ^ :phash) as hamming_distance FROM files HAVING hamming_distance < :threshold ORDER BY hamming_distance ASC LIMIT 1'
- matches = connection.execute(text(cmd), phash=phash, threshold=threshold).fetchall()
+ cmd = """
+ SELECT files.*, BIT_COUNT(phash ^ :phash)
+ AS hamming_distance FROM files
+ HAVING hamming_distance < :threshold
+ ORDER BY hamming_distance ASC
+ LIMIT :limit
+ """
+ matches = connection.execute(text(cmd), phash=phash, threshold=threshold, limit=limit).fetchall()
keys = ('id', 'sha256', 'phash', 'ext', 'score')
results = [ dict(zip(keys, values)) for values in matches ]
return results
diff --git a/check/app/server/api.py b/check/app/server/api.py
index c4f9f80..322d899 100644
--- a/check/app/server/api.py
+++ b/check/app/server/api.py
@@ -2,11 +2,13 @@ import os
import re
import time
import numpy as np
+import logging
from flask import Blueprint, request, jsonify
from PIL import Image
from app.models.sql_factory import search_by_phash, add_phash
-from app.utils.im_utils import pil2np
+from app.utils.im_utils import compute_phash_int
+from app.utils.file_utils import sha256_stream
sanitize_re = re.compile('[\W]+')
valid_exts = ['.gif', '.jpg', '.jpeg', '.png']
@@ -22,29 +24,72 @@ def index():
"""
return jsonify({ 'status': 'ok' })
-@api.route('/v1/match/', methods=['POST'])
-def upload():
+@api.route('/v1/match', methods=['POST'])
+def match():
"""
Search by uploading an image
"""
start = time.time()
+ logging.debug(start)
- file = request.files['query_img']
+ file = request.files['q']
fn = file.filename
if fn.endswith('blob'): # FIX PNG IMAGES?
fn = 'filename.jpg'
+ logging.debug(fn)
basename, ext = os.path.splitext(fn)
if ext.lower() not in valid_exts:
return jsonify({
+ 'success': False,
+ 'match': False,
'error': 'not_an_image'
})
+ ext = ext[1:].lower()
+
im = Image.open(file.stream).convert('RGB')
phash = compute_phash_int(im)
- threshold = request.args.get('threshold') || 6
+ logging.debug(phash)
+ try:
+ threshold = int(request.args.get('threshold') or 6)
+ limit = int(request.args.get('limit') or 1)
+ add = str(request.args.get('add') or 'true') == 'true'
+ except:
+ return jsonify({
+ 'success': False,
+ 'match': False,
+ 'error': 'param_error'
+ })
+
+ results = search_by_phash(phash=phash, threshold=threshold, limit=limit)
- res = search_by_phash(phash, threshold)
+ if len(results) == 0:
+ if add:
+ hash = sha256_stream(file)
+ add_phash(sha256=hash, phash=phash, ext=ext)
+ if limit == 1:
+ return jsonify({
+ 'success': True,
+ 'match': False,
+ })
+ else:
+ return jsonify({
+ 'success': True,
+ 'match': False,
+ 'results': [],
+ })
+
+ if limit > 1:
+ return jsonify({
+ 'success': True,
+ 'match': True,
+ 'results': results,
+ })
- return jsonify({ 'res': res })
+ return jsonify({
+ 'success': True,
+ 'match': True,
+ 'closest_match': results[0],
+ })
diff --git a/check/app/utils/file_utils.py b/check/app/utils/file_utils.py
index 1ed1833..a185cf4 100644
--- a/check/app/utils/file_utils.py
+++ b/check/app/utils/file_utils.py
@@ -352,6 +352,16 @@ def sha256(fp_in, block_size=65536):
sha256.update(block)
return sha256.hexdigest()
+def sha256_stream(stream, block_size=65536):
+ """Generates SHA256 hash for a file stream (from Flask)
+ :param fp_in: (FileStream) stream object
+ :param block_size: (int) byte size of block
+ :returns: (str) hash
+ """
+ sha256 = hashlib.sha256()
+ for block in iter(lambda: stream.read(block_size), b''):
+ sha256.update(block)
+ return sha256.hexdigest()
def sha256_tree(sha256):
"""Split hash into branches with tree-depth for faster file indexing
diff --git a/check/commands/phash/dedupe.py b/check/commands/phash/dedupe.py
index 3cf60d4..2e99f62 100644
--- a/check/commands/phash/dedupe.py
+++ b/check/commands/phash/dedupe.py
@@ -1,5 +1,5 @@
"""
-Dedupe a folder of images
+Dedupe a folder of images (uses phash directly, does not use database)
"""
import click
diff --git a/check/commands/phash/query.py b/check/commands/phash/query.py
index 8fc8c61..7fe2ae3 100644
--- a/check/commands/phash/query.py
+++ b/check/commands/phash/query.py
@@ -1,9 +1,10 @@
"""
-Search the database for an image
+Query the database with a test set
"""
import click
import os
+import glob
from PIL import Image
@@ -12,34 +13,29 @@ from app.utils.im_utils import compute_phash_int
from app.utils.file_utils import sha256
@click.command()
-@click.option('-i', '--input', 'opt_fn',
+@click.option('-i', '--input', 'opt_input_glob',
required=True,
- help="File to search")
+ help="Input glob to search -- e.g. '../docs/images/*.jpg'")
@click.pass_context
-def cli(ctx, opt_fn):
+def cli(ctx, opt_input_glob):
"""
- Search the database for an image
+ Query the database with a test set
"""
- print('Searching for a file...')
+ for fn in sorted(glob.iglob(opt_input_glob)):
+ im = Image.open(fn).convert('RGB')
+ phash = compute_phash_int(im)
- if not os.path.exists(opt_fn):
- print("File does not exist")
- return
+ hash = sha256(fn)
- im = Image.open(opt_fn).convert('RGB')
- phash = compute_phash_int(im)
+ phash_match = search_by_phash(phash)
+ hash_match = search_by_hash(hash)
- hash = sha256(opt_fn)
+ hash_result = 'NO'
+ if hash_match:
+ hash_result = 'YES'
- phash_match = search_by_phash(phash)
- hash_match = search_by_hash(hash)
+ phash_result = 'NO'
+ if len(phash_match):
+ phash_result = 'YES, score={}'.format(phash_match[0]['score'])
- hash_result = 'NO'
- if hash_match:
- hash_result = 'YES'
-
- phash_result = 'NO'
- if len(phash_match):
- phash_result = 'YES, score={}'.format(phash_match[0]['score'])
-
- print("{} - hash={}, phash={}".format(opt_fn, hash_result, phash_result))
+ print("{} - hash={}, phash={}".format(fn, hash_result, phash_result))
diff --git a/check/commands/phash/test.py b/check/commands/phash/test.py
index 7fe2ae3..77c4c69 100644
--- a/check/commands/phash/test.py
+++ b/check/commands/phash/test.py
@@ -1,41 +1,38 @@
"""
-Query the database with a test set
+Test the API
"""
import click
import os
import glob
+import requests
-from PIL import Image
-
-from app.models.sql_factory import search_by_phash, search_by_hash
-from app.utils.im_utils import compute_phash_int
-from app.utils.file_utils import sha256
+mime_types = {
+ '.png': 'image/png',
+ '.gif': 'image/gif',
+ '.jpg': 'image/jpeg',
+ '.jpeg': 'image/jpeg',
+}
@click.command()
-@click.option('-i', '--input', 'opt_input_glob',
+@click.option('-i', '--input', 'opt_input_fn',
required=True,
- help="Input glob to search -- e.g. '../docs/images/*.jpg'")
+ help="Image to test the API with")
@click.pass_context
-def cli(ctx, opt_input_glob):
+def cli(ctx, opt_input_fn):
"""
- Query the database with a test set
+ Query the API with a test image
"""
- for fn in sorted(glob.iglob(opt_input_glob)):
- im = Image.open(fn).convert('RGB')
- phash = compute_phash_int(im)
-
- hash = sha256(fn)
-
- phash_match = search_by_phash(phash)
- hash_match = search_by_hash(hash)
-
- hash_result = 'NO'
- if hash_match:
- hash_result = 'YES'
+ with open(opt_input_fn, 'rb') as f:
+ fn = os.path.basename(opt_input_fn)
+ fpart, ext = os.path.splitext(fn)
+ if ext not in mime_types:
+ print("Invalid filetype: {}".format(ext))
- phash_result = 'NO'
- if len(phash_match):
- phash_result = 'YES, score={}'.format(phash_match[0]['score'])
+ query = [
+ ('q', (fn, f, mime_types[ext]))
+ ]
- print("{} - hash={}, phash={}".format(fn, hash_result, phash_result))
+ print("Testing match API")
+ r = requests.post('http://0.0.0.0:5000/api/v1/match', files=query)
+ print(r.json())
diff --git a/check/static/demo.html b/check/static/demo.html
new file mode 100644
index 0000000..c8de017
--- /dev/null
+++ b/check/static/demo.html
@@ -0,0 +1,7 @@
+<html>
+<head>
+ <title>PHash Check API Demo</title>
+</head>
+<body>
+</body>
+</html> \ No newline at end of file
diff --git a/check/static/index.html b/check/static/index.html
index a17fa53..6c70bcf 100644
--- a/check/static/index.html
+++ b/check/static/index.html
@@ -1 +1 @@
-deebeephunky \ No newline at end of file
+<html></html> \ No newline at end of file
diff --git a/docs/specifications.md b/docs/specifications.md
index 48106dd..2a4e4ec 100644
--- a/docs/specifications.md
+++ b/docs/specifications.md
@@ -42,6 +42,8 @@ User story:
## Example Requests
+All requests are standard multipart POST requests. Specify the image with the parameter `image`.
+
Example response for a successful image upload with no match:
`check.vframe.io/v1/match/`