summaryrefslogtreecommitdiff
path: root/check
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2019-04-23 13:10:52 +0200
committerJules Laplace <julescarbon@gmail.com>2019-04-23 13:10:52 +0200
commit1b30746ee4d38cd6acdbf339b4605700f07fc02a (patch)
tree74c2bbbde7d04bdabf76dd64bfcf652fbb713c05 /check
parentf5f30912a2efd9c3d47cc9aeaf6a542211b00474 (diff)
parent0cb9209d52604e5bf7366b74447ca79576335091 (diff)
Merge branch 'master' of github.com:adamhrv/vframe_check_api
Diffstat (limited to 'check')
-rw-r--r--check/app/utils/file_utils.py19
-rw-r--r--check/app/utils/im_utils.py5
-rw-r--r--check/commands/phash/report.py101
-rw-r--r--check/commands/phash/report_html.py80
-rwxr-xr-xcheck/static/assets/css.css341
-rw-r--r--check/static/perceptual_hash_report.html39
6 files changed, 580 insertions, 5 deletions
diff --git a/check/app/utils/file_utils.py b/check/app/utils/file_utils.py
index 5c7b39d..1ed1833 100644
--- a/check/app/utils/file_utils.py
+++ b/check/app/utils/file_utils.py
@@ -310,6 +310,25 @@ def mkdirs(fp):
fpp = fpp.parent if fpp.suffix else fpp
fpp.mkdir(parents=True, exist_ok=True)
+def ensure_posixpath(fp):
+ """Ensures filepath is pathlib.Path
+ :param fp: a (str, LazyFile, PosixPath)
+ :returns: a PosixPath filepath object
+ """
+ if type(fp) == str:
+ fpp = Path(fp)
+ elif type(fp) == click.utils.LazyFile:
+ fpp = Path(fp.name)
+ elif type(fp) == pathlib.PosixPath:
+ fpp = fp
+ else:
+ raise TypeError('{} is not a valid filepath type'.format(type(fp)))
+ return fpp
+
+def ensure_dir(fp):
+ if not Path(fp).is_dir():
+ mkdirs(fp)
+
def ext_media_format(ext):
"""Converts file extension into Enum MediaType
diff --git a/check/app/utils/im_utils.py b/check/app/utils/im_utils.py
index 747e900..1d1affb 100644
--- a/check/app/utils/im_utils.py
+++ b/check/app/utils/im_utils.py
@@ -11,12 +11,7 @@ from skimage import feature
import imutils
import time
import numpy as np
-import torch
-import torch.nn as nn
-import torchvision.models as models
-import torchvision.transforms as transforms
import struct
-from torch.autograd import Variable
from sklearn.metrics.pairwise import cosine_similarity
import datetime
diff --git a/check/commands/phash/report.py b/check/commands/phash/report.py
new file mode 100644
index 0000000..362480d
--- /dev/null
+++ b/check/commands/phash/report.py
@@ -0,0 +1,101 @@
+"""
+Generate a test report from a directory of images
+This demo does not use SQL storage
+
+"""
+
+from pathlib import Path
+
+import click
+
+@click.command('')
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+ help='Path to input dir')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+ help='Path to output directory')
+@click.option('--recursive', 'opt_recursive', is_flag=True,
+ help='Recursive globbing')
+@click.option('-t', '--thresh', 'opt_thresh', default=3,
+ help='Perceptual hash threshold')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None))
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_recursive, opt_thresh, opt_slice):
+ """Deduplicate images"""
+
+ # ------------------------------------------------
+ # imports
+ import sys
+ from os.path import join
+ from glob import glob
+
+ import pandas as pd
+ from tqdm import tqdm
+ import numpy as np
+ import cv2 as cv
+ import imagehash
+
+ from app.utils import logger_utils, im_utils, file_utils
+
+ log = logger_utils.Logger.getLogger()
+ log.info(f'De-duplicating: {opt_fp_in}')
+
+ # get list of all images
+ fp_ims = glob(join(opt_fp_in, '*'))
+ print(len(fp_ims))
+ exts = ['.jpg', '.png', '.jpeg']
+ fp_ims = [x for x in fp_ims if Path(x).suffix in exts]
+ if opt_slice:
+ fp_ims = fp_ims[opt_slice[0]:opt_slice[1]]
+
+ log.info(f'Processing {len(fp_ims):,} images')
+
+ # Create image meta objects
+ ims_meta = {}
+ log.info('Computing sha256 and perceptual hashes...')
+ for fp_im in tqdm(fp_ims):
+ sha256 = file_utils.sha256(fp_im)
+ im = cv.imread(fp_im)
+ im_hash = im_utils.compute_phash(im) # uses PIL
+ ims_meta[sha256] = {
+ 'imhash': im_hash,
+ 'filepath': fp_im,
+ 'fname': Path(fp_im).name,
+ 'sha256': sha256,
+ 'duplicate': None,
+ }
+
+ # Deduplicate the list of images
+ log.info('Deduplicating images...')
+ duplicates = []
+ for sha256_a, im_obj_a in tqdm(ims_meta.copy().items()):
+ for sha256_b, im_obj_b in ims_meta.copy().items():
+ if sha256_a == sha256_b:
+ continue
+ d = abs(im_obj_a['imhash'] - im_obj_b['imhash'])
+ if d <= opt_thresh:
+ # mark B as a duplicate of A
+ ims_meta[sha256_b]['duplicate'] = sha256_a
+ duplicates.append({'sha256_a': sha256_a, 'fname_a': im_obj_a['fname'],
+ 'sha256_b': sha256_b, 'fname_b': im_obj_b['fname'], 'score': d})
+ ims_meta.pop(sha256_b)
+
+ n_dupes = sum(1 for k,v in ims_meta.items() if v['duplicate'] is not None)
+ log.info(f'Found {n_dupes}')
+
+
+ #im_list = [v for k,v in ims_meta.items()] # dict to list of dicts
+ df_items = pd.DataFrame.from_dict(duplicates)
+ #df_items.drop(['imhash', 'filepath'], axis=1, inplace=True)
+ file_utils.ensure_dir(opt_fp_out)
+ log.info(f'Writing: {opt_fp_out}')
+ df_items.to_csv(opt_fp_out, index=False)
+
+ # generate HTML
+ # copy images to another directory
+ # import shutil
+ # file_utils.ensure_dir(opt_fp_dir_copyto)
+ # for sha256, im_meta in ims_meta.items():
+ # src = im_meta['filepath']
+ # dst = join(opt_fp_dir_copyto, f'{sha256}.jpg')
+ # # dst = join(opt_fp_dir_copyto, f"{im_meta['fname']}")
+ # shutil.copy(src, dst) \ No newline at end of file
diff --git a/check/commands/phash/report_html.py b/check/commands/phash/report_html.py
new file mode 100644
index 0000000..61a8b48
--- /dev/null
+++ b/check/commands/phash/report_html.py
@@ -0,0 +1,80 @@
+"""
+Generate a test report from a directory of images
+This demo does not use SQL storage
+
+"""
+
+from pathlib import Path
+
+import click
+
+@click.command('')
+@click.option('--csv', 'opt_fp_in_csv', required=True,
+ help='Path to input CSV')
+@click.option('--images', 'opt_fp_in_img', required=True,
+ help='Path to images')
+@click.option('-o', '--output', 'opt_fp_out_dir', type=click.Path(file_okay=False, dir_okay=True),
+ help='Path to output directory')
+@click.pass_context
+def cli(ctx, opt_fp_in_csv, opt_fp_in_img, opt_fp_out_dir):
+ """Generate HTML report"""
+
+ # ------------------------------------------------
+ # imports
+ import sys
+ from os.path import join
+ from glob import glob
+
+ import pandas as pd
+ from tqdm import tqdm
+ import jinja2
+ from flask import url_for
+ import shutil
+
+ from app.utils import logger_utils, im_utils, file_utils
+
+ log = logger_utils.Logger.getLogger()
+ log.info(f'Generating HTML report from: {opt_fp_in_csv}')
+
+
+ template_loader = jinja2.FileSystemLoader(searchpath="./static/")
+ template_env = jinja2.Environment(loader=template_loader)
+ TEMPLATE_FILE = "perceptual_hash_report.html"
+ template = template_env.get_template(TEMPLATE_FILE)
+
+ # create project output dir
+ fp_out_dir_assets = join(opt_fp_out_dir, 'assets')
+ fp_out_dir_images = join(opt_fp_out_dir, 'images')
+
+ file_utils.ensure_dir(opt_fp_out_dir)
+ file_utils.ensure_dir(fp_out_dir_assets)
+ file_utils.ensure_dir(fp_out_dir_images)
+
+ df_dupes = pd.read_csv(opt_fp_in_csv)
+ image_groups = df_dupes.groupby('fname_a')
+
+ log.info(f'Saving HTML report to: {opt_fp_out_dir}')
+ # im_objs = df_dupes.to_records('dict')
+ fp_out_html = join(opt_fp_out_dir, 'index.html')
+ with open(fp_out_html, 'w') as fp:
+ html_text = template.render(image_groups=image_groups,
+ dir_ims=Path(fp_out_dir_images).name, dir_assets=Path(fp_out_dir_assets).name)
+ fp.write(html_text)
+
+ # copy css
+ fp_src = 'static/assets/css.css'
+ fp_dst = join(fp_out_dir_assets, Path(fp_src).name)
+ shutil.copy(fp_src, fp_dst)
+
+ # copy images
+ for fname_a, image_group in image_groups:
+ # get image a
+ for df_im in image_group.itertuples():
+ # image a
+ fp_src = join(opt_fp_in_img, df_im.fname_a)
+ fp_dst = join(fp_out_dir_images, df_im.fname_a)
+ shutil.copy(fp_src, fp_dst)
+ # image b
+ fp_src = join(opt_fp_in_img, df_im.fname_b)
+ fp_dst = join(fp_out_dir_images, df_im.fname_b)
+ shutil.copy(fp_src, fp_dst) \ No newline at end of file
diff --git a/check/static/assets/css.css b/check/static/assets/css.css
new file mode 100755
index 0000000..9e8a59f
--- /dev/null
+++ b/check/static/assets/css.css
@@ -0,0 +1,341 @@
+body {
+ font-family: Helvetica, arial, sans-serif;
+ font-size: 14px;
+ line-height: 1.6;
+ padding-top: 10px;
+ padding-bottom: 10px;
+ background-color: white;
+ padding: 45px; }
+
+body > *:first-child {
+ margin-top: 0 !important; }
+body > *:last-child {
+ margin-bottom: 0 !important; }
+
+a {
+ color: #4183C4; }
+a.absent {
+ color: #cc0000; }
+a.anchor {
+ display: block;
+ padding-left: 30px;
+ margin-left: -30px;
+ cursor: pointer;
+ position: absolute;
+ top: 0;
+ left: 0;
+ bottom: 0; }
+
+h1, h2, h3, h4, h5, h6 {
+ margin: 25px 0 8px;
+ padding: 0;
+ font-weight: bold;
+ -webkit-font-smoothing: antialiased;
+ cursor: text;
+ position: relative; }
+
+h1:hover a.anchor, h2:hover a.anchor, h3:hover a.anchor, h4:hover a.anchor, h5:hover a.anchor, h6:hover a.anchor {
+ background: url() no-repeat 10px center;
+ text-decoration: none; }
+
+h1 tt, h1 code {
+ font-size: inherit; }
+
+h2 tt, h2 code {
+ font-size: inherit; }
+
+h3 tt, h3 code {
+ font-size: inherit; }
+
+h4 tt, h4 code {
+ font-size: inherit; }
+
+h5 tt, h5 code {
+ font-size: inherit; }
+
+h6 tt, h6 code {
+ font-size: inherit; }
+
+h1 {
+ font-size: 28px;
+ color: black; }
+
+h2 {
+ font-size: 24px; }
+
+h3 {
+ font-size: 18px; }
+
+h4 {
+ font-size: 16px; }
+
+h5 {
+ font-size: 14px; }
+
+h6 {
+ color: #777777;
+ font-size: 14px; }
+
+p, blockquote, ul, ol, dl, li, table, pre {
+ margin: 15px 0; }
+
+hr {
+ background: transparent url() repeat-x 0 0;
+ border: 0 none;
+ color: #cccccc;
+ height: 4px;
+ padding: 0;
+}
+
+body > h2:first-child {
+ margin-top: 0;
+ padding-top: 0; }
+body > h1:first-child {
+ margin-top: 0;
+ padding-top: 0; }
+ body > h1:first-child + h2 {
+ margin-top: 0;
+ padding-top: 0; }
+body > h3:first-child, body > h4:first-child, body > h5:first-child, body > h6:first-child {
+ margin-top: 0;
+ padding-top: 0; }
+
+a:first-child h1, a:first-child h2, a:first-child h3, a:first-child h4, a:first-child h5, a:first-child h6 {
+ margin-top: 0;
+ padding-top: 0; }
+
+h1 p, h2 p, h3 p, h4 p, h5 p, h6 p {
+ margin-top: 0; }
+
+li p.first {
+ display: inline-block; }
+li {
+ margin: 0; }
+ul, ol {
+ padding-left: 30px; }
+
+ul :first-child, ol :first-child {
+ margin-top: 0; }
+
+dl {
+ padding: 0; }
+ dl dt {
+ font-size: 14px;
+ font-weight: bold;
+ font-style: italic;
+ padding: 0;
+ margin: 15px 0 5px; }
+ dl dt:first-child {
+ padding: 0; }
+ dl dt > :first-child {
+ margin-top: 0; }
+ dl dt > :last-child {
+ margin-bottom: 0; }
+ dl dd {
+ margin: 0 0 15px;
+ padding: 0 15px; }
+ dl dd > :first-child {
+ margin-top: 0; }
+ dl dd > :last-child {
+ margin-bottom: 0; }
+
+blockquote {
+ border-left: 4px solid #dddddd;
+ padding: 0 15px;
+ color: #777777; }
+ blockquote > :first-child {
+ margin-top: 0; }
+ blockquote > :last-child {
+ margin-bottom: 0; }
+
+table {
+ font-size:11px;
+ padding: 0;border-collapse: collapse; }
+ table tr {
+ border-top: 1px solid #cccccc;
+ background-color: white;
+ margin: 0;
+ padding: 0; }
+ table tr:nth-child(2n) {
+ background-color: #f8f8f8; }
+ table tr th {
+ font-weight: bold;
+ border: 1px solid #cccccc;
+ margin: 0;
+ padding: 6px 13px; }
+ table tr td {
+ border: 1px solid #cccccc;
+ margin: 0;
+ padding: 6px 13px; }
+ table tr th :first-child, table tr td :first-child {
+ margin-top: 0; }
+ table tr th :last-child, table tr td :last-child {
+ margin-bottom: 0; }
+
+img {
+ max-width: 100%; }
+img + em{
+ font-size:.675rem;
+ color:#999;
+ display: block;
+}
+span.frame {
+ display: block;
+ overflow: hidden; }
+ span.frame > span {
+ border: 1px solid #dddddd;
+ display: block;
+ float: left;
+ overflow: hidden;
+ margin: 13px 0 0;
+ padding: 7px;
+ width: auto; }
+ span.frame span img {
+ display: block;
+ float: left; }
+ span.frame span span {
+ clear: both;
+ color: #333333;
+ display: block;
+ padding: 5px 0 0; }
+span.align-center {
+ display: block;
+ overflow: hidden;
+ clear: both; }
+ span.align-center > span {
+ display: block;
+ overflow: hidden;
+ margin: 13px auto 0;
+ text-align: center; }
+ span.align-center span img {
+ margin: 0 auto;
+ text-align: center; }
+span.align-right {
+ display: block;
+ overflow: hidden;
+ clear: both; }
+ span.align-right > span {
+ display: block;
+ overflow: hidden;
+ margin: 13px 0 0;
+ text-align: right; }
+ span.align-right span img {
+ margin: 0;
+ text-align: right; }
+span.float-left {
+ display: block;
+ margin-right: 13px;
+ overflow: hidden;
+ float: left; }
+ span.float-left span {
+ margin: 13px 0 0; }
+span.float-right {
+ display: block;
+ margin-left: 13px;
+ overflow: hidden;
+ float: right; }
+ span.float-right > span {
+ display: block;
+ overflow: hidden;
+ margin: 13px auto 0;
+ text-align: right; }
+
+code, tt {
+ margin: 0 2px;
+ padding: 0 5px;
+ white-space: nowrap;
+ border: 1px solid #eaeaea;
+ background-color: #f8f8f8;
+ border-radius: 3px; }
+
+pre code {
+ margin: 0;
+ padding: 0;
+ white-space: pre;
+ border: none;
+ background: transparent; }
+
+.highlight pre {
+ background-color: #f8f8f8;
+ border: 1px solid #cccccc;
+ font-size: 13px;
+ line-height: 19px;
+ overflow: auto;
+ padding: 6px 10px;
+ border-radius: 3px; }
+
+pre {
+ background-color: #f8f8f8;
+ border: 1px solid #cccccc;
+ font-size: 13px;
+ line-height: 19px;
+ overflow: auto;
+ padding: 6px 10px;
+ border-radius: 3px; }
+ pre code, pre tt {
+ background-color: transparent;
+ border: none; }
+
+sup {
+ font-size: 0.83em;
+ vertical-align: super;
+ line-height: 0;
+}
+
+kbd {
+ display: inline-block;
+ padding: 3px 5px;
+ font-size: 11px;
+ line-height: 10px;
+ color: #555;
+ vertical-align: middle;
+ background-color: #fcfcfc;
+ border: solid 1px #ccc;
+ border-bottom-color: #bbb;
+ border-radius: 3px;
+ box-shadow: inset 0 -1px 0 #bbb
+}
+
+* {
+ -webkit-print-color-adjust: exact;
+}
+@media screen and (min-width: 914px) {
+ body {
+ width: 854px;
+ margin:0 auto;
+ }
+}
+@media print {
+ table, pre {
+ page-break-inside: avoid;
+ }
+ pre {
+ word-wrap: break-word;
+ }
+}
+.caption{
+ font-size:.83em;
+ color:#999;
+ margin-bottom:10px;
+}
+
+.pagebreak { page-break-inside:avoid; page-break-after:always; }
+
+/* image match results */
+
+.img_match{
+ max-width:300px
+}
+td.result_txt{
+ vertical-align: top;
+}
+td.result_txt ul{
+ margin-left: 0;
+ padding-left: 0;
+ list-style: none;
+}
+td.result_txt ul li{
+ margin-left:0;
+ padding-bottom: 5px;
+ list-style: none;
+} \ No newline at end of file
diff --git a/check/static/perceptual_hash_report.html b/check/static/perceptual_hash_report.html
new file mode 100644
index 0000000..566a058
--- /dev/null
+++ b/check/static/perceptual_hash_report.html
@@ -0,0 +1,39 @@
+<html>
+<head>
+ <link rel="stylesheet" href="{{ dir_assets }}/css.css">
+</head>
+<body>
+
+ <h1>Image Duplicates</h1>
+
+ {% for fname_a, image_group in image_groups %}
+ <h3>Duplicates for {{ fname_a }}</h3>
+ <table>
+ <tr>
+ <th>Image A</th>
+ <th>Image B</th>
+ <th>Score</th>
+ </tr>
+ {% for df in image_group.itertuples() %}
+ <tr>
+ <td>
+ <img class="img_match img_match_a" src="{{ dir_ims }}/{{ df.fname_a }}" />
+ </td>
+ <td>
+ <img class="img_match img_match_b" src="{{ dir_ims }}/{{ df.fname_b }}" />
+ </td>
+
+ <td class="result_txt">
+ <ul>
+ <li>Filename A:</h5> {{ df.fname_a }}</li>
+ <li>Filename B:<br> {{ df.fname_b }}</li>
+ <li>Score: {{ df.score }}</li>
+ </ul>
+ </td>
+ </tr>
+ {% endfor %}
+ </table>
+ {% endfor %}
+
+</body>
+</html> \ No newline at end of file