1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
import os
import glob
import time
import pandas as pd
from sqlalchemy import create_engine, Table, Column, String, BigInteger
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from app.settings import app_cfg as cfg
connection_url = "mysql+mysqlconnector://{}:{}@{}/{}?charset=utf8mb4".format(
os.getenv("DB_USER"),
os.getenv("DB_PASS"),
os.getenv("DB_HOST"),
os.getenv("DB_NAME")
)
loaded = False
engine = create_engine(connection_url, encoding="utf-8", pool_recycle=3600)
Session = sessionmaker(bind=engine)
Base = declarative_base()
class PHashTable(Base):
__tablename__ = 'phashes'
id = Column(Integer, primary_key=True)
sha256 = Column(String(36, convert_unicode=True), nullable=False)
phash = Column(BigInteger(blank=True), nullable=False)
ext = Column(String(3, convert_unicode=True), nullable=False)
def toJSON(self):
return {
'id': self.id,
'sha256': self.sha256,
'phash': self.phash,
'ext': self.ext,
}
def search_by_phash(phash, threshold=6):
connection = engine.connect()
cmd = 'SELECT phashes.*, BIT_COUNT(phash ^ :phash) as hamming_distance FROM images_image HAVING hamming_distance < :threshold ORDER BY hamming_distance ASC LIMIT 1'
matches = connection.execute(text(cmd), phash=phash, threshold=threshold)
return matches
def add_phash(sha256, phash, ext):
rec = PHashTable(
sha256=sha256, phash=phash, ext=ext,
)
session = Session()
session.add(rec)
session.flush()
|