Python用哈希算法查找相似图片(包括不同分辨率,不同大小,不同格式的图片)
阅读原文时间:2023年07月14日阅读:2

# -*- coding: utf-8 -*-
'''
Python用哈希算法查找相似图片并放入[_df]的文件夹中

相似图片包括不同分辨率,不同大小,不同格式,只要图片相似就会算重复文件

安装cv2
pip install opencv-python

'''
import os
import cv2
import numpy as np
import shutil
import random

class DuplicateFiles (object):
dir = ''
def __init__(self, dir):
self.dir = dir # 实例属性

# 均值哈希算法  
def aHash(self,img,shape=(10,10)):  
    # 缩放为10\*10  
    img = cv2.resize(img, shape)  
    # 转换为灰度图  
    gray = cv2.cvtColor(img, cv2.COLOR\_BGR2GRAY)  
    # s为像素和初值为0,hash\_str为hash值初值为''  
    s = 0  
    hash\_str = ''  
    # 遍历累加求像素和  
    for i in range(shape\[0\]):  
        for j in range(shape\[1\]):  
            s = s + gray\[i, j\]  
    # 求平均灰度  
    avg = s / 100  
    # 灰度大于平均值为1相反为0生成图片的hash值  
    for i in range(shape\[0\]):  
        for j in range(shape\[1\]):  
            if gray\[i, j\] > avg:  
                hash\_str = hash\_str + '1'  
            else:  
                hash\_str = hash\_str + '0'  
    return hash\_str

# 差值感知算法  
def dHash(self,img,shape=(10,10)):  
    # 缩放10\*11  
    img = cv2.resize(img, (shape\[0\]+1, shape\[1\]))  
    # 转换灰度图  
    gray = cv2.cvtColor(img, cv2.COLOR\_BGR2GRAY)  
    hash\_str = ''  
    # 每行前一个像素大于后一个像素为1,相反为0,生成哈希  
    for i in range(shape\[0\]):  
        for j in range(shape\[1\]):  
            if gray\[i, j\] > gray\[i, j + 1\]:  
                hash\_str = hash\_str + '1'  
            else:  
                hash\_str = hash\_str + '0'  
    return hash\_str

# 感知哈希算法(pHash)  
def pHash(self,img,shape=(10,10)):  
    # 缩放32\*32  
    img = cv2.resize(img, (32, 32))  # , interpolation=cv2.INTER\_CUBIC

    # 转换为灰度图  
    gray = cv2.cvtColor(img, cv2.COLOR\_BGR2GRAY)  
    # 将灰度图转为浮点型,再进行dct变换  
    dct = cv2.dct(np.float32(gray))  
    # opencv实现的掩码操作  
    dct\_roi = dct\[0:10, 0:10\]

    hash = \[\]  
    avreage = np.mean(dct\_roi)  
    for i in range(dct\_roi.shape\[0\]):  
        for j in range(dct\_roi.shape\[1\]):  
            if dct\_roi\[i, j\] > avreage:  
                hash.append(1)  
            else:  
                hash.append(0)  
    return hash

# 通过得到RGB每个通道的直方图来计算相似度  
def classify\_hist\_with\_split(self,image1, image2, size=(256, 256)):  
    # 将图像resize后,分离为RGB三个通道,再计算每个通道的相似值  
    image1 = cv2.resize(image1, size)  
    image2 = cv2.resize(image2, size)  
    sub\_image1 = cv2.split(image1)  
    sub\_image2 = cv2.split(image2)  
    sub\_data = 0  
    for im1, im2 in zip(sub\_image1, sub\_image2):  
        sub\_data += self.calculate(im1, im2)  
    sub\_data = sub\_data / 3  
    return sub\_data

# 计算单通道的直方图的相似值  
def calculate(self,image1, image2):  
    hist1 = cv2.calcHist(\[image1\], \[0\], None, \[256\], \[0.0, 255.0\])  
    hist2 = cv2.calcHist(\[image2\], \[0\], None, \[256\], \[0.0, 255.0\])  
    # 计算直方图的重合度  
    degree = 0  
    for i in range(len(hist1)):  
        if hist1\[i\] != hist2\[i\]:  
            degree = degree + (1 - abs(hist1\[i\] - hist2\[i\]) / max(hist1\[i\], hist2\[i\]))  
        else:  
            degree = degree + 1  
    degree = degree / len(hist1)  
    return degree

# Hash值对比  
def cmpHash(self,hash1, hash2,shape=(10,10)):  
    n = 0  
    # hash长度不同则返回-1代表传参出错  
    if len(hash1)!=len(hash2):  
        return -1  
    # 遍历判断  
    for i in range(len(hash1)):  
        # 相等则n计数+1,n最终为相似度  
        if hash1\[i\] == hash2\[i\]:  
            n = n + 1  
    return n/(shape\[0\]\*shape\[1\])

def mymovefile(self,srcfile,dstpath,ffname):           # 移动函数  
    if not os.path.isfile(srcfile):  
        print ("%s not exist!"%(srcfile))  
    else:  
        fpath,fname=os.path.split(srcfile)             # 分离文件名和路径  
        if(ffname):fname=ffname  
        if not os.path.exists(dstpath):  
            os.makedirs(dstpath)                       # 创建路径  
        shutil.move(srcfile, dstpath + fname)          # 移动文件  
        #print ("move %s -> %s"%(srcfile, dstpath + fname))

# 定义函数  
def list\_all\_files(self,rootdir):  
    \_files = \[\]  
    # 列出文件夹下所有的目录与文件  
    list = os.listdir(rootdir)  
    for i in range(0, len(list)):  
        # 构造路径  
        path = os.path.join(rootdir, list\[i\])  
        # 判断路径是否为文件目录或者文件  
        # 如果是目录则继续递归  
        if os.path.isdir(path):  
            \_files.extend(list\_all\_files(path))  
        if os.path.isfile(path):  
            \_files.append(path)  
    return \_files

#处理文件  
def mvPhoto(self):

    photoList = self.list\_all\_files(self.dir)  
    #print(photoList)

    for i,photo in enumerate(photoList):  
        mvPhoto = False #是否移动主文件  
        #如果不是文件则跳出  
        if(not os.path.isfile(photo)):  
            continue  
        fpath,fname=os.path.split(photo)  
        print('Master:'+fname)  
        ffname = fname.split('.')

        #不是下列文件形式跳出  
        if(ffname\[1\] not in {'jpg', 'bmp', 'png', 'jpeg', 'gif'}):  
            continue

        img1 = cv2.imdecode(np.fromfile(photo,dtype=np.uint8),cv2.IMREAD\_COLOR)  
        for j in range(i+1,len(photoList)):  
            #print('  ',j,photoList\[j\])  
            if(not os.path.isfile(photo) or not os.path.isfile(photoList\[j\])):  
                continue  
            spath,sname=os.path.split(photoList\[j\])  
            #print(sname)  
            ssname = sname.split('.')  
            if(ssname\[1\] not in {'jpg', 'bmp', 'png', 'jpeg', 'jfif'}):  
                continue

            #img1 = cv2.imread(photo)  
            img2 = cv2.imdecode(np.fromfile(photoList\[j\],dtype=np.uint8),cv2.IMREAD\_COLOR)

            #hash1 = aHash(img1)  
            #hash2 = aHash(img2)  
            n1 = self.cmpHash(self.aHash(img1), self.aHash(img2))  
            n2 = self.cmpHash(self.dHash(img1), self.dHash(img2))  
            n3 = self.cmpHash(self.pHash(img1), self.pHash(img2))  
            n4 = self.classify\_hist\_with\_split(img1, img2)  
            n5 = self.calculate(img1, img2)  
            #print('    ',n1,n2,n3,n4,n5)  
            if(n1>0.90 or n2>0.90 or n3>0.90 or n4>0.90 or n5>0.90):  
                mvPhoto = True  
                print('    move file:'+photoList\[j\])  
                if(os.path.isfile(photoList\[j\])):  
                    print('ffname\[0\]:'+ffname\[0\])  
                    #mymovefile(photoList\[j\],dir+'\_重复'+'/',ffname\[0\]+'\_'+str(random.randint(10,99))+'.'+ffname\[1\])  
                    self.mymovefile(photoList\[j\],dir+'\_df'+'/',ffname\[0\]+'\_'+sname)

        #最后移动主文件  
        if(mvPhoto==True):  
            self.mymovefile(photo,dir+'\_df'+'/',fname)

if __name__ == "__main__":
#指定路径
#dir = r'E:\python\photoCompare\328' #指定目录地址
dir = os.getcwd() #当前文件所在目录
duplicateFiles = DuplicateFiles(dir)
duplicateFiles.mvPhoto()