# -*- coding: utf-8 -*-
'''
Python用哈希算法查找相似图片并放入[_df]的文件夹中
相似图片包括不同分辨率,不同大小,不同格式,只要图片相似就会算重复文件
安装cv2
pip install opencv-python
'''
import os
import cv2
import numpy as np
import shutil
import random
class DuplicateFiles (object):
dir = ''
def __init__(self, dir):
self.dir = dir # 实例属性
# 均值哈希算法
def aHash(self,img,shape=(10,10)):
# 缩放为10\*10
img = cv2.resize(img, shape)
# 转换为灰度图
gray = cv2.cvtColor(img, cv2.COLOR\_BGR2GRAY)
# s为像素和初值为0,hash\_str为hash值初值为''
s = 0
hash\_str = ''
# 遍历累加求像素和
for i in range(shape\[0\]):
for j in range(shape\[1\]):
s = s + gray\[i, j\]
# 求平均灰度
avg = s / 100
# 灰度大于平均值为1相反为0生成图片的hash值
for i in range(shape\[0\]):
for j in range(shape\[1\]):
if gray\[i, j\] > avg:
hash\_str = hash\_str + '1'
else:
hash\_str = hash\_str + '0'
return hash\_str
# 差值感知算法
def dHash(self,img,shape=(10,10)):
# 缩放10\*11
img = cv2.resize(img, (shape\[0\]+1, shape\[1\]))
# 转换灰度图
gray = cv2.cvtColor(img, cv2.COLOR\_BGR2GRAY)
hash\_str = ''
# 每行前一个像素大于后一个像素为1,相反为0,生成哈希
for i in range(shape\[0\]):
for j in range(shape\[1\]):
if gray\[i, j\] > gray\[i, j + 1\]:
hash\_str = hash\_str + '1'
else:
hash\_str = hash\_str + '0'
return hash\_str
# 感知哈希算法(pHash)
def pHash(self,img,shape=(10,10)):
# 缩放32\*32
img = cv2.resize(img, (32, 32)) # , interpolation=cv2.INTER\_CUBIC
# 转换为灰度图
gray = cv2.cvtColor(img, cv2.COLOR\_BGR2GRAY)
# 将灰度图转为浮点型,再进行dct变换
dct = cv2.dct(np.float32(gray))
# opencv实现的掩码操作
dct\_roi = dct\[0:10, 0:10\]
hash = \[\]
avreage = np.mean(dct\_roi)
for i in range(dct\_roi.shape\[0\]):
for j in range(dct\_roi.shape\[1\]):
if dct\_roi\[i, j\] > avreage:
hash.append(1)
else:
hash.append(0)
return hash
# 通过得到RGB每个通道的直方图来计算相似度
def classify\_hist\_with\_split(self,image1, image2, size=(256, 256)):
# 将图像resize后,分离为RGB三个通道,再计算每个通道的相似值
image1 = cv2.resize(image1, size)
image2 = cv2.resize(image2, size)
sub\_image1 = cv2.split(image1)
sub\_image2 = cv2.split(image2)
sub\_data = 0
for im1, im2 in zip(sub\_image1, sub\_image2):
sub\_data += self.calculate(im1, im2)
sub\_data = sub\_data / 3
return sub\_data
# 计算单通道的直方图的相似值
def calculate(self,image1, image2):
hist1 = cv2.calcHist(\[image1\], \[0\], None, \[256\], \[0.0, 255.0\])
hist2 = cv2.calcHist(\[image2\], \[0\], None, \[256\], \[0.0, 255.0\])
# 计算直方图的重合度
degree = 0
for i in range(len(hist1)):
if hist1\[i\] != hist2\[i\]:
degree = degree + (1 - abs(hist1\[i\] - hist2\[i\]) / max(hist1\[i\], hist2\[i\]))
else:
degree = degree + 1
degree = degree / len(hist1)
return degree
# Hash值对比
def cmpHash(self,hash1, hash2,shape=(10,10)):
n = 0
# hash长度不同则返回-1代表传参出错
if len(hash1)!=len(hash2):
return -1
# 遍历判断
for i in range(len(hash1)):
# 相等则n计数+1,n最终为相似度
if hash1\[i\] == hash2\[i\]:
n = n + 1
return n/(shape\[0\]\*shape\[1\])
def mymovefile(self,srcfile,dstpath,ffname): # 移动函数
if not os.path.isfile(srcfile):
print ("%s not exist!"%(srcfile))
else:
fpath,fname=os.path.split(srcfile) # 分离文件名和路径
if(ffname):fname=ffname
if not os.path.exists(dstpath):
os.makedirs(dstpath) # 创建路径
shutil.move(srcfile, dstpath + fname) # 移动文件
#print ("move %s -> %s"%(srcfile, dstpath + fname))
# 定义函数
def list\_all\_files(self,rootdir):
\_files = \[\]
# 列出文件夹下所有的目录与文件
list = os.listdir(rootdir)
for i in range(0, len(list)):
# 构造路径
path = os.path.join(rootdir, list\[i\])
# 判断路径是否为文件目录或者文件
# 如果是目录则继续递归
if os.path.isdir(path):
\_files.extend(list\_all\_files(path))
if os.path.isfile(path):
\_files.append(path)
return \_files
#处理文件
def mvPhoto(self):
photoList = self.list\_all\_files(self.dir)
#print(photoList)
for i,photo in enumerate(photoList):
mvPhoto = False #是否移动主文件
#如果不是文件则跳出
if(not os.path.isfile(photo)):
continue
fpath,fname=os.path.split(photo)
print('Master:'+fname)
ffname = fname.split('.')
#不是下列文件形式跳出
if(ffname\[1\] not in {'jpg', 'bmp', 'png', 'jpeg', 'gif'}):
continue
img1 = cv2.imdecode(np.fromfile(photo,dtype=np.uint8),cv2.IMREAD\_COLOR)
for j in range(i+1,len(photoList)):
#print(' ',j,photoList\[j\])
if(not os.path.isfile(photo) or not os.path.isfile(photoList\[j\])):
continue
spath,sname=os.path.split(photoList\[j\])
#print(sname)
ssname = sname.split('.')
if(ssname\[1\] not in {'jpg', 'bmp', 'png', 'jpeg', 'jfif'}):
continue
#img1 = cv2.imread(photo)
img2 = cv2.imdecode(np.fromfile(photoList\[j\],dtype=np.uint8),cv2.IMREAD\_COLOR)
#hash1 = aHash(img1)
#hash2 = aHash(img2)
n1 = self.cmpHash(self.aHash(img1), self.aHash(img2))
n2 = self.cmpHash(self.dHash(img1), self.dHash(img2))
n3 = self.cmpHash(self.pHash(img1), self.pHash(img2))
n4 = self.classify\_hist\_with\_split(img1, img2)
n5 = self.calculate(img1, img2)
#print(' ',n1,n2,n3,n4,n5)
if(n1>0.90 or n2>0.90 or n3>0.90 or n4>0.90 or n5>0.90):
mvPhoto = True
print(' move file:'+photoList\[j\])
if(os.path.isfile(photoList\[j\])):
print('ffname\[0\]:'+ffname\[0\])
#mymovefile(photoList\[j\],dir+'\_重复'+'/',ffname\[0\]+'\_'+str(random.randint(10,99))+'.'+ffname\[1\])
self.mymovefile(photoList\[j\],dir+'\_df'+'/',ffname\[0\]+'\_'+sname)
#最后移动主文件
if(mvPhoto==True):
self.mymovefile(photo,dir+'\_df'+'/',fname)
if __name__ == "__main__":
#指定路径
#dir = r'E:\python\photoCompare\328' #指定目录地址
dir = os.getcwd() #当前文件所在目录
duplicateFiles = DuplicateFiles(dir)
duplicateFiles.mvPhoto()
手机扫一扫
移动阅读更方便
你可能感兴趣的文章