2022-10-23 18:01:46 +02:00
|
|
|
from PIL import Image
|
|
|
|
import imagehash
|
|
|
|
import os
|
|
|
|
import numpy as np
|
2022-10-23 19:34:46 +02:00
|
|
|
import csv
|
|
|
|
|
|
|
|
def writeCSV(dirname, i1, i2):
|
|
|
|
with open("duplicates.csv", "a") as file:
|
|
|
|
writer = csv.writer(file, delimiter=',')
|
|
|
|
writer.writerow([dirname+i1, dirname+i2])
|
2022-10-23 18:01:46 +02:00
|
|
|
|
|
|
|
class DuplicateRemover:
|
|
|
|
extensions = ['png', 'jpg', 'jpeg', 'gif', 'webp']
|
|
|
|
def __init__(self,dirname,hash_size = 8):
|
|
|
|
self.dirname = dirname
|
|
|
|
self.hash_size = hash_size
|
|
|
|
|
2022-10-23 18:52:05 +02:00
|
|
|
def find_duplicates(self, verbose=True, exportCSV=True):
|
2022-10-23 18:01:46 +02:00
|
|
|
"""
|
|
|
|
Find and Delete Duplicates
|
|
|
|
"""
|
|
|
|
|
|
|
|
fnames = os.listdir(self.dirname)
|
|
|
|
hashes = {}
|
|
|
|
duplicates = []
|
|
|
|
if verbose:
|
|
|
|
print("Finding Duplicates Now!\n")
|
|
|
|
for image in fnames:
|
|
|
|
if list(image.lower().split('.'))[-1] in self.extensions:
|
2022-10-23 18:52:05 +02:00
|
|
|
try:
|
|
|
|
with Image.open(os.path.join(self.dirname,image)) as img:
|
|
|
|
temp_hash = imagehash.average_hash(img, self.hash_size)
|
|
|
|
if temp_hash in hashes:
|
|
|
|
if verbose:
|
|
|
|
print("Duplicate {} \nfound for Image {}!\n".format(image,hashes[temp_hash]))
|
2022-10-23 19:34:46 +02:00
|
|
|
if exportCSV:
|
|
|
|
writeCSV(self.dirname, image, hashes[temp_hash])
|
2022-10-23 18:52:05 +02:00
|
|
|
duplicates.append(image)
|
|
|
|
else:
|
|
|
|
hashes[temp_hash] = image
|
|
|
|
except Exception as error:
|
|
|
|
print("Error: The following error occured:\n",image,"\n",error,"\n")
|
2022-10-23 18:01:46 +02:00
|
|
|
|
|
|
|
if len(duplicates) != 0:
|
|
|
|
if verbose:
|
|
|
|
a = input("Do you want to delete these {} Images? Press Y or N: ".format(len(duplicates)))
|
|
|
|
else:
|
|
|
|
a = 'y'
|
|
|
|
space_saved = 0
|
|
|
|
if(a.strip().lower() == "y"):
|
|
|
|
for duplicate in duplicates:
|
|
|
|
space_saved += os.path.getsize(os.path.join(self.dirname,duplicate))
|
|
|
|
|
|
|
|
os.remove(os.path.join(self.dirname,duplicate))
|
|
|
|
if verbose:
|
|
|
|
print("{} Deleted Succesfully!".format(duplicate))
|
|
|
|
|
|
|
|
if verbose:
|
|
|
|
print("\n\nYou saved {} mb of Space!".format(round(space_saved/1000000),2))
|
|
|
|
else:
|
|
|
|
if verbose:
|
|
|
|
print("Thank you for Using Duplicate Remover")
|
|
|
|
else:
|
|
|
|
if verbose:
|
|
|
|
print("No Duplicates Found :(")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def find_similar(self,location,similarity=80,verbose=True):
|
|
|
|
fnames = os.listdir(self.dirname)
|
|
|
|
threshold = 1 - similarity/100
|
|
|
|
diff_limit = int(threshold*(self.hash_size**2))
|
|
|
|
|
|
|
|
with Image.open(location) as img:
|
|
|
|
hash1 = imagehash.average_hash(img, self.hash_size).hash
|
|
|
|
|
|
|
|
if verbose:
|
|
|
|
print("Finding Similar Images to {} Now!\n".format(location))
|
|
|
|
for image in fnames:
|
|
|
|
if list(image.lower().split('.'))[-1] in self.extensions:
|
|
|
|
with Image.open(os.path.join(self.dirname,image)) as img:
|
|
|
|
hash2 = imagehash.average_hash(img, self.hash_size).hash
|
|
|
|
|
|
|
|
if np.count_nonzero(hash1 != hash2) <= diff_limit:
|
|
|
|
if verbose:
|
2022-10-23 19:34:46 +02:00
|
|
|
print("{} image found {}% similar to {}".format(image,similarity,location))
|