SimilarImages/DuplicateRemover.py

from PIL import Image
import imagehash
import os
import numpy as np
import csv
import requests

def writeCSV(i1, i2):
    with open("duplicates.csv", "a") as file:
        writer = csv.writer(file, delimiter=',')
        writer.writerow([i1, i2])

class DuplicateRemover:
    extensions = ['png', 'jpg', 'jpeg', 'gif', 'webp']
    def __init__(self,dirname,hash_size = 8):
        self.dirname = dirname
        self.hash_size = hash_size

    def find_duplicates(self, verbose=True, exportCSV=False, exportAPI=True):
        """
        Find and Delete Duplicates
        """

        #fnames = os.listdir(self.dirname)
        hashes = {}
        duplicates = []
        if verbose:
            print("Finding Duplicates Now!\n")
        for path, subdirs, fnames in os.walk(self.dirname):
            for image in fnames:
                if list(image.lower().split('.'))[-1] in self.extensions:
                    try:
                        with Image.open(os.path.join(path,image)) as img:
                            temp_hash = imagehash.average_hash(img, self.hash_size)
                            if temp_hash in hashes:
                                if verbose:
                                    print("Duplicate {} \nfound for Image {}!\n".format(os.path.join(path,image),os.path.join(path,hashes[temp_hash])))
                                if exportCSV:
                                    writeCSV(os.path.join(path,image), os.path.join(path,hashes[temp_hash]))
                                if exportAPI:
                                    requests.post('http://your-server.domain/duplicate_api.php?insertDuplicate', json={"file1":os.path.join(path,image),"file2":os.path.join(path,hashes[temp_hash])})
                                duplicates.append(os.path.join(path,image))
                            else:
                                hashes[temp_hash] = os.path.join(path,image)
                    except Exception as error:
                        print("Error: The following error occured:\n",os.path.join(path,image),"\n",error,"\n")

        if len(duplicates) != 0:
            if verbose:
                a = input("Do you want to delete these {} Images? Press Y or N:  ".format(len(duplicates)))
            else:
                a = 'y'
            space_saved = 0
            if(a.strip().lower() == "y"):
                for duplicate in duplicates:
                    space_saved += os.path.getsize(duplicate)

                    os.remove(duplicate)
                    if verbose:
                        print("{} Deleted Succesfully!".format(duplicate))

                if verbose:
                    print("\n\nYou saved {} mb of Space!".format(round(space_saved/1000000),2))
            else:
                if verbose:
                    print("Thank you for Using Duplicate Remover")
        else:
            if verbose:
                print("No Duplicates Found :(")


    def find_similar(self,location,similarity=80,verbose=True):
        fnames = os.listdir(self.dirname)
        threshold = 1 - similarity/100
        diff_limit = int(threshold*(self.hash_size**2))

        with Image.open(location) as img:
            hash1 = imagehash.average_hash(img, self.hash_size).hash

        if verbose:
            print("Finding Similar Images to {} Now!\n".format(location))
        for image in fnames:
            if list(image.lower().split('.'))[-1] in self.extensions:
                with Image.open(os.path.join(self.dirname,image)) as img:
                    hash2 = imagehash.average_hash(img, self.hash_size).hash

                    if np.count_nonzero(hash1 != hash2) <= diff_limit:
                        if verbose:
                            print("{} image found {}% similar to {}".format(image,similarity,location))