Thursday, June 12, 2025

 The following serves as an illustration to remove duplicates from a continuous stream of aerial images:

import cv2

import imagehash

import numpy as np

from PIL import Image

from collections import deque

class ImageDeduplicator:

    def __init__(self, buffer_size=100):

        """Initialize a ring buffer for tracking image hashes."""

        self.buffer_size = buffer_size

        self.hash_buffer = deque(maxlen=buffer_size)

        self.vector_buffer = deque(maxlen=buffer_size)

        self.threshold = 0.97 # as close to an exact match of 1.0

    def compute_hash(self, image):

        """Compute perceptual hash of an image."""

        return imagehash.phash(Image.fromarray(image))

    def is_duplicate(self, image):

        """Check if the image is a duplicate."""

        img_hash = self.compute_hash(image)

        if img_hash in self.hash_buffer:

            return True

        self.hash_buffer.append(img_hash)

        return False

    def is_visited(self, vector):

        index = 0

        for existing in reversed(self.vector_buffer):

            # print(existing)

            score = self.cosine_similarity(existing, vector)

            if score > self.threshold:

                return True

            index += 1

        self.vector_buffer.append(vector)

        return False

    def get_hash_buffer_len(self):

        return len(self.hash_buffer)

    def get_vector_buffer_len(self):

        return len(self.vector_buffer)

    def cosine_similarity(self, vec1, vec2):

        """Computes cosine similarity between two vectors."""

        dot_product = np.dot(vec1, vec2)

        norm_vec1 = np.linalg.norm(vec1)

        norm_vec2 = np.linalg.norm(vec2)

        return dot_product / (norm_vec1 * norm_vec2)

    def euclidean_distance(self, vec1, vec2):

        """Computes Euclidean distance between two vectors."""

        value = np.linalg.norm(np.array(vec1) - np.array(vec2))

        print(f"Euclidean={value}")

        return value

def is_duplicate(destination_client, vector):

    vector_query = VectorizedQuery(vector=vector,

                                  k_nearest_neighbors=3,

                                  exhaustive=True,

                                  fields = "vector")

    results = search_client.search(

        search_text=None,

        vector_queries= [vector_query],

        select=["id", "description","vector"],

        # select='id,description,vector',

        include_total_count=True,

        top=4

    )

    if results != None and results.get_count() > 0:

        best = 0

        for match in results:

            match_vector = match["vector"]

            score = self.cosine_similarity(vector, match_vector)

            if score > best:

                best = score

        if best > 0.8:

           return True

    return False

Reference: previous posts for context.

#codingexercise: https://1drv.ms/w/c/d609fb70e39b65c8/Echlm-Nw-wkggNYlIwEAAAABD8nSsN--hM7kfA-W_mzuWw?e=vGBXc1 

No comments:

Post a Comment