Cluster computing

Thursday, June 5, 2025

Image retrieval enhancement:

The following is a technique to enhance the image retrieval for drone images following the vectorize and analyze method describe in the references and comes helpful regardless of zero agent, one-agent or multiple agent-based retrieval:

import json

from azure.search.documents import SearchClient

from azure.core.credentials import AzureKeyCredential

import os

import re

search_endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]

api_version = os.getenv("AZURE_SEARCH_API_VERSION")

search_api_key = os.getenv("AZURE_SEARCH_ADMIN_KEY")

index_name = os.getenv("AZURE_SEARCH_INDEX_NAME", "index00")

dest_index_name = os.getenv("AZURE_SEARCH_1024_INDEX_NAME", "index1024")

credential = AzureKeyCredential(search_api_key)

# Initialize SearchClient

search_client = SearchClient(

endpoint=search_endpoint,

index_name=index_name,

credential=AzureKeyCredential(search_api_key)

)

destination_client = SearchClient(

endpoint=search_endpoint,

index_name=dest_index_name,

credential=AzureKeyCredential(search_api_key)

)

def prepare_json_string_for_load(text):

text = text.replace("\"", "'")

text = text.replace("{'", "{\"")

text = text.replace("'}", "\"}")

text = text.replace(" '", " \"")

text = text.replace("' ", "\" ")

text = text.replace(":'", ":\"")

text = text.replace("':", "\":")

text = text.replace(",'", ",\"")

text = text.replace("',", "\",")

return re.sub(r'\n\s*', '', text)

def to_string(bounding_box):

return f"{bounding_box['x']},{bounding_box['y']},{bounding_box['w']},{bounding_box['h']}"

page_size = 10

skip = 0

total = 17833

while True:

# Retrieve the first 10 entries from the index

search_results = search_client.search("*", select=["id", "description", "vector"], top=page_size, skip = skip, include_total_count=True)

# Process entries and shred descriptions

flat_list = []

if search_results.get_count() == 0:

break

for entry in search_results:

entry_id = entry["id"]

width = 0

height = 0

tags = ""

title = ""

description_text = prepare_json_string_for_load(entry["description"]).replace('""','')

description_json = json.loads(description_text)

if description_json and description_json["description"]:

title = description_json["description"]

if description_json and description_json["_data"] and description_json["_data"]["tagsResult"] and description_json["_data"]["tagsResult"]["values"]:

tags = ','.join([tag["name"] for tag in description_json["_data"]["tagsResult"]["values"]]).strip(",")

# add entries at object level instead of image level

# if description_json and description_json["_data"] and description_json["_data"]["denseCaptionsResult"] and description_json["_data"]["denseCaptionsResult"]["values"]:

# for item in description_json["_data"]["denseCaptionsResult"]["values"]:

# text = item.get("text", "")

# bounding_box = item.get("boundingBox", {

# "x": 0,

# "y": 0,

# "w": 0,

# "h": 0

# })

# flat_list.append({

# "id": entry_id,

# "text": text,

# "bounding_box": to_string(bounding_box),

# "tags" : tags,

# "title": title

# })

# else:

# print(f"Nothing found in entry with id:{id}")

flat_list.append({

"id": entry_id,

"tags" : tags,

"title": title

})

if len(flat_list) != 0:

merge_results = destination_client.merge_documents(flat_list)

error = ','.join([merge_result.error_message for merge_result in merge_results if merge_result.error_message]).strip(",")

if error:

print(error)

if len([merge_result.succeeded for merge_result in merge_results if merge_result.succeeded]) == page_size:

print(f"success in merging entries with id: {skip} to {skip + page_size}")

skip += page_size

References:

Vectorize and Analyze: https://1drv.ms/w/c/d609fb70e39b65c8/Eb6vxQeXGE9MsVwwdsvLSskBLgFNNuClDqAepem73pMcbQ?e=LtQasJ

Cluster computing

Thursday, June 5, 2025

No comments:

Post a Comment