Cluster computing

The following is a sample of how to index images in Azure AI Search for lexical and vector search.

#! /usr/bin/python

#from azure.ai.vision import VisionClient

from azure.core.credentials import AzureKeyCredential

from azure.core.rest import HttpRequest, HttpResponse

from azure.core.exceptions import HttpResponseError

from azure.identity import DefaultAzureCredential

from azure.search.documents import SearchClient

from azure.ai.vision.imageanalysis import ImageAnalysisClient

from azure.ai.vision.imageanalysis.models import VisualFeatures

from tenacity import retry, stop_after_attempt, wait_fixed

from dotenv import load_dotenv

import json

import requests

import http.client, urllib.parse

import os

load_dotenv()

search_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")

index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")

search_api_version = os.getenv("AZURE_SEARCH_API_VERSION")

search_api_key = os.getenv("AZURE_SEARCH_ADMIN_KEY")

vision_api_key = os.getenv("AZURE_AI_VISION_API_KEY")

vision_api_version = os.getenv("AZURE_AI_VISION_API_VERSION")

vision_region = os.getenv("AZURE_AI_VISION_REGION")

vision_endpoint = os.getenv("AZURE_AI_VISION_ENDPOINT")

credential = DefaultAzureCredential()

#search_credential = AzureKeyCredential(search_api_key)

vision_credential = AzureKeyCredential(vision_api_key)

# Initialize Azure clients

#vision_client = VisionClient(endpoint=vision_endpoint, credential=AzureKeyCredential(vision_api_key))

search_client = SearchClient(endpoint=search_endpoint, index_name=index_name, credential=credential)

analysis_client = ImageAnalysisClient(vision_endpoint, vision_credential)

# Define SAS URL template

sas_template = "https://saravinoteblogs.blob.core.windows.net/playground/vision/main/main/{file}.jpg?sp=rle&st=2025-05-11T00:36:41Z&se=2025-05-11T08:36:41Z&spr=https&sv=2024-11-04&sr=d&sig=vjCrqWLo3LbmkXwCyIKWtAtFnYO2uBSxEWNgGKbeS00%3D&sdd=3"

# Process images in batches of 100

batch_size = 100

total_images = 2 # 17853 # Adjust this as needed

def get_description(id, image_url):

result = analyze_image_from_sdk(client, image_url)

description = {}

description["id"] = id

# Access the results (e.g., image categories)

if result.caption:

print(f"Caption: {result.caption.text}")

print(f"Caption Confidence: {result.caption.confidence}")

description["caption"] = f"{result.caption.text}"

description["caption_confidence"] = result.caption.confidence

if result.tags:

print("Tags:")

tags = []

for tag in result.tags:

tag = {}

print(f" {tag.name}: {tag.confidence}")

tag["name"] = f"{tag.name}"

tag["confidence"] = f"{tag.confidence}"

tags += [tag]

description["tags"] = tags

if result.objects:

print("Objects:")

objectItems = []

for obj in result.objects:

objectItem = {}

print(f" {obj.name}: {obj.confidence}")

objectItem["name"] = f"{obj.name}"

objectItem["confidence"] = obj.confidence

if obj.bounding_box:

print(f" Bounding Box: {obj.bounding_box}")

objectItem["bounding_box"] = f"{obj.bounding_box}"

objectItems += [objectItem]

description["objects"] = objectItems

return description

#@retry(stop=stop_after_attempt(5), wait=wait_fixed(1))

def vectorize_image(client, blob_url):

headers = {

'Ocp-Apim-Subscription-Key': vision_api_key,

}

params = {

'model-version': '2023-04-15',

'language': 'en'

}

headers['Content-Type'] = 'application/json'

request = HttpRequest(

method="POST",

url=f"/retrieval:vectorizeImage?api-version={vision_api_version}",

json={"url": blob_url},

params=params,

headers=headers

)

response = client.send_request(request)

try:

print(repr(response))

response.raise_for_status()

print(f"vectorize returned {response.json()}")

return response.json()

except HttpResponseError:

print(str(e))

return None

#@retry(stop=stop_after_attempt(5), wait=wait_fixed(1))

def get_image_vector(image_path, key, region):

headers = {

'Ocp-Apim-Subscription-Key': key,

}

params = urllib.parse.urlencode({

'model-version': 'latest',

})

try:

if image_path.startswith(('http://', 'https://')):

headers['Content-Type'] = 'application/json'

body = json.dumps({"url": image_path})

else:

headers['Content-Type'] = 'application/octet-stream'

with open(image_path, "rb") as filehandler:

image_data = filehandler.read()

body = image_data

conn = http.client.HTTPSConnection("img01.cognitiveservices.azure.com", timeout=3)

conn.request("POST", "/retrieval:vectorizeImage?api-version=2023-04-01-preview&%s" % params, body, headers)

response = conn.getresponse()

print(repr(response))

data = json.load(response)

print(repr(data))

conn.close()

if response.status != 200:

raise Exception(f"Error processing image {image_path}: {data.get('message', '')}")

return data.get("vector")

except (requests.exceptions.Timeout, http.client.HTTPException) as e:

print(f"Timeout/Error for {image_path}. Retrying...")

raise

#@retry(stop=stop_after_attempt(5), wait=wait_fixed(1))

def analyze_image(client, blob_url):

headers = {

'Ocp-Apim-Subscription-Key': search_api_key,

}

params = {

'model-version': '2023-04-15',

'language': 'en'

}

headers['Content-Type'] = 'application/json'

request = HttpRequest(

method="POST",

url=f"/computervision/imageanalysis:analyze?api-version={vision_api_version}",

json={"url": blob_url},

params=params,

headers=headers

)

response = client.send_request(request)

try:

response.raise_for_status()

print(f"analyze returned {response.json()}")

return response.json()

except HttpResponseError:

print(str(e))

return None

def analyze_image_from_sdk(client, blob_url):

result = client.analyze(

image_url=blob_url,

visual_features=[

VisualFeatures.TAGS,

VisualFeatures.OBJECTS,

VisualFeatures.CAPTION,

VisualFeatures.DENSE_CAPTIONS,

VisualFeatures.READ,

VisualFeatures.SMART_CROPS,

VisualFeatures.PEOPLE,

], # Mandatory. Select one or more visual features to analyze.

smart_crops_aspect_ratios=[0.9, 1.33], # Optional. Relevant only if SMART_CROPS was specified above.

gender_neutral_caption=True, # Optional. Relevant only if CAPTION or DENSE_CAPTIONS were specified above.

language="en", # Optional. Relevant only if TAGS is specified above. See https://aka.ms/cv-languages for supported languages.

model_version="latest", # Optional. Analysis model version to use. Defaults to "latest".

)

return result

def vectorize_image_from_sdk(client, blob_url):

result = client.vectorize(

image_url=blob_url,

language="en", # Optional. Relevant only if TAGS is specified above. See https://aka.ms/cv-languages for supported languages.

model_version="latest", # Optional. Analysis model version to use. Defaults to "latest".

)

return result

for batch_start in range(1, total_images + 1, batch_size):

vectorized_images = {}

documents = []

# Vectorize 100 images at a time

batch_end = min(batch_start + batch_size, total_images + 1)

for i in range(batch_start, batch_end):

file_name = f"{i:06}"

blob_url = sas_template.format(file=file_name)

try:

#response = get_image_vector(blob_url, vision_api_key, "eastus")

response = vectorize_image(analysis_client, blob_url)

print(repr(response))

if response:

vectorized_images[file_name] = response

documents += [

{"id": file_name, "description": repr(get_description(file_name, sas_template.format(file=file_name))), "vector": response}

]

except Exception as e:

print(f"Error processing {file_name}.jpg: {e}")

print(f"Vectorization complete for images {batch_start} to {min(batch_start + batch_size - 1, total_images)}")

# Upload batch to Azure AI Search

if len(documents) > 0:

# search_client.upload_documents(documents)

print(f"Uploaded {len(documents)} images {batch_start} to {batch_end} to {index_name}.")

print(f"Vectorized images successfully added to {index_name}!")

Cluster computing

Sunday, May 11, 2025

No comments:

Post a Comment