Sunday, May 11, 2025

 The following is a sample of how to index images in Azure AI Search for lexical and vector search.

#! /usr/bin/python


#from azure.ai.vision import VisionClient

from azure.core.credentials import AzureKeyCredential

from azure.core.rest import HttpRequest, HttpResponse

from azure.core.exceptions import HttpResponseError

from azure.identity import DefaultAzureCredential

from azure.search.documents import SearchClient

from azure.ai.vision.imageanalysis import ImageAnalysisClient

from azure.ai.vision.imageanalysis.models import VisualFeatures

from tenacity import retry, stop_after_attempt, wait_fixed

from dotenv import load_dotenv  

import json  

import requests

import http.client, urllib.parse

import os


load_dotenv()  

search_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")  

index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")

search_api_version = os.getenv("AZURE_SEARCH_API_VERSION")

search_api_key = os.getenv("AZURE_SEARCH_ADMIN_KEY")  

vision_api_key = os.getenv("AZURE_AI_VISION_API_KEY")

vision_api_version = os.getenv("AZURE_AI_VISION_API_VERSION")

vision_region = os.getenv("AZURE_AI_VISION_REGION")

vision_endpoint =  os.getenv("AZURE_AI_VISION_ENDPOINT")

credential = DefaultAzureCredential()

#search_credential = AzureKeyCredential(search_api_key)

vision_credential = AzureKeyCredential(vision_api_key)


# Initialize Azure clients

#vision_client = VisionClient(endpoint=vision_endpoint, credential=AzureKeyCredential(vision_api_key))

search_client = SearchClient(endpoint=search_endpoint, index_name=index_name, credential=credential)

analysis_client = ImageAnalysisClient(vision_endpoint, vision_credential)


# Define SAS URL template

sas_template = "https://saravinoteblogs.blob.core.windows.net/playground/vision/main/main/{file}.jpg?sp=rle&st=2025-05-11T00:36:41Z&se=2025-05-11T08:36:41Z&spr=https&sv=2024-11-04&sr=d&sig=vjCrqWLo3LbmkXwCyIKWtAtFnYO2uBSxEWNgGKbeS00%3D&sdd=3"


# Process images in batches of 100

batch_size = 100

total_images = 2 # 17853  # Adjust this as needed


def get_description(id, image_url):

    result = analyze_image_from_sdk(client, image_url)

    description = {}

    description["id"] = id

    # Access the results (e.g., image categories)

    if result.caption:

        print(f"Caption: {result.caption.text}")

        print(f"Caption Confidence: {result.caption.confidence}")

        description["caption"] = f"{result.caption.text}"

        description["caption_confidence"] = result.caption.confidence

    if result.tags:

        print("Tags:")

        tags = []

        for tag in result.tags:

            tag = {}

            print(f"  {tag.name}: {tag.confidence}")

            tag["name"] = f"{tag.name}"

            tag["confidence"] = f"{tag.confidence}"

            tags += [tag]

        description["tags"] = tags

    if result.objects:

        print("Objects:")

        objectItems = []

        for obj in result.objects:

            objectItem = {}

            print(f"  {obj.name}: {obj.confidence}")

            objectItem["name"] = f"{obj.name}"

            objectItem["confidence"] = obj.confidence

            if obj.bounding_box:

                print(f"    Bounding Box: {obj.bounding_box}")

                objectItem["bounding_box"] = f"{obj.bounding_box}"

            objectItems += [objectItem]

        description["objects"] = objectItems

    return description


#@retry(stop=stop_after_attempt(5), wait=wait_fixed(1))

def vectorize_image(client, blob_url):

    headers = {

        'Ocp-Apim-Subscription-Key': vision_api_key,

    }


    params = {

        'model-version': '2023-04-15',

        'language': 'en'

    }

    headers['Content-Type'] = 'application/json'


    request = HttpRequest(

        method="POST",

        url=f"/retrieval:vectorizeImage?api-version={vision_api_version}",

        json={"url": blob_url},

        params=params,

        headers=headers

    )

    response = client.send_request(request)    

    try:

        print(repr(response))

        response.raise_for_status()

        print(f"vectorize returned {response.json()}")

        return response.json()

    except HttpResponseError:

        print(str(e))

        return None


#@retry(stop=stop_after_attempt(5), wait=wait_fixed(1))

def get_image_vector(image_path, key, region):

    headers = {

        'Ocp-Apim-Subscription-Key': key,

    }


    params = urllib.parse.urlencode({

        'model-version': 'latest',

    })


    try:

        if image_path.startswith(('http://', 'https://')):

            headers['Content-Type'] = 'application/json'              

            body = json.dumps({"url": image_path})

        else:

            headers['Content-Type'] = 'application/octet-stream'

            with open(image_path, "rb") as filehandler:

                image_data = filehandler.read()

                body = image_data


        conn = http.client.HTTPSConnection("img01.cognitiveservices.azure.com", timeout=3)

        conn.request("POST", "/retrieval:vectorizeImage?api-version=2023-04-01-preview&%s" % params, body, headers)

        response = conn.getresponse()

        print(repr(response))

        data = json.load(response)

        print(repr(data))

        conn.close()


        if response.status != 200:

            raise Exception(f"Error processing image {image_path}: {data.get('message', '')}")


        return data.get("vector")


    except (requests.exceptions.Timeout, http.client.HTTPException) as e:

        print(f"Timeout/Error for {image_path}. Retrying...")

        raise


#@retry(stop=stop_after_attempt(5), wait=wait_fixed(1))

def analyze_image(client, blob_url):

    headers = {

        'Ocp-Apim-Subscription-Key': search_api_key,

    }


    params = {

        'model-version': '2023-04-15',

        'language': 'en'

    }

    headers['Content-Type'] = 'application/json'


    request = HttpRequest(

        method="POST",

        url=f"/computervision/imageanalysis:analyze?api-version={vision_api_version}",

        json={"url": blob_url},

        params=params,

        headers=headers

    )


    response = client.send_request(request)    

    try:

        response.raise_for_status()

        print(f"analyze returned {response.json()}")

        return response.json()

    except HttpResponseError:

        print(str(e))

        return None


def analyze_image_from_sdk(client, blob_url):

    result = client.analyze(

        image_url=blob_url,

        visual_features=[

            VisualFeatures.TAGS,

            VisualFeatures.OBJECTS,

            VisualFeatures.CAPTION,

            VisualFeatures.DENSE_CAPTIONS,

            VisualFeatures.READ,

            VisualFeatures.SMART_CROPS,

            VisualFeatures.PEOPLE,

        ],  # Mandatory. Select one or more visual features to analyze.

        smart_crops_aspect_ratios=[0.9, 1.33],  # Optional. Relevant only if SMART_CROPS was specified above.

        gender_neutral_caption=True,  # Optional. Relevant only if CAPTION or DENSE_CAPTIONS were specified above.

        language="en",  # Optional. Relevant only if TAGS is specified above. See https://aka.ms/cv-languages for supported languages.

        model_version="latest",  # Optional. Analysis model version to use. Defaults to "latest".

    )

    return result


def vectorize_image_from_sdk(client, blob_url):

    result = client.vectorize(

        image_url=blob_url,

        language="en",  # Optional. Relevant only if TAGS is specified above. See https://aka.ms/cv-languages for supported languages.

        model_version="latest",  # Optional. Analysis model version to use. Defaults to "latest".

    )

    return result


for batch_start in range(1, total_images + 1, batch_size):

    vectorized_images = {}

    documents = []


    # Vectorize 100 images at a time

    batch_end = min(batch_start + batch_size, total_images + 1)

    for i in range(batch_start, batch_end):

        file_name = f"{i:06}"

        blob_url = sas_template.format(file=file_name)


        try:

            #response = get_image_vector(blob_url, vision_api_key, "eastus")

            response = vectorize_image(analysis_client, blob_url)

            print(repr(response))

            if response:

               vectorized_images[file_name] = response

               documents += [

                  {"id": file_name, "description": repr(get_description(file_name, sas_template.format(file=file_name))), "vector": response}

               ]

        except Exception as e:

            print(f"Error processing {file_name}.jpg: {e}")


    print(f"Vectorization complete for images {batch_start} to {min(batch_start + batch_size - 1, total_images)}")

    # Upload batch to Azure AI Search

    if len(documents) > 0:

        # search_client.upload_documents(documents)

        print(f"Uploaded {len(documents)} images {batch_start} to {batch_end} to {index_name}.")


print(f"Vectorized images successfully added to {index_name}!")


No comments:

Post a Comment