This explains why location services from public cloud providers are unreliable for aerial drone images unless they are using custom models trained to detect based on features of the scene.
import requests 
import os 
from azure.cognitiveservices.vision.computervision import ComputerVisionClient 
from msrest.authentication import CognitiveServicesCredentials 
from PIL import Image 
 
# === Azure Computer Vision credentials === 
vision_api_key = os.getenv("AZURE_AI_VISION_API_KEY") 
vision_endpoint = os.getenv("AZURE_AI_VISION_ENDPOINT") 
computervision_client = ComputerVisionClient(vision_endpoint, CognitiveServicesCredentials(vision_api_key)) 
 
# === Azure Maps credentials === 
azure_maps_key = os.getenv("AZURE_MAPS_SUBSCRIPTION_KEY") 
 
# === Load local image and get tags === 
image_path = "frame5.jpg" 
with open(image_path, "rb") as img_stream: 
    analysis = computervision_client.analyze_image_in_stream( 
        img_stream, 
        visual_features=["Tags"] 
    ) 
 
tags = [tag.name for tag in analysis.tags if tag.confidence > 0.5] 
 
# === Azure Maps Search API for landmark coordinates === 
def get_coordinates_from_azure_maps(landmark, azure_key): 
    url = f"https://atlas.microsoft.com/search/address/json" 
    params = { 
        "api-version": "1.0", 
        "subscription-key": azure_key, 
        "query": landmark 
    } 
    response = requests.get(url, params=params) 
    data = response.json() 
    results = data.get("results", []) 
    if results: 
        position = results[0]["position"] 
        return (position["lat"], position["lon"]) 
    return None 
tags = ["circular plaza"] 
# === Display matched coordinates === 
for tag in tags: 
    coords = get_coordinates_from_azure_maps(tag, azure_maps_key) 
    if coords: 
        print(f"Landmark: {tag}, Latitude: {coords[0]}, Longitude: {coords[1]}") 
    else: 
        print(f"No match found for tag: {tag}") 
 
""" 
Output: 
Landmark: outdoor, Latitude: 39.688359, Longitude: -84.235051 
Landmark: text, Latitude: 17.9739757, Longitude: -76.7856201 
Landmark: building, Latitude: 23.3531395, Longitude: -75.0597782 
Landmark: car, Latitude: 18.5366554, Longitude: -72.4020263 
Landmark: urban design, Latitude: 48.4732981, Longitude: 35.0019145 
Landmark: metropolitan area, Latitude: 55.6033166, Longitude: 13.0013362 
Landmark: urban area, Latitude: 8.448839, Longitude: -13.258005 
Landmark: neighbourhood, Latitude: 54.8811412, Longitude: -6.2779797 
Landmark: intersection, Latitude: 34.899284, Longitude: -83.392743 
Landmark: vehicle, Latitude: 38.6151446, Longitude: -121.273215 
Landmark: residential area, Latitude: 9.982962, Longitude: 76.2954466 
Landmark: city, Latitude: 19.4326773, Longitude: -99.1342112 
Landmark: traffic, Latitude: 23.5786896, Longitude: 87.1950397 
Landmark: street, Latitude: 51.1250213, Longitude: -2.7313088 
Landmark: aerial, Latitude: 34.95435, Longitude: -117.826011 
 
#  
# Not even close to the nearest neigbhorhood: https://www.google.com/maps?q=42.3736,-71.1097 
and when trying google cloud: 
gcloud ml vision detect-landmarks frame5.jpg
{
"responses": [
{}
]
} 
 
import nyckel 
import os 
nyckel_client_id = os.getenv("NYCKEL_CLIENT_ID") 
nyckel_client_secret = os.getenv("NYCKEL_CLIENT_SECRET") 
credentials = nyckel.Credentials(nyckel_client_id, nyckel_client_secret) 
image_url = os.getenv("CIRCULAR_BUILDING_SAS_URL").strip('"') 
response = nyckel.invoke("landmark-identifier", image_url, credentials) 
print(response) 
# Output: 
# {'labelName': 'Yellowstone National Park', 'labelId': 'label_wottnvl9ole6ch4o', 'confidence': 0.02} 
""" 
Or the landmarks may not be detected at all: 
import requests 
import os 
from azure.cognitiveservices.vision.computervision import ComputerVisionClient 
from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes 
from msrest.authentication import CognitiveServicesCredentials 
from PIL import Image 
from pprint import pprint 
 
# === Azure Computer Vision credentials === 
vision_api_key = os.getenv("AZURE_AI_VISION_API_KEY") 
vision_endpoint = os.getenv("AZURE_AI_VISION_ENDPOINT") 
computervision_client = ComputerVisionClient(vision_endpoint, CognitiveServicesCredentials(vision_api_key)) 
scene_url = os.getenv("CIRCULAR_BUILDING_SAS_URL").strip('"') 
 
def get_landmark_info(image_path_or_url): 
    """ 
    Detects landmarks in an aerial image and returns detailed metadata. 
    Supports both local file paths and image URLs. 
    """ 
    visual_features = [VisualFeatureTypes.categories, VisualFeatureTypes.description, VisualFeatureTypes.tags] 
 
    if image_path_or_url.startswith("http"): 
        analysis = computervision_client.analyze_image(image_path_or_url, visual_features) 
    else: 
        with open(image_path_or_url, "rb") as image_stream: 
            analysis = computervision_client.analyze_image_in_stream(image_stream, visual_features) 
 
    # Extract landmark-related tags and descriptions 
    landmark_tags = [tag.name for tag in analysis.tags if "landmark" in tag.name.lower()] 
    description = analysis.description.captions[0].text if analysis.description.captions else "No description available" 
 
    result = { 
        "description": description, 
        "landmark_tags": landmark_tags, 
        "categories": [cat.name for cat in analysis.categories] 
    } 
 
    return result 
 
# Example usage 
if __name__ == "__main__": 
    landmark_data = get_landmark_info(scene_url) 
    pprint(landmark_data) 
 
 
### output: 
# {'categories': ['abstract_', 'others_', 'outdoor_', 'text_sign'], 
#  'description': 'graphical user interface', 
#  'landmark_tags': []} 
# actual location information is  
# 42.371305, -71.117339 
# Orthodox Minyan at Harvard Hillel, 52 Mt Auburn St, Cambridge, MA 02138
# and the drone provided GPS information is the most accurate in this regard such as: 
import json 
import numpy as np 
 
# Replace this with actual GPS bounds for transformation 
# Example: top-left, top-right, bottom-right, bottom-left in pixel & GPS 
pixel_bounds = np.array([[0, 0], [4096, 0], [4096, 4096], [0, 4096]]) 
gps_bounds = np.array([[39.735, -104.997], [39.735, -104.989], 
                       [39.729, -104.989], [39.729, -104.997]]) 
 
# Compute affine transform matrix from pixel to GPS 
A = np.linalg.lstsq(pixel_bounds, gps_bounds, rcond=None)[0] 
 
def pixel_to_gps(coord): 
    """Map pixel coordinate to GPS using affine approximation""" 
    return tuple(np.dot(coord, A)) 
 
def parse_json_gps(json_data): 
    gps_coords = [] 
    for frame in json_data: 
        if frame is None: 
            continue 
        frame_coords = [pixel_to_gps(coord) for coord in frame] 
        gps_coords.append(frame_coords) 
    return gps_coords 
 
# Example JSON input 
data = [None, [[3132, 4151], [3354, 2924], [4044, 3056], [3824, 4275]], 
              [[3095, 4164], [3318, 2939], [4006, 3073], [3787, 4289]]] 
 
gps_output = parse_json_gps(data) 
for i, frame in enumerate(gps_output): 
    print(f"Frame {i+1}:") 
    for lat, lon in frame: 
        print(f"Latitude: {lat:.6f}, Longitude: {lon:.6f}") 
 
No comments:
Post a Comment