This explains why location services from public cloud providers are unreliable for aerial drone images unless they are using custom models trained to detect based on features of the scene.
import requests
import os
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from msrest.authentication import CognitiveServicesCredentials
from PIL import Image
# === Azure Computer Vision credentials ===
vision_api_key = os.getenv("AZURE_AI_VISION_API_KEY")
vision_endpoint = os.getenv("AZURE_AI_VISION_ENDPOINT")
computervision_client = ComputerVisionClient(vision_endpoint, CognitiveServicesCredentials(vision_api_key))
# === Azure Maps credentials ===
azure_maps_key = os.getenv("AZURE_MAPS_SUBSCRIPTION_KEY")
# === Load local image and get tags ===
image_path = "frame5.jpg"
with open(image_path, "rb") as img_stream:
analysis = computervision_client.analyze_image_in_stream(
img_stream,
visual_features=["Tags"]
)
tags = [tag.name for tag in analysis.tags if tag.confidence > 0.5]
# === Azure Maps Search API for landmark coordinates ===
def get_coordinates_from_azure_maps(landmark, azure_key):
url = f"https://atlas.microsoft.com/search/address/json"
params = {
"api-version": "1.0",
"subscription-key": azure_key,
"query": landmark
}
response = requests.get(url, params=params)
data = response.json()
results = data.get("results", [])
if results:
position = results[0]["position"]
return (position["lat"], position["lon"])
return None
tags = ["circular plaza"]
# === Display matched coordinates ===
for tag in tags:
coords = get_coordinates_from_azure_maps(tag, azure_maps_key)
if coords:
print(f"Landmark: {tag}, Latitude: {coords[0]}, Longitude: {coords[1]}")
else:
print(f"No match found for tag: {tag}")
"""
Output:
Landmark: outdoor, Latitude: 39.688359, Longitude: -84.235051
Landmark: text, Latitude: 17.9739757, Longitude: -76.7856201
Landmark: building, Latitude: 23.3531395, Longitude: -75.0597782
Landmark: car, Latitude: 18.5366554, Longitude: -72.4020263
Landmark: urban design, Latitude: 48.4732981, Longitude: 35.0019145
Landmark: metropolitan area, Latitude: 55.6033166, Longitude: 13.0013362
Landmark: urban area, Latitude: 8.448839, Longitude: -13.258005
Landmark: neighbourhood, Latitude: 54.8811412, Longitude: -6.2779797
Landmark: intersection, Latitude: 34.899284, Longitude: -83.392743
Landmark: vehicle, Latitude: 38.6151446, Longitude: -121.273215
Landmark: residential area, Latitude: 9.982962, Longitude: 76.2954466
Landmark: city, Latitude: 19.4326773, Longitude: -99.1342112
Landmark: traffic, Latitude: 23.5786896, Longitude: 87.1950397
Landmark: street, Latitude: 51.1250213, Longitude: -2.7313088
Landmark: aerial, Latitude: 34.95435, Longitude: -117.826011
#
# Not even close to the nearest neigbhorhood: https://www.google.com/maps?q=42.3736,-71.1097
and when trying google cloud:
gcloud ml vision detect-landmarks frame5.jpg
{
"responses": [
{}
]
}
import nyckel
import os
nyckel_client_id = os.getenv("NYCKEL_CLIENT_ID")
nyckel_client_secret = os.getenv("NYCKEL_CLIENT_SECRET")
credentials = nyckel.Credentials(nyckel_client_id, nyckel_client_secret)
image_url = os.getenv("CIRCULAR_BUILDING_SAS_URL").strip('"')
response = nyckel.invoke("landmark-identifier", image_url, credentials)
print(response)
# Output:
# {'labelName': 'Yellowstone National Park', 'labelId': 'label_wottnvl9ole6ch4o', 'confidence': 0.02}
"""
Or the landmarks may not be detected at all:
import requests
import os
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes
from msrest.authentication import CognitiveServicesCredentials
from PIL import Image
from pprint import pprint
# === Azure Computer Vision credentials ===
vision_api_key = os.getenv("AZURE_AI_VISION_API_KEY")
vision_endpoint = os.getenv("AZURE_AI_VISION_ENDPOINT")
computervision_client = ComputerVisionClient(vision_endpoint, CognitiveServicesCredentials(vision_api_key))
scene_url = os.getenv("CIRCULAR_BUILDING_SAS_URL").strip('"')
def get_landmark_info(image_path_or_url):
"""
Detects landmarks in an aerial image and returns detailed metadata.
Supports both local file paths and image URLs.
"""
visual_features = [VisualFeatureTypes.categories, VisualFeatureTypes.description, VisualFeatureTypes.tags]
if image_path_or_url.startswith("http"):
analysis = computervision_client.analyze_image(image_path_or_url, visual_features)
else:
with open(image_path_or_url, "rb") as image_stream:
analysis = computervision_client.analyze_image_in_stream(image_stream, visual_features)
# Extract landmark-related tags and descriptions
landmark_tags = [tag.name for tag in analysis.tags if "landmark" in tag.name.lower()]
description = analysis.description.captions[0].text if analysis.description.captions else "No description available"
result = {
"description": description,
"landmark_tags": landmark_tags,
"categories": [cat.name for cat in analysis.categories]
}
return result
# Example usage
if __name__ == "__main__":
landmark_data = get_landmark_info(scene_url)
pprint(landmark_data)
### output:
# {'categories': ['abstract_', 'others_', 'outdoor_', 'text_sign'],
# 'description': 'graphical user interface',
# 'landmark_tags': []}
# actual location information is
# 42.371305, -71.117339
# Orthodox Minyan at Harvard Hillel, 52 Mt Auburn St, Cambridge, MA 02138
# and the drone provided GPS information is the most accurate in this regard such as:
import json
import numpy as np
# Replace this with actual GPS bounds for transformation
# Example: top-left, top-right, bottom-right, bottom-left in pixel & GPS
pixel_bounds = np.array([[0, 0], [4096, 0], [4096, 4096], [0, 4096]])
gps_bounds = np.array([[39.735, -104.997], [39.735, -104.989],
[39.729, -104.989], [39.729, -104.997]])
# Compute affine transform matrix from pixel to GPS
A = np.linalg.lstsq(pixel_bounds, gps_bounds, rcond=None)[0]
def pixel_to_gps(coord):
"""Map pixel coordinate to GPS using affine approximation"""
return tuple(np.dot(coord, A))
def parse_json_gps(json_data):
gps_coords = []
for frame in json_data:
if frame is None:
continue
frame_coords = [pixel_to_gps(coord) for coord in frame]
gps_coords.append(frame_coords)
return gps_coords
# Example JSON input
data = [None, [[3132, 4151], [3354, 2924], [4044, 3056], [3824, 4275]],
[[3095, 4164], [3318, 2939], [4006, 3073], [3787, 4289]]]
gps_output = parse_json_gps(data)
for i, frame in enumerate(gps_output):
print(f"Frame {i+1}:")
for lat, lon in frame:
print(f"Latitude: {lat:.6f}, Longitude: {lon:.6f}")
#codingexercise: https://1drv.ms/w/c/d609fb70e39b65c8/EfWTrWDGqOxFvkw4sb48NWUBmxjiu90rja-WxBLPsbgS0Q?e=quyCp2
No comments:
Post a Comment