This is a
continuation of articles on text to speech Azure AI services. The earlier
article1 discussed the production of mp3 audio for short texts
up to 5000 characters that can be sent via a single API call. This article
discusses the conversion of large text with the help of Batch Synthesis API for
text to speech. This is helpful for the creation of AudioBooks and even though
Speechify ranks high in AI Voice Generator and ElevenLabs ranks high in AI
Voice cloning, this will be good enough.
import requests
import json
import time
from docx
import Document
import os
import uuid
# Azure AI
Language Service configuration
endpoint =
"https://<your_region>.api.cognitive.microsoft.com/texttospeech/batchsyntheses/JOBID?api-version=2024-04-01"
api_key =
"<your_api_key>"
headers = {
"Content-Type":
"application/json",
"Ocp-Apim-Subscription-Key":
api_key
}
def
synthesize_text(inputs):
body = {
"inputKind":
"PlainText", # or SSML
'synthesisConfig': {
"voice":
"en-US-GuyNeural",
},
# Replace with your custom voice name
and deployment ID if you want to use custom voice.
# Multiple voices are supported, the
mixture of custom voices and platform voices is allowed.
# Invalid voice name or deployment ID
will be rejected.
'customVoices': {
#
"YOUR_CUSTOM_VOICE_NAME": "YOUR_CUSTOM_VOICE_ID"
},
"inputs": inputs,
"properties": {
"outputFormat":
"audio-48khz-192kbitrate-mono-mp3"
}
}
response =
requests.put(endpoint.replace("JOBID", str(uuid.uuid4())),
headers=headers, json=body)
if response.status_code < 400:
jobId =
f'{response.json()["id"]}'
return jobId
else:
raise Exception(f"Failed to start
batch synthesis job: {response.text}")
def
get_synthesis(job_id: str):
while True:
url = f'https://<your_region>.api.cognitive.microsoft.com/texttospeech/batchsyntheses/{job_id}?api-version=2024-04-01'
headers = {
"Content-Type":
"application/json",
"Ocp-Apim-Subscription-Key": api_key
}
response = requests.get(url,
headers=headers)
if response.status_code < 400:
status = response.json()['status']
if "Succeeded" in status:
return response.json()
else:
print(f'batch synthesis job is
still running, status [{status}]')
time.sleep(5) # Wait for 5 seconds before checking again
def
get_text(file_path):
with open(file_path, 'r') as file:
file_contents = file.read()
print(f"Length of text:
{len(file_contents)}")
return file_contents
# Main
execution
if __name__ ==
"__main__":
input_file_name = ""
large_text = ""
inputs = []
for i in range(2,7):
input_file_name=f"{i}.txt"
print(input_file_name)
if input_file_name:
document_text =
get_text(input_file_name)
inputs += [
{
"content":
document_text
},
]
# Start summarization job
jobId = synthesize_text(inputs)
print(jobId)
# Get summary result
audio = get_synthesis(jobId)
print("Result:")
print(audio)
Sample result:
Result:
{'id': '8cdbd29d-43f3-4878-b83c-b5326688e302', 'status':
'Succeeded', 'createdDateTime': '2025-03-08T08:16:42.8628654Z',
'lastActionDateTime': '2025-03-08T08:16:55.1972806', 'inputKind': 'PlainText',
'customVoices': {}, 'properties': {'timeToLiveInHours': 744, 'outputFormat':
'audio-48khz-192kbitrate-mono-mp3', 'concatenateResult': False,
'decompressOutputFiles': False, 'wordBoundaryEnabled': False,
'sentenceBoundaryEnabled': False, 'sizeInBytes': 38024640,
'succeededAudioCount': 5, 'failedAudioCount': 0, 'durationInMilliseconds':
1584360, 'billingDetails': {'neuralCharacters': 27710}}, 'synthesisConfig':
{'voice': 'en-US-GuyNeural'}, 'outputs': {'result':
'https://stttssvcproduse.blob.core.windows.net/batchsynthesis-output/da113ddc2b524d9e8b95c6f6b6ab2a61/8cdbd29d-43f3-4878-b83c-b5326688e302/results.zip?skoid=12345678-6c19-4f12-8d9f-57c205aaba10&sktid=33e01921-4d64-4f8c-a055-5bdaffd5e33d&skt=2025-03-07T13%3A02%3A37Z&ske=2025-03-13T13%3A07%3A37Z&sks=b&skv=2023-11-03&sv=2023-11-03&st=2025-03-08T08%3A11%3A59Z&se=2025-03-11T08%3A16%3A59Z&sr=b&sp=rl&sig=s0DIH6g6gryEgmDEHlbd2ilqC5xfuB2J7HJ%2FddOlHcA%3D'}}