The following script can be used to covert the manuscript of a book into its corresponding audio production.
Option 1: individual chapters
import azure.cognitiveservices.speech as speechsdk
import time
def batch_text_to_speech(text, output_filename):
# Azure Speech Service configuration
speech_key = "<use-your-speech-key>"
service_region = "eastus"
# Configure speech synthesis
speech_config = speechsdk.SpeechConfig(
subscription=speech_key,
region=service_region
)
# Set output format to MP3
speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio48Khz192KBitRateMonoMp3)
speech_config.speech_synthesis_voice_name = "en-US-BrianMultilingualNeural"
# Create audio config for file output
audio_config = speechsdk.audio.AudioOutputConfig(filename=output_filename)
# Create speech synthesizer
synthesizer = speechsdk.SpeechSynthesizer(
speech_config=speech_config,
audio_config=audio_config
)
# Split text into chunks if needed (optional)
# text_chunks = split_large_text(text)
# Synthesize text
result = synthesizer.speak_text_async(text).get()
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
print(f"Audio synthesized to {output_filename}")
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print(f"Speech synthesis canceled: {cancellation_details.reason}")
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print(f"Error details: {cancellation_details.error_details}")
def split_large_text(text, max_length=9000):
return [text[i:i+max_length] for i in range(0, len(text), max_length)]
input_filename = ""
large_text = ""
for i in range(1,100):
input_filename=f"{i}.txt"
print(input_filename)
if input_filename:
with open(input_filename, "r") as fin:
large_text = fin.read()
print(str(len(large_text)) + " " + input_filename.replace("txt","mp3"))
batch_text_to_speech(large_text, input_filename.replace("txt","mp3"))
Option 2. Whole manuscript:
import requests import json import time from docx import Document import os import uuid
# Azure AI Language Service configuration
endpoint = "https://eastus.api.cognitive.microsoft.com/texttospeech/batchsyntheses/JOBID?api-version=2024-04-01" api_key = "<your_api_key>"
headers = {
"Content-Type": "application/json",
"Ocp-Apim-Subscription-Key": api_key
}
def synthesize_text(inputs):
body = {
"inputKind": "PlainText", # or SSML
'synthesisConfig': {
"voice": "en-US-BrianMultilingualNeural",
},
# Replace with your custom voice name and deployment ID if you want to use custom voice.
# Multiple voices are supported, the mixture of custom voices and platform voices is allowed.
# Invalid voice name or deployment ID will be rejected.
'customVoices': {
# "YOUR_CUSTOM_VOICE_NAME": "YOUR_CUSTOM_VOICE_ID" }, "inputs": inputs,
"properties": {
"outputFormat": "audio-48khz-192kbitrate-mono-mp3"
}
}
response = requests.put(endpoint.replace("JOBID", str(uuid.uuid4())), headers=headers, json=body)
if response.status_code < 400:
jobId = f'{response.json()["id"]}'
return jobId
else:
raise Exception(f"Failed to start batch synthesis job: {response.text}")
def get_synthesis(job_id: str):
while True:
url = f'https://eastus.api.cognitive.microsoft.com/texttospeech/batchsyntheses/{job_id}?api-version=2024-04-01'
headers = { "Content-Type": "application/json", "Ocp-Apim-Subscription-Key": api_key }
response = requests.get(url, headers=headers)
if response.status_code < 400:
status = response.json()['status']
if "Succeeded" in status:
return response.json()
else:
print(f'batch synthesis job is still running, status [{status}]')
time.sleep(5) # Wait for 5 seconds before checking again
def get_text(file_path):
with open(file_path, 'r') as file:
file_contents = file.read()
print(f"Length of text: {len(file_contents)}")
return file_contents
if name == "main":
input_file_name = ""
large_text = ""
inputs = []
for i in range(1,100):
input_file_name=f"{i}.txt"
print(input_file_name)
if input_file_name:
document_text = get_text(input_file_name)
inputs += [ { "content": document_text }, ]
jobId = synthesize_text(inputs)
print(jobId)
# Get audio result
audio = get_synthesis(jobId)
print("Result:")
print(audio)
No comments:
Post a Comment