Monday, April 7, 2025

 The following script can be used to covert the manuscript of a book into its corresponding audio production.

Option 1: individual chapters

import azure.cognitiveservices.speech as speechsdk

import time

def batch_text_to_speech(text, output_filename):

      # Azure Speech Service configuration

      speech_key = "<use-your-speech-key>"

      service_region = "eastus"

# Configure speech synthesis

speech_config = speechsdk.SpeechConfig(

     subscription=speech_key,

     region=service_region

)

# Set output format to MP3

                          speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio48Khz192KBitRateMonoMp3)

speech_config.speech_synthesis_voice_name = "en-US-BrianMultilingualNeural"

# Create audio config for file output

audio_config = speechsdk.audio.AudioOutputConfig(filename=output_filename)

# Create speech synthesizer

synthesizer = speechsdk.SpeechSynthesizer(

    speech_config=speech_config,

    audio_config=audio_config

)

# Split text into chunks if needed (optional)

# text_chunks = split_large_text(text)

# Synthesize text

result = synthesizer.speak_text_async(text).get()

if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:

    print(f"Audio synthesized to {output_filename}")

elif result.reason == speechsdk.ResultReason.Canceled:

    cancellation_details = result.cancellation_details

    print(f"Speech synthesis canceled: {cancellation_details.reason}")

    if cancellation_details.reason == speechsdk.CancellationReason.Error:

        print(f"Error details: {cancellation_details.error_details}")

def split_large_text(text, max_length=9000):

        return [text[i:i+max_length] for i in range(0, len(text), max_length)]

input_filename = ""

large_text = ""

for i in range(1,100):

        input_filename=f"{i}.txt"

        print(input_filename)

        if input_filename:

          with open(input_filename, "r") as fin:

              large_text = fin.read()

              print(str(len(large_text)) + " " + input_filename.replace("txt","mp3"))

              batch_text_to_speech(large_text, input_filename.replace("txt","mp3"))

Option 2. Whole manuscript:

import requests import json import time from docx import Document import os import uuid

# Azure AI Language Service configuration

endpoint = "https://eastus.api.cognitive.microsoft.com/texttospeech/batchsyntheses/JOBID?api-version=2024-04-01" api_key = "<your_api_key>"

headers = {

 "Content-Type": "application/json",

 "Ocp-Apim-Subscription-Key": api_key

 }

def synthesize_text(inputs):

body = {

 "inputKind": "PlainText", # or SSML

 'synthesisConfig': {

 "voice": "en-US-BrianMultilingualNeural",

 },

 # Replace with your custom voice name and deployment ID if you want to use custom voice.

 # Multiple voices are supported, the mixture of custom voices and platform voices is allowed.

 # Invalid voice name or deployment ID will be rejected.

 'customVoices': {

  # "YOUR_CUSTOM_VOICE_NAME": "YOUR_CUSTOM_VOICE_ID" }, "inputs": inputs,

   "properties": {

     "outputFormat": "audio-48khz-192kbitrate-mono-mp3"

    }

 }

 response = requests.put(endpoint.replace("JOBID", str(uuid.uuid4())), headers=headers, json=body)

 if response.status_code < 400:

    jobId = f'{response.json()["id"]}'

    return jobId

 else:

    raise Exception(f"Failed to start batch synthesis job: {response.text}")

def get_synthesis(job_id: str):

 while True:

 url = f'https://eastus.api.cognitive.microsoft.com/texttospeech/batchsyntheses/{job_id}?api-version=2024-04-01'

     headers = { "Content-Type": "application/json", "Ocp-Apim-Subscription-Key": api_key }

     response = requests.get(url, headers=headers)

  if response.status_code < 400:

status = response.json()['status']

if "Succeeded" in status:

return response.json()

else:

print(f'batch synthesis job is still running, status [{status}]')

time.sleep(5) # Wait for 5 seconds before checking again

def get_text(file_path):

with open(file_path, 'r') as file:

  file_contents = file.read()

print(f"Length of text: {len(file_contents)}")

return file_contents

if name == "main":

input_file_name = ""

large_text = ""

inputs = []

  for i in range(1,100):

   input_file_name=f"{i}.txt"

   print(input_file_name)

   if input_file_name:

document_text = get_text(input_file_name)

inputs += [ { "content": document_text }, ]

jobId = synthesize_text(inputs)

 print(jobId)

 # Get audio result

 audio = get_synthesis(jobId)

 print("Result:")

 print(audio)


No comments:

Post a Comment