Tuesday, March 4, 2025

 This follows up on a previous article to split a large text for use with text-to-speech api:

import azure.cognitiveservices.speech as speechsdk

import io

import wave

def split_text(text, max_chunk_size=5000):

    """Split text into chunks of approximately max_chunk_size characters."""

    words = text.split()

    chunks = []

    current_chunk = []

    current_size = 0

    for word in words:

        if current_size + len(word) + 1 > max_chunk_size:

            chunks.append(' '.join(current_chunk))

            current_chunk = [word]

            current_size = len(word)

        else:

            current_chunk.append(word)

            current_size += len(word) + 1

    if current_chunk:

        chunks.append(' '.join(current_chunk))

    return chunks

def synthesize_text(speech_synthesizer, text):

    """Synthesize speech from text."""

    result = speech_synthesizer.speak_text_async(text).get()

    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:

        return result.audio_data

    else:

        print(f"Speech synthesis failed: {result.reason}")

        return None

def combine_audio(audio_chunks):

    """Combine multiple audio chunks into a single WAV file."""

    combined = io.BytesIO()

    with wave.open(combined, 'wb') as wav_file:

        for i, audio_chunk in enumerate(audio_chunks):

            if i == 0:

                # Set parameters from the first chunk

                with wave.open(io.BytesIO(audio_chunk), 'rb') as first_chunk:

                    params = first_chunk.getparams()

                wav_file.setparams(params)

            wav_file.writeframes(audio_chunk)

    return combined.getvalue()

def process_large_text(text, speech_key, service_region):

    """Process large text by splitting, synthesizing, and combining audio."""

    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)

    speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm)

    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)

    chunks = split_text(text)

    audio_chunks = []

    for chunk in chunks:

        audio_data = synthesize_text(speech_synthesizer, chunk)

        if audio_data:

            audio_chunks.append(audio_data)

    if audio_chunks:

        combined_audio = combine_audio(audio_chunks)

        return combined_audio

    else:

        return None

# Usage example

if __name__ == "__main__":

    speech_key = "YOUR_SPEECH_KEY"

    service_region = "YOUR_SERVICE_REGION"

    large_text = "Your very large text goes here... " * 1000 # Example of a large text

    result = process_large_text(large_text, speech_key, service_region)

    if result:

        with open("output.wav", "wb") as audio_file:

            audio_file.write(result)

        print("Audio file 'output.wav' has been created.")

    else:

        print("Failed to process the text.")

A large document can be split into text as shown:

from docx import Document import os

input_file = Document1.docx'

output_file = 'Text1.txt'

def process_large_file(input_file_path, output_file_path):

try:

doc = Document(input_file_path)

print(f"Number of paragraphs: {len(doc.paragraphs)}")

with open(output_file_path, 'a', encoding='utf-8') as output_file:

for para in doc.paragraphs: chunk = para.text

if chunk:

output_file.write(chunk)

output_file.write("\r\n")

except Exception as e: print(f"An error occurred: {e}")

process_large_file(input_file, output_file)

print(f"Text has been extracted from {input_file} and written to {output_file}")

--

https://ezcloudiac.com/info/index.html


No comments:

Post a Comment