Text-to-Speech

Service

Free Tier

Pricing Model

Docs

Amazon Polly

5M std + 1M neural

~$4 /M (std), ~$16 /M (neural) after free tier

Google Cloud TTS

4M std + 1M WaveNet

~$4 /M (std), ~$16 /M (WaveNet) pay-as-you-go

Azure TTS

500K neural ongoing

~$15 /M (neural), discount at higher volumes

IBM Watson TTS

10K chars Lite plan

~$0.02 /1K (i.e. ~$20 /M). Enterprise options available

ElevenLabs

10K chars monthly

From ~$5/mo (30K chars) up to $330/mo (2M chars). Enterprise

Example Code

1. Amazon Polly

# Requires: pip install boto3
import boto3
import os

def synthesize_polly(text: str, output_filename: str = "polly_output.mp3", region: str | None = None):
    """Synthesizes speech using AWS Polly."""
    # Assumes AWS credentials are configured (e.g., via env vars, ~/.aws/credentials)
    aws_region = region or os.environ.get("AWS_REGION", "us-east-1")
    try:
        polly = boto3.client("polly", region_name=aws_region)
        response = polly.synthesize_speech(
            Text=text,
            OutputFormat="mp3",
            VoiceId="Joanna" # Example voice
        )

        # Check if AudioStream is present
        if "AudioStream" in response:
            with open(output_filename, "wb") as f:
                f.write(response["AudioStream"].read())
            print(f"Audio saved to {output_filename}")
        else:
            print("Error: Could not stream audio from Polly.")

    except Exception as e:
        print(f"Error calling AWS Polly: {e}")

# Example:
# synthesize_polly("Hello from AWS Polly!")

2. Google Cloud TTS

# Requires: pip install google-cloud-texttospeech
from google.cloud import texttospeech
import os

def synthesize_google_tts(text: str, output_filename: str = "gcloud_tts_output.mp3"):
    """Synthesizes speech using Google Cloud TTS."""
    # Assumes GOOGLE_APPLICATION_CREDENTIALS env var is set
    try:
        client = texttospeech.TextToSpeechClient()
        input_text = texttospeech.SynthesisInput(text=text)
        # Example voice, check documentation for more options
        voice = texttospeech.VoiceSelectionParams(
            language_code="en-US",
            ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
        )
        audio_config = texttospeech.AudioConfig(
            audio_encoding=texttospeech.AudioEncoding.MP3
        )

        response = client.synthesize_speech(
            input=input_text, voice=voice, audio_config=audio_config
        )

        with open(output_filename, "wb") as f:
            f.write(response.audio_content)
        print(f"Audio saved to {output_filename}")

    except Exception as e:
        print(f"Error calling Google Cloud TTS: {e}")

# Example:
# synthesize_google_tts("Hello from Google Cloud Text-to-Speech!")

3. Azure TTS

# Requires: pip install azure-cognitiveservices-speech
import azure.cognitiveservices.speech as speechsdk
import os

def synthesize_azure_tts(text: str, output_filename: str = "azure_tts_output.wav"):
    """Synthesizes speech using Azure Cognitive Services TTS."""
    speech_key = os.environ.get("AZURE_SPEECH_KEY")
    service_region = os.environ.get("AZURE_SPEECH_REGION")

    if not speech_key or not service_region:
        print("Error: AZURE_SPEECH_KEY or AZURE_SPEECH_REGION not set.")
        return

    try:
        speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
        # Example voice, check documentation for more
        # speech_config.speech_synthesis_voice_name='en-US-JennyNeural'

        # Synthesize to an audio file
        audio_config = speechsdk.audio.AudioOutputConfig(filename=output_filename)

        synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)

        result = synthesizer.speak_text_async(text).get()

        # Check result
        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            print(f"Audio saved to {output_filename}")
        elif result.reason == speechsdk.ResultReason.Canceled:
            cancellation_details = result.cancellation_details
            print(f"Speech synthesis canceled: {cancellation_details.reason}")
            if cancellation_details.reason == speechsdk.CancellationReason.Error:
                print(f"Error details: {cancellation_details.error_details}")

    except Exception as e:
        print(f"Error calling Azure TTS: {e}")

# Example:
# synthesize_azure_tts("Hello from Azure Text-to-Speech!")

4. IBM Watson TTS

# Requires: pip install ibm_watson
from ibm_watson import TextToSpeechV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
import os

def synthesize_ibm_tts(text: str, output_filename: str = "ibm_tts_output.mp3"):
    """Synthesizes speech using IBM Watson TTS."""
    api_key = os.environ.get("IBM_API_KEY")
    service_url = os.environ.get("IBM_SERVICE_URL")

    if not api_key or not service_url:
        print("Error: IBM_API_KEY or IBM_SERVICE_URL not set.")
        return

    try:
        authenticator = IAMAuthenticator(api_key)
        text_to_speech = TextToSpeechV1(authenticator=authenticator)
        text_to_speech.set_service_url(service_url)

        response = text_to_speech.synthesize(
            text=text,
            voice='en-US_AllisonV3Voice', # Example voice
            accept='audio/mp3' # Specify desired format
        ).get_result()

        # The result object has a 'content' attribute with the audio data
        with open(output_filename, 'wb') as audio_file:
            audio_file.write(response.content)
        print(f"Audio saved to {output_filename}")

    except Exception as e:
        print(f"Error calling IBM Watson TTS: {e}")

# Example:
# synthesize_ibm_tts("Hello from IBM Watson Text-to-Speech!")

5. ElevenLabs

# Requires: pip install requests
import requests
import os

def synthesize_elevenlabs(text: str, output_filename: str = "elevenlabs_output.mp3"):
    """Synthesizes speech using ElevenLabs API."""
    api_key = os.environ.get("ELEVENLABS_API_KEY")
    # Find voice IDs via ElevenLabs website or API
    voice_id = os.environ.get("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM") # Example: Rachel

    if not api_key:
        print("Error: ELEVENLABS_API_KEY not set.")
        return
    if not voice_id:
        print("Error: ELEVENLABS_VOICE_ID not set.")
        return

    url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
    headers = {
        "Accept": "audio/mpeg", # Request MP3 format
        "Content-Type": "application/json",
        "xi-api-key": api_key
    }
    data = {
        "text": text,
        "model_id": "eleven_monolingual_v1", # Or other models like eleven_multilingual_v2
        "voice_settings": {
            "stability": 0.5,       # Example settings
            "similarity_boost": 0.75
        }
    }

    try:
        response = requests.post(url, json=data, headers=headers)
        response.raise_for_status() # Raise exception for bad status codes

        with open(output_filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
        print(f"Audio saved to {output_filename}")

    except requests.exceptions.RequestException as e:
        print(f"Error calling ElevenLabs API: {e}")
        # Print response body if available for more details
        if e.response is not None:
             print(f"Response body: {e.response.text}")


# Example:
# synthesize_elevenlabs("Hello from ElevenLabs Text-to-Speech!")

Last updated