Text-to-Speech
BrainyFlow does NOT provide built-in utilities
Instead, we offer examples that you can implement yourself. This approach gives you more flexibility and control over your project's dependencies and functionality.
Service
Free Tier
Pricing Model
Docs
IBM Watson TTS
10K chars Lite plan
~$0.02 /1K (i.e. ~$20 /M). Enterprise options available
ElevenLabs
10K chars monthly
From ~$5/mo (30K chars) up to $330/mo (2M chars). Enterprise
Example Code
1. Amazon Polly
# Requires: pip install boto3
import boto3
import os
def synthesize_polly(text: str, output_filename: str = "polly_output.mp3", region: str | None = None):
"""Synthesizes speech using AWS Polly."""
# Assumes AWS credentials are configured (e.g., via env vars, ~/.aws/credentials)
aws_region = region or os.environ.get("AWS_REGION", "us-east-1")
try:
polly = boto3.client("polly", region_name=aws_region)
response = polly.synthesize_speech(
Text=text,
OutputFormat="mp3",
VoiceId="Joanna" # Example voice
)
# Check if AudioStream is present
if "AudioStream" in response:
with open(output_filename, "wb") as f:
f.write(response["AudioStream"].read())
print(f"Audio saved to {output_filename}")
else:
print("Error: Could not stream audio from Polly.")
except Exception as e:
print(f"Error calling AWS Polly: {e}")
# Example:
# synthesize_polly("Hello from AWS Polly!")
2. Google Cloud TTS
# Requires: pip install google-cloud-texttospeech
from google.cloud import texttospeech
import os
def synthesize_google_tts(text: str, output_filename: str = "gcloud_tts_output.mp3"):
"""Synthesizes speech using Google Cloud TTS."""
# Assumes GOOGLE_APPLICATION_CREDENTIALS env var is set
try:
client = texttospeech.TextToSpeechClient()
input_text = texttospeech.SynthesisInput(text=text)
# Example voice, check documentation for more options
voice = texttospeech.VoiceSelectionParams(
language_code="en-US",
ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
)
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)
response = client.synthesize_speech(
input=input_text, voice=voice, audio_config=audio_config
)
with open(output_filename, "wb") as f:
f.write(response.audio_content)
print(f"Audio saved to {output_filename}")
except Exception as e:
print(f"Error calling Google Cloud TTS: {e}")
# Example:
# synthesize_google_tts("Hello from Google Cloud Text-to-Speech!")
3. Azure TTS
# Requires: pip install azure-cognitiveservices-speech
import azure.cognitiveservices.speech as speechsdk
import os
def synthesize_azure_tts(text: str, output_filename: str = "azure_tts_output.wav"):
"""Synthesizes speech using Azure Cognitive Services TTS."""
speech_key = os.environ.get("AZURE_SPEECH_KEY")
service_region = os.environ.get("AZURE_SPEECH_REGION")
if not speech_key or not service_region:
print("Error: AZURE_SPEECH_KEY or AZURE_SPEECH_REGION not set.")
return
try:
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
# Example voice, check documentation for more
# speech_config.speech_synthesis_voice_name='en-US-JennyNeural'
# Synthesize to an audio file
audio_config = speechsdk.audio.AudioOutputConfig(filename=output_filename)
synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
result = synthesizer.speak_text_async(text).get()
# Check result
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
print(f"Audio saved to {output_filename}")
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print(f"Speech synthesis canceled: {cancellation_details.reason}")
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print(f"Error details: {cancellation_details.error_details}")
except Exception as e:
print(f"Error calling Azure TTS: {e}")
# Example:
# synthesize_azure_tts("Hello from Azure Text-to-Speech!")
4. IBM Watson TTS
# Requires: pip install ibm_watson
from ibm_watson import TextToSpeechV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
import os
def synthesize_ibm_tts(text: str, output_filename: str = "ibm_tts_output.mp3"):
"""Synthesizes speech using IBM Watson TTS."""
api_key = os.environ.get("IBM_API_KEY")
service_url = os.environ.get("IBM_SERVICE_URL")
if not api_key or not service_url:
print("Error: IBM_API_KEY or IBM_SERVICE_URL not set.")
return
try:
authenticator = IAMAuthenticator(api_key)
text_to_speech = TextToSpeechV1(authenticator=authenticator)
text_to_speech.set_service_url(service_url)
response = text_to_speech.synthesize(
text=text,
voice='en-US_AllisonV3Voice', # Example voice
accept='audio/mp3' # Specify desired format
).get_result()
# The result object has a 'content' attribute with the audio data
with open(output_filename, 'wb') as audio_file:
audio_file.write(response.content)
print(f"Audio saved to {output_filename}")
except Exception as e:
print(f"Error calling IBM Watson TTS: {e}")
# Example:
# synthesize_ibm_tts("Hello from IBM Watson Text-to-Speech!")
5. ElevenLabs
# Requires: pip install requests
import requests
import os
def synthesize_elevenlabs(text: str, output_filename: str = "elevenlabs_output.mp3"):
"""Synthesizes speech using ElevenLabs API."""
api_key = os.environ.get("ELEVENLABS_API_KEY")
# Find voice IDs via ElevenLabs website or API
voice_id = os.environ.get("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM") # Example: Rachel
if not api_key:
print("Error: ELEVENLABS_API_KEY not set.")
return
if not voice_id:
print("Error: ELEVENLABS_VOICE_ID not set.")
return
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
headers = {
"Accept": "audio/mpeg", # Request MP3 format
"Content-Type": "application/json",
"xi-api-key": api_key
}
data = {
"text": text,
"model_id": "eleven_monolingual_v1", # Or other models like eleven_multilingual_v2
"voice_settings": {
"stability": 0.5, # Example settings
"similarity_boost": 0.75
}
}
try:
response = requests.post(url, json=data, headers=headers)
response.raise_for_status() # Raise exception for bad status codes
with open(output_filename, 'wb') as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
print(f"Audio saved to {output_filename}")
except requests.exceptions.RequestException as e:
print(f"Error calling ElevenLabs API: {e}")
# Print response body if available for more details
if e.response is not None:
print(f"Response body: {e.response.text}")
# Example:
# synthesize_elevenlabs("Hello from ElevenLabs Text-to-Speech!")
Last updated