Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 0 additions & 154 deletions python/.gitignore

This file was deleted.

126 changes: 75 additions & 51 deletions python/text-to-speech/main.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"""Synthesize text to speech using Google, Azure and AWS API."""
# Standard library
import base64
import abc
Expand All @@ -9,9 +10,6 @@
import azure.cognitiveservices.speech as speechsdk
import boto3

# Local imports
import secret


class TextToSpeech():
"""Base class for Text to Speech."""
Expand All @@ -20,7 +18,7 @@ def __init__(self, req: requests) -> None:
self.validate_request(req)

@abc.abstractmethod
def validate_request(self, req: requests):
def validate_request(self, req: requests) -> None:
"""Abstract validate request method for providers."""

@abc.abstractmethod
Expand All @@ -47,7 +45,7 @@ def validate_request(self, req: requests) -> None:
self.api_key = req.variables.get("API_KEY")
self.project_id = req.variables.get("PROJECT_ID")

def speech(self, text, language) -> bytes:
def speech(self, text: str, language: str) -> bytes:
"""
Converts the given text into speech with the Google text to speech API.

Expand All @@ -59,25 +57,33 @@ def speech(self, text, language) -> bytes:
bytes: The synthezied speech in bytes.
"""
# Instantiate a client.
client = texttospeech.TextToSpeechClient(client_options={"api_key": self.api_key, "quota_project_id": self.project_id})
client = texttospeech.TextToSpeechClient(client_options={
"api_key": self.api_key,
"quota_project_id": self.project_id,
})
# Set the text input to be synthesized.
synthesis_input = texttospeech.SynthesisInput(text=text)
# Build the voice request, select the language code ("en-US") and the ssml voice gender is neutral.
voice = texttospeech.VoiceSelectionParams(language_code=language, ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL)
# Build the voice request, select the language code ("en-US")
# and the ssml voice gender is neutral.
voice = texttospeech.VoiceSelectionParams(
language_code=language,
ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
)
# Select the type of audio file you want returned.
audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3)
# Perform the text-to-speech request on the text input with the selected voice parameters and audio file type.
response = client.synthesize_speech(input=synthesis_input, voice=voice, audio_config=audio_config)
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3)
# Perform the text-to-speech request on the text input
# with the selected voice parameters and audio file type.
response = client.synthesize_speech(
input=synthesis_input,
voice=voice,
audio_config=audio_config
)
return response.audio_content


class Azure(TextToSpeech):
"""
This class represents the implementation of Azure text to speech.
"""
api_key = None
region_key = None

"""This class represents the implementation of Azure text to speech.""
def validate_request(self, req: requests) -> None:
"""
This method validates the request data for Azure text to speech.
Expand All @@ -88,13 +94,13 @@ def validate_request(self, req: requests) -> None:
ValueError: If any required value is missing or invalid.
"""
if not req.variables.get("API_KEY"):
raise ValueError("Missing API_KEY")
raise ValueError("Missing API_KEY.")
if not req.variables.get("REGION_KEY"):
raise ValueError("Missing region")
raise ValueError("Missing REGION_KEY.")
self.api_key = req.variables.get("API_KEY")
self.region_key = req.variables.get("REGION_KEY")

def speech(self, text, language) -> bytes:
def speech(self, text: str, language: str) -> bytes:
"""
Converts the given text into speech with the Google text to speech API.

Expand All @@ -106,22 +112,25 @@ def speech(self, text, language) -> bytes:
bytes: The synthezied speech in bytes.
"""
# Set the speech configuration to speech key and region key.
speech_config = speechsdk.SpeechConfig(subscription=self.api_key, region=self.region_key)
speech_config = speechsdk.SpeechConfig(
subscription=self.api_key,
region=self.region_key
)
# The language of the voice that speaks.
speech_config.speech_synthesis_language = language
# Set the speech.
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
speech_synthesizer = speechsdk.SpeechSynthesizer(
speech_config=speech_config,
audio_config=None
)
# Response for the speech synthesizer.
response = speech_synthesizer.speak_text_async(text).get().audio_data
return response


class AWS(TextToSpeech):
"""
This class represents the implementation of AWS text to speech.
"""
api_key = None
secret_api_key = None
"""This class represents the implementation of AWS text to speech. """
voice_id = "Joanna"

def validate_request(self, req: requests) -> None:
"""
Expand All @@ -139,7 +148,7 @@ def validate_request(self, req: requests) -> None:
self.api_key = req.payload.get("API_KEY")
self.secret_api_key = req.payload.get("SECRET_API_KEY")

def speech(self, text, language) -> bytes:
def speech(self, text: str, language: str) -> bytes:
"""
Converts the given text into speech with the AWS text to speech API.

Expand All @@ -150,12 +159,27 @@ def speech(self, text, language) -> bytes:
Returns:
bytes: The synthezied speech in bytes.
"""
polly_client = boto3.Session(aws_access_key_id=self.api_key, aws_secret_access_key=self.secret_api_key, region_name="us-west-2").client("polly")
response = polly_client.synthesize_speech(VoiceId="Joanna", OutputFormat="mp3", Text=text, LanguageCode=language)
return response["AudioStream"].read().decode()
# Call polly client using boto3.session
polly_client = boto3.Session(
aws_access_key_id=self.api_key,
aws_secret_access_key=self.secret_api_key,
region_name="us-west-2"
).client("polly")
# Get response from polly client
response = polly_client.synthesize_speech(
VoiceId=AWS.voice_id,
OutputFormat="mp3",
Text=text,
LanguageCode=language
)
return response["Audiostream"].read()


list_of_providers = ["google", "azure", "aws"]


def validate_common(req: requests) -> tuple[str]:

def validate_common(req: requests) -> tuple:
"""
This function validates the common fields in the request data
that are independent of the text-to-speech provider.
Expand All @@ -176,11 +200,15 @@ def validate_common(req: requests) -> tuple:

# Check if variables is empty.
if not req.variables:
raise ValueError("Missing variables.")
raise ValueError("Missing Variables.")

# Check if provider is empty.
if not req.payload.get("provider"):
raise ValueError("Missing provider")
raise ValueError("Missing Provider.")

# Check if provider is in the list
if req.payload.get("provider").lower not in list_of_providers:
raise ValueError("Invalid Provider.")

# Check if text is empty.
if not req.payload.get("text"):
Expand All @@ -191,14 +219,8 @@ def validate_common(req: requests) -> tuple:
raise ValueError("Missing Language.")

# Return the text and langage.
return (req.payload.get("text"), req.payload.get("language"))


IMPLEMENTATIONS = {
"google": Google,
"azure": Azure,
"aws": AWS,
}
return (req.payload.get("provider").lower(),
req.payload.get("text"), req.payload.get("language"))


def main(req: requests, res: json) -> json:
Expand All @@ -214,26 +236,28 @@ def main(req: requests, res: json) -> json:
containing the synthesized audio in base64 encoded format.
"""
try:
text, language = validate_common(req)
provider_class = IMPLEMENTATIONS[req.payload.get("provider")](req)
provider, text, language = validate_common(req)
if provider == "google":
provider_class = Google(req)
elif provider == "azure":
provider_class = Azure(req)
else:
provider_class = AWS(req)

except (ValueError) as value_error:
return res.json({
"success": False,
"error": f"{value_error}",
"error": str(value_error),
})
try:
audio_stream = provider_class.speech(text, language)
audio_bytes = provider_class.speech(text, language)
except Exception as error:
return res.json({
"success": False,
"error": f"{type(error).__name__}: {error}",
})

# f = open("python/text-to-speech/results/azure.txt", "w")
# f.write(base64.b64encode(audio_stream).decode())

return res.json({
"success": True,
"audio_stream": base64.b64encode(audio_stream).decode(),
})
"audio_bytes": base64.b64encode(audio_bytes).decode(),
})
Loading