Mushmou · rubynguyen1510 · Jul 28, 2023 · Jul 21, 2023 · Jul 21, 2023 · Jul 21, 2023
diff --git a/python/.gitignore b/python/.gitignore
diff --git a/python/text-to-speech/main.py b/python/text-to-speech/main.py
@@ -1,3 +1,4 @@
+"""Synthesize text to speech using Google, Azure and AWS API."""
 # Standard library
 import base64
 import abc
@@ -9,9 +10,6 @@
 import azure.cognitiveservices.speech as speechsdk
 import boto3
 
-# Local imports
-import secret
-
 
 class TextToSpeech():
     """Base class for Text to Speech."""
@@ -20,7 +18,7 @@ def __init__(self, req: requests) -> None:
         self.validate_request(req)
 
     @abc.abstractmethod
-    def validate_request(self, req: requests):
+    def validate_request(self, req: requests) -> None:
         """Abstract validate request method for providers."""
 
     @abc.abstractmethod
@@ -47,7 +45,7 @@ def validate_request(self, req: requests) -> None:
         self.api_key = req.variables.get("API_KEY")
         self.project_id = req.variables.get("PROJECT_ID")
 
-    def speech(self, text, language) -> bytes:
+    def speech(self, text: str, language: str) -> bytes:
         """
         Converts the given text into speech with the Google text to speech API.
 
@@ -59,25 +57,33 @@ def speech(self, text, language) -> bytes:
             bytes: The synthezied speech in bytes.
         """
         # Instantiate a client.
-        client = texttospeech.TextToSpeechClient(client_options={"api_key": self.api_key, "quota_project_id": self.project_id})
+        client = texttospeech.TextToSpeechClient(client_options={
+            "api_key": self.api_key,
+            "quota_project_id": self.project_id,
+        })
         # Set the text input to be synthesized.
         synthesis_input = texttospeech.SynthesisInput(text=text)
-        # Build the voice request, select the language code ("en-US") and the ssml voice gender is neutral.
-        voice = texttospeech.VoiceSelectionParams(language_code=language, ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL)
+        # Build the voice request, select the language code ("en-US")
+        # and the ssml voice gender is neutral.
+        voice = texttospeech.VoiceSelectionParams(
+            language_code=language,
+            ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
+        )
         # Select the type of audio file you want returned.
-        audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3)
-        # Perform the text-to-speech request on the text input with the selected voice parameters and audio file type.
-        response = client.synthesize_speech(input=synthesis_input, voice=voice, audio_config=audio_config)
+        audio_config = texttospeech.AudioConfig(
+            audio_encoding=texttospeech.AudioEncoding.MP3)
+        # Perform the text-to-speech request on the text input
+        # with the selected voice parameters and audio file type.
+        response = client.synthesize_speech(
+            input=synthesis_input,
+            voice=voice,
+            audio_config=audio_config
+        )
         return response.audio_content
 
 
 class Azure(TextToSpeech):
-    """
-    This class represents the implementation of Azure text to speech.
-    """
-    api_key = None
-    region_key = None
-
+    """This class represents the implementation of Azure text to speech.""
     def validate_request(self, req: requests) -> None:
         """
         This method validates the request data for Azure text to speech.
@@ -88,13 +94,13 @@ def validate_request(self, req: requests) -> None:
             ValueError: If any required value is missing or invalid.
         """
         if not req.variables.get("API_KEY"):
-            raise ValueError("Missing API_KEY")
+            raise ValueError("Missing API_KEY.")
         if not req.variables.get("REGION_KEY"):
-            raise ValueError("Missing region")
+            raise ValueError("Missing REGION_KEY.")
         self.api_key = req.variables.get("API_KEY")
         self.region_key = req.variables.get("REGION_KEY")
 
-    def speech(self, text, language) -> bytes:
+    def speech(self, text: str, language: str) -> bytes:
         """
         Converts the given text into speech with the Google text to speech API.
 
@@ -106,22 +112,25 @@ def speech(self, text, language) -> bytes:
             bytes: The synthezied speech in bytes.
         """
         # Set the speech configuration to speech key and region key.
-        speech_config = speechsdk.SpeechConfig(subscription=self.api_key, region=self.region_key)
+        speech_config = speechsdk.SpeechConfig(
+            subscription=self.api_key,
+            region=self.region_key
+        )
         # The language of the voice that speaks.
         speech_config.speech_synthesis_language = language
         # Set the speech.
-        speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
+        speech_synthesizer = speechsdk.SpeechSynthesizer(
+            speech_config=speech_config,
+            audio_config=None
+        )
         # Response for the speech synthesizer.
         response = speech_synthesizer.speak_text_async(text).get().audio_data
         return response
 
 
 class AWS(TextToSpeech):
-    """
-    This class represents the implementation of AWS text to speech.
-    """
-    api_key = None
-    secret_api_key = None
+    """This class represents the implementation of AWS text to speech. """
+    voice_id = "Joanna"
 
     def validate_request(self, req: requests) -> None:
         """
@@ -139,7 +148,7 @@ def validate_request(self, req: requests) -> None:
         self.api_key = req.payload.get("API_KEY")
         self.secret_api_key = req.payload.get("SECRET_API_KEY")
 
-    def speech(self, text, language) -> bytes:
+    def speech(self, text: str, language: str) -> bytes:
         """
         Converts the given text into speech with the AWS text to speech API.
 
@@ -150,12 +159,27 @@ def speech(self, text, language) -> bytes:
         Returns:
             bytes: The synthezied speech in bytes.
         """
-        polly_client = boto3.Session(aws_access_key_id=self.api_key, aws_secret_access_key=self.secret_api_key, region_name="us-west-2").client("polly")
-        response = polly_client.synthesize_speech(VoiceId="Joanna", OutputFormat="mp3", Text=text, LanguageCode=language)
-        return response["AudioStream"].read().decode()
+        # Call polly client using boto3.session
+        polly_client = boto3.Session(
+            aws_access_key_id=self.api_key,
+            aws_secret_access_key=self.secret_api_key,
+            region_name="us-west-2"
+        ).client("polly")
+        # Get response from polly client
+        response = polly_client.synthesize_speech(
+            VoiceId=AWS.voice_id,
+            OutputFormat="mp3",
+            Text=text,
+            LanguageCode=language
+        )
+        return response["Audiostream"].read()
+
+
+list_of_providers = ["google", "azure", "aws"]
+
 
+def validate_common(req: requests) -> tuple[str]:
 
-def validate_common(req: requests) -> tuple:
     """
     This function validates the common fields in the request data
         that are independent of the text-to-speech provider.
@@ -176,11 +200,15 @@ def validate_common(req: requests) -> tuple:
 
     # Check if variables is empty.
     if not req.variables:
-        raise ValueError("Missing variables.")
+        raise ValueError("Missing Variables.")
 
     # Check if provider is empty.
     if not req.payload.get("provider"):
-        raise ValueError("Missing provider")
+        raise ValueError("Missing Provider.")
+
+    # Check if provider is in the list
+    if req.payload.get("provider").lower not in list_of_providers:
+        raise ValueError("Invalid Provider.")
 
     # Check if text is empty.
     if not req.payload.get("text"):
@@ -191,14 +219,8 @@ def validate_common(req: requests) -> tuple:
         raise ValueError("Missing Language.")
 
     # Return the text and langage.
-    return (req.payload.get("text"), req.payload.get("language"))
-
-
-IMPLEMENTATIONS = {
-    "google": Google,
-    "azure": Azure,
-    "aws": AWS,
-}
+    return (req.payload.get("provider").lower(),
+            req.payload.get("text"), req.payload.get("language"))
 
 
 def main(req: requests, res: json) -> json:
@@ -214,26 +236,28 @@ def main(req: requests, res: json) -> json:
         containing the synthesized audio in base64 encoded format.
     """
     try:
-        text, language = validate_common(req)
-        provider_class = IMPLEMENTATIONS[req.payload.get("provider")](req)
+        provider, text, language = validate_common(req)
+        if provider == "google":
+            provider_class = Google(req)
+        elif provider == "azure":
+            provider_class = Azure(req)
+        else:
+            provider_class = AWS(req)
 
     except (ValueError) as value_error:
         return res.json({
             "success": False,
-            "error": f"{value_error}",
+            "error": str(value_error),
         })
     try:
-        audio_stream = provider_class.speech(text, language)
+        audio_bytes = provider_class.speech(text, language)
     except Exception as error:
         return res.json({
             "success": False,
             "error": f"{type(error).__name__}: {error}",
         })
 
-    # f = open("python/text-to-speech/results/azure.txt", "w")
-    # f.write(base64.b64encode(audio_stream).decode())
-
     return res.json({
         "success": True,
-        "audio_stream": base64.b64encode(audio_stream).decode(),
-    })
+        "audio_bytes": base64.b64encode(audio_bytes).decode(),
+    })