Decouple encoding from sample_rate in Gradium STT

markbackman · markbackman · commit ef794ff91f3e · 2026-03-18T15:52:11.000-04:00
The encoding parameter now takes just the base type (pcm, wav, opus)
and the sample rate is derived from the pipeline audio_in_sample_rate,
assembled dynamically via input_format_from_encoding(). This fixes the
mismatch where SAMPLE_RATE=24000 was passed to the base class while
encoding defaulted to pcm_16000.
diff --git a/changelog/4066.changed.2.md b/changelog/4066.changed.2.md
@@ -0,0 +1 @@
+- `GradiumSTTService` now takes both an `encoding` and `sample_rate` constructor argument which is assmebled in the class to form the `input_format`. PCM accepts `8000`, `16000`, and `24000` Hz sample rates.
diff --git a/src/pipecat/services/gradium/stt.py b/src/pipecat/services/gradium/stt.py
@@ -45,12 +45,39 @@
     logger.error('In order to use Gradium, you need to `pip install "pipecat-ai[gradium]"`.')
     raise Exception(f"Missing module: {e}")
 
-SAMPLE_RATE = 24000
 # Seconds to wait after a "flushed" message for trailing text tokens to arrive
 # before finalizing the transcription.
 TRANSCRIPT_AGGREGATION_DELAY = 0.1
 
 
+def input_format_from_encoding(encoding: str, sample_rate: int) -> str:
+    """Build Gradium input_format from encoding type and sample rate.
+
+    For PCM encoding, appends the sample rate (e.g., "pcm_16000").
+    For other encodings (wav, opus), returns the encoding as-is.
+
+    Args:
+        encoding: Base encoding type ("pcm", "wav", or "opus").
+        sample_rate: Audio sample rate in Hz.
+
+    Returns:
+        The full input_format string for the Gradium API.
+    """
+    if encoding == "pcm":
+        match sample_rate:
+            case 8000:
+                return "pcm_8000"
+            case 16000:
+                return "pcm_16000"
+            case 24000:
+                return "pcm_24000"
+        logger.warning(
+            f"GradiumSTTService: unsupported sample rate {sample_rate} for PCM encoding, using pcm_16000"
+        )
+        return "pcm_16000"
+    return encoding
+
+
 def language_to_gradium_language(language: Language) -> Optional[str]:
     """Convert a Language enum to Gradium's language code format.
 
@@ -120,7 +147,8 @@ def __init__(
         *,
         api_key: str,
         api_endpoint_base_url: str = "wss://eu.api.gradium.ai/api/speech/asr",
-        encoding: str = "pcm_16000",
+        encoding: str = "pcm",
+        sample_rate: Optional[int] = None,
         params: Optional[InputParams] = None,
         json_config: Optional[str] = None,
         settings: Optional[Settings] = None,
@@ -132,8 +160,12 @@ def __init__(
         Args:
             api_key: Gradium API key for authentication.
             api_endpoint_base_url: WebSocket endpoint URL. Defaults to Gradium's streaming endpoint.
-            encoding: Audio input format. One of "pcm", "pcm_16000", "wav", or "opus". Defaults to
-                "pcm_16000".
+            encoding: Base audio encoding type. One of "pcm", "wav", or "opus".
+                For PCM, the sample rate is appended automatically from the
+                pipeline's audio_in_sample_rate (e.g., "pcm" becomes "pcm_16000").
+                Defaults to "pcm".
+            sample_rate: Audio sample rate in Hz. If None, uses the pipeline
+                sample rate.
             params: Configuration parameters for language and delay settings.
 
                 .. deprecated:: 0.0.105
@@ -181,7 +213,7 @@ def __init__(
             default_settings.apply_update(settings)
 
         super().__init__(
-            sample_rate=SAMPLE_RATE,
+            sample_rate=sample_rate,
             ttfs_p99_latency=ttfs_p99_latency,
             settings=default_settings,
             **kwargs,
@@ -195,6 +227,8 @@ def __init__(
 
         self._receive_task = None
 
+        self._input_format = ""
+
         self._audio_buffer = bytearray()
         self._chunk_size_ms = 80
         self._chunk_size_bytes = 0
@@ -240,6 +274,7 @@ async def start(self, frame: StartFrame):
             frame: Start frame to begin processing.
         """
         await super().start(frame)
+        self._input_format = input_format_from_encoding(self._encoding, self.sample_rate)
         self._chunk_size_bytes = int(self._chunk_size_ms * self.sample_rate * 2 / 1000)
         await self._connect()
 
@@ -351,7 +386,7 @@ async def _connect_websocket(self):
             setup_msg = {
                 "type": "setup",
                 "model_name": self._settings.model,
-                "input_format": self._encoding,
+                "input_format": self._input_format,
             }
             # Build json_config: start with deprecated json_config, then override with params
             json_config = {}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+- `GradiumSTTService` now takes both an `encoding` and `sample_rate` constructor argument which is assmebled in the class to form the `input_format`. PCM accepts `8000`, `16000`, and `24000` Hz sample rates.