Skip to content

Commit a298ce3

Browse files
authored
Merge pull request pipecat-ai#3424 from pipecat-ai/mb/tts-append-trailing-space
Add append_trailing_space to TTSService to prevent vocalizing trailin…
2 parents f6ed7d7 + 31daa88 commit a298ce3

5 files changed

Lines changed: 30 additions & 6 deletions

File tree

changelog/3424.added.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
- Added `append_trailing_space` parameter to `TTSService` to automatically append a trailing space to text before sending to TTS, helping prevent some services from vocalizing trailing punctuation.

changelog/3424.changed.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
- `DeepgramTTSService` and `RimeTTSService` now set `append_trailing_space` to `True` to prevent punctuation (e.g., “dot”) from being pronounced.

src/pipecat/services/deepgram/tts.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ def __init__(
8585
sample_rate=sample_rate,
8686
pause_frame_processing=True,
8787
push_stop_frames=True,
88+
append_trailing_space=True,
8889
**kwargs,
8990
)
9091

@@ -291,24 +292,22 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
291292
Yields:
292293
Frame: Audio frames containing the synthesized speech, plus start/stop frames.
293294
"""
294-
# Append trailing space to prevent TTS from vocalizing trailing periods as "dot"
295-
text_with_trailing_space = text + " "
296-
logger.debug(f"{self}: Generating TTS [{text_with_trailing_space}]")
295+
logger.debug(f"{self}: Generating TTS [{text}]")
297296

298297
try:
299298
# Reconnect if the websocket is closed
300299
if not self._websocket or self._websocket.state is State.CLOSED:
301300
await self._connect()
302301

303302
await self.start_ttfb_metrics()
304-
await self.start_tts_usage_metrics(text_with_trailing_space)
303+
await self.start_tts_usage_metrics(text)
305304

306305
yield TTSStartedFrame()
307306

308307
# Send text message to Deepgram
309308
# Note: We don't send Flush here - that should only be sent when the
310309
# LLM finishes a complete response via flush_audio()
311-
speak_msg = {"type": "Speak", "text": text_with_trailing_space}
310+
speak_msg = {"type": "Speak", "text": text}
312311
await self._get_websocket().send(json.dumps(speak_msg))
313312

314313
# The audio frames will be handled in _receive_messages

src/pipecat/services/rime/tts.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ def __init__(
130130
push_text_frames=False,
131131
push_stop_frames=True,
132132
pause_frame_processing=True,
133+
append_trailing_space=True,
133134
sample_rate=sample_rate,
134135
**kwargs,
135136
)

src/pipecat/services/tts_service.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,9 @@ def __init__(
101101
silence_time_s: float = 2.0,
102102
# if True, we will pause processing frames while we are receiving audio
103103
pause_frame_processing: bool = False,
104+
# if True, append a trailing space to text before sending to TTS
105+
# (helps prevent some TTS services from vocalizing trailing punctuation)
106+
append_trailing_space: bool = False,
104107
# TTS output sample rate
105108
sample_rate: Optional[int] = None,
106109
# Text aggregator to aggregate incoming tokens and decide when to push to the TTS.
@@ -132,6 +135,8 @@ def __init__(
132135
push_silence_after_stop: Whether to push silence audio after TTSStoppedFrame.
133136
silence_time_s: Duration of silence to push when push_silence_after_stop is True.
134137
pause_frame_processing: Whether to pause frame processing during audio generation.
138+
append_trailing_space: Whether to append a trailing space to text before sending to TTS.
139+
This helps prevent some TTS services from vocalizing trailing punctuation (e.g., "dot").
135140
sample_rate: Output sample rate for generated audio.
136141
text_aggregator: Custom text aggregator for processing incoming text.
137142
@@ -161,6 +166,7 @@ def __init__(
161166
self._push_silence_after_stop: bool = push_silence_after_stop
162167
self._silence_time_s: float = silence_time_s
163168
self._pause_frame_processing: bool = pause_frame_processing
169+
self._append_trailing_space: bool = append_trailing_space
164170
self._init_sample_rate = sample_rate
165171
self._sample_rate = 0
166172
self._voice_id: str = ""
@@ -273,6 +279,19 @@ def language_to_service_language(self, language: Language) -> Optional[str]:
273279
"""
274280
return Language(language)
275281

282+
def _prepare_text_for_tts(self, text: str) -> str:
283+
"""Prepare text for TTS by applying any transformations required by the TTS service.
284+
285+
Args:
286+
text: The text to prepare.
287+
288+
Returns:
289+
The prepared text with transformations applied.
290+
"""
291+
if self._append_trailing_space and not text.endswith(" "):
292+
return text + " "
293+
return text
294+
276295
async def update_setting(self, key: str, value: Any):
277296
"""Update a service-specific setting.
278297
@@ -603,7 +622,10 @@ async def _push_tts_frames(
603622
for aggregation_type, transform in self._text_transforms:
604623
if aggregation_type == type or aggregation_type == "*":
605624
transformed_text = await transform(transformed_text, type)
606-
await self.process_generator(self.run_tts(transformed_text))
625+
626+
# Apply any final text preparation (e.g., trailing space)
627+
prepared_text = self._prepare_text_for_tts(transformed_text)
628+
await self.process_generator(self.run_tts(prepared_text))
607629

608630
await self.stop_processing_metrics()
609631

0 commit comments

Comments
 (0)