Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions gateway/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -1161,6 +1161,10 @@ def __init__(self, config: Optional[GatewayConfig] = None):

# Per-chat voice reply mode: "off" | "voice_only" | "all"
self._voice_mode: Dict[str, str] = self._load_voice_modes()
# Recent voice transcripts per (guild,user) for duplicate suppression.
# Protects against the same utterance being emitted twice by the voice
# capture / STT pipeline, which otherwise produces a second delayed reply.
self._recent_voice_transcripts: Dict[tuple[int, int], List[tuple[float, str]]] = {}

# Track background tasks to prevent garbage collection mid-execution
self._background_tasks: set = set()
Expand Down Expand Up @@ -8261,6 +8265,47 @@ def _handle_voice_timeout_cleanup(self, chat_id: str) -> None:
adapter = self.adapters.get(Platform.DISCORD)
self._set_adapter_auto_tts_disabled(adapter, chat_id, disabled=True)

def _is_duplicate_voice_transcript(self, guild_id: int, user_id: int, transcript: str) -> bool:
"""Suppress repeated STT outputs for the same recent utterance.

Voice capture can occasionally emit the same utterance twice a few
seconds apart, which creates a second queued agent run and overlapping
spoken replies. Dedup exact and near-exact repeats per guild/user over a
short window while allowing genuinely new turns through.
"""
from difflib import SequenceMatcher

normalized = re.sub(r"\s+", " ", transcript).strip().lower()
normalized = re.sub(r"[^\w\s]", "", normalized)
if not normalized:
return False

now = time.monotonic()
window_seconds = 12.0
key = (guild_id, user_id)
recent_store = getattr(self, "_recent_voice_transcripts", None)
if not isinstance(recent_store, dict):
recent_store = {}
self._recent_voice_transcripts = recent_store
recent = [
(ts, txt)
for ts, txt in recent_store.get(key, [])
if now - ts <= window_seconds
]

for _, prior in recent:
if prior == normalized:
recent_store[key] = recent
return True
if len(prior) >= 16 and len(normalized) >= 16:
if SequenceMatcher(None, prior, normalized).ratio() >= 0.95:
recent_store[key] = recent
return True

recent.append((now, normalized))
recent_store[key] = recent[-5:]
return False

async def _handle_voice_channel_input(
self, guild_id: int, user_id: int, transcript: str
):
Expand Down Expand Up @@ -8298,6 +8343,15 @@ async def _handle_voice_channel_input(
logger.debug("Unauthorized voice input from user %d, ignoring", user_id)
return

if self._is_duplicate_voice_transcript(guild_id, user_id, transcript):
logger.info(
"Suppressing duplicate voice transcript for guild=%s user=%s: %s",
guild_id,
user_id,
transcript[:100],
)
return

# Show transcript in text channel (after auth, with mention sanitization)
try:
channel = adapter._client.get_channel(text_ch_id)
Expand Down
40 changes: 40 additions & 0 deletions tests/gateway/test_voice_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -954,6 +954,46 @@ async def test_input_posts_transcript_in_text_channel(self, runner):
assert "Test transcript" in msg
assert "42" in msg # user_id in mention

@pytest.mark.asyncio
async def test_input_suppresses_duplicate_transcript(self, runner):
"""Near-immediate duplicate STT output should not dispatch twice."""
from gateway.config import Platform

mock_adapter = AsyncMock()
mock_adapter._voice_text_channels = {111: 123}
mock_adapter._voice_sources = {}
mock_channel = AsyncMock()
mock_adapter._client = MagicMock()
mock_adapter._client.get_channel = MagicMock(return_value=mock_channel)
mock_adapter.handle_message = AsyncMock()
runner.adapters[Platform.DISCORD] = mock_adapter

await runner._handle_voice_channel_input(111, 42, "Hello from VC")
await runner._handle_voice_channel_input(111, 42, "Hello from VC")

mock_adapter.handle_message.assert_called_once()
mock_channel.send.assert_called_once()

@pytest.mark.asyncio
async def test_input_suppresses_near_duplicate_transcript(self, runner):
"""Small STT wording drift should still be treated as the same utterance."""
from gateway.config import Platform

mock_adapter = AsyncMock()
mock_adapter._voice_text_channels = {111: 123}
mock_adapter._voice_sources = {}
mock_channel = AsyncMock()
mock_adapter._client = MagicMock()
mock_adapter._client.get_channel = MagicMock(return_value=mock_channel)
mock_adapter.handle_message = AsyncMock()
runner.adapters[Platform.DISCORD] = mock_adapter

await runner._handle_voice_channel_input(111, 42, "This is a test of the voice system")
await runner._handle_voice_channel_input(111, 42, "This is a test for the voice system")

mock_adapter.handle_message.assert_called_once()
mock_channel.send.assert_called_once()

# -- _get_guild_id --

def test_get_guild_id_from_guild(self, runner):
Expand Down
Loading