Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 32 additions & 11 deletions gateway/platforms/discord.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,15 @@ def __init__(self, voice_client, allowed_user_ids: set = None):

# Debug logging counter (instance-level to avoid cross-instance races)
self._packet_debug_count = 0
self._packet_dump_mode = os.getenv(
"HERMES_DISCORD_VOICE_PACKET_DUMP", "errors"
).strip().lower()
if self._packet_dump_mode not in {"off", "errors", "all"}:
logger.warning(
"Invalid HERMES_DISCORD_VOICE_PACKET_DUMP=%r; using errors",
self._packet_dump_mode,
)
self._packet_dump_mode = "errors"

# ------------------------------------------------------------------
# Lifecycle
Expand Down Expand Up @@ -202,27 +211,37 @@ async def wrapped_hook(ws, msg):
# Packet handler (called from SocketReader thread)
# ------------------------------------------------------------------

def _dump_voice_packets(self, include_errors: bool = False) -> bool:
"""Return True when voice packet dumps should be logged."""
if self._packet_dump_mode == "all":
return True
if include_errors and self._packet_dump_mode == "errors":
return True
return False

def _on_packet(self, data: bytes):
if not self._running or self._paused:
return

# Log first few raw packets for debugging
# Log raw packets when packet dumping is enabled.
self._packet_debug_count += 1
if self._packet_debug_count <= 5:
if self._dump_voice_packets():
logger.debug(
"Raw UDP packet: len=%d, first_bytes=%s",
len(data), data[:4].hex() if len(data) >= 4 else "short",
)

if len(data) < 16:
if self._dump_voice_packets(include_errors=True):
logger.warning("Dropped short voice packet: len=%d", len(data))
return

# RTP version check: top 2 bits must be 10 (version 2).
# Lower bits may vary (padding, extension, CSRC count).
# Payload type (byte 1 lower 7 bits) = 0x78 (120) for voice.
if (data[0] >> 6) != 2 or (data[1] & 0x7F) != 0x78:
if self._packet_debug_count <= 5:
logger.debug("Skipped non-RTP: byte0=0x%02x byte1=0x%02x", data[0], data[1])
if self._dump_voice_packets(include_errors=True):
logger.warning("Skipped non-RTP packet: byte0=0x%02x byte1=0x%02x", data[0], data[1])
return

first_byte = data[0]
Expand All @@ -248,7 +267,7 @@ def _on_packet(self, data: bytes):
ext_words = struct.unpack_from(">H", data, ext_preamble_offset + 2)[0]
ext_data_len = ext_words * 4

if self._packet_debug_count <= 10:
if self._dump_voice_packets():
with self._lock:
known_user = self._ssrc_to_user.get(ssrc, "unknown")
logger.debug(
Expand All @@ -271,7 +290,7 @@ def _on_packet(self, data: bytes):
box = nacl.secret.Aead(self._secret_key)
decrypted = box.decrypt(encrypted, header, bytes(nonce))
except Exception as e:
if self._packet_debug_count <= 10:
if self._dump_voice_packets(include_errors=True):
logger.warning("NaCl decrypt failed: %s (hdr=%d, enc=%d)", e, header_size, len(encrypted))
return

Expand All @@ -286,14 +305,14 @@ def _on_packet(self, data: bytes):
# bytes into DAVE/Opus and corrupts inbound audio.
if has_padding:
if not decrypted:
if self._packet_debug_count <= 10:
if self._dump_voice_packets(include_errors=True):
logger.warning(
"RTP padding bit set but no payload (ssrc=%d)", ssrc,
)
return
pad_len = decrypted[-1]
if pad_len == 0 or pad_len > len(decrypted):
if self._packet_debug_count <= 10:
if self._dump_voice_packets(include_errors=True):
logger.warning(
"Invalid RTP padding length %d for payload size %d (ssrc=%d)",
pad_len, len(decrypted), ssrc,
Expand All @@ -317,7 +336,7 @@ def _on_packet(self, data: bytes):
except Exception as e:
# Unencrypted passthrough — use NaCl-decrypted data as-is
if "Unencrypted" not in str(e):
if self._packet_debug_count <= 10:
if self._dump_voice_packets(include_errors=True):
logger.warning("DAVE decrypt failed for ssrc=%d: %s", ssrc, e)
return
# If SSRC unknown (no SPEAKING event yet), skip DAVE and try
Expand All @@ -333,7 +352,8 @@ def _on_packet(self, data: bytes):
self._buffers[ssrc].extend(pcm)
self._last_packet_time[ssrc] = time.monotonic()
except Exception as e:
logger.debug("Opus decode error for SSRC %s: %s", ssrc, e)
if self._dump_voice_packets(include_errors=True):
logger.warning("Opus decode error for SSRC %s: %s", ssrc, e)
return

# ------------------------------------------------------------------
Expand Down Expand Up @@ -1768,8 +1788,9 @@ async def slash_reload_mcp(interaction: discord.Interaction):
await self._run_simple_slash(interaction, "/reload-mcp")

@tree.command(name="voice", description="Toggle voice reply mode")
@discord.app_commands.describe(mode="Voice mode: on, off, tts, channel, leave, or status")
@discord.app_commands.describe(mode="Voice mode: on, off, tts, join, channel, leave, or status")
@discord.app_commands.choices(mode=[
discord.app_commands.Choice(name="join — join your voice channel", value="join"),
discord.app_commands.Choice(name="channel — join your voice channel", value="channel"),
discord.app_commands.Choice(name="leave — leave voice channel", value="leave"),
discord.app_commands.Choice(name="on — voice reply to voice messages", value="on"),
Expand Down
26 changes: 26 additions & 0 deletions tests/gateway/test_voice_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,17 @@ def test_initial_state(self):
assert receiver._paused is False
assert len(receiver._buffers) == 0
assert len(receiver._ssrc_to_user) == 0
assert receiver._packet_dump_mode == "errors"

def test_packet_dump_mode_honors_env(self):
with patch.dict(os.environ, {"HERMES_DISCORD_VOICE_PACKET_DUMP": "all"}):
receiver = self._make_receiver()
assert receiver._packet_dump_mode == "all"

def test_packet_dump_mode_invalid_value_falls_back(self):
with patch.dict(os.environ, {"HERMES_DISCORD_VOICE_PACKET_DUMP": "nope"}):
receiver = self._make_receiver()
assert receiver._packet_dump_mode == "errors"

def test_start_sets_running(self):
receiver = self._make_receiver()
Expand Down Expand Up @@ -1201,6 +1212,21 @@ def reader():
assert len(errors) == 0, f"Race detected: {errors[:3]}"


# =====================================================================
# Slash command metadata
# =====================================================================

class TestVoiceSlashCommandChoices:
"""The /voice slash command should expose join in autocomplete."""

def test_join_choice_is_present(self):
import inspect
from gateway.platforms.discord import DiscordAdapter

source = inspect.getsource(DiscordAdapter._register_slash_commands)
assert 'Choice(name="join — join your voice channel", value="join")' in source


# =====================================================================
# Callback wiring order (join)
# =====================================================================
Expand Down
5 changes: 4 additions & 1 deletion website/docs/guides/use-voice-mode-with-hermes.md
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ In addition to the normal text-bot setup, make sure the bot has:

Also enable privileged intents in the Developer Portal:
- Presence Intent
- Server Members Intent
- Server Members Intent (only needed if `DISCORD_ALLOWED_USERS` includes usernames or mixed identifiers)
- Message Content Intent

## Join and leave
Expand All @@ -364,10 +364,13 @@ In a Discord text channel where the bot is present:

```text
/voice join
/voice channel
/voice leave
/voice status
```

`/voice join` now appears in Discord's slash-command autocomplete, and `/voice channel` remains the alias that maps to the same behavior.

### What happens when joined

- users speak in the VC
Expand Down
2 changes: 2 additions & 0 deletions website/docs/reference/environment-variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@ For cloud sandbox backends, persistence is filesystem-oriented. `TERMINAL_LIFETI
| `TELEGRAM_PROXY` | Proxy URL for Telegram connections — overrides `HTTPS_PROXY`. Supports `http://`, `https://`, `socks5://` |
| `DISCORD_BOT_TOKEN` | Discord bot token |
| `DISCORD_ALLOWED_USERS` | Comma-separated Discord user IDs allowed to use the bot |
| `DISCORD_PROXY` | Proxy URL for Discord connections — overrides `HTTPS_PROXY` and macOS system proxy auto-detection |
| `HERMES_DISCORD_VOICE_PACKET_DUMP` | Voice packet logging mode for Discord VC debugging: `off`, `errors` (default), or `all` |
| `DISCORD_HOME_CHANNEL` | Default Discord channel for cron delivery |
| `DISCORD_HOME_CHANNEL_NAME` | Display name for the Discord home channel |
| `DISCORD_REQUIRE_MENTION` | Require an @mention before responding in server channels |
Expand Down
6 changes: 3 additions & 3 deletions website/docs/user-guide/features/voice-mode.md
Original file line number Diff line number Diff line change
Expand Up @@ -279,10 +279,10 @@ In the [Developer Portal](https://discord.com/developers/applications) → your
| Intent | Purpose |
|--------|---------|
| **Presence Intent** | Detect user online/offline status |
| **Server Members Intent** | Map voice SSRC identifiers to Discord user IDs |
| **Server Members Intent** | Optional for username resolution; SSRC→user mapping comes from Discord SPEAKING events |
| **Message Content Intent** | Read text message content in channels |

All three are required for full voice channel functionality. **Server Members Intent** is especially critical — without it, the bot cannot identify who is speaking in the voice channel.
Only Message Content Intent is strictly required for text chat; Server Members Intent is only needed if you allow usernames in `DISCORD_ALLOWED_USERS` or want richer member lookup. SSRC→speaker mapping still comes from Discord SPEAKING events, not the members list.

#### 3. Opus Codec

Expand Down Expand Up @@ -360,7 +360,7 @@ When the bot is in a voice channel:

### Echo Prevention

The bot automatically pauses its audio listener while playing TTS replies, preventing it from hearing and re-processing its own output.
The bot does not fully pause during TTS playback. Instead, it switches into a raised-RMS barge-in mode after a short guard window, so users can interrupt mid-sentence without the bot immediately hearing its own output. Tunables: `voice.discord_vc.barge_in_guard` and `voice.discord_vc.barge_in_rms`.

### Access Control

Expand Down
11 changes: 9 additions & 2 deletions website/docs/user-guide/messaging/discord.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,14 +116,16 @@ On the **Bot** page, scroll down to **Privileged Gateway Intents**. You'll see t
| Intent | Purpose | Required? |
|--------|---------|-----------|
| **Presence Intent** | See user online/offline status | Optional |
| **Server Members Intent** | Access the member list, resolve usernames | **Required** |
| **Server Members Intent** | Access the member list, resolve usernames | Optional* |
| **Message Content Intent** | Read the text content of messages | **Required** |

**Enable both Server Members Intent and Message Content Intent** by toggling them **ON**.
**Enable Message Content Intent** by toggling it **ON**.

- Without **Message Content Intent**, your bot receives message events but the message text is empty — the bot literally cannot see what you typed.
- Without **Server Members Intent**, the bot cannot resolve usernames for the allowed users list and may fail to identify who is messaging it.

*If `DISCORD_ALLOWED_USERS` contains only numeric Discord IDs, the bot does not request Server Members Intent. If you allow usernames or mixed identifiers, it will request the intent automatically.

:::warning[This is the #1 reason Discord bots don't work]
If your bot is online but never responds to messages, the **Message Content Intent** is almost certainly disabled. Go back to the [Developer Portal](https://discord.com/developers/applications), select your application → Bot → Privileged Gateway Intents, and make sure **Message Content Intent** is toggled ON. Click **Save Changes**.
:::
Expand Down Expand Up @@ -248,8 +250,13 @@ DISCORD_ALLOWED_USERS=284102345871466496

# Multiple allowed users (comma-separated)
# DISCORD_ALLOWED_USERS=284102345871466496,198765432109876543

# Optional: use a proxy for Discord connections
# DISCORD_PROXY=http://127.0.0.1:7890
```

`DISCORD_PROXY` overrides `HTTPS_PROXY` and the macOS system proxy auto-detection used by the gateway.

Then start the gateway:

```bash
Expand Down