agentscope-ai · xieyxclack · Mar 17, 2026 · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026
diff --git a/console/src/api/modules/agent.ts b/console/src/api/modules/agent.ts
@@ -41,4 +41,45 @@ export const agentApi = {
       method: "PUT",
       body: JSON.stringify({ language }),
     }),
+
+  getAudioMode: () => request<{ audio_mode: string }>("/agent/audio-mode"),
+
+  updateAudioMode: (audio_mode: string) =>
+    request<{ audio_mode: string }>("/agent/audio-mode", {
+      method: "PUT",
+      body: JSON.stringify({ audio_mode }),
+    }),
+
+  getTranscriptionProviders: () =>
+    request<{
+      providers: { id: string; name: string; available: boolean }[];
+      configured_provider_id: string;
+    }>("/agent/transcription-providers"),
+
+  updateTranscriptionProvider: (provider_id: string) =>
+    request<{ provider_id: string }>("/agent/transcription-provider", {
+      method: "PUT",
+      body: JSON.stringify({ provider_id }),
+    }),
+
+  getTranscriptionProviderType: () =>
+    request<{ transcription_provider_type: string }>(
+      "/agent/transcription-provider-type",
+    ),
+
+  updateTranscriptionProviderType: (transcription_provider_type: string) =>
+    request<{ transcription_provider_type: string }>(
+      "/agent/transcription-provider-type",
+      {
+        method: "PUT",
+        body: JSON.stringify({ transcription_provider_type }),
+      },
+    ),
+
+  getLocalWhisperStatus: () =>
+    request<{
+      available: boolean;
+      ffmpeg_installed: boolean;
+      whisper_installed: boolean;
+    }>("/agent/local-whisper-status"),
 };
diff --git a/console/src/layouts/MainLayout/index.tsx b/console/src/layouts/MainLayout/index.tsx
@@ -19,6 +19,7 @@ import ModelsPage from "../../pages/Settings/Models";
 import EnvironmentsPage from "../../pages/Settings/Environments";
 import SecurityPage from "../../pages/Settings/Security";
 import TokenUsagePage from "../../pages/Settings/TokenUsage";
+import VoiceTranscriptionPage from "../../pages/Settings/VoiceTranscription";
 import AgentsPage from "../../pages/Settings/Agents";
 
 const { Content } = Layout;
@@ -39,6 +40,7 @@ const pathToKey: Record<string, string> = {
   "/agent-config": "agent-config",
   "/security": "security",
   "/token-usage": "token-usage",
+  "/voice-transcription": "voice-transcription",
 };
 
 export default function MainLayout() {
@@ -86,6 +88,10 @@ export default function MainLayout() {
                 <Route path="/agent-config" element={<AgentConfigPage />} />
                 <Route path="/security" element={<SecurityPage />} />
                 <Route path="/token-usage" element={<TokenUsagePage />} />
+                <Route
+                  path="/voice-transcription"
+                  element={<VoiceTranscriptionPage />}
+                />
               </Routes>
             )}
           </div>

diff --git a/console/src/layouts/Sidebar.tsx b/console/src/layouts/Sidebar.tsx
@@ -36,6 +36,7 @@ import {
   Copy,
   Check,
   BarChart3,
+  Mic,
   Bot,
 } from "lucide-react";
 import api from "../api";
@@ -69,6 +70,7 @@ const KEY_TO_PATH: Record<string, string> = {
   "agent-config": "/agent-config",
   security: "/security",
   "token-usage": "/token-usage",
+  "voice-transcription": "/voice-transcription",
 };
 
 const UPDATE_MD: Record<string, string> = {
@@ -358,6 +360,11 @@ export default function Sidebar({ selectedKey }: SidebarProps) {
           label: t("nav.tokenUsage"),
           icon: <BarChart3 size={16} />,
         },
+        {
+          key: "voice-transcription",
+          label: t("nav.voiceTranscription"),
+          icon: <Mic size={16} />,
+        },
       ],
     },
   ];

diff --git a/console/src/locales/en.json b/console/src/locales/en.json
@@ -44,7 +44,42 @@
     "models": "Models",
     "environments": "Environments",
     "security": "Security",
-    "tokenUsage": "Token Usage"
+    "tokenUsage": "Token Usage",
+    "voiceTranscription": "Voice Transcription"
+  },
+  "voiceTranscription": {
+    "title": "Voice Transcription",
+    "description": "Configure how incoming audio and voice messages are handled.",
+    "loadFailed": "Failed to load audio mode settings",
+    "saveSuccess": "Audio mode saved",
+    "saveFailed": "Failed to save audio mode",
+    "audioModeLabel": "Audio Mode",
+    "audioModeDescription": "Choose how voice messages from channels (Discord, Telegram, etc.) are processed before being sent to the model.",
+    "modeAuto": "Auto (Recommended)",
+    "modeAutoDesc": "Transcribe audio to text using the selected transcription provider, then send the text to the model. If transcription is unavailable or disabled, a file-uploaded placeholder is shown instead. Audio is never sent directly to the model in this mode. Works with all models.",
+    "modeNative": "Native Audio",
+    "modeNativeDesc": "Send the audio file directly to the model without transcription. This is the only mode that sends audio to the model. Only works with specific audio-capable models (e.g. gpt-4o-audio). Most models do not support this and will reject the message.",
+    "ffmpegReady": "ffmpeg is installed. Audio conversion is available for native mode.",
+    "ffmpegMissing": "ffmpeg is not installed.",
+    "ffmpegMissingDesc": "Native audio mode requires ffmpeg to convert audio formats (e.g. .ogg to .wav). Install ffmpeg as a system package to enable this mode.",
+    "providerTypeLabel": "Transcription Provider",
+    "providerTypeDescription": "Choose the transcription backend. Select Disabled if you do not need voice transcription.",
+    "providerTypeDisabled": "Disabled",
+    "providerTypeDisabledDesc": "No transcription. Voice messages will show a file-uploaded placeholder.",
+    "providerTypeWhisperApi": "Whisper API",
+    "providerTypeWhisperApiDesc": "Use an OpenAI-compatible Whisper API endpoint from a configured provider (e.g. OpenAI, Ollama).",
+    "providerTypeLocalWhisper": "Local Whisper",
+    "providerTypeLocalWhisperDesc": "Run transcription locally using the openai-whisper Python library. Requires both ffmpeg and openai-whisper to be installed.",
+    "localWhisperReady": "Local Whisper is ready. Both ffmpeg and openai-whisper are installed.",
+    "localWhisperMissing": "Local Whisper is not ready. Missing dependencies must be installed.",
+    "localWhisperMissingDesc": "ffmpeg: {{ffmpeg}} | openai-whisper: {{whisper}}. Install missing dependencies: ffmpeg (system package) and openai-whisper (uv pip install openai-whisper, or install CoPaw with the [whisper] extra).",
+    "providerLabel": "Whisper API Provider",
+    "providerDescription": "Select which provider to use for audio transcription via the Whisper API. Only providers with a Whisper-compatible endpoint are shown.",
+    "providerPlaceholder": "Select a provider...",
+    "noProvidersWarning": "No transcription-capable providers found. Configure an OpenAI provider to enable voice transcription.",
+    "transcriptionInfoTitle": "How transcription works",
+    "transcriptionInfoDesc": "Whisper API transcription uses an OpenAI-compatible /v1/audio/transcriptions endpoint. This requires a configured provider with a Whisper-compatible endpoint — for example, an OpenAI provider. Select a specific provider above to enable transcription.",
+    "transcriptionInfoDescLocal": "Local Whisper transcription runs the openai-whisper library directly on your machine. It requires both ffmpeg (for audio decoding) and the openai-whisper Python package to be installed. No API key or network connection is needed. Install with: uv pip install 'copaw[whisper]'."
   },
   "workspace": {
     "title": "WorkSpace",

diff --git a/console/src/locales/ja.json b/console/src/locales/ja.json
@@ -41,6 +41,7 @@
     "security": "セキュリティ",
     "tokenUsage": "トークン使用量",
     "tools": "ツール",
+    "voiceTranscription": "音声文字起こし",
     "agents": "エージェント管理"
   },
   "agent": {
@@ -743,5 +744,39 @@
         "goToWhitelist": "ホワイトリストへ"
       }
     }
+  },
+  "voiceTranscription": {
+    "title": "音声文字起こし",
+    "description": "受信した音声メッセージの処理方法を設定します。",
+    "loadFailed": "音声モード設定の読み込みに失敗しました",
+    "saveSuccess": "音声モードを保存しました",
+    "saveFailed": "音声モードの保存に失敗しました",
+    "audioModeLabel": "音声モード",
+    "audioModeDescription": "チャンネル（Discord、Telegram など）からの音声メッセージをモデルに送信する前にどのように処理するかを選択します。",
+    "modeAuto": "自動（推奨）",
+    "modeAutoDesc": "選択した文字起こしプロバイダーで音声をテキストに変換してからモデルに送信します。文字起こしが無効または利用できない場合、ファイルアップロードのプレースホルダーが表示されます。このモードでは音声はモデルに直接送信されません。すべてのモデルで動作します。",
+    "modeNative": "ネイティブ音声",
+    "modeNativeDesc": "文字起こしせずに音声ファイルをモデルに直接送信します。音声をモデルに送信する唯一のモードです。音声対応モデル（例: gpt-4o-audio）でのみ動作します。ほとんどのモデルはこのモードをサポートしていません。",
+    "ffmpegReady": "ffmpegがインストールされています。ネイティブモードの音声変換が利用可能です。",
+    "ffmpegMissing": "ffmpegがインストールされていません。",
+    "ffmpegMissingDesc": "ネイティブ音声モードでは、音声形式の変換（.oggから.wavなど）にffmpegが必要です。このモードを有効にするには、ffmpegをシステムパッケージとしてインストールしてください。",
+    "providerTypeLabel": "文字起こしプロバイダー",
+    "providerTypeDescription": "文字起こしバックエンドを選択します。音声文字起こしが不要な場合は「無効」を選択してください。",
+    "providerTypeDisabled": "無効",
+    "providerTypeDisabledDesc": "文字起こしなし。音声メッセージはファイルアップロードのプレースホルダーとして表示されます。",
+    "providerTypeWhisperApi": "Whisper API",
+    "providerTypeWhisperApiDesc": "設定済みプロバイダー（OpenAI、Ollama など）の OpenAI 互換 Whisper API エンドポイントを使用します。",
+    "providerTypeLocalWhisper": "ローカル Whisper",
+    "providerTypeLocalWhisperDesc": "ローカルにインストールされた openai-whisper Python ライブラリで文字起こしを実行します。ffmpeg と openai-whisper の両方のインストールが必要です。",
+    "localWhisperReady": "ローカル Whisper は準備完了です。ffmpeg と openai-whisper がインストールされています。",
+    "localWhisperMissing": "ローカル Whisper は準備ができていません。不足している依存関係をインストールしてください。",
+    "localWhisperMissingDesc": "ffmpeg: {{ffmpeg}} | openai-whisper: {{whisper}}。不足している依存関係をインストールしてください：ffmpeg（システムパッケージ）と openai-whisper（uv pip install openai-whisper、または CoPaw を [whisper] エクストラ付きでインストール）。",
+    "providerLabel": "Whisper API プロバイダー",
+    "providerDescription": "Whisper API による音声文字起こしに使用するプロバイダーを選択します。Whisper 対応エンドポイントを持つプロバイダーのみ表示されます。",
+    "providerPlaceholder": "プロバイダーを選択...",
+    "noProvidersWarning": "文字起こし対応のプロバイダーが見つかりません。音声文字起こしを有効にするには、OpenAI プロバイダーを設定してください。",
+    "transcriptionInfoTitle": "文字起こしの仕組み",
+    "transcriptionInfoDesc": "Whisper API 文字起こしは OpenAI 互換の /v1/audio/transcriptions エンドポイントを使用します。Whisper 対応エンドポイントを持つプロバイダー（例: OpenAI）の設定が必要です。上記で具体的なプロバイダーを選択して文字起こしを有効にしてください。",
+    "transcriptionInfoDescLocal": "ローカル Whisper 文字起こしは、お使いのマシン上で直接 openai-whisper ライブラリを実行します。ffmpeg（音声デコード用）と openai-whisper Python パッケージの両方のインストールが必要です。API キーやネットワーク接続は不要です。インストール：uv pip install 'copaw[whisper]'。"
   }
 }
diff --git a/console/src/locales/ru.json b/console/src/locales/ru.json
@@ -41,6 +41,7 @@
     "environments": "Окружения",
     "security": "Безопасность",
     "tokenUsage": "Использование токенов",
+    "voiceTranscription": "Транскрипция голоса",
     "agents": "Управление агентами"
   },
   "agent": {
@@ -748,5 +749,39 @@
         "goToWhitelist": "Перейти к белому списку"
       }
     }
+  },
+  "voiceTranscription": {
+    "title": "Транскрипция голоса",
+    "description": "Настройте обработку входящих аудио- и голосовых сообщений.",
+    "loadFailed": "Не удалось загрузить настройки аудиорежима",
+    "saveSuccess": "Аудиорежим сохранён",
+    "saveFailed": "Не удалось сохранить аудиорежим",
+    "audioModeLabel": "Аудиорежим",
+    "audioModeDescription": "Выберите, как голосовые сообщения из каналов (Discord, Telegram и др.) обрабатываются перед отправкой модели.",
+    "modeAuto": "Авто (рекомендуется)",
+    "modeAutoDesc": "Транскрибировать аудио в текст через выбранный провайдер, затем отправить текст модели. Если транскрипция отключена или недоступна, отображается заглушка «файл загружен». В этом режиме аудио не отправляется модели напрямую. Работает со всеми моделями.",
+    "modeNative": "Нативное аудио",
+    "modeNativeDesc": "Отправлять аудиофайл модели напрямую без транскрипции. Это единственный режим, передающий аудио модели. Работает только с моделями, поддерживающими аудио (например, gpt-4o-audio). Большинство моделей не поддерживают этот режим.",
+    "ffmpegReady": "ffmpeg установлен. Конвертация аудио для нативного режима доступна.",
+    "ffmpegMissing": "ffmpeg не установлен.",
+    "ffmpegMissingDesc": "Нативный аудиорежим требует ffmpeg для конвертации аудиоформатов (например, .ogg в .wav). Установите ffmpeg как системный пакет для включения этого режима.",
+    "providerTypeLabel": "Провайдер транскрипции",
+    "providerTypeDescription": "Выберите бэкенд транскрипции. Если голосовая транскрипция не нужна, выберите «Отключено».",
+    "providerTypeDisabled": "Отключено",
+    "providerTypeDisabledDesc": "Без транскрипции. Голосовые сообщения будут отображаться как заглушка «файл загружен».",
+    "providerTypeWhisperApi": "Whisper API",
+    "providerTypeWhisperApiDesc": "Использовать OpenAI-совместимый Whisper API от настроенного провайдера (например, OpenAI, Ollama).",
+    "providerTypeLocalWhisper": "Локальный Whisper",
+    "providerTypeLocalWhisperDesc": "Запускать транскрипцию локально с помощью библиотеки openai-whisper. Требуются ffmpeg и openai-whisper.",
+    "localWhisperReady": "Локальный Whisper готов к работе. ffmpeg и openai-whisper установлены.",
+    "localWhisperMissing": "Локальный Whisper не готов. Необходимо установить недостающие зависимости.",
+    "localWhisperMissingDesc": "ffmpeg: {{ffmpeg}} | openai-whisper: {{whisper}}. Установите недостающие зависимости: ffmpeg (системный пакет) и openai-whisper (uv pip install openai-whisper или установите CoPaw с опцией [whisper]).",
+    "providerLabel": "Провайдер Whisper API",
+    "providerDescription": "Выберите провайдера для транскрипции аудио через Whisper API. Отображаются только провайдеры с совместимым эндпоинтом.",
+    "providerPlaceholder": "Выберите провайдера...",
+    "noProvidersWarning": "Не найдено провайдеров для транскрипции. Настройте провайдер OpenAI для включения голосовой транскрипции.",
+    "transcriptionInfoTitle": "Как работает транскрипция",
+    "transcriptionInfoDesc": "Whisper API транскрипция использует OpenAI-совместимый эндпоинт /v1/audio/transcriptions. Необходим провайдер с поддержкой Whisper (например, OpenAI). Выберите конкретного провайдера выше для включения транскрипции.",
+    "transcriptionInfoDescLocal": "Локальная Whisper транскрипция запускает библиотеку openai-whisper непосредственно на вашем устройстве. Требуются ffmpeg (для декодирования аудио) и пакет openai-whisper. API-ключ и подключение к интернету не нужны. Установка: uv pip install 'copaw[whisper]'."
   }
 }