Skip to content

Commit 7b6cf52

Browse files
RichardLiudaKBVsent
authored andcommitted
feat: add xiaomi MiMo TTS & STT providers (AstrBotDevs#6643)
* feat: add mimo tts provider support * fix: handle empty mimo tts choices * feat: add mimo stt provider support
1 parent 35c76d8 commit 7b6cf52

11 files changed

Lines changed: 673 additions & 3 deletions

File tree

astrbot/core/config/default.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1451,6 +1451,20 @@ class ChatProviderTemplate(TypedDict):
14511451
"model": "whisper-1",
14521452
"proxy": "",
14531453
},
1454+
"MiMo STT(API)": {
1455+
"id": "mimo_stt",
1456+
"provider": "mimo",
1457+
"type": "mimo_stt_api",
1458+
"provider_type": "speech_to_text",
1459+
"enable": False,
1460+
"api_key": "",
1461+
"api_base": "https://api.xiaomimimo.com/v1",
1462+
"model": "mimo-v2-omni",
1463+
"mimo-stt-system-prompt": "You are a speech transcription assistant. Transcribe the spoken content from the audio exactly and return only the transcription text.",
1464+
"mimo-stt-user-prompt": "Please transcribe the content of the audio and return only the transcription text.",
1465+
"timeout": "20",
1466+
"proxy": "",
1467+
},
14541468
"Whisper(Local)": {
14551469
"provider": "openai",
14561470
"type": "openai_whisper_selfhost",
@@ -1481,6 +1495,23 @@ class ChatProviderTemplate(TypedDict):
14811495
"timeout": "20",
14821496
"proxy": "",
14831497
},
1498+
"MiMo TTS(API)": {
1499+
"id": "mimo_tts",
1500+
"type": "mimo_tts_api",
1501+
"provider": "mimo",
1502+
"provider_type": "text_to_speech",
1503+
"enable": False,
1504+
"api_key": "",
1505+
"api_base": "https://api.xiaomimimo.com/v1",
1506+
"model": "mimo-v2-tts",
1507+
"mimo-tts-voice": "mimo_default",
1508+
"mimo-tts-format": "wav",
1509+
"mimo-tts-style-prompt": "",
1510+
"mimo-tts-dialect": "",
1511+
"mimo-tts-seed-text": "Hello, MiMo, have you had lunch?",
1512+
"timeout": "20",
1513+
"proxy": "",
1514+
},
14841515
"Genie TTS": {
14851516
"id": "genie_tts",
14861517
"provider": "genie_tts",
@@ -2329,11 +2360,46 @@ class ChatProviderTemplate(TypedDict):
23292360
"type": "int",
23302361
"hint": "超时时间,单位为秒。",
23312362
},
2363+
"mimo-stt-system-prompt": {
2364+
"description": "系统提示词",
2365+
"type": "string",
2366+
"hint": "用于指导 MiMo STT 转录行为的 system prompt。",
2367+
},
2368+
"mimo-stt-user-prompt": {
2369+
"description": "用户提示词",
2370+
"type": "string",
2371+
"hint": "附加给 MiMo STT 的用户提示词,用于约束返回结果格式。",
2372+
},
23322373
"openai-tts-voice": {
23332374
"description": "voice",
23342375
"type": "string",
23352376
"hint": "OpenAI TTS 的声音。OpenAI 默认支持:'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'",
23362377
},
2378+
"mimo-tts-voice": {
2379+
"description": "音色",
2380+
"type": "string",
2381+
"hint": "MiMo TTS 的音色名称。可选值包括 'mimo_default'、'default_en'、'default_zh'。",
2382+
},
2383+
"mimo-tts-format": {
2384+
"description": "输出格式",
2385+
"type": "string",
2386+
"hint": "MiMo TTS 生成音频的格式。支持 'wav'、'mp3'、'pcm'。",
2387+
},
2388+
"mimo-tts-style-prompt": {
2389+
"description": "风格提示词",
2390+
"type": "string",
2391+
"hint": "用于控制生成语音的说话风格、语气或情绪,例如温柔、活泼、沉稳等。可留空。",
2392+
},
2393+
"mimo-tts-dialect": {
2394+
"description": "方言",
2395+
"type": "string",
2396+
"hint": "指定生成语音时使用的方言或口音,例如四川话、粤语口音等。可留空。",
2397+
},
2398+
"mimo-tts-seed-text": {
2399+
"description": "种子文本",
2400+
"type": "string",
2401+
"hint": "用于引导音色和说话方式的参考文本,会影响生成语音的表达风格。",
2402+
},
23372403
"fishaudio-tts-character": {
23382404
"description": "character",
23392405
"type": "string",

astrbot/core/provider/manager.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,10 @@ def dynamic_import_provider(self, type: str) -> None:
391391
from .sources.whisper_api_source import (
392392
ProviderOpenAIWhisperAPI as ProviderOpenAIWhisperAPI,
393393
)
394+
case "mimo_stt_api":
395+
from .sources.mimo_stt_api_source import (
396+
ProviderMiMoSTTAPI as ProviderMiMoSTTAPI,
397+
)
394398
case "openai_whisper_selfhost":
395399
from .sources.whisper_selfhosted_source import (
396400
ProviderOpenAIWhisperSelfHost as ProviderOpenAIWhisperSelfHost,
@@ -403,6 +407,10 @@ def dynamic_import_provider(self, type: str) -> None:
403407
from .sources.openai_tts_api_source import (
404408
ProviderOpenAITTSAPI as ProviderOpenAITTSAPI,
405409
)
410+
case "mimo_tts_api":
411+
from .sources.mimo_tts_api_source import (
412+
ProviderMiMoTTSAPI as ProviderMiMoTTSAPI,
413+
)
406414
case "genie_tts":
407415
from .sources.genie_tts import (
408416
GenieTTSProvider as GenieTTSProvider,
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
import base64
2+
import uuid
3+
from pathlib import Path
4+
from urllib.parse import urlparse
5+
6+
import httpx
7+
8+
from astrbot import logger
9+
from astrbot.core.utils.astrbot_path import get_astrbot_temp_path
10+
from astrbot.core.utils.io import download_file
11+
from astrbot.core.utils.tencent_record_helper import (
12+
convert_to_pcm_wav,
13+
tencent_silk_to_wav,
14+
)
15+
16+
DEFAULT_MIMO_API_BASE = "https://api.xiaomimimo.com/v1"
17+
DEFAULT_MIMO_TTS_MODEL = "mimo-v2-tts"
18+
DEFAULT_MIMO_TTS_VOICE = "mimo_default"
19+
DEFAULT_MIMO_TTS_SEED_TEXT = "Hello, MiMo, have you had lunch?"
20+
DEFAULT_MIMO_STT_MODEL = "mimo-v2-omni"
21+
DEFAULT_MIMO_STT_SYSTEM_PROMPT = (
22+
"You are a speech transcription assistant. "
23+
"Transcribe the spoken content from the audio exactly and return only the transcription text."
24+
)
25+
DEFAULT_MIMO_STT_USER_PROMPT = (
26+
"Please transcribe the content of the audio and return only the transcription text."
27+
)
28+
29+
30+
class MiMoAPIError(Exception):
31+
pass
32+
33+
34+
def normalize_timeout(timeout: int | str | None) -> int | None:
35+
if timeout in (None, ""):
36+
return None
37+
if isinstance(timeout, str):
38+
return int(timeout)
39+
return timeout
40+
41+
42+
def build_headers(api_key: str) -> dict[str, str]:
43+
headers = {"Content-Type": "application/json"}
44+
if api_key:
45+
headers["Authorization"] = f"Bearer {api_key}"
46+
return headers
47+
48+
49+
def get_temp_dir() -> Path:
50+
temp_dir = Path(get_astrbot_temp_path())
51+
temp_dir.mkdir(parents=True, exist_ok=True)
52+
return temp_dir
53+
54+
55+
def create_http_client(timeout: int | None, proxy: str) -> httpx.AsyncClient:
56+
client_kwargs: dict[str, object] = {
57+
"timeout": timeout,
58+
"follow_redirects": True,
59+
}
60+
if proxy:
61+
logger.info("[MiMo API] Using proxy: %s", proxy)
62+
client_kwargs["proxy"] = proxy
63+
return httpx.AsyncClient(**client_kwargs)
64+
65+
66+
def build_api_url(api_base: str) -> str:
67+
normalized_api_base = api_base.rstrip("/")
68+
if normalized_api_base.endswith("/chat/completions"):
69+
return normalized_api_base
70+
return normalized_api_base + "/chat/completions"
71+
72+
73+
async def _detect_audio_format(file_path: Path) -> str | None:
74+
silk_header = b"SILK"
75+
amr_header = b"#!AMR"
76+
77+
try:
78+
with file_path.open("rb") as file:
79+
file_header = file.read(8)
80+
except FileNotFoundError:
81+
return None
82+
83+
if silk_header in file_header:
84+
return "silk"
85+
if amr_header in file_header:
86+
return "amr"
87+
return None
88+
89+
90+
async def prepare_audio_input(audio_source: str) -> tuple[str, list[Path]]:
91+
cleanup_paths: list[Path] = []
92+
source_path = Path(audio_source)
93+
is_remote = audio_source.startswith(("http://", "https://"))
94+
is_tencent = "multimedia.nt.qq.com.cn" in audio_source if is_remote else False
95+
96+
if is_remote:
97+
parsed_url = urlparse(audio_source)
98+
suffix = Path(parsed_url.path).suffix or ".input"
99+
download_path = get_temp_dir() / f"mimo_audio_{uuid.uuid4().hex[:8]}{suffix}"
100+
await download_file(audio_source, str(download_path))
101+
source_path = download_path
102+
cleanup_paths.append(download_path)
103+
104+
if not source_path.exists():
105+
raise FileNotFoundError(f"File does not exist: {source_path}")
106+
107+
if source_path.suffix.lower() in {".amr", ".silk"} or is_tencent:
108+
file_format = await _detect_audio_format(source_path)
109+
if file_format in {"silk", "amr"}:
110+
converted_path = get_temp_dir() / f"mimo_audio_{uuid.uuid4().hex[:8]}.wav"
111+
cleanup_paths.append(converted_path)
112+
if file_format == "silk":
113+
logger.info("Converting silk file to wav for MiMo STT...")
114+
await tencent_silk_to_wav(str(source_path), str(converted_path))
115+
else:
116+
logger.info("Converting amr file to wav for MiMo STT...")
117+
await convert_to_pcm_wav(str(source_path), str(converted_path))
118+
source_path = converted_path
119+
120+
encoded_audio = base64.b64encode(source_path.read_bytes()).decode("utf-8")
121+
return encoded_audio, cleanup_paths
122+
123+
124+
def cleanup_files(paths: list[Path]) -> None:
125+
for path in paths:
126+
try:
127+
path.unlink(missing_ok=True)
128+
except Exception as exc:
129+
logger.warning("Failed to remove temporary MiMo file %s: %s", path, exc)
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
from ..entities import ProviderType
2+
from ..provider import STTProvider
3+
from ..register import register_provider_adapter
4+
from .mimo_api_common import (
5+
DEFAULT_MIMO_API_BASE,
6+
DEFAULT_MIMO_STT_MODEL,
7+
DEFAULT_MIMO_STT_SYSTEM_PROMPT,
8+
DEFAULT_MIMO_STT_USER_PROMPT,
9+
MiMoAPIError,
10+
build_api_url,
11+
build_headers,
12+
cleanup_files,
13+
create_http_client,
14+
normalize_timeout,
15+
prepare_audio_input,
16+
)
17+
18+
19+
@register_provider_adapter(
20+
"mimo_stt_api",
21+
"MiMo STT API",
22+
provider_type=ProviderType.SPEECH_TO_TEXT,
23+
)
24+
class ProviderMiMoSTTAPI(STTProvider):
25+
def __init__(
26+
self,
27+
provider_config: dict,
28+
provider_settings: dict,
29+
) -> None:
30+
super().__init__(provider_config, provider_settings)
31+
self.chosen_api_key = provider_config.get("api_key", "")
32+
self.api_base = provider_config.get("api_base", DEFAULT_MIMO_API_BASE)
33+
self.proxy = provider_config.get("proxy", "")
34+
self.timeout = normalize_timeout(provider_config.get("timeout", 20))
35+
self.system_prompt = provider_config.get(
36+
"mimo-stt-system-prompt",
37+
DEFAULT_MIMO_STT_SYSTEM_PROMPT,
38+
)
39+
self.user_prompt = provider_config.get(
40+
"mimo-stt-user-prompt",
41+
DEFAULT_MIMO_STT_USER_PROMPT,
42+
)
43+
self.set_model(provider_config.get("model", DEFAULT_MIMO_STT_MODEL))
44+
self.client = create_http_client(self.timeout, self.proxy)
45+
46+
async def get_text(self, audio_url: str) -> str:
47+
audio_data_url, cleanup_paths = await prepare_audio_input(audio_url)
48+
payload = {
49+
"model": self.model_name,
50+
"messages": [
51+
{
52+
"role": "system",
53+
"content": self.system_prompt,
54+
},
55+
{
56+
"role": "user",
57+
"content": [
58+
{
59+
"type": "input_audio",
60+
"input_audio": {
61+
"data": audio_data_url,
62+
},
63+
},
64+
{
65+
"type": "text",
66+
"text": self.user_prompt,
67+
},
68+
],
69+
},
70+
],
71+
"max_completion_tokens": 1024,
72+
}
73+
74+
try:
75+
response = await self.client.post(
76+
build_api_url(self.api_base),
77+
headers=build_headers(self.chosen_api_key),
78+
json=payload,
79+
)
80+
try:
81+
response.raise_for_status()
82+
except Exception as exc:
83+
error_text = response.text[:1024]
84+
raise MiMoAPIError(
85+
f"MiMo STT API request failed: HTTP {response.status_code}, response: {error_text}"
86+
) from exc
87+
88+
data = response.json()
89+
choices = data.get("choices") or []
90+
first_choice = choices[0] if choices else {}
91+
content = first_choice.get("message", {}).get("content", "")
92+
if not isinstance(content, str) or not content.strip():
93+
raise MiMoAPIError(f"MiMo STT API returned empty transcription: {data}")
94+
return content.strip()
95+
finally:
96+
cleanup_files(cleanup_paths)
97+
98+
async def terminate(self):
99+
if self.client:
100+
await self.client.aclose()

0 commit comments

Comments
 (0)