Skip to content

Commit 98288c2

Browse files
authored
feat(voice-agent-example): implement voice agent pipeline integrating STT, Agent, and TTS via spring-ai-alibaba-extensions. (#445)
feat(voice-agent-example): implement voice agent pipeline integrating STT, Agent, and TTS via spring-ai-alibaba-extensions
1 parent 0fdea8f commit 98288c2

22 files changed

Lines changed: 1733 additions & 1 deletion

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
<!-- Spring AI Alibaba -->
4949
<!-- Install Spring AI Alibaba in your local. -->
5050
<spring-ai-alibaba.version>1.1.0.0</spring-ai-alibaba.version>
51-
<spring-ai-alibaba-extensions.version>1.1.2.0</spring-ai-alibaba-extensions.version>
51+
<spring-ai-alibaba-extensions.version>1.1.2.1</spring-ai-alibaba-extensions.version>
5252

5353
<!-- maven plugin -->
5454
<maven-deploy-plugin.version>3.1.1</maven-deploy-plugin.version>

spring-ai-alibaba-agent-example/pom.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
<module>sql-agent-example</module>
4343
<module>adk-samples-llm-auditor</module>
4444
<module>voice-agent-dashscope-sdk-example</module>
45+
<module>voice-agent-example</module>
4546
</modules>
4647

4748
<build>
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# Voice Agent Example
2+
3+
基于 Spring AI Alibaba 的语音对话 Agent 示例,实现 Push-to-Talk 模式的 Audio → STT → Agent → TTS → Audio 全链路。
4+
5+
## 功能
6+
7+
- **Push-to-Talk 语音交互**:WebSocket 双向通信,推送录音、流式播放回复
8+
- **打断控制**:AI 播放中随时点击打断
9+
- **Reactive Pipeline**:STT → Agent → TTS 基于 Reactor 的流式编排
10+
- **工具调用**:示例为航班预订助手(查询/改签)
11+
12+
## 快速开始
13+
14+
### 1. 配置 API Key
15+
16+
```bash
17+
# Linux/Mac
18+
export AI_DASHSCOPE_API_KEY=sk-xxx
19+
20+
# Windows PowerShell
21+
$env:AI_DASHSCOPE_API_KEY="sk-xxx"
22+
```
23+
24+
### 2. 启动
25+
26+
```bash
27+
mvn spring-boot:run
28+
```
29+
30+
### 3. 使用
31+
32+
浏览器打开 `http://localhost:8081`,点击 🎤 录音 → 再次点击发送 → AI 语音回复。
33+
34+
## 架构
35+
36+
```
37+
Browser ──PCM──► VoiceWebSocketHandler ──► VoiceAgentPipeline
38+
◄──PCM+JSON──┘ │
39+
┌──────┴────────────────────────┐
40+
│ STT (paraformer-realtime-v2) │
41+
│ ↓ │
42+
│ ReactAgent + Tools │
43+
│ ↓ │
44+
│ TTS (cosyvoice-v1) │
45+
└───────────────────────────────┘
46+
```
47+
48+
## 项目结构
49+
50+
```
51+
voice-agent-example/src/main/java/.../voice/
52+
├── config/
53+
│ ├── VoiceAgentConfiguration.java # ReactAgent + Tools 组装
54+
│ └── WebSocketConfig.java # WebSocket 端点注册
55+
├── controller/
56+
│ ├── PageController.java # 前端路由
57+
│ └── VoiceWebSocketHandler.java # WebSocket 消息处理
58+
├── service/
59+
│ ├── VoiceAgentPipeline.java # 核心管道 STT→Agent→TTS
60+
│ └── VoiceAgentService.java # ReactAgent 调用封装
61+
├── event/ # Pipeline 事件(sealed interface)
62+
└── tools/ # 航班查询/改签工具
63+
```
64+
65+
## 技术栈
66+
67+
| 组件 | 实现 |
68+
|------|--------------------------------|
69+
| 框架 | Spring Boot 3.x |
70+
| AI 集成 | Spring AI Alibaba (ReactAgent) |
71+
| 流式编排 | Project Reactor |
72+
| 实时通信 | WebSocket (PCM 16kHz) |
73+
74+
## 注意事项
75+
76+
- 端口 `8081`,可在 `application.yml` 修改
77+
- 浏览器麦克风需 HTTPS 或 localhost
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<project xmlns="http://maven.apache.org/POM/4.0.0"
3+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
4+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
5+
<modelVersion>4.0.0</modelVersion>
6+
<parent>
7+
<groupId>com.alibaba.cloud.ai</groupId>
8+
<artifactId>spring-ai-alibaba-agent-example</artifactId>
9+
<version>${revision}</version>
10+
<relativePath>../pom.xml</relativePath>
11+
</parent>
12+
13+
<artifactId>voice-agent-example</artifactId>
14+
<name>Spring AI Alibaba Voice Agent Example</name>
15+
<description>Voice Agent Example - STT > Agent > TTS Pipeline</description>
16+
17+
<properties>
18+
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
19+
</properties>
20+
21+
<dependencies>
22+
<dependency>
23+
<groupId>org.springframework.boot</groupId>
24+
<artifactId>spring-boot-starter-web</artifactId>
25+
</dependency>
26+
27+
<dependency>
28+
<groupId>org.springframework.boot</groupId>
29+
<artifactId>spring-boot-starter-websocket</artifactId>
30+
</dependency>
31+
32+
<!-- Spring AI Alibaba -->
33+
<dependency>
34+
<groupId>com.alibaba.cloud.ai</groupId>
35+
<artifactId>spring-ai-alibaba-agent-framework</artifactId>
36+
</dependency>
37+
<dependency>
38+
<groupId>com.alibaba.cloud.ai</groupId>
39+
<artifactId>spring-ai-alibaba-starter-dashscope</artifactId>
40+
</dependency>
41+
42+
<!-- Thymeleaf for HTML templates -->
43+
<dependency>
44+
<groupId>org.springframework.boot</groupId>
45+
<artifactId>spring-boot-starter-thymeleaf</artifactId>
46+
</dependency>
47+
48+
<dependency>
49+
<groupId>org.springframework.boot</groupId>
50+
<artifactId>spring-boot-starter-test</artifactId>
51+
<scope>test</scope>
52+
</dependency>
53+
54+
</dependencies>
55+
56+
<build>
57+
<plugins>
58+
<plugin>
59+
<groupId>org.springframework.boot</groupId>
60+
<artifactId>spring-boot-maven-plugin</artifactId>
61+
</plugin>
62+
</plugins>
63+
</build>
64+
65+
</project>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/*
2+
* Copyright 2026-2027 the original author or authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* https://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package com.cloud.alibaba.ai.example.agent.voice;
17+
18+
import org.springframework.boot.SpringApplication;
19+
import org.springframework.boot.autoconfigure.SpringBootApplication;
20+
21+
/**
22+
* Voice agent bootstrap: Audio -> STT -> Agent -> TTS -> Audio.
23+
*
24+
* @author buvidk
25+
* @since 2026-02-12
26+
*/
27+
@SpringBootApplication
28+
public class Application {
29+
30+
public static void main(String[] args) {
31+
SpringApplication.run(Application.class, args);
32+
}
33+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
/*
2+
* Copyright 2026-2027 the original author or authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* https://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package com.cloud.alibaba.ai.example.agent.voice.config;
17+
18+
import com.alibaba.cloud.ai.graph.agent.ReactAgent;
19+
import com.alibaba.cloud.ai.graph.checkpoint.savers.MemorySaver;
20+
import com.alibaba.cloud.ai.graph.exception.GraphStateException;
21+
import com.cloud.alibaba.ai.example.agent.voice.tools.BookingTool;
22+
import com.cloud.alibaba.ai.example.agent.voice.tools.FlightChangeTool;
23+
import org.springframework.ai.chat.model.ChatModel;
24+
import org.springframework.context.annotation.Bean;
25+
import org.springframework.context.annotation.Configuration;
26+
27+
/**
28+
* Voice Agent Configuration
29+
*
30+
* @author buvidk
31+
* @since 2026-02-12
32+
*/
33+
@Configuration
34+
public class VoiceAgentConfiguration {
35+
36+
private final ChatModel chatModel;
37+
private final BookingTool bookingTool;
38+
private final FlightChangeTool flightChangeTool;
39+
40+
public VoiceAgentConfiguration(ChatModel chatModel,
41+
BookingTool bookingTool,
42+
FlightChangeTool flightChangeTool) {
43+
this.chatModel = chatModel;
44+
this.bookingTool = bookingTool;
45+
this.flightChangeTool = flightChangeTool;
46+
}
47+
48+
@Bean
49+
public ReactAgent voiceReactAgent() throws GraphStateException {
50+
return ReactAgent.builder()
51+
.name("voice-assistant")
52+
.description("""
53+
你是一个专业的航空公司语音助手�?
54+
55+
你的能力�?
56+
1. 查询航班预订详情
57+
2. 更改航班日期
58+
59+
重要输出规则�?
60+
- 只用纯文本,不要�?Markdown、列表符号或表情符号
61+
- 保持回复简短,最�?-3句话
62+
- 用自然口语化的中文回复,像电话交流一�?
63+
- 如果用户没有提供预订号,请礼貌地询问
64+
""")
65+
.model(chatModel)
66+
.saver(new MemorySaver())
67+
.tools(bookingTool.toolCallback(), flightChangeTool.toolCallback())
68+
.build();
69+
}
70+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
/*
2+
* Copyright 2026-2027 the original author or authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* https://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package com.cloud.alibaba.ai.example.agent.voice.config;
17+
18+
import com.cloud.alibaba.ai.example.agent.voice.controller.VoiceWebSocketHandler;
19+
import org.springframework.context.annotation.Bean;
20+
import org.springframework.context.annotation.Configuration;
21+
import org.springframework.web.socket.config.annotation.EnableWebSocket;
22+
import org.springframework.web.socket.config.annotation.WebSocketConfigurer;
23+
import org.springframework.web.socket.config.annotation.WebSocketHandlerRegistry;
24+
import org.springframework.web.socket.server.standard.ServletServerContainerFactoryBean;
25+
26+
/**
27+
* WebSocket Configuration
28+
*
29+
* @author buvidk
30+
* @since 2026-02-12
31+
*/
32+
@Configuration
33+
@EnableWebSocket
34+
public class WebSocketConfig implements WebSocketConfigurer {
35+
36+
private final VoiceWebSocketHandler handler;
37+
38+
public WebSocketConfig(VoiceWebSocketHandler handler) {
39+
this.handler = handler;
40+
}
41+
42+
@Override
43+
public void registerWebSocketHandlers(WebSocketHandlerRegistry registry) {
44+
registry.addHandler(handler, "/ws/voice").setAllowedOrigins("*");
45+
}
46+
47+
@Bean
48+
public ServletServerContainerFactoryBean createWebSocketContainer() {
49+
ServletServerContainerFactoryBean container = new ServletServerContainerFactoryBean();
50+
container.setMaxTextMessageBufferSize(1024 * 1024);
51+
container.setMaxBinaryMessageBufferSize(1024 * 1024);
52+
return container;
53+
}
54+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/*
2+
* Copyright 2026-2027 the original author or authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* https://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package com.cloud.alibaba.ai.example.agent.voice.controller;
17+
18+
import org.springframework.stereotype.Controller;
19+
import org.springframework.web.bind.annotation.GetMapping;
20+
21+
/**
22+
* Page Controller for serving frontend pages
23+
*
24+
* @author buvidk
25+
* @since 2026-02-12
26+
*/
27+
@Controller
28+
public class PageController {
29+
30+
@GetMapping("/")
31+
public String index() {
32+
return "index";
33+
}
34+
}

0 commit comments

Comments
 (0)