Skip to content

Commit 1ee365b

Browse files
authored
[Voice Agent] Fix text aggregation, eob handling, logging (#14951)
* fix text aggregation, eob handling, logging Signed-off-by: stevehuang52 <heh@nvidia.com> * improve text segmentation logic Signed-off-by: stevehuang52 <heh@nvidia.com> * improve text segmentation logic Signed-off-by: stevehuang52 <heh@nvidia.com> * revert default cfg Signed-off-by: stevehuang52 <heh@nvidia.com> * revert default config Signed-off-by: stevehuang52 <heh@nvidia.com> * pin stt frame type to interim when turn-taking module is enabled Signed-off-by: stevehuang52 <heh@nvidia.com> * update llm yamls to let server yaml control type and device Signed-off-by: stevehuang52 <heh@nvidia.com> * refactor diar Signed-off-by: stevehuang52 <heh@nvidia.com> * refactor diar Signed-off-by: stevehuang52 <heh@nvidia.com> * refactor ASR and add EOU latency and prob logging Signed-off-by: stevehuang52 <heh@nvidia.com> --------- Signed-off-by: stevehuang52 <heh@nvidia.com>
1 parent 021e187 commit 1ee365b

File tree

25 files changed

+460
-93
lines changed

25 files changed

+460
-93
lines changed

examples/voice_agent/client/package-lock.json

Lines changed: 59 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

examples/voice_agent/client/package.json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,17 @@
1414
"devDependencies": {
1515
"@types/node": "^22.15.30",
1616
"@types/protobufjs": "^6.0.0",
17+
"@types/react": "^19.2.2",
18+
"@types/react-dom": "^19.2.2",
1719
"@vitejs/plugin-react-swc": "^3.10.1",
1820
"typescript": "^5.8.3",
1921
"vite": "^6.3.5"
2022
},
2123
"dependencies": {
2224
"@pipecat-ai/client-js": "^0.4.0",
2325
"@pipecat-ai/websocket-transport": "^0.4.1",
24-
"protobufjs": "^7.4.0"
26+
"protobufjs": "^7.4.0",
27+
"react": "^19.2.0",
28+
"react-dom": "^19.2.0"
2529
}
2630
}

examples/voice_agent/client/src/app.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,12 @@ class WebsocketClientApp {
4646
private readonly serverConfigs = {
4747
websocket: {
4848
name: 'WebSocket Server',
49-
baseUrl: 'http://localhost:7860',
49+
baseUrl: `http://${window.location.hostname}:7860`,
5050
port: 8765
5151
},
5252
fastapi: {
5353
name: 'FastAPI Server',
54-
baseUrl: 'http://localhost:8000',
54+
baseUrl: `http://${window.location.hostname}:8000`,
5555
port: 8000
5656
}
5757
};
@@ -257,6 +257,7 @@ class WebsocketClientApp {
257257

258258
this.log('Initializing devices...');
259259
await this.rtviClient.initDevices();
260+
this.log('Devices initialized successfully');
260261

261262
this.log('Connecting to bot...');
262263
await this.rtviClient.connect();

examples/voice_agent/client/tsconfig.json

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@
1111
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
1212

1313
/* Language and Environment */
14-
"target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
15-
// "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
16-
// "jsx": "preserve", /* Specify what JSX code is generated. */
14+
"target": "ES2020", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
15+
"lib": ["ES2020", "DOM", "DOM.Iterable"], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
16+
"jsx": "react-jsx", /* Specify what JSX code is generated. */
1717
// "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */
1818
// "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */
1919
// "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */
@@ -25,9 +25,9 @@
2525
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
2626

2727
/* Modules */
28-
"module": "commonjs", /* Specify what module code is generated. */
28+
"module": "ESNext", /* Specify what module code is generated. */
2929
// "rootDir": "./", /* Specify the root folder within your source files. */
30-
// "moduleResolution": "node10", /* Specify how TypeScript looks up a file from a given module specifier. */
30+
"moduleResolution": "bundler", /* Specify how TypeScript looks up a file from a given module specifier. */
3131
// "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */
3232
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
3333
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */
@@ -41,7 +41,7 @@
4141
// "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */
4242
// "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
4343
// "noUncheckedSideEffectImports": true, /* Check side effect imports. */
44-
// "resolveJsonModule": true, /* Enable importing .json files. */
44+
"resolveJsonModule": true, /* Enable importing .json files. */
4545
// "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */
4646
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
4747

@@ -74,10 +74,10 @@
7474
// "declarationDir": "./", /* Specify the output directory for generated declaration files. */
7575

7676
/* Interop Constraints */
77-
// "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */
77+
"isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */
7878
// "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */
7979
// "isolatedDeclarations": true, /* Require sufficient annotation on exports so other tools can trivially generate declaration files. */
80-
// "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
80+
"allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
8181
"esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
8282
// "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
8383
"forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */

examples/voice_agent/server/backchannel_phrases.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
- "great"
1212
- "great thanks"
1313
- "ha ha"
14-
- "hi"
1514
- "hmm"
1615
- "humm"
1716
- "huh"

examples/voice_agent/server/bot_websocket_server.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,13 @@
2828
from pipecat.pipeline.runner import PipelineRunner
2929
from pipecat.pipeline.task import PipelineParams, PipelineTask
3030
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
31-
from pipecat.processors.frameworks.rtvi import RTVIAction, RTVIConfig, RTVIObserver, RTVIProcessor
31+
from pipecat.processors.frameworks.rtvi import RTVIAction, RTVIConfig, RTVIProcessor
3232
from pipecat.serializers.protobuf import ProtobufFrameSerializer
3333

34-
from nemo.agents.voice_agent.pipecat.services.nemo.diar import NeMoDiarInputParams, NemoDiarService
34+
from nemo.agents.voice_agent.pipecat.processors.frameworks.rtvi import RTVIObserver
35+
from nemo.agents.voice_agent.pipecat.services.nemo.diar import NemoDiarService
3536
from nemo.agents.voice_agent.pipecat.services.nemo.llm import get_llm_service_from_config
36-
from nemo.agents.voice_agent.pipecat.services.nemo.stt import NeMoSTTInputParams, NemoSTTService
37+
from nemo.agents.voice_agent.pipecat.services.nemo.stt import NemoSTTService
3738
from nemo.agents.voice_agent.pipecat.services.nemo.tts import KokoroTTSService, NeMoFastPitchHiFiGANTTSService
3839
from nemo.agents.voice_agent.pipecat.services.nemo.turn_taking import NeMoTurnTakingService
3940
from nemo.agents.voice_agent.pipecat.transports.network.websocket_server import (
@@ -243,7 +244,9 @@ async def reset_context_handler(rtvi_processor: RTVIProcessor, service: str, arg
243244
assistant_context_aggregator.reset()
244245
user_context_aggregator.set_messages(copy.deepcopy(original_messages))
245246
assistant_context_aggregator.set_messages(copy.deepcopy(original_messages))
246-
247+
text_aggregator.reset()
248+
if diar is not None:
249+
diar.reset()
247250
logger.info("Conversation context reset successfully")
248251
return True
249252
except Exception as e:
@@ -276,6 +279,7 @@ async def reset_context_handler(rtvi_processor: RTVIProcessor, service: str, arg
276279

277280
pipeline = Pipeline(pipeline)
278281

282+
rtvi_text_aggregator = SimpleSegmentedTextAggregator("\n?!.", min_sentence_length=5)
279283
task = PipelineTask(
280284
pipeline,
281285
params=PipelineParams(
@@ -286,7 +290,7 @@ async def reset_context_handler(rtvi_processor: RTVIProcessor, service: str, arg
286290
report_only_initial_ttfb=True,
287291
idle_timeout=None, # Disable idle timeout
288292
),
289-
observers=[RTVIObserver(rtvi)],
293+
observers=[RTVIObserver(rtvi, text_aggregator=rtvi_text_aggregator)],
290294
idle_timeout_secs=None,
291295
cancel_on_idle_timeout=False,
292296
)

examples/voice_agent/server/example_prompts/fast-bite.txt

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Fast Bites Lunch Menu
22

3-
Burgers and Sandwiches
3+
Burgers and Sandwiches:
44
1. Classic Cheeseburger – $5.99
55
Juicy beef patty, cheddar cheese, pickles, ketchup & mustard on a toasted bun.
66
- Make it a double cheeseburger by adding another patty - $1.50
@@ -14,18 +14,18 @@ Combo Deals (includes small fries and fountain soda)
1414
5. Chicken Sandwich Combo – $9.49
1515
6. Veggie Wrap Combo – $8.49
1616

17-
Sides
17+
Sides:
1818
7. French Fries
1919
- Small - $2.49
2020
- Medium - $3.49
2121
- Large - $4.49
2222
8. Chicken Nuggets
23-
- 4 pcs - $3.29
24-
- 8 pcs - $5.99
25-
- 12 pcs - $8.99
26-
9. Side Salad - $2.99
23+
- 4 pieces - $3.29
24+
- 8 pieces - $5.99
25+
- 12 pieces - $8.99
26+
9. Side Salad - $2.99
2727

28-
Drinks
28+
Drinks:
2929
10. Fountain Soda (16 oz, choices: Coke, Diet Coke, Sprite, Fanta) – $1.99
3030
11. Iced Tea or Lemonade – $2.29
3131
12. Bottled Water – $1.49

examples/voice_agent/server/server_configs/default.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ vad:
1616
stt:
1717
type: nemo # choices in ['nemo'] currently only NeMo is supported
1818
model: "stt_en_fastconformer_hybrid_large_streaming_80ms"
19+
# model: "nvidia/parakeet_realtime_eou_120m-v1"
1920
model_config: "./server_configs/stt_configs/nemo_cache_aware_streaming.yaml"
2021
device: "cuda"
2122

@@ -41,12 +42,12 @@ llm:
4142
# model_config: "./server_configs/llm_configs/qwen2.5-7B.yaml"
4243
# model: "Qwen/Qwen3-8B"
4344
# model_config: "./server_configs/llm_configs/qwen3-8B.yaml"
45+
# model: meta-llama/Llama-3.1-8B-Instruct
4446
device: "cuda"
4547
enable_reasoning: false # it's best to turn-off reasoning for lowest latency
4648
# `system_prompt` is used as the sytem prompt to the LLM, please refer to differnt LLM webpage for spcial functions like enabling/disabling thinking
4749
# system_prompt: /path/to/prompt.txt # or use path to a txt file that contains a long prompt, for example in `../example_prompts/fast_bite.txt`
4850
system_prompt: "You are a helpful AI agent named Lisa. Start by greeting the user warmly and introducing yourself within one sentence. Your answer should be concise and to the point. You might also see speaker tags (<speaker_0>, <speaker_1>, etc.) in the user context. You should respond to the user based on the speaker tag and the context of that speaker. Do not include the speaker tags in your response, use them only to identify the speaker. Do not include any emoji in response."
49-
5051
tts:
5152
type: kokoro # choices in ['nemo', 'kokoro']
5253
model: "hexgrad/Kokoro-82M"

examples/voice_agent/server/server_configs/llm_configs/hf_llm_generic.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# This is an example config for setting up a generic HuggingFace LLM for a NeMo Voice Agent server.
22
# Please refer to https://github.com/NVIDIA-NeMo/NeMo/tree/main/examples/voice_agent/README.md for more details
33

4-
type: auto # choices in ['auto', 'hf', 'vllm']
4+
# type: auto # choices in ['auto', 'hf', 'vllm']
5+
# device: "cuda"
56
dtype: bfloat16 # torch.dtype for LLM
6-
device: "cuda"
77
system_role: "system" # role for system prompt, set it to `user` for models that do not support system prompt
88
system_prompt_suffix: "/no_think" # a string that would be appended to the system prompt, used to enable/disable thinking
99

examples/voice_agent/server/server_configs/llm_configs/llama3.1-8B-instruct.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
# This is an example config for setting up Qwen2.5-7B model for a NeMo Voice Agent server.
22
# Please refer to https://github.com/NVIDIA-NeMo/NeMo/tree/main/examples/voice_agent/README.md for more details
33

4-
type: auto # choices in ['auto', 'hf', 'vllm']
4+
# type: auto # choices in ['auto', 'hf', 'vllm']
5+
# model: meta-llama/Llama-3.1-8B-Instruct
6+
# device: "cuda"
57
dtype: bfloat16 # torch.dtype for LLM
6-
model: meta-llama/Llama-3.1-8B-Instruct
7-
device: "cuda"
88
system_role: "system" # role for system prompt, set it to `user` for models that do not support system prompt
99
system_prompt_suffix: null # a string that would be appended to the system prompt, used to enable/disable thinking
1010

0 commit comments

Comments
 (0)