1. add additional **kwargs to ut to prevent mismatch parameter problem

natureofnature · natureofnature · commit 52cabf7bd33c · 2025-12-09T10:25:22.000+08:00
2. fix yaml config for multi connector

Signed-off-by: wzliu &lt;wzliu@connect.hku.hk&gt;
diff --git a/tests/test_omni_llm.py b/tests/test_omni_llm.py
@@ -104,6 +104,7 @@ def init_stage_worker(
         shm_threshold_bytes: int = 65536,
         ctx=None,
         batch_timeout: int = 10,
+        **kwargs,
     ):
         """Mock init_stage_worker: don't start real process, just send stage_ready message."""
         # Create a mock process object
diff --git a/vllm_omni/model_executor/stage_configs/qwen2_5_omni_multiconnector.yaml b/vllm_omni/model_executor/stage_configs/qwen2_5_omni_multiconnector.yaml
@@ -5,13 +5,13 @@ stage_args:
   - stage_id: 0
     runtime:
       process: true            # Run this stage in a separate process
-      devices: "0"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
+      devices: "5"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
       max_batch_size: 1
     engine_args:
       model_stage: thinker
       model_arch: Qwen2_5OmniForConditionalGeneration
       worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
-      scheduler_cls: vllm_omni.core.sched.scheduler.GPUARScheduler
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
       gpu_memory_utilization: 0.8
       enforce_eager: true  # Now we only support eager mode
       trust_remote_code: true
@@ -34,13 +34,13 @@ stage_args:
   - stage_id: 1
     runtime:
       process: true
-      devices: "1"
+      devices: "6"
       max_batch_size: 1
     engine_args:
       model_stage: talker
       model_arch: Qwen2_5OmniForConditionalGeneration
       worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
-      scheduler_cls: vllm_omni.core.sched.scheduler.GPUARScheduler
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
       gpu_memory_utilization: 0.8
       enforce_eager: true
       trust_remote_code: true
@@ -65,17 +65,18 @@ stage_args:
   - stage_id: 2
     runtime:
       process: true
-      devices: "2"            # Example: use a different GPU than the previous stage; use "0" if single GPU
+      devices: "7"            # Example: use a different GPU than the previous stage; use "0" if single GPU
       max_batch_size: 1
     engine_args:
       model_stage: code2wav
       model_arch: Qwen2_5OmniForConditionalGeneration
-      worker_cls: vllm_omni.worker.gpu_diffusion_worker.GPUGenerationWorker
-      scheduler_cls: vllm_omni.core.sched.diffusion_scheduler.GPUGenerationScheduler
+      worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
       gpu_memory_utilization: 0.3
       enforce_eager: true
       trust_remote_code: true
       enable_prefix_caching: false
+      max_num_batched_tokens: 32768
       engine_output_type: audio
     engine_input_source: [1]
     final_output: true