vllm-project · vllm-bot · Feb 26, 2026 · Dec 2, 2025 · Dec 3, 2025 · Dec 3, 2025
diff --git a/tests/basic_correctness/test_v2_offload.py b/tests/basic_correctness/test_v2_offload.py
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test V2 offloading correctness with DeepSeek V2 model."""
+
+from ..utils import compare_two_settings
+
+
+def test_v2_offload_deepseek():
+    """Test V2 CPU offloading with DeepSeek-V2-Lite.
+
+    Compares outputs between:
+    1. Baseline (no offloading)
+    2. V2 offloading (group_size=8, num_in_group=2, prefetch_step=1)
+
+    This tests the advanced offloading with prefetching on a MoE model.
+    """
+    compare_two_settings(
+        "deepseek-ai/DeepSeek-V2-Lite",
+        [
+            # V2 offloading configuration
+            "--offload-group-size",
+            "8",
+            "--offload-num-in-group",
+            "2",
+            "--offload-prefetch-step",
+            "1",
+            # torch.compile is automatically disabled when V2 offloading is
+            # enabled (via enable_if in @support_torch_compile decorator)
+        ],
+        [],  # Baseline: no offloading
+    )
@@ -17,6 +17,7 @@
 from vllm.distributed.device_communicators.pynccl_allocator import set_graph_pool_id
 from vllm.forward_context import BatchDescriptor, get_forward_context
 from vllm.logger import init_logger
+from vllm.model_executor.offloader.base import get_offloader
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import current_stream, weak_ref_tensors
 
@@ -265,6 +266,11 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any | None:
                     set_graph_pool_id(self.graph_pool)
                 else:
                     set_graph_pool_id(current_platform.graph_pool_handle())
+
+                # Sync offloader's copy stream before capture.
+                # Ensure any pre-capture prefetches from offloader are complete.
+                get_offloader().sync_prev_onload()
+
                 # mind-exploding: carefully manage the reference and memory.
                 with torch.cuda.graph(
                     cudagraph,
@@ -273,6 +279,11 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any | None:
                 ):
                     # `output` is managed by pytorch's cudagraph pool
                     output = self.runnable(*args, **kwargs)
+                    # Join offloader's copy stream after forward to avoid
+                    # unjoined stream error. The last layer's start_prefetch
+                    # forks copy_stream, but wait_prefetch only happens in
+                    # the next forward pass.
+                    get_offloader().join_after_forward()
                     if self.cudagraph_options.weak_ref_output:
                         # by converting it to weak ref,
                         # the original `output` will immediately be released
@@ -305,5 +316,8 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any | None:
                 f"got {new_input_addresses}"
             )
 
+        # Sync offloader before replay - ensures any external dependencies
+        # from pre-capture prefetches are satisfied.
+        get_offloader().sync_prev_onload()
         entry.cudagraph.replay()
         return entry.output
@@ -24,6 +24,7 @@
 )
 from vllm.config.multimodal import MultiModalConfig
 from vllm.config.observability import ObservabilityConfig
+from vllm.config.offload import OffloadConfig
 from vllm.config.parallel import EPLBConfig, ParallelConfig
 from vllm.config.pooler import PoolerConfig
 from vllm.config.profiler import ProfilerConfig
@@ -85,6 +86,8 @@
     "MultiModalConfig",
     # From vllm.config.observability
     "ObservabilityConfig",
+    # From vllm.config.offload
+    "OffloadConfig",
     # From vllm.config.parallel
     "EPLBConfig",
     "ParallelConfig",

@@ -97,17 +97,15 @@ class CacheConfig:
     load a 13B model with BF16 weight, which requires at least 26GB GPU memory.
     Note that this requires fast CPU-GPU interconnect, as part of the model is
     loaded from CPU memory to GPU memory on the fly in each model forward pass.
+
+    DEPRECATED: This field is deprecated and will be removed in a future
+    release. Please use OffloadConfig.cpu_offload_gb instead.
     """
     cpu_offload_params: set[str] = Field(default_factory=set)
-    """ The set of parameter name segments to target for CPU offloading.
-    Unmatched parameters are not offloaded. If this set is empty, parameters
-    are offloaded non-selectively until the memory limit defined by
-    `cpu_offload_gb` is reached.
-    Examples:
-        - For parameter name "mlp.experts.w2_weight":
-            - "experts" or "experts.w2_weight" will match.
-            - "expert" or "w2" will NOT match (must be exact segments).
-    This allows distinguishing parameters like "w2_weight" and "w2_weight_scale".
+    """The set of parameter name segments to target for CPU offloading.
+
+    DEPRECATED: This field is deprecated and will be removed in a future
+    release. Please use OffloadConfig.cpu_offload_params instead.
     """
     calculate_kv_scales: bool = False
     """This enables dynamic calculation of `k_scale` and `v_scale` when

@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Configuration for model weight offloading."""
+
+from pydantic import Field, model_validator
+from pydantic.dataclasses import dataclass
+
+from vllm.config.utils import config
+
+
+@config
+@dataclass
+class OffloadConfig:
+    """Configuration for model weight offloading to CPU.
+
+    This controls how model parameters are offloaded to CPU memory to reduce
+    GPU memory usage, at the cost of additional CPU-GPU transfers during
+    inference.
+    """
+
+    cpu_offload_gb: float = Field(default=0, ge=0)
+    """The space in GiB to offload to CPU, per GPU. Default is 0, which means
+    no offloading. Intuitively, this argument can be seen as a virtual way to
+    increase the GPU memory size. For example, if you have one 24 GB GPU and
+    set this to 10, virtually you can think of it as a 34 GB GPU. Then you can
+    load a 13B model with BF16 weight, which requires at least 26GB GPU memory.
+    Note that this requires fast CPU-GPU interconnect, as part of the model is
+    loaded from CPU memory to GPU memory on the fly in each model forward pass.
+    This uses UVA (Unified Virtual Addressing) for zero-copy access.
+    """
+
+    cpu_offload_params: set[str] = Field(default_factory=set)
+    """The set of parameter name segments to target for CPU offloading.
+    Unmatched parameters are not offloaded. If this set is empty, parameters
+    are offloaded non-selectively until the memory limit defined by
+    `cpu_offload_gb` is reached.
+    Examples:
+        - For parameter name "mlp.experts.w2_weight":
+            - "experts" or "experts.w2_weight" will match.
+            - "expert" or "w2" will NOT match (must be exact segments).
+    This allows distinguishing parameters like "w2_weight" and "w2_weight_scale".
+    """
+
+    offload_group_size: int = Field(default=0, ge=0)
+    """Advanced CPU offloading (V2): Group every N layers together. Offload last
+    `offload_num_in_group` layers of each group. Default is 0 (disabled).
+    Example: group_size=8, num_in_group=2 offloads layers 6,7,14,15,22,23,...
+    Unlike cpu_offload_gb, this uses explicit async prefetching to hide transfer
+    latency.
+    """
+
+    offload_num_in_group: int = Field(default=1, ge=1)
+    """Advanced CPU offloading (V2): Number of layers to offload per group.
+    Must be <= offload_group_size. Default is 1."""
+
+    offload_prefetch_step: int = Field(default=1, ge=0)
+    """Advanced CPU offloading (V2): Number of layers to prefetch ahead.
+    Higher values hide more latency but use more GPU memory. Default is 1."""
+
+    @model_validator(mode="after")
+    def validate_offload_config(self) -> "OffloadConfig":
+        """Validate offload configuration constraints."""
+        if self.offload_group_size > 0:
+            if self.offload_num_in_group > self.offload_group_size:
+                raise ValueError(
+                    f"offload_num_in_group ({self.offload_num_in_group}) must be "
+                    f"<= offload_group_size ({self.offload_group_size})"
+                )
+            if self.offload_prefetch_step < 1:
+                raise ValueError(
+                    f"offload_prefetch_step ({self.offload_prefetch_step}) must be "
+                    f">= 1 when V2 offloading is enabled (offload_group_size > 0)"
+                )
+        return self
+
+    def compute_hash(self) -> str:
+        """
+        Provide a hash that uniquely identifies all the offload configs.
+
+        All fields are included because OffloaderV2 patches module
+        forwards and inserts custom ops (wait_prefetch, start_prefetch)
+        into the computation graph. Changing any offload setting can
+        alter which layers are hooked and how prefetch indices are
+        computed, so the compilation cache must distinguish them.
+        """
+        # OffloaderV2 (offload_group_size > 0) patches module forwards
+        # and inserts custom ops (wait_prefetch, start_prefetch) into the
+        # computation graph, so all offload settings must be part of the
+        # cache key to avoid stale compilation cache hits.
+        from vllm.config.utils import get_hash_factors, hash_factors
+
+        factors = get_hash_factors(self, ignored_factors=set())
+        hash_str = hash_factors(factors)
+        return hash_str
@@ -37,6 +37,7 @@
 from .lora import LoRAConfig
 from .model import ModelConfig
 from .observability import ObservabilityConfig
+from .offload import OffloadConfig
 from .parallel import ParallelConfig
 from .profiler import ProfilerConfig
 from .scheduler import SchedulerConfig
@@ -242,6 +243,8 @@ class VllmConfig:
     """Device configuration."""
     load_config: LoadConfig = Field(default_factory=LoadConfig)
     """Load configuration."""
+    offload_config: OffloadConfig = Field(default_factory=OffloadConfig)
+    """Model weight offloading configuration."""
     attention_config: AttentionConfig = Field(default_factory=AttentionConfig)
     """Attention configuration."""
     kernel_config: KernelConfig = Field(default_factory=KernelConfig)
@@ -344,6 +347,10 @@ def compute_hash(self) -> str:
             vllm_factors.append(self.load_config.compute_hash())
         else:
             vllm_factors.append("None")
+        if self.offload_config:
+            vllm_factors.append(self.offload_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.attention_config:
             vllm_factors.append(self.attention_config.compute_hash())
         else:

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -48,6 +48,7 @@
     ModelConfig,
     MultiModalConfig,
     ObservabilityConfig,
+    OffloadConfig,
     ParallelConfig,
     PoolerConfig,
     ProfilerConfig,
@@ -438,8 +439,11 @@ class EngineArgs:
     disable_sliding_window: bool = ModelConfig.disable_sliding_window
     disable_cascade_attn: bool = ModelConfig.disable_cascade_attn
     swap_space: float = CacheConfig.swap_space
-    cpu_offload_gb: float = CacheConfig.cpu_offload_gb
-    cpu_offload_params: set[str] = get_field(CacheConfig, "cpu_offload_params")
+    cpu_offload_gb: float = OffloadConfig.cpu_offload_gb
+    cpu_offload_params: set[str] = get_field(OffloadConfig, "cpu_offload_params")
+    offload_group_size: int = OffloadConfig.offload_group_size
+    offload_num_in_group: int = OffloadConfig.offload_num_in_group
+    offload_prefetch_step: int = OffloadConfig.offload_prefetch_step
     gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
     kv_cache_memory_bytes: int | None = CacheConfig.kv_cache_memory_bytes
     max_num_batched_tokens: int | None = None
@@ -947,10 +951,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         cache_group.add_argument(
             "--prefix-caching-hash-algo", **cache_kwargs["prefix_caching_hash_algo"]
         )
-        cache_group.add_argument("--cpu-offload-gb", **cache_kwargs["cpu_offload_gb"])
-        cache_group.add_argument(
-            "--cpu-offload-params", **cache_kwargs["cpu_offload_params"]
-        )
         cache_group.add_argument(
             "--calculate-kv-scales", **cache_kwargs["calculate_kv_scales"]
         )
@@ -976,6 +976,28 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--kv-offloading-backend", **cache_kwargs["kv_offloading_backend"]
         )
 
+        # Model weight offload related configs
+        offload_kwargs = get_kwargs(OffloadConfig)
+        offload_group = parser.add_argument_group(
+            title="OffloadConfig",
+            description=OffloadConfig.__doc__,
+        )
+        offload_group.add_argument(
+            "--cpu-offload-gb", **offload_kwargs["cpu_offload_gb"]
+        )
+        offload_group.add_argument(
+            "--offload-group-size", **offload_kwargs["offload_group_size"]
+        )
+        offload_group.add_argument(
+            "--offload-num-in-group", **offload_kwargs["offload_num_in_group"]
+        )
+        offload_group.add_argument(
+            "--offload-prefetch-step", **offload_kwargs["offload_prefetch_step"]
+        )
+        offload_group.add_argument(
+            "--cpu-offload-params", **offload_kwargs["cpu_offload_params"]
+        )
+
         # Multimodal related configs
         multimodal_kwargs = get_kwargs(MultiModalConfig)
         multimodal_group = parser.add_argument_group(
@@ -1465,8 +1487,6 @@ def create_engine_config(
             sliding_window=sliding_window,
             enable_prefix_caching=self.enable_prefix_caching,
             prefix_caching_hash_algo=self.prefix_caching_hash_algo,
-            cpu_offload_gb=self.cpu_offload_gb,
-            cpu_offload_params=self.cpu_offload_params,
             calculate_kv_scales=self.calculate_kv_scales,
             kv_sharing_fast_prefill=self.kv_sharing_fast_prefill,
             mamba_cache_dtype=self.mamba_cache_dtype,
@@ -1824,13 +1844,23 @@ def create_engine_config(
             compilation_config.max_cudagraph_capture_size = (
                 self.max_cudagraph_capture_size
             )
+
+        offload_config = OffloadConfig(
+            cpu_offload_gb=self.cpu_offload_gb,
+            offload_group_size=self.offload_group_size,
+            offload_num_in_group=self.offload_num_in_group,
+            offload_prefetch_step=self.offload_prefetch_step,
+            cpu_offload_params=self.cpu_offload_params,
+        )
+
         config = VllmConfig(
             model_config=model_config,
             cache_config=cache_config,
             parallel_config=parallel_config,
             scheduler_config=scheduler_config,
             device_config=device_config,
             load_config=load_config,
+            offload_config=offload_config,
             attention_config=attention_config,
             kernel_config=kernel_config,
             lora_config=lora_config,

@@ -163,6 +163,14 @@ class LLM:
             the model weights. This virtually increases the GPU memory space
             you can use to hold the model weights, at the cost of CPU-GPU data
             transfer for every forward pass.
+        offload_group_size: Advanced CPU offloading: Group every N layers
+            together. Offload last `offload_num_in_group` layers of each group.
+            Default is 0 (disabled).
+        offload_num_in_group: Advanced CPU offloading: Number of layers to
+            offload per group. Default is 1.
+        offload_prefetch_step: Advanced CPU offloading: Number of layers to
+            prefetch ahead. Higher values hide more latency but use more GPU
+            memory. Default is 1.
         enforce_eager: Whether to enforce eager execution. If True, we will
             disable CUDA graph and always execute the model in eager mode.
             If False, we will use CUDA graph and eager execution in hybrid.
@@ -217,6 +225,9 @@ def __init__(
         gpu_memory_utilization: float = 0.9,
         swap_space: float = 4,
         cpu_offload_gb: float = 0,
+        offload_group_size: int = 0,
+        offload_num_in_group: int = 1,
+        offload_prefetch_step: int = 1,
         enforce_eager: bool = False,
         enable_return_routed_experts: bool = False,
         disable_custom_all_reduce: bool = False,
@@ -326,6 +337,9 @@ def _make_config(value: Any, cls: type[_R]) -> _R:
             kv_cache_memory_bytes=kv_cache_memory_bytes,
             swap_space=swap_space,
             cpu_offload_gb=cpu_offload_gb,
+            offload_group_size=offload_group_size,
+            offload_num_in_group=offload_num_in_group,
+            offload_prefetch_step=offload_prefetch_step,
             enforce_eager=enforce_eager,
             enable_return_routed_experts=enable_return_routed_experts,
             disable_custom_all_reduce=disable_custom_all_reduce,

diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -19,6 +19,7 @@
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE,
     FusedMoeWeightScaleSupported,
+    find_fused_moe_submodule,
 )
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEActivationFormat,
@@ -68,6 +69,7 @@ def get_config() -> dict[str, Any] | None:
     "SharedFusedMoE",
     "ZeroExpertFusedMoE",
     "activation_without_mul",
+    "find_fused_moe_submodule",
     "apply_moe_activation",
     "override_config",
     "get_config",