vllm-project · vllm-bot · Feb 26, 2026 · Dec 2, 2025 · Dec 3, 2025 · Dec 3, 2025
diff --git a/tests/basic_correctness/test_v2_offload.py b/tests/basic_correctness/test_v2_offload.py
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test V2 offloading correctness with DeepSeek V2 model."""
+
+from ..utils import compare_two_settings
+
+
+def test_v2_offload_deepseek():
+    """Test V2 CPU offloading with DeepSeek-V2-Lite.
+
+    Compares outputs between:
+    1. Baseline (no offloading)
+    2. V2 offloading (group_size=8, num_in_group=2, prefetch_step=1)
+
+    This tests the advanced offloading with prefetching on a MoE model.
+    """
+    compare_two_settings(
+        "deepseek-ai/DeepSeek-V2-Lite",
+        [],  # Baseline: no offloading
+        [
+            # V2 offloading configuration
+            "--offload-group-size",
+            "8",
+            "--offload-num-in-group",
+            "2",
+            "--offload-prefetch-step",
+            "1",
+            # currently not compatible with torch.compile
+            "--enforce-eager",
+        ],
+    )
@@ -22,6 +22,7 @@
 )
 from vllm.config.multimodal import MultiModalConfig
 from vllm.config.observability import ObservabilityConfig
+from vllm.config.offload import OffloadConfig
 from vllm.config.parallel import EPLBConfig, ParallelConfig
 from vllm.config.pooler import PoolerConfig
 from vllm.config.profiler import ProfilerConfig
@@ -77,6 +78,8 @@
     "MultiModalConfig",
     # From vllm.config.observability
     "ObservabilityConfig",
+    # From vllm.config.offload
+    "OffloadConfig",
     # From vllm.config.parallel
     "EPLBConfig",
     "ParallelConfig",

@@ -100,6 +100,9 @@ class CacheConfig:
     load a 13B model with BF16 weight, which requires at least 26GB GPU memory.
     Note that this requires fast CPU-GPU interconnect, as part of the model is
     loaded from CPU memory to GPU memory on the fly in each model forward pass.
+
+    DEPRECATED: This field is deprecated and will be removed in a future
+    release. Please use OffloadConfig.cpu_offload_gb instead.
     """
     calculate_kv_scales: bool = False
     """This enables dynamic calculation of `k_scale` and `v_scale` when

@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Configuration for model weight offloading."""
+
+from typing import Any
+
+from pydantic import Field, model_validator
+from pydantic.dataclasses import dataclass
+
+from vllm.config.utils import config
+from vllm.utils.hashing import safe_hash
+
+
+@config
+@dataclass
+class OffloadConfig:
+    """Configuration for model weight offloading to CPU.
+
+    This controls how model parameters are offloaded to CPU memory to reduce
+    GPU memory usage, at the cost of additional CPU-GPU transfers during
+    inference.
+    """
+
+    cpu_offload_gb: float = Field(default=0, ge=0)
+    """The space in GiB to offload to CPU, per GPU. Default is 0, which means
+    no offloading. Intuitively, this argument can be seen as a virtual way to
+    increase the GPU memory size. For example, if you have one 24 GB GPU and
+    set this to 10, virtually you can think of it as a 34 GB GPU. Then you can
+    load a 13B model with BF16 weight, which requires at least 26GB GPU memory.
+    Note that this requires fast CPU-GPU interconnect, as part of the model is
+    loaded from CPU memory to GPU memory on the fly in each model forward pass.
+    This uses UVA (Unified Virtual Addressing) for zero-copy access.
+    """
+
+    offload_group_size: int = Field(default=0, ge=0)
+    """Advanced CPU offloading (V2): Group every N layers together. Offload last
+    `offload_num_in_group` layers of each group. Default is 0 (disabled).
+    Example: group_size=8, num_in_group=2 offloads layers 6,7,14,15,22,23,...
+    Unlike cpu_offload_gb, this uses explicit async prefetching to hide transfer
+    latency.
+    """
+
+    offload_num_in_group: int = Field(default=1, ge=1)
+    """Advanced CPU offloading (V2): Number of layers to offload per group.
+    Must be <= offload_group_size. Default is 1."""
+
+    offload_prefetch_step: int = Field(default=1, ge=0)
+    """Advanced CPU offloading (V2): Number of layers to prefetch ahead.
+    Higher values hide more latency but use more GPU memory. Default is 1."""
+
+    @model_validator(mode="after")
+    def validate_offload_config(self) -> "OffloadConfig":
+        """Validate that offload_num_in_group <= offload_group_size."""
+        if (
+            self.offload_group_size > 0
+            and self.offload_num_in_group > self.offload_group_size
+        ):
+            raise ValueError(
+                f"offload_num_in_group ({self.offload_num_in_group}) must be "
+                f"<= offload_group_size ({self.offload_group_size})"
+            )
+        return self
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # Offload settings don't affect the computation graph structure,
+        # only the memory layout and transfer patterns.
+        factors: list[Any] = []
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
+        return hash_str
@@ -38,6 +38,7 @@
 from .lora import LoRAConfig
 from .model import ModelConfig
 from .observability import ObservabilityConfig
+from .offload import OffloadConfig
 from .parallel import ParallelConfig
 from .profiler import ProfilerConfig
 from .scheduler import SchedulerConfig
@@ -194,6 +195,8 @@ class VllmConfig:
     """Device configuration."""
     load_config: LoadConfig = Field(default_factory=LoadConfig)
     """Load configuration."""
+    offload_config: OffloadConfig = Field(default_factory=OffloadConfig)
+    """Model weight offloading configuration."""
     attention_config: AttentionConfig = Field(default_factory=AttentionConfig)
     """Attention configuration."""
     lora_config: LoRAConfig | None = None
@@ -285,6 +288,10 @@ def compute_hash(self) -> str:
             vllm_factors.append(self.load_config.compute_hash())
         else:
             vllm_factors.append("None")
+        if self.offload_config:
+            vllm_factors.append(self.offload_config.compute_hash())
+        else:
+            vllm_factors.append("None")
         if self.attention_config:
             vllm_factors.append(self.attention_config.compute_hash())
         else:

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -48,6 +48,7 @@
     ModelConfig,
     MultiModalConfig,
     ObservabilityConfig,
+    OffloadConfig,
     ParallelConfig,
     PoolerConfig,
     ProfilerConfig,
@@ -434,7 +435,10 @@ class EngineArgs:
     disable_sliding_window: bool = ModelConfig.disable_sliding_window
     disable_cascade_attn: bool = ModelConfig.disable_cascade_attn
     swap_space: float = CacheConfig.swap_space
-    cpu_offload_gb: float = CacheConfig.cpu_offload_gb
+    cpu_offload_gb: float = OffloadConfig.cpu_offload_gb
+    offload_group_size: int = OffloadConfig.offload_group_size
+    offload_num_in_group: int = OffloadConfig.offload_num_in_group
+    offload_prefetch_step: int = OffloadConfig.offload_prefetch_step
     gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
     kv_cache_memory_bytes: int | None = CacheConfig.kv_cache_memory_bytes
     max_num_batched_tokens: int | None = None
@@ -912,7 +916,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         cache_group.add_argument(
             "--prefix-caching-hash-algo", **cache_kwargs["prefix_caching_hash_algo"]
         )
-        cache_group.add_argument("--cpu-offload-gb", **cache_kwargs["cpu_offload_gb"])
         cache_group.add_argument(
             "--calculate-kv-scales", **cache_kwargs["calculate_kv_scales"]
         )
@@ -935,6 +938,25 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--kv-offloading-backend", **cache_kwargs["kv_offloading_backend"]
         )
 
+        # Model weight offload related configs
+        offload_kwargs = get_kwargs(OffloadConfig)
+        offload_group = parser.add_argument_group(
+            title="OffloadConfig",
+            description=OffloadConfig.__doc__,
+        )
+        offload_group.add_argument(
+            "--cpu-offload-gb", **offload_kwargs["cpu_offload_gb"]
+        )
+        offload_group.add_argument(
+            "--offload-group-size", **offload_kwargs["offload_group_size"]
+        )
+        offload_group.add_argument(
+            "--offload-num-in-group", **offload_kwargs["offload_num_in_group"]
+        )
+        offload_group.add_argument(
+            "--offload-prefetch-step", **offload_kwargs["offload_prefetch_step"]
+        )
+
         # Multimodal related configs
         multimodal_kwargs = get_kwargs(MultiModalConfig)
         multimodal_group = parser.add_argument_group(
@@ -1384,7 +1406,6 @@ def create_engine_config(
             sliding_window=sliding_window,
             enable_prefix_caching=self.enable_prefix_caching,
             prefix_caching_hash_algo=self.prefix_caching_hash_algo,
-            cpu_offload_gb=self.cpu_offload_gb,
             calculate_kv_scales=self.calculate_kv_scales,
             kv_sharing_fast_prefill=self.kv_sharing_fast_prefill,
             mamba_cache_dtype=self.mamba_cache_dtype,
@@ -1715,13 +1736,22 @@ def create_engine_config(
             compilation_config.max_cudagraph_capture_size = (
                 self.max_cudagraph_capture_size
             )
+
+        offload_config = OffloadConfig(
+            cpu_offload_gb=self.cpu_offload_gb,
+            offload_group_size=self.offload_group_size,
+            offload_num_in_group=self.offload_num_in_group,
+            offload_prefetch_step=self.offload_prefetch_step,
+        )
+
         config = VllmConfig(
             model_config=model_config,
             cache_config=cache_config,
             parallel_config=parallel_config,
             scheduler_config=scheduler_config,
             device_config=device_config,
             load_config=load_config,
+            offload_config=offload_config,
             attention_config=attention_config,
             lora_config=lora_config,
             speculative_config=speculative_config,

@@ -155,6 +155,14 @@ class LLM:
             the model weights. This virtually increases the GPU memory space
             you can use to hold the model weights, at the cost of CPU-GPU data
             transfer for every forward pass.
+        offload_group_size: Advanced CPU offloading: Group every N layers
+            together. Offload last `offload_num_in_group` layers of each group.
+            Default is 0 (disabled).
+        offload_num_in_group: Advanced CPU offloading: Number of layers to
+            offload per group. Default is 1.
+        offload_prefetch_step: Advanced CPU offloading: Number of layers to
+            prefetch ahead. Higher values hide more latency but use more GPU
+            memory. Default is 1.
         enforce_eager: Whether to enforce eager execution. If True, we will
             disable CUDA graph and always execute the model in eager mode.
             If False, we will use CUDA graph and eager execution in hybrid.
@@ -208,6 +216,9 @@ def __init__(
         gpu_memory_utilization: float = 0.9,
         swap_space: float = 4,
         cpu_offload_gb: float = 0,
+        offload_group_size: int = 0,
+        offload_num_in_group: int = 1,
+        offload_prefetch_step: int = 1,
         enforce_eager: bool = False,
         disable_custom_all_reduce: bool = False,
         hf_token: bool | str | None = None,
@@ -316,6 +327,9 @@ def _make_config(value: Any, cls: type[_R]) -> _R:
             kv_cache_memory_bytes=kv_cache_memory_bytes,
             swap_space=swap_space,
             cpu_offload_gb=cpu_offload_gb,
+            offload_group_size=offload_group_size,
+            offload_num_in_group=offload_num_in_group,
+            offload_prefetch_step=offload_prefetch_step,
             enforce_eager=enforce_eager,
             disable_custom_all_reduce=disable_custom_all_reduce,
             hf_token=hf_token,

diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -14,6 +14,7 @@
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE,
     FusedMoeWeightScaleSupported,
+    find_fused_moe_submodule,
 )
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEActivationFormat,
@@ -55,6 +56,7 @@ def get_config() -> dict[str, Any] | None:
     "RoutingMethodType",
     "SharedFusedMoE",
     "activation_without_mul",
+    "find_fused_moe_submodule",
     "override_config",
     "get_config",
 ]

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -2175,3 +2175,28 @@ def moe_forward_shared_fake(
 # Mark the FusedMoE weight_loader as supporting MoE-specific parameters
 # to avoid expensive runtime reflection in model loading code
 FusedMoE.weight_loader.supports_moe_loading = True  # type: ignore[attr-defined]
+
+
+def find_fused_moe_submodule(module: torch.nn.Module) -> torch.nn.Module:
+    """Find a FusedMoE submodule for offloading, or return the module itself.
+
+    Searches module attributes for instances of FusedMoE (or subclasses like
+    SharedFusedMoE).
+
+    Args:
+        module: The module to search within (typically layer.mlp).
+
+    Returns:
+        The first FusedMoE instance found, or the original module if none found.
+    """
+    for attr_name in dir(module):
+        if attr_name.startswith("_"):
+            continue
+        try:
+            attr = getattr(module, attr_name, None)
+        except Exception:
+            continue
+        if isinstance(attr, FusedMoE):
+            return attr
+
+    return module
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
@@ -49,7 +49,10 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
-from vllm.model_executor.layers.fused_moe import SharedFusedMoE
+from vllm.model_executor.layers.fused_moe import (
+    SharedFusedMoE,
+    find_fused_moe_submodule,
+)
 from vllm.model_executor.layers.layernorm import LayerNorm, RMSNorm
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -1274,6 +1277,21 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 vllm_config, prefix, topk_indices_buffer=topk_indices_buffer
             ),
             prefix=f"{prefix}.layers",
+            offloader_kwargs=dict(
+                # Extract the MLP submodule - for MoE layers, go deeper to the experts
+                submodule_accessor=lambda layer: find_fused_moe_submodule(layer.mlp),
+                # Specify which parameters to offload
+                whitelist_param_names_creator=lambda module: (
+                    [
+                        # Core MoE expert weights
+                        "w13_weight",
+                        "w2_weight",
+                    ]
+                    # Only offload from MoE experts (SharedFusedMoE/FusedMoE)
+                    if hasattr(module, "w13_weight")
+                    else []
+                ),
+            ),
         )
 
         if get_pp_group().is_last_rank: