-
-
Notifications
You must be signed in to change notification settings - Fork 16.5k
[offloader] v2: Hide weight onloading latency via prefetching #29941
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
a3dfa47
9bf4097
152af73
67cd6cc
c21063e
f43b577
975e972
0ed3570
b07eb3f
ac5fb49
80e764e
9436a66
21e0813
5bc88d8
719af1b
91c180e
20db2a1
2debd78
20ef9d8
f536bc4
021acc1
0525c42
6506706
55a9934
8478e55
4520428
fcaa9da
79ec7d5
5fe6d7c
08bab8a
0d18ec2
e5d375e
1d3d404
3c3ba9b
96301e3
6c6600d
ff52cd4
2a77385
20696cd
f537f21
18b576e
3df4d98
150dc9f
9f49711
0478a6a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,31 @@ | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||
| """Test V2 offloading correctness with DeepSeek V2 model.""" | ||
|
|
||
| from ..utils import compare_two_settings | ||
|
|
||
|
|
||
| def test_v2_offload_deepseek(): | ||
| """Test V2 CPU offloading with DeepSeek-V2-Lite. | ||
|
|
||
| Compares outputs between: | ||
| 1. Baseline (no offloading) | ||
| 2. V2 offloading (group_size=8, num_in_group=2, prefetch_step=1) | ||
|
|
||
| This tests the advanced offloading with prefetching on a MoE model. | ||
| """ | ||
| compare_two_settings( | ||
| "deepseek-ai/DeepSeek-V2-Lite", | ||
| [], # Baseline: no offloading | ||
| [ | ||
| # V2 offloading configuration | ||
| "--offload-group-size", | ||
| "8", | ||
| "--offload-num-in-group", | ||
| "2", | ||
| "--offload-prefetch-step", | ||
| "1", | ||
| # currently not compatible with torch.compile | ||
| "--enforce-eager", | ||
| ], | ||
| ) | ||
|
minosfuture marked this conversation as resolved.
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,80 @@ | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||
| """Configuration for model weight offloading.""" | ||
|
|
||
| from typing import Any | ||
|
|
||
| from pydantic import Field, model_validator | ||
| from pydantic.dataclasses import dataclass | ||
|
|
||
| from vllm.config.utils import config | ||
| from vllm.utils.hashing import safe_hash | ||
|
|
||
|
|
||
| @config | ||
| @dataclass | ||
|
minosfuture marked this conversation as resolved.
Outdated
|
||
| class OffloadConfig: | ||
| """Configuration for model weight offloading to CPU. | ||
|
|
||
| This controls how model parameters are offloaded to CPU memory to reduce | ||
| GPU memory usage, at the cost of additional CPU-GPU transfers during | ||
| inference. | ||
| """ | ||
|
|
||
| cpu_offload_gb: float = Field(default=0, ge=0) | ||
| """The space in GiB to offload to CPU, per GPU. Default is 0, which means | ||
| no offloading. Intuitively, this argument can be seen as a virtual way to | ||
| increase the GPU memory size. For example, if you have one 24 GB GPU and | ||
| set this to 10, virtually you can think of it as a 34 GB GPU. Then you can | ||
| load a 13B model with BF16 weight, which requires at least 26GB GPU memory. | ||
| Note that this requires fast CPU-GPU interconnect, as part of the model is | ||
| loaded from CPU memory to GPU memory on the fly in each model forward pass. | ||
| This uses UVA (Unified Virtual Addressing) for zero-copy access. | ||
| """ | ||
|
|
||
| offload_group_size: int = Field(default=0, ge=0) | ||
| """Advanced CPU offloading (V2): Group every N layers together. Offload last | ||
| `offload_num_in_group` layers of each group. Default is 0 (disabled). | ||
| Example: group_size=8, num_in_group=2 offloads layers 6,7,14,15,22,23,... | ||
| Unlike cpu_offload_gb, this uses explicit async prefetching to hide transfer | ||
| latency. | ||
| """ | ||
|
|
||
| offload_num_in_group: int = Field(default=1, ge=1) | ||
| """Advanced CPU offloading (V2): Number of layers to offload per group. | ||
| Must be <= offload_group_size. Default is 1.""" | ||
|
|
||
| offload_prefetch_step: int = Field(default=1, ge=0) | ||
| """Advanced CPU offloading (V2): Number of layers to prefetch ahead. | ||
| Higher values hide more latency but use more GPU memory. Default is 1.""" | ||
|
|
||
| @model_validator(mode="after") | ||
| def validate_offload_config(self) -> "OffloadConfig": | ||
| """Validate that offload_num_in_group <= offload_group_size.""" | ||
| if ( | ||
| self.offload_group_size > 0 | ||
| and self.offload_num_in_group > self.offload_group_size | ||
| ): | ||
| raise ValueError( | ||
| f"offload_num_in_group ({self.offload_num_in_group}) must be " | ||
| f"<= offload_group_size ({self.offload_group_size})" | ||
| ) | ||
| return self | ||
|
minosfuture marked this conversation as resolved.
|
||
|
|
||
| def compute_hash(self) -> str: | ||
| """ | ||
| WARNING: Whenever a new field is added to this config, | ||
| ensure that it is included in the factors list if | ||
| it affects the computation graph. | ||
|
|
||
| Provide a hash that uniquely identifies all the configs | ||
| that affect the structure of the computation | ||
| graph from input ids/embeddings to the final hidden states, | ||
| excluding anything before input ids/embeddings and after | ||
| the final hidden states. | ||
| """ | ||
| # Offload settings don't affect the computation graph structure, | ||
| # only the memory layout and transfer patterns. | ||
| factors: list[Any] = [] | ||
| hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest() | ||
| return hash_str | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -155,6 +155,14 @@ class LLM: | |
| the model weights. This virtually increases the GPU memory space | ||
| you can use to hold the model weights, at the cost of CPU-GPU data | ||
| transfer for every forward pass. | ||
| offload_group_size: Advanced CPU offloading: Group every N layers | ||
| together. Offload last `offload_num_in_group` layers of each group. | ||
| Default is 0 (disabled). | ||
| offload_num_in_group: Advanced CPU offloading: Number of layers to | ||
| offload per group. Default is 1. | ||
| offload_prefetch_step: Advanced CPU offloading: Number of layers to | ||
| prefetch ahead. Higher values hide more latency but use more GPU | ||
| memory. Default is 1. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like we are missing
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added. Thanks! |
||
| enforce_eager: Whether to enforce eager execution. If True, we will | ||
| disable CUDA graph and always execute the model in eager mode. | ||
| If False, we will use CUDA graph and eager execution in hybrid. | ||
|
|
@@ -208,6 +216,9 @@ def __init__( | |
| gpu_memory_utilization: float = 0.9, | ||
| swap_space: float = 4, | ||
| cpu_offload_gb: float = 0, | ||
| offload_group_size: int = 0, | ||
| offload_num_in_group: int = 1, | ||
| offload_prefetch_step: int = 1, | ||
| enforce_eager: bool = False, | ||
| disable_custom_all_reduce: bool = False, | ||
| hf_token: bool | str | None = None, | ||
|
|
@@ -316,6 +327,9 @@ def _make_config(value: Any, cls: type[_R]) -> _R: | |
| kv_cache_memory_bytes=kv_cache_memory_bytes, | ||
| swap_space=swap_space, | ||
| cpu_offload_gb=cpu_offload_gb, | ||
| offload_group_size=offload_group_size, | ||
| offload_num_in_group=offload_num_in_group, | ||
| offload_prefetch_step=offload_prefetch_step, | ||
| enforce_eager=enforce_eager, | ||
| disable_custom_all_reduce=disable_custom_all_reduce, | ||
| hf_token=hf_token, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -49,7 +49,10 @@ | |
| from vllm.logger import init_logger | ||
| from vllm.model_executor.layers.activation import SiluAndMul | ||
| from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase | ||
| from vllm.model_executor.layers.fused_moe import SharedFusedMoE | ||
| from vllm.model_executor.layers.fused_moe import ( | ||
| SharedFusedMoE, | ||
| find_fused_moe_submodule, | ||
| ) | ||
| from vllm.model_executor.layers.layernorm import LayerNorm, RMSNorm | ||
| from vllm.model_executor.layers.linear import ( | ||
| ColumnParallelLinear, | ||
|
|
@@ -1274,6 +1277,21 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): | |
| vllm_config, prefix, topk_indices_buffer=topk_indices_buffer | ||
| ), | ||
| prefix=f"{prefix}.layers", | ||
| offloader_kwargs=dict( | ||
| # Extract the MLP submodule - for MoE layers, go deeper to the experts | ||
| submodule_accessor=lambda layer: find_fused_moe_submodule(layer.mlp), | ||
| # Specify which parameters to offload | ||
| whitelist_param_names_creator=lambda module: ( | ||
| [ | ||
| # Core MoE expert weights | ||
| "w13_weight", | ||
| "w2_weight", | ||
| ] | ||
| # Only offload from MoE experts (SharedFusedMoE/FusedMoE) | ||
| if hasattr(module, "w13_weight") | ||
| else [] | ||
| ), | ||
| ), | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we need to do it for every model? this looks intrusive 🤔
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is highly targeted optimization as it requires carefully overlapping weight onloading (memcpy) and computation. So it is impacted by model weight size, CPU<>GPU bandwidth, computation latency (per batch size). So it should be configured at least model level. Or, maybe this should be configurable via cli args. WDYT?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the current offloading, we use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| ) | ||
|
|
||
| if get_pp_group().is_last_rank: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could use an fp8 model here to make it faster like RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
serving "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8" failed at flashinfer autotuning stage on GB200. 😿
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
switched to llama as we support any model now.