diff --git a/vllm/model_executor/offloader/__init__.py b/vllm/model_executor/offloader/__init__.py index a6522ff7c0a3..f1b49c69ef93 100644 --- a/vllm/model_executor/offloader/__init__.py +++ b/vllm/model_executor/offloader/__init__.py @@ -8,6 +8,7 @@ create_offloader, get_offloader, set_offloader, + should_pin_memory, ) from vllm.model_executor.offloader.prefetch import PrefetchOffloader from vllm.model_executor.offloader.uva import UVAOffloader @@ -20,4 +21,5 @@ "create_offloader", "get_offloader", "set_offloader", + "should_pin_memory", ] diff --git a/vllm/model_executor/offloader/base.py b/vllm/model_executor/offloader/base.py index 7cb0ddfd1848..b8c1b6cfa48a 100644 --- a/vllm/model_executor/offloader/base.py +++ b/vllm/model_executor/offloader/base.py @@ -10,7 +10,9 @@ import torch.nn as nn +import vllm.envs as envs from vllm.logger import init_logger +from vllm.utils.platform_utils import is_pin_memory_available if TYPE_CHECKING: from vllm.config import OffloadConfig @@ -18,6 +20,18 @@ logger = init_logger(__name__) +def should_pin_memory() -> bool: + """Check if pinned memory should be used for weight offloading. + + Combines the platform capability check with the user override env var. + On unified-memory systems (e.g. GH200) pinned memory eats into GPU + memory, so users can disable it via VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY. + """ + return ( + is_pin_memory_available() and not envs.VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY + ) + + """ class relation: diff --git a/vllm/model_executor/offloader/prefetch.py b/vllm/model_executor/offloader/prefetch.py index 5bdde8c3a18a..cc04367d54c3 100644 --- a/vllm/model_executor/offloader/prefetch.py +++ b/vllm/model_executor/offloader/prefetch.py @@ -20,8 +20,7 @@ # Import prefetch_ops to register custom ops at module load time import vllm.model_executor.offloader.prefetch_ops # noqa: F401 from vllm.logger import init_logger -from vllm.model_executor.offloader.base import BaseOffloader -from vllm.utils.platform_utils import is_pin_memory_available +from vllm.model_executor.offloader.base import BaseOffloader, should_pin_memory logger = init_logger(__name__) @@ -528,7 +527,7 @@ def start_onload_to_static(self): gpu_buffer = offloader._gpu_buffer assert cpu_storage is not None, "CPU storage not initialized" assert gpu_buffer is not None, "GPU buffer not assigned" - assert not is_pin_memory_available() or cpu_storage.is_pinned(), ( + assert not should_pin_memory() or cpu_storage.is_pinned(), ( f"CPU storage for {name} is not pinned! " "non_blocking=True H2D copy from non-pinned memory " "causes stream synchronization that breaks " @@ -629,7 +628,7 @@ def _offload_to_cpu_internal(self): original GPU tensor is garbage collected. """ param = self._param - pin_memory = is_pin_memory_available() + pin_memory = should_pin_memory() # Create pinned CPU storage and copy current GPU data self._cpu_storage = torch.empty_strided( @@ -666,7 +665,7 @@ def _update_cpu_storage_from_param(self) -> None: param = self._param if param.data.device.type == "cpu": - if is_pin_memory_available() and not param.data.is_pinned(): + if should_pin_memory() and not param.data.is_pinned(): pinned = torch.empty_strided( size=param.data.size(), stride=param.data.stride(), diff --git a/vllm/model_executor/offloader/uva.py b/vllm/model_executor/offloader/uva.py index c524e43cddae..51eb1a14fcb0 100644 --- a/vllm/model_executor/offloader/uva.py +++ b/vllm/model_executor/offloader/uva.py @@ -10,9 +10,9 @@ import vllm.envs as envs from vllm.logger import init_logger -from vllm.model_executor.offloader.base import BaseOffloader +from vllm.model_executor.offloader.base import BaseOffloader, should_pin_memory from vllm.utils.mem_utils import format_gib -from vllm.utils.platform_utils import is_pin_memory_available, is_uva_available +from vllm.utils.platform_utils import is_uva_available from vllm.utils.torch_utils import get_accelerator_view_from_cpu_tensor logger = init_logger(__name__) @@ -43,10 +43,7 @@ def __init__( self.cpu_offload_bytes = 0 self.cpu_offload_params = cpu_offload_params or set() - self.pin_memory = ( - is_pin_memory_available() - and not envs.VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY - ) + self.pin_memory = should_pin_memory() self.uva_offloading = ( is_uva_available() and not envs.VLLM_WEIGHT_OFFLOADING_DISABLE_UVA )