Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions vllm/model_executor/offloader/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
create_offloader,
get_offloader,
set_offloader,
should_pin_memory,
)
from vllm.model_executor.offloader.prefetch import PrefetchOffloader
from vllm.model_executor.offloader.uva import UVAOffloader
Expand All @@ -20,4 +21,5 @@
"create_offloader",
"get_offloader",
"set_offloader",
"should_pin_memory",
]
14 changes: 14 additions & 0 deletions vllm/model_executor/offloader/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,28 @@

import torch.nn as nn

import vllm.envs as envs
from vllm.logger import init_logger
from vllm.utils.platform_utils import is_pin_memory_available

if TYPE_CHECKING:
from vllm.config import OffloadConfig

logger = init_logger(__name__)


def should_pin_memory() -> bool:
"""Check if pinned memory should be used for weight offloading.

Combines the platform capability check with the user override env var.
On unified-memory systems (e.g. GH200) pinned memory eats into GPU
memory, so users can disable it via VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY.
"""
return (
is_pin_memory_available() and not envs.VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY
)


"""
class relation:

Expand Down
9 changes: 4 additions & 5 deletions vllm/model_executor/offloader/prefetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@
# Import prefetch_ops to register custom ops at module load time
import vllm.model_executor.offloader.prefetch_ops # noqa: F401
from vllm.logger import init_logger
from vllm.model_executor.offloader.base import BaseOffloader
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.model_executor.offloader.base import BaseOffloader, should_pin_memory

logger = init_logger(__name__)

Expand Down Expand Up @@ -528,7 +527,7 @@ def start_onload_to_static(self):
gpu_buffer = offloader._gpu_buffer
assert cpu_storage is not None, "CPU storage not initialized"
assert gpu_buffer is not None, "GPU buffer not assigned"
assert not is_pin_memory_available() or cpu_storage.is_pinned(), (
assert not should_pin_memory() or cpu_storage.is_pinned(), (
f"CPU storage for {name} is not pinned! "
"non_blocking=True H2D copy from non-pinned memory "
"causes stream synchronization that breaks "
Expand Down Expand Up @@ -629,7 +628,7 @@ def _offload_to_cpu_internal(self):
original GPU tensor is garbage collected.
"""
param = self._param
pin_memory = is_pin_memory_available()
pin_memory = should_pin_memory()

# Create pinned CPU storage and copy current GPU data
self._cpu_storage = torch.empty_strided(
Expand Down Expand Up @@ -666,7 +665,7 @@ def _update_cpu_storage_from_param(self) -> None:
param = self._param

if param.data.device.type == "cpu":
if is_pin_memory_available() and not param.data.is_pinned():
if should_pin_memory() and not param.data.is_pinned():
pinned = torch.empty_strided(
size=param.data.size(),
stride=param.data.stride(),
Expand Down
9 changes: 3 additions & 6 deletions vllm/model_executor/offloader/uva.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@

import vllm.envs as envs
from vllm.logger import init_logger
from vllm.model_executor.offloader.base import BaseOffloader
from vllm.model_executor.offloader.base import BaseOffloader, should_pin_memory
from vllm.utils.mem_utils import format_gib
from vllm.utils.platform_utils import is_pin_memory_available, is_uva_available
from vllm.utils.platform_utils import is_uva_available
from vllm.utils.torch_utils import get_accelerator_view_from_cpu_tensor

logger = init_logger(__name__)
Expand Down Expand Up @@ -43,10 +43,7 @@ def __init__(
self.cpu_offload_bytes = 0
self.cpu_offload_params = cpu_offload_params or set()

self.pin_memory = (
is_pin_memory_available()
and not envs.VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY
)
self.pin_memory = should_pin_memory()
self.uva_offloading = (
is_uva_available() and not envs.VLLM_WEIGHT_OFFLOADING_DISABLE_UVA
)
Expand Down
Loading