From f9d82ea54433e9b0320296df4c2c16d154c39475 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Thu, 27 Mar 2025 16:36:42 -0700 Subject: [PATCH 01/32] Track TPU usages in vLLM's data dashboards --- vllm/usage/usage_lib.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 2ee3f9104d1..ff9ff444797 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -137,6 +137,9 @@ def __init__(self) -> None: self.gpu_type: Optional[str] = None self.gpu_memory_per_device: Optional[int] = None self.env_var_json: Optional[str] = None + self.tpu_count: Optional[int] = None + self.tpu_type: Optional[str] = None + self.tpu_memory_per_device: Optional[int] = None # vLLM Information self.model_architecture: Optional[str] = None @@ -174,6 +177,11 @@ def _report_usage_once(self, model_architecture: str, self.gpu_memory_per_device = device_property.total_memory if current_platform.is_cuda(): self.cuda_runtime = torch.version.cuda + if current_platform.is_tpu(): + from torch_xla.core import xla_model as xm + self.tpu_count = xm.xrt_world_size() + self.tpu_type = xm.get_device_attributes(xm.xla_device()).get("type") + self.tpu_memory_per_device = xm.get_device_attributes(xm.xla_device()).get("memory") self.provider = _detect_cloud_provider() self.architecture = platform.machine() self.platform = platform.platform() From d2d9b9e6e1cc069c74421129784b7715e5a08a5f Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Thu, 27 Mar 2025 16:39:52 -0700 Subject: [PATCH 02/32] Make the code more robust --- vllm/usage/usage_lib.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index ff9ff444797..5f162ac5e01 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -178,10 +178,16 @@ def _report_usage_once(self, model_architecture: str, if current_platform.is_cuda(): self.cuda_runtime = torch.version.cuda if current_platform.is_tpu(): - from torch_xla.core import xla_model as xm - self.tpu_count = xm.xrt_world_size() - self.tpu_type = xm.get_device_attributes(xm.xla_device()).get("type") - self.tpu_memory_per_device = xm.get_device_attributes(xm.xla_device()).get("memory") + try: + from torch_xla.core import xla_model as xm + self.tpu_count = xm.xrt_world_size() + self.tpu_type = xm.get_device_attributes(xm.xla_device()).get("type") + self.tpu_memory_per_device = xm.get_device_attributes(xm.xla_device()).get("memory") + except ImportError: + logging.warning("torch_xla not found, skipping TPU usage statistics.") + self.tpu_count = None + self.tpu_type = None + self.tpu_memory_per_device = None self.provider = _detect_cloud_provider() self.architecture = platform.machine() self.platform = platform.platform() From 39d610f4de19d5b5cfcb63d11c1c1d5d626c3d38 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Mon, 7 Apr 2025 13:16:08 -0700 Subject: [PATCH 03/32] Your descriptive message about the changes you made --- vllm/usage/usage_lib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 5f162ac5e01..39806e62e35 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -181,8 +181,8 @@ def _report_usage_once(self, model_architecture: str, try: from torch_xla.core import xla_model as xm self.tpu_count = xm.xrt_world_size() - self.tpu_type = xm.get_device_attributes(xm.xla_device()).get("type") - self.tpu_memory_per_device = xm.get_device_attributes(xm.xla_device()).get("memory") + self.tpu_type = xm.xla_device_hw(xm.xla_device()) + self.tpu_memory_per_device = xm.get_memory_info().bytes_limit except ImportError: logging.warning("torch_xla not found, skipping TPU usage statistics.") self.tpu_count = None From 558c60fdcd727f800899b7585d79b0e5b8c882db Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Mon, 7 Apr 2025 13:36:36 -0700 Subject: [PATCH 04/32] format --- vllm/usage/usage_lib.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 39806e62e35..75b1704cd4b 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -184,7 +184,8 @@ def _report_usage_once(self, model_architecture: str, self.tpu_type = xm.xla_device_hw(xm.xla_device()) self.tpu_memory_per_device = xm.get_memory_info().bytes_limit except ImportError: - logging.warning("torch_xla not found, skipping TPU usage statistics.") + logging.warning( + "torch_xla not found, skipping TPU usage statistics.") self.tpu_count = None self.tpu_type = None self.tpu_memory_per_device = None From 639f77bf376eeddd695bfd81c164a95d1f3e1591 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Mon, 7 Apr 2025 14:12:21 -0700 Subject: [PATCH 05/32] use new API --- vllm/usage/usage_lib.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 75b1704cd4b..9291399c4f4 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -179,8 +179,9 @@ def _report_usage_once(self, model_architecture: str, self.cuda_runtime = torch.version.cuda if current_platform.is_tpu(): try: + import torch_xla.runtime as xr from torch_xla.core import xla_model as xm - self.tpu_count = xm.xrt_world_size() + self.tpu_count = xr.world_size() self.tpu_type = xm.xla_device_hw(xm.xla_device()) self.tpu_memory_per_device = xm.get_memory_info().bytes_limit except ImportError: From 8f055c9f0a9f3a4fb45259271c4909d11a88ed0c Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Mon, 7 Apr 2025 16:43:52 -0700 Subject: [PATCH 06/32] address Simon's comments --- vllm/usage/usage_lib.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 9291399c4f4..1820d3f882a 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -137,9 +137,6 @@ def __init__(self) -> None: self.gpu_type: Optional[str] = None self.gpu_memory_per_device: Optional[int] = None self.env_var_json: Optional[str] = None - self.tpu_count: Optional[int] = None - self.tpu_type: Optional[str] = None - self.tpu_memory_per_device: Optional[int] = None # vLLM Information self.model_architecture: Optional[str] = None @@ -178,18 +175,11 @@ def _report_usage_once(self, model_architecture: str, if current_platform.is_cuda(): self.cuda_runtime = torch.version.cuda if current_platform.is_tpu(): - try: - import torch_xla.runtime as xr - from torch_xla.core import xla_model as xm - self.tpu_count = xr.world_size() - self.tpu_type = xm.xla_device_hw(xm.xla_device()) - self.tpu_memory_per_device = xm.get_memory_info().bytes_limit - except ImportError: - logging.warning( - "torch_xla not found, skipping TPU usage statistics.") - self.tpu_count = None - self.tpu_type = None - self.tpu_memory_per_device = None + import torch_xla.runtime as xr + from torch_xla.core import xla_model as xm + self.gpu_count = xr.world_size() + self.gpu_type = xm.xla_device_hw(xm.xla_device()) + self.gpu_memory_per_device = xm.get_memory_info().bytes_limit self.provider = _detect_cloud_provider() self.architecture = platform.machine() self.platform = platform.platform() From 63bea3692ec7fafa1244e23404eee95846803b74 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Mon, 7 Apr 2025 16:49:34 -0700 Subject: [PATCH 07/32] Silence ImportError --- vllm/usage/usage_lib.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 1820d3f882a..c26de7f239c 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -175,11 +175,14 @@ def _report_usage_once(self, model_architecture: str, if current_platform.is_cuda(): self.cuda_runtime = torch.version.cuda if current_platform.is_tpu(): - import torch_xla.runtime as xr - from torch_xla.core import xla_model as xm - self.gpu_count = xr.world_size() - self.gpu_type = xm.xla_device_hw(xm.xla_device()) - self.gpu_memory_per_device = xm.get_memory_info().bytes_limit + try: + import torch_xla.runtime as xr + from torch_xla.core import xla_model as xm + self.gpu_count = xr.world_size() + self.gpu_type = xm.xla_device_hw(xm.xla_device()) + self.gpu_memory_per_device = xm.get_memory_info().bytes_limit + except ImportError: + pass self.provider = _detect_cloud_provider() self.architecture = platform.machine() self.platform = platform.platform() From 6a4eea4ffd339b4e3022ef620c48c85e4ac451d1 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Wed, 9 Apr 2025 14:55:30 -0700 Subject: [PATCH 08/32] Use torch_xla.tpu.get_tpu_type() to get TPU version --- vllm/usage/usage_lib.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index c26de7f239c..dc086e85047 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -178,8 +178,9 @@ def _report_usage_once(self, model_architecture: str, try: import torch_xla.runtime as xr from torch_xla.core import xla_model as xm + from torch_xla.tpu import tpu_type self.gpu_count = xr.world_size() - self.gpu_type = xm.xla_device_hw(xm.xla_device()) + self.gpu_type = tpu_type() self.gpu_memory_per_device = xm.get_memory_info().bytes_limit except ImportError: pass From 727bed55a33bdbe45e0e6a95346c90878827a95f Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 22 Apr 2025 07:28:36 +0000 Subject: [PATCH 09/32] Add usage to more engines --- vllm/engine/async_llm_engine.py | 40 +++++++++++++++++++++++++++++++- vllm/v1/engine/async_llm.py | 40 +++++++++++++++++++++++++++++++- vllm/v1/engine/llm_engine.py | 41 ++++++++++++++++++++++++++++++++- 3 files changed, 118 insertions(+), 3 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 67c7e109c9f..e185de9c9d3 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -34,7 +34,8 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import ExecuteModelRequest from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.usage.usage_lib import UsageContext +from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, + usage_message) from vllm.utils import Device, deprecate_kwargs, weak_bind logger = init_logger(__name__) @@ -628,6 +629,43 @@ def __init__(self, # Lazy initialized fields self._request_tracker: RequestTracker + # If usage stat is enabled, collect relevant info. + if is_usage_stats_enabled(): + from vllm.model_executor.model_loader import ( + get_architecture_class_name) + usage_message.report_usage( + get_architecture_class_name(self.engine.model_config), + usage_context, + extra_kvs={ + # Common configuration + "dtype": + str(self.engine.model_config.dtype), + "tensor_parallel_size": + self.engine.parallel_config.tensor_parallel_size, + "block_size": + self.engine.cache_config.block_size, + "gpu_memory_utilization": + self.engine.cache_config.gpu_memory_utilization, + + # Quantization + "quantization": + self.engine.model_config.quantization, + "kv_cache_dtype": + str(self.engine.cache_config.cache_dtype), + + # Feature flags + "enable_lora": + bool(self.engine.lora_config), + "enable_prompt_adapter": + bool(self.engine.prompt_adapter_config), + "enable_prefix_caching": + self.engine.cache_config.enable_prefix_caching, + "enforce_eager": + self.engine.model_config.enforce_eager, + "disable_custom_all_reduce": + self.engine.parallel_config.disable_custom_all_reduce, + }) + def __del__(self): if rt := getattr(self, "request_tracker", None): # Wake up engine loop so that it will exit cleanly diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index bc49a0d3bb5..63443421709 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -23,7 +23,8 @@ from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs -from vllm.usage.usage_lib import UsageContext +from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, + usage_message) from vllm.utils import Device, cdiv from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import AsyncMPClient, DPAsyncMPClient @@ -113,6 +114,43 @@ def __init__( self._run_output_handler() except RuntimeError: pass + + # If usage stat is enabled, collect relevant info. + if is_usage_stats_enabled(): + from vllm.model_executor.model_loader import ( + get_architecture_class_name) + usage_message.report_usage( + get_architecture_class_name(self.model_config), + usage_context, + extra_kvs={ + # Common configuration + "dtype": + str(self.model_config.dtype), + "tensor_parallel_size": + vllm_config.parallel_config.tensor_parallel_size, + "block_size": + vllm_config.cache_config.block_size, + "gpu_memory_utilization": + vllm_config.cache_config.gpu_memory_utilization, + + # Quantization + "quantization": + self.model_config.quantization, + "kv_cache_dtype": + str(vllm_config.cache_config.cache_dtype), + + # Feature flags + "enable_lora": + bool(vllm_config.lora_config), + "enable_prompt_adapter": + bool(vllm_config.prompt_adapter_config), + "enable_prefix_caching": + vllm_config.cache_config.enable_prefix_caching, + "enforce_eager": + vllm_config.model_config.enforce_eager, + "disable_custom_all_reduce": + vllm_config.parallel_config.disable_custom_all_reduce, + }) @classmethod def from_vllm_config( diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index c05319f3d80..99f473540bb 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -21,7 +21,8 @@ from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import ( BaseTokenizerGroup, init_tokenizer_from_configs) -from vllm.usage.usage_lib import UsageContext +from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, + usage_message) from vllm.utils import Device from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.output_processor import OutputProcessor @@ -98,6 +99,44 @@ def __init__( if not multiprocess_mode: # for v0 compatibility self.model_executor = self.engine_core.engine_core.model_executor # type: ignore + + # If usage stat is enabled, collect relevant info. + if is_usage_stats_enabled(): + from vllm.model_executor.model_loader import ( + get_architecture_class_name) + usage_message.report_usage( + get_architecture_class_name(self.model_config), + usage_context, + extra_kvs={ + # Common configuration + "dtype": + str(self.model_config.dtype), + "tensor_parallel_size": + parallel_config.tensor_parallel_size, + "block_size": + self.cache_config.block_size, + "gpu_memory_utilization": + self.cache_config.gpu_memory_utilization, + + # Quantization + "quantization": + self.model_config.quantization, + "kv_cache_dtype": + str(self.cache_config.cache_dtype), + + # Feature flags + "enable_lora": + bool(vllm_config.lora_config), + "enable_prompt_adapter": + bool(vllm_config.prompt_adapter_config), + "enable_prefix_caching": + self.cache_config.enable_prefix_caching, + "enforce_eager": + self.model_config.enforce_eager, + "disable_custom_all_reduce": + parallel_config.disable_custom_all_reduce, + }) + @classmethod def from_vllm_config( From 619e496c2af7b8153ffed5a9402f635238807749 Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 22 Apr 2025 22:56:54 +0000 Subject: [PATCH 10/32] fix error --- vllm/engine/async_llm_engine.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index e185de9c9d3..acb9b3a93c9 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -597,6 +597,7 @@ def __init__(self, *args, log_requests: bool = True, start_engine_loop: bool = True, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, **kwargs) -> None: if envs.VLLM_USE_V1: raise ValueError( @@ -627,6 +628,7 @@ def __init__(self, self._errored_with: Optional[BaseException] = None # Lazy initialized fields + self.usage_context = usage_context self._request_tracker: RequestTracker # If usage stat is enabled, collect relevant info. @@ -635,7 +637,7 @@ def __init__(self, get_architecture_class_name) usage_message.report_usage( get_architecture_class_name(self.engine.model_config), - usage_context, + self.usage_context, extra_kvs={ # Common configuration "dtype": From a1ae7ffa1309195f534c5b030edae8d70fcdb161 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 23 Apr 2025 06:45:25 +0000 Subject: [PATCH 11/32] format --- vllm/v1/engine/async_llm.py | 144 +++++++++++++++++++----------------- 1 file changed, 76 insertions(+), 68 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 63443421709..4033331835e 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -23,19 +23,20 @@ from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs -from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, - usage_message) +from vllm.usage.usage_lib import UsageContext, is_usage_stats_enabled, usage_message from vllm.utils import Device, cdiv from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import AsyncMPClient, DPAsyncMPClient from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError -from vllm.v1.engine.output_processor import (OutputProcessor, - RequestOutputCollector) +from vllm.v1.engine.output_processor import OutputProcessor, RequestOutputCollector from vllm.v1.engine.parallel_sampling import ParentRequest from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor -from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger, - StatLoggerBase) +from vllm.v1.metrics.loggers import ( + LoggingStatLogger, + PrometheusStatLogger, + StatLoggerBase, +) from vllm.v1.metrics.stats import IterationStats, SchedulerStats logger = init_logger(__name__) @@ -59,7 +60,8 @@ def __init__( "Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. " "This should not happen. As a workaround, try using " "AsyncLLMEngine.from_vllm_config(...) or explicitly set " - "VLLM_USE_V1=0 or 1 and report this issue on Github.") + "VLLM_USE_V1=0 or 1 and report this issue on Github." + ) self.model_config = vllm_config.model_config self.vllm_config = vllm_config @@ -73,8 +75,7 @@ def __init__( loggers: list[StatLoggerBase] = [] if logger.isEnabledFor(logging.INFO): loggers.append(LoggingStatLogger(engine_index=i)) - loggers.append( - PrometheusStatLogger(vllm_config, engine_index=i)) + loggers.append(PrometheusStatLogger(vllm_config, engine_index=i)) self.stat_loggers.append(loggers) # Tokenizer (+ ensure liveness if running in another process). @@ -82,7 +83,8 @@ def __init__( model_config=vllm_config.model_config, scheduler_config=vllm_config.scheduler_config, parallel_config=vllm_config.parallel_config, - lora_config=vllm_config.lora_config) + lora_config=vllm_config.lora_config, + ) self.tokenizer.ping() # Processor (converts Inputs --> EngineCoreRequests). @@ -93,13 +95,16 @@ def __init__( ) # OutputProcessor (converts EngineCoreOutputs --> RequestOutput). - self.output_processor = OutputProcessor(self.tokenizer, - log_stats=self.log_stats) + self.output_processor = OutputProcessor( + self.tokenizer, log_stats=self.log_stats + ) # EngineCore (starts the engine in background process). - core_client_class = AsyncMPClient if ( - vllm_config.parallel_config.data_parallel_size - == 1) else DPAsyncMPClient + core_client_class = ( + AsyncMPClient + if (vllm_config.parallel_config.data_parallel_size == 1) + else DPAsyncMPClient + ) self.engine_core = core_client_class( vllm_config=vllm_config, @@ -114,43 +119,31 @@ def __init__( self._run_output_handler() except RuntimeError: pass - - # If usage stat is enabled, collect relevant info. + + # If usage stat is enabled, collect relevant info. if is_usage_stats_enabled(): - from vllm.model_executor.model_loader import ( - get_architecture_class_name) + from vllm.model_executor.model_loader import get_architecture_class_name + usage_message.report_usage( get_architecture_class_name(self.model_config), usage_context, extra_kvs={ # Common configuration - "dtype": - str(self.model_config.dtype), - "tensor_parallel_size": - vllm_config.parallel_config.tensor_parallel_size, - "block_size": - vllm_config.cache_config.block_size, - "gpu_memory_utilization": - vllm_config.cache_config.gpu_memory_utilization, - + "dtype": str(self.model_config.dtype), + "tensor_parallel_size": vllm_config.parallel_config.tensor_parallel_size, + "block_size": vllm_config.cache_config.block_size, + "gpu_memory_utilization": vllm_config.cache_config.gpu_memory_utilization, # Quantization - "quantization": - self.model_config.quantization, - "kv_cache_dtype": - str(vllm_config.cache_config.cache_dtype), - + "quantization": self.model_config.quantization, + "kv_cache_dtype": str(vllm_config.cache_config.cache_dtype), # Feature flags - "enable_lora": - bool(vllm_config.lora_config), - "enable_prompt_adapter": - bool(vllm_config.prompt_adapter_config), - "enable_prefix_caching": - vllm_config.cache_config.enable_prefix_caching, - "enforce_eager": - vllm_config.model_config.enforce_eager, - "disable_custom_all_reduce": - vllm_config.parallel_config.disable_custom_all_reduce, - }) + "enable_lora": bool(vllm_config.lora_config), + "enable_prompt_adapter": bool(vllm_config.prompt_adapter_config), + "enable_prefix_caching": vllm_config.cache_config.enable_prefix_caching, + "enforce_eager": vllm_config.model_config.enforce_eager, + "disable_custom_all_reduce": vllm_config.parallel_config.disable_custom_all_reduce, + }, + ) @classmethod def from_vllm_config( @@ -167,13 +160,16 @@ def from_vllm_config( "Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. " "This should not happen. As a workaround, try using " "AsyncLLMEngine.from_vllm_config(...) or explicitly set " - "VLLM_USE_V1=0 or 1 and report this issue on Github.") + "VLLM_USE_V1=0 or 1 and report this issue on Github." + ) # FIXME(rob): refactor VllmConfig to include the StatLoggers # include StatLogger in the Oracle decision. if stat_loggers is not None: - raise ValueError("Custom StatLoggers are not yet supported on V1. " - "Explicitly set VLLM_USE_V1=0 to disable V1.") + raise ValueError( + "Custom StatLoggers are not yet supported on V1. " + "Explicitly set VLLM_USE_V1=0 to disable V1." + ) # Create the LLMEngine. return cls( @@ -236,18 +232,22 @@ async def add_request( if self.errored: raise EngineDeadError() - assert isinstance(params, SamplingParams), \ - "Pooling is not supported in V1" + assert isinstance(params, SamplingParams), "Pooling is not supported in V1" # Create a new output collector for the request. queue = RequestOutputCollector(output_kind=params.output_kind) # Convert Input --> Request. - request = self.processor.process_inputs(request_id, prompt, params, - arrival_time, lora_request, - trace_headers, - prompt_adapter_request, - priority) + request = self.processor.process_inputs( + request_id, + prompt, + params, + arrival_time, + lora_request, + trace_headers, + prompt_adapter_request, + priority, + ) if params.n == 1: await self._add_request(request, None, 0, queue) @@ -263,9 +263,13 @@ async def add_request( await self._add_request(child_request, parent_request, idx, queue) return queue - async def _add_request(self, request: EngineCoreRequest, - parent_req: Optional[ParentRequest], index: int, - queue: RequestOutputCollector): + async def _add_request( + self, + request: EngineCoreRequest, + parent_req: Optional[ParentRequest], + index: int, + queue: RequestOutputCollector, + ): # Add the request to OutputProcessor (this process). self.output_processor.add_request(request, parent_req, index, queue) @@ -382,23 +386,26 @@ async def output_handler(): outputs = await engine_core.get_output_async() num_outputs = len(outputs.outputs) - iteration_stats = IterationStats() if ( - log_stats and num_outputs) else None + iteration_stats = ( + IterationStats() if (log_stats and num_outputs) else None + ) # Split outputs into chunks of at most # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the # event loop for too long. if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: - slices = (outputs.outputs, ) + slices = (outputs.outputs,) else: slices = np.array_split( outputs.outputs, - cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE)) + cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE), + ) for i, outputs_slice in enumerate(slices): # 2) Process EngineCoreOutputs. processed_outputs = output_processor.process_outputs( - outputs_slice, outputs.timestamp, iteration_stats) + outputs_slice, outputs.timestamp, iteration_stats + ) # NOTE: RequestOutputs are pushed to their queues. assert not processed_outputs.request_outputs @@ -408,7 +415,8 @@ async def output_handler(): # 3) Abort any reqs that finished due to stop strings. await engine_core.abort_requests_async( - processed_outputs.reqs_to_abort) + processed_outputs.reqs_to_abort + ) # 4) Logging. # TODO(rob): make into a coroutine and launch it in @@ -429,7 +437,7 @@ async def output_handler(): async def abort(self, request_id: str) -> None: """Abort RequestId in OutputProcessor and EngineCore.""" - request_ids = self.output_processor.abort_requests((request_id, )) + request_ids = self.output_processor.abort_requests((request_id,)) await self.engine_core.abort_requests_async(request_ids) if self.log_requests: @@ -444,8 +452,9 @@ def _record_stats( """static so that it can be used from the output_handler task without a circular ref to AsyncLLM.""" for stat_logger in stat_loggers: - stat_logger.record(scheduler_stats=scheduler_stats, - iteration_stats=iteration_stats) + stat_logger.record( + scheduler_stats=scheduler_stats, iteration_stats=iteration_stats + ) def encode( self, @@ -497,8 +506,7 @@ async def start_profile(self) -> None: async def stop_profile(self) -> None: await self.engine_core.profile_async(False) - async def reset_prefix_cache(self, - device: Optional[Device] = None) -> None: + async def reset_prefix_cache(self, device: Optional[Device] = None) -> None: if device == Device.CPU: raise ValueError("Not supported on CPU.") await self.engine_core.reset_prefix_cache_async() From 9f725f6b35405d97ee7927272444a7d820e54cd8 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 23 Apr 2025 06:59:11 +0000 Subject: [PATCH 12/32] Revert "format" This reverts commit a1ae7ffa1309195f534c5b030edae8d70fcdb161. --- vllm/v1/engine/async_llm.py | 144 +++++++++++++++++------------------- 1 file changed, 68 insertions(+), 76 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 4033331835e..63443421709 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -23,20 +23,19 @@ from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs -from vllm.usage.usage_lib import UsageContext, is_usage_stats_enabled, usage_message +from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, + usage_message) from vllm.utils import Device, cdiv from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import AsyncMPClient, DPAsyncMPClient from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError -from vllm.v1.engine.output_processor import OutputProcessor, RequestOutputCollector +from vllm.v1.engine.output_processor import (OutputProcessor, + RequestOutputCollector) from vllm.v1.engine.parallel_sampling import ParentRequest from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor -from vllm.v1.metrics.loggers import ( - LoggingStatLogger, - PrometheusStatLogger, - StatLoggerBase, -) +from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger, + StatLoggerBase) from vllm.v1.metrics.stats import IterationStats, SchedulerStats logger = init_logger(__name__) @@ -60,8 +59,7 @@ def __init__( "Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. " "This should not happen. As a workaround, try using " "AsyncLLMEngine.from_vllm_config(...) or explicitly set " - "VLLM_USE_V1=0 or 1 and report this issue on Github." - ) + "VLLM_USE_V1=0 or 1 and report this issue on Github.") self.model_config = vllm_config.model_config self.vllm_config = vllm_config @@ -75,7 +73,8 @@ def __init__( loggers: list[StatLoggerBase] = [] if logger.isEnabledFor(logging.INFO): loggers.append(LoggingStatLogger(engine_index=i)) - loggers.append(PrometheusStatLogger(vllm_config, engine_index=i)) + loggers.append( + PrometheusStatLogger(vllm_config, engine_index=i)) self.stat_loggers.append(loggers) # Tokenizer (+ ensure liveness if running in another process). @@ -83,8 +82,7 @@ def __init__( model_config=vllm_config.model_config, scheduler_config=vllm_config.scheduler_config, parallel_config=vllm_config.parallel_config, - lora_config=vllm_config.lora_config, - ) + lora_config=vllm_config.lora_config) self.tokenizer.ping() # Processor (converts Inputs --> EngineCoreRequests). @@ -95,16 +93,13 @@ def __init__( ) # OutputProcessor (converts EngineCoreOutputs --> RequestOutput). - self.output_processor = OutputProcessor( - self.tokenizer, log_stats=self.log_stats - ) + self.output_processor = OutputProcessor(self.tokenizer, + log_stats=self.log_stats) # EngineCore (starts the engine in background process). - core_client_class = ( - AsyncMPClient - if (vllm_config.parallel_config.data_parallel_size == 1) - else DPAsyncMPClient - ) + core_client_class = AsyncMPClient if ( + vllm_config.parallel_config.data_parallel_size + == 1) else DPAsyncMPClient self.engine_core = core_client_class( vllm_config=vllm_config, @@ -119,31 +114,43 @@ def __init__( self._run_output_handler() except RuntimeError: pass - - # If usage stat is enabled, collect relevant info. + + # If usage stat is enabled, collect relevant info. if is_usage_stats_enabled(): - from vllm.model_executor.model_loader import get_architecture_class_name - + from vllm.model_executor.model_loader import ( + get_architecture_class_name) usage_message.report_usage( get_architecture_class_name(self.model_config), usage_context, extra_kvs={ # Common configuration - "dtype": str(self.model_config.dtype), - "tensor_parallel_size": vllm_config.parallel_config.tensor_parallel_size, - "block_size": vllm_config.cache_config.block_size, - "gpu_memory_utilization": vllm_config.cache_config.gpu_memory_utilization, + "dtype": + str(self.model_config.dtype), + "tensor_parallel_size": + vllm_config.parallel_config.tensor_parallel_size, + "block_size": + vllm_config.cache_config.block_size, + "gpu_memory_utilization": + vllm_config.cache_config.gpu_memory_utilization, + # Quantization - "quantization": self.model_config.quantization, - "kv_cache_dtype": str(vllm_config.cache_config.cache_dtype), + "quantization": + self.model_config.quantization, + "kv_cache_dtype": + str(vllm_config.cache_config.cache_dtype), + # Feature flags - "enable_lora": bool(vllm_config.lora_config), - "enable_prompt_adapter": bool(vllm_config.prompt_adapter_config), - "enable_prefix_caching": vllm_config.cache_config.enable_prefix_caching, - "enforce_eager": vllm_config.model_config.enforce_eager, - "disable_custom_all_reduce": vllm_config.parallel_config.disable_custom_all_reduce, - }, - ) + "enable_lora": + bool(vllm_config.lora_config), + "enable_prompt_adapter": + bool(vllm_config.prompt_adapter_config), + "enable_prefix_caching": + vllm_config.cache_config.enable_prefix_caching, + "enforce_eager": + vllm_config.model_config.enforce_eager, + "disable_custom_all_reduce": + vllm_config.parallel_config.disable_custom_all_reduce, + }) @classmethod def from_vllm_config( @@ -160,16 +167,13 @@ def from_vllm_config( "Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. " "This should not happen. As a workaround, try using " "AsyncLLMEngine.from_vllm_config(...) or explicitly set " - "VLLM_USE_V1=0 or 1 and report this issue on Github." - ) + "VLLM_USE_V1=0 or 1 and report this issue on Github.") # FIXME(rob): refactor VllmConfig to include the StatLoggers # include StatLogger in the Oracle decision. if stat_loggers is not None: - raise ValueError( - "Custom StatLoggers are not yet supported on V1. " - "Explicitly set VLLM_USE_V1=0 to disable V1." - ) + raise ValueError("Custom StatLoggers are not yet supported on V1. " + "Explicitly set VLLM_USE_V1=0 to disable V1.") # Create the LLMEngine. return cls( @@ -232,22 +236,18 @@ async def add_request( if self.errored: raise EngineDeadError() - assert isinstance(params, SamplingParams), "Pooling is not supported in V1" + assert isinstance(params, SamplingParams), \ + "Pooling is not supported in V1" # Create a new output collector for the request. queue = RequestOutputCollector(output_kind=params.output_kind) # Convert Input --> Request. - request = self.processor.process_inputs( - request_id, - prompt, - params, - arrival_time, - lora_request, - trace_headers, - prompt_adapter_request, - priority, - ) + request = self.processor.process_inputs(request_id, prompt, params, + arrival_time, lora_request, + trace_headers, + prompt_adapter_request, + priority) if params.n == 1: await self._add_request(request, None, 0, queue) @@ -263,13 +263,9 @@ async def add_request( await self._add_request(child_request, parent_request, idx, queue) return queue - async def _add_request( - self, - request: EngineCoreRequest, - parent_req: Optional[ParentRequest], - index: int, - queue: RequestOutputCollector, - ): + async def _add_request(self, request: EngineCoreRequest, + parent_req: Optional[ParentRequest], index: int, + queue: RequestOutputCollector): # Add the request to OutputProcessor (this process). self.output_processor.add_request(request, parent_req, index, queue) @@ -386,26 +382,23 @@ async def output_handler(): outputs = await engine_core.get_output_async() num_outputs = len(outputs.outputs) - iteration_stats = ( - IterationStats() if (log_stats and num_outputs) else None - ) + iteration_stats = IterationStats() if ( + log_stats and num_outputs) else None # Split outputs into chunks of at most # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the # event loop for too long. if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: - slices = (outputs.outputs,) + slices = (outputs.outputs, ) else: slices = np.array_split( outputs.outputs, - cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE), - ) + cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE)) for i, outputs_slice in enumerate(slices): # 2) Process EngineCoreOutputs. processed_outputs = output_processor.process_outputs( - outputs_slice, outputs.timestamp, iteration_stats - ) + outputs_slice, outputs.timestamp, iteration_stats) # NOTE: RequestOutputs are pushed to their queues. assert not processed_outputs.request_outputs @@ -415,8 +408,7 @@ async def output_handler(): # 3) Abort any reqs that finished due to stop strings. await engine_core.abort_requests_async( - processed_outputs.reqs_to_abort - ) + processed_outputs.reqs_to_abort) # 4) Logging. # TODO(rob): make into a coroutine and launch it in @@ -437,7 +429,7 @@ async def output_handler(): async def abort(self, request_id: str) -> None: """Abort RequestId in OutputProcessor and EngineCore.""" - request_ids = self.output_processor.abort_requests((request_id,)) + request_ids = self.output_processor.abort_requests((request_id, )) await self.engine_core.abort_requests_async(request_ids) if self.log_requests: @@ -452,9 +444,8 @@ def _record_stats( """static so that it can be used from the output_handler task without a circular ref to AsyncLLM.""" for stat_logger in stat_loggers: - stat_logger.record( - scheduler_stats=scheduler_stats, iteration_stats=iteration_stats - ) + stat_logger.record(scheduler_stats=scheduler_stats, + iteration_stats=iteration_stats) def encode( self, @@ -506,7 +497,8 @@ async def start_profile(self) -> None: async def stop_profile(self) -> None: await self.engine_core.profile_async(False) - async def reset_prefix_cache(self, device: Optional[Device] = None) -> None: + async def reset_prefix_cache(self, + device: Optional[Device] = None) -> None: if device == Device.CPU: raise ValueError("Not supported on CPU.") await self.engine_core.reset_prefix_cache_async() From b17dbc9ff8a6c4941841f151e089f30faae2c38f Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 23 Apr 2025 07:10:35 +0000 Subject: [PATCH 13/32] format --- vllm/usage/usage_lib.py | 17 ++++++++--------- vllm/v1/engine/async_llm.py | 4 ++-- vllm/v1/engine/llm_engine.py | 3 +-- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index dc086e85047..44c6908e3c8 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -175,15 +175,14 @@ def _report_usage_once(self, model_architecture: str, if current_platform.is_cuda(): self.cuda_runtime = torch.version.cuda if current_platform.is_tpu(): - try: - import torch_xla.runtime as xr - from torch_xla.core import xla_model as xm - from torch_xla.tpu import tpu_type - self.gpu_count = xr.world_size() - self.gpu_type = tpu_type() - self.gpu_memory_per_device = xm.get_memory_info().bytes_limit - except ImportError: - pass + # try: + import torch_xla.runtime as xr + from torch_xla.core import xla_model as xm + self.gpu_count = xr.world_size() + self.gpu_type = torch_xla.tpu.tpu_type() + self.gpu_memory_per_device = xm.get_memory_info().bytes_limit + # except ImportError: + # pass self.provider = _detect_cloud_provider() self.architecture = platform.machine() self.platform = platform.platform() diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 63443421709..1c94288d43d 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -114,8 +114,8 @@ def __init__( self._run_output_handler() except RuntimeError: pass - - # If usage stat is enabled, collect relevant info. + + # If usage stat is enabled, collect relevant info. if is_usage_stats_enabled(): from vllm.model_executor.model_loader import ( get_architecture_class_name) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 99f473540bb..bbd6510cb6b 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -99,7 +99,7 @@ def __init__( if not multiprocess_mode: # for v0 compatibility self.model_executor = self.engine_core.engine_core.model_executor # type: ignore - + # If usage stat is enabled, collect relevant info. if is_usage_stats_enabled(): from vllm.model_executor.model_loader import ( @@ -137,7 +137,6 @@ def __init__( parallel_config.disable_custom_all_reduce, }) - @classmethod def from_vllm_config( cls, From 3bd0c9ba75ad0ef2374f7ce14c5f7e8df5edcd6e Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 23 Apr 2025 07:40:26 +0000 Subject: [PATCH 14/32] Use import torch_xla --- vllm/usage/usage_lib.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 44c6908e3c8..022b3ad62ed 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -176,13 +176,12 @@ def _report_usage_once(self, model_architecture: str, self.cuda_runtime = torch.version.cuda if current_platform.is_tpu(): # try: - import torch_xla.runtime as xr - from torch_xla.core import xla_model as xm - self.gpu_count = xr.world_size() + import torch_xla + self.gpu_count = torch_xla.runtime.world_size() self.gpu_type = torch_xla.tpu.tpu_type() - self.gpu_memory_per_device = xm.get_memory_info().bytes_limit + self.gpu_memory_per_device = torch_xla.core.xla_model.get_memory_info().bytes_limit # except ImportError: - # pass + # pass self.provider = _detect_cloud_provider() self.architecture = platform.machine() self.platform = platform.platform() From 718729a6481964c22965481075968e99793c2e47 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 23 Apr 2025 07:48:48 +0000 Subject: [PATCH 15/32] format --- vllm/usage/usage_lib.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 022b3ad62ed..0be29b7673f 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -179,7 +179,8 @@ def _report_usage_once(self, model_architecture: str, import torch_xla self.gpu_count = torch_xla.runtime.world_size() self.gpu_type = torch_xla.tpu.tpu_type() - self.gpu_memory_per_device = torch_xla.core.xla_model.get_memory_info().bytes_limit + self.gpu_memory_per_device = torch_xla.core.xla_model.get_memory_info( + ).bytes_limit # except ImportError: # pass self.provider = _detect_cloud_provider() From 6e61fba03ed63b25621381743f2e39ad0c4e26e3 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 23 Apr 2025 08:07:49 +0000 Subject: [PATCH 16/32] format --- vllm/usage/usage_lib.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 0be29b7673f..70e6a17989c 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -179,8 +179,9 @@ def _report_usage_once(self, model_architecture: str, import torch_xla self.gpu_count = torch_xla.runtime.world_size() self.gpu_type = torch_xla.tpu.tpu_type() - self.gpu_memory_per_device = torch_xla.core.xla_model.get_memory_info( - ).bytes_limit + self.gpu_memory_per_device = ( + torch_xla.core.xla_model.get_memory_info().bytes_limit + ) # except ImportError: # pass self.provider = _detect_cloud_provider() From 737646d757bc5b5316b2c67d0e4ba6202327dbdd Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 23 Apr 2025 08:41:12 +0000 Subject: [PATCH 17/32] format --- vllm/usage/usage_lib.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 70e6a17989c..fd7e7a6ea40 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -180,8 +180,7 @@ def _report_usage_once(self, model_architecture: str, self.gpu_count = torch_xla.runtime.world_size() self.gpu_type = torch_xla.tpu.tpu_type() self.gpu_memory_per_device = ( - torch_xla.core.xla_model.get_memory_info().bytes_limit - ) + torch_xla.core.xla_model.get_memory_info().bytes_limit) # except ImportError: # pass self.provider = _detect_cloud_provider() From f825349abfa3c313eff64bb21af8d75791eb0d4e Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 23 Apr 2025 22:34:27 +0000 Subject: [PATCH 18/32] Try Qiliang's idea --- vllm/engine/async_llm_engine.py | 1 + vllm/engine/llm_engine.py | 1 + vllm/usage/usage_lib.py | 12 ++++++++---- vllm/v1/engine/async_llm.py | 1 + vllm/v1/engine/llm_engine.py | 1 + 5 files changed, 12 insertions(+), 4 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index acb9b3a93c9..54c5d224c9d 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -638,6 +638,7 @@ def __init__(self, usage_message.report_usage( get_architecture_class_name(self.engine.model_config), self.usage_context, + self.engine.vllm_config, extra_kvs={ # Common configuration "dtype": diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 4644053785f..362c4c58838 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -291,6 +291,7 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: usage_message.report_usage( get_architecture_class_name(self.model_config), usage_context, + self.vllm_config, extra_kvs={ # Common configuration "dtype": diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index fd7e7a6ea40..68a0cfd00f2 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -17,6 +17,7 @@ import requests import torch +from vllm.config import VllmConfig import vllm.envs as envs from vllm.connections import global_http_connection from vllm.version import __version__ as VLLM_VERSION @@ -150,20 +151,23 @@ def __init__(self) -> None: def report_usage(self, model_architecture: str, usage_context: UsageContext, + vllm_config: VllmConfig, extra_kvs: Optional[dict[str, Any]] = None) -> None: t = Thread(target=self._report_usage_worker, - args=(model_architecture, usage_context, extra_kvs or {}), + args=(model_architecture, usage_context, vllm_config, extra_kvs or {}), daemon=True) t.start() def _report_usage_worker(self, model_architecture: str, usage_context: UsageContext, + vllm_config: VllmConfig, extra_kvs: dict[str, Any]) -> None: - self._report_usage_once(model_architecture, usage_context, extra_kvs) + self._report_usage_once(model_architecture, usage_context, vllm_config, extra_kvs) self._report_continous_usage() def _report_usage_once(self, model_architecture: str, usage_context: UsageContext, + vllm_config: VllmConfig, extra_kvs: dict[str, Any]) -> None: # Platform information from vllm.platforms import current_platform @@ -177,8 +181,8 @@ def _report_usage_once(self, model_architecture: str, if current_platform.is_tpu(): # try: import torch_xla - self.gpu_count = torch_xla.runtime.world_size() - self.gpu_type = torch_xla.tpu.tpu_type() + self.gpu_count = vllm_config.parallel_config.world_size + self.gpu_type = torch_xla.tpu.get_tpu_type() self.gpu_memory_per_device = ( torch_xla.core.xla_model.get_memory_info().bytes_limit) # except ImportError: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 1c94288d43d..5812b490be5 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -122,6 +122,7 @@ def __init__( usage_message.report_usage( get_architecture_class_name(self.model_config), usage_context, + self.vllm_config, extra_kvs={ # Common configuration "dtype": diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index bbd6510cb6b..119146cac04 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -107,6 +107,7 @@ def __init__( usage_message.report_usage( get_architecture_class_name(self.model_config), usage_context, + self.vllm_config, extra_kvs={ # Common configuration "dtype": From bbd7f5aba8cd95710c3cb314af8a781c8375ff50 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 24 Apr 2025 05:50:27 +0000 Subject: [PATCH 19/32] Use Yarong's 2nd idea --- vllm/usage/usage_lib.py | 10 +++++---- vllm/v1/worker/tpu_worker.py | 42 +++++++++++++++++++++++++++++++++++- 2 files changed, 47 insertions(+), 5 deletions(-) diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 68a0cfd00f2..4602112a571 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -17,8 +17,8 @@ import requests import torch -from vllm.config import VllmConfig import vllm.envs as envs +from vllm.config import VllmConfig from vllm.connections import global_http_connection from vllm.version import __version__ as VLLM_VERSION @@ -154,7 +154,8 @@ def report_usage(self, vllm_config: VllmConfig, extra_kvs: Optional[dict[str, Any]] = None) -> None: t = Thread(target=self._report_usage_worker, - args=(model_architecture, usage_context, vllm_config, extra_kvs or {}), + args=(model_architecture, usage_context, vllm_config, + extra_kvs or {}), daemon=True) t.start() @@ -162,7 +163,8 @@ def _report_usage_worker(self, model_architecture: str, usage_context: UsageContext, vllm_config: VllmConfig, extra_kvs: dict[str, Any]) -> None: - self._report_usage_once(model_architecture, usage_context, vllm_config, extra_kvs) + self._report_usage_once(model_architecture, usage_context, vllm_config, + extra_kvs) self._report_continous_usage() def _report_usage_once(self, model_architecture: str, @@ -184,7 +186,7 @@ def _report_usage_once(self, model_architecture: str, self.gpu_count = vllm_config.parallel_config.world_size self.gpu_type = torch_xla.tpu.get_tpu_type() self.gpu_memory_per_device = ( - torch_xla.core.xla_model.get_memory_info().bytes_limit) + torch_xla.core.xla_model.get_memory_info()["bytes_limit"]) # except ImportError: # pass self.provider = _detect_cloud_provider() diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index 2204f037a6d..f1cced080f9 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -16,6 +16,8 @@ init_distributed_environment) from vllm.logger import init_logger from vllm.model_executor import set_random_seed +from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, + usage_message) from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.kv_cache_interface import (AttentionSpec, KVCacheConfig, @@ -82,7 +84,7 @@ def __init__( if self.model_config.seed is None: self.model_config.seed = 0 - def init_device(self): + def init_device(self, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT): os.environ["PJRT_DEVICE"] = "TPU" # Note: Currently the XLA compiler wrongly uses 2D ring strategy on 1D # ring, the xla tpu compiler flag @@ -133,6 +135,44 @@ def init_device(self): # Init ModelRunner here, so that we have access to self.device. self.model_runner = TPUModelRunner(self.vllm_config, self.device) + # If usage stat is enabled, collect relevant info. + if is_usage_stats_enabled(): + from vllm.model_executor.model_loader import ( + get_architecture_class_name) + usage_message.report_usage( + get_architecture_class_name(self.model_config), + usage_context, + self.vllm_config, + extra_kvs={ + # Common configuration + "dtype": + str(self.model_config.dtype), + "tensor_parallel_size": + self.parallel_config.tensor_parallel_size, + "block_size": + self.cache_config.block_size, + "gpu_memory_utilization": + self.cache_config.gpu_memory_utilization, + + # Quantization + "quantization": + self.model_config.quantization, + "kv_cache_dtype": + str(self.cache_config.cache_dtype), + + # Feature flags + "enable_lora": + bool(self.lora_config), + "enable_prompt_adapter": + bool(self.prompt_adapter_config), + "enable_prefix_caching": + self.cache_config.enable_prefix_caching, + "enforce_eager": + self.model_config.enforce_eager, + "disable_custom_all_reduce": + self.parallel_config.disable_custom_all_reduce, + }) + def determine_available_memory(self) -> int: kv_caches: dict[str, torch.Tensor] = {} kv_cache_spec = self.model_runner.get_kv_cache_spec() From 4e38e67dfe9e8064b3b29fe63fccc9aaa822ab20 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 24 Apr 2025 06:02:17 +0000 Subject: [PATCH 20/32] revert vllm/engine/async_llm_engine.py --- vllm/engine/async_llm_engine.py | 43 +-------------------------------- 1 file changed, 1 insertion(+), 42 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 54c5d224c9d..67c7e109c9f 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -34,8 +34,7 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import ExecuteModelRequest from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, - usage_message) +from vllm.usage.usage_lib import UsageContext from vllm.utils import Device, deprecate_kwargs, weak_bind logger = init_logger(__name__) @@ -597,7 +596,6 @@ def __init__(self, *args, log_requests: bool = True, start_engine_loop: bool = True, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, **kwargs) -> None: if envs.VLLM_USE_V1: raise ValueError( @@ -628,47 +626,8 @@ def __init__(self, self._errored_with: Optional[BaseException] = None # Lazy initialized fields - self.usage_context = usage_context self._request_tracker: RequestTracker - # If usage stat is enabled, collect relevant info. - if is_usage_stats_enabled(): - from vllm.model_executor.model_loader import ( - get_architecture_class_name) - usage_message.report_usage( - get_architecture_class_name(self.engine.model_config), - self.usage_context, - self.engine.vllm_config, - extra_kvs={ - # Common configuration - "dtype": - str(self.engine.model_config.dtype), - "tensor_parallel_size": - self.engine.parallel_config.tensor_parallel_size, - "block_size": - self.engine.cache_config.block_size, - "gpu_memory_utilization": - self.engine.cache_config.gpu_memory_utilization, - - # Quantization - "quantization": - self.engine.model_config.quantization, - "kv_cache_dtype": - str(self.engine.cache_config.cache_dtype), - - # Feature flags - "enable_lora": - bool(self.engine.lora_config), - "enable_prompt_adapter": - bool(self.engine.prompt_adapter_config), - "enable_prefix_caching": - self.engine.cache_config.enable_prefix_caching, - "enforce_eager": - self.engine.model_config.enforce_eager, - "disable_custom_all_reduce": - self.engine.parallel_config.disable_custom_all_reduce, - }) - def __del__(self): if rt := getattr(self, "request_tracker", None): # Wake up engine loop so that it will exit cleanly From fc18a7a5133e8e1b1be3a31b7c31774e0a00757d Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 24 Apr 2025 06:14:03 +0000 Subject: [PATCH 21/32] simplify code --- vllm/engine/llm_engine.py | 1 - vllm/usage/usage_lib.py | 12 +++-------- vllm/v1/engine/async_llm.py | 3 +-- vllm/v1/engine/llm_engine.py | 3 +-- vllm/v1/worker/tpu_worker.py | 39 ++---------------------------------- 5 files changed, 7 insertions(+), 51 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 362c4c58838..4644053785f 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -291,7 +291,6 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: usage_message.report_usage( get_architecture_class_name(self.model_config), usage_context, - self.vllm_config, extra_kvs={ # Common configuration "dtype": diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 83770c9515a..37f209787b8 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -18,7 +18,6 @@ import torch import vllm.envs as envs -from vllm.config import VllmConfig from vllm.connections import global_http_connection from vllm.utils import cuda_device_count_stateless, cuda_get_device_properties from vllm.version import __version__ as VLLM_VERSION @@ -152,25 +151,20 @@ def __init__(self) -> None: def report_usage(self, model_architecture: str, usage_context: UsageContext, - vllm_config: VllmConfig, extra_kvs: Optional[dict[str, Any]] = None) -> None: t = Thread(target=self._report_usage_worker, - args=(model_architecture, usage_context, vllm_config, - extra_kvs or {}), + args=(model_architecture, usage_context, extra_kvs or {}), daemon=True) t.start() def _report_usage_worker(self, model_architecture: str, usage_context: UsageContext, - vllm_config: VllmConfig, extra_kvs: dict[str, Any]) -> None: - self._report_usage_once(model_architecture, usage_context, vllm_config, - extra_kvs) + self._report_usage_once(model_architecture, usage_context, extra_kvs) self._report_continous_usage() def _report_usage_once(self, model_architecture: str, usage_context: UsageContext, - vllm_config: VllmConfig, extra_kvs: dict[str, Any]) -> None: # Platform information from vllm.platforms import current_platform @@ -183,7 +177,7 @@ def _report_usage_once(self, model_architecture: str, if current_platform.is_tpu(): # try: import torch_xla - self.gpu_count = vllm_config.parallel_config.world_size + self.gpu_count = torch_xla.runtime.world_size self.gpu_type = torch_xla.tpu.get_tpu_type() self.gpu_memory_per_device = ( torch_xla.core.xla_model.get_memory_info()["bytes_limit"]) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 7b4df77dcd1..1149dfa9ce5 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -23,8 +23,7 @@ from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs -from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, - usage_message) +from vllm.usage.usage_lib import UsageContext from vllm.utils import Device, cdiv from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import AsyncMPClient, DPAsyncMPClient diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index d920606c8c9..6fa90b26982 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -21,8 +21,7 @@ from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import ( BaseTokenizerGroup, init_tokenizer_from_configs) -from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, - usage_message) +from vllm.usage.usage_lib import UsageContext from vllm.utils import Device from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.output_processor import OutputProcessor diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index f1cced080f9..a5925a1a066 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -23,7 +23,7 @@ from vllm.v1.kv_cache_interface import (AttentionSpec, KVCacheConfig, KVCacheSpec) from vllm.v1.outputs import ModelRunnerOutput -from vllm.v1.utils import bind_kv_cache +from vllm.v1.utils import bind_kv_cache, report_usage_stats from vllm.v1.worker.tpu_model_runner import TPUModelRunner logger = init_logger(__name__) @@ -136,42 +136,7 @@ def init_device(self, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT) self.model_runner = TPUModelRunner(self.vllm_config, self.device) # If usage stat is enabled, collect relevant info. - if is_usage_stats_enabled(): - from vllm.model_executor.model_loader import ( - get_architecture_class_name) - usage_message.report_usage( - get_architecture_class_name(self.model_config), - usage_context, - self.vllm_config, - extra_kvs={ - # Common configuration - "dtype": - str(self.model_config.dtype), - "tensor_parallel_size": - self.parallel_config.tensor_parallel_size, - "block_size": - self.cache_config.block_size, - "gpu_memory_utilization": - self.cache_config.gpu_memory_utilization, - - # Quantization - "quantization": - self.model_config.quantization, - "kv_cache_dtype": - str(self.cache_config.cache_dtype), - - # Feature flags - "enable_lora": - bool(self.lora_config), - "enable_prompt_adapter": - bool(self.prompt_adapter_config), - "enable_prefix_caching": - self.cache_config.enable_prefix_caching, - "enforce_eager": - self.model_config.enforce_eager, - "disable_custom_all_reduce": - self.parallel_config.disable_custom_all_reduce, - }) + report_usage_stats(self.vllm_config, usage_context) def determine_available_memory(self) -> int: kv_caches: dict[str, torch.Tensor] = {} From cf7997ae4dcad23ea33668f9ec1565bbcbc93f3b Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 24 Apr 2025 06:20:01 +0000 Subject: [PATCH 22/32] simplify --- vllm/v1/worker/tpu_worker.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index a5925a1a066..bf50928a6ea 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -16,8 +16,7 @@ init_distributed_environment) from vllm.logger import init_logger from vllm.model_executor import set_random_seed -from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, - usage_message) +from vllm.usage.usage_lib import UsageContext from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.kv_cache_interface import (AttentionSpec, KVCacheConfig, From 3bd5730fdc671ff07870dbf52b98d643b3a76dd8 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 24 Apr 2025 06:23:16 +0000 Subject: [PATCH 23/32] fix typo --- vllm/usage/usage_lib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 37f209787b8..fb2643efd11 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -177,7 +177,7 @@ def _report_usage_once(self, model_architecture: str, if current_platform.is_tpu(): # try: import torch_xla - self.gpu_count = torch_xla.runtime.world_size + self.gpu_count = torch_xla.runtime.world_size() self.gpu_type = torch_xla.tpu.get_tpu_type() self.gpu_memory_per_device = ( torch_xla.core.xla_model.get_memory_info()["bytes_limit"]) From 4374c3c232366fabb66e8ff216fc68d21a95b937 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 24 Apr 2025 06:27:40 +0000 Subject: [PATCH 24/32] format --- vllm/v1/worker/tpu_worker.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index bf50928a6ea..7fa9c0322fb 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -83,7 +83,8 @@ def __init__( if self.model_config.seed is None: self.model_config.seed = 0 - def init_device(self, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT): + def init_device(self, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT): os.environ["PJRT_DEVICE"] = "TPU" # Note: Currently the XLA compiler wrongly uses 2D ring strategy on 1D # ring, the xla tpu compiler flag From 68293717ceaea83d4e0ac6adf41d802b0106ae3b Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 24 Apr 2025 06:32:22 +0000 Subject: [PATCH 25/32] simplify --- vllm/v1/utils.py | 3 ++- vllm/v1/worker/tpu_worker.py | 6 ++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index dc6457bf903..be915fc0d3c 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -205,7 +205,8 @@ def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor, return to_tensor[:length].copy_(from_tensor[:length], non_blocking=True) -def report_usage_stats(vllm_config, usage_context: UsageContext) -> None: +def report_usage_stats(vllm_config, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT) -> None: """Report usage statistics if enabled.""" if not is_usage_stats_enabled(): diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index 7fa9c0322fb..89443d26967 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -16,7 +16,6 @@ init_distributed_environment) from vllm.logger import init_logger from vllm.model_executor import set_random_seed -from vllm.usage.usage_lib import UsageContext from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.kv_cache_interface import (AttentionSpec, KVCacheConfig, @@ -83,8 +82,7 @@ def __init__( if self.model_config.seed is None: self.model_config.seed = 0 - def init_device(self, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT): + def init_device(self): os.environ["PJRT_DEVICE"] = "TPU" # Note: Currently the XLA compiler wrongly uses 2D ring strategy on 1D # ring, the xla tpu compiler flag @@ -136,7 +134,7 @@ def init_device(self, self.model_runner = TPUModelRunner(self.vllm_config, self.device) # If usage stat is enabled, collect relevant info. - report_usage_stats(self.vllm_config, usage_context) + report_usage_stats(self.vllm_config) def determine_available_memory(self) -> int: kv_caches: dict[str, torch.Tensor] = {} From 3c55fc7b2f902fa569d766133374baa801ac7a01 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 24 Apr 2025 06:33:22 +0000 Subject: [PATCH 26/32] silence error --- vllm/usage/usage_lib.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index fb2643efd11..0c555a743e4 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -175,14 +175,14 @@ def _report_usage_once(self, model_architecture: str, if current_platform.is_cuda(): self.cuda_runtime = torch.version.cuda if current_platform.is_tpu(): - # try: - import torch_xla - self.gpu_count = torch_xla.runtime.world_size() - self.gpu_type = torch_xla.tpu.get_tpu_type() - self.gpu_memory_per_device = ( - torch_xla.core.xla_model.get_memory_info()["bytes_limit"]) - # except ImportError: - # pass + try: + import torch_xla + self.gpu_count = torch_xla.runtime.world_size() + self.gpu_type = torch_xla.tpu.get_tpu_type() + self.gpu_memory_per_device = ( + torch_xla.core.xla_model.get_memory_info()["bytes_limit"]) + except ImportError: + pass self.provider = _detect_cloud_provider() self.architecture = platform.machine() self.platform = platform.platform() From bbee546505e26d77cae752c4b05459582449cf95 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 24 Apr 2025 06:40:38 +0000 Subject: [PATCH 27/32] Suppress all exceptions --- vllm/usage/usage_lib.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 0c555a743e4..df598d11e86 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -181,7 +181,8 @@ def _report_usage_once(self, model_architecture: str, self.gpu_type = torch_xla.tpu.get_tpu_type() self.gpu_memory_per_device = ( torch_xla.core.xla_model.get_memory_info()["bytes_limit"]) - except ImportError: + # To suppress errors like "TPU initialization failed: open(/dev/vfio/0): Device or resource busy" + except Exception: pass self.provider = _detect_cloud_provider() self.architecture = platform.machine() From 429b6aabfd4a153f9d9249526fc124099add72ec Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 24 Apr 2025 06:44:10 +0000 Subject: [PATCH 28/32] format --- vllm/v1/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index be915fc0d3c..9c238c3aad8 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -205,8 +205,9 @@ def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor, return to_tensor[:length].copy_(from_tensor[:length], non_blocking=True) -def report_usage_stats(vllm_config, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT) -> None: +def report_usage_stats( + vllm_config, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT) -> None: """Report usage statistics if enabled.""" if not is_usage_stats_enabled(): From 89392353450df46adcb90526872d4eee830bf02e Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 24 Apr 2025 06:50:33 +0000 Subject: [PATCH 29/32] remove comment --- vllm/usage/usage_lib.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index df598d11e86..67b834533b7 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -181,7 +181,6 @@ def _report_usage_once(self, model_architecture: str, self.gpu_type = torch_xla.tpu.get_tpu_type() self.gpu_memory_per_device = ( torch_xla.core.xla_model.get_memory_info()["bytes_limit"]) - # To suppress errors like "TPU initialization failed: open(/dev/vfio/0): Device or resource busy" except Exception: pass self.provider = _detect_cloud_provider() From bac067a6797fd763f402c10612dad12344fca928 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 24 Apr 2025 22:37:51 +0000 Subject: [PATCH 30/32] report usage of TPU and GPU during worker init time --- vllm/v1/engine/async_llm.py | 4 ---- vllm/v1/engine/llm_engine.py | 3 --- vllm/v1/worker/gpu_worker.py | 5 +++++ vllm/v1/worker/tpu_worker.py | 5 +++-- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 54f0232da2b..c33535b3d36 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -36,7 +36,6 @@ from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger, StatLoggerBase) from vllm.v1.metrics.stats import IterationStats, SchedulerStats -from vllm.v1.utils import report_usage_stats logger = init_logger(__name__) @@ -113,9 +112,6 @@ def __init__( except RuntimeError: pass - # If usage stat is enabled, collect relevant info. - report_usage_stats(vllm_config, usage_context) - @classmethod def from_vllm_config( cls, diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index af67408097a..b1329a6138b 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -97,9 +97,6 @@ def __init__( # for v0 compatibility self.model_executor = self.engine_core.engine_core.model_executor # type: ignore - # If usage stat is enabled, collect relevant info. - report_usage_stats(vllm_config, usage_context) - @classmethod def from_vllm_config( cls, diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 424c73e3ab7..e6a535c951e 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -25,6 +25,7 @@ from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.worker.gpu_model_runner import GPUModelRunner from vllm.v1.worker.worker_base import WorkerBase +from vllm.v1.utils import report_usage_stats logger = init_logger(__name__) @@ -140,6 +141,10 @@ def init_device(self): # Construct the model runner self.model_runner: GPUModelRunner = GPUModelRunner( self.vllm_config, self.device) + + if self.rank == 0: + # If usage stat is enabled, collect relevant info. + report_usage_stats(self.vllm_config) # FIXME(youkaichao & ywang96): Use TorchDispatchMode instead of memory pool # to hijack tensor allocation. diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index 89443d26967..de676541eff 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -133,8 +133,9 @@ def init_device(self): # Init ModelRunner here, so that we have access to self.device. self.model_runner = TPUModelRunner(self.vllm_config, self.device) - # If usage stat is enabled, collect relevant info. - report_usage_stats(self.vllm_config) + if rank == 0: + # If usage stat is enabled, collect relevant info. + report_usage_stats(self.vllm_config) def determine_available_memory(self) -> int: kv_caches: dict[str, torch.Tensor] = {} From 3ad33a226ba38bb36329ce64af40b50b33fe0aa9 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 24 Apr 2025 22:45:27 +0000 Subject: [PATCH 31/32] remove useless import --- vllm/v1/engine/llm_engine.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index b1329a6138b..a07595a552a 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -28,7 +28,6 @@ from vllm.v1.engine.parallel_sampling import ParentRequest from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor -from vllm.v1.utils import report_usage_stats logger = init_logger(__name__) From 5b0ab6dbd8e5ac72450652023f09b6b2bc2f1f7e Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 24 Apr 2025 22:47:10 +0000 Subject: [PATCH 32/32] format --- vllm/v1/worker/gpu_worker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index e6a535c951e..68c4e94fcd7 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -23,9 +23,9 @@ from vllm.utils import GiB_bytes from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.utils import report_usage_stats from vllm.v1.worker.gpu_model_runner import GPUModelRunner from vllm.v1.worker.worker_base import WorkerBase -from vllm.v1.utils import report_usage_stats logger = init_logger(__name__) @@ -141,7 +141,7 @@ def init_device(self): # Construct the model runner self.model_runner: GPUModelRunner = GPUModelRunner( self.vllm_config, self.device) - + if self.rank == 0: # If usage stat is enabled, collect relevant info. report_usage_stats(self.vllm_config)