diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index afe0b53077a..912470fada8 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -553,9 +553,8 @@ def main(args: argparse.Namespace): intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // args.tp_size else: - if not hasattr(config, "hidden_size"): - # Support for llama4 - config = config.text_config + # Support for llama4 + config = config.get_text_config() # Default: Mixtral. E = config.num_local_experts topk = config.num_experts_per_tok diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index cd2b8f00d52..446c4efbf6a 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -24,10 +24,7 @@ def test_can_initialize(model_arch): def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig: hf_config.update(model_info.hf_overrides) - if hasattr(hf_config, "text_config"): - text_config: PretrainedConfig = hf_config.text_config - else: - text_config = hf_config + text_config = hf_config.get_text_config() text_config.update({ "num_layers": 1, diff --git a/vllm/config.py b/vllm/config.py index 0eb15825a3b..8b48d93187e 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2841,12 +2841,10 @@ def _get_and_verify_dtype( ) -> torch.dtype: # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct # because config.torch_dtype can be None. - config_dtype = getattr(config, "torch_dtype", None) + config_dtype = getattr(config.get_text_config(), "torch_dtype", None) - # Fallbacks for multi-modal models if the root config + # Fallback for multi-modal models if the root config # does not define torch_dtype - if config_dtype is None and hasattr(config, "text_config"): - config_dtype = getattr(config.text_config, "torch_dtype", None) if config_dtype is None and hasattr(config, "vision_config"): config_dtype = getattr(config.vision_config, "torch_dtype", None) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index ee991eaeb34..e062afd6820 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -760,19 +760,22 @@ def get_hf_text_config(config: PretrainedConfig): """Get the "sub" config relevant to llm for multi modal models. No op for pure text models. """ - if hasattr(config, "text_config"): - # The code operates under the assumption that text_config should have - # `num_attention_heads` (among others). Assert here to fail early - # if transformers config doesn't align with this assumption. - assert hasattr(config.text_config, "num_attention_heads") - return config.text_config - elif hasattr(config, "thinker_config"): + # This block should be unnecessary after https://github.com/huggingface/transformers/pull/37517 + if hasattr(config, "thinker_config"): # TODO(suyang.fy): Refactor code. # For Qwen2.5-Omni, change hf_text_config to # thinker_config.text_config. return config.thinker_config.text_config - else: - return config + + text_config = config.get_text_config() + + if text_config is not config: + # The code operates under the assumption that text_config should have + # `num_attention_heads` (among others). Assert here to fail early + # if transformers config doesn't align with this assumption. + assert hasattr(text_config, "num_attention_heads") + + return text_config def try_get_generation_config( diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 29fbfbf0d37..87b7f02ab6d 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -508,13 +508,8 @@ def load_model(self) -> None: logger.warning("Regarding multimodal models, vLLM currently " "only supports adding LoRA to language model.") - # It's necessary to distinguish between the max_position_embeddings - # of VLMs and LLMs. - if hasattr(self.model.config, "max_position_embeddings"): - max_pos_embeddings = self.model.config.max_position_embeddings - else: - max_pos_embeddings = ( - self.model.config.text_config.max_position_embeddings) + # Use get_text_config() in case of multimodal models + text_config = self.model_config.hf_config.get_text_config() self.lora_manager = LRUCacheWorkerLoRAManager( self.scheduler_config.max_num_seqs, @@ -524,7 +519,7 @@ def load_model(self) -> None: self.device, self.model.embedding_modules, self.model.embedding_padding_modules, - max_position_embeddings=max_pos_embeddings, + max_position_embeddings=text_config.max_position_embeddings, ) self.model = self.lora_manager.create_lora_manager(self.model) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 1bcef841b06..2a495634367 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -724,14 +724,9 @@ def load_model(self) -> None: "Bias support in LoRA is not enabled in HPU yet." assert not self.lora_config.fully_sharded_loras, \ "Fully sharded LoRAs is not enabled in HPU yet." - # It's necessary to distinguish between the - # max_position_embeddings of VLMs and LLMs. - if hasattr(self.model.config, "max_position_embeddings"): - max_pos_embeddings = ( - self.model.config.max_position_embeddings) - else: - max_pos_embeddings = ( - self.model.config.text_config.max_position_embeddings) + + # Use get_text_config() in case of multimodal models + text_config = self.model_config.hf_config.get_text_config() self.lora_manager = LRUCacheWorkerLoRAManager( self.scheduler_config.max_num_seqs, @@ -741,7 +736,8 @@ def load_model(self) -> None: self.device, self.model.embedding_modules, self.model.embedding_padding_modules, - max_position_embeddings=max_pos_embeddings, + max_position_embeddings=text_config. + max_position_embeddings, ) self.model = self.lora_manager.create_lora_manager(self.model) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 3b09c92ae15..66b12d5be1a 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1130,14 +1130,9 @@ def load_model(self) -> None: logger.warning( "Regarding multimodal models, vLLM currently " "only supports adding LoRA to language model.") - # It's necessary to distinguish between the - # max_position_embeddings of VLMs and LLMs. - if hasattr(self.model.config, "max_position_embeddings"): - max_pos_embeddings = ( - self.model.config.max_position_embeddings) - else: - max_pos_embeddings = ( - self.model.config.text_config.max_position_embeddings) + + # Use get_text_config() in case of multimodal models + text_config = self.model_config.hf_config.get_text_config() self.lora_manager = LRUCacheWorkerLoRAManager( self.scheduler_config.max_num_seqs, @@ -1147,7 +1142,8 @@ def load_model(self) -> None: self.device, self.model.embedding_modules, self.model.embedding_padding_modules, - max_position_embeddings=max_pos_embeddings, + max_position_embeddings=text_config. + max_position_embeddings, ) self.model = self.lora_manager.create_lora_manager(self.model) time_after_load = time.perf_counter()