From a478715c8974bc75b06ccac2b308eaa59e74ab80 Mon Sep 17 00:00:00 2001 From: cty Date: Wed, 30 Apr 2025 08:21:47 +0000 Subject: [PATCH 01/11] This is to resolve the conflict in git rebase Signed-off-by: cty --- vllm_ascend/worker/model_runner_v1.py | 266 +++++++++++++++++++++++++- 1 file changed, 261 insertions(+), 5 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index a0bc2125935..af99e2dead7 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -58,6 +58,14 @@ from vllm.v1.utils import bind_kv_cache from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin +from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange + +from vllm.v1.worker.utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs, + scatter_mm_placeholders) + +from vllm.multimodal.utils import group_mm_inputs_by_modality + +from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm_ascend.attention.attention import AttentionMaskBuilder from vllm_ascend.attention.attention_v1 import AscendAttentionState @@ -422,7 +430,44 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: output_token_ids=[], lora_request=new_req_data.lora_request, ) - + + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.uses_mrope: + image_grid_thw = [] + video_grid_thw = [] + second_per_grid_ts = [] + audio_feature_lengths = [] + use_audio_in_video = False + for mm_input in self.requests[req_id].mm_inputs: + if mm_input.get("image_grid_thw") is not None: + image_grid_thw.extend( + mm_input["image_grid_thw"].tolist()) + if mm_input.get("video_grid_thw") is not None: + video_grid_thw.extend( + mm_input["video_grid_thw"].tolist()) + if mm_input.get("second_per_grid_ts") is not None: + second_per_grid_ts.extend( + mm_input["second_per_grid_ts"]) + if mm_input.get("audio_feature_lengths") is not None: + audio_feature_lengths.extend( + mm_input["audio_feature_lengths"]) + if mm_input.get("use_audio_in_video") is True: + use_audio_in_video = True + + hf_config = self.model_config.hf_config + + self.requests[req_id].mrope_positions, \ + self.requests[req_id].mrope_position_delta = \ + MRotaryEmbedding.get_input_positions_tensor( + self.requests[req_id].prompt_token_ids, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + audio_feature_lengths=audio_feature_lengths, + use_audio_in_video=use_audio_in_video, + ) + req_ids_to_add.append(req_id) # Update the states of the running/resumed requests. @@ -533,6 +578,166 @@ def _make_attention_mask(self, seq_lens, query_lens, position, else: return None + def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"): + mrope_pos_ptr = 0 + for index, req_id in enumerate(self.input_batch.req_ids): + req = self.requests[req_id] + assert req.mrope_positions is not None + + num_computed_tokens = \ + self.input_batch.num_computed_tokens_cpu[index] + num_scheduled_tokens = \ + scheduler_output.num_scheduled_tokens[req_id] + num_prompt_tokens = len(req.prompt_token_ids) + + if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens: + prompt_part_len = max(0, + num_prompt_tokens - num_computed_tokens) + completion_part_len = max( + 0, num_scheduled_tokens - prompt_part_len) + else: + prompt_part_len = num_scheduled_tokens + completion_part_len = 0 + + assert num_scheduled_tokens == prompt_part_len + completion_part_len + + if prompt_part_len > 0: + # prompt's mrope_positions are pre-computed + dst_start = mrope_pos_ptr + dst_end = mrope_pos_ptr + prompt_part_len + src_start = num_computed_tokens + src_end = num_computed_tokens + prompt_part_len + + self.mrope_positions_cpu[:, dst_start:dst_end] = \ + req.mrope_positions[:,src_start:src_end] + + mrope_pos_ptr += prompt_part_len + + if completion_part_len > 0: + # compute completion's mrope_positions on-the-fly + dst_start = mrope_pos_ptr + dst_end = mrope_pos_ptr + completion_part_len + + self.mrope_positions_cpu[:, dst_start:dst_end] = \ + MRotaryEmbedding.get_next_input_positions_tensor( + req.mrope_position_delta, + context_len=num_computed_tokens + + prompt_part_len, + seq_len=num_computed_tokens + + prompt_part_len + + completion_part_len, + ) + + mrope_pos_ptr += completion_part_len + + def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): + scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs + if not scheduled_encoder_inputs: + return + + # Batch the multi-modal inputs. + mm_inputs = list[MultiModalKwargs]() + req_ids_pos = list[tuple[str, int, PlaceholderRange]]() + for req_id, encoder_input_ids in scheduled_encoder_inputs.items(): + req_state = self.requests[req_id] + + for mm_input_id in encoder_input_ids: + mm_inputs.append(req_state.mm_inputs[mm_input_id]) + req_ids_pos.append( + (req_id, mm_input_id, req_state.mm_positions[mm_input_id])) + + # Batch mm inputs as much as we can: if a request in the batch has + # multiple modalities or a different modality than the previous one, + # we process it separately to preserve item order. + # FIXME(ywang96): This is a hacky way to deal with multiple modalities + # in the same batch while still being able to benefit from batching + # multimodal inputs. The proper solution should be reordering the + # encoder outputs. + grouped_mm_inputs_list = group_mm_inputs_by_modality(mm_inputs) + + encoder_outputs = [] + for grouped_mm_inputs in grouped_mm_inputs_list: + batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs) + batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs, + device=self.device) + + # Run the encoder. + # `curr_group_outputs` is either of the following: + # 1. A tensor of shape (num_items, feature_size, hidden_size) + # in case feature_size is fixed across all multimodal items. + # 2. A list or tuple (length: num_items) of tensors, each of shape + # (feature_size, hidden_size) in case the feature size is dynamic + # depending on the input multimodal items. + curr_group_outputs = self.model.get_multimodal_embeddings( + **batched_mm_inputs) + + sanity_check_mm_encoder_outputs( + curr_group_outputs, + expected_num_items=len(grouped_mm_inputs), + ) + + for output in curr_group_outputs: + encoder_outputs.append(output) + + # Cache the encoder outputs. + for (req_id, input_id, pos_info), output in zip( + req_ids_pos, + encoder_outputs, + ): + if req_id not in self.encoder_cache: + self.encoder_cache[req_id] = {} + + self.encoder_cache[req_id][input_id] = scatter_mm_placeholders( + output, + is_embed=pos_info.is_embed, + ) + + def _gather_mm_embeddings( + self, + scheduler_output: "SchedulerOutput", + ) -> list[torch.Tensor]: + mm_embeds: list[torch.Tensor] = [] + for req_id in self.input_batch.req_ids: + num_scheduled_tokens = scheduler_output.num_scheduled_tokens[ + req_id] + req_state = self.requests[req_id] + num_computed_tokens = req_state.num_computed_tokens + mm_positions = req_state.mm_positions + for i, pos_info in enumerate(mm_positions): + start_pos = pos_info.offset + num_encoder_tokens = pos_info.length + + # The encoder output is needed if the two ranges overlap: + # [num_computed_tokens, + # num_computed_tokens + num_scheduled_tokens) and + # [start_pos, start_pos + num_encoder_tokens) + if start_pos >= num_computed_tokens + num_scheduled_tokens: + # The encoder output is not needed in this step. + break + if start_pos + num_encoder_tokens <= num_computed_tokens: + # The encoder output is already processed and stored + # in the decoder's KV cache. + continue + + start_idx = max(num_computed_tokens - start_pos, 0) + end_idx = min( + num_computed_tokens - start_pos + num_scheduled_tokens, + num_encoder_tokens) + assert start_idx < end_idx + assert req_id in self.encoder_cache + assert i in self.encoder_cache[req_id] + encoder_output = self.encoder_cache[req_id][i] + + if (is_embed := pos_info.is_embed) is not None: + is_embed = is_embed[start_idx:end_idx] + + mm_embeds_item = gather_mm_placeholders( + encoder_output[start_idx:end_idx], + is_embed=is_embed, + ) + mm_embeds.append(mm_embeds_item) + return mm_embeds + def _process_reqs( self, scheduler_output: "SchedulerOutput", @@ -591,7 +796,18 @@ def _process_reqs( np.add(self.input_batch.num_computed_tokens_cpu[req_indices], arange, out=positions_np) + + # Calculate M-RoPE positions. + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.uses_mrope: + self._calc_mrope_positions(scheduler_output) + if self.uses_mrope: + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + self.mrope_positions[:, :total_num_scheduled_tokens].copy_( + self.mrope_positions_cpu[:, :total_num_scheduled_tokens], + non_blocking=True) + self.positions[:total_num_scheduled_tokens].copy_( self.positions_cpu[:total_num_scheduled_tokens], non_blocking=True) positions = self.positions[:num_input_tokens] @@ -690,6 +906,43 @@ def _process_reqs( input_ids = self.input_ids[:padded_batch_size] positions = self.positions[:padded_batch_size] + # prepare the MRoPE for mllm if using multimodal + num_input_tokens = total_num_scheduled_tokens + # _prepare_inputs may reorder the batch, so we must gather multi + # modal outputs after that to ensure the correct order + if self.is_multimodal_model: + # Run the multimodal encoder if any. + self._execute_mm_encoder(scheduler_output) + mm_embeds = self._gather_mm_embeddings(scheduler_output) + else: + mm_embeds = [] + + if self.is_multimodal_model: + # NOTE(woosuk): To unify token ids and soft tokens (vision + # embeddings), we always use embeddings (rather than token ids) + # as input to the multimodal model, even when the input is text. + input_ids = self.input_ids[:num_input_tokens] + if mm_embeds: + inputs_embeds = self.model.get_input_embeddings( + input_ids, mm_embeds) + else: + inputs_embeds = self.model.get_input_embeddings(input_ids) + # TODO(woosuk): Avoid the copy. Optimize. + self.inputs_embeds[:num_input_tokens].copy_(inputs_embeds) + inputs_embeds = self.inputs_embeds[:num_input_tokens] + input_ids = None + else: + # For text-only models, we use token ids as input. + # While it is possible to use embeddings as input just like the + # multimodal models, it is not desirable for performance since + # then the embedding layer is not included in the CUDA graph. + input_ids = self.input_ids[:num_input_tokens] + inputs_embeds = None + if self.uses_mrope: + positions = self.mrope_positions[:, :num_input_tokens] + else: + positions = self.positions[:num_input_tokens] + # Run forward pass with set_forward_context(attn_metadata, self.vllm_config, @@ -703,7 +956,7 @@ def _process_reqs( input_ids=input_ids, positions=positions, intermediate_tensors=intermediate_tensors, - inputs_embeds=None, + inputs_embeds=inputs_embeds, **model_kwargs, ) else: @@ -712,7 +965,7 @@ def _process_reqs( input_ids=input_ids, positions=positions, intermediate_tensors=intermediate_tensors, - inputs_embeds=None, + inputs_embeds=inputs_embeds, **model_kwargs, ) @@ -1173,8 +1426,11 @@ def _dummy_run( return hidden_states def profile_run(self) -> None: - # Profile with multimodal encoder & encoder cache. - self._profile_multimodal() + # FIXME Profile with multimodal encoder & encoder cache. + # current _profile_multimodal() using PyTorch SDPA backend method not + # support for window/full attn to reduce Memcpy operations, so will cause + # Out Of Memory problem, so we currently don't use self._profile_multimodal() + # self._profile_multimodal() # For profile, have maximum num_reqs and that collectively have # maximum num_tokens. From 695c3b8d4d08311a4d7a5477beca8924c8efceeb Mon Sep 17 00:00:00 2001 From: cty Date: Wed, 30 Apr 2025 08:13:39 +0000 Subject: [PATCH 02/11] Use git rebase to fit upstream/main Signed-off-by: cty --- ...aclrtlaunch_rope_custom_false_bfloat16_t.h | 10 ++++ .../aclrtlaunch_rope_custom_false_half.h | 10 ++++ .../aclrtlaunch_rope_custom_true_bfloat16_t.h | 10 ++++ .../aclrtlaunch_rope_custom_true_half.h | 10 ++++ .../aclrtlaunch_triple_chevrons_func.h | 60 +++++++++++++++++++ 5 files changed, 100 insertions(+) create mode 100644 vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_bfloat16_t.h create mode 100644 vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_half.h create mode 100644 vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_bfloat16_t.h create mode 100644 vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_half.h create mode 100644 vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_triple_chevrons_func.h diff --git a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_bfloat16_t.h b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_bfloat16_t.h new file mode 100644 index 00000000000..e1a136ed09c --- /dev/null +++ b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_bfloat16_t.h @@ -0,0 +1,10 @@ +#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_BFLOAT16_T_H +#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_BFLOAT16_T_H +#include "acl/acl_base.h" + +#ifndef ACLRT_LAUNCH_KERNEL +#define ACLRT_LAUNCH_KERNEL(kernel_func) aclrtlaunch_##kernel_func +#endif + +extern "C" uint32_t aclrtlaunch_rope_custom_false_bfloat16_t(uint32_t blockDim, aclrtStream stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum); +#endif diff --git a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_half.h b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_half.h new file mode 100644 index 00000000000..58738467ef6 --- /dev/null +++ b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_half.h @@ -0,0 +1,10 @@ +#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_HALF_H +#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_HALF_H +#include "acl/acl_base.h" + +#ifndef ACLRT_LAUNCH_KERNEL +#define ACLRT_LAUNCH_KERNEL(kernel_func) aclrtlaunch_##kernel_func +#endif + +extern "C" uint32_t aclrtlaunch_rope_custom_false_half(uint32_t blockDim, aclrtStream stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum); +#endif diff --git a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_bfloat16_t.h b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_bfloat16_t.h new file mode 100644 index 00000000000..122ee862ab6 --- /dev/null +++ b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_bfloat16_t.h @@ -0,0 +1,10 @@ +#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_BFLOAT16_T_H +#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_BFLOAT16_T_H +#include "acl/acl_base.h" + +#ifndef ACLRT_LAUNCH_KERNEL +#define ACLRT_LAUNCH_KERNEL(kernel_func) aclrtlaunch_##kernel_func +#endif + +extern "C" uint32_t aclrtlaunch_rope_custom_true_bfloat16_t(uint32_t blockDim, aclrtStream stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum); +#endif diff --git a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_half.h b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_half.h new file mode 100644 index 00000000000..02e01a10fcb --- /dev/null +++ b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_half.h @@ -0,0 +1,10 @@ +#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_HALF_H +#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_HALF_H +#include "acl/acl_base.h" + +#ifndef ACLRT_LAUNCH_KERNEL +#define ACLRT_LAUNCH_KERNEL(kernel_func) aclrtlaunch_##kernel_func +#endif + +extern "C" uint32_t aclrtlaunch_rope_custom_true_half(uint32_t blockDim, aclrtStream stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum); +#endif diff --git a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_triple_chevrons_func.h b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_triple_chevrons_func.h new file mode 100644 index 00000000000..7f1e2cbe562 --- /dev/null +++ b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_triple_chevrons_func.h @@ -0,0 +1,60 @@ + +#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_BFLOAT16_T_HKERNEL_H_ +#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_BFLOAT16_T_HKERNEL_H_ + + + +extern "C" uint32_t aclrtlaunch_rope_custom_false_bfloat16_t(uint32_t blockDim, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum); + +inline uint32_t rope_custom_false_bfloat16_t(uint32_t blockDim, void* hold, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum) +{ + (void)hold; + return aclrtlaunch_rope_custom_false_bfloat16_t(blockDim, stream, positions, queryDst, keyDst, query, key, cosSinCache, rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, numHeads, numKvHeads, headSize, numTokens, loopNum, coreNum); +} + +#endif + +#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_HALF_HKERNEL_H_ +#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_HALF_HKERNEL_H_ + + + +extern "C" uint32_t aclrtlaunch_rope_custom_false_half(uint32_t blockDim, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum); + +inline uint32_t rope_custom_false_half(uint32_t blockDim, void* hold, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum) +{ + (void)hold; + return aclrtlaunch_rope_custom_false_half(blockDim, stream, positions, queryDst, keyDst, query, key, cosSinCache, rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, numHeads, numKvHeads, headSize, numTokens, loopNum, coreNum); +} + +#endif + +#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_BFLOAT16_T_HKERNEL_H_ +#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_BFLOAT16_T_HKERNEL_H_ + + + +extern "C" uint32_t aclrtlaunch_rope_custom_true_bfloat16_t(uint32_t blockDim, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum); + +inline uint32_t rope_custom_true_bfloat16_t(uint32_t blockDim, void* hold, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum) +{ + (void)hold; + return aclrtlaunch_rope_custom_true_bfloat16_t(blockDim, stream, positions, queryDst, keyDst, query, key, cosSinCache, rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, numHeads, numKvHeads, headSize, numTokens, loopNum, coreNum); +} + +#endif + +#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_HALF_HKERNEL_H_ +#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_HALF_HKERNEL_H_ + + + +extern "C" uint32_t aclrtlaunch_rope_custom_true_half(uint32_t blockDim, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum); + +inline uint32_t rope_custom_true_half(uint32_t blockDim, void* hold, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum) +{ + (void)hold; + return aclrtlaunch_rope_custom_true_half(blockDim, stream, positions, queryDst, keyDst, query, key, cosSinCache, rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, numHeads, numKvHeads, headSize, numTokens, loopNum, coreNum); +} + +#endif From 931505515dbef50bd2cabb31d809c37e6b5e0efe Mon Sep 17 00:00:00 2001 From: cty Date: Wed, 30 Apr 2025 08:42:41 +0000 Subject: [PATCH 03/11] git rebase for fitting the main branch Signed-off-by: cty --- ...aclrtlaunch_rope_custom_false_bfloat16_t.h | 10 ---- .../aclrtlaunch_rope_custom_false_half.h | 10 ---- .../aclrtlaunch_rope_custom_true_bfloat16_t.h | 10 ---- .../aclrtlaunch_rope_custom_true_half.h | 10 ---- .../aclrtlaunch_triple_chevrons_func.h | 60 ------------------- 5 files changed, 100 deletions(-) delete mode 100644 vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_bfloat16_t.h delete mode 100644 vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_half.h delete mode 100644 vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_bfloat16_t.h delete mode 100644 vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_half.h delete mode 100644 vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_triple_chevrons_func.h diff --git a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_bfloat16_t.h b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_bfloat16_t.h deleted file mode 100644 index e1a136ed09c..00000000000 --- a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_bfloat16_t.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_BFLOAT16_T_H -#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_BFLOAT16_T_H -#include "acl/acl_base.h" - -#ifndef ACLRT_LAUNCH_KERNEL -#define ACLRT_LAUNCH_KERNEL(kernel_func) aclrtlaunch_##kernel_func -#endif - -extern "C" uint32_t aclrtlaunch_rope_custom_false_bfloat16_t(uint32_t blockDim, aclrtStream stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum); -#endif diff --git a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_half.h b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_half.h deleted file mode 100644 index 58738467ef6..00000000000 --- a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_half.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_HALF_H -#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_HALF_H -#include "acl/acl_base.h" - -#ifndef ACLRT_LAUNCH_KERNEL -#define ACLRT_LAUNCH_KERNEL(kernel_func) aclrtlaunch_##kernel_func -#endif - -extern "C" uint32_t aclrtlaunch_rope_custom_false_half(uint32_t blockDim, aclrtStream stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum); -#endif diff --git a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_bfloat16_t.h b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_bfloat16_t.h deleted file mode 100644 index 122ee862ab6..00000000000 --- a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_bfloat16_t.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_BFLOAT16_T_H -#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_BFLOAT16_T_H -#include "acl/acl_base.h" - -#ifndef ACLRT_LAUNCH_KERNEL -#define ACLRT_LAUNCH_KERNEL(kernel_func) aclrtlaunch_##kernel_func -#endif - -extern "C" uint32_t aclrtlaunch_rope_custom_true_bfloat16_t(uint32_t blockDim, aclrtStream stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum); -#endif diff --git a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_half.h b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_half.h deleted file mode 100644 index 02e01a10fcb..00000000000 --- a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_half.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_HALF_H -#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_HALF_H -#include "acl/acl_base.h" - -#ifndef ACLRT_LAUNCH_KERNEL -#define ACLRT_LAUNCH_KERNEL(kernel_func) aclrtlaunch_##kernel_func -#endif - -extern "C" uint32_t aclrtlaunch_rope_custom_true_half(uint32_t blockDim, aclrtStream stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum); -#endif diff --git a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_triple_chevrons_func.h b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_triple_chevrons_func.h deleted file mode 100644 index 7f1e2cbe562..00000000000 --- a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_triple_chevrons_func.h +++ /dev/null @@ -1,60 +0,0 @@ - -#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_BFLOAT16_T_HKERNEL_H_ -#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_BFLOAT16_T_HKERNEL_H_ - - - -extern "C" uint32_t aclrtlaunch_rope_custom_false_bfloat16_t(uint32_t blockDim, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum); - -inline uint32_t rope_custom_false_bfloat16_t(uint32_t blockDim, void* hold, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum) -{ - (void)hold; - return aclrtlaunch_rope_custom_false_bfloat16_t(blockDim, stream, positions, queryDst, keyDst, query, key, cosSinCache, rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, numHeads, numKvHeads, headSize, numTokens, loopNum, coreNum); -} - -#endif - -#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_HALF_HKERNEL_H_ -#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_HALF_HKERNEL_H_ - - - -extern "C" uint32_t aclrtlaunch_rope_custom_false_half(uint32_t blockDim, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum); - -inline uint32_t rope_custom_false_half(uint32_t blockDim, void* hold, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum) -{ - (void)hold; - return aclrtlaunch_rope_custom_false_half(blockDim, stream, positions, queryDst, keyDst, query, key, cosSinCache, rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, numHeads, numKvHeads, headSize, numTokens, loopNum, coreNum); -} - -#endif - -#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_BFLOAT16_T_HKERNEL_H_ -#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_BFLOAT16_T_HKERNEL_H_ - - - -extern "C" uint32_t aclrtlaunch_rope_custom_true_bfloat16_t(uint32_t blockDim, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum); - -inline uint32_t rope_custom_true_bfloat16_t(uint32_t blockDim, void* hold, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum) -{ - (void)hold; - return aclrtlaunch_rope_custom_true_bfloat16_t(blockDim, stream, positions, queryDst, keyDst, query, key, cosSinCache, rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, numHeads, numKvHeads, headSize, numTokens, loopNum, coreNum); -} - -#endif - -#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_HALF_HKERNEL_H_ -#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_HALF_HKERNEL_H_ - - - -extern "C" uint32_t aclrtlaunch_rope_custom_true_half(uint32_t blockDim, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum); - -inline uint32_t rope_custom_true_half(uint32_t blockDim, void* hold, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum) -{ - (void)hold; - return aclrtlaunch_rope_custom_true_half(blockDim, stream, positions, queryDst, keyDst, query, key, cosSinCache, rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, numHeads, numKvHeads, headSize, numTokens, loopNum, coreNum); -} - -#endif From d3d3eb618edd1b9f647cc9cd99d023d67ef61794 Mon Sep 17 00:00:00 2001 From: cty Date: Wed, 30 Apr 2025 10:45:00 +0000 Subject: [PATCH 04/11] This commit is to fit the ruff and yapf Signed-off-by: cty --- vllm_ascend/worker/model_runner_v1.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index af99e2dead7..95478735246 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -40,7 +40,7 @@ from vllm.logger import logger from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.model_loader import get_model -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, @@ -60,8 +60,9 @@ from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange -from vllm.v1.worker.utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs, - scatter_mm_placeholders) +from vllm.v1.worker.utils import (gather_mm_placeholders, + sanity_check_mm_encoder_outputs, + scatter_mm_placeholders) from vllm.multimodal.utils import group_mm_inputs_by_modality @@ -430,7 +431,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: output_token_ids=[], lora_request=new_req_data.lora_request, ) - + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) if self.uses_mrope: image_grid_thw = [] @@ -467,7 +468,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: audio_feature_lengths=audio_feature_lengths, use_audio_in_video=use_audio_in_video, ) - + req_ids_to_add.append(req_id) # Update the states of the running/resumed requests. @@ -796,7 +797,7 @@ def _process_reqs( np.add(self.input_batch.num_computed_tokens_cpu[req_indices], arange, out=positions_np) - + # Calculate M-RoPE positions. # Only relevant for models using M-RoPE (e.g, Qwen2-VL) if self.uses_mrope: @@ -807,7 +808,7 @@ def _process_reqs( self.mrope_positions[:, :total_num_scheduled_tokens].copy_( self.mrope_positions_cpu[:, :total_num_scheduled_tokens], non_blocking=True) - + self.positions[:total_num_scheduled_tokens].copy_( self.positions_cpu[:total_num_scheduled_tokens], non_blocking=True) positions = self.positions[:num_input_tokens] @@ -1427,7 +1428,7 @@ def _dummy_run( def profile_run(self) -> None: # FIXME Profile with multimodal encoder & encoder cache. - # current _profile_multimodal() using PyTorch SDPA backend method not + # current _profile_multimodal() using PyTorch SDPA backend method not # support for window/full attn to reduce Memcpy operations, so will cause # Out Of Memory problem, so we currently don't use self._profile_multimodal() # self._profile_multimodal() From 1e23068568d01733537d0d4a4f37714815f2c57b Mon Sep 17 00:00:00 2001 From: cty Date: Wed, 30 Apr 2025 11:15:34 +0000 Subject: [PATCH 05/11] This commit is to resolve rebase conflict Signed-off-by: cty --- vllm_ascend/worker/model_runner_v1.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 95478735246..43d7c9ff63b 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -39,8 +39,11 @@ from vllm.inputs import INPUT_REGISTRY from vllm.logger import logger from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import get_model from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange +from vllm.multimodal.utils import group_mm_inputs_by_modality from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, @@ -64,10 +67,6 @@ sanity_check_mm_encoder_outputs, scatter_mm_placeholders) -from vllm.multimodal.utils import group_mm_inputs_by_modality - -from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding - from vllm_ascend.attention.attention import AttentionMaskBuilder from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.attention.mla_v1 import CommonAttentionMetadata From 8006613accd0fd7272ac574fd3d6e6a0fd329891 Mon Sep 17 00:00:00 2001 From: cty Date: Wed, 30 Apr 2025 11:39:17 +0000 Subject: [PATCH 06/11] This commit is to enable the V1 Qwen2.5-vl test Signed-off-by: cty --- tests/singlecard/test_offline_inference.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/singlecard/test_offline_inference.py b/tests/singlecard/test_offline_inference.py index 5d0e16e5fdf..5e60f05298d 100644 --- a/tests/singlecard/test_offline_inference.py +++ b/tests/singlecard/test_offline_inference.py @@ -58,8 +58,6 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None: @pytest.mark.parametrize("model", MULTIMODALITY_MODELS) -@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "1", - reason="qwen2.5_vl is not supported on v1") def test_multimodal(model, prompt_template, vllm_runner): image = ImageAsset("cherry_blossom") \ .pil_image.convert("RGB") From 4a292a22e0cb40a7463fbadeefba02a90eb94120 Mon Sep 17 00:00:00 2001 From: cty Date: Tue, 27 May 2025 09:25:18 +0000 Subject: [PATCH 07/11] resolve redefine problem Signed-off-by: cty --- vllm_ascend/worker/model_runner_v1.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 43d7c9ff63b..8c9783cb9b3 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -61,7 +61,6 @@ from vllm.v1.utils import bind_kv_cache from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin -from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange from vllm.v1.worker.utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs, From 850b6692f21eef971c5e402f536600ec9605e77d Mon Sep 17 00:00:00 2001 From: cty Date: Tue, 27 May 2025 10:38:06 +0000 Subject: [PATCH 08/11] resolve isort problem Signed-off-by: cty --- vllm_ascend/worker/model_runner_v1.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 8c9783cb9b3..bc123f94f6b 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -61,7 +61,6 @@ from vllm.v1.utils import bind_kv_cache from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin - from vllm.v1.worker.utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs, scatter_mm_placeholders) From c9d291abf5574fa25dcdcd524f26f862988f932a Mon Sep 17 00:00:00 2001 From: cty Date: Tue, 27 May 2025 10:52:00 +0000 Subject: [PATCH 09/11] Trigger CI Signed-off-by: cty From 23c5078b420a478fa9255129c3182bc2cdec91a8 Mon Sep 17 00:00:00 2001 From: cty Date: Tue, 27 May 2025 11:00:55 +0000 Subject: [PATCH 10/11] Trigger CI Signed-off-by: cty From 46a0776ddc62b39c9b28edf44016ba75826675eb Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Tue, 3 Jun 2025 15:41:49 +0800 Subject: [PATCH 11/11] enable test Signed-off-by: Yikun Jiang --- .github/workflows/accuracy_test.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml index 3bf6b746b0e..10b7b320e5c 100644 --- a/.github/workflows/accuracy_test.yaml +++ b/.github/workflows/accuracy_test.yaml @@ -109,10 +109,6 @@ jobs: contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') && '["Qwen/Qwen2.5-VL-7B-Instruct"]' ) }} - # Remove exclude after https://github.com/vllm-project/vllm-ascend/issues/1044 resolved - exclude: - - model_name: Qwen/Qwen2.5-VL-7B-Instruct - vllm_use_version: 1 fail-fast: false name: ${{ matrix.model_name }} accuracy V${{ matrix.vllm_use_version }}