From a478715c8974bc75b06ccac2b308eaa59e74ab80 Mon Sep 17 00:00:00 2001
From: cty <ctynb@qq.com>
Date: Wed, 30 Apr 2025 08:21:47 +0000
Subject: [PATCH 01/11] This is to resolve the conflict in git rebase

Signed-off-by: cty <ctynb@qq.com>
---
 vllm_ascend/worker/model_runner_v1.py | 266 +++++++++++++++++++++++++-
 1 file changed, 261 insertions(+), 5 deletions(-)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index a0bc2125935..af99e2dead7 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -58,6 +58,14 @@
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+
+from vllm.v1.worker.utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs,
+                    scatter_mm_placeholders)
+
+from vllm.multimodal.utils import group_mm_inputs_by_modality
+
+from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 
 from vllm_ascend.attention.attention import AttentionMaskBuilder
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
@@ -422,7 +430,44 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                 output_token_ids=[],
                 lora_request=new_req_data.lora_request,
             )
-
+            
+            # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
+            if self.uses_mrope:
+                image_grid_thw = []
+                video_grid_thw = []
+                second_per_grid_ts = []
+                audio_feature_lengths = []
+                use_audio_in_video = False
+                for mm_input in self.requests[req_id].mm_inputs:
+                    if mm_input.get("image_grid_thw") is not None:
+                        image_grid_thw.extend(
+                            mm_input["image_grid_thw"].tolist())
+                    if mm_input.get("video_grid_thw") is not None:
+                        video_grid_thw.extend(
+                            mm_input["video_grid_thw"].tolist())
+                    if mm_input.get("second_per_grid_ts") is not None:
+                        second_per_grid_ts.extend(
+                            mm_input["second_per_grid_ts"])
+                    if mm_input.get("audio_feature_lengths") is not None:
+                        audio_feature_lengths.extend(
+                            mm_input["audio_feature_lengths"])
+                    if mm_input.get("use_audio_in_video") is True:
+                        use_audio_in_video = True
+
+                hf_config = self.model_config.hf_config
+
+                self.requests[req_id].mrope_positions, \
+                    self.requests[req_id].mrope_position_delta = \
+                    MRotaryEmbedding.get_input_positions_tensor(
+                        self.requests[req_id].prompt_token_ids,
+                        hf_config=hf_config,
+                        image_grid_thw=image_grid_thw,
+                        video_grid_thw=video_grid_thw,
+                        second_per_grid_ts=second_per_grid_ts,
+                        audio_feature_lengths=audio_feature_lengths,
+                        use_audio_in_video=use_audio_in_video,
+                    )
+                    
             req_ids_to_add.append(req_id)
 
         # Update the states of the running/resumed requests.
@@ -533,6 +578,166 @@ def _make_attention_mask(self, seq_lens, query_lens, position,
         else:
             return None
 
+    def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
+        mrope_pos_ptr = 0
+        for index, req_id in enumerate(self.input_batch.req_ids):
+            req = self.requests[req_id]
+            assert req.mrope_positions is not None
+
+            num_computed_tokens = \
+                self.input_batch.num_computed_tokens_cpu[index]
+            num_scheduled_tokens = \
+                scheduler_output.num_scheduled_tokens[req_id]
+            num_prompt_tokens = len(req.prompt_token_ids)
+
+            if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens:
+                prompt_part_len = max(0,
+                                      num_prompt_tokens - num_computed_tokens)
+                completion_part_len = max(
+                    0, num_scheduled_tokens - prompt_part_len)
+            else:
+                prompt_part_len = num_scheduled_tokens
+                completion_part_len = 0
+
+            assert num_scheduled_tokens == prompt_part_len + completion_part_len
+
+            if prompt_part_len > 0:
+                # prompt's mrope_positions are pre-computed
+                dst_start = mrope_pos_ptr
+                dst_end = mrope_pos_ptr + prompt_part_len
+                src_start = num_computed_tokens
+                src_end = num_computed_tokens + prompt_part_len
+
+                self.mrope_positions_cpu[:, dst_start:dst_end] = \
+                    req.mrope_positions[:,src_start:src_end]
+
+                mrope_pos_ptr += prompt_part_len
+
+            if completion_part_len > 0:
+                # compute completion's mrope_positions on-the-fly
+                dst_start = mrope_pos_ptr
+                dst_end = mrope_pos_ptr + completion_part_len
+
+                self.mrope_positions_cpu[:, dst_start:dst_end] = \
+                    MRotaryEmbedding.get_next_input_positions_tensor(
+                        req.mrope_position_delta,
+                        context_len=num_computed_tokens +
+                        prompt_part_len,
+                        seq_len=num_computed_tokens +
+                        prompt_part_len +
+                        completion_part_len,
+                    )
+
+                mrope_pos_ptr += completion_part_len
+
+    def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
+        scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
+        if not scheduled_encoder_inputs:
+            return
+
+        # Batch the multi-modal inputs.
+        mm_inputs = list[MultiModalKwargs]()
+        req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
+        for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
+            req_state = self.requests[req_id]
+
+            for mm_input_id in encoder_input_ids:
+                mm_inputs.append(req_state.mm_inputs[mm_input_id])
+                req_ids_pos.append(
+                    (req_id, mm_input_id, req_state.mm_positions[mm_input_id]))
+
+        # Batch mm inputs as much as we can: if a request in the batch has
+        # multiple modalities or a different modality than the previous one,
+        # we process it separately to preserve item order.
+        # FIXME(ywang96): This is a hacky way to deal with multiple modalities
+        # in the same batch while still being able to benefit from batching
+        # multimodal inputs. The proper solution should be reordering the
+        # encoder outputs.
+        grouped_mm_inputs_list = group_mm_inputs_by_modality(mm_inputs)
+
+        encoder_outputs = []
+        for grouped_mm_inputs in grouped_mm_inputs_list:
+            batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs)
+            batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs,
+                                                           device=self.device)
+
+            # Run the encoder.
+            # `curr_group_outputs` is either of the following:
+            # 1. A tensor of shape (num_items, feature_size, hidden_size)
+            # in case feature_size is fixed across all multimodal items.
+            # 2. A list or tuple (length: num_items) of tensors, each of shape
+            # (feature_size, hidden_size) in case the feature size is dynamic
+            # depending on the input multimodal items.
+            curr_group_outputs = self.model.get_multimodal_embeddings(
+                **batched_mm_inputs)
+
+            sanity_check_mm_encoder_outputs(
+                curr_group_outputs,
+                expected_num_items=len(grouped_mm_inputs),
+            )
+
+            for output in curr_group_outputs:
+                encoder_outputs.append(output)
+
+        # Cache the encoder outputs.
+        for (req_id, input_id, pos_info), output in zip(
+                req_ids_pos,
+                encoder_outputs,
+        ):
+            if req_id not in self.encoder_cache:
+                self.encoder_cache[req_id] = {}
+
+            self.encoder_cache[req_id][input_id] = scatter_mm_placeholders(
+                output,
+                is_embed=pos_info.is_embed,
+            )
+
+    def _gather_mm_embeddings(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> list[torch.Tensor]:
+        mm_embeds: list[torch.Tensor] = []
+        for req_id in self.input_batch.req_ids:
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
+                req_id]
+            req_state = self.requests[req_id]
+            num_computed_tokens = req_state.num_computed_tokens
+            mm_positions = req_state.mm_positions
+            for i, pos_info in enumerate(mm_positions):
+                start_pos = pos_info.offset
+                num_encoder_tokens = pos_info.length
+
+                # The encoder output is needed if the two ranges overlap:
+                # [num_computed_tokens,
+                #  num_computed_tokens + num_scheduled_tokens) and
+                # [start_pos, start_pos + num_encoder_tokens)
+                if start_pos >= num_computed_tokens + num_scheduled_tokens:
+                    # The encoder output is not needed in this step.
+                    break
+                if start_pos + num_encoder_tokens <= num_computed_tokens:
+                    # The encoder output is already processed and stored
+                    # in the decoder's KV cache.
+                    continue
+
+                start_idx = max(num_computed_tokens - start_pos, 0)
+                end_idx = min(
+                    num_computed_tokens - start_pos + num_scheduled_tokens,
+                    num_encoder_tokens)
+                assert start_idx < end_idx
+                assert req_id in self.encoder_cache
+                assert i in self.encoder_cache[req_id]
+                encoder_output = self.encoder_cache[req_id][i]
+
+                if (is_embed := pos_info.is_embed) is not None:
+                    is_embed = is_embed[start_idx:end_idx]
+
+                mm_embeds_item = gather_mm_placeholders(
+                    encoder_output[start_idx:end_idx],
+                    is_embed=is_embed,
+                )
+                mm_embeds.append(mm_embeds_item)
+        return mm_embeds
+
     def _process_reqs(
         self,
         scheduler_output: "SchedulerOutput",
@@ -591,7 +796,18 @@ def _process_reqs(
         np.add(self.input_batch.num_computed_tokens_cpu[req_indices],
                arange,
                out=positions_np)
+        
+        # Calculate M-RoPE positions.
+        # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
+        if self.uses_mrope:
+            self._calc_mrope_positions(scheduler_output)
 
+        if self.uses_mrope:
+            # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
+            self.mrope_positions[:, :total_num_scheduled_tokens].copy_(
+                self.mrope_positions_cpu[:, :total_num_scheduled_tokens],
+                non_blocking=True)
+            
         self.positions[:total_num_scheduled_tokens].copy_(
             self.positions_cpu[:total_num_scheduled_tokens], non_blocking=True)
         positions = self.positions[:num_input_tokens]
@@ -690,6 +906,43 @@ def _process_reqs(
             input_ids = self.input_ids[:padded_batch_size]
             positions = self.positions[:padded_batch_size]
 
+        # prepare the MRoPE for mllm if using multimodal
+        num_input_tokens = total_num_scheduled_tokens
+        # _prepare_inputs may reorder the batch, so we must gather multi
+        # modal outputs after that to ensure the correct order
+        if self.is_multimodal_model:
+            # Run the multimodal encoder if any.
+            self._execute_mm_encoder(scheduler_output)
+            mm_embeds = self._gather_mm_embeddings(scheduler_output)
+        else:
+            mm_embeds = []
+
+        if self.is_multimodal_model:
+            # NOTE(woosuk): To unify token ids and soft tokens (vision
+            # embeddings), we always use embeddings (rather than token ids)
+            # as input to the multimodal model, even when the input is text.
+            input_ids = self.input_ids[:num_input_tokens]
+            if mm_embeds:
+                inputs_embeds = self.model.get_input_embeddings(
+                    input_ids, mm_embeds)
+            else:
+                inputs_embeds = self.model.get_input_embeddings(input_ids)
+            # TODO(woosuk): Avoid the copy. Optimize.
+            self.inputs_embeds[:num_input_tokens].copy_(inputs_embeds)
+            inputs_embeds = self.inputs_embeds[:num_input_tokens]
+            input_ids = None
+        else:
+            # For text-only models, we use token ids as input.
+            # While it is possible to use embeddings as input just like the
+            # multimodal models, it is not desirable for performance since
+            # then the embedding layer is not included in the CUDA graph.
+            input_ids = self.input_ids[:num_input_tokens]
+            inputs_embeds = None
+        if self.uses_mrope:
+            positions = self.mrope_positions[:, :num_input_tokens]
+        else:
+            positions = self.positions[:num_input_tokens]
+
         # Run forward pass
         with set_forward_context(attn_metadata,
                                  self.vllm_config,
@@ -703,7 +956,7 @@ def _process_reqs(
                     input_ids=input_ids,
                     positions=positions,
                     intermediate_tensors=intermediate_tensors,
-                    inputs_embeds=None,
+                    inputs_embeds=inputs_embeds,
                     **model_kwargs,
                 )
             else:
@@ -712,7 +965,7 @@ def _process_reqs(
                     input_ids=input_ids,
                     positions=positions,
                     intermediate_tensors=intermediate_tensors,
-                    inputs_embeds=None,
+                    inputs_embeds=inputs_embeds,
                     **model_kwargs,
                 )
 
@@ -1173,8 +1426,11 @@ def _dummy_run(
                 return hidden_states
 
     def profile_run(self) -> None:
-        # Profile with multimodal encoder & encoder cache.
-        self._profile_multimodal()
+        # FIXME Profile with multimodal encoder & encoder cache.
+        # current _profile_multimodal() using PyTorch SDPA backend method not 
+        # support for window/full attn to reduce Memcpy operations, so will cause
+        # Out Of Memory problem, so we currently don't use self._profile_multimodal()
+        # self._profile_multimodal()
 
         # For profile, have maximum num_reqs and that collectively have
         # maximum num_tokens.

From 695c3b8d4d08311a4d7a5477beca8924c8efceeb Mon Sep 17 00:00:00 2001
From: cty <ctynb@qq.com>
Date: Wed, 30 Apr 2025 08:13:39 +0000
Subject: [PATCH 02/11] Use git rebase to fit upstream/main

Signed-off-by: cty <ctynb@qq.com>
---
 ...aclrtlaunch_rope_custom_false_bfloat16_t.h | 10 ++++
 .../aclrtlaunch_rope_custom_false_half.h      | 10 ++++
 .../aclrtlaunch_rope_custom_true_bfloat16_t.h | 10 ++++
 .../aclrtlaunch_rope_custom_true_half.h       | 10 ++++
 .../aclrtlaunch_triple_chevrons_func.h        | 60 +++++++++++++++++++
 5 files changed, 100 insertions(+)
 create mode 100644 vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_bfloat16_t.h
 create mode 100644 vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_half.h
 create mode 100644 vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_bfloat16_t.h
 create mode 100644 vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_half.h
 create mode 100644 vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_triple_chevrons_func.h

diff --git a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_bfloat16_t.h b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_bfloat16_t.h
new file mode 100644
index 00000000000..e1a136ed09c
--- /dev/null
+++ b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_bfloat16_t.h
@@ -0,0 +1,10 @@
+#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_BFLOAT16_T_H
+#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_BFLOAT16_T_H
+#include "acl/acl_base.h"
+
+#ifndef ACLRT_LAUNCH_KERNEL
+#define ACLRT_LAUNCH_KERNEL(kernel_func) aclrtlaunch_##kernel_func
+#endif
+
+extern "C" uint32_t aclrtlaunch_rope_custom_false_bfloat16_t(uint32_t blockDim, aclrtStream stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum);
+#endif
diff --git a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_half.h b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_half.h
new file mode 100644
index 00000000000..58738467ef6
--- /dev/null
+++ b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_half.h
@@ -0,0 +1,10 @@
+#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_HALF_H
+#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_HALF_H
+#include "acl/acl_base.h"
+
+#ifndef ACLRT_LAUNCH_KERNEL
+#define ACLRT_LAUNCH_KERNEL(kernel_func) aclrtlaunch_##kernel_func
+#endif
+
+extern "C" uint32_t aclrtlaunch_rope_custom_false_half(uint32_t blockDim, aclrtStream stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum);
+#endif
diff --git a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_bfloat16_t.h b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_bfloat16_t.h
new file mode 100644
index 00000000000..122ee862ab6
--- /dev/null
+++ b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_bfloat16_t.h
@@ -0,0 +1,10 @@
+#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_BFLOAT16_T_H
+#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_BFLOAT16_T_H
+#include "acl/acl_base.h"
+
+#ifndef ACLRT_LAUNCH_KERNEL
+#define ACLRT_LAUNCH_KERNEL(kernel_func) aclrtlaunch_##kernel_func
+#endif
+
+extern "C" uint32_t aclrtlaunch_rope_custom_true_bfloat16_t(uint32_t blockDim, aclrtStream stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum);
+#endif
diff --git a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_half.h b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_half.h
new file mode 100644
index 00000000000..02e01a10fcb
--- /dev/null
+++ b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_half.h
@@ -0,0 +1,10 @@
+#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_HALF_H
+#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_HALF_H
+#include "acl/acl_base.h"
+
+#ifndef ACLRT_LAUNCH_KERNEL
+#define ACLRT_LAUNCH_KERNEL(kernel_func) aclrtlaunch_##kernel_func
+#endif
+
+extern "C" uint32_t aclrtlaunch_rope_custom_true_half(uint32_t blockDim, aclrtStream stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum);
+#endif
diff --git a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_triple_chevrons_func.h b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_triple_chevrons_func.h
new file mode 100644
index 00000000000..7f1e2cbe562
--- /dev/null
+++ b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_triple_chevrons_func.h
@@ -0,0 +1,60 @@
+
+#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_BFLOAT16_T_HKERNEL_H_
+#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_BFLOAT16_T_HKERNEL_H_
+
+
+
+extern "C" uint32_t aclrtlaunch_rope_custom_false_bfloat16_t(uint32_t blockDim, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum);
+
+inline uint32_t rope_custom_false_bfloat16_t(uint32_t blockDim, void* hold, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum)
+{
+    (void)hold;
+    return aclrtlaunch_rope_custom_false_bfloat16_t(blockDim, stream, positions, queryDst, keyDst, query, key, cosSinCache, rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, numHeads, numKvHeads, headSize, numTokens, loopNum, coreNum);
+}
+
+#endif
+
+#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_HALF_HKERNEL_H_
+#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_HALF_HKERNEL_H_
+
+
+
+extern "C" uint32_t aclrtlaunch_rope_custom_false_half(uint32_t blockDim, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum);
+
+inline uint32_t rope_custom_false_half(uint32_t blockDim, void* hold, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum)
+{
+    (void)hold;
+    return aclrtlaunch_rope_custom_false_half(blockDim, stream, positions, queryDst, keyDst, query, key, cosSinCache, rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, numHeads, numKvHeads, headSize, numTokens, loopNum, coreNum);
+}
+
+#endif
+
+#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_BFLOAT16_T_HKERNEL_H_
+#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_BFLOAT16_T_HKERNEL_H_
+
+
+
+extern "C" uint32_t aclrtlaunch_rope_custom_true_bfloat16_t(uint32_t blockDim, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum);
+
+inline uint32_t rope_custom_true_bfloat16_t(uint32_t blockDim, void* hold, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum)
+{
+    (void)hold;
+    return aclrtlaunch_rope_custom_true_bfloat16_t(blockDim, stream, positions, queryDst, keyDst, query, key, cosSinCache, rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, numHeads, numKvHeads, headSize, numTokens, loopNum, coreNum);
+}
+
+#endif
+
+#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_HALF_HKERNEL_H_
+#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_HALF_HKERNEL_H_
+
+
+
+extern "C" uint32_t aclrtlaunch_rope_custom_true_half(uint32_t blockDim, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum);
+
+inline uint32_t rope_custom_true_half(uint32_t blockDim, void* hold, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum)
+{
+    (void)hold;
+    return aclrtlaunch_rope_custom_true_half(blockDim, stream, positions, queryDst, keyDst, query, key, cosSinCache, rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, numHeads, numKvHeads, headSize, numTokens, loopNum, coreNum);
+}
+
+#endif

From 931505515dbef50bd2cabb31d809c37e6b5e0efe Mon Sep 17 00:00:00 2001
From: cty <ctynb@qq.com>
Date: Wed, 30 Apr 2025 08:42:41 +0000
Subject: [PATCH 03/11] git rebase for fitting the main branch

Signed-off-by: cty <ctynb@qq.com>
---
 ...aclrtlaunch_rope_custom_false_bfloat16_t.h | 10 ----
 .../aclrtlaunch_rope_custom_false_half.h      | 10 ----
 .../aclrtlaunch_rope_custom_true_bfloat16_t.h | 10 ----
 .../aclrtlaunch_rope_custom_true_half.h       | 10 ----
 .../aclrtlaunch_triple_chevrons_func.h        | 60 -------------------
 5 files changed, 100 deletions(-)
 delete mode 100644 vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_bfloat16_t.h
 delete mode 100644 vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_half.h
 delete mode 100644 vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_bfloat16_t.h
 delete mode 100644 vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_half.h
 delete mode 100644 vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_triple_chevrons_func.h

diff --git a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_bfloat16_t.h b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_bfloat16_t.h
deleted file mode 100644
index e1a136ed09c..00000000000
--- a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_bfloat16_t.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_BFLOAT16_T_H
-#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_BFLOAT16_T_H
-#include "acl/acl_base.h"
-
-#ifndef ACLRT_LAUNCH_KERNEL
-#define ACLRT_LAUNCH_KERNEL(kernel_func) aclrtlaunch_##kernel_func
-#endif
-
-extern "C" uint32_t aclrtlaunch_rope_custom_false_bfloat16_t(uint32_t blockDim, aclrtStream stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum);
-#endif
diff --git a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_half.h b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_half.h
deleted file mode 100644
index 58738467ef6..00000000000
--- a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_false_half.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_HALF_H
-#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_HALF_H
-#include "acl/acl_base.h"
-
-#ifndef ACLRT_LAUNCH_KERNEL
-#define ACLRT_LAUNCH_KERNEL(kernel_func) aclrtlaunch_##kernel_func
-#endif
-
-extern "C" uint32_t aclrtlaunch_rope_custom_false_half(uint32_t blockDim, aclrtStream stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum);
-#endif
diff --git a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_bfloat16_t.h b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_bfloat16_t.h
deleted file mode 100644
index 122ee862ab6..00000000000
--- a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_bfloat16_t.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_BFLOAT16_T_H
-#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_BFLOAT16_T_H
-#include "acl/acl_base.h"
-
-#ifndef ACLRT_LAUNCH_KERNEL
-#define ACLRT_LAUNCH_KERNEL(kernel_func) aclrtlaunch_##kernel_func
-#endif
-
-extern "C" uint32_t aclrtlaunch_rope_custom_true_bfloat16_t(uint32_t blockDim, aclrtStream stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum);
-#endif
diff --git a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_half.h b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_half.h
deleted file mode 100644
index 02e01a10fcb..00000000000
--- a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_rope_custom_true_half.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_HALF_H
-#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_HALF_H
-#include "acl/acl_base.h"
-
-#ifndef ACLRT_LAUNCH_KERNEL
-#define ACLRT_LAUNCH_KERNEL(kernel_func) aclrtlaunch_##kernel_func
-#endif
-
-extern "C" uint32_t aclrtlaunch_rope_custom_true_half(uint32_t blockDim, aclrtStream stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum);
-#endif
diff --git a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_triple_chevrons_func.h b/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_triple_chevrons_func.h
deleted file mode 100644
index 7f1e2cbe562..00000000000
--- a/vllm_ascend/include/vllm_ascend_kernels/aclrtlaunch_triple_chevrons_func.h
+++ /dev/null
@@ -1,60 +0,0 @@
-
-#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_BFLOAT16_T_HKERNEL_H_
-#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_BFLOAT16_T_HKERNEL_H_
-
-
-
-extern "C" uint32_t aclrtlaunch_rope_custom_false_bfloat16_t(uint32_t blockDim, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum);
-
-inline uint32_t rope_custom_false_bfloat16_t(uint32_t blockDim, void* hold, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum)
-{
-    (void)hold;
-    return aclrtlaunch_rope_custom_false_bfloat16_t(blockDim, stream, positions, queryDst, keyDst, query, key, cosSinCache, rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, numHeads, numKvHeads, headSize, numTokens, loopNum, coreNum);
-}
-
-#endif
-
-#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_HALF_HKERNEL_H_
-#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_FALSE_HALF_HKERNEL_H_
-
-
-
-extern "C" uint32_t aclrtlaunch_rope_custom_false_half(uint32_t blockDim, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum);
-
-inline uint32_t rope_custom_false_half(uint32_t blockDim, void* hold, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum)
-{
-    (void)hold;
-    return aclrtlaunch_rope_custom_false_half(blockDim, stream, positions, queryDst, keyDst, query, key, cosSinCache, rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, numHeads, numKvHeads, headSize, numTokens, loopNum, coreNum);
-}
-
-#endif
-
-#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_BFLOAT16_T_HKERNEL_H_
-#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_BFLOAT16_T_HKERNEL_H_
-
-
-
-extern "C" uint32_t aclrtlaunch_rope_custom_true_bfloat16_t(uint32_t blockDim, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum);
-
-inline uint32_t rope_custom_true_bfloat16_t(uint32_t blockDim, void* hold, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum)
-{
-    (void)hold;
-    return aclrtlaunch_rope_custom_true_bfloat16_t(blockDim, stream, positions, queryDst, keyDst, query, key, cosSinCache, rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, numHeads, numKvHeads, headSize, numTokens, loopNum, coreNum);
-}
-
-#endif
-
-#ifndef HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_HALF_HKERNEL_H_
-#define HEADER_ACLRTLAUNCH_ROPE_CUSTOM_TRUE_HALF_HKERNEL_H_
-
-
-
-extern "C" uint32_t aclrtlaunch_rope_custom_true_half(uint32_t blockDim, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum);
-
-inline uint32_t rope_custom_true_half(uint32_t blockDim, void* hold, void* stream, void* positions, void* queryDst, void* keyDst, void* query, void* key, void* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, const int headSize, const int64_t numTokens, const int loopNum, const int coreNum)
-{
-    (void)hold;
-    return aclrtlaunch_rope_custom_true_half(blockDim, stream, positions, queryDst, keyDst, query, key, cosSinCache, rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, numHeads, numKvHeads, headSize, numTokens, loopNum, coreNum);
-}
-
-#endif

From d3d3eb618edd1b9f647cc9cd99d023d67ef61794 Mon Sep 17 00:00:00 2001
From: cty <ctynb@qq.com>
Date: Wed, 30 Apr 2025 10:45:00 +0000
Subject: [PATCH 04/11] This commit is to fit the ruff and yapf

Signed-off-by: cty <ctynb@qq.com>
---
 vllm_ascend/worker/model_runner_v1.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index af99e2dead7..95478735246 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -40,7 +40,7 @@
 from vllm.logger import logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
@@ -60,8 +60,9 @@
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 
-from vllm.v1.worker.utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs,
-                    scatter_mm_placeholders)
+from vllm.v1.worker.utils import (gather_mm_placeholders,
+                                  sanity_check_mm_encoder_outputs,
+                                  scatter_mm_placeholders)
 
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 
@@ -430,7 +431,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                 output_token_ids=[],
                 lora_request=new_req_data.lora_request,
             )
-            
+
             # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
             if self.uses_mrope:
                 image_grid_thw = []
@@ -467,7 +468,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                         audio_feature_lengths=audio_feature_lengths,
                         use_audio_in_video=use_audio_in_video,
                     )
-                    
+
             req_ids_to_add.append(req_id)
 
         # Update the states of the running/resumed requests.
@@ -796,7 +797,7 @@ def _process_reqs(
         np.add(self.input_batch.num_computed_tokens_cpu[req_indices],
                arange,
                out=positions_np)
-        
+
         # Calculate M-RoPE positions.
         # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
         if self.uses_mrope:
@@ -807,7 +808,7 @@ def _process_reqs(
             self.mrope_positions[:, :total_num_scheduled_tokens].copy_(
                 self.mrope_positions_cpu[:, :total_num_scheduled_tokens],
                 non_blocking=True)
-            
+
         self.positions[:total_num_scheduled_tokens].copy_(
             self.positions_cpu[:total_num_scheduled_tokens], non_blocking=True)
         positions = self.positions[:num_input_tokens]
@@ -1427,7 +1428,7 @@ def _dummy_run(
 
     def profile_run(self) -> None:
         # FIXME Profile with multimodal encoder & encoder cache.
-        # current _profile_multimodal() using PyTorch SDPA backend method not 
+        # current _profile_multimodal() using PyTorch SDPA backend method not
         # support for window/full attn to reduce Memcpy operations, so will cause
         # Out Of Memory problem, so we currently don't use self._profile_multimodal()
         # self._profile_multimodal()

From 1e23068568d01733537d0d4a4f37714815f2c57b Mon Sep 17 00:00:00 2001
From: cty <ctynb@qq.com>
Date: Wed, 30 Apr 2025 11:15:34 +0000
Subject: [PATCH 05/11] This commit is to resolve rebase conflict

Signed-off-by: cty <ctynb@qq.com>
---
 vllm_ascend/worker/model_runner_v1.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 95478735246..43d7c9ff63b 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -39,8 +39,11 @@
 from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
@@ -64,10 +67,6 @@
                                   sanity_check_mm_encoder_outputs,
                                   scatter_mm_placeholders)
 
-from vllm.multimodal.utils import group_mm_inputs_by_modality
-
-from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
-
 from vllm_ascend.attention.attention import AttentionMaskBuilder
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.attention.mla_v1 import CommonAttentionMetadata

From 8006613accd0fd7272ac574fd3d6e6a0fd329891 Mon Sep 17 00:00:00 2001
From: cty <ctynb@qq.com>
Date: Wed, 30 Apr 2025 11:39:17 +0000
Subject: [PATCH 06/11] This commit is to enable the V1 Qwen2.5-vl test

Signed-off-by: cty <ctynb@qq.com>
---
 tests/singlecard/test_offline_inference.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/singlecard/test_offline_inference.py b/tests/singlecard/test_offline_inference.py
index 5d0e16e5fdf..5e60f05298d 100644
--- a/tests/singlecard/test_offline_inference.py
+++ b/tests/singlecard/test_offline_inference.py
@@ -58,8 +58,6 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None:
 
 
 @pytest.mark.parametrize("model", MULTIMODALITY_MODELS)
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "1",
-                    reason="qwen2.5_vl is not supported on v1")
 def test_multimodal(model, prompt_template, vllm_runner):
     image = ImageAsset("cherry_blossom") \
         .pil_image.convert("RGB")

From 4a292a22e0cb40a7463fbadeefba02a90eb94120 Mon Sep 17 00:00:00 2001
From: cty <ctynb@qq.com>
Date: Tue, 27 May 2025 09:25:18 +0000
Subject: [PATCH 07/11] resolve redefine problem

Signed-off-by: cty <ctynb@qq.com>
---
 vllm_ascend/worker/model_runner_v1.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 43d7c9ff63b..8c9783cb9b3 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -61,7 +61,6 @@
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 
 from vllm.v1.worker.utils import (gather_mm_placeholders,
                                   sanity_check_mm_encoder_outputs,

From 850b6692f21eef971c5e402f536600ec9605e77d Mon Sep 17 00:00:00 2001
From: cty <ctynb@qq.com>
Date: Tue, 27 May 2025 10:38:06 +0000
Subject: [PATCH 08/11] resolve isort problem

Signed-off-by: cty <ctynb@qq.com>
---
 vllm_ascend/worker/model_runner_v1.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 8c9783cb9b3..bc123f94f6b 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -61,7 +61,6 @@
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
-
 from vllm.v1.worker.utils import (gather_mm_placeholders,
                                   sanity_check_mm_encoder_outputs,
                                   scatter_mm_placeholders)

From c9d291abf5574fa25dcdcd524f26f862988f932a Mon Sep 17 00:00:00 2001
From: cty <ctynb@qq.com>
Date: Tue, 27 May 2025 10:52:00 +0000
Subject: [PATCH 09/11] Trigger CI

Signed-off-by: cty <ctynb@qq.com>

From 23c5078b420a478fa9255129c3182bc2cdec91a8 Mon Sep 17 00:00:00 2001
From: cty <ctynb@qq.com>
Date: Tue, 27 May 2025 11:00:55 +0000
Subject: [PATCH 10/11] Trigger CI

Signed-off-by: cty <ctynb@qq.com>

From 46a0776ddc62b39c9b28edf44016ba75826675eb Mon Sep 17 00:00:00 2001
From: Yikun Jiang <yikunkero@gmail.com>
Date: Tue, 3 Jun 2025 15:41:49 +0800
Subject: [PATCH 11/11] enable test

Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
---
 .github/workflows/accuracy_test.yaml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml
index 3bf6b746b0e..10b7b320e5c 100644
--- a/.github/workflows/accuracy_test.yaml
+++ b/.github/workflows/accuracy_test.yaml
@@ -109,10 +109,6 @@ jobs:
           contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') &&
             '["Qwen/Qwen2.5-VL-7B-Instruct"]'
          ) }}
-        # Remove exclude after https://github.com/vllm-project/vllm-ascend/issues/1044 resolved
-        exclude:
-          - model_name: Qwen/Qwen2.5-VL-7B-Instruct
-            vllm_use_version: 1
 
       fail-fast: false
     name: ${{ matrix.model_name }} accuracy V${{ matrix.vllm_use_version }}