vllm-project · hmellor · Aug 26, 2025 · Aug 26, 2025 · Aug 26, 2025 · Aug 26, 2025
@@ -805,14 +805,18 @@ def forward(
         """Forward pass with FlashAttention.
 
         Args:
-            query: shape = [num_tokens, num_heads, head_size]
-            key: shape = [num_tokens, num_kv_heads, head_size]
-            value: shape = [num_tokens, num_kv_heads, head_size]
-            output: shape = [num_tokens, num_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            layer: Attention layer instance.
+            q: Query tensor with shape = [num_tokens, num_heads, head_size]
+            k: Key tensor with shape = [num_tokens, num_kv_heads, head_size]
+            v: Value tensor with shape = [num_tokens, num_kv_heads, head_size]
+            kv_cache: KV cache tensor with shape 
+                [2, num_blocks, block_size, num_kv_heads, head_size].
                 NOTE: kv_cache will be an empty tensor with shape [0]
                 for profiling run.
             attn_metadata: Metadata for attention.
+            output: Output tensor with shape [num_tokens, num_heads, head_size]
+            output_scale: Optional output scale tensor.
+            output_block_scale: Optional output block scale tensor.
         NOTE: It in-place updates the output tensor.
         NOTE: FP8 quantization, flash-attn expect the size of
               {q,k,v}_descale to be (num_sequences, num_kv_heads).

@@ -605,7 +605,8 @@ def forward(
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
             output: shape = [num_tokens, num_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: KV cache tensor with shape 
+                [2, num_blocks, block_size, num_kv_heads, head_size].
                 NOTE: kv_cache will be an empty tensor with shape [0]
                 for profiling run.
             attn_metadata: Metadata for attention.
@@ -850,7 +851,7 @@ def forward(
 
 
 def _get_query_key_seq_metadata(
-    attn_metadata,
+    attn_metadata: FlashAttentionMetadata,
     is_prompt: bool,
     attn_type: str,
 ) -> tuple:

@@ -584,17 +584,18 @@ def forward(
                 use prefill sequence attributes
 
         Args:
+            layer: Attention layer instance.
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
             value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+            kv_cache: KV cache tensor with shape 
+                [2, num_blocks, block_size * num_kv_heads * head_size].
                 NOTE: kv_cache will be an empty tensor with shape [0]
                 for profiling run.
             attn_metadata: Metadata for attention.
-            attn_type: Select attention type, between encoder attention,
-                       decoder self-attention, or encoder/decoder cross-
-                       attention. Defaults to decoder self-attention,
-                       which is the vLLM default generally
+            output: Optional output tensor.
+            output_scale: Optional output scale tensor.
+            output_block_scale: Optional output block scale tensor.
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """

@@ -561,7 +561,7 @@ def get_num_prefill_decode_query_kv_tokens(
 
     Raises:
         AssertionError: If the number of encoder tokens in `attn_metadata` 
-        is `None` when required for the calculations.
+            is `None` when required for the calculations.
     """
     num_prefill_query_tokens = 0
     num_decode_query_tokens = 0

@@ -471,17 +471,18 @@ def forward(
                 max_encoder_seq_len)
 
         Args:
+            layer: Attention layer instance.
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
             value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+            kv_cache: KV cache tensor with shape 
+                [2, num_blocks, block_size * num_kv_heads * head_size].
                 NOTE: kv_cache will be an empty tensor with shape [0]
                 for profiling run.
             attn_metadata: Metadata for attention.
-            attn_type: Select attention type, between encoder attention,
-                       decoder self-attention, or encoder/decoder cross-
-                       attention. Defaults to decoder self-attention,
-                       which is the vLLM default generally
+            output: Optional output tensor.
+            output_scale: Optional output scale tensor.
+            output_block_scale: Optional output block scale tensor.
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
@@ -644,7 +645,6 @@ def _run_memory_efficient_xformers_forward(
         for API spec.
 
         Args:
-            output: shape = [num_prefill_tokens, num_heads, head_size]
             query: shape = [num_prefill_tokens, num_heads, head_size]
             key: shape = [num_prefill_tokens, num_kv_heads, head_size]
             value: shape = [num_prefill_tokens, num_kv_heads, head_size]

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
@@ -352,7 +352,7 @@ def can_swap_in(self, seq_group: SequenceGroup,
         with num_lookahead_slots.
 
         Args:
-            sequence_group (SequenceGroup): The sequence group to swap in.
+            seq_group (SequenceGroup): The sequence group to swap in.
             num_lookahead_slots (int): Number of lookahead slots used in 
                 speculative decoding, default to 0.
 
@@ -405,8 +405,6 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool:
 
         Args:
             seq_group (SequenceGroup): The sequence group to swap out.
-            num_lookahead_slots (int): Number of lookahead slots used in 
-                speculative decoding, default to 0.
 
         Returns:
             bool: Whether it's possible to swap out current sequence group.
@@ -420,7 +418,7 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
         swapping out the given sequence_group with num_lookahead_slots.
 
         Args:
-            sequence_group (SequenceGroup): The sequence group to swap out.
+            seq_group (SequenceGroup): The sequence group to swap out.
 
         Returns:
             List[Tuple[int, int]]: The mapping of swapping block from 
@@ -473,7 +471,7 @@ def _can_swap(self,
         on to the 'device'.
 
         Args:
-            sequence_group (SequenceGroup): The sequence group to swap in/out.
+            seq_group (SequenceGroup): The sequence group to swap in/out.
             device (Device): device to swap the 'seq_group' on.
             status (SequenceStatus): The status of sequence which is needed
                 for action. RUNNING for swap out and SWAPPED for swap in

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -486,10 +486,10 @@ class AsyncLLMEngine(EngineClient):
     _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
 
     def __init__(self,
-                 *args,
+                 *args: Any,
                  log_requests: bool = True,
                  start_engine_loop: bool = True,
-                 **kwargs) -> None:
+                 **kwargs: Any) -> None:
         if envs.VLLM_USE_V1:
             raise ValueError(
                 "Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. "

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -644,10 +644,10 @@ def add_request(
         Details:
             - Set arrival_time to the current time if it is None.
             - Set prompt_token_ids to the encoded prompt if it is None.
-            - Create `n` number of [Sequence][vllm.Sequence] objects.
-            - Create a [SequenceGroup][vllm.SequenceGroup] object
-              from the list of [Sequence][vllm.Sequence].
-            - Add the [SequenceGroup][vllm.SequenceGroup] object to the
+            - Create `n` number of [Sequence][vllm.sequence.Sequence] objects.
+            - Create a [SequenceGroup][vllm.sequence.SequenceGroup] object
+              from the list of [Sequence][vllm.sequence.Sequence].
+            - Add the [SequenceGroup][vllm.sequence.SequenceGroup] object to the
               scheduler.
 
         Example:

@@ -186,7 +186,7 @@ def __init__(
                                            CompilationConfig]] = None,
         logits_processors: Optional[list[Union[str,
                                                type[LogitsProcessor]]]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         """LLM constructor."""
 
@@ -697,8 +697,8 @@ def chat(
         Generate responses for a chat conversation.
 
         The chat conversation is converted into a text prompt using the
-        tokenizer and calls the [generate][] method to generate the
-        responses.
+        tokenizer and calls the [generate][vllm.LLM.generate] method to generate
+        the responses.
 
         Multi-modal inputs can be passed in the same way you would pass them
         to the OpenAI API.
@@ -1334,8 +1334,8 @@ def sleep(self, level: int = 1):
 
     def wake_up(self, tags: Optional[list[str]] = None):
         """
-        Wake up the engine from sleep mode. See the [sleep][] method
-        for more details.
+        Wake up the engine from sleep mode. See the [sleep][vllm.LLM.sleep]
+        method for more details.
 
         Args:
             tags: An optional list of tags to reallocate the engine memory

@@ -461,7 +461,8 @@ def _find_tool_boundaries(self, text: str) -> list[tuple[int, int]]:
                 i += 1
         return boundaries
 
-    def _extract_tool_args(self, tool_content: str, args_match) -> str:
+    def _extract_tool_args(self, tool_content: str,
+                           args_match: re.Match[str]) -> str:
         """
         Extract tool arguments from tool content.
 

diff --git a/vllm/model_executor/layers/lightning_attn.py b/vllm/model_executor/layers/lightning_attn.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
 import torch
 from einops import rearrange
 
@@ -453,7 +455,14 @@ def forward(ctx, q, k, v, s, kv_history):
 lightning_attention_ = _attention.apply
 
 
-def lightning_attention(q, k, v, ed, block_size=256, kv_history=None):
+def lightning_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    ed: torch.Tensor,
+    block_size: int = 256,
+    kv_history: Optional[torch.Tensor] = None
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Apply lightning attention algorithm 
     to compute attention efficiently.

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
@@ -233,10 +233,10 @@ class LinearBase(CustomOp):
     Args:
         input_size: input dimension of the linear layer.
         output_size: output dimension of the linear layer.
-        bias: If true, add bias.
         skip_bias_add: If true, skip adding bias but instead return it.
         params_dtype: Data type for the parameters.
         quant_config: Quantization configure.
+        prefix: Prefix for parameter names.
         return_bias: If true, return bias together with outputs in forward pass.
     """
 
@@ -378,13 +378,14 @@ class MergedReplicatedLinear(ReplicatedLinear):
 
     Args:
         input_size: input dimension of the linear layer.
-        output_size: output dimension of the linear layer.
+        output_sizes: list of output dimensions of the linear layer.
         bias: If true, add bias.
         skip_bias_add: If true, skip adding bias but instead return it.
         params_dtype: Data type for the parameters.
         quant_config: Quantization configure.
         prefix: The name of the layer in the state dict, including all parents
                         (e.g. model.layers.0.qkv_proj)
+        return_bias: If true, return bias together with outputs in forward pass.
     """
 
     def __init__(

diff --git a/vllm/outputs.py b/vllm/outputs.py
@@ -409,7 +409,7 @@ class EmbeddingOutput:
 
     Args:
         embedding: The embedding vector, which is a list of floats.
-        Its length depends on the hidden dimension of the model.
+            Its length depends on the hidden dimension of the model.
     """
     embedding: list[float]
 
@@ -447,7 +447,7 @@ class ClassificationOutput:
 
     Args:
         probs: The probability vector, which is a list of floats.
-        Its length depends on the number of classes.
+            Its length depends on the number of classes.
     """
     probs: list[float]
 

diff --git a/vllm/sequence.py b/vllm/sequence.py
@@ -147,18 +147,7 @@ class SequenceDataDelta(
 
 class SequenceData(msgspec.Struct,
                    omit_defaults=True):  # type: ignore[call-arg]
-    """Data associated with a sequence.
-
-    Args:
-        prompt_token_ids: The token IDs of the prompt.
-        output_token_ids: The token IDs of the output. Set to an empty list if
-            None.
-
-    Attributes:
-        prompt_token_ids: The token IDs of the prompt.
-        output_token_ids: The token IDs of the output.
-        cumulative_logprob: The cumulative log probability of the output.
-    """
+    """Data associated with a sequence."""
     # NOTE: we cannot use Union[list, array] because msgspec cannot support
     # union of 2 list types.
     _prompt_token_ids: array
@@ -256,10 +245,12 @@ def _update_cached_all_token_embeds(self):
 
     @property
     def cumulative_logprob(self) -> float:
+        """The cumulative log probability of the output."""
         return self._cumulative_logprob
 
     @property
     def prompt_token_ids(self) -> tuple[int, ...]:
+        """The token IDs of the prompt."""
         return self._prompt_token_ids_tuple
 
     @prompt_token_ids.setter
@@ -277,6 +268,7 @@ def prompt_token_ids_array(self) -> array:
 
     @property
     def output_token_ids(self) -> tuple[int, ...]:
+        """The token IDs of the output."""
         return tuple(self._output_token_ids)
 
     @output_token_ids.setter
@@ -940,7 +932,7 @@ class SequenceGroupMetadata(
         omit_defaults=True):  # type: ignore[call-arg]
     """Metadata for a sequence group. Used to create `AttentionMetadata`.
 
-    Args:
+    Attributes:
         request_id: The ID of the request.
         is_prompt: Whether the request is at prompt stage.
         seq_data: The sequence data. (Seq id -> sequence data)
@@ -950,14 +942,14 @@ class SequenceGroupMetadata(
         do_sample: True if sampling is required. Sampling is not required when
             e.g., prefill is chunked, and the current iteration only computes
             query tokens for prefill, we don't need sampling.
-        token_chunk_size: The number of tokens to be processed (per sequence).
-            None if chunking is not required.
+        pooling_params: Pooling parameters.
         lora_request: LoRA request.
         computed_block_nums: The block numbers that are already computed,
             used in prefix caching.
         state: Internal state tied to this sequence group.
+        token_type_ids: Token type IDs.
         multi_modal_data: Multi modal data.
-        mm_processor_kwargs: Multimodal input processor / mapper overrides.
+        multi_modal_placeholders: Multi modal placeholders.
         encoder_seq_data: Optional sequence data for encoder prompt
                           (SequenceGroup.encoder_seq). Should be None
                           unless you are working with an encoder/decoder
@@ -1043,12 +1035,13 @@ class SequenceOutput(
         array_like=True):  # type: ignore[call-arg]
     """The model output associated with a sequence.
 
-    Args:
+    Attributes:
         parent_seq_id: The ID of the parent sequence (for forking in beam
             search).
         output_token: The output token ID.
         logprobs: The logprobs of the output token.
             (Token id -> logP(x_i+1 | x_0, ..., x_i))
+        output_embed: Optional output embedding tensor.
     """
     parent_seq_id: int
     output_token: int