Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions vllm/attention/backends/differential_flash_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -805,14 +805,18 @@ def forward(
"""Forward pass with FlashAttention.

Args:
query: shape = [num_tokens, num_heads, head_size]
key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
output: shape = [num_tokens, num_heads, head_size]
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
layer: Attention layer instance.
q: Query tensor with shape = [num_tokens, num_heads, head_size]
k: Key tensor with shape = [num_tokens, num_kv_heads, head_size]
v: Value tensor with shape = [num_tokens, num_kv_heads, head_size]
kv_cache: KV cache tensor with shape
[2, num_blocks, block_size, num_kv_heads, head_size].
NOTE: kv_cache will be an empty tensor with shape [0]
for profiling run.
attn_metadata: Metadata for attention.
output: Output tensor with shape [num_tokens, num_heads, head_size]
output_scale: Optional output scale tensor.
output_block_scale: Optional output block scale tensor.
NOTE: It in-place updates the output tensor.
NOTE: FP8 quantization, flash-attn expect the size of
{q,k,v}_descale to be (num_sequences, num_kv_heads).
Expand Down
5 changes: 3 additions & 2 deletions vllm/attention/backends/flash_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,7 +605,8 @@ def forward(
key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
output: shape = [num_tokens, num_heads, head_size]
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
kv_cache: KV cache tensor with shape
[2, num_blocks, block_size, num_kv_heads, head_size].
NOTE: kv_cache will be an empty tensor with shape [0]
for profiling run.
attn_metadata: Metadata for attention.
Expand Down Expand Up @@ -850,7 +851,7 @@ def forward(


def _get_query_key_seq_metadata(
attn_metadata,
attn_metadata: FlashAttentionMetadata,
is_prompt: bool,
attn_type: str,
) -> tuple:
Expand Down
11 changes: 6 additions & 5 deletions vllm/attention/backends/rocm_flash_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,17 +584,18 @@ def forward(
use prefill sequence attributes

Args:
layer: Attention layer instance.
query: shape = [num_tokens, num_heads * head_size]
key: shape = [num_tokens, num_kv_heads * head_size]
value: shape = [num_tokens, num_kv_heads * head_size]
kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
kv_cache: KV cache tensor with shape
[2, num_blocks, block_size * num_kv_heads * head_size].
NOTE: kv_cache will be an empty tensor with shape [0]
for profiling run.
attn_metadata: Metadata for attention.
attn_type: Select attention type, between encoder attention,
decoder self-attention, or encoder/decoder cross-
attention. Defaults to decoder self-attention,
which is the vLLM default generally
output: Optional output tensor.
output_scale: Optional output scale tensor.
output_block_scale: Optional output block scale tensor.
Returns:
shape = [num_tokens, num_heads * head_size]
"""
Expand Down
2 changes: 1 addition & 1 deletion vllm/attention/backends/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,7 +561,7 @@ def get_num_prefill_decode_query_kv_tokens(

Raises:
AssertionError: If the number of encoder tokens in `attn_metadata`
is `None` when required for the calculations.
is `None` when required for the calculations.
"""
num_prefill_query_tokens = 0
num_decode_query_tokens = 0
Expand Down
12 changes: 6 additions & 6 deletions vllm/attention/backends/xformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,17 +471,18 @@ def forward(
max_encoder_seq_len)

Args:
layer: Attention layer instance.
query: shape = [num_tokens, num_heads * head_size]
key: shape = [num_tokens, num_kv_heads * head_size]
value: shape = [num_tokens, num_kv_heads * head_size]
kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
kv_cache: KV cache tensor with shape
[2, num_blocks, block_size * num_kv_heads * head_size].
NOTE: kv_cache will be an empty tensor with shape [0]
for profiling run.
attn_metadata: Metadata for attention.
attn_type: Select attention type, between encoder attention,
decoder self-attention, or encoder/decoder cross-
attention. Defaults to decoder self-attention,
which is the vLLM default generally
output: Optional output tensor.
output_scale: Optional output scale tensor.
output_block_scale: Optional output block scale tensor.
Returns:
shape = [num_tokens, num_heads * head_size]
"""
Expand Down Expand Up @@ -644,7 +645,6 @@ def _run_memory_efficient_xformers_forward(
for API spec.

Args:
output: shape = [num_prefill_tokens, num_heads, head_size]
query: shape = [num_prefill_tokens, num_heads, head_size]
key: shape = [num_prefill_tokens, num_kv_heads, head_size]
value: shape = [num_prefill_tokens, num_kv_heads, head_size]
Expand Down
8 changes: 3 additions & 5 deletions vllm/core/block_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ def can_swap_in(self, seq_group: SequenceGroup,
with num_lookahead_slots.

Args:
sequence_group (SequenceGroup): The sequence group to swap in.
seq_group (SequenceGroup): The sequence group to swap in.
num_lookahead_slots (int): Number of lookahead slots used in
speculative decoding, default to 0.

Expand Down Expand Up @@ -405,8 +405,6 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool:

Args:
seq_group (SequenceGroup): The sequence group to swap out.
num_lookahead_slots (int): Number of lookahead slots used in
speculative decoding, default to 0.

Returns:
bool: Whether it's possible to swap out current sequence group.
Expand All @@ -420,7 +418,7 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
swapping out the given sequence_group with num_lookahead_slots.

Args:
sequence_group (SequenceGroup): The sequence group to swap out.
seq_group (SequenceGroup): The sequence group to swap out.

Returns:
List[Tuple[int, int]]: The mapping of swapping block from
Expand Down Expand Up @@ -473,7 +471,7 @@ def _can_swap(self,
on to the 'device'.

Args:
sequence_group (SequenceGroup): The sequence group to swap in/out.
seq_group (SequenceGroup): The sequence group to swap in/out.
device (Device): device to swap the 'seq_group' on.
status (SequenceStatus): The status of sequence which is needed
for action. RUNNING for swap out and SWAPPED for swap in
Expand Down
4 changes: 2 additions & 2 deletions vllm/engine/async_llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,10 +486,10 @@ class AsyncLLMEngine(EngineClient):
_engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine

def __init__(self,
*args,
*args: Any,
log_requests: bool = True,
start_engine_loop: bool = True,
**kwargs) -> None:
**kwargs: Any) -> None:
if envs.VLLM_USE_V1:
raise ValueError(
"Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. "
Expand Down
8 changes: 4 additions & 4 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,10 +644,10 @@ def add_request(
Details:
- Set arrival_time to the current time if it is None.
- Set prompt_token_ids to the encoded prompt if it is None.
- Create `n` number of [Sequence][vllm.Sequence] objects.
- Create a [SequenceGroup][vllm.SequenceGroup] object
from the list of [Sequence][vllm.Sequence].
- Add the [SequenceGroup][vllm.SequenceGroup] object to the
- Create `n` number of [Sequence][vllm.sequence.Sequence] objects.
- Create a [SequenceGroup][vllm.sequence.SequenceGroup] object
from the list of [Sequence][vllm.sequence.Sequence].
- Add the [SequenceGroup][vllm.sequence.SequenceGroup] object to the
scheduler.

Example:
Expand Down
10 changes: 5 additions & 5 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def __init__(
CompilationConfig]] = None,
logits_processors: Optional[list[Union[str,
type[LogitsProcessor]]]] = None,
**kwargs,
**kwargs: Any,
Copy link
Contributor Author

@Zerohertz Zerohertz Aug 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I initially used object for the **kwargs type hint, as I had seen it used in most other codes.
However, this caused issues with mypy, so I changed it to Any.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like you did the right thing. According to https://peps.python.org/pep-0484/#arbitrary-argument-lists-and-default-argument-values you should type hint the type of the expected values of the contents of **kwargs. Since we use **kwargs to pass arbitrary keyword arguments, Any is an apprioriate choice.

) -> None:
"""LLM constructor."""

Expand Down Expand Up @@ -697,8 +697,8 @@ def chat(
Generate responses for a chat conversation.

The chat conversation is converted into a text prompt using the
tokenizer and calls the [generate][] method to generate the
responses.
tokenizer and calls the [generate][vllm.LLM.generate] method to generate
the responses.

Multi-modal inputs can be passed in the same way you would pass them
to the OpenAI API.
Expand Down Expand Up @@ -1334,8 +1334,8 @@ def sleep(self, level: int = 1):

def wake_up(self, tags: Optional[list[str]] = None):
"""
Wake up the engine from sleep mode. See the [sleep][] method
for more details.
Wake up the engine from sleep mode. See the [sleep][vllm.LLM.sleep]
method for more details.

Args:
tags: An optional list of tags to reallocate the engine memory
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,8 @@ def _find_tool_boundaries(self, text: str) -> list[tuple[int, int]]:
i += 1
return boundaries

def _extract_tool_args(self, tool_content: str, args_match) -> str:
def _extract_tool_args(self, tool_content: str,
args_match: re.Match[str]) -> str:
"""
Extract tool arguments from tool content.

Expand Down
11 changes: 10 additions & 1 deletion vllm/model_executor/layers/lightning_attn.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional

import torch
from einops import rearrange

Expand Down Expand Up @@ -453,7 +455,14 @@ def forward(ctx, q, k, v, s, kv_history):
lightning_attention_ = _attention.apply


def lightning_attention(q, k, v, ed, block_size=256, kv_history=None):
def lightning_attention(
q: torch.Tensor,
k: torch.Tensor,
v: torch.Tensor,
ed: torch.Tensor,
block_size: int = 256,
kv_history: Optional[torch.Tensor] = None
) -> tuple[torch.Tensor, torch.Tensor]:
"""
Apply lightning attention algorithm
to compute attention efficiently.
Expand Down
5 changes: 3 additions & 2 deletions vllm/model_executor/layers/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,10 +233,10 @@ class LinearBase(CustomOp):
Args:
input_size: input dimension of the linear layer.
output_size: output dimension of the linear layer.
bias: If true, add bias.
skip_bias_add: If true, skip adding bias but instead return it.
params_dtype: Data type for the parameters.
quant_config: Quantization configure.
prefix: Prefix for parameter names.
return_bias: If true, return bias together with outputs in forward pass.
"""

Expand Down Expand Up @@ -378,13 +378,14 @@ class MergedReplicatedLinear(ReplicatedLinear):

Args:
input_size: input dimension of the linear layer.
output_size: output dimension of the linear layer.
output_sizes: list of output dimensions of the linear layer.
bias: If true, add bias.
skip_bias_add: If true, skip adding bias but instead return it.
params_dtype: Data type for the parameters.
quant_config: Quantization configure.
prefix: The name of the layer in the state dict, including all parents
(e.g. model.layers.0.qkv_proj)
return_bias: If true, return bias together with outputs in forward pass.
"""

def __init__(
Expand Down
4 changes: 2 additions & 2 deletions vllm/outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ class EmbeddingOutput:

Args:
embedding: The embedding vector, which is a list of floats.
Its length depends on the hidden dimension of the model.
Its length depends on the hidden dimension of the model.
"""
embedding: list[float]

Expand Down Expand Up @@ -447,7 +447,7 @@ class ClassificationOutput:

Args:
probs: The probability vector, which is a list of floats.
Its length depends on the number of classes.
Its length depends on the number of classes.
"""
probs: list[float]

Expand Down
27 changes: 10 additions & 17 deletions vllm/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,18 +147,7 @@ class SequenceDataDelta(

class SequenceData(msgspec.Struct,
omit_defaults=True): # type: ignore[call-arg]
"""Data associated with a sequence.

Args:
prompt_token_ids: The token IDs of the prompt.
output_token_ids: The token IDs of the output. Set to an empty list if
None.

Attributes:
prompt_token_ids: The token IDs of the prompt.
output_token_ids: The token IDs of the output.
cumulative_logprob: The cumulative log probability of the output.
Comment on lines -158 to -160
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I moved these to their respective @property methods because this class seems to have no public attributes

"""
"""Data associated with a sequence."""
# NOTE: we cannot use Union[list, array] because msgspec cannot support
# union of 2 list types.
_prompt_token_ids: array
Expand Down Expand Up @@ -256,10 +245,12 @@ def _update_cached_all_token_embeds(self):

@property
def cumulative_logprob(self) -> float:
"""The cumulative log probability of the output."""
return self._cumulative_logprob

@property
def prompt_token_ids(self) -> tuple[int, ...]:
"""The token IDs of the prompt."""
return self._prompt_token_ids_tuple

@prompt_token_ids.setter
Expand All @@ -277,6 +268,7 @@ def prompt_token_ids_array(self) -> array:

@property
def output_token_ids(self) -> tuple[int, ...]:
"""The token IDs of the output."""
return tuple(self._output_token_ids)

@output_token_ids.setter
Expand Down Expand Up @@ -940,7 +932,7 @@ class SequenceGroupMetadata(
omit_defaults=True): # type: ignore[call-arg]
"""Metadata for a sequence group. Used to create `AttentionMetadata`.

Args:
Attributes:
request_id: The ID of the request.
is_prompt: Whether the request is at prompt stage.
seq_data: The sequence data. (Seq id -> sequence data)
Expand All @@ -950,14 +942,14 @@ class SequenceGroupMetadata(
do_sample: True if sampling is required. Sampling is not required when
e.g., prefill is chunked, and the current iteration only computes
query tokens for prefill, we don't need sampling.
token_chunk_size: The number of tokens to be processed (per sequence).
None if chunking is not required.
pooling_params: Pooling parameters.
lora_request: LoRA request.
computed_block_nums: The block numbers that are already computed,
used in prefix caching.
state: Internal state tied to this sequence group.
token_type_ids: Token type IDs.
multi_modal_data: Multi modal data.
mm_processor_kwargs: Multimodal input processor / mapper overrides.
multi_modal_placeholders: Multi modal placeholders.
encoder_seq_data: Optional sequence data for encoder prompt
(SequenceGroup.encoder_seq). Should be None
unless you are working with an encoder/decoder
Expand Down Expand Up @@ -1043,12 +1035,13 @@ class SequenceOutput(
array_like=True): # type: ignore[call-arg]
"""The model output associated with a sequence.

Args:
Attributes:
parent_seq_id: The ID of the parent sequence (for forking in beam
search).
output_token: The output token ID.
logprobs: The logprobs of the output token.
(Token id -> logP(x_i+1 | x_0, ..., x_i))
output_embed: Optional output embedding tensor.
"""
parent_seq_id: int
output_token: int
Expand Down