diff --git a/vllm/attention/backends/differential_flash_attn.py b/vllm/attention/backends/differential_flash_attn.py index ce9467efd23c..caa02530d2fd 100644 --- a/vllm/attention/backends/differential_flash_attn.py +++ b/vllm/attention/backends/differential_flash_attn.py @@ -805,14 +805,18 @@ def forward( """Forward pass with FlashAttention. Args: - query: shape = [num_tokens, num_heads, head_size] - key: shape = [num_tokens, num_kv_heads, head_size] - value: shape = [num_tokens, num_kv_heads, head_size] - output: shape = [num_tokens, num_heads, head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + layer: Attention layer instance. + q: Query tensor with shape = [num_tokens, num_heads, head_size] + k: Key tensor with shape = [num_tokens, num_kv_heads, head_size] + v: Value tensor with shape = [num_tokens, num_kv_heads, head_size] + kv_cache: KV cache tensor with shape + [2, num_blocks, block_size, num_kv_heads, head_size]. NOTE: kv_cache will be an empty tensor with shape [0] for profiling run. attn_metadata: Metadata for attention. + output: Output tensor with shape [num_tokens, num_heads, head_size] + output_scale: Optional output scale tensor. + output_block_scale: Optional output block scale tensor. NOTE: It in-place updates the output tensor. NOTE: FP8 quantization, flash-attn expect the size of {q,k,v}_descale to be (num_sequences, num_kv_heads). diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index ba7a9afe8678..d8cb208c4f2e 100755 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -605,7 +605,8 @@ def forward( key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] output: shape = [num_tokens, num_heads, head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + kv_cache: KV cache tensor with shape + [2, num_blocks, block_size, num_kv_heads, head_size]. NOTE: kv_cache will be an empty tensor with shape [0] for profiling run. attn_metadata: Metadata for attention. @@ -850,7 +851,7 @@ def forward( def _get_query_key_seq_metadata( - attn_metadata, + attn_metadata: FlashAttentionMetadata, is_prompt: bool, attn_type: str, ) -> tuple: diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index e4c27a0ef36e..9262144e37b5 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -584,17 +584,18 @@ def forward( use prefill sequence attributes Args: + layer: Attention layer instance. query: shape = [num_tokens, num_heads * head_size] key: shape = [num_tokens, num_kv_heads * head_size] value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + kv_cache: KV cache tensor with shape + [2, num_blocks, block_size * num_kv_heads * head_size]. NOTE: kv_cache will be an empty tensor with shape [0] for profiling run. attn_metadata: Metadata for attention. - attn_type: Select attention type, between encoder attention, - decoder self-attention, or encoder/decoder cross- - attention. Defaults to decoder self-attention, - which is the vLLM default generally + output: Optional output tensor. + output_scale: Optional output scale tensor. + output_block_scale: Optional output block scale tensor. Returns: shape = [num_tokens, num_heads * head_size] """ diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 34e059067d84..7b6c426b0f85 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -561,7 +561,7 @@ def get_num_prefill_decode_query_kv_tokens( Raises: AssertionError: If the number of encoder tokens in `attn_metadata` - is `None` when required for the calculations. + is `None` when required for the calculations. """ num_prefill_query_tokens = 0 num_decode_query_tokens = 0 diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index c1213f7620a7..302d3d7ea903 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -471,17 +471,18 @@ def forward( max_encoder_seq_len) Args: + layer: Attention layer instance. query: shape = [num_tokens, num_heads * head_size] key: shape = [num_tokens, num_kv_heads * head_size] value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + kv_cache: KV cache tensor with shape + [2, num_blocks, block_size * num_kv_heads * head_size]. NOTE: kv_cache will be an empty tensor with shape [0] for profiling run. attn_metadata: Metadata for attention. - attn_type: Select attention type, between encoder attention, - decoder self-attention, or encoder/decoder cross- - attention. Defaults to decoder self-attention, - which is the vLLM default generally + output: Optional output tensor. + output_scale: Optional output scale tensor. + output_block_scale: Optional output block scale tensor. Returns: shape = [num_tokens, num_heads * head_size] """ @@ -644,7 +645,6 @@ def _run_memory_efficient_xformers_forward( for API spec. Args: - output: shape = [num_prefill_tokens, num_heads, head_size] query: shape = [num_prefill_tokens, num_heads, head_size] key: shape = [num_prefill_tokens, num_kv_heads, head_size] value: shape = [num_prefill_tokens, num_kv_heads, head_size] diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 4ec5a775f465..cbfa4d7ff3c4 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -352,7 +352,7 @@ def can_swap_in(self, seq_group: SequenceGroup, with num_lookahead_slots. Args: - sequence_group (SequenceGroup): The sequence group to swap in. + seq_group (SequenceGroup): The sequence group to swap in. num_lookahead_slots (int): Number of lookahead slots used in speculative decoding, default to 0. @@ -405,8 +405,6 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool: Args: seq_group (SequenceGroup): The sequence group to swap out. - num_lookahead_slots (int): Number of lookahead slots used in - speculative decoding, default to 0. Returns: bool: Whether it's possible to swap out current sequence group. @@ -420,7 +418,7 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: swapping out the given sequence_group with num_lookahead_slots. Args: - sequence_group (SequenceGroup): The sequence group to swap out. + seq_group (SequenceGroup): The sequence group to swap out. Returns: List[Tuple[int, int]]: The mapping of swapping block from @@ -473,7 +471,7 @@ def _can_swap(self, on to the 'device'. Args: - sequence_group (SequenceGroup): The sequence group to swap in/out. + seq_group (SequenceGroup): The sequence group to swap in/out. device (Device): device to swap the 'seq_group' on. status (SequenceStatus): The status of sequence which is needed for action. RUNNING for swap out and SWAPPED for swap in diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 84ad2299b065..4fb028627a8c 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -486,10 +486,10 @@ class AsyncLLMEngine(EngineClient): _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine def __init__(self, - *args, + *args: Any, log_requests: bool = True, start_engine_loop: bool = True, - **kwargs) -> None: + **kwargs: Any) -> None: if envs.VLLM_USE_V1: raise ValueError( "Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. " diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index dbf8d3ba5014..cbd714c159eb 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -644,10 +644,10 @@ def add_request( Details: - Set arrival_time to the current time if it is None. - Set prompt_token_ids to the encoded prompt if it is None. - - Create `n` number of [Sequence][vllm.Sequence] objects. - - Create a [SequenceGroup][vllm.SequenceGroup] object - from the list of [Sequence][vllm.Sequence]. - - Add the [SequenceGroup][vllm.SequenceGroup] object to the + - Create `n` number of [Sequence][vllm.sequence.Sequence] objects. + - Create a [SequenceGroup][vllm.sequence.SequenceGroup] object + from the list of [Sequence][vllm.sequence.Sequence]. + - Add the [SequenceGroup][vllm.sequence.SequenceGroup] object to the scheduler. Example: diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 728ed8328d36..8816ff56d684 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -186,7 +186,7 @@ def __init__( CompilationConfig]] = None, logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None, - **kwargs, + **kwargs: Any, ) -> None: """LLM constructor.""" @@ -697,8 +697,8 @@ def chat( Generate responses for a chat conversation. The chat conversation is converted into a text prompt using the - tokenizer and calls the [generate][] method to generate the - responses. + tokenizer and calls the [generate][vllm.LLM.generate] method to generate + the responses. Multi-modal inputs can be passed in the same way you would pass them to the OpenAI API. @@ -1334,8 +1334,8 @@ def sleep(self, level: int = 1): def wake_up(self, tags: Optional[list[str]] = None): """ - Wake up the engine from sleep mode. See the [sleep][] method - for more details. + Wake up the engine from sleep mode. See the [sleep][vllm.LLM.sleep] + method for more details. Args: tags: An optional list of tags to reallocate the engine memory diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py index 283e6095013d..0fd62f0b6a7f 100644 --- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py @@ -461,7 +461,8 @@ def _find_tool_boundaries(self, text: str) -> list[tuple[int, int]]: i += 1 return boundaries - def _extract_tool_args(self, tool_content: str, args_match) -> str: + def _extract_tool_args(self, tool_content: str, + args_match: re.Match[str]) -> str: """ Extract tool arguments from tool content. diff --git a/vllm/model_executor/layers/lightning_attn.py b/vllm/model_executor/layers/lightning_attn.py index 8ffc700ca5cd..0b87acc85120 100644 --- a/vllm/model_executor/layers/lightning_attn.py +++ b/vllm/model_executor/layers/lightning_attn.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional + import torch from einops import rearrange @@ -453,7 +455,14 @@ def forward(ctx, q, k, v, s, kv_history): lightning_attention_ = _attention.apply -def lightning_attention(q, k, v, ed, block_size=256, kv_history=None): +def lightning_attention( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + ed: torch.Tensor, + block_size: int = 256, + kv_history: Optional[torch.Tensor] = None +) -> tuple[torch.Tensor, torch.Tensor]: """ Apply lightning attention algorithm to compute attention efficiently. diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index dd54aebeb011..c0fcacd1e6ee 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -233,10 +233,10 @@ class LinearBase(CustomOp): Args: input_size: input dimension of the linear layer. output_size: output dimension of the linear layer. - bias: If true, add bias. skip_bias_add: If true, skip adding bias but instead return it. params_dtype: Data type for the parameters. quant_config: Quantization configure. + prefix: Prefix for parameter names. return_bias: If true, return bias together with outputs in forward pass. """ @@ -378,13 +378,14 @@ class MergedReplicatedLinear(ReplicatedLinear): Args: input_size: input dimension of the linear layer. - output_size: output dimension of the linear layer. + output_sizes: list of output dimensions of the linear layer. bias: If true, add bias. skip_bias_add: If true, skip adding bias but instead return it. params_dtype: Data type for the parameters. quant_config: Quantization configure. prefix: The name of the layer in the state dict, including all parents (e.g. model.layers.0.qkv_proj) + return_bias: If true, return bias together with outputs in forward pass. """ def __init__( diff --git a/vllm/outputs.py b/vllm/outputs.py index 9784a8894472..acdb2f89ce73 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -409,7 +409,7 @@ class EmbeddingOutput: Args: embedding: The embedding vector, which is a list of floats. - Its length depends on the hidden dimension of the model. + Its length depends on the hidden dimension of the model. """ embedding: list[float] @@ -447,7 +447,7 @@ class ClassificationOutput: Args: probs: The probability vector, which is a list of floats. - Its length depends on the number of classes. + Its length depends on the number of classes. """ probs: list[float] diff --git a/vllm/sequence.py b/vllm/sequence.py index 3c4c77aea5ed..36b1b198bd5a 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -147,18 +147,7 @@ class SequenceDataDelta( class SequenceData(msgspec.Struct, omit_defaults=True): # type: ignore[call-arg] - """Data associated with a sequence. - - Args: - prompt_token_ids: The token IDs of the prompt. - output_token_ids: The token IDs of the output. Set to an empty list if - None. - - Attributes: - prompt_token_ids: The token IDs of the prompt. - output_token_ids: The token IDs of the output. - cumulative_logprob: The cumulative log probability of the output. - """ + """Data associated with a sequence.""" # NOTE: we cannot use Union[list, array] because msgspec cannot support # union of 2 list types. _prompt_token_ids: array @@ -256,10 +245,12 @@ def _update_cached_all_token_embeds(self): @property def cumulative_logprob(self) -> float: + """The cumulative log probability of the output.""" return self._cumulative_logprob @property def prompt_token_ids(self) -> tuple[int, ...]: + """The token IDs of the prompt.""" return self._prompt_token_ids_tuple @prompt_token_ids.setter @@ -277,6 +268,7 @@ def prompt_token_ids_array(self) -> array: @property def output_token_ids(self) -> tuple[int, ...]: + """The token IDs of the output.""" return tuple(self._output_token_ids) @output_token_ids.setter @@ -940,7 +932,7 @@ class SequenceGroupMetadata( omit_defaults=True): # type: ignore[call-arg] """Metadata for a sequence group. Used to create `AttentionMetadata`. - Args: + Attributes: request_id: The ID of the request. is_prompt: Whether the request is at prompt stage. seq_data: The sequence data. (Seq id -> sequence data) @@ -950,14 +942,14 @@ class SequenceGroupMetadata( do_sample: True if sampling is required. Sampling is not required when e.g., prefill is chunked, and the current iteration only computes query tokens for prefill, we don't need sampling. - token_chunk_size: The number of tokens to be processed (per sequence). - None if chunking is not required. + pooling_params: Pooling parameters. lora_request: LoRA request. computed_block_nums: The block numbers that are already computed, used in prefix caching. state: Internal state tied to this sequence group. + token_type_ids: Token type IDs. multi_modal_data: Multi modal data. - mm_processor_kwargs: Multimodal input processor / mapper overrides. + multi_modal_placeholders: Multi modal placeholders. encoder_seq_data: Optional sequence data for encoder prompt (SequenceGroup.encoder_seq). Should be None unless you are working with an encoder/decoder @@ -1043,12 +1035,13 @@ class SequenceOutput( array_like=True): # type: ignore[call-arg] """The model output associated with a sequence. - Args: + Attributes: parent_seq_id: The ID of the parent sequence (for forking in beam search). output_token: The output token ID. logprobs: The logprobs of the output token. (Token id -> logP(x_i+1 | x_0, ..., x_i)) + output_embed: Optional output embedding tensor. """ parent_seq_id: int output_token: int