vllm-project · youkaichao · Jul 3, 2024 · Jun 3, 2024 · Jun 3, 2024 · Jun 3, 2024
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -120,7 +120,7 @@ steps:
   mirror_hardwares: [amd]
   commands:
     - bash ../.buildkite/download-images.sh
-    - pytest -v -s models -m vlm
+    - pytest -v -s -rx models -m vlm
 
 - label: Prefix Caching Test
   mirror_hardwares: [amd]

diff --git a/docs/source/dev/input_processing/model_inputs_index.rst b/docs/source/dev/input_processing/model_inputs_index.rst
@@ -8,7 +8,7 @@ Input Processing
 vLLM provides a mechanism for defining input processors for each model so that the inputs are processed
 in :class:`~vllm.LLMEngine` before they are passed to model executors. 
 
-Currently, this mechanism is only utilized in **multi-modal models** for preprocessing multi-modal input 
+Currently, this mechanism is only utilized in :ref:`multi-modal models <multi_modality>` for preprocessing multi-modal input 
 data in addition to input prompt, but it can be extended to text-only language models when needed.
 
 Guides

diff --git a/docs/source/dev/multimodal/adding_multimodal_model.rst b/docs/source/dev/multimodal/adding_multimodal_model.rst
@@ -0,0 +1,123 @@
+.. _adding_a_new_multimodal_model:
+
+Adding a New Multimodal Model
+=============================
+
+This document provides a high-level guide on integrating a :ref:`multi-modal model <multi_modality>` into vLLM.
+
+.. note::
+    The complexity of adding a new model depends heavily on the model's architecture.
+    The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
+    However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
+
+.. tip::
+    If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ repository.
+    We will be happy to help you out!
+
+
+1. Set up the base vLLM model
+-----------------------------
+
+As usual, follow :ref:`these steps <adding_a_new_model>` to implement the model in vLLM, but note the following:
+
+- You should additionally implement the :class:`~vllm.model_executor.models.interfaces.SupportsVision` interface.
+
+  .. code-block:: diff
+
+      + from vllm.model_executor.models.interfaces import SupportsVision
+
+      - class YourModelForImage2Seq(nn.Module):
+      + class YourModelForImage2Seq(nn.Module, SupportsVision):
+
+  .. note::
+      The model class does not have to be named :code:`*ForCausalLM`.
+      Check out `the HuggingFace Transformers documentation <https://huggingface.co/docs/transformers/model_doc/auto#multimodal>`__ for some examples.
+
+- While implementing the :meth:`~torch.nn.Module.forward` method, reserve a keyword parameter
+  for each input tensor that corresponds to a multi-modal input, as shown in the following example:
+
+  .. code-block:: diff
+
+      def forward(
+          self,
+          input_ids: torch.Tensor,
+          positions: torch.Tensor,
+          kv_caches: List[torch.Tensor],
+          attn_metadata: AttentionMetadata,
+      +   pixel_values: torch.Tensor,
+      ) -> SamplerOutput:
+
+
+2. Register input mappers
+-------------------------
+
+For each modality type to support, decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_input_mapper <vllm.multimodal.MultiModalRegistry.register_input_mapper>`.
+This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in :meth:`~torch.nn.Module.forward`.
+
+.. code-block:: diff
+
+    from vllm.model_executor.models.interfaces import SupportsVision
+    + from vllm.multimodal import MULTIMODAL_REGISTRY
+
+    + @MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
+    + @MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
+    class YourModelForImage2Seq(nn.Module, SupportsVision):
+
+A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function.
+
+.. seealso::
+    :ref:`input_processing_pipeline`
+
+
+3. (Optional) Register dummy data
+---------------------------------
+
+During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models.
+In such cases, you can define your own dummy data by registering a factory method via :meth:`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_dummy_data>`.
+
+.. code-block:: diff
+
+    from vllm.inputs import INPUT_REGISTRY
+    from vllm.model_executor.models.interfaces import SupportsVision
+    from vllm.multimodal import MULTIMODAL_REGISTRY
+
+    @MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
+    @MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
+    + @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
+    class YourModelForImage2Seq(nn.Module, SupportsVision):
+
+Here are some examples:
+
+- Image inputs (static feature size): `LLaVA-1.5 Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py>`__
+- Image inputs (dynamic feature size): `LLaVA-NeXT Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py>`__
+
+.. seealso::
+    :ref:`input_processing_pipeline`
+
+
+4. (Optional) Register input processor
+--------------------------------------
+
+Sometimes, there is a need to process inputs at the :class:`~vllm.LLMEngine` level before they are passed to the model executor.
+You can register input processors via :meth:`INPUT_REGISTRY.register_input_processor <vllm.inputs.registry.InputRegistry.register_input_processor>`.
+
+.. code-block:: diff
+
+    from vllm.inputs import INPUT_REGISTRY
+    from vllm.model_executor.models.interfaces import SupportsVision
+    from vllm.multimodal import MULTIMODAL_REGISTRY
+
+    @MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
+    @MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
+    @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
+    + @INPUT_REGISTRY.register_input_processor(<your_input_processor>)
+    class YourModelForImage2Seq(nn.Module, SupportsVision):
+
+A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation.
+Here are some examples:
+
+- Insert static number of image tokens: `LLaVA-1.5 Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py>`__
+- Insert dynamic number of image tokens: `LLaVA-NeXT Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py>`__
+
+.. seealso::
+    :ref:`input_processing_pipeline`
diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
@@ -1,3 +1,5 @@
+.. _multi_modality:
+
 Multi-Modality
 ==============
 
@@ -8,9 +10,15 @@ vLLM provides experimental support for multi-modal models through the :mod:`vllm
 :class:`vllm.inputs.PromptStrictInputs` accepts an additional attribute ``multi_modal_data``
 which allows you to pass in multi-modal input alongside text and token prompts.
 
-By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model,
-you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data <MultiModalRegistry.register_dummy_data>`,
-as well as :meth:`MULTIMODAL_REGISTRY.register_input <MultiModalRegistry.register_input>` for each modality type to support.
+By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model, please follow :ref:`this guide <adding_a_new_multimodal_model>`.
+
+Guides
+++++++
+
+.. toctree::
+   :maxdepth: 1
+
+   adding_multimodal_model
 
 Module Contents
 +++++++++++++++
@@ -33,6 +41,10 @@ Base Classes
     :members:
     :show-inheritance:
 
+.. autoclass:: vllm.multimodal.MultiModalInputs
+    :members:
+    :show-inheritance:
+
 .. autoclass:: vllm.multimodal.MultiModalPlugin
     :members:
     :show-inheritance:

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
@@ -23,7 +23,6 @@ The following :ref:`engine arguments <engine_args>` are specific to VLMs:
     Currently, the support for vision language models on vLLM has the following limitations:
 
     * Only single image input is supported per text prompt.
-    * Dynamic ``image_input_shape`` is not supported: the input image will be resized to the static ``image_input_shape``. This means our LLaVA-NeXT output may not exactly match the huggingface implementation.
 
     We are continuously improving user & developer experience for VLMs. Please `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_ if you have any feedback or feature requests.
 
@@ -48,13 +47,12 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
 
 To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
 
-* ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
+* ``prompt``: The prompt should follow the same format as that for the HuggingFace version of the model.
 * ``multi_modal_data``: This should be an instance of :class:`~vllm.multimodal.image.ImagePixelData` or :class:`~vllm.multimodal.image.ImageFeatureData`.
 
 .. code-block:: python
 
-    prompt = "<image>" * 576 + (
-        "\nUSER: What is the content of this image?\nASSISTANT:")
+    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
 
     # Load the image using PIL.Image
     image = ...
@@ -70,8 +68,6 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptS
 
 A code example can be found in `examples/llava_example.py <https://github.com/vllm-project/vllm/blob/main/examples/llava_example.py>`_.
 
-.. important::
-    We will remove the need to format image tokens in a future release. Afterwards, the input text will follow the same format as that for the original HuggingFace model.
 
 Online OpenAI Vision API Compatible Inference
 ----------------------------------------------
@@ -141,5 +137,4 @@ A full code example can be found in `examples/openai_vision_api_client.py <https
         export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
 
 .. note::
-    The prompt formatting with the image token ``<image>`` is not needed when serving VLMs with the API server since the prompt will be 
-    processed automatically by the server.
+    There is no need to format the prompt in the API request when since it will be handled by the server.
diff --git a/examples/llava_example.py b/examples/llava_example.py
@@ -22,8 +22,7 @@ def run_llava_pixel_values(*, disable_image_processor: bool = False):
         disable_image_processor=disable_image_processor,
     )
 
-    prompt = "<image>" * 576 + (
-        "\nUSER: What is the content of this image?\nASSISTANT:")
+    prompt = "<image>\nUSER: What is the content of this image?\nASSISTANT:"
 
     if disable_image_processor:
         image = torch.load("images/stop_sign_pixel_values.pt")
@@ -49,8 +48,7 @@ def run_llava_image_features():
         image_feature_size=576,
     )
 
-    prompt = "<image>" * 576 + (
-        "\nUSER: What is the content of this image?\nASSISTANT:")
+    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
 
     image: torch.Tensor = torch.load("images/stop_sign_image_features.pt")
 

diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py
@@ -19,7 +19,7 @@
     image_feature_size=1176,
 )
 
-prompt = "[INST] " + "<image>" * 1176 + "\nWhat is shown in this image? [/INST]"
+prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
 url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
 image = Image.open(BytesIO(requests.get(url).content))
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=100)

diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
@@ -26,8 +26,6 @@ def run_phi3v():
 
     # single-image prompt
     prompt = "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n"  # noqa: E501
-    prompt = prompt.replace("<|image_1|>", "<|image|>" * 1921 + "<s>")
-
     sampling_params = SamplingParams(temperature=0, max_tokens=64)
 
     outputs = llm.generate(

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,6 +1,7 @@
 import contextlib
 import gc
 import os
+import sys
 from collections import UserList
 from dataclasses import dataclass
 from functools import cached_property
@@ -79,7 +80,17 @@ class _ImageAssetPrompts(TypedDict):
     cherry_blossom: str
 
 
-class _ImageAssets(UserList[ImageAsset]):
+if sys.version_info < (3, 9):
+    # UserList cannot be subscripted
+    class _ImageAssetsBase(UserList):
+        pass
+else:
+
+    class _ImageAssetsBase(UserList[ImageAsset]):
+        pass
+
+
+class _ImageAssets(_ImageAssetsBase):
 
     def __init__(self) -> None:
         super().__init__(