vllm-project · BloomBerry · Mar 5, 2025 · Mar 5, 2025 · Mar 5, 2025 · Mar 5, 2025
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
@@ -434,8 +434,8 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
   * ✅︎
 - * `Qwen2ForCausalLM`
-  * QwQ, Qwen2
-  * `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc.
+  * Qwen2
+  * `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.
   * ✅︎
   * ✅︎
 - * `Qwen2MoeForCausalLM`
@@ -665,6 +665,13 @@ On the other hand, modalities separated by `/` are mutually exclusive.
 
 - e.g.: `T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
 
+### ColQwen2VL
+
+- **Model Name**: ColQwen2VL
+- **Description**: Implements the ColQwen2 model for efficient document retrieval with vision-language capabilities. This model is compatible with the transformers' ColQwen2 class and is designed to handle complex multimodal tasks involving text and image data.
+- **Supported Modalities**: Text + Image
+- **Example Use Cases**: Document Retrieval (Text-to-Image Retrieval) using embedding outputs.
+
 See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model.
 
 :::{important}
@@ -692,23 +699,8 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
 vLLM currently only supports adding LoRA to the language backbone of multimodal models.
 :::
 
-### Generative Models
+### Other Models
 
-See [this page](#generative-models) for more information on how to use generative models.
-
-#### Text Generation (`--task generate`)
-
-:::{list-table}
-:widths: 25 25 15 20 5 5 5
-:header-rows: 1
-
-- * Architecture
-  * Models
-  * Inputs
-  * Example HF Models
-  * [LoRA](#lora-adapter)
-  * [PP](#distributed-serving)
-  * [V1](gh-issue:8779)
 - * `AriaForConditionalGeneration`
   * Aria
   * T + I<sup>+</sup>
@@ -1011,7 +1003,7 @@ _________________
 
 ## Model Support Policy
 
-At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
+At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here's how we manage third-party model support:
 
 1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated!
 

diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py
@@ -71,6 +71,38 @@ def run_e5_v(query: Query):
     )
 
 
+def run_colqwen2vlm(query: Query):
+    if query["modality"] == "text":
+        text = query["text"]
+        prompt = f"<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n"
+        image = None
+    elif query["modality"] == "image":
+        text = "Describe the image."
+        prompt = (
+            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+            "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+            f"{text}<|im_end|>\n"
+            "<|im_start|>assistant\n")
+        image = query["image"]
+    else:
+        modality = query['modality']
+        raise ValueError(f"Unsupported query modality: '{modality}'")
+
+    llm = LLM(
+        model="vidore/colqwen2-v1.0-merged",
+        # model="vidore/colqwen2-1.0-hf-internal",
+        task="embed",
+        trust_remote_code=True,
+        # dtype=torch.bfloat16,
+    )
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        image=image,
+    )
+
+
 def run_vlm2vec(query: Query):
     if query["modality"] == "text":
         text = query["text"]
@@ -150,6 +182,7 @@ def main(args: Namespace):
 model_example_map = {
     "e5_v": run_e5_v,
     "vlm2vec": run_vlm2vec,
+    "colqwen2vlm": run_colqwen2vlm,
 }
 
 if __name__ == "__main__":

@@ -0,0 +1,10 @@
+# tests/models/embedding/vision_language/test_colqwen2vl.py
+
+import torch
+from vllm.model_executor.models.colqwen2_vl import ColQwen2VL
+
+def test_colqwen2vl_embeddings():
+    model = ColQwen2VL()
+    dummy_input = torch.rand((1, 3, 224, 224))  # Example input
+    embeddings = model(dummy_input)
+    assert embeddings.shape == (1, 128), "Embedding size should be 128."
@@ -20,7 +20,14 @@
 from ..utils import fork_new_process_for_each_test
 from .registry import HF_EXAMPLE_MODELS
 
-
+from vllm.model_executor.models.colqwen2_vl import ColQwen2VL
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+def test_colqwen2vl_registration():
+    assert 'ColQwen2VL' in MULTIMODAL_REGISTRY, "ColQwen2VL should be registered."
+    model = MULTIMODAL_REGISTRY['ColQwen2VL']()
+    assert isinstance(model, ColQwen2VL), "Failed to instantiate ColQwen2VL."
+
 @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
 def test_registry_imports(model_arch):
     model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)