Skip to content

Commit 7e302b6

Browse files
princeprideminpeter
authored andcommitted
Add tarsier model support (vllm-project#18985)
Signed-off-by: 汪志鹏 <[email protected]> Signed-off-by: minpeter <[email protected]>
1 parent a9e4280 commit 7e302b6

File tree

7 files changed

+689
-0
lines changed

7 files changed

+689
-0
lines changed

docs/models/supported_models.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -550,6 +550,7 @@ Specified using `--task generate`.
550550
| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎\* |
551551
| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
552552
| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
553+
| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`,`omni-search/Tarsier-34b` | | ✅︎ | ✅︎ |
553554

554555
<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
555556
&nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:

examples/offline_inference/vision_language.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,25 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
333333
)
334334

335335

336+
# omni-research/Tarsier-7b
337+
def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
338+
assert modality == "image"
339+
model_name = "omni-research/Tarsier-7b"
340+
341+
engine_args = EngineArgs(
342+
model=model_name,
343+
trust_remote_code=True,
344+
max_model_len=4096,
345+
limit_mm_per_prompt={modality: 1},
346+
)
347+
prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
348+
349+
return ModelRequestData(
350+
engine_args=engine_args,
351+
prompts=prompts,
352+
)
353+
354+
336355
# InternVL
337356
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
338357
model_name = "OpenGVLab/InternVL3-2B"
@@ -1091,6 +1110,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
10911110
"qwen2_5_omni": run_qwen2_5_omni,
10921111
"skywork_chat": run_skyworkr1v,
10931112
"smolvlm": run_smolvlm,
1113+
"tarsier": run_tarsier,
10941114
}
10951115

10961116

examples/offline_inference/vision_language_multi_image.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -691,6 +691,26 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
691691
)
692692

693693

694+
def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
695+
model_name = "omni-research/Tarsier-7b"
696+
697+
engine_args = EngineArgs(
698+
model=model_name,
699+
trust_remote_code=True,
700+
max_model_len=4096,
701+
limit_mm_per_prompt={"image": len(image_urls)},
702+
)
703+
704+
prompt = f"USER: {'<image>' * len(image_urls)}\n{question}\n ASSISTANT:"
705+
image_data = [fetch_image(url) for url in image_urls]
706+
707+
return ModelRequestData(
708+
engine_args=engine_args,
709+
prompt=prompt,
710+
image_data=image_data,
711+
)
712+
713+
694714
model_example_map = {
695715
"aria": load_aria,
696716
"aya_vision": load_aya_vision,
@@ -712,6 +732,7 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
712732
"qwen2_vl": load_qwen2_vl,
713733
"qwen2_5_vl": load_qwen2_5_vl,
714734
"smolvlm": load_smolvlm,
735+
"tarsier": load_tarsier,
715736
}
716737

717738

tests/models/multimodal/processing/test_common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ def _test_processing_correctness_one(
282282
"Skywork/Skywork-R1V-38B",
283283
"fixie-ai/ultravox-v0_5-llama-3_2-1b",
284284
"openai/whisper-large-v3",
285+
"omni-research/Tarsier-7b",
285286
])
286287
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
287288
@pytest.mark.parametrize("num_batches", [32])

tests/models/registry.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,8 @@ def check_available_online(
406406
"SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"), # noqa: E501
407407
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501
408408
trust_remote_code=True),
409+
"TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b", # noqa: E501
410+
hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}), # noqa: E501
409411
# [Encoder-decoder]
410412
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
411413
# Therefore, we borrow the BartTokenizer from the original Bart model

vllm/model_executor/models/registry.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,7 @@
211211
"Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501
212212
"UltravoxModel": ("ultravox", "UltravoxModel"),
213213
"Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
214+
"TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501
214215
# [Encoder-decoder]
215216
"Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"), # noqa: E501
216217
"MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501

0 commit comments

Comments
 (0)