Skip to content

[MODEL ADDITION] Ovis2 Model Addition #15826

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 38 commits into from
Apr 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
332f191
Start ovis model addition
mlinmg Mar 31, 2025
751ad19
Added configurations and processors to the model files
mlinmg Mar 31, 2025
172a2eb
Changed ovis to ovis2 for better model versioning
mlinmg Apr 1, 2025
64985c1
corrected processor implementation
mlinmg Apr 2, 2025
90bf158
Merge branch 'main' into Ovis-model-addition
mlinmg Apr 2, 2025
8b00b1d
Merge branch 'main' into Ovis-model-addition
mlinmg Apr 7, 2025
c8de860
put porting files to correct places
Isotr0py Apr 8, 2025
f57cc5f
update replacement
Isotr0py Apr 8, 2025
32c5f72
make inference work on CPU with bs1
Isotr0py Apr 9, 2025
87c5265
fix gpu single image inference
Isotr0py Apr 9, 2025
e0e6a74
Merge branch 'main' into Ovis-model-addition
Isotr0py Apr 9, 2025
5df33cf
init multi-images examples and re-enable mm cache
Isotr0py Apr 9, 2025
5c97f2c
fix multi-images inference
Isotr0py Apr 9, 2025
d1b76a1
Merge remote-tracking branch 'upstream/main' into Ovis-model-addition
Isotr0py Apr 15, 2025
9790bd7
revert changes by mistake
Isotr0py Apr 15, 2025
ac0069c
update examples
Isotr0py Apr 15, 2025
789db0b
support v1
Isotr0py Apr 15, 2025
c51665e
clean up
Isotr0py Apr 15, 2025
dc9811d
clean up and update doc
Isotr0py Apr 15, 2025
7bc203b
further clean up
Isotr0py Apr 15, 2025
5bc739e
disable TP on ViT
Isotr0py Apr 20, 2025
f23b952
init tests
Isotr0py Apr 20, 2025
b5f4319
add single image tests
Isotr0py Apr 20, 2025
2613694
add multi image tests
Isotr0py Apr 20, 2025
3bf618e
Merge remote-tracking branch 'upstream/main' into Ovis-model-addition
Isotr0py Apr 20, 2025
5005161
update
Isotr0py Apr 20, 2025
6c913d2
code format
Isotr0py Apr 20, 2025
a1b0634
code format
Isotr0py Apr 21, 2025
27e8870
make mypy happy
Isotr0py Apr 21, 2025
569b174
use tokenizer repo
Isotr0py Apr 21, 2025
b44dc75
add processor test
Isotr0py Apr 21, 2025
19cf4e1
clean up aimv2 ViT
Isotr0py Apr 23, 2025
dacad3a
clean up ovis2
Isotr0py Apr 24, 2025
aa8e815
Merge branch 'main' into Ovis-model-addition
Isotr0py Apr 24, 2025
dd4e856
make isort happy
Isotr0py Apr 24, 2025
411c2a2
remove sampler and unused config
Isotr0py Apr 25, 2025
5174be6
Merge branch 'main' into Ovis-model-addition
Isotr0py Apr 29, 2025
d2cbdd5
fix config import
Isotr0py Apr 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/source/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -1014,6 +1014,13 @@ See [this page](#generative-models) for more information on how to use generativ
*
* ✅︎
* ✅︎
- * `Ovis2ForConditionalGeneration`<sup>^</sup>
* Ovis2
* T + I<sup>+</sup>
* `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis2-2B`, etc.
*
*
* ✅︎
- * `PaliGemmaForConditionalGeneration`
* PaliGemma, PaliGemma 2
* T + I<sup>E</sup>
Expand Down
31 changes: 31 additions & 0 deletions examples/offline_inference/vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -725,6 +725,36 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
)


# Ovis2
def run_ovis2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"

model_name = "AIDC-AI/Ovis2-1B"
tokenizer = "Isotr0py/Ovis2-tokenizer"

engine_args = EngineArgs(
model=model_name,
tokenizer=tokenizer,
max_model_len=4096,
max_num_seqs=2,
trust_remote_code=True,
dtype="half",
hf_overrides={"architectures": ["Ovis2ForConditionalGeneration"]},
limit_mm_per_prompt={"image": 1},
)

placeholder = "<image>\n"
prompts = [("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n{placeholder}"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n") for question in questions]

return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)


# PaliGemma
def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
Expand Down Expand Up @@ -1041,6 +1071,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
"llama4": run_llama4,
"molmo": run_molmo,
"NVLM_D": run_nvlm_d,
"ovis2": run_ovis2,
"paligemma": run_paligemma,
"paligemma2": run_paligemma2,
"phi3_v": run_phi3v,
Expand Down
31 changes: 31 additions & 0 deletions examples/offline_inference/vision_language_multi_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,36 @@ def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
)


# Ovis2
def load_ovis2(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "AIDC-AI/Ovis2-1B"
tokenizer = "Isotr0py/Ovis2-tokenizer"

engine_args = EngineArgs(
model=model_name,
tokenizer=tokenizer,
max_model_len=8192,
max_num_seqs=2,
trust_remote_code=True,
dtype="half",
limit_mm_per_prompt={"image": len(image_urls)},
hf_overrides={"architectures": ["Ovis2ForConditionalGeneration"]},
)

placeholder = '\n'.join(
[f'Image {i+1}: <image>' for i in range(len(image_urls))]) + '\n'
prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n{placeholder}"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n")

return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)


def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "mistral-community/pixtral-12b"

Expand Down Expand Up @@ -685,6 +715,7 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
"mistral3": load_mistral3,
"mllama": load_mllama,
"NVLM_D": load_nvlm_d,
"ovis2": load_ovis2,
"phi3_v": load_phi3v,
"phi4_mm": load_phi4mm,
"pixtral_hf": load_pixtral_hf,
Expand Down
12 changes: 12 additions & 0 deletions tests/models/decoder_only/vision_language/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,18 @@
max_num_seqs=2,
patch_hf_runner=model_utils.molmo_patch_hf_runner,
),
"ovis2": VLMTestInfo(
models=["AIDC-AI/Ovis2-1B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
dtype="half",
# use sdpa mode for hf runner since ovis2 didn't work with flash_attn
hf_model_kwargs={"llm_attn_implementation": "sdpa"},
patch_hf_runner=model_utils.ovis2_patch_hf_runner,
),
"phi3v": VLMTestInfo(
models=["microsoft/Phi-3.5-vision-instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def run_test(
"disable_mm_preprocessor_cache": True,
}
if model_info.tokenizer:
vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer
vllm_runner_kwargs_["tokenizer_name"] = model_info.tokenizer
if model_info.tokenizer_mode:
vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
if model_info.hf_overrides:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -676,3 +676,33 @@ def _generate(self, max_new_tokens=None, do_sample=None, **kwargs):
hf_model.model.generate = types.MethodType(_generate, hf_model.model)

return hf_model


def ovis2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for Ovis2."""
hf_model.model.visual_tokenizer.to(hf_model.dtype)
hf_model.model.vte.to(hf_model.dtype)
hf_model.model.llm.to(hf_model.dtype)

hf_model.model.get_output_embeddings = lambda: \
hf_model.model.llm.get_output_embeddings()

def processor(*args, text="", images=None, **kwargs):
text_tokenizer = hf_model.model.get_text_tokenizer()
images = [images] if isinstance(images, Image) else images

text = text.split("<|im_start|>user\n")[1].split("<|im_end|>\n")[0]

prompt, input_ids, pixel_values = hf_model.model.preprocess_inputs(
text_or_conversations=text, images=images)
attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)

inputs = {
"inputs": input_ids.unsqueeze(0),
"pixel_values": pixel_values.unsqueeze(0),
"attention_mask": attention_mask.unsqueeze(0),
}
return BatchFeature(data=inputs, tensor_type="pt")

hf_model.processor = processor
return hf_model
1 change: 1 addition & 0 deletions tests/models/multimodal/processing/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ def _test_processing_correctness_mistral(
"allenai/Molmo-7B-D-0924",
"allenai/Molmo-7B-O-0924",
"nvidia/NVLM-D-72B",
"AIDC-AI/Ovis2-1B",
"google/paligemma-3b-mix-224",
"google/paligemma2-3b-ft-docci-448",
"microsoft/Phi-4-multimodal-instruct",
Expand Down
4 changes: 4 additions & 0 deletions tests/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,10 @@ def check_available_online(
max_transformers_version="4.48",
transformers_version_reason="Use of deprecated imports which have been removed.", # noqa: E501
extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}), # noqa: E501
"Ovis2ForConditionalGeneration": _HfExamplesInfo("AIDC-AI/Ovis2-1B",
tokenizer="Isotr0py/Ovis2-tokenizer",
trust_remote_code=True,
hf_overrides={"architectures": ["Ovis2ForConditionalGeneration"]}), # noqa: E501
"Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
trust_remote_code=True),
"PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501
Expand Down
5 changes: 3 additions & 2 deletions vllm/entrypoints/chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,9 +496,10 @@ def _placeholder_str(self, modality: ModalityStr,
if model_type.startswith("llava"):
return self._cached_token_str(self._tokenizer,
hf_config.image_token_index)

if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2",
"internvl_chat", "skywork_chat", "NVLM_D",
"h2ovl_chat", "idefics3", "smolvlm"):
"internvl_chat", "ovis2", "skywork_chat",
"NVLM_D", "h2ovl_chat", "idefics3", "smolvlm"):
return "<image>"
if model_type in ("mllama", "llama4"):
return "<|image|>"
Expand Down
Loading