Skip to content

Commit d9fc8cd

Browse files
[V1] Enable multi-input by default (#15799)
Signed-off-by: DarkLight1337 <[email protected]>
1 parent f069f3e commit d9fc8cd

File tree

21 files changed

+214
-105
lines changed

21 files changed

+214
-105
lines changed

docs/source/models/supported_models.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -759,7 +759,7 @@ On the other hand, modalities separated by `/` are mutually exclusive.
759759
See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model.
760760

761761
:::{important}
762-
To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
762+
**To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference)
763763
or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt:
764764

765765
Offline inference:
@@ -777,6 +777,8 @@ Online serving:
777777
vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
778778
```
779779

780+
**This is no longer required if you are using vLLM V1.**
781+
780782
:::
781783

782784
:::{note}

docs/source/serving/offline_inference.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,30 @@ If you run out of CPU RAM, try the following options:
110110
- (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB).
111111
- (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
112112

113+
#### Disable unused modalities
114+
115+
You can disable unused modalities (except for text) by setting its limit to zero.
116+
117+
For example, if your application only accepts image input, there is no need to allocate any memory for videos.
118+
119+
```python
120+
from vllm import LLM
121+
122+
# Accept images but not videos
123+
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
124+
limit_mm_per_prompt={"video": 0})
125+
```
126+
127+
You can even run a multi-modal model for text-only inference:
128+
129+
```python
130+
from vllm import LLM
131+
132+
# Don't accept images. Just text.
133+
llm = LLM(model="google/gemma-3-27b-it",
134+
limit_mm_per_prompt={"image": 0})
135+
```
136+
113137
### Performance optimization and tuning
114138

115139
You can potentially improve the performance of vLLM by finetuning various options.

examples/offline_inference/audio_language.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,11 @@ def main(args):
196196
req_data = model_example_map[model](question_per_audio_count[audio_count],
197197
audio_count)
198198

199+
# Disable other modalities to save memory
200+
default_limits = {"image": 0, "video": 0, "audio": 0}
201+
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
202+
req_data.engine_args.limit_mm_per_prompt or {})
203+
199204
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
200205
llm = LLM(**engine_args)
201206

examples/offline_inference/encoder_decoder_multimodal.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,11 @@ def main(args):
133133

134134
req_data = model_example_map[model]()
135135

136+
# Disable other modalities to save memory
137+
default_limits = {"image": 0, "video": 0, "audio": 0}
138+
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
139+
req_data.engine_args.limit_mm_per_prompt or {})
140+
136141
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
137142
llm = LLM(**engine_args)
138143

examples/offline_inference/vision_language.py

Lines changed: 45 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
4545
max_model_len=4096,
4646
max_num_seqs=2,
4747
dtype="bfloat16",
48-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
48+
limit_mm_per_prompt={"image": 1},
4949
)
5050

5151
prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
@@ -71,7 +71,7 @@ def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
7171
max_model_len=2048,
7272
max_num_seqs=2,
7373
mm_processor_kwargs={"crop_to_patches": True},
74-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
74+
limit_mm_per_prompt={"image": 1},
7575
)
7676
prompts = [
7777
f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
@@ -92,7 +92,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
9292
prompts = [f"Question: {question} Answer:" for question in questions]
9393
engine_args = EngineArgs(
9494
model="Salesforce/blip2-opt-6.7b",
95-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
95+
limit_mm_per_prompt={"image": 1},
9696
)
9797

9898
return ModelRequestData(
@@ -110,7 +110,7 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
110110
model="facebook/chameleon-7b",
111111
max_model_len=4096,
112112
max_num_seqs=2,
113-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
113+
limit_mm_per_prompt={"image": 1},
114114
)
115115

116116
return ModelRequestData(
@@ -129,8 +129,8 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
129129
model=model_name,
130130
max_model_len=4096,
131131
max_num_seqs=2,
132-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
133132
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
133+
limit_mm_per_prompt={"image": 1},
134134
)
135135

136136
prompts = [
@@ -155,7 +155,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
155155
max_num_seqs=2,
156156
trust_remote_code=True,
157157
dtype="bfloat16",
158-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
158+
limit_mm_per_prompt={"image": 1},
159159
)
160160

161161
prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
@@ -175,7 +175,7 @@ def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
175175
model="adept/fuyu-8b",
176176
max_model_len=2048,
177177
max_num_seqs=2,
178-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
178+
limit_mm_per_prompt={"image": 1},
179179
)
180180

181181
return ModelRequestData(
@@ -194,7 +194,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
194194
max_model_len=2048,
195195
max_num_seqs=2,
196196
mm_processor_kwargs={"do_pan_and_scan": True},
197-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
197+
limit_mm_per_prompt={"image": 1},
198198
)
199199

200200
prompts = [("<bos><start_of_turn>user\n"
@@ -219,7 +219,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
219219
trust_remote_code=True,
220220
enforce_eager=True,
221221
hf_overrides={"architectures": ["GLM4VForCausalLM"]},
222-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
222+
limit_mm_per_prompt={"image": 1},
223223
)
224224

225225
prompts = [
@@ -246,7 +246,7 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
246246
model=model_name,
247247
trust_remote_code=True,
248248
max_model_len=8192,
249-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
249+
limit_mm_per_prompt={"image": 1},
250250
)
251251

252252
tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -287,7 +287,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
287287
"longest_edge": 3 * 364
288288
},
289289
},
290-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
290+
limit_mm_per_prompt={"image": 1},
291291
)
292292
prompts = [(
293293
f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
@@ -314,7 +314,7 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
314314
"longest_edge": 384
315315
},
316316
},
317-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
317+
limit_mm_per_prompt={"image": 1},
318318
)
319319
prompts = [
320320
(f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
@@ -337,7 +337,7 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
337337
model=model_name,
338338
trust_remote_code=True,
339339
max_model_len=4096,
340-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
340+
limit_mm_per_prompt={"image": 1},
341341
)
342342

343343
tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -375,7 +375,7 @@ def run_llava(questions: list[str], modality: str) -> ModelRequestData:
375375
engine_args = EngineArgs(
376376
model="llava-hf/llava-1.5-7b-hf",
377377
max_model_len=4096,
378-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
378+
limit_mm_per_prompt={"image": 1},
379379
)
380380

381381
return ModelRequestData(
@@ -392,7 +392,7 @@ def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
392392
engine_args = EngineArgs(
393393
model="llava-hf/llava-v1.6-mistral-7b-hf",
394394
max_model_len=8192,
395-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
395+
limit_mm_per_prompt={"image": 1},
396396
)
397397

398398
return ModelRequestData(
@@ -414,7 +414,7 @@ def run_llava_next_video(questions: list[str],
414414
model="llava-hf/LLaVA-NeXT-Video-7B-hf",
415415
max_model_len=8192,
416416
max_num_seqs=2,
417-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
417+
limit_mm_per_prompt={"image": 1},
418418
)
419419

420420
return ModelRequestData(
@@ -442,7 +442,7 @@ def run_llava_onevision(questions: list[str],
442442
engine_args = EngineArgs(
443443
model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
444444
max_model_len=16384,
445-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
445+
limit_mm_per_prompt={"image": 1},
446446
)
447447

448448
return ModelRequestData(
@@ -465,7 +465,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
465465
model="TIGER-Lab/Mantis-8B-siglip-llama3",
466466
max_model_len=4096,
467467
hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
468-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
468+
limit_mm_per_prompt={"image": 1},
469469
)
470470
stop_token_ids = [128009]
471471

@@ -506,7 +506,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
506506
max_model_len=4096,
507507
max_num_seqs=2,
508508
trust_remote_code=True,
509-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
509+
limit_mm_per_prompt={"image": 1},
510510
)
511511
# NOTE The stop_token_ids are different for various versions of MiniCPM-V
512512
# 2.0
@@ -561,7 +561,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
561561
max_model_len=8192,
562562
max_num_seqs=2,
563563
tensor_parallel_size=2,
564-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
564+
limit_mm_per_prompt={"image": 1},
565565
)
566566

567567
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
@@ -587,7 +587,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
587587
model=model_name,
588588
max_model_len=8192,
589589
max_num_seqs=2,
590-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
590+
limit_mm_per_prompt={"image": 1},
591591
)
592592

593593
tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -611,7 +611,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
611611
)
612612

613613

614-
def run_llama4(questions: list[str], modality: str):
614+
def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
615615
assert modality == "image"
616616

617617
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
@@ -621,8 +621,8 @@ def run_llama4(questions: list[str], modality: str):
621621
max_model_len=8192,
622622
max_num_seqs=4,
623623
tensor_parallel_size=8,
624-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
625624
gpu_memory_utilization=0.4,
625+
limit_mm_per_prompt={"image": 1},
626626
)
627627

628628
tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -657,7 +657,7 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
657657
model=model_name,
658658
trust_remote_code=True,
659659
dtype="bfloat16",
660-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
660+
limit_mm_per_prompt={"image": 1},
661661
)
662662

663663
prompts = [
@@ -683,7 +683,7 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
683683
trust_remote_code=True,
684684
max_model_len=4096,
685685
tensor_parallel_size=4,
686-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
686+
limit_mm_per_prompt={"image": 1},
687687
)
688688

689689
tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -710,7 +710,8 @@ def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
710710
prompts = ["caption en" for _ in questions]
711711
engine_args = EngineArgs(
712712
model="google/paligemma-3b-mix-224",
713-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
713+
limit_mm_per_prompt={"image": 1},
714+
)
714715

715716
return ModelRequestData(
716717
engine_args=engine_args,
@@ -726,7 +727,8 @@ def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
726727
prompts = ["caption en" for _ in questions]
727728
engine_args = EngineArgs(
728729
model="google/paligemma2-3b-ft-docci-448",
729-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
730+
limit_mm_per_prompt={"image": 1},
731+
)
730732

731733
return ModelRequestData(
732734
engine_args=engine_args,
@@ -762,7 +764,7 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
762764
max_num_seqs=2,
763765
# Note - mm_processor_kwargs can also be passed to generate/chat calls
764766
mm_processor_kwargs={"num_crops": 16},
765-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
767+
limit_mm_per_prompt={"image": 1},
766768
)
767769

768770
return ModelRequestData(
@@ -793,6 +795,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
793795
max_num_seqs=2,
794796
enable_lora=True,
795797
max_lora_rank=320,
798+
limit_mm_per_prompt={"image": 1},
796799
)
797800

798801
return ModelRequestData(
@@ -813,7 +816,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
813816
model=model_name,
814817
max_model_len=6144,
815818
max_num_seqs=2,
816-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
819+
limit_mm_per_prompt={"image": 1},
817820
)
818821

819822
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
@@ -834,7 +837,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
834837
max_model_len=1024,
835838
max_num_seqs=2,
836839
hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
837-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
840+
limit_mm_per_prompt={"image": 1},
838841
)
839842

840843
prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
@@ -859,7 +862,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
859862
"min_pixels": 28 * 28,
860863
"max_pixels": 1280 * 28 * 28,
861864
},
862-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
865+
limit_mm_per_prompt={"image": 1},
863866
)
864867

865868
if modality == "image":
@@ -894,7 +897,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
894897
"max_pixels": 1280 * 28 * 28,
895898
"fps": 1,
896899
},
897-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
900+
limit_mm_per_prompt={"image": 1},
898901
)
899902

900903
if modality == "image":
@@ -925,7 +928,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
925928
model=model_name,
926929
trust_remote_code=True,
927930
max_model_len=4096,
928-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
931+
limit_mm_per_prompt={"image": 1},
929932
)
930933

931934
tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -1082,7 +1085,15 @@ def main(args):
10821085

10831086
req_data = model_example_map[model](questions, modality)
10841087

1085-
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
1088+
# Disable other modalities to save memory
1089+
default_limits = {"image": 0, "video": 0, "audio": 0}
1090+
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
1091+
req_data.engine_args.limit_mm_per_prompt or {})
1092+
1093+
engine_args = asdict(req_data.engine_args) | {
1094+
"seed": args.seed,
1095+
"disable_mm_preprocessor_cache": args.disable_mm_preprocessor_cache,
1096+
}
10861097
llm = LLM(**engine_args)
10871098

10881099
# To maintain code compatibility in this script, we add LoRA here.

0 commit comments

Comments
 (0)