@@ -45,7 +45,7 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
45
45
max_model_len = 4096 ,
46
46
max_num_seqs = 2 ,
47
47
dtype = "bfloat16" ,
48
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
48
+ limit_mm_per_prompt = { "image" : 1 } ,
49
49
)
50
50
51
51
prompts = [(f"<|im_start|>user\n <fim_prefix><|img|><fim_suffix>{ question } "
@@ -71,7 +71,7 @@ def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
71
71
max_model_len = 2048 ,
72
72
max_num_seqs = 2 ,
73
73
mm_processor_kwargs = {"crop_to_patches" : True },
74
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
74
+ limit_mm_per_prompt = { "image" : 1 } ,
75
75
)
76
76
prompts = [
77
77
f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{ question } <|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
@@ -92,7 +92,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
92
92
prompts = [f"Question: { question } Answer:" for question in questions ]
93
93
engine_args = EngineArgs (
94
94
model = "Salesforce/blip2-opt-6.7b" ,
95
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
95
+ limit_mm_per_prompt = { "image" : 1 } ,
96
96
)
97
97
98
98
return ModelRequestData (
@@ -110,7 +110,7 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
110
110
model = "facebook/chameleon-7b" ,
111
111
max_model_len = 4096 ,
112
112
max_num_seqs = 2 ,
113
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
113
+ limit_mm_per_prompt = { "image" : 1 } ,
114
114
)
115
115
116
116
return ModelRequestData (
@@ -129,8 +129,8 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
129
129
model = model_name ,
130
130
max_model_len = 4096 ,
131
131
max_num_seqs = 2 ,
132
- disable_mm_preprocessor_cache = args .disable_mm_preprocessor_cache ,
133
132
hf_overrides = {"architectures" : ["DeepseekVLV2ForCausalLM" ]},
133
+ limit_mm_per_prompt = {"image" : 1 },
134
134
)
135
135
136
136
prompts = [
@@ -155,7 +155,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
155
155
max_num_seqs = 2 ,
156
156
trust_remote_code = True ,
157
157
dtype = "bfloat16" ,
158
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
158
+ limit_mm_per_prompt = { "image" : 1 } ,
159
159
)
160
160
161
161
prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions ]
@@ -175,7 +175,7 @@ def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
175
175
model = "adept/fuyu-8b" ,
176
176
max_model_len = 2048 ,
177
177
max_num_seqs = 2 ,
178
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
178
+ limit_mm_per_prompt = { "image" : 1 } ,
179
179
)
180
180
181
181
return ModelRequestData (
@@ -194,7 +194,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
194
194
max_model_len = 2048 ,
195
195
max_num_seqs = 2 ,
196
196
mm_processor_kwargs = {"do_pan_and_scan" : True },
197
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
197
+ limit_mm_per_prompt = { "image" : 1 } ,
198
198
)
199
199
200
200
prompts = [("<bos><start_of_turn>user\n "
@@ -219,7 +219,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
219
219
trust_remote_code = True ,
220
220
enforce_eager = True ,
221
221
hf_overrides = {"architectures" : ["GLM4VForCausalLM" ]},
222
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
222
+ limit_mm_per_prompt = { "image" : 1 } ,
223
223
)
224
224
225
225
prompts = [
@@ -246,7 +246,7 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
246
246
model = model_name ,
247
247
trust_remote_code = True ,
248
248
max_model_len = 8192 ,
249
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
249
+ limit_mm_per_prompt = { "image" : 1 } ,
250
250
)
251
251
252
252
tokenizer = AutoTokenizer .from_pretrained (model_name ,
@@ -287,7 +287,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
287
287
"longest_edge" : 3 * 364
288
288
},
289
289
},
290
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
290
+ limit_mm_per_prompt = { "image" : 1 } ,
291
291
)
292
292
prompts = [(
293
293
f"<|begin_of_text|>User:<image>{ question } <end_of_utterance>\n Assistant:"
@@ -314,7 +314,7 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
314
314
"longest_edge" : 384
315
315
},
316
316
},
317
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
317
+ limit_mm_per_prompt = { "image" : 1 } ,
318
318
)
319
319
prompts = [
320
320
(f"<|im_start|>User:<image>{ question } <end_of_utterance>\n Assistant:" )
@@ -337,7 +337,7 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
337
337
model = model_name ,
338
338
trust_remote_code = True ,
339
339
max_model_len = 4096 ,
340
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
340
+ limit_mm_per_prompt = { "image" : 1 } ,
341
341
)
342
342
343
343
tokenizer = AutoTokenizer .from_pretrained (model_name ,
@@ -375,7 +375,7 @@ def run_llava(questions: list[str], modality: str) -> ModelRequestData:
375
375
engine_args = EngineArgs (
376
376
model = "llava-hf/llava-1.5-7b-hf" ,
377
377
max_model_len = 4096 ,
378
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
378
+ limit_mm_per_prompt = { "image" : 1 } ,
379
379
)
380
380
381
381
return ModelRequestData (
@@ -392,7 +392,7 @@ def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
392
392
engine_args = EngineArgs (
393
393
model = "llava-hf/llava-v1.6-mistral-7b-hf" ,
394
394
max_model_len = 8192 ,
395
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
395
+ limit_mm_per_prompt = { "image" : 1 } ,
396
396
)
397
397
398
398
return ModelRequestData (
@@ -414,7 +414,7 @@ def run_llava_next_video(questions: list[str],
414
414
model = "llava-hf/LLaVA-NeXT-Video-7B-hf" ,
415
415
max_model_len = 8192 ,
416
416
max_num_seqs = 2 ,
417
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
417
+ limit_mm_per_prompt = { "image" : 1 } ,
418
418
)
419
419
420
420
return ModelRequestData (
@@ -442,7 +442,7 @@ def run_llava_onevision(questions: list[str],
442
442
engine_args = EngineArgs (
443
443
model = "llava-hf/llava-onevision-qwen2-7b-ov-hf" ,
444
444
max_model_len = 16384 ,
445
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
445
+ limit_mm_per_prompt = { "image" : 1 } ,
446
446
)
447
447
448
448
return ModelRequestData (
@@ -465,7 +465,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
465
465
model = "TIGER-Lab/Mantis-8B-siglip-llama3" ,
466
466
max_model_len = 4096 ,
467
467
hf_overrides = {"architectures" : ["MantisForConditionalGeneration" ]},
468
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
468
+ limit_mm_per_prompt = { "image" : 1 } ,
469
469
)
470
470
stop_token_ids = [128009 ]
471
471
@@ -506,7 +506,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
506
506
max_model_len = 4096 ,
507
507
max_num_seqs = 2 ,
508
508
trust_remote_code = True ,
509
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
509
+ limit_mm_per_prompt = { "image" : 1 } ,
510
510
)
511
511
# NOTE The stop_token_ids are different for various versions of MiniCPM-V
512
512
# 2.0
@@ -561,7 +561,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
561
561
max_model_len = 8192 ,
562
562
max_num_seqs = 2 ,
563
563
tensor_parallel_size = 2 ,
564
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
564
+ limit_mm_per_prompt = { "image" : 1 } ,
565
565
)
566
566
567
567
prompts = [f"<s>[INST]{ question } \n [IMG][/INST]" for question in questions ]
@@ -587,7 +587,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
587
587
model = model_name ,
588
588
max_model_len = 8192 ,
589
589
max_num_seqs = 2 ,
590
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
590
+ limit_mm_per_prompt = { "image" : 1 } ,
591
591
)
592
592
593
593
tokenizer = AutoTokenizer .from_pretrained (model_name )
@@ -611,7 +611,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
611
611
)
612
612
613
613
614
- def run_llama4 (questions : list [str ], modality : str ):
614
+ def run_llama4 (questions : list [str ], modality : str ) -> ModelRequestData :
615
615
assert modality == "image"
616
616
617
617
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
@@ -621,8 +621,8 @@ def run_llama4(questions: list[str], modality: str):
621
621
max_model_len = 8192 ,
622
622
max_num_seqs = 4 ,
623
623
tensor_parallel_size = 8 ,
624
- disable_mm_preprocessor_cache = args .disable_mm_preprocessor_cache ,
625
624
gpu_memory_utilization = 0.4 ,
625
+ limit_mm_per_prompt = {"image" : 1 },
626
626
)
627
627
628
628
tokenizer = AutoTokenizer .from_pretrained (model_name )
@@ -657,7 +657,7 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
657
657
model = model_name ,
658
658
trust_remote_code = True ,
659
659
dtype = "bfloat16" ,
660
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
660
+ limit_mm_per_prompt = { "image" : 1 } ,
661
661
)
662
662
663
663
prompts = [
@@ -683,7 +683,7 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
683
683
trust_remote_code = True ,
684
684
max_model_len = 4096 ,
685
685
tensor_parallel_size = 4 ,
686
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
686
+ limit_mm_per_prompt = { "image" : 1 } ,
687
687
)
688
688
689
689
tokenizer = AutoTokenizer .from_pretrained (model_name ,
@@ -710,7 +710,8 @@ def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
710
710
prompts = ["caption en" for _ in questions ]
711
711
engine_args = EngineArgs (
712
712
model = "google/paligemma-3b-mix-224" ,
713
- disable_mm_preprocessor_cache = args .disable_mm_preprocessor_cache )
713
+ limit_mm_per_prompt = {"image" : 1 },
714
+ )
714
715
715
716
return ModelRequestData (
716
717
engine_args = engine_args ,
@@ -726,7 +727,8 @@ def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
726
727
prompts = ["caption en" for _ in questions ]
727
728
engine_args = EngineArgs (
728
729
model = "google/paligemma2-3b-ft-docci-448" ,
729
- disable_mm_preprocessor_cache = args .disable_mm_preprocessor_cache )
730
+ limit_mm_per_prompt = {"image" : 1 },
731
+ )
730
732
731
733
return ModelRequestData (
732
734
engine_args = engine_args ,
@@ -762,7 +764,7 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
762
764
max_num_seqs = 2 ,
763
765
# Note - mm_processor_kwargs can also be passed to generate/chat calls
764
766
mm_processor_kwargs = {"num_crops" : 16 },
765
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
767
+ limit_mm_per_prompt = { "image" : 1 } ,
766
768
)
767
769
768
770
return ModelRequestData (
@@ -793,6 +795,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
793
795
max_num_seqs = 2 ,
794
796
enable_lora = True ,
795
797
max_lora_rank = 320 ,
798
+ limit_mm_per_prompt = {"image" : 1 },
796
799
)
797
800
798
801
return ModelRequestData (
@@ -813,7 +816,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
813
816
model = model_name ,
814
817
max_model_len = 6144 ,
815
818
max_num_seqs = 2 ,
816
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
819
+ limit_mm_per_prompt = { "image" : 1 } ,
817
820
)
818
821
819
822
prompts = [f"<s>[INST]{ question } \n [IMG][/INST]" for question in questions ]
@@ -834,7 +837,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
834
837
max_model_len = 1024 ,
835
838
max_num_seqs = 2 ,
836
839
hf_overrides = {"architectures" : ["QwenVLForConditionalGeneration" ]},
837
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
840
+ limit_mm_per_prompt = { "image" : 1 } ,
838
841
)
839
842
840
843
prompts = [f"{ question } Picture 1: <img></img>\n " for question in questions ]
@@ -859,7 +862,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
859
862
"min_pixels" : 28 * 28 ,
860
863
"max_pixels" : 1280 * 28 * 28 ,
861
864
},
862
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
865
+ limit_mm_per_prompt = { "image" : 1 } ,
863
866
)
864
867
865
868
if modality == "image" :
@@ -894,7 +897,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
894
897
"max_pixels" : 1280 * 28 * 28 ,
895
898
"fps" : 1 ,
896
899
},
897
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
900
+ limit_mm_per_prompt = { "image" : 1 } ,
898
901
)
899
902
900
903
if modality == "image" :
@@ -925,7 +928,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
925
928
model = model_name ,
926
929
trust_remote_code = True ,
927
930
max_model_len = 4096 ,
928
- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
931
+ limit_mm_per_prompt = { "image" : 1 } ,
929
932
)
930
933
931
934
tokenizer = AutoTokenizer .from_pretrained (model_name ,
@@ -1082,7 +1085,15 @@ def main(args):
1082
1085
1083
1086
req_data = model_example_map [model ](questions , modality )
1084
1087
1085
- engine_args = asdict (req_data .engine_args ) | {"seed" : args .seed }
1088
+ # Disable other modalities to save memory
1089
+ default_limits = {"image" : 0 , "video" : 0 , "audio" : 0 }
1090
+ req_data .engine_args .limit_mm_per_prompt = default_limits | dict (
1091
+ req_data .engine_args .limit_mm_per_prompt or {})
1092
+
1093
+ engine_args = asdict (req_data .engine_args ) | {
1094
+ "seed" : args .seed ,
1095
+ "disable_mm_preprocessor_cache" : args .disable_mm_preprocessor_cache ,
1096
+ }
1086
1097
llm = LLM (** engine_args )
1087
1098
1088
1099
# To maintain code compatibility in this script, we add LoRA here.
0 commit comments