@@ -358,7 +358,7 @@ def load_chat_template(
358
358
# TODO: Let user specify how to insert multimodal tokens into prompt
359
359
# (similar to chat template)
360
360
def _get_full_multimodal_text_prompt (placeholder_counts : Dict [str , int ],
361
- text_prompt : str ) -> str :
361
+ text_prompt : str , model_type : str ) -> str :
362
362
"""Combine multimodal prompts for a multimodal language model."""
363
363
364
364
# Look through the text prompt to check for missing placeholders
@@ -378,7 +378,13 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
378
378
379
379
# NOTE: For now we always add missing placeholders at the front of
380
380
# the prompt. This may change to be customizable in the future.
381
- return "\n " .join (missing_placeholders + [text_prompt ])
381
+ if model_type == "qwen2_vl" :
382
+ # TODO: multi images not very well supported
383
+ multimodal_prompt = "" .join (missing_placeholders + [text_prompt ])
384
+ else :
385
+ multimodal_prompt = "\n " .join (missing_placeholders + [text_prompt ])
386
+
387
+ return multimodal_prompt
382
388
383
389
384
390
# No need to validate using Pydantic again
@@ -442,7 +448,9 @@ def _parse_chat_message_content_parts(
442
448
mm_placeholder_counts = mm_parser .mm_placeholder_counts ()
443
449
if mm_placeholder_counts :
444
450
text_prompt = _get_full_multimodal_text_prompt (
445
- mm_placeholder_counts , text_prompt )
451
+ mm_placeholder_counts ,
452
+ text_prompt ,
453
+ mm_tracker ._model_config .hf_config .model_type )
446
454
return [ConversationMessage (role = role , content = text_prompt )]
447
455
448
456
0 commit comments