Refactor image parsing in Dockerfile.meipian and chat_utils.py

whyiug · whyiug · commit 3495e80ac8d1 · 2024-10-15T14:27:36.000+08:00
diff --git a/Dockerfile.meipian b/Dockerfile.meipian
@@ -4,16 +4,9 @@ WORKDIR /workspace
 
 RUN pip config set global.index-url https://mirrors.aliyun.com/pypi/simple && \
     pip install redis && \
+    pip install flash-attn --no-build-isolation && \
     pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 
-
-# Attention: image &&& wheel && branch are the same version
-# sync main
-# git clone https://github.com/whyiug/vllm 
-# cd vllm
-# git checkout feature_redis_image_embeds
-# git merge origin main
-
 COPY . /workspace/vllm
 
 WORKDIR /workspace/vllm
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
@@ -358,7 +358,7 @@ def load_chat_template(
 # TODO: Let user specify how to insert multimodal tokens into prompt
 # (similar to chat template)
 def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
-                                     text_prompt: str) -> str:
+                                     text_prompt: str, model_type: str) -> str:
     """Combine multimodal prompts for a multimodal language model."""
 
     # Look through the text prompt to check for missing placeholders
@@ -378,7 +378,13 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
 
     # NOTE: For now we always add missing placeholders at the front of
     # the prompt. This may change to be customizable in the future.
-    return "\n".join(missing_placeholders + [text_prompt])
+    if model_type == "qwen2_vl":
+        # TODO: multi images not very well supported
+        multimodal_prompt = "".join(missing_placeholders + [text_prompt])
+    else:
+        multimodal_prompt = "\n".join(missing_placeholders + [text_prompt])
+
+    return multimodal_prompt
 
 
 # No need to validate using Pydantic again
@@ -442,7 +448,9 @@ def _parse_chat_message_content_parts(
         mm_placeholder_counts = mm_parser.mm_placeholder_counts()
         if mm_placeholder_counts:
             text_prompt = _get_full_multimodal_text_prompt(
-                mm_placeholder_counts, text_prompt)
+                mm_placeholder_counts,
+                text_prompt,
+                mm_tracker._model_config.hf_config.model_type)
         return [ConversationMessage(role=role, content=text_prompt)]