Skip to content

Commit 3604d3e

Browse files
Qualcomm AI Engine Direct - [Multimodal] Muti-turn VLM conversation (#17308)
1 parent 1f0e737 commit 3604d3e

35 files changed

+1569
-858
lines changed

backends/qualcomm/tests/test_qnn_delegate.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6705,7 +6705,7 @@ class MLLMSpecs:
67056705
sm8650_token_rate: float
67066706
sm8750_token_rate: float
67076707
encoder_pte_size: float
6708-
text_embedding_pte_size: float
6708+
tok_embedding_pte_size: float
67096709
decoder_pte_size: float
67106710

67116711
@dataclass(frozen=True)
@@ -6721,7 +6721,7 @@ def setUp(self):
67216721
sm8650_token_rate=50,
67226722
sm8750_token_rate=55,
67236723
encoder_pte_size=110_000_000, # 110MB
6724-
text_embedding_pte_size=100_000_000, # 100MB
6724+
tok_embedding_pte_size=100_000_000, # 100MB
67256725
decoder_pte_size=400_000_000, # 400MB
67266726
image_path="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg", # New York Bay
67276727
golden_image_feature="city",
@@ -6731,7 +6731,7 @@ def setUp(self):
67316731
sm8650_token_rate=11,
67326732
sm8750_token_rate=13,
67336733
encoder_pte_size=425_000_000, # 425MB
6734-
text_embedding_pte_size=300_000_000, # 300MB
6734+
tok_embedding_pte_size=300_000_000, # 300MB
67356735
decoder_pte_size=550_000_000, # 550 MB
67366736
image_path="http://images.cocodataset.org/val2017/000000039769.jpg", # Two cats lying on a blanket
67376737
golden_image_feature="cats",
@@ -6803,16 +6803,16 @@ def test_static_vlm(self):
68036803
print(f"Answer: {model_out}")
68046804
if not self.enable_x86_64:
68056805
encoder_pte_size = msg["encoder_pte_size"]
6806-
text_embedding_pte_size = msg["text_embedding_pte_size"]
6806+
tok_embedding_pte_size = msg["tok_embedding_pte_size"]
68076807
decoder_pte_size = msg["pte_size"]
68086808
self.assertLessEqual(encoder_pte_size, vlm_specs.encoder_pte_size)
68096809
self.assertLessEqual(
6810-
text_embedding_pte_size, vlm_specs.text_embedding_pte_size
6810+
tok_embedding_pte_size, vlm_specs.tok_embedding_pte_size
68116811
)
68126812
self.assertLessEqual(decoder_pte_size, vlm_specs.decoder_pte_size)
68136813
print(f"Encoder PTE Size: {encoder_pte_size} bytes")
6814-
print(f"Text Embedding PTE Size: {text_embedding_pte_size} bytes")
6815-
print(f"Decoder PTE Size: {decoder_pte_size} bytes")
6814+
print(f"Token Embedding PTE Size: {tok_embedding_pte_size} bytes")
6815+
print(f"Text Decoder PTE Size: {decoder_pte_size} bytes")
68166816

68176817
attr_name = f"{self.model.lower()}_token_rate"
68186818
if (

examples/qualcomm/oss_scripts/llama/CMakeLists.txt

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,12 +94,15 @@ list(
9494
${CMAKE_CURRENT_LIST_DIR}/qnn_multimodal_runner.cpp
9595
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_runner.cpp
9696
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_runner.h
97+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_embedding_merger.cpp
98+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_embedding_merger.h
99+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/utils.h
97100
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/encoder.cpp
98101
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/encoder.h
99-
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_runner.cpp
100-
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_runner.h
101-
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_processor.cpp
102-
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_processor.h
102+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/tok_embedding_runner.cpp
103+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/tok_embedding_runner.h
104+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/tok_embedding_processor.cpp
105+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/tok_embedding_processor.h
103106
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_prompt_processor.cpp
104107
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_prompt_processor.h
105108
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_token_generator.cpp

examples/qualcomm/oss_scripts/llama/README.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,37 @@ If you have already compiled a VLM model, you can run inference with pre-generat
308308
python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode kv --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE}
309309
```
310310

311+
### Multi-Turn Conversation with VLM
312+
313+
The framework supports multi-turn conversations with VLMs, allowing you to conduct dialogues that can involve multiple images.
314+
315+
- **Multi-Turn Prompts**: To engage in a conversation, provide multiple prompts sequentially using the `--prompt` argument. Each string will be treated as a separate turn.
316+
- **Multiple Images**: You can supply multiple images (from URLs or local paths) using the `--image_path` argument.
317+
- **Flexible Image Placement**: Use the `<image>` token within your prompt to specify exactly where each image's embeddings should be placed. The images provided via `--image_path` will replace the `<image>` tokens in the order they appear.
318+
319+
**Example**:
320+
321+
In this example, the first turn compares two images, the second turn asks a follow-up question about the first image, and the third turn asks for a caption for a third image.
322+
323+
```bash
324+
# Define image URLs and prompts for a 3-turn conversation
325+
IMAGE1_URL="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
326+
IMAGE2_URL="http://images.cocodataset.org/val2017/000000039769.jpg"
327+
IMAGE3_URL="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"
328+
329+
PROMPT1="<image><image>Compare these images above and list the differences."
330+
PROMPT2="Answer the question: What's the main object in first image?"
331+
PROMPT3="<image>Caption this image."
332+
333+
# Execute the multi-turn conversation
334+
python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode kv --max_seq_len 2048 --prompt "$PROMPT1" "$PROMPT2" "$PROMPT3" --image_path "$IMAGE1_URL" "$IMAGE2_URL" "$IMAGE3_URL"
335+
```
336+
337+
**How it works:**
338+
- **Turn 1**: The prompt `"<image><image>Compare these images above and list the differences."` uses the first two images (`$IMAGE1_URL`, `$IMAGE2_URL`).
339+
- **Turn 2**: The prompt `"Answer the question: What's the main object in first image?"` is a text-only follow-up. The conversation context is maintained from the previous turn.
340+
- **Turn 3**: The prompt `"<image>Caption this image."` uses the third image (`$IMAGE3_URL`).
341+
311342
### VLM Processing Details
312343

313344
The VLM inference pipeline consists of:

examples/qualcomm/oss_scripts/llama/dataset.py

Lines changed: 54 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,22 @@
55
# LICENSE file in the root directory of this source tree.
66

77
import argparse
8-
import warnings
9-
from typing import Callable, List, Optional
8+
from typing import Callable, Dict, List, Optional
109

1110
from executorch.examples.qualcomm.oss_scripts.llama import LLMModelConfig
1211
from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import (
1312
AUDIO_ENCODER,
1413
TEXT_DECODER,
15-
TEXT_EMBEDDING,
1614
TEXT_ENCODER,
15+
TOK_EMBEDDING,
1716
VISION_ENCODER,
18-
VISION_ENCODER_INPUT_FILENAME,
1917
)
2018

2119
from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_config import (
2220
MultiModalityConfig,
2321
VisionModalityConfig,
2422
)
2523
from executorch.examples.qualcomm.oss_scripts.llama.tokenizer import TokenizerWrapper
26-
2724
from transformers import AutoProcessor
2825
from transformers.image_utils import load_image
2926

@@ -43,35 +40,30 @@ def __init__(
4340
self.artifact = control_args.artifact
4441
self.repo_id = config.repo_id
4542

46-
def _build_vision_dataset(self, config: VisionModalityConfig, prompt: str):
43+
def _build_vision_dataset(
44+
self, config: VisionModalityConfig, prompt: str, files_path: List[str]
45+
):
4746
"""
4847
This will processes images using the HuggingFace processor and saves
4948
the processed pixel values for runtime evaluation.
5049
5150
Args:
5251
config (VisionModalityConfig): containing image URL and resize parameters
53-
prompt (str): Text prompt to be processed alongside the image
52+
prompt (str): Text prompt
53+
files_path (List[str]): List of file paths for images. Each path can be either a URL or a local file path.
5454
5555
Returns:
5656
tuple of pixel values tensors
5757
"""
58-
# Load image from user-specified path (URL or local file)
59-
# fall back to the default image URL if no image is provided.
60-
image_path = self.control_args.image_path or config.img_url
61-
if not self.control_args.image_path:
62-
warnings.warn(
63-
f"No image path/URL provided, using default image URL: {config.img_url}",
64-
UserWarning,
65-
stacklevel=1,
66-
)
67-
image = load_image(image_path)
58+
59+
images = [load_image(image_path) for image_path in files_path]
6860

6961
# Process image with text prompt using HuggingFace processor
7062
# Some HF processors (e.g. InternVL3) need to pass text arg or it will cause error and process failed
7163
processor = AutoProcessor.from_pretrained(self.repo_id)
7264
pixel_values = processor(
7365
text=prompt,
74-
images=[image],
66+
images=images,
7567
return_tensors="pt",
7668
crop_to_patches=False,
7769
size={
@@ -80,19 +72,26 @@ def _build_vision_dataset(self, config: VisionModalityConfig, prompt: str):
8072
},
8173
).pixel_values
8274

83-
# save image file for runtime evaluation
84-
pixel_values.detach().numpy().tofile(
85-
f"{self.artifact}/{VISION_ENCODER_INPUT_FILENAME}.raw"
75+
assert pixel_values.dim() in (4, 5), (
76+
f"Unsupported pixel_values dim={pixel_values.dim()}); "
77+
f"expected 5D (1,N,C,H,W) or 4D (N,C,H,W)."
8678
)
87-
return (pixel_values,)
79+
80+
# HTP Prepare failed when pixel_values has 5D dimension, so we squeeze the batch dimension here.
81+
if pixel_values.dim() == 5:
82+
pixel_values = pixel_values.squeeze(0) # (N, C, H, W)
83+
84+
# save image file for runtime evaluation
85+
return [(pixel_values[i][None, ...],) for i in range(len(pixel_values))]
8886

8987
def _build_dataset_for_encoder(
9088
self,
9189
config: MultiModalityConfig,
9290
prompt: str,
91+
files_path: List[str],
9392
) -> Optional[tuple]:
9493
if issubclass(config, VisionModalityConfig):
95-
return self._build_vision_dataset(config, prompt)
94+
return self._build_vision_dataset(config, prompt, files_path)
9695
else:
9796
# Audio and text encoder dataset building are not yet implemented
9897
# TODO: Add support for AudioModalityConfig and TextModalityConfig
@@ -106,22 +105,33 @@ def prepare_calibration_dataset(
106105
prompts: List[str],
107106
chat_template: Callable,
108107
):
109-
calibration_data = {
110-
AUDIO_ENCODER: [],
111-
TEXT_ENCODER: [],
112-
VISION_ENCODER: [],
113-
TEXT_EMBEDDING: [],
114-
TEXT_DECODER: [],
108+
# 1. Initialize data
109+
# Shape convention: (num_samples, num_turns).
110+
# Currently, user prompt calibration is one-shot per prompt (num_samples = 1).
111+
calibration_data: Dict[str, List[List]] = {
112+
# Encoders / embeddings: initialize an empty turn list for each prompt.
113+
AUDIO_ENCODER: [[] for _ in range(len(prompts))],
114+
TEXT_ENCODER: [[] for _ in range(len(prompts))],
115+
VISION_ENCODER: [[] for _ in range(len(prompts))],
116+
TOK_EMBEDDING: [[] for _ in range(len(prompts))],
117+
# Decoder targets: one string per prompt.
118+
TEXT_DECODER: ["" for _ in range(len(prompts))],
115119
}
116120

121+
# 2. Prepare messages for multi-turn conversation
122+
messages = self.tokenizer_wrapper.prepare_messages(prompts)
123+
124+
# 3. build dataset by modality
117125
is_multimodal = any(
118126
[
119127
hasattr(self.config, AUDIO_ENCODER),
120128
hasattr(self.config, VISION_ENCODER),
121129
]
122130
)
123-
for prompt in prompts:
124-
# Apply chat template formatting if available (for instruction-tuned/reasoning models)
131+
for turn_idx, message in enumerate(messages):
132+
prompt = message["text"]
133+
134+
# 3.1. Apply chat template formatting if available (for instruction-tuned/reasoning models)
125135
prompt = (
126136
self.tokenizer_wrapper.apply_prompt_template(
127137
chat_template, prompt, self.control_args.system_prompt
@@ -130,23 +140,26 @@ def prepare_calibration_dataset(
130140
else prompt
131141
)
132142

133-
# Build calibration datasets for each available encoder modality
143+
# 3.2 Build calibration datasets for each available encoder modality
134144
for modality in [AUDIO_ENCODER, TEXT_ENCODER, VISION_ENCODER]:
135-
if hasattr(self.config, modality):
136-
data = self._build_dataset_for_encoder(
137-
getattr(self.config, modality),
138-
prompt,
139-
)
140-
calibration_data[modality].append(data)
141-
142-
# Expand multimodal tokens in prompt for decoder
145+
if not hasattr(self.config, modality) or not message["files_path"]:
146+
continue
147+
148+
data = self._build_dataset_for_encoder(
149+
getattr(self.config, modality),
150+
prompt,
151+
message["files_path"],
152+
)
153+
calibration_data[modality][turn_idx] = data
154+
155+
# 3.3. Expand multimodal tokens in prompt for decoder
143156
prompt = (
144157
self.tokenizer_wrapper.prepare_multimodal_prompt(prompt)
145158
if is_multimodal
146159
else prompt
147160
)
148161

149162
# Add prompt to decoder calibration data
150-
calibration_data[TEXT_DECODER].append(prompt)
163+
calibration_data[TEXT_DECODER][turn_idx] = prompt
151164

152165
return calibration_data

examples/qualcomm/oss_scripts/llama/decoder_constants.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,20 +12,21 @@
1212
TASKS_EVAL = "tasks_eval"
1313
SQNR_EVAL = "sqnr_eval"
1414

15-
# filenames for vision model
16-
VISION_ENCODER_INPUT_FILENAME = "vision_encoder_input"
17-
18-
1915
# Component identifiers
2016
AUDIO_ENCODER = "audio_encoder"
2117
VISION_ENCODER = "vision_encoder"
2218
TEXT_ENCODER = "text_encoder"
23-
TEXT_EMBEDDING = "text_embedding"
19+
TOK_EMBEDDING = "tok_embedding"
2420
TEXT_DECODER = "text_decoder"
2521
ATTENTION_SINK_EVICTOR = "attention_sink_evictor"
2622

23+
# Mapping of input flags for the runner
24+
MODALITY_INPUT_FLAG_MAP = {
25+
VISION_ENCODER: "image_path",
26+
}
27+
2728
# Text embedding graph names
28-
TEXT_EMBEDDING_GRAPH_NAMES = [
29+
TOK_EMBEDDING_GRAPH_NAMES = [
2930
"tok_embedding_kv_forward",
3031
"tok_embedding_prefill_forward",
3132
]

0 commit comments

Comments
 (0)