Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions .buildkite/download-images.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,6 @@ set -o pipefail
# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
mkdir -p images
cd images
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg

Expand Down
6 changes: 2 additions & 4 deletions docs/source/models/vlm.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``

llm = LLM(
model="llava-hf/llava-1.5-7b-hf",
image_input_type="pixel_values",
image_token_id=32000,
image_input_shape="1,3,336,336",
image_feature_size=576,
Expand All @@ -49,7 +48,7 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:

* ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
* ``multi_modal_data``: This should be an instance of :class:`~vllm.multimodal.image.ImagePixelData` or :class:`~vllm.multimodal.image.ImageFeatureData`.
* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`ExternalMultiModalDataDict`.

.. code-block:: python

Expand All @@ -61,7 +60,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptS

outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": ImagePixelData(image),
"multi_modal_data": {"image": image},
})

for o in outputs:
Expand Down Expand Up @@ -93,7 +92,6 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with

python -m vllm.entrypoints.openai.api_server \
--model llava-hf/llava-1.5-7b-hf \
--image-input-type pixel_values \
--image-token-id 32000 \
--image-input-shape 1,3,336,336 \
--image-feature-size 576 \
Expand Down
56 changes: 8 additions & 48 deletions examples/llava_example.py
Original file line number Diff line number Diff line change
@@ -1,84 +1,44 @@
import argparse
import os
import subprocess

import torch
from PIL import Image

from vllm import LLM
from vllm.multimodal.image import ImageFeatureData, ImagePixelData

# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
# You can use `.buildkite/download-images.sh` to download them


def run_llava_pixel_values(*, disable_image_processor: bool = False):
def run_llava():
llm = LLM(
model="llava-hf/llava-1.5-7b-hf",
image_input_type="pixel_values",
image_token_id=32000,
image_input_shape="1,3,336,336",
image_feature_size=576,
disable_image_processor=disable_image_processor,
)

prompt = "<image>" * 576 + (
"\nUSER: What is the content of this image?\nASSISTANT:")

if disable_image_processor:
image = torch.load("images/stop_sign_pixel_values.pt")
else:
image = Image.open("images/stop_sign.jpg")
image = Image.open("images/stop_sign.jpg")

outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": ImagePixelData(image),
"multi_modal_data": {
"image": image
},
})

for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)


def run_llava_image_features():
llm = LLM(
model="llava-hf/llava-1.5-7b-hf",
image_input_type="image_features",
image_token_id=32000,
image_input_shape="1,576,1024",
image_feature_size=576,
)

prompt = "<image>" * 576 + (
"\nUSER: What is the content of this image?\nASSISTANT:")

image: torch.Tensor = torch.load("images/stop_sign_image_features.pt")

outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": ImageFeatureData(image),
})

for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)


def main(args):
if args.type == "pixel_values":
run_llava_pixel_values()
else:
run_llava_image_features()
def main():
run_llava()


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Demo on Llava")
parser.add_argument("--type",
type=str,
choices=["pixel_values", "image_features"],
default="pixel_values",
help="image input type")
args = parser.parse_args()
# Download from s3
s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
local_directory = "images"
Expand All @@ -95,4 +55,4 @@ def main(args):
local_directory,
"--no-sign-request",
])
main(args)
main()
6 changes: 3 additions & 3 deletions examples/phi3v_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from PIL import Image

from vllm import LLM, SamplingParams
from vllm.multimodal.image import ImagePixelData


def run_phi3v():
Expand All @@ -15,7 +14,6 @@ def run_phi3v():
llm = LLM(
model=model_path,
trust_remote_code=True,
image_input_type="pixel_values",
image_token_id=32044,
image_input_shape="1,3,1008,1344",
image_feature_size=1921,
Expand All @@ -33,7 +31,9 @@ def run_phi3v():
outputs = llm.generate(
{
"prompt": prompt,
"multi_modal_data": ImagePixelData(image),
"multi_modal_data": {
"image": image
},
},
sampling_params=sampling_params)
for o in outputs:
Expand Down
15 changes: 3 additions & 12 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,12 @@
AutoProcessor, AutoTokenizer, BatchEncoding)

from vllm import LLM, SamplingParams
from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
from vllm.config import TokenizerPoolConfig
from vllm.distributed import (destroy_distributed_environment,
destroy_model_parallel)
from vllm.inputs import TextPrompt
from vllm.logger import init_logger
from vllm.multimodal import MultiModalData
from vllm.multimodal.image import ImageFeatureData, ImagePixelData
from vllm.sequence import SampleLogprobs
from vllm.utils import cuda_device_count_stateless, is_cpu

Expand Down Expand Up @@ -62,16 +61,8 @@ def pil_image(self) -> Image.Image:
def for_hf(self) -> Image.Image:
return self.pil_image

def for_vllm(self, vision_config: VisionLanguageConfig) -> MultiModalData:
image_input_type = vision_config.image_input_type
ImageInputType = VisionLanguageConfig.ImageInputType

if image_input_type == ImageInputType.IMAGE_FEATURES:
return ImageFeatureData(self.image_features)
if image_input_type == ImageInputType.PIXEL_VALUES:
return ImagePixelData(self.pil_image)

raise NotImplementedError
def for_vllm(self) -> Dict[str, Any]:
return {"image": self.pil_image}


class _ImageAssetPrompts(TypedDict):
Expand Down
2 changes: 0 additions & 2 deletions tests/entrypoints/test_openai_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@ def server():
"--max-model-len",
"4096",
"--enforce-eager",
"--image-input-type",
"pixel_values",
"--image-token-id",
"32000",
"--image-input-shape",
Expand Down
18 changes: 6 additions & 12 deletions tests/models/test_llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,11 @@ def iter_llava_configs(model_name: str):
}

for (h, w), f in image_hw_to_feature_size.items():
for input_type, input_shape in [
(VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
(VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1024)),
]:
yield (model_name,
VisionLanguageConfig(image_input_type=input_type,
image_feature_size=f,
image_token_id=32000,
image_input_shape=input_shape,
image_processor=model_name,
image_processor_revision=None))
input_shape = (1, 3, h, w)
yield (model_name,
VisionLanguageConfig(image_feature_size=f,
image_token_id=32000,
image_input_shape=input_shape))


model_and_vl_config = [
Expand Down Expand Up @@ -82,7 +76,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
"""
model_id, vlm_config = model_and_config
hf_images = [asset.for_hf() for asset in image_assets]
vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
vllm_images = [asset.for_vllm() for asset in image_assets]

with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
Expand Down
19 changes: 8 additions & 11 deletions tests/models/test_llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,13 @@ def iter_llava_next_configs(model_name: str):
}

for (h, w), f in image_hw_to_feature_size.items():
for input_type, input_shape in [
(VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
]:
yield (model_name,
VisionLanguageConfig(image_input_type=input_type,
image_feature_size=f,
image_token_id=32000,
image_input_shape=input_shape,
image_processor=model_name,
image_processor_revision=None))
input_shape = (1, 3, h, w)
yield (model_name,
VisionLanguageConfig(
image_feature_size=f,
image_token_id=32000,
image_input_shape=input_shape,
))


model_and_vl_config = [
Expand Down Expand Up @@ -91,7 +88,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
"""
model_id, vlm_config = model_and_config
hf_images = [asset.for_hf() for asset in image_assets]
vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
vllm_images = [asset.for_vllm() for asset in image_assets]

with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
Expand Down
17 changes: 6 additions & 11 deletions tests/models/test_phi3v.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,11 @@ def iter_phi3v_configs(model_name: str):
}

for (h, w), f in image_hw_to_feature_size.items():
for input_type, input_shape in [
(VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
]:
yield (model_name,
VisionLanguageConfig(image_input_type=input_type,
image_feature_size=f,
image_token_id=32044,
image_input_shape=input_shape,
image_processor=model_name,
image_processor_revision=None))
input_shape = (1, 3, h, w)
yield (model_name,
VisionLanguageConfig(image_feature_size=f,
image_token_id=32044,
image_input_shape=input_shape))


model_and_vl_config = [
Expand Down Expand Up @@ -95,7 +90,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
"""
model_id, vlm_config = model_and_config
hf_images = [asset.for_hf() for asset in image_assets]
vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
vllm_images = [asset.for_vllm() for asset in image_assets]

# use eager mode for hf runner, since phi3_v didn't work with flash_attn
hf_model_kwargs = {"_attn_implementation": "eager"}
Expand Down
Loading