vLLM lvm integration (opea-project#1362)

Spycsh · web-flow · commit 831c5a397d06 · 2025-03-17T13:33:38.000+08:00
* vLLM lvm integration

- integrate vLLM LVMs and set vLLM as default
- use OpenAI chat completions and cover single-image/text-only cases
diff --git a/comps/lvms/deployment/docker_compose/compose.yaml b/comps/lvms/deployment/docker_compose/compose.yaml
@@ -18,6 +18,58 @@ services:
       interval: 30s
       timeout: 6s
       retries: 20
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm:latest
+    container_name: vllm-service
+    ports:
+      - ${VLLM_PORT:-9699}:80
+    volumes:
+      - "./data:/data"
+    shm_size: 128g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:80/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80 --chat-template examples/template_llava.jinja  # https://docs.vllm.ai/en/v0.5.0/models/vlm.html
+  vllm-gaudi-service:
+    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
+    container_name: vllm-gaudi-service
+    ports:
+      - ${VLLM_PORT:-9699}:80
+    volumes:
+      - "./data:/data"
+    shm_size: 128g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+      VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
+      MAX_MODEL_LEN: ${MAX_TOTAL_TOKENS:-4096}
+      MAX_SEQ_LEN_TO_CAPTURE: ${MAX_TOTAL_TOKENS:-4096}
+      PT_HPUGRAPH_DISABLE_TENSOR_CACHE: false # https://github.com/HabanaAI/vllm-fork/issues/841#issuecomment-2700421704
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    ipc: host
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:80/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 150
+    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --chat-template examples/template_llava.jinja  # https://docs.vllm.ai/en/v0.5.0/models/vlm.html
   llava-tgi-service:
     image: ghcr.io/huggingface/tgi-gaudi:2.3.1
     container_name: llava-tgi-service
@@ -99,7 +151,8 @@ services:
     ipc: host
     environment:
       LVM_ENDPOINT: ${LVM_ENDPOINT}
-      LVM_COMPONENT_NAME: ${LVM_COMPONENT_NAME:-OPEA_LLAVA_LVM}
+      LVM_COMPONENT_NAME: ${LVM_COMPONENT_NAME:-OPEA_VLLM_LVM}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
   lvm-llava:
     extends: lvm
     container_name: lvm-llava-service
@@ -140,6 +193,22 @@ services:
     depends_on:
       video-llama-service:
         condition: service_healthy
+  lvm-vllm:
+    extends: lvm
+    container_name: lvm-vllm-service
+    environment:
+      LVM_COMPONENT_NAME: ${LVM_COMPONENT_NAME:-OPEA_VLLM_LVM}
+    depends_on:
+      vllm-service:
+        condition: service_healthy
+  lvm-vllm-gaudi:
+    extends: lvm
+    container_name: lvm-vllm-gaudi-service
+    environment:
+      LVM_COMPONENT_NAME: ${LVM_COMPONENT_NAME:-OPEA_VLLM_LVM}
+    depends_on:
+      vllm-gaudi-service:
+        condition: service_healthy
 
 networks:
   default:
diff --git a/comps/lvms/src/README.md b/comps/lvms/src/README.md
@@ -66,16 +66,36 @@ export LVM_ENDPOINT=http://$ip_address:$VIDEO_LLAMA_PORT
 docker compose -f comps/lvms/deployment/docker_compose/compose.yaml up video-llama-service lvm-video-llama -d
 ```
 
+- vLLM
+
+```bash
+# currently you have to build the opea/vllm-gaudi with the habana_main branch locally
+git clone https://github.com/HabanaAI/vllm-fork.git
+cd ./vllm-fork/
+git checkout habana_main
+docker build -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
+cd ..
+rm -rf vllm-fork
+
+
+export ip_address=$(hostname -I | awk '{print $1}')
+export LVM_PORT=9399
+export VLLM_PORT=11507
+export LVM_ENDPOINT=http://$ip_address:$VLLM_PORT
+export LLM_MODEL_ID=llava-hf/llava-1.5-7b-hf
+docker compose -f comps/lvms/deployment/docker_compose/compose.yaml up vllm-service lvm-vllm -d
+```
+
 ## Test
 
-- LLaVA & llama-vision & PredictionGuard & TGI LLaVA
+- vLLM & LLaVA native & llama-vision & PredictionGuard & TGI LLaVA
 
 ```bash
 # curl with an image and a prompt
 http_proxy="" curl http://localhost:9399/v1/lvm -XPOST -d '{"image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "prompt":"What is this?"}' -H 'Content-Type: application/json'
 
 # curl with only the prompt
-http_proxy="" curl http://localhost:9399/v1/lvm --silent --write-out "HTTPSTATUS:%{http_code}" -XPOST -d '{"image": "", "prompt":"What is deep learning?"}' -H 'Content-Type: application/json'
+http_proxy="" curl http://localhost:9399/v1/lvm -XPOST -d '{"image": "", "prompt":"What is deep learning?"}' -H 'Content-Type: application/json'
 ```
 
 - video-llama
diff --git a/comps/lvms/src/integrations/dependency/video-llama/requirements.txt b/comps/lvms/src/integrations/dependency/video-llama/requirements.txt
@@ -12,7 +12,7 @@ iopath
 langchain
 langchain-community
 langchain-core
-numpy
+numpy==1.26.4
 omegaconf
 opencv-python-headless
 opentelemetry-api
diff --git a/comps/lvms/src/integrations/vllm.py b/comps/lvms/src/integrations/vllm.py
@@ -0,0 +1,222 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import time
+from typing import Union
+
+import requests
+from fastapi import HTTPException
+from fastapi.responses import StreamingResponse
+from langchain_core.prompts import PromptTemplate
+from openai import OpenAI
+
+from comps import (
+    CustomLogger,
+    LVMDoc,
+    LVMSearchedMultimodalDoc,
+    MetadataTextDoc,
+    OpeaComponent,
+    OpeaComponentRegistry,
+    ServiceType,
+    TextDoc,
+    statistics_dict,
+)
+
+logger = CustomLogger("opea_vllm")
+logflag = os.getenv("LOGFLAG", False)
+
+# The maximum number of images that should be sent to the LVM
+# max_images = int(os.getenv("MAX_IMAGES", 1))
+LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "llava-hf/llava-1.5-7b-hf")
+
+
+class ChatTemplate:
+
+    @staticmethod
+    def generate_multimodal_rag_on_videos_prompt(question: str, context: str, has_image: bool = False):
+
+        if has_image:
+            template = """The transcript associated with the image is '{context}'. {question}"""
+        else:
+            template = (
+                """Refer to the following results obtained from the local knowledge base: '{context}'. {question}"""
+            )
+
+        return template.format(context=context, question=question)
+
+
+@OpeaComponentRegistry.register("OPEA_VLLM_LVM")
+class OpeaVllmLvm(OpeaComponent):
+    """A specialized vLLM LVM component derived from OpeaComponent for vLLM LVM services."""
+
+    def __init__(self, name: str, description: str, config: dict = None):
+        super().__init__(name, ServiceType.LVM.name.lower(), description, config)
+        self.base_url = os.getenv("LVM_ENDPOINT", "http://localhost:8399")
+        # https://github.com/huggingface/huggingface_hub/blob/v0.29.1/src/huggingface_hub/inference/_providers/hf_inference.py#L87
+        # latest AsyncInferenceClient has model hardcoded issues to "tgi"
+        # so we use OpenAI client
+        self.lvm_client = OpenAI(api_key="EMPTY", base_url=f"{self.base_url}/v1")
+        health_status = self.check_health()
+        # if logflag:
+        #     logger.info(f"MAX_IMAGES: {max_images}")
+        if not health_status:
+            logger.error("OpeaVllmLvm health check failed.")
+
+    async def invoke(
+        self,
+        request: Union[LVMDoc, LVMSearchedMultimodalDoc],
+    ) -> Union[TextDoc, MetadataTextDoc]:
+        """Involve the LVM service to generate answer for the provided input."""
+        if logflag:
+            logger.info(request)
+        if isinstance(request, LVMSearchedMultimodalDoc):
+            # TODO may bugs here
+            if logflag:
+                logger.info("[LVMSearchedMultimodalDoc ] input from retriever microservice")
+            retrieved_metadatas = request.metadata
+            if retrieved_metadatas is None or len(retrieved_metadatas) == 0:
+                raise HTTPException(status_code=500, detail="There is no video segments retrieved given the query!")
+
+            img_b64_str = retrieved_metadatas[0]["b64_img_str"]
+            has_image = img_b64_str != ""
+            initial_query = request.initial_query
+            context = retrieved_metadatas[0]["transcript_for_inference"]
+            prompt = initial_query
+            if request.chat_template is None:
+                prompt = ChatTemplate.generate_multimodal_rag_on_videos_prompt(initial_query, context, has_image)
+            else:
+                prompt_template = PromptTemplate.from_template(request.chat_template)
+                input_variables = prompt_template.input_variables
+                if sorted(input_variables) == ["context", "question"]:
+                    prompt = prompt_template.format(question=initial_query, context=context)
+                else:
+                    logger.info(
+                        f"[ LVMSearchedMultimodalDoc ] {prompt_template} not used, we only support 2 input variables ['question', 'context']"
+                    )
+            max_new_tokens = request.max_new_tokens
+            stream = request.stream
+            repetition_penalty = request.repetition_penalty
+            temperature = request.temperature
+            top_k = request.top_k
+            top_p = request.top_p
+            if logflag:
+                logger.info(
+                    f"prompt generated for [LVMSearchedMultimodalDoc ] input from retriever microservice: {prompt}"
+                )
+
+        else:
+            # TODO align legacy LVMDoc with chat completions parameters for vLLM
+            img_b64_str = request.image
+            prompt = request.prompt
+            max_new_tokens = request.max_new_tokens
+            stream = request.stream
+            # repetition_penalty = request.repetition_penalty
+            temperature = request.temperature
+            # top_k = request.top_k
+            top_p = request.top_p
+
+        if not img_b64_str:
+            # If img_b64_str was an empty string, which means we have just have a text prompt.
+            # Work around an issue where LLaVA-NeXT is not providing good responses when prompted without an image.
+            # Provide an image and then instruct the model to ignore the image. The base64 string below is the encoded png:
+            # https://raw.githubusercontent.com/opea-project/GenAIExamples/refs/tags/v1.0/AudioQnA/ui/svelte/src/lib/assets/icons/png/audio1.png
+            img_b64_str = "iVBORw0KGgoAAAANSUhEUgAAADUAAAAlCAYAAADiMKHrAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAKPSURBVHgB7Zl/btowFMefnUTqf+MAHYMTjN4gvcGOABpM+8E0doLSE4xpsE3rKuAG3KC5Ad0J6MYOkP07YnvvhR9y0lVzupTIVT5SwDjB9fd97WfsMkCef1rUXM8dY9HHK4hWUevzi/oVWAqnF8fzLmAtiPA3Aq0lFsVA1fRKxlgNLIbDPaQUZQuu6YO98aIipHOiFGtIqaYfn1UnUCDds6WPyeANlTFbv9WztbFTK+HNUVAPiz7nbPzq7HsPCoKWIBREGfsJXZit5xT07X0jp6iRdIbEHOnjyyD97OvzH00lVS2K5OS2ax11cBXxJgYxlEIE6XZclzdTX6n8XjkkcEIfbj2nMO0/SNd1vy4vsCNjYPyEovfyy88GZIQCSKOCMf6ORgStoboLJuSWKDYCfK2q4jjrMZ+GOh7Pib/gek5DHxVUJtcgA7mJ4kwZRbN7viQXFzQn0Nl52gXG4Fo7DKAYp0yI3VHQ16oaWV0wYa+iGE8nG+wAdx5DzpS/KGyhFGULpShbKEXZQinqLlBK/IKc2asoh4sZvoXJWhlAzuxV1KBVD3HrfYTFAK8ZHgu0hu36DHLG+Izinw250WUkXHJht02QUnxLP7fZxR7f1I6S7Ir2GgmYvIQM5OYUuYBdainATq2ZjTqPBlnbGXYeBrg9Od18DKmc1U0jpw4OIIwEJFxQSl2b4MN2lf74fw8nFNbHt/5N9xWKTZvJ2S6YZk6RC3j2cKpVhSIShZ0mea6caCOCAjyNHd5gPPxGncMBTvI6hunYdaJ6kf8VoSCP2odxX6RkR6NOtanfj13EswKVqEQrPzzFL1lK+YvCFraiEqs8TrwQLGYraqpX4kr/Hixml+63Z+CoM9DTo438AUmP+KyMWT+tAAAAAElFTkSuQmCC"
+
+        if stream:
+            t_start = time.time()
+
+            def stream_generator(time_start):
+                first_token_latency = None
+                chat_response = ""
+
+                # https://docs.vllm.ai/en/v0.5.1/getting_started/examples/openai_vision_api_client.html
+                # vLLM chat completions api
+                # TODO align legacy LVMDoc with chat completions parameters for vLLM
+                # Now we simply keep the intersection of them
+                # TODO check vLLM multi-image inputs https://platform.openai.com/docs/guides/vision#multiple-image-inputs
+                text_generation = self.lvm_client.chat.completions.create(
+                    model=LLM_MODEL_ID,
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": prompt},
+                                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64_str}"}},
+                            ],
+                        }
+                    ],
+                    max_tokens=max_new_tokens,
+                    temperature=temperature,
+                    top_p=top_p,
+                    stream=True,
+                )
+
+                for output in text_generation:
+                    if first_token_latency is None:
+                        first_token_latency = time.time() - time_start
+                    text = output.choices[0].delta.content
+                    chat_response += text
+                    chunk_repr = repr(text.encode("utf-8"))
+                    if logflag:
+                        logger.info(f"[llm - chat_stream] chunk:{chunk_repr}")
+                    yield f"data: {chunk_repr}\n\n"
+                if logflag:
+                    logger.info(f"[llm - chat_stream] stream response: {chat_response}")
+                statistics_dict["opea_service@lvm"].append_latency(time.time() - time_start, first_token_latency)
+                yield "data: [DONE]\n\n"
+
+            return StreamingResponse(stream_generator(t_start), media_type="text/event-stream")
+        else:
+            # https://docs.vllm.ai/en/v0.5.1/getting_started/examples/openai_vision_api_client.html
+            # vLLM chat completions api
+            # TODO align legacy LVMDoc with chat completions parameters for vLLM
+            # Now we simply keep the intersection of them
+            # TODO check vLLM multi-image inputs https://platform.openai.com/docs/guides/vision#multiple-image-inputs
+            generated_output = self.lvm_client.chat.completions.create(
+                model=LLM_MODEL_ID,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": prompt},
+                            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64_str}"}},
+                        ],
+                    }
+                ],
+                max_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+            )
+            generated_str = generated_output.choices[0].message.content
+
+            if logflag:
+                logger.info(generated_str)
+            if isinstance(request, LVMSearchedMultimodalDoc):
+                # TODO Check bugs here
+                retrieved_metadata = request.metadata[0]
+                return_metadata = {}  # this metadata will be used to construct proof for generated text
+                return_metadata["video_id"] = retrieved_metadata["video_id"]
+                return_metadata["source_video"] = retrieved_metadata["source_video"]
+                return_metadata["time_of_frame_ms"] = retrieved_metadata["time_of_frame_ms"]
+                return_metadata["transcript_for_inference"] = retrieved_metadata["transcript_for_inference"]
+                return MetadataTextDoc(text=generated_str, metadata=return_metadata)
+            else:
+                return TextDoc(text=generated_str)
+
+    def check_health(self) -> bool:
+        """Checks the health of the embedding service.
+
+        Returns:
+            bool: True if the service is reachable and healthy, False otherwise.
+        """
+        try:
+            response = requests.get(f"{self.base_url}/health")
+            if response.status_code == 200:
+                return True
+            else:
+                return False
+        except Exception as e:
+            # Handle connection errors, timeouts, etc.
+            logger.error(f"Health check failed: {e}")
+        return False
diff --git a/comps/lvms/src/opea_lvm_microservice.py b/comps/lvms/src/opea_lvm_microservice.py
@@ -10,6 +10,7 @@
 from integrations.predictionguard import OpeaPredictionguardLvm
 from integrations.tgi_llava import OpeaTgiLlavaLvm
 from integrations.video_llama import OpeaVideoLlamaLvm
+from integrations.vllm import OpeaVllmLvm
 
 from comps import (
     CustomLogger,
@@ -29,7 +30,7 @@
 logger = CustomLogger("opea_lvm_microservice")
 logflag = os.getenv("LOGFLAG", False)
 
-lvm_component_name = os.getenv("LVM_COMPONENT_NAME", "OPEA_LLAVA_LVM")
+lvm_component_name = os.getenv("LVM_COMPONENT_NAME", "OPEA_VLLM_LVM")
 # Initialize OpeaComponentController
 loader = OpeaComponentLoader(lvm_component_name, description=f"OPEA LVM Component: {lvm_component_name}")
 
@@ -54,7 +55,7 @@ async def lvm(
             logger.info(lvm_response)
 
         if loader.component.name in ["OpeaVideoLlamaLvm"] or (
-            loader.component.name in ["OpeaTgiLlavaLvm"] and request.streaming
+            loader.component.name in ["OpeaTgiLlavaLvm", "OpeaVllmLvm"] and request.streaming
         ):
             # statistics for StreamingResponse are handled inside the integrations
             # here directly return the response
diff --git a/comps/lvms/src/requirements.txt b/comps/lvms/src/requirements.txt
@@ -4,6 +4,7 @@ docarray[full]
 fastapi
 huggingface_hub
 langchain-core
+openai
 opentelemetry-api
 opentelemetry-exporter-otlp
 opentelemetry-sdk
diff --git a/tests/lvms/test_lvms_vllm.sh b/tests/lvms/test_lvms_vllm.sh
diff --git a/tests/lvms/test_lvms_vllm_on_intel_hpu.sh b/tests/lvms/test_lvms_vllm_on_intel_hpu.sh