Support for UI of MultimodalRAGWithVideos in GenAIExamples (#656)

tileintel · pre-commit-ci[bot] · kevinintel · web-flow · commit 7664578c972c · 2024-09-11T17:22:59.000+08:00
* using png instead of jpg due to tgi-gaudi. return message include video_id maps Signed-off-by: Tiep Le <tiep.le@intel.com> * update lvm and lvm_tgi to handle empty retrieval results Signed-off-by: Tiep Le <tiep.le@intel.com> * handle no retrieval results Signed-off-by: Tiep Le <tiep.le@intel.com> * add metadata to textdoc returned by LVM Signed-off-by: Tiep Le <tiep.le@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add metadatatextdoc Signed-off-by: Tiep Le <tiep.le@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add test Signed-off-by: Tiep Le <tiep.le@intel.com> * add metadata to Chatcompletionresponsechoice Signed-off-by: Tiep Le <tiep.le@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bug Signed-off-by: Tiep Le <tiep.le@intel.com> * fix bug Signed-off-by: Tiep Le <tiep.le@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bug Signed-off-by: Tiep Le <tiep.le@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * minor fix for more consice Signed-off-by: Tiep Le <tiep.le@intel.com> * update port for test Signed-off-by: Tiep Le <tiep.le@intel.com> * update test Signed-off-by: Tiep Le <tiep.le@intel.com> --------- Signed-off-by: Tiep Le <tiep.le@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: kevinintel <hanwen.chang@intel.com> Co-authored-by: Sihan Chen <39623753+Spycsh@users.noreply.github.com> Co-authored-by: chen, suyue <suyue.chen@intel.com>
diff --git a/comps/__init__.py b/comps/__init__.py
@@ -16,6 +16,7 @@
     LVMSearchedMultimodalDoc,
     RerankedDoc,
     TextDoc,
+    MetadataTextDoc,
     RAGASParams,
     RAGASScores,
     GraphDoc,
diff --git a/comps/cores/mega/gateway.py b/comps/cores/mega/gateway.py
@@ -777,14 +777,30 @@ async def handle_request(self, request: Request):
             ):
                 return response
         last_node = runtime_graph.all_leaves()[-1]
-        response = result_dict[last_node]["text"]
+
+        if "text" in result_dict[last_node].keys():
+            response = result_dict[last_node]["text"]
+        else:
+            # text in not response message
+            # something wrong, for example due to empty retrieval results
+            if "detail" in result_dict[last_node].keys():
+                response = result_dict[last_node]["detail"]
+            else:
+                response = "The server fail to generate answer to your query!"
+        if "metadata" in result_dict[last_node].keys():
+            # from retrieval results
+            metadata = result_dict[last_node]["metadata"]
+        else:
+            # follow-up question, no retrieval
+            metadata = None
         choices = []
         usage = UsageInfo()
         choices.append(
             ChatCompletionResponseChoice(
                 index=0,
                 message=ChatMessage(role="assistant", content=response),
                 finish_reason="stop",
+                metadata=metadata,
             )
         )
         return ChatCompletionResponse(model="multimodalragwithvideos", choices=choices, usage=usage)
diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py
@@ -299,6 +299,7 @@ class ChatCompletionResponseChoice(BaseModel):
     index: int
     message: ChatMessage
     finish_reason: Optional[Literal["stop", "length"]] = None
+    metadata: Optional[Dict[str, Any]] = None
 
 
 class ChatCompletionResponse(BaseModel):
diff --git a/comps/cores/proto/docarray.py b/comps/cores/proto/docarray.py
@@ -20,6 +20,13 @@ class TextDoc(BaseDoc, TopologyInfo):
     text: str = None
 
 
+class MetadataTextDoc(TextDoc):
+    metadata: Optional[Dict[str, Any]] = Field(
+        description="This encloses all metadata associated with the textdoc.",
+        default=None,
+    )
+
+
 class ImageDoc(BaseDoc):
     url: Optional[ImageUrl] = Field(
         description="The path to the image. It can be remote (Web) URL, or a local file path",
diff --git a/comps/dataprep/multimodal/redis/langchain/multimodal_utils.py b/comps/dataprep/multimodal/redis/langchain/multimodal_utils.py
@@ -123,13 +123,13 @@ def str2time(strtime: str):
 
 def convert_img_to_base64(image):
     "Convert image to base64 string"
-    _, buffer = cv2.imencode(".jpg", image)
+    _, buffer = cv2.imencode(".png", image)
     encoded_string = base64.b64encode(buffer)
     return encoded_string.decode()
 
 
 def extract_frames_and_annotations_from_transcripts(video_id: str, video_path: str, vtt_path: str, output_dir: str):
-    """Extract frames (.jpg) and annotations (.json) from video file (.mp4) and captions file (.vtt)"""
+    """Extract frames (.png) and annotations (.json) from video file (.mp4) and captions file (.vtt)"""
     # Set up location to store frames and annotations
     os.makedirs(output_dir, exist_ok=True)
     os.makedirs(os.path.join(output_dir, "frames"), exist_ok=True)
@@ -157,7 +157,7 @@ def extract_frames_and_annotations_from_transcripts(video_id: str, video_path: s
         if success:
             # Save frame for further processing
             img_fname = f"frame_{idx}"
-            img_fpath = os.path.join(output_dir, "frames", img_fname + ".jpg")
+            img_fpath = os.path.join(output_dir, "frames", img_fname + ".png")
             cv2.imwrite(img_fpath, frame)
 
             # Convert image to base64 encoded string
@@ -195,7 +195,7 @@ def use_lvm(endpoint: str, img_b64_string: str, prompt: str = "Provide a short d
 def extract_frames_and_generate_captions(
     video_id: str, video_path: str, lvm_endpoint: str, output_dir: str, key_frame_per_second: int = 1
 ):
-    """Extract frames (.jpg) and annotations (.json) from video file (.mp4) by generating captions using LVM microservice."""
+    """Extract frames (.png) and annotations (.json) from video file (.mp4) by generating captions using LVM microservice."""
     # Set up location to store frames and annotations
     os.makedirs(output_dir, exist_ok=True)
     os.makedirs(os.path.join(output_dir, "frames"), exist_ok=True)
@@ -225,7 +225,7 @@ def extract_frames_and_generate_captions(
 
             # Save frame for further processing
             img_fname = f"frame_{idx}"
-            img_fpath = os.path.join(output_dir, "frames", img_fname + ".jpg")
+            img_fpath = os.path.join(output_dir, "frames", img_fname + ".png")
             cv2.imwrite(img_fpath, frame)
 
             # Convert image to base64 encoded string
diff --git a/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py b/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
@@ -199,7 +199,7 @@ def prepare_data_and_metadata_from_annotation(
     metadatas = []
     for i, frame in enumerate(annotation):
         frame_index = frame["sub_video_id"]
-        path_to_frame = os.path.join(path_to_frames, f"frame_{frame_index}.jpg")
+        path_to_frame = os.path.join(path_to_frames, f"frame_{frame_index}.png")
         # augment this frame's transcript with a reasonable number of neighboring frames' transcripts helps semantic retrieval
         lb_ingesting = max(0, i - num_transcript_concat_for_ingesting)
         ub_ingesting = min(len(annotation), i + num_transcript_concat_for_ingesting + 1)
@@ -275,6 +275,7 @@ async def ingest_videos_generate_transcripts(files: List[UploadFile] = File(None
 
     if files:
         video_files = []
+        uploaded_videos_saved_videos_map = {}
         for file in files:
             if os.path.splitext(file.filename)[1] == ".mp4":
                 video_files.append(file)
@@ -299,6 +300,8 @@ async def ingest_videos_generate_transcripts(files: List[UploadFile] = File(None
             with open(os.path.join(upload_folder, video_file_name), "wb") as f:
                 shutil.copyfileobj(video_file.file, f)
 
+            uploaded_videos_saved_videos_map[video_name] = video_file_name
+
             # Extract temporary audio wav file from video mp4
             audio_file = video_dir_name + ".wav"
             print(f"Extracting {audio_file}")
@@ -345,7 +348,11 @@ async def ingest_videos_generate_transcripts(files: List[UploadFile] = File(None
             end = time.time()
             print(str(end - st))
 
-        return {"status": 200, "message": "Data preparation succeeded"}
+        return {
+            "status": 200,
+            "message": "Data preparation succeeded",
+            "video_id_maps": uploaded_videos_saved_videos_map,
+        }
 
     raise HTTPException(status_code=400, detail="Must provide at least one video (.mp4) file.")
 
@@ -358,6 +365,7 @@ async def ingest_videos_generate_caption(files: List[UploadFile] = File(None)):
 
     if files:
         video_files = []
+        uploaded_videos_saved_videos_map = {}
         for file in files:
             if os.path.splitext(file.filename)[1] == ".mp4":
                 video_files.append(file)
@@ -380,6 +388,7 @@ async def ingest_videos_generate_caption(files: List[UploadFile] = File(None)):
             # Save video file in upload_directory
             with open(os.path.join(upload_folder, video_file_name), "wb") as f:
                 shutil.copyfileobj(video_file.file, f)
+            uploaded_videos_saved_videos_map[video_name] = video_file_name
 
             # Store frames and caption annotations in a new directory
             extract_frames_and_generate_captions(
@@ -397,7 +406,11 @@ async def ingest_videos_generate_caption(files: List[UploadFile] = File(None)):
 
             print(f"Processed video {video_file.filename}")
 
-        return {"status": 200, "message": "Data preparation succeeded"}
+        return {
+            "status": 200,
+            "message": "Data preparation succeeded",
+            "video_id_maps": uploaded_videos_saved_videos_map,
+        }
 
     raise HTTPException(status_code=400, detail="Must provide at least one video (.mp4) file.")
 
@@ -413,6 +426,7 @@ async def ingest_videos_with_transcripts(files: List[UploadFile] = File(None)):
     if files:
         video_files, video_file_names = [], []
         captions_files, captions_file_names = [], []
+        uploaded_videos_saved_videos_map = {}
         for file in files:
             if os.path.splitext(file.filename)[1] == ".mp4":
                 video_files.append(file)
@@ -451,6 +465,7 @@ async def ingest_videos_with_transcripts(files: List[UploadFile] = File(None)):
             # Save video file in upload_directory
             with open(os.path.join(upload_folder, video_file_name), "wb") as f:
                 shutil.copyfileobj(video_file.file, f)
+            uploaded_videos_saved_videos_map[video_name] = video_file_name
 
             # Save captions file in upload directory
             vtt_file_name = os.path.splitext(video_file.filename)[0] + ".vtt"
@@ -482,7 +497,11 @@ async def ingest_videos_with_transcripts(files: List[UploadFile] = File(None)):
 
             print(f"Processed video {video_file.filename}")
 
-        return {"status": 200, "message": "Data preparation succeeded"}
+        return {
+            "status": 200,
+            "message": "Data preparation succeeded",
+            "video_id_maps": uploaded_videos_saved_videos_map,
+        }
 
     raise HTTPException(
         status_code=400, detail="Must provide at least one pair consisting of video (.mp4) and captions (.vtt)"
diff --git a/comps/dataprep/multimodal/redis/langchain/requirements.txt b/comps/dataprep/multimodal/redis/langchain/requirements.txt
@@ -1,7 +1,6 @@
 docarray[full]
 fastapi
 langchain==0.1.12
-langchain_benchmarks
 moviepy
 openai-whisper
 opencv-python
diff --git a/comps/lvms/llava/lvm.py b/comps/lvms/llava/lvm.py
@@ -8,13 +8,15 @@
 from typing import Union
 
 import requests
+from fastapi import HTTPException
 from langchain_core.prompts import PromptTemplate
 from template import ChatTemplate
 
 from comps import (
     CustomLogger,
     LVMDoc,
     LVMSearchedMultimodalDoc,
+    MetadataTextDoc,
     ServiceType,
     TextDoc,
     opea_microservices,
@@ -35,14 +37,20 @@
     port=9399,
 )
 @register_statistics(names=["opea_service@lvm"])
-async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> TextDoc:
+async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc, MetadataTextDoc]:
     if logflag:
         logger.info(request)
     start = time.time()
     if isinstance(request, LVMSearchedMultimodalDoc):
         if logflag:
             logger.info("[LVMSearchedMultimodalDoc ] input from retriever microservice")
         retrieved_metadatas = request.metadata
+        if retrieved_metadatas is None or len(retrieved_metadatas) == 0:
+            # there is no video segments retrieved.
+            # Raise HTTPException status_code 204
+            # due to llava-tgi-gaudi should receive image as input; Otherwise, the generated text is bad.
+            raise HTTPException(status_code=500, detail="There is no video segments retrieved given the query!")
+
         img_b64_str = retrieved_metadatas[0]["b64_img_str"]
         initial_query = request.initial_query
         context = retrieved_metadatas[0]["transcript_for_inference"]
@@ -75,7 +83,15 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> TextDoc:
     result = response.json()["text"]
     if logflag:
         logger.info(result)
-    return TextDoc(text=result)
+    if isinstance(request, LVMSearchedMultimodalDoc):
+        retrieved_metadata = request.metadata[0]
+        return_metadata = {}  # this metadata will be used to construct proof for generated text
+        return_metadata["video_id"] = retrieved_metadata["video_id"]
+        return_metadata["source_video"] = retrieved_metadata["source_video"]
+        return_metadata["time_of_frame_ms"] = retrieved_metadata["time_of_frame_ms"]
+        return MetadataTextDoc(text=result, metadata=return_metadata)
+    else:
+        return TextDoc(text=result)
 
 
 if __name__ == "__main__":
diff --git a/comps/lvms/tgi-llava/lvm_tgi.py b/comps/lvms/tgi-llava/lvm_tgi.py
@@ -5,6 +5,7 @@
 import time
 from typing import Union
 
+from fastapi import HTTPException
 from fastapi.responses import StreamingResponse
 from huggingface_hub import AsyncInferenceClient
 from langchain_core.prompts import PromptTemplate
@@ -14,6 +15,7 @@
     CustomLogger,
     LVMDoc,
     LVMSearchedMultimodalDoc,
+    MetadataTextDoc,
     ServiceType,
     TextDoc,
     opea_microservices,
@@ -36,7 +38,7 @@
     output_datatype=TextDoc,
 )
 @register_statistics(names=["opea_service@lvm_tgi"])
-async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> TextDoc:
+async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc, MetadataTextDoc]:
     if logflag:
         logger.info(request)
     start = time.time()
@@ -46,6 +48,11 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> TextDoc:
         if logflag:
             logger.info("[LVMSearchedMultimodalDoc ] input from retriever microservice")
         retrieved_metadatas = request.metadata
+        if not retrieved_metadatas or len(retrieved_metadatas) == 0:
+            # there is no video segments retrieved.
+            # Raise HTTPException status_code 204
+            # due to llava-tgi-gaudi should receive image as input; Otherwise, the generated text is bad.
+            raise HTTPException(status_code=500, detail="There is no video segments retrieved given the query!")
         img_b64_str = retrieved_metadatas[0]["b64_img_str"]
         initial_query = request.initial_query
         context = retrieved_metadatas[0]["transcript_for_inference"]
@@ -121,7 +128,15 @@ async def stream_generator():
         statistics_dict["opea_service@lvm_tgi"].append_latency(time.time() - start, None)
         if logflag:
             logger.info(generated_str)
-        return TextDoc(text=generated_str)
+        if isinstance(request, LVMSearchedMultimodalDoc):
+            retrieved_metadata = request.metadata[0]
+            return_metadata = {}  # this metadata will be used to construct proof for generated text
+            return_metadata["video_id"] = retrieved_metadata["video_id"]
+            return_metadata["source_video"] = retrieved_metadata["source_video"]
+            return_metadata["time_of_frame_ms"] = retrieved_metadata["time_of_frame_ms"]
+            return MetadataTextDoc(text=generated_str, metadata=return_metadata)
+        else:
+            return TextDoc(text=generated_str)
 
 
 if __name__ == "__main__":
diff --git a/tests/lvms/test_lvms_llava.sh b/tests/lvms/test_lvms_llava.sh
@@ -29,16 +29,15 @@ function build_docker_images() {
 
 function start_service() {
     unset http_proxy
-    llava_port=5071
-    lvm_port=5072
-    docker run -d --name="test-comps-lvm-llava-dependency" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p $lvm_port:8399 --ipc=host opea/llava:comps
-    docker run -d --name="test-comps-lvm-llava-server" -e LVM_ENDPOINT=http://$ip_address:$llava_port -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p $lvm_port:9399 --ipc=host opea/lvm:comps
+    lvm_port=5051
+    docker run -d --name="test-comps-lvm-llava-dependency" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 5028:8399 --ipc=host opea/llava:comps
+    docker run -d --name="test-comps-lvm-llava-server" -e LVM_ENDPOINT=http://$ip_address:5028 -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p $lvm_port:9399 --ipc=host opea/lvm:comps
     sleep 8m
 }
 
 function validate_microservice() {
 
-    lvm_port=5072
+    lvm_port=5051
     result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "prompt":"What is this?"}' -H 'Content-Type: application/json')
     if [[ $result == *"yellow"* ]]; then
         echo "Result correct."
@@ -49,7 +48,7 @@ function validate_microservice() {
         exit 1
     fi
 
-    result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [{"b64_img_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "transcript_for_inference": "yellow image"}]}' -H 'Content-Type: application/json')
+    result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [{"b64_img_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "transcript_for_inference": "yellow image", "video_id": "8c7461df-b373-4a00-8696-9a2234359fe0", "time_of_frame_ms":"37000000", "source_video":"WeAreGoingOnBullrun_8c7461df-b373-4a00-8696-9a2234359fe0.mp4"}]}' -H 'Content-Type: application/json')
     if [[ $result == *"yellow"* ]]; then
         echo "Result correct."
     else
@@ -59,7 +58,7 @@ function validate_microservice() {
         exit 1
     fi
 
-    result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [{"b64_img_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "transcript_for_inference": "yellow image"}], "chat_template":"The caption of the image is: '\''{context}'\''. {question}"}' -H 'Content-Type: application/json')
+    result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [{"b64_img_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "transcript_for_inference": "yellow image", "video_id": "8c7461df-b373-4a00-8696-9a2234359fe0", "time_of_frame_ms":"37000000", "source_video":"WeAreGoingOnBullrun_8c7461df-b373-4a00-8696-9a2234359fe0.mp4"}], "chat_template":"The caption of the image is: '\''{context}'\''. {question}"}' -H 'Content-Type: application/json')
     if [[ $result == *"yellow"* ]]; then
         echo "Result correct."
     else