Skip to content

Commit 7664578

Browse files
tileintelpre-commit-ci[bot]kevinintelSpycshchensuyue
authored
Support for UI of MultimodalRAGWithVideos in GenAIExamples (#656)
* using png instead of jpg due to tgi-gaudi. return message include video_id maps Signed-off-by: Tiep Le <[email protected]> * update lvm and lvm_tgi to handle empty retrieval results Signed-off-by: Tiep Le <[email protected]> * handle no retrieval results Signed-off-by: Tiep Le <[email protected]> * add metadata to textdoc returned by LVM Signed-off-by: Tiep Le <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add metadatatextdoc Signed-off-by: Tiep Le <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add test Signed-off-by: Tiep Le <[email protected]> * add metadata to Chatcompletionresponsechoice Signed-off-by: Tiep Le <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bug Signed-off-by: Tiep Le <[email protected]> * fix bug Signed-off-by: Tiep Le <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bug Signed-off-by: Tiep Le <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * minor fix for more consice Signed-off-by: Tiep Le <[email protected]> * update port for test Signed-off-by: Tiep Le <[email protected]> * update test Signed-off-by: Tiep Le <[email protected]> --------- Signed-off-by: Tiep Le <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: kevinintel <[email protected]> Co-authored-by: Sihan Chen <[email protected]> Co-authored-by: chen, suyue <[email protected]>
1 parent 8e3f553 commit 7664578

File tree

10 files changed

+95
-22
lines changed

10 files changed

+95
-22
lines changed

comps/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
LVMSearchedMultimodalDoc,
1717
RerankedDoc,
1818
TextDoc,
19+
MetadataTextDoc,
1920
RAGASParams,
2021
RAGASScores,
2122
GraphDoc,

comps/cores/mega/gateway.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -777,14 +777,30 @@ async def handle_request(self, request: Request):
777777
):
778778
return response
779779
last_node = runtime_graph.all_leaves()[-1]
780-
response = result_dict[last_node]["text"]
780+
781+
if "text" in result_dict[last_node].keys():
782+
response = result_dict[last_node]["text"]
783+
else:
784+
# text in not response message
785+
# something wrong, for example due to empty retrieval results
786+
if "detail" in result_dict[last_node].keys():
787+
response = result_dict[last_node]["detail"]
788+
else:
789+
response = "The server fail to generate answer to your query!"
790+
if "metadata" in result_dict[last_node].keys():
791+
# from retrieval results
792+
metadata = result_dict[last_node]["metadata"]
793+
else:
794+
# follow-up question, no retrieval
795+
metadata = None
781796
choices = []
782797
usage = UsageInfo()
783798
choices.append(
784799
ChatCompletionResponseChoice(
785800
index=0,
786801
message=ChatMessage(role="assistant", content=response),
787802
finish_reason="stop",
803+
metadata=metadata,
788804
)
789805
)
790806
return ChatCompletionResponse(model="multimodalragwithvideos", choices=choices, usage=usage)

comps/cores/proto/api_protocol.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,7 @@ class ChatCompletionResponseChoice(BaseModel):
299299
index: int
300300
message: ChatMessage
301301
finish_reason: Optional[Literal["stop", "length"]] = None
302+
metadata: Optional[Dict[str, Any]] = None
302303

303304

304305
class ChatCompletionResponse(BaseModel):

comps/cores/proto/docarray.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,13 @@ class TextDoc(BaseDoc, TopologyInfo):
2020
text: str = None
2121

2222

23+
class MetadataTextDoc(TextDoc):
24+
metadata: Optional[Dict[str, Any]] = Field(
25+
description="This encloses all metadata associated with the textdoc.",
26+
default=None,
27+
)
28+
29+
2330
class ImageDoc(BaseDoc):
2431
url: Optional[ImageUrl] = Field(
2532
description="The path to the image. It can be remote (Web) URL, or a local file path",

comps/dataprep/multimodal/redis/langchain/multimodal_utils.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -123,13 +123,13 @@ def str2time(strtime: str):
123123

124124
def convert_img_to_base64(image):
125125
"Convert image to base64 string"
126-
_, buffer = cv2.imencode(".jpg", image)
126+
_, buffer = cv2.imencode(".png", image)
127127
encoded_string = base64.b64encode(buffer)
128128
return encoded_string.decode()
129129

130130

131131
def extract_frames_and_annotations_from_transcripts(video_id: str, video_path: str, vtt_path: str, output_dir: str):
132-
"""Extract frames (.jpg) and annotations (.json) from video file (.mp4) and captions file (.vtt)"""
132+
"""Extract frames (.png) and annotations (.json) from video file (.mp4) and captions file (.vtt)"""
133133
# Set up location to store frames and annotations
134134
os.makedirs(output_dir, exist_ok=True)
135135
os.makedirs(os.path.join(output_dir, "frames"), exist_ok=True)
@@ -157,7 +157,7 @@ def extract_frames_and_annotations_from_transcripts(video_id: str, video_path: s
157157
if success:
158158
# Save frame for further processing
159159
img_fname = f"frame_{idx}"
160-
img_fpath = os.path.join(output_dir, "frames", img_fname + ".jpg")
160+
img_fpath = os.path.join(output_dir, "frames", img_fname + ".png")
161161
cv2.imwrite(img_fpath, frame)
162162

163163
# Convert image to base64 encoded string
@@ -195,7 +195,7 @@ def use_lvm(endpoint: str, img_b64_string: str, prompt: str = "Provide a short d
195195
def extract_frames_and_generate_captions(
196196
video_id: str, video_path: str, lvm_endpoint: str, output_dir: str, key_frame_per_second: int = 1
197197
):
198-
"""Extract frames (.jpg) and annotations (.json) from video file (.mp4) by generating captions using LVM microservice."""
198+
"""Extract frames (.png) and annotations (.json) from video file (.mp4) by generating captions using LVM microservice."""
199199
# Set up location to store frames and annotations
200200
os.makedirs(output_dir, exist_ok=True)
201201
os.makedirs(os.path.join(output_dir, "frames"), exist_ok=True)
@@ -225,7 +225,7 @@ def extract_frames_and_generate_captions(
225225

226226
# Save frame for further processing
227227
img_fname = f"frame_{idx}"
228-
img_fpath = os.path.join(output_dir, "frames", img_fname + ".jpg")
228+
img_fpath = os.path.join(output_dir, "frames", img_fname + ".png")
229229
cv2.imwrite(img_fpath, frame)
230230

231231
# Convert image to base64 encoded string

comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ def prepare_data_and_metadata_from_annotation(
199199
metadatas = []
200200
for i, frame in enumerate(annotation):
201201
frame_index = frame["sub_video_id"]
202-
path_to_frame = os.path.join(path_to_frames, f"frame_{frame_index}.jpg")
202+
path_to_frame = os.path.join(path_to_frames, f"frame_{frame_index}.png")
203203
# augment this frame's transcript with a reasonable number of neighboring frames' transcripts helps semantic retrieval
204204
lb_ingesting = max(0, i - num_transcript_concat_for_ingesting)
205205
ub_ingesting = min(len(annotation), i + num_transcript_concat_for_ingesting + 1)
@@ -275,6 +275,7 @@ async def ingest_videos_generate_transcripts(files: List[UploadFile] = File(None
275275

276276
if files:
277277
video_files = []
278+
uploaded_videos_saved_videos_map = {}
278279
for file in files:
279280
if os.path.splitext(file.filename)[1] == ".mp4":
280281
video_files.append(file)
@@ -299,6 +300,8 @@ async def ingest_videos_generate_transcripts(files: List[UploadFile] = File(None
299300
with open(os.path.join(upload_folder, video_file_name), "wb") as f:
300301
shutil.copyfileobj(video_file.file, f)
301302

303+
uploaded_videos_saved_videos_map[video_name] = video_file_name
304+
302305
# Extract temporary audio wav file from video mp4
303306
audio_file = video_dir_name + ".wav"
304307
print(f"Extracting {audio_file}")
@@ -345,7 +348,11 @@ async def ingest_videos_generate_transcripts(files: List[UploadFile] = File(None
345348
end = time.time()
346349
print(str(end - st))
347350

348-
return {"status": 200, "message": "Data preparation succeeded"}
351+
return {
352+
"status": 200,
353+
"message": "Data preparation succeeded",
354+
"video_id_maps": uploaded_videos_saved_videos_map,
355+
}
349356

350357
raise HTTPException(status_code=400, detail="Must provide at least one video (.mp4) file.")
351358

@@ -358,6 +365,7 @@ async def ingest_videos_generate_caption(files: List[UploadFile] = File(None)):
358365

359366
if files:
360367
video_files = []
368+
uploaded_videos_saved_videos_map = {}
361369
for file in files:
362370
if os.path.splitext(file.filename)[1] == ".mp4":
363371
video_files.append(file)
@@ -380,6 +388,7 @@ async def ingest_videos_generate_caption(files: List[UploadFile] = File(None)):
380388
# Save video file in upload_directory
381389
with open(os.path.join(upload_folder, video_file_name), "wb") as f:
382390
shutil.copyfileobj(video_file.file, f)
391+
uploaded_videos_saved_videos_map[video_name] = video_file_name
383392

384393
# Store frames and caption annotations in a new directory
385394
extract_frames_and_generate_captions(
@@ -397,7 +406,11 @@ async def ingest_videos_generate_caption(files: List[UploadFile] = File(None)):
397406

398407
print(f"Processed video {video_file.filename}")
399408

400-
return {"status": 200, "message": "Data preparation succeeded"}
409+
return {
410+
"status": 200,
411+
"message": "Data preparation succeeded",
412+
"video_id_maps": uploaded_videos_saved_videos_map,
413+
}
401414

402415
raise HTTPException(status_code=400, detail="Must provide at least one video (.mp4) file.")
403416

@@ -413,6 +426,7 @@ async def ingest_videos_with_transcripts(files: List[UploadFile] = File(None)):
413426
if files:
414427
video_files, video_file_names = [], []
415428
captions_files, captions_file_names = [], []
429+
uploaded_videos_saved_videos_map = {}
416430
for file in files:
417431
if os.path.splitext(file.filename)[1] == ".mp4":
418432
video_files.append(file)
@@ -451,6 +465,7 @@ async def ingest_videos_with_transcripts(files: List[UploadFile] = File(None)):
451465
# Save video file in upload_directory
452466
with open(os.path.join(upload_folder, video_file_name), "wb") as f:
453467
shutil.copyfileobj(video_file.file, f)
468+
uploaded_videos_saved_videos_map[video_name] = video_file_name
454469

455470
# Save captions file in upload directory
456471
vtt_file_name = os.path.splitext(video_file.filename)[0] + ".vtt"
@@ -482,7 +497,11 @@ async def ingest_videos_with_transcripts(files: List[UploadFile] = File(None)):
482497

483498
print(f"Processed video {video_file.filename}")
484499

485-
return {"status": 200, "message": "Data preparation succeeded"}
500+
return {
501+
"status": 200,
502+
"message": "Data preparation succeeded",
503+
"video_id_maps": uploaded_videos_saved_videos_map,
504+
}
486505

487506
raise HTTPException(
488507
status_code=400, detail="Must provide at least one pair consisting of video (.mp4) and captions (.vtt)"

comps/dataprep/multimodal/redis/langchain/requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
docarray[full]
22
fastapi
33
langchain==0.1.12
4-
langchain_benchmarks
54
moviepy
65
openai-whisper
76
opencv-python

comps/lvms/llava/lvm.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,15 @@
88
from typing import Union
99

1010
import requests
11+
from fastapi import HTTPException
1112
from langchain_core.prompts import PromptTemplate
1213
from template import ChatTemplate
1314

1415
from comps import (
1516
CustomLogger,
1617
LVMDoc,
1718
LVMSearchedMultimodalDoc,
19+
MetadataTextDoc,
1820
ServiceType,
1921
TextDoc,
2022
opea_microservices,
@@ -35,14 +37,20 @@
3537
port=9399,
3638
)
3739
@register_statistics(names=["opea_service@lvm"])
38-
async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> TextDoc:
40+
async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc, MetadataTextDoc]:
3941
if logflag:
4042
logger.info(request)
4143
start = time.time()
4244
if isinstance(request, LVMSearchedMultimodalDoc):
4345
if logflag:
4446
logger.info("[LVMSearchedMultimodalDoc ] input from retriever microservice")
4547
retrieved_metadatas = request.metadata
48+
if retrieved_metadatas is None or len(retrieved_metadatas) == 0:
49+
# there is no video segments retrieved.
50+
# Raise HTTPException status_code 204
51+
# due to llava-tgi-gaudi should receive image as input; Otherwise, the generated text is bad.
52+
raise HTTPException(status_code=500, detail="There is no video segments retrieved given the query!")
53+
4654
img_b64_str = retrieved_metadatas[0]["b64_img_str"]
4755
initial_query = request.initial_query
4856
context = retrieved_metadatas[0]["transcript_for_inference"]
@@ -75,7 +83,15 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> TextDoc:
7583
result = response.json()["text"]
7684
if logflag:
7785
logger.info(result)
78-
return TextDoc(text=result)
86+
if isinstance(request, LVMSearchedMultimodalDoc):
87+
retrieved_metadata = request.metadata[0]
88+
return_metadata = {} # this metadata will be used to construct proof for generated text
89+
return_metadata["video_id"] = retrieved_metadata["video_id"]
90+
return_metadata["source_video"] = retrieved_metadata["source_video"]
91+
return_metadata["time_of_frame_ms"] = retrieved_metadata["time_of_frame_ms"]
92+
return MetadataTextDoc(text=result, metadata=return_metadata)
93+
else:
94+
return TextDoc(text=result)
7995

8096

8197
if __name__ == "__main__":

comps/lvms/tgi-llava/lvm_tgi.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import time
66
from typing import Union
77

8+
from fastapi import HTTPException
89
from fastapi.responses import StreamingResponse
910
from huggingface_hub import AsyncInferenceClient
1011
from langchain_core.prompts import PromptTemplate
@@ -14,6 +15,7 @@
1415
CustomLogger,
1516
LVMDoc,
1617
LVMSearchedMultimodalDoc,
18+
MetadataTextDoc,
1719
ServiceType,
1820
TextDoc,
1921
opea_microservices,
@@ -36,7 +38,7 @@
3638
output_datatype=TextDoc,
3739
)
3840
@register_statistics(names=["opea_service@lvm_tgi"])
39-
async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> TextDoc:
41+
async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc, MetadataTextDoc]:
4042
if logflag:
4143
logger.info(request)
4244
start = time.time()
@@ -46,6 +48,11 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> TextDoc:
4648
if logflag:
4749
logger.info("[LVMSearchedMultimodalDoc ] input from retriever microservice")
4850
retrieved_metadatas = request.metadata
51+
if not retrieved_metadatas or len(retrieved_metadatas) == 0:
52+
# there is no video segments retrieved.
53+
# Raise HTTPException status_code 204
54+
# due to llava-tgi-gaudi should receive image as input; Otherwise, the generated text is bad.
55+
raise HTTPException(status_code=500, detail="There is no video segments retrieved given the query!")
4956
img_b64_str = retrieved_metadatas[0]["b64_img_str"]
5057
initial_query = request.initial_query
5158
context = retrieved_metadatas[0]["transcript_for_inference"]
@@ -121,7 +128,15 @@ async def stream_generator():
121128
statistics_dict["opea_service@lvm_tgi"].append_latency(time.time() - start, None)
122129
if logflag:
123130
logger.info(generated_str)
124-
return TextDoc(text=generated_str)
131+
if isinstance(request, LVMSearchedMultimodalDoc):
132+
retrieved_metadata = request.metadata[0]
133+
return_metadata = {} # this metadata will be used to construct proof for generated text
134+
return_metadata["video_id"] = retrieved_metadata["video_id"]
135+
return_metadata["source_video"] = retrieved_metadata["source_video"]
136+
return_metadata["time_of_frame_ms"] = retrieved_metadata["time_of_frame_ms"]
137+
return MetadataTextDoc(text=generated_str, metadata=return_metadata)
138+
else:
139+
return TextDoc(text=generated_str)
125140

126141

127142
if __name__ == "__main__":

tests/lvms/test_lvms_llava.sh

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,15 @@ function build_docker_images() {
2929

3030
function start_service() {
3131
unset http_proxy
32-
llava_port=5071
33-
lvm_port=5072
34-
docker run -d --name="test-comps-lvm-llava-dependency" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p $lvm_port:8399 --ipc=host opea/llava:comps
35-
docker run -d --name="test-comps-lvm-llava-server" -e LVM_ENDPOINT=http://$ip_address:$llava_port -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p $lvm_port:9399 --ipc=host opea/lvm:comps
32+
lvm_port=5051
33+
docker run -d --name="test-comps-lvm-llava-dependency" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 5028:8399 --ipc=host opea/llava:comps
34+
docker run -d --name="test-comps-lvm-llava-server" -e LVM_ENDPOINT=http://$ip_address:5028 -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p $lvm_port:9399 --ipc=host opea/lvm:comps
3635
sleep 8m
3736
}
3837

3938
function validate_microservice() {
4039

41-
lvm_port=5072
40+
lvm_port=5051
4241
result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "prompt":"What is this?"}' -H 'Content-Type: application/json')
4342
if [[ $result == *"yellow"* ]]; then
4443
echo "Result correct."
@@ -49,7 +48,7 @@ function validate_microservice() {
4948
exit 1
5049
fi
5150

52-
result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [{"b64_img_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "transcript_for_inference": "yellow image"}]}' -H 'Content-Type: application/json')
51+
result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [{"b64_img_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "transcript_for_inference": "yellow image", "video_id": "8c7461df-b373-4a00-8696-9a2234359fe0", "time_of_frame_ms":"37000000", "source_video":"WeAreGoingOnBullrun_8c7461df-b373-4a00-8696-9a2234359fe0.mp4"}]}' -H 'Content-Type: application/json')
5352
if [[ $result == *"yellow"* ]]; then
5453
echo "Result correct."
5554
else
@@ -59,7 +58,7 @@ function validate_microservice() {
5958
exit 1
6059
fi
6160

62-
result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [{"b64_img_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "transcript_for_inference": "yellow image"}], "chat_template":"The caption of the image is: '\''{context}'\''. {question}"}' -H 'Content-Type: application/json')
61+
result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [{"b64_img_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "transcript_for_inference": "yellow image", "video_id": "8c7461df-b373-4a00-8696-9a2234359fe0", "time_of_frame_ms":"37000000", "source_video":"WeAreGoingOnBullrun_8c7461df-b373-4a00-8696-9a2234359fe0.mp4"}], "chat_template":"The caption of the image is: '\''{context}'\''. {question}"}' -H 'Content-Type: application/json')
6362
if [[ $result == *"yellow"* ]]; then
6463
echo "Result correct."
6564
else

0 commit comments

Comments
 (0)