Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions tests/multimodal/media/test_video.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import io
from pathlib import Path

import numpy as np
import numpy.typing as npt
import pybase64
import pytest
from PIL import Image

Expand Down Expand Up @@ -235,3 +237,53 @@ def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch
frames_missing, metadata_missing = videoio_missing.load_bytes(b"test")
np.testing.assert_array_equal(frames_missing, FAKE_OUTPUT_2)
assert metadata_missing["video_backend"] == "test_video_backend_override_2"


def test_load_base64_jpeg_returns_metadata():
"""Regression test: load_base64 with video/jpeg must return metadata.

Previously, base64 JPEG frame sequences returned an empty dict for
metadata, which broke downstream consumers that rely on fields like
total_num_frames and fps. See PR #37301.
"""

num_test_frames = 3
frame_width, frame_height = 8, 8

# Build a few tiny JPEG frames and base64-encode them
b64_frames = []
for i in range(num_test_frames):
img = Image.new("RGB", (frame_width, frame_height), color=(i * 80, 0, 0))
buf = io.BytesIO()
img.save(buf, format="JPEG")
b64_frames.append(pybase64.b64encode(buf.getvalue()).decode("ascii"))

data = ",".join(b64_frames)

imageio = ImageMediaIO()
videoio = VideoMediaIO(imageio, num_frames=num_test_frames)
frames, metadata = videoio.load_base64("video/jpeg", data)

# Frames array shape: (num_frames, H, W, 3)
assert frames.shape[0] == num_test_frames

# All required metadata keys must be present
required_keys = {
"total_num_frames",
"fps",
"duration",
"video_backend",
"frames_indices",
"do_sample_frames",
}
assert required_keys.issubset(metadata.keys()), (
f"Missing metadata keys: {required_keys - metadata.keys()}"
)

assert metadata["total_num_frames"] == num_test_frames
assert metadata["video_backend"] == "jpeg_sequence"
assert metadata["frames_indices"] == list(range(num_test_frames))
assert metadata["do_sample_frames"] is False
# Default fps=1 → duration == num_frames
assert metadata["fps"] == 1.0
assert metadata["duration"] == float(num_test_frames)
16 changes: 14 additions & 2 deletions vllm/multimodal/media/video.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,21 @@ def load_base64(
"image/jpeg",
)

return np.stack(
frames = np.stack(
[np.asarray(load_frame(frame_data)) for frame_data in data.split(",")]
), {}
)
total = int(frames.shape[0])
fps = float(self.kwargs.get("fps", 1))
duration = total / fps if fps > 0 else 0.0
metadata = {
"total_num_frames": total,
"fps": fps,
"duration": duration,
"video_backend": "jpeg_sequence",
"frames_indices": list(range(total)),
"do_sample_frames": False,
}
return frames, metadata
Comment on lines +89 to +97
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The previous implementation returned an empty dictionary for metadata, which caused a runtime error when transformers.video_utils.VideoMetadata was initialized. This change correctly populates the metadata with all necessary fields, resolving the total_num_frames missing argument issue and ensuring proper video metadata propagation. This is a critical fix for the functionality of base64 JPEG video frame processing.


return self.load_bytes(pybase64.b64decode(data))

Expand Down
Loading