Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions docs/source/en/api/pipelines/ltx2.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,6 @@ video, audio = pipe(
output_type="np",
return_dict=False,
)
video = (video * 255).round().astype("uint8")
video = torch.from_numpy(video)

encode_video(
video[0],
Expand Down Expand Up @@ -185,8 +183,6 @@ video, audio = pipe(
output_type="np",
return_dict=False,
)
video = (video * 255).round().astype("uint8")
video = torch.from_numpy(video)

encode_video(
video[0],
Expand Down
68 changes: 59 additions & 9 deletions src/diffusers/pipelines/ltx2/export_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from collections.abc import Iterator
from fractions import Fraction
from typing import Optional
from itertools import chain
from typing import List, Optional, Union

import numpy as np
import PIL.Image
import torch
from tqdm import tqdm

from ...utils import is_av_available

Expand Down Expand Up @@ -101,11 +106,54 @@ def _write_audio(


def encode_video(
video: torch.Tensor, fps: int, audio: Optional[torch.Tensor], audio_sample_rate: Optional[int], output_path: str
video: Union[List[PIL.Image.Image], np.ndarray, torch.Tensor, Iterator[torch.Tensor]],
fps: int,
audio: Optional[torch.Tensor],
audio_sample_rate: Optional[int],
output_path: str,
video_chunks_number: int = 1,
) -> None:
video_np = video.cpu().numpy()

_, height, width, _ = video_np.shape
"""
Encodes a video with audio using the PyAV library. Based on code from the original LTX-2 repo:
https://github.com/Lightricks/LTX-2/blob/4f410820b198e05074a1e92de793e3b59e9ab5a0/packages/ltx-pipelines/src/ltx_pipelines/utils/media_io.py#L182

Args:
video (`List[PIL.Image.Image]` or `np.ndarray` or `torch.Tensor`):
A video tensor of shape [frames, height, width, channels] with integer pixel values in [0, 255]. If the
input is a `np.ndarray`, it is expected to be a float array with values in [0, 1] (which is what pipelines
usually return with `output_type="np"`).
fps (`int`)
The frames per second (FPS) of the encoded video.
audio (`torch.Tensor`, *optional*):
An audio waveform of shape [audio_channels, samples].
audio_sample_rate: (`int`, *optional*):
The sampling rate of the audio waveform. For LTX 2, this is typically 24000 (24 kHz).
output_path (`str`):
The path to save the encoded video to.
video_chunks_number (`int`, *optional*, defaults to `1`):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When is this option helpful?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The original LTX-2 code will use a video_chunks_number calculated from the video VAE tiling config, for example in two stage inference:

https://github.com/Lightricks/LTX-2/blob/4f410820b198e05074a1e92de793e3b59e9ab5a0/packages/ltx-pipelines/src/ltx_pipelines/ti2vid_two_stages.py#L257

For the default num_frames value of 121 and default tiling config TilingConfig.default(), I believe this works out to 3 chunks. The idea seems to be that the chunks correspond to each tiled stride when decoding.

In practice, I haven't had any issues with the current code, which is equivalent to just using one chunk. I don't fully understand the reasoning behind why the original code supports it; my guess is that it is useful for very long videos or if there are compute constraints.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #13057 (comment) for discussion about some complications for supporting video_chunks_number.

The number of chunks to split the video into for encoding. Each chunk will be encoded separately. The
number of chunks to use often depends on the tiling config for the video VAE.
"""
if isinstance(video, list) and isinstance(video[0], PIL.Image.Image):
# Pipeline output_type="pil"; assumes each image is in "RGB" mode
video_frames = [np.array(frame) for frame in video]
video = np.stack(video_frames, axis=0)
video = torch.from_numpy(video)
elif isinstance(video, np.ndarray):
# Pipeline output_type="np"
is_denormalized = np.logical_and(np.zeros_like(video) <= video, video <= np.ones_like(video))
if np.all(is_denormalized):
video = (video * 255).round().astype("uint8")
video = torch.from_numpy(video)

if isinstance(video, torch.Tensor):
# Split into video_chunks_number along the frame dimension
video = torch.tensor_split(video, video_chunks_number, dim=0)
video = iter(video)

first_chunk = next(video)

_, height, width, _ = first_chunk.shape

container = av.open(output_path, mode="w")
stream = container.add_stream("libx264", rate=int(fps))
Expand All @@ -119,10 +167,12 @@ def encode_video(

audio_stream = _prepare_audio_stream(container, audio_sample_rate)

for frame_array in video_np:
frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24")
for packet in stream.encode(frame):
container.mux(packet)
for video_chunk in tqdm(chain([first_chunk], video), total=video_chunks_number, desc="Encoding video chunks"):
video_chunk_cpu = video_chunk.to("cpu").numpy()
for frame_array in video_chunk_cpu:
frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we let the users control this format? 👀

Copy link
Collaborator Author

@dg845 dg845 Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we could allow the users to specify the format, but this would be in tension with value checking as suggested in #13057 (comment): for example, if we always convert denormalized inputs with values in $[0, 1]$ to uint8 values in $\{0, 1, \ldots, 255\}$, that would probably make it difficult to support a variety of formats.

We could conditionally convert based on the supplied video_format, but my understanding is that there are a lot of video formats, and I don't think we can anticipate all of the use cases that users may have. So I think we could support a video_format argument with a "use at your own risk" caveat:

    elif isinstance(video, np.ndarray):
        # Pipeline output_type="np"
        is_denormalized = np.logical_and(np.zeros_like(video) <= video, video <= np.ones_like(video))
        if np.all(is_denormalized) and video_format == "rgb24":
            video = (video * 255).round().astype("uint8")
        else:
            logger.warning(
                f"The video will be encoded using the input `video` values as-is with format {video_format}. Make sure"
                f" the values are in the proper range for the supplied format".
            )
        video = torch.from_numpy(video)

An alternative would be to only support "rgb24" as the original LTX-2 code does with the idea that power users can use their own video encoding code if they have a different use case.

EDIT: the right terminology here might be "pixel format" rather than "video format".

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An alternative would be to only support "rgb24" as the original LTX-2 code does with the idea that power users can use their own video encoding code if they have a different use case.

Okay let's go with this.

for packet in stream.encode(frame):
container.mux(packet)

# Flush encoder
for packet in stream.encode():
Expand Down
2 changes: 0 additions & 2 deletions src/diffusers/pipelines/ltx2/pipeline_ltx2.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,6 @@
... output_type="np",
... return_dict=False,
... )
>>> video = (video * 255).round().astype("uint8")
>>> video = torch.from_numpy(video)

>>> encode_video(
... video[0],
Expand Down
2 changes: 0 additions & 2 deletions src/diffusers/pipelines/ltx2/pipeline_ltx2_image2video.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,6 @@
... output_type="np",
... return_dict=False,
... )
>>> video = (video * 255).round().astype("uint8")
>>> video = torch.from_numpy(video)

>>> encode_video(
... video[0],
Expand Down
2 changes: 0 additions & 2 deletions src/diffusers/pipelines/ltx2/pipeline_ltx2_latent_upsample.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,6 @@
... output_type="np",
... return_dict=False,
... )[0]
>>> video = (video * 255).round().astype("uint8")
>>> video = torch.from_numpy(video)

>>> encode_video(
... video[0],
Expand Down
Loading