Set default FPS to official LTX 2 ckpt default of 24.0

dg845 · dg845 · commit 99ff722e0804 · 2026-01-08T01:48:31.000+01:00
diff --git a/src/diffusers/models/transformers/transformer_ltx2.py b/src/diffusers/models/transformers/transformer_ltx2.py
@@ -667,7 +667,7 @@ def prepare_video_coords(
         height: int,
         width: int,
         device: torch.device,
-        fps: float = 25.0,
+        fps: float = 24.0,
     ) -> torch.Tensor:
         """
         Create per-dimension bounds [inclusive start, exclusive end) for each patch with respect to the original pixel
@@ -735,7 +735,6 @@ def prepare_audio_coords(
         batch_size: int,
         num_frames: int,
         device: torch.device,
-        fps: float = 25.0,
         shift: int = 0,
     ) -> torch.Tensor:
         """
@@ -1115,7 +1114,7 @@ def forward(
         num_frames: Optional[int] = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
-        fps: float = 25.0,
+        fps: float = 24.0,
         audio_num_frames: Optional[int] = None,
         video_coords: Optional[torch.Tensor] = None,
         audio_coords: Optional[torch.Tensor] = None,
@@ -1176,7 +1175,7 @@ def forward(
             )
         if audio_coords is None:
             audio_coords = self.audio_rope.prepare_audio_coords(
-                batch_size, audio_num_frames, audio_hidden_states.device, fps=fps
+                batch_size, audio_num_frames, audio_hidden_states.device
             )
 
         video_rotary_emb = self.rope(video_coords, device=hidden_states.device)
diff --git a/src/diffusers/pipelines/ltx2/pipeline_ltx2.py b/src/diffusers/pipelines/ltx2/pipeline_ltx2.py
@@ -746,7 +746,7 @@ def __call__(
         height: int = 512,
         width: int = 768,
         num_frames: int = 121,
-        frame_rate: float = 25.0,
+        frame_rate: float = 24.0,
         num_inference_steps: int = 40,
         timesteps: List[int] = None,
         guidance_scale: float = 3.0,
@@ -781,7 +781,7 @@ def __call__(
                 The width in pixels of the generated image. This is set to 848 by default for the best results.
             num_frames (`int`, *optional*, defaults to `121`):
                 The number of video frames to generate
-            frame_rate (`float`, *optional*, defaults to `25.0`):
+            frame_rate (`float`, *optional*, defaults to `24.0`):
                 The frames per second (FPS) of the generated video.
             num_inference_steps (`int`, *optional*, defaults to 40):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -996,7 +996,7 @@ def __call__(
             latents.shape[0], latent_num_frames, latent_height, latent_width, latents.device, fps=frame_rate
         )
         audio_coords = self.transformer.audio_rope.prepare_audio_coords(
-            audio_latents.shape[0], audio_num_frames, audio_latents.device, fps=frame_rate
+            audio_latents.shape[0], audio_num_frames, audio_latents.device
         )
 
         # 7. Denoising loop
diff --git a/src/diffusers/pipelines/ltx2/pipeline_ltx2_image2video.py b/src/diffusers/pipelines/ltx2/pipeline_ltx2_image2video.py
@@ -807,7 +807,7 @@ def __call__(
         height: int = 512,
         width: int = 768,
         num_frames: int = 121,
-        frame_rate: float = 25.0,
+        frame_rate: float = 24.0,
         num_inference_steps: int = 40,
         timesteps: List[int] = None,
         guidance_scale: float = 3.0,
@@ -844,7 +844,7 @@ def __call__(
                 The width in pixels of the generated image. This is set to 848 by default for the best results.
             num_frames (`int`, *optional*, defaults to `121`):
                 The number of video frames to generate
-            frame_rate (`float`, *optional*, defaults to `25.0`):
+            frame_rate (`float`, *optional*, defaults to `24.0`):
                 The frames per second (FPS) of the generated video.
             num_inference_steps (`int`, *optional*, defaults to 40):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -1067,7 +1067,7 @@ def __call__(
             latents.shape[0], latent_num_frames, latent_height, latent_width, latents.device, fps=frame_rate
         )
         audio_coords = self.transformer.audio_rope.prepare_audio_coords(
-            audio_latents.shape[0], audio_num_frames, audio_latents.device, fps=frame_rate
+            audio_latents.shape[0], audio_num_frames, audio_latents.device
         )
 
         # 7. Denoising loop