make style and make quality

dg845 · dg845 · commit d9c6bc68b224 · 2025-11-08T04:45:33.000+01:00
diff --git a/scripts/convert_wan_to_diffusers.py b/scripts/convert_wan_to_diffusers.py
@@ -1,5 +1,4 @@
 import argparse
-import math
 import pathlib
 from typing import Any, Dict, Tuple
 
@@ -582,7 +581,6 @@ def get_transformer_config(model_type: str) -> Tuple[Dict[str, Any], ...]:
                 "ffn_dim": 13824,
                 "freq_dim": 256,
                 "in_channels": 36,
-                "motion_encoder_dim": 512,
                 "num_attention_heads": 40,
                 "num_layers": 40,
                 "out_channels": 16,
diff --git a/src/diffusers/models/transformers/transformer_wan_animate.py b/src/diffusers/models/transformers/transformer_wan_animate.py
@@ -40,7 +40,15 @@
 
 
 WAN_ANIMATE_MOTION_ENCODER_CHANNEL_SIZES = {
-    "4": 512, "8": 512, "16": 512, "32": 512, "64": 256, "128": 128, "256": 64, "512": 32, "1024": 16
+    "4": 512,
+    "8": 512,
+    "16": 512,
+    "32": 512,
+    "64": 256,
+    "128": 128,
+    "256": 64,
+    "512": 32,
+    "1024": 16,
 }
 
 
@@ -77,7 +85,11 @@ def __init__(self, negative_slope: float = 0.2, scale: float = 2**0.5, bias_chan
         self.channels = bias_channels
 
         if self.channels is not None:
-            self.bias = nn.Parameter(torch.zeros(self.channels,))
+            self.bias = nn.Parameter(
+                torch.zeros(
+                    self.channels,
+                )
+            )
         else:
             self.bias = None
 
@@ -121,13 +133,13 @@ def __init__(
             # Normalize kernel
             kernel = kernel / kernel.sum()
             if blur_upsample_factor > 1:
-                kernel = kernel * (blur_upsample_factor ** 2)
+                kernel = kernel * (blur_upsample_factor**2)
             self.register_buffer("blur_kernel", kernel, persistent=False)
             self.blur = True
 
         # Main Conv2d parameters (with scale factor)
         self.weight = nn.Parameter(torch.randn(out_channels, in_channels, kernel_size, kernel_size))
-        self.scale = 1 / math.sqrt(in_channels * kernel_size ** 2)
+        self.scale = 1 / math.sqrt(in_channels * kernel_size**2)
 
         self.stride = stride
         self.padding = padding
@@ -161,8 +173,8 @@ def forward(self, x: torch.Tensor, channel_dim: int = 1) -> torch.Tensor:
 
     def __repr__(self):
         return (
-            f'{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]},'
-            f' kernel_size={self.weight.shape[2]}, stride={self.stride}, padding={self.padding})'
+            f"{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]},"
+            f" kernel_size={self.weight.shape[2]}, stride={self.stride}, padding={self.padding})"
         )
 
 
@@ -179,7 +191,7 @@ def __init__(
 
         # Linear weight with scale factor
         self.weight = nn.Parameter(torch.randn(out_dim, in_dim))
-        self.scale = (1 / math.sqrt(in_dim))
+        self.scale = 1 / math.sqrt(in_dim)
 
         # If an activation is present, the bias will be fused to it
         if bias and not self.use_activation:
@@ -200,8 +212,8 @@ def forward(self, input: torch.Tensor, channel_dim: int = 1) -> torch.Tensor:
 
     def __repr__(self):
         return (
-            f'{self.__class__.__name__}(in_features={self.weight.shape[1]}, out_features={self.weight.shape[0]},'
-            f' bias={self.bias is not None})'
+            f"{self.__class__.__name__}(in_features={self.weight.shape[1]}, out_features={self.weight.shape[0]},"
+            f" bias={self.bias is not None})"
         )
 
 
@@ -616,7 +628,7 @@ def __init__(
             # TODO: should this always be true?
             assert in_channels == 2 * latent_channels + 4, "in_channels should be 2 * latent_channels + 4"
         else:
-            raise ValueError(f"At least one of `in_channels` and `latent_channels` must be supplied.")
+            raise ValueError("At least one of `in_channels` and `latent_channels` must be supplied.")
         out_channels = out_channels or latent_channels
 
         # 1. Patch & position embedding
@@ -722,8 +734,8 @@ def forward(
         Args:
             hidden_states (`torch.Tensor` of shape `(B, 2C + 4, T + 1, H, W)`):
                 Input noisy video latents of shape `(B, 2C + 4, T + 1, H, W)`, where B is the batch size, C is the
-                number of latent channels (16 for Wan VAE), T is the number of latent frames in an inference segment,
-                H is the latent height, and W is the latent width.
+                number of latent channels (16 for Wan VAE), T is the number of latent frames in an inference segment, H
+                is the latent height, and W is the latent width.
             timestep: (`torch.LongTensor`):
                 The current timestep in the denoising loop.
             encoder_hidden_states (`torch.Tensor`):
diff --git a/src/diffusers/pipelines/wan/pipeline_wan_animate.py b/src/diffusers/pipelines/wan/pipeline_wan_animate.py
@@ -421,9 +421,7 @@ def check_inputs(
                 " undefined when mode is `replace`."
             )
         if mode == "replace" and (not isinstance(background_video, list) or not isinstance(mask_video, list)):
-            raise ValueError(
-                "`background_video` and `mask_video` must be lists of PIL images when mode is `replace`."
-            )
+            raise ValueError("`background_video` and `mask_video` must be lists of PIL images when mode is `replace`.")
 
         if height % 16 != 0 or width % 16 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
@@ -609,7 +607,7 @@ def prepare_prev_segment_cond_latents(
             )
             prev_segment_cond_video = prev_segment_cond_video.unflatten(0, (batch_size, -1)).transpose(1, 2)
 
-        # Fill the remaining part of the cond video segment with zeros (if animating) or the background video (if 
+        # Fill the remaining part of the cond video segment with zeros (if animating) or the background video (if
         # replacing).
         if task == "replace":
             remaining_segment = background_video[:, :, prev_segment_cond_frames:].to(dtype)
@@ -626,7 +624,8 @@ def prepare_prev_segment_cond_latents(
         if isinstance(generator, list):
             if data_batch_size == len(generator):
                 prev_segment_cond_latents = [
-                    retrieve_latents(self.vae.encode(full_segment_cond_video[i].unsqueeze(0)), g, sample_mode) for i, g in enumerate(generator)
+                    retrieve_latents(self.vae.encode(full_segment_cond_video[i].unsqueeze(0)), g, sample_mode)
+                    for i, g in enumerate(generator)
                 ]
             elif data_batch_size == 1:
                 # Like prepare_latents, assume len(generator) == batch_size
@@ -813,11 +812,11 @@ def __call__(
             face_video (`List[PIL.Image.Image]`):
                 The input face video to condition the generation on. Must be a list of PIL images.
             background_video (`List[PIL.Image.Image]`, *optional*):
-                When mode is `"replace"`, the input background video to condition the generation on. Must be a list
-                of PIL images.
-            mask_video (`List[PIL.Image.Image]`, *optional*):
-                When mode is `"replace"`, the input mask video to condition the generation on. Must be a list of
+                When mode is `"replace"`, the input background video to condition the generation on. Must be a list of
                 PIL images.
+            mask_video (`List[PIL.Image.Image]`, *optional*):
+                When mode is `"replace"`, the input mask video to condition the generation on. Must be a list of PIL
+                images.
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                 instead.
@@ -828,16 +827,16 @@ def __call__(
             mode (`str`, defaults to `"animation"`):
                 The mode of the generation. Choose between `"animate"` and `"replace"`.
             prev_segment_conditioning_frames (`int`, defaults to `1`):
-                The number of frames from the previous video segment to be used for temporal guidance. Recommended
-                to be 1 or 5. In general, should be 4N + 1, where N is a non-negative integer.
+                The number of frames from the previous video segment to be used for temporal guidance. Recommended to
+                be 1 or 5. In general, should be 4N + 1, where N is a non-negative integer.
             height (`int`, defaults to `720`):
                 The height of the generated video.
             width (`int`, defaults to `1280`):
                 The width of the generated video.
             segment_frame_length (`int`, defaults to `77`):
-                The number of frames in each generated video segment. The total frames of video generated will be
-                equal to the number of frames in `pose_video`; we will generate the video in segments until we have
-                hit this length. In general, should be 4N + 1, where N is a non-negative integer.
+                The number of frames in each generated video segment. The total frames of video generated will be equal
+                to the number of frames in `pose_video`; we will generate the video in segments until we have hit this
+                length. In general, should be 4N + 1, where N is a non-negative integer.
             num_inference_steps (`int`, defaults to `20`):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference.
@@ -846,8 +845,8 @@ def __call__(
                 Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                 of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                 `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
-                the text `prompt`, usually at the expense of lower image quality. By default, CFG is not used in
-                Wan Animate inference.
+                the text `prompt`, usually at the expense of lower image quality. By default, CFG is not used in Wan
+                Animate inference.
             num_videos_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -923,7 +922,9 @@ def __call__(
                 f"`segment_frame_length - 1` has to be divisible by {self.vae_scale_factor_temporal}. Rounding to the"
                 f" nearest number."
             )
-            segment_frame_length = segment_frame_length // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
+            segment_frame_length = (
+                segment_frame_length // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
+            )
         segment_frame_length = max(segment_frame_length, 1)
 
         self._guidance_scale = guidance_scale
diff --git a/tests/models/transformers/test_models_transformer_wan_animate.py b/tests/models/transformers/test_models_transformer_wan_animate.py
@@ -55,9 +55,9 @@ def dummy_input(self):
         encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device)
         clip_ref_features = torch.randn((batch_size, clip_seq_len, clip_dim)).to(torch_device)
         pose_latents = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
-        face_pixel_values = torch.randn(
-            (batch_size, 3, inference_segment_length, face_height, face_width)
-        ).to(torch_device)
+        face_pixel_values = torch.randn((batch_size, 3, inference_segment_length, face_height, face_width)).to(
+            torch_device
+        )
 
         return {
             "hidden_states": hidden_states,