huggingface
diff --git a/‎scripts/convert_aura_flow_to_diffusers.py
Lines changed: 131 additions & 0 deletions b/‎scripts/convert_aura_flow_to_diffusers.py
Lines changed: 131 additions & 0 deletions
diff --git a/‎src/diffusers/__init__.py
Lines changed: 4 additions & 0 deletions b/‎src/diffusers/__init__.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/diffusers/models/__init__.py
Lines changed: 2 additions & 0 deletions b/‎src/diffusers/models/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/diffusers/models/attention_processor.py
Lines changed: 114 additions & 4 deletions b/‎src/diffusers/models/attention_processor.py
Lines changed: 114 additions & 4 deletions
diff --git a/‎src/diffusers/models/embeddings.py
Lines changed: 3 additions & 1 deletion b/‎src/diffusers/models/embeddings.py
Lines changed: 3 additions & 1 deletion
@@ -0,0 +1,131 @@
+import argparse
+
+import torch
+from huggingface_hub import hf_hub_download
+
+from diffusers.models.transformers.auraflow_transformer_2d import AuraFlowTransformer2DModel
+
+
+def load_original_state_dict(args):
+    model_pt = hf_hub_download(repo_id=args.original_state_dict_repo_id, filename="aura_diffusion_pytorch_model.bin")
+    state_dict = torch.load(model_pt, map_location="cpu")
+    return state_dict
+
+
+def calculate_layers(state_dict_keys, key_prefix):
+    dit_layers = set()
+    for k in state_dict_keys:
+        if key_prefix in k:
+            dit_layers.add(int(k.split(".")[2]))
+    print(f"{key_prefix}: {len(dit_layers)}")
+    return len(dit_layers)
+
+
+# similar to SD3 but only for the last norm layer
+def swap_scale_shift(weight, dim):
+    shift, scale = weight.chunk(2, dim=0)
+    new_weight = torch.cat([scale, shift], dim=0)
+    return new_weight
+
+
+def convert_transformer(state_dict):
+    converted_state_dict = {}
+    state_dict_keys = list(state_dict.keys())
+
+    converted_state_dict["register_tokens"] = state_dict.pop("model.register_tokens")
+    converted_state_dict["pos_embed.pos_embed"] = state_dict.pop("model.positional_encoding")
+    converted_state_dict["pos_embed.proj.weight"] = state_dict.pop("model.init_x_linear.weight")
+    converted_state_dict["pos_embed.proj.bias"] = state_dict.pop("model.init_x_linear.bias")
+
+    converted_state_dict["time_step_proj.linear_1.weight"] = state_dict.pop("model.t_embedder.mlp.0.weight")
+    converted_state_dict["time_step_proj.linear_1.bias"] = state_dict.pop("model.t_embedder.mlp.0.bias")
+    converted_state_dict["time_step_proj.linear_2.weight"] = state_dict.pop("model.t_embedder.mlp.2.weight")
+    converted_state_dict["time_step_proj.linear_2.bias"] = state_dict.pop("model.t_embedder.mlp.2.bias")
+
+    converted_state_dict["context_embedder.weight"] = state_dict.pop("model.cond_seq_linear.weight")
+
+    mmdit_layers = calculate_layers(state_dict_keys, key_prefix="double_layers")
+    single_dit_layers = calculate_layers(state_dict_keys, key_prefix="single_layers")
+
+    # MMDiT blocks 🎸.
+    for i in range(mmdit_layers):
+        # feed-forward
+        path_mapping = {"mlpX": "ff", "mlpC": "ff_context"}
+        weight_mapping = {"c_fc1": "linear_1", "c_fc2": "linear_2", "c_proj": "out_projection"}
+        for orig_k, diffuser_k in path_mapping.items():
+            for k, v in weight_mapping.items():
+                converted_state_dict[f"joint_transformer_blocks.{i}.{diffuser_k}.{v}.weight"] = state_dict.pop(
+                    f"model.double_layers.{i}.{orig_k}.{k}.weight"
+                )
+
+        # norms
+        path_mapping = {"modX": "norm1", "modC": "norm1_context"}
+        for orig_k, diffuser_k in path_mapping.items():
+            converted_state_dict[f"joint_transformer_blocks.{i}.{diffuser_k}.linear.weight"] = state_dict.pop(
+                f"model.double_layers.{i}.{orig_k}.1.weight"
+            )
+
+        # attns
+        x_attn_mapping = {"w2q": "to_q", "w2k": "to_k", "w2v": "to_v", "w2o": "to_out.0"}
+        context_attn_mapping = {"w1q": "add_q_proj", "w1k": "add_k_proj", "w1v": "add_v_proj", "w1o": "to_add_out"}
+        for attn_mapping in [x_attn_mapping, context_attn_mapping]:
+            for k, v in attn_mapping.items():
+                converted_state_dict[f"joint_transformer_blocks.{i}.attn.{v}.weight"] = state_dict.pop(
+                    f"model.double_layers.{i}.attn.{k}.weight"
+                )
+
+    # Single-DiT blocks.
+    for i in range(single_dit_layers):
+        # feed-forward
+        mapping = {"c_fc1": "linear_1", "c_fc2": "linear_2", "c_proj": "out_projection"}
+        for k, v in mapping.items():
+            converted_state_dict[f"single_transformer_blocks.{i}.ff.{v}.weight"] = state_dict.pop(
+                f"model.single_layers.{i}.mlp.{k}.weight"
+            )
+
+        # norms
+        converted_state_dict[f"single_transformer_blocks.{i}.norm1.linear.weight"] = state_dict.pop(
+            f"model.single_layers.{i}.modCX.1.weight"
+        )
+
+        # attns
+        x_attn_mapping = {"w1q": "to_q", "w1k": "to_k", "w1v": "to_v", "w1o": "to_out.0"}
+        for k, v in x_attn_mapping.items():
+            converted_state_dict[f"single_transformer_blocks.{i}.attn.{v}.weight"] = state_dict.pop(
+                f"model.single_layers.{i}.attn.{k}.weight"
+            )
+
+    # Final blocks.
+    converted_state_dict["proj_out.weight"] = state_dict.pop("model.final_linear.weight")
+    converted_state_dict["norm_out.linear.weight"] = swap_scale_shift(state_dict.pop("model.modF.1.weight"), dim=None)
+
+    return converted_state_dict
+
+
+@torch.no_grad()
+def populate_state_dict(args):
+    original_state_dict = load_original_state_dict(args)
+    state_dict_keys = list(original_state_dict.keys())
+    mmdit_layers = calculate_layers(state_dict_keys, key_prefix="double_layers")
+    single_dit_layers = calculate_layers(state_dict_keys, key_prefix="single_layers")
+
+    converted_state_dict = convert_transformer(original_state_dict)
+    model_diffusers = AuraFlowTransformer2DModel(
+        num_mmdit_layers=mmdit_layers, num_single_dit_layers=single_dit_layers
+    )
+    model_diffusers.load_state_dict(converted_state_dict, strict=True)
+
+    return model_diffusers
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--original_state_dict_repo_id", default="AuraDiffusion/auradiffusion-v0.1a0", type=str)
+    parser.add_argument("--dump_path", default="aura-flow", type=str)
+    parser.add_argument("--hub_id", default=None, type=str)
+    args = parser.parse_args()
+
+    model_diffusers = populate_state_dict(args)
+    model_diffusers.save_pretrained(args.dump_path)
+    if args.hub_id is not None:
+        model_diffusers.push_to_hub(args.hub_id)
@@ -76,6 +76,7 @@
     _import_structure["models"].extend(
         [
             "AsymmetricAutoencoderKL",
+            "AuraFlowTransformer2DModel",
             "AutoencoderKL",
             "AutoencoderKLTemporalDecoder",
             "AutoencoderTiny",
@@ -235,6 +236,7 @@
             "AudioLDM2ProjectionModel",
             "AudioLDM2UNet2DConditionModel",
             "AudioLDMPipeline",
+            "AuraFlowPipeline",
             "BlipDiffusionControlNetPipeline",
             "BlipDiffusionPipeline",
             "ChatGLMModel",
@@ -507,6 +509,7 @@
     else:
         from .models import (
             AsymmetricAutoencoderKL,
+            AuraFlowTransformer2DModel,
             AutoencoderKL,
             AutoencoderKLTemporalDecoder,
             AutoencoderTiny,
@@ -646,6 +649,7 @@
             AudioLDM2ProjectionModel,
             AudioLDM2UNet2DConditionModel,
             AudioLDMPipeline,
+            AuraFlowPipeline,
             ChatGLMModel,
             ChatGLMTokenizer,
             CLIPImageProjection,
 
@@ -38,6 +38,7 @@
     _import_structure["controlnet_xs"] = ["ControlNetXSAdapter", "UNetControlNetXSModel"]
     _import_structure["embeddings"] = ["ImageProjection"]
     _import_structure["modeling_utils"] = ["ModelMixin"]
+    _import_structure["transformers.auraflow_transformer_2d"] = ["AuraFlowTransformer2DModel"]
     _import_structure["transformers.dit_transformer_2d"] = ["DiTTransformer2DModel"]
     _import_structure["transformers.dual_transformer_2d"] = ["DualTransformer2DModel"]
     _import_structure["transformers.hunyuan_transformer_2d"] = ["HunyuanDiT2DModel"]
@@ -84,6 +85,7 @@
         from .embeddings import ImageProjection
         from .modeling_utils import ModelMixin
         from .transformers import (
+            AuraFlowTransformer2DModel,
             DiTTransformer2DModel,
             DualTransformer2DModel,
             HunyuanDiT2DModel,
 
@@ -22,7 +22,7 @@
 from ..image_processor import IPAdapterMaskProcessor
 from ..utils import deprecate, logging
 from ..utils.import_utils import is_torch_npu_available, is_xformers_available
-from ..utils.torch_utils import maybe_allow_in_graph
+from ..utils.torch_utils import is_torch_version, maybe_allow_in_graph
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -104,6 +104,7 @@ def __init__(
         cross_attention_norm_num_groups: int = 32,
         qk_norm: Optional[str] = None,
         added_kv_proj_dim: Optional[int] = None,
+        added_proj_bias: Optional[bool] = True,
         norm_num_groups: Optional[int] = None,
         spatial_norm_dim: Optional[int] = None,
         out_bias: bool = True,
@@ -118,6 +119,10 @@ def __init__(
         context_pre_only=None,
     ):
         super().__init__()
+
+        # To prevent circular import.
+        from .normalization import FP32LayerNorm
+
         self.inner_dim = out_dim if out_dim is not None else dim_head * heads
         self.inner_kv_dim = self.inner_dim if kv_heads is None else dim_head * kv_heads
         self.query_dim = query_dim
@@ -170,6 +175,9 @@ def __init__(
         elif qk_norm == "layer_norm":
             self.norm_q = nn.LayerNorm(dim_head, eps=eps)
             self.norm_k = nn.LayerNorm(dim_head, eps=eps)
+        elif qk_norm == "fp32_layer_norm":
+            self.norm_q = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+            self.norm_k = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
         elif qk_norm == "layer_norm_across_heads":
             # Lumina applys qk norm across all heads
             self.norm_q = nn.LayerNorm(dim_head * heads, eps=eps)
@@ -211,10 +219,10 @@ def __init__(
             self.to_v = None
 
         if self.added_kv_proj_dim is not None:
-            self.add_k_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim)
-            self.add_v_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim)
+            self.add_k_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias)
+            self.add_v_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias)
             if self.context_pre_only is not None:
-                self.add_q_proj = nn.Linear(added_kv_proj_dim, self.inner_dim)
+                self.add_q_proj = nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
 
         self.to_out = nn.ModuleList([])
         self.to_out.append(nn.Linear(self.inner_dim, self.out_dim, bias=out_bias))
@@ -223,6 +231,14 @@ def __init__(
         if self.context_pre_only is not None and not self.context_pre_only:
             self.to_add_out = nn.Linear(self.inner_dim, self.out_dim, bias=out_bias)
 
+        if qk_norm is not None and added_kv_proj_dim is not None:
+            if qk_norm == "fp32_layer_norm":
+                self.norm_added_q = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+                self.norm_added_k = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+        else:
+            self.norm_added_q = None
+            self.norm_added_k = None
+
         # set attention processor
         # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
         # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
@@ -1137,6 +1153,100 @@ def __call__(
         return hidden_states, encoder_hidden_states
 
 
+class AuraFlowAttnProcessor2_0:
+    """Attention processor used typically in processing Aura Flow."""
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention") and is_torch_version("<", "2.1"):
+            raise ImportError(
+                "AuraFlowAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to at least 2.1 or above as we use `scale` in `F.scaled_dot_product_attention()`. "
+            )
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        i=0,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        batch_size = hidden_states.shape[0]
+
+        # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+
+        # `context` projections.
+        if encoder_hidden_states is not None:
+            encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+            encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+            encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+
+        # Reshape.
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim)
+        key = key.view(batch_size, -1, attn.heads, head_dim)
+        value = value.view(batch_size, -1, attn.heads, head_dim)
+
+        # Apply QK norm.
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # Concatenate the projections.
+        if encoder_hidden_states is not None:
+            encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            )
+            encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(batch_size, -1, attn.heads, head_dim)
+            encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            )
+
+            if attn.norm_added_q is not None:
+                encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+            if attn.norm_added_k is not None:
+                encoder_hidden_states_key_proj = attn.norm_added_q(encoder_hidden_states_key_proj)
+
+            query = torch.cat([encoder_hidden_states_query_proj, query], dim=1)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
+
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        # Attention.
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, dropout_p=0.0, scale=attn.scale, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # Split the attention outputs.
+        if encoder_hidden_states is not None:
+            hidden_states, encoder_hidden_states = (
+                hidden_states[:, encoder_hidden_states.shape[1] :],
+                hidden_states[:, : encoder_hidden_states.shape[1]],
+            )
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if encoder_hidden_states is not None:
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+
+        if encoder_hidden_states is not None:
+            return hidden_states, encoder_hidden_states
+        else:
+            return hidden_states
+
+
 class XFormersAttnAddedKVProcessor:
     r"""
     Processor for implementing memory efficient attention using xFormers.
 
@@ -473,18 +473,20 @@ def forward(self, sample, condition=None):
 
 
 class Timesteps(nn.Module):
-    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float):
+    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float, scale: int = 1):
         super().__init__()
         self.num_channels = num_channels
         self.flip_sin_to_cos = flip_sin_to_cos
         self.downscale_freq_shift = downscale_freq_shift
+        self.scale = scale
 
     def forward(self, timesteps):
         t_emb = get_timestep_embedding(
             timesteps,
             self.num_channels,
             flip_sin_to_cos=self.flip_sin_to_cos,
             downscale_freq_shift=self.downscale_freq_shift,
+            scale=self.scale,
         )
         return t_emb