add only cross attention to simple attention blocks

williamberman · williamberman · commit ad99a3399bbe · 2023-04-08T16:52:56.000-07:00
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -61,6 +61,7 @@ def __init__(
         norm_num_groups: Optional[int] = None,
         out_bias: bool = True,
         scale_qk: bool = True,
+        only_cross_attention: bool = False,
         processor: Optional["AttnProcessor"] = None,
     ):
         super().__init__()
@@ -79,6 +80,12 @@ def __init__(
         self.sliceable_head_dim = heads
 
         self.added_kv_proj_dim = added_kv_proj_dim
+        self.only_cross_attention = only_cross_attention
+
+        if self.added_kv_proj_dim is None and self.only_cross_attention:
+            raise ValueError(
+                "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
+            )
 
         if norm_num_groups is not None:
             self.group_norm = nn.GroupNorm(num_channels=inner_dim, num_groups=norm_num_groups, eps=1e-5, affine=True)
@@ -89,8 +96,11 @@ def __init__(
             self.norm_cross = nn.LayerNorm(cross_attention_dim)
 
         self.to_q = nn.Linear(query_dim, inner_dim, bias=bias)
-        self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
-        self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+
+        if not self.only_cross_attention:
+            # only relevant for the `AddedKVProcessor` classes
+            self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+            self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
 
         if self.added_kv_proj_dim is not None:
             self.add_k_proj = nn.Linear(added_kv_proj_dim, cross_attention_dim)
@@ -409,18 +419,21 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
         query = attn.to_q(hidden_states)
         query = attn.head_to_batch_dim(query)
 
-        key = attn.to_k(hidden_states)
-        value = attn.to_v(hidden_states)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-
         encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
         encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
         encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
         encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
 
-        key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
-        value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
+        if not attn.only_cross_attention:
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
+        else:
+            key = encoder_hidden_states_key_proj
+            value = encoder_hidden_states_value_proj
 
         attention_probs = attn.get_attention_scores(query, key, attention_mask)
         hidden_states = torch.bmm(attention_probs, value)
@@ -639,18 +652,22 @@ def __call__(self, attn: "Attention", hidden_states, encoder_hidden_states=None,
         dim = query.shape[-1]
         query = attn.head_to_batch_dim(query)
 
-        key = attn.to_k(hidden_states)
-        value = attn.to_v(hidden_states)
         encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
         encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
 
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
         encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
         encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
 
-        key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
-        value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
+        if not attn.only_cross_attention:
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
+        else:
+            key = encoder_hidden_states_key_proj
+            value = encoder_hidden_states_value_proj
 
         batch_size_attention, query_tokens, _ = query.shape
         hidden_states = torch.zeros(
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
@@ -119,6 +119,7 @@ def get_down_block(
             cross_attention_dim=cross_attention_dim,
             attn_num_head_channels=attn_num_head_channels,
             resnet_time_scale_shift=resnet_time_scale_shift,
+            only_cross_attention=only_cross_attention,
         )
     elif down_block_type == "SkipDownBlock2D":
         return SkipDownBlock2D(
@@ -279,6 +280,7 @@ def get_up_block(
             cross_attention_dim=cross_attention_dim,
             attn_num_head_channels=attn_num_head_channels,
             resnet_time_scale_shift=resnet_time_scale_shift,
+            only_cross_attention=only_cross_attention,
         )
     elif up_block_type == "AttnUpBlock2D":
         return AttnUpBlock2D(
@@ -562,6 +564,7 @@ def __init__(
         attn_num_head_channels=1,
         output_scale_factor=1.0,
         cross_attention_dim=1280,
+        only_cross_attention=False,
     ):
         super().__init__()
 
@@ -600,6 +603,7 @@ def __init__(
                     norm_num_groups=resnet_groups,
                     bias=True,
                     upcast_softmax=True,
+                    only_cross_attention=only_cross_attention,
                     processor=AttnAddedKVProcessor(),
                 )
             )
@@ -1337,6 +1341,7 @@ def __init__(
         cross_attention_dim=1280,
         output_scale_factor=1.0,
         add_downsample=True,
+        only_cross_attention=False,
     ):
         super().__init__()
 
@@ -1374,6 +1379,7 @@ def __init__(
                     norm_num_groups=resnet_groups,
                     bias=True,
                     upcast_softmax=True,
+                    only_cross_attention=only_cross_attention,
                     processor=AttnAddedKVProcessor(),
                 )
             )
@@ -2329,6 +2335,7 @@ def __init__(
         cross_attention_dim=1280,
         output_scale_factor=1.0,
         add_upsample=True,
+        only_cross_attention=False,
     ):
         super().__init__()
         resnets = []
@@ -2367,6 +2374,7 @@ def __init__(
                     norm_num_groups=resnet_groups,
                     bias=True,
                     upcast_softmax=True,
+                    only_cross_attention=only_cross_attention,
                     processor=AttnAddedKVProcessor(),
                 )
             )
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
@@ -153,6 +153,7 @@ def __init__(
         conv_out_kernel: int = 3,
         projection_class_embeddings_input_dim: Optional[int] = None,
         class_embeddings_concat: bool = False,
+        mid_block_only_cross_attention: bool = False,
     ):
         super().__init__()
 
@@ -321,6 +322,7 @@ def __init__(
                 attn_num_head_channels=attention_head_dim[-1],
                 resnet_groups=norm_num_groups,
                 resnet_time_scale_shift=resnet_time_scale_shift,
+                only_cross_attention=mid_block_only_cross_attention,
             )
         elif mid_block_type is None:
             self.mid_block = None
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -239,6 +239,7 @@ def __init__(
         conv_out_kernel: int = 3,
         projection_class_embeddings_input_dim: Optional[int] = None,
         class_embeddings_concat: bool = False,
+        mid_block_only_cross_attention: bool = False,
     ):
         super().__init__()
 
@@ -412,6 +413,7 @@ def __init__(
                 attn_num_head_channels=attention_head_dim[-1],
                 resnet_groups=norm_num_groups,
                 resnet_time_scale_shift=resnet_time_scale_shift,
+                only_cross_attention=mid_block_only_cross_attention,
             )
         elif mid_block_type is None:
             self.mid_block = None
@@ -1434,6 +1436,7 @@ def __init__(
         attn_num_head_channels=1,
         output_scale_factor=1.0,
         cross_attention_dim=1280,
+        only_cross_attention=False,
     ):
         super().__init__()
 
@@ -1472,6 +1475,7 @@ def __init__(
                     norm_num_groups=resnet_groups,
                     bias=True,
                     upcast_softmax=True,
+                    only_cross_attention=only_cross_attention,
                     processor=AttnAddedKVProcessor(),
                 )
             )