generalize all possible ways to graft the neural memory onto a transformer

lucidrains · lucidrains · commit fbfdd4738680 · 2025-03-01T19:41:18.000Z
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.3.25"
+version = "0.4.0"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/titans_pytorch/mac_transformer.py b/titans_pytorch/mac_transformer.py
@@ -46,7 +46,7 @@ def create_mac_mask(_, __, q_idx, kv_idx):
 
 # einstein notation related
 
-from einops import repeat, rearrange, pack, unpack
+from einops import repeat, rearrange, pack, unpack, einsum
 from einops.layers.torch import Rearrange
 
 # b - batch
@@ -521,9 +521,7 @@ def __init__(
         self.sliding_window_attn = sliding_window_attn
         self.attn_window_size = segment_len + num_longterm_mem_tokens
 
-        # hyper conection
-
-        assert not (num_residual_streams <= 1 and neural_memory_qkv_receives_diff_views), 'allow neural memory queries, keys, values to be derived from different combinations of the residual streams can only work if hyper connections has greater than 1 residual stream'
+        # hyper connection
 
         init_hyper_conn, self.expand_streams, self.reduce_streams = get_init_and_expand_reduce_stream_functions(num_residual_streams, dim = dim, add_stream_embed = True, disable = num_residual_streams == 1)
 
@@ -560,17 +558,28 @@ def __init__(
             )
 
             mem = None
+            mem_qkv_layer_selector = None
             mem_hyper_conn = None
 
             if layer in neural_memory_layers:
-                mem_hyper_conn = init_hyper_conn(add_branch_out_to_residual = not neural_mem_gate_attn_output, num_input_views = 3 if neural_memory_qkv_receives_diff_views else 1)
+                mem_hyper_conn = init_hyper_conn(add_branch_out_to_residual = not neural_mem_gate_attn_output)
+
+                if not is_first and neural_memory_qkv_receives_diff_views:
+                    num_layer_choices = (layer - 1) * 4 + 1 # for each layer, have memory input select from attn inp, attn out, ff inp, and ff out - plus one for the current point in the residual stream (memory input)
+
+                    mem_qkv_layer_selector = nn.Sequential(
+                        nn.RMSNorm(dim),
+                        nn.Linear(dim, 3 * num_layer_choices),
+                        Rearrange('... (views layers) -> views ... layers', views = 3),
+                        nn.Softmax(dim = -1)
+                    )
 
                 mem = NeuralMemory(
                     dim = dim,
                     chunk_size = self.neural_memory_segment_len,
                     batch_size = neural_memory_batch_size,
                     model = deepcopy(neural_memory_model),
-                    qkv_receives_diff_views = neural_memory_qkv_receives_diff_views,
+                    qkv_receives_diff_views = True,
                     accept_weight_residual = neural_mem_weight_residual and not is_first_neural_mem,
                     **neural_memory_kwargs
                 )
@@ -581,9 +590,12 @@ def __init__(
 
             self.layers.append(ModuleList([
                 mem_hyper_conn,
+                init_hyper_conn(),
+                init_hyper_conn(),
+                mem_qkv_layer_selector,
                 mem,
-                init_hyper_conn(branch = attn),
-                init_hyper_conn(branch = ff)
+                attn,
+                ff,
             ]))
 
         self.norm = nn.RMSNorm(dim)
@@ -763,6 +775,10 @@ def forward(
 
         mem_weight_residual = None
 
+        # layers for the neural mem to select the qkv inputs from
+
+        mem_input_layers = []
+
         # when inferencing, only do one token at a time
 
         if is_inferencing:
@@ -773,7 +789,7 @@ def forward(
 
         x = self.expand_streams(x)
 
-        for mem_hyper_conn, mem, attn, ff in self.layers:
+        for mem_hyper_conn, attn_hyper_conn, ff_hyper_conn, mem_qkv_layer_selector, mem, attn, ff in self.layers:
 
             retrieved = None
             attn_out_gates = None
@@ -785,8 +801,19 @@ def forward(
 
                 mem_input, add_residual = mem_hyper_conn(x)
 
+                if not exists(mem_qkv_layer_selector):
+                    qkv_mem_input = stack((mem_input, mem_input, mem_input))
+                else:
+                    layers_to_choose_from = stack((mem_input, *mem_input_layers))
+
+                    # let the current `mem_input` select the 3 layers for qkv
+
+                    selected = mem_qkv_layer_selector(mem_input)
+
+                    qkv_mem_input = einsum(layers_to_choose_from, selected, 'l b n d, v b n l -> v b n d')
+
                 retrieved, next_neural_mem_cache = mem.forward(
-                    mem_input,
+                    qkv_mem_input,
                     state = next(neural_mem_caches, None),
                     prev_weights = mem_weight_residual
                 )
@@ -801,25 +828,41 @@ def forward(
 
             # attention
 
-            x, (values, next_kv_cache) = attn(
-                x,
+            attn_in, add_residual = attn_hyper_conn(x)
+
+            mem_input_layers.append(attn_in)
+
+            attn_out, (values, next_kv_cache) = attn(
+                attn_in,
                 value_residual = value_residual,
                 disable_flex_attn = disable_flex_attn,
                 flex_attn_fn = flex_attn_fn,
                 output_gating = attn_out_gates,
                 cache = next(kv_caches, None)
             )
 
+            mem_input_layers.append(attn_out)
+
             value_residual = default(value_residual, values)
 
+            x = add_residual(attn_out)
+
             # caches
 
             next_kv_caches.append(next_kv_cache)
             next_neural_mem_caches.append(next_neural_mem_cache)
 
             # feedforward
 
-            x = ff(x)
+            ff_in, add_ff_residual = ff_hyper_conn(x)
+
+            mem_input_layers.append(ff_in)
+
+            ff_out = ff(ff_in)
+
+            mem_input_layers.append(ff_out)
+
+            x = add_ff_residual(ff_out)
 
         # taking care of cache first
         # for early return when processing long term mem tokens during inference
diff --git a/train_mac.py b/train_mac.py
@@ -48,6 +48,7 @@
 STORE_ATTN_POOL_CHUNKS = True                   # whether to use attention pooling for chunk derived momentum, per-layer lr mod, decay
 MEMORY_MODEL_PER_LAYER_LEARNED_LR = True
 NEURAL_MEM_WEIGHT_RESIDUAL = True               # learning to accept contributions from the weights of the previous neural mem layer brings about significant improvements. this was improvised and not in the paper, but inspired by the value residual learning free lunch paper
+NEURAL_MEM_QKV_RECEIVES_DIFF_VIEW = True        # will allow the neural memory to select what layers from which to derive queries / keys / values, effectively allowing it to graft itself to the transformer in any way to be beneficial. this is to address an issue from a phd student who noted that the mem network is learning nothing more than wk @ wv. this also generalizes all possible ways to connect the neural memory to a transformer, a sort of NAS
 
 # experiment related
 
@@ -107,6 +108,7 @@ def decode_tokens(tokens):
     neural_memory_batch_size = NEURAL_MEM_BATCH_SIZE,
     neural_mem_gate_attn_output = NEURAL_MEM_GATE_ATTN_OUTPUT,
     neural_mem_weight_residual = NEURAL_MEM_WEIGHT_RESIDUAL,
+    neural_memory_qkv_receives_diff_views = NEURAL_MEM_QKV_RECEIVES_DIFF_VIEW,
     use_flex_attn = USE_FLEX_ATTN,
     sliding_window_attn = SLIDING_WINDOWS,
     neural_memory_model = neural_memory_model,