return the adaptive lr alongside the mem model losses, as model could have chosen not to store anything

lucidrains · lucidrains · commit 7ec702f859f3 · 2025-03-02T07:48:37.000-08:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.4.1"
+version = "0.4.3"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_titans.py b/tests/test_titans.py
@@ -85,9 +85,9 @@ def test_return_surprises():
 
     seq = torch.randn(4, 64, 384)
 
-    _, _, surprises = mem(seq, return_surprises = True)
+    _, _, (surprises, adaptive_lr) = mem(seq, return_surprises = True)
 
-    assert surprises.shape == (4, 4, 64)
+    assert all([t.shape == (4, 4, 64) for t in (surprises, adaptive_lr)])
 
 @pytest.mark.parametrize('learned_momentum_combine', (False, True))
 @pytest.mark.parametrize('learned_combine_include_zeroth', (False, True))
diff --git a/titans_pytorch/neural_memory.py b/titans_pytorch/neural_memory.py
@@ -652,6 +652,7 @@ def store_memories(
 
         # surprises
 
+        adaptive_lr = rearrange(adaptive_lr, '(b h n) c -> b h (n c)', b = batch, h = heads)
         unweighted_mem_model_loss = rearrange(unweighted_mem_model_loss, '(b h n) c -> b h (n c)', b = batch, h = heads)
 
         # maybe softclamp grad norm
@@ -695,7 +696,7 @@ def store_memories(
             if not return_surprises:
                 return output
 
-            return (*output, unweighted_mem_model_loss)
+            return (*output, (unweighted_mem_model_loss, adaptive_lr))
 
         # momentum + weight decay - momentum is the new contribution, as most linear RNNs have learned forgetting gates
 
@@ -755,7 +756,7 @@ def store_memories(
         if not return_surprises:
             return updates, next_store_state
 
-        return updates, next_store_state, unweighted_mem_model_loss
+        return updates, next_store_state, (unweighted_mem_model_loss, adaptive_lr)
 
     def retrieve_memories(
         self,
@@ -939,7 +940,7 @@ def accum_updates(past_updates, future_updates):
 
         # whether to allow network to slowly adjust from initial weight throughout (residual path) to fully updating weights every batch
 
-        surprises = None
+        surprises = (None, None)
         gate = None
 
         if exists(self.transition_gate):
@@ -966,7 +967,7 @@ def accum_updates(past_updates, future_updates):
 
             updates = accum_updates(updates, next_updates)
 
-            surprises = safe_cat((surprises, chunk_surprises), dim = -1)
+            surprises = tuple(safe_cat(args, dim = -1) for args in zip(surprises, chunk_surprises))
 
             if is_last and not update_after_final_store:
                 continue