openai
diff --git a/‎records/track_10min_16mb/2026-03-25_11L_ParallelMuon_MLP3x_TTT/README.md‎
Lines changed: 18 additions & 16 deletions b/‎records/track_10min_16mb/2026-03-25_11L_ParallelMuon_MLP3x_TTT/README.md‎
Lines changed: 18 additions & 16 deletions
diff --git a/‎records/track_10min_16mb/2026-03-25_11L_ParallelMuon_MLP3x_TTT/submission.json‎
Lines changed: 10 additions & 10 deletions b/‎records/track_10min_16mb/2026-03-25_11L_ParallelMuon_MLP3x_TTT/submission.json‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎records/track_10min_16mb/2026-03-25_11L_ParallelMuon_MLP3x_TTT/train_gpt.py‎
Lines changed: 8 additions & 18 deletions b/‎records/track_10min_16mb/2026-03-25_11L_ParallelMuon_MLP3x_TTT/train_gpt.py‎
Lines changed: 8 additions & 18 deletions
@@ -1,21 +1,22 @@
-# Record: 11L Parallel Muon + LeakyReLU² MLP3x + Legal Score-First TTT
+# Non-Record: 11L Parallel Muon + LN Scale + LeakyReLU² MLP3x + Legal TTT
 
-**3-seed mean val_bpb: 1.1253** (std=0.0002) | **~15 MB** | 8xH100 SXM
+**3-seed mean val_bpb: 1.1215** (std=0.0002) | **~15.85 MB** | 8xH100 SXM
 
 ## 3-Seed Results (8xH100 80GB SXM, PyTorch 2.9.1+cu128)
 
 | Seed | step_avg | steps | EMA bpb | Quantized bpb | **TTT bpb** |
 |------|----------|-------|---------|---------------|-------------|
-| 1337 | 91.5ms | 6,556 | 1.1194 | 1.1291 | **1.1255** |
-| 42 | 89.2ms | 6,726 | 1.1195 | 1.1278 | **1.1253** |
-| 2024 | 89.3ms | 6,722 | 1.1193 | 1.1280 | **1.1251** |
-| **Mean** | **90.0ms** | **6,668** | **1.1194** | **1.1283** | **1.1253** |
+| 1337 | 88.8ms | 6,759 | 1.1161 | 1.1238 | **1.1217** |
+| 42 | 88.8ms | 6,757 | 1.1158 | 1.1234 | **1.1213** |
+| 2024 | 88.9ms | 6,752 | 1.1160 | 1.1234 | **1.1215** |
+| **Mean** | **88.8ms** | **6,756** | **1.1160** | **1.1235** | **1.1215** |
 
-## Architecture (29.8M parameters)
+## Architecture (26.8M parameters)
 
 - 11 transformer layers, dim=512, 8 heads / 4 KV heads (GQA)
 - **Parallel Muon** with parameter banking (4 contiguous 3D banks) + batched Newton-Schulz
 - MLP 3x expansion (hidden=1536) with **LeakyReLU(0.5)²** activation
+- **LN Scale** — depth-dependent normalization: 1/sqrt(layer_idx+1)
 - **SmearGate** + **BigramHash(1536, dim=128)**
 - **Value Residual (ResFormer)** — cache V from layer 0, blend via learned lambda
 - **Gated Attention** — per-head sigmoid gate (nn.Linear, bias init 4.0)
@@ -26,39 +27,40 @@
 
 ## Training
 
-- **Parallel Muon optimizer**: 3-phase async reduce-scatter → Adam → NS5+all-gather
-  - lr=0.025, momentum 0.92→0.99/1500 steps, WD=0.04
-  - No DDP — manual gradient sync for non-bank params
+- **Parallel Muon optimizer**: 3-phase async reduce-scatter -> Adam -> NS5+all-gather
+  - lr=0.025, momentum 0.92->0.99/1500 steps, WD=0.04
+  - No DDP -- manual gradient sync for non-bank params
 - Adam for embeddings (lr=0.035) and scalars (lr=0.025)
 - Batch 786,432 tokens, seq_len 2048
 - EMA (decay=0.997) + SWA (every 50 steps when scale < 0.2)
 - Warmdown 3500 iterations (wallclock-based)
-- Late QAT via STE (final 15% of wallclock), symmetric [-31, 31] range
+- Late QAT via STE (final 15% of wallclock)
 - Gradient clipping 0.3
-- torch.compile(fullgraph=True) — no DDP wrapper for maximum compilation
+- torch.compile(fullgraph=True)
 
 ## Quantization
 
 - Int6 uniform per-row with GPTQ-lite (5-percentile clip search per row)
 - FP16 passthrough for tied embeddings
 - zstd-22 compression
-- Unbank → quantize → rebank for compatibility with parameter banking
+- Unbank -> quantize -> rebank for compatibility with parameter banking
 
 ## Legal Score-First TTT (PR #461 / #549 recipe)
 
 Every token scored BEFORE any weight update:
 
 ```
 for each 32K-token chunk:
-    Phase 1 — SCORE: sliding window eval (inference_mode, stride=64)
-    Phase 2 — TRAIN: SGD(lr=0.002, momentum=0.9), 3 epochs, all blocks unfrozen, cosine LR
+    Phase 1 -- SCORE: sliding window eval (inference_mode, stride=64)
+    Phase 2 -- TRAIN: SGD(lr=0.002, momentum=0.9), 3 epochs, all blocks unfrozen, cosine LR
 ```
 
-TTT improves quantized BPB by ~0.003 (1.1283 → 1.1253).
+TTT improves quantized BPB by ~0.002 (1.1235 -> 1.1215).
 
 ## Credits
 
 - Parallel Muon / Parameter Banking: PR #399 by @abaybektursun
 - LeakyReLU²: PR #493 by @parinzee, PR #518 by @sofiabod
+- LN Scale: PR #315/374 by @jfprincz
 - TTT recipe: PR #461 by @Christopher-Lee-McClendon (adapted: freeze=0)
 - Base model stack: PR #414 by @signalrush
@@ -1,16 +1,16 @@
 {
   "author": "Aryan Bhosale",
   "github_id": "aryanbhosale",
-  "name": "11L Parallel Muon + LeakyReLU² MLP3x + Legal Score-First TTT (mean val_bpb=1.1253)",
-  "blurb": "11-layer 512d transformer with Parallel Muon (parameter banking + batched NS5), LeakyReLU(0.5)² MLP 3x, SmearGate, BigramHash(1536), Value Residual, Gated Attention, XSA4, Partial RoPE(16/64), EMA(0.997)+SWA, Late QAT, GPTQ-lite int6+zstd-22, legal score-first TTT (SGD momentum=0.9, lr=0.002, 3 epochs). 3-seed mean 1.1253 BPB (std=0.0002) on 8xH100 SXM.",
-  "date": "2026-03-25T12:00:00Z",
-  "val_loss": 1.9000,
-  "val_bpb": 1.1253,
-  "bytes_total": 15000000,
-  "bytes_code": 78438,
+  "name": "11L Parallel Muon + LN Scale + LeakyReLU² MLP3x + Legal TTT (mean val_bpb=1.1215)",
+  "blurb": "11-layer 512d transformer with Parallel Muon (parameter banking + batched NS5), LN Scale, LeakyReLU(0.5)² MLP 3x, SmearGate, BigramHash(1536), Value Residual, Gated Attention, XSA4, Partial RoPE(16/64), EMA(0.997)+SWA, Late QAT, GPTQ-lite int6+zstd-22, legal score-first TTT. 3-seed mean 1.1215 BPB (std=0.0002) on 8xH100 SXM.",
+  "date": "2026-03-26T12:00:00Z",
+  "val_loss": 1.8937,
+  "val_bpb": 1.1215,
+  "bytes_total": 15850000,
+  "bytes_code": 80000,
   "seeds": {
-    "1337": {"val_bpb": 1.1255, "val_loss": 1.9004, "steps": 6556, "step_avg_ms": 91.5},
-    "42": {"val_bpb": 1.1253, "val_loss": 1.8999, "steps": 6726, "step_avg_ms": 89.2},
-    "2024": {"val_bpb": 1.1251, "val_loss": 1.8997, "steps": 6722, "step_avg_ms": 89.3}
+    "1337": {"val_bpb": 1.1217, "val_loss": 1.8940, "steps": 6759, "step_avg_ms": 88.8},
+    "42": {"val_bpb": 1.1213, "val_loss": 1.8933, "steps": 6757, "step_avg_ms": 88.8},
+    "2024": {"val_bpb": 1.1215, "val_loss": 1.8937, "steps": 6752, "step_avg_ms": 88.9}
   }
 }
@@ -1,9 +1,8 @@
-"""SOTA config: Parallel Muon + parameter banks + MLP 3x + 11L XSA + LN Scale + VE128 + GPTQ-lite int6 + LZMA."""
+"""SOTA config: Parallel Muon + parameter banks + MLP 3x + 11L XSA + LN Scale + GPTQ-lite int6 + zstd-22."""
 from __future__ import annotations
 import copy
 import glob
 import io
-import lzma
 import math
 import os
 import random
@@ -16,20 +15,11 @@
 
 try:
     import zstandard
+    def _compress(data: bytes) -> bytes: return zstandard.ZstdCompressor(level=22).compress(data)
+    def _decompress(data: bytes) -> bytes: return zstandard.ZstdDecompressor().decompress(data)
 except ImportError:
-    zstandard = None
-
-def _compress(data: bytes) -> bytes:
-    return lzma.compress(data, preset=6)
-
-def _decompress(data: bytes) -> bytes:
-    try:
-        return lzma.decompress(data)
-    except Exception:
-        try:
-            return zstandard.ZstdDecompressor().decompress(data)
-        except Exception:
-            return zlib.decompress(data)
+    def _compress(data: bytes) -> bytes: return zlib.compress(data, level=9)
+    def _decompress(data: bytes) -> bytes: return zlib.decompress(data)
 
 import numpy as np
 import sentencepiece as spm
@@ -95,11 +85,11 @@ class Hyperparameters:
 
     bigram_vocab_size = int(os.environ.get("BIGRAM_VOCAB_SIZE", 1536))
     bigram_dim = int(os.environ.get("BIGRAM_DIM", 128))
-    xsa_last_n = int(os.environ.get("XSA_LAST_N", 11))
+    xsa_last_n = int(os.environ.get("XSA_LAST_N", 4))
     rope_dims = int(os.environ.get("ROPE_DIMS", 16))
     ln_scale = bool(int(os.environ.get("LN_SCALE", "1")))
-    ve_enabled = bool(int(os.environ.get("VE_ENABLED", "1")))
-    ve_dim = int(os.environ.get("VE_DIM", 64))
+    ve_enabled = bool(int(os.environ.get("VE_ENABLED", "0")))
+    ve_dim = int(os.environ.get("VE_DIM", 32))
     ve_layers = os.environ.get("VE_LAYERS", "9,10")
 
     use_smeargate = bool(int(os.environ.get("USE_SMEARGATE", "1")))