openai · joshEng1 · Apr 4, 2026 · Apr 17, 2026 · Apr 18, 2026 · Apr 18, 2026
diff --git a/records/track_non_record_16mb/2026-04-04_RandomMLPUpAdapterAblation/README.md b/records/track_non_record_16mb/2026-04-04_RandomMLPUpAdapterAblation/README.md
@@ -0,0 +1,87 @@
+# Non record: Random MLP Up Adapter Ablations
+
+This experiment starts from the current root `train_gpt.py` and adds narrow random MLP up ablations only.
+
+Selected MLP up projections are replaced with seeded frozen QR random feature maps plus:
+
+1. learned per feature gain
+2. learned rank 16 low rank correction
+3. optional routed multi-basis expert gating
+
+Attention remains fully learned. MLP down projections remain fully learned. The goal is to isolate whether random feature expansion is useful when routing stays learned.
+
+## Configs
+
+`baseline_12l`
+
+1. 12 layers
+2. 512 model dim
+3. 8 heads
+4. 4 KV heads
+5. MLP mult 2
+6. no random MLP up layers
+
+`random_up_12l_5layers_rank16`
+
+1. same 12 layer stack
+2. layers `0,1,2,3,4` use frozen random MLP up projections
+3. each frozen up projection has learned gain plus rank 16 low rank correction
+
+`random_up_moe_12l_5layers_e2`
+
+1. same 12 layer stack
+2. layers `0,1,2,3,4` use frozen random MLP up projections
+3. each selected layer is split into 2 routed random expert subspaces behind a tiny token router
+4. the heavy up projection stays single pass by concatenating expert bases into one random weight
+5. this config disables the low rank correction path to isolate the routed multi-basis variant
+
+`random_up_moe_12l_5layers_e2_rank8`
+
+1. same 12 layer stack
+2. same 2 expert routed random basis construction as `random_up_moe_12l_5layers_e2`
+3. adds a small rank 8 learned correction on top of the routed expert path
+4. intended as the next budget-conscious comparison against pure routed experts
+
+## Extra Eval
+
+The trainer keeps the existing final roundtrip eval and adds a final sliding window eval controlled by:
+
+1. `FINAL_SLIDING_EVAL`
+2. `EVAL_STRIDE`
+3. `EVAL_SEQ_LEN`
+
+Both exact metrics are logged separately.
+
+## Early Smoke Results
+
+Short single GPU smoke runs were used to sanity check learning dynamics before a full length comparison.
+
+`random_up_moe_12l_5layers_e2`
+
+1. run with `TRAIN_BATCH_TOKENS=65536` and `MAX_WALLCLOCK_SECONDS=180`
+2. stopped at step `768`
+3. reached `train_loss=2.5663` at step `750`
+4. finished with `val_loss=2.6259` and `val_bpb=1.5552`
+5. averaged about `234.6 ms` per optimizer step
+
+This is materially stronger than the random guess starting point for a `1024` token vocabulary, which is about `ln(1024) ~= 6.93`. Even without completing the full schedule, the model clearly learns useful token structure and exits the near uniform regime quickly. Admittedly it is far below other competitors and it is questionalbe whether or not the compute and latency costs are worthwhile with gains. Probably looking forward need to look at efficiency. 
+
+`random_up_moe_12l_5layers_e2_rank8`
+
+1. matched the same short wall clock smoke setup
+2. reached `train_loss=3.7947` at step `100`
+3. reached `train_loss=2.5555` at step `750`
+4. averaged about `235.8 ms` per optimizer step
+
+The rank `8` correction path did not show a clear early training advantage over the pure routed `e2` variant in this short run. The train curves were very close, so the smoke result should be read as roughly neutral rather than a win or loss for the added correction path.
+
+Overall, these partial runs suggest the random MLP up construction is viable enough to train a competent model under the existing recipe, but they do not yet show a decisive benefit from the small low rank correction. A longer run with scheduled intermediate validation is still needed for a confident ranking across variants.
+
+## Reproduce
+
+```bash
+bash run.sh baseline_12l
+bash run.sh random_up_12l_5layers_rank16
+bash run.sh random_up_moe_12l_5layers_e2
+bash run.sh random_up_moe_12l_5layers_e2_rank8
+```
diff --git a/records/track_non_record_16mb/2026-04-04_RandomMLPUpAdapterAblation/run.sh b/records/track_non_record_16mb/2026-04-04_RandomMLPUpAdapterAblation/run.sh
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+CONFIG="${1:-random_up_12l_5layers_rank16}"
+
+case "$CONFIG" in
+  baseline_12l)
+    RUN_ID="baseline_12l" \
+    NUM_LAYERS=12 \
+    MODEL_DIM=512 \
+    NUM_HEADS=8 \
+    NUM_KV_HEADS=4 \
+    MLP_MULT=2 \
+    TRAIN_SEQ_LEN=1024 \
+    TRAIN_BATCH_TOKENS=524288 \
+    MATRIX_LR=0.02 \
+    SCALAR_LR=0.02 \
+    TIED_EMBED_LR=0.03 \
+    RANDOM_MLP_UP_LAYERS="" \
+    RANDOM_MLP_UP_RANK=16 \
+    RANDOM_MLP_UP_GAIN=1 \
+    RANDOM_MLP_UP_BASE_SEED=20260403 \
+    RANDOM_MLP_UP_INIT=qr \
+    FINAL_SLIDING_EVAL=1 \
+    EVAL_STRIDE=64 \
+    EVAL_SEQ_LEN=1024 \
+    torchrun --standalone --nproc_per_node=8 train_gpt.py
+    ;;
+  random_up_12l_5layers_rank16)
+    RUN_ID="random_up_12l_5layers_rank16" \
+    NUM_LAYERS=12 \
+    MODEL_DIM=512 \
+    NUM_HEADS=8 \
+    NUM_KV_HEADS=4 \
+    MLP_MULT=2 \
+    TRAIN_SEQ_LEN=1024 \
+    TRAIN_BATCH_TOKENS=524288 \
+    MATRIX_LR=0.02 \
+    SCALAR_LR=0.02 \
+    TIED_EMBED_LR=0.03 \
+    RANDOM_MLP_UP_LAYERS="0,1,2,3,4" \
+    RANDOM_MLP_UP_RANK=16 \
+    RANDOM_MLP_UP_GAIN=1 \
+    RANDOM_MLP_UP_BASE_SEED=20260403 \
+    RANDOM_MLP_UP_INIT=qr \
+    RANDOM_MLP_UP_NUM_EXPERTS=1 \
+    FINAL_SLIDING_EVAL=1 \
+    EVAL_STRIDE=64 \
+    EVAL_SEQ_LEN=1024 \
+    torchrun --standalone --nproc_per_node=8 train_gpt.py
+    ;;
+  random_up_moe_12l_5layers_e2)
+    RUN_ID="random_up_moe_12l_5layers_e2" \
+    NUM_LAYERS=12 \
+    MODEL_DIM=512 \
+    NUM_HEADS=8 \
+    NUM_KV_HEADS=4 \
+    MLP_MULT=2 \
+    TRAIN_SEQ_LEN=1024 \
+    TRAIN_BATCH_TOKENS=524288 \
+    MATRIX_LR=0.02 \
+    SCALAR_LR=0.02 \
+    TIED_EMBED_LR=0.03 \
+    RANDOM_MLP_UP_LAYERS="0,1,2,3,4" \
+    RANDOM_MLP_UP_RANK=0 \
+    RANDOM_MLP_UP_GAIN=1 \
+    RANDOM_MLP_UP_BASE_SEED=20260403 \
+    RANDOM_MLP_UP_INIT=qr \
+    RANDOM_MLP_UP_NUM_EXPERTS=2 \
+    FINAL_SLIDING_EVAL=1 \
+    EVAL_STRIDE=64 \
+    EVAL_SEQ_LEN=1024 \
+    torchrun --standalone --nproc_per_node=8 train_gpt.py
+    ;;
+  random_up_moe_12l_5layers_e2_rank8)
+    RUN_ID="random_up_moe_12l_5layers_e2_rank8" \
+    NUM_LAYERS=12 \
+    MODEL_DIM=512 \
+    NUM_HEADS=8 \
+    NUM_KV_HEADS=4 \
+    MLP_MULT=2 \
+    TRAIN_SEQ_LEN=1024 \
+    TRAIN_BATCH_TOKENS=524288 \
+    MATRIX_LR=0.02 \
+    SCALAR_LR=0.02 \
+    TIED_EMBED_LR=0.03 \
+    RANDOM_MLP_UP_LAYERS="0,1,2,3,4" \
+    RANDOM_MLP_UP_RANK=8 \
+    RANDOM_MLP_UP_GAIN=1 \
+    RANDOM_MLP_UP_BASE_SEED=20260403 \
+    RANDOM_MLP_UP_INIT=qr \
+    RANDOM_MLP_UP_NUM_EXPERTS=2 \
+    FINAL_SLIDING_EVAL=1 \
+    EVAL_STRIDE=64 \
+    EVAL_SEQ_LEN=1024 \
+    torchrun --standalone --nproc_per_node=8 train_gpt.py
+    ;;
+  *)
+    echo "Unknown config: $CONFIG" >&2
+    echo "Available configs: baseline_12l, random_up_12l_5layers_rank16, random_up_moe_12l_5layers_e2, random_up_moe_12l_5layers_e2_rank8" >&2
+    exit 1
+    ;;
+esac
diff --git a/records/track_non_record_16mb/2026-04-04_RandomMLPUpAdapterAblation/submission.json b/records/track_non_record_16mb/2026-04-04_RandomMLPUpAdapterAblation/submission.json
@@ -0,0 +1,17 @@
+{
+  "author": "",
+  "github_id": "",
+  "name": "Random MLP Up Adapter Ablations",
+  "blurb": "Non record ablations built from the root trainer. Selected MLP up projections are frozen seeded QR random maps with learned gain, low rank correction, and a routed multi-basis variant, while attention and MLP down projections remain learned.",
+  "date": "2026-04-04T00:00:00Z",
+  "track": "non-record-16mb",
+  "val_loss": null,
+  "val_bpb": null,
+  "pre_quant_val_loss": null,
+  "pre_quant_val_bpb": null,
+  "step_stop": null,
+  "wallclock_seconds": null,
+  "bytes_total": null,
+  "bytes_model_int8_zlib": null,
+  "bytes_code": null
+}