openai · deniskurlov · May 1, 2026
diff --git a/...rds/track_non_record_16mb/2026-04-30_Quinary_53M_10L_576d_SP16384_TTT/README.md b/...rds/track_non_record_16mb/2026-04-30_Quinary_53M_10L_576d_SP16384_TTT/README.md
diff --git a/...track_non_record_16mb/2026-04-30_Quinary_53M_10L_576d_SP16384_TTT/fineweb_16384_bpe.model b/...track_non_record_16mb/2026-04-30_Quinary_53M_10L_576d_SP16384_TTT/fineweb_16384_bpe.model
diff --git a/...track_non_record_16mb/2026-04-30_Quinary_53M_10L_576d_SP16384_TTT/fineweb_16384_bpe.vocab b/...track_non_record_16mb/2026-04-30_Quinary_53M_10L_576d_SP16384_TTT/fineweb_16384_bpe.vocab
diff --git a/...ds/track_non_record_16mb/2026-04-30_Quinary_53M_10L_576d_SP16384_TTT/quinary_seed1337.txt b/...ds/track_non_record_16mb/2026-04-30_Quinary_53M_10L_576d_SP16384_TTT/quinary_seed1337.txt
diff --git a/records/track_non_record_16mb/2026-04-30_Quinary_53M_10L_576d_SP16384_TTT/quinary_seed42.txt b/records/track_non_record_16mb/2026-04-30_Quinary_53M_10L_576d_SP16384_TTT/quinary_seed42.txt
diff --git a/records/track_non_record_16mb/2026-04-30_Quinary_53M_10L_576d_SP16384_TTT/quinary_seed7.txt b/records/track_non_record_16mb/2026-04-30_Quinary_53M_10L_576d_SP16384_TTT/quinary_seed7.txt
diff --git a/records/track_non_record_16mb/2026-04-30_Quinary_53M_10L_576d_SP16384_TTT/requirements.txt b/records/track_non_record_16mb/2026-04-30_Quinary_53M_10L_576d_SP16384_TTT/requirements.txt
@@ -0,0 +1,10 @@
+numpy
+tqdm
+torch==2.10
+huggingface-hub
+kernels
+setuptools
+typing-extensions==4.15.0
+datasets
+tiktoken
+sentencepiece
diff --git a/records/track_non_record_16mb/2026-04-30_Quinary_53M_10L_576d_SP16384_TTT/run.sh b/records/track_non_record_16mb/2026-04-30_Quinary_53M_10L_576d_SP16384_TTT/run.sh
@@ -0,0 +1,74 @@
+# Quinary (5-level) Parameter Golf submission.
+#
+# Defaults below match the canonical 53M-param model + per-stream v2
+# layout-aware compression that produced the artifact in this folder:
+#   - sp16384 vocab + tokenizer
+#   - EMBED_DIM=380, MODEL_DIM=576, NUM_LAYERS=10, NUM_HEADS=6, NUM_KV_HEADS=3
+#   - QK_GAIN_INIT=5.0, MATRIX_LR=0.035
+#   - TTT_STEPS=3, TTT_LR=0.005, TTT_TOKENS=32768
+#   - per-stream v2 archive (header byte 0x03):
+#       * splits each bulk tensor into its own compressed payload
+#       * for each quinary tensor, screens 4 layouts {base5, base5_T,
+#         bitmask, bitmask_T} by LZMA9 size, then runs LZMA9 vs lrzip-zpaq
+#         only on the winning layout (bounded heuristic, not exhaustive 4×2)
+#       * for c_qkv.weight, splits rows into Q/K/V sub-payloads independently
+#       * robust to the seed-dependent lrzip cliff (full-blob lrzip can OVER
+#         on ~33% of seeds; per-stream v2 consistently FITS at ~15.64 MB)
+#   - SCALE_QUANT_BITS=5 (per-group scale log-delta quant, saves ~141 KB
+#     at +2.1 mBPB TTT cost; net Pareto-positive)
+#
+# To run with a different seed (e.g., for the 3-seed mean):
+#   SEED=1337 bash run.sh
+
+RUN_ID=${RUN_ID:-quinary_seed42} \
+DATA_PATH=${DATA_PATH:-./data/canonical/datasets/fineweb10B_sp16384} \
+TOKENIZER_PATH=${TOKENIZER_PATH:-./data/canonical/tokenizers/fineweb_16384_bpe.model} \
+VOCAB_SIZE=${VOCAB_SIZE:-16384} \
+BITNET_GROUP_SIZE=${BITNET_GROUP_SIZE:-192} \
+EMBED_DIM=${EMBED_DIM:-380} \
+NUM_LAYERS=${NUM_LAYERS:-10} \
+MODEL_DIM=${MODEL_DIM:-576} \
+NUM_KV_HEADS=${NUM_KV_HEADS:-3} \
+NUM_HEADS=${NUM_HEADS:-6} \
+MLP_MULT=${MLP_MULT:-4} \
+MATRIX_OPTIMIZER=${MATRIX_OPTIMIZER:-muon} \
+ADAM_LR=${ADAM_LR:-0.05} \
+ADAM_WD=${ADAM_WD:-0.05} \
+MUON_BACKEND_STEPS=${MUON_BACKEND_STEPS:-3} \
+MUON_MOMENTUM=${MUON_MOMENTUM:-0.95} \
+MUON_MOMENTUM_WARMUP_START=${MUON_MOMENTUM_WARMUP_START:-0.85} \
+MUON_MOMENTUM_WARMUP_STEPS=${MUON_MOMENTUM_WARMUP_STEPS:-500} \
+MUON_WD=${MUON_WD:-0.0} \
+MATRIX_LR=${MATRIX_LR:-0.035} \
+SCALAR_LR=${SCALAR_LR:-0.02} \
+TIED_EMBED_LR=${TIED_EMBED_LR:-0.02} \
+WARMDOWN_FRACTION=${WARMDOWN_FRACTION:-0.2} \
+LOGIT_SOFTCAP=${LOGIT_SOFTCAP:-10} \
+QK_GAIN_INIT=${QK_GAIN_INIT:-5.0} \
+ROPE_TYPE=${ROPE_TYPE:-yarn} \
+YARN_MAX_LEN=${YARN_MAX_LEN:-2048} \
+ROPE_BASE=${ROPE_BASE:-5000} \
+BATCH_TOKENS_START=${BATCH_TOKENS_START:-0} \
+BATCH_SCHEDULE_FRACTION=${BATCH_SCHEDULE_FRACTION:-0.33} \
+TRAIN_BATCH_TOKENS=${TRAIN_BATCH_TOKENS:-524288} \
+SEQ_LEN_START=${SEQ_LEN_START:-0} \
+SEQ_SCHEDULE_FRACTION=${SEQ_SCHEDULE_FRACTION:-0.0} \
+TRAIN_SEQ_LEN=${TRAIN_SEQ_LEN:-1024} \
+ITERATIONS=${ITERATIONS:-10000} \
+WARMUP_STEPS=${WARMUP_STEPS:-5} \
+MAX_WALLCLOCK_SECONDS=${MAX_WALLCLOCK_SECONDS:-599} \
+VAL_LOSS_EVERY=${VAL_LOSS_EVERY:-0} \
+TRAIN_LOG_EVERY=${TRAIN_LOG_EVERY:-1000} \
+CHURN_LOG_EVERY=${CHURN_LOG_EVERY:-0} \
+VAL_MAX_TOKENS=${VAL_MAX_TOKENS:-0} \
+TIE_EMBEDDINGS=${TIE_EMBEDDINGS:-1} \
+HEAD_LR=${HEAD_LR:-0.02} \
+ACTIVATION=${ACTIVATION:-relu2} \
+SOFTCAP_TYPE=${SOFTCAP_TYPE:-poly} \
+TTT_STEPS=${TTT_STEPS:-3} \
+TTT_LR=${TTT_LR:-0.005} \
+TTT_TOKENS=${TTT_TOKENS:-32768} \
+SCALE_QUANT_BITS=${SCALE_QUANT_BITS:-5} \
+SEED=${SEED:-42} \
+COMPILE_MODE=${COMPILE_MODE:-default} \
+OMP_NUM_THREADS=${OMP_NUM_THREADS:-1} torchrun --standalone --nproc_per_node=8 train_gpt.py
diff --git a/records/track_non_record_16mb/2026-04-30_Quinary_53M_10L_576d_SP16384_TTT/setup.sh b/records/track_non_record_16mb/2026-04-30_Quinary_53M_10L_576d_SP16384_TTT/setup.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+# -------------------------------------------------------------------------------
+# Parameter Golf -- Quinary submission environment setup
+# Run from the submission/ directory on a fresh 8xH100 pod.
+#
+# After this finishes:
+#   - lrzip is installed (used by per-stream compression)
+#   - Python deps from requirements.txt are installed
+#   - FlashAttention-3 wheel is installed (Hopper-only)
+#   - sp16384 tokenizer + tokenized FineWeb shards are at ./data/
+#
+# Total time on a fresh pod: ~10-25 min (mostly the ~23 GB HF download).
+# -------------------------------------------------------------------------------
+
+set -e
+
+echo "=============================================="
+echo " Parameter Golf -- Quinary submission setup"
+echo "=============================================="
+
+# --------------------------------------------------------------------
+# 1. System packages (lrzip; needed by per-stream artifact compression)
+# --------------------------------------------------------------------
+echo ""
+echo "[1/4] System packages (lrzip)..."
+
+if command -v lrzip >/dev/null 2>&1; then
+    echo "    lrzip already installed -- skipping."
+else
+    apt-get update -qq
+    apt-get install -y -qq lrzip
+    echo "    Installed."
+fi
+
+# --------------------------------------------------------------------
+# 2. Python requirements
+# --------------------------------------------------------------------
+echo ""
+echo "[2/4] Python requirements..."
+
+if python3 -c "import torch, sentencepiece, numpy, huggingface_hub" 2>/dev/null; then
+    echo "    Core packages already installed -- skipping."
+else
+    pip install --upgrade pip -q
+    pip install -r requirements.txt -q
+    echo "    Installed."
+fi
+
+# --------------------------------------------------------------------
+# 3. FlashAttention-3 (Hopper-specific wheel)
+# --------------------------------------------------------------------
+echo ""
+echo "[3/4] FlashAttention-3..."
+
+if python3 -c "import flash_attn_interface" 2>/dev/null; then
+    echo "    Already installed -- skipping."
+else
+    pip install --no-cache-dir \
+        "https://download.pytorch.org/whl/cu128/flash_attn_3-3.0.0-cp39-abi3-manylinux_2_28_x86_64.whl"
+    echo "    Installed."
+fi
+
+# --------------------------------------------------------------------
+# 4. FineWeb dataset + sp16384 tokenizer (canonical/ subset only)
+# --------------------------------------------------------------------
+echo ""
+echo "[4/4] FineWeb sp16384 dataset + tokenizer..."
+
+if ls ./data/canonical/datasets/fineweb10B_sp16384/fineweb_val_*.bin 1>/dev/null 2>&1; then
+    echo "    Already present at ./data/canonical/ -- skipping."
+else
+    echo "    Downloading from deniskurlov/parameter-golf-fineweb-sp16384 (canonical/ only, ~23 GB)..."
+    hf download deniskurlov/parameter-golf-fineweb-sp16384 \
+        --include "canonical/**" \
+        --local-dir ./data \
+        --repo-type dataset
+    echo "    Downloaded."
+fi
+
+# --------------------------------------------------------------------
+# Verification
+# --------------------------------------------------------------------
+echo ""
+echo "=============================================="
+echo " Verification"
+echo "=============================================="
+
+python3 - << 'EOF'
+import sys, glob
+import torch, numpy as np
+
+print(f"Python       : {sys.version.split()[0]}")
+print(f"PyTorch      : {torch.__version__}")
+print(f"CUDA         : {torch.cuda.is_available()}")
+print(f"GPUs         : {torch.cuda.device_count()}")
+if torch.cuda.is_available():
+    for i in range(torch.cuda.device_count()):
+        props = torch.cuda.get_device_properties(i)
+        print(f"  GPU {i}      : {props.name} ({props.total_memory // 1024**3} GB)")
+
+try:
+    import flash_attn_interface  # noqa
+    print("FlashAttn3   : installed")
+except ImportError:
+    print("FlashAttn3   : NOT found (required for training)")
+
+import sentencepiece as spm
+sp_path = "./data/canonical/tokenizers/fineweb_16384_bpe.model"
+sp = spm.SentencePieceProcessor(model_file=sp_path)
+print(f"Tokenizer    : {sp.vocab_size()}-vocab SentencePiece BPE @ {sp_path}")
+
+train = sorted(glob.glob("./data/canonical/datasets/fineweb10B_sp16384/fineweb_train_*.bin"))
+val   = sorted(glob.glob("./data/canonical/datasets/fineweb10B_sp16384/fineweb_val_*.bin"))
+total_val = sum(int(np.fromfile(f, dtype="<i4", count=3)[2]) for f in val) if val else 0
+print(f"Dataset      : {len(train)} train shards, {len(val)} val shards, {total_val:,} val tokens")
+
+import shutil
+print(f"lrzip binary : {shutil.which('lrzip') or 'NOT FOUND (required for per-stream compression)'}")
+EOF
+
+echo ""
+echo "=============================================="
+echo " Done. To train + evaluate:"
+echo ""
+echo "   bash run.sh"
+echo ""
+echo " Or with overrides (e.g., a different seed):"
+echo ""
+echo "   SEED=1337 bash run.sh"
+echo "=============================================="
diff --git a/records/track_non_record_16mb/2026-04-30_Quinary_53M_10L_576d_SP16384_TTT/submission.json b/records/track_non_record_16mb/2026-04-30_Quinary_53M_10L_576d_SP16384_TTT/submission.json
@@ -0,0 +1,34 @@
+{
+  "author": "Denis Kurlov",
+  "github_id": "deniskurlov",
+  "name": "52.8M Quinary U-Net + Per-Stream-v2 Layout-Aware Compression + Score-First TTT",
+  "blurb": "Direct quinary fork of the 2026-03-24 ternary record submission by Ciprian-Florin Ifrim (PR #640, 1.1570 sliding BPB). Hypothesis: quinary {-2,-1,0,+1,+2} (5 levels per parameter; raw 8/3 ≈ 2.667 bits/param via base-5 packing 3 quins per byte, log₂5 ≈ 2.322 bits/param entropy floor) buys per-parameter expressivity that outweighs the ~1 bit/param cost vs ternary's 3 levels. Architecture inherited unchanged: 10L (5 enc + 5 dec) U-Net, 4x relu² MLP, factored tied embedding, polynomial-5 softcap, YaRN 2048, Muon, fused QKV, FP8 QAT for non-quantized linears, FlashAttention-3. Adapted: 768d->576d, GQA 8:4->6:3, embed bottleneck 254->380, group_size 128->192, tokenizer SP8192->SP16384, single-blob LZMA->layout-aware per-stream v2 archive (header 0x03; per-quinary-tensor LZMA-screened layout selection over {base5, base5_T, bitmask, bitmask_T} then LZMA-vs-lrzip on the winner — bounded heuristic, not exhaustive 4×2; c_qkv split into Q/K/V sub-payloads; structurally based on parameter-golf PR #1855), and stride-16 sliding eval -> score-first TTT (3 epochs, lr=0.005, adapting only the 42,364 fp16 calibration parameters — per-layer scales, residual mix, Q-gain, skip weights, vocab bias). 52.8M params in 15.72MB max-seed total. Trained 7,800 steps in ~599s on 8xH100 SXM. 3-seed validation (42, 1337, 7): TTT BPB 1.1384 ± 0.0009 std (-22 mBPB from quinary architectural change vs ternary RT 1.1842, -24 mBPB additional from TTT), all FITS with margin ~275 KB under the 16 MB cap. BPB denominator audited end-to-end: verify_bpb.py exact-eval-slice lut_bytes=151,078,879 matches the runtime eval_bytes:151,078,879 printed by train_gpt.py for every seed (delta=+0). Per-stream v2 also solves the seed-dependent lrzip cliff that forced earlier full-blob lrzip artifacts to OVER on ~33% of seeds (with the v2 archive seed=7 is now actually the best-fitting seed at 1.1378 TTT BPB).",
+  "date": "2026-05-01T22:00:00Z",
+  "val_loss": 3.2093,
+  "val_bpb": 1.1384,
+  "bytes_total": 15724839,
+  "bytes_total_note": "Max across the 3 verified seeds (seed=7); per-seed values in seed_results. Range across seeds: 15,714,938 - 15,724,839. Margin under 16 MB cap = 275,161 bytes.",
+  "bytes_code": 79272,
+  "seed_results": {
+    "42":   { "val_loss": 3.2083, "val_bpb": 1.1381, "val_bpb_roundtrip": 1.1626, "bytes_total": 15714938, "steps_reached": 7800, "eval_tokens": 37146624, "eval_bytes": 151078879, "verified": true },
+    "1337": { "val_loss": 3.2120, "val_bpb": 1.1394, "val_bpb_roundtrip": 1.1633, "bytes_total": 15721124, "steps_reached": 7800, "eval_tokens": 37146624, "eval_bytes": 151078879, "verified": true },
+    "7":    { "val_loss": 3.2076, "val_bpb": 1.1378, "val_bpb_roundtrip": 1.1622, "bytes_total": 15724839, "steps_reached": 7800, "eval_tokens": 37146624, "eval_bytes": 151078879, "verified": true }
+  },
+  "seed_mean_val_bpb": 1.1384,
+  "seed_std_val_bpb": 0.00085,
+  "seed_stderr_val_bpb": 0.00049,
+  "seed_mean_val_bpb_roundtrip": 1.1627,
+  "seed_std_val_bpb_roundtrip": 0.00056,
+  "seed_stderr_val_bpb_roundtrip": 0.00032,
+  "n_params": 52828668,
+  "n_quinary_params": 36495360,
+  "n_fp_params": 6896124,
+  "training_seconds": 599,
+  "eval_seconds": 300,
+  "eval_seconds_breakdown": {
+    "load_artifact_and_decompress": 5,
+    "roundtrip_eval": 80,
+    "ttt_eval": 215
+  },
+  "track": "track_non_record_16mb"
+}