Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Binary file not shown.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
numpy
tqdm
torch==2.10
huggingface-hub
kernels
setuptools
typing-extensions==4.15.0
datasets
tiktoken
sentencepiece
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Quinary (5-level) Parameter Golf submission.
#
# Defaults below match the canonical 53M-param model + per-stream v2
# layout-aware compression that produced the artifact in this folder:
# - sp16384 vocab + tokenizer
# - EMBED_DIM=380, MODEL_DIM=576, NUM_LAYERS=10, NUM_HEADS=6, NUM_KV_HEADS=3
# - QK_GAIN_INIT=5.0, MATRIX_LR=0.035
# - TTT_STEPS=3, TTT_LR=0.005, TTT_TOKENS=32768
# - per-stream v2 archive (header byte 0x03):
# * splits each bulk tensor into its own compressed payload
# * for each quinary tensor, screens 4 layouts {base5, base5_T,
# bitmask, bitmask_T} by LZMA9 size, then runs LZMA9 vs lrzip-zpaq
# only on the winning layout (bounded heuristic, not exhaustive 4×2)
# * for c_qkv.weight, splits rows into Q/K/V sub-payloads independently
# * robust to the seed-dependent lrzip cliff (full-blob lrzip can OVER
# on ~33% of seeds; per-stream v2 consistently FITS at ~15.64 MB)
# - SCALE_QUANT_BITS=5 (per-group scale log-delta quant, saves ~141 KB
# at +2.1 mBPB TTT cost; net Pareto-positive)
#
# To run with a different seed (e.g., for the 3-seed mean):
# SEED=1337 bash run.sh

RUN_ID=${RUN_ID:-quinary_seed42} \
DATA_PATH=${DATA_PATH:-./data/canonical/datasets/fineweb10B_sp16384} \
TOKENIZER_PATH=${TOKENIZER_PATH:-./data/canonical/tokenizers/fineweb_16384_bpe.model} \
VOCAB_SIZE=${VOCAB_SIZE:-16384} \
BITNET_GROUP_SIZE=${BITNET_GROUP_SIZE:-192} \
EMBED_DIM=${EMBED_DIM:-380} \
NUM_LAYERS=${NUM_LAYERS:-10} \
MODEL_DIM=${MODEL_DIM:-576} \
NUM_KV_HEADS=${NUM_KV_HEADS:-3} \
NUM_HEADS=${NUM_HEADS:-6} \
MLP_MULT=${MLP_MULT:-4} \
MATRIX_OPTIMIZER=${MATRIX_OPTIMIZER:-muon} \
ADAM_LR=${ADAM_LR:-0.05} \
ADAM_WD=${ADAM_WD:-0.05} \
MUON_BACKEND_STEPS=${MUON_BACKEND_STEPS:-3} \
MUON_MOMENTUM=${MUON_MOMENTUM:-0.95} \
MUON_MOMENTUM_WARMUP_START=${MUON_MOMENTUM_WARMUP_START:-0.85} \
MUON_MOMENTUM_WARMUP_STEPS=${MUON_MOMENTUM_WARMUP_STEPS:-500} \
MUON_WD=${MUON_WD:-0.0} \
MATRIX_LR=${MATRIX_LR:-0.035} \
SCALAR_LR=${SCALAR_LR:-0.02} \
TIED_EMBED_LR=${TIED_EMBED_LR:-0.02} \
WARMDOWN_FRACTION=${WARMDOWN_FRACTION:-0.2} \
LOGIT_SOFTCAP=${LOGIT_SOFTCAP:-10} \
QK_GAIN_INIT=${QK_GAIN_INIT:-5.0} \
ROPE_TYPE=${ROPE_TYPE:-yarn} \
YARN_MAX_LEN=${YARN_MAX_LEN:-2048} \
ROPE_BASE=${ROPE_BASE:-5000} \
BATCH_TOKENS_START=${BATCH_TOKENS_START:-0} \
BATCH_SCHEDULE_FRACTION=${BATCH_SCHEDULE_FRACTION:-0.33} \
TRAIN_BATCH_TOKENS=${TRAIN_BATCH_TOKENS:-524288} \
SEQ_LEN_START=${SEQ_LEN_START:-0} \
SEQ_SCHEDULE_FRACTION=${SEQ_SCHEDULE_FRACTION:-0.0} \
TRAIN_SEQ_LEN=${TRAIN_SEQ_LEN:-1024} \
ITERATIONS=${ITERATIONS:-10000} \
WARMUP_STEPS=${WARMUP_STEPS:-5} \
MAX_WALLCLOCK_SECONDS=${MAX_WALLCLOCK_SECONDS:-599} \
VAL_LOSS_EVERY=${VAL_LOSS_EVERY:-0} \
TRAIN_LOG_EVERY=${TRAIN_LOG_EVERY:-1000} \
CHURN_LOG_EVERY=${CHURN_LOG_EVERY:-0} \
VAL_MAX_TOKENS=${VAL_MAX_TOKENS:-0} \
TIE_EMBEDDINGS=${TIE_EMBEDDINGS:-1} \
HEAD_LR=${HEAD_LR:-0.02} \
ACTIVATION=${ACTIVATION:-relu2} \
SOFTCAP_TYPE=${SOFTCAP_TYPE:-poly} \
TTT_STEPS=${TTT_STEPS:-3} \
TTT_LR=${TTT_LR:-0.005} \
TTT_TOKENS=${TTT_TOKENS:-32768} \
SCALE_QUANT_BITS=${SCALE_QUANT_BITS:-5} \
SEED=${SEED:-42} \
COMPILE_MODE=${COMPILE_MODE:-default} \
OMP_NUM_THREADS=${OMP_NUM_THREADS:-1} torchrun --standalone --nproc_per_node=8 train_gpt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#!/bin/bash
# -------------------------------------------------------------------------------
# Parameter Golf -- Quinary submission environment setup
# Run from the submission/ directory on a fresh 8xH100 pod.
#
# After this finishes:
# - lrzip is installed (used by per-stream compression)
# - Python deps from requirements.txt are installed
# - FlashAttention-3 wheel is installed (Hopper-only)
# - sp16384 tokenizer + tokenized FineWeb shards are at ./data/
#
# Total time on a fresh pod: ~10-25 min (mostly the ~23 GB HF download).
# -------------------------------------------------------------------------------

set -e

echo "=============================================="
echo " Parameter Golf -- Quinary submission setup"
echo "=============================================="

# --------------------------------------------------------------------
# 1. System packages (lrzip; needed by per-stream artifact compression)
# --------------------------------------------------------------------
echo ""
echo "[1/4] System packages (lrzip)..."

if command -v lrzip >/dev/null 2>&1; then
echo " lrzip already installed -- skipping."
else
apt-get update -qq
apt-get install -y -qq lrzip
echo " Installed."
fi

# --------------------------------------------------------------------
# 2. Python requirements
# --------------------------------------------------------------------
echo ""
echo "[2/4] Python requirements..."

if python3 -c "import torch, sentencepiece, numpy, huggingface_hub" 2>/dev/null; then
echo " Core packages already installed -- skipping."
else
pip install --upgrade pip -q
pip install -r requirements.txt -q
echo " Installed."
fi

# --------------------------------------------------------------------
# 3. FlashAttention-3 (Hopper-specific wheel)
# --------------------------------------------------------------------
echo ""
echo "[3/4] FlashAttention-3..."

if python3 -c "import flash_attn_interface" 2>/dev/null; then
echo " Already installed -- skipping."
else
pip install --no-cache-dir \
"https://download.pytorch.org/whl/cu128/flash_attn_3-3.0.0-cp39-abi3-manylinux_2_28_x86_64.whl"
echo " Installed."
fi

# --------------------------------------------------------------------
# 4. FineWeb dataset + sp16384 tokenizer (canonical/ subset only)
# --------------------------------------------------------------------
echo ""
echo "[4/4] FineWeb sp16384 dataset + tokenizer..."

if ls ./data/canonical/datasets/fineweb10B_sp16384/fineweb_val_*.bin 1>/dev/null 2>&1; then
echo " Already present at ./data/canonical/ -- skipping."
else
echo " Downloading from deniskurlov/parameter-golf-fineweb-sp16384 (canonical/ only, ~23 GB)..."
hf download deniskurlov/parameter-golf-fineweb-sp16384 \
--include "canonical/**" \
--local-dir ./data \
--repo-type dataset
echo " Downloaded."
fi

# --------------------------------------------------------------------
# Verification
# --------------------------------------------------------------------
echo ""
echo "=============================================="
echo " Verification"
echo "=============================================="

python3 - << 'EOF'
import sys, glob
import torch, numpy as np

print(f"Python : {sys.version.split()[0]}")
print(f"PyTorch : {torch.__version__}")
print(f"CUDA : {torch.cuda.is_available()}")
print(f"GPUs : {torch.cuda.device_count()}")
if torch.cuda.is_available():
for i in range(torch.cuda.device_count()):
props = torch.cuda.get_device_properties(i)
print(f" GPU {i} : {props.name} ({props.total_memory // 1024**3} GB)")

try:
import flash_attn_interface # noqa
print("FlashAttn3 : installed")
except ImportError:
print("FlashAttn3 : NOT found (required for training)")

import sentencepiece as spm
sp_path = "./data/canonical/tokenizers/fineweb_16384_bpe.model"
sp = spm.SentencePieceProcessor(model_file=sp_path)
print(f"Tokenizer : {sp.vocab_size()}-vocab SentencePiece BPE @ {sp_path}")

train = sorted(glob.glob("./data/canonical/datasets/fineweb10B_sp16384/fineweb_train_*.bin"))
val = sorted(glob.glob("./data/canonical/datasets/fineweb10B_sp16384/fineweb_val_*.bin"))
total_val = sum(int(np.fromfile(f, dtype="<i4", count=3)[2]) for f in val) if val else 0
print(f"Dataset : {len(train)} train shards, {len(val)} val shards, {total_val:,} val tokens")

import shutil
print(f"lrzip binary : {shutil.which('lrzip') or 'NOT FOUND (required for per-stream compression)'}")
EOF

echo ""
echo "=============================================="
echo " Done. To train + evaluate:"
echo ""
echo " bash run.sh"
echo ""
echo " Or with overrides (e.g., a different seed):"
echo ""
echo " SEED=1337 bash run.sh"
echo "=============================================="
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"author": "Denis Kurlov",
"github_id": "deniskurlov",
"name": "52.8M Quinary U-Net + Per-Stream-v2 Layout-Aware Compression + Score-First TTT",
"blurb": "Direct quinary fork of the 2026-03-24 ternary record submission by Ciprian-Florin Ifrim (PR #640, 1.1570 sliding BPB). Hypothesis: quinary {-2,-1,0,+1,+2} (5 levels per parameter; raw 8/3 ≈ 2.667 bits/param via base-5 packing 3 quins per byte, log₂5 ≈ 2.322 bits/param entropy floor) buys per-parameter expressivity that outweighs the ~1 bit/param cost vs ternary's 3 levels. Architecture inherited unchanged: 10L (5 enc + 5 dec) U-Net, 4x relu² MLP, factored tied embedding, polynomial-5 softcap, YaRN 2048, Muon, fused QKV, FP8 QAT for non-quantized linears, FlashAttention-3. Adapted: 768d->576d, GQA 8:4->6:3, embed bottleneck 254->380, group_size 128->192, tokenizer SP8192->SP16384, single-blob LZMA->layout-aware per-stream v2 archive (header 0x03; per-quinary-tensor LZMA-screened layout selection over {base5, base5_T, bitmask, bitmask_T} then LZMA-vs-lrzip on the winner — bounded heuristic, not exhaustive 4×2; c_qkv split into Q/K/V sub-payloads; structurally based on parameter-golf PR #1855), and stride-16 sliding eval -> score-first TTT (3 epochs, lr=0.005, adapting only the 42,364 fp16 calibration parameters — per-layer scales, residual mix, Q-gain, skip weights, vocab bias). 52.8M params in 15.72MB max-seed total. Trained 7,800 steps in ~599s on 8xH100 SXM. 3-seed validation (42, 1337, 7): TTT BPB 1.1384 ± 0.0009 std (-22 mBPB from quinary architectural change vs ternary RT 1.1842, -24 mBPB additional from TTT), all FITS with margin ~275 KB under the 16 MB cap. BPB denominator audited end-to-end: verify_bpb.py exact-eval-slice lut_bytes=151,078,879 matches the runtime eval_bytes:151,078,879 printed by train_gpt.py for every seed (delta=+0). Per-stream v2 also solves the seed-dependent lrzip cliff that forced earlier full-blob lrzip artifacts to OVER on ~33% of seeds (with the v2 archive seed=7 is now actually the best-fitting seed at 1.1378 TTT BPB).",
"date": "2026-05-01T22:00:00Z",
"val_loss": 3.2093,
"val_bpb": 1.1384,
"bytes_total": 15724839,
"bytes_total_note": "Max across the 3 verified seeds (seed=7); per-seed values in seed_results. Range across seeds: 15,714,938 - 15,724,839. Margin under 16 MB cap = 275,161 bytes.",
"bytes_code": 79272,
"seed_results": {
"42": { "val_loss": 3.2083, "val_bpb": 1.1381, "val_bpb_roundtrip": 1.1626, "bytes_total": 15714938, "steps_reached": 7800, "eval_tokens": 37146624, "eval_bytes": 151078879, "verified": true },
"1337": { "val_loss": 3.2120, "val_bpb": 1.1394, "val_bpb_roundtrip": 1.1633, "bytes_total": 15721124, "steps_reached": 7800, "eval_tokens": 37146624, "eval_bytes": 151078879, "verified": true },
"7": { "val_loss": 3.2076, "val_bpb": 1.1378, "val_bpb_roundtrip": 1.1622, "bytes_total": 15724839, "steps_reached": 7800, "eval_tokens": 37146624, "eval_bytes": 151078879, "verified": true }
},
"seed_mean_val_bpb": 1.1384,
"seed_std_val_bpb": 0.00085,
"seed_stderr_val_bpb": 0.00049,
"seed_mean_val_bpb_roundtrip": 1.1627,
"seed_std_val_bpb_roundtrip": 0.00056,
"seed_stderr_val_bpb_roundtrip": 0.00032,
"n_params": 52828668,
"n_quinary_params": 36495360,
"n_fp_params": 6896124,
"training_seconds": 599,
"eval_seconds": 300,
"eval_seconds_breakdown": {
"load_artifact_and_decompress": 5,
"roundtrip_eval": 80,
"ttt_eval": 215
},
"track": "track_non_record_16mb"
}
Loading