Skip to content

Commit ecabe07

Browse files
Octavianclaude
andcommitted
Add 1-GPU edge A/B testing script for Stinky Frost optimization
Tests SWA, MLP1376, and MuonWD 0.02/0.03 with 50-min wallclock for quick head-to-head comparisons before full 8xH100 validation. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 8d25a3b commit ecabe07

1 file changed

Lines changed: 113 additions & 0 deletions

File tree

scripts/run_edge_ab_1gpu.sh

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
4+
# EDGE A/B TESTING — 1 GPU quick comparisons
5+
#
6+
# Runs shorter experiments (50 min each) on 1 GPU to find edges.
7+
# ~5500 steps = enough to see relative trends.
8+
#
9+
# Baseline: Stinky Frost v1 (MLP1344, WD=0.01, no SWA) → 1.1725 on 8xH100
10+
#
11+
# Test A: + SWA (every 50, last 50%) → does SWA help quant BPB?
12+
# Test B: MLP1376 + SWA → use headroom + SWA
13+
# Test C: MuonWD=0.02 + SWA → higher WD sweet spot?
14+
# Test D: MuonWD=0.03 + SWA → even higher WD?
15+
#
16+
# Compare quant BPB across all 4. Winner gets a full 8xH100 run.
17+
18+
export DATA_PATH="${DATA_PATH:-./data/datasets/fineweb10B_sp1024/}"
19+
export TOKENIZER_PATH="${TOKENIZER_PATH:-./data/tokenizers/fineweb_1024_bpe.model}"
20+
export VOCAB_SIZE="${VOCAB_SIZE:-1024}"
21+
22+
export NUM_LAYERS=9
23+
export MODEL_DIM=512
24+
export NUM_HEADS=8
25+
export NUM_KV_HEADS=4
26+
export MLP_MULT=3
27+
export TIE_EMBEDDINGS=1
28+
29+
export TRAIN_BATCH_TOKENS="${TRAIN_BATCH_TOKENS:-524288}"
30+
export TRAIN_SEQ_LEN="${TRAIN_SEQ_LEN:-1024}"
31+
export ITERATIONS="${ITERATIONS:-20000}"
32+
export TRAIN_LOG_EVERY="${TRAIN_LOG_EVERY:-200}"
33+
export VAL_LOSS_EVERY="${VAL_LOSS_EVERY:-500}"
34+
35+
# 50 min per run on 1 GPU ≈ 5500 steps (half of full training)
36+
export MAX_WALLCLOCK_SECONDS="${MAX_WALLCLOCK_SECONDS:-3000}"
37+
38+
LOGDIR="logs/edge_ab_$(date +%Y%m%d_%H%M%S)"
39+
mkdir -p "$LOGDIR"
40+
41+
echo "============================================"
42+
echo " EDGE A/B TESTING — 1 GPU"
43+
echo " 50 min per test, 4 tests, ~3.5 hrs total"
44+
echo " Logs: $LOGDIR"
45+
echo "============================================"
46+
47+
# --- Test A: v1 + SWA ---
48+
echo ""
49+
echo "[1/4] Test A: Stinky Frost + SWA"
50+
NUM_LAYERS=9 MODEL_DIM=512 NUM_HEADS=8 NUM_KV_HEADS=4 MLP_MULT=3 \
51+
MLP_HIDDEN=1344 MUON_WD=0.01 \
52+
SWA_EVERY=50 SWA_START_FRAC=0.5 \
53+
QUANT_BITS=6 QAT_START_FRAC=0.25 EVAL_STRIDE=64 FP16_EMBED=1 \
54+
SMEAR_GATE=1 BIGRAM_HASH=1 ORTHO_INIT=1 TIE_EMBEDDINGS=1 \
55+
NCCL_IB_DISABLE=1 RUN_ID=edge_a_swa \
56+
torchrun --standalone --nproc_per_node="${NPROC:-1}" train_gpt.py \
57+
2>&1 | tee "$LOGDIR/test_a_swa.log"
58+
59+
# --- Test B: MLP1376 + SWA ---
60+
echo ""
61+
echo "[2/4] Test B: MLP1376 + SWA"
62+
NUM_LAYERS=9 MODEL_DIM=512 NUM_HEADS=8 NUM_KV_HEADS=4 MLP_MULT=3 \
63+
MLP_HIDDEN=1376 MUON_WD=0.01 \
64+
SWA_EVERY=50 SWA_START_FRAC=0.5 \
65+
QUANT_BITS=6 QAT_START_FRAC=0.25 EVAL_STRIDE=64 FP16_EMBED=1 \
66+
SMEAR_GATE=1 BIGRAM_HASH=1 ORTHO_INIT=1 TIE_EMBEDDINGS=1 \
67+
NCCL_IB_DISABLE=1 RUN_ID=edge_b_mlp1376 \
68+
torchrun --standalone --nproc_per_node="${NPROC:-1}" train_gpt.py \
69+
2>&1 | tee "$LOGDIR/test_b_mlp1376.log"
70+
71+
# --- Test C: MuonWD=0.02 + SWA ---
72+
echo ""
73+
echo "[3/4] Test C: MuonWD=0.02 + SWA"
74+
NUM_LAYERS=9 MODEL_DIM=512 NUM_HEADS=8 NUM_KV_HEADS=4 MLP_MULT=3 \
75+
MLP_HIDDEN=1344 MUON_WD=0.02 \
76+
SWA_EVERY=50 SWA_START_FRAC=0.5 \
77+
QUANT_BITS=6 QAT_START_FRAC=0.25 EVAL_STRIDE=64 FP16_EMBED=1 \
78+
SMEAR_GATE=1 BIGRAM_HASH=1 ORTHO_INIT=1 TIE_EMBEDDINGS=1 \
79+
NCCL_IB_DISABLE=1 RUN_ID=edge_c_wd02 \
80+
torchrun --standalone --nproc_per_node="${NPROC:-1}" train_gpt.py \
81+
2>&1 | tee "$LOGDIR/test_c_wd02.log"
82+
83+
# --- Test D: MuonWD=0.03 + SWA ---
84+
echo ""
85+
echo "[4/4] Test D: MuonWD=0.03 + SWA"
86+
NUM_LAYERS=9 MODEL_DIM=512 NUM_HEADS=8 NUM_KV_HEADS=4 MLP_MULT=3 \
87+
MLP_HIDDEN=1344 MUON_WD=0.03 \
88+
SWA_EVERY=50 SWA_START_FRAC=0.5 \
89+
QUANT_BITS=6 QAT_START_FRAC=0.25 EVAL_STRIDE=64 FP16_EMBED=1 \
90+
SMEAR_GATE=1 BIGRAM_HASH=1 ORTHO_INIT=1 TIE_EMBEDDINGS=1 \
91+
NCCL_IB_DISABLE=1 RUN_ID=edge_d_wd03 \
92+
torchrun --standalone --nproc_per_node="${NPROC:-1}" train_gpt.py \
93+
2>&1 | tee "$LOGDIR/test_d_wd03.log"
94+
95+
# --- Summary ---
96+
echo ""
97+
echo "============================================"
98+
echo " EDGE A/B Complete. Results:"
99+
echo "============================================"
100+
echo " Reference: Stinky Frost v1 (8xH100) = quant_bpb:1.1725"
101+
echo ""
102+
103+
for f in "$LOGDIR"/test_*.log; do
104+
name=$(basename "$f" .log)
105+
steps=$(grep -oP 'stopping_early.*step:\K\d+' "$f" | tail -1)
106+
bpb=$(grep -oP 'final_int6_ttt_lora val_loss:\S+ val_bpb:\K\S+' "$f" | tail -1)
107+
quant_bpb=$(grep -oP 'final_int6_zlib_roundtrip val_loss:\S+ val_bpb:\K\S+' "$f" | tail -1)
108+
size=$(grep -oP 'Total submission size int6\+zlib: \K\d+' "$f" | tail -1)
109+
echo "$name: steps=${steps:-N/A} ttt_bpb=${bpb:-N/A} quant_bpb=${quant_bpb:-N/A} bytes=${size:-N/A}"
110+
done | tee "$LOGDIR/summary.txt"
111+
112+
echo ""
113+
echo "Pick the winner → run full 8xH100 validation"

0 commit comments

Comments
 (0)