Skip to content

Commit 80e4ec5

Browse files
Octavianclaude
andcommitted
Add int5-MLP mixed quant, zstd compression, magnitude pruning, and seq ramp
Adapted from PR #180 SOTA (1.1428 BPB): - INT5 quantization for MLP weights (int6 for attention) — saves ~1.86MB - zstd-22 compression instead of zlib — better ratio on sparse int5 data - 3% magnitude pruning before quantization — zeros compress well - Sequence length ramp: start at 256, ramp to full at 25% of training - QAT updated to fake-quantize int5 for MLP, int6 for rest New env vars: INT5_MLP, USE_ZSTD, ZSTD_LEVEL, PRUNE_PCT, SEQ_RAMP_START, SEQ_RAMP_FRAC Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent ecabe07 commit 80e4ec5

3 files changed

Lines changed: 306 additions & 17 deletions

File tree

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ setuptools
77
typing-extensions==4.15.0
88
datasets
99
tiktoken
10-
sentencepiece
10+
sentencepiece
11+
zstandard

scripts/run_stinky_frost_v2.sh

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
4+
# STINKY FROST V2 — Adapted from PR #180 SOTA techniques
5+
#
6+
# Key upgrades from v1:
7+
# - INT5 for MLP weights + INT6 for attention (saves ~1.86MB → more params)
8+
# - zstd-22 compression instead of zlib (better compression ratio)
9+
# - 3% magnitude pruning (zeros compress well with zstd)
10+
# - 10 layers (freed space from int5+zstd)
11+
# - SWA every 50 steps, start at 40%
12+
# - MuonWD=0.02 (sweet spot between 0.01 and 0.04)
13+
# - Seq ramp: start at 256 for first 25%, then full 1024 (novel!)
14+
# - BiggramHash 10240 buckets (from PR #180)
15+
#
16+
# Reference: v1 = 1.1725 BPB (15.58MB), PR #180 SOTA = 1.1428 BPB (15.52MB)
17+
18+
export DATA_PATH="${DATA_PATH:-./data/datasets/fineweb10B_sp1024/}"
19+
export TOKENIZER_PATH="${TOKENIZER_PATH:-./data/tokenizers/fineweb_1024_bpe.model}"
20+
export VOCAB_SIZE="${VOCAB_SIZE:-1024}"
21+
22+
export TRAIN_BATCH_TOKENS="${TRAIN_BATCH_TOKENS:-524288}"
23+
export TRAIN_SEQ_LEN="${TRAIN_SEQ_LEN:-1024}"
24+
export ITERATIONS="${ITERATIONS:-20000}"
25+
export TRAIN_LOG_EVERY="${TRAIN_LOG_EVERY:-200}"
26+
export VAL_LOSS_EVERY="${VAL_LOSS_EVERY:-500}"
27+
export MAX_WALLCLOCK_SECONDS="${MAX_WALLCLOCK_SECONDS:-600}"
28+
29+
LOGDIR="logs/stinky_frost_v2_$(date +%Y%m%d_%H%M%S)"
30+
mkdir -p "$LOGDIR"
31+
32+
echo "============================================"
33+
echo " STINKY FROST V2"
34+
echo " Int5-MLP + zstd + 10L + SWA + SeqRamp"
35+
echo " Logs: $LOGDIR"
36+
echo "============================================"
37+
38+
echo ""
39+
echo "[1/1] Stinky Frost V2 — Full Send"
40+
41+
NUM_LAYERS=10 MODEL_DIM=512 NUM_HEADS=8 NUM_KV_HEADS=4 MLP_MULT=3 \
42+
MLP_HIDDEN=1344 TIE_EMBEDDINGS=1 \
43+
QUANT_BITS=6 QAT_START_FRAC=0.25 EVAL_STRIDE=64 \
44+
FP16_EMBED=1 SMEAR_GATE=1 BIGRAM_HASH=1 ORTHO_INIT=1 \
45+
BIGRAM_BUCKETS=10240 \
46+
MUON_WD=0.02 \
47+
SWA_EVERY=50 SWA_START_FRAC=0.4 \
48+
INT5_MLP=1 USE_ZSTD=1 ZSTD_LEVEL=22 PRUNE_PCT=0.03 \
49+
SEQ_RAMP_START=256 SEQ_RAMP_FRAC=0.25 \
50+
NCCL_IB_DISABLE=1 RUN_ID=stinky_frost_v2 \
51+
torchrun --standalone --nproc_per_node="${NPROC:-8}" train_gpt.py \
52+
2>&1 | tee "$LOGDIR/run1_v2.log"
53+
54+
echo ""
55+
echo "============================================"
56+
echo " STINKY FROST V2 Complete."
57+
echo "============================================"
58+
echo " Reference: v1 = 1.1725 BPB (15.58MB)"
59+
echo " Reference: PR #180 SOTA = 1.1428 BPB (15.52MB)"
60+
echo ""
61+
62+
f="$LOGDIR/run1_v2.log"
63+
bpb=$(grep -oP 'final_int5int6_ttt_lora val_loss:\S+ val_bpb:\K\S+' "$f" 2>/dev/null | tail -1)
64+
quant_bpb=$(grep -oP 'final_int5int6_zst\S+ val_loss:\S+ val_bpb:\K\S+' "$f" 2>/dev/null | tail -1)
65+
# Fallback to int6 labels if mixed label not found
66+
bpb=${bpb:-$(grep -oP 'final_int6_ttt_lora val_loss:\S+ val_bpb:\K\S+' "$f" 2>/dev/null | tail -1)}
67+
quant_bpb=${quant_bpb:-$(grep -oP 'final_int6_zlib_roundtrip val_loss:\S+ val_bpb:\K\S+' "$f" 2>/dev/null | tail -1)}
68+
size=$(grep -oP 'Total submission size \S+: \K\d+' "$f" 2>/dev/null | tail -1)
69+
echo "stinky_frost_v2: ttt_bpb=${bpb:-N/A} quant_bpb=${quant_bpb:-N/A} artifact_bytes=${size:-N/A}"

0 commit comments

Comments
 (0)