Skip to content

Commit 54a86ba

Browse files
Omrigotliebclaude
andcommitted
feat: V2 submission — SP4096 + PE + MuonEq-R + DR + SLOT on clarkkev base
Based on PR openai#1218 (clarkkev) SP4096/MLP4x/WD0.085 stack. Added: Polar Express NS (4 steps), MuonEq-R, depth recurrence (layers 4,5), SLOT eval-time delta. Target: sub-1.09 BPB. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent ae780e2 commit 54a86ba

4 files changed

Lines changed: 3580 additions & 0 deletions

File tree

deploy_v2.sh

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#!/usr/bin/env bash
2+
# =================================================================
3+
# V2 Deploy: SP4096 + Depth Recurrence + Polar Express + SLOT
4+
# One-command deployment on RunPod 8xH100
5+
# =================================================================
6+
set -euo pipefail
7+
8+
FORK="https://github.com/Omrigotlieb/parameter-golf.git"
9+
RESULTS="/workspace/results_v2"
10+
mkdir -p "$RESULTS"
11+
12+
echo "=== V2 DEPLOY $(date) ===" | tee "$RESULTS/status.txt"
13+
14+
cd /workspace/parameter-golf
15+
git remote add fork "$FORK" 2>/dev/null || true
16+
git fetch fork main --quiet
17+
git checkout fork/main -- records/track_10min_16mb/2026-04-03_V2_SP4096_DepthRecur/
18+
19+
NGPU=$(nvidia-smi -L | wc -l)
20+
echo "GPUs: $NGPU" | tee -a "$RESULTS/status.txt"
21+
22+
# Download SP4096 data if needed
23+
SP4096_COUNT=$(ls data/datasets/fineweb10B_sp4096/fineweb_train_*.bin 2>/dev/null | wc -l)
24+
if [ "$SP4096_COUNT" -lt 10 ]; then
25+
echo "Downloading SP4096 data..." | tee -a "$RESULTS/status.txt"
26+
MATCHED_FINEWEB_REPO_ID=kevclark/parameter-golf python3 \
27+
data/cached_challenge_fineweb.py --variant sp4096 --train-shards 143
28+
echo "SP4096 data ready: $(ls data/datasets/fineweb10B_sp4096/fineweb_train_*.bin | wc -l) shards" | tee -a "$RESULTS/status.txt"
29+
fi
30+
31+
SCRIPT="records/track_10min_16mb/2026-04-03_V2_SP4096_DepthRecur/train_gpt.py"
32+
33+
# === RUN 1: V2 baseline (no SLOT, no recurrence) ===
34+
echo "" | tee -a "$RESULTS/status.txt"
35+
echo "=== V2 RUN 1: Baseline SP4096 (seed=1337) ===" | tee -a "$RESULTS/status.txt"
36+
echo "Started: $(date)" | tee -a "$RESULTS/status.txt"
37+
38+
MUON_BACKEND_STEPS=4 SLOT_ENABLED=0 RECUR_LAYERS="" \
39+
ITERATIONS=9000 MAX_WALLCLOCK_SECONDS=600 EVAL_STRIDE=64 SEED=1337 \
40+
torchrun --standalone --nproc_per_node=$NGPU "$SCRIPT" 2>&1 | tee "$RESULTS/v2_run1_baseline.log"
41+
echo "V2 Run 1 done: $(date)" | tee -a "$RESULTS/status.txt"
42+
43+
# === RUN 2: V2 + depth recurrence ===
44+
echo "" | tee -a "$RESULTS/status.txt"
45+
echo "=== V2 RUN 2: + Depth Recurrence (seed=1337) ===" | tee -a "$RESULTS/status.txt"
46+
47+
MUON_BACKEND_STEPS=4 SLOT_ENABLED=0 RECUR_LAYERS=4,5 RECUR_START_STEP=3000 \
48+
ITERATIONS=9000 MAX_WALLCLOCK_SECONDS=600 EVAL_STRIDE=64 SEED=1337 \
49+
torchrun --standalone --nproc_per_node=$NGPU "$SCRIPT" 2>&1 | tee "$RESULTS/v2_run2_recur.log"
50+
echo "V2 Run 2 done: $(date)" | tee -a "$RESULTS/status.txt"
51+
52+
# === RUN 3: V2 + depth recurrence + SLOT ===
53+
echo "" | tee -a "$RESULTS/status.txt"
54+
echo "=== V2 RUN 3: + DR + SLOT (seed=1337) ===" | tee -a "$RESULTS/status.txt"
55+
56+
MUON_BACKEND_STEPS=4 SLOT_ENABLED=1 SLOT_STEPS=8 SLOT_LR=0.005 \
57+
RECUR_LAYERS=4,5 RECUR_START_STEP=3000 \
58+
ITERATIONS=9000 MAX_WALLCLOCK_SECONDS=600 EVAL_STRIDE=64 SEED=1337 \
59+
torchrun --standalone --nproc_per_node=$NGPU "$SCRIPT" 2>&1 | tee "$RESULTS/v2_run3_recur_slot.log"
60+
echo "V2 Run 3 done: $(date)" | tee -a "$RESULTS/status.txt"
61+
62+
# === RUN 4-5: 3-seed validation of best config ===
63+
echo "" | tee -a "$RESULTS/status.txt"
64+
echo "=== V2 3-SEED VALIDATION ===" | tee -a "$RESULTS/status.txt"
65+
for SEED in 42 2025; do
66+
echo "Seed $SEED started: $(date)" | tee -a "$RESULTS/status.txt"
67+
MUON_BACKEND_STEPS=4 SLOT_ENABLED=1 SLOT_STEPS=8 SLOT_LR=0.005 \
68+
RECUR_LAYERS=4,5 RECUR_START_STEP=3000 \
69+
ITERATIONS=9000 MAX_WALLCLOCK_SECONDS=600 EVAL_STRIDE=64 SEED=$SEED \
70+
torchrun --standalone --nproc_per_node=$NGPU "$SCRIPT" 2>&1 | tee "$RESULTS/v2_seed${SEED}.log"
71+
echo "Seed $SEED done: $(date)" | tee -a "$RESULTS/status.txt"
72+
done
73+
74+
echo "" | tee -a "$RESULTS/status.txt"
75+
echo "=== ALL V2 RUNS COMPLETE $(date) ===" | tee -a "$RESULTS/status.txt"
76+
cat "$RESULTS/status.txt"

0 commit comments

Comments
 (0)