|
| 1 | +#!/usr/bin/env bash |
| 2 | +# ================================================================= |
| 3 | +# V2 Deploy: SP4096 + Depth Recurrence + Polar Express + SLOT |
| 4 | +# One-command deployment on RunPod 8xH100 |
| 5 | +# ================================================================= |
| 6 | +set -euo pipefail |
| 7 | + |
| 8 | +FORK="https://github.com/Omrigotlieb/parameter-golf.git" |
| 9 | +RESULTS="/workspace/results_v2" |
| 10 | +mkdir -p "$RESULTS" |
| 11 | + |
| 12 | +echo "=== V2 DEPLOY $(date) ===" | tee "$RESULTS/status.txt" |
| 13 | + |
| 14 | +cd /workspace/parameter-golf |
| 15 | +git remote add fork "$FORK" 2>/dev/null || true |
| 16 | +git fetch fork main --quiet |
| 17 | +git checkout fork/main -- records/track_10min_16mb/2026-04-03_V2_SP4096_DepthRecur/ |
| 18 | + |
| 19 | +NGPU=$(nvidia-smi -L | wc -l) |
| 20 | +echo "GPUs: $NGPU" | tee -a "$RESULTS/status.txt" |
| 21 | + |
| 22 | +# Download SP4096 data if needed |
| 23 | +SP4096_COUNT=$(ls data/datasets/fineweb10B_sp4096/fineweb_train_*.bin 2>/dev/null | wc -l) |
| 24 | +if [ "$SP4096_COUNT" -lt 10 ]; then |
| 25 | + echo "Downloading SP4096 data..." | tee -a "$RESULTS/status.txt" |
| 26 | + MATCHED_FINEWEB_REPO_ID=kevclark/parameter-golf python3 \ |
| 27 | + data/cached_challenge_fineweb.py --variant sp4096 --train-shards 143 |
| 28 | + echo "SP4096 data ready: $(ls data/datasets/fineweb10B_sp4096/fineweb_train_*.bin | wc -l) shards" | tee -a "$RESULTS/status.txt" |
| 29 | +fi |
| 30 | + |
| 31 | +SCRIPT="records/track_10min_16mb/2026-04-03_V2_SP4096_DepthRecur/train_gpt.py" |
| 32 | + |
| 33 | +# === RUN 1: V2 baseline (no SLOT, no recurrence) === |
| 34 | +echo "" | tee -a "$RESULTS/status.txt" |
| 35 | +echo "=== V2 RUN 1: Baseline SP4096 (seed=1337) ===" | tee -a "$RESULTS/status.txt" |
| 36 | +echo "Started: $(date)" | tee -a "$RESULTS/status.txt" |
| 37 | + |
| 38 | +MUON_BACKEND_STEPS=4 SLOT_ENABLED=0 RECUR_LAYERS="" \ |
| 39 | +ITERATIONS=9000 MAX_WALLCLOCK_SECONDS=600 EVAL_STRIDE=64 SEED=1337 \ |
| 40 | +torchrun --standalone --nproc_per_node=$NGPU "$SCRIPT" 2>&1 | tee "$RESULTS/v2_run1_baseline.log" |
| 41 | +echo "V2 Run 1 done: $(date)" | tee -a "$RESULTS/status.txt" |
| 42 | + |
| 43 | +# === RUN 2: V2 + depth recurrence === |
| 44 | +echo "" | tee -a "$RESULTS/status.txt" |
| 45 | +echo "=== V2 RUN 2: + Depth Recurrence (seed=1337) ===" | tee -a "$RESULTS/status.txt" |
| 46 | + |
| 47 | +MUON_BACKEND_STEPS=4 SLOT_ENABLED=0 RECUR_LAYERS=4,5 RECUR_START_STEP=3000 \ |
| 48 | +ITERATIONS=9000 MAX_WALLCLOCK_SECONDS=600 EVAL_STRIDE=64 SEED=1337 \ |
| 49 | +torchrun --standalone --nproc_per_node=$NGPU "$SCRIPT" 2>&1 | tee "$RESULTS/v2_run2_recur.log" |
| 50 | +echo "V2 Run 2 done: $(date)" | tee -a "$RESULTS/status.txt" |
| 51 | + |
| 52 | +# === RUN 3: V2 + depth recurrence + SLOT === |
| 53 | +echo "" | tee -a "$RESULTS/status.txt" |
| 54 | +echo "=== V2 RUN 3: + DR + SLOT (seed=1337) ===" | tee -a "$RESULTS/status.txt" |
| 55 | + |
| 56 | +MUON_BACKEND_STEPS=4 SLOT_ENABLED=1 SLOT_STEPS=8 SLOT_LR=0.005 \ |
| 57 | +RECUR_LAYERS=4,5 RECUR_START_STEP=3000 \ |
| 58 | +ITERATIONS=9000 MAX_WALLCLOCK_SECONDS=600 EVAL_STRIDE=64 SEED=1337 \ |
| 59 | +torchrun --standalone --nproc_per_node=$NGPU "$SCRIPT" 2>&1 | tee "$RESULTS/v2_run3_recur_slot.log" |
| 60 | +echo "V2 Run 3 done: $(date)" | tee -a "$RESULTS/status.txt" |
| 61 | + |
| 62 | +# === RUN 4-5: 3-seed validation of best config === |
| 63 | +echo "" | tee -a "$RESULTS/status.txt" |
| 64 | +echo "=== V2 3-SEED VALIDATION ===" | tee -a "$RESULTS/status.txt" |
| 65 | +for SEED in 42 2025; do |
| 66 | + echo "Seed $SEED started: $(date)" | tee -a "$RESULTS/status.txt" |
| 67 | + MUON_BACKEND_STEPS=4 SLOT_ENABLED=1 SLOT_STEPS=8 SLOT_LR=0.005 \ |
| 68 | + RECUR_LAYERS=4,5 RECUR_START_STEP=3000 \ |
| 69 | + ITERATIONS=9000 MAX_WALLCLOCK_SECONDS=600 EVAL_STRIDE=64 SEED=$SEED \ |
| 70 | + torchrun --standalone --nproc_per_node=$NGPU "$SCRIPT" 2>&1 | tee "$RESULTS/v2_seed${SEED}.log" |
| 71 | + echo "Seed $SEED done: $(date)" | tee -a "$RESULTS/status.txt" |
| 72 | +done |
| 73 | + |
| 74 | +echo "" | tee -a "$RESULTS/status.txt" |
| 75 | +echo "=== ALL V2 RUNS COMPLETE $(date) ===" | tee -a "$RESULTS/status.txt" |
| 76 | +cat "$RESULTS/status.txt" |
0 commit comments