Skip to content

Commit f92b800

Browse files
Wenhao Heclaude
andcommitted
Add sweep scripts and updated results through v13
- train_mdlm_combined.py: full MDLM training script (PR openai#1053 infra + PR openai#1106 MDLM + our innovations) - sweep.sh/sweep2.sh: 12-experiment hyperparameter sweep (eps, arch, loss, seq_len) - results.tsv: updated with v10-v13 experiments, corrected descriptions Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent aa51437 commit f92b800

4 files changed

Lines changed: 1039 additions & 7 deletions

File tree

results.tsv

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
commit val_bpb memory_gb artifact_mb status description
2-
992f599 2.054 3.2 17.3 discard MDLM v2: 11L 512d 2x cond128 lr=6e-4 (over 16MB)
3-
8e0b02b 1.955 10.8 19.5 discard MDLM v3: 10L 512d 2x cond64 lr=1e-3 (over 16MB)
4-
d91e8db 1.924 8.8 15.9 discard MDLM v4: 8L 512d 2x cond64 lr=1e-3 seq2048 batch32
5-
e1afee7 1.798 8.8 15.9 keep MDLM v5: importance sampling + seq1024 + batch64
6-
d43b0c3 1.866 17.5 15.5 discard MDLM v6: batch=128 hurt (fewer steps)
2+
992f599 2.054 3.2 17.3 discard MDLM v2: 11L cond128 lr=6e-4 eps=1e-3 (over 16MB)
3+
8e0b02b 1.955 10.8 19.5 discard MDLM v3: 10L cond64 lr=1e-3 eps=1e-3 (over 16MB)
4+
d91e8db 1.924 8.8 15.9 discard MDLM v4: 8L cond64 seq2048 batch32 eps=1e-3
5+
e1afee7 1.798 8.8 15.9 discard MDLM v5: importance sampling + seq1024 + batch64
6+
d43b0c3 1.866 17.5 15.5 discard MDLM v6: batch=128 (fewer steps hurt)
77
6c0e30f 1.799 8.8 15.9 discard MDLM v7: warmdown=1000 (same as v5)
8-
0b8f7a2 1.772 8.8 16.7 discard MDLM v8: lr=2e-3 (better BPB but artifact >16MB)
9-
9db68e4 1.788 8.8 15.7 keep MDLM v9: noise_eps=0.1 (from PR#1106, best so far)
8+
0b8f7a2 1.772 8.8 16.7 discard MDLM v8: lr=2e-3 (artifact >16MB)
9+
9db68e4 1.788 8.8 15.7 discard MDLM v9: eps=0.1 (terminal KL too high)
10+
a18c419 1.771 8.8 15.8 discard MDLM v10: eps=0.01 lr=1e-3
11+
f9d09c3 1.747 8.8 16.3 discard MDLM v11: eps=0.01 lr=1.5e-3 (artifact >16MB)
12+
e485e87 1.754 8.8 16.0 discard MDLM v12: eps=0.01 lr=1.2e-3 (artifact >16MB by 75KB)
13+
75a6064 1.766 8.8 15.96 keep MDLM v13: eps=0.01 lr=1.1e-3 (best valid)

sweep.sh

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#!/bin/bash
2+
# Hyperparameter sweep for MDLM combined model
3+
# Runs 8 experiments in 2 batches of 4 (one per GPU)
4+
set -e
5+
cd /pscratch/sd/w/whe1/auto-diffusion
6+
source .venv/bin/activate
7+
8+
# Common sweep settings: 500 steps, constant LR, fast eval
9+
COMMON="ITERATIONS=500 MAX_WALLCLOCK_SECONDS=0 WARMDOWN_ITERS=0 WARMUP_STEPS=5 \
10+
TRAIN_BATCH_TOKENS=32768 VAL_LOSS_EVERY=500 TRAIN_LOG_EVERY=100 \
11+
ELBO_EVAL_STEPS=32 MAX_EVAL_SEQS=64 NUM_LAYERS=11"
12+
13+
echo "=== BATCH 1: 4 experiments in parallel ==="
14+
15+
# Exp 1: Baseline
16+
CUDA_VISIBLE_DEVICES=0 env $COMMON \
17+
NOISE_EPS=0.01 TRAIN_SEQ_LEN=1024 LOGIT_SOFTCAP=30 COND_DIM=64 MLP_MULT=2 USE_DSIGMA_LOSS=0 \
18+
python train_mdlm_combined.py > sweep_exp1_baseline.log 2>&1 &
19+
P1=$!
20+
21+
# Exp 2: noise_eps=0.1
22+
CUDA_VISIBLE_DEVICES=1 env $COMMON \
23+
NOISE_EPS=0.1 TRAIN_SEQ_LEN=1024 LOGIT_SOFTCAP=30 COND_DIM=64 MLP_MULT=2 USE_DSIGMA_LOSS=0 \
24+
python train_mdlm_combined.py > sweep_exp2_eps01.log 2>&1 &
25+
P2=$!
26+
27+
# Exp 3: seq_len=2048
28+
CUDA_VISIBLE_DEVICES=2 env $COMMON \
29+
NOISE_EPS=0.01 TRAIN_SEQ_LEN=2048 LOGIT_SOFTCAP=30 COND_DIM=64 MLP_MULT=2 USE_DSIGMA_LOSS=0 \
30+
python train_mdlm_combined.py > sweep_exp3_seq2048.log 2>&1 &
31+
P3=$!
32+
33+
# Exp 4: no softcap
34+
CUDA_VISIBLE_DEVICES=3 env $COMMON \
35+
NOISE_EPS=0.01 TRAIN_SEQ_LEN=1024 LOGIT_SOFTCAP=0 COND_DIM=64 MLP_MULT=2 USE_DSIGMA_LOSS=0 \
36+
python train_mdlm_combined.py > sweep_exp4_nocap.log 2>&1 &
37+
P4=$!
38+
39+
echo "Waiting for batch 1..."
40+
wait $P1 $P2 $P3 $P4
41+
echo "Batch 1 done."
42+
43+
echo "=== BATCH 2: 4 experiments in parallel ==="
44+
45+
# Exp 5: cond_dim=128
46+
CUDA_VISIBLE_DEVICES=0 env $COMMON \
47+
NOISE_EPS=0.01 TRAIN_SEQ_LEN=1024 LOGIT_SOFTCAP=30 COND_DIM=128 MLP_MULT=2 USE_DSIGMA_LOSS=0 \
48+
python train_mdlm_combined.py > sweep_exp5_cond128.log 2>&1 &
49+
P5=$!
50+
51+
# Exp 6: dsigma loss
52+
CUDA_VISIBLE_DEVICES=1 env $COMMON \
53+
NOISE_EPS=0.01 TRAIN_SEQ_LEN=1024 LOGIT_SOFTCAP=30 COND_DIM=64 MLP_MULT=2 USE_DSIGMA_LOSS=1 \
54+
python train_mdlm_combined.py > sweep_exp6_dsigma.log 2>&1 &
55+
P6=$!
56+
57+
# Exp 7: combo (eps=0.1 + seq=2048 + nocap)
58+
CUDA_VISIBLE_DEVICES=2 env $COMMON \
59+
NOISE_EPS=0.1 TRAIN_SEQ_LEN=2048 LOGIT_SOFTCAP=0 COND_DIM=64 MLP_MULT=2 USE_DSIGMA_LOSS=0 \
60+
python train_mdlm_combined.py > sweep_exp7_combo.log 2>&1 &
61+
P7=$!
62+
63+
# Exp 8: combo + cond=128
64+
CUDA_VISIBLE_DEVICES=3 env $COMMON \
65+
NOISE_EPS=0.1 TRAIN_SEQ_LEN=2048 LOGIT_SOFTCAP=0 COND_DIM=128 MLP_MULT=2 USE_DSIGMA_LOSS=0 \
66+
python train_mdlm_combined.py > sweep_exp8_combo_cond128.log 2>&1 &
67+
P8=$!
68+
69+
echo "Waiting for batch 2..."
70+
wait $P5 $P6 $P7 $P8
71+
echo "Batch 2 done."
72+
73+
echo ""
74+
echo "=== SWEEP RESULTS ==="
75+
for f in sweep_exp*.log; do
76+
name=$(echo $f | sed 's/sweep_//;s/.log//')
77+
bpb=$(grep "val_bpb" $f | tail -1 | grep -oP 'val_bpb:\K[0-9.]+')
78+
loss=$(grep "val_loss" $f | tail -1 | grep -oP 'val_loss:\K[0-9.]+')
79+
echo "$name: val_bpb=$bpb val_loss=$loss"
80+
done

sweep2.sh

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#!/bin/bash
2+
# Follow-up sweep: eps=0.1 confirmed best, now test architecture variations
3+
set -e
4+
cd /pscratch/sd/w/whe1/auto-diffusion
5+
source .venv/bin/activate
6+
7+
COMMON="ITERATIONS=500 MAX_WALLCLOCK_SECONDS=0 WARMDOWN_ITERS=0 WARMUP_STEPS=5 \
8+
TRAIN_BATCH_TOKENS=32768 VAL_LOSS_EVERY=500 TRAIN_LOG_EVERY=100 \
9+
ELBO_EVAL_STEPS=32 MAX_EVAL_SEQS=64 NOISE_EPS=0.1 LOGIT_SOFTCAP=30 COND_DIM=64"
10+
11+
echo "=== SWEEP 2: 4 experiments in parallel (all with eps=0.1) ==="
12+
13+
# Exp 9: eps=0.1 + seq=2048
14+
CUDA_VISIBLE_DEVICES=0 env $COMMON \
15+
NUM_LAYERS=11 MLP_MULT=2 TRAIN_SEQ_LEN=2048 TIE_EMBEDDINGS=1 USE_DSIGMA_LOSS=0 \
16+
python train_mdlm_combined.py > sweep_exp9_eps01_seq2048.log 2>&1 &
17+
P1=$!
18+
19+
# Exp 10: eps=0.1 + no weight tying
20+
CUDA_VISIBLE_DEVICES=1 env $COMMON \
21+
NUM_LAYERS=11 MLP_MULT=2 TRAIN_SEQ_LEN=1024 TIE_EMBEDDINGS=0 USE_DSIGMA_LOSS=0 \
22+
python train_mdlm_combined.py > sweep_exp10_eps01_notie.log 2>&1 &
23+
P2=$!
24+
25+
# Exp 11: eps=0.1 + 9L + 3x MLP (wider model, fewer layers)
26+
CUDA_VISIBLE_DEVICES=2 env $COMMON \
27+
NUM_LAYERS=9 MLP_MULT=3 TRAIN_SEQ_LEN=1024 TIE_EMBEDDINGS=1 USE_DSIGMA_LOSS=0 \
28+
python train_mdlm_combined.py > sweep_exp11_eps01_9L_3x.log 2>&1 &
29+
P3=$!
30+
31+
# Exp 12: eps=0.1 + 9L + 3x MLP + seq=2048
32+
CUDA_VISIBLE_DEVICES=3 env $COMMON \
33+
NUM_LAYERS=9 MLP_MULT=3 TRAIN_SEQ_LEN=2048 TIE_EMBEDDINGS=1 USE_DSIGMA_LOSS=0 \
34+
python train_mdlm_combined.py > sweep_exp12_eps01_9L_3x_seq2048.log 2>&1 &
35+
P4=$!
36+
37+
echo "Waiting..."
38+
wait $P1 $P2 $P3 $P4
39+
40+
echo ""
41+
echo "=== SWEEP 2 RESULTS ==="
42+
for f in sweep_exp{9,10,11,12}*.log; do
43+
name=$(echo $f | sed 's/sweep_//;s/.log//')
44+
bpb=$(grep "val_bpb" $f | tail -1 | grep -oP 'val_bpb:\K[0-9.]+')
45+
params=$(grep -oP '[0-9,]+ params' $f | head -1)
46+
artifact=$(grep "artifact:" $f | grep -oP 'artifact:\K[0-9]+ bytes')
47+
echo "$name: val_bpb=$bpb params=$params artifact=$artifact"
48+
done

0 commit comments

Comments
 (0)