|
| 1 | +#!/bin/bash |
| 2 | +# Hyperparameter sweep for MDLM combined model |
| 3 | +# Runs 8 experiments in 2 batches of 4 (one per GPU) |
| 4 | +set -e |
| 5 | +cd /pscratch/sd/w/whe1/auto-diffusion |
| 6 | +source .venv/bin/activate |
| 7 | + |
| 8 | +# Common sweep settings: 500 steps, constant LR, fast eval |
| 9 | +COMMON="ITERATIONS=500 MAX_WALLCLOCK_SECONDS=0 WARMDOWN_ITERS=0 WARMUP_STEPS=5 \ |
| 10 | +TRAIN_BATCH_TOKENS=32768 VAL_LOSS_EVERY=500 TRAIN_LOG_EVERY=100 \ |
| 11 | +ELBO_EVAL_STEPS=32 MAX_EVAL_SEQS=64 NUM_LAYERS=11" |
| 12 | + |
| 13 | +echo "=== BATCH 1: 4 experiments in parallel ===" |
| 14 | + |
| 15 | +# Exp 1: Baseline |
| 16 | +CUDA_VISIBLE_DEVICES=0 env $COMMON \ |
| 17 | + NOISE_EPS=0.01 TRAIN_SEQ_LEN=1024 LOGIT_SOFTCAP=30 COND_DIM=64 MLP_MULT=2 USE_DSIGMA_LOSS=0 \ |
| 18 | + python train_mdlm_combined.py > sweep_exp1_baseline.log 2>&1 & |
| 19 | +P1=$! |
| 20 | + |
| 21 | +# Exp 2: noise_eps=0.1 |
| 22 | +CUDA_VISIBLE_DEVICES=1 env $COMMON \ |
| 23 | + NOISE_EPS=0.1 TRAIN_SEQ_LEN=1024 LOGIT_SOFTCAP=30 COND_DIM=64 MLP_MULT=2 USE_DSIGMA_LOSS=0 \ |
| 24 | + python train_mdlm_combined.py > sweep_exp2_eps01.log 2>&1 & |
| 25 | +P2=$! |
| 26 | + |
| 27 | +# Exp 3: seq_len=2048 |
| 28 | +CUDA_VISIBLE_DEVICES=2 env $COMMON \ |
| 29 | + NOISE_EPS=0.01 TRAIN_SEQ_LEN=2048 LOGIT_SOFTCAP=30 COND_DIM=64 MLP_MULT=2 USE_DSIGMA_LOSS=0 \ |
| 30 | + python train_mdlm_combined.py > sweep_exp3_seq2048.log 2>&1 & |
| 31 | +P3=$! |
| 32 | + |
| 33 | +# Exp 4: no softcap |
| 34 | +CUDA_VISIBLE_DEVICES=3 env $COMMON \ |
| 35 | + NOISE_EPS=0.01 TRAIN_SEQ_LEN=1024 LOGIT_SOFTCAP=0 COND_DIM=64 MLP_MULT=2 USE_DSIGMA_LOSS=0 \ |
| 36 | + python train_mdlm_combined.py > sweep_exp4_nocap.log 2>&1 & |
| 37 | +P4=$! |
| 38 | + |
| 39 | +echo "Waiting for batch 1..." |
| 40 | +wait $P1 $P2 $P3 $P4 |
| 41 | +echo "Batch 1 done." |
| 42 | + |
| 43 | +echo "=== BATCH 2: 4 experiments in parallel ===" |
| 44 | + |
| 45 | +# Exp 5: cond_dim=128 |
| 46 | +CUDA_VISIBLE_DEVICES=0 env $COMMON \ |
| 47 | + NOISE_EPS=0.01 TRAIN_SEQ_LEN=1024 LOGIT_SOFTCAP=30 COND_DIM=128 MLP_MULT=2 USE_DSIGMA_LOSS=0 \ |
| 48 | + python train_mdlm_combined.py > sweep_exp5_cond128.log 2>&1 & |
| 49 | +P5=$! |
| 50 | + |
| 51 | +# Exp 6: dsigma loss |
| 52 | +CUDA_VISIBLE_DEVICES=1 env $COMMON \ |
| 53 | + NOISE_EPS=0.01 TRAIN_SEQ_LEN=1024 LOGIT_SOFTCAP=30 COND_DIM=64 MLP_MULT=2 USE_DSIGMA_LOSS=1 \ |
| 54 | + python train_mdlm_combined.py > sweep_exp6_dsigma.log 2>&1 & |
| 55 | +P6=$! |
| 56 | + |
| 57 | +# Exp 7: combo (eps=0.1 + seq=2048 + nocap) |
| 58 | +CUDA_VISIBLE_DEVICES=2 env $COMMON \ |
| 59 | + NOISE_EPS=0.1 TRAIN_SEQ_LEN=2048 LOGIT_SOFTCAP=0 COND_DIM=64 MLP_MULT=2 USE_DSIGMA_LOSS=0 \ |
| 60 | + python train_mdlm_combined.py > sweep_exp7_combo.log 2>&1 & |
| 61 | +P7=$! |
| 62 | + |
| 63 | +# Exp 8: combo + cond=128 |
| 64 | +CUDA_VISIBLE_DEVICES=3 env $COMMON \ |
| 65 | + NOISE_EPS=0.1 TRAIN_SEQ_LEN=2048 LOGIT_SOFTCAP=0 COND_DIM=128 MLP_MULT=2 USE_DSIGMA_LOSS=0 \ |
| 66 | + python train_mdlm_combined.py > sweep_exp8_combo_cond128.log 2>&1 & |
| 67 | +P8=$! |
| 68 | + |
| 69 | +echo "Waiting for batch 2..." |
| 70 | +wait $P5 $P6 $P7 $P8 |
| 71 | +echo "Batch 2 done." |
| 72 | + |
| 73 | +echo "" |
| 74 | +echo "=== SWEEP RESULTS ===" |
| 75 | +for f in sweep_exp*.log; do |
| 76 | + name=$(echo $f | sed 's/sweep_//;s/.log//') |
| 77 | + bpb=$(grep "val_bpb" $f | tail -1 | grep -oP 'val_bpb:\K[0-9.]+') |
| 78 | + loss=$(grep "val_loss" $f | tail -1 | grep -oP 'val_loss:\K[0-9.]+') |
| 79 | + echo "$name: val_bpb=$bpb val_loss=$loss" |
| 80 | +done |
0 commit comments