openai · Omrigotlieb · Apr 4, 2026
diff --git a/records/track_10min_16mb/2026-04-03_V2_SP4096_DepthRecur/submission.json b/records/track_10min_16mb/2026-04-03_V2_SP4096_DepthRecur/submission.json
@@ -0,0 +1,10 @@
+{
+  "name": "SP4096 + Polar Express + MuonEq-R + Depth Recurrence + WD=0.105",
+  "val_bpb": 1.0923,
+  "bytes_total": 15694101,
+  "blurb": "On clarkkev PR #1218 SP4096 base: Polar Express NS 4-step, MuonEq-R, depth recurrence layers 3,4,5 (shared MLP weights), WD=0.105, MLR=0.022. 3-seed mean 1.0923 (1337=1.0927, 42=1.0917, 2025=1.0925). Clean — no SLOT, no TTT.",
+  "author": "Omri Gotlieb",
+  "github_id": "Omrigotlieb",
+  "date": "2026-04-04",
+  "seeds": {"1337": 1.0927, "42": 1.0917, "2025": 1.0925}
+}
diff --git a/records/track_10min_16mb/2026-04-03_V2_SP4096_DepthRecur/train_gpt.py b/records/track_10min_16mb/2026-04-03_V2_SP4096_DepthRecur/train_gpt.py
diff --git a/records/track_10min_16mb/2026-04-03_V2_SP4096_DepthRecur/train_gpt.py.bak b/records/track_10min_16mb/2026-04-03_V2_SP4096_DepthRecur/train_gpt.py.bak
diff --git a/records/track_10min_16mb/2026-04-03_V2_SP4096_DepthRecur/train_gpt_clarkkev.py b/records/track_10min_16mb/2026-04-03_V2_SP4096_DepthRecur/train_gpt_clarkkev.py
diff --git a/records/track_10min_16mb/2026-04-03_V2_SP4096_DepthRecur/train_gpt_dexhunter.py b/records/track_10min_16mb/2026-04-03_V2_SP4096_DepthRecur/train_gpt_dexhunter.py
diff --git a/records/track_10min_16mb/2026-04-03_V2_SP4096_DepthRecur/train_gpt_lzma.py b/records/track_10min_16mb/2026-04-03_V2_SP4096_DepthRecur/train_gpt_lzma.py
diff --git a/records/track_10min_16mb/2026-04-03_V2_SP4096_DepthRecur/train_seed1337.log b/records/track_10min_16mb/2026-04-03_V2_SP4096_DepthRecur/train_seed1337.log
@@ -0,0 +1,125 @@
+W0404 07:28:31.900000 3359 torch/distributed/run.py:803] 
+W0404 07:28:31.900000 3359 torch/distributed/run.py:803] *****************************************
+W0404 07:28:31.900000 3359 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+W0404 07:28:31.900000 3359 torch/distributed/run.py:803] *****************************************
+Hyperparameters:
+  adam_eps: 1e-08
+  adam_wd: 0.02
+  beta1: 0.9
+  beta2: 0.95
+  compressor: brotli
+  data_dir: ./data/
+  datasets_dir: ./data/datasets/fineweb10B_sp4096
+  distributed: True
+  ema_decay: 0.997
+  embed_lr: 0.6
+  embed_wd: 0.09
+  embedding_dim: 512
+  eval_seq_len: 2048
+  eval_stride: 64
+  gptq_calibration_batches: 64
+  gptq_enabled: True
+  gptq_reserve_seconds: 10.0
+  grad_accum_steps: 1
+  grad_clip_norm: 0.3
+  head_lr: 0.008
+  is_main_process: True
+  iterations: 20000
+  ln_scale: True
+  local_rank: 0
+  logfile: logs/e3461e57-df16-4f49-a2cf-df0e019e5a94.txt
+  logit_softcap: 30.0
+  matrix_lr: 0.02
+  max_wallclock_seconds: 600.0
+  min_lr: 0.0
+  mlp_mult: 4.0
+  model_dim: 512
+  model_path: final_model.pt
+  muon_backend_steps: 4
+  muon_beta2: 0.95
+  muon_momentum: 0.99
+  muon_momentum_warmup_start: 0.92
+  muon_momentum_warmup_steps: 1500
+  muon_wd: 0.09
+  num_heads: 8
+  num_kv_heads: 4
+  num_layers: 11
+  qk_gain_init: 4.0
+  quantized_model_path: final_model.int6.ptz
+  rank: 0
+  recur_layers: 
+  recur_start_step: 3000
+  rope_base: 10000.0
+  rope_dims: 16
+  rope_train_seq_len: 2048
+  run_id: e3461e57-df16-4f49-a2cf-df0e019e5a94
+  scalar_lr: 0.02
+  seed: 1337
+  skip_gates_enabled: True
+  sliding_window_enabled: True
+  slot_enabled: False
+  slot_lr: 0.005
+  slot_steps: 8
+  tie_embeddings: True
+  tied_embed_init_std: 0.005
+  tied_embed_lr: 0.03
+  tokenizer_path: ./data/tokenizers/fineweb_4096_bpe.model
+  train_batch_tokens: 786432
+  train_files: ./data/datasets/fineweb10B_sp4096/fineweb_train_*.bin
+  train_log_every: 500
+  train_seq_len: 2048
+  val_batch_tokens: 524288
+  val_files: ./data/datasets/fineweb10B_sp4096/fineweb_val_*.bin
+  val_loss_every: 4000
+  ve_dim: 128
+  ve_enabled: True
+  ve_layers: 9,10
+  vocab_size: 4096
+  warmdown_frac: 0.667
+  warmup_steps: 20
+  world_size: 8
+  xsa_last_n: 11
+train_shards: 105
+val_tokens: 45508608
+model_params:34401371
+gptq:reserving 10s, effective=590000ms
+warmup_step: 1/20
+warmup_step: 2/20
+warmup_step: 3/20
+warmup_step: 4/20
+warmup_step: 5/20
+warmup_step: 6/20
+warmup_step: 10/20
+warmup_step: 20/20
+0/20000 val_loss: 8.3169 val_bpb: 3.6144
+1/20000 train_loss: 8.3173 train_time: 0.0m tok/s: 8077780
+2/20000 train_loss: 12.3118 train_time: 0.0m tok/s: 8078957
+3/20000 train_loss: 10.8329 train_time: 0.0m tok/s: 8055106
+4/20000 train_loss: 9.0273 train_time: 0.0m tok/s: 8040628
+5/20000 train_loss: 7.7310 train_time: 0.0m tok/s: 8020990
+500/20000 train_loss: 3.0318 train_time: 0.8m tok/s: 7833903
+1000/20000 train_loss: 2.8039 train_time: 1.7m tok/s: 7850824
+1500/20000 train_loss: 2.7812 train_time: 2.5m tok/s: 7855251
+2000/20000 train_loss: 2.7374 train_time: 3.3m tok/s: 7859423
+2500/20000 train_loss: 2.7390 train_time: 4.2m tok/s: 7863572
+3000/20000 train_loss: 2.7083 train_time: 5.0m tok/s: 7865532
+3500/20000 train_loss: 2.7651 train_time: 5.8m tok/s: 7868093
+4000/20000 train_loss: 2.6938 train_time: 6.7m tok/s: 7870189
+4000/20000 val_loss: 2.6706 val_bpb: 1.1606
+4500/20000 train_loss: 2.6816 train_time: 7.5m tok/s: 7871903
+5000/20000 train_loss: 2.5679 train_time: 8.3m tok/s: 7874047
+5500/20000 train_loss: 2.5748 train_time: 9.2m tok/s: 7875988
+5910/20000 val_loss: 2.5401 val_bpb: 1.1039
+stopping_early: wallclock_cap train_time: 590094ms step: 5910/20000
+peak memory allocated: 25776 MiB reserved: 25848 MiB
+ema:applying EMA weights
+pre-quantization post-ema val_loss:2.53732900 val_bpb:1.10269395 eval_time:1723ms
+Serialized model: 132405827 bytes
+Code size: 52689 bytes
+GPTQ:collecting Hessians from calibration data...
+GPTQ:collected 66 Hessians in 8.3s
+GPTQ quantization: 66 layers with full GPTQ, 0 fallback to clip-search
+Serialized model int6+brotli: 15922137 bytes
+Total submission size int6+brotli: 15974826 bytes
+final_int6_roundtrip val_loss:2.56430026 val_bpb:1.11441535 eval_time:18127ms
+final_int6_sliding_window val_loss:2.52218645 val_bpb:1.09611317 eval_time:89542ms
diff --git a/records/track_10min_16mb/2026-04-03_V2_SP4096_DepthRecur/train_seed2025.log b/records/track_10min_16mb/2026-04-03_V2_SP4096_DepthRecur/train_seed2025.log
@@ -0,0 +1,125 @@
+W0404 07:57:49.512000 46862 torch/distributed/run.py:803] 
+W0404 07:57:49.512000 46862 torch/distributed/run.py:803] *****************************************
+W0404 07:57:49.512000 46862 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+W0404 07:57:49.512000 46862 torch/distributed/run.py:803] *****************************************
+Hyperparameters:
+  adam_eps: 1e-08
+  adam_wd: 0.02
+  beta1: 0.9
+  beta2: 0.95
+  compressor: brotli
+  data_dir: ./data/
+  datasets_dir: ./data/datasets/fineweb10B_sp4096
+  distributed: True
+  ema_decay: 0.997
+  embed_lr: 0.6
+  embed_wd: 0.09
+  embedding_dim: 512
+  eval_seq_len: 2048
+  eval_stride: 64
+  gptq_calibration_batches: 64
+  gptq_enabled: True
+  gptq_reserve_seconds: 10.0
+  grad_accum_steps: 1
+  grad_clip_norm: 0.3
+  head_lr: 0.008
+  is_main_process: True
+  iterations: 20000
+  ln_scale: True
+  local_rank: 0
+  logfile: logs/8be1a0dc-3325-4e29-a747-de250317023e.txt
+  logit_softcap: 30.0
+  matrix_lr: 0.02
+  max_wallclock_seconds: 600.0
+  min_lr: 0.0
+  mlp_mult: 4.0
+  model_dim: 512
+  model_path: final_model.pt
+  muon_backend_steps: 4
+  muon_beta2: 0.95
+  muon_momentum: 0.99
+  muon_momentum_warmup_start: 0.92
+  muon_momentum_warmup_steps: 1500
+  muon_wd: 0.09
+  num_heads: 8
+  num_kv_heads: 4
+  num_layers: 11
+  qk_gain_init: 4.0
+  quantized_model_path: final_model.int6.ptz
+  rank: 0
+  recur_layers: 
+  recur_start_step: 3000
+  rope_base: 10000.0
+  rope_dims: 16
+  rope_train_seq_len: 2048
+  run_id: 8be1a0dc-3325-4e29-a747-de250317023e
+  scalar_lr: 0.02
+  seed: 2025
+  skip_gates_enabled: True
+  sliding_window_enabled: True
+  slot_enabled: False
+  slot_lr: 0.005
+  slot_steps: 8
+  tie_embeddings: True
+  tied_embed_init_std: 0.005
+  tied_embed_lr: 0.03
+  tokenizer_path: ./data/tokenizers/fineweb_4096_bpe.model
+  train_batch_tokens: 786432
+  train_files: ./data/datasets/fineweb10B_sp4096/fineweb_train_*.bin
+  train_log_every: 500
+  train_seq_len: 2048
+  val_batch_tokens: 524288
+  val_files: ./data/datasets/fineweb10B_sp4096/fineweb_val_*.bin
+  val_loss_every: 4000
+  ve_dim: 128
+  ve_enabled: True
+  ve_layers: 9,10
+  vocab_size: 4096
+  warmdown_frac: 0.667
+  warmup_steps: 20
+  world_size: 8
+  xsa_last_n: 11
+train_shards: 143
+val_tokens: 45508608
+model_params:34401371
+gptq:reserving 10s, effective=590000ms
+warmup_step: 1/20
+warmup_step: 2/20
+warmup_step: 3/20
+warmup_step: 4/20
+warmup_step: 5/20
+warmup_step: 6/20
+warmup_step: 10/20
+warmup_step: 20/20
+0/20000 val_loss: 8.3157 val_bpb: 3.6139
+1/20000 train_loss: 8.3152 train_time: 0.0m tok/s: 8497508
+2/20000 train_loss: 12.2836 train_time: 0.0m tok/s: 8361985
+3/20000 train_loss: 10.8162 train_time: 0.0m tok/s: 8246061
+4/20000 train_loss: 9.0635 train_time: 0.0m tok/s: 8188349
+5/20000 train_loss: 7.7942 train_time: 0.0m tok/s: 8153099
+500/20000 train_loss: 3.0027 train_time: 0.8m tok/s: 7878519
+1000/20000 train_loss: 2.9989 train_time: 1.7m tok/s: 7866583
+1500/20000 train_loss: 2.9094 train_time: 2.5m tok/s: 7866417
+2000/20000 train_loss: 2.7554 train_time: 3.3m tok/s: 7869223
+2500/20000 train_loss: 2.7623 train_time: 4.2m tok/s: 7871668
+3000/20000 train_loss: 2.7322 train_time: 5.0m tok/s: 7874038
+3500/20000 train_loss: 2.6598 train_time: 5.8m tok/s: 7875895
+4000/20000 train_loss: 2.6695 train_time: 6.7m tok/s: 7877977
+4000/20000 val_loss: 2.6703 val_bpb: 1.1605
+4500/20000 train_loss: 2.6197 train_time: 7.5m tok/s: 7878855
+5000/20000 train_loss: 2.5924 train_time: 8.3m tok/s: 7880038
+5500/20000 train_loss: 2.5615 train_time: 9.1m tok/s: 7881157
+5914/20000 val_loss: 2.5393 val_bpb: 1.1035
+stopping_early: wallclock_cap train_time: 590078ms step: 5914/20000
+peak memory allocated: 25773 MiB reserved: 25882 MiB
+ema:applying EMA weights
+pre-quantization post-ema val_loss:2.53646569 val_bpb:1.10231877 eval_time:1719ms
+Serialized model: 132405827 bytes
+Code size: 52689 bytes
+GPTQ:collecting Hessians from calibration data...
+GPTQ:collected 66 Hessians in 8.3s
+GPTQ quantization: 66 layers with full GPTQ, 0 fallback to clip-search
+Serialized model int6+brotli: 15917226 bytes
+Total submission size int6+brotli: 15969915 bytes
+final_int6_roundtrip val_loss:2.56297595 val_bpb:1.11383982 eval_time:6449ms
+final_int6_sliding_window val_loss:2.52096094 val_bpb:1.09558058 eval_time:66692ms
diff --git a/records/track_10min_16mb/2026-04-03_V2_SP4096_DepthRecur/train_seed42.log b/records/track_10min_16mb/2026-04-03_V2_SP4096_DepthRecur/train_seed42.log
@@ -0,0 +1,125 @@
+W0404 07:43:59.954000 42021 torch/distributed/run.py:803] 
+W0404 07:43:59.954000 42021 torch/distributed/run.py:803] *****************************************
+W0404 07:43:59.954000 42021 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+W0404 07:43:59.954000 42021 torch/distributed/run.py:803] *****************************************
+Hyperparameters:
+  adam_eps: 1e-08
+  adam_wd: 0.02
+  beta1: 0.9
+  beta2: 0.95
+  compressor: brotli
+  data_dir: ./data/
+  datasets_dir: ./data/datasets/fineweb10B_sp4096
+  distributed: True
+  ema_decay: 0.997
+  embed_lr: 0.6
+  embed_wd: 0.09
+  embedding_dim: 512
+  eval_seq_len: 2048
+  eval_stride: 64
+  gptq_calibration_batches: 64
+  gptq_enabled: True
+  gptq_reserve_seconds: 10.0
+  grad_accum_steps: 1
+  grad_clip_norm: 0.3
+  head_lr: 0.008
+  is_main_process: True
+  iterations: 20000
+  ln_scale: True
+  local_rank: 0
+  logfile: logs/1deef4c5-5c32-4a7f-b79c-300d44472448.txt
+  logit_softcap: 30.0
+  matrix_lr: 0.02
+  max_wallclock_seconds: 600.0
+  min_lr: 0.0
+  mlp_mult: 4.0
+  model_dim: 512
+  model_path: final_model.pt
+  muon_backend_steps: 4
+  muon_beta2: 0.95
+  muon_momentum: 0.99
+  muon_momentum_warmup_start: 0.92
+  muon_momentum_warmup_steps: 1500
+  muon_wd: 0.09
+  num_heads: 8
+  num_kv_heads: 4
+  num_layers: 11
+  qk_gain_init: 4.0
+  quantized_model_path: final_model.int6.ptz
+  rank: 0
+  recur_layers: 
+  recur_start_step: 3000
+  rope_base: 10000.0
+  rope_dims: 16
+  rope_train_seq_len: 2048
+  run_id: 1deef4c5-5c32-4a7f-b79c-300d44472448
+  scalar_lr: 0.02
+  seed: 42
+  skip_gates_enabled: True
+  sliding_window_enabled: True
+  slot_enabled: False
+  slot_lr: 0.005
+  slot_steps: 8
+  tie_embeddings: True
+  tied_embed_init_std: 0.005
+  tied_embed_lr: 0.03
+  tokenizer_path: ./data/tokenizers/fineweb_4096_bpe.model
+  train_batch_tokens: 786432
+  train_files: ./data/datasets/fineweb10B_sp4096/fineweb_train_*.bin
+  train_log_every: 500
+  train_seq_len: 2048
+  val_batch_tokens: 524288
+  val_files: ./data/datasets/fineweb10B_sp4096/fineweb_val_*.bin
+  val_loss_every: 4000
+  ve_dim: 128
+  ve_enabled: True
+  ve_layers: 9,10
+  vocab_size: 4096
+  warmdown_frac: 0.667
+  warmup_steps: 20
+  world_size: 8
+  xsa_last_n: 11
+train_shards: 143
+val_tokens: 45508608
+model_params:34401371
+gptq:reserving 10s, effective=590000ms
+warmup_step: 1/20
+warmup_step: 2/20
+warmup_step: 3/20
+warmup_step: 4/20
+warmup_step: 5/20
+warmup_step: 6/20
+warmup_step: 10/20
+warmup_step: 20/20
+0/20000 val_loss: 8.3187 val_bpb: 3.6152
+1/20000 train_loss: 8.3178 train_time: 0.0m tok/s: 8428254
+2/20000 train_loss: 12.2740 train_time: 0.0m tok/s: 8320211
+3/20000 train_loss: 10.8397 train_time: 0.0m tok/s: 8228668
+4/20000 train_loss: 9.0791 train_time: 0.0m tok/s: 8175946
+5/20000 train_loss: 7.8026 train_time: 0.0m tok/s: 8142297
+500/20000 train_loss: 2.9997 train_time: 0.8m tok/s: 7892437
+1000/20000 train_loss: 3.0009 train_time: 1.7m tok/s: 7881175
+1500/20000 train_loss: 2.9133 train_time: 2.5m tok/s: 7880352
+2000/20000 train_loss: 2.7586 train_time: 3.3m tok/s: 7879069
+2500/20000 train_loss: 2.7596 train_time: 4.2m tok/s: 7876718
+3000/20000 train_loss: 2.7333 train_time: 5.0m tok/s: 7877002
+3500/20000 train_loss: 2.6583 train_time: 5.8m tok/s: 7876271
+4000/20000 train_loss: 2.6694 train_time: 6.7m tok/s: 7877532
+4000/20000 val_loss: 2.6698 val_bpb: 1.1603
+4500/20000 train_loss: 2.6216 train_time: 7.5m tok/s: 7879398
+5000/20000 train_loss: 2.5937 train_time: 8.3m tok/s: 7880764
+5500/20000 train_loss: 2.5632 train_time: 9.1m tok/s: 7882133
+5915/20000 val_loss: 2.5396 val_bpb: 1.1037
+stopping_early: wallclock_cap train_time: 590101ms step: 5915/20000
+peak memory allocated: 25773 MiB reserved: 25882 MiB
+ema:applying EMA weights
+pre-quantization post-ema val_loss:2.53685127 val_bpb:1.10248634 eval_time:1724ms
+Serialized model: 132405827 bytes
+Code size: 52689 bytes
+GPTQ:collecting Hessians from calibration data...
+GPTQ:collected 66 Hessians in 8.3s
+GPTQ quantization: 66 layers with full GPTQ, 0 fallback to clip-search
+Serialized model int6+brotli: 15924719 bytes
+Total submission size int6+brotli: 15977408 bytes
+final_int6_roundtrip val_loss:2.56379873 val_bpb:1.11419739 eval_time:6449ms
+final_int6_sliding_window val_loss:2.52178284 val_bpb:1.09593777 eval_time:67034ms