openai · lkk688 · Apr 22, 2026 · May 1, 2026 · May 1, 2026
diff --git a/records/track_10min_16mb/2026-04-30_SP1024_ValueResid_PPMMix/ train.log b/records/track_10min_16mb/2026-04-30_SP1024_ValueResid_PPMMix/ train.log
@@ -0,0 +1,92 @@
+output/run_sweep_record_try_sp1024_v1/record_try_sp1024_512d_mlp2_value_resid_last2_ppm_hi075_steps2200_20260501_051628/20260501_051631.txt
+val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path=./data/tokenizers/fineweb_1024_bpe.model
+train_loader:dataset:fineweb10B_sp1024 train_shards:80
+val_loader:shards pattern=./data/datasets/fineweb10B_sp1024/fineweb_val_*.bin tokens:62021632
+Architecture: Discrete N-Gram Hash (Max N=2)
+lora_params:0
+model_params:17385709
+world_size:1 grad_accum_steps:4
+attention_mode:gqa num_heads:8 num_kv_heads:4
+tie_embeddings:True matrix_lr:0.04 scalar_lr:0.04
+ttt_enabled:False ttt_mode:lora lora_ttt_enabled:False
+parallel_v2_enabled:0 mode:dual_add second_lane:mlp active_layers:[] second_lane_params:0
+W0501 05:17:04.134000 130660118263360 torch/fx/experimental/symbolic_shapes.py:4449] [0/0_1] q0 is not in var_ranges, defaulting to unknown range.
+W0501 05:17:04.175000 130660118263360 torch/fx/experimental/symbolic_shapes.py:4449] [0/0_1] z0 is not in var_ranges, defaulting to unknown range.
+W0501 05:17:06.797000 130660118263360 torch/fx/experimental/symbolic_shapes.py:4449] [0/0_1] x0 is not in var_ranges, defaulting to unknown range.
+W0501 05:17:43.264000 130660118263360 torch/fx/experimental/symbolic_shapes.py:4449] [0/1] q0 is not in var_ranges, defaulting to unknown range.
+W0501 05:17:43.279000 130660118263360 torch/fx/experimental/symbolic_shapes.py:4449] [0/1] z0 is not in var_ranges, defaulting to unknown range.
+W0501 05:17:45.391000 130660118263360 torch/fx/experimental/symbolic_shapes.py:4449] [0/1] x0 is not in var_ranges, defaulting to unknown range.
+warmup_step:1/20
+warmup_step:2/20
+warmup_step:3/20
+warmup_step:4/20
+warmup_step:5/20
+warmup_step:6/20
+warmup_step:7/20
+warmup_step:8/20
+warmup_step:9/20
+warmup_step:10/20
+warmup_step:11/20
+warmup_step:12/20
+warmup_step:13/20
+warmup_step:14/20
+warmup_step:15/20
+warmup_step:16/20
+warmup_step:17/20
+warmup_step:18/20
+warmup_step:19/20
+warmup_step:20/20
+EMA Enabled: decay=0.997
+Scheduled Late QAT to start at step 1870 (last 15.0%)
+step:0/2200 val_loss:6.9311 val_bpb:4.1050 train_time:3ms step_avg:3.24ms
+step:1/2200 train_loss:6.9310 train_time:5646ms step_avg:5645.96ms
+step:2/2200 train_loss:6.7587 train_time:6260ms step_avg:3130.22ms
+step:3/2200 train_loss:6.3280 train_time:6876ms step_avg:2291.92ms
+step:4/2200 train_loss:6.0142 train_time:7491ms step_avg:1872.66ms
+step:5/2200 train_loss:5.8538 train_time:8105ms step_avg:1621.03ms
+step:6/2200 train_loss:5.7285 train_time:8720ms step_avg:1453.37ms
+step:7/2200 train_loss:5.6112 train_time:9335ms step_avg:1333.56ms
+step:8/2200 train_loss:5.5435 train_time:9950ms step_avg:1243.71ms
+step:9/2200 train_loss:5.4271 train_time:10564ms step_avg:1173.82ms
+step:10/2200 train_loss:5.3285 train_time:11179ms step_avg:1117.95ms
+step:200/2200 train_loss:2.6610 train_time:127902ms step_avg:639.51ms
+step:400/2200 train_loss:2.3472 train_time:250871ms step_avg:627.18ms
+step:600/2200 train_loss:2.4710 train_time:373898ms step_avg:623.16ms
+step:800/2200 train_loss:2.3273 train_time:496924ms step_avg:621.15ms
+step:1000/2200 train_loss:2.3827 train_time:619883ms step_avg:619.88ms
+step:1000/2200 val_loss:2.3569 val_bpb:1.3959 train_time:619884ms step_avg:619.88ms
+step:1200/2200 train_loss:2.2925 train_time:742828ms step_avg:619.02ms
+step:1400/2200 train_loss:2.3217 train_time:865823ms step_avg:618.44ms
+step:1600/2200 train_loss:2.1926 train_time:988786ms step_avg:617.99ms
+step:1800/2200 train_loss:2.2291 train_time:1111692ms step_avg:617.61ms
+[Step 1870] Activating Late QAT — enabling branchless STE quantization.
+step:2000/2200 train_loss:2.1763 train_time:1234584ms step_avg:617.29ms
+step:2000/2200 val_loss:2.1922 val_bpb:1.2984 train_time:1234584ms step_avg:617.29ms
+step:2200/2200 train_loss:2.1102 train_time:1357536ms step_avg:617.06ms
+step:2200/2200 val_loss:2.1771 val_bpb:1.2894 train_time:1357536ms step_avg:617.06ms
+peak memory allocated: 29237 MiB reserved: 30568 MiB
+Applying EMA weights for final evaluation...
+saved raw checkpoint: output/run_sweep_record_try_sp1024_v1/record_try_sp1024_512d_mlp2_value_resid_last2_ppm_hi075_steps2200_20260501_051628/final_model.pt (67,874,184 bytes)
+model_size int8+zlib:15650103 bytes  code:156032 bytes  total:15806135 bytes  limit:16MB(16777216)  FITS
+  payload:17568410  raw_torch:17615294  compression_ratio:3.86x
+/workspace/parameter-golf/mytrain_gpt_v6_1.py:3216: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  quant_state = torch.load(io.BytesIO(zlib.decompress(quant_blob_disk)), map_location="cpu")
+final_int8_zlib_roundtrip val_loss:2.1839 val_bpb:1.2934 eval_time:14683ms
+final_int8_zlib_roundtrip_exact val_loss:2.18385094 val_bpb:1.29339954
+Starting PPM byte mixture evaluation...
+ppm_mix_progress seq:500/60568 tokens:513024 bytes:1223626 contexts:330630 skipped_ctx:0 mix_bpb:0.772718
+ppm_mix_progress seq:1000/60568 tokens:1025024 bytes:2455049 contexts:498547 skipped_ctx:0 mix_bpb:0.791232
+ppm_mix_progress seq:1500/60568 tokens:1537024 bytes:3681252 contexts:626070 skipped_ctx:0 mix_bpb:0.798705
+ppm_mix_progress seq:2000/60568 tokens:2049024 bytes:4901593 contexts:743558 skipped_ctx:0 mix_bpb:0.806348
+ppm_mix_progress seq:2500/60568 tokens:2561024 bytes:6124160 contexts:844658 skipped_ctx:0 mix_bpb:0.809410
+ppm_mix_progress seq:3000/60568 tokens:3073024 bytes:7342115 contexts:940062 skipped_ctx:0 mix_bpb:0.812034
+ppm_mix_progress seq:3500/60568 tokens:3585024 bytes:8556619 contexts:1034335 skipped_ctx:0 mix_bpb:0.815524
+ppm_mix_progress seq:4000/60568 tokens:4097024 bytes:9782949 contexts:1114667 skipped_ctx:0 mix_bpb:0.817923
+ppm_mix_progress seq:4500/60568 tokens:4609024 bytes:11023557 contexts:1185904 skipped_ctx:0 mix_bpb:0.820011
+ppm_mix_progress seq:5000/60568 tokens:5121024 bytes:12250013 contexts:1260459 skipped_ctx:0 mix_bpb:0.821949
+ppm_mix_progress seq:5500/60568 tokens:5633024 bytes:13478107 contexts:1335757 skipped_ctx:0 mix_bpb:0.823841
+ppm_mix_progress seq:6000/60568 tokens:6145024 bytes:14694159 contexts:1408314 skipped_ctx:0 mix_bpb:0.825123
+ppm_mix_progress seq:6500/60568 tokens:6657024 bytes:15917346 contexts:1482153 skipped_ctx:0 mix_bpb:0.826809
+ppm_mix_progress seq:7000/60568 tokens:7169024 bytes:17147554 contexts:1545185 skipped_ctx:0 mix_bpb:0.827852
+ppm_mix_progress seq:7500/60568 tokens:7681024 bytes:18359660 contexts:1612554 skipped_ctx:0 mix_bpb:0.828923
+ppm_mix_bpb:0.829467
diff --git a/records/track_10min_16mb/2026-04-30_SP1024_ValueResid_PPMMix/README.md b/records/track_10min_16mb/2026-04-30_SP1024_ValueResid_PPMMix/README.md
@@ -0,0 +1,132 @@
+# SP1024 + Value Residual + Byte-Level PPM Mixture
+
+## Overview
+
+This submission is the result of an incremental research process rather than a single clean-sheet design.
+
+The training script was built step by step across many rounds of experiments. Instead of hard-coding one fixed model, we kept most architecture, optimization, tokenizer, and evaluation ideas behind environment-controlled switches so we could run controlled ablations quickly and compare many alternatives within one stable framework.
+
+The final submission in this folder is a **record 16MB submission** based on:
+
+- SentencePiece 1024 tokenizer
+- 9-layer Transformer
+- model dimension 512
+- 8 attention heads / 4 KV heads
+- MLP multiplier 2
+- Value Residual enabled in the last 2 layers
+- byte-level PPM mixture during final evaluation
+
+## Submission Type
+
+This is a **record submission**.
+
+The included best run was produced on **1×H100**, with 600s as the wall clock. We do **not** claim verified compliance with the official **8×H100 / 10-minute** leaderboard requirement in this folder.
+
+However, this run does satisfy the artifact-size requirement:
+
+- compressed model: `15,650,103 bytes`
+- code size: `156,032 bytes`
+- total submission size: `15,806,135 bytes`
+
+This fits under the 16MB limit.
+
+## Best Included Result
+
+### Neural roundtrip score
+- `final_int8_zlib_roundtrip_exact val_bpb = 1.29339954`
+
+### Final mixed score
+- `ppm_mix_bpb = 0.829467`
+
+This was the strongest included result for the SP1024 compact line.
+
+## Main Idea
+
+Our final direction is intentionally simple:
+
+1. keep a compact Transformer backbone
+2. improve the late value path with **Value Residual**
+3. combine the neural model with a **byte-level PPM mixture** at evaluation time
+
+In our experiments, this combination was more useful than continuing to add more complicated architectural branches.
+
+## How the Code Evolved
+
+This codebase was not written as a minimal one-off competition script.  
+It evolved as a research scaffold.
+
+Over time, we added switches for many ideas so that the same script could be reused for many sweeps and fair ablations. The broader script supports experimentation with:
+
+- tokenizer variants
+- BiFPN / BiFPN2 skip fusion
+- XSA
+- N-gram augmentation
+- Value Residual
+- cross-layer V and KV sharing
+- PLE
+- MTP
+- parallel residual variants
+- parallel-v2 side lanes
+- LoRA-TTT
+- byte-level PPM mixture
+
+Many of these ideas were explored, but the strongest compact SP1024 line for this submission ended up being:
+
+**compact backbone + value residual + byte-level mixture**
+
+## Experimental Summary
+
+A short summary of the findings that most influenced this submission:
+
+### 1. Tokenizer choice mattered
+Earlier sweeps showed that tokenizer choice had a large impact on compression performance. We explored SP1024, SP4096, and SP8192. For this submission, we chose SP1024 because it provided a compact, size-friendly line suitable for a 16MB submission.
+
+### 2. Capacity still mattered
+Increasing backbone capacity often helped, but for this submission we prioritized a compact model that still achieved a strong mixed score while fitting under the 16MB limit.
+
+### 3. Value Residual was the strongest late-layer architectural improvement
+Across many later Transformer ablations, **Value Residual** was the most consistent improvement that survived repeated testing. In this submission we enable it only in the last 2 layers.
+
+### 4. Byte-level PPM mixture produced the largest final gain
+The final score improvement came primarily from combining the neural model with a **byte-level PPM mixture** rather than from continuing to add more neural-only complexity.
+
+## Final Configuration
+
+Key settings for the included run:
+
+- `VOCAB_SIZE=1024`
+- `NUM_LAYERS=9`
+- `MODEL_DIM=512`
+- `NUM_HEADS=8`
+- `NUM_KV_HEADS=4`
+- `MLP_MULT=2`
+- `VALUE_RESIDUAL_ENABLED=1`
+- `VALUE_RESIDUAL_LAST_N_LAYERS=2`
+- `BIFPN2_MODE=1`
+- `XSA_ENABLED=1`
+- `NGRAM_MAX_N=2`
+- `EMA_ENABLED=1`
+- `LATE_QAT_RATIO=0.15`
+- `PPM_ENABLED=1`
+- `PPM_ORDER=5`
+- `PPM_CONF_THRESHOLD=0.9`
+- `LAMBDA_LO=0.10`
+- `LAMBDA_HI=0.75`
+
+## Included Files
+
+This folder contains:
+
+- `train_gpt.py` — final training and evaluation script
+- `submission.json` — submission metadata
+- `config.json` — selected configuration for the included run
+- `requirements.txt` — Python dependencies
+- `train.log` — log from the included best run
+- `seed_runs.csv` — representative run summary
+
+## Reproduction
+
+A representative launch for the included run is equivalent to:
+
+```bash
+torchrun --nproc_per_node=1 train_gpt.py
diff --git a/records/track_10min_16mb/2026-04-30_SP1024_ValueResid_PPMMix/config.json b/records/track_10min_16mb/2026-04-30_SP1024_ValueResid_PPMMix/config.json
@@ -0,0 +1,110 @@
+{
+  "DATA_PATH": "./data/datasets/fineweb10B_sp1024",
+  "TOKENIZER_PATH": "./data/tokenizers/fineweb_1024_bpe.model",
+  "NUM_LAYERS": 9,
+  "MODEL_DIM": 512,
+  "NUM_HEADS": 8,
+  "NUM_KV_HEADS": 4,
+  "MLP_MULT": 2,
+  "VOCAB_SIZE": 1024,
+  "TIE_EMBEDDINGS": 1,
+  "ROPE_BASE": 10000.0,
+  "ROPE_DIMS": -1,
+  "LEARNABLE_ROPE": 0,
+  "LOGIT_SOFTCAP": 30.0,
+  "QK_GAIN_INIT": 4.0,
+  "GRAD_ACCUM_STEPS": 4,
+  "TRAIN_BATCH_TOKENS": 524288,
+  "TRAIN_SEQ_LEN": 1024,
+  "ITERATIONS": 20000,
+  "WARMUP_STEPS": 20,
+  "WARMDOWN_ITERS": 900,
+  "MAX_WALLCLOCK_SECONDS": 600.0,
+  "VAL_BATCH_SIZE": 524288,
+  "VAL_LOSS_EVERY": 1000,
+  "TRAIN_LOG_EVERY": 200,
+  "MATRIX_LR": 0.04,
+  "SCALAR_LR": 0.04,
+  "EMBED_LR": 0.6,
+  "HEAD_LR": 0.008,
+  "TIED_EMBED_LR": 0.05,
+  "TIED_EMBED_INIT_STD": 0.005,
+  "MUON_MOMENTUM": 0.95,
+  "MUON_BACKEND_STEPS": 5,
+  "MUON_MOMENTUM_WARMUP_START": 0.85,
+  "MUON_MOMENTUM_WARMUP_STEPS": 500,
+  "BETA1": 0.9,
+  "BETA2": 0.95,
+  "ADAM_EPS": 1e-08,
+  "GRAD_CLIP_NORM": 0.0,
+  "FDA_MODE": 0,
+  "BIFPN_MODE": 0,
+  "BIFPN2_MODE": 1,
+  "BIFPN_GROUP_COUNT": 8,
+  "BIFPN_BAND_WIDTH": 1,
+  "BIFPN_NORM_EPS": 0.0001,
+  "BIFPN_INIT_MAIN": 1.0,
+  "BIFPN_INIT_NEIGHBOR": 0.15,
+  "BIFPN_INIT_FAR": 0.0,
+  "SMEAR_MODE": 0,
+  "SMEAR_WINDOW": 4,
+  "SMEAR_GATE": 0,
+  "LN_SCALE": 1,
+  "LEARNABLE_LN_SCALE": 0,
+  "AFFINE_NORM": 0,
+  "SCALEDLM_HEAD": 1,
+  "XSA_ENABLED": 1,
+  "XSA_LAST_N_LAYERS": 4,
+  "XSA_EPS": 1e-06,
+  "V_SKIP_ENABLED": 0,
+  "V_SKIP_LAST_N_LAYERS": 0,
+  "V_SKIP_MODE": "scalar",
+  "V_SKIP_GROUP_COUNT": 8,
+  "CROSS_LAYER_V_ENABLED": 0,
+  "CROSS_LAYER_V_LAST_N_LAYERS": 4,
+  "CROSS_LAYER_V_MODE": "residual",
+  "CROSS_LAYER_V_GROUP_COUNT": 4,
+  "CROSS_LAYER_KV_SHARING_ENABLED": 0,
+  "CROSS_LAYER_KV_LAST_N_LAYERS": 0,
+  "CROSS_LAYER_KV_SHARE_K": 1,
+  "CROSS_LAYER_KV_SHARE_V": 1,
+  "CROSS_LAYER_KV_PAIRWISE": 0,
+  "CROSS_LAYER_KV_PARTIAL_HEAD": 0,
+  "CROSS_LAYER_KV_PARTIAL_HEAD_COUNT": 2,
+  "VALUE_RESIDUAL_ENABLED": 1,
+  "VALUE_RESIDUAL_LAST_N_LAYERS": 2,
+  "VALUE_RESIDUAL_INIT_V0": 0.5,
+  "VALUE_RESIDUAL_INIT_CUR": 0.5,
+  "PLE_ENABLED": 0,
+  "MTP_NUM_HEADS": 0,
+  "NGRAM_VOCAB_SIZE": 2048,
+  "NGRAM_DIM": 128,
+  "NGRAM_MAX_N": 2,
+  "NGRAM_FADE_ENABLE": 1,
+  "NGRAM_FADE_START_FRAC": 0.15,
+  "NGRAM_FADE_END_FRAC": 0.45,
+  "NGRAM_FADE_MIN_SCALE": 0.0,
+  "EMA_ENABLED": 1,
+  "EMA_DECAY": 0.997,
+  "LATE_QAT_RATIO": 0.15,
+  "DYNAMIC_CLIP_PERCENTILES": "100.0,99.9999,99.9995,99.995,99.99,99.95,99.9,99.8",
+  "EVAL_USE_SLIDING_WINDOW": 0,
+  "EVAL_STRIDE": 1024,
+  "EVAL_BATCH_SEQS": 16,
+  "TELEMETRY_EVERY": 50,
+  "PROFILE_RUN": 0,
+  "PROFILE_WARMUP_STEPS": 5,
+  "PROFILE_ACTIVE_STEPS": 10,
+  "TTT_ENABLED": 0,
+  "LORA_TTT_ENABLED": 0,
+  "PPM_ENABLED": 1,
+  "PPM_ORDER": 5,
+  "PPM_SUBSET_TOKENS": 8000000,
+  "PPM_CONF_THRESHOLD": 0.9,
+  "LAMBDA_LO": 0.1,
+  "LAMBDA_HI": 0.75,
+  "NN_BYTE_PROJECTION": "spread_root",
+  "NN_BYTE_UNIFORM_FLOOR": 1e-06,
+  "STOP_MODE": "steps",
+  "MAX_TRAIN_STEPS": 2200
+}
diff --git a/records/track_10min_16mb/2026-04-30_SP1024_ValueResid_PPMMix/requirements.txt b/records/track_10min_16mb/2026-04-30_SP1024_ValueResid_PPMMix/requirements.txt
@@ -0,0 +1,10 @@
+numpy
+tqdm
+torch
+huggingface-hub
+kernels
+setuptools
+typing-extensions==4.15.0
+datasets
+tiktoken
+sentencepiece
diff --git a/records/track_10min_16mb/2026-04-30_SP1024_ValueResid_PPMMix/seed_runs.csv b/records/track_10min_16mb/2026-04-30_SP1024_ValueResid_PPMMix/seed_runs.csv
@@ -0,0 +1,2 @@
+run_name,seed,tokenizer,model_dim,mlp_mult,num_layers,num_heads,num_kv_heads,value_residual_last_n_layers,ppm_order,ppm_conf_threshold,lambda_lo,lambda_hi,max_train_steps,world_size,grad_accum_steps,last_val_bpb,roundtrip_exact_val_bpb,ppm_mix_bpb,compressed_model_bytes,code_bytes,total_submission_bytes,fits_16mb,notes
+record_try_sp1024_512d_mlp2_value_resid_last2_ppm_hi075_steps2200,1337,sp1024,512,2,9,8,4,2,5,0.9,0.10,0.75,2200,1,4,1.2894,1.29339954,0.829467,15650103,156032,15806135,true,"Best included single-H100 non-record run; artifact fits under 16MB"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		run_name,seed,tokenizer,model_dim,mlp_mult,num_layers,num_heads,num_kv_heads,value_residual_last_n_layers,ppm_order,ppm_conf_threshold,lambda_lo,lambda_hi,max_train_steps,world_size,grad_accum_steps,last_val_bpb,roundtrip_exact_val_bpb,ppm_mix_bpb,compressed_model_bytes,code_bytes,total_submission_bytes,fits_16mb,notes
		record_try_sp1024_512d_mlp2_value_resid_last2_ppm_hi075_steps2200,1337,sp1024,512,2,9,8,4,2,5,0.9,0.10,0.75,2200,1,4,1.2894,1.29339954,0.829467,15650103,156032,15806135,true,"Best included single-H100 non-record run; artifact fits under 16MB"