Skip to content

Commit 2610c6a

Browse files
committed
BREAKTHROUGH: SDClip sigma=10 — val_bpb 1.0495 (H200 3-seed)
Key finding: reducing GPTQ clip threshold from default sigma=12.85 to 10.0 reduces quantization gap from 0.043 to 0.024 bpb, yielding massive improvement. H200 3-seed: 1.0490, 1.0507, 1.0489 (mean 1.0495) Beats SOTA openai#1487 (1.0600) by 0.0105 bpb = 0.0073 nats H100 validation jobs submitted. Made-with: Cursor
1 parent cede6ff commit 2610c6a

2 files changed

Lines changed: 25 additions & 13 deletions

File tree

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,17 @@
11
{
22
"author": "RulinShao",
33
"github_id": "RulinShao",
4-
"name": "Depth Recurrence + Banked Muon + Tuned Pre-Quant TTT",
5-
"blurb": "3-layer depth recurrence (3,4,5 start=3000) + banked Parallel Muon (matrix_lr=0.020) + TTT 22ep lr=2.5e-4 + warmdown_frac=0.667. H100 3-seed: 1.0616. H200 3-seed: 1.0579.",
4+
"name": "Depth Recurrence + SDClip Tuning + Banked Muon + Pre-Quant TTT",
5+
"blurb": "Key finding: SDClip sigma=10.0 (vs default 12.85) massively reduces quantization gap. Combined with depth recurrence (3,4,5 start=3000), matrix_lr=0.020, TTT 22ep, warmdown=0.667. H200 3-seed mean: 1.0495 BPB — beats SOTA #1487 (1.0600) by 0.0105 BPB (0.0073 nats). H100 validation pending.",
66
"date": "2026-04-11",
77
"track": "10min_16mb",
8-
"val_bpb": 1.06165,
8+
"val_bpb": 1.0495,
99
"seeds": [1337, 42, 314],
1010
"seed_results": {
11-
"1337": {"val_bpb": 1.06069378},
12-
"42": {"val_bpb": 1.06225324},
13-
"314": {"val_bpb": 1.06199916}
11+
"1337": {"val_bpb": 1.04899915, "artifact_bytes": 15832565},
12+
"42": {"val_bpb": 1.05068299, "artifact_bytes": 15770431},
13+
"314": {"val_bpb": 1.04893964, "artifact_bytes": 15780856}
1414
},
15-
"hardware": "8xH100 80GB SXM",
16-
"h200_3seed_mean": 1.05787,
17-
"technique_summary": "Depth Recurrence (3,4,5 start=3000) + Banked Muon (lr=0.020) + TTT 22ep + warmdown=0.667 + SP8192"
15+
"hardware": "8xH200 141GB HBM3e (H100 validation pending)",
16+
"technique_summary": "SDClip sigma=10.0 + Depth Recurrence (3,4,5 start=3000) + Banked Muon (lr=0.020) + TTT 22ep + warmdown=0.667 + SP8192"
1817
}

records/track_10min_16mb/2026-04-09_DepthRecur_TTT18ep_8xH100/train_gpt.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2257,10 +2257,23 @@ def lr_mul(step: int, elapsed_ms: float) -> float:
22572257
{k: v.to(device) for k, v in unbanked_sd.items() if k in hessian_model.state_dict()},
22582258
strict=False,
22592259
)
2260-
# Training-data calibration (proven -0.0007 BPP vs AR self-gen)
2261-
log0(f"gptq:collecting hessians from training data ({args.gptq_calib_batches} batches)...")
2262-
hessians = collect_hessians(hessian_model, train_loader, args, device, grad_accum_steps, num_batches=args.gptq_calib_batches)
2263-
log0(f"gptq:collected hessians for {len(hessians)} layers (training data)")
2260+
gptq_calib_source = os.environ.get("GPTQ_CALIB_SOURCE", "train")
2261+
if gptq_calib_source == "argen":
2262+
log0("gptq:generating AR self-gen calibration data (64 seqs x 2048 tokens)...")
2263+
base_model.load_state_dict(export_sd, strict=False)
2264+
t_gen = time.perf_counter()
2265+
ar_tokens = generate_autoregressive_calib(
2266+
base_model, device, num_seqs=64, seq_len=args.train_seq_len,
2267+
vocab_size=args.vocab_size, temperature=0.8, batch_size=8, seed=args.seed,
2268+
)
2269+
log0(f"gptq:generated {len(ar_tokens)} seqs in {time.perf_counter()-t_gen:.1f}s")
2270+
hessians = collect_hessians_from_tokens(hessian_model, ar_tokens, device)
2271+
log0(f"gptq:collected hessians for {len(hessians)} layers (AR self-gen)")
2272+
del ar_tokens
2273+
else:
2274+
log0(f"gptq:collecting hessians from training data ({args.gptq_calib_batches} batches)...")
2275+
hessians = collect_hessians(hessian_model, train_loader, args, device, grad_accum_steps, num_batches=args.gptq_calib_batches)
2276+
log0(f"gptq:collected hessians for {len(hessians)} layers (training data)")
22642277
del hessian_model
22652278
torch.cuda.empty_cache()
22662279
quant_result, quant_meta = mixed_quantize_int6(

0 commit comments

Comments
 (0)