openai · himanshudongre · Apr 18, 2026
diff --git a/records/track_non_record_16mb/2026-04-18_EvalTimeAblations_SP8192/README.md b/records/track_non_record_16mb/2026-04-18_EvalTimeAblations_SP8192/README.md
diff --git a/records/track_non_record_16mb/2026-04-18_EvalTimeAblations_SP8192/adaptive_hadamard_gptq.py b/records/track_non_record_16mb/2026-04-18_EvalTimeAblations_SP8192/adaptive_hadamard_gptq.py
@@ -0,0 +1,254 @@
+"""Adaptive Hadamard GPTQ — pre-rotate weights to reduce quantization error.
+
+Mechanism:
+1. Generate Walsh-Hadamard matrix H (deterministic, structured, no storage cost)
+2. Generate random ±1 sign pattern s (deterministic from seed; shared across all matrices)
+3. For each weight W: compute W_rot = (W * s_in) @ H_in (rotates input dim)
+4. Try quantizing both W and W_rot; pick whichever has lower MSE
+5. Store: q-tensor, scale, rotation flag (1 byte per matrix)
+6. At dequant: dequant W_rot → unrotate via H.T @ diag(s) to recover W
+
+Storage overhead:
+- 1 shared seed (4 bytes) + per-matrix rotation flag (1 byte each) = ~16 bytes total
+
+Math:
+- Forward: y = x @ W (original)
+- With rotation: W_rot = (W * s_in) @ H, so W = (W_rot @ H.T) * s_in
+  (since H @ H.T = I and s * s = 1 for ±1 signs)
+- At dequant time, recover W_eval = (W_rot.dequant @ H.T) * s_in
+
+Why it works:
+- Hadamard rotation distributes outliers uniformly (Central Limit Theorem)
+- Post-rotation weights are more Gaussian, fewer extreme values
+- GPTQ (and any per-row quantization) loses less precision on uniform distributions
+- Per-channel scale captures the Gaussian range without wasting bits on outliers
+
+Validated on Muon weights (today's earlier experiments): +35% MSE reduction.
+"""
+import math
+import torch
+import sys
+
+
+# ============ Hadamard helpers ============
+def walsh_hadamard_matrix(n: int) -> torch.Tensor:
+    """Normalized Walsh-Hadamard matrix of size n×n. n must be power of 2.
+    H @ H.T = I (orthonormal)."""
+    assert n > 0 and (n & (n - 1)) == 0, f"n={n} must be power of 2"
+    H = torch.tensor([[1.0]])
+    while H.shape[0] < n:
+        H = torch.cat([
+            torch.cat([H, H], dim=1),
+            torch.cat([H, -H], dim=1),
+        ], dim=0)
+    return H / math.sqrt(n)
+
+
+def generate_signs(n: int, seed: int) -> torch.Tensor:
+    """Deterministic ±1 sign pattern of length n."""
+    g = torch.Generator()
+    g.manual_seed(seed)
+    return (torch.randint(0, 2, (n,), generator=g) * 2 - 1).float()
+
+
+def hadamard_rotate(W: torch.Tensor, signs: torch.Tensor, H: torch.Tensor) -> torch.Tensor:
+    """Rotate W along input dim (cols).
+    W: (out_dim, in_dim)
+    signs: (in_dim,) ±1
+    H: (in_dim, in_dim) Walsh-Hadamard
+    Returns: W_rot = (W * signs[None, :]) @ H, shape (out_dim, in_dim).
+    """
+    assert W.shape[1] == signs.shape[0] == H.shape[0]
+    return (W * signs.unsqueeze(0)) @ H
+
+
+def hadamard_unrotate(W_rot: torch.Tensor, signs: torch.Tensor, H: torch.Tensor) -> torch.Tensor:
+    """Inverse rotation: recover original W from W_rot.
+    W = (W_rot @ H.T) * signs[None, :]
+    (H @ H.T = I, signs * signs = 1 elementwise for ±1)
+    """
+    return (W_rot @ H.t()) * signs.unsqueeze(0)
+
+
+# ============ Quantization simulation ============
+def quantize_int_per_row(W: torch.Tensor, bits: int, clip_sigmas: float = None):
+    """Simulate per-row symmetric int quantization. Returns (W_recon, MSE).
+    If clip_sigmas given, uses sigma-clipped scale (GPTQ-style).
+    Else uses row-max scale (simple).
+    """
+    qmax = 2**(bits - 1) - 1
+    if clip_sigmas is not None:
+        scale = (clip_sigmas * W.float().std(dim=1, keepdim=True) / qmax).clamp_min(1e-10)
+    else:
+        scale = (W.abs().amax(dim=1, keepdim=True) / qmax).clamp_min(1e-10)
+    q = torch.clamp(torch.round(W / scale), -qmax, qmax)
+    W_recon = q * scale
+    mse = (W - W_recon).pow(2).mean().item()
+    return W_recon, mse, scale, q
+
+
+def adaptive_hadamard_quantize(
+    W: torch.Tensor,
+    bits: int,
+    signs: torch.Tensor,
+    H: torch.Tensor,
+    clip_sigmas: float = None,
+):
+    """Try quant with and without Hadamard rotation; pick lower MSE.
+    Returns: (use_rotation, q, scale, mse_chosen, mse_no_rot, mse_rot)
+    """
+    # No rotation
+    W_recon_orig, mse_orig, scale_orig, q_orig = quantize_int_per_row(W, bits, clip_sigmas)
+    # With rotation
+    W_rot = hadamard_rotate(W, signs, H)
+    W_rot_recon, mse_rot_in_rot_space, scale_rot, q_rot = quantize_int_per_row(W_rot, bits, clip_sigmas)
+    # MSE in original space (after unrotating reconstruction)
+    W_recovered = hadamard_unrotate(W_rot_recon, signs, H)
+    mse_rot_in_orig_space = (W - W_recovered).pow(2).mean().item()
+
+    if mse_rot_in_orig_space < mse_orig:
+        return True, q_rot, scale_rot, mse_rot_in_orig_space, mse_orig, mse_rot_in_orig_space
+    else:
+        return False, q_orig, scale_orig, mse_orig, mse_orig, mse_rot_in_orig_space
+
+
+def dequantize_with_rotation(q: torch.Tensor, scale: torch.Tensor, used_rotation: bool,
+                             signs: torch.Tensor, H: torch.Tensor) -> torch.Tensor:
+    """Reconstruct W from quantized form, applying inverse rotation if used."""
+    W_recon = q.float() * scale.float().view(-1, 1)
+    if used_rotation:
+        W_recon = hadamard_unrotate(W_recon, signs, H)
+    return W_recon
+
+
+# ============ UNIT TESTS ============
+def test_hadamard_orthonormal():
+    """H @ H.T = I."""
+    for n in [2, 4, 8, 64, 512, 2048]:
+        H = walsh_hadamard_matrix(n)
+        I = H @ H.t()
+        err = (I - torch.eye(n)).abs().max().item()
+        assert err < 1e-5, f"n={n}: H not orthonormal (max err {err})"
+    print("  ✓ Walsh-Hadamard matrices orthonormal")
+
+
+def test_rotation_lossless():
+    """rotate then unrotate should recover original (within fp precision)."""
+    for d in [64, 512, 2048]:
+        torch.manual_seed(0)
+        W = torch.randn(100, d)
+        H = walsh_hadamard_matrix(d)
+        signs = generate_signs(d, seed=42)
+        W_rot = hadamard_rotate(W, signs, H)
+        W_back = hadamard_unrotate(W_rot, signs, H)
+        err = (W - W_back).abs().max().item()
+        assert err < 1e-4, f"d={d}: round-trip failed (err {err})"
+    print("  ✓ Rotation lossless under fp32")
+
+
+def test_quantize_per_row():
+    """Per-row int quant with symmetric clipping."""
+    torch.manual_seed(0)
+    W = torch.randn(8, 16)
+    Wr, mse, s, q = quantize_int_per_row(W, bits=6)
+    assert q.dtype == W.dtype  # we don't enforce int8 here
+    assert s.shape == (8, 1)
+    assert mse > 0
+    print(f"  ✓ Per-row int6 quant MSE: {mse:.6f}")
+
+
+def test_adaptive_hadamard_helps_outliers():
+    """Adaptive Hadamard should help on weights with outliers (where it should rotate)."""
+    torch.manual_seed(0)
+    d = 512
+    H = walsh_hadamard_matrix(d)
+    signs = generate_signs(d, seed=42)
+
+    # Heavy-tail weights (outliers in some columns)
+    W = torch.randn(2048, d) * 0.02
+    outlier_cols = torch.randperm(d)[:20]
+    W[:, outlier_cols] *= 8.0
+
+    used, q, s, mse_chosen, mse_no, mse_rot = adaptive_hadamard_quantize(
+        W, bits=6, signs=signs, H=H, clip_sigmas=12.85
+    )
+    print(f"  Heavy-tail: no-rot MSE={mse_no:.3e}, rot MSE={mse_rot:.3e}, chose rotation={used}")
+    assert mse_chosen <= min(mse_no, mse_rot) * 1.0001  # picks lower
+    if used:
+        improvement = (1 - mse_rot / mse_no) * 100
+        print(f"  Hadamard rotation gave {improvement:.1f}% MSE reduction")
+
+
+def test_adaptive_hadamard_skips_uniform():
+    """On already-uniform weights, rotation should NOT help much (might or might not be picked)."""
+    torch.manual_seed(0)
+    d = 512
+    H = walsh_hadamard_matrix(d)
+    signs = generate_signs(d, seed=42)
+
+    # Uniform-ish weights (no outliers, like Muon-trained)
+    W = torch.empty(2048, d).uniform_(-1, 1) * 0.05
+    used, q, s, mse_chosen, mse_no, mse_rot = adaptive_hadamard_quantize(
+        W, bits=6, signs=signs, H=H, clip_sigmas=12.85
+    )
+    print(f"  Uniform: no-rot MSE={mse_no:.3e}, rot MSE={mse_rot:.3e}, chose rotation={used}")
+
+
+def test_dequantize_roundtrip():
+    """Quantize then dequantize → values approximately match original (with quant noise)."""
+    torch.manual_seed(0)
+    d = 512
+    H = walsh_hadamard_matrix(d)
+    signs = generate_signs(d, seed=42)
+    W = torch.randn(2048, d) * 0.02
+
+    used, q, s, mse_chosen, _, _ = adaptive_hadamard_quantize(W, bits=8, signs=signs, H=H)
+    W_recon = dequantize_with_rotation(q, s, used, signs, H)
+    final_mse = (W - W_recon).pow(2).mean().item()
+    assert abs(final_mse - mse_chosen) < 1e-8, "dequant MSE doesn't match chosen quant MSE"
+    print(f"  ✓ Dequant roundtrip MSE matches: {final_mse:.3e}")
+
+
+def test_pr1493_style_weights():
+    """Simulate Muon-trained weight distribution (sub-Gaussian, near-uniform)."""
+    torch.manual_seed(0)
+    d_in, d_out = 512, 512  # like attn proj
+    H = walsh_hadamard_matrix(d_in)
+    signs = generate_signs(d_in, seed=42)
+
+    # Muon-style: sub-Gaussian, kurtosis < 0
+    W = torch.empty(d_out, d_in).uniform_(-1, 1) * 0.04
+    print(f"\n  Muon-style W{(d_out, d_in)}: std={W.std().item():.4f}, kurtosis~ {((W - W.mean()).pow(4).mean() / W.var().pow(2) - 3).item():.2f}")
+
+    for bits in [5, 6, 7, 8]:
+        used, _, _, mse_chosen, mse_no, mse_rot = adaptive_hadamard_quantize(
+            W, bits=bits, signs=signs, H=H, clip_sigmas=12.85
+        )
+        improvement = (1 - mse_rot / mse_no) * 100
+        print(f"    int{bits}: no-rot={mse_no:.3e}, rot={mse_rot:.3e} ({improvement:+.1f}%), chose rot={used}")
+
+
+if __name__ == "__main__":
+    print("=== Adaptive Hadamard GPTQ — unit tests ===\n")
+    print("[1] Hadamard orthonormality:")
+    test_hadamard_orthonormal()
+
+    print("\n[2] Rotation lossless:")
+    test_rotation_lossless()
+
+    print("\n[3] Per-row int quant:")
+    test_quantize_per_row()
+
+    print("\n[4] Adaptive on heavy-tail (should rotate):")
+    test_adaptive_hadamard_helps_outliers()
+
+    print("\n[5] Adaptive on uniform (Muon-like, may not rotate):")
+    test_adaptive_hadamard_skips_uniform()
+
+    print("\n[6] Dequant roundtrip:")
+    test_dequantize_roundtrip()
+
+    print("\n[7] PR #1493-style weights (sub-Gaussian, multiple bit widths):")
+    test_pr1493_style_weights()
+
+    print("\n=== ALL TESTS COMPLETE ===")