fix(submission): use canonical byte-count exporter in prepare_caseops_data.py

dexhunter · dexhunter · commit 04d35edaad74 · 2026-04-24T02:01:17.000Z
The shipped `_token_original_byte_counts` used a try/except surface-walk that attributed 3 UTF-8 bytes to bare U+E001..U+E004 operator pieces AND failed to advance `cursor_o`, over-counting validation bytes by ~8.37% on FineWeb. The training sidecar actually used (built from a different internal path via `surface_piece_original_byte_counts`) is correct, so the submitted 1.06157 / 1.06549 metrics are unaffected — but the shipped prep script could not reproduce the sidecar from a cold checkout. Swap the buggy inline walker for a direct delegation to `surface_piece_original_byte_counts` from `lossless_caps.py` (the same canonical exporter used by PR openai#1729 / the HF-hosted dataset). Verified on 500 FineWeb val docs: patched output matches the shipped sidecar token-for-token (0 mismatches) and byte-sum matches true UTF-8 exactly. Also clean up README prose for the 04-24 record: SmearGate is a gate on the first GATE_WINDOW=12 feature dims of x_t adding a 1-token causal lookback (not a 12-token residual window); LQER asymmetric stores A as INT2 per-matrix and B as INT4 per-group-64 and selects K=3 whole tensors globally (not per-row output columns).
diff --git a/records/track_10min_16mb/2026-04-22_SP8192_CaseOps_GatedAttn_QuantGate_Loop45_PhasedTTT_MLPClip12/prepare_caseops_data.py b/records/track_10min_16mb/2026-04-22_SP8192_CaseOps_GatedAttn_QuantGate_Loop45_PhasedTTT_MLPClip12/prepare_caseops_data.py
@@ -54,7 +54,11 @@
 
 # Local import — lossless_caps.py ships next to this script.
 sys.path.insert(0, str(pathlib.Path(__file__).resolve().parent))
-from lossless_caps import encode_lossless_caps_v2  # noqa: E402
+from lossless_caps import (  # noqa: E402
+    LOSSLESS_CAPS_CASEOPS_V1,
+    encode_lossless_caps_v2,
+    surface_piece_original_byte_counts,
+)
 
 
 SHARD_MAGIC = 20240520
@@ -92,44 +96,19 @@ def _token_original_byte_counts(
     original_text: str,
     transformed_text: str,
 ) -> np.ndarray:
-    """Compute per-token canonical (pre-transform) UTF-8 byte counts.
-
-    The tokenizer runs on the TRANSFORMED text (so operator tokens exist in
-    the vocabulary), but BPB must be scored on the ORIGINAL byte stream.
-    We tokenize the transformed text, then walk each token's surface form
-    through the decoder to recover the pre-transform substring, and count
-    the UTF-8 bytes of that.
-
-    This is an APPROXIMATION — it assumes every token maps cleanly back to
-    a contiguous original substring. For caseops_v1 (which is character-
-    level and bijective) this holds exactly, because operator tokens
-    correspond to positions in the original string where the case was
-    derived from surrounding letters rather than materialised bytes.
+    """Per-token canonical (pre-transform) UTF-8 byte counts.
+
+    Delegates to ``surface_piece_original_byte_counts`` in ``lossless_caps.py``
+    — the canonical exporter used by the PR #1729 / HF-hosted CaseOps dataset.
+    Operator pieces (U+E001..U+E004) contribute 0 original bytes; letter pieces
+    contribute their pre-transform UTF-8 byte count.
     """
-    # Re-encode via the SP model and get pieces (surface strings with the
-    # leading ▁ preserved, as in the BPE vocabulary).
-    piece_ids = sp.encode(transformed_text, out_type=int)
-    pieces = [sp.id_to_piece(int(pid)) for pid in piece_ids]
-    # Walk pieces and match against the transformed text to find byte spans.
-    counts = np.empty(len(piece_ids), dtype=np.uint16)
-    cursor_t = 0
-    cursor_o = 0
-    from lossless_caps import decode_lossless_caps_v2 as _decode
-    for i, piece in enumerate(pieces):
-        # SentencePiece uses ▁ as the whitespace marker.
-        surface = piece.replace("\u2581", " ")
-        span = transformed_text[cursor_t:cursor_t + len(surface)]
-        cursor_t += len(span)
-        # Decode just this span to find the original bytes it came from.
-        try:
-            decoded_prefix = _decode(transformed_text[:cursor_t])
-            original_bytes = len(decoded_prefix.encode("utf-8")) - cursor_o
-            cursor_o += original_bytes
-        except Exception:
-            # Fall back to counting the transformed surface.
-            original_bytes = len(span.encode("utf-8"))
-        counts[i] = max(0, min(65535, original_bytes))
-    return counts
+    proto = sp.encode_as_immutable_proto(transformed_text)
+    byte_counts = surface_piece_original_byte_counts(
+        (piece.surface for piece in proto.pieces),
+        text_transform_name=LOSSLESS_CAPS_CASEOPS_V1,
+    )
+    return np.asarray(list(byte_counts), dtype=np.uint16)
 
 
 def main() -> None:
diff --git a/records/track_10min_16mb/2026-04-24_PR1787Base_Smear_LQERAsym_PhasedTTT_1.06157/README.md b/records/track_10min_16mb/2026-04-24_PR1787Base_Smear_LQERAsym_PhasedTTT_1.06157/README.md
@@ -29,17 +29,17 @@ All 3 seeds clear both 600s budgets (train + eval) and the 16,000,000-byte decim
 This submission combines three components on top of the PR #1787 (nprime06) upstream base:
 
 1. **Native PR #1787 base stack** (CaseOps + SparseAttnGate + PolarNS + MIN_LR + FusedCE + PR #1767-style TTT with `TTT_WARM_START_A=1`). The SparseAttnGate (`SPARSE_ATTN_GATE_ENABLED=1`) is PR #1787's replacement for the earlier QuantGate — it's a sparse per-head multiplicative gate applied inside attention.
-2. **Smear gate** (`SMEAR_GATE_ENABLED=1`, `GATE_WINDOW=12`): a lightweight content-conditioned gate over the last 12 tokens of the residual stream. Orthogonal to SparseAttnGate because it operates on the residual (not on attention outputs) and uses causal local context, not the full attention window.
-3. **LQER asymmetric rank-k correction** (`LQER_ENABLED=1`, `LQER_RANK=4`, `LQER_TOP_K=3`, `LQER_FACTOR_BITS=4`, `LQER_ASYM_ENABLED=1`, `LQER_ASYM_GROUP=64`): inline post-GPTQ asymmetric low-rank error compensation. For the top-K output columns of each quantized MLP weight row the int6 quantization residual is factored as `W - W_q ≈ U Σ Vᵀ` with rank 4 and per-group (group size 64) asymmetric scaling on the factor side. Factors are stored int4 (4 bits/element) and Brotli-compressed with the model. Recovers ≈0.009 BPB of the int6 quantization tax at a ≈30 KB artifact cost.
+2. **Smear gate** (`SMEAR_GATE_ENABLED=1`, `GATE_WINDOW=12`): a lightweight content-conditioned gate over the **first `GATE_WINDOW=12` feature dimensions** of the current-token residual, modulating a **1-token causal lookback** `x_t ← x_t + λ · sigmoid(W · x_t[:12]) · x_{t-1}`. Orthogonal to SparseAttnGate because it operates on the residual (not on attention outputs) and uses only the previous token, not the full attention window.
+3. **LQER asymmetric rank-k correction** (`LQER_ENABLED=1`, `LQER_RANK=4`, `LQER_TOP_K=3`, `LQER_ASYM_ENABLED=1`, `LQER_ASYM_GROUP=64`): inline post-GPTQ asymmetric low-rank error compensation. The **top-K entire weight tensors (K=3)** are selected globally by Frobenius norm of the quantization residual `E = W - W_q`; each selected tensor is factored as `E ≈ A · B` via rank-4 SVD. In asymmetric mode, `A` is stored as **INT2 per-matrix (single fp16 scalar scale)** and `B` as **INT4 per-group-64**; both are Brotli-compressed with the model. Recovers ≈0.009 BPB of the int6 quantization tax at a ≈30 KB artifact cost. (`LQER_FACTOR_BITS=4` is consumed only by the symmetric fallback path and is unused here.)
 
 ### Mechanism stack
 
 | Component | Origin | Role |
 |-----------|--------|------|
 | CaseOps bijective case transform | PR #1729 (romeerp) / PR #1736 (ours) | ~1.5% token savings, full byte-level bijection |
 | SparseAttnGate | PR #1787 (nprime06) | sparse per-head gate inside attention |
-| Smear gate | this submission | causal content-conditioned gate over last 12 residual tokens |
-| LQER asymmetric rank-4 correction | this submission | post-GPTQ int6 residual recovery, int4 factors |
+| Smear gate | this submission | causal content-conditioned gate on first 12 residual dims, adding 1-token lookback |
+| LQER asymmetric rank-4 correction | this submission | post-GPTQ int6 residual recovery, INT2/INT4 asym factors on top-3 tensors |
 | Phased TTT (score-first, 3 phases, 2000-doc prefix) | PR #1394 / PR #1736 | per-document LoRA adapter, score-before-update |
 | Int6 GPTQ + Brotli compressor | PR #1019 / PR #1530 | fits int6 model + factors + code under 16,000,000 bytes |
 
@@ -84,7 +84,7 @@ Net on 3-seed mean: **−0.00392 BPB / −0.00856 val_loss (nats/token)** vs PR
 | parallel_start_layer | 8 |
 | eval_seq_len / eval_stride | 2048 / 64 |
 | matrix_bits / embed_bits | 6 / 7 |
-| LQER rank / top-K / factor bits / asym group | 4 / 3 / 4 / 64 |
+| LQER rank / top-K / A-bits / B-bits / asym group | 4 / 3 / 2 / 4 / 64 |
 | smear gate window | 12 |
 | compressor | brotli |
 
@@ -93,7 +93,7 @@ Net on 3-seed mean: **−0.00392 BPB / −0.00856 val_loss (nats/token)** vs PR
 - **Artifact ≤ 16,000,000 bytes DECIMAL**: all 3 seeds 15,951,189–15,953,718 bytes (~46–49 KB headroom).
 - **train_time ≤ 600s**: all 3 seeds 599.47–599.64s (`stopping_early: wallclock_cap`).
 - **total_eval_time ≤ 600s**: all 3 seeds 423.3–494.8s.
-- **Issue #1017 Condition 1 (causal dependence)**: (a) SparseAttnGate and Smear gate are pure functions of previous-token context (the smear window is strictly `tokens[t-GATE_WINDOW:t]`). (b) Phased TTT updates the per-document LoRA adapter AFTER scoring every chunk; no position-t prediction is ever conditioned on y_t or on positions > t.
+- **Issue #1017 Condition 1 (causal dependence)**: (a) SparseAttnGate and Smear gate are pure functions of previous-token context (the Smear gate reads only the current token's prefix `x_t[:GATE_WINDOW]` and the immediately previous token `x_{t-1}`). (b) Phased TTT updates the per-document LoRA adapter AFTER scoring every chunk; no position-t prediction is ever conditioned on y_t or on positions > t.
 - **Issue #1017 Condition 2 (full normalized distribution)**: CE over the full 8192-token softmax at each position; no x_t-dependent restriction of Σ.
 - **Issue #1017 Condition 3 (score-before-update)**: the TTT path snapshots the pre-update per-chunk logits and scores them BEFORE the adapter SGD step. Per-document LoRA reset (`reusable_lora.reset()`) prevents cross-document leakage.
 - **Issue #1017 Condition 4 (single left-to-right pass)**: eval is one left-to-right pass with sliding stride 64; no rescore/selection.
diff --git a/records/track_10min_16mb/2026-04-24_PR1787Base_Smear_LQERAsym_PhasedTTT_1.06157/prepare_caseops_data.py b/records/track_10min_16mb/2026-04-24_PR1787Base_Smear_LQERAsym_PhasedTTT_1.06157/prepare_caseops_data.py
@@ -54,7 +54,11 @@
 
 # Local import — lossless_caps.py ships next to this script.
 sys.path.insert(0, str(pathlib.Path(__file__).resolve().parent))
-from lossless_caps import encode_lossless_caps_v2  # noqa: E402
+from lossless_caps import (  # noqa: E402
+    LOSSLESS_CAPS_CASEOPS_V1,
+    encode_lossless_caps_v2,
+    surface_piece_original_byte_counts,
+)
 
 
 SHARD_MAGIC = 20240520
@@ -92,44 +96,19 @@ def _token_original_byte_counts(
     original_text: str,
     transformed_text: str,
 ) -> np.ndarray:
-    """Compute per-token canonical (pre-transform) UTF-8 byte counts.
-
-    The tokenizer runs on the TRANSFORMED text (so operator tokens exist in
-    the vocabulary), but BPB must be scored on the ORIGINAL byte stream.
-    We tokenize the transformed text, then walk each token's surface form
-    through the decoder to recover the pre-transform substring, and count
-    the UTF-8 bytes of that.
-
-    This is an APPROXIMATION — it assumes every token maps cleanly back to
-    a contiguous original substring. For caseops_v1 (which is character-
-    level and bijective) this holds exactly, because operator tokens
-    correspond to positions in the original string where the case was
-    derived from surrounding letters rather than materialised bytes.
+    """Per-token canonical (pre-transform) UTF-8 byte counts.
+
+    Delegates to ``surface_piece_original_byte_counts`` in ``lossless_caps.py``
+    — the canonical exporter used by the PR #1729 / HF-hosted CaseOps dataset.
+    Operator pieces (U+E001..U+E004) contribute 0 original bytes; letter pieces
+    contribute their pre-transform UTF-8 byte count.
     """
-    # Re-encode via the SP model and get pieces (surface strings with the
-    # leading ▁ preserved, as in the BPE vocabulary).
-    piece_ids = sp.encode(transformed_text, out_type=int)
-    pieces = [sp.id_to_piece(int(pid)) for pid in piece_ids]
-    # Walk pieces and match against the transformed text to find byte spans.
-    counts = np.empty(len(piece_ids), dtype=np.uint16)
-    cursor_t = 0
-    cursor_o = 0
-    from lossless_caps import decode_lossless_caps_v2 as _decode
-    for i, piece in enumerate(pieces):
-        # SentencePiece uses ▁ as the whitespace marker.
-        surface = piece.replace("\u2581", " ")
-        span = transformed_text[cursor_t:cursor_t + len(surface)]
-        cursor_t += len(span)
-        # Decode just this span to find the original bytes it came from.
-        try:
-            decoded_prefix = _decode(transformed_text[:cursor_t])
-            original_bytes = len(decoded_prefix.encode("utf-8")) - cursor_o
-            cursor_o += original_bytes
-        except Exception:
-            # Fall back to counting the transformed surface.
-            original_bytes = len(span.encode("utf-8"))
-        counts[i] = max(0, min(65535, original_bytes))
-    return counts
+    proto = sp.encode_as_immutable_proto(transformed_text)
+    byte_counts = surface_piece_original_byte_counts(
+        (piece.surface for piece in proto.pieces),
+        text_transform_name=LOSSLESS_CAPS_CASEOPS_V1,
+    )
+    return np.asarray(list(byte_counts), dtype=np.uint16)
 
 
 def main() -> None: