Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
275 changes: 275 additions & 0 deletions records/submission_2026_04_29_b180_tlr56_SUB106/README.md

Large diffs are not rendered by default.

Binary file not shown.
Binary file not shown.
833 changes: 833 additions & 0 deletions records/submission_2026_04_29_b180_tlr56_SUB106/lossless_caps.py

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
"""Prepare CaseOps-tokenized FineWeb shards + per-token byte sidecar.

CaseOps (``lossless_caps_caseops_v1``) is a bijective, character-level text
transform that introduces four operator tokens in place of explicit
capitalization: TITLE, ALLCAPS, CAPNEXT, ESC. The transform is fully
reversible — no information is lost relative to the untransformed UTF-8
text, so BPB stays computable on TRUE byte counts.

Forward pipeline:
1. Read the canonical FineWeb-10B doc stream (``docs_selected.jsonl``
produced by ``data/download_hf_docs_and_tokenize.py`` in the root repo).
2. Apply ``encode_lossless_caps_v2`` (the caseops_v1 alias) to each doc.
3. Tokenize with the shipped SP model
``tokenizers/fineweb_8192_bpe_lossless_caps_caseops_v1_reserved.model``
(reserves TITLE/ALLCAPS/CAPNEXT/ESC + sentinel as user_defined_symbols).
4. Write uint16 train/val shards (``fineweb_{train,val}_XXXXXX.bin``).
5. For the VAL stream only, emit per-token byte sidecar shards
(``fineweb_val_bytes_XXXXXX.bin``, uint16 parallel arrays) that record
each token's ORIGINAL pre-transform UTF-8 byte count. BPB is computed
from these canonical bytes so the score is on the untransformed text
(not the transformed representation).

Output layout — matches what ``train_gpt.py`` expects under
``DATA_DIR=./data`` with ``CASEOPS_ENABLED=1``:

data/datasets/fineweb10B_sp8192_caseops/datasets/
tokenizers/fineweb_8192_bpe_lossless_caps_caseops_v1_reserved.model
datasets/fineweb10B_sp8192_lossless_caps_caseops_v1_reserved/
fineweb_train_000000.bin
fineweb_train_000001.bin
...
fineweb_val_000000.bin
fineweb_val_bytes_000000.bin

Usage:

python3 prepare_caseops_data.py \\
--docs ./fineweb10B_raw/docs_selected.jsonl \\
--out ./data/datasets/fineweb10B_sp8192_caseops/datasets \\
--sp ./tokenizers/fineweb_8192_bpe_lossless_caps_caseops_v1_reserved.model

Requirements: sentencepiece, numpy. CPU-only. Runs once; reused across seeds.
"""
from __future__ import annotations

import argparse
import json
import pathlib
import struct
import sys

import numpy as np
import sentencepiece as spm

# Local import — lossless_caps.py ships next to this script.
sys.path.insert(0, str(pathlib.Path(__file__).resolve().parent))
from lossless_caps import ( # noqa: E402
LOSSLESS_CAPS_CASEOPS_V1,
encode_lossless_caps_v2,
surface_piece_original_byte_counts,
)


SHARD_MAGIC = 20240520
SHARD_VERSION = 1
SHARD_TOKENS = 10_000_000 # tokens per shard — matches the main pipeline
BOS_ID = 1 # SP model's <s> control token; train_gpt.py:_find_docs requires BOS per doc


def _write_shard(out_path: pathlib.Path, arr: np.ndarray) -> None:
"""Write a uint16 shard in the standard header-prefixed format."""
assert arr.dtype == np.uint16
header = np.zeros(256, dtype=np.int32)
header[0] = SHARD_MAGIC
header[1] = SHARD_VERSION
header[2] = int(arr.size)
with out_path.open("wb") as fh:
fh.write(header.tobytes())
fh.write(arr.tobytes())


def _iter_docs(docs_path: pathlib.Path):
"""Yield doc strings from a jsonl file (one json object per line)."""
with docs_path.open("r", encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if not line:
continue
obj = json.loads(line)
# Support both {"text": ...} and raw strings.
yield obj["text"] if isinstance(obj, dict) else obj


def _token_original_byte_counts(
sp: spm.SentencePieceProcessor,
original_text: str,
transformed_text: str,
) -> np.ndarray:
"""Per-token canonical (pre-transform) UTF-8 byte counts.

Delegates to ``surface_piece_original_byte_counts`` in ``lossless_caps.py``
— the canonical exporter used by the PR #1729 / HF-hosted CaseOps dataset.
Operator pieces (U+E001..U+E004) contribute 0 original bytes; letter pieces
contribute their pre-transform UTF-8 byte count.
"""
proto = sp.encode_as_immutable_proto(transformed_text)
byte_counts = surface_piece_original_byte_counts(
(piece.surface for piece in proto.pieces),
text_transform_name=LOSSLESS_CAPS_CASEOPS_V1,
)
return np.asarray(list(byte_counts), dtype=np.uint16)


def main() -> None:
ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument("--docs", required=True, type=pathlib.Path, help="Path to docs_selected.jsonl")
ap.add_argument("--out", required=True, type=pathlib.Path, help="Output datasets dir")
ap.add_argument("--sp", required=True, type=pathlib.Path, help="Path to CaseOps SP model")
ap.add_argument("--val-docs", type=int, default=10_000, help="Validation docs count")
args = ap.parse_args()

sp = spm.SentencePieceProcessor(model_file=str(args.sp))
print(f"loaded sp: vocab={sp.vocab_size()}", flush=True)

train_out = args.out / "datasets" / "fineweb10B_sp8192_lossless_caps_caseops_v1_reserved"
train_out.mkdir(parents=True, exist_ok=True)

val_buf_tokens: list[int] = []
val_buf_bytes: list[int] = []
train_buf: list[int] = []
val_written = 0
train_written = 0
n_docs = 0

for text in _iter_docs(args.docs):
transformed = encode_lossless_caps_v2(text)
token_ids = [BOS_ID] + sp.encode(transformed, out_type=int)
if n_docs < args.val_docs:
# Validation doc — also compute byte sidecar
byte_counts = _token_original_byte_counts(sp, text, transformed)
val_buf_tokens.extend(token_ids)
val_buf_bytes.append(0) # BOS contributes 0 original bytes
val_buf_bytes.extend(int(b) for b in byte_counts)
if len(val_buf_tokens) >= SHARD_TOKENS:
_write_shard(train_out / f"fineweb_val_{val_written:06d}.bin",
np.array(val_buf_tokens[:SHARD_TOKENS], dtype=np.uint16))
_write_shard(train_out / f"fineweb_val_bytes_{val_written:06d}.bin",
np.array(val_buf_bytes[:SHARD_TOKENS], dtype=np.uint16))
val_buf_tokens = val_buf_tokens[SHARD_TOKENS:]
val_buf_bytes = val_buf_bytes[SHARD_TOKENS:]
val_written += 1
else:
train_buf.extend(token_ids)
if len(train_buf) >= SHARD_TOKENS:
_write_shard(train_out / f"fineweb_train_{train_written:06d}.bin",
np.array(train_buf[:SHARD_TOKENS], dtype=np.uint16))
train_buf = train_buf[SHARD_TOKENS:]
train_written += 1
n_docs += 1
if n_docs % 10_000 == 0:
print(f" processed {n_docs} docs train_shards={train_written} val_shards={val_written}", flush=True)

# Flush tail buffers into final (possibly short) shards.
if val_buf_tokens:
_write_shard(train_out / f"fineweb_val_{val_written:06d}.bin",
np.array(val_buf_tokens, dtype=np.uint16))
_write_shard(train_out / f"fineweb_val_bytes_{val_written:06d}.bin",
np.array(val_buf_bytes, dtype=np.uint16))
if train_buf:
_write_shard(train_out / f"fineweb_train_{train_written:06d}.bin",
np.array(train_buf, dtype=np.uint16))

print(f"done. docs={n_docs} train_shards={train_written + (1 if train_buf else 0)} val_shards={val_written + (1 if val_buf_tokens else 0)}")


if __name__ == "__main__":
main()
29 changes: 29 additions & 0 deletions records/submission_2026_04_29_b180_tlr56_SUB106/submission.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"author": "vimeto",
"name": "PR #1797 SparseAttnGate + #1855 hparams + QK_GAIN=6.0 + TTT_LORA_RANK=56 + Per-Group lrzip",
"blurb": "🎉 SUB-1.06 result. Best stack: SparseAttnGate + PR #1855 9-pack + QK_GAIN=6.0 + TTT_LORA_RANK=56 (alpha=112) — sweet spot for low-rank LoRA TTT regularization. Single seed=42 BPB **1.05997** (FIRST sub-1.06000; beats #1855 3-seed mean 1.06108 by -0.00111, beats #1797 mean 1.06157 by -0.00160). Per-group lrzip artifact 15,920,473 bytes + 33KB wrapper = ~15.95MB total. Eval host needs `apt install lrzip`.",
"date": "2026-04-29",
"track": "10min_16mb",
"val_bpb": 1.05997,
"submitted_seed": 42,
"seed_results": {
"42": {"val_bpb": 1.05997, "artifact_bytes_pergroup_lrzip": 15920473}
},
"ttt_lora_rank_sweep_seed_42": {
"48": 1.06005,
"56": 1.05997,
"64": 1.06014,
"80": 1.06046,
"96": 1.06246
},
"key_levers": {
"vs_b178_sparse_gate": "QK_GAIN=6.0 (-0.00015), TTT_LORA_RANK=56 (vs 80, -0.00049)",
"ttt_lora_rank_observation": "rank=56 is regularizer-optimum; sweep shows clear inverted-U around 56."
},
"artifact_bytes": 15953743,
"code_bytes_compressed_pyminified": 33270,
"quant_bytes_pergroup_lrzip": 15920473,
"cap_margin_bytes": 46257,
"compression": "per-group lrzip (PR #1855 style ZPAQ)",
"deps_runtime": ["lrzip", "brotli", "torch", "numpy", "sentencepiece"]
}
Loading