Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.13
32 changes: 32 additions & 0 deletions RTK.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# RTK - Rust Token Killer (Codex CLI)

**Usage**: Token-optimized CLI proxy for shell commands.

## Rule

Always prefix shell commands with `rtk`.

Examples:

```bash
rtk git status
rtk cargo test
rtk npm run build
rtk pytest -q
```

## Meta Commands

```bash
rtk gain # Token savings analytics
rtk gain --history # Recent command savings history
rtk proxy <cmd> # Run raw command without filtering
```

## Verification

```bash
rtk --version
rtk gain
which rtk
```
103 changes: 69 additions & 34 deletions data/cached_challenge_fineweb.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,45 @@
import argparse
from dataclasses import dataclass
import json
import os
import shutil
from pathlib import Path

from huggingface_hub import hf_hub_download


REPO_ID = os.environ.get("MATCHED_FINEWEB_REPO_ID", "willdepueoai/parameter-golf")
REMOTE_ROOT_PREFIX = os.environ.get("MATCHED_FINEWEB_REMOTE_ROOT_PREFIX", "datasets")
DEFAULT_REPO_ID = "willdepueoai/parameter-golf"
DEFAULT_REMOTE_ROOT_PREFIX = "datasets"
SP8192_REPO_ID = "Jaikirat/fineweb10B_sp8192"
SP8192_REMOTE_ROOT_PREFIX = ""
ROOT = Path(__file__).resolve().parent
DATASETS_DIR = ROOT / "datasets"
TOKENIZERS_DIR = ROOT / "tokenizers"


@dataclass(frozen=True)
class DatasetSource:
repo_id: str
remote_root_prefix: str


def source_for_variant(variant: str) -> DatasetSource:
if variant == "sp8192":
default_repo_id = SP8192_REPO_ID
default_remote_root_prefix = SP8192_REMOTE_ROOT_PREFIX
else:
default_repo_id = DEFAULT_REPO_ID
default_remote_root_prefix = DEFAULT_REMOTE_ROOT_PREFIX
return DatasetSource(
repo_id=os.environ.get("MATCHED_FINEWEB_REPO_ID", default_repo_id),
remote_root_prefix=os.environ.get("MATCHED_FINEWEB_REMOTE_ROOT_PREFIX", default_remote_root_prefix),
)


def remote_path(source: DatasetSource, *parts: str) -> str:
path_parts = [source.remote_root_prefix, *parts]
return "/".join(part.strip("/") for part in path_parts if part.strip("/"))


def dataset_dir_for_variant(name: str) -> str:
if name == "byte260":
return "fineweb10B_byte260"
Expand All @@ -21,55 +48,62 @@ def dataset_dir_for_variant(name: str) -> str:
raise ValueError(f"unsupported variant {name!r}; expected byte260 or sp<VOCAB_SIZE>")


def local_path_for_remote(relative_path: str) -> Path:
def local_path_for_remote(relative_path: str, source: DatasetSource) -> Path:
remote_path = Path(relative_path)
if REMOTE_ROOT_PREFIX and remote_path.parts[:1] == (REMOTE_ROOT_PREFIX,):
remote_path = remote_path.relative_to(REMOTE_ROOT_PREFIX)
if source.remote_root_prefix and remote_path.parts[:1] == (source.remote_root_prefix,):
remote_path = remote_path.relative_to(source.remote_root_prefix)
if remote_path.parts[:1] == ("datasets",):
return DATASETS_DIR.joinpath(*remote_path.parts[1:])
if remote_path.parts[:1] == ("tokenizers",):
return TOKENIZERS_DIR.joinpath(*remote_path.parts[1:])
return ROOT / remote_path


def get(relative_path: str) -> None:
destination = local_path_for_remote(relative_path)
if destination.exists():
def get(relative_path: str, source: DatasetSource, *, force: bool = False) -> None:
destination = local_path_for_remote(relative_path, source)
if destination.exists() and not force:
return
if destination.is_symlink():
if destination.exists() or destination.is_symlink():
destination.unlink()

remote_path = Path(relative_path)
cached_path = Path(
downloaded_path = Path(
hf_hub_download(
repo_id=REPO_ID,
repo_id=source.repo_id,
filename=remote_path.name,
subfolder=remote_path.parent.as_posix() if remote_path.parent != Path(".") else None,
repo_type="dataset",
local_dir=ROOT,
force_download=force,
)
)
# HF cache entries may be snapshot symlinks. Resolve to the underlying blob so we
# always materialize a real file in data/, not a broken relative symlink.
cached_source = cached_path.resolve(strict=True)
if downloaded_path == destination:
return

destination.parent.mkdir(parents=True, exist_ok=True)
try:
os.link(cached_source, destination)
except OSError:
shutil.copy2(cached_source, destination)
downloaded_path.replace(destination)


def manifest_path(source: DatasetSource) -> Path:
return local_path_for_remote(remote_path(source, "manifest.json"), source)


def manifest_path() -> Path:
return local_path_for_remote(f"{REMOTE_ROOT_PREFIX}/manifest.json")
def manifest_has_dataset(manifest: dict, dataset_dir: str) -> bool:
return any(entry.get("name") == dataset_dir for entry in manifest.get("datasets", []))


def load_manifest(*, skip_manifest_download: bool) -> dict:
path = manifest_path()
def load_manifest(source: DatasetSource, dataset_dir: str, *, skip_manifest_download: bool) -> dict:
path = manifest_path(source)
if not path.is_file():
if skip_manifest_download:
raise FileNotFoundError(
f"manifest.json is required for manifest-driven shard counts but is not present locally at {path}"
)
get(f"{REMOTE_ROOT_PREFIX}/manifest.json")
get(remote_path(source, "manifest.json"), source)
manifest = json.loads(path.read_text(encoding="utf-8"))
if manifest_has_dataset(manifest, dataset_dir) or skip_manifest_download:
return manifest
get(remote_path(source, "manifest.json"), source, force=True)
return json.loads(path.read_text(encoding="utf-8"))


Expand Down Expand Up @@ -119,38 +153,39 @@ def build_parser() -> argparse.ArgumentParser:

def main() -> None:
args = build_parser().parse_args()
source = source_for_variant(args.variant)
dataset_dir = dataset_dir_for_variant(args.variant)
train_shards = args.train_shards_positional if args.train_shards_positional is not None else args.train_shards
if train_shards < 0:
raise ValueError("train_shards must be non-negative")

manifest = load_manifest(skip_manifest_download=args.skip_manifest)
manifest = load_manifest(source, dataset_dir, skip_manifest_download=args.skip_manifest)
dataset_entry = next((x for x in manifest.get("datasets", []) if x.get("name") == dataset_dir), None)
if dataset_entry is None:
raise ValueError(f"dataset {dataset_dir} not found in {REMOTE_ROOT_PREFIX}/manifest.json")
raise ValueError(f"dataset {dataset_dir} not found in {remote_path(source, 'manifest.json')}")
max_train_shards = int((dataset_entry.get("stats") or {}).get("files_train"))
val_shards = int((dataset_entry.get("stats") or {}).get("files_val"))
if train_shards > max_train_shards:
raise ValueError(
f"{args.variant} only has {max_train_shards} training shards on {REPO_ID}, requested {train_shards}"
f"{args.variant} only has {max_train_shards} training shards on {source.repo_id}, requested {train_shards}"
)
tokenizer_name = dataset_entry.get("tokenizer_name")
tokenizer_entry = next((x for x in manifest.get("tokenizers", []) if x.get("name") == tokenizer_name), None)
if tokenizer_entry is None:
raise ValueError(f"tokenizer {tokenizer_name} not found in {REMOTE_ROOT_PREFIX}/manifest.json")
raise ValueError(f"tokenizer {tokenizer_name} not found in {remote_path(source, 'manifest.json')}")

if args.with_docs:
get(f"{REMOTE_ROOT_PREFIX}/docs_selected.jsonl")
get(f"{REMOTE_ROOT_PREFIX}/docs_selected.source_manifest.json")
get(remote_path(source, "docs_selected.jsonl"), source)
get(remote_path(source, "docs_selected.source_manifest.json"), source)

dataset_prefix = f"{REMOTE_ROOT_PREFIX}/datasets/{dataset_dir}"
dataset_prefix = remote_path(source, "datasets", dataset_dir)
for i in range(val_shards):
get(f"{dataset_prefix}/fineweb_val_{i:06d}.bin")
get(f"{dataset_prefix}/fineweb_val_{i:06d}.bin", source)
for i in range(train_shards):
get(f"{dataset_prefix}/fineweb_train_{i:06d}.bin")
get(f"{dataset_prefix}/fineweb_train_{i:06d}.bin", source)

for artifact_path in artifact_paths_for_tokenizer(tokenizer_entry):
get(f"{REMOTE_ROOT_PREFIX}/{artifact_path}")
get(remote_path(source, artifact_path), source)


if __name__ == "__main__":
Expand Down
6 changes: 6 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
def main():
print("Hello from parameter-golf!")


if __name__ == "__main__":
main()
21 changes: 21 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
[project]
name = "parameter-golf"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.13"
dependencies = [
"brotli>=1.2.0",
"datasets>=4.8.4",
"huggingface-hub>=1.9.0",
"kernels>=0.12.3",
"mlx>=0.31.1",
"numpy>=2.4.4",
"sentencepiece>=0.2.1",
"setuptools>=82.0.1",
"tiktoken>=0.12.0",
"torch>=2.10.0",
"tqdm>=4.67.3",
"typing-extensions==4.15.0",
"wandb>=0.23.0",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# Non-Record Submission: SP8192 GPTQ Embeddings + SDClip + Loop45x2 + PLE

This is a non-record exploratory submission adding per-layer embeddings (PLE) to Kevin Clark's SP8192 GPTQ embeddings + SDClip + Loop45x2 stack from [PR #1394](https://github.com/openai/parameter-golf/pull/1394).

It is not a leaderboard record attempt. The run used a 20-minute wallclock cap (`MAX_WALLCLOCK_SECONDS=1200`) and the exported quantized+brotli artifact was `20,886,863` bytes, which is `4,886,863` bytes over the `16,000,000` byte artifact cap. The result is included because it is a useful PLE-on-PR1394 datapoint.

## Provenance

- Base submission: Kevin Clark's [PR #1394](https://github.com/openai/parameter-golf/pull/1394), "SP8192 + GPTQ Embeddings + Depth Recurrence + MuonEq-R + SDClip"
- Local implementation commit: `54bb087` (`54bb087ea167d7a23d95d4638e91783c574b2388`)
- PLE commit author: `BumaldaOverTheWater94`
- Run ID: `baseline_sp8192_GPTQ_embeddings_SDClip_loop_PLE_r1`

## What Changed

The run keeps the PR #1394 SP8192, GPTQ embeddings, standard-deviation clipping, MuonEq-R, and Loop45x2 baseline shape, then adds PLE:

- `PER_LAYER_EMBED_DIM=64`
- `PER_LAYER_EMBED_INIT_STD=0.02`
- Learned token-side per-layer embeddings in `embed_tokens_per_layer`
- A learned model-side `per_layer_model_projection`
- Per-block gated PLE injection after attention and MLP updates
- Rowwise int8 export for `embed_tokens_per_layer.weight`

The provided run used `MTP=1`, so this is a next-token objective run despite the PLE architecture change.

## Results

| Metric | Value |
|--------|------:|
| Quantized exact val_bpb | `1.21951793` |
| Quantized exact val_loss | `3.15010472` |
| Pre-quant post-EMA val_bpb | `1.21469745` |
| Pre-quant post-EMA val_loss | `3.13765307` |
| Stopped step | `1101 / 20000` |
| Train time | `1,188,555ms` |
| Wallclock cap | `1200s` |
| Model params | `42,792,024` |
| Quantized+brotli model bytes | `20,795,676` |
| Code bytes | `91,187` |
| Total submission bytes | `20,886,863` |

For comparison, PR #1394 reported a 5-seed mean sliding BPB of `1.08563` under the 16MB cap. This PLE run is therefore a negative result in this exact configuration: it increases artifact size substantially and does not improve quality within the logged 20-minute single-run setup.

## Run Command

The log was produced with the defaults from commit `54bb087` plus the explicit run identity and validation cadence shown below:

```bash
RUN_ID=baseline_sp8192_GPTQ_embeddings_SDClip_loop_PLE_r1 \
WANDB=1 \
WANDB_PROJECT=parameter-golf \
WANDB_RUN_NAME=baseline_sp8192_GPTQ_embeddings_SDClip_loop_PLE_r1 \
SEED=1337 \
MAX_WALLCLOCK_SECONDS=1200 \
VAL_LOSS_EVERY=250 \
torchrun --standalone --nproc_per_node=1 train_gpt.py
```

Track-relevant defaults from the logged hyperparameters:

```text
DATA_PATH=./data/datasets/fineweb10B_sp8192/
TOKENIZER_PATH=./data/tokenizers/fineweb_8192_bpe.model
VOCAB_SIZE=8192
NUM_LAYERS=11
MODEL_DIM=512
EMBEDDING_DIM=512
NUM_HEADS=8
NUM_KV_HEADS=4
MLP_MULT=4.0
TIE_EMBEDDINGS=1
TRAIN_BATCH_TOKENS=786432
TRAIN_SEQ_LEN=2048
EVAL_SEQ_LEN=2048
EVAL_STRIDE=64
MTP=1
NUM_LOOPS=2
LOOP_START=4
LOOP_END=5
ENABLE_LOOPING_AT=0.5
PER_LAYER_EMBED_DIM=64
PER_LAYER_EMBED_INIT_STD=0.02
MATRIX_BITS=6
EMBED_BITS=8
MATRIX_CLIP_SIGMAS=12.85
EMBED_CLIP_SIGMAS=20.0
GPTQ_CALIBRATION_BATCHES=64
GPTQ_RESERVE_SECONDS=12.0
COMPRESSOR=brotli
EMA_DECAY=0.997
MUON_ROW_NORMALIZE=1
MUON_WD=0.085
EMBED_WD=0.085
```

## Included Files

- `train_gpt.py` - exact code snapshot from commit `54bb087`
- `train_seed1337.log` - provided training and export log
- `submission.json` - non-record metadata, including explicit over-cap status
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"author": "BumaldaOverTheWater94",
"github_id": "BumaldaOverTheWater94",
"name": "SP8192 + GPTQ Embeddings + SDClip + Loop45x2 + PLE",
"blurb": "Non-record exploratory run adding per-layer embeddings (PLE) to Kevin Clark's PR #1394 SP8192 GPTQ embeddings + SDClip + Loop45x2 stack. The run trained for a 20-minute wallclock cap and produced a 20,886,863 byte quantized+brotli artifact, so it exceeds both the 10-minute record limit and the 16,000,000 byte artifact cap. Quantized exact val_bpb was 1.21951793.",
"date": "2026-04-30T08:28:06Z",
"track": "non-record-over-16mb",
"base_pr": 1394,
"base_url": "https://github.com/openai/parameter-golf/pull/1394",
"base_author": "Kevin Clark",
"base_github_id": "clarkkev",
"commit": "54bb087",
"commit_full": "54bb087ea167d7a23d95d4638e91783c574b2388",
"val_loss": 3.15010472,
"val_bpb": 1.21951793,
"pre_quant_val_loss": 3.13765307,
"pre_quant_val_bpb": 1.21469745,
"step_stop": 1101,
"wallclock_seconds": 1200.0,
"train_time_ms": 1188555,
"seed": 1337,
"model_params": 42792024,
"bytes_total": 20886863,
"bytes_model_quantized_brotli": 20795676,
"bytes_code": 91187,
"artifact_cap_bytes": 16000000,
"bytes_over_cap": 4886863,
"run_id": "baseline_sp8192_GPTQ_embeddings_SDClip_loop_PLE_r1",
"gpu": "1xH100-class run from provided log"
}
Loading