openai · BumaldaOverTheWater94 · Apr 22, 2026 · Apr 27, 2026 · Apr 30, 2026 · Apr 30, 2026
diff --git a/.python-version b/.python-version
@@ -0,0 +1 @@
+3.13
diff --git a/RTK.md b/RTK.md
@@ -0,0 +1,32 @@
+# RTK - Rust Token Killer (Codex CLI)
+
+**Usage**: Token-optimized CLI proxy for shell commands.
+
+## Rule
+
+Always prefix shell commands with `rtk`.
+
+Examples:
+
+```bash
+rtk git status
+rtk cargo test
+rtk npm run build
+rtk pytest -q
+```
+
+## Meta Commands
+
+```bash
+rtk gain            # Token savings analytics
+rtk gain --history  # Recent command savings history
+rtk proxy <cmd>     # Run raw command without filtering
+```
+
+## Verification
+
+```bash
+rtk --version
+rtk gain
+which rtk
+```
diff --git a/data/cached_challenge_fineweb.py b/data/cached_challenge_fineweb.py
@@ -1,18 +1,45 @@
 import argparse
+from dataclasses import dataclass
 import json
 import os
-import shutil
 from pathlib import Path
 
 from huggingface_hub import hf_hub_download
 
 
-REPO_ID = os.environ.get("MATCHED_FINEWEB_REPO_ID", "willdepueoai/parameter-golf")
-REMOTE_ROOT_PREFIX = os.environ.get("MATCHED_FINEWEB_REMOTE_ROOT_PREFIX", "datasets")
+DEFAULT_REPO_ID = "willdepueoai/parameter-golf"
+DEFAULT_REMOTE_ROOT_PREFIX = "datasets"
+SP8192_REPO_ID = "Jaikirat/fineweb10B_sp8192"
+SP8192_REMOTE_ROOT_PREFIX = ""
 ROOT = Path(__file__).resolve().parent
 DATASETS_DIR = ROOT / "datasets"
 TOKENIZERS_DIR = ROOT / "tokenizers"
 
+
+@dataclass(frozen=True)
+class DatasetSource:
+    repo_id: str
+    remote_root_prefix: str
+
+
+def source_for_variant(variant: str) -> DatasetSource:
+    if variant == "sp8192":
+        default_repo_id = SP8192_REPO_ID
+        default_remote_root_prefix = SP8192_REMOTE_ROOT_PREFIX
+    else:
+        default_repo_id = DEFAULT_REPO_ID
+        default_remote_root_prefix = DEFAULT_REMOTE_ROOT_PREFIX
+    return DatasetSource(
+        repo_id=os.environ.get("MATCHED_FINEWEB_REPO_ID", default_repo_id),
+        remote_root_prefix=os.environ.get("MATCHED_FINEWEB_REMOTE_ROOT_PREFIX", default_remote_root_prefix),
+    )
+
+
+def remote_path(source: DatasetSource, *parts: str) -> str:
+    path_parts = [source.remote_root_prefix, *parts]
+    return "/".join(part.strip("/") for part in path_parts if part.strip("/"))
+
+
 def dataset_dir_for_variant(name: str) -> str:
     if name == "byte260":
         return "fineweb10B_byte260"
@@ -21,55 +48,62 @@ def dataset_dir_for_variant(name: str) -> str:
     raise ValueError(f"unsupported variant {name!r}; expected byte260 or sp<VOCAB_SIZE>")
 
 
-def local_path_for_remote(relative_path: str) -> Path:
+def local_path_for_remote(relative_path: str, source: DatasetSource) -> Path:
     remote_path = Path(relative_path)
-    if REMOTE_ROOT_PREFIX and remote_path.parts[:1] == (REMOTE_ROOT_PREFIX,):
-        remote_path = remote_path.relative_to(REMOTE_ROOT_PREFIX)
+    if source.remote_root_prefix and remote_path.parts[:1] == (source.remote_root_prefix,):
+        remote_path = remote_path.relative_to(source.remote_root_prefix)
     if remote_path.parts[:1] == ("datasets",):
         return DATASETS_DIR.joinpath(*remote_path.parts[1:])
     if remote_path.parts[:1] == ("tokenizers",):
         return TOKENIZERS_DIR.joinpath(*remote_path.parts[1:])
     return ROOT / remote_path
 
 
-def get(relative_path: str) -> None:
-    destination = local_path_for_remote(relative_path)
-    if destination.exists():
+def get(relative_path: str, source: DatasetSource, *, force: bool = False) -> None:
+    destination = local_path_for_remote(relative_path, source)
+    if destination.exists() and not force:
         return
-    if destination.is_symlink():
+    if destination.exists() or destination.is_symlink():
         destination.unlink()
 
     remote_path = Path(relative_path)
-    cached_path = Path(
+    downloaded_path = Path(
         hf_hub_download(
-            repo_id=REPO_ID,
+            repo_id=source.repo_id,
             filename=remote_path.name,
             subfolder=remote_path.parent.as_posix() if remote_path.parent != Path(".") else None,
             repo_type="dataset",
+            local_dir=ROOT,
+            force_download=force,
         )
     )
-    # HF cache entries may be snapshot symlinks. Resolve to the underlying blob so we
-    # always materialize a real file in data/, not a broken relative symlink.
-    cached_source = cached_path.resolve(strict=True)
+    if downloaded_path == destination:
+        return
+
     destination.parent.mkdir(parents=True, exist_ok=True)
-    try:
-        os.link(cached_source, destination)
-    except OSError:
-        shutil.copy2(cached_source, destination)
+    downloaded_path.replace(destination)
+
+
+def manifest_path(source: DatasetSource) -> Path:
+    return local_path_for_remote(remote_path(source, "manifest.json"), source)
 
 
-def manifest_path() -> Path:
-    return local_path_for_remote(f"{REMOTE_ROOT_PREFIX}/manifest.json")
+def manifest_has_dataset(manifest: dict, dataset_dir: str) -> bool:
+    return any(entry.get("name") == dataset_dir for entry in manifest.get("datasets", []))
 
 
-def load_manifest(*, skip_manifest_download: bool) -> dict:
-    path = manifest_path()
+def load_manifest(source: DatasetSource, dataset_dir: str, *, skip_manifest_download: bool) -> dict:
+    path = manifest_path(source)
     if not path.is_file():
         if skip_manifest_download:
             raise FileNotFoundError(
                 f"manifest.json is required for manifest-driven shard counts but is not present locally at {path}"
             )
-        get(f"{REMOTE_ROOT_PREFIX}/manifest.json")
+        get(remote_path(source, "manifest.json"), source)
+    manifest = json.loads(path.read_text(encoding="utf-8"))
+    if manifest_has_dataset(manifest, dataset_dir) or skip_manifest_download:
+        return manifest
+    get(remote_path(source, "manifest.json"), source, force=True)
     return json.loads(path.read_text(encoding="utf-8"))
 
 
@@ -119,38 +153,39 @@ def build_parser() -> argparse.ArgumentParser:
 
 def main() -> None:
     args = build_parser().parse_args()
+    source = source_for_variant(args.variant)
     dataset_dir = dataset_dir_for_variant(args.variant)
     train_shards = args.train_shards_positional if args.train_shards_positional is not None else args.train_shards
     if train_shards < 0:
         raise ValueError("train_shards must be non-negative")
 
-    manifest = load_manifest(skip_manifest_download=args.skip_manifest)
+    manifest = load_manifest(source, dataset_dir, skip_manifest_download=args.skip_manifest)
     dataset_entry = next((x for x in manifest.get("datasets", []) if x.get("name") == dataset_dir), None)
     if dataset_entry is None:
-        raise ValueError(f"dataset {dataset_dir} not found in {REMOTE_ROOT_PREFIX}/manifest.json")
+        raise ValueError(f"dataset {dataset_dir} not found in {remote_path(source, 'manifest.json')}")
     max_train_shards = int((dataset_entry.get("stats") or {}).get("files_train"))
     val_shards = int((dataset_entry.get("stats") or {}).get("files_val"))
     if train_shards > max_train_shards:
         raise ValueError(
-            f"{args.variant} only has {max_train_shards} training shards on {REPO_ID}, requested {train_shards}"
+            f"{args.variant} only has {max_train_shards} training shards on {source.repo_id}, requested {train_shards}"
         )
     tokenizer_name = dataset_entry.get("tokenizer_name")
     tokenizer_entry = next((x for x in manifest.get("tokenizers", []) if x.get("name") == tokenizer_name), None)
     if tokenizer_entry is None:
-        raise ValueError(f"tokenizer {tokenizer_name} not found in {REMOTE_ROOT_PREFIX}/manifest.json")
+        raise ValueError(f"tokenizer {tokenizer_name} not found in {remote_path(source, 'manifest.json')}")
 
     if args.with_docs:
-        get(f"{REMOTE_ROOT_PREFIX}/docs_selected.jsonl")
-        get(f"{REMOTE_ROOT_PREFIX}/docs_selected.source_manifest.json")
+        get(remote_path(source, "docs_selected.jsonl"), source)
+        get(remote_path(source, "docs_selected.source_manifest.json"), source)
 
-    dataset_prefix = f"{REMOTE_ROOT_PREFIX}/datasets/{dataset_dir}"
+    dataset_prefix = remote_path(source, "datasets", dataset_dir)
     for i in range(val_shards):
-        get(f"{dataset_prefix}/fineweb_val_{i:06d}.bin")
+        get(f"{dataset_prefix}/fineweb_val_{i:06d}.bin", source)
     for i in range(train_shards):
-        get(f"{dataset_prefix}/fineweb_train_{i:06d}.bin")
+        get(f"{dataset_prefix}/fineweb_train_{i:06d}.bin", source)
 
     for artifact_path in artifact_paths_for_tokenizer(tokenizer_entry):
-        get(f"{REMOTE_ROOT_PREFIX}/{artifact_path}")
+        get(remote_path(source, artifact_path), source)
 
 
 if __name__ == "__main__":

diff --git a/main.py b/main.py
@@ -0,0 +1,6 @@
+def main():
+    print("Hello from parameter-golf!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,21 @@
+[project]
+name = "parameter-golf"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "brotli>=1.2.0",
+    "datasets>=4.8.4",
+    "huggingface-hub>=1.9.0",
+    "kernels>=0.12.3",
+    "mlx>=0.31.1",
+    "numpy>=2.4.4",
+    "sentencepiece>=0.2.1",
+    "setuptools>=82.0.1",
+    "tiktoken>=0.12.0",
+    "torch>=2.10.0",
+    "tqdm>=4.67.3",
+    "typing-extensions==4.15.0",
+    "wandb>=0.23.0",
+]
diff --git a/...cord_16mb/2026-04-30_SP8192_GPTQ-Embeddings_SDClip_Loop45x2_PLE_20min/README.md b/...cord_16mb/2026-04-30_SP8192_GPTQ-Embeddings_SDClip_Loop45x2_PLE_20min/README.md
@@ -0,0 +1,101 @@
+# Non-Record Submission: SP8192 GPTQ Embeddings + SDClip + Loop45x2 + PLE
+
+This is a non-record exploratory submission adding per-layer embeddings (PLE) to Kevin Clark's SP8192 GPTQ embeddings + SDClip + Loop45x2 stack from [PR #1394](https://github.com/openai/parameter-golf/pull/1394).
+
+It is not a leaderboard record attempt. The run used a 20-minute wallclock cap (`MAX_WALLCLOCK_SECONDS=1200`) and the exported quantized+brotli artifact was `20,886,863` bytes, which is `4,886,863` bytes over the `16,000,000` byte artifact cap. The result is included because it is a useful PLE-on-PR1394 datapoint.
+
+## Provenance
+
+- Base submission: Kevin Clark's [PR #1394](https://github.com/openai/parameter-golf/pull/1394), "SP8192 + GPTQ Embeddings + Depth Recurrence + MuonEq-R + SDClip"
+- Local implementation commit: `54bb087` (`54bb087ea167d7a23d95d4638e91783c574b2388`)
+- PLE commit author: `BumaldaOverTheWater94`
+- Run ID: `baseline_sp8192_GPTQ_embeddings_SDClip_loop_PLE_r1`
+
+## What Changed
+
+The run keeps the PR #1394 SP8192, GPTQ embeddings, standard-deviation clipping, MuonEq-R, and Loop45x2 baseline shape, then adds PLE:
+
+- `PER_LAYER_EMBED_DIM=64`
+- `PER_LAYER_EMBED_INIT_STD=0.02`
+- Learned token-side per-layer embeddings in `embed_tokens_per_layer`
+- A learned model-side `per_layer_model_projection`
+- Per-block gated PLE injection after attention and MLP updates
+- Rowwise int8 export for `embed_tokens_per_layer.weight`
+
+The provided run used `MTP=1`, so this is a next-token objective run despite the PLE architecture change.
+
+## Results
+
+| Metric | Value |
+|--------|------:|
+| Quantized exact val_bpb | `1.21951793` |
+| Quantized exact val_loss | `3.15010472` |
+| Pre-quant post-EMA val_bpb | `1.21469745` |
+| Pre-quant post-EMA val_loss | `3.13765307` |
+| Stopped step | `1101 / 20000` |
+| Train time | `1,188,555ms` |
+| Wallclock cap | `1200s` |
+| Model params | `42,792,024` |
+| Quantized+brotli model bytes | `20,795,676` |
+| Code bytes | `91,187` |
+| Total submission bytes | `20,886,863` |
+
+For comparison, PR #1394 reported a 5-seed mean sliding BPB of `1.08563` under the 16MB cap. This PLE run is therefore a negative result in this exact configuration: it increases artifact size substantially and does not improve quality within the logged 20-minute single-run setup.
+
+## Run Command
+
+The log was produced with the defaults from commit `54bb087` plus the explicit run identity and validation cadence shown below:
+
+```bash
+RUN_ID=baseline_sp8192_GPTQ_embeddings_SDClip_loop_PLE_r1 \
+WANDB=1 \
+WANDB_PROJECT=parameter-golf \
+WANDB_RUN_NAME=baseline_sp8192_GPTQ_embeddings_SDClip_loop_PLE_r1 \
+SEED=1337 \
+MAX_WALLCLOCK_SECONDS=1200 \
+VAL_LOSS_EVERY=250 \
+torchrun --standalone --nproc_per_node=1 train_gpt.py
+```
+
+Track-relevant defaults from the logged hyperparameters:
+
+```text
+DATA_PATH=./data/datasets/fineweb10B_sp8192/
+TOKENIZER_PATH=./data/tokenizers/fineweb_8192_bpe.model
+VOCAB_SIZE=8192
+NUM_LAYERS=11
+MODEL_DIM=512
+EMBEDDING_DIM=512
+NUM_HEADS=8
+NUM_KV_HEADS=4
+MLP_MULT=4.0
+TIE_EMBEDDINGS=1
+TRAIN_BATCH_TOKENS=786432
+TRAIN_SEQ_LEN=2048
+EVAL_SEQ_LEN=2048
+EVAL_STRIDE=64
+MTP=1
+NUM_LOOPS=2
+LOOP_START=4
+LOOP_END=5
+ENABLE_LOOPING_AT=0.5
+PER_LAYER_EMBED_DIM=64
+PER_LAYER_EMBED_INIT_STD=0.02
+MATRIX_BITS=6
+EMBED_BITS=8
+MATRIX_CLIP_SIGMAS=12.85
+EMBED_CLIP_SIGMAS=20.0
+GPTQ_CALIBRATION_BATCHES=64
+GPTQ_RESERVE_SECONDS=12.0
+COMPRESSOR=brotli
+EMA_DECAY=0.997
+MUON_ROW_NORMALIZE=1
+MUON_WD=0.085
+EMBED_WD=0.085
+```
+
+## Included Files
+
+- `train_gpt.py` - exact code snapshot from commit `54bb087`
+- `train_seed1337.log` - provided training and export log
+- `submission.json` - non-record metadata, including explicit over-cap status
diff --git a/...n_record_16mb/2026-04-30_SP8192_GPTQ-Embeddings_SDClip_Loop45x2_PLE_20min/submission.json b/...n_record_16mb/2026-04-30_SP8192_GPTQ-Embeddings_SDClip_Loop45x2_PLE_20min/submission.json
@@ -0,0 +1,30 @@
+{
+  "author": "BumaldaOverTheWater94",
+  "github_id": "BumaldaOverTheWater94",
+  "name": "SP8192 + GPTQ Embeddings + SDClip + Loop45x2 + PLE",
+  "blurb": "Non-record exploratory run adding per-layer embeddings (PLE) to Kevin Clark's PR #1394 SP8192 GPTQ embeddings + SDClip + Loop45x2 stack. The run trained for a 20-minute wallclock cap and produced a 20,886,863 byte quantized+brotli artifact, so it exceeds both the 10-minute record limit and the 16,000,000 byte artifact cap. Quantized exact val_bpb was 1.21951793.",
+  "date": "2026-04-30T08:28:06Z",
+  "track": "non-record-over-16mb",
+  "base_pr": 1394,
+  "base_url": "https://github.com/openai/parameter-golf/pull/1394",
+  "base_author": "Kevin Clark",
+  "base_github_id": "clarkkev",
+  "commit": "54bb087",
+  "commit_full": "54bb087ea167d7a23d95d4638e91783c574b2388",
+  "val_loss": 3.15010472,
+  "val_bpb": 1.21951793,
+  "pre_quant_val_loss": 3.13765307,
+  "pre_quant_val_bpb": 1.21469745,
+  "step_stop": 1101,
+  "wallclock_seconds": 1200.0,
+  "train_time_ms": 1188555,
+  "seed": 1337,
+  "model_params": 42792024,
+  "bytes_total": 20886863,
+  "bytes_model_quantized_brotli": 20795676,
+  "bytes_code": 91187,
+  "artifact_cap_bytes": 16000000,
+  "bytes_over_cap": 4886863,
+  "run_id": "baseline_sp8192_GPTQ_embeddings_SDClip_loop_PLE_r1",
+  "gpu": "1xH100-class run from provided log"
+}