11#! /bin/bash
2- # submission/dry_run.sh — H100 SUBMISSION DRY RUN
2+ # submission/dry_run.sh — H100 SUBMISSION RUN (dry run AND real submission, same code path)
3+ #
4+ # THE CANONICAL ENTRY POINT. This script is BOTH:
5+ # - The dry run sanity check (1 seed, fast iteration on 1×H100 PCIe)
6+ # - The real comp submission (3 seeds, full validation on 8×H100 SXM)
7+ #
8+ # The ONLY difference between the two is the SEEDS env var:
9+ # bash submission/dry_run.sh # dry run (default SEEDS=42)
10+ # SEEDS=42,314,999 bash submission/dry_run.sh # real 3-seed submission
11+ #
12+ # Output: assembles a complete comp submission folder under
13+ # records/track_10min_16mb/<date>_<config-tag>/
14+ # with: README.md, submission.json, train_gpt.py, train_seed<N>.log
315#
4- # Single-command launcher for the submission-grade config we want to test on H100.
516# This is the FULL stack we'd ship as a real comp record. Targets PR #1493
617# (merged leaderboard #1, val_bpb 1.0810 on 8×H100 SXM) plus our deltas:
7- # PR #1493 stack (verified from openai/parameter-golf merged PR description ):
18+ # PR #1493 stack (verified from openai/parameter-golf merged PR):
819# - SP8192 vocab
920# - 3-Layer Depth Recurrence (NUM_LOOPS=2, LOOP_START=3, LOOP_END=5)
1021# - Parallel Residuals (PR #1493 applies L7+; we apply all-layers — see notes)
1122# - QK-Gain 5.25
1223# - Legal Score-First TTT (TTT_ENABLED=1, TTT_LR=0.005, TTT_EPOCHS=3)
1324# - EMA_DECAY=0.9965, WARMDOWN_FRAC=0.72, ENABLE_LOOPING_AT=0.35
14- # - NO pre-quant TTT, NO SLOT, NO n-gram cache, NO ETLB (PR #1493 explicit)
25+ # - MUON_WD=0.095, MATRIX_LR=0.022
1526# Our deltas on top:
1627# - NUM_LAYERS=6 + MLP_MULT=2 (CHAMP_D validated, val_bpb 1.39943 on 3090)
1728# - MATRIX_BITS=8 (our int8 quant breakthrough — eliminates the int6 quant gap)
1829# - USE_PARALLEL_MUON=1 (our batched Newton-Schulz speedup)
1930# - max-autotune-no-cudagraphs compile mode + cudnn.benchmark
31+ # - n-gram bias stack (NIGHT_MODE wins) — NOTE: may be Track-B-illegal, verify before submit
2032# - 600s training cap, full 16 MB artifact target
2133#
2234# Usage on a fresh pod:
@@ -33,19 +45,32 @@ set -eu
3345REPO_DIR=" ${REPO_DIR:-/ workspace/ paramgolf} "
3446cd " $REPO_DIR "
3547
48+ # === Seed selection (THE dry-run/real-submission switch) ===
49+ SEEDS=" ${SEEDS:- 42} "
50+ IFS=' ,' read -ra SEED_ARRAY <<< " $SEEDS"
51+ NUM_SEEDS=${# SEED_ARRAY[@]}
52+
53+ # === Records folder staging ===
54+ DATE_STR=$( date -u +%Y-%m-%d)
55+ CONFIG_TAG=" SP8192_NL6_MLP2_int8_NgramBias_PR_LegalTTT"
56+ RECORD_NAME=" ${DATE_STR} _${CONFIG_TAG} "
57+ RECORD_DIR=" records/track_10min_16mb/${RECORD_NAME} "
58+ mkdir -p " $RECORD_DIR "
59+
3660echo " ============================================================"
37- echo " [dry_run] SUBMISSION DRY RUN starting at $( date -u +%Y-%m-%dT%H:%M:%SZ) "
61+ echo " [dry_run] SUBMISSION RUN starting at $( date -u +%Y-%m-%dT%H:%M:%SZ) "
3862echo " ============================================================"
63+ echo " Seeds: ${SEEDS} (${NUM_SEEDS} run$( [ $NUM_SEEDS -gt 1 ] && echo s || echo ' ' ) )"
3964echo " Stack: PR #1493 leaderboard #1 (1.0810) + our int8 quant + parallel muon"
40- echo " + NUM_LAYERS=6 compute-efficient + max-autotune compile"
65+ echo " + NUM_LAYERS=6 compute-efficient + n-gram bias + max-autotune"
66+ echo " Records folder: $RECORD_DIR "
4167echo " ============================================================"
4268
43- # === Submission-grade config ===
44- export SEED=42
69+ # === Submission-grade config (shared across all seed runs) ===
4570export MAX_WALLCLOCK_SECONDS=600
4671
4772# Model architecture — CHAMP_D validated config (val_bpb 1.39943 on 3090, 600s)
48- export NUM_LAYERS=6 # was 8 — reverted to CHAMP_D validated
73+ export NUM_LAYERS=6 # CHAMP_D validated
4974export MLP_MULT=2
5075
5176# PR #1493 (leaderboard #1, val_bpb 1.0810) architecture techniques (verified
@@ -93,16 +118,231 @@ export USE_NGR_LOG_FREQ_INV=1 # uses train data for sample (rule-f
93118export USE_CTX_PARTITIONED_TAB=1
94119export USE_PREFETCH_LOADER=1
95120
96- bash submission/run.sh
121+ # === Run each seed ===
122+ for THIS_SEED in " ${SEED_ARRAY[@]} " ; do
123+ echo
124+ echo " ============================================================"
125+ echo " [dry_run] SEED=${THIS_SEED} starting at $( date -u +%H:%M:%SZ) "
126+ echo " ============================================================"
127+
128+ export SEED=" $THIS_SEED "
129+ SEED_LOG=" ${RECORD_DIR} /train_seed${THIS_SEED} .log"
130+
131+ # bash submission/run.sh tees its own log to logs/run_seed*.log; we additionally
132+ # capture the train.py output stream into the records folder for the comp submission
133+ bash submission/run.sh 2>&1 | tee " $SEED_LOG "
134+
135+ echo
136+ echo " [dry_run] SEED=${THIS_SEED} done. Log: $SEED_LOG "
137+
138+ # Capture the artifact size for this seed (the int6.ptz file is the actual submission blob)
139+ if [ -f final_model.int6.ptz ]; then
140+ SEED_ARTIFACT_BYTES=$( stat -c %s final_model.int6.ptz 2> /dev/null || stat -f %z final_model.int6.ptz)
141+ echo " [dry_run] SEED=${THIS_SEED} artifact: ${SEED_ARTIFACT_BYTES} bytes"
142+ # Save the artifact alongside the log so a 3-seed run keeps all of them
143+ cp final_model.int6.ptz " ${RECORD_DIR} /final_model_seed${THIS_SEED} .int6.ptz"
144+ fi
145+ done
146+
147+ # === Assemble submission.json + README.md from the per-seed logs ===
148+ echo
149+ echo " ============================================================"
150+ echo " [dry_run] Assembling records folder $( date -u +%H:%M:%SZ) "
151+ echo " ============================================================"
152+
153+ python3 - << PYEOF
154+ import json, re, os, statistics
155+ from pathlib import Path
156+
157+ record_dir = Path("$RECORD_DIR ")
158+ seeds = "$SEEDS ".split(",")
159+ record_name = "$RECORD_NAME "
160+ date_str = "$DATE_STR "
161+
162+ # Parse each seed log for the final quantized_ttt val_bpb (the submission number)
163+ # Falls back to quantized_sliding_window then quantized if TTT wasn't enabled.
164+ seed_results = {}
165+ for seed in seeds:
166+ log_path = record_dir / f"train_seed{seed}.log"
167+ if not log_path.exists():
168+ print(f" WARN: missing {log_path}")
169+ continue
170+ text = log_path.read_text(errors="replace")
171+ # Match the lines train.py emits via timed_eval(label, ...):
172+ # quantized val_loss:X val_bpb:X eval_time:Xms
173+ # quantized_sliding_window val_loss:X val_bpb:X eval_time:Xms
174+ # quantized_ttt val_loss:X val_bpb:X eval_time:Xms
175+ rx = re.compile(r"^(quantized|quantized_sliding_window|quantized_ttt) val_loss:([\d.]+) val_bpb:([\d.]+) eval_time:(\d+)ms", re.M)
176+ matches = {m.group(1): (float(m.group(2)), float(m.group(3)), int(m.group(4))) for m in rx.finditer(text)}
177+ # Pick the best available metric, in order of preference
178+ if "quantized_ttt" in matches:
179+ primary_label = "quantized_ttt"
180+ elif "quantized_sliding_window" in matches:
181+ primary_label = "quantized_sliding_window"
182+ elif "quantized" in matches:
183+ primary_label = "quantized"
184+ else:
185+ print(f" WARN: no quantized val_bpb in {log_path}")
186+ continue
187+ val_loss, val_bpb, eval_time_ms = matches[primary_label]
188+ artifact_path = record_dir / f"final_model_seed{seed}.int6.ptz"
189+ artifact_bytes = artifact_path.stat().st_size if artifact_path.exists() else None
190+ seed_results[seed] = {
191+ "primary_label": primary_label,
192+ "val_loss": val_loss,
193+ "val_bpb": val_bpb,
194+ "eval_time_ms": eval_time_ms,
195+ "artifact_bytes": artifact_bytes,
196+ "all_metrics": {label: {"val_loss": vl, "val_bpb": vb, "eval_time_ms": et}
197+ for label, (vl, vb, et) in matches.items()},
198+ }
199+
200+ # Compute mean + std across seeds
201+ val_bpbs = [r["val_bpb"] for r in seed_results.values()]
202+ if val_bpbs:
203+ mean_bpb = sum(val_bpbs) / len(val_bpbs)
204+ std_bpb = statistics.stdev(val_bpbs) if len(val_bpbs) > 1 else 0.0
205+ else:
206+ mean_bpb = std_bpb = float("nan")
207+
208+ # Detect hardware (best effort)
209+ hw = "unknown"
210+ try:
211+ import subprocess
212+ out = subprocess.check_output(["nvidia-smi", "--query-gpu=name,count", "--format=csv,noheader"], text=True).strip().split("\n")
213+ if out:
214+ gpu_name = out[0].split(",")[0].strip()
215+ gpu_count = len(out)
216+ hw = f"{gpu_count}x{gpu_name}"
217+ except Exception:
218+ pass
219+
220+ submission = {
221+ "author": os.environ.get("SUBMISSION_AUTHOR", "taka6745"),
222+ "github_id": os.environ.get("SUBMISSION_GITHUB_ID", "taka6745"),
223+ "name": "SP8192 + NL6 MLP2 int8 + Parallel Muon + N-gram Bias + Parallel Residuals + Legal TTT",
224+ "date": date_str,
225+ "track": "10min_16mb",
226+ "val_bpb": round(mean_bpb, 5),
227+ "val_bpb_std": round(std_bpb, 5),
228+ "seeds": [int(s) for s in seeds if s in seed_results],
229+ "seed_results": {
230+ s: {"val_bpb": round(r["val_bpb"], 5), "artifact_bytes": r["artifact_bytes"]}
231+ for s, r in seed_results.items()
232+ },
233+ "hardware": hw,
234+ "technique_summary": (
235+ "SP8192 + NUM_LAYERS=6 + MLP_MULT=2 + 3-Layer Depth Recurrence (L3-5) "
236+ "+ Parallel Residuals (all layers) + QK-Gain 5.25 + EMA 0.9965 + WD 0.095 "
237+ "+ MATRIX_BITS=8 (int8 weights) + Parallel Muon + N-gram Bias Stack "
238+ "+ Score-First TTT (SGD 3ep) + GPTQ + Brotli"
239+ ),
240+ "compliance": {
241+ "train_under_600s": True, # MAX_WALLCLOCK_SECONDS=600
242+ "artifact_under_16mb": all((r["artifact_bytes"] or 0) < 16_000_000 for r in seed_results.values()),
243+ "eval_under_600s": all(sum(m["eval_time_ms"] for m in r["all_metrics"].values()) < 600_000 for r in seed_results.values()),
244+ "no_slot": True,
245+ "no_pre_quant_ttt": True, # PREQUANT_TTT_ENABLED=0
246+ "no_etlb": True,
247+ "no_ngram_cache": False, # WE USE NGRAM_BIAS — flag honestly
248+ "score_first_ttt": True,
249+ "three_seeds": len(seeds) >= 3,
250+ },
251+ "attribution": {
252+ "sp8192_gptq_sdclip": "@clarkkev (PR #1394)",
253+ "depth_recurrence": "@dexhunter (PR #1331, #1437)",
254+ "parallel_residuals": "@Robby955 (PR #1412), @msisovic (PR #1204)",
255+ "legal_ttt_framework": "@abaybektursun (PR #549), @dexhunter (PR #1413)",
256+ "hyperparameter_tuning_pr1493": "@bigbag (PR #1493)",
257+ "int8_quant_smaller_model": "@taka6745 (this submission, CHAMP_D discovery)",
258+ },
259+ }
260+
261+ (record_dir / "submission.json").write_text(json.dumps(submission, indent=2))
262+
263+ # README.md (templated)
264+ seeds_table = "\n".join(
265+ f"| {s} | **{r['val_bpb']:.4f}** | {r['artifact_bytes'] or 'N/A'} |"
266+ for s, r in seed_results.items()
267+ )
268+ readme = f"""# Record: SP8192 + NL6 MLP2 int8 + Parallel Muon + N-gram Bias + Legal TTT
269+
270+ **val_bpb = {mean_bpb:.4f}** ({len(val_bpbs)}-seed mean, std {std_bpb:.4f}) | **{hw}**
271+
272+ ## Per-seed Results
273+
274+ | Seed | val_bpb (quantized_ttt) | Artifact Bytes |
275+ |------|-------------------------|----------------|
276+ {seeds_table}
277+
278+ ## Key Techniques
279+
280+ 1. **NUM_LAYERS=6 + MLP_MULT=2** — compute-efficient architecture (CHAMP_D), validated val_bpb 1.39943 on RTX 3090
281+ 2. **MATRIX_BITS=8** — int8 weight quantization (eliminates the int6 quant gap on converged smaller models)
282+ 3. **3-Layer Depth Recurrence** (L3-5, activate at frac=0.35) — from PR #1331/#1437
283+ 4. **Parallel Residuals (all layers)** — more aggressive than PR #1493's L7+ pattern, bet on smaller-model expressivity
284+ 5. **QK-Gain 5.25 + EMA 0.9965 + WD 0.095 + warmdown 0.72** — PR #1493 (@bigbag) hyperparameters
285+ 6. **Parallel Muon** — batched Newton-Schulz across same-shape parameters
286+ 7. **N-gram Bias Stack** — bigram/trigram/fourgram with backoff, hash buckets, NLFI, ctx-partitioned (NIGHT_MODE)
287+ 8. **Legal Score-First TTT** — SGD (lr=0.005, mom=0.9), 3 epochs per chunk, cosine LR decay
288+ 9. **GPTQ + Brotli** — int8 matrices + int8 embeddings + brotli compression
289+ 10. **max-autotune-no-cudagraphs torch.compile + cudnn.benchmark**
290+
291+ ## Compliance
292+
293+ ` ` ` json
294+ {json.dumps(submission[" compliance" ], indent=2)}
295+ ` ` `
296+
297+ **NOTE on n-gram bias**: this submission uses precomputed n-gram log-prob tables as a logit bias.
298+ Issue #1017 Track B (legal eval-time adaptation) Condition 2 says "no n-gram cache, no logit biasing."
299+ We flag ` no_ngram_cache: false` honestly. Whether this is comp-legal under Track A or any other
300+ track is an open question that needs to be resolved before merging this as a record.
301+
302+ ## Reproduction
303+
304+ ` ` ` bash
305+ # Default: 1-seed dry run on 1xH100 PCIe
306+ bash submission/bootstrap.sh
307+ bash submission/dry_run.sh
308+
309+ # Real 3-seed submission on 8xH100 SXM
310+ SEEDS=42,314,999 bash submission/dry_run.sh
311+ ` ` `
312+
313+ ## Attribution
314+
315+ {chr(10).join(f"- **{k}**: {v}" for k, v in submission["attribution"].items())}
316+ """
317+ (record_dir / "README.md").write_text(readme)
318+
319+ print(f" wrote {record_dir}/submission.json")
320+ print(f" wrote {record_dir}/README.md")
321+ print(f" per-seed logs: train_seed{{{','.join(seeds)}}}.log")
322+ print()
323+ print(f" MEAN val_bpb: {mean_bpb:.5f} (std {std_bpb:.5f}, {len(val_bpbs)} seed(s))")
324+ PYEOF
325+
326+ # Copy train.py as train_gpt.py for the records folder (per comp convention).
327+ # Note: PR #1493 LZMA-wraps theirs to fit code-size. We don't yet — that's a
328+ # follow-up if/when we're closing in on the code-size limit.
329+ cp submission/train.py " $RECORD_DIR /train_gpt.py"
330+ echo " copied submission/train.py -> $RECORD_DIR /train_gpt.py"
97331
98332echo
99333echo " ============================================================"
100334echo " [dry_run] DONE $( date -u +%Y-%m-%dT%H:%M:%SZ) "
101335echo " ============================================================"
102- echo " Submission val_bpb candidates (use the LAST one for the PR):"
336+ echo " Records folder: $RECORD_DIR "
337+ echo " Contents:"
338+ ls -la " $RECORD_DIR "
339+ echo
340+ echo " Submission val_bpb candidates (use quantized_ttt for the PR):"
103341echo " - quantized val_bpb (no TTT)"
104342echo " - quantized_sliding_window (sliding eval, no TTT)"
105- echo " - legal_ttt_exact val_bpb (legal score-first TTT — THIS IS THE SUBMISSION NUMBER)"
343+ echo " - quantized_ttt val_bpb (legal score-first TTT — THIS IS THE SUBMISSION NUMBER)"
106344echo
107- grep -E " ^(quantized|legal_ttt_exact|quantized_sliding_window)" /tmp/paramgolf_bootstrap.log 2> /dev/null || \
108- grep -E " val_bpb" /tmp/paramgolf_bootstrap.log 2> /dev/null | tail -15
345+ for THIS_SEED in " ${SEED_ARRAY[@]} " ; do
346+ echo " === seed $THIS_SEED ==="
347+ grep -E " ^(quantized|quantized_sliding_window|quantized_ttt) val_loss:" " $RECORD_DIR /train_seed${THIS_SEED} .log" 2> /dev/null || echo " (no quantized lines in log)"
348+ done
0 commit comments