diff --git a/.gitignore b/.gitignore index 3423c416a7..b95d119d63 100644 --- a/.gitignore +++ b/.gitignore @@ -4,8 +4,14 @@ __pycache__/ modded-nanogpt/ modded-nanogpt data/datasets +data/*_local_build/ data/manifest.json data/docs_selected.jsonl .mypy_cache/ .venv -logs/ \ No newline at end of file +logs/ +final_model.pt +final_model.int8.ptz +tools/RunMonitor/bin/ +tools/RunMonitor/obj/ +tinkering/ diff --git a/FunProject.sln b/FunProject.sln new file mode 100644 index 0000000000..36e62cf975 --- /dev/null +++ b/FunProject.sln @@ -0,0 +1,29 @@ +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.5.2.0 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tools", "tools", "{07C2787E-EAC7-C090-1BA3-A61EC2A24D84}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "RunMonitor", "tools\RunMonitor\RunMonitor.csproj", "{DE6DDB0B-4D95-4B24-381D-64E74C9B1199}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {DE6DDB0B-4D95-4B24-381D-64E74C9B1199}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {DE6DDB0B-4D95-4B24-381D-64E74C9B1199}.Debug|Any CPU.Build.0 = Debug|Any CPU + {DE6DDB0B-4D95-4B24-381D-64E74C9B1199}.Release|Any CPU.ActiveCfg = Release|Any CPU + {DE6DDB0B-4D95-4B24-381D-64E74C9B1199}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(NestedProjects) = preSolution + {DE6DDB0B-4D95-4B24-381D-64E74C9B1199} = {07C2787E-EAC7-C090-1BA3-A61EC2A24D84} + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {96B455A5-4CC6-4905-9123-7FCEC67532C8} + EndGlobalSection +EndGlobal diff --git a/currentspecs.txt b/currentspecs.txt new file mode 100644 index 0000000000..fcd844b79d --- /dev/null +++ b/currentspecs.txt @@ -0,0 +1,4 @@ +Baseline after initial easy wins: +val_loss: 3.4190 +val_bpb: 2.0916 +artifact_bytes: 6831983 \ No newline at end of file diff --git a/docs/research_tracks.md b/docs/research_tracks.md new file mode 100644 index 0000000000..0236399d19 --- /dev/null +++ b/docs/research_tracks.md @@ -0,0 +1,377 @@ +# Parameter Golf Research Tracks + +Priority order is dictated by the challenge rules: + +1. stay under the `16,000,000` byte artifact cap +2. stay within the `10 minute / 8xH100` training budget for record attempts +3. optimize post-roundtrip `val_bpb`, not pre-quant loss + +## Integrated now + +- Post-compression-aware training: + - sampled int8 reconstruction regularizer + - optional ternary-weight regularizer + - optional outlier suppression penalty +- Weight sharing / recurrence: + - shared-block transformer via `NUM_UNIQUE_BLOCKS` +- Sparse attention: + - optional sliding-window attention via `WINDOW_SIZE` +- Factorized embeddings: + - optional `EMBED_DIM < MODEL_DIM` +- Hybrid eval-time compute: + - optional recent-token cache bias during validation / roundtrip eval +- Local proxy iteration: + - capped validation + - optional skip of expensive final roundtrip eval + - proxy sweep launcher + +## Current knobs + +- `NUM_UNIQUE_BLOCKS` +- `WINDOW_SIZE` +- `EMBED_DIM` +- `COMPRESSION_REG_WEIGHT` +- `COMPRESSION_GRID_REG_WEIGHT` +- `COMPRESSION_SCALE_REG_WEIGHT` +- `COMPRESSION_RANK1_REG_WEIGHT` +- `TERNARY_REG_WEIGHT` +- `OUTLIER_REG_WEIGHT` +- `EVAL_CACHE_MIX_WEIGHT` +- `EVAL_BIGRAM_MIX_WEIGHT` +- `EVAL_CACHE_SIZE` +- `FINAL_ROUNDTRIP_EVAL` +- `ROUNDTRIP_VAL_MAX_TOKENS` + +## Local proxy reference point + +All local comparisons below use the same quick 3090 proxy envelope: + +- `MAX_WALLCLOCK_SECONDS=180` +- `TRAIN_BATCH_TOKENS=32768` +- `VAL_MAX_TOKENS=1048576` +- `FINAL_ROUNDTRIP_EVAL=0` +- baseline architecture: + - `NUM_LAYERS=12` + - `NUM_UNIQUE_BLOCKS=12` + - `MODEL_DIM=384` + - `EMBED_DIM=0` + - `NUM_HEADS=6` + - `NUM_KV_HEADS=3` + +## Roundtrip proxy track + +Use this when ranking experiments on a more faithful local objective: + +- keep the same baseline architecture unless explicitly testing architecture +- enable `FINAL_ROUNDTRIP_EVAL=1` +- keep `ROUNDTRIP_VAL_MAX_TOKENS` capped so the run stays practical on a 3090 +- treat this as the local approximation to the actual challenge metric + +## Latest findings + +- Quick local baseline: + - run: `baseline3090_20260318_170251` + - result: `val_bpb=2.0916`, `val_loss=3.4910` + - total artifact: `6,831,983` bytes + - interpretation: current local number to beat +- Hybrid eval sidecar, recent-token + bigram continuation bias: + - run: `sidecar3090_20260318_172524` + - knobs: `EVAL_CACHE_MIX_WEIGHT=0.03`, `EVAL_BIGRAM_MIX_WEIGHT=0.05`, `EVAL_CACHE_SIZE=16` + - result: `val_bpb=2.0970`, `val_loss=3.5000` + - total artifact: `6,810,819` bytes + - delta vs baseline: `+0.0054 bpb` worse, `21,164` bytes smaller + - interpretation: close enough to keep around for later tuning, not good enough to become the default path +- Compression-aware baseline, reconstruction regularization `0.01`: + - run: `compress3090_20260318_174132` + - result: `val_bpb=2.0943`, `val_loss=3.4954` + - total artifact: `6,812,935` bytes + - delta vs baseline: `+0.0027 bpb` worse, `19,048` bytes smaller + - interpretation: strongest experimental branch so far +- Compression-aware baseline, reconstruction regularization `0.005`: + - run: `compress3090_half_20260318_1750` + - result: `val_bpb=2.0928`, `val_loss=3.4930` + - total artifact: `6,829,073` bytes + - delta vs baseline: `+0.0012 bpb` worse, `2,910` bytes smaller + - interpretation: best pre-roundtrip proxy result outside the plain baseline +- Matched roundtrip-proxy baseline: + - run: `baselinert3090_20260318_181344` + - exact final roundtrip result: `val_bpb=2.11089617`, `val_loss=3.56464830` + - total artifact: `6,705,058` bytes +- Matched roundtrip-proxy compression baseline: + - run: `compressrt3090_20260318_175828` + - knobs: `COMPRESSION_REG_WEIGHT=0.005` + - exact final roundtrip result: `val_bpb=2.06085837`, `val_loss=3.48014999` + - total artifact: `6,839,798` bytes + - delta vs matched roundtrip baseline: `-0.05003780 bpb`, about `2.37%` better + - interpretation: compression-aware training is now the leading local research branch when measured on a more faithful objective +- Sparse-attention probe on the winning compression setup: + - run: `compressrt_sparse512_20260318_1842` + - knobs: `WINDOW_SIZE=512`, `COMPRESSION_REG_WEIGHT=0.005` + - exact final roundtrip result: `val_bpb=2.07004634`, `val_loss=3.49566562` + - delta vs best compression baseline: `+0.00918797 bpb` worse + - interpretation: not good enough to displace the dense compression-aware path; sparse attention stays experimental for later +- Focused QAT roundtrip sweep around the winning compression point: + - sweep: `qatrtsweep_20260318_1906` + - best result in sweep: + - run: `qatrtsweep_20260318_1906_w0045_o0000` + - knobs: `COMPRESSION_REG_WEIGHT=0.0045`, `OUTLIER_REG_WEIGHT=0.0` + - exact final roundtrip result: `val_bpb=2.06804196`, `val_loss=3.49228084` + - total artifact: `6,814,995` bytes + - interpretation: + - tiny outlier regularization did not help on this local roundtrip track + - none of the focused QAT sweep runs beat the standing best dense compression-aware run at `2.06085837` + - the dense compression-aware baseline remains the current best local result +- Recurrent/shared-block roundtrip sweep: + - sweep: `recurtsweep_20260318_1925` + - tested: + - `16 layers / 8 unique / embed 0` -> `2.25452146` + - `18 layers / 6 unique / embed 0` -> `2.28804085` + - `16 layers / 8 unique / embed 256` -> `2.28260194` + - `18 layers / 6 unique / embed 256` -> `2.34886036` + - interpretation: + - this branch cuts artifact size aggressively, but quality collapses on the current local roundtrip track + - none of these shapes are close to the dense compression-aware baseline + - shared-block recurrence stays interesting for the 16 MB objective, but this first pass is not competitive enough to prioritize locally +- Roundtrip sidecar revisit on top of the winning dense compression setup: + - sweep: `sidecarrtsweep_20260318_1942` + - best usable result in sweep: + - run: `sidecarrtsweep_20260318_1942_c0020_b0030_s8` + - knobs: `EVAL_CACHE_MIX_WEIGHT=0.02`, `EVAL_BIGRAM_MIX_WEIGHT=0.03`, `EVAL_CACHE_SIZE=8` + - exact final roundtrip result: `val_bpb=2.06132482`, `val_loss=3.48093767` + - total artifact: `6,864,315` bytes + - delta vs best dense compression baseline: `+0.00046645 bpb` worse + - sweep reliability notes: + - `c0015_b0020_s8` and `c0020_b0020_s8` stopped before a usable roundtrip result was written + - `c0020_b0020_s16` reached artifact export but never wrote `final_int8_zlib_roundtrip_exact` + - interpretation: + - the sidecar branch is the closest secondary idea so far + - it still did not beat the plain dense compression-aware winner + - keep it parked as a late-stage add-on, not the current pivot +- Conservative ternary / low-bit sweep on top of the winning dense compression setup: + - sweep: `ternaryrtsweep_20260318_201412` + - tested: + - `TERNARY_REG_WEIGHT=0.0005` -> `2.07311732` + - `TERNARY_REG_WEIGHT=0.0010` -> `2.07009530` + - `TERNARY_REG_WEIGHT=0.0020` -> `2.07025558` + - `TERNARY_REG_WEIGHT=0.0035` -> `2.08786263` + - `TERNARY_REG_WEIGHT=0.0050` -> `2.07821685` + - interpretation: + - native low-bit pressure in this form clearly hurts the local roundtrip metric + - very small ternary weights degrade less, but still do not approach the current leader + - do not prioritize ternary shaping again until a stronger baseline exists or the training formulation changes +- Quantization residual-budget sweep on top of the winning dense compression setup: + - sweep: `residualrtsweep_20260318_203241` + - tested: + - `residual_rank=0, residual_budget=0` -> baseline export control for this sweep + - `residual_rank=1, residual_budget=65536` -> `2.08312093` + - `residual_rank=1, residual_budget=262144` -> `2.08187280` + - `residual_rank=1, residual_budget=524288` -> `2.08285302` + - `residual_rank=1, residual_budget=1048576` -> `2.07731235` + - interpretation: + - spending more bytes on rank-1 residual export corrections did not improve the local roundtrip metric + - the export-side residual mechanism is not currently a better lever than the plain dense compression-aware setup + - quantization-budget tuning should be deprioritized for now +- Refined sidecar micro-sweep around the prior near-win: + - sweep: `sidecarrefine_20260318_205219` + - completed exact results: + - `cache=0.018, bigram=0.030, size=8` -> `2.08080110` + - `cache=0.020, bigram=0.028, size=8` -> `2.07489103` + - `cache=0.020, bigram=0.030, size=8` rerun -> `2.08947255` + - `cache=0.020, bigram=0.032, size=8` -> `2.07840275` + - incomplete run: + - `cache=0.022, bigram=0.030, size=8` reached artifact export but did not write `final_int8_zlib_roundtrip_exact` + - interpretation: + - the earlier `2.06132482` sidecar near-win did not reproduce + - the sidecar branch now looks unstable on the local roundtrip track + - measuring repeatability is more important than additional sidecar micro-tuning right now +- Corrected wallclock repeatability sweep: + - sweep: `repeatrtsweepfix_20260318_215301` + - dense compression-aware runs: + - `base_a` -> `2.06761597` + - `base_b` -> `2.07369637` + - `base_c` -> `2.08956232` + - sidecar near-win reruns: + - `side_a` -> `2.05608381` + - `side_b` -> `2.09377262` + - `side_c` -> `2.07285932` + - interpretation: + - both branches show too much spread on the local `180s` wallclock track + - the best sidecar rerun did beat the standing leader, but the worst sidecar rerun was much worse + - the dominant local noise source now looks methodological, not architectural + - the next step should be a fixed-step local roundtrip track, not more wallclock micro-sweeps +- Fixed-step roundtrip sweep: + - sweep: `fixedsteprtsweep_20260318_221632` + - dense compression-aware runs: + - `base_a` -> `2.04299145` + - `base_b` -> `2.04299145` + - sidecar near-win reruns: + - `side_a` -> `2.04300345` + - `side_b` -> `2.04300345` + - interpretation: + - once wallclock variance is removed, the sidecar branch is effectively identical to the dense baseline + - the dense compression-aware branch remains the cleanest local control + - future local search should use fixed-step comparison first, then wallclock only as a secondary sanity check +- Export-aware fixed-step compression probe: + - sweep: `exportaware_fixedstep_20260318_223456` + - completed result: + - `g010_r000` -> `2.04288777` + - knobs: `COMPRESSION_REG_WEIGHT=0.005`, `COMPRESSION_GRID_REG_WEIGHT=0.10`, `COMPRESSION_RANK1_REG_WEIGHT=0.0` + - total artifact: `6,663,470` bytes + - delta vs fixed-step dense control: `-0.00010368 bpb` better + - execution note: + - the broader coarse sweep was aborted after the first positive signal to avoid spending more 3090 time on low-probability points + - interpretation: + - export-aware grid alignment is the first post-fixed-step change that improved the dense compression-aware control + - the gain is small, but it is deterministic and points in the right direction + - the next compression-native pivot should stay inside export-aware regularization, not revisit sidecar or architectural branches +- Scale-aware fixed-step compression sweep: + - sweep: `scaleaware_fixedstep_20260318_224233` + - completed results: + - `g010_s0010` -> `2.04313626` + - `g010_s0025` -> `2.04358127` + - interpretation: + - adding explicit adjacent-scale smoothing made the roundtripped result slightly worse at both tested weights + - this version of scale-aware pressure does not improve on the grid-aligned winner + - the next best move is to refine the grid-alignment weight itself, not add more compression-native terms yet +- Grid-refinement fixed-step sweep: + - sweep: `gridrefine_fixedstep_20260318_225110` + - completed results: + - `g0080` -> `2.04396986` + - `g0120` -> `2.04350611` + - interpretation: + - both nearby grid weights regressed versus the `0.10` winner + - `COMPRESSION_GRID_REG_WEIGHT=0.10` currently looks like a real local optimum on the fixed-step track + - the next compression-aware pivot should keep `grid=0.10` fixed and test only very small outlier pressure around it +- Tiny outlier sweep on top of the grid-aligned winner: + - sweep: `gridoutlier_fixedstep_20260318_225946` + - completed results: + - `o00010` -> `2.04373218` + - `o00025` -> `2.04372289` + - interpretation: + - even very small outlier pressure still regresses + - outlier suppression should stay parked unless it becomes tensor-targeted +- Dense iso-byte frontier sweep: + - sweep: `isobyte_fixedstep_20260318_234805` + - completed results: + - `b10` -> `2.02814871` at `9,683,932` bytes + - `b12` -> `2.05262920` at `11,334,608` bytes + - `b14` -> `2.03768242` at `13,094,288` bytes + - `b155` -> `2.00290272` at `13,741,308` bytes + - interpretation: + - dense scaling dominates the small-model micro-ideas by a wide margin + - the current best result is no longer the 6.66 MB regime; it is the larger dense `b155` run + - the frontier is not monotonic with size alone, so geometry still matters, but the main lesson is clear: under-byte-spent local negatives were misleading + - the next step should stay on the dense high-cap frontier and compare width-vs-depth near the byte ceiling +- High-cap dense frontier: + - recovered / rerun results: + - `w608_l12` -> `2.00551677` at `14,371,393` bytes + - `w624_l12` -> `2.01128088` at `15,024,114` bytes + - `d576_l14` -> `1.99806297` at `15,222,128` bytes + - `w640_l12` -> `2.00505534` at `15,658,993` bytes + - interpretation: + - depth beat width at roughly the same byte spend in this near-cap regime + - the first sub-`2.0` local fixed-step result came from the deeper dense model, not the wider one + - near the byte cap, width is not obviously the best place to spend additional budget +- Tokenizer sanity check on the current best dense recipe: + - matched local subset controls built from the same `120k` selected-doc prefix + - SP1024 subset control: + - `sp1024subsetbest_20260319_020125` -> `1.99806297` + - total artifact: `15,222,128` bytes + - dataset stats: `149,659,022` total tokens on the subset + - SP4096 subset swap on the same trainer: + - `sp4096best_20260319_015500` -> `1.89591231` + - total artifact: `16,627,470` bytes + - dataset stats: `109,783,049` total tokens on the same subset + - interpretation: + - moving from SP1024 to SP4096 on the same local subset improved exact roundtrip BPB by `0.10215066`, about `5.11%` + - the same subset needed about `26.64%` fewer tokens with SP4096, which matches the expected compression benefit + - the merged `14x576` SP4096 run broke the `16,000,000` byte cap by `627,470` bytes, so it is a strong signal but not yet a submission-shape replacement + - tokenizer work is no longer purely deferred; it is now a real frontier lever, but it must be co-optimized with model size to stay under cap +- Iso-byte SP4096 dense sweep: + - sweep: `sp4096isobyte_fixedstep_20260319_022236` + - completed results: + - `l15_d544` -> `1.90194008` at `16,090,675` bytes + - `l14_d560` -> `1.89329916` at `15,869,071` bytes + - `l12_d608` -> `1.89424125` at `15,844,603` bytes + - interpretation: + - SP4096 is now a cap-compliant win, not just an over-budget curiosity + - the best cap-legal SP4096 point beat the SP1024 `14x576` control by `0.10476381` bpb, about `5.24%` + - `l14_d560` is the current best local result overall + - `l12_d608` is slightly worse on fixed-step BPB but notably faster per step, so it remains a plausible wallclock-oriented backup shape + - the first deeper SP4096 point (`l15_d544`) pushed just over the cap, which suggests the next useful local refinement is a slightly narrower deeper sweep + +## Current leader + +- `sp4096isobyte_fixedstep_20260319_022236_l14_d560` +- dense attention, no sidecar, no recurrence, no factorized embedding +- `VOCAB_SIZE=4096`, tied embeddings +- `COMPRESSION_REG_WEIGHT=0.005` +- `COMPRESSION_GRID_REG_WEIGHT=0.10` +- fixed-step exact final roundtrip result: `val_bpb=1.89329916` +- total artifact: `15,869,071` bytes +- best wallclock-track reference remains `compressrt3090_20260318_175828` at `2.06085837` + +## Regime correction + +- The trusted local dense control is now in the near-cap regime, not the old `6.66 MB` regime. +- That is why the dense iso-byte and high-cap frontier sweeps changed the project direction so much. +- Many earlier negative results were gathered in an under-byte-spent regime and should not be treated as globally final. +- The trustworthy questions now are: + - how should the remaining byte budget be spent near the cap? + - which export-aware or tokenizer-aware changes still help once the dense control is already strong? + - how should the remaining cap headroom be spent inside the stronger SP4096 regime? + +## Immediate next step + +- Keep the SP1024 `14x576` run as the baseline control and the SP4096 `14x560` run as the new frontier control +- keep `COMPRESSION_REG_WEIGHT=0.005` and `COMPRESSION_GRID_REG_WEIGHT=0.10` +- treat tokenizer changes as a real branch, not a deferred curiosity +- next tokenizer-aware experiments should stay near the cap: + - refine the SP4096 depth/width trade in the `15.7 MB` to `16.0 MB` band + - keep a close eye on step time, because `l12_d608` was materially faster than `l14_d560` +- continue ranking ideas by `final_int8_zlib_roundtrip_exact val_bpb` + +## Next experiments + +- SP4096 frontier refinement: + - test slightly narrower deeper shapes like `15x528` and `16x512` + - compare them against the current `14x560` leader and the faster `12x608` backup + - stay under `16,000,000` bytes +- Export-side symmetry-aware permutation: + - apply function-preserving reordering of MLP channels / attention heads before export + - test whether the grid-alignment hint can be turned into a larger zlib win without quality loss +- Tensor sensitivity mapping / heterogeneous export allocation: + - measure which tensors hurt roundtrip BPB most when quantized + - spend residual / protection budget selectively instead of globally +- Export-aware compression regularizer: + - continue aligning sampled training-time regularization with the actual export path + - hold `COMPRESSION_GRID_REG_WEIGHT=0.10` fixed unless new evidence suggests otherwise +- Scale-aware compression regularizer: + - parked for now after the first two weights regressed + - revisit only if a different formulation of scale entropy or scale clustering becomes compelling +- Fixed-step compression sweeps: + - keep using the fixed-step roundtrip track as the local ranking metric + - only move promising compression-native changes back onto the 180s wallclock track later +- Sidecar branch is parked: + - fixed-step results say it is not moving the needle in a reliable way + - do not spend more 3090 time on sidecar micro-tuning for now +- Export-side ideas remain parked: + - residual-budget tuning did not help + - sparse attention did not help + - shared-block recurrence did not help +- Low-bit shaping remains parked: + - revisit only if the training objective changes materially or H100 results suggest a different regime + +## Medium-term work + +- Dense winner + sidecar + low-bit combined into one trainer once the individual branches are measured cleanly +- Global/shared codebook quantization across layers +- Basis-generated per-layer weights or hypernetwork-style weight generation +- Test-time adaptation with strict reset semantics +- Token-adaptive recurrent depth / halting policy + +## Deferred until the model is stronger + +- full tokenizer redesign beyond the SP1024 vs SP4096 sanity branch +- aggressive code-size golf +- heavy hyperparameter brute force diff --git a/run_monitor.bat b/run_monitor.bat new file mode 100644 index 0000000000..5fb83e3c65 --- /dev/null +++ b/run_monitor.bat @@ -0,0 +1,3 @@ +@echo off +setlocal +powershell -NoProfile -ExecutionPolicy Bypass -File "%~dp0scripts\run_monitor.ps1" %* diff --git a/scripts/build_sp1024_local_subset.py b/scripts/build_sp1024_local_subset.py new file mode 100644 index 0000000000..d527b7daff --- /dev/null +++ b/scripts/build_sp1024_local_subset.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +import argparse +import importlib.util +import json +import shutil +import sys +from pathlib import Path + + +def load_export_module(repo_root: Path): + module_path = repo_root / "data" / "download_hf_docs_and_tokenize.py" + spec = importlib.util.spec_from_file_location("local_export_module_sp1024", module_path) + if spec is None or spec.loader is None: + raise RuntimeError(f"failed to load export module from {module_path}") + module = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = module + spec.loader.exec_module(module) + return module + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Export a local SP-1024 subset dataset from an existing selected-doc prefix") + parser.add_argument("--docs-jsonl", required=True) + parser.add_argument("--num-docs", type=int, default=120000) + parser.add_argument("--num-val-docs", type=int, default=50000) + parser.add_argument("--chunk-tokens", type=int, default=20000000) + parser.add_argument("--dataset-name", default="fineweb10B_sp1024_local120k") + parser.add_argument("--output-root", default="") + return parser + + +def main() -> None: + args = build_parser().parse_args() + if args.num_docs <= args.num_val_docs: + raise ValueError("--num-docs must be larger than --num-val-docs") + + repo_root = Path(__file__).resolve().parents[1] + docs_jsonl = Path(args.docs_jsonl).expanduser().resolve() + if not docs_jsonl.is_file(): + raise FileNotFoundError(docs_jsonl) + + output_root = ( + Path(args.output_root).expanduser().resolve() + if args.output_root + else (repo_root / "data" / "sp1024_local_build").resolve() + ) + output_root.mkdir(parents=True, exist_ok=True) + tokenizers_dir = output_root / "tokenizers" + datasets_dir = output_root / "datasets" + tokenizers_dir.mkdir(parents=True, exist_ok=True) + datasets_dir.mkdir(parents=True, exist_ok=True) + + export_module = load_export_module(repo_root) + source_tokenizer = repo_root / "data" / "tokenizers" / "fineweb_1024_bpe.model" + if not source_tokenizer.is_file(): + raise FileNotFoundError(source_tokenizer) + + spec = { + "name": "sp_bpe_1024_local", + "dataset_suffix": "sp1024_local120k", + "vocab_size": 1024, + "model_prefix": "fineweb_1024_bpe_local", + "reuse_model_path": str(source_tokenizer), + } + tok = export_module.build_sentencepiece_tokenizer( + spec=spec, + docs_jsonl=docs_jsonl, + tokenizers_dir=tokenizers_dir, + ) + + output_dir = datasets_dir / args.dataset_name + stats = export_module.export_shards( + docs_jsonl, + tok, + output_dir, + num_val_docs=int(args.num_val_docs), + shard_size=int(args.chunk_tokens), + docs_total=int(args.num_docs), + ) + + manifest = { + "version": "local_subset", + "num_docs": int(args.num_docs), + "num_val_docs": int(args.num_val_docs), + "docs_jsonl": str(docs_jsonl), + "tokenizers": [ + { + "name": tok["name"], + "kind": tok["kind"], + "vocab_size": int(tok["vocab_size"]), + "bos_id": int(tok["bos_id"]), + "eos_id": int(tok["eos_id"]), + "recommended_bigram_vocab_size": int(((int(tok["vocab_size"]) + 127) // 128) * 128 * 5), + "source_spec": spec, + **tok["manifest"], + } + ], + "datasets": [ + { + "name": args.dataset_name, + "tokenizer_name": tok["name"], + "tokenizer_kind": tok["kind"], + "path": str(output_dir), + "train_glob": str(output_dir / "fineweb_train_*.bin"), + "val_glob": str(output_dir / "fineweb_val_*.bin"), + "vocab_size": int(tok["vocab_size"]), + "bos_id": int(tok["bos_id"]), + "eos_id": int(tok["eos_id"]), + "recommended_bigram_vocab_size": int(((int(tok["vocab_size"]) + 127) // 128) * 128 * 5), + "stats": stats, + } + ], + } + manifest = export_module.relativize_manifest_paths(manifest, output_root) + (output_root / "manifest.json").write_text(json.dumps(manifest, indent=2) + "\n", encoding="utf-8") + + target_dataset_dir = repo_root / "data" / "datasets" / args.dataset_name + if target_dataset_dir.exists(): + shutil.rmtree(target_dataset_dir) + shutil.copytree(output_dir, target_dataset_dir) + + print(f"dataset_dir:{target_dataset_dir}", flush=True) + print(f"dataset_stats:{json.dumps(stats, sort_keys=True)}", flush=True) + + +if __name__ == "__main__": + main() diff --git a/scripts/build_sp4096_local_subset.py b/scripts/build_sp4096_local_subset.py new file mode 100644 index 0000000000..4d10fc32d2 --- /dev/null +++ b/scripts/build_sp4096_local_subset.py @@ -0,0 +1,188 @@ +from __future__ import annotations + +import argparse +import importlib.util +import json +import math +import shutil +import sys +import urllib.request +from pathlib import Path + + +HF_REPO_ID = "willdepueoai/parameter-golf" +HF_ROOT = "datasets" +DOCS_URL = f"https://huggingface.co/datasets/{HF_REPO_ID}/resolve/main/{HF_ROOT}/docs_selected.jsonl" +SIDECAR_URL = f"https://huggingface.co/datasets/{HF_REPO_ID}/resolve/main/{HF_ROOT}/docs_selected.source_manifest.json" + + +def load_export_module(repo_root: Path): + module_path = repo_root / "data" / "download_hf_docs_and_tokenize.py" + spec = importlib.util.spec_from_file_location("local_export_module", module_path) + if spec is None or spec.loader is None: + raise RuntimeError(f"failed to load export module from {module_path}") + module = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = module + spec.loader.exec_module(module) + return module + + +def download_json(url: str) -> dict: + with urllib.request.urlopen(url, timeout=60) as response: + return json.loads(response.read().decode("utf-8")) + + +def stream_doc_prefix( + *, + docs_url: str, + tokenizer_docs_path: Path, + export_docs_path: Path, + tokenizer_train_docs: int, + export_docs: int, +) -> None: + max_docs = max(tokenizer_train_docs, export_docs) + tokenizer_docs_path.parent.mkdir(parents=True, exist_ok=True) + export_docs_path.parent.mkdir(parents=True, exist_ok=True) + with urllib.request.urlopen(docs_url, timeout=60) as response, tokenizer_docs_path.open("w", encoding="utf-8") as tok_out, export_docs_path.open("w", encoding="utf-8") as exp_out: + for idx, raw_line in enumerate(response, start=1): + line = raw_line.decode("utf-8") + if idx <= tokenizer_train_docs: + tok_out.write(line) + if idx <= export_docs: + exp_out.write(line) + if idx % 50000 == 0: + print(f"downloaded_docs:{idx}", flush=True) + if idx >= max_docs: + break + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Build a lightweight local SP-4096 subset export from the published selected-doc stream") + parser.add_argument("--output-root", required=True) + parser.add_argument("--tokenizer-train-docs", type=int, default=500000) + parser.add_argument("--export-docs", type=int, default=120000) + parser.add_argument("--num-val-docs", type=int, default=50000) + parser.add_argument("--chunk-tokens", type=int, default=20000000) + return parser + + +def main() -> None: + args = build_parser().parse_args() + if args.export_docs <= args.num_val_docs: + raise ValueError("--export-docs must be larger than --num-val-docs") + + repo_root = Path(__file__).resolve().parents[1] + output_root = Path(args.output_root).expanduser().resolve() + output_root.mkdir(parents=True, exist_ok=True) + tokenizers_dir = output_root / "tokenizers" + datasets_dir = output_root / "datasets" + tokenizers_dir.mkdir(parents=True, exist_ok=True) + datasets_dir.mkdir(parents=True, exist_ok=True) + + tokenizer_docs_jsonl = output_root / "docs_selected_tokenizer_train.jsonl" + export_docs_jsonl = output_root / "docs_selected.jsonl" + export_sidecar = output_root / "docs_selected.source_manifest.json" + + if not tokenizer_docs_jsonl.is_file() or not export_docs_jsonl.is_file(): + stream_doc_prefix( + docs_url=DOCS_URL, + tokenizer_docs_path=tokenizer_docs_jsonl, + export_docs_path=export_docs_jsonl, + tokenizer_train_docs=args.tokenizer_train_docs, + export_docs=args.export_docs, + ) + + source_sidecar = download_json(SIDECAR_URL) + subset_sidecar = { + "source_repo_id": HF_REPO_ID, + "source_remote_root": HF_ROOT, + "source_num_docs": source_sidecar.get("num_docs"), + "source_docs_val": source_sidecar.get("docs_val"), + "num_docs": int(args.export_docs), + "docs_val": int(args.num_val_docs), + "docs_sha256": None, + "subset_kind": "prefix", + "tokenizer_train_docs": int(args.tokenizer_train_docs), + } + export_sidecar.write_text(json.dumps(subset_sidecar, indent=2) + "\n", encoding="utf-8") + + export_module = load_export_module(repo_root) + spec = { + "name": "sp_bpe_4096", + "dataset_suffix": "sp4096_local", + "vocab_size": 4096, + "model_prefix": "fineweb_4096_bpe", + "tokenizer_train_docs": int(args.tokenizer_train_docs), + } + tok = export_module.build_sentencepiece_tokenizer( + spec=spec, + docs_jsonl=tokenizer_docs_jsonl, + tokenizers_dir=tokenizers_dir, + ) + dataset_name = "fineweb10B_sp4096_local" + output_dir = datasets_dir / dataset_name + stats = export_module.export_shards( + export_docs_jsonl, + tok, + output_dir, + num_val_docs=int(args.num_val_docs), + shard_size=int(args.chunk_tokens), + docs_total=int(args.export_docs), + ) + + recommended_bigram_vocab_size = int(((int(tok["vocab_size"]) + 127) // 128) * 128 * 5) + manifest = { + "version": "local_subset", + "num_docs": int(args.export_docs), + "num_val_docs": int(args.num_val_docs), + "tokenizer_train_docs": int(args.tokenizer_train_docs), + "shard_size": int(args.chunk_tokens), + "docs_jsonl": str(export_docs_jsonl), + "tokenizers": [ + { + "name": tok["name"], + "kind": tok["kind"], + "vocab_size": int(tok["vocab_size"]), + "bos_id": int(tok["bos_id"]), + "eos_id": int(tok["eos_id"]), + "recommended_bigram_vocab_size": recommended_bigram_vocab_size, + "source_spec": spec, + **tok["manifest"], + } + ], + "datasets": [ + { + "name": dataset_name, + "tokenizer_name": tok["name"], + "tokenizer_kind": tok["kind"], + "path": str(output_dir), + "train_glob": str(output_dir / "fineweb_train_*.bin"), + "val_glob": str(output_dir / "fineweb_val_*.bin"), + "vocab_size": int(tok["vocab_size"]), + "bos_id": int(tok["bos_id"]), + "eos_id": int(tok["eos_id"]), + "recommended_bigram_vocab_size": recommended_bigram_vocab_size, + "stats": stats, + } + ], + } + manifest = export_module.relativize_manifest_paths(manifest, output_root) + (output_root / "manifest.json").write_text(json.dumps(manifest, indent=2) + "\n", encoding="utf-8") + + target_tokenizer_dir = repo_root / "data" / "tokenizers" + target_dataset_dir = repo_root / "data" / "datasets" / dataset_name + target_tokenizer_dir.mkdir(parents=True, exist_ok=True) + target_dataset_dir.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(tokenizers_dir / "fineweb_4096_bpe.model", target_tokenizer_dir / "fineweb_4096_bpe.model") + shutil.copy2(tokenizers_dir / "fineweb_4096_bpe.vocab", target_tokenizer_dir / "fineweb_4096_bpe.vocab") + if target_dataset_dir.exists(): + shutil.rmtree(target_dataset_dir) + shutil.copytree(output_dir, target_dataset_dir) + + print(f"tokenizer_model:{target_tokenizer_dir / 'fineweb_4096_bpe.model'}", flush=True) + print(f"dataset_dir:{target_dataset_dir}", flush=True) + print(f"dataset_stats:{json.dumps(stats, sort_keys=True)}", flush=True) + + +if __name__ == "__main__": + main() diff --git a/scripts/replace_hf_dataset_with_export.py b/scripts/replace_hf_dataset_with_export.py new file mode 100644 index 0000000000..4934755c55 --- /dev/null +++ b/scripts/replace_hf_dataset_with_export.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +"""Replace challenge dataset artifacts in a Hugging Face dataset repo with a local export.""" + +from __future__ import annotations + +import argparse +from pathlib import Path + +from huggingface_hub import HfApi + + +DEFAULT_REPO_ID = "willdepueoai/parameter-golf" +DEFAULT_PATH_IN_REPO = "datasets" +DATA_ARTIFACT_NAMES = { + "datasets", + "tokenizers", + "manifest.json", + "docs_selected.jsonl", + "docs_selected.source_manifest.json", + "tokenizer_config.export.json", + "snapshot_meta.json", +} + + +def repo_path(prefix: str, name: str) -> str: + return f"{prefix}/{name}" if prefix else name + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Replace old dataset artifacts in a HF dataset repo with a local export") + parser.add_argument("--repo-id", default=DEFAULT_REPO_ID) + parser.add_argument("--local-export-root", required=True) + parser.add_argument("--path-in-repo", default=DEFAULT_PATH_IN_REPO, help="Subdirectory inside the dataset repo") + parser.add_argument("--repo-type", default="dataset") + parser.add_argument("--revision", default=None) + parser.add_argument("--commit-message", default="Replace dataset export") + parser.add_argument("--dry-run", action="store_true") + return parser + + +def main() -> None: + args = build_parser().parse_args() + api = HfApi() + local_export_root = Path(args.local_export_root).expanduser().resolve() + if not local_export_root.is_dir(): + raise FileNotFoundError(local_export_root) + + prefix = args.path_in_repo.strip("/") + top_level_local = {path.name for path in local_export_root.iterdir()} + delete_names = sorted(DATA_ARTIFACT_NAMES | top_level_local) + root_entries = { + entry.path: entry + for entry in api.list_repo_tree( + repo_id=args.repo_id, + recursive=False, + repo_type=args.repo_type, + revision=args.revision, + ) + } + + if prefix: + if prefix in root_entries: + print(f"delete {prefix}") + if not args.dry_run: + api.delete_folder( + prefix, + repo_id=args.repo_id, + repo_type=args.repo_type, + revision=args.revision, + commit_message=f"Delete {prefix}", + ) + + remote_entries = root_entries if not prefix else {} + + for name in delete_names: + if prefix: + break + remote_path = repo_path(prefix, name) + entry = remote_entries.get(remote_path) + if entry is None: + continue + print(f"delete {remote_path}") + if args.dry_run: + continue + if entry.__class__.__name__ == "RepoFolder": + api.delete_folder( + remote_path, + repo_id=args.repo_id, + repo_type=args.repo_type, + revision=args.revision, + commit_message=f"Delete {remote_path}", + ) + else: + api.delete_file( + remote_path, + repo_id=args.repo_id, + repo_type=args.repo_type, + revision=args.revision, + commit_message=f"Delete {remote_path}", + ) + + print(f"upload {local_export_root} -> {prefix or '/'}") + if args.dry_run: + return + api.upload_folder( + repo_id=args.repo_id, + repo_type=args.repo_type, + revision=args.revision, + folder_path=local_export_root, + path_in_repo=prefix or None, + commit_message=args.commit_message, + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/run_8xh100_recur.sh b/scripts/run_8xh100_recur.sh new file mode 100644 index 0000000000..bb7e0b207e --- /dev/null +++ b/scripts/run_8xh100_recur.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +export RUN_ID="${RUN_ID:-recur18_u5_d672_$(date +%Y%m%d_%H%M%S)}" +export DATA_PATH="${DATA_PATH:-$ROOT/data/datasets/fineweb10B_sp1024}" +export TOKENIZER_PATH="${TOKENIZER_PATH:-$ROOT/data/tokenizers/fineweb_1024_bpe.model}" + +export VOCAB_SIZE="${VOCAB_SIZE:-1024}" +export TIE_EMBEDDINGS="${TIE_EMBEDDINGS:-1}" +export NUM_LAYERS="${NUM_LAYERS:-18}" +export NUM_UNIQUE_BLOCKS="${NUM_UNIQUE_BLOCKS:-5}" +export MODEL_DIM="${MODEL_DIM:-672}" +export NUM_HEADS="${NUM_HEADS:-12}" +export NUM_KV_HEADS="${NUM_KV_HEADS:-6}" +export MLP_MULT="${MLP_MULT:-2}" + +export TRAIN_BATCH_TOKENS="${TRAIN_BATCH_TOKENS:-524288}" +export VAL_BATCH_SIZE="${VAL_BATCH_SIZE:-524288}" +export TRAIN_LOG_EVERY="${TRAIN_LOG_EVERY:-50}" +export VAL_LOSS_EVERY="${VAL_LOSS_EVERY:-200}" +export MAX_WALLCLOCK_SECONDS="${MAX_WALLCLOCK_SECONDS:-600}" + +export INT8_AXIS_MODE="${INT8_AXIS_MODE:-auto}" +export INT8_RESIDUAL_RANK="${INT8_RESIDUAL_RANK:-1}" +export INT8_RESIDUAL_BUDGET_BYTES="${INT8_RESIDUAL_BUDGET_BYTES:-65536}" +export SDP_BACKEND="${SDP_BACKEND:-flash}" +export ENABLE_TORCH_COMPILE="${ENABLE_TORCH_COMPILE:-1}" + +cd "$ROOT" +exec torchrun --standalone --nproc_per_node=8 train_gpt.py diff --git a/scripts/run_local_3090.ps1 b/scripts/run_local_3090.ps1 new file mode 100644 index 0000000000..ceb1b97369 --- /dev/null +++ b/scripts/run_local_3090.ps1 @@ -0,0 +1,123 @@ +param( + [string]$RunId = "", + [string]$DataPath = "", + [string]$TokenizerPath = "", + [int]$VocabSize = 1024, + [int]$TieEmbeddings = 1, + [int]$MaxWallclockSeconds = 420, + [int]$Iterations = 20000, + [int]$TrainBatchTokens = 65536, + [int]$ValBatchSize = 65536, + [int]$ValMaxTokens = 4194304, + [int]$RoundtripValMaxTokens = 2097152, + [int]$TrainLogEvery = 10, + [int]$ValLossEvery = 50, + [int]$WarmupSteps = 0, + [int]$NumLayers = 16, + [int]$NumUniqueBlocks = 4, + [int]$ModelDim = 512, + [int]$EmbedDim = 0, + [int]$NumHeads = 8, + [int]$NumKvHeads = 4, + [int]$MlpMult = 2, + [int]$WindowSize = 0, + [string]$Int8AxisMode = "auto", + [int]$Int8ResidualRank = 1, + [int]$Int8ResidualBudgetBytes = 65536, + [double]$CompressionRegWeight = 0.02, + [int]$CompressionRegInterval = 4, + [int]$CompressionRegWarmupSteps = 10, + [int]$CompressionRegSampleTensors = 4, + [int]$CompressionRegMaxCols = 128, + [double]$CompressionGridRegWeight = 0.0, + [double]$CompressionRank1RegWeight = 0.0, + [double]$CompressionScaleRegWeight = 0.0, + [double]$TernaryRegWeight = 0.15, + [double]$OutlierRegWeight = 0.01, + [double]$EvalCacheMixWeight = 0.03, + [double]$EvalBigramMixWeight = 0.0, + [int]$EvalCacheSize = 8, + [int]$SaveRawCheckpoint = 0, + [int]$FinalRoundtripEval = 0, + [switch]$Background +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$python = Join-Path $root ".venv\Scripts\python.exe" +if (-not (Test-Path $python)) { + throw "Python venv not found at $python" +} + +if ([string]::IsNullOrWhiteSpace($RunId)) { + $RunId = "local3090_" + (Get-Date -Format "yyyyMMdd_HHmmss") +} + +$resolvedDataPath = if ([string]::IsNullOrWhiteSpace($DataPath)) { Join-Path $root "data\datasets\fineweb10B_sp1024" } else { (Resolve-Path $DataPath).Path } +$resolvedTokenizerPath = if ([string]::IsNullOrWhiteSpace($TokenizerPath)) { Join-Path $root "data\tokenizers\fineweb_1024_bpe.model" } else { (Resolve-Path $TokenizerPath).Path } + +$env:RUN_ID = $RunId +$env:DATA_PATH = $resolvedDataPath +$env:TOKENIZER_PATH = $resolvedTokenizerPath +$env:VOCAB_SIZE = $VocabSize.ToString() +$env:TIE_EMBEDDINGS = $TieEmbeddings.ToString() +$env:NUM_LAYERS = $NumLayers.ToString() +$env:NUM_UNIQUE_BLOCKS = $NumUniqueBlocks.ToString() +$env:MODEL_DIM = $ModelDim.ToString() +$env:EMBED_DIM = $EmbedDim.ToString() +$env:NUM_HEADS = $NumHeads.ToString() +$env:NUM_KV_HEADS = $NumKvHeads.ToString() +$env:MLP_MULT = $MlpMult.ToString() +$env:WINDOW_SIZE = $WindowSize.ToString() +$env:ENABLE_TORCH_COMPILE = "0" +$env:SDP_BACKEND = "math" +$env:INT8_AXIS_MODE = $Int8AxisMode +$env:INT8_RESIDUAL_RANK = $Int8ResidualRank.ToString() +$env:INT8_RESIDUAL_BUDGET_BYTES = $Int8ResidualBudgetBytes.ToString() +$env:TRAIN_BATCH_TOKENS = $TrainBatchTokens.ToString() +$env:ITERATIONS = $Iterations.ToString() +$env:VAL_BATCH_SIZE = $ValBatchSize.ToString() +$env:VAL_MAX_TOKENS = $ValMaxTokens.ToString() +$env:ROUNDTRIP_VAL_MAX_TOKENS = $RoundtripValMaxTokens.ToString() +$env:TRAIN_LOG_EVERY = $TrainLogEvery.ToString() +$env:VAL_LOSS_EVERY = $ValLossEvery.ToString() +$env:WARMUP_STEPS = $WarmupSteps.ToString() +$env:MAX_WALLCLOCK_SECONDS = $MaxWallclockSeconds.ToString() +$env:SAVE_RAW_CHECKPOINT = $SaveRawCheckpoint.ToString() +$env:FINAL_ROUNDTRIP_EVAL = $FinalRoundtripEval.ToString() +$env:COMPRESSION_REG_WEIGHT = $CompressionRegWeight.ToString([System.Globalization.CultureInfo]::InvariantCulture) +$env:COMPRESSION_REG_INTERVAL = $CompressionRegInterval.ToString() +$env:COMPRESSION_REG_WARMUP_STEPS = $CompressionRegWarmupSteps.ToString() +$env:COMPRESSION_REG_SAMPLE_TENSORS = $CompressionRegSampleTensors.ToString() +$env:COMPRESSION_REG_MAX_COLS = $CompressionRegMaxCols.ToString() +$env:COMPRESSION_GRID_REG_WEIGHT = $CompressionGridRegWeight.ToString([System.Globalization.CultureInfo]::InvariantCulture) +$env:COMPRESSION_RANK1_REG_WEIGHT = $CompressionRank1RegWeight.ToString([System.Globalization.CultureInfo]::InvariantCulture) +$env:COMPRESSION_SCALE_REG_WEIGHT = $CompressionScaleRegWeight.ToString([System.Globalization.CultureInfo]::InvariantCulture) +$env:TERNARY_REG_WEIGHT = $TernaryRegWeight.ToString([System.Globalization.CultureInfo]::InvariantCulture) +$env:OUTLIER_REG_WEIGHT = $OutlierRegWeight.ToString([System.Globalization.CultureInfo]::InvariantCulture) +$env:EVAL_CACHE_MIX_WEIGHT = $EvalCacheMixWeight.ToString([System.Globalization.CultureInfo]::InvariantCulture) +$env:EVAL_BIGRAM_MIX_WEIGHT = $EvalBigramMixWeight.ToString([System.Globalization.CultureInfo]::InvariantCulture) +$env:EVAL_CACHE_SIZE = $EvalCacheSize.ToString() + +$scriptPath = Join-Path $root "train_gpt.py" +$stdoutPath = Join-Path $root "logs\$RunId.stdout.txt" +$stderrPath = Join-Path $root "logs\$RunId.stderr.txt" +New-Item -ItemType Directory -Force -Path (Join-Path $root "logs") | Out-Null + +if ($Background) { + $proc = Start-Process ` + -FilePath $python ` + -ArgumentList $scriptPath ` + -WorkingDirectory $root ` + -RedirectStandardOutput $stdoutPath ` + -RedirectStandardError $stderrPath ` + -PassThru + Write-Output ("RUN_ID={0}" -f $RunId) + Write-Output ("PID={0}" -f $proc.Id) + Write-Output ("STDOUT={0}" -f $stdoutPath) + Write-Output ("STDERR={0}" -f $stderrPath) + exit 0 +} + +& $python $scriptPath diff --git a/scripts/run_local_baseline.ps1 b/scripts/run_local_baseline.ps1 new file mode 100644 index 0000000000..756dc97da5 --- /dev/null +++ b/scripts/run_local_baseline.ps1 @@ -0,0 +1,50 @@ +param( + [string]$RunId = "", + [switch]$Background +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} + +$args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $(if ([string]::IsNullOrWhiteSpace($RunId)) { "baseline3090_" + (Get-Date -Format "yyyyMMdd_HHmmss") } else { $RunId }), + "-MaxWallclockSeconds", "180", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "1048576", + "-RoundtripValMaxTokens", "524288", + "-TrainLogEvery", "10", + "-ValLossEvery", "25", + "-WarmupSteps", "0", + "-NumLayers", "12", + "-NumUniqueBlocks", "12", + "-ModelDim", "384", + "-EmbedDim", "0", + "-NumHeads", "6", + "-NumKvHeads", "3", + "-MlpMult", "2", + "-CompressionRegWeight", "0", + "-CompressionRegInterval", "1", + "-CompressionRegWarmupSteps", "0", + "-CompressionRegSampleTensors", "0", + "-CompressionRegMaxCols", "128", + "-TernaryRegWeight", "0", + "-OutlierRegWeight", "0", + "-EvalCacheMixWeight", "0", + "-EvalCacheSize", "0", + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "0" +) + +if ($Background) { + $args += "-Background" +} + +& powershell @args diff --git a/scripts/run_local_baseline_roundtrip.ps1 b/scripts/run_local_baseline_roundtrip.ps1 new file mode 100644 index 0000000000..0973d523bd --- /dev/null +++ b/scripts/run_local_baseline_roundtrip.ps1 @@ -0,0 +1,51 @@ +param( + [string]$RunId = "", + [switch]$Background +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} + +$args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $(if ([string]::IsNullOrWhiteSpace($RunId)) { "baselinert3090_" + (Get-Date -Format "yyyyMMdd_HHmmss") } else { $RunId }), + "-MaxWallclockSeconds", "180", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "524288", + "-RoundtripValMaxTokens", "262144", + "-TrainLogEvery", "10", + "-ValLossEvery", "50", + "-WarmupSteps", "0", + "-NumLayers", "12", + "-NumUniqueBlocks", "12", + "-ModelDim", "384", + "-EmbedDim", "0", + "-NumHeads", "6", + "-NumKvHeads", "3", + "-MlpMult", "2", + "-CompressionRegWeight", "0", + "-CompressionRegInterval", "1", + "-CompressionRegWarmupSteps", "0", + "-CompressionRegSampleTensors", "0", + "-CompressionRegMaxCols", "128", + "-TernaryRegWeight", "0", + "-OutlierRegWeight", "0", + "-EvalCacheMixWeight", "0", + "-EvalBigramMixWeight", "0", + "-EvalCacheSize", "0", + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "1" +) + +if ($Background) { + $args += "-Background" +} + +& powershell @args diff --git a/scripts/run_local_compress_roundtrip.ps1 b/scripts/run_local_compress_roundtrip.ps1 new file mode 100644 index 0000000000..df3e8eab00 --- /dev/null +++ b/scripts/run_local_compress_roundtrip.ps1 @@ -0,0 +1,51 @@ +param( + [string]$RunId = "", + [switch]$Background +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} + +$args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $(if ([string]::IsNullOrWhiteSpace($RunId)) { "compressrt3090_" + (Get-Date -Format "yyyyMMdd_HHmmss") } else { $RunId }), + "-MaxWallclockSeconds", "180", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "524288", + "-RoundtripValMaxTokens", "262144", + "-TrainLogEvery", "10", + "-ValLossEvery", "50", + "-WarmupSteps", "0", + "-NumLayers", "12", + "-NumUniqueBlocks", "12", + "-ModelDim", "384", + "-EmbedDim", "0", + "-NumHeads", "6", + "-NumKvHeads", "3", + "-MlpMult", "2", + "-CompressionRegWeight", "0.005", + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "32", + "-CompressionRegSampleTensors", "4", + "-CompressionRegMaxCols", "128", + "-TernaryRegWeight", "0", + "-OutlierRegWeight", "0", + "-EvalCacheMixWeight", "0", + "-EvalBigramMixWeight", "0", + "-EvalCacheSize", "0", + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "1" +) + +if ($Background) { + $args += "-Background" +} + +& powershell @args diff --git a/scripts/run_local_compress_roundtrip_sweep.ps1 b/scripts/run_local_compress_roundtrip_sweep.ps1 new file mode 100644 index 0000000000..665d298871 --- /dev/null +++ b/scripts/run_local_compress_roundtrip_sweep.ps1 @@ -0,0 +1,70 @@ +param( + [string]$SweepId = "" +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} + +if ([string]::IsNullOrWhiteSpace($SweepId)) { + $SweepId = "compressrtsweep_" + (Get-Date -Format "yyyyMMdd_HHmmss") +} + +$controllerLog = Join-Path $root "logs\$SweepId.controller.txt" +New-Item -ItemType Directory -Force -Path (Join-Path $root "logs") | Out-Null + +$experiments = @( + @{ Suffix = "w0040_o0010"; CompressionRegWeight = "0.004"; OutlierRegWeight = "0.001" }, + @{ Suffix = "w0050_o0010"; CompressionRegWeight = "0.005"; OutlierRegWeight = "0.001" }, + @{ Suffix = "w0060_o0010"; CompressionRegWeight = "0.006"; OutlierRegWeight = "0.001" }, + @{ Suffix = "w0050_o0020"; CompressionRegWeight = "0.005"; OutlierRegWeight = "0.002" } +) + +foreach ($experiment in $experiments) { + $runId = "{0}_{1}" -f $SweepId, $experiment.Suffix + Add-Content -Path $controllerLog -Value ("[{0}] START {1} compression={2} outlier={3}" -f (Get-Date -Format s), $runId, $experiment.CompressionRegWeight, $experiment.OutlierRegWeight) + $args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $runId, + "-MaxWallclockSeconds", "180", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "524288", + "-RoundtripValMaxTokens", "262144", + "-TrainLogEvery", "10", + "-ValLossEvery", "50", + "-WarmupSteps", "0", + "-NumLayers", "12", + "-NumUniqueBlocks", "12", + "-ModelDim", "384", + "-EmbedDim", "0", + "-NumHeads", "6", + "-NumKvHeads", "3", + "-MlpMult", "2", + "-CompressionRegWeight", $experiment.CompressionRegWeight, + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "32", + "-CompressionRegSampleTensors", "4", + "-CompressionRegMaxCols", "128", + "-TernaryRegWeight", "0", + "-OutlierRegWeight", $experiment.OutlierRegWeight, + "-EvalCacheMixWeight", "0", + "-EvalBigramMixWeight", "0", + "-EvalCacheSize", "0", + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "1" + ) + & powershell @args + if ($LASTEXITCODE -ne 0) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1}" -f (Get-Date -Format s), $runId) + throw "Sweep run failed: $runId" + } + Add-Content -Path $controllerLog -Value ("[{0}] DONE {1}" -f (Get-Date -Format s), $runId) +} + +Add-Content -Path $controllerLog -Value ("[{0}] SWEEP_DONE {1}" -f (Get-Date -Format s), $SweepId) diff --git a/scripts/run_local_compression.ps1 b/scripts/run_local_compression.ps1 new file mode 100644 index 0000000000..8ae0a53e1a --- /dev/null +++ b/scripts/run_local_compression.ps1 @@ -0,0 +1,51 @@ +param( + [string]$RunId = "", + [switch]$Background +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} + +$args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $(if ([string]::IsNullOrWhiteSpace($RunId)) { "compress3090_" + (Get-Date -Format "yyyyMMdd_HHmmss") } else { $RunId }), + "-MaxWallclockSeconds", "180", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "1048576", + "-RoundtripValMaxTokens", "524288", + "-TrainLogEvery", "10", + "-ValLossEvery", "25", + "-WarmupSteps", "0", + "-NumLayers", "12", + "-NumUniqueBlocks", "12", + "-ModelDim", "384", + "-EmbedDim", "0", + "-NumHeads", "6", + "-NumKvHeads", "3", + "-MlpMult", "2", + "-CompressionRegWeight", "0.01", + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "32", + "-CompressionRegSampleTensors", "4", + "-CompressionRegMaxCols", "128", + "-TernaryRegWeight", "0", + "-OutlierRegWeight", "0", + "-EvalCacheMixWeight", "0", + "-EvalBigramMixWeight", "0", + "-EvalCacheSize", "0", + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "0" +) + +if ($Background) { + $args += "-Background" +} + +& powershell @args diff --git a/scripts/run_local_fixedstep_exportaware_sweep.ps1 b/scripts/run_local_fixedstep_exportaware_sweep.ps1 new file mode 100644 index 0000000000..33ea5c99ec --- /dev/null +++ b/scripts/run_local_fixedstep_exportaware_sweep.ps1 @@ -0,0 +1,94 @@ +param( + [string]$SweepId = "" +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} + +if ([string]::IsNullOrWhiteSpace($SweepId)) { + $SweepId = "exportaware_fixedstep_" + (Get-Date -Format "yyyyMMdd_HHmmss") +} + +$controllerLog = Join-Path $root "logs\$SweepId.controller.txt" +New-Item -ItemType Directory -Force -Path (Join-Path $root "logs") | Out-Null + +function Test-RoundtripRunComplete { + param( + [string]$LogPath + ) + + if (-not (Test-Path $LogPath)) { + return $false + } + + return [bool](Select-String -Path $LogPath -Pattern '^final_int8_zlib_roundtrip_exact ' -Quiet) +} + +$experiments = @( + @{ Suffix = "g010_r000"; Grid = "0.10"; Rank1 = "0.0" }, + @{ Suffix = "g025_r000"; Grid = "0.25"; Rank1 = "0.0" }, + @{ Suffix = "g010_r050"; Grid = "0.10"; Rank1 = "0.5" }, + @{ Suffix = "g025_r050"; Grid = "0.25"; Rank1 = "0.5" } +) + +foreach ($experiment in $experiments) { + $runId = "{0}_{1}" -f $SweepId, $experiment.Suffix + $runLog = Join-Path $root "logs\$runId.txt" + Add-Content -Path $controllerLog -Value ("[{0}] START {1} grid={2} rank1={3}" -f (Get-Date -Format s), $runId, $experiment.Grid, $experiment.Rank1) + $args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $runId, + "-MaxWallclockSeconds", "0", + "-Iterations", "300", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "524288", + "-RoundtripValMaxTokens", "262144", + "-TrainLogEvery", "10", + "-ValLossEvery", "50", + "-WarmupSteps", "0", + "-NumLayers", "12", + "-NumUniqueBlocks", "12", + "-ModelDim", "384", + "-EmbedDim", "0", + "-NumHeads", "6", + "-NumKvHeads", "3", + "-MlpMult", "2", + "-WindowSize", "0", + "-Int8AxisMode", "auto", + "-Int8ResidualRank", "1", + "-Int8ResidualBudgetBytes", "65536", + "-CompressionRegWeight", "0.005", + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "32", + "-CompressionRegSampleTensors", "4", + "-CompressionRegMaxCols", "128", + "-CompressionGridRegWeight", $experiment.Grid, + "-CompressionRank1RegWeight", $experiment.Rank1, + "-TernaryRegWeight", "0", + "-OutlierRegWeight", "0", + "-EvalCacheMixWeight", "0", + "-EvalBigramMixWeight", "0", + "-EvalCacheSize", "0", + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "1" + ) + & powershell @args + if ($LASTEXITCODE -ne 0) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1}" -f (Get-Date -Format s), $runId) + throw "Sweep run failed: $runId" + } + if (-not (Test-RoundtripRunComplete -LogPath $runLog)) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1} missing_final_roundtrip_metric" -f (Get-Date -Format s), $runId) + throw "Sweep run missing final roundtrip metric: $runId" + } + Add-Content -Path $controllerLog -Value ("[{0}] DONE {1}" -f (Get-Date -Format s), $runId) +} + +Add-Content -Path $controllerLog -Value ("[{0}] SWEEP_DONE {1}" -f (Get-Date -Format s), $SweepId) diff --git a/scripts/run_local_fixedstep_grid_outlier_sweep.ps1 b/scripts/run_local_fixedstep_grid_outlier_sweep.ps1 new file mode 100644 index 0000000000..28f275a39a --- /dev/null +++ b/scripts/run_local_fixedstep_grid_outlier_sweep.ps1 @@ -0,0 +1,93 @@ +param( + [string]$SweepId = "" +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} + +if ([string]::IsNullOrWhiteSpace($SweepId)) { + $SweepId = "gridoutlier_fixedstep_" + (Get-Date -Format "yyyyMMdd_HHmmss") +} + +$controllerLog = Join-Path $root "logs\$SweepId.controller.txt" +New-Item -ItemType Directory -Force -Path (Join-Path $root "logs") | Out-Null + +function Test-RoundtripRunComplete { + param( + [string]$LogPath + ) + + if (-not (Test-Path $LogPath)) { + return $false + } + + return [bool](Select-String -Path $LogPath -Pattern '^final_int8_zlib_roundtrip_exact ' -Quiet) +} + +$experiments = @( + @{ Suffix = "o00010"; Outlier = "0.00010" }, + @{ Suffix = "o00025"; Outlier = "0.00025" } +) + +foreach ($experiment in $experiments) { + $runId = "{0}_{1}" -f $SweepId, $experiment.Suffix + $runLog = Join-Path $root "logs\$runId.txt" + Add-Content -Path $controllerLog -Value ("[{0}] START {1} outlier={2}" -f (Get-Date -Format s), $runId, $experiment.Outlier) + $args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $runId, + "-MaxWallclockSeconds", "0", + "-Iterations", "300", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "524288", + "-RoundtripValMaxTokens", "262144", + "-TrainLogEvery", "10", + "-ValLossEvery", "50", + "-WarmupSteps", "0", + "-NumLayers", "12", + "-NumUniqueBlocks", "12", + "-ModelDim", "384", + "-EmbedDim", "0", + "-NumHeads", "6", + "-NumKvHeads", "3", + "-MlpMult", "2", + "-WindowSize", "0", + "-Int8AxisMode", "auto", + "-Int8ResidualRank", "1", + "-Int8ResidualBudgetBytes", "65536", + "-CompressionRegWeight", "0.005", + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "32", + "-CompressionRegSampleTensors", "4", + "-CompressionRegMaxCols", "128", + "-CompressionGridRegWeight", "0.10", + "-CompressionScaleRegWeight", "0.0", + "-CompressionRank1RegWeight", "0.0", + "-TernaryRegWeight", "0", + "-OutlierRegWeight", $experiment.Outlier, + "-EvalCacheMixWeight", "0", + "-EvalBigramMixWeight", "0", + "-EvalCacheSize", "0", + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "1" + ) + & powershell @args + if ($LASTEXITCODE -ne 0) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1}" -f (Get-Date -Format s), $runId) + throw "Sweep run failed: $runId" + } + if (-not (Test-RoundtripRunComplete -LogPath $runLog)) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1} missing_final_roundtrip_metric" -f (Get-Date -Format s), $runId) + throw "Sweep run missing final roundtrip metric: $runId" + } + Add-Content -Path $controllerLog -Value ("[{0}] DONE {1}" -f (Get-Date -Format s), $runId) +} + +Add-Content -Path $controllerLog -Value ("[{0}] SWEEP_DONE {1}" -f (Get-Date -Format s), $SweepId) diff --git a/scripts/run_local_fixedstep_grid_refine_sweep.ps1 b/scripts/run_local_fixedstep_grid_refine_sweep.ps1 new file mode 100644 index 0000000000..ea11195acc --- /dev/null +++ b/scripts/run_local_fixedstep_grid_refine_sweep.ps1 @@ -0,0 +1,93 @@ +param( + [string]$SweepId = "" +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} + +if ([string]::IsNullOrWhiteSpace($SweepId)) { + $SweepId = "gridrefine_fixedstep_" + (Get-Date -Format "yyyyMMdd_HHmmss") +} + +$controllerLog = Join-Path $root "logs\$SweepId.controller.txt" +New-Item -ItemType Directory -Force -Path (Join-Path $root "logs") | Out-Null + +function Test-RoundtripRunComplete { + param( + [string]$LogPath + ) + + if (-not (Test-Path $LogPath)) { + return $false + } + + return [bool](Select-String -Path $LogPath -Pattern '^final_int8_zlib_roundtrip_exact ' -Quiet) +} + +$experiments = @( + @{ Suffix = "g0080"; Grid = "0.08" }, + @{ Suffix = "g0120"; Grid = "0.12" } +) + +foreach ($experiment in $experiments) { + $runId = "{0}_{1}" -f $SweepId, $experiment.Suffix + $runLog = Join-Path $root "logs\$runId.txt" + Add-Content -Path $controllerLog -Value ("[{0}] START {1} grid={2}" -f (Get-Date -Format s), $runId, $experiment.Grid) + $args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $runId, + "-MaxWallclockSeconds", "0", + "-Iterations", "300", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "524288", + "-RoundtripValMaxTokens", "262144", + "-TrainLogEvery", "10", + "-ValLossEvery", "50", + "-WarmupSteps", "0", + "-NumLayers", "12", + "-NumUniqueBlocks", "12", + "-ModelDim", "384", + "-EmbedDim", "0", + "-NumHeads", "6", + "-NumKvHeads", "3", + "-MlpMult", "2", + "-WindowSize", "0", + "-Int8AxisMode", "auto", + "-Int8ResidualRank", "1", + "-Int8ResidualBudgetBytes", "65536", + "-CompressionRegWeight", "0.005", + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "32", + "-CompressionRegSampleTensors", "4", + "-CompressionRegMaxCols", "128", + "-CompressionGridRegWeight", $experiment.Grid, + "-CompressionScaleRegWeight", "0.0", + "-CompressionRank1RegWeight", "0.0", + "-TernaryRegWeight", "0", + "-OutlierRegWeight", "0", + "-EvalCacheMixWeight", "0", + "-EvalBigramMixWeight", "0", + "-EvalCacheSize", "0", + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "1" + ) + & powershell @args + if ($LASTEXITCODE -ne 0) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1}" -f (Get-Date -Format s), $runId) + throw "Sweep run failed: $runId" + } + if (-not (Test-RoundtripRunComplete -LogPath $runLog)) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1} missing_final_roundtrip_metric" -f (Get-Date -Format s), $runId) + throw "Sweep run missing final roundtrip metric: $runId" + } + Add-Content -Path $controllerLog -Value ("[{0}] DONE {1}" -f (Get-Date -Format s), $runId) +} + +Add-Content -Path $controllerLog -Value ("[{0}] SWEEP_DONE {1}" -f (Get-Date -Format s), $SweepId) diff --git a/scripts/run_local_fixedstep_highcap_dense_sweep.ps1 b/scripts/run_local_fixedstep_highcap_dense_sweep.ps1 new file mode 100644 index 0000000000..c2cc9945b1 --- /dev/null +++ b/scripts/run_local_fixedstep_highcap_dense_sweep.ps1 @@ -0,0 +1,124 @@ +param( + [string]$SweepId = "" +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} + +if ([string]::IsNullOrWhiteSpace($SweepId)) { + $SweepId = "highcapdense_fixedstep_" + (Get-Date -Format "yyyyMMdd_HHmmss") +} + +$controllerLog = Join-Path $root "logs\$SweepId.controller.txt" +New-Item -ItemType Directory -Force -Path (Join-Path $root "logs") | Out-Null + +function Test-RoundtripRunComplete { + param( + [string]$LogPath + ) + + if (-not (Test-Path $LogPath)) { + return $false + } + + return [bool](Select-String -Path $LogPath -Pattern '^final_int8_zlib_roundtrip_exact ' -Quiet) +} + +function Start-LauncherAndWait { + param( + [string[]]$LauncherArgs + ) + + $output = & powershell @($LauncherArgs + "-Background") + if ($LASTEXITCODE -ne 0) { + throw "Launcher failed before background start" + } + $pidLine = $output | Where-Object { $_ -match '^PID=' } | Select-Object -First 1 + if (-not $pidLine) { + throw "Launcher did not report a PID" + } + $pid = [int]($pidLine -replace '^PID=', '') + try { + Wait-Process -Id $pid -ErrorAction Stop + } catch { + $proc = Get-Process -Id $pid -ErrorAction SilentlyContinue + if ($proc) { + throw + } + } +} + +$experiments = @( + @{ Suffix = "w608_l12"; Label = "width_14p4MB"; Layers = "12"; Dim = "608"; Heads = "8"; KvHeads = "4" }, + @{ Suffix = "w624_l12"; Label = "width_15p0MB"; Layers = "12"; Dim = "624"; Heads = "8"; KvHeads = "4" }, + @{ Suffix = "d576_l14"; Label = "depth_15p0MB"; Layers = "14"; Dim = "576"; Heads = "8"; KvHeads = "4" }, + @{ Suffix = "w640_l12"; Label = "width_15p7MB"; Layers = "12"; Dim = "640"; Heads = "8"; KvHeads = "4" } +) + +foreach ($experiment in $experiments) { + $runId = "{0}_{1}" -f $SweepId, $experiment.Suffix + $runLog = Join-Path $root "logs\$runId.txt" + if (Test-RoundtripRunComplete -LogPath $runLog) { + Add-Content -Path $controllerLog -Value ("[{0}] SKIP {1} already_complete" -f (Get-Date -Format s), $runId) + continue + } + Add-Content -Path $controllerLog -Value ("[{0}] START {1} label={2} layers={3} dim={4} heads={5} kv={6}" -f (Get-Date -Format s), $runId, $experiment.Label, $experiment.Layers, $experiment.Dim, $experiment.Heads, $experiment.KvHeads) + $args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $runId, + "-MaxWallclockSeconds", "0", + "-Iterations", "300", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "524288", + "-RoundtripValMaxTokens", "262144", + "-TrainLogEvery", "10", + "-ValLossEvery", "50", + "-WarmupSteps", "0", + "-NumLayers", $experiment.Layers, + "-NumUniqueBlocks", $experiment.Layers, + "-ModelDim", $experiment.Dim, + "-EmbedDim", "0", + "-NumHeads", $experiment.Heads, + "-NumKvHeads", $experiment.KvHeads, + "-MlpMult", "2", + "-WindowSize", "0", + "-Int8AxisMode", "auto", + "-Int8ResidualRank", "1", + "-Int8ResidualBudgetBytes", "65536", + "-CompressionRegWeight", "0.005", + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "32", + "-CompressionRegSampleTensors", "4", + "-CompressionRegMaxCols", "128", + "-CompressionGridRegWeight", "0.10", + "-CompressionScaleRegWeight", "0.0", + "-CompressionRank1RegWeight", "0.0", + "-TernaryRegWeight", "0", + "-OutlierRegWeight", "0", + "-EvalCacheMixWeight", "0", + "-EvalBigramMixWeight", "0", + "-EvalCacheSize", "0", + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "1" + ) + try { + Start-LauncherAndWait -LauncherArgs $args + } catch { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1}" -f (Get-Date -Format s), $runId) + throw "Sweep run failed: $runId" + } + if (-not (Test-RoundtripRunComplete -LogPath $runLog)) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1} missing_final_roundtrip_metric" -f (Get-Date -Format s), $runId) + throw "Sweep run missing final roundtrip metric: $runId" + } + Add-Content -Path $controllerLog -Value ("[{0}] DONE {1}" -f (Get-Date -Format s), $runId) +} + +Add-Content -Path $controllerLog -Value ("[{0}] SWEEP_DONE {1}" -f (Get-Date -Format s), $SweepId) diff --git a/scripts/run_local_fixedstep_isobyte_sweep.ps1 b/scripts/run_local_fixedstep_isobyte_sweep.ps1 new file mode 100644 index 0000000000..7e79329ac9 --- /dev/null +++ b/scripts/run_local_fixedstep_isobyte_sweep.ps1 @@ -0,0 +1,95 @@ +param( + [string]$SweepId = "" +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} + +if ([string]::IsNullOrWhiteSpace($SweepId)) { + $SweepId = "isobyte_fixedstep_" + (Get-Date -Format "yyyyMMdd_HHmmss") +} + +$controllerLog = Join-Path $root "logs\$SweepId.controller.txt" +New-Item -ItemType Directory -Force -Path (Join-Path $root "logs") | Out-Null + +function Test-RoundtripRunComplete { + param( + [string]$LogPath + ) + + if (-not (Test-Path $LogPath)) { + return $false + } + + return [bool](Select-String -Path $LogPath -Pattern '^final_int8_zlib_roundtrip_exact ' -Quiet) +} + +$experiments = @( + @{ Suffix = "b10"; Target = "10MB"; Layers = "12"; Dim = "480"; Heads = "8"; KvHeads = "4" }, + @{ Suffix = "b12"; Target = "12MB"; Layers = "12"; Dim = "528"; Heads = "12"; KvHeads = "6" }, + @{ Suffix = "b14"; Target = "14MB"; Layers = "12"; Dim = "576"; Heads = "12"; KvHeads = "6" }, + @{ Suffix = "b155"; Target = "15.5MB"; Layers = "12"; Dim = "592"; Heads = "8"; KvHeads = "4" } +) + +foreach ($experiment in $experiments) { + $runId = "{0}_{1}" -f $SweepId, $experiment.Suffix + $runLog = Join-Path $root "logs\$runId.txt" + Add-Content -Path $controllerLog -Value ("[{0}] START {1} target={2} layers={3} dim={4} heads={5} kv={6}" -f (Get-Date -Format s), $runId, $experiment.Target, $experiment.Layers, $experiment.Dim, $experiment.Heads, $experiment.KvHeads) + $args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $runId, + "-MaxWallclockSeconds", "0", + "-Iterations", "300", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "524288", + "-RoundtripValMaxTokens", "262144", + "-TrainLogEvery", "10", + "-ValLossEvery", "50", + "-WarmupSteps", "0", + "-NumLayers", $experiment.Layers, + "-NumUniqueBlocks", $experiment.Layers, + "-ModelDim", $experiment.Dim, + "-EmbedDim", "0", + "-NumHeads", $experiment.Heads, + "-NumKvHeads", $experiment.KvHeads, + "-MlpMult", "2", + "-WindowSize", "0", + "-Int8AxisMode", "auto", + "-Int8ResidualRank", "1", + "-Int8ResidualBudgetBytes", "65536", + "-CompressionRegWeight", "0.005", + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "32", + "-CompressionRegSampleTensors", "4", + "-CompressionRegMaxCols", "128", + "-CompressionGridRegWeight", "0.10", + "-CompressionScaleRegWeight", "0.0", + "-CompressionRank1RegWeight", "0.0", + "-TernaryRegWeight", "0", + "-OutlierRegWeight", "0", + "-EvalCacheMixWeight", "0", + "-EvalBigramMixWeight", "0", + "-EvalCacheSize", "0", + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "1" + ) + & powershell @args + if ($LASTEXITCODE -ne 0) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1}" -f (Get-Date -Format s), $runId) + throw "Sweep run failed: $runId" + } + if (-not (Test-RoundtripRunComplete -LogPath $runLog)) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1} missing_final_roundtrip_metric" -f (Get-Date -Format s), $runId) + throw "Sweep run missing final roundtrip metric: $runId" + } + Add-Content -Path $controllerLog -Value ("[{0}] DONE {1}" -f (Get-Date -Format s), $runId) +} + +Add-Content -Path $controllerLog -Value ("[{0}] SWEEP_DONE {1}" -f (Get-Date -Format s), $SweepId) diff --git a/scripts/run_local_fixedstep_roundtrip_sweep.ps1 b/scripts/run_local_fixedstep_roundtrip_sweep.ps1 new file mode 100644 index 0000000000..e99eab5fb6 --- /dev/null +++ b/scripts/run_local_fixedstep_roundtrip_sweep.ps1 @@ -0,0 +1,92 @@ +param( + [string]$SweepId = "" +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} + +if ([string]::IsNullOrWhiteSpace($SweepId)) { + $SweepId = "fixedsteprtsweep_" + (Get-Date -Format "yyyyMMdd_HHmmss") +} + +$controllerLog = Join-Path $root "logs\$SweepId.controller.txt" +New-Item -ItemType Directory -Force -Path (Join-Path $root "logs") | Out-Null + +function Test-RoundtripRunComplete { + param( + [string]$LogPath + ) + + if (-not (Test-Path $LogPath)) { + return $false + } + + return [bool](Select-String -Path $LogPath -Pattern '^final_int8_zlib_roundtrip_exact ' -Quiet) +} + +$experiments = @( + @{ Suffix = "base_a"; CacheMix = "0.0"; BigramMix = "0.0"; CacheSize = "0" }, + @{ Suffix = "side_a"; CacheMix = "0.02"; BigramMix = "0.03"; CacheSize = "8" }, + @{ Suffix = "base_b"; CacheMix = "0.0"; BigramMix = "0.0"; CacheSize = "0" }, + @{ Suffix = "side_b"; CacheMix = "0.02"; BigramMix = "0.03"; CacheSize = "8" } +) + +foreach ($experiment in $experiments) { + $runId = "{0}_{1}" -f $SweepId, $experiment.Suffix + $runLog = Join-Path $root "logs\$runId.txt" + Add-Content -Path $controllerLog -Value ("[{0}] START {1} cache={2} bigram={3} size={4}" -f (Get-Date -Format s), $runId, $experiment.CacheMix, $experiment.BigramMix, $experiment.CacheSize) + $args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $runId, + "-MaxWallclockSeconds", "0", + "-Iterations", "300", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "524288", + "-RoundtripValMaxTokens", "262144", + "-TrainLogEvery", "10", + "-ValLossEvery", "50", + "-WarmupSteps", "0", + "-NumLayers", "12", + "-NumUniqueBlocks", "12", + "-ModelDim", "384", + "-EmbedDim", "0", + "-NumHeads", "6", + "-NumKvHeads", "3", + "-MlpMult", "2", + "-WindowSize", "0", + "-Int8AxisMode", "auto", + "-Int8ResidualRank", "1", + "-Int8ResidualBudgetBytes", "65536", + "-CompressionRegWeight", "0.005", + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "32", + "-CompressionRegSampleTensors", "4", + "-CompressionRegMaxCols", "128", + "-TernaryRegWeight", "0", + "-OutlierRegWeight", "0", + "-EvalCacheMixWeight", $experiment.CacheMix, + "-EvalBigramMixWeight", $experiment.BigramMix, + "-EvalCacheSize", $experiment.CacheSize, + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "1" + ) + & powershell @args + if ($LASTEXITCODE -ne 0) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1}" -f (Get-Date -Format s), $runId) + throw "Sweep run failed: $runId" + } + if (-not (Test-RoundtripRunComplete -LogPath $runLog)) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1} missing_final_roundtrip_metric" -f (Get-Date -Format s), $runId) + throw "Sweep run missing final roundtrip metric: $runId" + } + Add-Content -Path $controllerLog -Value ("[{0}] DONE {1}" -f (Get-Date -Format s), $runId) +} + +Add-Content -Path $controllerLog -Value ("[{0}] SWEEP_DONE {1}" -f (Get-Date -Format s), $SweepId) diff --git a/scripts/run_local_fixedstep_scaleaware_sweep.ps1 b/scripts/run_local_fixedstep_scaleaware_sweep.ps1 new file mode 100644 index 0000000000..6842246f3d --- /dev/null +++ b/scripts/run_local_fixedstep_scaleaware_sweep.ps1 @@ -0,0 +1,93 @@ +param( + [string]$SweepId = "" +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} + +if ([string]::IsNullOrWhiteSpace($SweepId)) { + $SweepId = "scaleaware_fixedstep_" + (Get-Date -Format "yyyyMMdd_HHmmss") +} + +$controllerLog = Join-Path $root "logs\$SweepId.controller.txt" +New-Item -ItemType Directory -Force -Path (Join-Path $root "logs") | Out-Null + +function Test-RoundtripRunComplete { + param( + [string]$LogPath + ) + + if (-not (Test-Path $LogPath)) { + return $false + } + + return [bool](Select-String -Path $LogPath -Pattern '^final_int8_zlib_roundtrip_exact ' -Quiet) +} + +$experiments = @( + @{ Suffix = "g010_s0010"; Grid = "0.10"; Scale = "0.0010" }, + @{ Suffix = "g010_s0025"; Grid = "0.10"; Scale = "0.0025" } +) + +foreach ($experiment in $experiments) { + $runId = "{0}_{1}" -f $SweepId, $experiment.Suffix + $runLog = Join-Path $root "logs\$runId.txt" + Add-Content -Path $controllerLog -Value ("[{0}] START {1} grid={2} scale={3}" -f (Get-Date -Format s), $runId, $experiment.Grid, $experiment.Scale) + $args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $runId, + "-MaxWallclockSeconds", "0", + "-Iterations", "300", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "524288", + "-RoundtripValMaxTokens", "262144", + "-TrainLogEvery", "10", + "-ValLossEvery", "50", + "-WarmupSteps", "0", + "-NumLayers", "12", + "-NumUniqueBlocks", "12", + "-ModelDim", "384", + "-EmbedDim", "0", + "-NumHeads", "6", + "-NumKvHeads", "3", + "-MlpMult", "2", + "-WindowSize", "0", + "-Int8AxisMode", "auto", + "-Int8ResidualRank", "1", + "-Int8ResidualBudgetBytes", "65536", + "-CompressionRegWeight", "0.005", + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "32", + "-CompressionRegSampleTensors", "4", + "-CompressionRegMaxCols", "128", + "-CompressionGridRegWeight", $experiment.Grid, + "-CompressionScaleRegWeight", $experiment.Scale, + "-CompressionRank1RegWeight", "0.0", + "-TernaryRegWeight", "0", + "-OutlierRegWeight", "0", + "-EvalCacheMixWeight", "0", + "-EvalBigramMixWeight", "0", + "-EvalCacheSize", "0", + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "1" + ) + & powershell @args + if ($LASTEXITCODE -ne 0) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1}" -f (Get-Date -Format s), $runId) + throw "Sweep run failed: $runId" + } + if (-not (Test-RoundtripRunComplete -LogPath $runLog)) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1} missing_final_roundtrip_metric" -f (Get-Date -Format s), $runId) + throw "Sweep run missing final roundtrip metric: $runId" + } + Add-Content -Path $controllerLog -Value ("[{0}] DONE {1}" -f (Get-Date -Format s), $runId) +} + +Add-Content -Path $controllerLog -Value ("[{0}] SWEEP_DONE {1}" -f (Get-Date -Format s), $SweepId) diff --git a/scripts/run_local_proxy_sweep.ps1 b/scripts/run_local_proxy_sweep.ps1 new file mode 100644 index 0000000000..d94d183614 --- /dev/null +++ b/scripts/run_local_proxy_sweep.ps1 @@ -0,0 +1,134 @@ +param( + [string]$Only = "", + [switch]$Background +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +$summarizer = Join-Path $root "scripts\summarize_log.py" +$python = Join-Path $root ".venv\Scripts\python.exe" +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} + +if ($Background -and [string]::IsNullOrWhiteSpace($Only)) { + throw "Background mode requires -Only so we do not start multiple GPU jobs at once." +} + +$stamp = Get-Date -Format "yyyyMMdd_HHmmss" +$experiments = @( + @{ + Name = "proxy_qat_s" + MaxWallclockSeconds = 180 + TrainBatchTokens = 32768 + ValBatchSize = 32768 + ValMaxTokens = 1048576 + RoundtripValMaxTokens = 524288 + ValLossEvery = 25 + NumLayers = 12 + NumUniqueBlocks = 3 + ModelDim = 384 + EmbedDim = 192 + NumHeads = 6 + NumKvHeads = 3 + CompressionRegWeight = 0.01 + TernaryRegWeight = 0.1 + EvalCacheMixWeight = 0.03 + }, + @{ + Name = "proxy_factored_m" + MaxWallclockSeconds = 240 + TrainBatchTokens = 49152 + ValBatchSize = 49152 + ValMaxTokens = 2097152 + RoundtripValMaxTokens = 1048576 + ValLossEvery = 30 + NumLayers = 14 + NumUniqueBlocks = 4 + ModelDim = 448 + EmbedDim = 224 + NumHeads = 8 + NumKvHeads = 4 + CompressionRegWeight = 0.015 + TernaryRegWeight = 0.12 + EvalCacheMixWeight = 0.04 + }, + @{ + Name = "proxy_qat_l" + MaxWallclockSeconds = 300 + TrainBatchTokens = 65536 + ValBatchSize = 65536 + ValMaxTokens = 4194304 + RoundtripValMaxTokens = 2097152 + ValLossEvery = 40 + NumLayers = 16 + NumUniqueBlocks = 4 + ModelDim = 512 + EmbedDim = 0 + NumHeads = 8 + NumKvHeads = 4 + CompressionRegWeight = 0.02 + TernaryRegWeight = 0.15 + EvalCacheMixWeight = 0.05 + } +) + +if (-not [string]::IsNullOrWhiteSpace($Only)) { + $experiments = @($experiments | Where-Object { $_.Name -eq $Only }) + if ($experiments.Count -eq 0) { + throw "No sweep preset matched -Only '$Only'" + } +} + +foreach ($experiment in $experiments) { + $runId = "{0}_{1}" -f $experiment.Name, $stamp + $args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $runId, + "-MaxWallclockSeconds", $experiment.MaxWallclockSeconds.ToString(), + "-TrainBatchTokens", $experiment.TrainBatchTokens.ToString(), + "-ValBatchSize", $experiment.ValBatchSize.ToString(), + "-ValMaxTokens", $experiment.ValMaxTokens.ToString(), + "-RoundtripValMaxTokens", $experiment.RoundtripValMaxTokens.ToString(), + "-TrainLogEvery", "10", + "-ValLossEvery", $experiment.ValLossEvery.ToString(), + "-WarmupSteps", "0", + "-NumLayers", $experiment.NumLayers.ToString(), + "-NumUniqueBlocks", $experiment.NumUniqueBlocks.ToString(), + "-ModelDim", $experiment.ModelDim.ToString(), + "-EmbedDim", $experiment.EmbedDim.ToString(), + "-NumHeads", $experiment.NumHeads.ToString(), + "-NumKvHeads", $experiment.NumKvHeads.ToString(), + "-MlpMult", "2", + "-CompressionRegWeight", $experiment.CompressionRegWeight.ToString([System.Globalization.CultureInfo]::InvariantCulture), + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "8", + "-CompressionRegSampleTensors", "4", + "-CompressionRegMaxCols", "128", + "-TernaryRegWeight", $experiment.TernaryRegWeight.ToString([System.Globalization.CultureInfo]::InvariantCulture), + "-OutlierRegWeight", "0.01", + "-EvalCacheMixWeight", $experiment.EvalCacheMixWeight.ToString([System.Globalization.CultureInfo]::InvariantCulture), + "-EvalCacheSize", "8", + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "0" + ) + + if ($Background) { + $args += "-Background" + } + + Write-Output ("Launching {0}" -f $runId) + & powershell @args + + if ($Background) { + break + } + + $logPath = Join-Path $root ("logs\{0}.txt" -f $runId) + if ((Test-Path $python) -and (Test-Path $summarizer) -and (Test-Path $logPath)) { + & $python $summarizer $logPath + } +} diff --git a/scripts/run_local_qat_roundtrip_sweep.ps1 b/scripts/run_local_qat_roundtrip_sweep.ps1 new file mode 100644 index 0000000000..2779bf7f64 --- /dev/null +++ b/scripts/run_local_qat_roundtrip_sweep.ps1 @@ -0,0 +1,72 @@ +param( + [string]$SweepId = "" +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} + +if ([string]::IsNullOrWhiteSpace($SweepId)) { + $SweepId = "qatrtsweep_" + (Get-Date -Format "yyyyMMdd_HHmmss") +} + +$controllerLog = Join-Path $root "logs\$SweepId.controller.txt" +New-Item -ItemType Directory -Force -Path (Join-Path $root "logs") | Out-Null + +$experiments = @( + @{ Suffix = "w0045_o0000"; CompressionRegWeight = "0.0045"; OutlierRegWeight = "0.0" }, + @{ Suffix = "w0050_o0000"; CompressionRegWeight = "0.0050"; OutlierRegWeight = "0.0" }, + @{ Suffix = "w0055_o0000"; CompressionRegWeight = "0.0055"; OutlierRegWeight = "0.0" }, + @{ Suffix = "w0050_o00025"; CompressionRegWeight = "0.0050"; OutlierRegWeight = "0.00025" }, + @{ Suffix = "w0050_o00050"; CompressionRegWeight = "0.0050"; OutlierRegWeight = "0.0005" } +) + +foreach ($experiment in $experiments) { + $runId = "{0}_{1}" -f $SweepId, $experiment.Suffix + Add-Content -Path $controllerLog -Value ("[{0}] START {1} compression={2} outlier={3}" -f (Get-Date -Format s), $runId, $experiment.CompressionRegWeight, $experiment.OutlierRegWeight) + $args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $runId, + "-MaxWallclockSeconds", "180", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "524288", + "-RoundtripValMaxTokens", "262144", + "-TrainLogEvery", "10", + "-ValLossEvery", "50", + "-WarmupSteps", "0", + "-NumLayers", "12", + "-NumUniqueBlocks", "12", + "-ModelDim", "384", + "-EmbedDim", "0", + "-NumHeads", "6", + "-NumKvHeads", "3", + "-MlpMult", "2", + "-WindowSize", "0", + "-CompressionRegWeight", $experiment.CompressionRegWeight, + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "32", + "-CompressionRegSampleTensors", "4", + "-CompressionRegMaxCols", "128", + "-TernaryRegWeight", "0", + "-OutlierRegWeight", $experiment.OutlierRegWeight, + "-EvalCacheMixWeight", "0", + "-EvalBigramMixWeight", "0", + "-EvalCacheSize", "0", + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "1" + ) + & powershell @args + if ($LASTEXITCODE -ne 0) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1}" -f (Get-Date -Format s), $runId) + throw "Sweep run failed: $runId" + } + Add-Content -Path $controllerLog -Value ("[{0}] DONE {1}" -f (Get-Date -Format s), $runId) +} + +Add-Content -Path $controllerLog -Value ("[{0}] SWEEP_DONE {1}" -f (Get-Date -Format s), $SweepId) diff --git a/scripts/run_local_recurrent_roundtrip_sweep.ps1 b/scripts/run_local_recurrent_roundtrip_sweep.ps1 new file mode 100644 index 0000000000..1ab2cea3af --- /dev/null +++ b/scripts/run_local_recurrent_roundtrip_sweep.ps1 @@ -0,0 +1,71 @@ +param( + [string]$SweepId = "" +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} + +if ([string]::IsNullOrWhiteSpace($SweepId)) { + $SweepId = "recurtsweep_" + (Get-Date -Format "yyyyMMdd_HHmmss") +} + +$controllerLog = Join-Path $root "logs\$SweepId.controller.txt" +New-Item -ItemType Directory -Force -Path (Join-Path $root "logs") | Out-Null + +$experiments = @( + @{ Suffix = "l16_u8_e000"; NumLayers = "16"; NumUniqueBlocks = "8"; EmbedDim = "0" }, + @{ Suffix = "l18_u6_e000"; NumLayers = "18"; NumUniqueBlocks = "6"; EmbedDim = "0" }, + @{ Suffix = "l16_u8_e256"; NumLayers = "16"; NumUniqueBlocks = "8"; EmbedDim = "256" }, + @{ Suffix = "l18_u6_e256"; NumLayers = "18"; NumUniqueBlocks = "6"; EmbedDim = "256" } +) + +foreach ($experiment in $experiments) { + $runId = "{0}_{1}" -f $SweepId, $experiment.Suffix + Add-Content -Path $controllerLog -Value ("[{0}] START {1} layers={2} unique={3} embed={4}" -f (Get-Date -Format s), $runId, $experiment.NumLayers, $experiment.NumUniqueBlocks, $experiment.EmbedDim) + $args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $runId, + "-MaxWallclockSeconds", "180", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "524288", + "-RoundtripValMaxTokens", "262144", + "-TrainLogEvery", "10", + "-ValLossEvery", "50", + "-WarmupSteps", "0", + "-NumLayers", $experiment.NumLayers, + "-NumUniqueBlocks", $experiment.NumUniqueBlocks, + "-ModelDim", "384", + "-EmbedDim", $experiment.EmbedDim, + "-NumHeads", "6", + "-NumKvHeads", "3", + "-MlpMult", "2", + "-WindowSize", "0", + "-CompressionRegWeight", "0.005", + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "32", + "-CompressionRegSampleTensors", "4", + "-CompressionRegMaxCols", "128", + "-TernaryRegWeight", "0", + "-OutlierRegWeight", "0", + "-EvalCacheMixWeight", "0", + "-EvalBigramMixWeight", "0", + "-EvalCacheSize", "0", + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "1" + ) + & powershell @args + if ($LASTEXITCODE -ne 0) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1}" -f (Get-Date -Format s), $runId) + throw "Sweep run failed: $runId" + } + Add-Content -Path $controllerLog -Value ("[{0}] DONE {1}" -f (Get-Date -Format s), $runId) +} + +Add-Content -Path $controllerLog -Value ("[{0}] SWEEP_DONE {1}" -f (Get-Date -Format s), $SweepId) diff --git a/scripts/run_local_repeatability_roundtrip_sweep.ps1 b/scripts/run_local_repeatability_roundtrip_sweep.ps1 new file mode 100644 index 0000000000..06b90dfcac --- /dev/null +++ b/scripts/run_local_repeatability_roundtrip_sweep.ps1 @@ -0,0 +1,98 @@ +param( + [string]$SweepId = "" +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} + +if ([string]::IsNullOrWhiteSpace($SweepId)) { + $SweepId = "repeatrtsweep_" + (Get-Date -Format "yyyyMMdd_HHmmss") +} + +$controllerLog = Join-Path $root "logs\$SweepId.controller.txt" +New-Item -ItemType Directory -Force -Path (Join-Path $root "logs") | Out-Null + +function Test-RoundtripRunComplete { + param( + [string]$LogPath + ) + + if (-not (Test-Path $LogPath)) { + return $false + } + + $finalExact = Select-String -Path $LogPath -Pattern '^final_int8_zlib_roundtrip_exact ' -Quiet + if ($finalExact) { + return $true + } + + return $false +} + +$experiments = @( + @{ Suffix = "base_a"; CacheMix = "0.0"; BigramMix = "0.0"; CacheSize = "0" }, + @{ Suffix = "side_a"; CacheMix = "0.02"; BigramMix = "0.03"; CacheSize = "8" }, + @{ Suffix = "base_b"; CacheMix = "0.0"; BigramMix = "0.0"; CacheSize = "0" }, + @{ Suffix = "side_b"; CacheMix = "0.02"; BigramMix = "0.03"; CacheSize = "8" }, + @{ Suffix = "base_c"; CacheMix = "0.0"; BigramMix = "0.0"; CacheSize = "0" }, + @{ Suffix = "side_c"; CacheMix = "0.02"; BigramMix = "0.03"; CacheSize = "8" } +) + +foreach ($experiment in $experiments) { + $runId = "{0}_{1}" -f $SweepId, $experiment.Suffix + $runLog = Join-Path $root "logs\$runId.txt" + Add-Content -Path $controllerLog -Value ("[{0}] START {1} cache={2} bigram={3} size={4}" -f (Get-Date -Format s), $runId, $experiment.CacheMix, $experiment.BigramMix, $experiment.CacheSize) + $args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $runId, + "-MaxWallclockSeconds", "180", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "524288", + "-RoundtripValMaxTokens", "262144", + "-TrainLogEvery", "10", + "-ValLossEvery", "50", + "-WarmupSteps", "0", + "-NumLayers", "12", + "-NumUniqueBlocks", "12", + "-ModelDim", "384", + "-EmbedDim", "0", + "-NumHeads", "6", + "-NumKvHeads", "3", + "-MlpMult", "2", + "-WindowSize", "0", + "-Int8AxisMode", "auto", + "-Int8ResidualRank", "1", + "-Int8ResidualBudgetBytes", "65536", + "-CompressionRegWeight", "0.005", + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "32", + "-CompressionRegSampleTensors", "4", + "-CompressionRegMaxCols", "128", + "-TernaryRegWeight", "0", + "-OutlierRegWeight", "0", + "-EvalCacheMixWeight", $experiment.CacheMix, + "-EvalBigramMixWeight", $experiment.BigramMix, + "-EvalCacheSize", $experiment.CacheSize, + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "1" + ) + & powershell @args + if ($LASTEXITCODE -ne 0) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1}" -f (Get-Date -Format s), $runId) + throw "Sweep run failed: $runId" + } + if (-not (Test-RoundtripRunComplete -LogPath $runLog)) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1} missing_final_roundtrip_metric" -f (Get-Date -Format s), $runId) + throw "Sweep run missing final roundtrip metric: $runId" + } + Add-Content -Path $controllerLog -Value ("[{0}] DONE {1}" -f (Get-Date -Format s), $runId) +} + +Add-Content -Path $controllerLog -Value ("[{0}] SWEEP_DONE {1}" -f (Get-Date -Format s), $SweepId) diff --git a/scripts/run_local_residual_roundtrip_sweep.ps1 b/scripts/run_local_residual_roundtrip_sweep.ps1 new file mode 100644 index 0000000000..c563dddde0 --- /dev/null +++ b/scripts/run_local_residual_roundtrip_sweep.ps1 @@ -0,0 +1,75 @@ +param( + [string]$SweepId = "" +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} + +if ([string]::IsNullOrWhiteSpace($SweepId)) { + $SweepId = "residualrtsweep_" + (Get-Date -Format "yyyyMMdd_HHmmss") +} + +$controllerLog = Join-Path $root "logs\$SweepId.controller.txt" +New-Item -ItemType Directory -Force -Path (Join-Path $root "logs") | Out-Null + +$experiments = @( + @{ Suffix = "r0_b000000"; Rank = "0"; Budget = "0" }, + @{ Suffix = "r1_b065536"; Rank = "1"; Budget = "65536" }, + @{ Suffix = "r1_b262144"; Rank = "1"; Budget = "262144" }, + @{ Suffix = "r1_b524288"; Rank = "1"; Budget = "524288" }, + @{ Suffix = "r1_b1048576"; Rank = "1"; Budget = "1048576" } +) + +foreach ($experiment in $experiments) { + $runId = "{0}_{1}" -f $SweepId, $experiment.Suffix + Add-Content -Path $controllerLog -Value ("[{0}] START {1} rank={2} residual_budget_bytes={3}" -f (Get-Date -Format s), $runId, $experiment.Rank, $experiment.Budget) + $args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $runId, + "-MaxWallclockSeconds", "180", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "524288", + "-RoundtripValMaxTokens", "262144", + "-TrainLogEvery", "10", + "-ValLossEvery", "50", + "-WarmupSteps", "0", + "-NumLayers", "12", + "-NumUniqueBlocks", "12", + "-ModelDim", "384", + "-EmbedDim", "0", + "-NumHeads", "6", + "-NumKvHeads", "3", + "-MlpMult", "2", + "-WindowSize", "0", + "-Int8AxisMode", "auto", + "-Int8ResidualRank", $experiment.Rank, + "-Int8ResidualBudgetBytes", $experiment.Budget, + "-CompressionRegWeight", "0.005", + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "32", + "-CompressionRegSampleTensors", "4", + "-CompressionRegMaxCols", "128", + "-TernaryRegWeight", "0", + "-OutlierRegWeight", "0", + "-EvalCacheMixWeight", "0", + "-EvalBigramMixWeight", "0", + "-EvalCacheSize", "0", + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "1" + ) + & powershell @args + if ($LASTEXITCODE -ne 0) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1}" -f (Get-Date -Format s), $runId) + throw "Sweep run failed: $runId" + } + Add-Content -Path $controllerLog -Value ("[{0}] DONE {1}" -f (Get-Date -Format s), $runId) +} + +Add-Content -Path $controllerLog -Value ("[{0}] SWEEP_DONE {1}" -f (Get-Date -Format s), $SweepId) diff --git a/scripts/run_local_sidecar.ps1 b/scripts/run_local_sidecar.ps1 new file mode 100644 index 0000000000..1986869c22 --- /dev/null +++ b/scripts/run_local_sidecar.ps1 @@ -0,0 +1,51 @@ +param( + [string]$RunId = "", + [switch]$Background +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} + +$args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $(if ([string]::IsNullOrWhiteSpace($RunId)) { "sidecar3090_" + (Get-Date -Format "yyyyMMdd_HHmmss") } else { $RunId }), + "-MaxWallclockSeconds", "180", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "1048576", + "-RoundtripValMaxTokens", "524288", + "-TrainLogEvery", "10", + "-ValLossEvery", "25", + "-WarmupSteps", "0", + "-NumLayers", "12", + "-NumUniqueBlocks", "12", + "-ModelDim", "384", + "-EmbedDim", "0", + "-NumHeads", "6", + "-NumKvHeads", "3", + "-MlpMult", "2", + "-CompressionRegWeight", "0", + "-CompressionRegInterval", "1", + "-CompressionRegWarmupSteps", "0", + "-CompressionRegSampleTensors", "0", + "-CompressionRegMaxCols", "128", + "-TernaryRegWeight", "0", + "-OutlierRegWeight", "0", + "-EvalCacheMixWeight", "0.03", + "-EvalBigramMixWeight", "0.05", + "-EvalCacheSize", "16", + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "0" +) + +if ($Background) { + $args += "-Background" +} + +& powershell @args diff --git a/scripts/run_local_sidecar_refine_roundtrip_sweep.ps1 b/scripts/run_local_sidecar_refine_roundtrip_sweep.ps1 new file mode 100644 index 0000000000..d2d552d693 --- /dev/null +++ b/scripts/run_local_sidecar_refine_roundtrip_sweep.ps1 @@ -0,0 +1,75 @@ +param( + [string]$SweepId = "" +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} + +if ([string]::IsNullOrWhiteSpace($SweepId)) { + $SweepId = "sidecarrefine_" + (Get-Date -Format "yyyyMMdd_HHmmss") +} + +$controllerLog = Join-Path $root "logs\$SweepId.controller.txt" +New-Item -ItemType Directory -Force -Path (Join-Path $root "logs") | Out-Null + +$experiments = @( + @{ Suffix = "c0018_b0030_s8"; CacheMix = "0.018"; BigramMix = "0.030"; CacheSize = "8" }, + @{ Suffix = "c0020_b0028_s8"; CacheMix = "0.020"; BigramMix = "0.028"; CacheSize = "8" }, + @{ Suffix = "c0020_b0030_s8"; CacheMix = "0.020"; BigramMix = "0.030"; CacheSize = "8" }, + @{ Suffix = "c0020_b0032_s8"; CacheMix = "0.020"; BigramMix = "0.032"; CacheSize = "8" }, + @{ Suffix = "c0022_b0030_s8"; CacheMix = "0.022"; BigramMix = "0.030"; CacheSize = "8" } +) + +foreach ($experiment in $experiments) { + $runId = "{0}_{1}" -f $SweepId, $experiment.Suffix + Add-Content -Path $controllerLog -Value ("[{0}] START {1} cache={2} bigram={3} size={4}" -f (Get-Date -Format s), $runId, $experiment.CacheMix, $experiment.BigramMix, $experiment.CacheSize) + $args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $runId, + "-MaxWallclockSeconds", "180", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "524288", + "-RoundtripValMaxTokens", "262144", + "-TrainLogEvery", "10", + "-ValLossEvery", "50", + "-WarmupSteps", "0", + "-NumLayers", "12", + "-NumUniqueBlocks", "12", + "-ModelDim", "384", + "-EmbedDim", "0", + "-NumHeads", "6", + "-NumKvHeads", "3", + "-MlpMult", "2", + "-WindowSize", "0", + "-Int8AxisMode", "auto", + "-Int8ResidualRank", "1", + "-Int8ResidualBudgetBytes", "65536", + "-CompressionRegWeight", "0.005", + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "32", + "-CompressionRegSampleTensors", "4", + "-CompressionRegMaxCols", "128", + "-TernaryRegWeight", "0", + "-OutlierRegWeight", "0", + "-EvalCacheMixWeight", $experiment.CacheMix, + "-EvalBigramMixWeight", $experiment.BigramMix, + "-EvalCacheSize", $experiment.CacheSize, + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "1" + ) + & powershell @args + if ($LASTEXITCODE -ne 0) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1}" -f (Get-Date -Format s), $runId) + throw "Sweep run failed: $runId" + } + Add-Content -Path $controllerLog -Value ("[{0}] DONE {1}" -f (Get-Date -Format s), $runId) +} + +Add-Content -Path $controllerLog -Value ("[{0}] SWEEP_DONE {1}" -f (Get-Date -Format s), $SweepId) diff --git a/scripts/run_local_sidecar_roundtrip_sweep.ps1 b/scripts/run_local_sidecar_roundtrip_sweep.ps1 new file mode 100644 index 0000000000..20cdb12c77 --- /dev/null +++ b/scripts/run_local_sidecar_roundtrip_sweep.ps1 @@ -0,0 +1,71 @@ +param( + [string]$SweepId = "" +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} + +if ([string]::IsNullOrWhiteSpace($SweepId)) { + $SweepId = "sidecarrtsweep_" + (Get-Date -Format "yyyyMMdd_HHmmss") +} + +$controllerLog = Join-Path $root "logs\$SweepId.controller.txt" +New-Item -ItemType Directory -Force -Path (Join-Path $root "logs") | Out-Null + +$experiments = @( + @{ Suffix = "c0015_b0020_s8"; CacheMix = "0.015"; BigramMix = "0.02"; CacheSize = "8" }, + @{ Suffix = "c0020_b0020_s8"; CacheMix = "0.02"; BigramMix = "0.02"; CacheSize = "8" }, + @{ Suffix = "c0020_b0030_s8"; CacheMix = "0.02"; BigramMix = "0.03"; CacheSize = "8" }, + @{ Suffix = "c0020_b0020_s16"; CacheMix = "0.02"; BigramMix = "0.02"; CacheSize = "16" } +) + +foreach ($experiment in $experiments) { + $runId = "{0}_{1}" -f $SweepId, $experiment.Suffix + Add-Content -Path $controllerLog -Value ("[{0}] START {1} cache={2} bigram={3} size={4}" -f (Get-Date -Format s), $runId, $experiment.CacheMix, $experiment.BigramMix, $experiment.CacheSize) + $args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $runId, + "-MaxWallclockSeconds", "180", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "524288", + "-RoundtripValMaxTokens", "262144", + "-TrainLogEvery", "10", + "-ValLossEvery", "50", + "-WarmupSteps", "0", + "-NumLayers", "12", + "-NumUniqueBlocks", "12", + "-ModelDim", "384", + "-EmbedDim", "0", + "-NumHeads", "6", + "-NumKvHeads", "3", + "-MlpMult", "2", + "-WindowSize", "0", + "-CompressionRegWeight", "0.005", + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "32", + "-CompressionRegSampleTensors", "4", + "-CompressionRegMaxCols", "128", + "-TernaryRegWeight", "0", + "-OutlierRegWeight", "0", + "-EvalCacheMixWeight", $experiment.CacheMix, + "-EvalBigramMixWeight", $experiment.BigramMix, + "-EvalCacheSize", $experiment.CacheSize, + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "1" + ) + & powershell @args + if ($LASTEXITCODE -ne 0) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1}" -f (Get-Date -Format s), $runId) + throw "Sweep run failed: $runId" + } + Add-Content -Path $controllerLog -Value ("[{0}] DONE {1}" -f (Get-Date -Format s), $runId) +} + +Add-Content -Path $controllerLog -Value ("[{0}] SWEEP_DONE {1}" -f (Get-Date -Format s), $SweepId) diff --git a/scripts/run_local_smoke.ps1 b/scripts/run_local_smoke.ps1 new file mode 100644 index 0000000000..8a012e1f52 --- /dev/null +++ b/scripts/run_local_smoke.ps1 @@ -0,0 +1,50 @@ +param( + [string]$RunId = "", + [switch]$Background +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} + +$args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $(if ([string]::IsNullOrWhiteSpace($RunId)) { "smoke3090_" + (Get-Date -Format "yyyyMMdd_HHmmss") } else { $RunId }), + "-MaxWallclockSeconds", "180", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "1048576", + "-RoundtripValMaxTokens", "524288", + "-TrainLogEvery", "10", + "-ValLossEvery", "25", + "-WarmupSteps", "0", + "-NumLayers", "12", + "-NumUniqueBlocks", "3", + "-ModelDim", "384", + "-EmbedDim", "192", + "-NumHeads", "6", + "-NumKvHeads", "3", + "-MlpMult", "2", + "-CompressionRegWeight", "0.01", + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "8", + "-CompressionRegSampleTensors", "3", + "-CompressionRegMaxCols", "128", + "-TernaryRegWeight", "0.1", + "-OutlierRegWeight", "0.01", + "-EvalCacheMixWeight", "0.03", + "-EvalCacheSize", "8", + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "0" +) + +if ($Background) { + $args += "-Background" +} + +& powershell @args diff --git a/scripts/run_local_sp1024_subset_current_best.ps1 b/scripts/run_local_sp1024_subset_current_best.ps1 new file mode 100644 index 0000000000..b47425880e --- /dev/null +++ b/scripts/run_local_sp1024_subset_current_best.ps1 @@ -0,0 +1,68 @@ +param( + [string]$RunId = "", + [switch]$Background +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$dataPath = Join-Path $root "data\datasets\fineweb10B_sp1024_local120k" +$tokenizerPath = Join-Path $root "data\tokenizers\fineweb_1024_bpe.model" + +if (-not (Test-Path $dataPath)) { + throw "SP1024 local subset dataset not found at $dataPath" +} +if (-not (Test-Path $tokenizerPath)) { + throw "SP1024 tokenizer not found at $tokenizerPath" +} + +$args = @( + "-ExecutionPolicy", "Bypass", + "-File", (Join-Path $root "scripts\run_local_3090.ps1"), + "-RunId", $(if ([string]::IsNullOrWhiteSpace($RunId)) { "sp1024subsetbest_" + (Get-Date -Format "yyyyMMdd_HHmmss") } else { $RunId }), + "-DataPath", $dataPath, + "-TokenizerPath", $tokenizerPath, + "-VocabSize", "1024", + "-TieEmbeddings", "1", + "-MaxWallclockSeconds", "0", + "-Iterations", "300", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "524288", + "-RoundtripValMaxTokens", "262144", + "-TrainLogEvery", "10", + "-ValLossEvery", "50", + "-WarmupSteps", "0", + "-NumLayers", "14", + "-NumUniqueBlocks", "14", + "-ModelDim", "576", + "-EmbedDim", "0", + "-NumHeads", "8", + "-NumKvHeads", "4", + "-MlpMult", "2", + "-WindowSize", "0", + "-Int8AxisMode", "auto", + "-Int8ResidualRank", "1", + "-Int8ResidualBudgetBytes", "65536", + "-CompressionRegWeight", "0.005", + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "32", + "-CompressionRegSampleTensors", "4", + "-CompressionRegMaxCols", "128", + "-CompressionGridRegWeight", "0.10", + "-CompressionScaleRegWeight", "0.0", + "-CompressionRank1RegWeight", "0.0", + "-TernaryRegWeight", "0", + "-OutlierRegWeight", "0", + "-EvalCacheMixWeight", "0", + "-EvalBigramMixWeight", "0", + "-EvalCacheSize", "0", + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "1" +) + +if ($Background) { + $args += "-Background" +} + +& powershell @args diff --git a/scripts/run_local_sp4096_current_best.ps1 b/scripts/run_local_sp4096_current_best.ps1 new file mode 100644 index 0000000000..ab4a79a966 --- /dev/null +++ b/scripts/run_local_sp4096_current_best.ps1 @@ -0,0 +1,68 @@ +param( + [string]$RunId = "", + [switch]$Background +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$dataPath = Join-Path $root "data\datasets\fineweb10B_sp4096_local" +$tokenizerPath = Join-Path $root "data\tokenizers\fineweb_4096_bpe.model" + +if (-not (Test-Path $dataPath)) { + throw "SP4096 local dataset not found at $dataPath" +} +if (-not (Test-Path $tokenizerPath)) { + throw "SP4096 tokenizer not found at $tokenizerPath" +} + +$args = @( + "-ExecutionPolicy", "Bypass", + "-File", (Join-Path $root "scripts\run_local_3090.ps1"), + "-RunId", $(if ([string]::IsNullOrWhiteSpace($RunId)) { "sp4096best_" + (Get-Date -Format "yyyyMMdd_HHmmss") } else { $RunId }), + "-DataPath", $dataPath, + "-TokenizerPath", $tokenizerPath, + "-VocabSize", "4096", + "-TieEmbeddings", "1", + "-MaxWallclockSeconds", "0", + "-Iterations", "300", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "524288", + "-RoundtripValMaxTokens", "262144", + "-TrainLogEvery", "10", + "-ValLossEvery", "50", + "-WarmupSteps", "0", + "-NumLayers", "14", + "-NumUniqueBlocks", "14", + "-ModelDim", "576", + "-EmbedDim", "0", + "-NumHeads", "8", + "-NumKvHeads", "4", + "-MlpMult", "2", + "-WindowSize", "0", + "-Int8AxisMode", "auto", + "-Int8ResidualRank", "1", + "-Int8ResidualBudgetBytes", "65536", + "-CompressionRegWeight", "0.005", + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "32", + "-CompressionRegSampleTensors", "4", + "-CompressionRegMaxCols", "128", + "-CompressionGridRegWeight", "0.10", + "-CompressionScaleRegWeight", "0.0", + "-CompressionRank1RegWeight", "0.0", + "-TernaryRegWeight", "0", + "-OutlierRegWeight", "0", + "-EvalCacheMixWeight", "0", + "-EvalBigramMixWeight", "0", + "-EvalCacheSize", "0", + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "1" +) + +if ($Background) { + $args += "-Background" +} + +& powershell @args diff --git a/scripts/run_local_sp4096_frontier_refine_sweep.ps1 b/scripts/run_local_sp4096_frontier_refine_sweep.ps1 new file mode 100644 index 0000000000..bdc91d3e74 --- /dev/null +++ b/scripts/run_local_sp4096_frontier_refine_sweep.ps1 @@ -0,0 +1,106 @@ +param( + [string]$SweepId = "" +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +$dataPath = Join-Path $root "data\datasets\fineweb10B_sp4096_local" +$tokenizerPath = Join-Path $root "data\tokenizers\fineweb_4096_bpe.model" + +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} +if (-not (Test-Path $dataPath)) { + throw "SP4096 local dataset not found at $dataPath" +} +if (-not (Test-Path $tokenizerPath)) { + throw "SP4096 tokenizer not found at $tokenizerPath" +} + +if ([string]::IsNullOrWhiteSpace($SweepId)) { + $SweepId = "sp4096frontier_fixedstep_" + (Get-Date -Format "yyyyMMdd_HHmmss") +} + +$controllerLog = Join-Path $root "logs\$SweepId.controller.txt" +New-Item -ItemType Directory -Force -Path (Join-Path $root "logs") | Out-Null + +function Test-RoundtripRunComplete { + param( + [string]$LogPath + ) + + if (-not (Test-Path $LogPath)) { + return $false + } + + return [bool](Select-String -Path $LogPath -Pattern '^final_int8_zlib_roundtrip_exact ' -Quiet) +} + +$experiments = @( + @{ Suffix = "l15_d528"; Layers = "15"; Dim = "528"; Heads = "8"; KvHeads = "4"; Note = "deeper_narrower_under_cap" }, + @{ Suffix = "l16_d512"; Layers = "16"; Dim = "512"; Heads = "8"; KvHeads = "4"; Note = "deepest_narrow_under_cap" } +) + +foreach ($experiment in $experiments) { + $runId = "{0}_{1}" -f $SweepId, $experiment.Suffix + $runLog = Join-Path $root "logs\$runId.txt" + Add-Content -Path $controllerLog -Value ("[{0}] START {1} note={2} layers={3} dim={4} heads={5} kv={6}" -f (Get-Date -Format s), $runId, $experiment.Note, $experiment.Layers, $experiment.Dim, $experiment.Heads, $experiment.KvHeads) + $args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $runId, + "-DataPath", $dataPath, + "-TokenizerPath", $tokenizerPath, + "-VocabSize", "4096", + "-TieEmbeddings", "1", + "-MaxWallclockSeconds", "0", + "-Iterations", "300", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "524288", + "-RoundtripValMaxTokens", "262144", + "-TrainLogEvery", "10", + "-ValLossEvery", "50", + "-WarmupSteps", "0", + "-NumLayers", $experiment.Layers, + "-NumUniqueBlocks", $experiment.Layers, + "-ModelDim", $experiment.Dim, + "-EmbedDim", "0", + "-NumHeads", $experiment.Heads, + "-NumKvHeads", $experiment.KvHeads, + "-MlpMult", "2", + "-WindowSize", "0", + "-Int8AxisMode", "auto", + "-Int8ResidualRank", "1", + "-Int8ResidualBudgetBytes", "65536", + "-CompressionRegWeight", "0.005", + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "32", + "-CompressionRegSampleTensors", "4", + "-CompressionRegMaxCols", "128", + "-CompressionGridRegWeight", "0.10", + "-CompressionScaleRegWeight", "0.0", + "-CompressionRank1RegWeight", "0.0", + "-TernaryRegWeight", "0", + "-OutlierRegWeight", "0", + "-EvalCacheMixWeight", "0", + "-EvalBigramMixWeight", "0", + "-EvalCacheSize", "0", + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "1" + ) + & powershell @args + if ($LASTEXITCODE -ne 0) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1}" -f (Get-Date -Format s), $runId) + throw "Sweep run failed: $runId" + } + if (-not (Test-RoundtripRunComplete -LogPath $runLog)) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1} missing_final_roundtrip_metric" -f (Get-Date -Format s), $runId) + throw "Sweep run missing final roundtrip metric: $runId" + } + Add-Content -Path $controllerLog -Value ("[{0}] DONE {1}" -f (Get-Date -Format s), $runId) +} + +Add-Content -Path $controllerLog -Value ("[{0}] SWEEP_DONE {1}" -f (Get-Date -Format s), $SweepId) diff --git a/scripts/run_local_sp4096_isobyte_sweep.ps1 b/scripts/run_local_sp4096_isobyte_sweep.ps1 new file mode 100644 index 0000000000..5ee0848765 --- /dev/null +++ b/scripts/run_local_sp4096_isobyte_sweep.ps1 @@ -0,0 +1,107 @@ +param( + [string]$SweepId = "" +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +$dataPath = Join-Path $root "data\datasets\fineweb10B_sp4096_local" +$tokenizerPath = Join-Path $root "data\tokenizers\fineweb_4096_bpe.model" + +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} +if (-not (Test-Path $dataPath)) { + throw "SP4096 local dataset not found at $dataPath" +} +if (-not (Test-Path $tokenizerPath)) { + throw "SP4096 tokenizer not found at $tokenizerPath" +} + +if ([string]::IsNullOrWhiteSpace($SweepId)) { + $SweepId = "sp4096isobyte_fixedstep_" + (Get-Date -Format "yyyyMMdd_HHmmss") +} + +$controllerLog = Join-Path $root "logs\$SweepId.controller.txt" +New-Item -ItemType Directory -Force -Path (Join-Path $root "logs") | Out-Null + +function Test-RoundtripRunComplete { + param( + [string]$LogPath + ) + + if (-not (Test-Path $LogPath)) { + return $false + } + + return [bool](Select-String -Path $LogPath -Pattern '^final_int8_zlib_roundtrip_exact ' -Quiet) +} + +$experiments = @( + @{ Suffix = "l15_d544"; Layers = "15"; Dim = "544"; Heads = "8"; KvHeads = "4"; Note = "depth_lean_near_cap" }, + @{ Suffix = "l14_d560"; Layers = "14"; Dim = "560"; Heads = "8"; KvHeads = "4"; Note = "trimmed_width_control" }, + @{ Suffix = "l12_d608"; Layers = "12"; Dim = "608"; Heads = "8"; KvHeads = "4"; Note = "width_lean_near_cap" } +) + +foreach ($experiment in $experiments) { + $runId = "{0}_{1}" -f $SweepId, $experiment.Suffix + $runLog = Join-Path $root "logs\$runId.txt" + Add-Content -Path $controllerLog -Value ("[{0}] START {1} note={2} layers={3} dim={4} heads={5} kv={6}" -f (Get-Date -Format s), $runId, $experiment.Note, $experiment.Layers, $experiment.Dim, $experiment.Heads, $experiment.KvHeads) + $args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $runId, + "-DataPath", $dataPath, + "-TokenizerPath", $tokenizerPath, + "-VocabSize", "4096", + "-TieEmbeddings", "1", + "-MaxWallclockSeconds", "0", + "-Iterations", "300", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "524288", + "-RoundtripValMaxTokens", "262144", + "-TrainLogEvery", "10", + "-ValLossEvery", "50", + "-WarmupSteps", "0", + "-NumLayers", $experiment.Layers, + "-NumUniqueBlocks", $experiment.Layers, + "-ModelDim", $experiment.Dim, + "-EmbedDim", "0", + "-NumHeads", $experiment.Heads, + "-NumKvHeads", $experiment.KvHeads, + "-MlpMult", "2", + "-WindowSize", "0", + "-Int8AxisMode", "auto", + "-Int8ResidualRank", "1", + "-Int8ResidualBudgetBytes", "65536", + "-CompressionRegWeight", "0.005", + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "32", + "-CompressionRegSampleTensors", "4", + "-CompressionRegMaxCols", "128", + "-CompressionGridRegWeight", "0.10", + "-CompressionScaleRegWeight", "0.0", + "-CompressionRank1RegWeight", "0.0", + "-TernaryRegWeight", "0", + "-OutlierRegWeight", "0", + "-EvalCacheMixWeight", "0", + "-EvalBigramMixWeight", "0", + "-EvalCacheSize", "0", + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "1" + ) + & powershell @args + if ($LASTEXITCODE -ne 0) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1}" -f (Get-Date -Format s), $runId) + throw "Sweep run failed: $runId" + } + if (-not (Test-RoundtripRunComplete -LogPath $runLog)) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1} missing_final_roundtrip_metric" -f (Get-Date -Format s), $runId) + throw "Sweep run missing final roundtrip metric: $runId" + } + Add-Content -Path $controllerLog -Value ("[{0}] DONE {1}" -f (Get-Date -Format s), $runId) +} + +Add-Content -Path $controllerLog -Value ("[{0}] SWEEP_DONE {1}" -f (Get-Date -Format s), $SweepId) diff --git a/scripts/run_local_ternary_roundtrip_sweep.ps1 b/scripts/run_local_ternary_roundtrip_sweep.ps1 new file mode 100644 index 0000000000..0d7a64f35e --- /dev/null +++ b/scripts/run_local_ternary_roundtrip_sweep.ps1 @@ -0,0 +1,72 @@ +param( + [string]$SweepId = "" +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$launcher = Join-Path $root "scripts\run_local_3090.ps1" +if (-not (Test-Path $launcher)) { + throw "Launcher not found at $launcher" +} + +if ([string]::IsNullOrWhiteSpace($SweepId)) { + $SweepId = "ternaryrtsweep_" + (Get-Date -Format "yyyyMMdd_HHmmss") +} + +$controllerLog = Join-Path $root "logs\$SweepId.controller.txt" +New-Item -ItemType Directory -Force -Path (Join-Path $root "logs") | Out-Null + +$experiments = @( + @{ Suffix = "t0005"; TernaryRegWeight = "0.0005" }, + @{ Suffix = "t0010"; TernaryRegWeight = "0.0010" }, + @{ Suffix = "t0020"; TernaryRegWeight = "0.0020" }, + @{ Suffix = "t0035"; TernaryRegWeight = "0.0035" }, + @{ Suffix = "t0050"; TernaryRegWeight = "0.0050" } +) + +foreach ($experiment in $experiments) { + $runId = "{0}_{1}" -f $SweepId, $experiment.Suffix + Add-Content -Path $controllerLog -Value ("[{0}] START {1} ternary={2}" -f (Get-Date -Format s), $runId, $experiment.TernaryRegWeight) + $args = @( + "-ExecutionPolicy", "Bypass", + "-File", $launcher, + "-RunId", $runId, + "-MaxWallclockSeconds", "180", + "-TrainBatchTokens", "32768", + "-ValBatchSize", "32768", + "-ValMaxTokens", "524288", + "-RoundtripValMaxTokens", "262144", + "-TrainLogEvery", "10", + "-ValLossEvery", "50", + "-WarmupSteps", "0", + "-NumLayers", "12", + "-NumUniqueBlocks", "12", + "-ModelDim", "384", + "-EmbedDim", "0", + "-NumHeads", "6", + "-NumKvHeads", "3", + "-MlpMult", "2", + "-WindowSize", "0", + "-CompressionRegWeight", "0.005", + "-CompressionRegInterval", "4", + "-CompressionRegWarmupSteps", "32", + "-CompressionRegSampleTensors", "4", + "-CompressionRegMaxCols", "128", + "-TernaryRegWeight", $experiment.TernaryRegWeight, + "-OutlierRegWeight", "0", + "-EvalCacheMixWeight", "0", + "-EvalBigramMixWeight", "0", + "-EvalCacheSize", "0", + "-SaveRawCheckpoint", "0", + "-FinalRoundtripEval", "1" + ) + & powershell @args + if ($LASTEXITCODE -ne 0) { + Add-Content -Path $controllerLog -Value ("[{0}] FAIL {1}" -f (Get-Date -Format s), $runId) + throw "Sweep run failed: $runId" + } + Add-Content -Path $controllerLog -Value ("[{0}] DONE {1}" -f (Get-Date -Format s), $runId) +} + +Add-Content -Path $controllerLog -Value ("[{0}] SWEEP_DONE {1}" -f (Get-Date -Format s), $SweepId) diff --git a/scripts/run_monitor.ps1 b/scripts/run_monitor.ps1 new file mode 100644 index 0000000000..2d99b70128 --- /dev/null +++ b/scripts/run_monitor.ps1 @@ -0,0 +1,49 @@ +param( + [string]$LogPath = "" +) + +$ErrorActionPreference = "Stop" + +$root = (Resolve-Path (Join-Path $PSScriptRoot "..")).Path +$projectDir = Join-Path $root "tools\RunMonitor" +$project = Join-Path $projectDir "RunMonitor.csproj" +$exe = Join-Path $projectDir "bin\Debug\net9.0-windows\RunMonitor.exe" + +if (-not (Test-Path $project)) { + throw "RunMonitor project not found at $project" +} + +Get-Process -Name "RunMonitor" -ErrorAction SilentlyContinue | Stop-Process -Force + +$needsBuild = -not (Test-Path $exe) +if (-not $needsBuild) { + $latestSource = Get-ChildItem -Path $projectDir -Recurse -File -Include *.cs,*.xaml,*.csproj | + Sort-Object LastWriteTimeUtc -Descending | + Select-Object -First 1 + if ($null -ne $latestSource) { + $needsBuild = $latestSource.LastWriteTimeUtc -gt (Get-Item $exe).LastWriteTimeUtc + } +} + +if ($needsBuild) { + & dotnet build $project | Out-Host + if ($LASTEXITCODE -ne 0) { + throw "RunMonitor build failed." + } +} + +$arguments = @() +if (-not [string]::IsNullOrWhiteSpace($LogPath)) { + if (-not (Test-Path $LogPath)) { + throw "Log path not found: $LogPath" + } + $arguments += (Resolve-Path $LogPath).Path +} + +$process = if ($arguments.Count -gt 0) { + Start-Process -FilePath $exe -ArgumentList $arguments -WorkingDirectory $root -PassThru +} +else { + Start-Process -FilePath $exe -WorkingDirectory $root -PassThru +} +Write-Output ("RunMonitor PID={0}" -f $process.Id) diff --git a/scripts/summarize_log.py b/scripts/summarize_log.py new file mode 100644 index 0000000000..1b3c01176c --- /dev/null +++ b/scripts/summarize_log.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +import argparse +import re +from pathlib import Path + + +TRAIN_RE = re.compile( + r"step:(?P\d+)/(?P\d+) train_loss:(?P[0-9.]+) " + r"train_time:(?P[0-9.]+)ms step_avg:(?P[0-9.]+)ms" +) +VAL_RE = re.compile( + r"step:(?P\d+)/(?P\d+) val_loss:(?P[0-9.]+) val_bpb:(?P[0-9.]+) " + r"train_time:(?P[0-9.]+)ms step_avg:(?P[0-9.]+)ms" +) +FINAL_RE = re.compile( + r"final_int8_zlib_roundtrip_exact val_loss:(?P[0-9.]+) val_bpb:(?P[0-9.]+)" +) +SIZE_RE = re.compile(r"Total submission size int8\+zlib: (?P\d+) bytes") + + +def latest_match(pattern: re.Pattern[str], lines: list[str]) -> dict[str, str] | None: + for line in reversed(lines): + match = pattern.search(line) + if match is not None: + return match.groupdict() + return None + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("log_path") + args = parser.parse_args() + + path = Path(args.log_path).expanduser().resolve() + lines = path.read_text(encoding="utf-8").splitlines() + + train = latest_match(TRAIN_RE, lines) + val = latest_match(VAL_RE, lines) + final = latest_match(FINAL_RE, lines) + size = latest_match(SIZE_RE, lines) + + print(f"log={path}") + if train is not None: + print( + "latest_train " + f"step={train['step']}/{train['iters']} " + f"loss={train['train_loss']} step_avg_ms={train['step_avg_ms']} " + f"train_time_ms={train['train_time_ms']}" + ) + if val is not None: + print( + "latest_val " + f"step={val['step']}/{val['iters']} " + f"loss={val['val_loss']} bpb={val['val_bpb']} " + f"step_avg_ms={val['step_avg_ms']} train_time_ms={val['train_time_ms']}" + ) + if final is not None: + print(f"final_int8 val_loss={final['val_loss']} val_bpb={final['val_bpb']}") + if size is not None: + print(f"artifact_bytes_total={size['bytes_total']}") + if train is None and val is None and final is None: + print("no_step_metrics_found") + + +if __name__ == "__main__": + main() diff --git a/scripts/upload_when_ready.sh b/scripts/upload_when_ready.sh new file mode 100644 index 0000000000..570ad0a120 --- /dev/null +++ b/scripts/upload_when_ready.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [[ $# -lt 2 || $# -gt 3 ]]; then + echo "usage: $0 [poll_seconds]" >&2 + exit 1 +fi + +src_root=$1 +dest_root=$2 +poll_seconds=${3:-120} +manifest_path="${src_root%/}/manifest.json" + +if [[ "$poll_seconds" -le 0 ]]; then + echo "poll_seconds must be positive" >&2 + exit 1 +fi + +while [[ ! -f "$manifest_path" ]]; do + sleep "$poll_seconds" +done + +bbb cptree "$src_root" "$dest_root" diff --git a/tools/RunMonitor/App.xaml b/tools/RunMonitor/App.xaml new file mode 100644 index 0000000000..28ed9e15cc --- /dev/null +++ b/tools/RunMonitor/App.xaml @@ -0,0 +1,5 @@ + + diff --git a/tools/RunMonitor/App.xaml.cs b/tools/RunMonitor/App.xaml.cs new file mode 100644 index 0000000000..0bc7707cf0 --- /dev/null +++ b/tools/RunMonitor/App.xaml.cs @@ -0,0 +1,7 @@ +using System.Windows; + +namespace RunMonitor; + +public partial class App : Application +{ +} diff --git a/tools/RunMonitor/MainWindow.xaml b/tools/RunMonitor/MainWindow.xaml new file mode 100644 index 0000000000..ad734abb3f --- /dev/null +++ b/tools/RunMonitor/MainWindow.xaml @@ -0,0 +1,407 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tools/RunMonitor/MainWindow.xaml.cs b/tools/RunMonitor/MainWindow.xaml.cs new file mode 100644 index 0000000000..d31e7406ee --- /dev/null +++ b/tools/RunMonitor/MainWindow.xaml.cs @@ -0,0 +1,1318 @@ +using System.Diagnostics; +using System.Globalization; +using System.IO; +using System.Linq; +using System.Text.RegularExpressions; +using System.Windows; +using System.Windows.Controls; +using System.Windows.Threading; +using System.Windows.Media; +using Microsoft.Win32; + +namespace RunMonitor; + +public partial class MainWindow : Window +{ + private static readonly Regex ConfigRegex = new( + @"^train_batch_tokens:\d+\s+train_seq_len:\d+\s+iterations:(?\d+)\s+warmup_steps:(?\d+)", + RegexOptions.Compiled + ); + + private static readonly Regex MaxWallclockRegex = new( + @"^train_batch_tokens:\d+\s+train_seq_len:\d+\s+iterations:\d+\s+warmup_steps:\d+\s+max_wallclock_seconds:(?[0-9.]+)", + RegexOptions.Compiled + ); + + private static readonly Regex WarmupRegex = new( + @"^warmup_step:(?\d+)/(?\d+)$", + RegexOptions.Compiled + ); + + private static readonly Regex TrainRegex = new( + @"^step:(?\d+)/(?\d+)\s+train_loss:(?[0-9.]+)\s+train_time:(?