exp82: 80 shards (10B tokens) + order-13 packed n-gram

sofiabod · sofiabod · commit bd7eb951bdc1 · 2026-03-27T10:13:09.000-04:00
exp81c proved paradigm: 0.1518 BPB with 40 shards order-9. Extend to full 80 shards (10B tokens) + order 2-13 for richer cache. Expected: sub-0.12 (closing gap to openai#900 at 0.1197).
diff --git a/results.tsv b/results.tsv
@@ -51,3 +51,4 @@ fc5f627	0.2417	15.39	keep	flat Dirichlet c=1.0 + phrase[36,28,20,16]	NEW BEST! p
 e608af8	0.2307	15.32	discard	order-13 flat Dirichlet + phrase[36,28,20,16]	-0.011 from orders but eval=673s OVER BUDGET
 f5c8cde	0.2284	14.92	discard	stride=64 order-13 phrase[48,36,28,20,16]	NEW BEST BPB but eval=601s (1s over budget)
 c9c53a6	0.2285	15.33	keep	stride=72 order-13 phrase[48,36,28,20,16]	LEGAL BEST! eval=567s, 33s spare
+838ad4f	0.1518	13.43	keep	PACKED NGRAM ARTIFACT 2L/128d + 40 shards order-9	PARADIGM SHIFT! eval=372s, 100% hit
diff --git a/train_gpt.py b/train_gpt.py
@@ -2032,9 +2032,9 @@ def lr_mul(step: int, elapsed_ms: float) -> float:
     packed_ngram = None
     if ngram_artifact_enabled:
         t_build = time.perf_counter()
-        ngram_art_order = int(os.environ.get("NGRAM_ART_ORDER", "9"))
+        ngram_art_order = int(os.environ.get("NGRAM_ART_ORDER", "13"))
         ngram_art_buckets = int(os.environ.get("NGRAM_ART_BUCKETS", "524288"))
-        ngram_art_max_shards = int(os.environ.get("NGRAM_ART_MAX_SHARDS", "40"))
+        ngram_art_max_shards = int(os.environ.get("NGRAM_ART_MAX_SHARDS", "80"))
         # each rank builds from a subset of shards
         all_shards = sorted(glob.glob(os.path.join(args.data_path, "fineweb_train_*.bin")))
         if ngram_art_max_shards > 0: