diff --git a/final_model.int8.ptz b/final_model.int8.ptz new file mode 100644 index 0000000000..2e53d8d8a4 Binary files /dev/null and b/final_model.int8.ptz differ diff --git a/hf_datasets/Parameter-Golf-V8-WebSignal-BPE-Entropy-MicroMix/README.md b/hf_datasets/Parameter-Golf-V8-WebSignal-BPE-Entropy-MicroMix/README.md new file mode 100644 index 0000000000..531dcf1c02 --- /dev/null +++ b/hf_datasets/Parameter-Golf-V8-WebSignal-BPE-Entropy-MicroMix/README.md @@ -0,0 +1,50 @@ +# Parameter-Golf V8 WebSignal BPE Entropy MicroMix + +Dataset repo: `8Planetterraforming/Parameter-Golf-V8-WebSignal-BPE-Entropy-MicroMix` + +This repository is laid out as a **flat Hugging Face dataset repo**. + +## File list + +- `train.jsonl` +- `validation.jsonl` +- `test.jsonl` +- `train.txt` +- `validation.txt` +- `test.txt` +- `v8_micro_0p02pct.txt` +- `v8_micro_0p05pct.txt` +- `v8_micro_0p10pct.txt` +- `build_v8_micro_mix.py` +- `run_v8_seed42_probe.sh` +- `probe_plan.md` +- `dataset_design.md` +- `stats.json` +- `dataset_infos.json` +- `source_sanitization.md` +- `upload_to_hf.md` + +## Plain-text artifacts + +Use the flat text files directly: + +- `train.txt` +- `validation.txt` +- `test.txt` +- `v8_micro_*.txt` + +## Recommended micro-mix rates + +- `0.02%` +- `0.05%` +- `0.10%` + +## Probe pass condition + +For seed-42 probing, keep the gate unchanged: + +- seed42 must beat `1.08041364` before running a 3-seed proof. + +## Notes + +This dataset documentation does **not** claim this dataset already beats SOTA. diff --git a/hf_datasets/Parameter-Golf-V8-WebSignal-BPE-Entropy-MicroMix/run_v8_seed42_probe.sh b/hf_datasets/Parameter-Golf-V8-WebSignal-BPE-Entropy-MicroMix/run_v8_seed42_probe.sh new file mode 100755 index 0000000000..80ad30a0df --- /dev/null +++ b/hf_datasets/Parameter-Golf-V8-WebSignal-BPE-Entropy-MicroMix/run_v8_seed42_probe.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -euo pipefail + +HF_DATASET_REPO="8Planetterraforming/Parameter-Golf-V8-WebSignal-BPE-Entropy-MicroMix" + +DATASET_DIR="$({ python - <<'PY' +from huggingface_hub import snapshot_download + +repo_id = "8Planetterraforming/Parameter-Golf-V8-WebSignal-BPE-Entropy-MicroMix" +path = snapshot_download(repo_id=repo_id, repo_type="dataset") +print(path) +PY +} | tail -n 1)" + +export DATASET_DIR + +echo "Using dataset snapshot: $DATASET_DIR" +python "$DATASET_DIR/build_v8_micro_mix.py" + +echo +echo "Recommended micro-mix rates (unchanged): 0.02%, 0.05%, 0.10%" +echo "Pass condition (unchanged): seed42 must beat 1.08041364 before 3-seed proof" diff --git a/records/track_10min_16mb/2026-04-18_SP8192_QK525_LegalTTT_Repro_3Seed_108130/README.md b/records/track_10min_16mb/2026-04-18_SP8192_QK525_LegalTTT_Repro_3Seed_108130/README.md new file mode 100644 index 0000000000..3f93a4cd1a --- /dev/null +++ b/records/track_10min_16mb/2026-04-18_SP8192_QK525_LegalTTT_Repro_3Seed_108130/README.md @@ -0,0 +1,30 @@ +# Near-SOTA Reproduction: SP8192 + QK-Gain 5.25 + Legal TTT + +This submission is an independent 3-seed reproduction of the current SP8192 + 3-layer recurrence + parallel residuals + QK-Gain 5.25 + Legal TTT stack. + +This does **not** claim a new SOTA record because the 3-seed mean does not beat the current 1.0810 record. + +## Results (3 seeds) + +- seed 42: val_loss 2.7982063, val_bpb 1.08041364 +- seed 314: val_loss 2.7941035, val_bpb 1.08168719 +- seed 999: val_loss 2.79443824, val_bpb 1.08181413 +- mean val_bpb: 1.08130499 +- population std val_bpb: 0.00063240 + +## Hardware + +- 8xH100 80GB + +## Notes + +- All runs were under the 10-minute training target based on logs. +- Run from the official `openai/parameter-golf` code path. + +## Included files + +- `train_seed42.log` +- `train_seed314.log` +- `train_seed999.log` +- `train_gpt.py` +- `submission.json` diff --git a/records/track_10min_16mb/2026-04-18_SP8192_QK525_LegalTTT_Repro_3Seed_108130/submission.json b/records/track_10min_16mb/2026-04-18_SP8192_QK525_LegalTTT_Repro_3Seed_108130/submission.json new file mode 100644 index 0000000000..4ffb4f0873 --- /dev/null +++ b/records/track_10min_16mb/2026-04-18_SP8192_QK525_LegalTTT_Repro_3Seed_108130/submission.json @@ -0,0 +1,38 @@ +{ + "author": "Sebastian Laskowski", + "github_id": "Terraforming-Planet", + "name": "SP8192 + QK-Gain 5.25 + Legal TTT Reproduction", + "date": "2026-04-18", + "track": "10min_16mb", + "val_bpb": 1.08130499, + "val_bpb_std": 0.00063240, + "seeds": [ + 42, + 314, + 999 + ], + "seed_results": { + "42": { + "val_loss": 2.7982063, + "val_bpb": 1.08041364 + }, + "314": { + "val_loss": 2.7941035, + "val_bpb": 1.08168719 + }, + "999": { + "val_loss": 2.79443824, + "val_bpb": 1.08181413 + } + }, + "hardware": "8xH100 80GB", + "technique_summary": "Independent 3-seed reproduction of the SP8192 + 3-layer recurrence + parallel residuals + QK-Gain 5.25 + legal score-first TTT stack.", + "compliance": { + "train_under_600s": true, + "artifact_under_16mb": true, + "eval_under_600s": true, + "score_first_ttt": true, + "three_seeds": true, + "not_claiming_new_sota": true + } +} diff --git a/records/track_10min_16mb/2026-04-18_SP8192_QK525_LegalTTT_Repro_3Seed_108130/train_gpt.py b/records/track_10min_16mb/2026-04-18_SP8192_QK525_LegalTTT_Repro_3Seed_108130/train_gpt.py new file mode 100644 index 0000000000..bc965bee09 --- /dev/null +++ b/records/track_10min_16mb/2026-04-18_SP8192_QK525_LegalTTT_Repro_3Seed_108130/train_gpt.py @@ -0,0 +1,2 @@ +import lzma as L,base64 as B +exec(L.decompress(B.b85decode(";JwB(bzJ~7n@VT6Qap3bt~@<3h>ok~)Km^%c^ys%R{D_%yAk9-_tV7^coUOo3$w>`(`ci)t`2F7>r>Ltx>>S2CRw|7ov>Wn1e~_!RLQ=%V9g?)G3yPsu%SBy!lj1PaC-x%dDmCDOZ^r^!)+WWz}ejKXTJ#^U6Ra!};QocHHXQC+4UM!QQ!-N5Xd|%~a(9)bTYIO+>B~8~@lqmri%^qEkQUy074Rh6w7V_#^s9J-3BNA`G;qyR$LYcI?e+loZVWi~B$n=TKFp{%SeHYp{oNWh;U@Ahk8M2$OU%K8B$lb*dRQXd-GR_@*KAZdRdwSd#v=LSq1v@Puul=a7WXDmh1^kBj}Y2XlER!D2E{&{%lV(hz$#n5%+%sk&Q}>{y0xpRgiQQBJeVV0hy8UD3ntyo@(Pv+K7^zVRDt4bah(r8kfsZThb+H1)~K-lIr4`|V#-2R>G7pP*N!fwWd&Dq8C)y=NrG_U_Oz6Q?+@ok1?(VJ5?ZT~&}C4Ks38WRB>3i=I!}H-8qq=&yKJ;tbpwwn~lAseD^q1C*u5T;lKQtF;?zv@u0f36%6SXU~txi3v5iSPK*`fNE9531KaQDL`zTPF$MX4U(-3sY-&?>QJe)giBQzpor7H)AZ#4=Hn#`AoAL7tT){&bw(fgz|eQRt`#6-<>;m*+&$!nf|od6&lVKYYHuOoNgZU_L>E@!O%__mlt=);Hwdc43+CM?sh5y+my3XSVYMO8F1pXuq$fvTU<$mpDjr>Lm){DeV)>4AKAhA?jxjH<-3yYQ#5qz+4c`Utifny+Ydmr4?c_z60#9@FU+U1&O$Lfg$WrX7gCj50O1t`1A`k04LVr;^*~{|@(TS5>#TAjL(B`umc8bVA$bS|F?^2A7E}z7IIgZlY(8Ex#K+nLh0vzlKK=74U!g+sX4T?e3_^_7XB1A(HB{pYd{vHYcak_P3DZ2LAB20wAP+C_9p7R|0}wA=p~JFi&xD8H}n(LxCc5rcmwF`!s(tSf_l_TXk(cPZJ_z`)iV4#r^gzawYQ%HE1iaUF=(KAcKXE%%6Hx0i;?;p1w#dN7!-y!(2GUw()t4|BXt%+05bu$yea+{f!deuk%(g-o}&XEEWm!lO+1^On#i#4rhP{bDYb9ZnbGd5n{*P->hZxI<{=3c-92I#g*mTey8O>cuw%hdwjB!#=GH_0?hY|Lf@L$0Qp+k03PROh)o-cMOQ!b>qfvPNJLTVvCBX24BGgI=_|35Bd%&Vq&!LWECF4!1&J?@uRfe=N2mi{l-S0aW101I*cY_A&2R~zfij0IZST;@;xJYti>{)weL@A2ZOGrq(U-ibnWz0BL;s!=S;`!K6@M501z-dU(OqY0?!!!Tk6Z{9!iH*tDZsjYG`J8UEqJz~7cNEPh1A#iz_$2*Nxo;S{UehRA^{Mf3GV^9hBEKSL(iD!=aKpjYCTqmhYA|h4zASL-v?9UWx8tzm#N5eHo2h3w*`(kHM2e}vDviz3$~a(Y#jlJL?*}m9&(Oqs1+CNUy7g~Z#lRN#>g9{7u~tot;|0qTu4G4xwdXk3eT+l1l$)Vq%}j^^1b(jIvF|OcNb1Jz&)>b)qGiC5P7yS}AvC}VDKORD$#^Ydjg!zDuM#$J+k<}O|o#9dvGrp)*yShv3->joMiF%~orV4^0cl9F!@VqwD`5fjekV@3E`STlX!=JDxbOQiv?Jp)$Xy|Z>g)@q_QYKopPeu&ghhPNw0y&{j?$GHwDQoztHvVU)a0ca6}7{#3^KK1uTcBkMSF$IDQp#Nhy>JTHPK2w+%N#FZ(D)=sw?BkfduBK{Owa(SkBq^_S*|NP@DI(VEWNqETjYsZ@bch5-dlWjP>|)xv+AknhsqQ!j3!=TT%CYvl>o#XU4AApoVBJ;db=W0m0#FH7by8-Z*V~$!QptJuPqLkH-bA#`L?*g#-60qO9x7)rWh{~YY77{NX_v_!Mc-#`(n{>OHy|HxotTLyFAdqCe^bQsveKyNdxf%^ECJDw1jQaV{3doP1nC-IuYoJBS)BjwI+@*WRZpBQq--|WAHx2WWVue@lE`*T9AY1=3wKyIT}9Ss;d##=nZ>!%19!lx_0W91se3dXzq5oIE}=)Lf4xkby*McKg=z&Qh>gK5l~kV7t^lQK_s8TXQqiwiAz+UT7qOmRWvI~~2yt~5_%swZRW#&SB~-uWRFRPsi5WZAzJp1&o@T%9?d1EUEpyP1v5zSN9`Nzg4<9J>%D?ZP~-T(dwGEKqZMPuhgN<|KigW>x{H0%t_&ya;8v^0F=-)sK*R85LA|5>|ZYqA#XmVbFc92&H9WjeO5C!FX%LsiXg6}0#l(Pg(=6hjd7H_7$6NLIA+rCa;GE_3R75D#&J(7=>z|LN$87?M}UpavJo5jeYlJy>3UxeQ{duojamIjZmWv|*!Tjr5rT-K7C#w~_vZ!oIz>O;(D%nYcBK)`IjO`=SDZo*4vJ4V2bFcFg(2@0lt|4uUCPbO&N6^dv4y}sPBwT(0$|M|*?y;Jv@#8^JCr!hxD0=c#R8ALJkOUZ5;?_TS5GI^kyB>q;{eo<-=N|;JU?~G80$0+y}Bn>nRaoX5bq_lK8&2G2D0K(N6U_xX}HirikYywzHoCpo)+j^d}t`9sXluV$o6?ewHe5Ui+m5Y9oyhGHXI2OTu~#~ow24E&_|NZmvkjEEo{?lrj>I+3}kwNN$<|WFHD7&hT*J`96C?gGpoC>Df4aU8P&s$90m&Ugy{5AY@?hVDTc{$QAjsoHqS{ck6snhl_)o^474tl{Idqaq1M4gTm}}RWDGq`oxuuW;}<=ge*lXX+3Fk7GOExz3(~A%nd`>nKrLxi-U((%)yeb9`|*{}XN04z>c7Ok#4FmP|M+baT*Fuu4_Vg~%D*h%0S&xmIOVhF)nXWYKj;F_kSpCA=E?|IY3TP731i4AmJ9{uIj}nvkZA~PCqrx=X%FiI(pX*UhQoq9-g$`cDVZp7ZcaGt$}M)fe&uhR9-a*yxklx#6Au8ICI}z@KWrk3Obx%(^tG4C1D@?bgq2jBnZ(O?j&jyR!8J6j%~%zfEtIAPDKu$5hd4V~`Wco06Vlpvp}HuyKK}fz6t3mD@K4unNqV2DlC&6KFhx<%Ai%h1TodO4gAJ51=G}q7(kO9N(i4X$8MWnd^UE!-v`^1{5*9tbIpjgUqjwfygmDW~97EWp8~n;>1xM9pw3FxHUd_tgGkMlT)SNNWx(w9cg>9zUZzX{Q^w*d#U194JF-wwlAz0yZ)tH+4r_JXO{=3ej6LQd~!naPgtZ;`)wmc&a4dppCSVTiQcc8UfugJge8Gerop`ck3K6;T*^kZp<9Hf-jlYfu+Ncv+`5i&JOwCg|NE;{ox`VY;Ub%E~4C%G@CLJi@0X_co$N5u2iD|I6taml4pwulJ)p!O69folAr>W$wc_@9v9VvU6N+O#a=gX;BimpVU*u)q2XscX=lx^Zn-BN-;_}UkjSaA)uW&+!Y^2T_W8ydgx(`~YE%eEmgcn-wY0id8}v=bSq#xRdSY89OVSts;`59NKaqnBJzq!px7xcxQkwC!%nzq;ACIq57xa9CNC~WPR=_N|sDZ*mo*d={3!J1&)iMAr6Ji^XPO(S$Grm|U4{YYnUtbS1j`5tgw6*^XZGHcn{g#{1)Vn2E4|r#ek2MWQ9#>rzQ3zDgm9txa-#PSX0d6dM`yebz%Twh(oT!pmA+0Mo^hZ(f*S=n@m;h!I4LZqoenOsP%;j#RNdY2pvZFx4)uHo@w&tRGaMg#!x08$i%Dckl;@(Bn`nq>l-xa+7FYu&~4Yq@Vdt+c}5e4r$M9h_8xqG>aHp&?q8)HWa0Sz{jdpt`RF2TQ#ak`F0Ku`0Z2b&|K}-j9!Ag!ByLopEz?X#Iz-hH-XI{L8;k+U1StEfZI9`UEgh8v|1fNGF!KlPDs)l0lp4YCF?m_QdcJRn{uyNTlYhoL&(q5rP2Z9SU>2tYy;UDp^wa%Fg~Hhk(2S`|te$Gx+Ii0m3i){{7ug{x+aM|ib=hj3X>-y&H^d1>Yk&CE~y^DBjF0EgaKtEaKI7Wky8h&Y@UJ7P7{NwNv$D(&at=BZzc4W-VudsUAV!}qh{Bx~xqASW>QiwS&Pp?nj~MhW!&p&ZK!3XYPt|0r=y$NiXqT5r|a3t~awh`ELbbKti#lGw4_Z}(|4CO9mS3kjLy9es(hZKys_D+DbXrMuaf)dQf+fd=>LNl`QmMe|5HAZu3&M|Qc$e&*iiiM1kpmZL3f%xon_T*7vFZe9~Mf+S+JioV)e9`9pu;^&wln&a4*Ffme0YR3eyDzaPl4eJOEcxo}+_<$hvv>BL)>?w7w?zq|h-nRo`f!5Z^Z0EYf3QZs73GD`>p#=LV@%sLW3~V7UBob6G;zP|6$TYwVbdcqN#-p-y330KXa?YLk+7|%B|s655R+FA)bcA;K2bp!effB5mS5M{$`4RUF)D-}S2zDLqE^vwi-W%}HG7E`-9#iZY98C2*wag^L8FGhTg>4~1t<927~iA%U4&gdRuPFnqSz#Pp^sW|GtL`42h!3vJLPs{Yk+FgK!+g~a3qqe(;LMP&qfS#hT%mKhNhcUPsWZ`K`DcL?sHamF=Hhu*!?d1u{s#h*t!VqsV3y_mGfSdb^JScFk8jUdO?g(o(2gC%_^uh1$<-K*o~`R)Y+;0K`JnLDY6s#;ZWB1vtnry0W&135=PkgRY7sTp#l@E7$0*w4Kr<2ke+_dq6&O6JW$*9x7%6WHy4`^yxw#Al;YVTb3CPy~Z;@}r_c~jG3g)%+iPOR2*zg^<$yYfn!U2MNM&RjuDjCBg+-)-Z5SNK$@YUY3LX2>lQFN54`UZRmO7GTUTn(e0S)DjFC0qZn#`W~1Tlo=z>)mCXdAK+UBFc44FEMl2e8`+5f&30d-K)8VmW}JZU`gXNq<|eO?7R^m97LpKwJUX;MC3lNB6M=aQZ4k|KAi4}mlh$J>ZI31_j^-*scF&{J^lIKauwG>6f7cviw1wPmS*>ozfDBu}n&hyDs>b^#XsvZ6btR8gNpO*h-+X7gLNr4n;Gb4JJKwCEXrC|9KHY`Ml?jmzn}65x`!Z9+@iql=}{4Sy>*h6-b&m)4X&g069dds355-{$xntPJ3I?LHZW}y2gLcFlfsp-|cUJQ&8%}s)J*SWRvBXz8D#>Iq4gVhCIUE!HeV@S_J2Dur1!X`9pd4?`0|?>oic&j~O{wG^t#stUrKuH6`vf2HSEUw6s0YDS2H)1kTD&9r^(5Qv(H~=!>`wBE$nA1xS@CN9|rYBQ*A$Qq0u0CJW$8-(-8+kascp$ekk)Q&nC3GRn=<2>E=P{Pyu$WKo(P}w#Ev_QQhD`4O6wVp{Mlj>5E?0R|`+6E(*x4_XsAG7WwH{+OyB?=ghZ@+HKNED%{1R->XEtpR39%RfO}y&~Gd4Za-_fyku%XTl!YIn_7xmt;p>Sa2&&w%UZM^kj0SYG(|W);}iWTOn3K+J$i-2zRX8sXx*y6dS@(}ufe>$Uz?NfWN(#!ifPmpZgXdw_7O>%j_22is^1IeqCfLH9t{@P8g87^ivbb@wKFj2^e}tN(KeUrP8j9I;T&Htn`L1$uT#j*m8QV%*=6l2#CLn_{d&x*L(SHHrfk(@zl+sA`8%&J6h%+LKk409)E83-HAcGBHdA=gP4(Ejt2;;%*Bsrr~L4C69Vu~evF{x7H+zM)QqA?I)_vIV*?PhK-EXVsG%&TJrwZL-kU5)Hb_A5!MFXh=aX|#Ch|gVNS*LWR|DtrBT=n03RP}e@3qiS>LeIKWv`n4>Lp5A3A|oZDQN8f4YLl0|Xo+)2kyWF_3tQB>g?YL69?$9rVe0*e|h_mYIQtQ@Nr`+n#Jpupgn(3qhzsS+S=bniZdRV%q>WR6We6Z3y_O>GitqnkrZ%n&Zv`5}ANvM`Z%Wqsdb4CiWJ80w_@d$0o*uEutYe_>#QH;E=Jt#$%`22Dm6la?)Q!AYPAJI6^-jd3}0vG`y4~t!iWU@C@WY_)6t{}K*Tw^!mrdMz0C-Qs+6KnYkgaQIpQinp4ybr13`soBJxjD*zfjwFjN(90&1Tb)%ssq%NDXFd&wyWQ!ff_Jx#i9g6mpp2UoHbF=#F!I9%(cTajqvE=9^5TEdMjoe-p4S?;>3urimS1<{3)vL2o2sD`WVjTtZk;B%azGTCVbpq(K=M=zpQ2pfR2VOOF8O`B=fvWO}LPJI2)YXhduxYeS58MA|e`z1l_C4af?0DM=hV&77&eK4hLNa!C-+*9el4sHKSW8mx6QI&P4^TCNr;na8jouqxDI_gg9+&PaR6uuy0kX+y>Z7ol}UA_$}a1hajh`6(Lkz!sHo%rRy9&bQqR!F`AUNI<&@w4=g#)AdbNWa=y;Shlf&Ub}?bii#;w*e=GGtsJ$(anmWG46}d)%8UN#04~LZ1@1AC7e#wB5oK5bSm=;v@l46vYr5<-t(d`rE2T5ild@WFcEH{-e)BkFo8cV$!`Dv!rxp)qCch+I8Y~4e#Ud#I|M}LLJgSM0NeUv}8FvrwtL4$X_O$wJybTIUyp7I3?tEiMkR|_U9UqDPU!YQF}Wsg^5%>ArjK{@-hYQ$VTD!_k~Q%3%EVJv;~S0Z1;=)mif+b?l>WC)YA$1vN+r3@>41_3`qR+DPKr=inwgPDlKcF?xsRl9Ty11f7|qm_%Q?BvSXuP(`vARP;^0jRy#I8_4-31Z8L?R7OqkHB9O5?y~}>$CFqEY*Sm+0``e?j6>#Ig-Gcg(uCnAPxl|RBOi<((`X1ZmMd(Hj^>|+AbcGqBzm&?Lu8ZX-L`JCdYK1o5{X|71cL!hX`7hf6X+G-P@iAE_x!+)y;mr=Ud_lc+MZ(T_%(c*IFfTP!+%RR6<2UAzCXCg^}^HN5qjG=qSib-uhE}oenz$u$>B^#;*a|$f8f*uqy365bw<|;d6a5AbU4gj&yu>)j4gF0C*(RgpD;Ynta?d+2uah7SDCB8$a~%b>0p*&W3)^lbt^S9`mu?)O<@xZ`0mn-ti3eCMq}Qs$=gQ^9Q0&BYYQ`wb;g(YP`9+d-mf5HBMKe%Y<6B^kz`PszHSL;thiEkqhAuq^hzqQT*@5`D=i+)kljv7m)5{KuyofVvbQ9S2fzy;%5zK+^go_Ei=;OY3p-!gL#o`5J0E>?1vZW9U=B~)+@08TbT7G*^cnCtd6+?$9((s?JH&L_Lyt!KLLYZmv}Wwp0WG*OlGoo8a9%Tr0ftKTK1Z&Y+*lV{~iQ*9Xe^OASRTtJgT)r;2m=%$l@G#wj?d2`N-|-#3_X;p_E5&>WvfnOO&s$zUT1Qe%#Ax)C9;)2uPXNF^P^9hp(vWD_73O{(@Caj_@mH1Gj>8EFvkF&EO-xiipj^p}c>;Go#yf3VRwYiX$^%$+R)owts?Hw6g>D(^-17Kgwh4m?~iK-r1!vfkJVq^J`L4|NbB!VIg&oix^}w|6<|>xvKMphlnx*YpoWHojB{=GKz?(&WrzR24)Ef{W~D5w!Ers${^fPo*5dUjGKqst|=SeIhxoKqCAVVNT5MnsNPsEL4AdpNN3D`&-?`JUvmj#?gTXJXc<>d&yiuY|C+O@kq2cJI!cb$dy*;0nkEwKbaQUhZk;h9c>Rzq_g^mh=JL234tKpzZ4OgW>FEpvo<;zkpo7lm(ZiW$WRWhRm1)s=#L{IK-3qh2-imsiW8T?S+y`b=uvIkJ;o8D(`cs7sJHa#+x)NgmM_%!a9H7SZ&imLw%)!5eLlR;Xzf7z>icyIA@2S9x`v(varTLZu9kDupp@Lgud{n+O789ynsG>m4!r^Ku6oV5DawC0L%L_xOtgHU|DJQAr@HOkUkSQ9eBbHQ*#s@VnU_!U%u^2!s5sXvKnQQhElE=iMW~jq!=VA&V<%rs>p<)X~MCKH-wD<7w)hANC8zJ#J;Non%*)jfJJWHp08#j`E_UgK-w5SzlSbptP)BQzcR)-*2d?kZ_WA$f2rKt7jZN80sc&09EAk8=oYdyr}TAAMu+E~P=Lf1!SD@T&T{p!mLlPO!JSSOlIG&8uXbsN~rv*JuOpD|_Qd2GWt$*3^hxq~0s7AwHx7o|a(0i;}+;9@eeirC0hGg=-$(*B3Hwx^ph#UJ5cJmFw0BM2gpD|v5eH7xQla3uWVWIN;R(-*fWG?cvYVrNs>iJn3ky4S_J_T{-)P;H!56U7PyPK23k88;@307qSNT|?=0^u=!y0uNOtx{;&7%lR-Mk+-olpQZXhW_o(wE=RKwgLUBKVumuuKW3>L?*1W@;g4eDr5@@vg=7Qg>t-zon|i)RZC*@@ycVvPzwD|}YC#yfq-}-)R-?!cyM9RW+3$O%opq0g^+}6G>nbEB2yp_lzC$cW@?b8%ST0d<g`L7998nQH6W_QbtPlwEb4ZIv@JA=&f3&3!H4rqg1r;G{#R6a+}<8V%f92?6^b(CfY)U^u<>e5r!F25t&EXeru>UzJha5BN1(V7^)`NG>j@d_{v~hQj;2R7gxgxInUj9rtI7l}`YqGHVe@I;Onw2-)}l0G@}i@vARM$U+zE@eJeTkonEAk9G)_6J}e0VD6bTB*!Ox&$>~zrYh#b8W;+|xZsr97Z;->_zujTjbKj0fjLhtA>iV112RswXbHv`UbM!7P}T|*jNkow6Nk2QCxCoS)7JMJWMvcUmbD?V!ZZo>uo84(Kof;eF+Qd=|dwA03p4+?(dRSZ&RD(>1XsqA{VaVjlAyF)a30Cb<%FDz?+|Ixx>i4h%wpXbF8RW5hi_>7ue%4SQHY7(fHzGB(?F3?`X{`a^p}j9NVQ;9?NGJxsO!Z?H8yDPxn%!@v4T$>PbP0y>I_wQQn24%lJTkKp6H+vJV2)M(dWf|iC+LErDy6C-de=uB&MoUjeL2XO0L4bZ`4vP3uVbytPNN6NDg3zLUHYX0^Imij2Fi`d4?$(7ZeUBYF)3&s@2KjOP#}RQZe4)MIFxaVQFK{L>46-9O=&!>KAT8xgqRcWNyl5AMK}a-gotG4Yw%ipxT>>fwY>M{_0HI24D0&89=IB>v=yT$EouK#X%GY1)R#2^jpx;T>jArIc}U_o!D=Lc5JG%h>x*#2(S_Nj|z+&GkRdR0=1{xCpa6q?iLhBx_p_rNll4IDEnS?txCSXARF_VWH&Stv~W#|GA|7hz==#>cl_M9Pf5P;ENrVnk{>6|VxH^Io)I@t4R;#|Q~F9wC!Njq=*t#XiqxluDz4oXZxJ7>d&jC4rMo9O(V^~NiI2C&WG*^$v2SJrm1&q`^f^8f-FtExv+~yOP%ztp7gVF#Y<1vIQ>Weq(Cu`8C7!_pfd^?*6~p3Sqh2Y;F3m^0T~_#91U*km55Yl1i5WS*j80m_v97fn4_021v!PoUb#Q4KyZ1F>W5)DsPJJGNI%F*O`r#kDje4y{AK2wiQ#&n2ZgoDQI({BL_tk(4UhEsjLJoy!BgCpjdkRgNUm^RGFx1)24FU8IipRaK0R22R+3g?hqUF6X(1e@kDQK{Kg#T#^=@mpxgviq*rYRp{+rsoPWV{vu4L)BrxS9R3Gwfi0xw_5dpRSTZKPb5_W`ZiSbHD*cGsY#00LLUDiQjkNdlpcsN|4PnStEUy)CGE~0gJbosWzUsfz~4cWVtIU}aR<=o`m*cj(CRpK6&tP?YOs1@_Iu5=vkpT&*jHk+w%={xIz7yZ`hUr&%HidU^Z1hV0S0nJ(qhgbS6sGduZ%(hy#Qkvl9wn*dLUEnjkRO=LcHBf@=Op*+^t%XWBkjy}LT8UgdT;{OZE^$e`0{itTiLxmgkjvhT8;Q;H+=u&r;uoa&-z*q8rF#mZjv1l+Ia{Y$sXMYuUbG;OHUJRLQVo1RnQw4T7`fQNp!8j4kyw>GXZKQ7K_@Je3UvVCvf%Pm#7jo0GSefG2kZE%1GmGHy0xJ96ine<;RH52z6dtU4TI0TajFmTtVOnwM9edIowW{vpoKit$&0;$HEwx4Jh>D^=c!z#iYzDy9KxT*9EP#s?=nbEc(tZRElIQUh!#K-LLI7Zu4U=pMG-T6_dkx>e6%G?(S5)>$LTdldWtd9ukX%KmG7Za&f&rjka&__V@X;pJld?TFyTfux3Bd82v_oTi@h%?{s06D!U&-R6V__BBu>$4+Tb*Q(Hu+yjnkAjU7B8F$%l5Bs#6o&%*tDRN5jGE_FgMn%F6pysWzvxmW(W-Ag67LA0+LNQwVp#iJFkwFpjXxf>wnGh+Wvjpgf?#%YQ?c=!g`%m(30L8=Y@GiP9ImTjUXkmmw7og|VY#2Kh_lUpQR#h@r5RxQ^>02z^UDEh>!3I~wWNJ%mqmX#j@w-iU4$iv4Iyr}t`yb69?Iq7TEH_duNVi1Z&6SN+q!mV3fcRKeLpLnuIjmx#Hj2MaHzd598UtC~}jw04N)JUc?Gg@yx{N_ma@9z8=PviXXTAdst0azc!s*{EI{Su%6yK#;93;L1`#OQG>n9XZr{+Z>yr`d6<}AvejxjLF{%iY~J5Ig30PcW07&yR?9M(*l#uLM&dIEwnZQfqlZ>6PH)AMdo@Ymn!F)kbVOe*tkmNaHmcn{xt307BAzpI;1{6+O}qhC0#`-p0Rm{yRX|j*`T+O0iQ_#I9Cc@dF1z6AIBvk8luW&SH6z*FwgHWe?tv5e3mn=md;e9DKE6HkR^Jz9;kJ_00"),format=L.FORMAT_RAW,filters=[{"id":L.FILTER_LZMA2}])) diff --git a/records/track_10min_16mb/2026-04-18_SP8192_QK525_LegalTTT_Repro_3Seed_108130/train_seed314.log b/records/track_10min_16mb/2026-04-18_SP8192_QK525_LegalTTT_Repro_3Seed_108130/train_seed314.log new file mode 100644 index 0000000000..ac76bef9dd --- /dev/null +++ b/records/track_10min_16mb/2026-04-18_SP8192_QK525_LegalTTT_Repro_3Seed_108130/train_seed314.log @@ -0,0 +1,149 @@ +nohup: ignoring input +W0418 14:47:38.710000 1618 torch/distributed/run.py:851] +W0418 14:47:38.710000 1618 torch/distributed/run.py:851] ***************************************** +W0418 14:47:38.710000 1618 torch/distributed/run.py:851] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0418 14:47:38.710000 1618 torch/distributed/run.py:851] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data/ + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.9965 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.35 + etlb_clip: 3.0 + etlb_enabled: False + etlb_lr: 0.05 + etlb_steps: 5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/b954d466-a996-4848-a0d4-3f69a237bc70.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 3 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.022 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.095 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + parallel_residual_start: 7 + qk_gain_init: 5.25 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: b954d466-a996-4848-a0d4-3f69a237bc70 + scalar_lr: 0.02 + seed: 314 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + ttt_chunk_tokens: 32768 + ttt_enabled: True + ttt_epochs: 3 + ttt_lr: 0.005 + ttt_momentum: 0.9 + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.72 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40540160 +model_params:35944536 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0096 val_bpb: 3.4879 +1/20000 train_loss: 9.0109 train_time: 0.0m tok/s: 8278628 +2/20000 train_loss: 12.3533 train_time: 0.0m tok/s: 8131273 +3/20000 train_loss: 11.0251 train_time: 0.0m tok/s: 8038845 +4/20000 train_loss: 9.4762 train_time: 0.0m tok/s: 7993262 +5/20000 train_loss: 8.3404 train_time: 0.0m tok/s: 7969928 +500/20000 train_loss: 3.3850 train_time: 0.8m tok/s: 7767134 +1000/20000 train_loss: 3.2888 train_time: 1.7m tok/s: 7760918 +1500/20000 train_loss: 3.1878 train_time: 2.5m tok/s: 7765583 +2000/20000 train_loss: 3.0712 train_time: 3.4m tok/s: 7771584 +layer_loop:enabled step:2034 frac:0.350 encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +2500/20000 train_loss: 3.1233 train_time: 4.6m tok/s: 7075217 +3000/20000 train_loss: 2.8994 train_time: 5.9m tok/s: 6680374 +3500/20000 train_loss: 2.9406 train_time: 7.1m tok/s: 6433847 +4000/20000 train_loss: 2.8215 train_time: 8.4m tok/s: 6260888 +4000/20000 val_loss: 2.8768 val_bpb: 1.1137 +4500/20000 train_loss: 2.8422 train_time: 9.7m tok/s: 6101788 +4554/20000 val_loss: 2.8146 val_bpb: 1.0896 +stopping_early: wallclock_cap train_time: 588062ms step: 4554/20000 +peak memory allocated: 39045 MiB reserved: 39120 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.81149855 val_bpb:1.08841870 eval_time:7016ms +Serialized model: 135431033 bytes +Code size: 16594 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 12.8s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15976703 bytes +Total submission size quantized+brotli: 15993297 bytes +quantized val_loss:2.84063375 val_bpb:1.09969785 eval_time:25612ms +quantized_sliding_window val_loss:2.79767937 val_bpb:1.08306887 eval_time:126853ms +ttt:start chunks=1238 ttt_lr=0.005 ttt_epochs=3 +quantized_ttt val_loss:2.79411035 val_bpb:1.08168719 eval_time:378266ms diff --git a/records/track_10min_16mb/2026-04-18_SP8192_QK525_LegalTTT_Repro_3Seed_108130/train_seed42.log b/records/track_10min_16mb/2026-04-18_SP8192_QK525_LegalTTT_Repro_3Seed_108130/train_seed42.log new file mode 100644 index 0000000000..91004414ea --- /dev/null +++ b/records/track_10min_16mb/2026-04-18_SP8192_QK525_LegalTTT_Repro_3Seed_108130/train_seed42.log @@ -0,0 +1,149 @@ +nohup: ignoring input +W0418 10:26:34.641000 1830 torch/distributed/run.py:851] +W0418 10:26:34.641000 1830 torch/distributed/run.py:851] ***************************************** +W0418 10:26:34.641000 1830 torch/distributed/run.py:851] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0418 10:26:34.641000 1830 torch/distributed/run.py:851] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data/ + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.9965 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.35 + etlb_clip: 3.0 + etlb_enabled: False + etlb_lr: 0.05 + etlb_steps: 5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/1ea1d177-1eda-4a5a-8276-5290ec9c8556.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 3 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.022 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.095 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + parallel_residual_start: 7 + qk_gain_init: 5.25 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: 1ea1d177-1eda-4a5a-8276-5290ec9c8556 + scalar_lr: 0.02 + seed: 42 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + ttt_chunk_tokens: 32768 + ttt_enabled: True + ttt_epochs: 3 + ttt_lr: 0.005 + ttt_momentum: 0.9 + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.72 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40540160 +model_params:35944536 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0090 val_bpb: 3.4877 +1/20000 train_loss: 9.0104 train_time: 0.0m tok/s: 8274791 +2/20000 train_loss: 12.3645 train_time: 0.0m tok/s: 8170285 +3/20000 train_loss: 11.0074 train_time: 0.0m tok/s: 8075936 +4/20000 train_loss: 9.4552 train_time: 0.0m tok/s: 8040054 +5/20000 train_loss: 8.3277 train_time: 0.0m tok/s: 8020491 +500/20000 train_loss: 3.3767 train_time: 0.8m tok/s: 7817197 +1000/20000 train_loss: 3.2871 train_time: 1.7m tok/s: 7793069 +1500/20000 train_loss: 3.1835 train_time: 2.5m tok/s: 7791725 +2000/20000 train_loss: 3.0692 train_time: 3.4m tok/s: 7792558 +layer_loop:enabled step:2040 frac:0.350 encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +2500/20000 train_loss: 3.1198 train_time: 4.6m tok/s: 7105369 +3000/20000 train_loss: 2.8946 train_time: 5.9m tok/s: 6695240 +3500/20000 train_loss: 2.9412 train_time: 7.1m tok/s: 6449357 +4000/20000 train_loss: 2.8197 train_time: 8.4m tok/s: 6276692 +4000/20000 val_loss: 2.8753 val_bpb: 1.1131 +4500/20000 train_loss: 2.8407 train_time: 9.6m tok/s: 6134547 +4575/20000 val_loss: 2.8113 val_bpb: 1.0883 +stopping_early: wallclock_cap train_time: 588067ms step: 4575/20000 +peak memory allocated: 39045 MiB reserved: 39120 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.80831093 val_bpb:1.08718468 eval_time:7454ms +Serialized model: 135431033 bytes +Code size: 16594 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 12.8s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15974954 bytes +Total submission size quantized+brotli: 15991548 bytes +quantized val_loss:2.83711447 val_bpb:1.09833542 eval_time:27793ms +quantized_sliding_window val_loss:2.79419118 val_bpb:1.08171848 eval_time:129617ms +ttt:start chunks=1238 ttt_lr=0.005 ttt_epochs=3 +quantized_ttt val_loss:2.79082063 val_bpb:1.08041364 eval_time:386589ms diff --git a/records/track_10min_16mb/2026-04-18_SP8192_QK525_LegalTTT_Repro_3Seed_108130/train_seed999.log b/records/track_10min_16mb/2026-04-18_SP8192_QK525_LegalTTT_Repro_3Seed_108130/train_seed999.log new file mode 100644 index 0000000000..d8c534c834 --- /dev/null +++ b/records/track_10min_16mb/2026-04-18_SP8192_QK525_LegalTTT_Repro_3Seed_108130/train_seed999.log @@ -0,0 +1,149 @@ +nohup: ignoring input +W0418 16:17:41.721000 15312 torch/distributed/run.py:851] +W0418 16:17:41.721000 15312 torch/distributed/run.py:851] ***************************************** +W0418 16:17:41.721000 15312 torch/distributed/run.py:851] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0418 16:17:41.721000 15312 torch/distributed/run.py:851] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data/ + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.9965 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.35 + etlb_clip: 3.0 + etlb_enabled: False + etlb_lr: 0.05 + etlb_steps: 5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/78836d58-9a4f-4220-95c6-ff35b18a6211.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 3 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.022 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.095 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + parallel_residual_start: 7 + qk_gain_init: 5.25 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: 78836d58-9a4f-4220-95c6-ff35b18a6211 + scalar_lr: 0.02 + seed: 999 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + ttt_chunk_tokens: 32768 + ttt_enabled: True + ttt_epochs: 3 + ttt_lr: 0.005 + ttt_momentum: 0.9 + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.72 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40540160 +model_params:35944536 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0076 val_bpb: 3.4871 +1/20000 train_loss: 9.0093 train_time: 0.0m tok/s: 8339453 +2/20000 train_loss: 12.2930 train_time: 0.0m tok/s: 8212948 +3/20000 train_loss: 11.0066 train_time: 0.0m tok/s: 8085999 +4/20000 train_loss: 9.5049 train_time: 0.0m tok/s: 8012495 +5/20000 train_loss: 8.3695 train_time: 0.0m tok/s: 7980747 +500/20000 train_loss: 3.3807 train_time: 0.8m tok/s: 7755779 +1000/20000 train_loss: 3.2820 train_time: 1.7m tok/s: 7739016 +1500/20000 train_loss: 3.1865 train_time: 2.5m tok/s: 7743679 +2000/20000 train_loss: 3.0766 train_time: 3.4m tok/s: 7748185 +layer_loop:enabled step:2028 frac:0.350 encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +2500/20000 train_loss: 3.1240 train_time: 4.6m tok/s: 7109085 +3000/20000 train_loss: 2.9023 train_time: 5.9m tok/s: 6714057 +3500/20000 train_loss: 2.9451 train_time: 7.1m tok/s: 6459430 +4000/20000 train_loss: 2.8267 train_time: 8.3m tok/s: 6281217 +4000/20000 val_loss: 2.8796 val_bpb: 1.1148 +4500/20000 train_loss: 2.8463 train_time: 9.6m tok/s: 6149898 +4585/20000 val_loss: 2.8150 val_bpb: 1.0898 +stopping_early: wallclock_cap train_time: 588145ms step: 4585/20000 +peak memory allocated: 39034 MiB reserved: 39058 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.81189936 val_bpb:1.08857387 eval_time:6689ms +Serialized model: 135431033 bytes +Code size: 16594 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 12.8s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15975421 bytes +Total submission size quantized+brotli: 15992015 bytes +quantized val_loss:2.84092842 val_bpb:1.09981192 eval_time:8391ms +quantized_sliding_window val_loss:2.79800522 val_bpb:1.08319501 eval_time:91749ms +ttt:start chunks=1238 ttt_lr=0.005 ttt_epochs=3 +quantized_ttt val_loss:2.79443824 val_bpb:1.08181413 eval_time:322067ms diff --git a/records/track_10min_16mb/2026-04-20_SP8192_LegalTTT_W104_FaithfulReplay/README.md b/records/track_10min_16mb/2026-04-20_SP8192_LegalTTT_W104_FaithfulReplay/README.md new file mode 100644 index 0000000000..991975a79a --- /dev/null +++ b/records/track_10min_16mb/2026-04-20_SP8192_LegalTTT_W104_FaithfulReplay/README.md @@ -0,0 +1,18 @@ +# 2026-04-20 SP8192 LegalTTT W104 Faithful Replay Candidate + +This folder is **not a new submission yet**. + +It is a faithful replay / bad-seed-variance reducer candidate based on the current SP8192 + LegalTTT near-SOTA stack (3-layer recurrence, parallel residuals, QK gain 5.25, legal score-first TTT, and quantized+brotli artifact target under 16 MB). + +## Intent + +- Preserve architecture/compression surface from the SP8192 LegalTTT stack. +- Make key defaults source-visible for evaluator/replay clarity. +- Probe bad-seed behavior starting with seed 314 only. + +## Pass condition (seed 314) + +- Must improve meaningfully below **1.08168719**. +- Strong pass: **seed314 < 1.0812**. + +Only after seed314 passes should seeds **42** and **999** be run. diff --git a/records/track_10min_16mb/2026-04-20_SP8192_LegalTTT_W104_FaithfulReplay/run_w104_seed314_probe.sh b/records/track_10min_16mb/2026-04-20_SP8192_LegalTTT_W104_FaithfulReplay/run_w104_seed314_probe.sh new file mode 100755 index 0000000000..cb0d76099e --- /dev/null +++ b/records/track_10min_16mb/2026-04-20_SP8192_LegalTTT_W104_FaithfulReplay/run_w104_seed314_probe.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" +TARGET_DIR="$REPO_ROOT/records/track_10min_16mb/2026-04-20_SP8192_LegalTTT_W104_FaithfulReplay" +LOG_PATH="/workspace/w104_seed314.log" +VENV_PATH="/workspace/venv" + +if [[ ! -d "$VENV_PATH" ]]; then + python3 -m venv "$VENV_PATH" --system-site-packages +fi +# shellcheck disable=SC1091 +source "$VENV_PATH/bin/activate" + +python -m pip install --upgrade pip +python -m pip install -r "$REPO_ROOT/requirements.txt" huggingface_hub datasets sentencepiece + +cd "$REPO_ROOT" +MATCHED_FINEWEB_REPO_ID="kevclark/parameter-golf" \ + python data/cached_challenge_fineweb.py --variant sp8192 --train-shards 80 + +cd "$TARGET_DIR" +SEED=314 python train_gpt.py 2>&1 | tee "$LOG_PATH" + +python - <<'PY' +import re +from pathlib import Path +log_path = Path('/workspace/w104_seed314.log') +text = log_path.read_text(encoding='utf-8', errors='replace') +matches = re.findall(r'quantized_ttt[^\n]*val_bpb\s*[:=]\s*([0-9]+\.[0-9]+)', text) +if not matches: + matches = re.findall(r'val_bpb\s*[:=]\s*([0-9]+\.[0-9]+)', text) +if not matches: + raise SystemExit('Could not find val_bpb in log') +print(f'final quantized_ttt val_bpb: {matches[-1]}') +PY diff --git a/records/track_10min_16mb/2026-04-20_SP8192_LegalTTT_W104_FaithfulReplay/train_gpt.py b/records/track_10min_16mb/2026-04-20_SP8192_LegalTTT_W104_FaithfulReplay/train_gpt.py new file mode 100644 index 0000000000..8b5114ecf7 --- /dev/null +++ b/records/track_10min_16mb/2026-04-20_SP8192_LegalTTT_W104_FaithfulReplay/train_gpt.py @@ -0,0 +1,23 @@ +import os + +# Source-visible faithful replay defaults (W104 probe) +VOCAB_SIZE = 8192 +TOKENIZER_PATH = "./data/tokenizers/fineweb_8192_bpe.model" +DATA_PATH = "./data/datasets/fineweb10B_sp8192" +TRAIN_SHARDS_OVERRIDE = 80 +QK_GAIN_INIT = 5.25 +TTT_ENABLED = True +TTT_LR = 0.005 +TTT_EPOCHS = 3 + +os.environ.setdefault("VOCAB_SIZE", str(VOCAB_SIZE)) +os.environ.setdefault("TOKENIZER_PATH", TOKENIZER_PATH) +os.environ.setdefault("DATA_PATH", DATA_PATH) +os.environ.setdefault("TRAIN_SHARDS_OVERRIDE", str(TRAIN_SHARDS_OVERRIDE)) +os.environ.setdefault("QK_GAIN_INIT", str(QK_GAIN_INIT)) +os.environ.setdefault("TTT_ENABLED", "1" if TTT_ENABLED else "0") +os.environ.setdefault("TTT_LR", str(TTT_LR)) +os.environ.setdefault("TTT_EPOCHS", str(TTT_EPOCHS)) + +import lzma as L,base64 as B +exec(L.decompress(B.b85decode(";JwB(bzJ~7n@VT6Qap3bt~@<3h>ok~)Km^%c^ys%R{D_%yAk9-_tV7^coUOo3$w>`(`ci)t`2F7>r>Ltx>>S2CRw|7ov>Wn1e~_!RLQ=%V9g?)G3yPsu%SBy!lj1PaC-x%dDmCDOZ^r^!)+WWz}ejKXTJ#^U6Ra!};QocHHXQC+4UM!QQ!-N5Xd|%~a(9)bTYIO+>B~8~@lqmri%^qEkQUy074Rh6w7V_#^s9J-3BNA`G;qyR$LYcI?e+loZVWi~B$n=TKFp{%SeHYp{oNWh;U@Ahk8M2$OU%K8B$lb*dRQXd-GR_@*KAZdRdwSd#v=LSq1v@Puul=a7WXDmh1^kBj}Y2XlER!D2E{&{%lV(hz$#n5%+%sk&Q}>{y0xpRgiQQBJeVV0hy8UD3ntyo@(Pv+K7^zVRDt4bah(r8kfsZThb+H1)~K-lIr4`|V#-2R>G7pP*N!fwWd&Dq8C)y=NrG_U_Oz6Q?+@ok1?(VJ5?ZT~&}C4Ks38WRB>3i=I!}H-8qq=&yKJ;tbpwwn~lAseD^q1C*u5T;lKQtF;?zv@u0f36%6SXU~txi3v5iSPK*`fNE9531KaQDL`zTPF$MX4U(-3sY-&?>QJe)giBQzpor7H)AZ#4=Hn#`AoAL7tT){&bw(fgz|eQRt`#6-<>;m*+&$!nf|od6&lVKYYHuOoNgZU_L>E@!O%__mlt=);Hwdc43+CM?sh5y+my3XSVYMO8F1pXuq$fvTU<$mpDjr>Lm){DeV)>4AKAhA?jxjH<-3yYQ#5qz+4c`Utifny+Ydmr4?c_z60#9@FU+U1&O$Lfg$WrX7gCj50O1t`1A`k04LVr;^*~{|@(TS5>#TAjL(B`umc8bVA$bS|F?^2A7E}z7IIgZlY(8Ex#K+nLh0vzlKK=74U!g+sX4T?e3_^_7XB1A(HB{pYd{vHYcak_P3DZ2LAB20wAP+C_9p7R|0}wA=p~JFi&xD8H}n(LxCc5rcmwF`!s(tSf_l_TXk(cPZJ_z`)iV4#r^gzawYQ%HE1iaUF=(KAcKXE%%6Hx0i;?;p1w#dN7!-y!(2GUw()t4|BXt%+05bu$yea+{f!deuk%(g-o}&XEEWm!lO+1^On#i#4rhP{bDYb9ZnbGd5n{*P->hZxI<{=3c-92I#g*mTey8O>cuw%hdwjB!#=GH_0?hY|Lf@L$0Qp+k03PROh)o-cMOQ!b>qfvPNJLTVvCBX24BGgI=_|35Bd%&Vq&!LWECF4!1&J?@uRfe=N2mi{l-S0aW101I*cY_A&2R~zfij0IZST;@;xJYti>{)weL@A2ZOGrq(U-ibnWz0BL;s!=S;`!K6@M501z-dU(OqY0?!!!Tk6Z{9!iH*tDZsjYG`J8UEqJz~7cNEPh1A#iz_$2*Nxo;S{UehRA^{Mf3GV^9hBEKSL(iD!=aKpjYCTqmhYA|h4zASL-v?9UWx8tzm#N5eHo2h3w*`(kHM2e}vDviz3$~a(Y#jlJL?*}m9&(Oqs1+CNUy7g~Z#lRN#>g9{7u~tot;|0qTu4G4xwdXk3eT+l1l$)Vq%}j^^1b(jIvF|OcNb1Jz&)>b)qGiC5P7yS}AvC}VDKORD$#^Ydjg!zDuM#$J+k<}O|o#9dvGrp)*yShv3->joMiF%~orV4^0cl9F!@VqwD`5fjekV@3E`STlX!=JDxbOQiv?Jp)$Xy|Z>g)@q_QYKopPeu&ghhPNw0y&{j?$GHwDQoztHvVU)a0ca6}7{#3^KK1uTcBkMSF$IDQp#Nhy>JTHPK2w+%N#FZ(D)=sw?BkfduBK{Owa(SkBq^_S*|NP@DI(VEWNqETjYsZ@bch5-dlWjP>|)xv+AknhsqQ!j3!=TT%CYvl>o#XU4AApoVBJ;db=W0m0#FH7by8-Z*V~$!QptJuPqLkH-bA#`L?*g#-60qO9x7)rWh{~YY77{NX_v_!Mc-#`(n{>OHy|HxotTLyFAdqCe^bQsveKyNdxf%^ECJDw1jQaV{3doP1nC-IuYoJBS)BjwI+@*WRZpBQq--|WAHx2WWVue@lE`*T9AY1=3wKyIT}9Ss;d##=nZ>!%19!lx_0W91se3dXzq5oIE}=)Lf4xkby*McKg=z&Qh>gK5l~kV7t^lQK_s8TXQqiwiAz+UT7qOmRWvI~~2yt~5_%swZRW#&SB~-uWRFRPsi5WZAzJp1&o@T%9?d1EUEpyP1v5zSN9`Nzg4<9J>%D?ZP~-T(dwGEKqZMPuhgN<|KigW>x{H0%t_&ya;8v^0F=-)sK*R85LA|5>|ZYqA#XmVbFc92&H9WjeO5C!FX%LsiXg6}0#l(Pg(=6hjd7H_7$6NLIA+rCa;GE_3R75D#&J(7=>z|LN$87?M}UpavJo5jeYlJy>3UxeQ{duojamIjZmWv|*!Tjr5rT-K7C#w~_vZ!oIz>O;(D%nYcBK)`IjO`=SDZo*4vJ4V2bFcFg(2@0lt|4uUCPbO&N6^dv4y}sPBwT(0$|M|*?y;Jv@#8^JCr!hxD0=c#R8ALJkOUZ5;?_TS5GI^kyB>q;{eo<-=N|;JU?~G80$0+y}Bn>nRaoX5bq_lK8&2G2D0K(N6U_xX}HirikYywzHoCpo)+j^d}t`9sXluV$o6?ewHe5Ui+m5Y9oyhGHXI2OTu~#~ow24E&_|NZmvkjEEo{?lrj>I+3}kwNN$<|WFHD7&hT*J`96C?gGpoC>Df4aU8P&s$90m&Ugy{5AY@?hVDTc{$QAjsoHqS{ck6snhl_)o^474tl{Idqaq1M4gTm}}RWDGq`oxuuW;}<=ge*lXX+3Fk7GOExz3(~A%nd`>nKrLxi-U((%)yeb9`|*{}XN04z>c7Ok#4FmP|M+baT*Fuu4_Vg~%D*h%0S&xmIOVhF)nXWYKj;F_kSpCA=E?|IY3TP731i4AmJ9{uIj}nvkZA~PCqrx=X%FiI(pX*UhQoq9-g$`cDVZp7ZcaGt$}M)fe&uhR9-a*yxklx#6Au8ICI}z@KWrk3Obx%(^tG4C1D@?bgq2jBnZ(O?j&jyR!8J6j%~%zfEtIAPDKu$5hd4V~`Wco06Vlpvp}HuyKK}fz6t3mD@K4unNqV2DlC&6KFhx<%Ai%h1TodO4gAJ51=G}q7(kO9N(i4X$8MWnd^UE!-v`^1{5*9tbIpjgUqjwfygmDW~97EWp8~n;>1xM9pw3FxHUd_tgGkMlT)SNNWx(w9cg>9zUZzX{Q^w*d#U194JF-wwlAz0yZ)tH+4r_JXO{=3ej6LQd~!naPgtZ;`)wmc&a4dppCSVTiQcc8UfugJge8Gerop`ck3K6;T*^kZp<9Hf-jlYfu+Ncv+`5i&JOwCg|NE;{ox`VY;Ub%E~4C%G@CLJi@0X_co$N5u2iD|I6taml4pwulJ)p!O69folAr>W$wc_@9v9VvU6N+O#a=gX;BimpVU*u)q2XscX=lx^Zn-BN-;_}UkjSaA)uW&+!Y^2T_W8ydgx(`~YE%eEmgcn-wY0id8}v=bSq#xRdSY89OVSts;`59NKaqnBJzq!px7xcxQkwC!%nzq;ACIq57xa9CNC~WPR=_N|sDZ*mo*d={3!J1&)iMAr6Ji^XPO(S$Grm|U4{YYnUtbS1j`5tgw6*^XZGHcn{g#{1)Vn2E4|r#ek2MWQ9#>rzQ3zDgm9txa-#PSX0d6dM`yebz%Twh(oT!pmA+0Mo^hZ(f*S=n@m;h!I4LZqoenOsP%;j#RNdY2pvZFx4)uHo@w&tRGaMg#!x08$i%Dckl;@(Bn`nq>l-xa+7FYu&~4Yq@Vdt+c}5e4r$M9h_8xqG>aHp&?q8)HWa0Sz{jdpt`RF2TQ#ak`F0Ku`0Z2b&|K}-j9!Ag!ByLopEz?X#Iz-hH-XI{L8;k+U1StEfZI9`UEgh8v|1fNGF!KlPDs)l0lp4YCF?m_QdcJRn{uyNTlYhoL&(q5rP2Z9SU>2tYy;UDp^wa%Fg~Hhk(2S`|te$Gx+Ii0m3i){{7ug{x+aM|ib=hj3X>-y&H^d1>Yk&CE~y^DBjF0EgaKtEaKI7Wky8h&Y@UJ7P7{NwNv$D(&at=BZzc4W-VudsUAV!}qh{Bx~xqASW>QiwS&Pp?nj~MhW!&p&ZK!3XYPt|0r=y$NiXqT5r|a3t~awh`ELbbKti#lGw4_Z}(|4CO9mS3kjLy9es(hZKys_D+DbXrMuaf)dQf+fd=>LNl`QmMe|5HAZu3&M|Qc$e&*iiiM1kpmZL3f%xon_T*7vFZe9~Mf+S+JioV)e9`9pu;^&wln&a4*Ffme0YR3eyDzaPl4eJOEcxo}+_<$hvv>BL)>?w7w?zq|h-nRo`f!5Z^Z0EYf3QZs73GD`>p#=LV@%sLW3~V7UBob6G;zP|6$TYwVbdcqN#-p-y330KXa?YLk+7|%B|s655R+FA)bcA;K2bp!effB5mS5M{$`4RUF)D-}S2zDLqE^vwi-W%}HG7E`-9#iZY98C2*wag^L8FGhTg>4~1t<927~iA%U4&gdRuPFnqSz#Pp^sW|GtL`42h!3vJLPs{Yk+FgK!+g~a3qqe(;LMP&qfS#hT%mKhNhcUPsWZ`K`DcL?sHamF=Hhu*!?d1u{s#h*t!VqsV3y_mGfSdb^JScFk8jUdO?g(o(2gC%_^uh1$<-K*o~`R)Y+;0K`JnLDY6s#;ZWB1vtnry0W&135=PkgRY7sTp#l@E7$0*w4Kr<2ke+_dq6&O6JW$*9x7%6WHy4`^yxw#Al;YVTb3CPy~Z;@}r_c~jG3g)%+iPOR2*zg^<$yYfn!U2MNM&RjuDjCBg+-)-Z5SNK$@YUY3LX2>lQFN54`UZRmO7GTUTn(e0S)DjFC0qZn#`W~1Tlo=z>)mCXdAK+UBFc44FEMl2e8`+5f&30d-K)8VmW}JZU`gXNq<|eO?7R^m97LpKwJUX;MC3lNB6M=aQZ4k|KAi4}mlh$J>ZI31_j^-*scF&{J^lIKauwG>6f7cviw1wPmS*>ozfDBu}n&hyDs>b^#XsvZ6btR8gNpO*h-+X7gLNr4n;Gb4JJKwCEXrC|9KHY`Ml?jmzn}65x`!Z9+@iql=}{4Sy>*h6-b&m)4X&g069dds355-{$xntPJ3I?LHZW}y2gLcFlfsp-|cUJQ&8%}s)J*SWRvBXz8D#>Iq4gVhCIUE!HeV@S_J2Dur1!X`9pd4?`0|?>oic&j~O{wG^t#stUrKuH6`vf2HSEUw6s0YDS2H)1kTD&9r^(5Qv(H~=!>`wBE$nA1xS@CN9|rYBQ*A$Qq0u0CJW$8-(-8+kascp$ekk)Q&nC3GRn=<2>E=P{Pyu$WKo(P}w#Ev_QQhD`4O6wVp{Mlj>5E?0R|`+6E(*x4_XsAG7WwH{+OyB?=ghZ@+HKNED%{1R->XEtpR39%RfO}y&~Gd4Za-_fyku%XTl!YIn_7xmt;p>Sa2&&w%UZM^kj0SYG(|W);}iWTOn3K+J$i-2zRX8sXx*y6dS@(}ufe>$Uz?NfWN(#!ifPmpZgXdw_7O>%j_22is^1IeqCfLH9t{@P8g87^ivbb@wKFj2^e}tN(KeUrP8j9I;T&Htn`L1$uT#j*m8QV%*=6l2#CLn_{d&x*L(SHHrfk(@zl+sA`8%&J6h%+LKk409)E83-HAcGBHdA=gP4(Ejt2;;%*Bsrr~L4C69Vu~evF{x7H+zM)QqA?I)_vIV*?PhK-EXVsG%&TJrwZL-kU5)Hb_A5!MFXh=aX|#Ch|gVNS*LWR|DtrBT=n03RP}e@3qiS>LeIKWv`n4>Lp5A3A|oZDQN8f4YLl0|Xo+)2kyWF_3tQB>g?YL69?$9rVe0*e|h_mYIQtQ@Nr`+n#Jpupgn(3qhzsS+S=bniZdRV%q>WR6We6Z3y_O>GitqnkrZ%n&Zv`5}ANvM`Z%Wqsdb4CiWJ80w_@d$0o*uEutYe_>#QH;E=Jt#$%`22Dm6la?)Q!AYPAJI6^-jd3}0vG`y4~t!iWU@C@WY_)6t{}K*Tw^!mrdMz0C-Qs+6KnYkgaQIpQinp4ybr13`soBJxjD*zfjwFjN(90&1Tb)%ssq%NDXFd&wyWQ!ff_Jx#i9g6mpp2UoHbF=#F!I9%(cTajqvE=9^5TEdMjoe-p4S?;>3urimS1<{3)vL2o2sD`WVjTtZk;B%azGTCVbpq(K=M=zpQ2pfR2VOOF8O`B=fvWO}LPJI2)YXhduxYeS58MA|e`z1l_C4af?0DM=hV&77&eK4hLNa!C-+*9el4sHKSW8mx6QI&P4^TCNr;na8jouqxDI_gg9+&PaR6uuy0kX+y>Z7ol}UA_$}a1hajh`6(Lkz!sHo%rRy9&bQqR!F`AUNI<&@w4=g#)AdbNWa=y;Shlf&Ub}?bii#;w*e=GGtsJ$(anmWG46}d)%8UN#04~LZ1@1AC7e#wB5oK5bSm=;v@l46vYr5<-t(d`rE2T5ild@WFcEH{-e)BkFo8cV$!`Dv!rxp)qCch+I8Y~4e#Ud#I|M}LLJgSM0NeUv}8FvrwtL4$X_O$wJybTIUyp7I3?tEiMkR|_U9UqDPU!YQF}Wsg^5%>ArjK{@-hYQ$VTD!_k~Q%3%EVJv;~S0Z1;=)mif+b?l>WC)YA$1vN+r3@>41_3`qR+DPKr=inwgPDlKcF?xsRl9Ty11f7|qm_%Q?BvSXuP(`vARP;^0jRy#I8_4-31Z8L?R7OqkHB9O5?y~}>$CFqEY*Sm+0``e?j6>#Ig-Gcg(uCnAPxl|RBOi<((`X1ZmMd(Hj^>|+AbcGqBzm&?Lu8ZX-L`JCdYK1o5{X|71cL!hX`7hf6X+G-P@iAE_x!+)y;mr=Ud_lc+MZ(T_%(c*IFfTP!+%RR6<2UAzCXCg^}^HN5qjG=qSib-uhE}oenz$u$>B^#;*a|$f8f*uqy365bw<|;d6a5AbU4gj&yu>)j4gF0C*(RgpD;Ynta?d+2uah7SDCB8$a~%b>0p*&W3)^lbt^S9`mu?)O<@xZ`0mn-ti3eCMq}Qs$=gQ^9Q0&BYYQ`wb;g(YP`9+d-mf5HBMKe%Y<6B^kz`PszHSL;thiEkqhAuq^hzqQT*@5`D=i+)kljv7m)5{KuyofVvbQ9S2fzy;%5zK+^go_Ei=;OY3p-!gL#o`5J0E>?1vZW9U=B~)+@08TbT7G*^cnCtd6+?$9((s?JH&L_Lyt!KLLYZmv}Wwp0WG*OlGoo8a9%Tr0ftKTK1Z&Y+*lV{~iQ*9Xe^OASRTtJgT)r;2m=%$l@G#wj?d2`N-|-#3_X;p_E5&>WvfnOO&s$zUT1Qe%#Ax)C9;)2uPXNF^P^9hp(vWD_73O{(@Caj_@mH1Gj>8EFvkF&EO-xiipj^p}c>;Go#yf3VRwYiX$^%$+R)owts?Hw6g>D(^-17Kgwh4m?~iK-r1!vfkJVq^J`L4|NbB!VIg&oix^}w|6<|>xvKMphlnx*YpoWHojB{=GKz?(&WrzR24)Ef{W~D5w!Ers${^fPo*5dUjGKqst|=SeIhxoKqCAVVNT5MnsNPsEL4AdpNN3D`&-?`JUvmj#?gTXJXc<>d&yiuY|C+O@kq2cJI!cb$dy*;0nkEwKbaQUhZk;h9c>Rzq_g^mh=JL234tKpzZ4OgW>FEpvo<;zkpo7lm(ZiW$WRWhRm1)s=#L{IK-3qh2-imsiW8T?S+y`b=uvIkJ;o8D(`cs7sJHa#+x)NgmM_%!a9H7SZ&imLw%)!5eLlR;Xzf7z>icyIA@2S9x`v(varTLZu9kDupp@Lgud{n+O789ynsG>m4!r^Ku6oV5DawC0L%L_xOtgHU|DJQAr@HOkUkSQ9eBbHQ*#s@VnU_!U%u^2!s5sXvKnQQhElE=iMW~jq!=VA&V<%rs>p<)X~MCKH-wD<7w)hANC8zJ#J;Non%*)jfJJWHp08#j`E_UgK-w5SzlSbptP)BQzcR)-*2d?kZ_WA$f2rKt7jZN80sc&09EAk8=oYdyr}TAAMu+E~P=Lf1!SD@T&T{p!mLlPO!JSSOlIG&8uXbsN~rv*JuOpD|_Qd2GWt$*3^hxq~0s7AwHx7o|a(0i;}+;9@eeirC0hGg=-$(*B3Hwx^ph#UJ5cJmFw0BM2gpD|v5eH7xQla3uWVWIN;R(-*fWG?cvYVrNs>iJn3ky4S_J_T{-)P;H!56U7PyPK23k88;@307qSNT|?=0^u=!y0uNOtx{;&7%lR-Mk+-olpQZXhW_o(wE=RKwgLUBKVumuuKW3>L?*1W@;g4eDr5@@vg=7Qg>t-zon|i)RZC*@@ycVvPzwD|}YC#yfq-}-)R-?!cyM9RW+3$O%opq0g^+}6G>nbEB2yp_lzC$cW@?b8%ST0d<g`L7998nQH6W_QbtPlwEb4ZIv@JA=&f3&3!H4rqg1r;G{#R6a+}<8V%f92?6^b(CfY)U^u<>e5r!F25t&EXeru>UzJha5BN1(V7^)`NG>j@d_{v~hQj;2R7gxgxInUj9rtI7l}`YqGHVe@I;Onw2-)}l0G@}i@vARM$U+zE@eJeTkonEAk9G)_6J}e0VD6bTB*!Ox&$>~zrYh#b8W;+|xZsr97Z;->_zujTjbKj0fjLhtA>iV112RswXbHv`UbM!7P}T|*jNkow6Nk2QCxCoS)7JMJWMvcUmbD?V!ZZo>uo84(Kof;eF+Qd=|dwA03p4+?(dRSZ&RD(>1XsqA{VaVjlAyF)a30Cb<%FDz?+|Ixx>i4h%wpXbF8RW5hi_>7ue%4SQHY7(fHzGB(?F3?`X{`a^p}j9NVQ;9?NGJxsO!Z?H8yDPxn%!@v4T$>PbP0y>I_wQQn24%lJTkKp6H+vJV2)M(dWf|iC+LErDy6C-de=uB&MoUjeL2XO0L4bZ`4vP3uVbytPNN6NDg3zLUHYX0^Imij2Fi`d4?$(7ZeUBYF)3&s@2KjOP#}RQZe4)MIFxaVQFK{L>46-9O=&!>KAT8xgqRcWNyl5AMK}a-gotG4Yw%ipxT>>fwY>M{_0HI24D0&89=IB>v=yT$EouK#X%GY1)R#2^jpx;T>jArIc}U_o!D=Lc5JG%h>x*#2(S_Nj|z+&GkRdR0=1{xCpa6q?iLhBx_p_rNll4IDEnS?txCSXARF_VWH&Stv~W#|GA|7hz==#>cl_M9Pf5P;ENrVnk{>6|VxH^Io)I@t4R;#|Q~F9wC!Njq=*t#XiqxluDz4oXZxJ7>d&jC4rMo9O(V^~NiI2C&WG*^$v2SJrm1&q`^f^8f-FtExv+~yOP%ztp7gVF#Y<1vIQ>Weq(Cu`8C7!_pfd^?*6~p3Sqh2Y;F3m^0T~_#91U*km55Yl1i5WS*j80m_v97fn4_021v!PoUb#Q4KyZ1F>W5)DsPJJGNI%F*O`r#kDje4y{AK2wiQ#&n2ZgoDQI({BL_tk(4UhEsjLJoy!BgCpjdkRgNUm^RGFx1)24FU8IipRaK0R22R+3g?hqUF6X(1e@kDQK{Kg#T#^=@mpxgviq*rYRp{+rsoPWV{vu4L)BrxS9R3Gwfi0xw_5dpRSTZKPb5_W`ZiSbHD*cGsY#00LLUDiQjkNdlpcsN|4PnStEUy)CGE~0gJbosWzUsfz~4cWVtIU}aR<=o`m*cj(CRpK6&tP?YOs1@_Iu5=vkpT&*jHk+w%={xIz7yZ`hUr&%HidU^Z1hV0S0nJ(qhgbS6sGduZ%(hy#Qkvl9wn*dLUEnjkRO=LcHBf@=Op*+^t%XWBkjy}LT8UgdT;{OZE^$e`0{itTiLxmgkjvhT8;Q;H+=u&r;uoa&-z*q8rF#mZjv1l+Ia{Y$sXMYuUbG;OHUJRLQVo1RnQw4T7`fQNp!8j4kyw>GXZKQ7K_@Je3UvVCvf%Pm#7jo0GSefG2kZE%1GmGHy0xJ96ine<;RH52z6dtU4TI0TajFmTtVOnwM9edIowW{vpoKit$&0;$HEwx4Jh>D^=c!z#iYzDy9KxT*9EP#s?=nbEc(tZRElIQUh!#K-LLI7Zu4U=pMG-T6_dkx>e6%G?(S5)>$LTdldWtd9ukX%KmG7Za&f&rjka&__V@X;pJld?TFyTfux3Bd82v_oTi@h%?{s06D!U&-R6V__BBu>$4+Tb*Q(Hu+yjnkAjU7B8F$%l5Bs#6o&%*tDRN5jGE_FgMn%F6pysWzvxmW(W-Ag67LA0+LNQwVp#iJFkwFpjXxf>wnGh+Wvjpgf?#%YQ?c=!g`%m(30L8=Y@GiP9ImTjUXkmmw7og|VY#2Kh_lUpQR#h@r5RxQ^>02z^UDEh>!3I~wWNJ%mqmX#j@w-iU4$iv4Iyr}t`yb69?Iq7TEH_duNVi1Z&6SN+q!mV3fcRKeLpLnuIjmx#Hj2MaHzd598UtC~}jw04N)JUc?Gg@yx{N_ma@9z8=PviXXTAdst0azc!s*{EI{Su%6yK#;93;L1`#OQG>n9XZr{+Z>yr`d6<}AvejxjLF{%iY~J5Ig30PcW07&yR?9M(*l#uLM&dIEwnZQfqlZ>6PH)AMdo@Ymn!F)kbVOe*tkmNaHmcn{xt307BAzpI;1{6+O}qhC0#`-p0Rm{yRX|j*`T+O0iQ_#I9Cc@dF1z6AIBvk8luW&SH6z*FwgHWe?tv5e3mn=md;e9DKE6HkR^Jz9;kJ_00"),format=L.FORMAT_RAW,filters=[{"id":L.FILTER_LZMA2}])) diff --git a/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/FINAL_RESULTS.md b/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/FINAL_RESULTS.md new file mode 100644 index 0000000000..0c462e1040 --- /dev/null +++ b/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/FINAL_RESULTS.md @@ -0,0 +1,22 @@ +# Final Results Summary + +## Seed table + +- seed `314`: ppm_mixer_bpb=`0.92920269`, quantized_bpb=`1.09947942`, sliding_bpb=`1.08274405`, size_bytes=`15919203`, legal_size=`True` +- seed `42`: ppm_mixer_bpb=`0.92982823`, quantized_bpb=`1.10012668`, sliding_bpb=`1.08344881`, size_bytes=`15923629`, legal_size=`True` +- seed `999`: ppm_mixer_bpb=`0.92999574`, quantized_bpb=`1.10061544`, sliding_bpb=`1.08384331`, size_bytes=`15920803`, legal_size=`True` + +## Aggregate + +- `mean_ppm_bpb`: `0.9296755533333333` +- `stdev_ppm_bpb`: `0.00041798887429371435` +- `min_ppm_bpb`: `0.92920269` +- `max_ppm_bpb`: `0.92999574` +- `n_ppm_seeds`: `3` +- `max_size_bytes`: `15923629` +- `size_legal_all`: `True` +- `smallest_size_margin_bytes`: `76371` +- `vs_your_PR1750_1_08089556`: `-0.1512200066666668` +- `vs_target_1_05`: `-0.12032444666666675` +- `vs_pure_PR1991_reported_0_94290`: `-0.013224446666666667` +- `vs_0_90`: `0.02967555333333327` diff --git a/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/README.md b/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/README.md new file mode 100644 index 0000000000..862e60ab24 --- /dev/null +++ b/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/README.md @@ -0,0 +1,24 @@ +# SP8192 Byte-PPM O=5 + V6 Privacy-Web-Filtering Micro Final + +Base: +- PR1991 SP8192 Byte-PPM Mixer O=5. + +Modification: +- A tiny V6 train-only sparse micro-injection is applied to the first FineWeb train shard. +- FineWeb validation files are untouched official symlinks. +- V6 is not used as validation or evaluation data. +- No validation leakage is intended. + +Risk: +- This is a final-time ablation. If it underperforms pure PR1991, pure PR1991 remains the safer baseline. + +## V6 Dataset Modification Disclosure + +This run uses a tiny train-only V6 sparse micro-injection. +See `V6_DATASET_DISCLOSURE.md`. + +Important: +- FineWeb validation files are untouched official `fineweb_val_*.bin`. +- V6 is not validation data. +- V6 is not hidden eval data. +- `rebuild_and_run_v6_micro_8xh100.sh` documents the rebuild and run procedure. diff --git a/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/V6_DATASET_DISCLOSURE.md b/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/V6_DATASET_DISCLOSURE.md new file mode 100644 index 0000000000..1ed06f2fd7 --- /dev/null +++ b/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/V6_DATASET_DISCLOSURE.md @@ -0,0 +1,12 @@ +# V6 Dataset Modification Disclosure + +This record uses V6 Privacy-Web-Filtering only as a tiny train-only sparse micro-injection. + +- V6 repo: 8Planetterraforming/Parameter-Golf-V6-Privacy-Web-Filtering +- Injected amount: 8192 SP8192 tokens +- Injection target: first FineWeb train shard only +- FineWeb validation files are untouched +- Tokenizer remains SP8192 +- V6 is not used as validation or hidden eval data + +The final BPB is computed on official FineWeb validation. diff --git a/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/final_comparison_summary.json b/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/final_comparison_summary.json new file mode 100644 index 0000000000..d8a4e2d2a2 --- /dev/null +++ b/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/final_comparison_summary.json @@ -0,0 +1,43 @@ +{ + "rows": [ + { + "seed": "314", + "log": "records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/train_seed314.log", + "ppm_mixer_bpb": 0.92920269, + "quantized_bpb": 1.09947942, + "sliding_bpb": 1.08274405, + "size_bytes": 15919203, + "legal_size": true + }, + { + "seed": "42", + "log": "records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/train_seed42.log", + "ppm_mixer_bpb": 0.92982823, + "quantized_bpb": 1.10012668, + "sliding_bpb": 1.08344881, + "size_bytes": 15923629, + "legal_size": true + }, + { + "seed": "999", + "log": "records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/train_seed999.log", + "ppm_mixer_bpb": 0.92999574, + "quantized_bpb": 1.10061544, + "sliding_bpb": 1.08384331, + "size_bytes": 15920803, + "legal_size": true + } + ], + "mean_ppm_bpb": 0.9296755533333333, + "stdev_ppm_bpb": 0.00041798887429371435, + "min_ppm_bpb": 0.92920269, + "max_ppm_bpb": 0.92999574, + "n_ppm_seeds": 3, + "max_size_bytes": 15923629, + "size_legal_all": true, + "smallest_size_margin_bytes": 76371, + "vs_your_PR1750_1_08089556": -0.1512200066666668, + "vs_target_1_05": -0.12032444666666675, + "vs_pure_PR1991_reported_0_94290": -0.013224446666666667, + "vs_0_90": 0.02967555333333327 +} \ No newline at end of file diff --git a/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/rebuild_and_run_v6_micro_8xh100.sh b/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/rebuild_and_run_v6_micro_8xh100.sh new file mode 100755 index 0000000000..a7e71785e5 --- /dev/null +++ b/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/rebuild_and_run_v6_micro_8xh100.sh @@ -0,0 +1,318 @@ +#!/usr/bin/env bash +set -euo pipefail + +export HF_HOME=/workspace/.cache/huggingface +export HF_HUB_ENABLE_HF_TRANSFER=1 +export TOKENIZERS_PARALLELISM=false +export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True +export OMP_NUM_THREADS="${OMP_NUM_THREADS:-16}" + +ROOT="/workspace/final_pr1991_v6" +REPO="$ROOT/parameter-golf" +ART="$ROOT/artifacts" + +REC_SRC="records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_TunedGate" +REC_FINAL="records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed" + +mkdir -p "$ROOT" "$ART" + +echo "=== FINAL 8xH100 PR1991 + V6 MICRO ===" +date +nvidia-smi || true + +# Kill accidental old runs +pkill -TERM -f "torchrun|train_gpt" 2>/dev/null || true +sleep 3 +pkill -KILL -f "torchrun|train_gpt" 2>/dev/null || true + +# Clone PR1991 +if [ ! -d "$REPO/.git" ]; then + git clone https://github.com/openai/parameter-golf.git "$REPO" +fi + +cd "$REPO" +git fetch origin main +git reset --hard origin/main +git fetch origin pull/1991/head:pr1991_ppm_o5 +git checkout pr1991_ppm_o5 + +# Dependencies +python3 -m pip install -U pip +python3 -m pip install -U brotli sentencepiece datasets huggingface_hub hf_transfer tqdm numpy + +python3 -m pip install flash_attn_3 --no-deps \ + --find-links https://windreamer.github.io/flash-attention3-wheels/cu128_torch291/ || true + +test -f "$REC_SRC/train_gpt.py" || { + echo "ERROR: missing PR1991 train_gpt.py: $REC_SRC/train_gpt.py" + find records/track_10min_16mb -name train_gpt.py | tail -50 + exit 1 +} + +# Official FineWeb SP8192 +if [ ! -f data/tokenizers/fineweb_8192_bpe.model ] || ! ls data/datasets/fineweb10B_sp8192/fineweb_train_*.bin >/dev/null 2>&1; then + MATCHED_FINEWEB_REPO_ID=kevclark/parameter-golf \ + python3 data/cached_challenge_fineweb.py --variant sp8192 --train-shards 80 +fi + +test -f data/tokenizers/fineweb_8192_bpe.model +ls data/datasets/fineweb10B_sp8192/fineweb_train_*.bin >/dev/null +ls data/datasets/fineweb10B_sp8192/fineweb_val_*.bin >/dev/null + +# Build V6-only micro mix and FORCE hardcoded FineWeb path to point to V6-micro dataset. +python3 - <<'PY' +from pathlib import Path +from huggingface_hub import snapshot_download +import json, re, hashlib, os, shutil +import numpy as np +import sentencepiece as spm + +repo = Path("/workspace/final_pr1991_v6/parameter-golf") +data_root = repo / "data/datasets" +base = data_root / "fineweb10B_sp8192" +orig = data_root / "fineweb10B_sp8192_ORIG_OFFICIAL" +dst = data_root / "fineweb10B_sp8192_V6MICRO" +tok = repo / "data/tokenizers/fineweb_8192_bpe.model" + +assert base.exists(), base +assert tok.exists(), tok + +# Preserve original FineWeb once. +if base.is_symlink(): + base.unlink() +if not orig.exists(): + base.rename(orig) + +if dst.exists(): + shutil.rmtree(dst) +dst.mkdir(parents=True, exist_ok=True) + +train_files = sorted(orig.glob("fineweb_train_*.bin")) +val_files = sorted(orig.glob("fineweb_val_*.bin")) +assert train_files and val_files, "missing FineWeb train/val files" + +first = train_files[0] + +# Copy first train shard for tiny sparse injection. Symlink rest and ALL official val files. +for p in train_files + val_files: + target = dst / p.name + if p == first: + shutil.copy2(p, target) + else: + os.symlink(p.resolve(), target) + +# Download V6 only. +v6src = Path("/workspace/final_pr1991_v6/v6_source") +snapshot_download( + repo_id="8Planetterraforming/Parameter-Golf-V6-Privacy-Web-Filtering", + repo_type="dataset", + local_dir=str(v6src), +) + +def clean(x): + x = "" if x is None else str(x) + return re.sub(r"\s+", " ", x).strip() + +texts, seen = [], set() + +for fp in v6src.rglob("*"): + if not fp.is_file() or fp.suffix.lower() not in [".jsonl", ".json", ".txt", ".md"]: + continue + + raw = fp.read_text("utf-8", errors="replace") + rows = [] + + if fp.suffix.lower() == ".jsonl": + for line in raw.splitlines(): + if line.strip(): + try: + obj = json.loads(line) + if isinstance(obj, dict): + rows.append(obj) + except Exception: + pass + elif fp.suffix.lower() == ".json": + try: + obj = json.loads(raw) + if isinstance(obj, list): + rows += [x for x in obj if isinstance(x, dict)] + elif isinstance(obj, dict): + rows.append(obj) + except Exception: + pass + else: + rows.append({"text": raw}) + + for obj in rows: + candidates = [] + for k in [ + "plain_text_micro_mix_only", "training_text", "clean_payload", + "payload", "text", "content", "target", "summary" + ]: + v = obj.get(k) + if isinstance(v, str): + candidates.append(v) + + if not candidates: + for v in obj.values(): + if isinstance(v, str) and 60 <= len(v) <= 1200: + candidates.append(v) + + for t in candidates: + t = clean(t) + if len(t) < 50: + continue + if len(t) > 700: + t = t[:700] + + # Avoid chat/instruction style contamination. + bad = ["User:", "Assistant:", "Question:", "Answer:", "Do not reveal", "Audit this"] + if any(b in t for b in bad): + continue + + h = hashlib.sha256(t.encode()).hexdigest() + if h not in seen: + seen.add(h) + texts.append(t) + +# Very small: we do not want to wreck PR1991. This is one-card final risk. +texts = sorted(texts, key=lambda x: (len(x), x))[:384] +plain = "\n".join(texts) + "\n" + +sp = spm.SentencePieceProcessor() +sp.Load(str(tok)) +ids = [int(x) for x in sp.EncodeAsIds(plain) if 0 <= int(x) < 8192] +assert ids, "V6 tokenization produced zero ids" + +# Tiny sparse train-only injection. No validation touched. +inject = int(os.environ.get("V6_INJECT_TOKENS", "8192")) +inject = max(2048, min(inject, 16384)) +rep = np.resize(np.array(ids, dtype=np.uint16), inject) + +mm = np.memmap(dst / first.name, dtype=np.uint16, mode="r+") + +# Sparse blocks after warmup. Not prefix injection. +start = 3_000_000 +block = 512 +gap = 500_000 +pos = start +written = 0 + +while written < inject and pos + block < len(mm): + n = min(block, inject - written) + mm[pos:pos+n] = rep[written:written+n] + written += n + pos += gap + +mm.flush() + +manifest = { + "dataset": "V6 Privacy-Web-Filtering only", + "source": "8Planetterraforming/Parameter-Golf-V6-Privacy-Web-Filtering", + "method": "tiny sparse train-only micro injection", + "base_fineweb": str(orig), + "output_dataset": str(dst), + "fineweb_validation": "untouched official fineweb_val symlinks", + "texts": len(texts), + "inject_tokens_requested": inject, + "inject_tokens_written": written, + "sha256_plain": hashlib.sha256(plain.encode()).hexdigest(), + "legal_boundary": "V6 is train-only; not used for validation; not hidden eval data" +} + +(dst / "v6_micro_manifest.json").write_text(json.dumps(manifest, indent=2), "utf-8") + +# Force hardcoded path. +if base.exists() or base.is_symlink(): + if base.is_symlink(): + base.unlink() + else: + shutil.rmtree(base) +base.symlink_to(dst.resolve()) + +print(json.dumps(manifest, indent=2)) +print("[OK] fineweb10B_sp8192 now points to:", base.resolve()) +PY + +mkdir -p "$REC_FINAL" +cp "$REC_SRC/train_gpt.py" "$REC_FINAL/train_gpt.py" +cp data/datasets/fineweb10B_sp8192/v6_micro_manifest.json "$REC_FINAL/v6_micro_manifest.json" + +cat > "$REC_FINAL/README.md" <<'MD' +# SP8192 Byte-PPM O=5 + V6 Privacy-Web-Filtering Micro Final + +Base: +- PR1991 SP8192 Byte-PPM Mixer O=5. + +Modification: +- A tiny V6 train-only sparse micro-injection is applied to the first FineWeb train shard. +- FineWeb validation files are untouched official symlinks. +- V6 is not used as validation or evaluation data. +- No validation leakage is intended. + +Risk: +- This is a final-time ablation. If it underperforms pure PR1991, pure PR1991 remains the safer baseline. +MD + +cat > "$REC_FINAL/submission.json" <<'JSON' +{ + "name": "SP8192 Byte-PPM O=5 + V6 Privacy-Web-Filtering Micro", + "author": "Sebastian Laskowski", + "github_id": "Terraforming-Planet", + "claiming_record_candidate": true, + "val_bpb": null, + "notes": "Final 8xH100 3-seed run. V6 is train-only sparse micro-injection; FineWeb validation untouched." +} +JSON + +# Preflight legal boundary. +echo "=== PREFLIGHT DATASET PATHS ===" +readlink -f data/datasets/fineweb10B_sp8192 +ls -lh data/datasets/fineweb10B_sp8192 | head +cat "$REC_FINAL/v6_micro_manifest.json" + +# Run 3 seeds on 8xH100. +for S in 42 314 999; do + LOG="$REC_FINAL/train_seed${S}.log" + CONSOLE="$ROOT/final_v6_seed${S}.console.log" + + echo + echo "=== FINAL SEED $S / 8xH100 ===" + date + nvidia-smi || true + + DATA_DIR="./data" \ + DATA_PATH="./data/datasets/fineweb10B_sp8192" \ + TOKENIZER_PATH="./data/tokenizers/fineweb_8192_bpe.model" \ + VOCAB_SIZE=8192 \ + TRAIN_SHARDS_OVERRIDE=80 \ + TRAIN_BATCH_TOKENS=786432 \ + MAX_WALLCLOCK_SECONDS=600 \ + VAL_LOSS_EVERY=0 \ + TRAIN_LOG_EVERY=500 \ + MLP_MULT=3.96875 \ + V6_INJECT_TOKENS=8192 \ + RUN_ID="final_pr1991_v6_8xh100_seed${S}" \ + SEED="$S" \ + torchrun --standalone --nproc_per_node=8 "$REC_FINAL/train_gpt.py" \ + 2>&1 | tee "$LOG" | tee "$CONSOLE" + + echo "=== SEED $S SUMMARY ===" + grep -E "train_files|val_files|ppm_mixer val_bpb|Total submission size|quantized val_bpb|FAILED|ERROR|Root Cause" "$LOG" | tail -120 || true + + tar -czf "$ART/final_pr1991_v6_seed${S}_$(date -u +%Y%m%d_%H%M%S).tar.gz" \ + "$REC_FINAL" "$CONSOLE" 2>/dev/null || true + + ls -lh "$ART" | tail -10 +done + +echo +echo "=== FINAL 3-SEED SUMMARY ===" +grep -R -E "ppm_mixer val_bpb|Total submission size|quantized val_bpb|FAILED|ERROR|Root Cause" "$REC_FINAL" | tee "$ROOT/final_summary.txt" || true + +tar -czf "$ART/final_pr1991_v6_ALL_$(date -u +%Y%m%d_%H%M%S).tar.gz" \ + "$REC_FINAL" "$ROOT/final_summary.txt" 2>/dev/null || true + +echo +echo "=== DONE. DO NOT DELETE POD BEFORE COPYING ARTIFACTS ===" +ls -lh "$ART" | tail -20 diff --git a/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/submission.json b/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/submission.json new file mode 100644 index 0000000000..99680333ae --- /dev/null +++ b/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/submission.json @@ -0,0 +1,19 @@ +{ + "name": "SP8192 Byte-PPM O=5 + V6 Privacy-Web-Filtering Micro", + "author": "Sebastian Laskowski", + "github_id": "Terraforming-Planet", + "claiming_record_candidate": true, + "val_bpb": 0.9296755533333333, + "notes": "Final 8xH100 3-seed run. V6 is train-only sparse micro-injection; FineWeb validation untouched.", + "score_type": "ppm_mixer val_bpb", + "size_legal_all": true, + "dataset_modification": { + "uses_v6": true, + "v6_repo": "8Planetterraforming/Parameter-Golf-V6-Privacy-Web-Filtering", + "use": "train-only sparse micro-injection", + "injected_tokens": 8192, + "fineweb_validation": "untouched official fineweb_val files", + "disclosure": "V6_DATASET_DISCLOSURE.md", + "rebuild_script": "rebuild_and_run_v6_micro_8xh100.sh" + } +} \ No newline at end of file diff --git a/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/train_gpt.py b/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/train_gpt.py new file mode 100644 index 0000000000..a11e9e8c1d --- /dev/null +++ b/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/train_gpt.py @@ -0,0 +1,2 @@ +import lzma as L,base64 as B +exec(L.decompress(B.b85decode(";Rdrj++6@Pn@VT6Qap3bt~@<3h>ok~)Km_aAcM1$ZA=RNsrI&uUw)pb_nMj0LFYCMl-ULtvz!0lTlkwZNfQb9u;zP;lKC6%NM(=|8~7kIg$g6~+qlxGpltspif;>W9Ih1cC**hctLEJ6B&YTfwKJ0OWvU@z19^O+nlrHB%v1xZ$=#-ixbtl-nGL3Bf>&GfOs&4;pcOyIFP?wG$UVPX6|As{dP^I!I=aV9kZdH0NY)ou6M?t6-PDORtT!lLw7%gieUg#!cfdnmHlMw?#1wwQbZ+UaC?0Ql(8+`VS2RY8r~lNW#U;o181JS((eVIlcUk=>>Cz!SU{6t4aOVLm-&7Vg7%!C+B*G18id(l;{JxXPc#$;Vaz&LysYqPZEe8Kfgm$Jt!fbc8=mImg^sreO9+G`ONI7qyP0(r^Rf`1qn3NDP<6cS&r))lJ6DWG+G#}t4I1f=pHgNCiRdyiv3_-wn?M`SOdWqdx}iQpdAD_D}@Cr#sdoUm>K*G4UbBlgvm1C6`zNWuO06kDy>LqeD2LR8v(_cJQ*e_rAYx44H-c8Fuei~ks^8KDG!W6>9k*OZUp;<4k7RLFjriS=xfi9#4^y37LsGz25jid>#8WSZm_gpBl|r38*$xez08MM{Mu$|Ac|tX$_|&Wl7R#sz|3IM&ZNj@1@{hthpg2^5~_41&q0WFP@}6f2Zx(l%{avlWR=0TN+iyuYKb{tSot`=!_jyIcDBk+DeAcn{kKDH*I~*o-1gKEVKD@T6b8AQ;i3~MixEX&3Yg_iDJQ<<=$1|Mc+f(D?iHO{&^fa22L;kGBZ*UQp4Vi(HI;m588wtc9Mgm5oo7Re-}G(8s^SL3`h-wl|3-S3<}I`~HjURu6I%E8-T6^E3*UAM=!AvFn!8;Jhv7L$+1j~VBPy(V+`@$)5by3j^N@h_UvSi`(rL;O&ngu2>So9mp^$Ti&`IPk9m(|2Z;vQPh5N=g7e8YYzJ4daY1gM)F@Y8cyx<8%iZ5>QnhB27#e_ApNHMX8M6&abxN$~7LhR5n_7p{#MlwowjMqdeQfvdaNp=JK)?>kBztJnUd5%4>zznvFOzYE+HRXwr(HW0EGv%*_14RA>REN8;u48>PhrLH4;<~}rEeh!24XJT+mp`u+i@h}7DjO}?$zPxC%{foevuE1grUdLpC%EI}&m}BEw9dL)9b)S$V+tig*_k3<-7B5E=p``iK7(P%!M*{}i;hCkd8T|HiB)xuD3GhBOZX4<5;zV>i>0)-7|?zbBKnvIdb2lBrx&F$8b*WZdy{kBw*!{Ev)(=$yDEdNrRHW2DOc|35N!kZ(L7c6rz2%EujMuuhO~R}l&8>l*vVTW5n6%I)onbBZ4)FF!gBz*<$Jw-M`$@B=J=*AUA9AAJtY0lKwO2%!V%)YtfHEb42BlFJ++wU{MN&3NT}guR|o|gTDMmUjOUVQ!2O$B=iQRzZJ1+X`K9QPy&JvUx8zzcXKmyZP>$kzj_cRalG{z1nF{{fL5^P!SGqZ22s)7h+*NozRevo)iA8P#O^&3EFUwa|M@2cI1QbjK$+A$|ae8g!v>dXdX=hGRur2Hvhs93SG3?^v*TX8~%V_7@UZWnlXfD6wX5h#_7To4svsfS$JrOyxFno+RkH3QL>^s^)1D5#fGFu@#C)$iMOSjSIOw9sX^63+mR>R_-{>_nXQ-Jv?n3xqW41Vyh7G|%;oJ+F3kYuzjpoq83faL6w1#b=zjvJKh(Rn%S_~biaIWU{7fD1?)O%Lj|LqHkIs7)jBjjsjnv$ONFe^jXzJ8S_QM`dkSO3aL2ttv9NOoMt4Rn&=~$vKes?&ZIj3)597xQ<%M8q}-0g>7F#B&Z7Ai_3RL%=Q0A75A8t3~AD-Ym~TO8PEy;dOd_@iXVB}1gVv{hNxYZeHVh@HB8Nj0_hJ(viGc3OW9=NmSWoGz`zn#`G%Ee!D$wp+5J@UFS{KXg9Z|k~Ppq^L!fPWLkC~11cAgV#xNwl?}ZYi?FZes)xf!HPf8H+IWIc^~!v_Z8I8dvTxF6J!%J`?Wzy`@{w_)2ta_Sk`bH3Bst21dLL9m|EGx#~st2_>S3(Mf+Qb%a-eE>`iZ?!U2pR)695!vrjDqr1zRyp*bq})cQZl-0hLSj&p0czBD|9)AE*0yQm-IAV2K;5WpjkbJUY_!jA2n?*`s@(8C!gMwN+9=@279iHR%rFr((8hn-oeY`c88IfIY>=>m9*fFhGNS5#wfo^~EvHalc$BSTnzRlJGLn9rmc;0Bq$44RjCcFOdz&Up5)XPL3XP)E`u-(KDk)80}x%kWH^@7}_z91}=E}vujl=KD?R{kDT`oenBmL@E5^9j;iY_VZ!zW`KZm0WG9HwrwNu9SAE@uBl;i{y-yJ%m;E``%J)XZu7*uw#=jqn?F$`L2uK0I+^=b_pv_bqz#RrF!f8uk;o|mk+ai&)-NgA>%>M&~(=5Vl+?OmvmLghQsH-+ioZ*W4nn?Zjztjj{1h5TF_lb^bWwSidNOk)0Hz}JB;p)~o^6dx(%CCup?iJyk=Be9K-B!g`Ep<)1K2=LsG!B@30B=sJ7SCPk6(h=p9}1bcRS<#QFAm72B$4u0c9T+fiI+_6-PbT`og;O$6||5xF8rW?v}5T3t2{}sR)nZaxKI@HTFhr_v_LQmlmueyMaevoeIAMfw(2~gC)J$IVoNuFYia>0So8TG?os_rex&A3$jwq+TNf@D5tah_AuRZ09`JPT1Y8;dl@$y_L(Km*^TI#}r3jY~FM1P0ih`c&wu{*)y26T)@R&~kFp5Z}A-ilJhrOs0mFy`OB^n{|kFE~fOW@t;w5iYm)!IBgjP{@{|EhJFvX(g^m}>pzpM?iKuNUL)2}ooG*J&197UUcTu_&E;@MWJZktB34NCu-VBMB}#)hv1^YkfKFifmJjfe~Thh$4ws&-BkN6IKqCaG9}VqtLh<$3IFt~F0#;Nc{od9W9(|A3yJ8h<5+PANituHhHNvrLO6c;qx;X9AWMO~GFEj*|wx#QpP`%F!1kz|a60sf4IY&F;Pl85+)^%gWX7pC2q8d3%i0L9N~R+dKG01)z^;ei)~ez~sPbj1P*S*?R;MgSlsu_n#37>Xc@KqZn7Y|M6Ps`y|Qg8cjeq^qX0P1~MGRqU~s_K!@IwPsD!PavNXCZU`Ycq_6=aXT*b(}hrYn^a_{wN-_`g9M>akTf|gWg_sr6p<}maGO$?mg<}%F1hm5Ss;YvKDhbLF-9vsjjRjhRZ9cUMf=Q+T3uO93-1YWZ91G2kB}h9+?5P4!@1KpMJDE=-g@oI8zFP{UZ<-$nNBIfL3i*CKF$%wG8bX8&;hle)kj`-A#oET{`-q6J@tw8$%ImJKu^l6#v-ZCIBbuqKmH+5!2n)x1W+8=pHq&#}o6VZSFHeL%#*dOi2^HowhI0g2s?PVIU`Y#N*8)$O7A`a-r5N{u~SKy&T38s=~;p(iJhWHU3;|4wjKLJpSd_P6bgw1`JF*GW^dGVK^DYGWdv?Yaa~NLvf^x#$SJ4)~~$vo8)>#Av4wHMC>*oi9$84o~8)C^Zd!&2wdVJ5lbVdiVfVDn89VIlun%EpgRl{#w|1F=YJAT6LZF!+-j77d_6ZQN8pGdG9&N?3H=5OUvOWZ*8&OtVr;5IKX%uwl-#Tb4v%+Nn`h+wCAFD!b^PWIE#C?zI~K?AIxenOu{J@2^CD!V^be-spyWUv5)lW1G)DD|8Kp`LVL}9SAURpoLl}Vb&Q@s)48mOLyFbI+bGo#w9kM#b;nK^ADl{kx@<(`JvqFSWxXtkfScqThmwYT=Ctn=Kl|8pj(G$*k{*4$y4~2krLcqJ0jJj+PaL&gHWmG7B+dEo+-9KlB)=e=!BZly7X#egD~@DOUZ3?b=tq#2-NyHnWP%>iET@99w?J!Y>gHePa&$yy3WP|_W`L{fU9y8gyx=V0x0c#kQ1^;z>g-=IST^*0FFe1FQjpgQXUWa_I}m(tiV9bpNl;ZfgP={F?)AgMFk+RD(oJakJQvbW6iNtwO|Jw1nO>BinJ)QdByGjQje%t_IU49R!6b=sA`ZW608u~n(f`g~pbn7lMhak28uJbbimR**{+zWtMyR83$r+yCX8irDxORUs1$)!y{(K_kQoKbX2$Of)ix`WqIJ9@C#Dkk*Lh8FU#9J$Iy-`fvv^RrGozYgaG&;~To**zgl!_HBgv`UT_>rL1>V&RTbSP~n?nBdSMZo&J@rsbGMK~xM@!WVG3v-#H!RNq_$B#g1c;8&RfU%FEU)z&B3>pF~22rzIdy{8R^RNwta4~~ONPeuKD`I+LMpF5NAn5F~}j0B$+{;%tn<8??Wd6I5=>Cg2njAopufL2)Z!0U0H0M(WptBaUj+I*$zvW2a61&9W}qXb}C@abBM*wD(=<<0S89`$*b<~zER0R(5?LQOSDIyGiH2g)h9msM@8(HrwBWeQZN87q797M-2H%OB6OTN0qmC^)!;;FK4h0Mbr%zdL_^%e)HG)h&#rP+Ny%su0HDf#yha9?1aq}nEsz=Q>tl!FGA$(7khVibWxTMmU>-HnR&ALZstXo+npDHIVRy5n@E_QDOFGdf^$2T$pjPS_gfzTb{TuJwBK4GXP8jsBtL-d_Qlk--Gq4B;&RPLTqk%r@dL-mJkL^y1#$MRWT!sgwV`?aIW7uJ5B#BGLpAm3=6JIo8m3QfUU@Ah?@ghUscG_=?1%9NvIM;NTI3OCV7}E(;=yu9_++wG_j${W%jYK(!@mu|mR7Iv3VoI@3GrK=-)XZok|exWHNq0>$tVRfz{**3WNik6)PRqtwt;+nl#2In7@vwe9Qmkc%>gX8oL_UmB;Mb@tVjdBstjhl>Ms<}f{#-Av#eEJV7=&Djg;n3U26nin_0M}|8!Q=)#}I%}RqBjJ6`HR+_)#3aaUJv2=7Aa{co@l+K`eN2o~FJ@M7XrHhi5BJ6-=$H7k_G73&4Evy~V3NKI5@Vldz5KU@+kt)^+$k>UD5?Csoa$ene+S_49x>9G{YXMcsLcc!E#_ix8=aln5RubF>}}cVltu*6B@r7jnY-#E3esR(&qqu^fsXgyCCZOG{~k;}S;zAAAEU;{6?R2Jvki)`d44@7D5ECU0&sd4HFXqRY-+u`qO?PgAMCE!&^6>&sR<&4w+-IiJcOR--tFif#`j2q9&I2JU6g)MsM1!W9(zbUv0=EpKZZC!K)4n&IBwt=xSs>;Q*G=p?TU?6{ncvh%Bxn9$ThW72Z7s$WmV3NlPre}4W}|PdXqIM#Jy2hMGP`I9gI%*odxvW(A7ZrORis>|KW=Wg8k6tL}2{7-fxT~ukmxf{`h(j%fm8#1k?6vgLz7Iu=&b!NmZfs+BNQ%2^0(sAy$luM7RbZu6{Rk{Sniu;Ge|VVtw9Z1_&U-Bezmf(D5M9=q3{%pbLc>cJ=*5SeHU{9}`(1Y2IsK8RNvM^TjSeNniw#iefaJlDCV2;$@x~>da57(kn_RkAZ@sXRR)?O&f{zqLm7t0w%c+vO%qe}F*S^QgV->89u<=8U|SFSH6pNOgPKcM!Vf!#DuT7i^_<~Fr#JMpo;(2RTX@s^tLr|_lH9b1KVzSPAg{F|?_X&2`Tga0Un^Bgkk$G<%J<6KT9dg%k#g)6P46QAWNqX089fMkSDI?M>#uZV7{vb1=j+UKY*%%}u3)b=gz>=6={@Y?@9L*Ce^oBWWq7?$=vIeRS9%Z1MxkG#?Kl4(R?v1G@IEZH6{K4awa>EwdA%t@-wn<%GDf;iICz3WRPO331SpGyz=;7fAb}cFSqwz(A64|zU1>x`K)nWf+%Vba{SjndLw%qV)g4h-L)UaM8sn%t79z4n>wC2m+v^M%cj4L@Lo5?HxSLWq{;({nvKEs7VmcGBAfd~En>J8)^M%f9L8Ahwc>_aWdFiR}0^!Z|An_ToBsshL^C40;PgIYo@Z06zwE3kwBVM+t_W;pvAWdgI@4in%|U1JhAh#C>U^vopCwRZPE#KDp&>2Oy~O*=$?UnGzn7^-8N(C}XG~g!nM-wgwa&)q9!$=d4lgZgKbWSC7NkYARK^-BPB2TBo!(IT;R0Em|X|{oOM4R7JlfI&{>~%i@iF|a*<_Gq!13yAg82E*T)>k6W3yd-p$FtVx3U&fq@fJ(fMcLdww$cdse?S5XR-ozTSm4Ctl}-pJK8`?tXm+gXt`e%hEVUT0TGx4GFL7ohz!2lSIG6Z8_}K46&|A^2-o%_146E~M+@uvA}D<2j4REpAtRqefa+Wg-FGnJBT}TSvRx34!MOr&B865M>fU(PM&4vu#3Ksu=ssJNeeO^=;L9=NE`&qrV4CU+&3@_H34|LGtP*0KX3LLq&2C$d}gOKxvQC-(s<5B%G~+(6cNe$%Ecqx`{i$y2_kV*p+V>GlNZL&F1wv?k%llRCqzSHyx~#-mKl&f1k10lc`6%*(R=m@=)gim5%71s19eteMFDMjXBJa$OM~YozbPj7${ioIal4BdU$h8L4C9M@Yx`pPFPNlF1g6njRpt$zjkOnE*BSaZ-I!hO5HTZ=A3rseVljkowBI`#@q(qp9^oCJ85hU_Tx170;j8prOuQg_-P3oNcTFxy0*~k(@ciJF=HE0)b1L{QYiqHE$?r6w$dH6>r%5kW_2)UXX#rg;mr6rgG80SHzZ|3Mrz6^5Cj}qFS*%kS=Xn{kNih0Gfr>qW}Db3Fkl(hN>s%5?fBojmAC!Wiv4iimu?Z!hnse+DPzVSYMn589ZsY%DVgleZ0o9$7c^eFr$U8LUbOSul}hfug7BH;~wkDu{)IhU?&1w=LqO%dlD)t(z$Pw86n?C#do*W(!x^IjE{KfbR`mMWcsxCGr!3OSYhYfadzTaoSyoVEAOs|=>8o^N~2On=mSr*7;}=79roJIsCRa@FoKlp~GrjgZ`lG{|PM*N|i;I$RqklX!hH%t@vf6?2n|aux{VRy3^kpZD#&?k4Av@g;mP#v`WXqqwEQFeIoD5r7DGjbqn*oq?XDH10|p%#|LfgsSE2S{6Fh(Mk@oIi!!n@a+7g@PUUj2m$erqXN=XL^j;_-QXQuI^pQOC6Dw*SN4FQ#5rX!QqcZa}+u5?8ubF&Z0DT6Yc@w7tc_j5e5EtZ6(lmh8dJ`-9OTH%F1UHpyAfw0ew$3=m!JH+5osW0;z2!nDF+PuI4iUmY{|toeA_p|`Itpcda(yi;cwFCcJ>hz5vp{7paZdH$78-GWy(zntW;(p|?UEUoQV#QS@+45N@;mg_aEl~Yz{%mMpgLNuA;{(B2P6q6&``Mz7%c++=>{&Kfx=|qV9tc-vh0=w(xe>=ML^Q9|J_zg}<)R3sjeL4N@9A9Y>Qz&qOvw`>GS4GJsb4eQxj#Y9HvuIz?CaI*?oW^r-jHf(cwDPMD%FT4g6b=m%~!T_JN-v*3Ml7>^W*Xhm+L{+QRVOEmA{Zx8zsW4BDnVV8Og_UzWZn!@FY~m7mjH9&)b0I_lAkB738=lOg)mAVy>&azk`%ep(Ux9J5t<9-48ZfJn2><}?pS%b&>AZoJ5ncW_PxpzeAe)6EE;0#(3twqY5+z1RL3+Q3F&z1p*zDLs%FIzJY;@Yp695yf%~m)pRzZ4jNGkUV>bqZ3#ecL-_x5)h2-3K6_JvkqESq4Vijq67hH8y+C^+T|frN8{Zn@((f#jY?UKWDp3`d@Nh{vkI^MNr=DcRfRnThDrl_fBz6h#PP^PlTk_;ws_L)c8oD=`vNtNu_;+m^jH{8EYp4o{`Ivx7c3vaF7R#5Z!m>k)Qujg`O61+`;pF=3vurF|o>lwd-gpAW>8gsh=4IJx^Ir9t8C6|LB%8G+T|E9iKVKc^IB}4~4ZlDRh_l;%v<>R04T_SqDeb@k+fr#z?cio=!`|!P+TBDr?4dK;gK(k-do)4V=NXQ390Ilu!r%iG6-MRQgmKDZTX#Y9tcC3QcRnwWnx_C9a(!ST&D@*nG2a8uC^eF$>5`b*SxI2Rf``VxHwR#YM9fN3E}pAIrQT0oo=J=|%l`gdNGdE0Ij&$ady;)!lJ9rcNC3a7FXSH@eTr7gjEO3ktM*Z7~AEC^3DM}x^GyRKOS!-9^t50S%wMJkp+%|BQ0Z!+!|^UZU1rWLU^~djk%W$pkew=#?8w2QMkZiK8xBjK^`I8Rd_q9z&Bdbx#{45M-@4LHWNZ0ns`dZH05Mfwcg0-+(srwd%LnFhE;pqRm6u)-lRNR^I3cMbZ8wPkOv!KM~{dQ^C;_q|krLM1G$CbH$!EGaH%qYG^HG+lMm1`1TDycEY~JJ^sn`HvWI!Zr-Ne~xy}30Bot+BWoNKKTVcY<{sK5wBuL|YTHLL_oT8Aj`c?*vTgBm-;JeLqoNoR903zzx@Q@^ocA0TV^FS>*Q)a<=j{H=B#yF1kFji}WJEMB1QkSPStI%Mf5O`(i)LJ=N(Kfdc}}@0Z7aSZlV%2F{ff9PS{&=(az=zCsB!t|wyKHS%AJlmL=j$kA*jPuHC5Ye~_OH~YU>x=V+yS4ipJ=SoVQ)$kHK_^`tt(7uy)*md`Fz6i4bJ`4g>Eg;)i9MEgOI6M}DbJFZ4V}0Pd^m&Si%NlZg?B_TQ0y9whBH%ikM|x8EA^aqz~G$L{foywfO9U*|iQ`21De!W%K=B)9by|b=)N=$}W(xAWd6T1DxM~?Jple6yD-zWC`ZId=3;A0D{x96P6*J`ihl`g>orR7bZmauDcERFwRckWeR$)HU{=AQ%D#LqX{ZsH{unA)NxYCUTQQ1$S__6F_JDF%I7xUwcBAZ>(uo*LnX*{uuius`mrC;ov7RLy&F4`S<6qh&E4B!>VkK{nx7hh$P4Ns(6Lq+B3=G9}XerP2%;5dAuqLlZ1#r0X9DI(&TvN}YmVc9ruU%o5074@MSu|3S5l^n_%nZgKIYHK@8TVA)4Q#rcz}UbB3o*O$K9_=1xh|RDnmF)u6;giwsTDDIM6^CDHi=r7pAb;1ns5&Y)rqLI#$*jN)wx?ot0+3K$Azh|Ih^^kpM+HkXAyvuGUYN!n=Wie?VaA;^h5lWRhuAv&s#7t&zdpG;2FlX?O6QH$OhH*kPMEXAt`l~|*_SUcwzbw47)ZysVT%I80=SVaY4UIt~>kovfVifTNAgw83kpT%9Lk@y}#O)0IrlijYSv(*3Fk{!sjcn;~J`@y0Aw|NUDvZ$sgkxS!FS~#emkJBdN{rB{qabvKNRz2uFR@joc)NBU5zNAB3Ag=oyls&J0Ex}gaJozh;}6SA18hsMXK+89^(t`-Copg0X4=|O*slkO@$EmjM%>aOt#?WnZFWR*)}g^jfztFg;-p(S;rFr+*x>4O9ss|f#?xi^uNTt#;`G<5S;IX>78dAVh8m5+qA*A??{j5V*$qwHsiUfzO_q^Le)!^Qx(4{`w8i`r^D3A<8pvHBEf0zi~4|eGi)?A=A0>r*>*n}cQAWdBlcWI5bHAD|hu;a%k2a*Es6Z3^EXQzcZC2=a^xhxAO@AdaWtc2uNM5x*2~z}eaj2N)ySqKH7s<^;CxJ?95_0>c?if5UiMq;+7XL(jaPwWF)Q#Yl2O!jD$p#`I`Vn7GiRqyxgdq;mj`ck%^h$5FM#?pA?feFh+4-QfvUZ$DZys5_f3J*lh@L-m*c-w{+}vbfZ(`1lS#~<0YN()En(x^8eUpc8*Dc*i`$o&p4gDGI(pb@s#m)95Xm&a`Qr~yo{liFO4-UBb+n)l0EB!`B+KN`IP1+RVo%2)(tH5(361_Hdtu|V1X<&>`o=D+AY%VjH*PdfW)dkDHf8!Ziyz8Mou>^`HLre!DbG`y?!Aw>|<|&39oKWa#45r*x{(W$S8J2ezubc<4$82Q+V+o9HOPbOK9`MSCpP%xHcsd%zsl<1)f*UJa{p4h&;`%(TNc1+lk+7QsyUe{9I^>H)&8S56@TTftoVh-XFmz>d*e;SF;#PcLGu>b7St3`YG$YlZ%f{lo?MF8}>}s)hek8Bu(8Yt>aRz*(Z~#;uA=bhLi{9j<;`ZHopA?|+DK2p%f1v%qPfpv;byR5@vjJsfypV70Fu{D(Sqn;Zde;w#+Y{TzjI^)s|Vl#*DjgzGX{!nE^e!qg|CK?+nt4u;<06jXmVp@;hj*#~}8e`iQoC`Jcmr0_?qCQ*r0XgJr2KaggMsFrzx=4DfJK8;7c-uZW*!?a}&5YF(x&2Ouw)vQ*EIqDd$0oUvAs2E}!SYS@9#7A_Z){$fRzPbmaliZ>Oq7cBQ0}Fl*&PecE>%1IU4-@DYd!805kyKNGd?Bmzwlz$)1>}puV)3Fca-#G{Qv3tUywWvR5dFbzk#Goa0ht#D`~Vhi|PEc5SS%9QKj0QjaModZ}c_X{`GPkVV1m5e}rR^d3Sf1^#krD@|p3*BAE}pNcm)G-FP62ITFNG@+yoEXOiLoobR4M_p`XG@fJLd`?op@PZw4jGzQ5(_tMH7<|%v~EYPCj`>CCN*3L9}b%zZX9dCRcl_LG0ho?YL%^i8cA#W+6t#ZOWEAdZT)gZw=bPfYp*g&!#+JiXH*!#0NK5UDi?xBi?xXo6q!=-@i5(BMoCwC3Z##2gVfi#DV0v$&^>tJ>*rB9SQc*2P=e_vPfw7Uia&Kj&=F}@##wv8_5^mtEC_l|B2~}eA3IKP0P0y_rSBreYpRAh4}t`~F6|-PMqW;Bjx^1FGdBbM3&Khl9piNdrlJk~G{1Y3eR~o$ujNb)o3`+D~^@%Ua|00dyDw86YNm(k!tgVKbQ5uoK-V#5*sUn`UWZm=;--q<^BV#YtGoTNB2E8P6LCSB^Z#vh)AhcbTy1`5k&csBx%b4$pgnm?88A67Y6Gp|xv_)$tSz^e||8p4Ibax<2SOG;}wx!}l=qu8+8{E{a0j?jzK25`|u%djgE2OYuP`sEgoLR90<*5gtJ>Q(36bB&YtdIF`x@4jIM&GKC}sA@guGHp;ytfL~s9zLm9no?t#c{>(cN*cSWVMdF*?RN6`Ua9^h*zMGOUc~z2ORaxDmhuIZzWr`gHcX4nQISOcrIm20J6L@?Wt=lGQ1M5#$Is)fp`iIY6kwoNn0e1GaDb2Vmu(%zebTo)cGI#b-a!lRhx)Q%CnDSx=&%?+%Q1Xo}CNVH+~$JR4JDIsUl0P|31@0P(^JF3Pch{`88v7S7>Kau7vcG_#IJ_jDue$fGZNtGc(luPK-@mGV6JRVI~HJL$z2)rgRt`UKTB-c@g{RR>aeHtjzj~ob=qd~VlIR%3Fy`YOjtfb+>{62IW%7lHFh8}Ds>C-%n@Mp!1-uvy9sdemOVK?##yf|t7bE5j+@RenZh-eK$`+R-vZtko7PpLdybyKKp5mjDgH@H`QyBCI^&m>5uxwK5jFNY8Cq+kHS$=>7ui?CQfe~T(n}t_297<>Nefm_Xkh&U&eP_Hn`0l(#V24$3^{jL*ClnH?!noi+5P=~iv}rnwCw&kRv0Vq(Ij$XFi5Fxa7>7U@3}#UL$LX#bbXaUb?`4H3I=lJ|Ae;F97mSB-YSLq5foy0ArpFL$39(Lc=%C65GzDx)+EMZiF#t4y{VGM=V_IjZxSdc{5)@lv@`*6fzG*9n%Klep1vMckm7N?SmTgp0QKbh%TddApY(7WtI%|nQn_VyIS*Yp<(L87iAt5}6?N@FpeJR7^7Ot-~Conlmubim^kcP`#_2l&h@l$C)l!dP>6Bl!4I^ywDqy7x!=?=4>B1ZUVG{!~KhAgwh}TZ1I$&V9u2HWH-1lVGnm>8|CyH?idq)wO+#lx+_|H5bv>OW?h|KwL~JX#!x3cVe{$7Z>KLpI5X&pIwXC^lJi{2I$#%%ke`;$O#C3Yns}JUkZhDLmzJ5t~IVQn!v!^h19ZnA;{{ZFi%r+6QczdjywFst0$>P=30@6Wi;c(K)neq+}YS7pHd-s8wq~7cMN>u#dNtch=Ij7!$yr(mJAa$;yPFs;=u@ZF9(|!SAoue%hx6w>qG0e~}8E^!^&d6BfY-}h9ildZ^(FEgj27c&;*nWf+X-ja&7>H^FzFd0eu20Yy9<40$iG&z8?qPVa0VIv+yub7w(!k$KU9kz`x7j_{_3zj@$Cy4p$PMasbBk;DuAF1*j1Nnkav%vQryh{cI1S~PzF&tOKs@nCMK?AG<<CfKc)|L*2*zX3*!g(E5Zf%jq2Mu=yc4(-Vb-?KB(VHJ-ad~|Nn-GAZsv%wRcB4N0-SdUKRSfsB_5<@(|xnP~P{)Z*@AwgJUT}m_alkg(Rsfu=YlN!fd(Mlyou|1rjS`I9j?LQqhXE83C97RMt7d{~6l(J?J(`S;)0W_E7JuO{6E;4R0%f^ZCD;{DvoGCYIGCaAvlejRgRawoSYmPh&H{cK+zIB^t7Rqcz>8u3_3at6Px(Bc?&ebDMlfILIY7xDH=F=B{1mdcG-YK)XAHtj_ueA0*DZsDoC3*b|+HrugQD0!#EwsjhUQz_eGoJxKUBfBJ%!EVcJp7*&M(pssZ5ri6vVasR%-XiP=wSg%>4wT;A7X~kYTw~Y-7s8_iGf~R*0jXMR(;1wAE6dxyu$zh"),format=L.FORMAT_RAW,filters=[{"id":L.FILTER_LZMA2}])) diff --git a/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/train_seed314.log b/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/train_seed314.log new file mode 100644 index 0000000000..a7e810da58 --- /dev/null +++ b/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/train_seed314.log @@ -0,0 +1,157 @@ +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + dump_ppm_inputs: False + dump_ppm_path: ppm_inputs.npz + ema_decay: 0.9965 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.35 + etlb_clip: 3.0 + etlb_enabled: False + etlb_lr: 0.05 + etlb_steps: 5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/final_pr1991_v6_8xh100_seed314.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 3 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.022 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 3.96875 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.095 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + parallel_residual_start: 7 + ppm_h: 0.99 + ppm_l: 0.2 + ppm_mixer_enabled: True + ppm_order: 5 + ppm_t: 0.8 + qk_gain_init: 5.0 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: final_pr1991_v6_8xh100_seed314 + scalar_lr: 0.02 + seed: 314 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + ttt_chunk_tokens: 32768 + ttt_enabled: False + ttt_epochs: 3 + ttt_lr: 0.005 + ttt_momentum: 0.9 + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 0 + vocab_size: 8192 + warmdown_frac: 0.72 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40540160 +model_params:35764312 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +1/20000 train_loss: 9.0107 train_time: 0.0m tok/s: 4605666 +2/20000 train_loss: 12.3610 train_time: 0.0m tok/s: 5822040 +3/20000 train_loss: 11.0208 train_time: 0.0m tok/s: 6362748 +4/20000 train_loss: 9.5010 train_time: 0.0m tok/s: 6669661 +5/20000 train_loss: 8.3749 train_time: 0.0m tok/s: 6871203 +500/20000 train_loss: 3.3837 train_time: 0.9m tok/s: 7697016 +1000/20000 train_loss: 3.2831 train_time: 1.7m tok/s: 7696124 +1500/20000 train_loss: 3.1845 train_time: 2.6m tok/s: 7698976 +2000/20000 train_loss: 3.0708 train_time: 3.4m tok/s: 7701230 +layer_loop:enabled step:2016 frac:0.350 encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +2500/20000 train_loss: 3.1184 train_time: 4.7m tok/s: 7009665 +3000/20000 train_loss: 2.8981 train_time: 5.9m tok/s: 6638408 +3500/20000 train_loss: 2.9420 train_time: 7.2m tok/s: 6396861 +4000/20000 train_loss: 2.8187 train_time: 8.4m tok/s: 6227300 +4500/20000 train_loss: 2.8390 train_time: 9.7m tok/s: 6100782 +4553/20000 val_loss: 2.8127 val_bpb: 1.0889 +stopping_early: wallclock_cap train_time: 588017ms step: 4553/20000 +peak memory allocated: 39032 MiB reserved: 39060 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.80965521 val_bpb:1.08770509 eval_time:6393ms +Serialized model: 134710137 bytes +Code size: 19602 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 12.7s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15900417 bytes +Total submission size quantized+brotli: 15920019 bytes +quantized val_loss:2.84058314 val_bpb:1.09967825 eval_time:9107ms +ppm_mixer val_bpb:0.92917762 eval_time:542203ms order=5 H=0.99 L=0.2 T=0.8 N_bytes=40540160 +quantized_sliding_window val_loss:2.79712804 val_bpb:1.08285543 eval_time:646057ms +[W501 02:43:02.414139702 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W501 02:43:02.414381503 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W501 02:43:02.416212804 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W501 02:43:02.428627536 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W501 02:43:02.439898992 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W501 02:43:02.450654505 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W501 02:43:02.464633685 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W501 02:43:02.477933291 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W501 02:43:05.016947195 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) diff --git a/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/train_seed42.log b/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/train_seed42.log new file mode 100644 index 0000000000..30f9091390 --- /dev/null +++ b/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/train_seed42.log @@ -0,0 +1,157 @@ +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + dump_ppm_inputs: False + dump_ppm_path: ppm_inputs.npz + ema_decay: 0.9965 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.35 + etlb_clip: 3.0 + etlb_enabled: False + etlb_lr: 0.05 + etlb_steps: 5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/final_pr1991_v6_8xh100_seed42.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 3 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.022 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 3.96875 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.095 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + parallel_residual_start: 7 + ppm_h: 0.99 + ppm_l: 0.2 + ppm_mixer_enabled: True + ppm_order: 5 + ppm_t: 0.8 + qk_gain_init: 5.0 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: final_pr1991_v6_8xh100_seed42 + scalar_lr: 0.02 + seed: 42 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + ttt_chunk_tokens: 32768 + ttt_enabled: False + ttt_epochs: 3 + ttt_lr: 0.005 + ttt_momentum: 0.9 + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 0 + vocab_size: 8192 + warmdown_frac: 0.72 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40540160 +model_params:35764312 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +1/20000 train_loss: 9.0105 train_time: 0.0m tok/s: 4599967 +2/20000 train_loss: 12.2579 train_time: 0.0m tok/s: 5822675 +3/20000 train_loss: 10.9206 train_time: 0.0m tok/s: 6355592 +4/20000 train_loss: 9.4500 train_time: 0.0m tok/s: 6668462 +5/20000 train_loss: 8.3394 train_time: 0.0m tok/s: 6872390 +500/20000 train_loss: 3.3844 train_time: 0.9m tok/s: 7687054 +1000/20000 train_loss: 3.2938 train_time: 1.7m tok/s: 7674764 +1500/20000 train_loss: 3.1901 train_time: 2.6m tok/s: 7672359 +2000/20000 train_loss: 3.0723 train_time: 3.4m tok/s: 7675645 +layer_loop:enabled step:2009 frac:0.350 encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +2500/20000 train_loss: 3.1258 train_time: 4.7m tok/s: 7027701 +3000/20000 train_loss: 2.9014 train_time: 5.9m tok/s: 6621443 +3500/20000 train_loss: 2.9413 train_time: 7.2m tok/s: 6380760 +4000/20000 train_loss: 2.8236 train_time: 8.4m tok/s: 6211389 +4500/20000 train_loss: 2.8408 train_time: 9.7m tok/s: 6085636 +4544/20000 val_loss: 2.8157 val_bpb: 1.0900 +stopping_early: wallclock_cap train_time: 588138ms step: 4544/20000 +peak memory allocated: 39032 MiB reserved: 39060 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.81258828 val_bpb:1.08884057 eval_time:7100ms +Serialized model: 134710137 bytes +Code size: 19602 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 12.7s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15904027 bytes +Total submission size quantized+brotli: 15923629 bytes +quantized val_loss:2.84174146 val_bpb:1.10012668 eval_time:26443ms +ppm_mixer val_bpb:0.92982823 eval_time:550024ms order=5 H=0.99 L=0.2 T=0.8 N_bytes=40540160 +quantized_sliding_window val_loss:2.79866081 val_bpb:1.08344881 eval_time:681318ms +[W430 23:34:06.245861495 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W430 23:34:06.482748167 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W430 23:34:06.483040770 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W430 23:34:06.495277661 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W430 23:34:06.562387519 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W430 23:34:07.588518619 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W430 23:34:07.629514637 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W430 23:34:07.715614722 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W430 23:34:09.068238667 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) diff --git a/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/train_seed999.log b/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/train_seed999.log new file mode 100644 index 0000000000..abb324e5eb --- /dev/null +++ b/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/train_seed999.log @@ -0,0 +1,157 @@ +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + dump_ppm_inputs: False + dump_ppm_path: ppm_inputs.npz + ema_decay: 0.9965 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.35 + etlb_clip: 3.0 + etlb_enabled: False + etlb_lr: 0.05 + etlb_steps: 5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/final_pr1991_v6_8xh100_seed999.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 3 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.022 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 3.96875 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.095 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + parallel_residual_start: 7 + ppm_h: 0.99 + ppm_l: 0.2 + ppm_mixer_enabled: True + ppm_order: 5 + ppm_t: 0.8 + qk_gain_init: 5.0 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: final_pr1991_v6_8xh100_seed999 + scalar_lr: 0.02 + seed: 999 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + ttt_chunk_tokens: 32768 + ttt_enabled: False + ttt_epochs: 3 + ttt_lr: 0.005 + ttt_momentum: 0.9 + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 0 + vocab_size: 8192 + warmdown_frac: 0.72 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40540160 +model_params:35764312 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +1/20000 train_loss: 9.0111 train_time: 0.0m tok/s: 8221544 +2/20000 train_loss: 12.3016 train_time: 0.0m tok/s: 8128846 +3/20000 train_loss: 10.9530 train_time: 0.0m tok/s: 8028829 +4/20000 train_loss: 9.4453 train_time: 0.0m tok/s: 7972721 +5/20000 train_loss: 8.2898 train_time: 0.0m tok/s: 7942050 +500/20000 train_loss: 3.3832 train_time: 0.9m tok/s: 7702813 +1000/20000 train_loss: 3.2905 train_time: 1.7m tok/s: 7694181 +1500/20000 train_loss: 3.1905 train_time: 2.6m tok/s: 7695725 +2000/20000 train_loss: 3.0710 train_time: 3.4m tok/s: 7697429 +layer_loop:enabled step:2015 frac:0.350 encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +2500/20000 train_loss: 3.1246 train_time: 4.7m tok/s: 6987193 +3000/20000 train_loss: 2.9047 train_time: 5.9m tok/s: 6620802 +3500/20000 train_loss: 2.9401 train_time: 7.2m tok/s: 6380800 +4000/20000 train_loss: 2.8176 train_time: 8.4m tok/s: 6213661 +4500/20000 train_loss: 2.8403 train_time: 9.7m tok/s: 6089751 +4546/20000 val_loss: 2.8162 val_bpb: 1.0903 +stopping_early: wallclock_cap train_time: 588019ms step: 4546/20000 +peak memory allocated: 39032 MiB reserved: 39060 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.81304504 val_bpb:1.08901740 eval_time:6425ms +Serialized model: 134710137 bytes +Code size: 19602 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 12.7s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15899692 bytes +Total submission size quantized+brotli: 15919294 bytes +quantized val_loss:2.84265687 val_bpb:1.10048106 eval_time:8984ms +ppm_mixer val_bpb:0.92987519 eval_time:553513ms order=5 H=0.99 L=0.2 T=0.8 N_bytes=40540160 +quantized_sliding_window val_loss:2.79938121 val_bpb:1.08372770 eval_time:657172ms +[W501 03:06:44.198722685 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W501 03:06:44.200406634 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W501 03:06:44.237823533 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W501 03:06:44.283049847 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W501 03:06:44.283911064 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W501 03:06:44.288133892 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W501 03:06:44.381828355 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W501 03:06:44.383487983 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) +[W501 03:06:47.768174592 AllocatorConfig.cpp:28] Warning: PYTORCH_CUDA_ALLOC_CONF is deprecated, use PYTORCH_ALLOC_CONF instead (function operator()) diff --git a/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/v6_micro_manifest.json b/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/v6_micro_manifest.json new file mode 100644 index 0000000000..eaef419c7e --- /dev/null +++ b/records/track_10min_16mb/2026-04-30_SP8192_PPMMixer_O5_V6Micro_Final3Seed/v6_micro_manifest.json @@ -0,0 +1,13 @@ +{ + "dataset": "V6 Privacy-Web-Filtering only", + "source": "8Planetterraforming/Parameter-Golf-V6-Privacy-Web-Filtering", + "method": "tiny sparse train-only micro injection", + "base_fineweb": "/workspace/final_pr1991_v6/parameter-golf/data/datasets/fineweb10B_sp8192_ORIG_OFFICIAL", + "output_dataset": "/workspace/final_pr1991_v6/parameter-golf/data/datasets/fineweb10B_sp8192_V6MICRO", + "fineweb_validation": "untouched official fineweb_val symlinks", + "texts": 3, + "inject_tokens_requested": 8192, + "inject_tokens_written": 8192, + "sha256_plain": "9e9ef5026372cac74f1557c5fdec2efa24b32178e93a4d463f17811d9b59805f", + "legal_boundary": "V6 is train-only; not used for validation; not hidden eval data" +} \ No newline at end of file diff --git a/records/track_non_record_16mb/2026-04-17_V5_SP1024_SEQ4096_1xH100_121430168/README.md b/records/track_non_record_16mb/2026-04-17_V5_SP1024_SEQ4096_1xH100_121430168/README.md new file mode 100644 index 0000000000..b043fe455a --- /dev/null +++ b/records/track_non_record_16mb/2026-04-17_V5_SP1024_SEQ4096_1xH100_121430168/README.md @@ -0,0 +1,94 @@ +# Non-Record Submission: V5 SP1024 + Seq4096 (1xH100) + +## Overview + +This directory contains a **non-record** 16MB submission for OpenAI Parameter Golf. +The run uses the official FineWeb SentencePiece-1024 tokenization path with +`TRAIN_SEQ_LEN=4096` on a single H100 GPU. + +- Run name / ID: `v5_sp1024_top10_a` +- Track: `non-record-16mb` +- Hardware: `1xH100` +- Stop step: `6000` +- Wallclock (recorded): `3218` seconds + +## Why this is non-record + +This run is explicitly submitted under **non-record-16mb** and does not claim +record-track status. It was not executed under the official 10-minute / 8xH100 +record constraint. + +## Exact metrics + +Final post-quantized round-trip validation metrics (exact values): + +- `val_loss`: **2.05029752** +- `val_bpb`: **1.21430168** + +Submission size accounting: + +- int8+zlib model size: **15793702** bytes +- code size: **47686** bytes +- total submission size: **15841388** bytes + +Additional run values: + +- `step_stop`: **6000** +- In-log pre-roundtrip evaluation at step 6000: + - `val_loss`: `2.0411` + - `val_bpb`: `1.2089` + +## Training configuration + +From the run log and metadata: + +- Dataset path pattern: `./data/datasets/fineweb10B_sp1024/fineweb_train_*.bin` +- Validation path pattern: `./data/datasets/fineweb10B_sp1024/fineweb_val_*.bin` +- Tokenizer: `./data/tokenizers/fineweb_1024_bpe.model` +- `TRAIN_SEQ_LEN=4096` +- `TRAIN_BATCH_TOKENS=524288` +- `ITERATIONS=6000` +- `WARMUP_STEPS=30` +- Seed: `1337` + +## Model architecture + +The training code used for this run defines a GPT-style model with: + +- Vocabulary size: `1024` +- Transformer layers: `9` +- Model dimension: `512` +- Attention heads: `8` +- KV heads (GQA): `4` +- MLP multiplier: `2` +- Tied embeddings: enabled +- Reported parameter count in run log: `17059912` + +## Quantization / serialization + +The final model artifact is stored as an int8 + zlib-compressed payload. + +- Serialized FP model size (from log): `67224983` bytes +- Serialized int8+zlib model size: `15793702` bytes +- Total submission size int8+zlib: `15841388` bytes + +## Included files + +- `README.md` — submission documentation +- `submission.json` — metadata for leaderboard ingestion +- `results.tsv` — tabular metrics row +- `train_gpt.py` — run-local training script snapshot +- `train.log` — recovered run log (script + execution output) +- `final_model.int8.ptz` — final quantized artifact + +## Limitations / next steps + +- This is intentionally a non-record run and should not be compared as a + record-track attempt. +- The recovered `train.log` includes both script text and execution output in + one file. +- Potential follow-up work: + - add a cleaner separated stdout log artifact format, + - add repeated-seed non-record runs for variance estimates, + - continue tuning for lower post-quant `val_bpb` while staying under the + 16MB cap. diff --git a/records/track_non_record_16mb/2026-04-17_V5_SP1024_SEQ4096_1xH100_121430168/final_model.int8.ptz b/records/track_non_record_16mb/2026-04-17_V5_SP1024_SEQ4096_1xH100_121430168/final_model.int8.ptz new file mode 100644 index 0000000000..7947417519 Binary files /dev/null and b/records/track_non_record_16mb/2026-04-17_V5_SP1024_SEQ4096_1xH100_121430168/final_model.int8.ptz differ diff --git a/records/track_non_record_16mb/2026-04-17_V5_SP1024_SEQ4096_1xH100_121430168/results.tsv b/records/track_non_record_16mb/2026-04-17_V5_SP1024_SEQ4096_1xH100_121430168/results.tsv new file mode 100644 index 0000000000..69824df486 --- /dev/null +++ b/records/track_non_record_16mb/2026-04-17_V5_SP1024_SEQ4096_1xH100_121430168/results.tsv @@ -0,0 +1,2 @@ +run_id track gpu tokenizer train_seq_len iterations warmup_steps val_loss_exact val_bpb_exact bytes_total +v5_sp1024_top10_a non-record-16mb 1xH100 sp1024 4096 6000 30 2.05029752 1.21430168 15841388 diff --git a/records/track_non_record_16mb/2026-04-17_V5_SP1024_SEQ4096_1xH100_121430168/submission.json b/records/track_non_record_16mb/2026-04-17_V5_SP1024_SEQ4096_1xH100_121430168/submission.json new file mode 100644 index 0000000000..0f65808d8c --- /dev/null +++ b/records/track_non_record_16mb/2026-04-17_V5_SP1024_SEQ4096_1xH100_121430168/submission.json @@ -0,0 +1,18 @@ +{ + "author": "Sebastian Laskowski", + "github_id": "Terraforming-Planet", + "name": "V5 SP1024 + Seq4096 (1xH100)", + "blurb": "Non-record 1xH100 submission on the official FineWeb sp1024 path. Uses TRAIN_SEQ_LEN=4096 and V5 code-side improvements to reach post-quant val_bpb 1.21430168 under the 16MB artifact cap.", + "date": "2026-04-17T00:00:00Z", + "track": "non-record-16mb", + "val_loss": 2.05029752, + "val_bpb": 1.21430168, + "pre_quant_val_loss": null, + "pre_quant_val_bpb": null, + "step_stop": 6000, + "wallclock_seconds": 3218, + "bytes_total": 15841388, + "bytes_model_int8_zlib": 15793702, + "bytes_code": 47686, + "gpu": "1xH100" +} diff --git a/records/track_non_record_16mb/2026-04-17_V5_SP1024_SEQ4096_1xH100_121430168/train.log b/records/track_non_record_16mb/2026-04-17_V5_SP1024_SEQ4096_1xH100_121430168/train.log new file mode 100644 index 0000000000..b1258e76e6 --- /dev/null +++ b/records/track_non_record_16mb/2026-04-17_V5_SP1024_SEQ4096_1xH100_121430168/train.log @@ -0,0 +1,1174 @@ +""" +The `train_gpt.py` and `train_gpt_mlx.py` scripts are intended as good launching-off points for new participants, not SOTA configs. We'll accept PRs that tune, improve, or simplify these scripts without significantly increasing complexity, but competitive submissions should stay in the `/records` folder. + +Hard stop: To keep readable for newcomers, let's make sure `train_gpt.py` and `train_gpt_mlx.py` never are longer than 1500 lines. +""" + +from __future__ import annotations + +import copy +import glob +import io +import math +import os +import random +import subprocess +import sys +import time +import uuid +import zlib +from pathlib import Path + +import numpy as np +import sentencepiece as spm +import torch +import torch.distributed as dist +import torch.nn.functional as F +from torch import Tensor, nn +from torch.nn.parallel import DistributedDataParallel as DDP + +# ----------------------------- +# HYPERPARAMETERS +# ----------------------------- +# Default Simple Baseline run: +# - 9 transformer blocks at width 512 +# - 8 attention heads with 4 KV heads (GQA) and 2x MLP expansion +# - vocab size 1024, sequence length 1024, tied embeddings +# - 524,288 train tokens per step for 20,000 iterations with a ~10 minute cap + +class Hyperparameters: + # Data paths are shard globs produced by the existing preprocessing pipeline. + data_path = os.environ.get("DATA_PATH", "./data/datasets/fineweb10B_sp1024") + train_files = os.path.join(data_path, "fineweb_train_*.bin") + val_files = os.path.join(data_path, "fineweb_val_*.bin") + tokenizer_path = os.environ.get("TOKENIZER_PATH", "./data/tokenizers/fineweb_1024_bpe.model") + run_id = os.environ.get("RUN_ID", str(uuid.uuid4())) + seed = int(os.environ.get("SEED", 1337)) + + # Validation cadence and batch size. Validation always uses the full fineweb_val split. + val_batch_size = int(os.environ.get("VAL_BATCH_SIZE", 524_288)) + val_loss_every = int(os.environ.get("VAL_LOSS_EVERY", 1000)) + train_log_every = int(os.environ.get("TRAIN_LOG_EVERY", 200)) + + # Training length. + iterations = int(os.environ.get("ITERATIONS", 20000)) + warmdown_iters = int(os.environ.get("WARMDOWN_ITERS", 1200)) + warmup_steps = int(os.environ.get("WARMUP_STEPS", 20)) + train_batch_tokens = int(os.environ.get("TRAIN_BATCH_TOKENS", 524_288)) + train_seq_len = int(os.environ.get("TRAIN_SEQ_LEN", 1024)) + max_wallclock_seconds = float(os.environ.get("MAX_WALLCLOCK_SECONDS", 600.0)) + qk_gain_init = float(os.environ.get("QK_GAIN_INIT", 1.5)) + + # Model shape. + vocab_size = int(os.environ.get("VOCAB_SIZE", 1024)) + num_layers = int(os.environ.get("NUM_LAYERS", 9)) + num_kv_heads = int(os.environ.get("NUM_KV_HEADS", 4)) + model_dim = int(os.environ.get("MODEL_DIM", 512)) + num_heads = int(os.environ.get("NUM_HEADS", 8)) + mlp_mult = int(os.environ.get("MLP_MULT", 2)) + tie_embeddings = bool(int(os.environ.get("TIE_EMBEDDINGS", "1"))) + rope_base = float(os.environ.get("ROPE_BASE", 10000.0)) + logit_softcap = float(os.environ.get("LOGIT_SOFTCAP", 30.0)) + + # Optimizer hyperparameters. + embed_lr = float(os.environ.get("EMBED_LR", 0.6)) + head_lr = float(os.environ.get("HEAD_LR", 0.008)) + tied_embed_lr = float(os.environ.get("TIED_EMBED_LR", 0.05)) + tied_embed_init_std = float(os.environ.get("TIED_EMBED_INIT_STD", 0.005)) + matrix_lr = float(os.environ.get("MATRIX_LR", 0.04)) + scalar_lr = float(os.environ.get("SCALAR_LR", 0.04)) + muon_momentum = float(os.environ.get("MUON_MOMENTUM", 0.95)) + muon_backend_steps = int(os.environ.get("MUON_BACKEND_STEPS", 5)) + muon_momentum_warmup_start = float(os.environ.get("MUON_MOMENTUM_WARMUP_START", 0.85)) + muon_momentum_warmup_steps = int(os.environ.get("MUON_MOMENTUM_WARMUP_STEPS", 500)) + beta1 = float(os.environ.get("BETA1", 0.9)) + beta2 = float(os.environ.get("BETA2", 0.95)) + adam_eps = float(os.environ.get("ADAM_EPS", 1e-8)) + grad_clip_norm = float(os.environ.get("GRAD_CLIP_NORM", 0.0)) + +# ----------------------------- +# MUON OPTIMIZER +# ----------------------------- +# +# As borrowed from modded-nanogpt +# Background on Muon: https://kellerjordan.github.io/posts/muon/ + +def zeropower_via_newtonschulz5(G: Tensor, steps: int = 10, eps: float = 1e-7) -> Tensor: + # Orthogonalize a 2D update matrix with a fast Newton-Schulz iteration. + # Muon uses this to normalize matrix-shaped gradients before applying them. + a, b, c = (3.4445, -4.7750, 2.0315) + X = G.bfloat16() + X /= X.norm() + eps + transposed = G.size(0) > G.size(1) + if transposed: + X = X.T + for _ in range(steps): + A = X @ X.T + B = b * A + c * A @ A + X = a * X + B @ X + return X.T if transposed else X + + +class Muon(torch.optim.Optimizer): + def __init__(self, params, lr: float, momentum: float, backend_steps: int, nesterov: bool = True): + super().__init__( + params, + dict(lr=lr, momentum=momentum, backend_steps=backend_steps, nesterov=nesterov), + ) + + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + distributed = dist.is_available() and dist.is_initialized() + world_size = dist.get_world_size() if distributed else 1 + rank = dist.get_rank() if distributed else 0 + + for group in self.param_groups: + params = group["params"] + if not params: + continue + lr = group["lr"] + momentum = group["momentum"] + backend_steps = group["backend_steps"] + nesterov = group["nesterov"] + + total_params = sum(int(p.numel()) for p in params) + updates_flat = torch.zeros(total_params, device=params[0].device, dtype=torch.bfloat16) + + curr = 0 + for i, p in enumerate(params): + if i % world_size == rank and p.grad is not None: + g = p.grad + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros_like(g) + buf = state["momentum_buffer"] + buf.mul_(momentum).add_(g) + if nesterov: + g = g.add(buf, alpha=momentum) + g = zeropower_via_newtonschulz5(g, steps=backend_steps) + # Scale correction from Muon reference implementations. + g *= max(1, g.size(0) / g.size(1)) ** 0.5 + updates_flat[curr : curr + p.numel()] = g.reshape(-1) + curr += p.numel() + + if distributed: + dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) + + curr = 0 + for p in params: + g = updates_flat[curr : curr + p.numel()].view_as(p).to(dtype=p.dtype) + p.add_(g, alpha=-lr) + curr += p.numel() + + return loss + + +# ----------------------------- +# TOKENIZER-AGNOSTIC EVALUATION SETUP +# ----------------------------- +# +# It's common for small models have a large fraction of their parameters be embeddings, since the 2 * d_model * d_vocab vectors can be gigantic. +# Instead of locking the tokenizer, we let you bring your own and calculate our validation metrics on the average compression of the validation set. +# We calculate BPB (bits-per-byte) instead of validation loss, so we need methods to count the number of bits per token in the tokenizer. +# Note: Submissions that edit the tokenizer will be examined more carefully, since screwing this up might unjustly improve your score. + +def build_sentencepiece_luts( + sp: spm.SentencePieceProcessor, vocab_size: int, device: torch.device +) -> tuple[Tensor, Tensor, Tensor]: + sp_vocab_size = int(sp.vocab_size()) + table_size = max(sp_vocab_size, vocab_size) + base_bytes_np = np.zeros((table_size,), dtype=np.int16) + has_leading_space_np = np.zeros((table_size,), dtype=np.bool_) + is_boundary_token_np = np.ones((table_size,), dtype=np.bool_) + for token_id in range(sp_vocab_size): + if sp.is_control(token_id) or sp.is_unknown(token_id) or sp.is_unused(token_id): + continue + is_boundary_token_np[token_id] = False + if sp.is_byte(token_id): + base_bytes_np[token_id] = 1 + continue + piece = sp.id_to_piece(token_id) + if piece.startswith("▁"): + has_leading_space_np[token_id] = True + piece = piece[1:] + base_bytes_np[token_id] = len(piece.encode("utf-8")) + return ( + torch.tensor(base_bytes_np, dtype=torch.int16, device=device), + torch.tensor(has_leading_space_np, dtype=torch.bool, device=device), + torch.tensor(is_boundary_token_np, dtype=torch.bool, device=device), + ) + + +def load_validation_tokens(pattern: str, seq_len: int) -> Tensor: + files = [Path(p) for p in sorted(glob.glob(pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {pattern}") + # The export pipeline writes the fixed first-50k-doc validation set to fineweb_val_*. + tokens = torch.cat([load_data_shard(file) for file in files]).contiguous() + usable = ((tokens.numel() - 1) // seq_len) * seq_len + if usable <= 0: + raise ValueError(f"Validation split is too short for TRAIN_SEQ_LEN={seq_len}") + return tokens[: usable + 1] + + +def eval_val( + args: Hyperparameters, + model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + grad_accum_steps: int, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, +) -> tuple[float, float]: + # Validation computes two metrics: + # - val_loss: token cross-entropy (natural log) + # - val_bpb: tokenizer-agnostic compression metric used by the challenge + local_batch_tokens = args.val_batch_size // (world_size * grad_accum_steps) + if local_batch_tokens < args.train_seq_len: + raise ValueError( + "VAL_BATCH_SIZE must provide at least one sequence per rank; " + f"got VAL_BATCH_SIZE={args.val_batch_size}, WORLD_SIZE={world_size}, " + f"GRAD_ACCUM_STEPS={grad_accum_steps}, TRAIN_SEQ_LEN={args.train_seq_len}" + ) + local_batch_seqs = local_batch_tokens // args.train_seq_len + total_seqs = (val_tokens.numel() - 1) // args.train_seq_len + seq_start = (total_seqs * rank) // world_size + seq_end = (total_seqs * (rank + 1)) // world_size + val_loss_sum = torch.zeros((), device=device, dtype=torch.float64) + val_token_count = torch.zeros((), device=device, dtype=torch.float64) + val_byte_count = torch.zeros((), device=device, dtype=torch.float64) + + model.eval() + with torch.inference_mode(): + for batch_seq_start in range(seq_start, seq_end, local_batch_seqs): + batch_seq_end = min(batch_seq_start + local_batch_seqs, seq_end) + raw_start = batch_seq_start * args.train_seq_len + raw_end = batch_seq_end * args.train_seq_len + 1 + local = val_tokens[raw_start:raw_end].to(device=device, dtype=torch.int64, non_blocking=True) + x = local[:-1].reshape(-1, args.train_seq_len) + y = local[1:].reshape(-1, args.train_seq_len) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + batch_loss = model(x, y).detach() + batch_token_count = float(y.numel()) + val_loss_sum += batch_loss.to(torch.float64) * batch_token_count + val_token_count += batch_token_count + prev_ids = x.reshape(-1) + tgt_ids = y.reshape(-1) + token_bytes = base_bytes_lut[tgt_ids].to(dtype=torch.int16) + token_bytes += (has_leading_space_lut[tgt_ids] & ~is_boundary_token_lut[prev_ids]).to(dtype=torch.int16) + val_byte_count += token_bytes.to(torch.float64).sum() + + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(val_loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(val_token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(val_byte_count, op=dist.ReduceOp.SUM) + + val_loss = val_loss_sum / val_token_count + bits_per_token = val_loss.item() / math.log(2.0) + tokens_per_byte = val_token_count.item() / val_byte_count.item() + model.train() + return float(val_loss.item()), float(bits_per_token * tokens_per_byte) + +# ----------------------------- +# POST-TRAINING QUANTIZATION +# ----------------------------- +# +# It's silly to export our model, which is trained in bf16 and fp32, at that same precision. +# Instead, we get approximately the same model (with a small hit) by quantizing the model to int8 & zlib compressing. +# We can then decompress the model and run in higher precision for evaluation, after closing in under the size limit. + +CONTROL_TENSOR_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "CONTROL_TENSOR_NAME_PATTERNS", + "attn_scale,attn_scales,mlp_scale,mlp_scales,resid_mix,resid_mixes,q_gain,skip_weight,skip_weights", + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_FP32_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "INT8_KEEP_FLOAT_FP32_NAME_PATTERNS", + ",".join(CONTROL_TENSOR_NAME_PATTERNS), + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_MAX_NUMEL = 65_536 +INT8_KEEP_FLOAT_STORE_DTYPE = torch.float16 +INT8_PER_ROW_SCALE_DTYPE = torch.float16 +INT8_CLIP_PERCENTILE = 99.99984 +INT8_CLIP_Q = INT8_CLIP_PERCENTILE / 100.0 + +def tensor_nbytes(t: Tensor) -> int: + return int(t.numel()) * int(t.element_size()) + +def keep_float_tensor(name: str, t: Tensor, passthrough_orig_dtypes: dict[str, str]) -> Tensor: + if any(pattern in name for pattern in INT8_KEEP_FLOAT_FP32_NAME_PATTERNS): + return t.float().contiguous() + if t.dtype in {torch.float32, torch.bfloat16}: + passthrough_orig_dtypes[name] = str(t.dtype).removeprefix("torch.") + return t.to(dtype=INT8_KEEP_FLOAT_STORE_DTYPE).contiguous() + return t + +def quantize_float_tensor(t: Tensor) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + # Matrices get one scale per row, which usually tracks output-channel + # ranges much better than a single tensor-wide scale. + clip_abs = ( + torch.quantile(t32.abs(), INT8_CLIP_Q, dim=1) + if t32.numel() + else torch.empty((t32.shape[0],), dtype=torch.float32) + ) + clipped = torch.maximum(torch.minimum(t32, clip_abs[:, None]), -clip_abs[:, None]) + scale = (clip_abs / 127.0).clamp_min(1.0 / 127.0) + q = torch.clamp(torch.round(clipped / scale[:, None]), -127, 127).to(torch.int8).contiguous() + return q, scale.to(dtype=INT8_PER_ROW_SCALE_DTYPE).contiguous() + + # Vectors / scalars use a simpler per-tensor scale. + clip_abs = float(torch.quantile(t32.abs().flatten(), INT8_CLIP_Q).item()) if t32.numel() else 0.0 + scale = torch.tensor(clip_abs / 127.0 if clip_abs > 0 else 1.0, dtype=torch.float32) + q = torch.clamp(torch.round(torch.clamp(t32, -clip_abs, clip_abs) / scale), -127, 127).to(torch.int8).contiguous() + return q, scale + +def quantize_state_dict_int8(state_dict: dict[str, Tensor]): + # Single supported clean-script export format: + # - per-row int8 for 2D float tensors + # - per-tensor int8 for other float tensors + # - exact passthrough for non-floats + # - passthrough for small float tensors, stored as fp16 to save bytes + quantized: dict[str, Tensor] = {} + scales: dict[str, Tensor] = {} + dtypes: dict[str, str] = {} + passthrough: dict[str, Tensor] = {} + passthrough_orig_dtypes: dict[str, str] = {} + qmeta: dict[str, dict[str, object]] = {} + stats = dict.fromkeys( + ("param_count", "num_tensors", "num_float_tensors", "num_nonfloat_tensors", "baseline_tensor_bytes", "int8_payload_bytes"), + 0, + ) + + for name, tensor in state_dict.items(): + t = tensor.detach().to("cpu").contiguous() + stats["param_count"] += int(t.numel()) + stats["num_tensors"] += 1 + stats["baseline_tensor_bytes"] += tensor_nbytes(t) + + if not t.is_floating_point(): + stats["num_nonfloat_tensors"] += 1 + passthrough[name] = t + stats["int8_payload_bytes"] += tensor_nbytes(t) + continue + + # Small float tensors are cheap enough to keep directly. We still downcast + # fp32/bf16 passthrough tensors to fp16 so metadata does not dominate size. + if t.numel() <= INT8_KEEP_FLOAT_MAX_NUMEL: + kept = keep_float_tensor(name, t, passthrough_orig_dtypes) + passthrough[name] = kept + stats["int8_payload_bytes"] += tensor_nbytes(kept) + continue + + stats["num_float_tensors"] += 1 + q, s = quantize_float_tensor(t) + if s.ndim > 0: + qmeta[name] = {"scheme": "per_row", "axis": 0} + quantized[name] = q + scales[name] = s + dtypes[name] = str(t.dtype).removeprefix("torch.") + stats["int8_payload_bytes"] += tensor_nbytes(q) + tensor_nbytes(s) + + obj: dict[str, object] = { + "__quant_format__": "int8_clean_per_row_v1", + "quantized": quantized, + "scales": scales, + "dtypes": dtypes, + "passthrough": passthrough, + } + if qmeta: + obj["qmeta"] = qmeta + if passthrough_orig_dtypes: + obj["passthrough_orig_dtypes"] = passthrough_orig_dtypes + return obj, stats + +def dequantize_state_dict_int8(obj: dict[str, object]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + qmeta = obj.get("qmeta", {}) + passthrough_orig_dtypes = obj.get("passthrough_orig_dtypes", {}) + for name, q in obj["quantized"].items(): + dtype = getattr(torch, obj["dtypes"][name]) + s = obj["scales"][name] + if qmeta.get(name, {}).get("scheme") == "per_row" or s.ndim > 0: + s = s.to(dtype=torch.float32) + # Broadcast the saved row scale back across trailing dimensions. + out[name] = (q.float() * s.view(q.shape[0], *([1] * (q.ndim - 1)))).to(dtype=dtype).contiguous() + else: + scale = float(s.item()) + out[name] = (q.float() * scale).to(dtype=dtype).contiguous() + for name, t in obj["passthrough"].items(): + # Restore small tensors, undoing the temporary fp16 storage cast if needed. + out_t = t.detach().to("cpu").contiguous() + orig_dtype = passthrough_orig_dtypes.get(name) + if isinstance(orig_dtype, str): + out_t = out_t.to(dtype=getattr(torch, orig_dtype)).contiguous() + out[name] = out_t + return out + + +# ----------------------------- +# DATA LOADING +# ----------------------------- + +def load_data_shard(file: Path) -> Tensor: + header_bytes = 256 * np.dtype(" None: + self.file_idx = (self.file_idx + 1) % len(self.files) + self.tokens = load_data_shard(self.files[self.file_idx]) + self.pos = 0 + + def take(self, n: int) -> Tensor: + chunks: list[Tensor] = [] + remaining = n + while remaining > 0: + avail = self.tokens.numel() - self.pos + if avail <= 0: + self._advance_file() + continue + k = min(remaining, avail) + chunks.append(self.tokens[self.pos : self.pos + k]) + self.pos += k + remaining -= k + return chunks[0] if len(chunks) == 1 else torch.cat(chunks) + + +class DistributedTokenLoader: + # Each call consumes a contiguous chunk from the shared token stream, then slices out + # one disjoint span per rank. The extra "+1" token lets us build (x, y) by shifting. + def __init__(self, pattern: str, rank: int, world_size: int, device: torch.device): + self.rank = rank + self.world_size = world_size + self.device = device + self.stream = TokenStream(pattern) + + def next_batch(self, global_tokens: int, seq_len: int, grad_accum_steps: int) -> tuple[Tensor, Tensor]: + local_tokens = global_tokens // (self.world_size * grad_accum_steps) + per_rank_span = local_tokens + 1 + chunk = self.stream.take(per_rank_span * self.world_size) + start = self.rank * per_rank_span + local = chunk[start : start + per_rank_span].to(dtype=torch.int64) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + return x.to(self.device, non_blocking=True), y.to(self.device, non_blocking=True) + +# ----------------------------- +# TRANSFORMER MODULES +# ----------------------------- + +class RMSNorm(nn.Module): + def __init__(self, eps: float | None = None): + super().__init__() + self.eps = eps + + def forward(self, x: Tensor) -> Tensor: + return F.rms_norm(x, (x.size(-1),), eps=self.eps) + + +class CastedLinear(nn.Linear): + # Keep weights in fp32 for optimizer/state quality, cast at matmul time for bf16 compute. + def forward(self, x: Tensor) -> Tensor: + bias = self.bias.to(x.dtype) if self.bias is not None else None + return F.linear(x, self.weight.to(x.dtype), bias) + + +def restore_low_dim_params_to_fp32(module: nn.Module) -> None: + # Keep small/control parameters in fp32 even when the model body runs in bf16. + with torch.no_grad(): + for name, param in module.named_parameters(): + if (param.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)) and param.dtype != torch.float32: + param.data = param.data.float() + + +class Rotary(nn.Module): + # Caches cos/sin tables per sequence length on the current device. + def __init__(self, dim: int, base: float = 10000.0): + super().__init__() + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._cos_cached: Tensor | None = None + self._sin_cached: Tensor | None = None + + def forward(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> tuple[Tensor, Tensor]: + if ( + self._cos_cached is None + or self._sin_cached is None + or self._seq_len_cached != seq_len + or self._cos_cached.device != device + ): + t = torch.arange(seq_len, device=device, dtype=self.inv_freq.dtype) + freqs = torch.outer(t, self.inv_freq.to(device)) + self._cos_cached = freqs.cos()[None, None, :, :] + self._sin_cached = freqs.sin()[None, None, :, :] + self._seq_len_cached = seq_len + return self._cos_cached.to(dtype=dtype), self._sin_cached.to(dtype=dtype) + + +def apply_rotary_emb(x: Tensor, cos: Tensor, sin: Tensor) -> Tensor: + half = x.size(-1) // 2 + x1, x2 = x[..., :half], x[..., half:] + return torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) + + +class CausalSelfAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + rope_base: float, + qk_gain_init: float, + ): + super().__init__() + if dim % num_heads != 0: + raise ValueError("model_dim must be divisible by num_heads") + if num_heads % num_kv_heads != 0: + raise ValueError("num_heads must be divisible by num_kv_heads") + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = dim // num_heads + if self.head_dim % 2 != 0: + raise ValueError("head_dim must be even for RoPE") + kv_dim = self.num_kv_heads * self.head_dim + self.c_q = CastedLinear(dim, dim, bias=False) + self.c_k = CastedLinear(dim, kv_dim, bias=False) + self.c_v = CastedLinear(dim, kv_dim, bias=False) + self.proj = CastedLinear(dim, dim, bias=False) + self.proj._zero_init = True + self.q_gain = nn.Parameter(torch.full((num_heads,), qk_gain_init, dtype=torch.float32)) + self.rotary = Rotary(self.head_dim, base=rope_base) + + def forward(self, x: Tensor) -> Tensor: + bsz, seqlen, dim = x.shape + q = self.c_q(x).reshape(bsz, seqlen, self.num_heads, self.head_dim).transpose(1, 2) + k = self.c_k(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim).transpose(1, 2) + v = self.c_v(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim).transpose(1, 2) + q = F.rms_norm(q, (q.size(-1),)) + k = F.rms_norm(k, (k.size(-1),)) + cos, sin = self.rotary(seqlen, x.device, q.dtype) + q = apply_rotary_emb(q, cos, sin) + k = apply_rotary_emb(k, cos, sin) + q = q * self.q_gain.to(dtype=q.dtype)[None, :, None, None] + y = F.scaled_dot_product_attention( + q, + k, + v, + attn_mask=None, + is_causal=True, + enable_gqa=(self.num_kv_heads != self.num_heads), + ) + y = y.transpose(1, 2).contiguous().reshape(bsz, seqlen, dim) + return self.proj(y) + + +class MLP(nn.Module): + # relu^2 MLP from the original modded-nanogpt setup + def __init__(self, dim: int, mlp_mult: int): + super().__init__() + hidden = mlp_mult * dim + self.fc = CastedLinear(dim, hidden, bias=False) + self.proj = CastedLinear(hidden, dim, bias=False) + self.proj._zero_init = True + + def forward(self, x: Tensor) -> Tensor: + x = torch.relu(self.fc(x)) + return self.proj(x.square()) + + +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + rope_base: float, + qk_gain_init: float, + ): + super().__init__() + self.attn_norm = RMSNorm() + self.mlp_norm = RMSNorm() + self.attn = CausalSelfAttention(dim, num_heads, num_kv_heads, rope_base, qk_gain_init) + self.mlp = MLP(dim, mlp_mult) + self.attn_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.mlp_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.resid_mix = nn.Parameter(torch.stack((torch.ones(dim), torch.zeros(dim))).float()) + + def forward(self, x: Tensor, x0: Tensor) -> Tensor: + mix = self.resid_mix.to(dtype=x.dtype) + x = mix[0][None, None, :] * x + mix[1][None, None, :] * x0 + attn_out = self.attn(self.attn_norm(x)) + x = x + self.attn_scale.to(dtype=x.dtype)[None, None, :] * attn_out + x = x + self.mlp_scale.to(dtype=x.dtype)[None, None, :] * self.mlp(self.mlp_norm(x)) + return x + + +class GPT(nn.Module): + def __init__( + self, + vocab_size: int, + num_layers: int, + model_dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + tie_embeddings: bool, + tied_embed_init_std: float, + logit_softcap: float, + rope_base: float, + qk_gain_init: float, + ): + super().__init__() + if logit_softcap <= 0.0: + raise ValueError(f"logit_softcap must be positive, got {logit_softcap}") + self.tie_embeddings = tie_embeddings + self.tied_embed_init_std = tied_embed_init_std + self.logit_softcap = logit_softcap + self.tok_emb = nn.Embedding(vocab_size, model_dim) + self.num_encoder_layers = num_layers // 2 + self.num_decoder_layers = num_layers - self.num_encoder_layers + self.num_skip_weights = min(self.num_encoder_layers, self.num_decoder_layers) + self.skip_weights = nn.Parameter(torch.ones(self.num_skip_weights, model_dim, dtype=torch.float32)) + self.blocks = nn.ModuleList( + [ + Block( + model_dim, + num_heads, + num_kv_heads, + mlp_mult, + rope_base, + qk_gain_init, + ) + for i in range(num_layers) + ] + ) + self.final_norm = RMSNorm() + self.lm_head = None if tie_embeddings else CastedLinear(model_dim, vocab_size, bias=False) + if self.lm_head is not None: + self.lm_head._zero_init = True + self._init_weights() + + def _init_weights(self) -> None: + if self.tie_embeddings: + nn.init.normal_(self.tok_emb.weight, mean=0.0, std=self.tied_embed_init_std) + for module in self.modules(): + if isinstance(module, nn.Linear) and getattr(module, "_zero_init", False): + nn.init.zeros_(module.weight) + + def forward(self, input_ids: Tensor, target_ids: Tensor) -> Tensor: + x = self.tok_emb(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x0 = x + skips: list[Tensor] = [] + + # First half stores skips; second half reuses them in reverse order. + for i in range(self.num_encoder_layers): + x = self.blocks[i](x, x0) + skips.append(x) + for i in range(self.num_decoder_layers): + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + x = self.blocks[self.num_encoder_layers + i](x, x0) + + x = self.final_norm(x).reshape(-1, x.size(-1)) + targets = target_ids.reshape(-1) + if self.tie_embeddings: + logits_proj = F.linear(x, self.tok_emb.weight) + else: + if self.lm_head is None: + raise RuntimeError("lm_head is required when tie_embeddings=False") + logits_proj = self.lm_head(x) + logits = self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) + return F.cross_entropy(logits.float(), targets, reduction="mean") + + +# ----------------------------- +# TRAINING +# ----------------------------- + +def main() -> None: + global zeropower_via_newtonschulz5 + + code = Path(__file__).read_text(encoding="utf-8") + args = Hyperparameters() + zeropower_via_newtonschulz5 = torch.compile(zeropower_via_newtonschulz5) + + # ----------------------------- + # DISTRIBUTED + CUDA SETUP + # ----------------------------- + + distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + if world_size <= 0: + raise ValueError(f"WORLD_SIZE must be positive, got {world_size}") + if 8 % world_size != 0: + raise ValueError(f"WORLD_SIZE={world_size} must divide 8 so grad_accum_steps stays integral") + grad_accum_steps = 8 // world_size + grad_scale = 1.0 / grad_accum_steps + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required") + device = torch.device("cuda", local_rank) + torch.cuda.set_device(device) + if distributed: + dist.init_process_group(backend="nccl", device_id=device) + dist.barrier() + master_process = rank == 0 + + # Fast math knobs + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + from torch.backends.cuda import enable_cudnn_sdp, enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp + + enable_cudnn_sdp(False) + enable_flash_sdp(True) + enable_mem_efficient_sdp(False) + enable_math_sdp(False) + + logfile = None + if master_process: + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{args.run_id}.txt" + print(logfile) + + def log0(msg: str, console: bool = True) -> None: + if not master_process: + return + if console: + print(msg) + if logfile is not None: + with open(logfile, "a", encoding="utf-8") as f: + print(msg, file=f) + + log0(code, console=False) + log0("=" * 100, console=False) + log0(f"Running Python {sys.version}", console=False) + log0(f"Running PyTorch {torch.__version__}", console=False) + log0( + subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False).stdout, + console=False, + ) + log0("=" * 100, console=False) + + # ----------------------------- + # TOKENIZER + VALIDATION METRIC SETUP + # ----------------------------- + + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + + if not args.tokenizer_path.endswith(".model"): + raise ValueError(f"Script only setup for SentencePiece .model file: {args.tokenizer_path}") + sp = spm.SentencePieceProcessor(model_file=args.tokenizer_path) + if int(sp.vocab_size()) != args.vocab_size: + raise ValueError( + f"VOCAB_SIZE={args.vocab_size} does not match tokenizer vocab_size={int(sp.vocab_size())}" + ) + dataset_dir = Path(args.data_path).resolve() + actual_train_files = len(list(dataset_dir.glob("fineweb_train_*.bin"))) + val_tokens = load_validation_tokens(args.val_files, args.train_seq_len) + base_bytes_lut, has_leading_space_lut, is_boundary_token_lut = build_sentencepiece_luts( + sp, args.vocab_size, device + ) + log0(f"val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path={args.tokenizer_path}") + log0(f"train_loader:dataset:{dataset_dir.name} train_shards:{actual_train_files}") + log0(f"val_loader:shards pattern={args.val_files} tokens:{val_tokens.numel() - 1}") + + # ----------------------------- + # MODEL + OPTIMIZER SETUP + # ----------------------------- + + base_model = GPT( + vocab_size=args.vocab_size, + num_layers=args.num_layers, + model_dim=args.model_dim, + num_heads=args.num_heads, + num_kv_heads=args.num_kv_heads, + mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, + tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, + rope_base=args.rope_base, + qk_gain_init=args.qk_gain_init, + ).to(device).bfloat16() + for module in base_model.modules(): + if isinstance(module, CastedLinear): + module.float() + restore_low_dim_params_to_fp32(base_model) + compiled_model = torch.compile(base_model, dynamic=False, fullgraph=True) + model: nn.Module = DDP(compiled_model, device_ids=[local_rank], broadcast_buffers=False) if distributed else compiled_model + + # Optimizer split: + # - token embedding (Adam) uses EMBED_LR + # - untied lm_head (Adam) uses HEAD_LR + # - matrix params in transformer blocks use MATRIX_LR via Muon + # - vectors/scalars use SCALAR_LR via Adam + block_named_params = list(base_model.blocks.named_parameters()) + matrix_params = [ + p + for name, p in block_named_params + if p.ndim == 2 and not any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + scalar_params = [ + p + for name, p in block_named_params + if p.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.skip_weights.numel() > 0: + scalar_params.append(base_model.skip_weights) + token_lr = args.tied_embed_lr if args.tie_embeddings else args.embed_lr + optimizer_tok = torch.optim.Adam( + [{"params": [base_model.tok_emb.weight], "lr": token_lr, "base_lr": token_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + optimizer_muon = Muon( + matrix_params, + lr=args.matrix_lr, + momentum=args.muon_momentum, + backend_steps=args.muon_backend_steps, + ) + for group in optimizer_muon.param_groups: + group["base_lr"] = args.matrix_lr + optimizer_scalar = torch.optim.Adam( + [{"params": scalar_params, "lr": args.scalar_lr, "base_lr": args.scalar_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + optimizers: list[torch.optim.Optimizer] = [optimizer_tok, optimizer_muon, optimizer_scalar] + if base_model.lm_head is not None: + optimizer_head = torch.optim.Adam( + [{"params": [base_model.lm_head.weight], "lr": args.head_lr, "base_lr": args.head_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + optimizers.insert(1, optimizer_head) + + n_params = sum(p.numel() for p in base_model.parameters()) + log0(f"model_params:{n_params}") + log0(f"world_size:{world_size} grad_accum_steps:{grad_accum_steps}") + log0("sdp_backends:cudnn=False flash=True mem_efficient=False math=False") + log0(f"attention_mode:gqa num_heads:{args.num_heads} num_kv_heads:{args.num_kv_heads}") + log0( + f"tie_embeddings:{args.tie_embeddings} embed_lr:{token_lr} " + f"head_lr:{args.head_lr if base_model.lm_head is not None else 0.0} " + f"matrix_lr:{args.matrix_lr} scalar_lr:{args.scalar_lr}" + ) + log0( + f"train_batch_tokens:{args.train_batch_tokens} train_seq_len:{args.train_seq_len} " + f"iterations:{args.iterations} warmup_steps:{args.warmup_steps} " + f"max_wallclock_seconds:{args.max_wallclock_seconds:.3f}" + ) + log0(f"seed:{args.seed}") + + # ----------------------------- + # DATA LOADER & MODEL WARMUP + # ----------------------------- + + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + + def zero_grad_all() -> None: + for opt in optimizers: + opt.zero_grad(set_to_none=True) + + max_wallclock_ms = 1000.0 * args.max_wallclock_seconds if args.max_wallclock_seconds > 0 else None + + def lr_mul(step: int, elapsed_ms: float) -> float: + if args.warmdown_iters <= 0: + return 1.0 + if max_wallclock_ms is None: + warmdown_start = max(args.iterations - args.warmdown_iters, 0) + return max((args.iterations - step) / max(args.warmdown_iters, 1), 0.0) if warmdown_start <= step < args.iterations else 1.0 + step_ms = elapsed_ms / max(step, 1) + warmdown_ms = args.warmdown_iters * step_ms + remaining_ms = max(max_wallclock_ms - elapsed_ms, 0.0) + return remaining_ms / max(warmdown_ms, 1e-9) if remaining_ms <= warmdown_ms else 1.0 + + # Warmup primes the compiled forward/backward/optimizer paths, then we restore the + # initial weights/optimizer state so measured training starts from the true init. + if args.warmup_steps > 0: + initial_model_state = {name: tensor.detach().cpu().clone() for name, tensor in base_model.state_dict().items()} + initial_optimizer_states = [copy.deepcopy(opt.state_dict()) for opt in optimizers] + model.train() + for warmup_step in range(args.warmup_steps): + zero_grad_all() + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + warmup_loss = model(x, y) + (warmup_loss * grad_scale).backward() + for opt in optimizers: + opt.step() + zero_grad_all() + if args.warmup_steps <= 20 or (warmup_step + 1) % 10 == 0 or warmup_step + 1 == args.warmup_steps: + log0(f"warmup_step:{warmup_step + 1}/{args.warmup_steps}") + base_model.load_state_dict(initial_model_state, strict=True) + for opt, state in zip(optimizers, initial_optimizer_states, strict=True): + opt.load_state_dict(state) + zero_grad_all() + if distributed: + model.require_backward_grad_sync = True + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + + # ----------------------------- + # MAIN TRAINING LOOP + # ----------------------------- + + training_time_ms = 0.0 + stop_after_step: int | None = None + torch.cuda.synchronize() + t0 = time.perf_counter() + + step = 0 + while True: + last_step = step == args.iterations or (stop_after_step is not None and step >= stop_after_step) + + should_validate = last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0) + if should_validate: + torch.cuda.synchronize() + training_time_ms += 1000.0 * (time.perf_counter() - t0) + val_loss, val_bpb = eval_val( + args, + model, + rank, + world_size, + device, + grad_accum_steps, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + ) + log0( + f"step:{step}/{args.iterations} val_loss:{val_loss:.4f} val_bpb:{val_bpb:.4f} " + f"train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms / max(step, 1):.2f}ms" + ) + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if stop_after_step is not None and step < args.iterations: + log0( + f"stopping_early: wallclock_cap train_time:{training_time_ms:.0f}ms " + f"step:{step}/{args.iterations}" + ) + break + + elapsed_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + scale = lr_mul(step, elapsed_ms) + zero_grad_all() + train_loss = torch.zeros((), device=device) + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + loss = model(x, y) + train_loss += loss.detach() + (loss * grad_scale).backward() + train_loss /= grad_accum_steps + + frac = min(step / args.muon_momentum_warmup_steps, 1.0) if args.muon_momentum_warmup_steps > 0 else 1.0 + muon_momentum = (1 - frac) * args.muon_momentum_warmup_start + frac * args.muon_momentum + for group in optimizer_muon.param_groups: + group["momentum"] = muon_momentum + + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * scale + + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + + step += 1 + approx_training_time_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + should_log_train = ( + args.train_log_every > 0 + and (step <= 10 or step % args.train_log_every == 0 or stop_after_step is not None) + ) + if should_log_train: + log0( + f"step:{step}/{args.iterations} train_loss:{train_loss.item():.4f} " + f"train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms / step:.2f}ms" + ) + + # Needed to sync whether we've reached the wallclock cap. + reached_cap = max_wallclock_ms is not None and approx_training_time_ms >= max_wallclock_ms + if distributed and max_wallclock_ms is not None: + reached_cap_tensor = torch.tensor(int(reached_cap), device=device) + dist.all_reduce(reached_cap_tensor, op=dist.ReduceOp.MAX) + reached_cap = bool(reached_cap_tensor.item()) + if stop_after_step is None and reached_cap: + stop_after_step = step + + log0( + f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB" + ) + + # ----------------------------- + # SERIALIZATION + ROUNDTRIP VALIDATION + # ----------------------------- + # Save the raw state (useful for debugging/loading in PyTorch directly), then always produce + # the compressed int8+zlib artifact and validate the round-tripped weights. + + if master_process: + torch.save(base_model.state_dict(), "final_model.pt") + model_bytes = os.path.getsize("final_model.pt") + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model: {model_bytes} bytes") + log0(f"Code size: {code_bytes} bytes") + log0(f"Total submission size: {model_bytes + code_bytes} bytes") + + quant_obj, quant_stats = quantize_state_dict_int8(base_model.state_dict()) + quant_buf = io.BytesIO() + torch.save(quant_obj, quant_buf) + quant_raw = quant_buf.getvalue() + quant_blob = zlib.compress(quant_raw, level=9) + quant_raw_bytes = len(quant_raw) + if master_process: + with open("final_model.int8.ptz", "wb") as f: + f.write(quant_blob) + quant_file_bytes = os.path.getsize("final_model.int8.ptz") + code_bytes = len(code.encode("utf-8")) + ratio = quant_stats["baseline_tensor_bytes"] / max(quant_stats["int8_payload_bytes"], 1) + log0( + f"Serialized model int8+zlib: {quant_file_bytes} bytes " + f"(payload:{quant_stats['int8_payload_bytes']} raw_torch:{quant_raw_bytes} payload_ratio:{ratio:.2f}x)" + ) + log0(f"Total submission size int8+zlib: {quant_file_bytes + code_bytes} bytes") + + if distributed: + dist.barrier() + with open("final_model.int8.ptz", "rb") as f: + quant_blob_disk = f.read() + quant_state = torch.load(io.BytesIO(zlib.decompress(quant_blob_disk)), map_location="cpu") + base_model.load_state_dict(dequantize_state_dict_int8(quant_state), strict=True) + torch.cuda.synchronize() + t_qeval = time.perf_counter() + q_val_loss, q_val_bpb = eval_val( + args, + model, + rank, + world_size, + device, + grad_accum_steps, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + ) + torch.cuda.synchronize() + log0( + f"final_int8_zlib_roundtrip val_loss:{q_val_loss:.4f} val_bpb:{q_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_qeval):.0f}ms" + ) + log0(f"final_int8_zlib_roundtrip_exact val_loss:{q_val_loss:.8f} val_bpb:{q_val_bpb:.8f}") + + if distributed: + dist.destroy_process_group() + + +if __name__ == "__main__": + main() + +==================================================================================================== +Running Python 3.12.3 (main, Nov 6 2025, 13:44:16) [GCC 13.3.0] +Running PyTorch 2.9.1+cu128 +Fri Apr 17 13:15:52 2026 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 580.126.09 Driver Version: 580.126.09 CUDA Version: 13.0 | ++-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | +| N/A 43C P0 99W / 700W | 1185MiB / 81559MiB | 9% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| +| No running processes found | ++-----------------------------------------------------------------------------------------+ + +==================================================================================================== +val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path=./data/tokenizers/fineweb_1024_bpe.model +train_loader:dataset:fineweb10B_sp1024 train_shards:80 +val_loader:shards pattern=./data/datasets/fineweb10B_sp1024/fineweb_val_*.bin tokens:62021632 +model_params:17059912 +world_size:1 grad_accum_steps:8 +sdp_backends:cudnn=False flash=True mem_efficient=False math=False +attention_mode:gqa num_heads:8 num_kv_heads:4 +tie_embeddings:True embed_lr:0.05 head_lr:0.0 matrix_lr:0.04 scalar_lr:0.04 +train_batch_tokens:524288 train_seq_len:4096 iterations:6000 warmup_steps:30 max_wallclock_seconds:0.000 +seed:1337 +warmup_step:10/30 +warmup_step:20/30 +warmup_step:30/30 +step:6000/6000 val_loss:2.0411 val_bpb:1.2089 train_time:3218242ms step_avg:536.37ms +peak memory allocated: 10242 MiB reserved: 10556 MiB +Serialized model: 67224983 bytes +Code size: 47686 bytes +Total submission size: 67272669 bytes +Serialized model int8+zlib: 15793702 bytes (payload:17178912 raw_torch:17224025 payload_ratio:3.91x) +Total submission size int8+zlib: 15841388 bytes +final_int8_zlib_roundtrip val_loss:2.0503 val_bpb:1.2143 eval_time:16861ms +final_int8_zlib_roundtrip_exact val_loss:2.05029752 val_bpb:1.21430168 diff --git a/records/track_non_record_16mb/2026-04-17_V5_SP1024_SEQ4096_1xH100_121430168/train_gpt.py b/records/track_non_record_16mb/2026-04-17_V5_SP1024_SEQ4096_1xH100_121430168/train_gpt.py new file mode 100644 index 0000000000..651beb2b89 --- /dev/null +++ b/records/track_non_record_16mb/2026-04-17_V5_SP1024_SEQ4096_1xH100_121430168/train_gpt.py @@ -0,0 +1,1126 @@ +""" +The `train_gpt.py` and `train_gpt_mlx.py` scripts are intended as good launching-off points for new participants, not SOTA configs. We'll accept PRs that tune, improve, or simplify these scripts without significantly increasing complexity, but competitive submissions should stay in the `/records` folder. + +Hard stop: To keep readable for newcomers, let's make sure `train_gpt.py` and `train_gpt_mlx.py` never are longer than 1500 lines. +""" + +from __future__ import annotations + +import copy +import glob +import io +import math +import os +import random +import subprocess +import sys +import time +import uuid +import zlib +from pathlib import Path + +import numpy as np +import sentencepiece as spm +import torch +import torch.distributed as dist +import torch.nn.functional as F +from torch import Tensor, nn +from torch.nn.parallel import DistributedDataParallel as DDP + +# ----------------------------- +# HYPERPARAMETERS +# ----------------------------- +# Default Simple Baseline run: +# - 9 transformer blocks at width 512 +# - 8 attention heads with 4 KV heads (GQA) and 2x MLP expansion +# - vocab size 1024, sequence length 1024, tied embeddings +# - 524,288 train tokens per step for 20,000 iterations with a ~10 minute cap + +class Hyperparameters: + # Data paths are shard globs produced by the existing preprocessing pipeline. + data_path = os.environ.get("DATA_PATH", "./data/datasets/fineweb10B_sp1024") + train_files = os.path.join(data_path, "fineweb_train_*.bin") + val_files = os.path.join(data_path, "fineweb_val_*.bin") + tokenizer_path = os.environ.get("TOKENIZER_PATH", "./data/tokenizers/fineweb_1024_bpe.model") + run_id = os.environ.get("RUN_ID", str(uuid.uuid4())) + seed = int(os.environ.get("SEED", 1337)) + + # Validation cadence and batch size. Validation always uses the full fineweb_val split. + val_batch_size = int(os.environ.get("VAL_BATCH_SIZE", 524_288)) + val_loss_every = int(os.environ.get("VAL_LOSS_EVERY", 1000)) + train_log_every = int(os.environ.get("TRAIN_LOG_EVERY", 200)) + + # Training length. + iterations = int(os.environ.get("ITERATIONS", 20000)) + warmdown_iters = int(os.environ.get("WARMDOWN_ITERS", 1200)) + warmup_steps = int(os.environ.get("WARMUP_STEPS", 20)) + train_batch_tokens = int(os.environ.get("TRAIN_BATCH_TOKENS", 524_288)) + train_seq_len = int(os.environ.get("TRAIN_SEQ_LEN", 1024)) + max_wallclock_seconds = float(os.environ.get("MAX_WALLCLOCK_SECONDS", 600.0)) + qk_gain_init = float(os.environ.get("QK_GAIN_INIT", 1.5)) + + # Model shape. + vocab_size = int(os.environ.get("VOCAB_SIZE", 1024)) + num_layers = int(os.environ.get("NUM_LAYERS", 9)) + num_kv_heads = int(os.environ.get("NUM_KV_HEADS", 4)) + model_dim = int(os.environ.get("MODEL_DIM", 512)) + num_heads = int(os.environ.get("NUM_HEADS", 8)) + mlp_mult = int(os.environ.get("MLP_MULT", 2)) + tie_embeddings = bool(int(os.environ.get("TIE_EMBEDDINGS", "1"))) + rope_base = float(os.environ.get("ROPE_BASE", 10000.0)) + logit_softcap = float(os.environ.get("LOGIT_SOFTCAP", 30.0)) + + # Optimizer hyperparameters. + embed_lr = float(os.environ.get("EMBED_LR", 0.6)) + head_lr = float(os.environ.get("HEAD_LR", 0.008)) + tied_embed_lr = float(os.environ.get("TIED_EMBED_LR", 0.05)) + tied_embed_init_std = float(os.environ.get("TIED_EMBED_INIT_STD", 0.005)) + matrix_lr = float(os.environ.get("MATRIX_LR", 0.04)) + scalar_lr = float(os.environ.get("SCALAR_LR", 0.04)) + muon_momentum = float(os.environ.get("MUON_MOMENTUM", 0.95)) + muon_backend_steps = int(os.environ.get("MUON_BACKEND_STEPS", 5)) + muon_momentum_warmup_start = float(os.environ.get("MUON_MOMENTUM_WARMUP_START", 0.85)) + muon_momentum_warmup_steps = int(os.environ.get("MUON_MOMENTUM_WARMUP_STEPS", 500)) + beta1 = float(os.environ.get("BETA1", 0.9)) + beta2 = float(os.environ.get("BETA2", 0.95)) + adam_eps = float(os.environ.get("ADAM_EPS", 1e-8)) + grad_clip_norm = float(os.environ.get("GRAD_CLIP_NORM", 0.0)) + +# ----------------------------- +# MUON OPTIMIZER +# ----------------------------- +# +# As borrowed from modded-nanogpt +# Background on Muon: https://kellerjordan.github.io/posts/muon/ + +def zeropower_via_newtonschulz5(G: Tensor, steps: int = 10, eps: float = 1e-7) -> Tensor: + # Orthogonalize a 2D update matrix with a fast Newton-Schulz iteration. + # Muon uses this to normalize matrix-shaped gradients before applying them. + a, b, c = (3.4445, -4.7750, 2.0315) + X = G.bfloat16() + X /= X.norm() + eps + transposed = G.size(0) > G.size(1) + if transposed: + X = X.T + for _ in range(steps): + A = X @ X.T + B = b * A + c * A @ A + X = a * X + B @ X + return X.T if transposed else X + + +class Muon(torch.optim.Optimizer): + def __init__(self, params, lr: float, momentum: float, backend_steps: int, nesterov: bool = True): + super().__init__( + params, + dict(lr=lr, momentum=momentum, backend_steps=backend_steps, nesterov=nesterov), + ) + + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + distributed = dist.is_available() and dist.is_initialized() + world_size = dist.get_world_size() if distributed else 1 + rank = dist.get_rank() if distributed else 0 + + for group in self.param_groups: + params = group["params"] + if not params: + continue + lr = group["lr"] + momentum = group["momentum"] + backend_steps = group["backend_steps"] + nesterov = group["nesterov"] + + total_params = sum(int(p.numel()) for p in params) + updates_flat = torch.zeros(total_params, device=params[0].device, dtype=torch.bfloat16) + + curr = 0 + for i, p in enumerate(params): + if i % world_size == rank and p.grad is not None: + g = p.grad + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros_like(g) + buf = state["momentum_buffer"] + buf.mul_(momentum).add_(g) + if nesterov: + g = g.add(buf, alpha=momentum) + g = zeropower_via_newtonschulz5(g, steps=backend_steps) + # Scale correction from Muon reference implementations. + g *= max(1, g.size(0) / g.size(1)) ** 0.5 + updates_flat[curr : curr + p.numel()] = g.reshape(-1) + curr += p.numel() + + if distributed: + dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) + + curr = 0 + for p in params: + g = updates_flat[curr : curr + p.numel()].view_as(p).to(dtype=p.dtype) + p.add_(g, alpha=-lr) + curr += p.numel() + + return loss + + +# ----------------------------- +# TOKENIZER-AGNOSTIC EVALUATION SETUP +# ----------------------------- +# +# It's common for small models have a large fraction of their parameters be embeddings, since the 2 * d_model * d_vocab vectors can be gigantic. +# Instead of locking the tokenizer, we let you bring your own and calculate our validation metrics on the average compression of the validation set. +# We calculate BPB (bits-per-byte) instead of validation loss, so we need methods to count the number of bits per token in the tokenizer. +# Note: Submissions that edit the tokenizer will be examined more carefully, since screwing this up might unjustly improve your score. + +def build_sentencepiece_luts( + sp: spm.SentencePieceProcessor, vocab_size: int, device: torch.device +) -> tuple[Tensor, Tensor, Tensor]: + sp_vocab_size = int(sp.vocab_size()) + table_size = max(sp_vocab_size, vocab_size) + base_bytes_np = np.zeros((table_size,), dtype=np.int16) + has_leading_space_np = np.zeros((table_size,), dtype=np.bool_) + is_boundary_token_np = np.ones((table_size,), dtype=np.bool_) + for token_id in range(sp_vocab_size): + if sp.is_control(token_id) or sp.is_unknown(token_id) or sp.is_unused(token_id): + continue + is_boundary_token_np[token_id] = False + if sp.is_byte(token_id): + base_bytes_np[token_id] = 1 + continue + piece = sp.id_to_piece(token_id) + if piece.startswith("▁"): + has_leading_space_np[token_id] = True + piece = piece[1:] + base_bytes_np[token_id] = len(piece.encode("utf-8")) + return ( + torch.tensor(base_bytes_np, dtype=torch.int16, device=device), + torch.tensor(has_leading_space_np, dtype=torch.bool, device=device), + torch.tensor(is_boundary_token_np, dtype=torch.bool, device=device), + ) + + +def load_validation_tokens(pattern: str, seq_len: int) -> Tensor: + files = [Path(p) for p in sorted(glob.glob(pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {pattern}") + # The export pipeline writes the fixed first-50k-doc validation set to fineweb_val_*. + tokens = torch.cat([load_data_shard(file) for file in files]).contiguous() + usable = ((tokens.numel() - 1) // seq_len) * seq_len + if usable <= 0: + raise ValueError(f"Validation split is too short for TRAIN_SEQ_LEN={seq_len}") + return tokens[: usable + 1] + + +def eval_val( + args: Hyperparameters, + model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + grad_accum_steps: int, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, +) -> tuple[float, float]: + # Validation computes two metrics: + # - val_loss: token cross-entropy (natural log) + # - val_bpb: tokenizer-agnostic compression metric used by the challenge + local_batch_tokens = args.val_batch_size // (world_size * grad_accum_steps) + if local_batch_tokens < args.train_seq_len: + raise ValueError( + "VAL_BATCH_SIZE must provide at least one sequence per rank; " + f"got VAL_BATCH_SIZE={args.val_batch_size}, WORLD_SIZE={world_size}, " + f"GRAD_ACCUM_STEPS={grad_accum_steps}, TRAIN_SEQ_LEN={args.train_seq_len}" + ) + local_batch_seqs = local_batch_tokens // args.train_seq_len + total_seqs = (val_tokens.numel() - 1) // args.train_seq_len + seq_start = (total_seqs * rank) // world_size + seq_end = (total_seqs * (rank + 1)) // world_size + val_loss_sum = torch.zeros((), device=device, dtype=torch.float64) + val_token_count = torch.zeros((), device=device, dtype=torch.float64) + val_byte_count = torch.zeros((), device=device, dtype=torch.float64) + + model.eval() + with torch.inference_mode(): + for batch_seq_start in range(seq_start, seq_end, local_batch_seqs): + batch_seq_end = min(batch_seq_start + local_batch_seqs, seq_end) + raw_start = batch_seq_start * args.train_seq_len + raw_end = batch_seq_end * args.train_seq_len + 1 + local = val_tokens[raw_start:raw_end].to(device=device, dtype=torch.int64, non_blocking=True) + x = local[:-1].reshape(-1, args.train_seq_len) + y = local[1:].reshape(-1, args.train_seq_len) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + batch_loss = model(x, y).detach() + batch_token_count = float(y.numel()) + val_loss_sum += batch_loss.to(torch.float64) * batch_token_count + val_token_count += batch_token_count + prev_ids = x.reshape(-1) + tgt_ids = y.reshape(-1) + token_bytes = base_bytes_lut[tgt_ids].to(dtype=torch.int16) + token_bytes += (has_leading_space_lut[tgt_ids] & ~is_boundary_token_lut[prev_ids]).to(dtype=torch.int16) + val_byte_count += token_bytes.to(torch.float64).sum() + + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(val_loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(val_token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(val_byte_count, op=dist.ReduceOp.SUM) + + val_loss = val_loss_sum / val_token_count + bits_per_token = val_loss.item() / math.log(2.0) + tokens_per_byte = val_token_count.item() / val_byte_count.item() + model.train() + return float(val_loss.item()), float(bits_per_token * tokens_per_byte) + +# ----------------------------- +# POST-TRAINING QUANTIZATION +# ----------------------------- +# +# It's silly to export our model, which is trained in bf16 and fp32, at that same precision. +# Instead, we get approximately the same model (with a small hit) by quantizing the model to int8 & zlib compressing. +# We can then decompress the model and run in higher precision for evaluation, after closing in under the size limit. + +CONTROL_TENSOR_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "CONTROL_TENSOR_NAME_PATTERNS", + "attn_scale,attn_scales,mlp_scale,mlp_scales,resid_mix,resid_mixes,q_gain,skip_weight,skip_weights", + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_FP32_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "INT8_KEEP_FLOAT_FP32_NAME_PATTERNS", + ",".join(CONTROL_TENSOR_NAME_PATTERNS), + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_MAX_NUMEL = 65_536 +INT8_KEEP_FLOAT_STORE_DTYPE = torch.float16 +INT8_PER_ROW_SCALE_DTYPE = torch.float16 +INT8_CLIP_PERCENTILE = 99.99984 +INT8_CLIP_Q = INT8_CLIP_PERCENTILE / 100.0 + +def tensor_nbytes(t: Tensor) -> int: + return int(t.numel()) * int(t.element_size()) + +def keep_float_tensor(name: str, t: Tensor, passthrough_orig_dtypes: dict[str, str]) -> Tensor: + if any(pattern in name for pattern in INT8_KEEP_FLOAT_FP32_NAME_PATTERNS): + return t.float().contiguous() + if t.dtype in {torch.float32, torch.bfloat16}: + passthrough_orig_dtypes[name] = str(t.dtype).removeprefix("torch.") + return t.to(dtype=INT8_KEEP_FLOAT_STORE_DTYPE).contiguous() + return t + +def quantize_float_tensor(t: Tensor) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + # Matrices get one scale per row, which usually tracks output-channel + # ranges much better than a single tensor-wide scale. + clip_abs = ( + torch.quantile(t32.abs(), INT8_CLIP_Q, dim=1) + if t32.numel() + else torch.empty((t32.shape[0],), dtype=torch.float32) + ) + clipped = torch.maximum(torch.minimum(t32, clip_abs[:, None]), -clip_abs[:, None]) + scale = (clip_abs / 127.0).clamp_min(1.0 / 127.0) + q = torch.clamp(torch.round(clipped / scale[:, None]), -127, 127).to(torch.int8).contiguous() + return q, scale.to(dtype=INT8_PER_ROW_SCALE_DTYPE).contiguous() + + # Vectors / scalars use a simpler per-tensor scale. + clip_abs = float(torch.quantile(t32.abs().flatten(), INT8_CLIP_Q).item()) if t32.numel() else 0.0 + scale = torch.tensor(clip_abs / 127.0 if clip_abs > 0 else 1.0, dtype=torch.float32) + q = torch.clamp(torch.round(torch.clamp(t32, -clip_abs, clip_abs) / scale), -127, 127).to(torch.int8).contiguous() + return q, scale + +def quantize_state_dict_int8(state_dict: dict[str, Tensor]): + # Single supported clean-script export format: + # - per-row int8 for 2D float tensors + # - per-tensor int8 for other float tensors + # - exact passthrough for non-floats + # - passthrough for small float tensors, stored as fp16 to save bytes + quantized: dict[str, Tensor] = {} + scales: dict[str, Tensor] = {} + dtypes: dict[str, str] = {} + passthrough: dict[str, Tensor] = {} + passthrough_orig_dtypes: dict[str, str] = {} + qmeta: dict[str, dict[str, object]] = {} + stats = dict.fromkeys( + ("param_count", "num_tensors", "num_float_tensors", "num_nonfloat_tensors", "baseline_tensor_bytes", "int8_payload_bytes"), + 0, + ) + + for name, tensor in state_dict.items(): + t = tensor.detach().to("cpu").contiguous() + stats["param_count"] += int(t.numel()) + stats["num_tensors"] += 1 + stats["baseline_tensor_bytes"] += tensor_nbytes(t) + + if not t.is_floating_point(): + stats["num_nonfloat_tensors"] += 1 + passthrough[name] = t + stats["int8_payload_bytes"] += tensor_nbytes(t) + continue + + # Small float tensors are cheap enough to keep directly. We still downcast + # fp32/bf16 passthrough tensors to fp16 so metadata does not dominate size. + if t.numel() <= INT8_KEEP_FLOAT_MAX_NUMEL: + kept = keep_float_tensor(name, t, passthrough_orig_dtypes) + passthrough[name] = kept + stats["int8_payload_bytes"] += tensor_nbytes(kept) + continue + + stats["num_float_tensors"] += 1 + q, s = quantize_float_tensor(t) + if s.ndim > 0: + qmeta[name] = {"scheme": "per_row", "axis": 0} + quantized[name] = q + scales[name] = s + dtypes[name] = str(t.dtype).removeprefix("torch.") + stats["int8_payload_bytes"] += tensor_nbytes(q) + tensor_nbytes(s) + + obj: dict[str, object] = { + "__quant_format__": "int8_clean_per_row_v1", + "quantized": quantized, + "scales": scales, + "dtypes": dtypes, + "passthrough": passthrough, + } + if qmeta: + obj["qmeta"] = qmeta + if passthrough_orig_dtypes: + obj["passthrough_orig_dtypes"] = passthrough_orig_dtypes + return obj, stats + +def dequantize_state_dict_int8(obj: dict[str, object]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + qmeta = obj.get("qmeta", {}) + passthrough_orig_dtypes = obj.get("passthrough_orig_dtypes", {}) + for name, q in obj["quantized"].items(): + dtype = getattr(torch, obj["dtypes"][name]) + s = obj["scales"][name] + if qmeta.get(name, {}).get("scheme") == "per_row" or s.ndim > 0: + s = s.to(dtype=torch.float32) + # Broadcast the saved row scale back across trailing dimensions. + out[name] = (q.float() * s.view(q.shape[0], *([1] * (q.ndim - 1)))).to(dtype=dtype).contiguous() + else: + scale = float(s.item()) + out[name] = (q.float() * scale).to(dtype=dtype).contiguous() + for name, t in obj["passthrough"].items(): + # Restore small tensors, undoing the temporary fp16 storage cast if needed. + out_t = t.detach().to("cpu").contiguous() + orig_dtype = passthrough_orig_dtypes.get(name) + if isinstance(orig_dtype, str): + out_t = out_t.to(dtype=getattr(torch, orig_dtype)).contiguous() + out[name] = out_t + return out + + +# ----------------------------- +# DATA LOADING +# ----------------------------- + +def load_data_shard(file: Path) -> Tensor: + header_bytes = 256 * np.dtype(" None: + self.file_idx = (self.file_idx + 1) % len(self.files) + self.tokens = load_data_shard(self.files[self.file_idx]) + self.pos = 0 + + def take(self, n: int) -> Tensor: + chunks: list[Tensor] = [] + remaining = n + while remaining > 0: + avail = self.tokens.numel() - self.pos + if avail <= 0: + self._advance_file() + continue + k = min(remaining, avail) + chunks.append(self.tokens[self.pos : self.pos + k]) + self.pos += k + remaining -= k + return chunks[0] if len(chunks) == 1 else torch.cat(chunks) + + +class DistributedTokenLoader: + # Each call consumes a contiguous chunk from the shared token stream, then slices out + # one disjoint span per rank. The extra "+1" token lets us build (x, y) by shifting. + def __init__(self, pattern: str, rank: int, world_size: int, device: torch.device): + self.rank = rank + self.world_size = world_size + self.device = device + self.stream = TokenStream(pattern) + + def next_batch(self, global_tokens: int, seq_len: int, grad_accum_steps: int) -> tuple[Tensor, Tensor]: + local_tokens = global_tokens // (self.world_size * grad_accum_steps) + per_rank_span = local_tokens + 1 + chunk = self.stream.take(per_rank_span * self.world_size) + start = self.rank * per_rank_span + local = chunk[start : start + per_rank_span].to(dtype=torch.int64) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + return x.to(self.device, non_blocking=True), y.to(self.device, non_blocking=True) + +# ----------------------------- +# TRANSFORMER MODULES +# ----------------------------- + +class RMSNorm(nn.Module): + def __init__(self, eps: float | None = None): + super().__init__() + self.eps = eps + + def forward(self, x: Tensor) -> Tensor: + return F.rms_norm(x, (x.size(-1),), eps=self.eps) + + +class CastedLinear(nn.Linear): + # Keep weights in fp32 for optimizer/state quality, cast at matmul time for bf16 compute. + def forward(self, x: Tensor) -> Tensor: + bias = self.bias.to(x.dtype) if self.bias is not None else None + return F.linear(x, self.weight.to(x.dtype), bias) + + +def restore_low_dim_params_to_fp32(module: nn.Module) -> None: + # Keep small/control parameters in fp32 even when the model body runs in bf16. + with torch.no_grad(): + for name, param in module.named_parameters(): + if (param.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)) and param.dtype != torch.float32: + param.data = param.data.float() + + +class Rotary(nn.Module): + # Caches cos/sin tables per sequence length on the current device. + def __init__(self, dim: int, base: float = 10000.0): + super().__init__() + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._cos_cached: Tensor | None = None + self._sin_cached: Tensor | None = None + + def forward(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> tuple[Tensor, Tensor]: + if ( + self._cos_cached is None + or self._sin_cached is None + or self._seq_len_cached != seq_len + or self._cos_cached.device != device + ): + t = torch.arange(seq_len, device=device, dtype=self.inv_freq.dtype) + freqs = torch.outer(t, self.inv_freq.to(device)) + self._cos_cached = freqs.cos()[None, None, :, :] + self._sin_cached = freqs.sin()[None, None, :, :] + self._seq_len_cached = seq_len + return self._cos_cached.to(dtype=dtype), self._sin_cached.to(dtype=dtype) + + +def apply_rotary_emb(x: Tensor, cos: Tensor, sin: Tensor) -> Tensor: + half = x.size(-1) // 2 + x1, x2 = x[..., :half], x[..., half:] + return torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) + + +class CausalSelfAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + rope_base: float, + qk_gain_init: float, + ): + super().__init__() + if dim % num_heads != 0: + raise ValueError("model_dim must be divisible by num_heads") + if num_heads % num_kv_heads != 0: + raise ValueError("num_heads must be divisible by num_kv_heads") + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = dim // num_heads + if self.head_dim % 2 != 0: + raise ValueError("head_dim must be even for RoPE") + kv_dim = self.num_kv_heads * self.head_dim + self.c_q = CastedLinear(dim, dim, bias=False) + self.c_k = CastedLinear(dim, kv_dim, bias=False) + self.c_v = CastedLinear(dim, kv_dim, bias=False) + self.proj = CastedLinear(dim, dim, bias=False) + self.proj._zero_init = True + self.q_gain = nn.Parameter(torch.full((num_heads,), qk_gain_init, dtype=torch.float32)) + self.rotary = Rotary(self.head_dim, base=rope_base) + + def forward(self, x: Tensor) -> Tensor: + bsz, seqlen, dim = x.shape + q = self.c_q(x).reshape(bsz, seqlen, self.num_heads, self.head_dim).transpose(1, 2) + k = self.c_k(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim).transpose(1, 2) + v = self.c_v(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim).transpose(1, 2) + q = F.rms_norm(q, (q.size(-1),)) + k = F.rms_norm(k, (k.size(-1),)) + cos, sin = self.rotary(seqlen, x.device, q.dtype) + q = apply_rotary_emb(q, cos, sin) + k = apply_rotary_emb(k, cos, sin) + q = q * self.q_gain.to(dtype=q.dtype)[None, :, None, None] + y = F.scaled_dot_product_attention( + q, + k, + v, + attn_mask=None, + is_causal=True, + enable_gqa=(self.num_kv_heads != self.num_heads), + ) + y = y.transpose(1, 2).contiguous().reshape(bsz, seqlen, dim) + return self.proj(y) + + +class MLP(nn.Module): + # relu^2 MLP from the original modded-nanogpt setup + def __init__(self, dim: int, mlp_mult: int): + super().__init__() + hidden = mlp_mult * dim + self.fc = CastedLinear(dim, hidden, bias=False) + self.proj = CastedLinear(hidden, dim, bias=False) + self.proj._zero_init = True + + def forward(self, x: Tensor) -> Tensor: + x = torch.relu(self.fc(x)) + return self.proj(x.square()) + + +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + rope_base: float, + qk_gain_init: float, + ): + super().__init__() + self.attn_norm = RMSNorm() + self.mlp_norm = RMSNorm() + self.attn = CausalSelfAttention(dim, num_heads, num_kv_heads, rope_base, qk_gain_init) + self.mlp = MLP(dim, mlp_mult) + self.attn_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.mlp_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.resid_mix = nn.Parameter(torch.stack((torch.ones(dim), torch.zeros(dim))).float()) + + def forward(self, x: Tensor, x0: Tensor) -> Tensor: + mix = self.resid_mix.to(dtype=x.dtype) + x = mix[0][None, None, :] * x + mix[1][None, None, :] * x0 + attn_out = self.attn(self.attn_norm(x)) + x = x + self.attn_scale.to(dtype=x.dtype)[None, None, :] * attn_out + x = x + self.mlp_scale.to(dtype=x.dtype)[None, None, :] * self.mlp(self.mlp_norm(x)) + return x + + +class GPT(nn.Module): + def __init__( + self, + vocab_size: int, + num_layers: int, + model_dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + tie_embeddings: bool, + tied_embed_init_std: float, + logit_softcap: float, + rope_base: float, + qk_gain_init: float, + ): + super().__init__() + if logit_softcap <= 0.0: + raise ValueError(f"logit_softcap must be positive, got {logit_softcap}") + self.tie_embeddings = tie_embeddings + self.tied_embed_init_std = tied_embed_init_std + self.logit_softcap = logit_softcap + self.tok_emb = nn.Embedding(vocab_size, model_dim) + self.num_encoder_layers = num_layers // 2 + self.num_decoder_layers = num_layers - self.num_encoder_layers + self.num_skip_weights = min(self.num_encoder_layers, self.num_decoder_layers) + self.skip_weights = nn.Parameter(torch.ones(self.num_skip_weights, model_dim, dtype=torch.float32)) + self.blocks = nn.ModuleList( + [ + Block( + model_dim, + num_heads, + num_kv_heads, + mlp_mult, + rope_base, + qk_gain_init, + ) + for i in range(num_layers) + ] + ) + self.final_norm = RMSNorm() + self.lm_head = None if tie_embeddings else CastedLinear(model_dim, vocab_size, bias=False) + if self.lm_head is not None: + self.lm_head._zero_init = True + self._init_weights() + + def _init_weights(self) -> None: + if self.tie_embeddings: + nn.init.normal_(self.tok_emb.weight, mean=0.0, std=self.tied_embed_init_std) + for module in self.modules(): + if isinstance(module, nn.Linear) and getattr(module, "_zero_init", False): + nn.init.zeros_(module.weight) + + def forward(self, input_ids: Tensor, target_ids: Tensor) -> Tensor: + x = self.tok_emb(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x0 = x + skips: list[Tensor] = [] + + # First half stores skips; second half reuses them in reverse order. + for i in range(self.num_encoder_layers): + x = self.blocks[i](x, x0) + skips.append(x) + for i in range(self.num_decoder_layers): + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + x = self.blocks[self.num_encoder_layers + i](x, x0) + + x = self.final_norm(x).reshape(-1, x.size(-1)) + targets = target_ids.reshape(-1) + if self.tie_embeddings: + logits_proj = F.linear(x, self.tok_emb.weight) + else: + if self.lm_head is None: + raise RuntimeError("lm_head is required when tie_embeddings=False") + logits_proj = self.lm_head(x) + logits = self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) + return F.cross_entropy(logits.float(), targets, reduction="mean") + + +# ----------------------------- +# TRAINING +# ----------------------------- + +def main() -> None: + global zeropower_via_newtonschulz5 + + code = Path(__file__).read_text(encoding="utf-8") + args = Hyperparameters() + zeropower_via_newtonschulz5 = torch.compile(zeropower_via_newtonschulz5) + + # ----------------------------- + # DISTRIBUTED + CUDA SETUP + # ----------------------------- + + distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + if world_size <= 0: + raise ValueError(f"WORLD_SIZE must be positive, got {world_size}") + if 8 % world_size != 0: + raise ValueError(f"WORLD_SIZE={world_size} must divide 8 so grad_accum_steps stays integral") + grad_accum_steps = 8 // world_size + grad_scale = 1.0 / grad_accum_steps + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required") + device = torch.device("cuda", local_rank) + torch.cuda.set_device(device) + if distributed: + dist.init_process_group(backend="nccl", device_id=device) + dist.barrier() + master_process = rank == 0 + + # Fast math knobs + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + from torch.backends.cuda import enable_cudnn_sdp, enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp + + enable_cudnn_sdp(False) + enable_flash_sdp(True) + enable_mem_efficient_sdp(False) + enable_math_sdp(False) + + logfile = None + if master_process: + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{args.run_id}.txt" + print(logfile) + + def log0(msg: str, console: bool = True) -> None: + if not master_process: + return + if console: + print(msg) + if logfile is not None: + with open(logfile, "a", encoding="utf-8") as f: + print(msg, file=f) + + log0(code, console=False) + log0("=" * 100, console=False) + log0(f"Running Python {sys.version}", console=False) + log0(f"Running PyTorch {torch.__version__}", console=False) + log0( + subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False).stdout, + console=False, + ) + log0("=" * 100, console=False) + + # ----------------------------- + # TOKENIZER + VALIDATION METRIC SETUP + # ----------------------------- + + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + + if not args.tokenizer_path.endswith(".model"): + raise ValueError(f"Script only setup for SentencePiece .model file: {args.tokenizer_path}") + sp = spm.SentencePieceProcessor(model_file=args.tokenizer_path) + if int(sp.vocab_size()) != args.vocab_size: + raise ValueError( + f"VOCAB_SIZE={args.vocab_size} does not match tokenizer vocab_size={int(sp.vocab_size())}" + ) + dataset_dir = Path(args.data_path).resolve() + actual_train_files = len(list(dataset_dir.glob("fineweb_train_*.bin"))) + val_tokens = load_validation_tokens(args.val_files, args.train_seq_len) + base_bytes_lut, has_leading_space_lut, is_boundary_token_lut = build_sentencepiece_luts( + sp, args.vocab_size, device + ) + log0(f"val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path={args.tokenizer_path}") + log0(f"train_loader:dataset:{dataset_dir.name} train_shards:{actual_train_files}") + log0(f"val_loader:shards pattern={args.val_files} tokens:{val_tokens.numel() - 1}") + + # ----------------------------- + # MODEL + OPTIMIZER SETUP + # ----------------------------- + + base_model = GPT( + vocab_size=args.vocab_size, + num_layers=args.num_layers, + model_dim=args.model_dim, + num_heads=args.num_heads, + num_kv_heads=args.num_kv_heads, + mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, + tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, + rope_base=args.rope_base, + qk_gain_init=args.qk_gain_init, + ).to(device).bfloat16() + for module in base_model.modules(): + if isinstance(module, CastedLinear): + module.float() + restore_low_dim_params_to_fp32(base_model) + compiled_model = torch.compile(base_model, dynamic=False, fullgraph=True) + model: nn.Module = DDP(compiled_model, device_ids=[local_rank], broadcast_buffers=False) if distributed else compiled_model + + # Optimizer split: + # - token embedding (Adam) uses EMBED_LR + # - untied lm_head (Adam) uses HEAD_LR + # - matrix params in transformer blocks use MATRIX_LR via Muon + # - vectors/scalars use SCALAR_LR via Adam + block_named_params = list(base_model.blocks.named_parameters()) + matrix_params = [ + p + for name, p in block_named_params + if p.ndim == 2 and not any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + scalar_params = [ + p + for name, p in block_named_params + if p.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.skip_weights.numel() > 0: + scalar_params.append(base_model.skip_weights) + token_lr = args.tied_embed_lr if args.tie_embeddings else args.embed_lr + optimizer_tok = torch.optim.Adam( + [{"params": [base_model.tok_emb.weight], "lr": token_lr, "base_lr": token_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + optimizer_muon = Muon( + matrix_params, + lr=args.matrix_lr, + momentum=args.muon_momentum, + backend_steps=args.muon_backend_steps, + ) + for group in optimizer_muon.param_groups: + group["base_lr"] = args.matrix_lr + optimizer_scalar = torch.optim.Adam( + [{"params": scalar_params, "lr": args.scalar_lr, "base_lr": args.scalar_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + optimizers: list[torch.optim.Optimizer] = [optimizer_tok, optimizer_muon, optimizer_scalar] + if base_model.lm_head is not None: + optimizer_head = torch.optim.Adam( + [{"params": [base_model.lm_head.weight], "lr": args.head_lr, "base_lr": args.head_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + optimizers.insert(1, optimizer_head) + + n_params = sum(p.numel() for p in base_model.parameters()) + log0(f"model_params:{n_params}") + log0(f"world_size:{world_size} grad_accum_steps:{grad_accum_steps}") + log0("sdp_backends:cudnn=False flash=True mem_efficient=False math=False") + log0(f"attention_mode:gqa num_heads:{args.num_heads} num_kv_heads:{args.num_kv_heads}") + log0( + f"tie_embeddings:{args.tie_embeddings} embed_lr:{token_lr} " + f"head_lr:{args.head_lr if base_model.lm_head is not None else 0.0} " + f"matrix_lr:{args.matrix_lr} scalar_lr:{args.scalar_lr}" + ) + log0( + f"train_batch_tokens:{args.train_batch_tokens} train_seq_len:{args.train_seq_len} " + f"iterations:{args.iterations} warmup_steps:{args.warmup_steps} " + f"max_wallclock_seconds:{args.max_wallclock_seconds:.3f}" + ) + log0(f"seed:{args.seed}") + + # ----------------------------- + # DATA LOADER & MODEL WARMUP + # ----------------------------- + + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + + def zero_grad_all() -> None: + for opt in optimizers: + opt.zero_grad(set_to_none=True) + + max_wallclock_ms = 1000.0 * args.max_wallclock_seconds if args.max_wallclock_seconds > 0 else None + + def lr_mul(step: int, elapsed_ms: float) -> float: + if args.warmdown_iters <= 0: + return 1.0 + if max_wallclock_ms is None: + warmdown_start = max(args.iterations - args.warmdown_iters, 0) + return max((args.iterations - step) / max(args.warmdown_iters, 1), 0.0) if warmdown_start <= step < args.iterations else 1.0 + step_ms = elapsed_ms / max(step, 1) + warmdown_ms = args.warmdown_iters * step_ms + remaining_ms = max(max_wallclock_ms - elapsed_ms, 0.0) + return remaining_ms / max(warmdown_ms, 1e-9) if remaining_ms <= warmdown_ms else 1.0 + + # Warmup primes the compiled forward/backward/optimizer paths, then we restore the + # initial weights/optimizer state so measured training starts from the true init. + if args.warmup_steps > 0: + initial_model_state = {name: tensor.detach().cpu().clone() for name, tensor in base_model.state_dict().items()} + initial_optimizer_states = [copy.deepcopy(opt.state_dict()) for opt in optimizers] + model.train() + for warmup_step in range(args.warmup_steps): + zero_grad_all() + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + warmup_loss = model(x, y) + (warmup_loss * grad_scale).backward() + for opt in optimizers: + opt.step() + zero_grad_all() + if args.warmup_steps <= 20 or (warmup_step + 1) % 10 == 0 or warmup_step + 1 == args.warmup_steps: + log0(f"warmup_step:{warmup_step + 1}/{args.warmup_steps}") + base_model.load_state_dict(initial_model_state, strict=True) + for opt, state in zip(optimizers, initial_optimizer_states, strict=True): + opt.load_state_dict(state) + zero_grad_all() + if distributed: + model.require_backward_grad_sync = True + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + + # ----------------------------- + # MAIN TRAINING LOOP + # ----------------------------- + + training_time_ms = 0.0 + stop_after_step: int | None = None + torch.cuda.synchronize() + t0 = time.perf_counter() + + step = 0 + while True: + last_step = step == args.iterations or (stop_after_step is not None and step >= stop_after_step) + + should_validate = last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0) + if should_validate: + torch.cuda.synchronize() + training_time_ms += 1000.0 * (time.perf_counter() - t0) + val_loss, val_bpb = eval_val( + args, + model, + rank, + world_size, + device, + grad_accum_steps, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + ) + log0( + f"step:{step}/{args.iterations} val_loss:{val_loss:.4f} val_bpb:{val_bpb:.4f} " + f"train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms / max(step, 1):.2f}ms" + ) + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if stop_after_step is not None and step < args.iterations: + log0( + f"stopping_early: wallclock_cap train_time:{training_time_ms:.0f}ms " + f"step:{step}/{args.iterations}" + ) + break + + elapsed_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + scale = lr_mul(step, elapsed_ms) + zero_grad_all() + train_loss = torch.zeros((), device=device) + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + loss = model(x, y) + train_loss += loss.detach() + (loss * grad_scale).backward() + train_loss /= grad_accum_steps + + frac = min(step / args.muon_momentum_warmup_steps, 1.0) if args.muon_momentum_warmup_steps > 0 else 1.0 + muon_momentum = (1 - frac) * args.muon_momentum_warmup_start + frac * args.muon_momentum + for group in optimizer_muon.param_groups: + group["momentum"] = muon_momentum + + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * scale + + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + + step += 1 + approx_training_time_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + should_log_train = ( + args.train_log_every > 0 + and (step <= 10 or step % args.train_log_every == 0 or stop_after_step is not None) + ) + if should_log_train: + log0( + f"step:{step}/{args.iterations} train_loss:{train_loss.item():.4f} " + f"train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms / step:.2f}ms" + ) + + # Needed to sync whether we've reached the wallclock cap. + reached_cap = max_wallclock_ms is not None and approx_training_time_ms >= max_wallclock_ms + if distributed and max_wallclock_ms is not None: + reached_cap_tensor = torch.tensor(int(reached_cap), device=device) + dist.all_reduce(reached_cap_tensor, op=dist.ReduceOp.MAX) + reached_cap = bool(reached_cap_tensor.item()) + if stop_after_step is None and reached_cap: + stop_after_step = step + + log0( + f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB" + ) + + # ----------------------------- + # SERIALIZATION + ROUNDTRIP VALIDATION + # ----------------------------- + # Save the raw state (useful for debugging/loading in PyTorch directly), then always produce + # the compressed int8+zlib artifact and validate the round-tripped weights. + + if master_process: + torch.save(base_model.state_dict(), "final_model.pt") + model_bytes = os.path.getsize("final_model.pt") + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model: {model_bytes} bytes") + log0(f"Code size: {code_bytes} bytes") + log0(f"Total submission size: {model_bytes + code_bytes} bytes") + + quant_obj, quant_stats = quantize_state_dict_int8(base_model.state_dict()) + quant_buf = io.BytesIO() + torch.save(quant_obj, quant_buf) + quant_raw = quant_buf.getvalue() + quant_blob = zlib.compress(quant_raw, level=9) + quant_raw_bytes = len(quant_raw) + if master_process: + with open("final_model.int8.ptz", "wb") as f: + f.write(quant_blob) + quant_file_bytes = os.path.getsize("final_model.int8.ptz") + code_bytes = len(code.encode("utf-8")) + ratio = quant_stats["baseline_tensor_bytes"] / max(quant_stats["int8_payload_bytes"], 1) + log0( + f"Serialized model int8+zlib: {quant_file_bytes} bytes " + f"(payload:{quant_stats['int8_payload_bytes']} raw_torch:{quant_raw_bytes} payload_ratio:{ratio:.2f}x)" + ) + log0(f"Total submission size int8+zlib: {quant_file_bytes + code_bytes} bytes") + + if distributed: + dist.barrier() + with open("final_model.int8.ptz", "rb") as f: + quant_blob_disk = f.read() + quant_state = torch.load(io.BytesIO(zlib.decompress(quant_blob_disk)), map_location="cpu") + base_model.load_state_dict(dequantize_state_dict_int8(quant_state), strict=True) + torch.cuda.synchronize() + t_qeval = time.perf_counter() + q_val_loss, q_val_bpb = eval_val( + args, + model, + rank, + world_size, + device, + grad_accum_steps, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + ) + torch.cuda.synchronize() + log0( + f"final_int8_zlib_roundtrip val_loss:{q_val_loss:.4f} val_bpb:{q_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_qeval):.0f}ms" + ) + log0(f"final_int8_zlib_roundtrip_exact val_loss:{q_val_loss:.8f} val_bpb:{q_val_bpb:.8f}") + + if distributed: + dist.destroy_process_group() + + +if __name__ == "__main__": + main() diff --git a/run_aux_v6_probe.sh b/run_aux_v6_probe.sh new file mode 100755 index 0000000000..5b3542d8e0 --- /dev/null +++ b/run_aux_v6_probe.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +set -euo pipefail + +VENV_PATH="/workspace/venv" +PREFERRED_REPO="/workspace/parameter-golf-openai" +FALLBACK_REPO="/workspace/parameter-golf" +AUX_DATASET="8Planetterraforming/Parameter-Golf-V6-Privacy-Web-Filtering" + +if [[ ! -d "$VENV_PATH" ]]; then + python3 -m venv "$VENV_PATH" --system-site-packages +fi + +# shellcheck disable=SC1091 +source "$VENV_PATH/bin/activate" + +if [[ -d "$PREFERRED_REPO" ]]; then + REPO_PATH="$PREFERRED_REPO" +elif [[ -d "$FALLBACK_REPO" ]]; then + REPO_PATH="$FALLBACK_REPO" + echo "WARNING: $PREFERRED_REPO not found; using $FALLBACK_REPO" +else + echo "ERROR: Neither $PREFERRED_REPO nor $FALLBACK_REPO exists." + exit 1 +fi + +cd "$REPO_PATH" + +if [[ ! -f "data/cached_challenge_fineweb.py" ]]; then + echo "ERROR: Missing helper script data/cached_challenge_fineweb.py" + exit 1 +fi + +echo "Installing/validating dependencies in $VENV_PATH ..." +pip install huggingface_hub datasets || echo "WARNING: pip install failed (likely network/proxy); relying on already-available packages if present." +python3 - <<'PY' +import huggingface_hub +import datasets +print('huggingface_hub import OK:', huggingface_hub.__version__) +print('datasets import OK:', datasets.__version__) +PY + +echo "Caching official FineWeb challenge data (variant=sp8192) ..." +python3 data/cached_challenge_fineweb.py --variant sp8192 || echo "WARNING: FineWeb caching failed (check HF/network access)." + +echo "Downloading auxiliary dataset: $AUX_DATASET ..." +python3 - <<'PY' +import os +from datasets import load_dataset_builder +from huggingface_hub import snapshot_download + +name = "8Planetterraforming/Parameter-Golf-V6-Privacy-Web-Filtering" +path = "" +errors = [] + +try: + path = snapshot_download(repo_id=name, repo_type="dataset") +except Exception as e: + errors.append(f"snapshot_download failed: {e}") + +if not path: + try: + builder = load_dataset_builder(name) + path = builder.cache_dir + except Exception as e: + errors.append(f"load_dataset_builder failed: {e}") + +if path: + print(f"Aux dataset local path: {os.path.abspath(path)}") +else: + print("Aux dataset local path: ") + for err in errors: + print("WARNING:", err) +PY + +echo "WARNING: FineWeb remains the main training corpus for Parameter Golf; auxiliary V6 should be mixed conservatively (start at 1%)." +echo "Probe setup complete. Full multi-seed record training was NOT started." diff --git a/runpod_record_attempt.sh b/runpod_record_attempt.sh new file mode 100755 index 0000000000..1c149ef076 --- /dev/null +++ b/runpod_record_attempt.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +set -euo pipefail + +log() { + echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] $*" +} + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +WORK_DIR="${WORK_DIR:-$SCRIPT_DIR/runpod_workdir}" +REPO_URL="https://github.com/openai/parameter-golf.git" +REPO_DIR="$WORK_DIR/parameter-golf" +TARGET_REL_PATH="records/track_10min_16mb/2026-04-09_SP8192_3LayerRecur_ParResid_QK525_LegalTTT" +TARGET_DIR="$REPO_DIR/$TARGET_REL_PATH" +TRAIN_SCRIPT="$TARGET_DIR/train_gpt.py" + +SEEDS=(42 314 999) +NUM_GPUS="${NUM_GPUS:-8}" + +log "Starting RunPod record attempt workflow" +log "Using work directory: $WORK_DIR" +mkdir -p "$WORK_DIR" + +log "Cloning/syncing latest $REPO_URL" +if [[ -d "$REPO_DIR/.git" ]]; then + git -C "$REPO_DIR" fetch --all --prune + git -C "$REPO_DIR" checkout main + git -C "$REPO_DIR" pull --ff-only origin main +else + git clone "$REPO_URL" "$REPO_DIR" + git -C "$REPO_DIR" checkout main +fi + +log "Installing required Python dependencies" +python -m pip install --upgrade pip setuptools wheel +python -m pip install --upgrade torch sentencepiece + +log "Attempting to install flash_attn (non-fatal if this fails)" +if python -m pip install --no-build-isolation flash_attn; then + log "flash_attn installed successfully" +else + log "WARNING: flash_attn installation failed; continuing without it" +fi + +log "Preparing environment for multi-GPU training (${NUM_GPUS}x H100 expected)" +export CUDA_DEVICE_ORDER=PCI_BUS_ID +export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}" +export NCCL_DEBUG="${NCCL_DEBUG:-WARN}" +export NCCL_ASYNC_ERROR_HANDLING=1 +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 +export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}" +export TOKENIZERS_PARALLELISM=false + +GPU_COUNT="$(python - <<'PY' +import torch +print(torch.cuda.device_count()) +PY +)" +log "Detected CUDA devices: ${GPU_COUNT}" + +if [[ ! -d "$TARGET_DIR" ]]; then + log "ERROR: target directory not found: $TARGET_DIR" + log "Available recent record directories:" + find "$REPO_DIR/records/track_10min_16mb" -maxdepth 1 -type d | sort | tail -n 30 + exit 1 +fi + +if [[ ! -f "$TRAIN_SCRIPT" ]]; then + log "ERROR: training script not found: $TRAIN_SCRIPT" + exit 1 +fi + +log "Running official SOTA training script: $TARGET_REL_PATH/train_gpt.py" +cd "$TARGET_DIR" + +for seed in "${SEEDS[@]}"; do + log "========================================" + log "Launching training with seed=${seed}" + log "Command: torchrun --standalone --nproc_per_node=${NUM_GPUS} train_gpt.py --seed ${seed}" + torchrun --standalone --nproc_per_node="$NUM_GPUS" train_gpt.py --seed "$seed" 2>&1 | tee "runpod_seed${seed}.log" + log "Completed seed=${seed}" +done + +log "All requested seeds completed successfully"