diff --git a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/requirements.txt b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/requirements.txt new file mode 100644 index 0000000000..dd845cb739 --- /dev/null +++ b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/requirements.txt @@ -0,0 +1 @@ +fused-softcap-ce @ git+https://github.com/anthony-maio/fused-softcap-ce.git@25e7ad6292cd1e837eef592f50e4d9f5990b6c84 diff --git a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt.py b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt.py new file mode 100644 index 0000000000..512734abb7 --- /dev/null +++ b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt.py @@ -0,0 +1,2 @@ +import lzma as L,base64 as B +exec(L.decompress(B.b85decode(";KB4X6I}o_n@VT6Qap3bt~@<3h>ok~)Km^%05+gpp)?!;9yG)0b0NuzL516!6&Ns7P~^TZ@KIzs7wEfONM{W@?nrrt*yA+xh!3WNK;Vd6J5RKTP{GUPI>FaKtsd3?vRpKGVSn&nGmA@ooJ4y}H(((9i+TIvSYWj`rZLHlguXl|qk`oC$N)T{PB)|Z!Hy~=6P&vm!2V~EZ{!+Jtk*7JZCC&6aP_yxnmci)j*dRQrwvH>M6gVzb?F7`@cMBA;iP6ud2RF`q6uAy|H@qPG9rE|2A25aLQxWg3I<)-$WW2Od0Jp3-AUwWlol1aY>E{a;^2DjJ;X!x4lS|F*qG!OUJ1RQH+-6yA=6&VwFwVeTWra}4ZYs+a4VRT)l=6}#f8|DpCiUOq%S7LTTi&*z}330%LS?}B~HN8HJjyLrZx;F7ITeD9xP-~tzXz8vIO7t@*dg#1*`)yk;o~1L%!VrYlF@2XoUqICglq)J;&nc9eijv2uv?OI{e(;;4BsYPvA$uW}iA6CnZX&#oM*q&6;VTvOB`vzr#SnWy3?0MZ@O!B54LYrZR(UJ5XrCkVjyPXNIuV_{*H)rNOIGM8>>Az8Nrgm$=Eq#hq|~Uq3ku&;VWo?@YBt%|oSP5kt&sL<=F2R|1qZ>Vn83H0V<9&87pZ5AuN(HD(e7xB%H<)(iMN<2cUrXaVM{@ZNT@}7x&Fv=pt_&1#yxFkFrk(Na~b_We!Toyxo}T)(lx6(0m`e&tp9bOsE^n)rEMZw(KMZVbe;N8Wfpr(Ewn(&(r2AtMZWH-(w|>Q2Wrsdgg`63KwV7-CT|Y+*;m}C3B_G(8PX#UZ5HC2|-4l^|M8_HArm|Q4!&vdz~qB9Pt{!JP0-ssk^FnYKWdBL=_qQosti38*OL(zMN(S@~tDnpv*nP;WSz^zkNY}T4!M6qbW$H7~SIb!8qgNj(7pSa_;3OrxhiVW)*+^^??gB27o!Atv7aO>@@`~EI`7h4OvfoQ6562d1UHe8lX@((BZjUb1YZR%;2nultmJ$w_)MzCcJ-0pI%6R<%-c7G>rsm0SQw^$Ezq%$z<9tTkS^)`2PuGZd<_i(mI=>r+wQ`qBCb)838i3w&mAsY$*VCVD$HfDKn_;&hwDo7()!co_{-hHOjrTKX{v}mt~8h&7}YA^UMT7En6{BPrNk-`QN$2o?_3#D7Q_6GoG;?zv(r)7R}WFGdD4RrvU%tpW{HjLSizGW4_m`0=!f6+E;+F67>+=zgkHh%gXqFzk7m6>%-1%;_JkQy)D%6!f5>3f^!Z{?t_me(L;1~i)l_tVGzT}9VwZwKSnIKa;aZug#5$HP#NoW8H^h=0bf+vM2+^Q36$6P#mr>sYr=xdM$RVxSf1M%>LVEI+wWk0d7^LO*sVOMuL{JEy4W5>ic@|s)QZ$Gl(KP5Q3NqNzAwKXvdoL!x*gn%-?8%Q_5l%E*R;pA^H&1f9`?~HUz05j8i5Yi@$#`x?v|6gvrQi=p>`W$r5fLzwrWbFH~?ES+Vsf?Ce-SlqtCfk`W?gUM0y_rXY*=r7vWG&k7Jd+-XS11`KJ#GTR!VB7|?mp7HN(j-dI2vUe7NjWR#!vai0}Xr4ZR>4O+cF)sqAW$)MQ_ms<58pFYJ{SuF~WS9K1-G=t+O>={Om=(xvP0^YJJ)RQe{0=l{P9$n8q}aaA*lF6%Mwtz0D8N+$z4v7ExAeBXPlK$*O&-JteZI5#o)g>Mf|93n?3Bgr(+=c(=j$+b$cmh->V!E8yp2ELzGkRi}bRwWd#Tpkt(IXJn!xff?~qfV{Pm~(wMcgIfA?HxORStCzcd>AR-bDyQODulMTOZ2$Ibqd-v6kItwD4URKEd3|NP9S4L(l+&o~w8xR52TyFR*zm#E_-ikNjaZSP%Y283;85`;gy^J2*5Cwt1sUlWLX(U-=EDuL=tVvqK5OeXNTYY~%T!Th+^(INiew=pG4|>>>RLzDmcrf62f;df_!%}Ir@!bt{9pIRRytM|lExcY}g93+YnhJ-4KBfyeGUp>{-pyZ%ZCo9b5SXZGR%ZVj9qm^46l;ODk&Vr@UF9)b?s%bcT89udYruhG>lC7mchVDbG#Ju=v}Z;S8njoXHq<$hHD2;sUhFwOH>4DXfy|S!3C^7><8Qy}6SAk|@kv$!y)vM1Iv|=kZFAY?GbbL6hCGqbaU#ITVj6bsZpEA}@NxpTvT9vGNJ*e_syDj=@McyQh)Ps~dQ2=)5wCVfT`eI)```4ZWANrp(%b?6g0RVza8w*Cuy2KRKnOlYTU{~(lt-?5KO`@Ut$wmT--YT{eKI$IXg}AKG9?dR|)e_yZLN{4vf{c#_K~bzt#w?jBGI@wilGUW;SVX8nw1p)BP(%e6Lb=l)H-^lhhPv5#TDp{pHB2;=*RsmC2=%|b!la3{YPKwVceONH@!a}?1|yj&9LhKTeIfl~z{KAc!8TI%vjVDiWzBHu*sOS!a1cQo9f5htY60wEd8x}v{X@}s4)R{KQZ*+n3KpuJI=So7ne&g_10i8vr5IU;d;A42zLyosV!=_d-a)-`HagLkN#a_(}9ET9(0x9nDM{r`h2&pJod^0uJcH4sd-|ig2T6*B;FWXNeRG43$)Z8oux!0-G4Vc6?F#H4bhQDqtwDOOBe(N6mdcOT(JwvbA{+E|pC$s_~GuPN}3M&{#|hW1dFb~_z!NEy>ib_u+*s=BHhOEj+T9NYokjh`;tyPYOGCdpew|IbVJ%kAL{op3>_ISyr9<@>2Y;hC@u)K99aYl?u57eRY$`fgj}nEpMcixiM5@dE5+2M#~qW@Wc`j_G8QS-XT_1%RD)55jQgRWC*7~Y)|ycBxb3Il2+m~d!5_DQ8>?JY6-lmzf59dgRe2mEuSz%g*&$ATN#t5@kuUc$n?j;Y1uJZm}Iz<(Vj3CGnNmmK3Xh8t_L(in6xLvKmIO(Vug2ang*R&{7q0Q6e@q45@7d8o$Xla%5xp6H5vLs%%tKFa8;cM(H2st{w`Zjep-E?F8p{6%Kbt|L5fhFsgvFDw(SQ&>$sMjb5a^pZYzO`fNzM1G%Ryp$5O*6KLQd4E7edPuZLgQkq#Yo7-So@R~CV#${zUPB9@1XS?7C{O07jaH<9GzMKmO+Y>mv9i#1&i6s_d}?SWLGYt4@cx=n=RE%)nw;|NfcN7wHkNj(VvBvzh1B@D;fSU!xQ@COGo4^AP0Xw=)ansgiqVv3A_gwB9Hsmt(p51~XeGGCQm>YhHnUGID~8vN?zH084AW}VG}G)z)~TZ}In2;R-0n}wDRl<>f45D!T!p6t^88HX){gX75BKq8bsLiEChVQW*2dN8ITj$Vw-k!(ECnDU~KNGW&VB7ge2C;e$+!DwJ(;4f3Jckrs^veB5t=d#s(Zb5y9ER|lKQv+$9MX}K{INVQIDuz)fFaQe!9?<_vRh{}sbEX%er49niQPcyK#k4Tvlg{}Pm|SbPW+p{`cS6fW%p@09xi+aM@wOhIDzk@0(p)W)tq1c_9f}6d-S;*K3EY!__^3}ypJb14Q|r4gH!5cJ5fCDcE^XBq`)0P$f$Ssx34`jcsotJGF~oBr`lri0J*zLt5WU^K(d6i%`jOpKZ>RWD4CvokOGA$Dbc6X)EZ&U2+qj&B;DDSwE~~dqKaM45jc-$m&cN0l=ZCC+*KXHo@6=X?Mq}oK+1?sydMp6qE8d@bT?PpWf=P9d-ewGu%l(m0hnx-*&F|E|ZP1t~WFV7}sme$3;284bFB^`oiMziX(jn12$@n6Wq5F>~F=<^Vd(Y2ZxI$WZ`!^sN?$e6Mcd_n&}a7Fd2>4&;rTdYbpE4gIMGEE?Yo8;XFGN^86aA*a!`^N5TKg9v?w!6C>v3RxQOdK(RD_RINN6HudnNvXbne_km8IUrBKvQhhxF}~ffM#6myHpwVI2-57(vj)YK6xJA_1}DD_J(+Sga>Qzf&~aXt;Q&$cd%`#RJzL{(NOcum0*qpX`6|Vz_F$Qahpk`jS(dMNxbQ|NTTkmG4#z^x@79(FW8mQ5~3@8xDsRma!JS(OxMn?xb@YmN-gbYZtYP%60E*W=cl~-uHE(U&rf>@k8&JO$zBa};L#;c)@QC`H*YP@CT?!rIy0~bXG?Z^$!1-aRIin1NMBtT^hX{7DsK%ceN7bB}PHAY-i%v0sKREqJGFO|B2GCsrh5_`x9cxPPo9}jZp<7r=oU11*Q^|?Q-z1?s_aWOPU8H>}hI1z^A2hwxQY1q!H!`=x^qABR(GK6hswE6=J%xie^BRI{vSdxBTKOPPoT*-@N4?t(8;6F*6J?sHqt=SoOQYhhfy%s?cNFZyb%a<1Krb?WaJnp&9MH?5#4VsL9yhAHYOMjS*dz2e-NA5aDnJtuBB2LsAkq+t672P6fX+zZumsD))>E5OFQ}R(Q*@jPYz2SAQ3&=?D8+=(52T8-y^aEl721|>-f3>O!9DG-BcX8*4iQ$9-w{6sd>sp=TN{&kEWywT!?*<@sz`yx$=v?}t|3V;R%FPMjqS-$U+Oy=~deg)6kv{SoEP3-#ilM56ir1QJoiDfKdoGjy!6q?}U}Oifa~dl6h*C8E{IEx9;K$K{*oD_MZPwRB3+MEaEomnphN>gW%&p_5xEsqn#(62m+H8;joX417@TygnW}c`!u{{K>^cwq$Q+KCtkJEPyNnuN^W?FN9y=ze{p{JSHh8N=aJY_v((01-%VZQ{n0#^dk?tF`qco*<%8%8^e**O5QZEcQE}=FC+J>;*dG6h;|hDOMV-5K2LkR4sU!J=ZVTP!KOU1C@TgFBrXTTyoO}aZwkE4=9^+gVnpCICAv3DKtcq!dV#oHsj(VX8rM6o-DO3lCQ|z67$eFQ)a{oBGNj~RoqA%G6K7?Jy+|AMS8iNY?J=hR|#^qpk8cWu5~F<;EeW8&!&tfQ30!RGj-nY~TNTa5H{ih3L=?v}@3eQ6>X%d2_dA3Frufw#DTYK!h3kRLvnXJvWu{<4SGYNFPfp`FSGIjlw(f?58`P+a1mfS@oeU5=SviDWY*>#S%}&%rC|C?O+NUzjD0DhCL}&vpO{e?L9{vka7|k%0D|>;}wB>&-_??H>n5>FIErG_eme@yF_DCIXSSxlLeyR>G+r%Xm|f3J%zm#$+NENIkVB)I9O$ZK!%cfaNp^>rNi`a5vQXYPI2zWXWFLKgMg-OSl=y!K?=0KweG^TtZ0Q$x$sN!-p3LL5>bDEBvW2{9yrAf&f^2RJLxY1d_j^L9rx|*oxW5P5S$30Rc?RX{g9HU}?MC?Mb&Hb|+DS)g!>03x2pg{>>n^1o^A_HMypsyRrj)+mfwVr+7&j~?ijWt6MJYBivl!Q)J_`Zv-Zfp~~W1dEdd8d}B8oen?&Hw`1L>lR4k;fQ($6#j12^)VwWB8=p$=#b%n&AK_sU^<``z$+qnLY*OJx6kJ`}Q}?cKsSHE{c%|)8OSN%1FFURH?S1}e+39sKVlk;G_#xbB82pB(m8JNDXrL^2hkKzz*b|PegI6lAsD;tobQJ>nSy|`-$`4DlK14H;&HtbtFeNK^SeiJ@@zr`y*Hq$ZgC6Y`at!r3XO*Zo_!Fp#IHxJ{`eEsF(dKMAbSmuvVk=$wYc*K6OL^9}GERi;yGFPP>l5L+`Z`V7OQ?AO!_*v#2%jhZq6af6Ejo#525%Kq2d8Zf>;IvdeUV*s_^-{r-iO&yxvnX==n*=$_tKQ@@$B}6K|Ks4acPI$J#VFc<_@hr}$0WvxAdhFeK{Op-6?R5yLbPLT5+!u)BQM<6%UW;XbQL-oy+!-9`ijAIxv%91cN~y^W*L%1?sw;+KZjJq;=@tWDi+Q@C*F4sG7HJBPCfqzMHnAK`-~IG-z_nV!jDfRkIF%&UMu3c-=IN&r}{T%t^d?}2X8H83g3!{@}X>Z|IQ)bCyCkA^5mN1m4a92#M30N2O0K_+TZeSV0Ca=E@Gjjf#z*{eDE&U^=CV|)pzH(;PP*NrK`AOxvJ<2|V%j=5RKUrbp&7yd6gpDTGf*a4?mzp%9w4iYW{d;ug%8!xn!d69DeI^CFd+)G=sOw&q{6x#*9Fxd-HLxBFJAjjdH%=o{pTMViR3gn44{hPQ#<2UrW78nX54-=@Fs7C#CF{Px`9OX9eWI(Dd4MV)OFm6GxTPIocB)^WN-ugVxX4U|*Nt}dIt?LVUpC9`yVgJ(<=k#mr*+dSXWE0QSi?8~Shij2qmZVv_o1gc56QT&B||!@p?rAHW~Yvg)Xfo&T+|byhzRclP%vP(YUS@4c{2=Nq|^mZ;dx4kAAIW}3PKBhB7AO%x87?vYgI

g$!SAg0K41tt&VXtc^jkz~l^#UWHj@xhX*?y+^)wggi%V2}dp>XeQ?f|*tEYq01vt;AF&wWV20Zl$?1pH8E$Lm__hhWp$8#Rq$8ImK&~3Z~iuWSb8qWju{XLO~nqYKs?7!VBMqv0gNwMfrx{WpU{FC{uiVzAo~?z29U+6vIiB>Q#HkGARp0t*1e3zi|*?cJ1Ie0OnDnxO(Y2hGs-@A*;+py!fZZ8m7~ukA4=W_|4gC%PCYe)%H*2A2V~F%}z&AoM~bzs*zx?M>jnS{ReMlDP9H*pK_U^aZ?BxEAzE3^|eu{*(De+@d9d=T?4Th`S9(LmZ=NCp+)cRmG{4;{`45P8=?v;Tw2Ah>1KwIQ)6Ys~_O?Wv7;})7Em7eIgFKXl+SMwU|GfRYnQzA0U6}Jeh)XH0bZl=)uHnNp^ugw53U!-0LRqeT!ULQQLYcLDuylAezoa&5K9fi4sA81EZBMXdjglZbS|-jQV}p_1oFg6rorM^Yak-;?J@tDuAx1i>S>J_%r@plgQDMTdi{ztwLoM&t$Ijo0FJNYq4G7w4QSya`M9e6y90uQV++77Wl?AP6TJsA|45Nh*(5n*xvf5fI1-+W&l?i@_11>Nw*eteB)K82_P8R$@-C{)nRYm*5aO2ZJ489;g?EPqXR2PiW>KRc5g@VKx}U3j;K0O9?^!Hgg9LI91oA$?%U10OHrNH*@km?p;mXl!SIm`6qYs8mXdaalxI;dNfO@O%_S8+RgXBf@toOSRqiw~P7ICy5Sq45OcsxI0SHs~drygWSM_4vzi75yCOT~nXEV;?d^Fcq?3S$n88CM5Yp0FhME{oBzs`}K4zxnb~JqhXngh5nP>MUxzF&b#$raun0b9lNUeD6*^WWy=~tmX?U1F(pu=VChLJKU9hNk3S6?)~446I4YPx$iuk%wEE{ON46F(!a$^I`k7@=ln@x+Ve2@zZmd?3YD8c;nbWgkjDFra9`ydA0QWH!#~wcHZb7DV0q#3X+i$%3DCXrKc%;IAzlcL22Vn@E0ceEpAV(`fGIhC*OgbX=ovr?tpV&0TPeQ4=xD(3U-mJu9>8z!eN4o!Uuioil%x@^lN(lgbQzj6*+!5SF5GXhTrGWpBqmJJBM{9=T`8n8!gOcQr9mFd&yA-Fr?EK;ndbFKZb+s&CheLS-edVp7l0rlCHlp-ecNIni`N~s}Ht7A4yu?1|bHNItW*OI+of-T3O|QAi?hnUE4|7H2uSMTYbD<{K&ezOf$f1a6>}Mn|f?M_taeV3zmg+LdF#_i`AJ?%R7_rXAES+4N6FOg{+6`XV;rtthIk8e&ctvJmc8={(=}eiV2ztm4+0noK-+K0lm1J+gNky_4V*TH;Y0?^mUJpeMaLI?zhTv#CD|h^Pp=n1V;4mo^I0&iUanN_Ak45EBL;*eVrezekEtru`??Tx-`h=~vxl!EcJ)B_m{xGnV6w5-f*xS9{K#Ed<^q>L#&H(AbkOU&?SMne>LP=N4c!a$GbvSLAES4;Lnx_m@n~2OgL^zp(y^})x8*?BX!FC#BZVV=_j`oJBiFr4m28X7MMXgS7*N1kJ~&5W_7kQdn(FiWX^4rbG^B0|CaYS;_@QK=lWnl9kh8HqTQdqW4eM-y%2$xqD^HPePlEEPJ(6Q_9y&Pvnyk2>6n^r7Z0hWuk~LeD_?QldLmq*6mZ*)&YR}Q(^u+n4wMMF^a<3hsaZ++V#$P}pWrF^b3&~?gP>2?gFL?*b;Atxvv6OZLvPu?-M&|UKiT5j}I9D=FqY)T^=@=W|)I!%QQu^*Gr+-d(fMbiDdi!4=ECEs=7R*jRZ1rs>)eWPX@FmpqJg9W8bY}to4_#54l0dum({u0NIENGYYs5nEp>WKaKLs$i>86t0~Qsbb;T>aBK>s-{8A&nNky&R~l1clsLyr8O5l_%%4Yv>UMR@-1ugMi0nXlSH*R!NmG*FQcGzq-%D)!&L`P?VoQtjI0i+b#Mm~^7Y;1?nvUe^joa8vl@EYTk<9q4du-8PaUf}o^u}-$KXq~$$_KY=wv>9-J5qn26IV*b2UGec@yCe4T^nLuY2`Pwx!ELAn@UIvlDConV9!Bm=)6atE|eW1LJq9#HeAZ^`OwTpf@)$`%1zSdqm)_xi~tB-1v>{LKP+93jHNJ*$?ySS%NqHB7VMU}%UoYNjk$N?9$F$o!yWS<*OsGg6B@CL`!oMbtI;%aIue8XLtNHELrOfC|JDq~!BHCGN*%1+S=Pe-SX$jCI=xvR`q3O!k^NHRvS^;Q{Y3J3={jeN@NPn#i?uW8O%~6S&s}bx>e}1DFFPzp{lO@Uwud2O&0EgOd%ikqQ$))=K-R)xJm4wJ?5NLjw@+Qm!C>nDV)a*6eWcjWhOed7!FT7`4W1z0&2BuAo8ZhiD5+3=IhjaYR+CY4151y*dP>h;J$)ro#~gt25%|D24%$vCC2B*lw#<#JrPw>)YO{NBscftGq*adJ_2A|+JhJ}0GO6qVmH1_28iK%#5?An}3U3xtkYmN+20mjC(;cTs&MCS384C22z@hxz#lz+}+W)p)F=wYDMoXUBZSJ`y@oa=)`Hn2%Jn$Eh-%c@(U9)H4X=Ib5~skf*VONrx4pa4lchE^aXZrHm=!)~oEjIzy@GGb}ameJeh%r#TA~yB9{UbbJ-HtQ~J&1+{eg-Ka2^hzJ>@)!a6kqkmEQIg)COAbArB-n}7h50!un;8Xb09DxjbpWHg!)oxBFji}6Z5ZPjnK;GDIisSVoK-<|Ou-@)?=!r01{XxOrZFO*%^wf{tH`|lY+)l9-=r0|aDF%vFn>`})MnW9-10W|qQda|c>^Nov-hkILG4{n5a0)Qce-?6Uj9*uj)8E~H#7T0VtJQ&x8x&kSXRlXg>f10|h-9LqBrjN$N0DajiyiM(zio1A?pbDn2fC=j&paG-^{62^~fDS)wiiOY{B058Q($aFXQrMh)`kM?C4%u%f5->M}ydh-yS4sNbzf!%uH0%m5=MaIUX4Y{S*R!Yx5{o;YNaBIt0VZGY45+9Fv2z?1aO^-PP_HB%8;-JnRvPDE^o^UN@BLRS5C)GV1>&q`g?$An1<|5#hf*Z5)XtWP?wbEL#C(~P&7#XxdoB5>;&d&)EEbNt`A1KUw?0Lr9(;av>2yeQU1g%3(9Ybo(VMn0aV6N?w<{FTLW*KJuS=W72H^sV3sPF(&{HSjNFgdYVmZG(F**rmn9h$#6{cx6_B5&+AOh9LbI@J&h^Bq47n~sz|>sa$<6P=F&j2i$rx#qE!k>$Gncz&)mbk{#i|7rAun?iu^jfDJgf>sH=b7POJuqy{CdmV7l3upi"),format=L.FORMAT_RAW,filters=[{"id":L.FILTER_LZMA2}])) diff --git a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt_sota.py b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt_sota.py new file mode 100644 index 0000000000..fb3f0ac8fd --- /dev/null +++ b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt_sota.py @@ -0,0 +1,470 @@ +import collections,copy,glob,io,lzma,math,os +from pathlib import Path +import random,re,subprocess,sys,time,uuid,numpy as np,sentencepiece as spm,torch,torch.distributed as dist,torch.nn.functional as F +from torch.nn.parallel import DistributedDataParallel as DDP +from torch import Tensor,nn +from flash_attn_interface import flash_attn_func as flash_attn_3_func +class Hyperparameters:data_dir=os.environ.get('DATA_DIR','./data/');seed=int(os.environ.get('SEED',1337));run_id=os.environ.get('RUN_ID',str(uuid.uuid4()));iterations=int(os.environ.get('ITERATIONS',20000));warmdown_frac=float(os.environ.get('WARMDOWN_FRAC',.72));warmup_steps=int(os.environ.get('WARMUP_STEPS',20));train_batch_tokens=int(os.environ.get('TRAIN_BATCH_TOKENS',786432));train_seq_len=int(os.environ.get('TRAIN_SEQ_LEN',2048));train_log_every=int(os.environ.get('TRAIN_LOG_EVERY',500));max_wallclock_seconds=float(os.environ.get('MAX_WALLCLOCK_SECONDS',6e2));val_batch_tokens=int(os.environ.get('VAL_BATCH_TOKENS',524288));eval_seq_len=int(os.environ.get('EVAL_SEQ_LEN',2048));val_loss_every=int(os.environ.get('VAL_LOSS_EVERY',4000));sliding_window_enabled=bool(int(os.environ.get('SLIDING_WINDOW_ENABLED','1')));vocab_size=int(os.environ.get('VOCAB_SIZE',8192));num_layers=int(os.environ.get('NUM_LAYERS',11));xsa_last_n=int(os.environ.get('XSA_LAST_N',11));model_dim=int(os.environ.get('MODEL_DIM',512));embedding_dim=int(os.environ.get('EMBEDDING_DIM',512));num_kv_heads=int(os.environ.get('NUM_KV_HEADS',4));num_heads=int(os.environ.get('NUM_HEADS',8));mlp_mult=float(os.environ.get('MLP_MULT',4.));skip_gates_enabled=bool(int(os.environ.get('SKIP_GATES_ENABLED','1')));tie_embeddings=bool(int(os.environ.get('TIE_EMBEDDINGS','1')));logit_softcap=float(os.environ.get('LOGIT_SOFTCAP',3e1));rope_base=float(os.environ.get('ROPE_BASE',1e4));rope_dims=int(os.environ.get('ROPE_DIMS',16));rope_train_seq_len=int(os.environ.get('ROPE_TRAIN_SEQ_LEN',2048));ln_scale=bool(int(os.environ.get('LN_SCALE','1')));qk_gain_init=float(os.environ.get('QK_GAIN_INIT',5.));num_loops=int(os.environ.get('NUM_LOOPS',2));loop_start=int(os.environ.get('LOOP_START',3));loop_end=int(os.environ.get('LOOP_END',5));enable_looping_at=float(os.environ.get('ENABLE_LOOPING_AT',.35));parallel_residual_start=int(os.environ.get('PARALLEL_RESIDUAL_START',7));min_lr=float(os.environ.get('MIN_LR',.0));embed_lr=float(os.environ.get('EMBED_LR',.6));head_lr=float(os.environ.get('HEAD_LR',.008));tied_embed_lr=float(os.environ.get('TIED_EMBED_LR',.03));tied_embed_init_std=float(os.environ.get('TIED_EMBED_INIT_STD',.005));matrix_lr=float(os.environ.get('MATRIX_LR',.022));scalar_lr=float(os.environ.get('SCALAR_LR',.02));muon_momentum=float(os.environ.get('MUON_MOMENTUM',.99));muon_backend_steps=int(os.environ.get('MUON_BACKEND_STEPS',5));muon_momentum_warmup_start=float(os.environ.get('MUON_MOMENTUM_WARMUP_START',.92));muon_momentum_warmup_steps=int(os.environ.get('MUON_MOMENTUM_WARMUP_STEPS',1500));muon_row_normalize=bool(int(os.environ.get('MUON_ROW_NORMALIZE','1')));beta1=float(os.environ.get('BETA1',.9));beta2=float(os.environ.get('BETA2',.95));adam_eps=float(os.environ.get('ADAM_EPS',1e-08));grad_clip_norm=float(os.environ.get('GRAD_CLIP_NORM',.3));eval_stride=int(os.environ.get('EVAL_STRIDE',64));muon_beta2=float(os.environ.get('MUON_BETA2',.95));adam_wd=float(os.environ.get('ADAM_WD',.02));muon_wd=float(os.environ.get('MUON_WD',.095));embed_wd=float(os.environ.get('EMBED_WD',.085));ema_decay=float(os.environ.get('EMA_DECAY',.9965));ttt_enabled=bool(int(os.environ.get('TTT_ENABLED','0')));ttt_lr=float(os.environ.get('TTT_LR',.005));ttt_epochs=int(os.environ.get('TTT_EPOCHS',3));ttt_momentum=float(os.environ.get('TTT_MOMENTUM',.9));ttt_chunk_tokens=int(os.environ.get('TTT_CHUNK_TOKENS',32768));etlb_enabled=bool(int(os.environ.get('ETLB_ENABLED','0')));etlb_lr=float(os.environ.get('ETLB_LR',.05));etlb_steps=int(os.environ.get('ETLB_STEPS',5));etlb_clip=float(os.environ.get('ETLB_CLIP',3.));compressor=os.environ.get('COMPRESSOR','brotli');gptq_calibration_batches=int(os.environ.get('GPTQ_CALIBRATION_BATCHES',64));gptq_reserve_seconds=float(os.environ.get('GPTQ_RESERVE_SECONDS',12.));matrix_bits=int(os.environ.get('MATRIX_BITS',6));embed_bits=int(os.environ.get('EMBED_BITS',8));matrix_clip_sigmas=float(os.environ.get('MATRIX_CLIP_SIGMAS',12.85));embed_clip_sigmas=float(os.environ.get('EMBED_CLIP_SIGMAS',2e1));distributed='RANK'in os.environ and'WORLD_SIZE'in os.environ;rank=int(os.environ.get('RANK','0'));world_size=int(os.environ.get('WORLD_SIZE','1'));local_rank=int(os.environ.get('LOCAL_RANK','0'));is_main_process=rank==0;grad_accum_steps=8//world_size;datasets_dir=os.path.join(data_dir,'datasets',f"fineweb10B_sp{vocab_size}");train_files=os.path.join(datasets_dir,'fineweb_train_*.bin');val_files=os.path.join(datasets_dir,'fineweb_val_*.bin');tokenizer_path=os.path.join(data_dir,'tokenizers',f"fineweb_{vocab_size}_bpe.model");logfile=f"logs/{run_id}.txt";model_path='final_model.pt';quantized_model_path='final_model.int6.ptz' +_logger_hparams=None +def set_logging_hparams(h):global _logger_hparams;_logger_hparams=h +def log(msg,console=True): + if _logger_hparams is None:print(msg);return + if _logger_hparams.is_main_process: + if console:print(msg) + if _logger_hparams.logfile is not None: + with open(_logger_hparams.logfile,'a',encoding='utf-8')as f:print(msg,file=f) +class ValidationData: + def __init__(self,h,device): + self.sp=spm.SentencePieceProcessor(model_file=h.tokenizer_path) + if int(self.sp.vocab_size())!=h.vocab_size:raise ValueError(f"VOCAB_SIZE={h.vocab_size} does not match tokenizer vocab_size={int(self.sp.vocab_size())}") + self.val_tokens=load_validation_tokens(h.val_files,h.eval_seq_len);self.base_bytes_lut,self.has_leading_space_lut,self.is_boundary_token_lut=build_sentencepiece_luts(self.sp,h.vocab_size,device) +def build_sentencepiece_luts(sp,vocab_size,device): + sp_vocab_size=int(sp.vocab_size());assert sp.piece_to_id('▁')!=sp.unk_id(),"Tokenizer must have '▁' (space) as its own token for correct BPB byte counting";table_size=max(sp_vocab_size,vocab_size);base_bytes_np=np.zeros((table_size,),dtype=np.int16);has_leading_space_np=np.zeros((table_size,),dtype=np.bool_);is_boundary_token_np=np.ones((table_size,),dtype=np.bool_) + for token_id in range(sp_vocab_size): + if sp.is_control(token_id)or sp.is_unknown(token_id)or sp.is_unused(token_id):continue + is_boundary_token_np[token_id]=False + if sp.is_byte(token_id):base_bytes_np[token_id]=1;continue + piece=sp.id_to_piece(token_id) + if piece.startswith('▁'):has_leading_space_np[token_id]=True;piece=piece[1:] + base_bytes_np[token_id]=len(piece.encode('utf-8')) + return torch.tensor(base_bytes_np,dtype=torch.int16,device=device),torch.tensor(has_leading_space_np,dtype=torch.bool,device=device),torch.tensor(is_boundary_token_np,dtype=torch.bool,device=device) +def load_validation_tokens(pattern,seq_len): + files=[Path(p)for p in sorted(glob.glob(pattern))] + if not files:raise FileNotFoundError(f"No files found for pattern: {pattern}") + tokens=torch.cat([load_data_shard(file)for file in files]).contiguous();usable=(tokens.numel()-1)//seq_len*seq_len + if usable<=0:raise ValueError(f"Validation split is too short for TRAIN_SEQ_LEN={seq_len}") + return tokens[:usable+1] +def load_data_shard(file): + header_bytes=256*np.dtype('0 else 0;num_sequences=(self.num_tokens[si]-1-phase)//self.seq_len;sequence_order=self.rng.permutation(num_sequences);self.start_inds[si]=(phase+sequence_order*self.seq_len).tolist() + def next_batch(self,global_tokens,grad_accum_steps): + device_tokens=global_tokens//(self.world_size*grad_accum_steps);device_batch_size=device_tokens//self.seq_len;remaining=np.array([len(s)for s in self.start_inds],dtype=np.float64);x=torch.empty((device_batch_size,self.seq_len),dtype=torch.int64);y=torch.empty((device_batch_size,self.seq_len),dtype=torch.int64) + for bi in range(device_batch_size): + total=remaining.sum() + if total<=0: + for si in range(len(self.files)):self._reset_shard(si) + remaining=np.array([len(s)for s in self.start_inds],dtype=np.float64);total=remaining.sum() + probs=remaining/total;si=int(self.rng.choice(len(self.files),p=probs));start_ind=self.start_inds[si].pop();remaining[si]-=1;mm=_get_shard_memmap(self.files[si]);window=torch.as_tensor(np.array(mm[start_ind:start_ind+self.seq_len+1],dtype=np.int64));x[bi]=window[:-1];y[bi]=window[1:] + return x.to(self.device,non_blocking=True),y.to(self.device,non_blocking=True) +class RMSNorm(nn.Module): + def __init__(self,eps=None):super().__init__();self.eps=eps + def forward(self,x):return F.rms_norm(x,(x.size(-1),),eps=self.eps) +class CastedLinear(nn.Linear): + def forward(self,x):w=self.weight.to(x.dtype);bias=self.bias.to(x.dtype)if self.bias is not None else None;return F.linear(x,w,bias) +class Rotary(nn.Module): + def __init__(self,dim,base=1e4,train_seq_len=1024,rope_dims=0):super().__init__();self.dim=dim;self.base=base;self.train_seq_len=train_seq_len;self.rope_dims=rope_dims if rope_dims>0 else dim;inv_freq=1./base**(torch.arange(0,self.rope_dims,2,dtype=torch.float32)/self.rope_dims);self.register_buffer('inv_freq',inv_freq,persistent=False);self._seq_len_cached=0;self._cos_cached=None;self._sin_cached=None + def forward(self,seq_len,device,dtype): + if self._cos_cached is None or self._sin_cached is None or self._seq_len_cached!=seq_len or self._cos_cached.device!=device: + rd=self.rope_dims + if seq_len>self.train_seq_len:scale=seq_len/self.train_seq_len;new_base=self.base*scale**(rd/(rd-2));inv_freq=1./new_base**(torch.arange(0,rd,2,dtype=torch.float32,device=device)/rd) + else:inv_freq=self.inv_freq.to(device) + t=torch.arange(seq_len,device=device,dtype=inv_freq.dtype);freqs=torch.outer(t,inv_freq);self._cos_cached=freqs.cos()[None,:,None,:];self._sin_cached=freqs.sin()[None,:,None,:];self._seq_len_cached=seq_len + return self._cos_cached.to(dtype=dtype),self._sin_cached.to(dtype=dtype) +def apply_rotary_emb(x,cos,sin,rope_dims=0): + if rope_dims>0 and rope_dims0: + head_dim=h.model_dim//h.num_heads + for block in self.blocks:block.attn.rope_dims=h.rope_dims;block.attn.rotary=Rotary(head_dim,base=h.rope_base,train_seq_len=h.train_seq_len,rope_dims=h.rope_dims) + self.final_norm=RMSNorm();self.lm_head=None if h.tie_embeddings else CastedLinear(h.embedding_dim,h.vocab_size,bias=False) + if self.lm_head is not None:self.lm_head._zero_init=True + if h.xsa_last_n>0: + for i in range(max(0,h.num_layers-h.xsa_last_n),h.num_layers):self.blocks[i].attn.use_xsa=True + if h.parallel_residual_start>=0: + for i in range(h.parallel_residual_start,h.num_layers):self.blocks[i].parallel=True + self.looping_active=False + if h.num_loops>0: + loop_seg=list(range(h.loop_start,h.loop_end+1));all_indices=list(range(h.loop_start)) + for _ in range(h.num_loops+1):all_indices.extend(loop_seg) + all_indices.extend(range(h.loop_end+1,h.num_layers));num_enc=len(all_indices)//2;self.encoder_indices=all_indices[:num_enc];self.decoder_indices=all_indices[num_enc:] + else:self.encoder_indices=list(range(self.num_encoder_layers));self.decoder_indices=list(range(self.num_encoder_layers,h.num_layers)) + self.num_skip_weights=min(len(self.encoder_indices),len(self.decoder_indices));self.skip_weights=nn.Parameter(torch.ones(self.num_skip_weights,h.model_dim,dtype=torch.float32));self.skip_gates=nn.Parameter(torch.zeros(self.num_skip_weights,h.model_dim,dtype=torch.float32))if h.skip_gates_enabled else None;self._init_weights() + def _init_weights(self): + if self.tie_embeddings:nn.init.normal_(self.tok_emb.weight,mean=.0,std=self.tied_embed_init_std) + for(name,module)in self.named_modules(): + if isinstance(module,nn.Linear): + if getattr(module,'_zero_init',False):nn.init.zeros_(module.weight) + elif module.weight.ndim==2 and module.weight.shape[0]>=64 and module.weight.shape[1]>=64:nn.init.orthogonal_(module.weight,gain=1.) + def forward_logits(self,input_ids): + x=self.tok_emb(input_ids);x=F.rms_norm(x,(x.size(-1),)) + if self.embed_proj is not None:x=self.embed_proj(x) + x0=x;skips=[];enc_iter=self.encoder_indices if self.looping_active else range(self.num_encoder_layers);dec_iter=self.decoder_indices if self.looping_active else range(self.num_encoder_layers,self.num_encoder_layers+self.num_decoder_layers) + for i in enc_iter:x=self.blocks[i](x,x0);skips.append(x) + for(skip_idx,i)in enumerate(dec_iter): + if skip_idxG.size(1) + if transposed:X=X.T + for _ in range(steps):A=X@X.T;B=b*A+c*A@A;X=a*X+B@X + return X.T if transposed else X +class Muon(torch.optim.Optimizer): + def __init__(self,params,lr,momentum,backend_steps,nesterov=True,weight_decay=.0,row_normalize=False):super().__init__(params,dict(lr=lr,momentum=momentum,backend_steps=backend_steps,nesterov=nesterov,weight_decay=weight_decay,row_normalize=row_normalize)) + @torch.no_grad() + def step(self,closure=None): + loss=None + if closure is not None: + with torch.enable_grad():loss=closure() + distributed=dist.is_available()and dist.is_initialized();world_size=dist.get_world_size()if distributed else 1;rank=dist.get_rank()if distributed else 0 + for group in self.param_groups: + params=group['params'] + if not params:continue + lr=group['lr'];momentum=group['momentum'];backend_steps=group['backend_steps'];nesterov=group['nesterov'];total_params=sum(int(p.numel())for p in params);updates_flat=torch.zeros(total_params,device=params[0].device,dtype=torch.bfloat16);curr=0 + for(i,p)in enumerate(params): + if i%world_size==rank and p.grad is not None: + g=p.grad;state=self.state[p] + if'momentum_buffer'not in state:state['momentum_buffer']=torch.zeros_like(g) + buf=state['momentum_buffer'];buf.mul_(momentum).add_(g) + if nesterov:g=g.add(buf,alpha=momentum) + if group.get('row_normalize',False):row_norms=g.float().norm(dim=-1,keepdim=True).clamp_min(1e-07);g=g/row_norms.to(g.dtype) + g=zeropower_via_newtonschulz5(g,steps=backend_steps);g*=max(1,g.size(0)/g.size(1))**.5;updates_flat[curr:curr+p.numel()]=g.reshape(-1) + curr+=p.numel() + if distributed:dist.all_reduce(updates_flat,op=dist.ReduceOp.SUM) + wd=group.get('weight_decay',.0);curr=0 + for p in params: + if wd>.0:p.data.mul_(1.-lr*wd) + g=updates_flat[curr:curr+p.numel()].view_as(p).to(dtype=p.dtype);p.add_(g,alpha=-lr);curr+=p.numel() + return loss +CONTROL_TENSOR_NAME_PATTERNS=tuple(pattern for pattern in os.environ.get('CONTROL_TENSOR_NAME_PATTERNS','attn_scale,attn_scales,mlp_scale,mlp_scales,resid_mix,resid_mixes,q_gain,skip_weight,skip_weights,skip_gates').split(',')if pattern) +class Optimizers: + def __init__(self,h,base_model): + block_named_params=list(base_model.blocks.named_parameters());matrix_params=[p for(name,p)in block_named_params if p.ndim==2 and not any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)];scalar_params=[p for(name,p)in block_named_params if p.ndim<2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)] + if base_model.skip_weights.numel()>0:scalar_params.append(base_model.skip_weights) + if base_model.skip_gates is not None and base_model.skip_gates.numel()>0:scalar_params.append(base_model.skip_gates) + token_lr=h.tied_embed_lr if h.tie_embeddings else h.embed_lr;tok_params=[{'params':[base_model.tok_emb.weight],'lr':token_lr,'base_lr':token_lr}];self.optimizer_tok=torch.optim.AdamW(tok_params,betas=(h.beta1,h.beta2),eps=h.adam_eps,weight_decay=h.embed_wd,fused=True);self.optimizer_muon=Muon(matrix_params,lr=h.matrix_lr,momentum=h.muon_momentum,backend_steps=h.muon_backend_steps,weight_decay=h.muon_wd,row_normalize=h.muon_row_normalize) + for group in self.optimizer_muon.param_groups:group['base_lr']=h.matrix_lr + self.optimizer_scalar=torch.optim.AdamW([{'params':scalar_params,'lr':h.scalar_lr,'base_lr':h.scalar_lr}],betas=(h.beta1,h.beta2),eps=h.adam_eps,weight_decay=h.adam_wd,fused=True);self.optimizers=[self.optimizer_tok,self.optimizer_muon,self.optimizer_scalar] + if base_model.lm_head is not None:self.optimizer_head=torch.optim.Adam([{'params':[base_model.lm_head.weight],'lr':h.head_lr,'base_lr':h.head_lr}],betas=(h.beta1,h.beta2),eps=h.adam_eps,fused=True);self.optimizers.insert(1,self.optimizer_head) + else:self.optimizer_head=None + def __iter__(self):return iter(self.optimizers) + def zero_grad_all(self): + for opt in self.optimizers:opt.zero_grad(set_to_none=True) + def step(self): + for opt in self.optimizers:opt.step() + self.zero_grad_all() +def restore_fp32_params(model): + for module in model.modules(): + if isinstance(module,CastedLinear):module.float() + for(name,param)in model.named_parameters(): + if(param.ndim<2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS))and param.dtype!=torch.float32:param.data=param.data.float() +def collect_hessians(model,train_loader,h,device,n_calibration_batches=64): + hessians={};hooks=[] + def make_hook(name): + def hook_fn(module,inp,out): + x=inp[0].detach().float() + if x.ndim==3:x=x.reshape(-1,x.shape[-1]) + if name not in hessians:hessians[name]=torch.zeros(x.shape[1],x.shape[1],dtype=torch.float32,device=device) + hessians[name].addmm_(x.T,x) + return hook_fn + for(name,module)in model.named_modules(): + if isinstance(module,CastedLinear)and module.weight.numel()>65536: + cat=classify_param(name+'.weight') + if cat in('mlp','attn'):hooks.append(module.register_forward_hook(make_hook(name+'.weight'))) + if model.tie_embeddings: + hook_module=model.head_proj if model.head_proj is not None else model.final_norm + def make_output_hook(name): + def hook_fn(module,inp,out): + x=out.detach().float() + if x.ndim==3:x=x.reshape(-1,x.shape[-1]) + if name not in hessians:hessians[name]=torch.zeros(x.shape[1],x.shape[1],dtype=torch.float32,device=device) + hessians[name].addmm_(x.T,x) + return hook_fn + hooks.append(hook_module.register_forward_hook(make_output_hook('tok_emb.weight'))) + model.eval() + with torch.no_grad(): + for _ in range(n_calibration_batches):x,_=train_loader.next_batch(h.train_batch_tokens,h.grad_accum_steps);model.forward_logits(x) + for hook in hooks:hook.remove() + for name in hessians:hessians[name]=hessians[name].cpu()/n_calibration_batches + return hessians +def gptq_quantize_weight(w,H,clip_sigmas=3.,clip_range=63,block_size=128): + W_orig=w.float().clone();rows,cols=W_orig.shape;H=H.float().clone();dead=torch.diag(H)==0;H[dead,dead]=1;damp=.01*H.diag().mean();H.diagonal().add_(damp);perm=torch.argsort(H.diag(),descending=True);invperm=torch.argsort(perm);W_perm=W_orig[:,perm].clone();W_perm[:,dead[perm]]=0;H=H[perm][:,perm];Hinv=torch.cholesky_inverse(torch.linalg.cholesky(H));Hinv=torch.linalg.cholesky(Hinv,upper=True);row_std=W_orig.std(dim=1);s=(clip_sigmas*row_std/clip_range).clamp_min(1e-10).to(torch.float16);sf=s.float();Q=torch.zeros(rows,cols,dtype=torch.int8);W_work=W_perm.clone() + for i1 in range(0,cols,block_size): + i2=min(i1+block_size,cols);W_block=W_work[:,i1:i2].clone();Hinv_block=Hinv[i1:i2,i1:i2];Err=torch.zeros(rows,i2-i1) + for j in range(i2-i1):w_col=W_block[:,j];d=Hinv_block[j,j];q_col=torch.clamp(torch.round(w_col/sf),-clip_range,clip_range);Q[:,i1+j]=q_col.to(torch.int8);err=(w_col-q_col.float()*sf)/d;Err[:,j]=err;W_block[:,j:]-=err.unsqueeze(1)*Hinv_block[j,j:].unsqueeze(0) + if i20:out[name]=(q.float()*s.float().view(q.shape[0],*[1]*(q.ndim-1))).to(orig_dtype) + else:out[name]=(q.float()*float(s.item())).to(orig_dtype) + return out +_BSHF_MAGIC=b'BSHF' +def _byte_shuffle(data,stride=2): + if stride<=1 or len(data)0: + base_model.train();chunk_seqs=(chunk_end-chunk_start)//seq_len + if chunk_seqs>0: + cos_lr=h.ttt_lr*.5*(1.+math.cos(math.pi*ci/max(num_chunks-1,1))) + for pg in optimizer.param_groups:pg['lr']=cos_lr + my_seq_s=chunk_seqs*rank//world_size;my_seq_e=chunk_seqs*(rank+1)//world_size;my_chunk_seqs=my_seq_e-my_seq_s + for _ep in range(h.ttt_epochs): + for bs in range(0,my_chunk_seqs,batch_seqs): + be=min(bs+batch_seqs,my_chunk_seqs);actual_bs=my_seq_s+bs;start_tok=chunk_start+actual_bs*seq_len;end_tok=chunk_start+(my_seq_s+be)*seq_len+1 + if end_tok>val_data.val_tokens.numel():continue + local=val_data.val_tokens[start_tok:end_tok].to(device=device,dtype=torch.int64);x=local[:-1].reshape(-1,seq_len);y=local[1:].reshape(-1,seq_len);optimizer.zero_grad(set_to_none=True) + with torch.autocast(device_type='cuda',dtype=torch.bfloat16):loss=base_model(x,y) + loss.backward() + if world_size>1: + for p in ttt_params: + if p.grad is not None:dist.all_reduce(p.grad,op=dist.ReduceOp.AVG) + torch.nn.utils.clip_grad_norm_(ttt_params,1.);optimizer.step() + if dist.is_available()and dist.is_initialized():dist.all_reduce(loss_sum,op=dist.ReduceOp.SUM);dist.all_reduce(token_count,op=dist.ReduceOp.SUM);dist.all_reduce(byte_count,op=dist.ReduceOp.SUM) + for p in base_model.parameters():p.requires_grad_(True) + base_model.eval();return _loss_bpb(loss_sum,token_count,byte_count) +def timed_eval(label,fn,*args,**kwargs):torch.cuda.synchronize();t0=time.perf_counter();val_loss,val_bpb=fn(*args,**kwargs);torch.cuda.synchronize();elapsed_ms=1e3*(time.perf_counter()-t0);log(f"{label} val_loss:{val_loss:.8f} val_bpb:{val_bpb:.8f} eval_time:{elapsed_ms:.0f}ms");return val_loss,val_bpb +def train_model(h,device,val_data): + base_model=GPT(h).to(device).bfloat16();restore_fp32_params(base_model);compiled_model=torch.compile(base_model,dynamic=False,fullgraph=True) + if h.distributed:model=DDP(compiled_model,device_ids=[h.local_rank],broadcast_buffers=False) + else:model=compiled_model + log(f"model_params:{sum(p.numel()for p in base_model.parameters())}");optimizers=Optimizers(h,base_model);train_loader=ShuffledSequenceLoader(h,device);max_wallclock_ms=1e3*h.max_wallclock_seconds if h.max_wallclock_seconds>0 else None + if max_wallclock_ms is not None:max_wallclock_ms-=h.gptq_reserve_seconds*1e3;log(f"gptq:reserving {h.gptq_reserve_seconds:.0f}s, effective={max_wallclock_ms:.0f}ms") + def training_frac(step,elapsed_ms): + if max_wallclock_ms is None:return step/max(h.iterations,1) + return elapsed_ms/max(max_wallclock_ms,1e-09) + def lr_mul(frac): + if h.warmdown_frac<=0:return 1. + if frac>=1.-h.warmdown_frac:return max((1.-frac)/h.warmdown_frac,h.min_lr) + return 1. + def step_fn(step,lr_scale): + optimizers.zero_grad_all();train_loss=torch.zeros((),device=device) + for micro_step in range(h.grad_accum_steps): + if h.distributed:model.require_backward_grad_sync=micro_step==h.grad_accum_steps-1 + x,y=train_loader.next_batch(h.train_batch_tokens,h.grad_accum_steps) + with torch.autocast(device_type='cuda',dtype=torch.bfloat16,enabled=True):loss=model(x,y) + train_loss+=loss.detach();(loss/h.grad_accum_steps).backward() + train_loss/=h.grad_accum_steps;frac=min(step/h.muon_momentum_warmup_steps,1.)if h.muon_momentum_warmup_steps>0 else 1.;muon_momentum=(1-frac)*h.muon_momentum_warmup_start+frac*h.muon_momentum + for group in optimizers.optimizer_muon.param_groups:group['momentum']=muon_momentum + for opt in optimizers: + for group in opt.param_groups:group['lr']=group['base_lr']*lr_scale + if h.grad_clip_norm>0:torch.nn.utils.clip_grad_norm_(base_model.parameters(),h.grad_clip_norm) + optimizers.step();return train_loss + if h.warmup_steps>0: + initial_model_state={name:tensor.detach().cpu().clone()for(name,tensor)in base_model.state_dict().items()};initial_optimizer_states=[copy.deepcopy(opt.state_dict())for opt in optimizers];model.train() + for warmup_step in range(h.warmup_steps): + step_fn(warmup_step,1.) + if warmup_step<=5 or(warmup_step+1)%10==0 or warmup_step+1==h.warmup_steps:log(f"warmup_step: {warmup_step+1}/{h.warmup_steps}") + if h.num_loops>0: + base_model.looping_active=True;log(f"loop_warmup:enabled encoder:{base_model.encoder_indices} decoder:{base_model.decoder_indices}") + for warmup_step in range(h.warmup_steps): + step_fn(warmup_step,1.) + if warmup_step<=5 or(warmup_step+1)%10==0 or warmup_step+1==h.warmup_steps:log(f"loop_warmup_step: {warmup_step+1}/{h.warmup_steps}") + base_model.looping_active=False + base_model.load_state_dict(initial_model_state,strict=True) + for(opt,state)in zip(optimizers,initial_optimizer_states,strict=True):opt.load_state_dict(state) + optimizers.zero_grad_all() + if h.distributed:model.require_backward_grad_sync=True + train_loader=ShuffledSequenceLoader(h,device) + ema_state={name:t.detach().float().clone()for(name,t)in base_model.state_dict().items()};ema_decay=h.ema_decay;training_time_ms=.0;stop_after_step=None;torch.cuda.synchronize();t0=time.perf_counter();step=0 + while True: + last_step=step==h.iterations or stop_after_step is not None and step>=stop_after_step;should_validate=last_step or h.val_loss_every>0 and step%h.val_loss_every==0 + if should_validate:torch.cuda.synchronize();training_time_ms+=1e3*(time.perf_counter()-t0);val_loss,val_bpb=eval_val(h,device,val_data,model);log(f"{step}/{h.iterations} val_loss: {val_loss:.4f} val_bpb: {val_bpb:.4f}");torch.cuda.synchronize();t0=time.perf_counter() + if last_step: + if stop_after_step is not None and step0 and not base_model.looping_active and frac>=h.enable_looping_at:base_model.looping_active=True;log(f"layer_loop:enabled step:{step} frac:{frac:.3f} encoder:{base_model.encoder_indices} decoder:{base_model.decoder_indices}") + train_loss=step_fn(step,scale) + with torch.no_grad(): + for(name,t)in base_model.state_dict().items():ema_state[name].mul_(ema_decay).add_(t.detach().float(),alpha=1.-ema_decay) + step+=1;approx_training_time_ms=training_time_ms+1e3*(time.perf_counter()-t0);should_log_train=h.train_log_every>0 and(step<=5 or step%h.train_log_every==0 or stop_after_step is not None) + if should_log_train:tok_per_sec=step*h.train_batch_tokens/(approx_training_time_ms/1e3);log(f"{step}/{h.iterations} train_loss: {train_loss.item():.4f} train_time: {approx_training_time_ms/60000:.1f}m tok/s: {tok_per_sec:.0f}") + reached_cap=max_wallclock_ms is not None and approx_training_time_ms>=max_wallclock_ms + if h.distributed and max_wallclock_ms is not None:reached_cap_tensor=torch.tensor(int(reached_cap),device=device);dist.all_reduce(reached_cap_tensor,op=dist.ReduceOp.MAX);reached_cap=bool(reached_cap_tensor.item()) + if stop_after_step is None and reached_cap:stop_after_step=step + log(f"peak memory allocated: {torch.cuda.max_memory_allocated()//1024//1024} MiB reserved: {torch.cuda.max_memory_reserved()//1024//1024} MiB");log('ema:applying EMA weights');current_state=base_model.state_dict();avg_state={name:t.to(dtype=current_state[name].dtype)for(name,t)in ema_state.items()};base_model.load_state_dict(avg_state,strict=True);return base_model,compiled_model +def train_and_eval(h,device): + random.seed(h.seed);np.random.seed(h.seed);torch.manual_seed(h.seed);torch.cuda.manual_seed_all(h.seed);val_data=ValidationData(h,device);log(f"train_shards: {len(list(Path(h.datasets_dir).resolve().glob("fineweb_train_*.bin")))}");log(f"val_tokens: {val_data.val_tokens.numel()-1}");base_model,compiled_model=train_model(h,device,val_data);torch._dynamo.reset();timed_eval('pre-quantization post-ema',eval_val,h,device,val_data,compiled_model);serialize(h,base_model,Path(__file__).read_text(encoding='utf-8')) + if h.distributed:dist.barrier() + eval_model=deserialize(h,device) + if h.num_loops>0:eval_model.looping_active=True + compiled_model=torch.compile(eval_model,dynamic=False,fullgraph=True);timed_eval('quantized',eval_val,h,device,val_data,compiled_model) + if h.sliding_window_enabled:timed_eval('quantized_sliding_window',eval_val_sliding,h,device,val_data,eval_model) + if h.ttt_enabled and h.sliding_window_enabled: + del eval_model,compiled_model;torch._dynamo.reset();torch.cuda.empty_cache();ttt_model=deserialize(h,device) + if h.num_loops>0:ttt_model.looping_active=True + timed_eval('quantized_ttt',eval_val_ttt,h,device,val_data,ttt_model);del ttt_model + if h.etlb_enabled and h.sliding_window_enabled: + if'eval_model'not in dir(): + eval_model=deserialize(h,device) + if h.num_loops>0:eval_model.looping_active=True + timed_eval('quantized_sliding_etlb',eval_val_sliding_etlb,h,device,val_data,eval_model) +def main(): + world_size=int(os.environ.get('WORLD_SIZE','1'));local_rank=int(os.environ.get('LOCAL_RANK','0'));distributed='RANK'in os.environ and'WORLD_SIZE'in os.environ + if not torch.cuda.is_available():raise RuntimeError('CUDA is required') + if world_size<=0:raise ValueError(f"WORLD_SIZE must be positive, got {world_size}") + if 8%world_size!=0:raise ValueError(f"WORLD_SIZE={world_size} must divide 8 so grad_accum_steps stays integral") + device=torch.device('cuda',local_rank);torch.cuda.set_device(device) + if distributed:dist.init_process_group(backend='nccl',device_id=device);dist.barrier() + torch.backends.cuda.matmul.allow_tf32=True;torch.backends.cudnn.allow_tf32=True;torch.set_float32_matmul_precision('high');from torch.backends.cuda import enable_cudnn_sdp,enable_flash_sdp,enable_math_sdp,enable_mem_efficient_sdp;enable_cudnn_sdp(False);enable_flash_sdp(True);enable_mem_efficient_sdp(False);enable_math_sdp(False);torch._dynamo.config.optimize_ddp=False;h=Hyperparameters();set_logging_hparams(h) + if h.is_main_process: + os.makedirs('logs',exist_ok=True);log(100*'=',console=False);log('Hyperparameters:',console=True) + for(k,v)in sorted(vars(type(h)).items()): + if not k.startswith('_'):log(f" {k}: {v}",console=True) + log('='*100,console=False);log(f"Running Python {sys.version}",console=False);log(f"Running PyTorch {torch.__version__}",console=False);log(subprocess.run(['nvidia-smi'],stdout=subprocess.PIPE,stderr=subprocess.PIPE,text=True,check=False).stdout,console=False);log('='*100,console=False) + train_and_eval(h,device) + if distributed:dist.destroy_process_group() +if __name__=='__main__':main() \ No newline at end of file diff --git a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed1337.log b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed1337.log new file mode 100644 index 0000000000..efff7bbc84 --- /dev/null +++ b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed1337.log @@ -0,0 +1,137 @@ +W0412 14:14:57.052000 35802 torch/distributed/run.py:803] +W0412 14:14:57.052000 35802 torch/distributed/run.py:803] ***************************************** +W0412 14:14:57.052000 35802 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0412 14:14:57.052000 35802 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.997 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/sp8192_seed1337.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 4 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.02 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.085 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + qk_gain_init: 4.0 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: sp8192_seed1337 + scalar_lr: 0.02 + seed: 1337 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.667 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40548352 +model_params:35943512 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 4] decoder:[5, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0047 val_bpb: 3.4867 +1/20000 train_loss: 9.0080 train_time: 0.0m tok/s: 8089272 +2/20000 train_loss: 12.3015 train_time: 0.0m tok/s: 8022559 +3/20000 train_loss: 11.0711 train_time: 0.0m tok/s: 7954927 +4/20000 train_loss: 9.4520 train_time: 0.0m tok/s: 7918173 +5/20000 train_loss: 8.3679 train_time: 0.0m tok/s: 7892396 +500/20000 train_loss: 3.3349 train_time: 0.9m tok/s: 7690797 +1000/20000 train_loss: 3.2063 train_time: 1.7m tok/s: 7685016 +1500/20000 train_loss: 3.0906 train_time: 2.6m tok/s: 7688746 +2000/20000 train_loss: 3.0213 train_time: 3.4m tok/s: 7689501 +2500/20000 train_loss: 3.0327 train_time: 4.3m tok/s: 7692100 +layer_loop:enabled step:2877 frac:0.500 encoder:[0, 1, 2, 3, 4, 5, 4] decoder:[5, 4, 5, 6, 7, 8, 9, 10] +3000/20000 train_loss: 3.0867 train_time: 5.2m tok/s: 7563624 +3500/20000 train_loss: 2.9550 train_time: 6.3m tok/s: 7265791 +4000/20000 train_loss: 2.9969 train_time: 7.5m tok/s: 7031651 +4000/20000 val_loss: 2.9178 val_bpb: 1.1298 +4500/20000 train_loss: 2.8096 train_time: 8.6m tok/s: 6882955 +5000/20000 train_loss: 2.7590 train_time: 9.7m tok/s: 6766993 +5052/20000 val_loss: 2.8139 val_bpb: 1.0896 +stopping_early: wallclock_cap train_time: 588041ms step: 5052/20000 +peak memory allocated: 35373 MiB reserved: 35478 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.81131292 val_bpb:1.08857004 eval_time:6825ms +Serialized model: 135426937 bytes +Code size: 58367 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 11.3s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15970240 bytes +Total submission size quantized+brotli: 16028607 bytes +quantized val_loss:2.84129693 val_bpb:1.10018017 eval_time:22233ms +quantized_sliding_window val_loss:2.79834517 val_bpb:1.08354879 eval_time:83683ms diff --git a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed1337_frontier.log b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed1337_frontier.log new file mode 100644 index 0000000000..b3f4831c8e --- /dev/null +++ b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed1337_frontier.log @@ -0,0 +1,148 @@ +W0412 17:41:11.842000 48239 torch/distributed/run.py:803] +W0412 17:41:11.842000 48239 torch/distributed/run.py:803] ***************************************** +W0412 17:41:11.842000 48239 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0412 17:41:11.842000 48239 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: /workspace/data + datasets_dir: /workspace/data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.9965 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.35 + etlb_clip: 3.0 + etlb_enabled: False + etlb_lr: 0.05 + etlb_steps: 5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/frontier_seed1337.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 3 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.022 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.095 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + parallel_residual_start: 7 + qk_gain_init: 5.25 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: frontier_seed1337 + scalar_lr: 0.02 + seed: 1337 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: /workspace/data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: /workspace/data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + ttt_chunk_tokens: 32768 + ttt_enabled: True + ttt_epochs: 3 + ttt_lr: 0.005 + ttt_momentum: 0.9 + val_batch_tokens: 524288 + val_files: /workspace/data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.72 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40548352 +model_params:35944536 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0047 val_bpb: 3.4867 +1/20000 train_loss: 9.0080 train_time: 0.0m tok/s: 8336072 +2/20000 train_loss: 12.2992 train_time: 0.0m tok/s: 8184327 +3/20000 train_loss: 11.0456 train_time: 0.0m tok/s: 8084574 +4/20000 train_loss: 9.4139 train_time: 0.0m tok/s: 8030457 +5/20000 train_loss: 8.3296 train_time: 0.0m tok/s: 7997738 +500/20000 train_loss: 3.3332 train_time: 0.8m tok/s: 7731821 +1000/20000 train_loss: 3.2115 train_time: 1.7m tok/s: 7728010 +1500/20000 train_loss: 3.0985 train_time: 2.5m tok/s: 7736121 +2000/20000 train_loss: 3.0193 train_time: 3.4m tok/s: 7741721 +layer_loop:enabled step:2026 frac:0.350 encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +2500/20000 train_loss: 2.9987 train_time: 4.6m tok/s: 7114884 +3000/20000 train_loss: 3.0367 train_time: 5.8m tok/s: 6727898 +3500/20000 train_loss: 2.9188 train_time: 7.1m tok/s: 6476757 +4000/20000 train_loss: 2.9547 train_time: 8.3m tok/s: 6299690 +4000/20000 val_loss: 2.8728 val_bpb: 1.1124 +4500/20000 train_loss: 2.7579 train_time: 9.6m tok/s: 6170374 +4598/20000 val_loss: 2.8075 val_bpb: 1.0871 +stopping_early: wallclock_cap train_time: 588092ms step: 4598/20000 +peak memory allocated: 39046 MiB reserved: 39070 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.80424019 val_bpb:1.08583141 eval_time:6825ms +Serialized model: 135431033 bytes +Code size: 16791 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 12.7s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15975659 bytes +Total submission size quantized+brotli: 15992450 bytes +quantized val_loss:2.83421669 val_bpb:1.09743862 eval_time:8477ms +quantized_sliding_window val_loss:2.79040941 val_bpb:1.08047598 eval_time:88503ms +ttt:start chunks=1238 ttt_lr=0.005 ttt_epochs=3 +quantized_ttt val_loss:2.78678937 val_bpb:1.07907426 eval_time:334602ms diff --git a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed2024.log b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed2024.log new file mode 100644 index 0000000000..12344ae02d --- /dev/null +++ b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed2024.log @@ -0,0 +1,137 @@ +W0412 14:47:48.365000 113005 torch/distributed/run.py:803] +W0412 14:47:48.365000 113005 torch/distributed/run.py:803] ***************************************** +W0412 14:47:48.365000 113005 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0412 14:47:48.365000 113005 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.997 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/sp8192_seed2024.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 4 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.02 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.085 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + qk_gain_init: 4.0 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: sp8192_seed2024 + scalar_lr: 0.02 + seed: 2024 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.667 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40548352 +model_params:35943512 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 4] decoder:[5, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0072 val_bpb: 3.4877 +1/20000 train_loss: 9.0094 train_time: 0.0m tok/s: 2135125 +2/20000 train_loss: 12.2873 train_time: 0.0m tok/s: 3370519 +3/20000 train_loss: 11.1022 train_time: 0.0m tok/s: 4161091 +4/20000 train_loss: 9.4986 train_time: 0.0m tok/s: 4708808 +5/20000 train_loss: 8.4185 train_time: 0.0m tok/s: 5113931 +500/20000 train_loss: 3.3297 train_time: 0.9m tok/s: 7577257 +1000/20000 train_loss: 3.2011 train_time: 1.7m tok/s: 7627264 +1500/20000 train_loss: 3.0892 train_time: 2.6m tok/s: 7647347 +2000/20000 train_loss: 3.0172 train_time: 3.4m tok/s: 7660145 +2500/20000 train_loss: 3.0322 train_time: 4.3m tok/s: 7668167 +layer_loop:enabled step:2869 frac:0.500 encoder:[0, 1, 2, 3, 4, 5, 4] decoder:[5, 4, 5, 6, 7, 8, 9, 10] +3000/20000 train_loss: 3.0873 train_time: 5.2m tok/s: 7571205 +3500/20000 train_loss: 2.9600 train_time: 6.3m tok/s: 7240254 +4000/20000 train_loss: 2.9966 train_time: 7.5m tok/s: 7010853 +4000/20000 val_loss: 2.9162 val_bpb: 1.1292 +4500/20000 train_loss: 2.8106 train_time: 8.6m tok/s: 6865615 +5000/20000 train_loss: 2.7611 train_time: 9.7m tok/s: 6750831 +5042/20000 val_loss: 2.8137 val_bpb: 1.0895 +stopping_early: wallclock_cap train_time: 588097ms step: 5042/20000 +peak memory allocated: 35373 MiB reserved: 35476 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.81121527 val_bpb:1.08853223 eval_time:6640ms +Serialized model: 135426937 bytes +Code size: 58367 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 11.3s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15967559 bytes +Total submission size quantized+brotli: 16025926 bytes +quantized val_loss:2.84118514 val_bpb:1.10013688 eval_time:22696ms +quantized_sliding_window val_loss:2.79862166 val_bpb:1.08365585 eval_time:108261ms diff --git a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed2024_frontier.log b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed2024_frontier.log new file mode 100644 index 0000000000..005879a74f --- /dev/null +++ b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed2024_frontier.log @@ -0,0 +1,148 @@ +W0412 18:26:02.275000 60284 torch/distributed/run.py:803] +W0412 18:26:02.275000 60284 torch/distributed/run.py:803] ***************************************** +W0412 18:26:02.275000 60284 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0412 18:26:02.275000 60284 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: /workspace/data + datasets_dir: /workspace/data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.9965 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.35 + etlb_clip: 3.0 + etlb_enabled: False + etlb_lr: 0.05 + etlb_steps: 5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/frontier_seed2024.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 3 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.022 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.095 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + parallel_residual_start: 7 + qk_gain_init: 5.25 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: frontier_seed2024 + scalar_lr: 0.02 + seed: 2024 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: /workspace/data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: /workspace/data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + ttt_chunk_tokens: 32768 + ttt_enabled: True + ttt_epochs: 3 + ttt_lr: 0.005 + ttt_momentum: 0.9 + val_batch_tokens: 524288 + val_files: /workspace/data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.72 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40548352 +model_params:35944536 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0072 val_bpb: 3.4877 +1/20000 train_loss: 9.0094 train_time: 0.0m tok/s: 8317541 +2/20000 train_loss: 12.2867 train_time: 0.0m tok/s: 8171456 +3/20000 train_loss: 11.0810 train_time: 0.0m tok/s: 8077182 +4/20000 train_loss: 9.4616 train_time: 0.0m tok/s: 8025433 +5/20000 train_loss: 8.3776 train_time: 0.0m tok/s: 7991921 +500/20000 train_loss: 3.3317 train_time: 0.8m tok/s: 7750924 +1000/20000 train_loss: 3.2122 train_time: 1.7m tok/s: 7739976 +1500/20000 train_loss: 3.0989 train_time: 2.5m tok/s: 7739565 +2000/20000 train_loss: 3.0174 train_time: 3.4m tok/s: 7741289 +layer_loop:enabled step:2026 frac:0.350 encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +2500/20000 train_loss: 2.9978 train_time: 4.6m tok/s: 7114500 +3000/20000 train_loss: 3.0376 train_time: 5.8m tok/s: 6727692 +3500/20000 train_loss: 2.9237 train_time: 7.1m tok/s: 6476233 +4000/20000 train_loss: 2.9570 train_time: 8.3m tok/s: 6300234 +4000/20000 val_loss: 2.8752 val_bpb: 1.1133 +4500/20000 train_loss: 2.7565 train_time: 9.6m tok/s: 6170620 +4598/20000 val_loss: 2.8097 val_bpb: 1.0879 +stopping_early: wallclock_cap train_time: 588079ms step: 4598/20000 +peak memory allocated: 39046 MiB reserved: 39070 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.80642814 val_bpb:1.08667860 eval_time:6803ms +Serialized model: 135431033 bytes +Code size: 16791 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 12.8s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15972913 bytes +Total submission size quantized+brotli: 15989704 bytes +quantized val_loss:2.83555862 val_bpb:1.09795824 eval_time:8518ms +quantized_sliding_window val_loss:2.79237236 val_bpb:1.08123606 eval_time:88486ms +ttt:start chunks=1238 ttt_lr=0.005 ttt_epochs=3 +quantized_ttt val_loss:2.78921781 val_bpb:1.08001458 eval_time:333698ms diff --git a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed42.log b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed42.log new file mode 100644 index 0000000000..1ba76d085a --- /dev/null +++ b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed42.log @@ -0,0 +1,137 @@ +W0412 14:31:11.762000 74428 torch/distributed/run.py:803] +W0412 14:31:11.762000 74428 torch/distributed/run.py:803] ***************************************** +W0412 14:31:11.762000 74428 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0412 14:31:11.762000 74428 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.997 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/sp8192_seed42.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 4 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.02 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.085 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + qk_gain_init: 4.0 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: sp8192_seed42 + scalar_lr: 0.02 + seed: 42 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.667 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40548352 +model_params:35943512 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 4] decoder:[5, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0091 val_bpb: 3.4884 +1/20000 train_loss: 9.0116 train_time: 0.0m tok/s: 8131746 +2/20000 train_loss: 12.3391 train_time: 0.0m tok/s: 8043815 +3/20000 train_loss: 11.1222 train_time: 0.0m tok/s: 7969544 +4/20000 train_loss: 9.4225 train_time: 0.0m tok/s: 7926698 +5/20000 train_loss: 8.3224 train_time: 0.0m tok/s: 7908161 +500/20000 train_loss: 3.3361 train_time: 0.9m tok/s: 7698191 +1000/20000 train_loss: 3.2032 train_time: 1.7m tok/s: 7690409 +1500/20000 train_loss: 3.0970 train_time: 2.6m tok/s: 7691199 +2000/20000 train_loss: 3.0195 train_time: 3.4m tok/s: 7692530 +2500/20000 train_loss: 3.0329 train_time: 4.3m tok/s: 7694479 +layer_loop:enabled step:2877 frac:0.500 encoder:[0, 1, 2, 3, 4, 5, 4] decoder:[5, 4, 5, 6, 7, 8, 9, 10] +3000/20000 train_loss: 3.0914 train_time: 5.2m tok/s: 7585577 +3500/20000 train_loss: 2.9592 train_time: 6.3m tok/s: 7267839 +4000/20000 train_loss: 3.0006 train_time: 7.5m tok/s: 7031051 +4000/20000 val_loss: 2.9219 val_bpb: 1.1314 +4500/20000 train_loss: 2.8134 train_time: 8.6m tok/s: 6882962 +5000/20000 train_loss: 2.7602 train_time: 9.7m tok/s: 6767536 +5053/20000 val_loss: 2.8183 val_bpb: 1.0913 +stopping_early: wallclock_cap train_time: 588126ms step: 5053/20000 +peak memory allocated: 35373 MiB reserved: 35476 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.81580054 val_bpb:1.09030770 eval_time:6794ms +Serialized model: 135426937 bytes +Code size: 58367 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 11.3s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15966716 bytes +Total submission size quantized+brotli: 16025083 bytes +quantized val_loss:2.84662455 val_bpb:1.10224308 eval_time:22261ms +quantized_sliding_window val_loss:2.80368562 val_bpb:1.08561667 eval_time:108586ms diff --git a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed42_frontier.log b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed42_frontier.log new file mode 100644 index 0000000000..fe0e26d97d --- /dev/null +++ b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed42_frontier.log @@ -0,0 +1,148 @@ +W0412 18:04:24.140000 54797 torch/distributed/run.py:803] +W0412 18:04:24.140000 54797 torch/distributed/run.py:803] ***************************************** +W0412 18:04:24.140000 54797 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0412 18:04:24.140000 54797 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: /workspace/data + datasets_dir: /workspace/data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.9965 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.35 + etlb_clip: 3.0 + etlb_enabled: False + etlb_lr: 0.05 + etlb_steps: 5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/frontier_seed42.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 3 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.022 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.095 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + parallel_residual_start: 7 + qk_gain_init: 5.25 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: frontier_seed42 + scalar_lr: 0.02 + seed: 42 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: /workspace/data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: /workspace/data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + ttt_chunk_tokens: 32768 + ttt_enabled: True + ttt_epochs: 3 + ttt_lr: 0.005 + ttt_momentum: 0.9 + val_batch_tokens: 524288 + val_files: /workspace/data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.72 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40548352 +model_params:35944536 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0091 val_bpb: 3.4884 +1/20000 train_loss: 9.0116 train_time: 0.0m tok/s: 8266749 +2/20000 train_loss: 12.3389 train_time: 0.0m tok/s: 8207894 +3/20000 train_loss: 11.1044 train_time: 0.0m tok/s: 8094344 +4/20000 train_loss: 9.3899 train_time: 0.0m tok/s: 8023875 +5/20000 train_loss: 8.2859 train_time: 0.0m tok/s: 8003812 +500/20000 train_loss: 3.3346 train_time: 0.8m tok/s: 7756714 +1000/20000 train_loss: 3.2165 train_time: 1.7m tok/s: 7748534 +1500/20000 train_loss: 3.1056 train_time: 2.5m tok/s: 7753442 +2000/20000 train_loss: 3.0221 train_time: 3.4m tok/s: 7757329 +layer_loop:enabled step:2031 frac:0.350 encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +2500/20000 train_loss: 3.0017 train_time: 4.6m tok/s: 7134514 +3000/20000 train_loss: 3.0386 train_time: 5.8m tok/s: 6743066 +3500/20000 train_loss: 2.9250 train_time: 7.1m tok/s: 6490513 +4000/20000 train_loss: 2.9582 train_time: 8.3m tok/s: 6313257 +4000/20000 val_loss: 2.8764 val_bpb: 1.1138 +4500/20000 train_loss: 2.7575 train_time: 9.5m tok/s: 6182555 +4606/20000 val_loss: 2.8099 val_bpb: 1.0880 +stopping_early: wallclock_cap train_time: 588138ms step: 4606/20000 +peak memory allocated: 39046 MiB reserved: 39070 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.80669912 val_bpb:1.08678353 eval_time:6833ms +Serialized model: 135431033 bytes +Code size: 16791 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 12.7s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15973010 bytes +Total submission size quantized+brotli: 15989801 bytes +quantized val_loss:2.83668775 val_bpb:1.09839545 eval_time:8495ms +quantized_sliding_window val_loss:2.79328837 val_bpb:1.08159075 eval_time:88058ms +ttt:start chunks=1238 ttt_lr=0.005 ttt_epochs=3 +quantized_ttt val_loss:2.78953660 val_bpb:1.08013802 eval_time:323884ms