From 6f75a8bf21e79baeab0ca58b90142c974a4502f7 Mon Sep 17 00:00:00 2001 From: Anthony Date: Sun, 12 Apr 2026 10:04:08 -0400 Subject: [PATCH 1/5] SP8192 frontier: adapted from PR #1394 (depth recurrence, GPTQ, SDClip) --- .../2026-04-12_SP8192_Frontier/train_gpt.py | 1408 +++++++++++++++++ 1 file changed, 1408 insertions(+) create mode 100644 records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt.py diff --git a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt.py b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt.py new file mode 100644 index 0000000000..b024660f31 --- /dev/null +++ b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt.py @@ -0,0 +1,1408 @@ +import collections +import copy +import glob +import io +import lzma +import math +import os +from pathlib import Path +import random +import re +import subprocess +import sys +import time +import uuid + +import numpy as np +import sentencepiece as spm +import torch +import torch.distributed as dist +import torch.nn.functional as F +from torch.nn.parallel import DistributedDataParallel as DDP +from torch import Tensor, nn + +from flash_attn_interface import flash_attn_func as flash_attn_3_func + +# ---------------------------------------- +# Hyperparameters +# ---------------------------------------- + +class Hyperparameters(): + # Experiment settings + data_dir = os.environ.get('DATA_DIR', './data/') + seed = int(os.environ.get('SEED', 1337)) + run_id = os.environ.get("RUN_ID", str(uuid.uuid4())) + + # Training length + iterations = int(os.environ.get('ITERATIONS', 20000)) + warmdown_frac = float(os.environ.get('WARMDOWN_FRAC', 0.667)) + warmup_steps = int(os.environ.get('WARMUP_STEPS', 20)) + train_batch_tokens = int(os.environ.get('TRAIN_BATCH_TOKENS', 2048 * 48 * 8)) + train_seq_len = int(os.environ.get('TRAIN_SEQ_LEN', 2048)) + train_log_every = int(os.environ.get('TRAIN_LOG_EVERY', 500)) + max_wallclock_seconds = float(os.environ.get('MAX_WALLCLOCK_SECONDS', 600.0)) + + # Validation/Evals + val_batch_tokens = int(os.environ.get('VAL_BATCH_TOKENS', 2048 * 32 * 8)) + eval_seq_len = int(os.environ.get('EVAL_SEQ_LEN', 2048)) + val_loss_every = int(os.environ.get('VAL_LOSS_EVERY', 4000)) + sliding_window_enabled = bool(int(os.environ.get('SLIDING_WINDOW_ENABLED', '1'))) + + # Model architecture + vocab_size = int(os.environ.get('VOCAB_SIZE', 8192)) + num_layers = int(os.environ.get('NUM_LAYERS', 11)) + xsa_last_n = int(os.environ.get('XSA_LAST_N', 11)) + model_dim = int(os.environ.get('MODEL_DIM', 512)) + embedding_dim = int(os.environ.get('EMBEDDING_DIM', 512)) + num_kv_heads = int(os.environ.get('NUM_KV_HEADS', 4)) + num_heads = int(os.environ.get('NUM_HEADS', 8)) + mlp_mult = float(os.environ.get('MLP_MULT', 4.0)) + skip_gates_enabled = bool(int(os.environ.get('SKIP_GATES_ENABLED', '1'))) + tie_embeddings = bool(int(os.environ.get('TIE_EMBEDDINGS', '1'))) + logit_softcap = float(os.environ.get('LOGIT_SOFTCAP', 30.0)) + rope_base = float(os.environ.get('ROPE_BASE', 10000.0)) + rope_dims = int(os.environ.get('ROPE_DIMS', 16)) + rope_train_seq_len = int(os.environ.get('ROPE_TRAIN_SEQ_LEN', 2048)) + ln_scale = bool(int(os.environ.get('LN_SCALE', '1'))) + qk_gain_init = float(os.environ.get('QK_GAIN_INIT', 4.0)) + + # Layer looping + num_loops = int(os.environ.get('NUM_LOOPS', 2)) + loop_start = int(os.environ.get('LOOP_START', 4)) + loop_end = int(os.environ.get('LOOP_END', 5)) + enable_looping_at = float(os.environ.get('ENABLE_LOOPING_AT', 0.5)) + + # Optimizer + min_lr = float(os.environ.get('MIN_LR', 0.0)) + embed_lr = float(os.environ.get('EMBED_LR', 0.6)) + head_lr = float(os.environ.get('HEAD_LR', 0.008)) + tied_embed_lr = float(os.environ.get('TIED_EMBED_LR', 0.03)) + tied_embed_init_std = float(os.environ.get('TIED_EMBED_INIT_STD', 0.005)) + matrix_lr = float(os.environ.get('MATRIX_LR', 0.02)) + scalar_lr = float(os.environ.get('SCALAR_LR', 0.02)) + muon_momentum = float(os.environ.get('MUON_MOMENTUM', 0.99)) + muon_backend_steps = int(os.environ.get('MUON_BACKEND_STEPS', 5)) + muon_momentum_warmup_start = float(os.environ.get('MUON_MOMENTUM_WARMUP_START', 0.92)) + muon_momentum_warmup_steps = int(os.environ.get('MUON_MOMENTUM_WARMUP_STEPS', 1500)) + muon_row_normalize = bool(int(os.environ.get('MUON_ROW_NORMALIZE', '1'))) + beta1 = float(os.environ.get('BETA1', 0.9)) + beta2 = float(os.environ.get('BETA2', 0.95)) + adam_eps = float(os.environ.get('ADAM_EPS', 1e-8)) + grad_clip_norm = float(os.environ.get('GRAD_CLIP_NORM', 0.3)) + eval_stride = int(os.environ.get('EVAL_STRIDE', 64)) + muon_beta2 = float(os.environ.get('MUON_BETA2', 0.95)) + adam_wd = float(os.environ.get('ADAM_WD', 0.02)) + muon_wd = float(os.environ.get('MUON_WD', 0.085)) + embed_wd = float(os.environ.get('EMBED_WD', 0.085)) + ema_decay = float(os.environ.get('EMA_DECAY', 0.997)) + + # Quantization & Compression + compressor = os.environ.get('COMPRESSOR', 'brotli') + gptq_calibration_batches = int(os.environ.get('GPTQ_CALIBRATION_BATCHES', 64)) + gptq_reserve_seconds = float(os.environ.get('GPTQ_RESERVE_SECONDS', 12.0)) + matrix_bits = int(os.environ.get('MATRIX_BITS', 6)) + embed_bits = int(os.environ.get('EMBED_BITS', 8)) + matrix_clip_sigmas = float(os.environ.get('MATRIX_CLIP_SIGMAS', 12.85)) + embed_clip_sigmas = float(os.environ.get('EMBED_CLIP_SIGMAS', 20.0)) + + # Distributed setup + distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + is_main_process = rank == 0 + grad_accum_steps = 8 // world_size + + # Data paths + datasets_dir = os.path.join(data_dir, 'datasets', f'fineweb10B_sp{vocab_size}') + train_files = os.path.join(datasets_dir, 'fineweb_train_*.bin') + val_files = os.path.join(datasets_dir, 'fineweb_val_*.bin') + tokenizer_path = os.path.join(data_dir, 'tokenizers', f'fineweb_{vocab_size}_bpe.model') + + # Experiment files + logfile = f"logs/{run_id}.txt" + model_path = "final_model.pt" + quantized_model_path = "final_model.int6.ptz" + +# ---------------------------------------- +# Global Logging Function +# ---------------------------------------- + +_logger_hparams = None + + +def set_logging_hparams(h: Hyperparameters) -> None: + global _logger_hparams + _logger_hparams = h + + +def log(msg, console: bool = True) -> None: + if _logger_hparams is None: + print(msg) + return + if _logger_hparams.is_main_process: + if console: + print(msg) + if _logger_hparams.logfile is not None: + with open(_logger_hparams.logfile, "a", encoding="utf-8") as f: + print(msg, file=f) + +# ---------------------------------------- +# Data Loading +# ---------------------------------------- + +class ValidationData: + def __init__(self, h: Hyperparameters, device: torch.device): + self.sp = spm.SentencePieceProcessor(model_file=h.tokenizer_path) + if int(self.sp.vocab_size()) != h.vocab_size: + raise ValueError( + f"VOCAB_SIZE={h.vocab_size} does not match tokenizer vocab_size={int(self.sp.vocab_size())}" + ) + self.val_tokens = load_validation_tokens(h.val_files, h.eval_seq_len) + self.base_bytes_lut, self.has_leading_space_lut, self.is_boundary_token_lut = ( + build_sentencepiece_luts(self.sp, h.vocab_size, device)) + + +def build_sentencepiece_luts( + sp: spm.SentencePieceProcessor, vocab_size: int, device: torch.device +) -> tuple[Tensor, Tensor, Tensor]: + sp_vocab_size = int(sp.vocab_size()) + # The BPB calculation assumes "▁" is its own token so that leading-space bytes + # are counted correctly. See https://github.com/openai/parameter-golf/issues/897 + assert sp.piece_to_id("\u2581") != sp.unk_id(), \ + "Tokenizer must have '▁' (space) as its own token for correct BPB byte counting" + table_size = max(sp_vocab_size, vocab_size) + base_bytes_np = np.zeros((table_size,), dtype=np.int16) + has_leading_space_np = np.zeros((table_size,), dtype=np.bool_) + is_boundary_token_np = np.ones((table_size,), dtype=np.bool_) + for token_id in range(sp_vocab_size): + if sp.is_control(token_id) or sp.is_unknown(token_id) or sp.is_unused(token_id): + continue + is_boundary_token_np[token_id] = False + if sp.is_byte(token_id): + base_bytes_np[token_id] = 1 + continue + piece = sp.id_to_piece(token_id) + if piece.startswith("\u2581"): + has_leading_space_np[token_id] = True + piece = piece[1:] + base_bytes_np[token_id] = len(piece.encode("utf-8")) + return ( + torch.tensor(base_bytes_np, dtype=torch.int16, device=device), + torch.tensor(has_leading_space_np, dtype=torch.bool, device=device), + torch.tensor(is_boundary_token_np, dtype=torch.bool, device=device), + ) + + +def load_validation_tokens(pattern: str, seq_len: int) -> Tensor: + files = [Path(p) for p in sorted(glob.glob(pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {pattern}") + # The export pipeline writes the fixed first-50k-doc validation set to fineweb_val_*. + tokens = torch.cat([load_data_shard(file) for file in files]).contiguous() + usable = ((tokens.numel() - 1) // seq_len) * seq_len + if usable <= 0: + raise ValueError(f"Validation split is too short for TRAIN_SEQ_LEN={seq_len}") + return tokens[: usable + 1] + + +def load_data_shard(file: Path) -> Tensor: + header_bytes = 256 * np.dtype(" int: + key = str(file) + cached = _SHARD_NTOKENS_CACHE.get(key) + if cached is not None: + return cached + header = np.fromfile(file, dtype=" np.memmap: + key = str(file) + mm = _MMAP_CACHE.get(key) + if mm is not None: + return mm + n = _read_num_tokens(file) + mm = np.memmap(file, mode="r", dtype=" None: + max_phase = min(self.seq_len - 1, max(0, self.num_tokens[si] - self.seq_len - 1)) + phase = int(self.rng.integers(max_phase + 1)) if max_phase > 0 else 0 + num_sequences = (self.num_tokens[si] - 1 - phase) // self.seq_len + sequence_order = self.rng.permutation(num_sequences) + self.start_inds[si] = (phase + sequence_order * self.seq_len).tolist() + + def next_batch(self, global_tokens: int, grad_accum_steps: int) -> tuple[Tensor, Tensor]: + device_tokens = global_tokens // (self.world_size * grad_accum_steps) + device_batch_size = device_tokens // self.seq_len + remaining = np.array([len(s) for s in self.start_inds], dtype=np.float64) + x = torch.empty((device_batch_size, self.seq_len), dtype=torch.int64) + y = torch.empty((device_batch_size, self.seq_len), dtype=torch.int64) + for bi in range(device_batch_size): + total = remaining.sum() + if total <= 0: + for si in range(len(self.files)): + self._reset_shard(si) + remaining = np.array([len(s) for s in self.start_inds], dtype=np.float64) + total = remaining.sum() + probs = remaining / total + si = int(self.rng.choice(len(self.files), p=probs)) + start_ind = self.start_inds[si].pop() + remaining[si] -= 1 + mm = _get_shard_memmap(self.files[si]) + window = torch.as_tensor( + np.array(mm[start_ind:start_ind + self.seq_len + 1], dtype=np.int64)) + x[bi] = window[:-1] + y[bi] = window[1:] + return x.to(self.device, non_blocking=True), y.to(self.device, non_blocking=True) + +# ---------------------------------------- +# Model Architecture +# ---------------------------------------- + +class RMSNorm(nn.Module): + def __init__(self, eps: float | None = None): + super().__init__() + self.eps = eps + + def forward(self, x: Tensor) -> Tensor: + return F.rms_norm(x, (x.size(-1),), eps=self.eps) + + +class CastedLinear(nn.Linear): + def forward(self, x: Tensor) -> Tensor: + w = self.weight.to(x.dtype) + bias = self.bias.to(x.dtype) if self.bias is not None else None + return F.linear(x, w, bias) + + +class Rotary(nn.Module): + def __init__(self, dim: int, base: float = 10000.0, train_seq_len: int = 1024, rope_dims: int = 0): + super().__init__() + self.dim = dim + self.base = base + self.train_seq_len = train_seq_len + self.rope_dims = rope_dims if rope_dims > 0 else dim + inv_freq = 1.0 / (base ** (torch.arange(0, self.rope_dims, 2, dtype=torch.float32) / self.rope_dims)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._cos_cached: Tensor | None = None + self._sin_cached: Tensor | None = None + + def forward(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> tuple[Tensor, Tensor]: + if ( + self._cos_cached is None + or self._sin_cached is None + or self._seq_len_cached != seq_len + or self._cos_cached.device != device + ): + rd = self.rope_dims + if seq_len > self.train_seq_len: + scale = seq_len / self.train_seq_len + new_base = self.base * (scale ** (rd / (rd - 2))) + inv_freq = 1.0 / (new_base ** (torch.arange( + 0, rd, 2, dtype=torch.float32, device=device) / rd)) + else: + inv_freq = self.inv_freq.to(device) + t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) + freqs = torch.outer(t, inv_freq) + self._cos_cached = freqs.cos()[None, :, None, :] + self._sin_cached = freqs.sin()[None, :, None, :] + self._seq_len_cached = seq_len + return self._cos_cached.to(dtype=dtype), self._sin_cached.to(dtype=dtype) + + +def apply_rotary_emb(x: Tensor, cos: Tensor, sin: Tensor, rope_dims: int = 0) -> Tensor: + if rope_dims > 0 and rope_dims < x.size(-1): + x_rope, x_pass = x[..., :rope_dims], x[..., rope_dims:] + half = rope_dims // 2 + x1, x2 = x_rope[..., :half], x_rope[..., half:] + x_rope = torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) + return torch.cat((x_rope, x_pass), dim=-1) + half = x.size(-1) // 2 + x1, x2 = x[..., :half], x[..., half:] + return torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) + + +class CausalSelfAttention(nn.Module): + def __init__(self, dim: int, num_heads: int, num_kv_heads: int, + rope_base: float, qk_gain_init: float, train_seq_len: int): + super().__init__() + if dim % num_heads != 0: + raise ValueError("model_dim must be divisible by num_heads") + if num_heads % num_kv_heads != 0: + raise ValueError("num_heads must be divisible by num_kv_heads") + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = dim // num_heads + if self.head_dim % 2 != 0: + raise ValueError("head_dim must be even for RoPE") + kv_dim = self.num_kv_heads * self.head_dim + self.c_q = CastedLinear(dim, dim, bias=False) + self.c_k = CastedLinear(dim, kv_dim, bias=False) + self.c_v = CastedLinear(dim, kv_dim, bias=False) + self.proj = CastedLinear(dim, dim, bias=False) + self.proj._zero_init = True + self.q_gain = nn.Parameter(torch.full((num_heads,), qk_gain_init, dtype=torch.float32)) + self.rope_dims = 0 + self.rotary = Rotary(self.head_dim, base=rope_base, train_seq_len=train_seq_len) + self.use_xsa = False + + def _xsa_efficient(self, y: Tensor, v: Tensor) -> Tensor: + B, T, H, D = y.shape + Hkv = v.size(-2) + group = H // Hkv + y_g = y.reshape(B, T, Hkv, group, D) + vn = F.normalize(v, dim=-1).unsqueeze(-2) + proj = (y_g * vn).sum(dim=-1, keepdim=True) * vn + return (y_g - proj).reshape(B, T, H, D) + + def forward(self, x: Tensor) -> Tensor: + bsz, seqlen, dim = x.shape + q = self.c_q(x).reshape(bsz, seqlen, self.num_heads, self.head_dim) + k = self.c_k(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + v = self.c_v(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + q = F.rms_norm(q, (q.size(-1),)) + k = F.rms_norm(k, (k.size(-1),)) + cos, sin = self.rotary(seqlen, x.device, q.dtype) + q = apply_rotary_emb(q, cos, sin, self.rope_dims) + k = apply_rotary_emb(k, cos, sin, self.rope_dims) + q = q * self.q_gain.to(dtype=q.dtype)[None, None, :, None] + y = flash_attn_3_func(q, k, v, causal=True) + if self.use_xsa: + y = self._xsa_efficient(y, v) + y = y.reshape(bsz, seqlen, dim) + return self.proj(y) + + +class MLP(nn.Module): + def __init__(self, dim: int, mlp_mult: int): + super().__init__() + hidden = int(mlp_mult * dim) + self.fc = CastedLinear(dim, hidden, bias=False) + self.proj = CastedLinear(hidden, dim, bias=False) + self.proj._zero_init = True + + def forward(self, x: Tensor) -> Tensor: + return self.proj(F.leaky_relu(self.fc(x), negative_slope=0.5).square()) + + +class Block(nn.Module): + def __init__(self, dim: int, num_heads: int, num_kv_heads: int, mlp_mult: int, + rope_base: float, qk_gain_init: float, train_seq_len: int, + layer_idx: int = 0, ln_scale: bool = False): + super().__init__() + self.attn_norm = RMSNorm() + self.mlp_norm = RMSNorm() + self.attn = CausalSelfAttention( + dim, num_heads, num_kv_heads, rope_base, qk_gain_init, train_seq_len) + self.mlp = MLP(dim, mlp_mult) + self.attn_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.mlp_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.resid_mix = nn.Parameter(torch.stack((torch.ones(dim), torch.zeros(dim))).float()) + self.ln_scale_factor = 1.0 / math.sqrt(layer_idx + 1) if ln_scale else 1.0 + + def forward(self, x: Tensor, x0: Tensor) -> Tensor: + mix = self.resid_mix.to(dtype=x.dtype) + x_in = mix[0][None, None, :] * x + mix[1][None, None, :] * x0 + attn_out = self.attn(self.attn_norm(x_in) * self.ln_scale_factor) + x_out = x_in + self.attn_scale.to(dtype=x_in.dtype)[None, None, :] * attn_out + x_out = x_out + self.mlp_scale.to(dtype=x_out.dtype)[None, None, :] * self.mlp( + self.mlp_norm(x_out) * self.ln_scale_factor) + return x_out + + +class GPT(nn.Module): + def __init__(self, h: Hyperparameters): + super().__init__() + if h.logit_softcap <= 0.0: + raise ValueError(f"logit_softcap must be positive, got {h.logit_softcap}") + self.tie_embeddings = h.tie_embeddings + self.tied_embed_init_std = h.tied_embed_init_std + self.logit_softcap = h.logit_softcap + self.tok_emb = nn.Embedding(h.vocab_size, h.embedding_dim) + if h.embedding_dim != h.model_dim: + self.embed_proj = CastedLinear(h.embedding_dim, h.model_dim, bias=False) + self.head_proj = CastedLinear(h.model_dim, h.embedding_dim, bias=False) + else: + self.embed_proj = None + self.head_proj = None + self.num_encoder_layers = h.num_layers // 2 + self.num_decoder_layers = h.num_layers - self.num_encoder_layers + self.blocks = nn.ModuleList([ + Block(h.model_dim, h.num_heads, h.num_kv_heads, h.mlp_mult, h.rope_base, + h.qk_gain_init, h.train_seq_len, layer_idx=i, ln_scale=h.ln_scale) + for i in range(h.num_layers) + ]) + if h.rope_dims > 0: + head_dim = h.model_dim // h.num_heads + for block in self.blocks: + block.attn.rope_dims = h.rope_dims + block.attn.rotary = Rotary(head_dim, base=h.rope_base, train_seq_len=h.train_seq_len, rope_dims=h.rope_dims) + self.final_norm = RMSNorm() + self.lm_head = None if h.tie_embeddings else CastedLinear(h.embedding_dim, h.vocab_size, bias=False) + if self.lm_head is not None: + self.lm_head._zero_init = True + if h.xsa_last_n > 0: + for i in range(max(0, h.num_layers - h.xsa_last_n), h.num_layers): + self.blocks[i].attn.use_xsa = True + + # Layer looping + self.looping_active: bool = False + if h.num_loops > 0: + loop_seg = list(range(h.loop_start, h.loop_end + 1)) + all_indices = list(range(h.loop_start)) + for _ in range(h.num_loops + 1): + all_indices.extend(loop_seg) + all_indices.extend(range(h.loop_end + 1, h.num_layers)) + num_enc = len(all_indices) // 2 + self.encoder_indices: list[int] = all_indices[:num_enc] + self.decoder_indices: list[int] = all_indices[num_enc:] + else: + self.encoder_indices = list(range(self.num_encoder_layers)) + self.decoder_indices = list(range(self.num_encoder_layers, h.num_layers)) + self.num_skip_weights = min(len(self.encoder_indices), len(self.decoder_indices)) + self.skip_weights = nn.Parameter(torch.ones(self.num_skip_weights, h.model_dim, dtype=torch.float32)) + self.skip_gates = nn.Parameter(torch.zeros(self.num_skip_weights, h.model_dim, dtype=torch.float32)) if h.skip_gates_enabled else None + + self._init_weights() + + def _init_weights(self) -> None: + if self.tie_embeddings: + nn.init.normal_(self.tok_emb.weight, mean=0.0, std=self.tied_embed_init_std) + for name, module in self.named_modules(): + if isinstance(module, nn.Linear): + if getattr(module, "_zero_init", False): + nn.init.zeros_(module.weight) + elif (module.weight.ndim == 2 and module.weight.shape[0] >= 64 and + module.weight.shape[1] >= 64): + nn.init.orthogonal_(module.weight, gain=1.0) + + def forward_logits(self, input_ids: Tensor) -> Tensor: + x = self.tok_emb(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + if self.embed_proj is not None: + x = self.embed_proj(x) + x0 = x + skips: list[Tensor] = [] + enc_iter = self.encoder_indices if self.looping_active else range(self.num_encoder_layers) + dec_iter = self.decoder_indices if self.looping_active else range(self.num_encoder_layers, self.num_encoder_layers + self.num_decoder_layers) + for i in enc_iter: + x = self.blocks[i](x, x0) + skips.append(x) + for skip_idx, i in enumerate(dec_iter): + if skip_idx < self.num_skip_weights and skips: + scaled_skip = self.skip_weights[skip_idx].to(dtype=x.dtype)[None, None, :] * skips.pop() + if self.skip_gates is not None: + g = torch.sigmoid(self.skip_gates[skip_idx].to(dtype=x.dtype))[None, None, :] + x = torch.lerp(scaled_skip, x, g) + else: + x = x + scaled_skip + x = self.blocks[i](x, x0) + x = self.final_norm(x) + if self.head_proj is not None: + x = self.head_proj(x) + if self.tie_embeddings: + logits_proj = F.linear(x, self.tok_emb.weight) + else: + logits_proj = self.lm_head(x) + return self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) + + def forward(self, input_ids: Tensor, target_ids: Tensor) -> Tensor: + logits = self.forward_logits(input_ids) + return F.cross_entropy( + logits.reshape(-1, logits.size(-1)).float(), target_ids.reshape(-1), reduction="mean") + + +def classify_param(name: str) -> str: + if "tok_emb" in name or "lm_head" in name: + return "embed" + if ".mlp." in name: + return "mlp" + if ".attn." in name or (".proj." in name and ".mlp." not in name): + return "attn" + return "other" + +# ---------------------------------------- +# Optimization +# ---------------------------------------- + +@torch.compile +def zeropower_via_newtonschulz5(G: Tensor, steps: int = 10, eps: float = 1e-7) -> Tensor: + a, b, c = (3.4445, -4.7750, 2.0315) + X = G.bfloat16() + X /= X.norm() + eps + transposed = G.size(0) > G.size(1) + if transposed: + X = X.T + for _ in range(steps): + A = X @ X.T + B = b * A + c * A @ A + X = a * X + B @ X + return X.T if transposed else X + + +class Muon(torch.optim.Optimizer): + def __init__(self, params, lr: float, momentum: float, backend_steps: int, + nesterov: bool = True, weight_decay: float = 0.0, + row_normalize: bool = False): + super().__init__( + params, + dict(lr=lr, momentum=momentum, backend_steps=backend_steps, + nesterov=nesterov, weight_decay=weight_decay, + row_normalize=row_normalize), + ) + + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + distributed = dist.is_available() and dist.is_initialized() + world_size = dist.get_world_size() if distributed else 1 + rank = dist.get_rank() if distributed else 0 + for group in self.param_groups: + params = group["params"] + if not params: + continue + lr = group["lr"] + momentum = group["momentum"] + backend_steps = group["backend_steps"] + nesterov = group["nesterov"] + total_params = sum(int(p.numel()) for p in params) + updates_flat = torch.zeros(total_params, device=params[0].device, dtype=torch.bfloat16) + curr = 0 + for i, p in enumerate(params): + if i % world_size == rank and p.grad is not None: + g = p.grad + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros_like(g) + buf = state["momentum_buffer"] + buf.mul_(momentum).add_(g) + if nesterov: + g = g.add(buf, alpha=momentum) + if group.get("row_normalize", False): + row_norms = g.float().norm(dim=-1, keepdim=True).clamp_min(1e-07) + g = g / row_norms.to(g.dtype) + g = zeropower_via_newtonschulz5(g, steps=backend_steps) + g *= max(1, g.size(0) / g.size(1)) ** 0.5 + updates_flat[curr : curr + p.numel()] = g.reshape(-1) + curr += p.numel() + if distributed: + dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) + wd = group.get("weight_decay", 0.0) + curr = 0 + for p in params: + if wd > 0.0: + p.data.mul_(1.0 - lr * wd) + g = updates_flat[curr : curr + p.numel()].view_as(p).to(dtype=p.dtype) + p.add_(g, alpha=-lr) + curr += p.numel() + return loss + + +CONTROL_TENSOR_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "CONTROL_TENSOR_NAME_PATTERNS", + "attn_scale,attn_scales,mlp_scale,mlp_scales,resid_mix,resid_mixes,q_gain,skip_weight,skip_weights,skip_gates", + ).split(",") + if pattern +) + + +class Optimizers(): + def __init__(self, h: Hyperparameters, base_model: GPT): + block_named_params = list(base_model.blocks.named_parameters()) + matrix_params = [ + p + for name, p in block_named_params + if p.ndim == 2 and not any(pattern in name for pattern in + CONTROL_TENSOR_NAME_PATTERNS) + ] + scalar_params = [ + p + for name, p in block_named_params + if p.ndim < 2 or any(pattern in name for pattern in + CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.skip_weights.numel() > 0: + scalar_params.append(base_model.skip_weights) + if base_model.skip_gates is not None and base_model.skip_gates.numel() > 0: + scalar_params.append(base_model.skip_gates) + + token_lr = h.tied_embed_lr if h.tie_embeddings else h.embed_lr + tok_params = [{"params": [base_model.tok_emb.weight], "lr": token_lr, "base_lr": token_lr}] + self.optimizer_tok = torch.optim.AdamW( + tok_params, + betas=(h.beta1, h.beta2), + eps=h.adam_eps, + weight_decay=h.embed_wd, + fused=True, + ) + self.optimizer_muon = Muon( + matrix_params, + lr=h.matrix_lr, + momentum=h.muon_momentum, + backend_steps=h.muon_backend_steps, + weight_decay=h.muon_wd, + row_normalize=h.muon_row_normalize, + ) + for group in self.optimizer_muon.param_groups: + group["base_lr"] = h.matrix_lr + self.optimizer_scalar = torch.optim.AdamW( + [{"params": scalar_params, "lr": h.scalar_lr, "base_lr": h.scalar_lr}], + betas=(h.beta1, h.beta2), + eps=h.adam_eps, + weight_decay=h.adam_wd, + fused=True, + ) + self.optimizers = [self.optimizer_tok, self.optimizer_muon, self.optimizer_scalar] + if base_model.lm_head is not None: + self.optimizer_head = torch.optim.Adam( + [{"params": [base_model.lm_head.weight], "lr": h.head_lr, "base_lr": h.head_lr}], + betas=(h.beta1, h.beta2), + eps=h.adam_eps, + fused=True, + ) + self.optimizers.insert(1, self.optimizer_head) + else: + self.optimizer_head = None + + def __iter__(self): + return iter(self.optimizers) + + def zero_grad_all(self) -> None: + for opt in self.optimizers: + opt.zero_grad(set_to_none=True) + + def step(self): + for opt in self.optimizers: + opt.step() + self.zero_grad_all() + +# ---------------------------------------- +# Quantization +# ---------------------------------------- + +def restore_fp32_params(model: nn.Module) -> None: + for module in model.modules(): + if isinstance(module, CastedLinear): + module.float() + for name, param in model.named_parameters(): + if (param.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)) and param.dtype != torch.float32: + param.data = param.data.float() + + +def collect_hessians( + model: nn.Module, + train_loader: ShuffledSequenceLoader, + h: Hyperparameters, + device: torch.device, + n_calibration_batches: int = 64, +) -> dict[str, Tensor]: + hessians: dict[str, Tensor] = {} + hooks = [] + + def make_hook(name: str): + def hook_fn(module, inp, out): + x = inp[0].detach().float() + if x.ndim == 3: + x = x.reshape(-1, x.shape[-1]) + if name not in hessians: + hessians[name] = torch.zeros( + x.shape[1], x.shape[1], dtype=torch.float32, device=device + ) + hessians[name].addmm_(x.T, x) + return hook_fn + + for name, module in model.named_modules(): + if isinstance(module, CastedLinear) and module.weight.numel() > 65536: + cat = classify_param(name + ".weight") + if cat in ("mlp", "attn"): + hooks.append(module.register_forward_hook(make_hook(name + ".weight"))) + + if model.tie_embeddings: + hook_module = model.head_proj if model.head_proj is not None else model.final_norm + def make_output_hook(name: str): + def hook_fn(module, inp, out): + x = out.detach().float() + if x.ndim == 3: + x = x.reshape(-1, x.shape[-1]) + if name not in hessians: + hessians[name] = torch.zeros( + x.shape[1], x.shape[1], dtype=torch.float32, device=device + ) + hessians[name].addmm_(x.T, x) + return hook_fn + hooks.append(hook_module.register_forward_hook(make_output_hook("tok_emb.weight"))) + + model.eval() + with torch.no_grad(): + for _ in range(n_calibration_batches): + x, _ = train_loader.next_batch(h.train_batch_tokens, h.grad_accum_steps) + model.forward_logits(x) + + for hook in hooks: + hook.remove() + + for name in hessians: + hessians[name] = hessians[name].cpu() / n_calibration_batches + + return hessians + + +def gptq_quantize_weight( + w: Tensor, + H: Tensor, + clip_sigmas: float = 3.0, + clip_range: int = 63, + block_size: int = 128, +) -> tuple[Tensor, Tensor]: + W_orig = w.float().clone() + rows, cols = W_orig.shape + H = H.float().clone() + + dead = torch.diag(H) == 0 + H[dead, dead] = 1 + damp = 0.01 * H.diag().mean() + H.diagonal().add_(damp) + + perm = torch.argsort(H.diag(), descending=True) + invperm = torch.argsort(perm) + W_perm = W_orig[:, perm].clone() + W_perm[:, dead[perm]] = 0 + H = H[perm][:, perm] + + Hinv = torch.cholesky_inverse(torch.linalg.cholesky(H)) + Hinv = torch.linalg.cholesky(Hinv, upper=True) + + row_std = W_orig.std(dim=1) + s = (clip_sigmas * row_std / clip_range).clamp_min(1e-10).to(torch.float16) + sf = s.float() + + Q = torch.zeros(rows, cols, dtype=torch.int8) + W_work = W_perm.clone() + for i1 in range(0, cols, block_size): + i2 = min(i1 + block_size, cols) + W_block = W_work[:, i1:i2].clone() + Hinv_block = Hinv[i1:i2, i1:i2] + Err = torch.zeros(rows, i2 - i1) + for j in range(i2 - i1): + w_col = W_block[:, j] + d = Hinv_block[j, j] + q_col = torch.clamp(torch.round(w_col / sf), -clip_range, clip_range) + Q[:, i1 + j] = q_col.to(torch.int8) + err = (w_col - q_col.float() * sf) / d + Err[:, j] = err + W_block[:, j:] -= err.unsqueeze(1) * Hinv_block[j, j:].unsqueeze(0) + if i2 < cols: + W_work[:, i2:] -= Err @ Hinv[i1:i2, i2:] + + return Q[:, invperm], s + + +def gptq_mixed_quantize( + state_dict: dict[str, Tensor], + hessians: dict[str, Tensor], + h: Hyperparameters, +) -> tuple[dict[str, Tensor], dict[str, object]]: + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough (float16)" + continue + cs = h.embed_clip_sigmas if "tok_emb" in name else h.matrix_clip_sigmas + bits = h.embed_bits if "tok_emb" in name else h.matrix_bits + q, s = gptq_quantize_weight( + t, hessians[name], clip_sigmas=cs, clip_range=2**(bits - 1) - 1) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = f"gptq (int{bits})" + + categories = collections.defaultdict(set) + for name, cat in meta.items(): + short = re.sub(r'\.\d+$', '', re.sub(r'blocks\.\d+', 'blocks', name)) + categories[cat].add(short) + log("Quantized weights:") + for cat in sorted(categories): + log(f" {cat}: {', '.join(sorted(categories[cat]))}") + + return result, meta + + +def dequantize_mixed(result: dict[str, Tensor], meta: dict[str, object], + template_sd: dict[str, Tensor]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + for name, orig in template_sd.items(): + info = meta.get(name) + if info is None: + continue + orig_dtype = orig.dtype + if "passthrough" in info: + t = result[name] + if t.dtype == torch.float16 and orig_dtype in (torch.float32, torch.bfloat16): + t = t.to(orig_dtype) + out[name] = t + continue + q, s = result[name + ".q"], result[name + ".scale"] + if s.ndim > 0: + out[name] = (q.float() * s.float().view(q.shape[0], *([1] * (q.ndim - 1)))).to(orig_dtype) + else: + out[name] = (q.float() * float(s.item())).to(orig_dtype) + return out + + +_BSHF_MAGIC = b"BSHF" + + +def _byte_shuffle(data: bytes, stride: int = 2) -> bytes: + if stride <= 1 or len(data) < stride: + return data + src = np.frombuffer(data, dtype=np.uint8) + n = len(src) + out = np.empty(n, dtype=np.uint8) + dest_off = 0 + for pos in range(stride): + chunk = src[pos::stride] + out[dest_off:dest_off + len(chunk)] = chunk + dest_off += len(chunk) + return _BSHF_MAGIC + bytes([stride]) + out.tobytes() + + +def _byte_unshuffle(data: bytes) -> bytes: + if len(data) < 5 or data[:4] != _BSHF_MAGIC: + return data + stride = data[4] + if stride < 2: + return data[5:] + payload = np.frombuffer(data, dtype=np.uint8, offset=5) + n = len(payload) + out = np.empty(n, dtype=np.uint8) + src_off = 0 + for pos in range(stride): + chunk_len = n // stride + (1 if pos < n % stride else 0) + out[pos::stride][:chunk_len] = payload[src_off:src_off + chunk_len] + src_off += chunk_len + return out.tobytes() + + +def _compress(data: bytes, compressor: str) -> bytes: + data = _byte_shuffle(data) + if compressor == "lzma": + return lzma.compress(data, preset=6) + elif compressor == "brotli": + import brotli + return brotli.compress(data, quality=11) + raise ValueError(f"Unknown compressor: {compressor!r}") + + +def _decompress(data: bytes, compressor: str) -> bytes: + if compressor == "lzma": + raw = lzma.decompress(data) + elif compressor == "brotli": + import brotli + raw = brotli.decompress(data) + else: + raise ValueError(f"Unknown compressor: {compressor!r}") + raw = _byte_unshuffle(raw) + return raw + + +def serialize(h: Hyperparameters, base_model: torch.nn.Module, code: str) -> tuple[int, int]: + code_bytes = len(code.encode("utf-8")) + if h.is_main_process: + torch.save(base_model.state_dict(), h.model_path) + model_bytes = os.path.getsize(h.model_path) + log(f"Serialized model: {model_bytes} bytes") + log(f"Code size: {code_bytes} bytes") + + sd_cpu = {k: v.detach().cpu() for k, v in base_model.state_dict().items()} + device = torch.device("cuda", h.local_rank) + log("GPTQ:collecting Hessians from calibration data...") + t0 = time.perf_counter() + calib_loader = ShuffledSequenceLoader(h, device) + hessians = collect_hessians( + base_model, calib_loader, h, device, + n_calibration_batches=h.gptq_calibration_batches, + ) + log(f"GPTQ:collected {len(hessians)} Hessians in {time.perf_counter() - t0:.1f}s") + quant_result, quant_meta = gptq_mixed_quantize(sd_cpu, hessians, h) + + quant_buf = io.BytesIO() + torch.save({"w": quant_result, "m": quant_meta}, quant_buf) + quant_raw = quant_buf.getvalue() + quant_blob = _compress(quant_raw, h.compressor) + quant_file_bytes = len(quant_blob) + bytes_total = quant_file_bytes + code_bytes + if h.is_main_process: + with open(h.quantized_model_path, "wb") as f: + f.write(quant_blob) + log(f"Serialized model quantized+{h.compressor}: {quant_file_bytes} bytes") + log(f"Total submission size quantized+{h.compressor}: {bytes_total} bytes") + return bytes_total, quant_file_bytes + + +def deserialize(h: Hyperparameters, device: torch.device) -> GPT: + eval_model = GPT(h).to(device).bfloat16() + restore_fp32_params(eval_model) + sd_cpu = {k: v.detach().cpu() for k, v in eval_model.state_dict().items()} + + with open(h.quantized_model_path, "rb") as f: + quant_blob_disk = f.read() + quant_state = torch.load( + io.BytesIO(_decompress(quant_blob_disk, h.compressor)), + map_location="cpu", + ) + deq_state = dequantize_mixed(quant_state["w"], quant_state["m"], sd_cpu) + eval_model.load_state_dict(deq_state, strict=True) + + return eval_model + +# ---------------------------------------- +# Evaluation +# ---------------------------------------- + +def _loss_bpb(loss_sum, token_count, byte_count) -> tuple[float, float]: + val_loss = (loss_sum / token_count).item() + val_bpb = val_loss / math.log(2.0) * (token_count.item() / byte_count.item()) + return val_loss, val_bpb + + +def eval_val( + h: Hyperparameters, + device: torch.device, + val_data: ValidationData, + model: nn.Module +) -> tuple[float, float]: + seq_len = h.eval_seq_len + local_batch_tokens = h.val_batch_tokens // (h.world_size * h.grad_accum_steps) + if local_batch_tokens < seq_len: + raise ValueError( + "VAL_BATCH_SIZE must provide at least one sequence per rank; " + f"got VAL_BATCH_SIZE={h.val_batch_tokens}, WORLD_SIZE={h.world_size}, " + f"GRAD_ACCUM_STEPS={h.grad_accum_steps}, seq_len={seq_len}" + ) + local_batch_seqs = local_batch_tokens // seq_len + total_seqs = (val_data.val_tokens.numel() - 1) // seq_len + seq_start = (total_seqs * h.rank) // h.world_size + seq_end = (total_seqs * (h.rank + 1)) // h.world_size + val_loss_sum = torch.zeros((), device=device, dtype=torch.float64) + val_token_count = torch.zeros((), device=device, dtype=torch.float64) + val_byte_count = torch.zeros((), device=device, dtype=torch.float64) + + model.eval() + with torch.inference_mode(): + for batch_seq_start in range(seq_start, seq_end, local_batch_seqs): + batch_seq_end = min(batch_seq_start + local_batch_seqs, seq_end) + raw_start = batch_seq_start * seq_len + raw_end = batch_seq_end * seq_len + 1 + local = val_data.val_tokens[raw_start:raw_end].to( + device=device, dtype=torch.int64, non_blocking=True) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + batch_loss = model(x, y).detach() + batch_token_count = float(y.numel()) + val_loss_sum += batch_loss.to(torch.float64) * batch_token_count + val_token_count += batch_token_count + prev_ids = x.reshape(-1) + tgt_ids = y.reshape(-1) + token_bytes = val_data.base_bytes_lut[tgt_ids].to(dtype=torch.int16) + token_bytes += (val_data.has_leading_space_lut[tgt_ids] & + ~val_data.is_boundary_token_lut[prev_ids]).to(dtype=torch.int16) + val_byte_count += token_bytes.to(torch.float64).sum() + + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(val_loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(val_token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(val_byte_count, op=dist.ReduceOp.SUM) + + model.train() + return _loss_bpb(val_loss_sum, val_token_count, val_byte_count) + + +def eval_val_sliding( + h: Hyperparameters, + device: torch.device, + val_data: ValidationData, + base_model: nn.Module, + batch_seqs: int = 32 +) -> tuple[float, float]: + base_model.eval() + logits_fn = torch.compile(base_model.forward_logits, dynamic=False, fullgraph=True) + + seq_len = h.eval_seq_len + context_size = seq_len - h.eval_stride + total_tokens = val_data.val_tokens.numel() - 1 + + window_starts = [ws for ws in range(0, total_tokens, h.eval_stride) + if ws + context_size < total_tokens] + + total_windows = len(window_starts) + my_s = (total_windows * h.rank) // h.world_size + my_e = (total_windows * (h.rank + 1)) // h.world_size + my_windows = window_starts[my_s:my_e] + + loss_sum = torch.zeros((), device=device, dtype=torch.float64) + token_count = torch.zeros((), device=device, dtype=torch.float64) + byte_count = torch.zeros((), device=device, dtype=torch.float64) + + with torch.inference_mode(): + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + + for i, ws in enumerate(batch_ws): + we = min(ws + seq_len, total_tokens) + wlen = we - ws + wlens.append(wlen) + chunk = val_data.val_tokens[ws:we + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = logits_fn(x_batch) + + nll = F.cross_entropy( + logits.reshape(-1, logits.size(-1)).float(), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else context_size + scored_nll = nll[i, s:wlen].to(torch.float64) + loss_sum += scored_nll.sum() + token_count += float(wlen - s) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = val_data.base_bytes_lut[tgt].to(torch.float64) + tb += (val_data.has_leading_space_lut[tgt] & + ~val_data.is_boundary_token_lut[prev]).to(torch.float64) + byte_count += tb.sum() + + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(byte_count, op=dist.ReduceOp.SUM) + + base_model.train() + return _loss_bpb(loss_sum, token_count, byte_count) + + +def timed_eval(label: str, fn, *args, **kwargs) -> tuple[float, float]: + torch.cuda.synchronize() + t0 = time.perf_counter() + val_loss, val_bpb = fn(*args, **kwargs) + torch.cuda.synchronize() + elapsed_ms = 1000.0 * (time.perf_counter() - t0) + log(f"{label} val_loss:{val_loss:.8f} val_bpb:{val_bpb:.8f} eval_time:{elapsed_ms:.0f}ms") + return val_loss, val_bpb + + +# ----------------------------- +# Training +# ----------------------------- + +def train_model(h: Hyperparameters, device: torch.device, val_data: ValidationData): + # Set up model + base_model = GPT(h).to(device).bfloat16() + restore_fp32_params(base_model) + compiled_model = torch.compile(base_model, dynamic=False, fullgraph=True) + if h.distributed: + model = DDP(compiled_model, device_ids=[h.local_rank], broadcast_buffers=False) + else: + model = compiled_model + log(f"model_params:{sum(p.numel() for p in base_model.parameters())}") + + # Set up optimizer and load train data + optimizers = Optimizers(h, base_model) + train_loader = ShuffledSequenceLoader(h, device) + + # Helper functions for training + max_wallclock_ms = 1000.0 * h.max_wallclock_seconds if h.max_wallclock_seconds > 0 else None + if max_wallclock_ms is not None: + max_wallclock_ms -= h.gptq_reserve_seconds * 1000.0 + log(f"gptq:reserving {h.gptq_reserve_seconds:.0f}s, effective={max_wallclock_ms:.0f}ms") + + def training_frac(step: int, elapsed_ms: float) -> float: + if max_wallclock_ms is None: + return step / max(h.iterations, 1) + return elapsed_ms / max(max_wallclock_ms, 1e-9) + + def lr_mul(frac: float) -> float: + if h.warmdown_frac <= 0: + return 1.0 + if frac >= 1.0 - h.warmdown_frac: + return max((1.0 - frac) / h.warmdown_frac, h.min_lr) + return 1.0 + + def step_fn(step, lr_scale): + optimizers.zero_grad_all() + train_loss = torch.zeros((), device=device) + for micro_step in range(h.grad_accum_steps): + if h.distributed: + model.require_backward_grad_sync = micro_step == h.grad_accum_steps - 1 + x, y = train_loader.next_batch(h.train_batch_tokens, h.grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + loss = model(x, y) + train_loss += loss.detach() + (loss / h.grad_accum_steps).backward() + train_loss /= h.grad_accum_steps + + frac = min(step / h.muon_momentum_warmup_steps, 1.0) if h.muon_momentum_warmup_steps > 0 else 1.0 + muon_momentum = (1 - frac) * h.muon_momentum_warmup_start + frac * h.muon_momentum + for group in optimizers.optimizer_muon.param_groups: + group["momentum"] = muon_momentum + + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * lr_scale + + if h.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), h.grad_clip_norm) + + optimizers.step() + return train_loss + + # Model warmup + if h.warmup_steps > 0: + initial_model_state = {name: tensor.detach().cpu().clone() + for name, tensor in base_model.state_dict().items()} + initial_optimizer_states = [copy.deepcopy(opt.state_dict()) for opt in optimizers] + model.train() + for warmup_step in range(h.warmup_steps): + step_fn(warmup_step, 1.0) + if warmup_step <= 5 or (warmup_step + 1) % 10 == 0 or warmup_step + 1 == h.warmup_steps: + log(f"warmup_step: {warmup_step + 1}/{h.warmup_steps}") + if h.num_loops > 0: + base_model.looping_active = True + log(f"loop_warmup:enabled encoder:{base_model.encoder_indices} decoder:{base_model.decoder_indices}") + for warmup_step in range(h.warmup_steps): + step_fn(warmup_step, 1.0) + if warmup_step <= 5 or (warmup_step + 1) % 10 == 0 or warmup_step + 1 == h.warmup_steps: + log(f"loop_warmup_step: {warmup_step + 1}/{h.warmup_steps}") + base_model.looping_active = False + base_model.load_state_dict(initial_model_state, strict=True) + for opt, state in zip(optimizers, initial_optimizer_states, strict=True): + opt.load_state_dict(state) + optimizers.zero_grad_all() + if h.distributed: + model.require_backward_grad_sync = True + train_loader = ShuffledSequenceLoader(h, device) + + # Training loop + ema_state = {name: t.detach().float().clone() for name, t in base_model.state_dict().items()} + ema_decay = h.ema_decay + + training_time_ms = 0.0 + stop_after_step: int | None = None + torch.cuda.synchronize() + t0 = time.perf_counter() + + step = 0 + while True: + last_step = step == h.iterations or (stop_after_step is not None and step >= stop_after_step) + + should_validate = last_step or (h.val_loss_every > 0 and step % h.val_loss_every == 0) + if should_validate: + torch.cuda.synchronize() + training_time_ms += 1000.0 * (time.perf_counter() - t0) + val_loss, val_bpb = eval_val(h, device, val_data, model) + log(f"{step}/{h.iterations} val_loss: {val_loss:.4f} val_bpb: {val_bpb:.4f}") + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if stop_after_step is not None and step < h.iterations: + log( + f"stopping_early: wallclock_cap train_time: {training_time_ms:.0f}ms " + f"step: {step}/{h.iterations}" + ) + break + + elapsed_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + frac = training_frac(step, elapsed_ms) + scale = lr_mul(frac) + if h.num_loops > 0 and not base_model.looping_active and frac >= h.enable_looping_at: + base_model.looping_active = True + log(f"layer_loop:enabled step:{step} frac:{frac:.3f} encoder:{base_model.encoder_indices} decoder:{base_model.decoder_indices}") + train_loss = step_fn(step, scale) + + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + + step += 1 + approx_training_time_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + + should_log_train = ( + h.train_log_every > 0 + and (step <= 5 or step % h.train_log_every == 0 or stop_after_step is not None) + ) + if should_log_train: + tok_per_sec = step * h.train_batch_tokens / (approx_training_time_ms / 1000.0) + log( + f"{step}/{h.iterations} train_loss: {train_loss.item():.4f} " + f"train_time: {approx_training_time_ms / 60000:.1f}m tok/s: {tok_per_sec:.0f}" + ) + + reached_cap = max_wallclock_ms is not None and approx_training_time_ms >= max_wallclock_ms + if h.distributed and max_wallclock_ms is not None: + reached_cap_tensor = torch.tensor(int(reached_cap), device=device) + dist.all_reduce(reached_cap_tensor, op=dist.ReduceOp.MAX) + reached_cap = bool(reached_cap_tensor.item()) + if stop_after_step is None and reached_cap: + stop_after_step = step + + log( + f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB" + ) + + # Weight averaging + log("ema:applying EMA weights") + current_state = base_model.state_dict() + avg_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + base_model.load_state_dict(avg_state, strict=True) + + return base_model, compiled_model + + +def train_and_eval(h: Hyperparameters, device: torch.device) -> None: + random.seed(h.seed) + np.random.seed(h.seed) + torch.manual_seed(h.seed) + torch.cuda.manual_seed_all(h.seed) + + val_data = ValidationData(h, device) + log(f"train_shards: {len(list(Path(h.datasets_dir).resolve().glob('fineweb_train_*.bin')))}") + log(f"val_tokens: {val_data.val_tokens.numel() - 1}") + + base_model, compiled_model = train_model(h, device, val_data) + torch._dynamo.reset() + timed_eval("pre-quantization post-ema", eval_val, h, device, val_data, compiled_model) + + serialize(h, base_model, Path(__file__).read_text(encoding="utf-8")) + if h.distributed: + dist.barrier() + eval_model = deserialize(h, device) + if h.num_loops > 0: + eval_model.looping_active = True + + compiled_model = torch.compile(eval_model, dynamic=False, fullgraph=True) + timed_eval("quantized", eval_val, h, device, val_data, compiled_model) + if h.sliding_window_enabled: + timed_eval("quantized_sliding_window", eval_val_sliding, h, device, val_data, eval_model) + + +def main(): + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ + + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required") + if world_size <= 0: + raise ValueError(f"WORLD_SIZE must be positive, got {world_size}") + if 8 % world_size != 0: + raise ValueError(f"WORLD_SIZE={world_size} must divide 8 so grad_accum_steps stays integral") + + device = torch.device("cuda", local_rank) + torch.cuda.set_device(device) + if distributed: + dist.init_process_group(backend="nccl", device_id=device) + dist.barrier() + + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + torch.set_float32_matmul_precision("high") + from torch.backends.cuda import enable_cudnn_sdp, enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp + + enable_cudnn_sdp(False) + enable_flash_sdp(True) + enable_mem_efficient_sdp(False) + enable_math_sdp(False) + torch._dynamo.config.optimize_ddp = False + + h = Hyperparameters() + set_logging_hparams(h) + if h.is_main_process: + os.makedirs("logs", exist_ok=True) + log(100 * "=", console=False) + log("Hyperparameters:", console=True) + for k, v in sorted(vars(type(h)).items()): + if not k.startswith("_"): + log(f" {k}: {v}", console=True) + log("=" * 100, console=False) + log(f"Running Python {sys.version}", console=False) + log(f"Running PyTorch {torch.__version__}", console=False) + log( + subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, + text=True, check=False).stdout, + console=False, + ) + log("=" * 100, console=False) + + train_and_eval(h, device) + + if distributed: + dist.destroy_process_group() + + +if __name__ == "__main__": + main() From bd3015a19a780bfb313efdfe23942d8f16c6d733 Mon Sep 17 00:00:00 2001 From: Anthony Date: Sun, 12 Apr 2026 11:44:35 -0400 Subject: [PATCH 2/5] SP8192 SOTA: compressed #1493 script + 3-seed logs from #1394 baseline --- .../2026-04-12_SP8192_Frontier/train_gpt.py | 1410 +---------------- .../train_gpt_sota.py | 470 ++++++ .../train_seed1337.log | 137 ++ .../train_seed2024.log | 137 ++ .../train_seed42.log | 137 ++ 5 files changed, 883 insertions(+), 1408 deletions(-) create mode 100644 records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt_sota.py create mode 100644 records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed1337.log create mode 100644 records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed2024.log create mode 100644 records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed42.log diff --git a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt.py b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt.py index b024660f31..e2634caf8e 100644 --- a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt.py +++ b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt.py @@ -1,1408 +1,2 @@ -import collections -import copy -import glob -import io -import lzma -import math -import os -from pathlib import Path -import random -import re -import subprocess -import sys -import time -import uuid - -import numpy as np -import sentencepiece as spm -import torch -import torch.distributed as dist -import torch.nn.functional as F -from torch.nn.parallel import DistributedDataParallel as DDP -from torch import Tensor, nn - -from flash_attn_interface import flash_attn_func as flash_attn_3_func - -# ---------------------------------------- -# Hyperparameters -# ---------------------------------------- - -class Hyperparameters(): - # Experiment settings - data_dir = os.environ.get('DATA_DIR', './data/') - seed = int(os.environ.get('SEED', 1337)) - run_id = os.environ.get("RUN_ID", str(uuid.uuid4())) - - # Training length - iterations = int(os.environ.get('ITERATIONS', 20000)) - warmdown_frac = float(os.environ.get('WARMDOWN_FRAC', 0.667)) - warmup_steps = int(os.environ.get('WARMUP_STEPS', 20)) - train_batch_tokens = int(os.environ.get('TRAIN_BATCH_TOKENS', 2048 * 48 * 8)) - train_seq_len = int(os.environ.get('TRAIN_SEQ_LEN', 2048)) - train_log_every = int(os.environ.get('TRAIN_LOG_EVERY', 500)) - max_wallclock_seconds = float(os.environ.get('MAX_WALLCLOCK_SECONDS', 600.0)) - - # Validation/Evals - val_batch_tokens = int(os.environ.get('VAL_BATCH_TOKENS', 2048 * 32 * 8)) - eval_seq_len = int(os.environ.get('EVAL_SEQ_LEN', 2048)) - val_loss_every = int(os.environ.get('VAL_LOSS_EVERY', 4000)) - sliding_window_enabled = bool(int(os.environ.get('SLIDING_WINDOW_ENABLED', '1'))) - - # Model architecture - vocab_size = int(os.environ.get('VOCAB_SIZE', 8192)) - num_layers = int(os.environ.get('NUM_LAYERS', 11)) - xsa_last_n = int(os.environ.get('XSA_LAST_N', 11)) - model_dim = int(os.environ.get('MODEL_DIM', 512)) - embedding_dim = int(os.environ.get('EMBEDDING_DIM', 512)) - num_kv_heads = int(os.environ.get('NUM_KV_HEADS', 4)) - num_heads = int(os.environ.get('NUM_HEADS', 8)) - mlp_mult = float(os.environ.get('MLP_MULT', 4.0)) - skip_gates_enabled = bool(int(os.environ.get('SKIP_GATES_ENABLED', '1'))) - tie_embeddings = bool(int(os.environ.get('TIE_EMBEDDINGS', '1'))) - logit_softcap = float(os.environ.get('LOGIT_SOFTCAP', 30.0)) - rope_base = float(os.environ.get('ROPE_BASE', 10000.0)) - rope_dims = int(os.environ.get('ROPE_DIMS', 16)) - rope_train_seq_len = int(os.environ.get('ROPE_TRAIN_SEQ_LEN', 2048)) - ln_scale = bool(int(os.environ.get('LN_SCALE', '1'))) - qk_gain_init = float(os.environ.get('QK_GAIN_INIT', 4.0)) - - # Layer looping - num_loops = int(os.environ.get('NUM_LOOPS', 2)) - loop_start = int(os.environ.get('LOOP_START', 4)) - loop_end = int(os.environ.get('LOOP_END', 5)) - enable_looping_at = float(os.environ.get('ENABLE_LOOPING_AT', 0.5)) - - # Optimizer - min_lr = float(os.environ.get('MIN_LR', 0.0)) - embed_lr = float(os.environ.get('EMBED_LR', 0.6)) - head_lr = float(os.environ.get('HEAD_LR', 0.008)) - tied_embed_lr = float(os.environ.get('TIED_EMBED_LR', 0.03)) - tied_embed_init_std = float(os.environ.get('TIED_EMBED_INIT_STD', 0.005)) - matrix_lr = float(os.environ.get('MATRIX_LR', 0.02)) - scalar_lr = float(os.environ.get('SCALAR_LR', 0.02)) - muon_momentum = float(os.environ.get('MUON_MOMENTUM', 0.99)) - muon_backend_steps = int(os.environ.get('MUON_BACKEND_STEPS', 5)) - muon_momentum_warmup_start = float(os.environ.get('MUON_MOMENTUM_WARMUP_START', 0.92)) - muon_momentum_warmup_steps = int(os.environ.get('MUON_MOMENTUM_WARMUP_STEPS', 1500)) - muon_row_normalize = bool(int(os.environ.get('MUON_ROW_NORMALIZE', '1'))) - beta1 = float(os.environ.get('BETA1', 0.9)) - beta2 = float(os.environ.get('BETA2', 0.95)) - adam_eps = float(os.environ.get('ADAM_EPS', 1e-8)) - grad_clip_norm = float(os.environ.get('GRAD_CLIP_NORM', 0.3)) - eval_stride = int(os.environ.get('EVAL_STRIDE', 64)) - muon_beta2 = float(os.environ.get('MUON_BETA2', 0.95)) - adam_wd = float(os.environ.get('ADAM_WD', 0.02)) - muon_wd = float(os.environ.get('MUON_WD', 0.085)) - embed_wd = float(os.environ.get('EMBED_WD', 0.085)) - ema_decay = float(os.environ.get('EMA_DECAY', 0.997)) - - # Quantization & Compression - compressor = os.environ.get('COMPRESSOR', 'brotli') - gptq_calibration_batches = int(os.environ.get('GPTQ_CALIBRATION_BATCHES', 64)) - gptq_reserve_seconds = float(os.environ.get('GPTQ_RESERVE_SECONDS', 12.0)) - matrix_bits = int(os.environ.get('MATRIX_BITS', 6)) - embed_bits = int(os.environ.get('EMBED_BITS', 8)) - matrix_clip_sigmas = float(os.environ.get('MATRIX_CLIP_SIGMAS', 12.85)) - embed_clip_sigmas = float(os.environ.get('EMBED_CLIP_SIGMAS', 20.0)) - - # Distributed setup - distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ - rank = int(os.environ.get("RANK", "0")) - world_size = int(os.environ.get("WORLD_SIZE", "1")) - local_rank = int(os.environ.get("LOCAL_RANK", "0")) - is_main_process = rank == 0 - grad_accum_steps = 8 // world_size - - # Data paths - datasets_dir = os.path.join(data_dir, 'datasets', f'fineweb10B_sp{vocab_size}') - train_files = os.path.join(datasets_dir, 'fineweb_train_*.bin') - val_files = os.path.join(datasets_dir, 'fineweb_val_*.bin') - tokenizer_path = os.path.join(data_dir, 'tokenizers', f'fineweb_{vocab_size}_bpe.model') - - # Experiment files - logfile = f"logs/{run_id}.txt" - model_path = "final_model.pt" - quantized_model_path = "final_model.int6.ptz" - -# ---------------------------------------- -# Global Logging Function -# ---------------------------------------- - -_logger_hparams = None - - -def set_logging_hparams(h: Hyperparameters) -> None: - global _logger_hparams - _logger_hparams = h - - -def log(msg, console: bool = True) -> None: - if _logger_hparams is None: - print(msg) - return - if _logger_hparams.is_main_process: - if console: - print(msg) - if _logger_hparams.logfile is not None: - with open(_logger_hparams.logfile, "a", encoding="utf-8") as f: - print(msg, file=f) - -# ---------------------------------------- -# Data Loading -# ---------------------------------------- - -class ValidationData: - def __init__(self, h: Hyperparameters, device: torch.device): - self.sp = spm.SentencePieceProcessor(model_file=h.tokenizer_path) - if int(self.sp.vocab_size()) != h.vocab_size: - raise ValueError( - f"VOCAB_SIZE={h.vocab_size} does not match tokenizer vocab_size={int(self.sp.vocab_size())}" - ) - self.val_tokens = load_validation_tokens(h.val_files, h.eval_seq_len) - self.base_bytes_lut, self.has_leading_space_lut, self.is_boundary_token_lut = ( - build_sentencepiece_luts(self.sp, h.vocab_size, device)) - - -def build_sentencepiece_luts( - sp: spm.SentencePieceProcessor, vocab_size: int, device: torch.device -) -> tuple[Tensor, Tensor, Tensor]: - sp_vocab_size = int(sp.vocab_size()) - # The BPB calculation assumes "▁" is its own token so that leading-space bytes - # are counted correctly. See https://github.com/openai/parameter-golf/issues/897 - assert sp.piece_to_id("\u2581") != sp.unk_id(), \ - "Tokenizer must have '▁' (space) as its own token for correct BPB byte counting" - table_size = max(sp_vocab_size, vocab_size) - base_bytes_np = np.zeros((table_size,), dtype=np.int16) - has_leading_space_np = np.zeros((table_size,), dtype=np.bool_) - is_boundary_token_np = np.ones((table_size,), dtype=np.bool_) - for token_id in range(sp_vocab_size): - if sp.is_control(token_id) or sp.is_unknown(token_id) or sp.is_unused(token_id): - continue - is_boundary_token_np[token_id] = False - if sp.is_byte(token_id): - base_bytes_np[token_id] = 1 - continue - piece = sp.id_to_piece(token_id) - if piece.startswith("\u2581"): - has_leading_space_np[token_id] = True - piece = piece[1:] - base_bytes_np[token_id] = len(piece.encode("utf-8")) - return ( - torch.tensor(base_bytes_np, dtype=torch.int16, device=device), - torch.tensor(has_leading_space_np, dtype=torch.bool, device=device), - torch.tensor(is_boundary_token_np, dtype=torch.bool, device=device), - ) - - -def load_validation_tokens(pattern: str, seq_len: int) -> Tensor: - files = [Path(p) for p in sorted(glob.glob(pattern))] - if not files: - raise FileNotFoundError(f"No files found for pattern: {pattern}") - # The export pipeline writes the fixed first-50k-doc validation set to fineweb_val_*. - tokens = torch.cat([load_data_shard(file) for file in files]).contiguous() - usable = ((tokens.numel() - 1) // seq_len) * seq_len - if usable <= 0: - raise ValueError(f"Validation split is too short for TRAIN_SEQ_LEN={seq_len}") - return tokens[: usable + 1] - - -def load_data_shard(file: Path) -> Tensor: - header_bytes = 256 * np.dtype(" int: - key = str(file) - cached = _SHARD_NTOKENS_CACHE.get(key) - if cached is not None: - return cached - header = np.fromfile(file, dtype=" np.memmap: - key = str(file) - mm = _MMAP_CACHE.get(key) - if mm is not None: - return mm - n = _read_num_tokens(file) - mm = np.memmap(file, mode="r", dtype=" None: - max_phase = min(self.seq_len - 1, max(0, self.num_tokens[si] - self.seq_len - 1)) - phase = int(self.rng.integers(max_phase + 1)) if max_phase > 0 else 0 - num_sequences = (self.num_tokens[si] - 1 - phase) // self.seq_len - sequence_order = self.rng.permutation(num_sequences) - self.start_inds[si] = (phase + sequence_order * self.seq_len).tolist() - - def next_batch(self, global_tokens: int, grad_accum_steps: int) -> tuple[Tensor, Tensor]: - device_tokens = global_tokens // (self.world_size * grad_accum_steps) - device_batch_size = device_tokens // self.seq_len - remaining = np.array([len(s) for s in self.start_inds], dtype=np.float64) - x = torch.empty((device_batch_size, self.seq_len), dtype=torch.int64) - y = torch.empty((device_batch_size, self.seq_len), dtype=torch.int64) - for bi in range(device_batch_size): - total = remaining.sum() - if total <= 0: - for si in range(len(self.files)): - self._reset_shard(si) - remaining = np.array([len(s) for s in self.start_inds], dtype=np.float64) - total = remaining.sum() - probs = remaining / total - si = int(self.rng.choice(len(self.files), p=probs)) - start_ind = self.start_inds[si].pop() - remaining[si] -= 1 - mm = _get_shard_memmap(self.files[si]) - window = torch.as_tensor( - np.array(mm[start_ind:start_ind + self.seq_len + 1], dtype=np.int64)) - x[bi] = window[:-1] - y[bi] = window[1:] - return x.to(self.device, non_blocking=True), y.to(self.device, non_blocking=True) - -# ---------------------------------------- -# Model Architecture -# ---------------------------------------- - -class RMSNorm(nn.Module): - def __init__(self, eps: float | None = None): - super().__init__() - self.eps = eps - - def forward(self, x: Tensor) -> Tensor: - return F.rms_norm(x, (x.size(-1),), eps=self.eps) - - -class CastedLinear(nn.Linear): - def forward(self, x: Tensor) -> Tensor: - w = self.weight.to(x.dtype) - bias = self.bias.to(x.dtype) if self.bias is not None else None - return F.linear(x, w, bias) - - -class Rotary(nn.Module): - def __init__(self, dim: int, base: float = 10000.0, train_seq_len: int = 1024, rope_dims: int = 0): - super().__init__() - self.dim = dim - self.base = base - self.train_seq_len = train_seq_len - self.rope_dims = rope_dims if rope_dims > 0 else dim - inv_freq = 1.0 / (base ** (torch.arange(0, self.rope_dims, 2, dtype=torch.float32) / self.rope_dims)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self._seq_len_cached = 0 - self._cos_cached: Tensor | None = None - self._sin_cached: Tensor | None = None - - def forward(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> tuple[Tensor, Tensor]: - if ( - self._cos_cached is None - or self._sin_cached is None - or self._seq_len_cached != seq_len - or self._cos_cached.device != device - ): - rd = self.rope_dims - if seq_len > self.train_seq_len: - scale = seq_len / self.train_seq_len - new_base = self.base * (scale ** (rd / (rd - 2))) - inv_freq = 1.0 / (new_base ** (torch.arange( - 0, rd, 2, dtype=torch.float32, device=device) / rd)) - else: - inv_freq = self.inv_freq.to(device) - t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) - freqs = torch.outer(t, inv_freq) - self._cos_cached = freqs.cos()[None, :, None, :] - self._sin_cached = freqs.sin()[None, :, None, :] - self._seq_len_cached = seq_len - return self._cos_cached.to(dtype=dtype), self._sin_cached.to(dtype=dtype) - - -def apply_rotary_emb(x: Tensor, cos: Tensor, sin: Tensor, rope_dims: int = 0) -> Tensor: - if rope_dims > 0 and rope_dims < x.size(-1): - x_rope, x_pass = x[..., :rope_dims], x[..., rope_dims:] - half = rope_dims // 2 - x1, x2 = x_rope[..., :half], x_rope[..., half:] - x_rope = torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) - return torch.cat((x_rope, x_pass), dim=-1) - half = x.size(-1) // 2 - x1, x2 = x[..., :half], x[..., half:] - return torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) - - -class CausalSelfAttention(nn.Module): - def __init__(self, dim: int, num_heads: int, num_kv_heads: int, - rope_base: float, qk_gain_init: float, train_seq_len: int): - super().__init__() - if dim % num_heads != 0: - raise ValueError("model_dim must be divisible by num_heads") - if num_heads % num_kv_heads != 0: - raise ValueError("num_heads must be divisible by num_kv_heads") - self.num_heads = num_heads - self.num_kv_heads = num_kv_heads - self.head_dim = dim // num_heads - if self.head_dim % 2 != 0: - raise ValueError("head_dim must be even for RoPE") - kv_dim = self.num_kv_heads * self.head_dim - self.c_q = CastedLinear(dim, dim, bias=False) - self.c_k = CastedLinear(dim, kv_dim, bias=False) - self.c_v = CastedLinear(dim, kv_dim, bias=False) - self.proj = CastedLinear(dim, dim, bias=False) - self.proj._zero_init = True - self.q_gain = nn.Parameter(torch.full((num_heads,), qk_gain_init, dtype=torch.float32)) - self.rope_dims = 0 - self.rotary = Rotary(self.head_dim, base=rope_base, train_seq_len=train_seq_len) - self.use_xsa = False - - def _xsa_efficient(self, y: Tensor, v: Tensor) -> Tensor: - B, T, H, D = y.shape - Hkv = v.size(-2) - group = H // Hkv - y_g = y.reshape(B, T, Hkv, group, D) - vn = F.normalize(v, dim=-1).unsqueeze(-2) - proj = (y_g * vn).sum(dim=-1, keepdim=True) * vn - return (y_g - proj).reshape(B, T, H, D) - - def forward(self, x: Tensor) -> Tensor: - bsz, seqlen, dim = x.shape - q = self.c_q(x).reshape(bsz, seqlen, self.num_heads, self.head_dim) - k = self.c_k(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) - v = self.c_v(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) - q = F.rms_norm(q, (q.size(-1),)) - k = F.rms_norm(k, (k.size(-1),)) - cos, sin = self.rotary(seqlen, x.device, q.dtype) - q = apply_rotary_emb(q, cos, sin, self.rope_dims) - k = apply_rotary_emb(k, cos, sin, self.rope_dims) - q = q * self.q_gain.to(dtype=q.dtype)[None, None, :, None] - y = flash_attn_3_func(q, k, v, causal=True) - if self.use_xsa: - y = self._xsa_efficient(y, v) - y = y.reshape(bsz, seqlen, dim) - return self.proj(y) - - -class MLP(nn.Module): - def __init__(self, dim: int, mlp_mult: int): - super().__init__() - hidden = int(mlp_mult * dim) - self.fc = CastedLinear(dim, hidden, bias=False) - self.proj = CastedLinear(hidden, dim, bias=False) - self.proj._zero_init = True - - def forward(self, x: Tensor) -> Tensor: - return self.proj(F.leaky_relu(self.fc(x), negative_slope=0.5).square()) - - -class Block(nn.Module): - def __init__(self, dim: int, num_heads: int, num_kv_heads: int, mlp_mult: int, - rope_base: float, qk_gain_init: float, train_seq_len: int, - layer_idx: int = 0, ln_scale: bool = False): - super().__init__() - self.attn_norm = RMSNorm() - self.mlp_norm = RMSNorm() - self.attn = CausalSelfAttention( - dim, num_heads, num_kv_heads, rope_base, qk_gain_init, train_seq_len) - self.mlp = MLP(dim, mlp_mult) - self.attn_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) - self.mlp_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) - self.resid_mix = nn.Parameter(torch.stack((torch.ones(dim), torch.zeros(dim))).float()) - self.ln_scale_factor = 1.0 / math.sqrt(layer_idx + 1) if ln_scale else 1.0 - - def forward(self, x: Tensor, x0: Tensor) -> Tensor: - mix = self.resid_mix.to(dtype=x.dtype) - x_in = mix[0][None, None, :] * x + mix[1][None, None, :] * x0 - attn_out = self.attn(self.attn_norm(x_in) * self.ln_scale_factor) - x_out = x_in + self.attn_scale.to(dtype=x_in.dtype)[None, None, :] * attn_out - x_out = x_out + self.mlp_scale.to(dtype=x_out.dtype)[None, None, :] * self.mlp( - self.mlp_norm(x_out) * self.ln_scale_factor) - return x_out - - -class GPT(nn.Module): - def __init__(self, h: Hyperparameters): - super().__init__() - if h.logit_softcap <= 0.0: - raise ValueError(f"logit_softcap must be positive, got {h.logit_softcap}") - self.tie_embeddings = h.tie_embeddings - self.tied_embed_init_std = h.tied_embed_init_std - self.logit_softcap = h.logit_softcap - self.tok_emb = nn.Embedding(h.vocab_size, h.embedding_dim) - if h.embedding_dim != h.model_dim: - self.embed_proj = CastedLinear(h.embedding_dim, h.model_dim, bias=False) - self.head_proj = CastedLinear(h.model_dim, h.embedding_dim, bias=False) - else: - self.embed_proj = None - self.head_proj = None - self.num_encoder_layers = h.num_layers // 2 - self.num_decoder_layers = h.num_layers - self.num_encoder_layers - self.blocks = nn.ModuleList([ - Block(h.model_dim, h.num_heads, h.num_kv_heads, h.mlp_mult, h.rope_base, - h.qk_gain_init, h.train_seq_len, layer_idx=i, ln_scale=h.ln_scale) - for i in range(h.num_layers) - ]) - if h.rope_dims > 0: - head_dim = h.model_dim // h.num_heads - for block in self.blocks: - block.attn.rope_dims = h.rope_dims - block.attn.rotary = Rotary(head_dim, base=h.rope_base, train_seq_len=h.train_seq_len, rope_dims=h.rope_dims) - self.final_norm = RMSNorm() - self.lm_head = None if h.tie_embeddings else CastedLinear(h.embedding_dim, h.vocab_size, bias=False) - if self.lm_head is not None: - self.lm_head._zero_init = True - if h.xsa_last_n > 0: - for i in range(max(0, h.num_layers - h.xsa_last_n), h.num_layers): - self.blocks[i].attn.use_xsa = True - - # Layer looping - self.looping_active: bool = False - if h.num_loops > 0: - loop_seg = list(range(h.loop_start, h.loop_end + 1)) - all_indices = list(range(h.loop_start)) - for _ in range(h.num_loops + 1): - all_indices.extend(loop_seg) - all_indices.extend(range(h.loop_end + 1, h.num_layers)) - num_enc = len(all_indices) // 2 - self.encoder_indices: list[int] = all_indices[:num_enc] - self.decoder_indices: list[int] = all_indices[num_enc:] - else: - self.encoder_indices = list(range(self.num_encoder_layers)) - self.decoder_indices = list(range(self.num_encoder_layers, h.num_layers)) - self.num_skip_weights = min(len(self.encoder_indices), len(self.decoder_indices)) - self.skip_weights = nn.Parameter(torch.ones(self.num_skip_weights, h.model_dim, dtype=torch.float32)) - self.skip_gates = nn.Parameter(torch.zeros(self.num_skip_weights, h.model_dim, dtype=torch.float32)) if h.skip_gates_enabled else None - - self._init_weights() - - def _init_weights(self) -> None: - if self.tie_embeddings: - nn.init.normal_(self.tok_emb.weight, mean=0.0, std=self.tied_embed_init_std) - for name, module in self.named_modules(): - if isinstance(module, nn.Linear): - if getattr(module, "_zero_init", False): - nn.init.zeros_(module.weight) - elif (module.weight.ndim == 2 and module.weight.shape[0] >= 64 and - module.weight.shape[1] >= 64): - nn.init.orthogonal_(module.weight, gain=1.0) - - def forward_logits(self, input_ids: Tensor) -> Tensor: - x = self.tok_emb(input_ids) - x = F.rms_norm(x, (x.size(-1),)) - if self.embed_proj is not None: - x = self.embed_proj(x) - x0 = x - skips: list[Tensor] = [] - enc_iter = self.encoder_indices if self.looping_active else range(self.num_encoder_layers) - dec_iter = self.decoder_indices if self.looping_active else range(self.num_encoder_layers, self.num_encoder_layers + self.num_decoder_layers) - for i in enc_iter: - x = self.blocks[i](x, x0) - skips.append(x) - for skip_idx, i in enumerate(dec_iter): - if skip_idx < self.num_skip_weights and skips: - scaled_skip = self.skip_weights[skip_idx].to(dtype=x.dtype)[None, None, :] * skips.pop() - if self.skip_gates is not None: - g = torch.sigmoid(self.skip_gates[skip_idx].to(dtype=x.dtype))[None, None, :] - x = torch.lerp(scaled_skip, x, g) - else: - x = x + scaled_skip - x = self.blocks[i](x, x0) - x = self.final_norm(x) - if self.head_proj is not None: - x = self.head_proj(x) - if self.tie_embeddings: - logits_proj = F.linear(x, self.tok_emb.weight) - else: - logits_proj = self.lm_head(x) - return self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) - - def forward(self, input_ids: Tensor, target_ids: Tensor) -> Tensor: - logits = self.forward_logits(input_ids) - return F.cross_entropy( - logits.reshape(-1, logits.size(-1)).float(), target_ids.reshape(-1), reduction="mean") - - -def classify_param(name: str) -> str: - if "tok_emb" in name or "lm_head" in name: - return "embed" - if ".mlp." in name: - return "mlp" - if ".attn." in name or (".proj." in name and ".mlp." not in name): - return "attn" - return "other" - -# ---------------------------------------- -# Optimization -# ---------------------------------------- - -@torch.compile -def zeropower_via_newtonschulz5(G: Tensor, steps: int = 10, eps: float = 1e-7) -> Tensor: - a, b, c = (3.4445, -4.7750, 2.0315) - X = G.bfloat16() - X /= X.norm() + eps - transposed = G.size(0) > G.size(1) - if transposed: - X = X.T - for _ in range(steps): - A = X @ X.T - B = b * A + c * A @ A - X = a * X + B @ X - return X.T if transposed else X - - -class Muon(torch.optim.Optimizer): - def __init__(self, params, lr: float, momentum: float, backend_steps: int, - nesterov: bool = True, weight_decay: float = 0.0, - row_normalize: bool = False): - super().__init__( - params, - dict(lr=lr, momentum=momentum, backend_steps=backend_steps, - nesterov=nesterov, weight_decay=weight_decay, - row_normalize=row_normalize), - ) - - @torch.no_grad() - def step(self, closure=None): - loss = None - if closure is not None: - with torch.enable_grad(): - loss = closure() - distributed = dist.is_available() and dist.is_initialized() - world_size = dist.get_world_size() if distributed else 1 - rank = dist.get_rank() if distributed else 0 - for group in self.param_groups: - params = group["params"] - if not params: - continue - lr = group["lr"] - momentum = group["momentum"] - backend_steps = group["backend_steps"] - nesterov = group["nesterov"] - total_params = sum(int(p.numel()) for p in params) - updates_flat = torch.zeros(total_params, device=params[0].device, dtype=torch.bfloat16) - curr = 0 - for i, p in enumerate(params): - if i % world_size == rank and p.grad is not None: - g = p.grad - state = self.state[p] - if "momentum_buffer" not in state: - state["momentum_buffer"] = torch.zeros_like(g) - buf = state["momentum_buffer"] - buf.mul_(momentum).add_(g) - if nesterov: - g = g.add(buf, alpha=momentum) - if group.get("row_normalize", False): - row_norms = g.float().norm(dim=-1, keepdim=True).clamp_min(1e-07) - g = g / row_norms.to(g.dtype) - g = zeropower_via_newtonschulz5(g, steps=backend_steps) - g *= max(1, g.size(0) / g.size(1)) ** 0.5 - updates_flat[curr : curr + p.numel()] = g.reshape(-1) - curr += p.numel() - if distributed: - dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) - wd = group.get("weight_decay", 0.0) - curr = 0 - for p in params: - if wd > 0.0: - p.data.mul_(1.0 - lr * wd) - g = updates_flat[curr : curr + p.numel()].view_as(p).to(dtype=p.dtype) - p.add_(g, alpha=-lr) - curr += p.numel() - return loss - - -CONTROL_TENSOR_NAME_PATTERNS = tuple( - pattern - for pattern in os.environ.get( - "CONTROL_TENSOR_NAME_PATTERNS", - "attn_scale,attn_scales,mlp_scale,mlp_scales,resid_mix,resid_mixes,q_gain,skip_weight,skip_weights,skip_gates", - ).split(",") - if pattern -) - - -class Optimizers(): - def __init__(self, h: Hyperparameters, base_model: GPT): - block_named_params = list(base_model.blocks.named_parameters()) - matrix_params = [ - p - for name, p in block_named_params - if p.ndim == 2 and not any(pattern in name for pattern in - CONTROL_TENSOR_NAME_PATTERNS) - ] - scalar_params = [ - p - for name, p in block_named_params - if p.ndim < 2 or any(pattern in name for pattern in - CONTROL_TENSOR_NAME_PATTERNS) - ] - if base_model.skip_weights.numel() > 0: - scalar_params.append(base_model.skip_weights) - if base_model.skip_gates is not None and base_model.skip_gates.numel() > 0: - scalar_params.append(base_model.skip_gates) - - token_lr = h.tied_embed_lr if h.tie_embeddings else h.embed_lr - tok_params = [{"params": [base_model.tok_emb.weight], "lr": token_lr, "base_lr": token_lr}] - self.optimizer_tok = torch.optim.AdamW( - tok_params, - betas=(h.beta1, h.beta2), - eps=h.adam_eps, - weight_decay=h.embed_wd, - fused=True, - ) - self.optimizer_muon = Muon( - matrix_params, - lr=h.matrix_lr, - momentum=h.muon_momentum, - backend_steps=h.muon_backend_steps, - weight_decay=h.muon_wd, - row_normalize=h.muon_row_normalize, - ) - for group in self.optimizer_muon.param_groups: - group["base_lr"] = h.matrix_lr - self.optimizer_scalar = torch.optim.AdamW( - [{"params": scalar_params, "lr": h.scalar_lr, "base_lr": h.scalar_lr}], - betas=(h.beta1, h.beta2), - eps=h.adam_eps, - weight_decay=h.adam_wd, - fused=True, - ) - self.optimizers = [self.optimizer_tok, self.optimizer_muon, self.optimizer_scalar] - if base_model.lm_head is not None: - self.optimizer_head = torch.optim.Adam( - [{"params": [base_model.lm_head.weight], "lr": h.head_lr, "base_lr": h.head_lr}], - betas=(h.beta1, h.beta2), - eps=h.adam_eps, - fused=True, - ) - self.optimizers.insert(1, self.optimizer_head) - else: - self.optimizer_head = None - - def __iter__(self): - return iter(self.optimizers) - - def zero_grad_all(self) -> None: - for opt in self.optimizers: - opt.zero_grad(set_to_none=True) - - def step(self): - for opt in self.optimizers: - opt.step() - self.zero_grad_all() - -# ---------------------------------------- -# Quantization -# ---------------------------------------- - -def restore_fp32_params(model: nn.Module) -> None: - for module in model.modules(): - if isinstance(module, CastedLinear): - module.float() - for name, param in model.named_parameters(): - if (param.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)) and param.dtype != torch.float32: - param.data = param.data.float() - - -def collect_hessians( - model: nn.Module, - train_loader: ShuffledSequenceLoader, - h: Hyperparameters, - device: torch.device, - n_calibration_batches: int = 64, -) -> dict[str, Tensor]: - hessians: dict[str, Tensor] = {} - hooks = [] - - def make_hook(name: str): - def hook_fn(module, inp, out): - x = inp[0].detach().float() - if x.ndim == 3: - x = x.reshape(-1, x.shape[-1]) - if name not in hessians: - hessians[name] = torch.zeros( - x.shape[1], x.shape[1], dtype=torch.float32, device=device - ) - hessians[name].addmm_(x.T, x) - return hook_fn - - for name, module in model.named_modules(): - if isinstance(module, CastedLinear) and module.weight.numel() > 65536: - cat = classify_param(name + ".weight") - if cat in ("mlp", "attn"): - hooks.append(module.register_forward_hook(make_hook(name + ".weight"))) - - if model.tie_embeddings: - hook_module = model.head_proj if model.head_proj is not None else model.final_norm - def make_output_hook(name: str): - def hook_fn(module, inp, out): - x = out.detach().float() - if x.ndim == 3: - x = x.reshape(-1, x.shape[-1]) - if name not in hessians: - hessians[name] = torch.zeros( - x.shape[1], x.shape[1], dtype=torch.float32, device=device - ) - hessians[name].addmm_(x.T, x) - return hook_fn - hooks.append(hook_module.register_forward_hook(make_output_hook("tok_emb.weight"))) - - model.eval() - with torch.no_grad(): - for _ in range(n_calibration_batches): - x, _ = train_loader.next_batch(h.train_batch_tokens, h.grad_accum_steps) - model.forward_logits(x) - - for hook in hooks: - hook.remove() - - for name in hessians: - hessians[name] = hessians[name].cpu() / n_calibration_batches - - return hessians - - -def gptq_quantize_weight( - w: Tensor, - H: Tensor, - clip_sigmas: float = 3.0, - clip_range: int = 63, - block_size: int = 128, -) -> tuple[Tensor, Tensor]: - W_orig = w.float().clone() - rows, cols = W_orig.shape - H = H.float().clone() - - dead = torch.diag(H) == 0 - H[dead, dead] = 1 - damp = 0.01 * H.diag().mean() - H.diagonal().add_(damp) - - perm = torch.argsort(H.diag(), descending=True) - invperm = torch.argsort(perm) - W_perm = W_orig[:, perm].clone() - W_perm[:, dead[perm]] = 0 - H = H[perm][:, perm] - - Hinv = torch.cholesky_inverse(torch.linalg.cholesky(H)) - Hinv = torch.linalg.cholesky(Hinv, upper=True) - - row_std = W_orig.std(dim=1) - s = (clip_sigmas * row_std / clip_range).clamp_min(1e-10).to(torch.float16) - sf = s.float() - - Q = torch.zeros(rows, cols, dtype=torch.int8) - W_work = W_perm.clone() - for i1 in range(0, cols, block_size): - i2 = min(i1 + block_size, cols) - W_block = W_work[:, i1:i2].clone() - Hinv_block = Hinv[i1:i2, i1:i2] - Err = torch.zeros(rows, i2 - i1) - for j in range(i2 - i1): - w_col = W_block[:, j] - d = Hinv_block[j, j] - q_col = torch.clamp(torch.round(w_col / sf), -clip_range, clip_range) - Q[:, i1 + j] = q_col.to(torch.int8) - err = (w_col - q_col.float() * sf) / d - Err[:, j] = err - W_block[:, j:] -= err.unsqueeze(1) * Hinv_block[j, j:].unsqueeze(0) - if i2 < cols: - W_work[:, i2:] -= Err @ Hinv[i1:i2, i2:] - - return Q[:, invperm], s - - -def gptq_mixed_quantize( - state_dict: dict[str, Tensor], - hessians: dict[str, Tensor], - h: Hyperparameters, -) -> tuple[dict[str, Tensor], dict[str, object]]: - result: dict[str, Tensor] = {} - meta: dict[str, object] = {} - - for name, tensor in state_dict.items(): - t = tensor.detach().cpu().contiguous() - if not t.is_floating_point() or t.numel() <= 65536: - result[name] = t.to(torch.float16) if t.is_floating_point() else t - meta[name] = "passthrough (float16)" - continue - cs = h.embed_clip_sigmas if "tok_emb" in name else h.matrix_clip_sigmas - bits = h.embed_bits if "tok_emb" in name else h.matrix_bits - q, s = gptq_quantize_weight( - t, hessians[name], clip_sigmas=cs, clip_range=2**(bits - 1) - 1) - result[name + ".q"] = q - result[name + ".scale"] = s - meta[name] = f"gptq (int{bits})" - - categories = collections.defaultdict(set) - for name, cat in meta.items(): - short = re.sub(r'\.\d+$', '', re.sub(r'blocks\.\d+', 'blocks', name)) - categories[cat].add(short) - log("Quantized weights:") - for cat in sorted(categories): - log(f" {cat}: {', '.join(sorted(categories[cat]))}") - - return result, meta - - -def dequantize_mixed(result: dict[str, Tensor], meta: dict[str, object], - template_sd: dict[str, Tensor]) -> dict[str, Tensor]: - out: dict[str, Tensor] = {} - for name, orig in template_sd.items(): - info = meta.get(name) - if info is None: - continue - orig_dtype = orig.dtype - if "passthrough" in info: - t = result[name] - if t.dtype == torch.float16 and orig_dtype in (torch.float32, torch.bfloat16): - t = t.to(orig_dtype) - out[name] = t - continue - q, s = result[name + ".q"], result[name + ".scale"] - if s.ndim > 0: - out[name] = (q.float() * s.float().view(q.shape[0], *([1] * (q.ndim - 1)))).to(orig_dtype) - else: - out[name] = (q.float() * float(s.item())).to(orig_dtype) - return out - - -_BSHF_MAGIC = b"BSHF" - - -def _byte_shuffle(data: bytes, stride: int = 2) -> bytes: - if stride <= 1 or len(data) < stride: - return data - src = np.frombuffer(data, dtype=np.uint8) - n = len(src) - out = np.empty(n, dtype=np.uint8) - dest_off = 0 - for pos in range(stride): - chunk = src[pos::stride] - out[dest_off:dest_off + len(chunk)] = chunk - dest_off += len(chunk) - return _BSHF_MAGIC + bytes([stride]) + out.tobytes() - - -def _byte_unshuffle(data: bytes) -> bytes: - if len(data) < 5 or data[:4] != _BSHF_MAGIC: - return data - stride = data[4] - if stride < 2: - return data[5:] - payload = np.frombuffer(data, dtype=np.uint8, offset=5) - n = len(payload) - out = np.empty(n, dtype=np.uint8) - src_off = 0 - for pos in range(stride): - chunk_len = n // stride + (1 if pos < n % stride else 0) - out[pos::stride][:chunk_len] = payload[src_off:src_off + chunk_len] - src_off += chunk_len - return out.tobytes() - - -def _compress(data: bytes, compressor: str) -> bytes: - data = _byte_shuffle(data) - if compressor == "lzma": - return lzma.compress(data, preset=6) - elif compressor == "brotli": - import brotli - return brotli.compress(data, quality=11) - raise ValueError(f"Unknown compressor: {compressor!r}") - - -def _decompress(data: bytes, compressor: str) -> bytes: - if compressor == "lzma": - raw = lzma.decompress(data) - elif compressor == "brotli": - import brotli - raw = brotli.decompress(data) - else: - raise ValueError(f"Unknown compressor: {compressor!r}") - raw = _byte_unshuffle(raw) - return raw - - -def serialize(h: Hyperparameters, base_model: torch.nn.Module, code: str) -> tuple[int, int]: - code_bytes = len(code.encode("utf-8")) - if h.is_main_process: - torch.save(base_model.state_dict(), h.model_path) - model_bytes = os.path.getsize(h.model_path) - log(f"Serialized model: {model_bytes} bytes") - log(f"Code size: {code_bytes} bytes") - - sd_cpu = {k: v.detach().cpu() for k, v in base_model.state_dict().items()} - device = torch.device("cuda", h.local_rank) - log("GPTQ:collecting Hessians from calibration data...") - t0 = time.perf_counter() - calib_loader = ShuffledSequenceLoader(h, device) - hessians = collect_hessians( - base_model, calib_loader, h, device, - n_calibration_batches=h.gptq_calibration_batches, - ) - log(f"GPTQ:collected {len(hessians)} Hessians in {time.perf_counter() - t0:.1f}s") - quant_result, quant_meta = gptq_mixed_quantize(sd_cpu, hessians, h) - - quant_buf = io.BytesIO() - torch.save({"w": quant_result, "m": quant_meta}, quant_buf) - quant_raw = quant_buf.getvalue() - quant_blob = _compress(quant_raw, h.compressor) - quant_file_bytes = len(quant_blob) - bytes_total = quant_file_bytes + code_bytes - if h.is_main_process: - with open(h.quantized_model_path, "wb") as f: - f.write(quant_blob) - log(f"Serialized model quantized+{h.compressor}: {quant_file_bytes} bytes") - log(f"Total submission size quantized+{h.compressor}: {bytes_total} bytes") - return bytes_total, quant_file_bytes - - -def deserialize(h: Hyperparameters, device: torch.device) -> GPT: - eval_model = GPT(h).to(device).bfloat16() - restore_fp32_params(eval_model) - sd_cpu = {k: v.detach().cpu() for k, v in eval_model.state_dict().items()} - - with open(h.quantized_model_path, "rb") as f: - quant_blob_disk = f.read() - quant_state = torch.load( - io.BytesIO(_decompress(quant_blob_disk, h.compressor)), - map_location="cpu", - ) - deq_state = dequantize_mixed(quant_state["w"], quant_state["m"], sd_cpu) - eval_model.load_state_dict(deq_state, strict=True) - - return eval_model - -# ---------------------------------------- -# Evaluation -# ---------------------------------------- - -def _loss_bpb(loss_sum, token_count, byte_count) -> tuple[float, float]: - val_loss = (loss_sum / token_count).item() - val_bpb = val_loss / math.log(2.0) * (token_count.item() / byte_count.item()) - return val_loss, val_bpb - - -def eval_val( - h: Hyperparameters, - device: torch.device, - val_data: ValidationData, - model: nn.Module -) -> tuple[float, float]: - seq_len = h.eval_seq_len - local_batch_tokens = h.val_batch_tokens // (h.world_size * h.grad_accum_steps) - if local_batch_tokens < seq_len: - raise ValueError( - "VAL_BATCH_SIZE must provide at least one sequence per rank; " - f"got VAL_BATCH_SIZE={h.val_batch_tokens}, WORLD_SIZE={h.world_size}, " - f"GRAD_ACCUM_STEPS={h.grad_accum_steps}, seq_len={seq_len}" - ) - local_batch_seqs = local_batch_tokens // seq_len - total_seqs = (val_data.val_tokens.numel() - 1) // seq_len - seq_start = (total_seqs * h.rank) // h.world_size - seq_end = (total_seqs * (h.rank + 1)) // h.world_size - val_loss_sum = torch.zeros((), device=device, dtype=torch.float64) - val_token_count = torch.zeros((), device=device, dtype=torch.float64) - val_byte_count = torch.zeros((), device=device, dtype=torch.float64) - - model.eval() - with torch.inference_mode(): - for batch_seq_start in range(seq_start, seq_end, local_batch_seqs): - batch_seq_end = min(batch_seq_start + local_batch_seqs, seq_end) - raw_start = batch_seq_start * seq_len - raw_end = batch_seq_end * seq_len + 1 - local = val_data.val_tokens[raw_start:raw_end].to( - device=device, dtype=torch.int64, non_blocking=True) - x = local[:-1].reshape(-1, seq_len) - y = local[1:].reshape(-1, seq_len) - with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): - batch_loss = model(x, y).detach() - batch_token_count = float(y.numel()) - val_loss_sum += batch_loss.to(torch.float64) * batch_token_count - val_token_count += batch_token_count - prev_ids = x.reshape(-1) - tgt_ids = y.reshape(-1) - token_bytes = val_data.base_bytes_lut[tgt_ids].to(dtype=torch.int16) - token_bytes += (val_data.has_leading_space_lut[tgt_ids] & - ~val_data.is_boundary_token_lut[prev_ids]).to(dtype=torch.int16) - val_byte_count += token_bytes.to(torch.float64).sum() - - if dist.is_available() and dist.is_initialized(): - dist.all_reduce(val_loss_sum, op=dist.ReduceOp.SUM) - dist.all_reduce(val_token_count, op=dist.ReduceOp.SUM) - dist.all_reduce(val_byte_count, op=dist.ReduceOp.SUM) - - model.train() - return _loss_bpb(val_loss_sum, val_token_count, val_byte_count) - - -def eval_val_sliding( - h: Hyperparameters, - device: torch.device, - val_data: ValidationData, - base_model: nn.Module, - batch_seqs: int = 32 -) -> tuple[float, float]: - base_model.eval() - logits_fn = torch.compile(base_model.forward_logits, dynamic=False, fullgraph=True) - - seq_len = h.eval_seq_len - context_size = seq_len - h.eval_stride - total_tokens = val_data.val_tokens.numel() - 1 - - window_starts = [ws for ws in range(0, total_tokens, h.eval_stride) - if ws + context_size < total_tokens] - - total_windows = len(window_starts) - my_s = (total_windows * h.rank) // h.world_size - my_e = (total_windows * (h.rank + 1)) // h.world_size - my_windows = window_starts[my_s:my_e] - - loss_sum = torch.zeros((), device=device, dtype=torch.float64) - token_count = torch.zeros((), device=device, dtype=torch.float64) - byte_count = torch.zeros((), device=device, dtype=torch.float64) - - with torch.inference_mode(): - for bi in range(0, len(my_windows), batch_seqs): - batch_ws = my_windows[bi:bi + batch_seqs] - bsz = len(batch_ws) - - x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) - y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) - wlens: list[int] = [] - - for i, ws in enumerate(batch_ws): - we = min(ws + seq_len, total_tokens) - wlen = we - ws - wlens.append(wlen) - chunk = val_data.val_tokens[ws:we + 1].to(dtype=torch.int64, device=device) - x_batch[i, :wlen] = chunk[:-1] - y_batch[i, :wlen] = chunk[1:] - - with torch.autocast(device_type="cuda", dtype=torch.bfloat16): - logits = logits_fn(x_batch) - - nll = F.cross_entropy( - logits.reshape(-1, logits.size(-1)).float(), - y_batch.reshape(-1), - reduction="none", - ).reshape(bsz, seq_len) - - for i, ws in enumerate(batch_ws): - wlen = wlens[i] - s = 0 if ws == 0 else context_size - scored_nll = nll[i, s:wlen].to(torch.float64) - loss_sum += scored_nll.sum() - token_count += float(wlen - s) - tgt = y_batch[i, s:wlen] - prev = x_batch[i, s:wlen] - tb = val_data.base_bytes_lut[tgt].to(torch.float64) - tb += (val_data.has_leading_space_lut[tgt] & - ~val_data.is_boundary_token_lut[prev]).to(torch.float64) - byte_count += tb.sum() - - if dist.is_available() and dist.is_initialized(): - dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM) - dist.all_reduce(token_count, op=dist.ReduceOp.SUM) - dist.all_reduce(byte_count, op=dist.ReduceOp.SUM) - - base_model.train() - return _loss_bpb(loss_sum, token_count, byte_count) - - -def timed_eval(label: str, fn, *args, **kwargs) -> tuple[float, float]: - torch.cuda.synchronize() - t0 = time.perf_counter() - val_loss, val_bpb = fn(*args, **kwargs) - torch.cuda.synchronize() - elapsed_ms = 1000.0 * (time.perf_counter() - t0) - log(f"{label} val_loss:{val_loss:.8f} val_bpb:{val_bpb:.8f} eval_time:{elapsed_ms:.0f}ms") - return val_loss, val_bpb - - -# ----------------------------- -# Training -# ----------------------------- - -def train_model(h: Hyperparameters, device: torch.device, val_data: ValidationData): - # Set up model - base_model = GPT(h).to(device).bfloat16() - restore_fp32_params(base_model) - compiled_model = torch.compile(base_model, dynamic=False, fullgraph=True) - if h.distributed: - model = DDP(compiled_model, device_ids=[h.local_rank], broadcast_buffers=False) - else: - model = compiled_model - log(f"model_params:{sum(p.numel() for p in base_model.parameters())}") - - # Set up optimizer and load train data - optimizers = Optimizers(h, base_model) - train_loader = ShuffledSequenceLoader(h, device) - - # Helper functions for training - max_wallclock_ms = 1000.0 * h.max_wallclock_seconds if h.max_wallclock_seconds > 0 else None - if max_wallclock_ms is not None: - max_wallclock_ms -= h.gptq_reserve_seconds * 1000.0 - log(f"gptq:reserving {h.gptq_reserve_seconds:.0f}s, effective={max_wallclock_ms:.0f}ms") - - def training_frac(step: int, elapsed_ms: float) -> float: - if max_wallclock_ms is None: - return step / max(h.iterations, 1) - return elapsed_ms / max(max_wallclock_ms, 1e-9) - - def lr_mul(frac: float) -> float: - if h.warmdown_frac <= 0: - return 1.0 - if frac >= 1.0 - h.warmdown_frac: - return max((1.0 - frac) / h.warmdown_frac, h.min_lr) - return 1.0 - - def step_fn(step, lr_scale): - optimizers.zero_grad_all() - train_loss = torch.zeros((), device=device) - for micro_step in range(h.grad_accum_steps): - if h.distributed: - model.require_backward_grad_sync = micro_step == h.grad_accum_steps - 1 - x, y = train_loader.next_batch(h.train_batch_tokens, h.grad_accum_steps) - with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): - loss = model(x, y) - train_loss += loss.detach() - (loss / h.grad_accum_steps).backward() - train_loss /= h.grad_accum_steps - - frac = min(step / h.muon_momentum_warmup_steps, 1.0) if h.muon_momentum_warmup_steps > 0 else 1.0 - muon_momentum = (1 - frac) * h.muon_momentum_warmup_start + frac * h.muon_momentum - for group in optimizers.optimizer_muon.param_groups: - group["momentum"] = muon_momentum - - for opt in optimizers: - for group in opt.param_groups: - group["lr"] = group["base_lr"] * lr_scale - - if h.grad_clip_norm > 0: - torch.nn.utils.clip_grad_norm_(base_model.parameters(), h.grad_clip_norm) - - optimizers.step() - return train_loss - - # Model warmup - if h.warmup_steps > 0: - initial_model_state = {name: tensor.detach().cpu().clone() - for name, tensor in base_model.state_dict().items()} - initial_optimizer_states = [copy.deepcopy(opt.state_dict()) for opt in optimizers] - model.train() - for warmup_step in range(h.warmup_steps): - step_fn(warmup_step, 1.0) - if warmup_step <= 5 or (warmup_step + 1) % 10 == 0 or warmup_step + 1 == h.warmup_steps: - log(f"warmup_step: {warmup_step + 1}/{h.warmup_steps}") - if h.num_loops > 0: - base_model.looping_active = True - log(f"loop_warmup:enabled encoder:{base_model.encoder_indices} decoder:{base_model.decoder_indices}") - for warmup_step in range(h.warmup_steps): - step_fn(warmup_step, 1.0) - if warmup_step <= 5 or (warmup_step + 1) % 10 == 0 or warmup_step + 1 == h.warmup_steps: - log(f"loop_warmup_step: {warmup_step + 1}/{h.warmup_steps}") - base_model.looping_active = False - base_model.load_state_dict(initial_model_state, strict=True) - for opt, state in zip(optimizers, initial_optimizer_states, strict=True): - opt.load_state_dict(state) - optimizers.zero_grad_all() - if h.distributed: - model.require_backward_grad_sync = True - train_loader = ShuffledSequenceLoader(h, device) - - # Training loop - ema_state = {name: t.detach().float().clone() for name, t in base_model.state_dict().items()} - ema_decay = h.ema_decay - - training_time_ms = 0.0 - stop_after_step: int | None = None - torch.cuda.synchronize() - t0 = time.perf_counter() - - step = 0 - while True: - last_step = step == h.iterations or (stop_after_step is not None and step >= stop_after_step) - - should_validate = last_step or (h.val_loss_every > 0 and step % h.val_loss_every == 0) - if should_validate: - torch.cuda.synchronize() - training_time_ms += 1000.0 * (time.perf_counter() - t0) - val_loss, val_bpb = eval_val(h, device, val_data, model) - log(f"{step}/{h.iterations} val_loss: {val_loss:.4f} val_bpb: {val_bpb:.4f}") - torch.cuda.synchronize() - t0 = time.perf_counter() - - if last_step: - if stop_after_step is not None and step < h.iterations: - log( - f"stopping_early: wallclock_cap train_time: {training_time_ms:.0f}ms " - f"step: {step}/{h.iterations}" - ) - break - - elapsed_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) - frac = training_frac(step, elapsed_ms) - scale = lr_mul(frac) - if h.num_loops > 0 and not base_model.looping_active and frac >= h.enable_looping_at: - base_model.looping_active = True - log(f"layer_loop:enabled step:{step} frac:{frac:.3f} encoder:{base_model.encoder_indices} decoder:{base_model.decoder_indices}") - train_loss = step_fn(step, scale) - - with torch.no_grad(): - for name, t in base_model.state_dict().items(): - ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) - - step += 1 - approx_training_time_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) - - should_log_train = ( - h.train_log_every > 0 - and (step <= 5 or step % h.train_log_every == 0 or stop_after_step is not None) - ) - if should_log_train: - tok_per_sec = step * h.train_batch_tokens / (approx_training_time_ms / 1000.0) - log( - f"{step}/{h.iterations} train_loss: {train_loss.item():.4f} " - f"train_time: {approx_training_time_ms / 60000:.1f}m tok/s: {tok_per_sec:.0f}" - ) - - reached_cap = max_wallclock_ms is not None and approx_training_time_ms >= max_wallclock_ms - if h.distributed and max_wallclock_ms is not None: - reached_cap_tensor = torch.tensor(int(reached_cap), device=device) - dist.all_reduce(reached_cap_tensor, op=dist.ReduceOp.MAX) - reached_cap = bool(reached_cap_tensor.item()) - if stop_after_step is None and reached_cap: - stop_after_step = step - - log( - f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " - f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB" - ) - - # Weight averaging - log("ema:applying EMA weights") - current_state = base_model.state_dict() - avg_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} - base_model.load_state_dict(avg_state, strict=True) - - return base_model, compiled_model - - -def train_and_eval(h: Hyperparameters, device: torch.device) -> None: - random.seed(h.seed) - np.random.seed(h.seed) - torch.manual_seed(h.seed) - torch.cuda.manual_seed_all(h.seed) - - val_data = ValidationData(h, device) - log(f"train_shards: {len(list(Path(h.datasets_dir).resolve().glob('fineweb_train_*.bin')))}") - log(f"val_tokens: {val_data.val_tokens.numel() - 1}") - - base_model, compiled_model = train_model(h, device, val_data) - torch._dynamo.reset() - timed_eval("pre-quantization post-ema", eval_val, h, device, val_data, compiled_model) - - serialize(h, base_model, Path(__file__).read_text(encoding="utf-8")) - if h.distributed: - dist.barrier() - eval_model = deserialize(h, device) - if h.num_loops > 0: - eval_model.looping_active = True - - compiled_model = torch.compile(eval_model, dynamic=False, fullgraph=True) - timed_eval("quantized", eval_val, h, device, val_data, compiled_model) - if h.sliding_window_enabled: - timed_eval("quantized_sliding_window", eval_val_sliding, h, device, val_data, eval_model) - - -def main(): - world_size = int(os.environ.get("WORLD_SIZE", "1")) - local_rank = int(os.environ.get("LOCAL_RANK", "0")) - distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ - - if not torch.cuda.is_available(): - raise RuntimeError("CUDA is required") - if world_size <= 0: - raise ValueError(f"WORLD_SIZE must be positive, got {world_size}") - if 8 % world_size != 0: - raise ValueError(f"WORLD_SIZE={world_size} must divide 8 so grad_accum_steps stays integral") - - device = torch.device("cuda", local_rank) - torch.cuda.set_device(device) - if distributed: - dist.init_process_group(backend="nccl", device_id=device) - dist.barrier() - - torch.backends.cuda.matmul.allow_tf32 = True - torch.backends.cudnn.allow_tf32 = True - torch.set_float32_matmul_precision("high") - from torch.backends.cuda import enable_cudnn_sdp, enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp - - enable_cudnn_sdp(False) - enable_flash_sdp(True) - enable_mem_efficient_sdp(False) - enable_math_sdp(False) - torch._dynamo.config.optimize_ddp = False - - h = Hyperparameters() - set_logging_hparams(h) - if h.is_main_process: - os.makedirs("logs", exist_ok=True) - log(100 * "=", console=False) - log("Hyperparameters:", console=True) - for k, v in sorted(vars(type(h)).items()): - if not k.startswith("_"): - log(f" {k}: {v}", console=True) - log("=" * 100, console=False) - log(f"Running Python {sys.version}", console=False) - log(f"Running PyTorch {torch.__version__}", console=False) - log( - subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, - text=True, check=False).stdout, - console=False, - ) - log("=" * 100, console=False) - - train_and_eval(h, device) - - if distributed: - dist.destroy_process_group() - - -if __name__ == "__main__": - main() +import lzma as L,base64 as B +exec(L.decompress(B.b85decode(";J=$Qid_ISn@VT6Qap3bt~@<3h>ok~)Km^%|Fb#-90hdJma3#=K3IPdy2;e1CE^B>w5WwJKer@f4vu8S0NV8~tab7X?zw=)B>>{9}d(HHQPE!C?b?dQ}H%quz_G@tfQBKoZA2;z)N_#D^h|L6c$!b*Mf;SIqp&i?fu)yK*yU(|GzCfiN#b4S8RJ&QFJiRIwB&7d_NR+8JfVmOSq2>pIb!xVk%o9io7zt%JpjiEk@AM4-?@y%6Z^_oN;HEE2#=X>i4Hq$m@!-#1p41yTF(YwFlrz>fPilA4dgALdi&8|y;|cNd>gymwHKv?ZwAVec@w^G#sH4m+Yi`_xq8FCpL2ML0B*wMpCe;l;3&LqjjA3>`Ksa*A{vVd6MnUKr)hFI*6tcV{0io*EK=kmk-*;%W`Qf<2Dzk0)ULpQ&J{9dB{p&l$K;3Th(jLWcY{Y7|r#&81QR_CERe5Q{ogJ?-h+4PPeIaT8$Wrc+(avJ-d`Av(J&G{Sd9iFwRyA8@{e!f5&?Khwq3q@8!zj{#|sOoVMK4HEI*(Ku~K3d&y!{J=vP6WSRQZ>S_GChDYjM6k!HJ-2qu&6u&^;yK*OKFg0m(ZWoUe$t#@&E6)jICM9&+aoA(I3P*rFGKvPYYqg~=vQpAYIA51cD3x)B;GR~1Zncp$-FbyC|o4cR)l~6ktXe)GjzxNC!QB(*mI%FO<#N3Xq+=@8uF@;_?y9K(x~l?edfieS_;Iay`RKf1v48lvub)mrCmPFeOSD9i%yH@$cCG-jRiK%dM=znMVf5)Zf6T&-Pcj+;im=6#Vz?1MD(-7Qy6*?X8$4$8#!0i{3mMjrJ?t4uF;RPPj&(8?gae;m9xm%NkHxQdp9tYJqp1p%r`|?Gf`~1byUgCaMawjUsX&in)gkv-*MZ6itT~%Vk{vNtpocOX9Gb{O@BOiXF(T?0*_T4(=2oN7Y7A_q7is5dJHk1H9_?WA`!-4oqihY{G034U;YdJ{g);AOMG#UR70l$Y`VHn)X}}*n+ys0wwZ{wAfA~CTc%lUaK+6QsgA{)8s5g_zhb$kyeM7MM!HljoONDJPg0%njJ4S&218YUR>XbWO41ZQuI|f^*}S0T^;Af)xa$#sM_xr%gd*t6-PW@&?LVz0Bf~_2H8y%vRfBT=GeL_;a6N^>judvxri*<>xm==p}DxR-ay!-k#tz6v{b*v8?>X>T-q3lM@L6{X5*?%gsv1EodS=P0p+lV*r1C#aA9I-zyp6stvj!oIGAcKC0{4??vuY6}g7+}{K9dL?szV{F?jFW8blE}OOCGv{a&8b;tnvC0=YAAJo8+z-rGwsNB`z$il1;H5T*O(~FxOeuu$m2WV=+EZPfaSrj2a=QSw!}PS&e2ifAOB&s_7URBS2quyYodM#vwb_=8<1EnMa%o&$a`4t5mJ5ufXZ&9N>4$&N5SFfX2V=mh*y-;p{Z$McK8o^kLSDw^0Yh^q>~nfl1p-ot_DMv8EHGjG2y83pDkMMEvohFVr+PDKIoc^6Km81KTYh(gv4KC_rSw-7Qd)7#rD#d<^F$)u%Rr{OJc-Gu{8>c+9x<%$avsND*KE8%!-H8z773c>{OfkM+H20$^1_#H#LGGp@r^U4uVIir+7xt{W)3rA2as1psvEgwNSaiDt^dUSeymFHMOi>gW=o;WqZ*hybkkbR}!pfPBaXyv+Tbdyg2xAAWYoGNe=SsiJNy3HpYOlFDLdfsLL++o9uy&mTrPGZ>afn6T?oiZ;g_yF*2NXBLxxJ`-ae?(Q>k*uX+r>6Mu(Bq^5#O)(0Efhygp%AYb=mMagb2)=q{Q57tP}JNP6fwsay==0cjNhtDx-+2Lm59lh+i5`MHG$Y_bpUT_DoDxBVFB5TRvPseDzmoUK5PM4{GcW|2YzpTh+!=2)!j(tewh*u-LCAHdhHysk_&h@Y2pie*Lu~*B&lOhQx>G)MC2!slNuCvZEbF5gO<0)SeA%P_L#pKGSJ(T_#fI@wyLh)OoGuaywG?z2*SS)}?Hyq{GH-`XK}5Y~LfRx6Z@^Kx}D;4ID8kua~wb-A9<@!r|3Q~^H%IYfqdbh&Z|N_~8p|vSlt?#h+V}tIR~SA;Yt?WURpp=)of;)pIR7~qAIPV$WZc>GJ)$A4Y>kSLkny7y3Es69b?&(A3Mu)|-27)hm#K@aFx&Bl6>nfK3g+iwvA=GGeDMtC*YNs+e2I8i_+X+3r-n{W{F=J@2NeW2{eE^5zhI(fmvqdCS=G92wcl1|1bC5qj}N7CNRI?w~*0g@a^uLL$Vf{VPaco6+_{~BSKmDTCoW61LB-6+J@yNf@AZ}{%Pb5k77|iLV1K{?YRK#;Br&a1r!e4{ltdt)bZ*wqD%EG{rqT%A2ICJ5I=J!>t-^Ij0iS|EJt2Ksasoa(+~-8erN;x9r(0AMg%o@t22vAH>!S-nk#=zD~1=u-#BJzb)8v#Y=&)C5yVqM+(VH`%zWQ4)ikYcjN0_EL8gpeaW&Kg-EK&a@v{d@j|bfOA)-hgSrpg*@L)Q<$$vL1O-F%A2zDMN#)e+J_iL%&E!jz80*x6?%i)LNbKfp#E>IIZ(kHR<@B}Au~%3?FVpP?9Y1KVQ^bQ=T#GSz<`Aj9QYtY&-Pa`VL~c0d};RdThHR0iF7Ol|2ui0I}h-G-1BRN`fTm^b)pl`m)R@0cJ**LIp){Pjj97YMQ-+@;Ax;%;N7Bxr`0&hTB1z4Q?>o>&LC>76+M`>e=9bUgHFM;k{{wQQin)vQ)hr=Nxs9fxWk#R!UJ&jiGemzW*RZqf^y0f@?D1Q6d1KkEESDZ{qteU@jJ`gkz%hvlDkGYM73Fd*dn1O(nWcMZvk-B|YW2_#_aZh-ut25yydr!nT7nB(K<#UyPs3g0MQDRiUiR$743?BF;Ov1@(gxGwI5gkCv6(^*#`QO-47l+}92ke6?HKNVz0%PanP(BL7gp!$%J8}pE4=nZWsj>_#{Z!;?k4i`7;@@z$3vSq-g@@O`1`w>%FNo_OS#aNhs{m~HT5gP+J)^dFm1QP^{zptjxbW-0B{DEz1PSnq3rCN{y+lR9N$FQzVgO$uWaH`{m!pXLchd|+}Y~v4qOq6@Pvp&j_%Xhs$^j$WaUbntrS#O~LX=g5IO281TIF%ei1!-yDQes~;cV;%P{88-3nyo$Q5WG6PRil{S&*5G_0u{JIsF%Llo~nuK7s7ejFc^O+tv;_!0ZR?E`cpc|%f!gW5@xoMWeev-Yi-ldI^q=`xhE{zOo%;4eM^P6OM9ef4Idp8Si8+Wm=#ewBz4H)y0NPBQ__c+`&W=9+p%8sM&QaHn)SUKcjpG7wI!!kgI_^RQLnQA0MOdv9d_Lb{P*EQCg4q^&4qizZdHq@I-J)nQ>kBq-Pfz;J#FO1Tfx#G870PeBS)y+U_vnOiL=V$G-D}4-=o!!nTOHJm)Q&745aL0*BZiR!Wwuk#Td(eW;jA1o5T)GfH{$u}(5DdI@f&l#-*k8uu6mDj8que!#xZ_xUuFq0^xPSs0#_10TPE}7}ZYRlHpqbn&R9<;2=E2z?-AsIR{Ypy@ttp=Jj+%`Te4Tz|AGB#B;44&vrsf|mmL=_tlT9B&(?rRLLU_a@3F&t0>VM0f4exO6_qnq6LksOe|olcK`VEGeV9l1KJp(XqItpLl7SBrh47Xrx;xRtPP#+TNqhwf^Il||E5s~!j2w0Qg5fwpmmM7V*_}FvfRg!U`gL%4IbkG1{!!mX=iglftXqt@onww?mYz)BiuW^#`{wwumW4jnnxd<3pLKY#bfy?T@Mo9Fva#nVBW4GLr7$e~^zLm+S)ot710l@U8y&7I26&!+)4UMLi7>Bd`^;S#ogDs3Z7pQHtW?lZu3x_zbAd7F>))Q$oUNj?s<_Yh?ZpB!C{cw!;oxMctj&PApdeZOKLApZ)70mv;3|1-@H*h&5o=-GfY|fgC9Omj%q4cj8no*D+99A9iHZ7*IC7rWKH}wiyH=BFb0Y%O1B_8XQfre*d*9xN$q7$vGjgt@;(z6$3`9Pn8E^!!BK%ap2$sR2WA$n_AY>rHTb<2>AM%|Z^k;FF{H#i9ab`^)e!ESvVk(=rt#w$p>sU0f?)n;Liu{d@pnJ1NKj&T)sQtcTb$Irc6J8sGTaNk^|==_?+3SCcRLoG5mFzD(jVN#3MN!*NI2Wypn7lt`7+KPJ$wj_#?oZGq&yMYJ5Jva!His}Le^uAtBuiVG{fvdsRk{TIlnS@_re{PYg@1Op*}H+xT_5UQ{#nH{?tJw^{q`SVnNe_CQ*LFu4!ywh$mzk?AN5Vleu_4@TQU1z^<0klUxDB$Q0Artvkrl;S`Lk6>h2|+{Pyw_rkPuA^rP8T|0g}EcSP6%s5}5dsNKUg>eh~{x>P3?`X%;qYj0P7)utO0J*6e=#mQa`$tppy3`^l$^8;Yurx@%6POfQ{oJes*yLbut;asee=X;hKaOf`U@_II0o7q2rV0AapCcGL_FGD@uE^}|KrvMfQo5qCT%PCHAS{XvOv*AK%?&hjTbPjwds!P-4N%!8YH5cYNSS7kw78eQVEQ4Co2=7{b^ak{DG9vu>!-lr+Lf%p3H=8)sM&C(*?iQoEL0yu+XeND58VMbDah#Mm@jfwUc+k$nA(!Ydl4sU@bbYclizq^I;=39QPM_5^lZsgjV*rK-3h`f<4ALj4{G@l=%xh9T%@Tgn0a-7HhwdaOF+s~t9iY0HTM0(mE5gI=fD;v6vU1za7a~gjIjmT=5U9-mF|6ADcI{WDoMq@l9fswQXf_=GN(odL==>Yikf4k3Z>Y3t;ZD-WC9UBdy;qADCs+d1kmc+rUq=HY1-)yUdC!}t8rfW)xKQNFN^_Pgf7E~l-Hz~h(rw4PJ`F2v+3Hs1736dP;C&Ey5b!d=H1zZyo`^KcqZPds3lRm|Y68QQSsF?3%OY4vQejD9@wB~G+($&7K-Uy~2Vo@03i#?lLUY2N0lIVvmr@|rcrRhNvgI-lL>3qN46Va9}$+UbuC9FD85Qy=nZjGP&wxSF!NNxh?Apx7c9RG+Xiqa-+S*Y%#AZq!QQsWfIBGlO})nSGj5%#tXfASo|PKQMzsGe#N%UW03jmtUwVF+leetE$&*gJ;@+isAu$Jnum_Z5%`u-PGVby~6}zqO%s%6*RrtA%z8L{3x;h6_{W_26@k#HBrM1USZk8Rb8N%X;P+@Yudf3XeV*5Estlgt>X{TEapXTB#R}`V;bFiaJ%tLqP1{zL1fI6Sh&~GTfEvttAP)F+IhM%BoOB6v|exvSfbJaW-L+12<#>|%gni+Alm$x-g#6|g(WZk{qUU+O|ksxJePe|$o)f6aOyr9t;E<+wLM{V;~rd3fS&Eo_X#>ad3kXT3_OpEga(t4oNyf`|EiCN3*MJUjI)PTC&oWPsbkd&ear~umT6a7zd|K^})hLk6bc{C)aXJoQBIcslRIC1fXcT+M6P{%Y<4<+hssZ{scI_ML=zbKM{DLrdu4BiRt-p=RF9JN)o`oWXbe1G*{*1bd_?zH8nntC87JyAef>Zh{PgI0Fla<(~cMmPi1u!h5n77K36cGI)#t$yxu++W&q;1wvL_{h2y=j3os!=H#RLwPAX?zt>Ds4+Du!i_s$b}kPPJMb+tuWvx%=?WMvy&)@Q4`UEsONE)YX*5$IWde;&dS9&1cV-jhXU4g|i(9AafyGw4)Q9S~P*g!tG*8p1E&_M0&w&?*Z$sHJu6Ivq3^)>Rp!j;z=tAME?+l!f+C1e_K0B%Kv%#C7Ht^5`GUvbGf8U^q#F)tUomUMWx1K?kA$#xEeTOS7=-8X^B!ro7iEOn$}u3oNOGJbFB_7dCTs8RA?y5h@H@{y8{pkveB4%FLGq)wG;c1r_Io+2nJxEN7|V9G-&2E1gUFX-A}QX@L%@--Dbr(u$q9Jk#9~Q6vUcFSLD`(~@i+sxI(#r<5=1?f>d&;uX1D&0Qy;Gb*W-wGm=BY3sbHYIO-Xc-Q+ghth2fCVPI=u_(z*WQj{@W&$kL%+%fY|6Z4K(CmaOSO~6tu9gHX`5O~g1M8~^vTDS6Ljdigeci}Z{3uO4Vi~~IJh3zJZdfVCWO(h8GYjr7C#&{$K?J+RGEM>ugNCE!q%ozW!ePg6GylwSzfSH6?SRjIZQGCGS&Tyty6*%#T9pLW*;LIR%JjqP+!f>g00d<(=^xvv)iAr$(j3X!+U>OjClRSd>sgK7Qnn4a&tqkyaB79EnOFbWv$@2YjvfpoFB<~rQN=}?a?}5k@v^{JRI;b=(NJO2rP&x#gL}|4%(1cCZcVDKldsCjBf7pWS4dEyeK)x+Ojj$i4kfS*Ax!T*;~Fdl(9+wvX|*9z{ZB66?YW`dDRxn(Snx{sr5rgzPRP3t)-J*;lJ(6JIU?YC};MXd;{!`Pm`&@S!~{x61Oe)kCw_|TdaIn8cPjDMW-`lXm_K+Eb$?(CF$r>(CX~fjmvkEv%#zYH{8puPJJ|xh~iHH5+9Mj@6aUCRM%u5L|&DPmvS^1lijrC@iQfRhVRN2&u%NGEHxgAnKsHwLc#e@XP)Wg+)Xg`W?H~ydvlVE8eP1be2H(793msiIgX&D-=fC#N+#)%%#HYXJB@I)M8fkd$|wFgd~HP%gw0B~rd6Zw!t%1qPI>=TtCtm!LieN~a5?wvA}Eg4PW1NXeuYm@iL8SV%!&Y#*sV>tDRA4GsTp^P!W)xc6QbLiSPX2TeusG1^wC!9N28>>OXmR3C8Zth!2z9DYHHjAdwZn2jg!>h*}oX70p}PlWyWwpqE{+YgpeH&QD~#V5Bjb@V_jUU!c5O}{mIh6alZ(+24Ig0?yvMmeLAfi!vCHC&Y*Se{EH8rFwlASQA~?c({Hg$B4nL$9c(yuqhvnY=@=-OMMJ%@k-A!twi^B$s@D_o0jqa69i03csu@?_pKH`fX{SODRd~DG3UQps3jGASx8(g^k5W1OOa~IQ3XMotC65`9rd8{84>Yer0={8cj?^%I@5WUPHUIJWC@2xh8+Ko^RTPtmzRJHn-E~P~OAv3C-0VCy=a5B3CY#}YA`+t{kattZ)-vv^;1LgtH?ne@%&5$bEJt725C%Q$}PhhpRD|J1lByQR~`Ln%@wioJ33&fx%saIdU8qpY)u!#8!Nd{aDHqW8d@`g@io_u{^dsrz^tjY(tGu*Q?)4_zS(x$i6JRcUTTuxTvZHLtPiMyN#@!?_RERBvu!wX9&b@Oj2oj>dhVQg$wr=JnKv=XH4)0uMYbnca5#Q`%9?=pi^Cv7`U)!d-jr7jHTq0gg8W1JY;Q4)nn6|)#l62WsQeHuBo_c7B_H2qV`e-y0$5&7vYdqTj#0|%Ak9QMJcP->KMvI+SK?eUXfc2~@RGV=+IPl`t;!t$ahygXeKO;;`I0-4UWIeRqO9Vk=hF-onMr#qg^)?1fmU)MgaJ+5Gbn}J%=QIX#OWegDTqas2==T*N&TDW4D?Hv+I${J64wMiN`k6oY=a~?Y-0=LTr4-W;+K04(kvGsTZ%JV+JJPJv|i^_|5*6x7OWq@&#ycng@rVV+M2V$(aV3A$Ui{-|s`TM4g^d#xh<`TwiUayhQeS64VYwlX~P(9my+Ps^mz{ltWZ0@27_LzY^Bk*vkimMs4Sa*5YE3#&|rpY+}EcsyFLSqhFo8(Cs3oj3|NKPEP?ps#r+K=H+eHs7SRhB6^1cp60oz>45tY-6^3r#jPpoCXbf~PC3YpReWzSONu2pGXHek*h9^YgkE*~!ES!Fggi&r-ZQqgXt9Yu8^sgbLKkqztFs;A`-d_InFvISBgDjUEyWpuIYCq$35M_*;{CHErOr%jk9dXUkNS)vSyQt86(yPOsiP=bZo%&XaBBx>yd+eykT|*KggrDVJe>8v~&GXip~hDRu_6HX=ANu|2)@WwnR)ma~U3^nSEb9zohTU~4=3E$PO6SD`s;oKPh%6wC1!sLJip8mj8(23;wH{Ru%}3(>?&oKoonMHmqwdO$M2|gtF4IJoDAkU^oGO5dyXkUbZ!x-v1#qAOQ!N*FXQl1FPJ?8t$AnY`vv`F?gFqqOsb!q%&$19b^$jq%7!8DN~^s6(3$50wx=G6)D{6Z+?rG>!cB@mjY85{w{!No(U{G0g7VaMvMAKlID+9?Y0}3MZAx2w*s}jf;BvGwuLTxxIPeAZt-ySldQ82`0=0LarJ5%UA@T+{Pv8)7``-E8ejAK{9y3ofMdEa@#8dfM9q(95%Hc^uhCK-g}_fO1HMJ?Ho3nJF?5NUmWw9#-)^q8Q}W;L_eyus*G6Dw^UAAZ0uQs?WQ&O|VnuLiGA3!L?Ct(o8@krbz8S>Z8Gm(yoeUhWbMm?!kGYu6G`>Fn65G1=5XLgj%Zs>xTb>RVBAL%!h;ru%Sy9gzdqWu4I?~iz~Omxe_n&U*T5)A;9}U@vWN?p2-at3}aJAdZ&&;e{SH%j0r(9<#u6sxCY4di0;?zgTpG^3E~QAeGK+2RAx0)8y|uLevrhN48Zt>OCgK&nl6~Ao()x5j@Qq?NGqo{hLM8=TYNHm5Bg_B|0~%2jk#$NE?=}c=Zz=Jucd^7K8aiU4*`ZpTojlV;R2NO_A($HRqap-^J2v$*bY#p41l8*o9B{Wz6n>a^63=GPCR`l-;FN|znZKS<6={mBoN|KGFAduRxea2LWYqaW$dwIlT}=k*bDk+q3$9D!Nt>{0clCfm1D|4tA)y^VqbXjuVqQq4ZYl%xREc_abpZXTV>5)DH=3G$YAK;;!S2=6;MO^AXdr{y=@%Mn4kn`&%MRF?|2~CI)#`mT$IqP1|4E0J#3>2b+msw5bqwg9o7pxlO^o)LcYRq_)hpwk!A~yZ7yX~ShAp+Lfo=O)|gqE7nIfnwAhkzz2L?Mll3f1iz9|t)PLWQ#b5I_Omeu%-cT8Y5NXMyhWa~>sB3ihDpnOsyuOM%RFMQt*xDmEI7iTXw~9hW=ZdCvrLvNp#+=wP)T5xo(tAR}DliaZwfe5^lQv@5?--H6C%biy}{&2Fgn(JQ&R;>B@Vq{QC_jfS!FAcIyqFkFV`JHX0PT>&sU_|g12pO%b)&gp*l0W@NZ2vjDs1jQLgOE{Bo>wI^x|%5LvNGDX;jKc@g)5uM5h(qn*sW(Xh}qBvzhD~u51ns)>K6p5C-*TWy|!h91Ad}2rzAL3n&>0g{ajEvF7eDjF6_bNG16UxZA@}sfEtaN@s|$*OIEaIq80W97&?&NN$9l(_(l&UsDn-`DN?6?4&2yp_VUS%Q1!&{q$e%*IXs?)hr<#y~&5INgSr<4V;W$GpIrxMVU;#uxo1cVby3@10IED9{Kz@);;s{ppo?%F3mNV@HP_NWkuqnIOEmJo0?ARJLfdZ;BgKTy%pKqgt#4aNXyShH==Q#X6SPnODH0lQ&pznRMi8!)L}1)|}=L)_)9SFsd2HTg~;PGV?uttj2zNP-2ImJ;d7UT7zOA_&aJ|ELz*m?KU|4o28YKHG1h$?UFrAv35XX!GLqId!7kxhkD#uG6tlk3ArhAF)HMr3Mk7*987Zm?R#|Hct>ks!6&l0x;19mPF+S-AQ+4R|#SIW0tMP5Yvnoj?4{`r!9|Y&AwqAySymZBF`wKX}!&J^_f^Rrg2ky7;!<^WHn#IA%jL{b0R+3#{#O1olwR|YTL4SalVfQTx&PvU>^4*v|{FkOtC#CxoEL~Q2e4$CRDohHrt8M$MnqxjMDfx4`COOSGH}jK*Izo$6Fr=oo#fivSbl=p>IkGp!#N#MjEDssSAG~I#lY~(;o;kZivpkHYD-scVqbUbWBCnralq|H6YF62+89WjtV**APm5`<)4mO9h4i^$piqI=QO&?NBHGQ+>k#0^lD9N$E-+-S=+pEobwZ>A~&`tTT}=Mllqe2ZpA4C48hYcID9zmH*_H4yab9um=CBmFWY5m{j0>X22oDnm3AZF8_7vIJ394)5Dx8=kL>T(WPzTroTi@l%=iWer#H+<~V`+{@c^2u5W!XFWqvrg0I#M4^6!*-d>yp+Nh6l^YbS=b~9nBK2Ap4$Vme22KImB>k6M{qKfCFSs@A_^+ZEe4jV`a5q++883Dc$}Zk0Z0ats-D1P}mFKMO)>muLN{jX>vhF_#K6!~xR+8+)zh($Ahux5~oQ{p|6nvmNaKZqyU$UQUhH6|wH@2--gdAfgt~>KGIPLf*x%zzW0U|L3N4kVEc*DbURIYLGYElAI%tU7xY@zQ-G>pt5)s0%Yo0ianMYY8MFFtUTH)Ug%go^yL&p*m__vw=Th0MGh4Wz9wle>TOzTw@u7Of0dBMm%#^b>pH0P-C%hKt5ffD7nX~pBq?s$#Xm*M;I+T6D+`v0r_A4WIGMoQwLyRKX}>}XMaZQzJ8$S8T`BADsQv9}gY+1$a0IB?|+J5#1fqy&o5#=zPn4YQ|Vdor|;-$h0`kJU%(;-$-=LfvUMs9hSt~7Ae&G7^v-W_VeukS*pSMJ9?JU&SQD{#W<`n6@0K5-AJ4Pf@bN`;-z3^ufD;ZCRo?cHOSU9|8S20IS{_Dt&`^v50OotL(GTd)YBNu(Gs+>Q4V}*_uH2uQb@O{@2&Tp*P?aNo(W|NZhRL@xSfLSQb#%ys+;TwZOFa?eTrRw!HA6>>{Y4G#!2bj-3Azj(p>p%$1=fz+q(q`GGKYYUV-BR~sZe7;JGi)^HOC>Q+5LfM|5<{xRDzYRf3YSgG#Lua*2Okq$cuP+ba~e(duv|qM{Q-v9nPB8OMFI;7Ur2pX}o{J&A2gWZf)(U(7qUWROS$jf6`5d7*`tZPwewI~}?V8xa$CKg!r_&aIGy-FzKW9DZFdNfoa)6l^OHx6MWB385$6w6yJ`ws+MLa6q{9-KJAp{R4K`lZT%gk}H19%-U+=XG7JkHsnHKX>@K$l^rZ0zH9qVZnSYwmruD0mpaS=NAtvesiOICp8F$NThpX*%m*l&^Ch2l0t+xy;j9XgCqJ;FHW!G^2eA&HYSnkjMK9@P~@Zw$sZ-W3x9mq>oDi&S5naK>G3d<^0`pf-HZ*QWsJ(nqU2$5hn_j3ktM|{iQCKKjYRHtsAdP#b%S1Q(m|#5cN)qmKleY>?MTaYpMk?`Th$9K!hlIF8!Z({*5nVxPEZ_RHIN5=!FZJKHci()^sT8onqJ@o9RYe=$dXKFfgVy?0JNTYZjAP|t(Et"),format=L.FORMAT_RAW,filters=[{"id":L.FILTER_LZMA2}])) diff --git a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt_sota.py b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt_sota.py new file mode 100644 index 0000000000..fb3f0ac8fd --- /dev/null +++ b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt_sota.py @@ -0,0 +1,470 @@ +import collections,copy,glob,io,lzma,math,os +from pathlib import Path +import random,re,subprocess,sys,time,uuid,numpy as np,sentencepiece as spm,torch,torch.distributed as dist,torch.nn.functional as F +from torch.nn.parallel import DistributedDataParallel as DDP +from torch import Tensor,nn +from flash_attn_interface import flash_attn_func as flash_attn_3_func +class Hyperparameters:data_dir=os.environ.get('DATA_DIR','./data/');seed=int(os.environ.get('SEED',1337));run_id=os.environ.get('RUN_ID',str(uuid.uuid4()));iterations=int(os.environ.get('ITERATIONS',20000));warmdown_frac=float(os.environ.get('WARMDOWN_FRAC',.72));warmup_steps=int(os.environ.get('WARMUP_STEPS',20));train_batch_tokens=int(os.environ.get('TRAIN_BATCH_TOKENS',786432));train_seq_len=int(os.environ.get('TRAIN_SEQ_LEN',2048));train_log_every=int(os.environ.get('TRAIN_LOG_EVERY',500));max_wallclock_seconds=float(os.environ.get('MAX_WALLCLOCK_SECONDS',6e2));val_batch_tokens=int(os.environ.get('VAL_BATCH_TOKENS',524288));eval_seq_len=int(os.environ.get('EVAL_SEQ_LEN',2048));val_loss_every=int(os.environ.get('VAL_LOSS_EVERY',4000));sliding_window_enabled=bool(int(os.environ.get('SLIDING_WINDOW_ENABLED','1')));vocab_size=int(os.environ.get('VOCAB_SIZE',8192));num_layers=int(os.environ.get('NUM_LAYERS',11));xsa_last_n=int(os.environ.get('XSA_LAST_N',11));model_dim=int(os.environ.get('MODEL_DIM',512));embedding_dim=int(os.environ.get('EMBEDDING_DIM',512));num_kv_heads=int(os.environ.get('NUM_KV_HEADS',4));num_heads=int(os.environ.get('NUM_HEADS',8));mlp_mult=float(os.environ.get('MLP_MULT',4.));skip_gates_enabled=bool(int(os.environ.get('SKIP_GATES_ENABLED','1')));tie_embeddings=bool(int(os.environ.get('TIE_EMBEDDINGS','1')));logit_softcap=float(os.environ.get('LOGIT_SOFTCAP',3e1));rope_base=float(os.environ.get('ROPE_BASE',1e4));rope_dims=int(os.environ.get('ROPE_DIMS',16));rope_train_seq_len=int(os.environ.get('ROPE_TRAIN_SEQ_LEN',2048));ln_scale=bool(int(os.environ.get('LN_SCALE','1')));qk_gain_init=float(os.environ.get('QK_GAIN_INIT',5.));num_loops=int(os.environ.get('NUM_LOOPS',2));loop_start=int(os.environ.get('LOOP_START',3));loop_end=int(os.environ.get('LOOP_END',5));enable_looping_at=float(os.environ.get('ENABLE_LOOPING_AT',.35));parallel_residual_start=int(os.environ.get('PARALLEL_RESIDUAL_START',7));min_lr=float(os.environ.get('MIN_LR',.0));embed_lr=float(os.environ.get('EMBED_LR',.6));head_lr=float(os.environ.get('HEAD_LR',.008));tied_embed_lr=float(os.environ.get('TIED_EMBED_LR',.03));tied_embed_init_std=float(os.environ.get('TIED_EMBED_INIT_STD',.005));matrix_lr=float(os.environ.get('MATRIX_LR',.022));scalar_lr=float(os.environ.get('SCALAR_LR',.02));muon_momentum=float(os.environ.get('MUON_MOMENTUM',.99));muon_backend_steps=int(os.environ.get('MUON_BACKEND_STEPS',5));muon_momentum_warmup_start=float(os.environ.get('MUON_MOMENTUM_WARMUP_START',.92));muon_momentum_warmup_steps=int(os.environ.get('MUON_MOMENTUM_WARMUP_STEPS',1500));muon_row_normalize=bool(int(os.environ.get('MUON_ROW_NORMALIZE','1')));beta1=float(os.environ.get('BETA1',.9));beta2=float(os.environ.get('BETA2',.95));adam_eps=float(os.environ.get('ADAM_EPS',1e-08));grad_clip_norm=float(os.environ.get('GRAD_CLIP_NORM',.3));eval_stride=int(os.environ.get('EVAL_STRIDE',64));muon_beta2=float(os.environ.get('MUON_BETA2',.95));adam_wd=float(os.environ.get('ADAM_WD',.02));muon_wd=float(os.environ.get('MUON_WD',.095));embed_wd=float(os.environ.get('EMBED_WD',.085));ema_decay=float(os.environ.get('EMA_DECAY',.9965));ttt_enabled=bool(int(os.environ.get('TTT_ENABLED','0')));ttt_lr=float(os.environ.get('TTT_LR',.005));ttt_epochs=int(os.environ.get('TTT_EPOCHS',3));ttt_momentum=float(os.environ.get('TTT_MOMENTUM',.9));ttt_chunk_tokens=int(os.environ.get('TTT_CHUNK_TOKENS',32768));etlb_enabled=bool(int(os.environ.get('ETLB_ENABLED','0')));etlb_lr=float(os.environ.get('ETLB_LR',.05));etlb_steps=int(os.environ.get('ETLB_STEPS',5));etlb_clip=float(os.environ.get('ETLB_CLIP',3.));compressor=os.environ.get('COMPRESSOR','brotli');gptq_calibration_batches=int(os.environ.get('GPTQ_CALIBRATION_BATCHES',64));gptq_reserve_seconds=float(os.environ.get('GPTQ_RESERVE_SECONDS',12.));matrix_bits=int(os.environ.get('MATRIX_BITS',6));embed_bits=int(os.environ.get('EMBED_BITS',8));matrix_clip_sigmas=float(os.environ.get('MATRIX_CLIP_SIGMAS',12.85));embed_clip_sigmas=float(os.environ.get('EMBED_CLIP_SIGMAS',2e1));distributed='RANK'in os.environ and'WORLD_SIZE'in os.environ;rank=int(os.environ.get('RANK','0'));world_size=int(os.environ.get('WORLD_SIZE','1'));local_rank=int(os.environ.get('LOCAL_RANK','0'));is_main_process=rank==0;grad_accum_steps=8//world_size;datasets_dir=os.path.join(data_dir,'datasets',f"fineweb10B_sp{vocab_size}");train_files=os.path.join(datasets_dir,'fineweb_train_*.bin');val_files=os.path.join(datasets_dir,'fineweb_val_*.bin');tokenizer_path=os.path.join(data_dir,'tokenizers',f"fineweb_{vocab_size}_bpe.model");logfile=f"logs/{run_id}.txt";model_path='final_model.pt';quantized_model_path='final_model.int6.ptz' +_logger_hparams=None +def set_logging_hparams(h):global _logger_hparams;_logger_hparams=h +def log(msg,console=True): + if _logger_hparams is None:print(msg);return + if _logger_hparams.is_main_process: + if console:print(msg) + if _logger_hparams.logfile is not None: + with open(_logger_hparams.logfile,'a',encoding='utf-8')as f:print(msg,file=f) +class ValidationData: + def __init__(self,h,device): + self.sp=spm.SentencePieceProcessor(model_file=h.tokenizer_path) + if int(self.sp.vocab_size())!=h.vocab_size:raise ValueError(f"VOCAB_SIZE={h.vocab_size} does not match tokenizer vocab_size={int(self.sp.vocab_size())}") + self.val_tokens=load_validation_tokens(h.val_files,h.eval_seq_len);self.base_bytes_lut,self.has_leading_space_lut,self.is_boundary_token_lut=build_sentencepiece_luts(self.sp,h.vocab_size,device) +def build_sentencepiece_luts(sp,vocab_size,device): + sp_vocab_size=int(sp.vocab_size());assert sp.piece_to_id('▁')!=sp.unk_id(),"Tokenizer must have '▁' (space) as its own token for correct BPB byte counting";table_size=max(sp_vocab_size,vocab_size);base_bytes_np=np.zeros((table_size,),dtype=np.int16);has_leading_space_np=np.zeros((table_size,),dtype=np.bool_);is_boundary_token_np=np.ones((table_size,),dtype=np.bool_) + for token_id in range(sp_vocab_size): + if sp.is_control(token_id)or sp.is_unknown(token_id)or sp.is_unused(token_id):continue + is_boundary_token_np[token_id]=False + if sp.is_byte(token_id):base_bytes_np[token_id]=1;continue + piece=sp.id_to_piece(token_id) + if piece.startswith('▁'):has_leading_space_np[token_id]=True;piece=piece[1:] + base_bytes_np[token_id]=len(piece.encode('utf-8')) + return torch.tensor(base_bytes_np,dtype=torch.int16,device=device),torch.tensor(has_leading_space_np,dtype=torch.bool,device=device),torch.tensor(is_boundary_token_np,dtype=torch.bool,device=device) +def load_validation_tokens(pattern,seq_len): + files=[Path(p)for p in sorted(glob.glob(pattern))] + if not files:raise FileNotFoundError(f"No files found for pattern: {pattern}") + tokens=torch.cat([load_data_shard(file)for file in files]).contiguous();usable=(tokens.numel()-1)//seq_len*seq_len + if usable<=0:raise ValueError(f"Validation split is too short for TRAIN_SEQ_LEN={seq_len}") + return tokens[:usable+1] +def load_data_shard(file): + header_bytes=256*np.dtype('0 else 0;num_sequences=(self.num_tokens[si]-1-phase)//self.seq_len;sequence_order=self.rng.permutation(num_sequences);self.start_inds[si]=(phase+sequence_order*self.seq_len).tolist() + def next_batch(self,global_tokens,grad_accum_steps): + device_tokens=global_tokens//(self.world_size*grad_accum_steps);device_batch_size=device_tokens//self.seq_len;remaining=np.array([len(s)for s in self.start_inds],dtype=np.float64);x=torch.empty((device_batch_size,self.seq_len),dtype=torch.int64);y=torch.empty((device_batch_size,self.seq_len),dtype=torch.int64) + for bi in range(device_batch_size): + total=remaining.sum() + if total<=0: + for si in range(len(self.files)):self._reset_shard(si) + remaining=np.array([len(s)for s in self.start_inds],dtype=np.float64);total=remaining.sum() + probs=remaining/total;si=int(self.rng.choice(len(self.files),p=probs));start_ind=self.start_inds[si].pop();remaining[si]-=1;mm=_get_shard_memmap(self.files[si]);window=torch.as_tensor(np.array(mm[start_ind:start_ind+self.seq_len+1],dtype=np.int64));x[bi]=window[:-1];y[bi]=window[1:] + return x.to(self.device,non_blocking=True),y.to(self.device,non_blocking=True) +class RMSNorm(nn.Module): + def __init__(self,eps=None):super().__init__();self.eps=eps + def forward(self,x):return F.rms_norm(x,(x.size(-1),),eps=self.eps) +class CastedLinear(nn.Linear): + def forward(self,x):w=self.weight.to(x.dtype);bias=self.bias.to(x.dtype)if self.bias is not None else None;return F.linear(x,w,bias) +class Rotary(nn.Module): + def __init__(self,dim,base=1e4,train_seq_len=1024,rope_dims=0):super().__init__();self.dim=dim;self.base=base;self.train_seq_len=train_seq_len;self.rope_dims=rope_dims if rope_dims>0 else dim;inv_freq=1./base**(torch.arange(0,self.rope_dims,2,dtype=torch.float32)/self.rope_dims);self.register_buffer('inv_freq',inv_freq,persistent=False);self._seq_len_cached=0;self._cos_cached=None;self._sin_cached=None + def forward(self,seq_len,device,dtype): + if self._cos_cached is None or self._sin_cached is None or self._seq_len_cached!=seq_len or self._cos_cached.device!=device: + rd=self.rope_dims + if seq_len>self.train_seq_len:scale=seq_len/self.train_seq_len;new_base=self.base*scale**(rd/(rd-2));inv_freq=1./new_base**(torch.arange(0,rd,2,dtype=torch.float32,device=device)/rd) + else:inv_freq=self.inv_freq.to(device) + t=torch.arange(seq_len,device=device,dtype=inv_freq.dtype);freqs=torch.outer(t,inv_freq);self._cos_cached=freqs.cos()[None,:,None,:];self._sin_cached=freqs.sin()[None,:,None,:];self._seq_len_cached=seq_len + return self._cos_cached.to(dtype=dtype),self._sin_cached.to(dtype=dtype) +def apply_rotary_emb(x,cos,sin,rope_dims=0): + if rope_dims>0 and rope_dims0: + head_dim=h.model_dim//h.num_heads + for block in self.blocks:block.attn.rope_dims=h.rope_dims;block.attn.rotary=Rotary(head_dim,base=h.rope_base,train_seq_len=h.train_seq_len,rope_dims=h.rope_dims) + self.final_norm=RMSNorm();self.lm_head=None if h.tie_embeddings else CastedLinear(h.embedding_dim,h.vocab_size,bias=False) + if self.lm_head is not None:self.lm_head._zero_init=True + if h.xsa_last_n>0: + for i in range(max(0,h.num_layers-h.xsa_last_n),h.num_layers):self.blocks[i].attn.use_xsa=True + if h.parallel_residual_start>=0: + for i in range(h.parallel_residual_start,h.num_layers):self.blocks[i].parallel=True + self.looping_active=False + if h.num_loops>0: + loop_seg=list(range(h.loop_start,h.loop_end+1));all_indices=list(range(h.loop_start)) + for _ in range(h.num_loops+1):all_indices.extend(loop_seg) + all_indices.extend(range(h.loop_end+1,h.num_layers));num_enc=len(all_indices)//2;self.encoder_indices=all_indices[:num_enc];self.decoder_indices=all_indices[num_enc:] + else:self.encoder_indices=list(range(self.num_encoder_layers));self.decoder_indices=list(range(self.num_encoder_layers,h.num_layers)) + self.num_skip_weights=min(len(self.encoder_indices),len(self.decoder_indices));self.skip_weights=nn.Parameter(torch.ones(self.num_skip_weights,h.model_dim,dtype=torch.float32));self.skip_gates=nn.Parameter(torch.zeros(self.num_skip_weights,h.model_dim,dtype=torch.float32))if h.skip_gates_enabled else None;self._init_weights() + def _init_weights(self): + if self.tie_embeddings:nn.init.normal_(self.tok_emb.weight,mean=.0,std=self.tied_embed_init_std) + for(name,module)in self.named_modules(): + if isinstance(module,nn.Linear): + if getattr(module,'_zero_init',False):nn.init.zeros_(module.weight) + elif module.weight.ndim==2 and module.weight.shape[0]>=64 and module.weight.shape[1]>=64:nn.init.orthogonal_(module.weight,gain=1.) + def forward_logits(self,input_ids): + x=self.tok_emb(input_ids);x=F.rms_norm(x,(x.size(-1),)) + if self.embed_proj is not None:x=self.embed_proj(x) + x0=x;skips=[];enc_iter=self.encoder_indices if self.looping_active else range(self.num_encoder_layers);dec_iter=self.decoder_indices if self.looping_active else range(self.num_encoder_layers,self.num_encoder_layers+self.num_decoder_layers) + for i in enc_iter:x=self.blocks[i](x,x0);skips.append(x) + for(skip_idx,i)in enumerate(dec_iter): + if skip_idxG.size(1) + if transposed:X=X.T + for _ in range(steps):A=X@X.T;B=b*A+c*A@A;X=a*X+B@X + return X.T if transposed else X +class Muon(torch.optim.Optimizer): + def __init__(self,params,lr,momentum,backend_steps,nesterov=True,weight_decay=.0,row_normalize=False):super().__init__(params,dict(lr=lr,momentum=momentum,backend_steps=backend_steps,nesterov=nesterov,weight_decay=weight_decay,row_normalize=row_normalize)) + @torch.no_grad() + def step(self,closure=None): + loss=None + if closure is not None: + with torch.enable_grad():loss=closure() + distributed=dist.is_available()and dist.is_initialized();world_size=dist.get_world_size()if distributed else 1;rank=dist.get_rank()if distributed else 0 + for group in self.param_groups: + params=group['params'] + if not params:continue + lr=group['lr'];momentum=group['momentum'];backend_steps=group['backend_steps'];nesterov=group['nesterov'];total_params=sum(int(p.numel())for p in params);updates_flat=torch.zeros(total_params,device=params[0].device,dtype=torch.bfloat16);curr=0 + for(i,p)in enumerate(params): + if i%world_size==rank and p.grad is not None: + g=p.grad;state=self.state[p] + if'momentum_buffer'not in state:state['momentum_buffer']=torch.zeros_like(g) + buf=state['momentum_buffer'];buf.mul_(momentum).add_(g) + if nesterov:g=g.add(buf,alpha=momentum) + if group.get('row_normalize',False):row_norms=g.float().norm(dim=-1,keepdim=True).clamp_min(1e-07);g=g/row_norms.to(g.dtype) + g=zeropower_via_newtonschulz5(g,steps=backend_steps);g*=max(1,g.size(0)/g.size(1))**.5;updates_flat[curr:curr+p.numel()]=g.reshape(-1) + curr+=p.numel() + if distributed:dist.all_reduce(updates_flat,op=dist.ReduceOp.SUM) + wd=group.get('weight_decay',.0);curr=0 + for p in params: + if wd>.0:p.data.mul_(1.-lr*wd) + g=updates_flat[curr:curr+p.numel()].view_as(p).to(dtype=p.dtype);p.add_(g,alpha=-lr);curr+=p.numel() + return loss +CONTROL_TENSOR_NAME_PATTERNS=tuple(pattern for pattern in os.environ.get('CONTROL_TENSOR_NAME_PATTERNS','attn_scale,attn_scales,mlp_scale,mlp_scales,resid_mix,resid_mixes,q_gain,skip_weight,skip_weights,skip_gates').split(',')if pattern) +class Optimizers: + def __init__(self,h,base_model): + block_named_params=list(base_model.blocks.named_parameters());matrix_params=[p for(name,p)in block_named_params if p.ndim==2 and not any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)];scalar_params=[p for(name,p)in block_named_params if p.ndim<2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)] + if base_model.skip_weights.numel()>0:scalar_params.append(base_model.skip_weights) + if base_model.skip_gates is not None and base_model.skip_gates.numel()>0:scalar_params.append(base_model.skip_gates) + token_lr=h.tied_embed_lr if h.tie_embeddings else h.embed_lr;tok_params=[{'params':[base_model.tok_emb.weight],'lr':token_lr,'base_lr':token_lr}];self.optimizer_tok=torch.optim.AdamW(tok_params,betas=(h.beta1,h.beta2),eps=h.adam_eps,weight_decay=h.embed_wd,fused=True);self.optimizer_muon=Muon(matrix_params,lr=h.matrix_lr,momentum=h.muon_momentum,backend_steps=h.muon_backend_steps,weight_decay=h.muon_wd,row_normalize=h.muon_row_normalize) + for group in self.optimizer_muon.param_groups:group['base_lr']=h.matrix_lr + self.optimizer_scalar=torch.optim.AdamW([{'params':scalar_params,'lr':h.scalar_lr,'base_lr':h.scalar_lr}],betas=(h.beta1,h.beta2),eps=h.adam_eps,weight_decay=h.adam_wd,fused=True);self.optimizers=[self.optimizer_tok,self.optimizer_muon,self.optimizer_scalar] + if base_model.lm_head is not None:self.optimizer_head=torch.optim.Adam([{'params':[base_model.lm_head.weight],'lr':h.head_lr,'base_lr':h.head_lr}],betas=(h.beta1,h.beta2),eps=h.adam_eps,fused=True);self.optimizers.insert(1,self.optimizer_head) + else:self.optimizer_head=None + def __iter__(self):return iter(self.optimizers) + def zero_grad_all(self): + for opt in self.optimizers:opt.zero_grad(set_to_none=True) + def step(self): + for opt in self.optimizers:opt.step() + self.zero_grad_all() +def restore_fp32_params(model): + for module in model.modules(): + if isinstance(module,CastedLinear):module.float() + for(name,param)in model.named_parameters(): + if(param.ndim<2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS))and param.dtype!=torch.float32:param.data=param.data.float() +def collect_hessians(model,train_loader,h,device,n_calibration_batches=64): + hessians={};hooks=[] + def make_hook(name): + def hook_fn(module,inp,out): + x=inp[0].detach().float() + if x.ndim==3:x=x.reshape(-1,x.shape[-1]) + if name not in hessians:hessians[name]=torch.zeros(x.shape[1],x.shape[1],dtype=torch.float32,device=device) + hessians[name].addmm_(x.T,x) + return hook_fn + for(name,module)in model.named_modules(): + if isinstance(module,CastedLinear)and module.weight.numel()>65536: + cat=classify_param(name+'.weight') + if cat in('mlp','attn'):hooks.append(module.register_forward_hook(make_hook(name+'.weight'))) + if model.tie_embeddings: + hook_module=model.head_proj if model.head_proj is not None else model.final_norm + def make_output_hook(name): + def hook_fn(module,inp,out): + x=out.detach().float() + if x.ndim==3:x=x.reshape(-1,x.shape[-1]) + if name not in hessians:hessians[name]=torch.zeros(x.shape[1],x.shape[1],dtype=torch.float32,device=device) + hessians[name].addmm_(x.T,x) + return hook_fn + hooks.append(hook_module.register_forward_hook(make_output_hook('tok_emb.weight'))) + model.eval() + with torch.no_grad(): + for _ in range(n_calibration_batches):x,_=train_loader.next_batch(h.train_batch_tokens,h.grad_accum_steps);model.forward_logits(x) + for hook in hooks:hook.remove() + for name in hessians:hessians[name]=hessians[name].cpu()/n_calibration_batches + return hessians +def gptq_quantize_weight(w,H,clip_sigmas=3.,clip_range=63,block_size=128): + W_orig=w.float().clone();rows,cols=W_orig.shape;H=H.float().clone();dead=torch.diag(H)==0;H[dead,dead]=1;damp=.01*H.diag().mean();H.diagonal().add_(damp);perm=torch.argsort(H.diag(),descending=True);invperm=torch.argsort(perm);W_perm=W_orig[:,perm].clone();W_perm[:,dead[perm]]=0;H=H[perm][:,perm];Hinv=torch.cholesky_inverse(torch.linalg.cholesky(H));Hinv=torch.linalg.cholesky(Hinv,upper=True);row_std=W_orig.std(dim=1);s=(clip_sigmas*row_std/clip_range).clamp_min(1e-10).to(torch.float16);sf=s.float();Q=torch.zeros(rows,cols,dtype=torch.int8);W_work=W_perm.clone() + for i1 in range(0,cols,block_size): + i2=min(i1+block_size,cols);W_block=W_work[:,i1:i2].clone();Hinv_block=Hinv[i1:i2,i1:i2];Err=torch.zeros(rows,i2-i1) + for j in range(i2-i1):w_col=W_block[:,j];d=Hinv_block[j,j];q_col=torch.clamp(torch.round(w_col/sf),-clip_range,clip_range);Q[:,i1+j]=q_col.to(torch.int8);err=(w_col-q_col.float()*sf)/d;Err[:,j]=err;W_block[:,j:]-=err.unsqueeze(1)*Hinv_block[j,j:].unsqueeze(0) + if i20:out[name]=(q.float()*s.float().view(q.shape[0],*[1]*(q.ndim-1))).to(orig_dtype) + else:out[name]=(q.float()*float(s.item())).to(orig_dtype) + return out +_BSHF_MAGIC=b'BSHF' +def _byte_shuffle(data,stride=2): + if stride<=1 or len(data)0: + base_model.train();chunk_seqs=(chunk_end-chunk_start)//seq_len + if chunk_seqs>0: + cos_lr=h.ttt_lr*.5*(1.+math.cos(math.pi*ci/max(num_chunks-1,1))) + for pg in optimizer.param_groups:pg['lr']=cos_lr + my_seq_s=chunk_seqs*rank//world_size;my_seq_e=chunk_seqs*(rank+1)//world_size;my_chunk_seqs=my_seq_e-my_seq_s + for _ep in range(h.ttt_epochs): + for bs in range(0,my_chunk_seqs,batch_seqs): + be=min(bs+batch_seqs,my_chunk_seqs);actual_bs=my_seq_s+bs;start_tok=chunk_start+actual_bs*seq_len;end_tok=chunk_start+(my_seq_s+be)*seq_len+1 + if end_tok>val_data.val_tokens.numel():continue + local=val_data.val_tokens[start_tok:end_tok].to(device=device,dtype=torch.int64);x=local[:-1].reshape(-1,seq_len);y=local[1:].reshape(-1,seq_len);optimizer.zero_grad(set_to_none=True) + with torch.autocast(device_type='cuda',dtype=torch.bfloat16):loss=base_model(x,y) + loss.backward() + if world_size>1: + for p in ttt_params: + if p.grad is not None:dist.all_reduce(p.grad,op=dist.ReduceOp.AVG) + torch.nn.utils.clip_grad_norm_(ttt_params,1.);optimizer.step() + if dist.is_available()and dist.is_initialized():dist.all_reduce(loss_sum,op=dist.ReduceOp.SUM);dist.all_reduce(token_count,op=dist.ReduceOp.SUM);dist.all_reduce(byte_count,op=dist.ReduceOp.SUM) + for p in base_model.parameters():p.requires_grad_(True) + base_model.eval();return _loss_bpb(loss_sum,token_count,byte_count) +def timed_eval(label,fn,*args,**kwargs):torch.cuda.synchronize();t0=time.perf_counter();val_loss,val_bpb=fn(*args,**kwargs);torch.cuda.synchronize();elapsed_ms=1e3*(time.perf_counter()-t0);log(f"{label} val_loss:{val_loss:.8f} val_bpb:{val_bpb:.8f} eval_time:{elapsed_ms:.0f}ms");return val_loss,val_bpb +def train_model(h,device,val_data): + base_model=GPT(h).to(device).bfloat16();restore_fp32_params(base_model);compiled_model=torch.compile(base_model,dynamic=False,fullgraph=True) + if h.distributed:model=DDP(compiled_model,device_ids=[h.local_rank],broadcast_buffers=False) + else:model=compiled_model + log(f"model_params:{sum(p.numel()for p in base_model.parameters())}");optimizers=Optimizers(h,base_model);train_loader=ShuffledSequenceLoader(h,device);max_wallclock_ms=1e3*h.max_wallclock_seconds if h.max_wallclock_seconds>0 else None + if max_wallclock_ms is not None:max_wallclock_ms-=h.gptq_reserve_seconds*1e3;log(f"gptq:reserving {h.gptq_reserve_seconds:.0f}s, effective={max_wallclock_ms:.0f}ms") + def training_frac(step,elapsed_ms): + if max_wallclock_ms is None:return step/max(h.iterations,1) + return elapsed_ms/max(max_wallclock_ms,1e-09) + def lr_mul(frac): + if h.warmdown_frac<=0:return 1. + if frac>=1.-h.warmdown_frac:return max((1.-frac)/h.warmdown_frac,h.min_lr) + return 1. + def step_fn(step,lr_scale): + optimizers.zero_grad_all();train_loss=torch.zeros((),device=device) + for micro_step in range(h.grad_accum_steps): + if h.distributed:model.require_backward_grad_sync=micro_step==h.grad_accum_steps-1 + x,y=train_loader.next_batch(h.train_batch_tokens,h.grad_accum_steps) + with torch.autocast(device_type='cuda',dtype=torch.bfloat16,enabled=True):loss=model(x,y) + train_loss+=loss.detach();(loss/h.grad_accum_steps).backward() + train_loss/=h.grad_accum_steps;frac=min(step/h.muon_momentum_warmup_steps,1.)if h.muon_momentum_warmup_steps>0 else 1.;muon_momentum=(1-frac)*h.muon_momentum_warmup_start+frac*h.muon_momentum + for group in optimizers.optimizer_muon.param_groups:group['momentum']=muon_momentum + for opt in optimizers: + for group in opt.param_groups:group['lr']=group['base_lr']*lr_scale + if h.grad_clip_norm>0:torch.nn.utils.clip_grad_norm_(base_model.parameters(),h.grad_clip_norm) + optimizers.step();return train_loss + if h.warmup_steps>0: + initial_model_state={name:tensor.detach().cpu().clone()for(name,tensor)in base_model.state_dict().items()};initial_optimizer_states=[copy.deepcopy(opt.state_dict())for opt in optimizers];model.train() + for warmup_step in range(h.warmup_steps): + step_fn(warmup_step,1.) + if warmup_step<=5 or(warmup_step+1)%10==0 or warmup_step+1==h.warmup_steps:log(f"warmup_step: {warmup_step+1}/{h.warmup_steps}") + if h.num_loops>0: + base_model.looping_active=True;log(f"loop_warmup:enabled encoder:{base_model.encoder_indices} decoder:{base_model.decoder_indices}") + for warmup_step in range(h.warmup_steps): + step_fn(warmup_step,1.) + if warmup_step<=5 or(warmup_step+1)%10==0 or warmup_step+1==h.warmup_steps:log(f"loop_warmup_step: {warmup_step+1}/{h.warmup_steps}") + base_model.looping_active=False + base_model.load_state_dict(initial_model_state,strict=True) + for(opt,state)in zip(optimizers,initial_optimizer_states,strict=True):opt.load_state_dict(state) + optimizers.zero_grad_all() + if h.distributed:model.require_backward_grad_sync=True + train_loader=ShuffledSequenceLoader(h,device) + ema_state={name:t.detach().float().clone()for(name,t)in base_model.state_dict().items()};ema_decay=h.ema_decay;training_time_ms=.0;stop_after_step=None;torch.cuda.synchronize();t0=time.perf_counter();step=0 + while True: + last_step=step==h.iterations or stop_after_step is not None and step>=stop_after_step;should_validate=last_step or h.val_loss_every>0 and step%h.val_loss_every==0 + if should_validate:torch.cuda.synchronize();training_time_ms+=1e3*(time.perf_counter()-t0);val_loss,val_bpb=eval_val(h,device,val_data,model);log(f"{step}/{h.iterations} val_loss: {val_loss:.4f} val_bpb: {val_bpb:.4f}");torch.cuda.synchronize();t0=time.perf_counter() + if last_step: + if stop_after_step is not None and step0 and not base_model.looping_active and frac>=h.enable_looping_at:base_model.looping_active=True;log(f"layer_loop:enabled step:{step} frac:{frac:.3f} encoder:{base_model.encoder_indices} decoder:{base_model.decoder_indices}") + train_loss=step_fn(step,scale) + with torch.no_grad(): + for(name,t)in base_model.state_dict().items():ema_state[name].mul_(ema_decay).add_(t.detach().float(),alpha=1.-ema_decay) + step+=1;approx_training_time_ms=training_time_ms+1e3*(time.perf_counter()-t0);should_log_train=h.train_log_every>0 and(step<=5 or step%h.train_log_every==0 or stop_after_step is not None) + if should_log_train:tok_per_sec=step*h.train_batch_tokens/(approx_training_time_ms/1e3);log(f"{step}/{h.iterations} train_loss: {train_loss.item():.4f} train_time: {approx_training_time_ms/60000:.1f}m tok/s: {tok_per_sec:.0f}") + reached_cap=max_wallclock_ms is not None and approx_training_time_ms>=max_wallclock_ms + if h.distributed and max_wallclock_ms is not None:reached_cap_tensor=torch.tensor(int(reached_cap),device=device);dist.all_reduce(reached_cap_tensor,op=dist.ReduceOp.MAX);reached_cap=bool(reached_cap_tensor.item()) + if stop_after_step is None and reached_cap:stop_after_step=step + log(f"peak memory allocated: {torch.cuda.max_memory_allocated()//1024//1024} MiB reserved: {torch.cuda.max_memory_reserved()//1024//1024} MiB");log('ema:applying EMA weights');current_state=base_model.state_dict();avg_state={name:t.to(dtype=current_state[name].dtype)for(name,t)in ema_state.items()};base_model.load_state_dict(avg_state,strict=True);return base_model,compiled_model +def train_and_eval(h,device): + random.seed(h.seed);np.random.seed(h.seed);torch.manual_seed(h.seed);torch.cuda.manual_seed_all(h.seed);val_data=ValidationData(h,device);log(f"train_shards: {len(list(Path(h.datasets_dir).resolve().glob("fineweb_train_*.bin")))}");log(f"val_tokens: {val_data.val_tokens.numel()-1}");base_model,compiled_model=train_model(h,device,val_data);torch._dynamo.reset();timed_eval('pre-quantization post-ema',eval_val,h,device,val_data,compiled_model);serialize(h,base_model,Path(__file__).read_text(encoding='utf-8')) + if h.distributed:dist.barrier() + eval_model=deserialize(h,device) + if h.num_loops>0:eval_model.looping_active=True + compiled_model=torch.compile(eval_model,dynamic=False,fullgraph=True);timed_eval('quantized',eval_val,h,device,val_data,compiled_model) + if h.sliding_window_enabled:timed_eval('quantized_sliding_window',eval_val_sliding,h,device,val_data,eval_model) + if h.ttt_enabled and h.sliding_window_enabled: + del eval_model,compiled_model;torch._dynamo.reset();torch.cuda.empty_cache();ttt_model=deserialize(h,device) + if h.num_loops>0:ttt_model.looping_active=True + timed_eval('quantized_ttt',eval_val_ttt,h,device,val_data,ttt_model);del ttt_model + if h.etlb_enabled and h.sliding_window_enabled: + if'eval_model'not in dir(): + eval_model=deserialize(h,device) + if h.num_loops>0:eval_model.looping_active=True + timed_eval('quantized_sliding_etlb',eval_val_sliding_etlb,h,device,val_data,eval_model) +def main(): + world_size=int(os.environ.get('WORLD_SIZE','1'));local_rank=int(os.environ.get('LOCAL_RANK','0'));distributed='RANK'in os.environ and'WORLD_SIZE'in os.environ + if not torch.cuda.is_available():raise RuntimeError('CUDA is required') + if world_size<=0:raise ValueError(f"WORLD_SIZE must be positive, got {world_size}") + if 8%world_size!=0:raise ValueError(f"WORLD_SIZE={world_size} must divide 8 so grad_accum_steps stays integral") + device=torch.device('cuda',local_rank);torch.cuda.set_device(device) + if distributed:dist.init_process_group(backend='nccl',device_id=device);dist.barrier() + torch.backends.cuda.matmul.allow_tf32=True;torch.backends.cudnn.allow_tf32=True;torch.set_float32_matmul_precision('high');from torch.backends.cuda import enable_cudnn_sdp,enable_flash_sdp,enable_math_sdp,enable_mem_efficient_sdp;enable_cudnn_sdp(False);enable_flash_sdp(True);enable_mem_efficient_sdp(False);enable_math_sdp(False);torch._dynamo.config.optimize_ddp=False;h=Hyperparameters();set_logging_hparams(h) + if h.is_main_process: + os.makedirs('logs',exist_ok=True);log(100*'=',console=False);log('Hyperparameters:',console=True) + for(k,v)in sorted(vars(type(h)).items()): + if not k.startswith('_'):log(f" {k}: {v}",console=True) + log('='*100,console=False);log(f"Running Python {sys.version}",console=False);log(f"Running PyTorch {torch.__version__}",console=False);log(subprocess.run(['nvidia-smi'],stdout=subprocess.PIPE,stderr=subprocess.PIPE,text=True,check=False).stdout,console=False);log('='*100,console=False) + train_and_eval(h,device) + if distributed:dist.destroy_process_group() +if __name__=='__main__':main() \ No newline at end of file diff --git a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed1337.log b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed1337.log new file mode 100644 index 0000000000..efff7bbc84 --- /dev/null +++ b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed1337.log @@ -0,0 +1,137 @@ +W0412 14:14:57.052000 35802 torch/distributed/run.py:803] +W0412 14:14:57.052000 35802 torch/distributed/run.py:803] ***************************************** +W0412 14:14:57.052000 35802 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0412 14:14:57.052000 35802 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.997 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/sp8192_seed1337.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 4 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.02 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.085 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + qk_gain_init: 4.0 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: sp8192_seed1337 + scalar_lr: 0.02 + seed: 1337 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.667 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40548352 +model_params:35943512 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 4] decoder:[5, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0047 val_bpb: 3.4867 +1/20000 train_loss: 9.0080 train_time: 0.0m tok/s: 8089272 +2/20000 train_loss: 12.3015 train_time: 0.0m tok/s: 8022559 +3/20000 train_loss: 11.0711 train_time: 0.0m tok/s: 7954927 +4/20000 train_loss: 9.4520 train_time: 0.0m tok/s: 7918173 +5/20000 train_loss: 8.3679 train_time: 0.0m tok/s: 7892396 +500/20000 train_loss: 3.3349 train_time: 0.9m tok/s: 7690797 +1000/20000 train_loss: 3.2063 train_time: 1.7m tok/s: 7685016 +1500/20000 train_loss: 3.0906 train_time: 2.6m tok/s: 7688746 +2000/20000 train_loss: 3.0213 train_time: 3.4m tok/s: 7689501 +2500/20000 train_loss: 3.0327 train_time: 4.3m tok/s: 7692100 +layer_loop:enabled step:2877 frac:0.500 encoder:[0, 1, 2, 3, 4, 5, 4] decoder:[5, 4, 5, 6, 7, 8, 9, 10] +3000/20000 train_loss: 3.0867 train_time: 5.2m tok/s: 7563624 +3500/20000 train_loss: 2.9550 train_time: 6.3m tok/s: 7265791 +4000/20000 train_loss: 2.9969 train_time: 7.5m tok/s: 7031651 +4000/20000 val_loss: 2.9178 val_bpb: 1.1298 +4500/20000 train_loss: 2.8096 train_time: 8.6m tok/s: 6882955 +5000/20000 train_loss: 2.7590 train_time: 9.7m tok/s: 6766993 +5052/20000 val_loss: 2.8139 val_bpb: 1.0896 +stopping_early: wallclock_cap train_time: 588041ms step: 5052/20000 +peak memory allocated: 35373 MiB reserved: 35478 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.81131292 val_bpb:1.08857004 eval_time:6825ms +Serialized model: 135426937 bytes +Code size: 58367 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 11.3s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15970240 bytes +Total submission size quantized+brotli: 16028607 bytes +quantized val_loss:2.84129693 val_bpb:1.10018017 eval_time:22233ms +quantized_sliding_window val_loss:2.79834517 val_bpb:1.08354879 eval_time:83683ms diff --git a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed2024.log b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed2024.log new file mode 100644 index 0000000000..12344ae02d --- /dev/null +++ b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed2024.log @@ -0,0 +1,137 @@ +W0412 14:47:48.365000 113005 torch/distributed/run.py:803] +W0412 14:47:48.365000 113005 torch/distributed/run.py:803] ***************************************** +W0412 14:47:48.365000 113005 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0412 14:47:48.365000 113005 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.997 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/sp8192_seed2024.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 4 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.02 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.085 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + qk_gain_init: 4.0 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: sp8192_seed2024 + scalar_lr: 0.02 + seed: 2024 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.667 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40548352 +model_params:35943512 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 4] decoder:[5, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0072 val_bpb: 3.4877 +1/20000 train_loss: 9.0094 train_time: 0.0m tok/s: 2135125 +2/20000 train_loss: 12.2873 train_time: 0.0m tok/s: 3370519 +3/20000 train_loss: 11.1022 train_time: 0.0m tok/s: 4161091 +4/20000 train_loss: 9.4986 train_time: 0.0m tok/s: 4708808 +5/20000 train_loss: 8.4185 train_time: 0.0m tok/s: 5113931 +500/20000 train_loss: 3.3297 train_time: 0.9m tok/s: 7577257 +1000/20000 train_loss: 3.2011 train_time: 1.7m tok/s: 7627264 +1500/20000 train_loss: 3.0892 train_time: 2.6m tok/s: 7647347 +2000/20000 train_loss: 3.0172 train_time: 3.4m tok/s: 7660145 +2500/20000 train_loss: 3.0322 train_time: 4.3m tok/s: 7668167 +layer_loop:enabled step:2869 frac:0.500 encoder:[0, 1, 2, 3, 4, 5, 4] decoder:[5, 4, 5, 6, 7, 8, 9, 10] +3000/20000 train_loss: 3.0873 train_time: 5.2m tok/s: 7571205 +3500/20000 train_loss: 2.9600 train_time: 6.3m tok/s: 7240254 +4000/20000 train_loss: 2.9966 train_time: 7.5m tok/s: 7010853 +4000/20000 val_loss: 2.9162 val_bpb: 1.1292 +4500/20000 train_loss: 2.8106 train_time: 8.6m tok/s: 6865615 +5000/20000 train_loss: 2.7611 train_time: 9.7m tok/s: 6750831 +5042/20000 val_loss: 2.8137 val_bpb: 1.0895 +stopping_early: wallclock_cap train_time: 588097ms step: 5042/20000 +peak memory allocated: 35373 MiB reserved: 35476 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.81121527 val_bpb:1.08853223 eval_time:6640ms +Serialized model: 135426937 bytes +Code size: 58367 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 11.3s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15967559 bytes +Total submission size quantized+brotli: 16025926 bytes +quantized val_loss:2.84118514 val_bpb:1.10013688 eval_time:22696ms +quantized_sliding_window val_loss:2.79862166 val_bpb:1.08365585 eval_time:108261ms diff --git a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed42.log b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed42.log new file mode 100644 index 0000000000..1ba76d085a --- /dev/null +++ b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed42.log @@ -0,0 +1,137 @@ +W0412 14:31:11.762000 74428 torch/distributed/run.py:803] +W0412 14:31:11.762000 74428 torch/distributed/run.py:803] ***************************************** +W0412 14:31:11.762000 74428 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0412 14:31:11.762000 74428 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: ./data + datasets_dir: ./data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.997 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/sp8192_seed42.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 4 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.02 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.085 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + qk_gain_init: 4.0 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: sp8192_seed42 + scalar_lr: 0.02 + seed: 42 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: ./data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: ./data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + val_batch_tokens: 524288 + val_files: ./data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.667 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40548352 +model_params:35943512 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 4] decoder:[5, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0091 val_bpb: 3.4884 +1/20000 train_loss: 9.0116 train_time: 0.0m tok/s: 8131746 +2/20000 train_loss: 12.3391 train_time: 0.0m tok/s: 8043815 +3/20000 train_loss: 11.1222 train_time: 0.0m tok/s: 7969544 +4/20000 train_loss: 9.4225 train_time: 0.0m tok/s: 7926698 +5/20000 train_loss: 8.3224 train_time: 0.0m tok/s: 7908161 +500/20000 train_loss: 3.3361 train_time: 0.9m tok/s: 7698191 +1000/20000 train_loss: 3.2032 train_time: 1.7m tok/s: 7690409 +1500/20000 train_loss: 3.0970 train_time: 2.6m tok/s: 7691199 +2000/20000 train_loss: 3.0195 train_time: 3.4m tok/s: 7692530 +2500/20000 train_loss: 3.0329 train_time: 4.3m tok/s: 7694479 +layer_loop:enabled step:2877 frac:0.500 encoder:[0, 1, 2, 3, 4, 5, 4] decoder:[5, 4, 5, 6, 7, 8, 9, 10] +3000/20000 train_loss: 3.0914 train_time: 5.2m tok/s: 7585577 +3500/20000 train_loss: 2.9592 train_time: 6.3m tok/s: 7267839 +4000/20000 train_loss: 3.0006 train_time: 7.5m tok/s: 7031051 +4000/20000 val_loss: 2.9219 val_bpb: 1.1314 +4500/20000 train_loss: 2.8134 train_time: 8.6m tok/s: 6882962 +5000/20000 train_loss: 2.7602 train_time: 9.7m tok/s: 6767536 +5053/20000 val_loss: 2.8183 val_bpb: 1.0913 +stopping_early: wallclock_cap train_time: 588126ms step: 5053/20000 +peak memory allocated: 35373 MiB reserved: 35476 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.81580054 val_bpb:1.09030770 eval_time:6794ms +Serialized model: 135426937 bytes +Code size: 58367 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 11.3s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15966716 bytes +Total submission size quantized+brotli: 16025083 bytes +quantized val_loss:2.84662455 val_bpb:1.10224308 eval_time:22261ms +quantized_sliding_window val_loss:2.80368562 val_bpb:1.08561667 eval_time:108586ms From 78fac924a40e4d31464a5646e7ab914db806c7f6 Mon Sep 17 00:00:00 2001 From: Anthony Date: Sun, 12 Apr 2026 12:29:48 -0400 Subject: [PATCH 3/5] Integrate fused-softcap-ce kernel (3.63x on H100) into SP8192 SOTA --- .../2026-04-12_SP8192_Frontier/requirements.txt | 1 + .../track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 records/track_10min_16mb/2026-04-12_SP8192_Frontier/requirements.txt diff --git a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/requirements.txt b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/requirements.txt new file mode 100644 index 0000000000..94f74a9cb8 --- /dev/null +++ b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/requirements.txt @@ -0,0 +1 @@ +fused-softcap-ce @ git+https://github.com/anthony-maio/fused-softcap-ce.git diff --git a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt.py b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt.py index e2634caf8e..512734abb7 100644 --- a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt.py +++ b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_gpt.py @@ -1,2 +1,2 @@ import lzma as L,base64 as B -exec(L.decompress(B.b85decode(";J=$Qid_ISn@VT6Qap3bt~@<3h>ok~)Km^%|Fb#-90hdJma3#=K3IPdy2;e1CE^B>w5WwJKer@f4vu8S0NV8~tab7X?zw=)B>>{9}d(HHQPE!C?b?dQ}H%quz_G@tfQBKoZA2;z)N_#D^h|L6c$!b*Mf;SIqp&i?fu)yK*yU(|GzCfiN#b4S8RJ&QFJiRIwB&7d_NR+8JfVmOSq2>pIb!xVk%o9io7zt%JpjiEk@AM4-?@y%6Z^_oN;HEE2#=X>i4Hq$m@!-#1p41yTF(YwFlrz>fPilA4dgALdi&8|y;|cNd>gymwHKv?ZwAVec@w^G#sH4m+Yi`_xq8FCpL2ML0B*wMpCe;l;3&LqjjA3>`Ksa*A{vVd6MnUKr)hFI*6tcV{0io*EK=kmk-*;%W`Qf<2Dzk0)ULpQ&J{9dB{p&l$K;3Th(jLWcY{Y7|r#&81QR_CERe5Q{ogJ?-h+4PPeIaT8$Wrc+(avJ-d`Av(J&G{Sd9iFwRyA8@{e!f5&?Khwq3q@8!zj{#|sOoVMK4HEI*(Ku~K3d&y!{J=vP6WSRQZ>S_GChDYjM6k!HJ-2qu&6u&^;yK*OKFg0m(ZWoUe$t#@&E6)jICM9&+aoA(I3P*rFGKvPYYqg~=vQpAYIA51cD3x)B;GR~1Zncp$-FbyC|o4cR)l~6ktXe)GjzxNC!QB(*mI%FO<#N3Xq+=@8uF@;_?y9K(x~l?edfieS_;Iay`RKf1v48lvub)mrCmPFeOSD9i%yH@$cCG-jRiK%dM=znMVf5)Zf6T&-Pcj+;im=6#Vz?1MD(-7Qy6*?X8$4$8#!0i{3mMjrJ?t4uF;RPPj&(8?gae;m9xm%NkHxQdp9tYJqp1p%r`|?Gf`~1byUgCaMawjUsX&in)gkv-*MZ6itT~%Vk{vNtpocOX9Gb{O@BOiXF(T?0*_T4(=2oN7Y7A_q7is5dJHk1H9_?WA`!-4oqihY{G034U;YdJ{g);AOMG#UR70l$Y`VHn)X}}*n+ys0wwZ{wAfA~CTc%lUaK+6QsgA{)8s5g_zhb$kyeM7MM!HljoONDJPg0%njJ4S&218YUR>XbWO41ZQuI|f^*}S0T^;Af)xa$#sM_xr%gd*t6-PW@&?LVz0Bf~_2H8y%vRfBT=GeL_;a6N^>judvxri*<>xm==p}DxR-ay!-k#tz6v{b*v8?>X>T-q3lM@L6{X5*?%gsv1EodS=P0p+lV*r1C#aA9I-zyp6stvj!oIGAcKC0{4??vuY6}g7+}{K9dL?szV{F?jFW8blE}OOCGv{a&8b;tnvC0=YAAJo8+z-rGwsNB`z$il1;H5T*O(~FxOeuu$m2WV=+EZPfaSrj2a=QSw!}PS&e2ifAOB&s_7URBS2quyYodM#vwb_=8<1EnMa%o&$a`4t5mJ5ufXZ&9N>4$&N5SFfX2V=mh*y-;p{Z$McK8o^kLSDw^0Yh^q>~nfl1p-ot_DMv8EHGjG2y83pDkMMEvohFVr+PDKIoc^6Km81KTYh(gv4KC_rSw-7Qd)7#rD#d<^F$)u%Rr{OJc-Gu{8>c+9x<%$avsND*KE8%!-H8z773c>{OfkM+H20$^1_#H#LGGp@r^U4uVIir+7xt{W)3rA2as1psvEgwNSaiDt^dUSeymFHMOi>gW=o;WqZ*hybkkbR}!pfPBaXyv+Tbdyg2xAAWYoGNe=SsiJNy3HpYOlFDLdfsLL++o9uy&mTrPGZ>afn6T?oiZ;g_yF*2NXBLxxJ`-ae?(Q>k*uX+r>6Mu(Bq^5#O)(0Efhygp%AYb=mMagb2)=q{Q57tP}JNP6fwsay==0cjNhtDx-+2Lm59lh+i5`MHG$Y_bpUT_DoDxBVFB5TRvPseDzmoUK5PM4{GcW|2YzpTh+!=2)!j(tewh*u-LCAHdhHysk_&h@Y2pie*Lu~*B&lOhQx>G)MC2!slNuCvZEbF5gO<0)SeA%P_L#pKGSJ(T_#fI@wyLh)OoGuaywG?z2*SS)}?Hyq{GH-`XK}5Y~LfRx6Z@^Kx}D;4ID8kua~wb-A9<@!r|3Q~^H%IYfqdbh&Z|N_~8p|vSlt?#h+V}tIR~SA;Yt?WURpp=)of;)pIR7~qAIPV$WZc>GJ)$A4Y>kSLkny7y3Es69b?&(A3Mu)|-27)hm#K@aFx&Bl6>nfK3g+iwvA=GGeDMtC*YNs+e2I8i_+X+3r-n{W{F=J@2NeW2{eE^5zhI(fmvqdCS=G92wcl1|1bC5qj}N7CNRI?w~*0g@a^uLL$Vf{VPaco6+_{~BSKmDTCoW61LB-6+J@yNf@AZ}{%Pb5k77|iLV1K{?YRK#;Br&a1r!e4{ltdt)bZ*wqD%EG{rqT%A2ICJ5I=J!>t-^Ij0iS|EJt2Ksasoa(+~-8erN;x9r(0AMg%o@t22vAH>!S-nk#=zD~1=u-#BJzb)8v#Y=&)C5yVqM+(VH`%zWQ4)ikYcjN0_EL8gpeaW&Kg-EK&a@v{d@j|bfOA)-hgSrpg*@L)Q<$$vL1O-F%A2zDMN#)e+J_iL%&E!jz80*x6?%i)LNbKfp#E>IIZ(kHR<@B}Au~%3?FVpP?9Y1KVQ^bQ=T#GSz<`Aj9QYtY&-Pa`VL~c0d};RdThHR0iF7Ol|2ui0I}h-G-1BRN`fTm^b)pl`m)R@0cJ**LIp){Pjj97YMQ-+@;Ax;%;N7Bxr`0&hTB1z4Q?>o>&LC>76+M`>e=9bUgHFM;k{{wQQin)vQ)hr=Nxs9fxWk#R!UJ&jiGemzW*RZqf^y0f@?D1Q6d1KkEESDZ{qteU@jJ`gkz%hvlDkGYM73Fd*dn1O(nWcMZvk-B|YW2_#_aZh-ut25yydr!nT7nB(K<#UyPs3g0MQDRiUiR$743?BF;Ov1@(gxGwI5gkCv6(^*#`QO-47l+}92ke6?HKNVz0%PanP(BL7gp!$%J8}pE4=nZWsj>_#{Z!;?k4i`7;@@z$3vSq-g@@O`1`w>%FNo_OS#aNhs{m~HT5gP+J)^dFm1QP^{zptjxbW-0B{DEz1PSnq3rCN{y+lR9N$FQzVgO$uWaH`{m!pXLchd|+}Y~v4qOq6@Pvp&j_%Xhs$^j$WaUbntrS#O~LX=g5IO281TIF%ei1!-yDQes~;cV;%P{88-3nyo$Q5WG6PRil{S&*5G_0u{JIsF%Llo~nuK7s7ejFc^O+tv;_!0ZR?E`cpc|%f!gW5@xoMWeev-Yi-ldI^q=`xhE{zOo%;4eM^P6OM9ef4Idp8Si8+Wm=#ewBz4H)y0NPBQ__c+`&W=9+p%8sM&QaHn)SUKcjpG7wI!!kgI_^RQLnQA0MOdv9d_Lb{P*EQCg4q^&4qizZdHq@I-J)nQ>kBq-Pfz;J#FO1Tfx#G870PeBS)y+U_vnOiL=V$G-D}4-=o!!nTOHJm)Q&745aL0*BZiR!Wwuk#Td(eW;jA1o5T)GfH{$u}(5DdI@f&l#-*k8uu6mDj8que!#xZ_xUuFq0^xPSs0#_10TPE}7}ZYRlHpqbn&R9<;2=E2z?-AsIR{Ypy@ttp=Jj+%`Te4Tz|AGB#B;44&vrsf|mmL=_tlT9B&(?rRLLU_a@3F&t0>VM0f4exO6_qnq6LksOe|olcK`VEGeV9l1KJp(XqItpLl7SBrh47Xrx;xRtPP#+TNqhwf^Il||E5s~!j2w0Qg5fwpmmM7V*_}FvfRg!U`gL%4IbkG1{!!mX=iglftXqt@onww?mYz)BiuW^#`{wwumW4jnnxd<3pLKY#bfy?T@Mo9Fva#nVBW4GLr7$e~^zLm+S)ot710l@U8y&7I26&!+)4UMLi7>Bd`^;S#ogDs3Z7pQHtW?lZu3x_zbAd7F>))Q$oUNj?s<_Yh?ZpB!C{cw!;oxMctj&PApdeZOKLApZ)70mv;3|1-@H*h&5o=-GfY|fgC9Omj%q4cj8no*D+99A9iHZ7*IC7rWKH}wiyH=BFb0Y%O1B_8XQfre*d*9xN$q7$vGjgt@;(z6$3`9Pn8E^!!BK%ap2$sR2WA$n_AY>rHTb<2>AM%|Z^k;FF{H#i9ab`^)e!ESvVk(=rt#w$p>sU0f?)n;Liu{d@pnJ1NKj&T)sQtcTb$Irc6J8sGTaNk^|==_?+3SCcRLoG5mFzD(jVN#3MN!*NI2Wypn7lt`7+KPJ$wj_#?oZGq&yMYJ5Jva!His}Le^uAtBuiVG{fvdsRk{TIlnS@_re{PYg@1Op*}H+xT_5UQ{#nH{?tJw^{q`SVnNe_CQ*LFu4!ywh$mzk?AN5Vleu_4@TQU1z^<0klUxDB$Q0Artvkrl;S`Lk6>h2|+{Pyw_rkPuA^rP8T|0g}EcSP6%s5}5dsNKUg>eh~{x>P3?`X%;qYj0P7)utO0J*6e=#mQa`$tppy3`^l$^8;Yurx@%6POfQ{oJes*yLbut;asee=X;hKaOf`U@_II0o7q2rV0AapCcGL_FGD@uE^}|KrvMfQo5qCT%PCHAS{XvOv*AK%?&hjTbPjwds!P-4N%!8YH5cYNSS7kw78eQVEQ4Co2=7{b^ak{DG9vu>!-lr+Lf%p3H=8)sM&C(*?iQoEL0yu+XeND58VMbDah#Mm@jfwUc+k$nA(!Ydl4sU@bbYclizq^I;=39QPM_5^lZsgjV*rK-3h`f<4ALj4{G@l=%xh9T%@Tgn0a-7HhwdaOF+s~t9iY0HTM0(mE5gI=fD;v6vU1za7a~gjIjmT=5U9-mF|6ADcI{WDoMq@l9fswQXf_=GN(odL==>Yikf4k3Z>Y3t;ZD-WC9UBdy;qADCs+d1kmc+rUq=HY1-)yUdC!}t8rfW)xKQNFN^_Pgf7E~l-Hz~h(rw4PJ`F2v+3Hs1736dP;C&Ey5b!d=H1zZyo`^KcqZPds3lRm|Y68QQSsF?3%OY4vQejD9@wB~G+($&7K-Uy~2Vo@03i#?lLUY2N0lIVvmr@|rcrRhNvgI-lL>3qN46Va9}$+UbuC9FD85Qy=nZjGP&wxSF!NNxh?Apx7c9RG+Xiqa-+S*Y%#AZq!QQsWfIBGlO})nSGj5%#tXfASo|PKQMzsGe#N%UW03jmtUwVF+leetE$&*gJ;@+isAu$Jnum_Z5%`u-PGVby~6}zqO%s%6*RrtA%z8L{3x;h6_{W_26@k#HBrM1USZk8Rb8N%X;P+@Yudf3XeV*5Estlgt>X{TEapXTB#R}`V;bFiaJ%tLqP1{zL1fI6Sh&~GTfEvttAP)F+IhM%BoOB6v|exvSfbJaW-L+12<#>|%gni+Alm$x-g#6|g(WZk{qUU+O|ksxJePe|$o)f6aOyr9t;E<+wLM{V;~rd3fS&Eo_X#>ad3kXT3_OpEga(t4oNyf`|EiCN3*MJUjI)PTC&oWPsbkd&ear~umT6a7zd|K^})hLk6bc{C)aXJoQBIcslRIC1fXcT+M6P{%Y<4<+hssZ{scI_ML=zbKM{DLrdu4BiRt-p=RF9JN)o`oWXbe1G*{*1bd_?zH8nntC87JyAef>Zh{PgI0Fla<(~cMmPi1u!h5n77K36cGI)#t$yxu++W&q;1wvL_{h2y=j3os!=H#RLwPAX?zt>Ds4+Du!i_s$b}kPPJMb+tuWvx%=?WMvy&)@Q4`UEsONE)YX*5$IWde;&dS9&1cV-jhXU4g|i(9AafyGw4)Q9S~P*g!tG*8p1E&_M0&w&?*Z$sHJu6Ivq3^)>Rp!j;z=tAME?+l!f+C1e_K0B%Kv%#C7Ht^5`GUvbGf8U^q#F)tUomUMWx1K?kA$#xEeTOS7=-8X^B!ro7iEOn$}u3oNOGJbFB_7dCTs8RA?y5h@H@{y8{pkveB4%FLGq)wG;c1r_Io+2nJxEN7|V9G-&2E1gUFX-A}QX@L%@--Dbr(u$q9Jk#9~Q6vUcFSLD`(~@i+sxI(#r<5=1?f>d&;uX1D&0Qy;Gb*W-wGm=BY3sbHYIO-Xc-Q+ghth2fCVPI=u_(z*WQj{@W&$kL%+%fY|6Z4K(CmaOSO~6tu9gHX`5O~g1M8~^vTDS6Ljdigeci}Z{3uO4Vi~~IJh3zJZdfVCWO(h8GYjr7C#&{$K?J+RGEM>ugNCE!q%ozW!ePg6GylwSzfSH6?SRjIZQGCGS&Tyty6*%#T9pLW*;LIR%JjqP+!f>g00d<(=^xvv)iAr$(j3X!+U>OjClRSd>sgK7Qnn4a&tqkyaB79EnOFbWv$@2YjvfpoFB<~rQN=}?a?}5k@v^{JRI;b=(NJO2rP&x#gL}|4%(1cCZcVDKldsCjBf7pWS4dEyeK)x+Ojj$i4kfS*Ax!T*;~Fdl(9+wvX|*9z{ZB66?YW`dDRxn(Snx{sr5rgzPRP3t)-J*;lJ(6JIU?YC};MXd;{!`Pm`&@S!~{x61Oe)kCw_|TdaIn8cPjDMW-`lXm_K+Eb$?(CF$r>(CX~fjmvkEv%#zYH{8puPJJ|xh~iHH5+9Mj@6aUCRM%u5L|&DPmvS^1lijrC@iQfRhVRN2&u%NGEHxgAnKsHwLc#e@XP)Wg+)Xg`W?H~ydvlVE8eP1be2H(793msiIgX&D-=fC#N+#)%%#HYXJB@I)M8fkd$|wFgd~HP%gw0B~rd6Zw!t%1qPI>=TtCtm!LieN~a5?wvA}Eg4PW1NXeuYm@iL8SV%!&Y#*sV>tDRA4GsTp^P!W)xc6QbLiSPX2TeusG1^wC!9N28>>OXmR3C8Zth!2z9DYHHjAdwZn2jg!>h*}oX70p}PlWyWwpqE{+YgpeH&QD~#V5Bjb@V_jUU!c5O}{mIh6alZ(+24Ig0?yvMmeLAfi!vCHC&Y*Se{EH8rFwlASQA~?c({Hg$B4nL$9c(yuqhvnY=@=-OMMJ%@k-A!twi^B$s@D_o0jqa69i03csu@?_pKH`fX{SODRd~DG3UQps3jGASx8(g^k5W1OOa~IQ3XMotC65`9rd8{84>Yer0={8cj?^%I@5WUPHUIJWC@2xh8+Ko^RTPtmzRJHn-E~P~OAv3C-0VCy=a5B3CY#}YA`+t{kattZ)-vv^;1LgtH?ne@%&5$bEJt725C%Q$}PhhpRD|J1lByQR~`Ln%@wioJ33&fx%saIdU8qpY)u!#8!Nd{aDHqW8d@`g@io_u{^dsrz^tjY(tGu*Q?)4_zS(x$i6JRcUTTuxTvZHLtPiMyN#@!?_RERBvu!wX9&b@Oj2oj>dhVQg$wr=JnKv=XH4)0uMYbnca5#Q`%9?=pi^Cv7`U)!d-jr7jHTq0gg8W1JY;Q4)nn6|)#l62WsQeHuBo_c7B_H2qV`e-y0$5&7vYdqTj#0|%Ak9QMJcP->KMvI+SK?eUXfc2~@RGV=+IPl`t;!t$ahygXeKO;;`I0-4UWIeRqO9Vk=hF-onMr#qg^)?1fmU)MgaJ+5Gbn}J%=QIX#OWegDTqas2==T*N&TDW4D?Hv+I${J64wMiN`k6oY=a~?Y-0=LTr4-W;+K04(kvGsTZ%JV+JJPJv|i^_|5*6x7OWq@&#ycng@rVV+M2V$(aV3A$Ui{-|s`TM4g^d#xh<`TwiUayhQeS64VYwlX~P(9my+Ps^mz{ltWZ0@27_LzY^Bk*vkimMs4Sa*5YE3#&|rpY+}EcsyFLSqhFo8(Cs3oj3|NKPEP?ps#r+K=H+eHs7SRhB6^1cp60oz>45tY-6^3r#jPpoCXbf~PC3YpReWzSONu2pGXHek*h9^YgkE*~!ES!Fggi&r-ZQqgXt9Yu8^sgbLKkqztFs;A`-d_InFvISBgDjUEyWpuIYCq$35M_*;{CHErOr%jk9dXUkNS)vSyQt86(yPOsiP=bZo%&XaBBx>yd+eykT|*KggrDVJe>8v~&GXip~hDRu_6HX=ANu|2)@WwnR)ma~U3^nSEb9zohTU~4=3E$PO6SD`s;oKPh%6wC1!sLJip8mj8(23;wH{Ru%}3(>?&oKoonMHmqwdO$M2|gtF4IJoDAkU^oGO5dyXkUbZ!x-v1#qAOQ!N*FXQl1FPJ?8t$AnY`vv`F?gFqqOsb!q%&$19b^$jq%7!8DN~^s6(3$50wx=G6)D{6Z+?rG>!cB@mjY85{w{!No(U{G0g7VaMvMAKlID+9?Y0}3MZAx2w*s}jf;BvGwuLTxxIPeAZt-ySldQ82`0=0LarJ5%UA@T+{Pv8)7``-E8ejAK{9y3ofMdEa@#8dfM9q(95%Hc^uhCK-g}_fO1HMJ?Ho3nJF?5NUmWw9#-)^q8Q}W;L_eyus*G6Dw^UAAZ0uQs?WQ&O|VnuLiGA3!L?Ct(o8@krbz8S>Z8Gm(yoeUhWbMm?!kGYu6G`>Fn65G1=5XLgj%Zs>xTb>RVBAL%!h;ru%Sy9gzdqWu4I?~iz~Omxe_n&U*T5)A;9}U@vWN?p2-at3}aJAdZ&&;e{SH%j0r(9<#u6sxCY4di0;?zgTpG^3E~QAeGK+2RAx0)8y|uLevrhN48Zt>OCgK&nl6~Ao()x5j@Qq?NGqo{hLM8=TYNHm5Bg_B|0~%2jk#$NE?=}c=Zz=Jucd^7K8aiU4*`ZpTojlV;R2NO_A($HRqap-^J2v$*bY#p41l8*o9B{Wz6n>a^63=GPCR`l-;FN|znZKS<6={mBoN|KGFAduRxea2LWYqaW$dwIlT}=k*bDk+q3$9D!Nt>{0clCfm1D|4tA)y^VqbXjuVqQq4ZYl%xREc_abpZXTV>5)DH=3G$YAK;;!S2=6;MO^AXdr{y=@%Mn4kn`&%MRF?|2~CI)#`mT$IqP1|4E0J#3>2b+msw5bqwg9o7pxlO^o)LcYRq_)hpwk!A~yZ7yX~ShAp+Lfo=O)|gqE7nIfnwAhkzz2L?Mll3f1iz9|t)PLWQ#b5I_Omeu%-cT8Y5NXMyhWa~>sB3ihDpnOsyuOM%RFMQt*xDmEI7iTXw~9hW=ZdCvrLvNp#+=wP)T5xo(tAR}DliaZwfe5^lQv@5?--H6C%biy}{&2Fgn(JQ&R;>B@Vq{QC_jfS!FAcIyqFkFV`JHX0PT>&sU_|g12pO%b)&gp*l0W@NZ2vjDs1jQLgOE{Bo>wI^x|%5LvNGDX;jKc@g)5uM5h(qn*sW(Xh}qBvzhD~u51ns)>K6p5C-*TWy|!h91Ad}2rzAL3n&>0g{ajEvF7eDjF6_bNG16UxZA@}sfEtaN@s|$*OIEaIq80W97&?&NN$9l(_(l&UsDn-`DN?6?4&2yp_VUS%Q1!&{q$e%*IXs?)hr<#y~&5INgSr<4V;W$GpIrxMVU;#uxo1cVby3@10IED9{Kz@);;s{ppo?%F3mNV@HP_NWkuqnIOEmJo0?ARJLfdZ;BgKTy%pKqgt#4aNXyShH==Q#X6SPnODH0lQ&pznRMi8!)L}1)|}=L)_)9SFsd2HTg~;PGV?uttj2zNP-2ImJ;d7UT7zOA_&aJ|ELz*m?KU|4o28YKHG1h$?UFrAv35XX!GLqId!7kxhkD#uG6tlk3ArhAF)HMr3Mk7*987Zm?R#|Hct>ks!6&l0x;19mPF+S-AQ+4R|#SIW0tMP5Yvnoj?4{`r!9|Y&AwqAySymZBF`wKX}!&J^_f^Rrg2ky7;!<^WHn#IA%jL{b0R+3#{#O1olwR|YTL4SalVfQTx&PvU>^4*v|{FkOtC#CxoEL~Q2e4$CRDohHrt8M$MnqxjMDfx4`COOSGH}jK*Izo$6Fr=oo#fivSbl=p>IkGp!#N#MjEDssSAG~I#lY~(;o;kZivpkHYD-scVqbUbWBCnralq|H6YF62+89WjtV**APm5`<)4mO9h4i^$piqI=QO&?NBHGQ+>k#0^lD9N$E-+-S=+pEobwZ>A~&`tTT}=Mllqe2ZpA4C48hYcID9zmH*_H4yab9um=CBmFWY5m{j0>X22oDnm3AZF8_7vIJ394)5Dx8=kL>T(WPzTroTi@l%=iWer#H+<~V`+{@c^2u5W!XFWqvrg0I#M4^6!*-d>yp+Nh6l^YbS=b~9nBK2Ap4$Vme22KImB>k6M{qKfCFSs@A_^+ZEe4jV`a5q++883Dc$}Zk0Z0ats-D1P}mFKMO)>muLN{jX>vhF_#K6!~xR+8+)zh($Ahux5~oQ{p|6nvmNaKZqyU$UQUhH6|wH@2--gdAfgt~>KGIPLf*x%zzW0U|L3N4kVEc*DbURIYLGYElAI%tU7xY@zQ-G>pt5)s0%Yo0ianMYY8MFFtUTH)Ug%go^yL&p*m__vw=Th0MGh4Wz9wle>TOzTw@u7Of0dBMm%#^b>pH0P-C%hKt5ffD7nX~pBq?s$#Xm*M;I+T6D+`v0r_A4WIGMoQwLyRKX}>}XMaZQzJ8$S8T`BADsQv9}gY+1$a0IB?|+J5#1fqy&o5#=zPn4YQ|Vdor|;-$h0`kJU%(;-$-=LfvUMs9hSt~7Ae&G7^v-W_VeukS*pSMJ9?JU&SQD{#W<`n6@0K5-AJ4Pf@bN`;-z3^ufD;ZCRo?cHOSU9|8S20IS{_Dt&`^v50OotL(GTd)YBNu(Gs+>Q4V}*_uH2uQb@O{@2&Tp*P?aNo(W|NZhRL@xSfLSQb#%ys+;TwZOFa?eTrRw!HA6>>{Y4G#!2bj-3Azj(p>p%$1=fz+q(q`GGKYYUV-BR~sZe7;JGi)^HOC>Q+5LfM|5<{xRDzYRf3YSgG#Lua*2Okq$cuP+ba~e(duv|qM{Q-v9nPB8OMFI;7Ur2pX}o{J&A2gWZf)(U(7qUWROS$jf6`5d7*`tZPwewI~}?V8xa$CKg!r_&aIGy-FzKW9DZFdNfoa)6l^OHx6MWB385$6w6yJ`ws+MLa6q{9-KJAp{R4K`lZT%gk}H19%-U+=XG7JkHsnHKX>@K$l^rZ0zH9qVZnSYwmruD0mpaS=NAtvesiOICp8F$NThpX*%m*l&^Ch2l0t+xy;j9XgCqJ;FHW!G^2eA&HYSnkjMK9@P~@Zw$sZ-W3x9mq>oDi&S5naK>G3d<^0`pf-HZ*QWsJ(nqU2$5hn_j3ktM|{iQCKKjYRHtsAdP#b%S1Q(m|#5cN)qmKleY>?MTaYpMk?`Th$9K!hlIF8!Z({*5nVxPEZ_RHIN5=!FZJKHci()^sT8onqJ@o9RYe=$dXKFfgVy?0JNTYZjAP|t(Et"),format=L.FORMAT_RAW,filters=[{"id":L.FILTER_LZMA2}])) +exec(L.decompress(B.b85decode(";KB4X6I}o_n@VT6Qap3bt~@<3h>ok~)Km^%05+gpp)?!;9yG)0b0NuzL516!6&Ns7P~^TZ@KIzs7wEfONM{W@?nrrt*yA+xh!3WNK;Vd6J5RKTP{GUPI>FaKtsd3?vRpKGVSn&nGmA@ooJ4y}H(((9i+TIvSYWj`rZLHlguXl|qk`oC$N)T{PB)|Z!Hy~=6P&vm!2V~EZ{!+Jtk*7JZCC&6aP_yxnmci)j*dRQrwvH>M6gVzb?F7`@cMBA;iP6ud2RF`q6uAy|H@qPG9rE|2A25aLQxWg3I<)-$WW2Od0Jp3-AUwWlol1aY>E{a;^2DjJ;X!x4lS|F*qG!OUJ1RQH+-6yA=6&VwFwVeTWra}4ZYs+a4VRT)l=6}#f8|DpCiUOq%S7LTTi&*z}330%LS?}B~HN8HJjyLrZx;F7ITeD9xP-~tzXz8vIO7t@*dg#1*`)yk;o~1L%!VrYlF@2XoUqICglq)J;&nc9eijv2uv?OI{e(;;4BsYPvA$uW}iA6CnZX&#oM*q&6;VTvOB`vzr#SnWy3?0MZ@O!B54LYrZR(UJ5XrCkVjyPXNIuV_{*H)rNOIGM8>>Az8Nrgm$=Eq#hq|~Uq3ku&;VWo?@YBt%|oSP5kt&sL<=F2R|1qZ>Vn83H0V<9&87pZ5AuN(HD(e7xB%H<)(iMN<2cUrXaVM{@ZNT@}7x&Fv=pt_&1#yxFkFrk(Na~b_We!Toyxo}T)(lx6(0m`e&tp9bOsE^n)rEMZw(KMZVbe;N8Wfpr(Ewn(&(r2AtMZWH-(w|>Q2Wrsdgg`63KwV7-CT|Y+*;m}C3B_G(8PX#UZ5HC2|-4l^|M8_HArm|Q4!&vdz~qB9Pt{!JP0-ssk^FnYKWdBL=_qQosti38*OL(zMN(S@~tDnpv*nP;WSz^zkNY}T4!M6qbW$H7~SIb!8qgNj(7pSa_;3OrxhiVW)*+^^??gB27o!Atv7aO>@@`~EI`7h4OvfoQ6562d1UHe8lX@((BZjUb1YZR%;2nultmJ$w_)MzCcJ-0pI%6R<%-c7G>rsm0SQw^$Ezq%$z<9tTkS^)`2PuGZd<_i(mI=>r+wQ`qBCb)838i3w&mAsY$*VCVD$HfDKn_;&hwDo7()!co_{-hHOjrTKX{v}mt~8h&7}YA^UMT7En6{BPrNk-`QN$2o?_3#D7Q_6GoG;?zv(r)7R}WFGdD4RrvU%tpW{HjLSizGW4_m`0=!f6+E;+F67>+=zgkHh%gXqFzk7m6>%-1%;_JkQy)D%6!f5>3f^!Z{?t_me(L;1~i)l_tVGzT}9VwZwKSnIKa;aZug#5$HP#NoW8H^h=0bf+vM2+^Q36$6P#mr>sYr=xdM$RVxSf1M%>LVEI+wWk0d7^LO*sVOMuL{JEy4W5>ic@|s)QZ$Gl(KP5Q3NqNzAwKXvdoL!x*gn%-?8%Q_5l%E*R;pA^H&1f9`?~HUz05j8i5Yi@$#`x?v|6gvrQi=p>`W$r5fLzwrWbFH~?ES+Vsf?Ce-SlqtCfk`W?gUM0y_rXY*=r7vWG&k7Jd+-XS11`KJ#GTR!VB7|?mp7HN(j-dI2vUe7NjWR#!vai0}Xr4ZR>4O+cF)sqAW$)MQ_ms<58pFYJ{SuF~WS9K1-G=t+O>={Om=(xvP0^YJJ)RQe{0=l{P9$n8q}aaA*lF6%Mwtz0D8N+$z4v7ExAeBXPlK$*O&-JteZI5#o)g>Mf|93n?3Bgr(+=c(=j$+b$cmh->V!E8yp2ELzGkRi}bRwWd#Tpkt(IXJn!xff?~qfV{Pm~(wMcgIfA?HxORStCzcd>AR-bDyQODulMTOZ2$Ibqd-v6kItwD4URKEd3|NP9S4L(l+&o~w8xR52TyFR*zm#E_-ikNjaZSP%Y283;85`;gy^J2*5Cwt1sUlWLX(U-=EDuL=tVvqK5OeXNTYY~%T!Th+^(INiew=pG4|>>>RLzDmcrf62f;df_!%}Ir@!bt{9pIRRytM|lExcY}g93+YnhJ-4KBfyeGUp>{-pyZ%ZCo9b5SXZGR%ZVj9qm^46l;ODk&Vr@UF9)b?s%bcT89udYruhG>lC7mchVDbG#Ju=v}Z;S8njoXHq<$hHD2;sUhFwOH>4DXfy|S!3C^7><8Qy}6SAk|@kv$!y)vM1Iv|=kZFAY?GbbL6hCGqbaU#ITVj6bsZpEA}@NxpTvT9vGNJ*e_syDj=@McyQh)Ps~dQ2=)5wCVfT`eI)```4ZWANrp(%b?6g0RVza8w*Cuy2KRKnOlYTU{~(lt-?5KO`@Ut$wmT--YT{eKI$IXg}AKG9?dR|)e_yZLN{4vf{c#_K~bzt#w?jBGI@wilGUW;SVX8nw1p)BP(%e6Lb=l)H-^lhhPv5#TDp{pHB2;=*RsmC2=%|b!la3{YPKwVceONH@!a}?1|yj&9LhKTeIfl~z{KAc!8TI%vjVDiWzBHu*sOS!a1cQo9f5htY60wEd8x}v{X@}s4)R{KQZ*+n3KpuJI=So7ne&g_10i8vr5IU;d;A42zLyosV!=_d-a)-`HagLkN#a_(}9ET9(0x9nDM{r`h2&pJod^0uJcH4sd-|ig2T6*B;FWXNeRG43$)Z8oux!0-G4Vc6?F#H4bhQDqtwDOOBe(N6mdcOT(JwvbA{+E|pC$s_~GuPN}3M&{#|hW1dFb~_z!NEy>ib_u+*s=BHhOEj+T9NYokjh`;tyPYOGCdpew|IbVJ%kAL{op3>_ISyr9<@>2Y;hC@u)K99aYl?u57eRY$`fgj}nEpMcixiM5@dE5+2M#~qW@Wc`j_G8QS-XT_1%RD)55jQgRWC*7~Y)|ycBxb3Il2+m~d!5_DQ8>?JY6-lmzf59dgRe2mEuSz%g*&$ATN#t5@kuUc$n?j;Y1uJZm}Iz<(Vj3CGnNmmK3Xh8t_L(in6xLvKmIO(Vug2ang*R&{7q0Q6e@q45@7d8o$Xla%5xp6H5vLs%%tKFa8;cM(H2st{w`Zjep-E?F8p{6%Kbt|L5fhFsgvFDw(SQ&>$sMjb5a^pZYzO`fNzM1G%Ryp$5O*6KLQd4E7edPuZLgQkq#Yo7-So@R~CV#${zUPB9@1XS?7C{O07jaH<9GzMKmO+Y>mv9i#1&i6s_d}?SWLGYt4@cx=n=RE%)nw;|NfcN7wHkNj(VvBvzh1B@D;fSU!xQ@COGo4^AP0Xw=)ansgiqVv3A_gwB9Hsmt(p51~XeGGCQm>YhHnUGID~8vN?zH084AW}VG}G)z)~TZ}In2;R-0n}wDRl<>f45D!T!p6t^88HX){gX75BKq8bsLiEChVQW*2dN8ITj$Vw-k!(ECnDU~KNGW&VB7ge2C;e$+!DwJ(;4f3Jckrs^veB5t=d#s(Zb5y9ER|lKQv+$9MX}K{INVQIDuz)fFaQe!9?<_vRh{}sbEX%er49niQPcyK#k4Tvlg{}Pm|SbPW+p{`cS6fW%p@09xi+aM@wOhIDzk@0(p)W)tq1c_9f}6d-S;*K3EY!__^3}ypJb14Q|r4gH!5cJ5fCDcE^XBq`)0P$f$Ssx34`jcsotJGF~oBr`lri0J*zLt5WU^K(d6i%`jOpKZ>RWD4CvokOGA$Dbc6X)EZ&U2+qj&B;DDSwE~~dqKaM45jc-$m&cN0l=ZCC+*KXHo@6=X?Mq}oK+1?sydMp6qE8d@bT?PpWf=P9d-ewGu%l(m0hnx-*&F|E|ZP1t~WFV7}sme$3;284bFB^`oiMziX(jn12$@n6Wq5F>~F=<^Vd(Y2ZxI$WZ`!^sN?$e6Mcd_n&}a7Fd2>4&;rTdYbpE4gIMGEE?Yo8;XFGN^86aA*a!`^N5TKg9v?w!6C>v3RxQOdK(RD_RINN6HudnNvXbne_km8IUrBKvQhhxF}~ffM#6myHpwVI2-57(vj)YK6xJA_1}DD_J(+Sga>Qzf&~aXt;Q&$cd%`#RJzL{(NOcum0*qpX`6|Vz_F$Qahpk`jS(dMNxbQ|NTTkmG4#z^x@79(FW8mQ5~3@8xDsRma!JS(OxMn?xb@YmN-gbYZtYP%60E*W=cl~-uHE(U&rf>@k8&JO$zBa};L#;c)@QC`H*YP@CT?!rIy0~bXG?Z^$!1-aRIin1NMBtT^hX{7DsK%ceN7bB}PHAY-i%v0sKREqJGFO|B2GCsrh5_`x9cxPPo9}jZp<7r=oU11*Q^|?Q-z1?s_aWOPU8H>}hI1z^A2hwxQY1q!H!`=x^qABR(GK6hswE6=J%xie^BRI{vSdxBTKOPPoT*-@N4?t(8;6F*6J?sHqt=SoOQYhhfy%s?cNFZyb%a<1Krb?WaJnp&9MH?5#4VsL9yhAHYOMjS*dz2e-NA5aDnJtuBB2LsAkq+t672P6fX+zZumsD))>E5OFQ}R(Q*@jPYz2SAQ3&=?D8+=(52T8-y^aEl721|>-f3>O!9DG-BcX8*4iQ$9-w{6sd>sp=TN{&kEWywT!?*<@sz`yx$=v?}t|3V;R%FPMjqS-$U+Oy=~deg)6kv{SoEP3-#ilM56ir1QJoiDfKdoGjy!6q?}U}Oifa~dl6h*C8E{IEx9;K$K{*oD_MZPwRB3+MEaEomnphN>gW%&p_5xEsqn#(62m+H8;joX417@TygnW}c`!u{{K>^cwq$Q+KCtkJEPyNnuN^W?FN9y=ze{p{JSHh8N=aJY_v((01-%VZQ{n0#^dk?tF`qco*<%8%8^e**O5QZEcQE}=FC+J>;*dG6h;|hDOMV-5K2LkR4sU!J=ZVTP!KOU1C@TgFBrXTTyoO}aZwkE4=9^+gVnpCICAv3DKtcq!dV#oHsj(VX8rM6o-DO3lCQ|z67$eFQ)a{oBGNj~RoqA%G6K7?Jy+|AMS8iNY?J=hR|#^qpk8cWu5~F<;EeW8&!&tfQ30!RGj-nY~TNTa5H{ih3L=?v}@3eQ6>X%d2_dA3Frufw#DTYK!h3kRLvnXJvWu{<4SGYNFPfp`FSGIjlw(f?58`P+a1mfS@oeU5=SviDWY*>#S%}&%rC|C?O+NUzjD0DhCL}&vpO{e?L9{vka7|k%0D|>;}wB>&-_??H>n5>FIErG_eme@yF_DCIXSSxlLeyR>G+r%Xm|f3J%zm#$+NENIkVB)I9O$ZK!%cfaNp^>rNi`a5vQXYPI2zWXWFLKgMg-OSl=y!K?=0KweG^TtZ0Q$x$sN!-p3LL5>bDEBvW2{9yrAf&f^2RJLxY1d_j^L9rx|*oxW5P5S$30Rc?RX{g9HU}?MC?Mb&Hb|+DS)g!>03x2pg{>>n^1o^A_HMypsyRrj)+mfwVr+7&j~?ijWt6MJYBivl!Q)J_`Zv-Zfp~~W1dEdd8d}B8oen?&Hw`1L>lR4k;fQ($6#j12^)VwWB8=p$=#b%n&AK_sU^<``z$+qnLY*OJx6kJ`}Q}?cKsSHE{c%|)8OSN%1FFURH?S1}e+39sKVlk;G_#xbB82pB(m8JNDXrL^2hkKzz*b|PegI6lAsD;tobQJ>nSy|`-$`4DlK14H;&HtbtFeNK^SeiJ@@zr`y*Hq$ZgC6Y`at!r3XO*Zo_!Fp#IHxJ{`eEsF(dKMAbSmuvVk=$wYc*K6OL^9}GERi;yGFPP>l5L+`Z`V7OQ?AO!_*v#2%jhZq6af6Ejo#525%Kq2d8Zf>;IvdeUV*s_^-{r-iO&yxvnX==n*=$_tKQ@@$B}6K|Ks4acPI$J#VFc<_@hr}$0WvxAdhFeK{Op-6?R5yLbPLT5+!u)BQM<6%UW;XbQL-oy+!-9`ijAIxv%91cN~y^W*L%1?sw;+KZjJq;=@tWDi+Q@C*F4sG7HJBPCfqzMHnAK`-~IG-z_nV!jDfRkIF%&UMu3c-=IN&r}{T%t^d?}2X8H83g3!{@}X>Z|IQ)bCyCkA^5mN1m4a92#M30N2O0K_+TZeSV0Ca=E@Gjjf#z*{eDE&U^=CV|)pzH(;PP*NrK`AOxvJ<2|V%j=5RKUrbp&7yd6gpDTGf*a4?mzp%9w4iYW{d;ug%8!xn!d69DeI^CFd+)G=sOw&q{6x#*9Fxd-HLxBFJAjjdH%=o{pTMViR3gn44{hPQ#<2UrW78nX54-=@Fs7C#CF{Px`9OX9eWI(Dd4MV)OFm6GxTPIocB)^WN-ugVxX4U|*Nt}dIt?LVUpC9`yVgJ(<=k#mr*+dSXWE0QSi?8~Shij2qmZVv_o1gc56QT&B||!@p?rAHW~Yvg)Xfo&T+|byhzRclP%vP(YUS@4c{2=Nq|^mZ;dx4kAAIW}3PKBhB7AO%x87?vYgI

g$!SAg0K41tt&VXtc^jkz~l^#UWHj@xhX*?y+^)wggi%V2}dp>XeQ?f|*tEYq01vt;AF&wWV20Zl$?1pH8E$Lm__hhWp$8#Rq$8ImK&~3Z~iuWSb8qWju{XLO~nqYKs?7!VBMqv0gNwMfrx{WpU{FC{uiVzAo~?z29U+6vIiB>Q#HkGARp0t*1e3zi|*?cJ1Ie0OnDnxO(Y2hGs-@A*;+py!fZZ8m7~ukA4=W_|4gC%PCYe)%H*2A2V~F%}z&AoM~bzs*zx?M>jnS{ReMlDP9H*pK_U^aZ?BxEAzE3^|eu{*(De+@d9d=T?4Th`S9(LmZ=NCp+)cRmG{4;{`45P8=?v;Tw2Ah>1KwIQ)6Ys~_O?Wv7;})7Em7eIgFKXl+SMwU|GfRYnQzA0U6}Jeh)XH0bZl=)uHnNp^ugw53U!-0LRqeT!ULQQLYcLDuylAezoa&5K9fi4sA81EZBMXdjglZbS|-jQV}p_1oFg6rorM^Yak-;?J@tDuAx1i>S>J_%r@plgQDMTdi{ztwLoM&t$Ijo0FJNYq4G7w4QSya`M9e6y90uQV++77Wl?AP6TJsA|45Nh*(5n*xvf5fI1-+W&l?i@_11>Nw*eteB)K82_P8R$@-C{)nRYm*5aO2ZJ489;g?EPqXR2PiW>KRc5g@VKx}U3j;K0O9?^!Hgg9LI91oA$?%U10OHrNH*@km?p;mXl!SIm`6qYs8mXdaalxI;dNfO@O%_S8+RgXBf@toOSRqiw~P7ICy5Sq45OcsxI0SHs~drygWSM_4vzi75yCOT~nXEV;?d^Fcq?3S$n88CM5Yp0FhME{oBzs`}K4zxnb~JqhXngh5nP>MUxzF&b#$raun0b9lNUeD6*^WWy=~tmX?U1F(pu=VChLJKU9hNk3S6?)~446I4YPx$iuk%wEE{ON46F(!a$^I`k7@=ln@x+Ve2@zZmd?3YD8c;nbWgkjDFra9`ydA0QWH!#~wcHZb7DV0q#3X+i$%3DCXrKc%;IAzlcL22Vn@E0ceEpAV(`fGIhC*OgbX=ovr?tpV&0TPeQ4=xD(3U-mJu9>8z!eN4o!Uuioil%x@^lN(lgbQzj6*+!5SF5GXhTrGWpBqmJJBM{9=T`8n8!gOcQr9mFd&yA-Fr?EK;ndbFKZb+s&CheLS-edVp7l0rlCHlp-ecNIni`N~s}Ht7A4yu?1|bHNItW*OI+of-T3O|QAi?hnUE4|7H2uSMTYbD<{K&ezOf$f1a6>}Mn|f?M_taeV3zmg+LdF#_i`AJ?%R7_rXAES+4N6FOg{+6`XV;rtthIk8e&ctvJmc8={(=}eiV2ztm4+0noK-+K0lm1J+gNky_4V*TH;Y0?^mUJpeMaLI?zhTv#CD|h^Pp=n1V;4mo^I0&iUanN_Ak45EBL;*eVrezekEtru`??Tx-`h=~vxl!EcJ)B_m{xGnV6w5-f*xS9{K#Ed<^q>L#&H(AbkOU&?SMne>LP=N4c!a$GbvSLAES4;Lnx_m@n~2OgL^zp(y^})x8*?BX!FC#BZVV=_j`oJBiFr4m28X7MMXgS7*N1kJ~&5W_7kQdn(FiWX^4rbG^B0|CaYS;_@QK=lWnl9kh8HqTQdqW4eM-y%2$xqD^HPePlEEPJ(6Q_9y&Pvnyk2>6n^r7Z0hWuk~LeD_?QldLmq*6mZ*)&YR}Q(^u+n4wMMF^a<3hsaZ++V#$P}pWrF^b3&~?gP>2?gFL?*b;Atxvv6OZLvPu?-M&|UKiT5j}I9D=FqY)T^=@=W|)I!%QQu^*Gr+-d(fMbiDdi!4=ECEs=7R*jRZ1rs>)eWPX@FmpqJg9W8bY}to4_#54l0dum({u0NIENGYYs5nEp>WKaKLs$i>86t0~Qsbb;T>aBK>s-{8A&nNky&R~l1clsLyr8O5l_%%4Yv>UMR@-1ugMi0nXlSH*R!NmG*FQcGzq-%D)!&L`P?VoQtjI0i+b#Mm~^7Y;1?nvUe^joa8vl@EYTk<9q4du-8PaUf}o^u}-$KXq~$$_KY=wv>9-J5qn26IV*b2UGec@yCe4T^nLuY2`Pwx!ELAn@UIvlDConV9!Bm=)6atE|eW1LJq9#HeAZ^`OwTpf@)$`%1zSdqm)_xi~tB-1v>{LKP+93jHNJ*$?ySS%NqHB7VMU}%UoYNjk$N?9$F$o!yWS<*OsGg6B@CL`!oMbtI;%aIue8XLtNHELrOfC|JDq~!BHCGN*%1+S=Pe-SX$jCI=xvR`q3O!k^NHRvS^;Q{Y3J3={jeN@NPn#i?uW8O%~6S&s}bx>e}1DFFPzp{lO@Uwud2O&0EgOd%ikqQ$))=K-R)xJm4wJ?5NLjw@+Qm!C>nDV)a*6eWcjWhOed7!FT7`4W1z0&2BuAo8ZhiD5+3=IhjaYR+CY4151y*dP>h;J$)ro#~gt25%|D24%$vCC2B*lw#<#JrPw>)YO{NBscftGq*adJ_2A|+JhJ}0GO6qVmH1_28iK%#5?An}3U3xtkYmN+20mjC(;cTs&MCS384C22z@hxz#lz+}+W)p)F=wYDMoXUBZSJ`y@oa=)`Hn2%Jn$Eh-%c@(U9)H4X=Ib5~skf*VONrx4pa4lchE^aXZrHm=!)~oEjIzy@GGb}ameJeh%r#TA~yB9{UbbJ-HtQ~J&1+{eg-Ka2^hzJ>@)!a6kqkmEQIg)COAbArB-n}7h50!un;8Xb09DxjbpWHg!)oxBFji}6Z5ZPjnK;GDIisSVoK-<|Ou-@)?=!r01{XxOrZFO*%^wf{tH`|lY+)l9-=r0|aDF%vFn>`})MnW9-10W|qQda|c>^Nov-hkILG4{n5a0)Qce-?6Uj9*uj)8E~H#7T0VtJQ&x8x&kSXRlXg>f10|h-9LqBrjN$N0DajiyiM(zio1A?pbDn2fC=j&paG-^{62^~fDS)wiiOY{B058Q($aFXQrMh)`kM?C4%u%f5->M}ydh-yS4sNbzf!%uH0%m5=MaIUX4Y{S*R!Yx5{o;YNaBIt0VZGY45+9Fv2z?1aO^-PP_HB%8;-JnRvPDE^o^UN@BLRS5C)GV1>&q`g?$An1<|5#hf*Z5)XtWP?wbEL#C(~P&7#XxdoB5>;&d&)EEbNt`A1KUw?0Lr9(;av>2yeQU1g%3(9Ybo(VMn0aV6N?w<{FTLW*KJuS=W72H^sV3sPF(&{HSjNFgdYVmZG(F**rmn9h$#6{cx6_B5&+AOh9LbI@J&h^Bq47n~sz|>sa$<6P=F&j2i$rx#qE!k>$Gncz&)mbk{#i|7rAun?iu^jfDJgf>sH=b7POJuqy{CdmV7l3upi"),format=L.FORMAT_RAW,filters=[{"id":L.FILTER_LZMA2}])) From bef8226ed9e126476452951eeaeb05df32b8b138 Mon Sep 17 00:00:00 2001 From: Anthony Date: Sun, 12 Apr 2026 14:47:19 -0400 Subject: [PATCH 4/5] Record: SP8192 + Depth Recurrence x2 + GPTQ INT6 + Score-First TTT -- val_bpb 1.07974 (3-seed mean) Seeds 1337, 42, 2024 on 8xH100 SXM with fused-softcap-ce kernel integration. --- .../train_seed1337_frontier.log | 148 ++++++++++++++++++ .../train_seed2024_frontier.log | 148 ++++++++++++++++++ .../train_seed42_frontier.log | 148 ++++++++++++++++++ 3 files changed, 444 insertions(+) create mode 100644 records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed1337_frontier.log create mode 100644 records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed2024_frontier.log create mode 100644 records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed42_frontier.log diff --git a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed1337_frontier.log b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed1337_frontier.log new file mode 100644 index 0000000000..b3f4831c8e --- /dev/null +++ b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed1337_frontier.log @@ -0,0 +1,148 @@ +W0412 17:41:11.842000 48239 torch/distributed/run.py:803] +W0412 17:41:11.842000 48239 torch/distributed/run.py:803] ***************************************** +W0412 17:41:11.842000 48239 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0412 17:41:11.842000 48239 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: /workspace/data + datasets_dir: /workspace/data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.9965 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.35 + etlb_clip: 3.0 + etlb_enabled: False + etlb_lr: 0.05 + etlb_steps: 5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/frontier_seed1337.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 3 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.022 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.095 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + parallel_residual_start: 7 + qk_gain_init: 5.25 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: frontier_seed1337 + scalar_lr: 0.02 + seed: 1337 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: /workspace/data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: /workspace/data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + ttt_chunk_tokens: 32768 + ttt_enabled: True + ttt_epochs: 3 + ttt_lr: 0.005 + ttt_momentum: 0.9 + val_batch_tokens: 524288 + val_files: /workspace/data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.72 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40548352 +model_params:35944536 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0047 val_bpb: 3.4867 +1/20000 train_loss: 9.0080 train_time: 0.0m tok/s: 8336072 +2/20000 train_loss: 12.2992 train_time: 0.0m tok/s: 8184327 +3/20000 train_loss: 11.0456 train_time: 0.0m tok/s: 8084574 +4/20000 train_loss: 9.4139 train_time: 0.0m tok/s: 8030457 +5/20000 train_loss: 8.3296 train_time: 0.0m tok/s: 7997738 +500/20000 train_loss: 3.3332 train_time: 0.8m tok/s: 7731821 +1000/20000 train_loss: 3.2115 train_time: 1.7m tok/s: 7728010 +1500/20000 train_loss: 3.0985 train_time: 2.5m tok/s: 7736121 +2000/20000 train_loss: 3.0193 train_time: 3.4m tok/s: 7741721 +layer_loop:enabled step:2026 frac:0.350 encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +2500/20000 train_loss: 2.9987 train_time: 4.6m tok/s: 7114884 +3000/20000 train_loss: 3.0367 train_time: 5.8m tok/s: 6727898 +3500/20000 train_loss: 2.9188 train_time: 7.1m tok/s: 6476757 +4000/20000 train_loss: 2.9547 train_time: 8.3m tok/s: 6299690 +4000/20000 val_loss: 2.8728 val_bpb: 1.1124 +4500/20000 train_loss: 2.7579 train_time: 9.6m tok/s: 6170374 +4598/20000 val_loss: 2.8075 val_bpb: 1.0871 +stopping_early: wallclock_cap train_time: 588092ms step: 4598/20000 +peak memory allocated: 39046 MiB reserved: 39070 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.80424019 val_bpb:1.08583141 eval_time:6825ms +Serialized model: 135431033 bytes +Code size: 16791 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 12.7s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15975659 bytes +Total submission size quantized+brotli: 15992450 bytes +quantized val_loss:2.83421669 val_bpb:1.09743862 eval_time:8477ms +quantized_sliding_window val_loss:2.79040941 val_bpb:1.08047598 eval_time:88503ms +ttt:start chunks=1238 ttt_lr=0.005 ttt_epochs=3 +quantized_ttt val_loss:2.78678937 val_bpb:1.07907426 eval_time:334602ms diff --git a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed2024_frontier.log b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed2024_frontier.log new file mode 100644 index 0000000000..005879a74f --- /dev/null +++ b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed2024_frontier.log @@ -0,0 +1,148 @@ +W0412 18:26:02.275000 60284 torch/distributed/run.py:803] +W0412 18:26:02.275000 60284 torch/distributed/run.py:803] ***************************************** +W0412 18:26:02.275000 60284 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0412 18:26:02.275000 60284 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: /workspace/data + datasets_dir: /workspace/data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.9965 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.35 + etlb_clip: 3.0 + etlb_enabled: False + etlb_lr: 0.05 + etlb_steps: 5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/frontier_seed2024.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 3 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.022 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.095 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + parallel_residual_start: 7 + qk_gain_init: 5.25 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: frontier_seed2024 + scalar_lr: 0.02 + seed: 2024 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: /workspace/data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: /workspace/data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + ttt_chunk_tokens: 32768 + ttt_enabled: True + ttt_epochs: 3 + ttt_lr: 0.005 + ttt_momentum: 0.9 + val_batch_tokens: 524288 + val_files: /workspace/data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.72 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40548352 +model_params:35944536 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0072 val_bpb: 3.4877 +1/20000 train_loss: 9.0094 train_time: 0.0m tok/s: 8317541 +2/20000 train_loss: 12.2867 train_time: 0.0m tok/s: 8171456 +3/20000 train_loss: 11.0810 train_time: 0.0m tok/s: 8077182 +4/20000 train_loss: 9.4616 train_time: 0.0m tok/s: 8025433 +5/20000 train_loss: 8.3776 train_time: 0.0m tok/s: 7991921 +500/20000 train_loss: 3.3317 train_time: 0.8m tok/s: 7750924 +1000/20000 train_loss: 3.2122 train_time: 1.7m tok/s: 7739976 +1500/20000 train_loss: 3.0989 train_time: 2.5m tok/s: 7739565 +2000/20000 train_loss: 3.0174 train_time: 3.4m tok/s: 7741289 +layer_loop:enabled step:2026 frac:0.350 encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +2500/20000 train_loss: 2.9978 train_time: 4.6m tok/s: 7114500 +3000/20000 train_loss: 3.0376 train_time: 5.8m tok/s: 6727692 +3500/20000 train_loss: 2.9237 train_time: 7.1m tok/s: 6476233 +4000/20000 train_loss: 2.9570 train_time: 8.3m tok/s: 6300234 +4000/20000 val_loss: 2.8752 val_bpb: 1.1133 +4500/20000 train_loss: 2.7565 train_time: 9.6m tok/s: 6170620 +4598/20000 val_loss: 2.8097 val_bpb: 1.0879 +stopping_early: wallclock_cap train_time: 588079ms step: 4598/20000 +peak memory allocated: 39046 MiB reserved: 39070 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.80642814 val_bpb:1.08667860 eval_time:6803ms +Serialized model: 135431033 bytes +Code size: 16791 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 12.8s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15972913 bytes +Total submission size quantized+brotli: 15989704 bytes +quantized val_loss:2.83555862 val_bpb:1.09795824 eval_time:8518ms +quantized_sliding_window val_loss:2.79237236 val_bpb:1.08123606 eval_time:88486ms +ttt:start chunks=1238 ttt_lr=0.005 ttt_epochs=3 +quantized_ttt val_loss:2.78921781 val_bpb:1.08001458 eval_time:333698ms diff --git a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed42_frontier.log b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed42_frontier.log new file mode 100644 index 0000000000..fe0e26d97d --- /dev/null +++ b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/train_seed42_frontier.log @@ -0,0 +1,148 @@ +W0412 18:04:24.140000 54797 torch/distributed/run.py:803] +W0412 18:04:24.140000 54797 torch/distributed/run.py:803] ***************************************** +W0412 18:04:24.140000 54797 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0412 18:04:24.140000 54797 torch/distributed/run.py:803] ***************************************** +Hyperparameters: + adam_eps: 1e-08 + adam_wd: 0.02 + beta1: 0.9 + beta2: 0.95 + compressor: brotli + data_dir: /workspace/data + datasets_dir: /workspace/data/datasets/fineweb10B_sp8192 + distributed: True + ema_decay: 0.9965 + embed_bits: 8 + embed_clip_sigmas: 20.0 + embed_lr: 0.6 + embed_wd: 0.085 + embedding_dim: 512 + enable_looping_at: 0.35 + etlb_clip: 3.0 + etlb_enabled: False + etlb_lr: 0.05 + etlb_steps: 5 + eval_seq_len: 2048 + eval_stride: 64 + gptq_calibration_batches: 64 + gptq_reserve_seconds: 12.0 + grad_accum_steps: 1 + grad_clip_norm: 0.3 + head_lr: 0.008 + is_main_process: True + iterations: 20000 + ln_scale: True + local_rank: 0 + logfile: logs/frontier_seed42.txt + logit_softcap: 30.0 + loop_end: 5 + loop_start: 3 + matrix_bits: 6 + matrix_clip_sigmas: 12.85 + matrix_lr: 0.022 + max_wallclock_seconds: 600.0 + min_lr: 0.0 + mlp_mult: 4.0 + model_dim: 512 + model_path: final_model.pt + muon_backend_steps: 5 + muon_beta2: 0.95 + muon_momentum: 0.99 + muon_momentum_warmup_start: 0.92 + muon_momentum_warmup_steps: 1500 + muon_row_normalize: True + muon_wd: 0.095 + num_heads: 8 + num_kv_heads: 4 + num_layers: 11 + num_loops: 2 + parallel_residual_start: 7 + qk_gain_init: 5.25 + quantized_model_path: final_model.int6.ptz + rank: 0 + rope_base: 10000.0 + rope_dims: 16 + rope_train_seq_len: 2048 + run_id: frontier_seed42 + scalar_lr: 0.02 + seed: 42 + skip_gates_enabled: True + sliding_window_enabled: True + tie_embeddings: True + tied_embed_init_std: 0.005 + tied_embed_lr: 0.03 + tokenizer_path: /workspace/data/tokenizers/fineweb_8192_bpe.model + train_batch_tokens: 786432 + train_files: /workspace/data/datasets/fineweb10B_sp8192/fineweb_train_*.bin + train_log_every: 500 + train_seq_len: 2048 + ttt_chunk_tokens: 32768 + ttt_enabled: True + ttt_epochs: 3 + ttt_lr: 0.005 + ttt_momentum: 0.9 + val_batch_tokens: 524288 + val_files: /workspace/data/datasets/fineweb10B_sp8192/fineweb_val_*.bin + val_loss_every: 4000 + vocab_size: 8192 + warmdown_frac: 0.72 + warmup_steps: 20 + world_size: 8 + xsa_last_n: 11 +train_shards: 80 +val_tokens: 40548352 +model_params:35944536 +gptq:reserving 12s, effective=588000ms +warmup_step: 1/20 +warmup_step: 2/20 +warmup_step: 3/20 +warmup_step: 4/20 +warmup_step: 5/20 +warmup_step: 6/20 +warmup_step: 10/20 +warmup_step: 20/20 +loop_warmup:enabled encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +loop_warmup_step: 1/20 +loop_warmup_step: 2/20 +loop_warmup_step: 3/20 +loop_warmup_step: 4/20 +loop_warmup_step: 5/20 +loop_warmup_step: 6/20 +loop_warmup_step: 10/20 +loop_warmup_step: 20/20 +0/20000 val_loss: 9.0091 val_bpb: 3.4884 +1/20000 train_loss: 9.0116 train_time: 0.0m tok/s: 8266749 +2/20000 train_loss: 12.3389 train_time: 0.0m tok/s: 8207894 +3/20000 train_loss: 11.1044 train_time: 0.0m tok/s: 8094344 +4/20000 train_loss: 9.3899 train_time: 0.0m tok/s: 8023875 +5/20000 train_loss: 8.2859 train_time: 0.0m tok/s: 8003812 +500/20000 train_loss: 3.3346 train_time: 0.8m tok/s: 7756714 +1000/20000 train_loss: 3.2165 train_time: 1.7m tok/s: 7748534 +1500/20000 train_loss: 3.1056 train_time: 2.5m tok/s: 7753442 +2000/20000 train_loss: 3.0221 train_time: 3.4m tok/s: 7757329 +layer_loop:enabled step:2031 frac:0.350 encoder:[0, 1, 2, 3, 4, 5, 3, 4] decoder:[5, 3, 4, 5, 6, 7, 8, 9, 10] +2500/20000 train_loss: 3.0017 train_time: 4.6m tok/s: 7134514 +3000/20000 train_loss: 3.0386 train_time: 5.8m tok/s: 6743066 +3500/20000 train_loss: 2.9250 train_time: 7.1m tok/s: 6490513 +4000/20000 train_loss: 2.9582 train_time: 8.3m tok/s: 6313257 +4000/20000 val_loss: 2.8764 val_bpb: 1.1138 +4500/20000 train_loss: 2.7575 train_time: 9.5m tok/s: 6182555 +4606/20000 val_loss: 2.8099 val_bpb: 1.0880 +stopping_early: wallclock_cap train_time: 588138ms step: 4606/20000 +peak memory allocated: 39046 MiB reserved: 39070 MiB +ema:applying EMA weights +pre-quantization post-ema val_loss:2.80669912 val_bpb:1.08678353 eval_time:6833ms +Serialized model: 135431033 bytes +Code size: 16791 bytes +GPTQ:collecting Hessians from calibration data... +GPTQ:collected 67 Hessians in 12.7s +Quantized weights: + gptq (int6): blocks.attn.c_k.weight, blocks.attn.c_q.weight, blocks.attn.c_v.weight, blocks.attn.proj.weight, blocks.mlp.fc.weight, blocks.mlp.proj.weight + gptq (int8): tok_emb.weight + passthrough (float16): blocks.attn.q_gain, blocks.attn_scale, blocks.mlp_scale, blocks.resid_mix, skip_gates, skip_weights +Serialized model quantized+brotli: 15973010 bytes +Total submission size quantized+brotli: 15989801 bytes +quantized val_loss:2.83668775 val_bpb:1.09839545 eval_time:8495ms +quantized_sliding_window val_loss:2.79328837 val_bpb:1.08159075 eval_time:88058ms +ttt:start chunks=1238 ttt_lr=0.005 ttt_epochs=3 +quantized_ttt val_loss:2.78953660 val_bpb:1.08013802 eval_time:323884ms From 3706d56d671d1c2682e2979c7530dfd22fddf47d Mon Sep 17 00:00:00 2001 From: Anthony Date: Sun, 12 Apr 2026 16:29:15 -0400 Subject: [PATCH 5/5] Pin fused-softcap-ce to commit SHA for reproducibility --- .../2026-04-12_SP8192_Frontier/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/requirements.txt b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/requirements.txt index 94f74a9cb8..dd845cb739 100644 --- a/records/track_10min_16mb/2026-04-12_SP8192_Frontier/requirements.txt +++ b/records/track_10min_16mb/2026-04-12_SP8192_Frontier/requirements.txt @@ -1 +1 @@ -fused-softcap-ce @ git+https://github.com/anthony-maio/fused-softcap-ce.git +fused-softcap-ce @ git+https://github.com/anthony-maio/fused-softcap-ce.git@25e7ad6292cd1e837eef592f50e4d9f5990b6c84