From 197ba3a4a88b0fa406cd4678b7bda4cc30f65cae Mon Sep 17 00:00:00 2001 From: Varun Date: Mon, 27 Oct 2025 22:34:27 -0400 Subject: [PATCH 1/2] Fix Muon LR --- .../3309f49f-8ea2-4f8c-95a4-d3deb9e19f46.txt | 3814 +++++++++++++++++ .../6c588921-a777-458d-8003-f608774f040c.txt | 3814 +++++++++++++++++ .../6e1efe80-8453-4ef6-a34d-8c73543618a8.txt | 3814 +++++++++++++++++ .../72231598-c098-4e79-94f2-26952a4bbdc6.txt | 3814 +++++++++++++++++ .../74ef00d7-4030-46f2-a269-bea707f0f0bd.txt | 3814 +++++++++++++++++ .../2025-10-27_FixMuonLR/README.md | 65 + .../f196cb62-827b-4bb1-94f0-4169eb1c9375.txt | 3814 +++++++++++++++++ .../fc12c205-f953-4028-bfdf-0519c72fb269.txt | 3814 +++++++++++++++++ train_gpt.py | 397 +- 9 files changed, 26945 insertions(+), 215 deletions(-) create mode 100644 records/track_1_short/2025-10-27_FixMuonLR/3309f49f-8ea2-4f8c-95a4-d3deb9e19f46.txt create mode 100644 records/track_1_short/2025-10-27_FixMuonLR/6c588921-a777-458d-8003-f608774f040c.txt create mode 100644 records/track_1_short/2025-10-27_FixMuonLR/6e1efe80-8453-4ef6-a34d-8c73543618a8.txt create mode 100644 records/track_1_short/2025-10-27_FixMuonLR/72231598-c098-4e79-94f2-26952a4bbdc6.txt create mode 100644 records/track_1_short/2025-10-27_FixMuonLR/74ef00d7-4030-46f2-a269-bea707f0f0bd.txt create mode 100644 records/track_1_short/2025-10-27_FixMuonLR/README.md create mode 100644 records/track_1_short/2025-10-27_FixMuonLR/f196cb62-827b-4bb1-94f0-4169eb1c9375.txt create mode 100644 records/track_1_short/2025-10-27_FixMuonLR/fc12c205-f953-4028-bfdf-0519c72fb269.txt diff --git a/records/track_1_short/2025-10-27_FixMuonLR/3309f49f-8ea2-4f8c-95a4-d3deb9e19f46.txt b/records/track_1_short/2025-10-27_FixMuonLR/3309f49f-8ea2-4f8c-95a4-d3deb9e19f46.txt new file mode 100644 index 000000000..7004bc11e --- /dev/null +++ b/records/track_1_short/2025-10-27_FixMuonLR/3309f49f-8ea2-4f8c-95a4-d3deb9e19f46.txt @@ -0,0 +1,3814 @@ +import os +import sys + +with open(sys.argv[0]) as f: + code = f.read() # read the code of this file ASAP, for logging +import copy +import glob +import math +import threading +import time +import uuid +from dataclasses import dataclass +from collections import defaultdict +from itertools import accumulate +from pathlib import Path + +os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" +import torch + +torch.empty( + 1, device="cuda", requires_grad=True +).backward() # prevents a bug on some systems +import torch._dynamo as dynamo +import torch.distributed as dist +import torch.nn.functional as F + +# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min +import triton +import triton.language as tl +from kernels import get_kernel +from torch import Tensor, nn + +dynamo.config.recompile_limit = 64 + +# ----------------------------------------------------------------------------- +# Custom operators: FP8 matmul by @YouJiacheng + + +@torch.library.custom_op("nanogpt::mm", mutates_args=()) +def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: + @torch.compile + def impl(x: Tensor, w: Tensor): + assert x.is_contiguous() and w.is_contiguous() + x_f8 = x.div(x_s).to(torch.float8_e4m3fn) + w_f8 = w.div(w_s).to(torch.float8_e4m3fn) + out = torch._scaled_mm( + x_f8, + w_f8.T, + out_dtype=torch.bfloat16, + scale_a=x.new_tensor(x_s, dtype=torch.float32), + scale_b=x.new_tensor(w_s, dtype=torch.float32), + use_fast_accum=True, + ) + return out, x_f8, w_f8 + + return impl(x, w) + +@mm_op.register_fake +def _(x: Tensor, w: Tensor, *_): + assert x.ndim == w.ndim == 2 + assert x.shape[1] == w.shape[1] + assert x.device == w.device + assert x.is_contiguous() and w.is_contiguous() + return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) + +@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) +def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: + @torch.compile + def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): + assert grad.is_contiguous() + x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) + w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) + grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) + grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) + grad_x = torch._scaled_mm( + grad_f8, + w_f8.T.contiguous().T, + out_dtype=torch.bfloat16, + scale_a=grad_inv_s, + scale_b=w_inv_s, + use_fast_accum=False, + ) + # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) + grad_w = torch._scaled_mm( + x_f8.T.contiguous(), + grad_f8.T.contiguous().T, + out_dtype=torch.float32, + scale_a=x_inv_s, + scale_b=grad_inv_s, + use_fast_accum=False, + ).T + return grad_x, grad_w + + return impl(g, x_f8, w_f8) + +@mm_backward_op.register_fake +def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): + return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) + +def backward(ctx, grad_out: Tensor, *_): + x_f8, w_f8 = ctx.saved_tensors + x_s, w_s, grad_s = ctx.scales + grad_x, grad_w = torch.ops.nanogpt.mm_backward( + grad_out, x_f8, w_f8, x_s, w_s, grad_s + ) + return grad_x, grad_w, None, None, None + +def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): + *_, x_s, w_s, grad_s = inputs + _, x_f8, w_f8 = output + ctx.save_for_backward(x_f8, w_f8) + ctx.scales = x_s, w_s, grad_s + ctx.set_materialize_grads(False) + +mm_op.register_autograd(backward, setup_context=setup_context) + +# ----------------------------------------------------------------------------- +# Triton kernel for symmetric matrix multiplication by @byronxu99 + +def _get_autotune_configs(): + return [ + triton.Config( + { + "BLOCK_SIZE_M": bm, + "BLOCK_SIZE_N": bn, + "BLOCK_SIZE_K": bk, + "GROUP_SIZE_M": 8, + "LOWER_UPPER": 1, + }, + num_stages=stages, + num_warps=warps, + ) + for bm in [64, 128] + for bn in [64, 128, 256] + for bk in [64, 128] + for stages, warps in [(3, 4), (3, 8), (4, 4)] + if bm // bn <= 2 and bn // bm <= 2 + ] + +@triton.jit +def _pid_to_block( + pid, + M, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) + + # Map PID to a single matrix in batch + batch_idx = pid // (num_pid_m * num_pid_n) + pid = pid % (num_pid_m * num_pid_n) + + # Map PID to 2D grid of blocks + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) + + m_idx = pid_m * BLOCK_SIZE_M + n_idx = pid_n * BLOCK_SIZE_N + return batch_idx, m_idx, n_idx + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def XXT_kernel( + A_ptr, C_ptr, + M, K, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def XXT(A: torch.Tensor, out: torch.Tensor): + """ + Launch Triton kernel to compute C = A @ A.T + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert out.size(-2) == M, "Output matrix has incorrect shape" + assert out.size(-1) == M, "Output matrix has incorrect shape" + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + XXT_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + K=K, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + ) + return out + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def ba_plus_cAA_kernel( + A_ptr, C_ptr, + M, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + alpha, beta, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A + # Performance is slightly slower than XXT_kernel, so we use two separate kernels + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + # Load block of A to add (corresponds to the current block of C) + offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) + a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) + a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) + a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) + + # Apply alpha and beta + accumulator *= alpha + accumulator += a_add * beta + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): + """ + Launch Triton kernel to compute C = alpha * A @ A.T + beta * A + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert M == K, "Input matrix must be square" + assert out.size(-2) == M + assert out.size(-1) == M + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + ba_plus_cAA_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + alpha=alpha, + beta=beta, + ) + return out + +# Computed for num_iters=5, safety_factor=2e-2, cushion=2 +polar_express_coeffs = [ + (8.156554524902461, -22.48329292557795, 15.878769915207462), + (4.042929935166739, -2.808917465908714, 0.5000178451051316), + (3.8916678022926607, -2.772484153217685, 0.5060648178503393), + (3.285753657755655, -2.3681294933425376, 0.46449024233003106), + (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) +] + +@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower +def polar_express(G: torch.Tensor): + """ + Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 + by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. + Code adapted from https://github.com/NoahAmsel/PolarExpress/tree/main by @varunneal. + """ + X = G.bfloat16() + if G.size(-2) > G.size(-1): + X = X.mT + + # Ensure spectral norm is at most 1 + X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) + + # Allocate buffers + X = X.contiguous() + A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) + B = torch.empty_like(A) + C = torch.empty_like(X) + + aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm + + # Perform the iterations + for a, b, c in polar_express_coeffs: + XXT(X, out=A) # A = X @ X.mT + ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A + aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X + X, C = C, X # Swap references to avoid unnecessary copies + + if G.size(-2) > G.size(-1): + X = X.mT + return X + +# ----------------------------------------------------------------------------- +# Muon optimizer + +class Muon(torch.optim.Optimizer): + """ + Muon - MomentUm Orthogonalized by Newton-schulz + + https://kellerjordan.github.io/posts/muon/ + + Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- + processing step, in which each 2D parameter's update is replaced with the nearest orthogonal + matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has + the advantage that it can be stably run in bfloat16 on the GPU. + Note: A later PR replaced Newton-Shulz with Polar Express for the orthogonalization step + + Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, + or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). + Though empirically small 1D params perform efficiently here: + NS approximately performs a magnitude normalization of the grad + This hyper-optimized class has faster execution time than the current impl of Adam for small params + + Custom distributed sizing: + The model stores all attn and mlp weights in the same shape, and then updates the view as + needed on the forward pass. This enables attn and mlp weights to be contained within the same + dist.reduce_scatter_tensor() call. The model architecture has been customized to enable + (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. + The scheduling is: + 1. reduce scatter smear_gate (1 param 7 padding params) + 2. reduce scatter attn_gate (10 params 6 padding params) + 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) + 4. reduce scatter attn/mlp round 2 (16 mlp params) + 5. wait on step 1, then compute update of 1 and schedule all gather + 6. wait on step 2, then compute update of 2 and schedule all gather + 7. wait on step 3, then compute update of 3 and schedule all gather + GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] + GPUs that receive params of type attn reshape before computing update + 8. wait on 4, then compute update of 4 and schedule all gather + 9. wait for each all gather to complete and update params + Empirically, leading with small params provides an additional 0.2s improvement. + """ + def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, eps=1e-8, beta2=0.95, custom_sizing=True): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + # custom sizing requires 8 GPUs + if custom_sizing and dist.get_world_size()==8: + param_groups = self.generate_custom_param_groups(params) + else: + param_groups = self.generate_standard_param_groups(params) + super().__init__(param_groups, defaults) + + def reset(self): + # expose a reset for clearing buffers + for group in self.param_groups: + group["momentum_buffer"].zero_() + group["second_momentum_buffer"].zero_() + + def generate_standard_param_groups(self, params): + """ + Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. + Creates one param group per module. + """ + groups = defaultdict(list) + for param in params: + groups[param.label].append(param) + + param_groups = [] + for module_name, group_params in groups.items(): + chunk_size = (len(group_params) + self.world_size - 1) // self.world_size + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + + return param_groups + + def generate_custom_param_groups(self, params): + """ + Implementation requires that a single GPU does not receive both attn + and mlp params when a param group is split across GPUs. + """ + module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp_up', 'mlp_down'] + params_list = list(params) + params_list.sort(key=lambda x: module_group_order.index(x.label)) + + idx = 0 + group_sizes = [1, 10, 16, 16] + assert len(params_list) == sum(group_sizes) + param_groups = [] + for size in group_sizes: + chunk_size = (size + self.world_size - 1) // self.world_size + group_params = params_list[idx: idx + size] + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + idx += size + + return param_groups + + @torch.no_grad() + def step(self): + # Efficient systems-wise implementation of step developed by @YouJiacheng, + # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, + # @ryanyang0, @vagrawal, and @varunneal. + rank = dist.get_rank() + group_infos = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + if not params: + continue + + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + stacked_grads = torch.empty( + (padded_num_params, *params[0].shape), + dtype=params[0].dtype, + device=params[0].device + ) + for i, p in enumerate(params): + stacked_grads[i].copy_(p.grad, non_blocking=True) + if len(params) < padded_num_params: + stacked_grads[len(params):].zero_() + + grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) + + reduce_future = dist.reduce_scatter_tensor( + grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True + ).get_future() + + group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) + + all_gather_infos = [] + # Second pass: wait for gradients, compute updates for the local shard of parameters, + # and launch all async all_gather operations. + for group, info in zip(self.param_groups, group_infos): + info["reduce_future"].wait() + + params = group["params"] + grad_chunk = info["grad_chunk"] + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + start_idx = rank * chunk_size + module_idx = start_idx if start_idx < len(params) else 0 + + num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank + + if "momentum_buffer" not in group: + group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) + momentum_buffer = group["momentum_buffer"] + # Apply momentum update to the persistent momentum buffer in-place + momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) + updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) + + grad_shape = updated_grads.shape + if params[module_idx].label == 'attn': + # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] + for p in params[module_idx:module_idx + num_params]: + assert p.label == 'attn' + updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) + ref_param = params[module_idx] + param_shape = ref_param.shape + + if "second_momentum_buffer" not in group: + group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) + if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) + ) + second_momentum_buffer = group["second_momentum_buffer"] + + if "param_lr" not in group: + group["param_lr"] = ( + max(1., param_shape[-2] / param_shape[-1]) ** 0.5 + * ref_param.new_tensor( + [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + ) + + group["param_wd"] = ref_param.new_tensor( + [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + + # Determine LR and WR + eff_lr = group["lr"] * group["param_lr"] + eff_wd = group["weight_decay"] * group["param_wd"] + + # Compute zeropower for the entire chunk in a single, batched call. + if num_params == 0: + v_chunk = updated_grads + elif params[module_idx].label == "smear_gate": + # dividing by magnitude is equivalent of SVN for 1d tensors + v_chunk = updated_grads / (updated_grads.norm(dim=(-2, -1), keepdim=True).clamp_min(1e-10)) + else: + v_chunk = polar_express(updated_grads) + + # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) + v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) + second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) + step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() + v_chunk.mul_(step_size) + v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) + + v_chunk = v_chunk.view(grad_shape) + + updated_params = torch.empty_like(grad_chunk) + param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) + # Apply weight decay directly to the buffer. + param_chunk.mul_(1 - eff_wd) + + param_chunk.add_(-eff_lr * v_chunk) + + updated_params[:num_params].copy_(param_chunk) + if num_params < chunk_size: + updated_params[num_params:].zero_() + + stacked_params = torch.empty( + (padded_num_params, *param_shape), + dtype=updated_params.dtype, + device=updated_params.device, + ) + + gather_future = dist.all_gather_into_tensor( + stacked_params, updated_params, async_op=True + ).get_future() + + all_gather_infos.append( + { + "gather_future": gather_future, + "stacked_params": stacked_params, + "orig_params": params, + } + ) + + # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. + for info in all_gather_infos: + info["gather_future"].wait() + stacked_params = info["stacked_params"] + orig_params = info["orig_params"] + + unstacked_params = torch.unbind(stacked_params) + for i, p in enumerate(orig_params): + p.copy_(unstacked_params[i], non_blocking=True) + + +class DistAdam(torch.optim.Optimizer): + def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + params = list(params) + sizes = {p.shape for p in params} + # create one buffer per unique parameter-size + param_groups = [] + for size in sizes: + group_params = [p for p in params if p.shape == size] + param_groups.append(dict(params=group_params)) + super().__init__(param_groups, defaults) + # init state + for p in params: + chunk_size = p.size(0) // self.world_size + exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) + exp_avg_sq = torch.zeros_like(exp_avg) + self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) + # DistributedAdam implementation by @vagrawal + + @torch.compile + @torch.no_grad() + def step(self): + rank = dist.get_rank() + reduce_scatter_futures: list[torch.Future] = [] + all_gather_futures: list[torch.Future] = [] + grad_slices = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + for param in params: + grad = param.grad + rank_size = grad.shape[0] // self.world_size + grad_slice = torch.empty_like(grad[:rank_size]) + reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) + grad_slices.append(grad_slice) + + idx = 0 + for group in self.param_groups: + beta1, beta2 = group['betas'] + eps = group['eps'] + wd = group['weight_decay'] + params = group['params'] + for param in params: + reduce_scatter_futures[idx].wait() + rank_size = param.shape[0] // self.world_size + p_slice = param[rank * rank_size:(rank + 1) * rank_size] + lr = group['lr'] * getattr(param, "lr_mul", 1.0) + state = self.state[param] + g_slice = grad_slices[idx] + + exp_avg = state["exp_avg"] + exp_avg_sq = state["exp_avg_sq"] + state["step"] += 1 + t = state["step"] + # weight decay + if wd != 0: + eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) + p_slice.mul_(1 - eff_weight_decay) + # update running averages + exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) + # bias corrections + bias1 = 1 - beta1 ** t + bias2 = 1 - beta2 ** t + # compute step + denom = exp_avg_sq.sqrt().add_(eps) + step_size = lr * (bias2 ** 0.5 / bias1) + update = exp_avg.div(denom).mul_(step_size) + p_slice.add_(other=update, alpha=-1.0) + idx += 1 + all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) + torch.futures.collect_all(all_gather_futures).wait() + +# ----------------------------------------------------------------------------- +# PyTorch nn.Module definitions for the model + +def norm(x: Tensor): + return F.rms_norm(x, (x.size(-1),)) + +class CastedLinear(nn.Linear): + def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): + super().__init__(in_features, out_features, bias=False) + self.use_fp8 = use_fp8 + self.x_s = x_s + self.w_s = w_s + self.grad_s = grad_s + + def reset_parameters(self) -> None: + with torch.no_grad(): + self.weight.zero_() # @Grad62304977 and others + + def forward(self, x: Tensor): + if self.use_fp8 and self.training: + _x = x.flatten(0, -2) + out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] + return out.reshape(*x.shape[:-1], -1) + else: + return F.linear(x, self.weight.type_as(x)) + +# yarn implementation @classiclarryd +class Yarn(nn.Module): + def __init__(self, head_dim, max_seq_len): + super().__init__() + self.head_dim = head_dim + self.max_seq_len = max_seq_len + self.reset() + + def reset(self): + angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) + # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) + angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) + theta = torch.outer(t, angular_freq) + self.cos = nn.Buffer( + theta.cos().to(torch.bfloat16), persistent=False + ) + self.sin = nn.Buffer( + theta.sin().to(torch.bfloat16), persistent=False + ) + self.angular_freq = angular_freq + # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 + self.attn_scale = 0.1 + + def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): + rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) + scaling_factor = old_window / new_window + interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) + self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) + theta = torch.outer(t, self.angular_freq) + self.cos.copy_(theta.cos()) + self.sin.copy_(theta.sin()) + self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 + +def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): + assert cos.size(0) >= x_BTHD.size(-3) + cos, sin = ( + cos[None, : x_BTHD.size(-3), None, :], + sin[None, : x_BTHD.size(-3), None, :], + ) + x1, x2 = x_BTHD.chunk(2, dim=-1) + y1 = x1 * cos + x2 * sin + y2 = x1 * (-sin) + x2 * cos + return torch.cat((y1, y2), 3) + +@dataclass +class AttnArgs: + ve: torch.Tensor + sa_lambdas: torch.Tensor + seqlens: torch.Tensor + bm_size: int + cos: torch.Tensor + sin: torch.Tensor + attn_scale: float + +flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface + +class CausalSelfAttention(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int): + super().__init__() + self.num_heads = num_heads + self.head_dim = head_dim + self.dim = dim + self.hdim = num_heads * head_dim + + assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" + std = 0.5 * (self.dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng + # https://x.com/hi_tysam/status/1879699187107033311 + # make matrices the same shape as MLP to enable batched call in optimizer + self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) + # label module to enable custom optimizer sizing + self.qkvo_w.label='attn' + + with torch.no_grad(): + self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights + self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero + + # sparse gated attention to enable context based no-op by @classiclarryd + self.attn_gate = CastedLinear(12, num_heads) + # label module to enable custom optimizer sizing + self.attn_gate.weight.label = 'attn_gate' + + def forward(self, x: Tensor, attn_args: AttnArgs): + B, T = x.size(0), x.size(1) # batch size, sequence length + assert B == 1, "varlen sequences requires B == 1" + assert T % 16 == 0 + # unpack attention args + cos, sin = attn_args.cos, attn_args.sin + ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas + seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size + + q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) + q, k = norm(q), norm(k) # QK norm @Grad62304977 + q, k = rotary(q, cos, sin), rotary(k, cos, sin) + if ve is not None: + v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 + else: # skip mid-layers token value embeddings by @YouJiacheng + v = sa_lambdas[0] * v + + max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) + + # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng + y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, + max_seqlen_q=max_len, max_seqlen_k=max_len, + causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) + y = y.view(B, T, self.num_heads, self.head_dim) + y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) + y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side + y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) + return y + + +class MLP(nn.Module): + def __init__(self, dim: int): + super().__init__() + hdim = 4 * dim + # make matrices the same shape to enable batched call in optimizer + self.c_fc = nn.Parameter(torch.empty(dim, hdim)) + self.c_proj = nn.Parameter(torch.empty(dim, hdim)) + # label modules to enable custom optimizer sizing + self.c_fc.label = 'mlp_up' + self.c_proj.label = 'mlp_down' + # corrective factor to account for transpose + self.c_fc.lr_mul = 2. + + std = 0.5 * (dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + with torch.no_grad(): + self.c_fc.uniform_(-bound, bound) + self.c_proj.zero_() # zero init suggested by @Grad62304977 + + def forward(self, x: Tensor): + x = F.linear(x, self.c_fc.T.type_as(x)) + x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 + x = F.linear(x, self.c_proj.type_as(x)) + return x + +class Block(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): + super().__init__() + # skip attention of blocks.7 (the 8th layer) by @YouJiacheng + self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None + # skip MLP blocks for first MLP layer by @EmelyanenkoK + self.mlp = MLP(dim) if layer_idx != 0 else None + + def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): + x = lambdas[0] * x + lambdas[1] * x0 + if self.attn is not None: + x = x + self.attn(norm(x), attn_args) + if self.mlp is not None: + x = x + self.mlp(norm(x)) + return x + +# ----------------------------------------------------------------------------- +# The main model + +def next_multiple_of_n(v: float | int, *, n: int): + return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) + +class GPT(nn.Module): + def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): + super().__init__() + vocab_size = next_multiple_of_n(vocab_size, n=128) + self.embed = nn.Embedding(vocab_size, model_dim) + self.smear_gate = CastedLinear(12, 1) + # label modules to enable custom optimizer sizing + self.smear_gate.weight.label = 'smear_gate' + # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 + # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 + self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) + self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) + self.yarn = Yarn(head_dim, max_seq_len) + # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. + # suggested to me by @Grad62304977. this originates from Karpathy's experiments. + use_fp8 = not os.environ.get("DISABLE_FP8", False) + self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) + # Add learnable skip connection weights for decoder layers + assert num_layers % 2 == 0 + pad = (-num_layers * 5 - 2) % dist.get_world_size() + self.scalars = nn.Parameter( + torch.cat( + [ + -1.5 + * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 + *[ + torch.tensor([1.0, 0.0]) for _ in range(num_layers) + ], # block lambdas + *[ + torch.tensor([0.5, 0.5]) for _ in range(num_layers) + ], # SA lambdas + torch.zeros(1), # smear_lambda + 0.5*torch.ones(1), # backout_lambda + torch.ones(pad), + ] + ) + ) + # set learning rates + for param in self.embed.parameters(): + param.lr_mul = 75. + for param in self.value_embeds.parameters(): + param.lr_mul = 75. + self.lm_head.weight.lr_mul = 1.0 + self.scalars.lr_mul = 5.0 + + def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): + assert input_seq.ndim == 1 + + ve = [value_embed(input_seq) for value_embed in self.value_embeds] + # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure + # dropping first layer updates this to .12 ... 012 + ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] + assert len(ve) == len(self.blocks) + + short_bm = ws_short * args.block_size + long_bm = ws_long * args.block_size + bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] + assert len(bm_sizes) == len(self.blocks) + + x = self.embed(input_seq) + + skip_weights = self.scalars[:(len(self.blocks) // 2)] + lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) + sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) + smear_lambda = self.scalars[5 * len(self.blocks)] + backout_lambda = self.scalars[5 * len(self.blocks)+1] + + # smear token embed forward 1 position @classiclarryd + smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) + x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) + x = x0 = norm(x[None]) + + # U-net design by @brendanh0gan + skip_connections = [] + n = len(self.blocks) // 2 + + x_backout = None + backout_layer = 8 + # skip layer zero + for i in range(1,len(self.blocks)): + attn_args = AttnArgs( + ve=ve[i], + sa_lambdas=sa_lambdas[i], + seqlens=seqlens, + bm_size=bm_sizes[i], + cos=self.yarn.cos, + sin=self.yarn.sin, + attn_scale=self.yarn.attn_scale + ) + # since layer 0 is skipped, layer 11 does not have skip_connection + if i >= n and i<11: + gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) + x = x + gate * skip_connections.pop() + x = self.blocks[i](x, x0, lambdas[i], attn_args) + if i < n: + skip_connections.append(x) + if i == backout_layer: + x_backout = x + + # back out contributions from first 8 layers that are only required for downstream context and not direct prediction + x -= backout_lambda * x_backout + x = norm(x) + logits = self.lm_head(x) + # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) + logits = 30 * torch.sigmoid(logits / 7.5) + logits_for_loss = logits.float() if not self.training else logits + loss = F.cross_entropy( + logits_for_loss.view(-1, logits_for_loss.size(-1)), + target_seq, + reduction="sum" if self.training else "mean", + ) + return loss + +# ----------------------------------------------------------------------------- +# Distributed data loader + +def _load_data_shard(file: Path): + header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 + assert header[0] == 20240520, "magic number mismatch in the data .bin file" + assert header[1] == 1, "unsupported version" + num_tokens = int(header[2]) # number of tokens (claimed) + with file.open("rb", buffering=0) as f: + tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng + f.seek(256 * 4) + nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng + assert nbytes == 2 * num_tokens, "number of tokens read does not match header" + return tokens + +BOS_ID = 50256 + +class BOSFinder: + # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd + def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): + # Precompute BOS positions once per shard + self.tokens=tokens + self.size = tokens.numel() + self.quickload = quickload + if quickload: + # only scan first 4 million tokens, then kickoff async thread to scan rest + self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.thread = None + self.ready = threading.Event() + self.start() + else: + self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.i = 0 + self.world_size = world_size + self.batch_iter = 0 + + def _load(self): + self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + self.bos_idx = self.bos_idx_async + + def next_batch(self, num_tokens_local: int, max_seq_len: int): + # if quickload was used, repoint to the full dataset after 5 batches + if self.quickload and self.batch_iter==5: + self.get() + n = len(self.bos_idx) + starts = [[] for _ in range(self.world_size)] + ends = [[] for _ in range(self.world_size)] + + idx = self.i + for r in range(self.world_size): + cur_len = 0 + while cur_len <= num_tokens_local: + if idx >= n: + raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") + cur = self.bos_idx[idx] + starts[r].append(cur) + end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, + cur + max_seq_len, + cur + num_tokens_local - cur_len + 1) + ends[r].append(end) + cur_len += end - cur + idx += 1 + + assert cur_len == num_tokens_local + 1 + self.i = idx + self.batch_iter+=1 + return starts, ends + +class DataPreloader: + # Helper for asynchronously loading next shard and indexing bos tokens + def __init__(self, file_iter, world_size: int = 1): + self.file_iter = file_iter + self.world_size = world_size + self.thread = None + self.data = None + self.ready = threading.Event() + + def _load(self): + tokens = _load_data_shard(next(self.file_iter)) + self.data = (tokens, BOSFinder(tokens, self.world_size)) + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + return self.data + +def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): + # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len + rank = dist.get_rank() if dist.is_initialized() else 0 + world_size = dist.get_world_size() if dist.is_initialized() else 1 + assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" + num_tokens = num_tokens // grad_accum_steps + + files = [Path(file) for file in sorted(glob.glob(filename_pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") + + file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training + tokens = _load_data_shard(next(file_iter)) + if align_to_bos: + finder = BOSFinder(tokens, world_size=world_size, quickload=True) + preloader = DataPreloader(file_iter, world_size) + preloader.start() + else: + pos = 0 # for unaligned case + + while True: + num_tokens_local = num_tokens // world_size + max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 + + if align_to_bos: + try: + seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) + start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) + except StopIteration: + # This shard is exhausted, load the next one in the next loop iteration. + tokens, finder = preloader.get() + preloader.start() + continue + + buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) + _inputs = buf[:-1] + _targets = buf[1:] + end_idxs[-1] -= 1 # last document was too long to account for _targets offset + cum_lengths = (end_idxs - start_idxs).cumsum(0) + + else: + if pos + num_tokens + 1 >= len(tokens): # should not occur for val data + tokens, pos = _load_data_shard(next(file_iter)), 0 + + pos_local = pos + rank * num_tokens_local + buf = tokens[pos_local: pos_local + num_tokens_local + 1] + _inputs = buf[:-1].view(num_tokens_local, ) + _targets = buf[1:].view(num_tokens_local, ) + + cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] + pos += num_tokens + + + _cum_lengths = torch.full((max_num_docs,), num_tokens_local) + _cum_lengths[0] = 0 + _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths + + new_params = yield ( + _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), + _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), + _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) + ) + + if new_params is not None: + # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() + new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params + assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" + num_tokens = new_num_tokens + max_seq_len = new_max_seq_len + grad_accum_steps = new_grad_accum_steps + + +# ----------------------------------------------------------------------------- +# int main + +@dataclass +class Hyperparameters: + # data + train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on + val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on + val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons + train_batch_size: int = 2048 * 16 * 8 + train_max_seq_len: int = 128 * 16 + val_batch_size: int = 4 * 64 * 1024 * 8 + # optimization + num_iterations: int = 2285 + lr_schedule = (0.5, 0.98) # breakpoints for 3-part schedule: (flat, linear decay, flat) + lr_min = 0.1 + # evaluation and logging + run_id: str = f"{uuid.uuid4()}" + val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end + save_checkpoint: bool = False + # attention masking + block_size: int = 128 + ws_schedule: tuple = (3, 5, 7, 9, 11, 13) + ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN + +args = Hyperparameters() + +data_path = os.environ.get("DATA_PATH", ".") +args.train_files = os.path.join(data_path, args.train_files) +args.val_files = os.path.join(data_path, args.val_files) + +# torchrun sets these env variables +rank = int(os.environ["RANK"]) +world_size = int(os.environ["WORLD_SIZE"]) +assert 8 % world_size == 0, "world_size must be a divisor of 8" +grad_accum_steps = 8 // world_size +assert torch.cuda.is_available() +device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) +torch.cuda.set_device(device) +dist.init_process_group(backend="nccl", device_id=device) +dist.barrier() +master_process = (rank == 0) # this process will do logging, checkpointing etc. + +# begin logging +logfile = None +if master_process: + run_id = args.run_id + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{run_id}.txt" + print(logfile) +def print0(s, console=False): + if master_process: + with open(logfile, "a") as f: + if console: + print(s) + print(s, file=f) + +# begin by printing this file (the Python code) +print0(code) +print0("="*100) +# log information about the hardware/software environment this is running on +print0(f"Running Python {sys.version}") +print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") +print0(f"Running Triton version {triton.__version__}") + +def nvidia_smi(): + import subprocess # avoid top level import + return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout +print0(nvidia_smi()) +print0("="*100) + +model: nn.Module = GPT( + vocab_size=50257, + num_layers=12, + num_heads=6, + head_dim=128, + model_dim=768, + max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) +).cuda() +for m in model.modules(): + if isinstance(m, (nn.Embedding, nn.Linear)): + m.bfloat16() +for param in model.parameters(): + dist.broadcast(param.detach(), 0) + +# collect the parameters to optimize +hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] +embed_params = [p for n, p in model.named_parameters() if "embed" in n] +scalar_params = [p for p in model.parameters() if p.ndim < 2] +head_params = [model.lm_head.weight] +gate_params = [p for n, p in model.named_parameters() if "gate" in n] + +# init the optimizer(s) +# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence +# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 +optimizer1 = DistAdam( + scalar_params + head_params + embed_params, + lr=0.008, + betas=(0.65, 0.95), + eps=1e-8, + weight_decay=0.0, +) +optimizer2 = Muon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=0.0) +optimizers = [optimizer1, optimizer2] +for opt in optimizers: + for group in opt.param_groups: + group["initial_lr"] = group["lr"] + +def get_lr(step: int): + assert step < args.num_iterations + # Three part schedule: flat, linear decrease, flat + lr_schedule = args.lr_schedule + x = step / args.num_iterations + + if x < lr_schedule[0]: + return 1.0 + elif x < lr_schedule[1]: + progress = (x - lr_schedule[0]) / (lr_schedule[1] - lr_schedule[0]) + lr = 1.0 - (1.0 - args.lr_min) * progress + else: + lr = args.lr_min + return lr + +def get_ws(step: int): + assert step <= args.num_iterations + x = step / (args.num_iterations + 1) + ws_idx = int(len(args.ws_schedule) * x) + return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] + +def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): + # warmup phase: linearly increase momentum from min to max + # cooldown phase: linearly decrease momentum from max to min + momentum_cd_start = args.num_iterations - muon_cooldown_steps + if step < muon_warmup_steps: + frac = step / muon_warmup_steps + momentum = momentum_min + frac * (momentum_max - momentum_min) + elif step > momentum_cd_start: + frac = (step - momentum_cd_start) / muon_cooldown_steps + momentum = momentum_max - frac * (momentum_max - momentum_min) + else: + momentum = momentum_max + return momentum + +def step_optimizers(step: int, optimizers, model): + # update lr + for optimizer in optimizers: + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * get_lr(step) + + # set muon momentum based on step + momentum = get_muon_momentum(step) + for group in optimizers[1].param_groups: + group["momentum"] = momentum + + # on even steps, only step Muon params + # on odd steps, step all params + if step%2==0: + optimizers[1].step() + optimizers[1].zero_grad(set_to_none=True) + else: + for optimizer in optimizers: + optimizer.step() + model.zero_grad(set_to_none=True) + +model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) + +######################################## +# Warmup kernels # +######################################## + +# Warmup the training kernels, then re-initialize the state so we aren't cheating +warmup_steps = 30 +initial_state = dict(model=copy.deepcopy(model.state_dict()), + optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +for step in range(warmup_steps): + inputs, targets, cum_seqlens = next(train_loader) + # each window size is a new graph, need to warm up each with Yarn.attn_scale + ws_idx = step % len(args.ws_schedule) + if ws_idx==0: + model.yarn.reset() + ws_long = args.ws_schedule[0] + else: + new_ws_long = args.ws_schedule[ws_idx] + if new_ws_long > ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long = new_ws_long + model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() + for opt in optimizers: + opt.step() + model.zero_grad(set_to_none=True) +model.yarn.reset() # rotary buffer is not stored in state_dict +model.load_state_dict(initial_state["model"]) +optimizer2.reset() # momentum buffer not in state dict +for opt, opt_state in zip(optimizers, initial_state["optimizers"]): + opt.load_state_dict(opt_state) +del train_loader, initial_state + +######################################## +# Training and validation # +######################################## + +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +training_time_ms = 0 +# start the clock +torch.cuda.synchronize() +t0 = time.perf_counter() +# begin training +train_steps = args.num_iterations +ws_short, ws_long = get_ws(0) +for step in range(train_steps + 1): + last_step = (step == train_steps) + ws_short, new_ws_long = get_ws(step) + if new_ws_long != ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long=new_ws_long + + # --------------- VALIDATION SECTION ----------------- + if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): + if last_step: + ws_long = args.ws_validate_post_yarn_ext + # stop the clock + torch.cuda.synchronize() + training_time_ms += 1000 * (time.perf_counter() - t0) + model.eval() + assert args.val_tokens % args.val_batch_size == 0 + val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size + val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) + val_loss = 0 + with torch.no_grad(): + for _ in range(val_steps): + inputs, targets, cum_seqlens = next(val_loader) + val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) + val_loss /= val_steps + del val_loader + dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) + print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) + model.train() + # start the clock again + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if master_process and args.save_checkpoint: + log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) + os.makedirs(f"logs/{run_id}", exist_ok=True) + torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") + # the last step only has the validation loop, so break to avoid training + break + + # --------------- TRAINING SECTION ----------------- + loss = 0 + for _ in range(grad_accum_steps): + inputs, targets, cum_seqlens = next(train_loader) + loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps + loss.backward() + step_optimizers(step, optimizers, model) + + # logging + approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) + print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) + +print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) +dist.destroy_process_group() + +==================================================================================================== +Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] +Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 +Running Triton version 3.5.0 +Tue Oct 28 02:17:34 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | +| N/A 40C P0 130W / 700W | 5858MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | +| N/A 33C P0 126W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | +| N/A 32C P0 121W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | +| N/A 38C P0 124W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | +| N/A 39C P0 121W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | +| N/A 32C P0 120W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | +| N/A 38C P0 125W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | +| N/A 31C P0 115W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| ++-----------------------------------------------------------------------------------------+ + +==================================================================================================== +step:0/2285 val_loss:10.8258 train_time:0ms step_avg:0.02ms +step:1/2285 train_time:121ms step_avg:121.14ms +step:2/2285 train_time:142ms step_avg:70.78ms +step:3/2285 train_time:180ms step_avg:60.11ms +step:4/2285 train_time:237ms step_avg:59.15ms +step:5/2285 train_time:296ms step_avg:59.16ms +step:6/2285 train_time:354ms step_avg:58.97ms +step:7/2285 train_time:414ms step_avg:59.19ms +step:8/2285 train_time:473ms step_avg:59.14ms +step:9/2285 train_time:534ms step_avg:59.34ms +step:10/2285 train_time:593ms step_avg:59.28ms +step:11/2285 train_time:653ms step_avg:59.39ms +step:12/2285 train_time:712ms step_avg:59.35ms +step:13/2285 train_time:773ms step_avg:59.45ms +step:14/2285 train_time:831ms step_avg:59.37ms +step:15/2285 train_time:892ms step_avg:59.49ms +step:16/2285 train_time:951ms step_avg:59.42ms +step:17/2285 train_time:1014ms step_avg:59.62ms +step:18/2285 train_time:1077ms step_avg:59.81ms +step:19/2285 train_time:1141ms step_avg:60.08ms +step:20/2285 train_time:1204ms step_avg:60.18ms +step:21/2285 train_time:1265ms step_avg:60.24ms +step:22/2285 train_time:1323ms step_avg:60.15ms +step:23/2285 train_time:1385ms step_avg:60.20ms +step:24/2285 train_time:1444ms step_avg:60.16ms +step:25/2285 train_time:1505ms step_avg:60.19ms +step:26/2285 train_time:1563ms step_avg:60.13ms +step:27/2285 train_time:1625ms step_avg:60.19ms +step:28/2285 train_time:1683ms step_avg:60.12ms +step:29/2285 train_time:1744ms step_avg:60.14ms +step:30/2285 train_time:1803ms step_avg:60.10ms +step:31/2285 train_time:1864ms step_avg:60.14ms +step:32/2285 train_time:1923ms step_avg:60.11ms +step:33/2285 train_time:1985ms step_avg:60.16ms +step:34/2285 train_time:2045ms step_avg:60.15ms +step:35/2285 train_time:2107ms step_avg:60.21ms +step:36/2285 train_time:2166ms step_avg:60.18ms +step:37/2285 train_time:2228ms step_avg:60.22ms +step:38/2285 train_time:2287ms step_avg:60.19ms +step:39/2285 train_time:2348ms step_avg:60.21ms +step:40/2285 train_time:2408ms step_avg:60.19ms +step:41/2285 train_time:2469ms step_avg:60.22ms +step:42/2285 train_time:2528ms step_avg:60.18ms +step:43/2285 train_time:2589ms step_avg:60.21ms +step:44/2285 train_time:2648ms step_avg:60.17ms +step:45/2285 train_time:2709ms step_avg:60.21ms +step:46/2285 train_time:2769ms step_avg:60.19ms +step:47/2285 train_time:2830ms step_avg:60.22ms +step:48/2285 train_time:2889ms step_avg:60.20ms +step:49/2285 train_time:2952ms step_avg:60.24ms +step:50/2285 train_time:3011ms step_avg:60.22ms +step:51/2285 train_time:3074ms step_avg:60.27ms +step:52/2285 train_time:3133ms step_avg:60.25ms +step:53/2285 train_time:3195ms step_avg:60.28ms +step:54/2285 train_time:3254ms step_avg:60.26ms +step:55/2285 train_time:3316ms step_avg:60.29ms +step:56/2285 train_time:3375ms step_avg:60.27ms +step:57/2285 train_time:3436ms step_avg:60.29ms +step:58/2285 train_time:3496ms step_avg:60.27ms +step:59/2285 train_time:3557ms step_avg:60.30ms +step:60/2285 train_time:3618ms step_avg:60.29ms +step:61/2285 train_time:3679ms step_avg:60.31ms +step:62/2285 train_time:3739ms step_avg:60.30ms +step:63/2285 train_time:3800ms step_avg:60.32ms +step:64/2285 train_time:3859ms step_avg:60.30ms +step:65/2285 train_time:3922ms step_avg:60.33ms +step:66/2285 train_time:3980ms step_avg:60.31ms +step:67/2285 train_time:4042ms step_avg:60.33ms +step:68/2285 train_time:4101ms step_avg:60.32ms +step:69/2285 train_time:4163ms step_avg:60.33ms +step:70/2285 train_time:4221ms step_avg:60.30ms +step:71/2285 train_time:4283ms step_avg:60.32ms +step:72/2285 train_time:4341ms step_avg:60.29ms +step:73/2285 train_time:4403ms step_avg:60.31ms +step:74/2285 train_time:4461ms step_avg:60.29ms +step:75/2285 train_time:4523ms step_avg:60.30ms +step:76/2285 train_time:4581ms step_avg:60.28ms +step:77/2285 train_time:4643ms step_avg:60.29ms +step:78/2285 train_time:4701ms step_avg:60.27ms +step:79/2285 train_time:4763ms step_avg:60.29ms +step:80/2285 train_time:4821ms step_avg:60.27ms +step:81/2285 train_time:4882ms step_avg:60.28ms +step:82/2285 train_time:4941ms step_avg:60.26ms +step:83/2285 train_time:5002ms step_avg:60.27ms +step:84/2285 train_time:5062ms step_avg:60.26ms +step:85/2285 train_time:5123ms step_avg:60.27ms +step:86/2285 train_time:5182ms step_avg:60.25ms +step:87/2285 train_time:5243ms step_avg:60.26ms +step:88/2285 train_time:5301ms step_avg:60.24ms +step:89/2285 train_time:5363ms step_avg:60.26ms +step:90/2285 train_time:5421ms step_avg:60.24ms +step:91/2285 train_time:5483ms step_avg:60.25ms +step:92/2285 train_time:5541ms step_avg:60.23ms +step:93/2285 train_time:5602ms step_avg:60.24ms +step:94/2285 train_time:5661ms step_avg:60.22ms +step:95/2285 train_time:5722ms step_avg:60.24ms +step:96/2285 train_time:5781ms step_avg:60.22ms +step:97/2285 train_time:5843ms step_avg:60.23ms +step:98/2285 train_time:5902ms step_avg:60.22ms +step:99/2285 train_time:5963ms step_avg:60.23ms +step:100/2285 train_time:6022ms step_avg:60.22ms +step:101/2285 train_time:6083ms step_avg:60.22ms +step:102/2285 train_time:6142ms step_avg:60.22ms +step:103/2285 train_time:6203ms step_avg:60.22ms +step:104/2285 train_time:6261ms step_avg:60.21ms +step:105/2285 train_time:6322ms step_avg:60.21ms +step:106/2285 train_time:6381ms step_avg:60.20ms +step:107/2285 train_time:6442ms step_avg:60.21ms +step:108/2285 train_time:6501ms step_avg:60.20ms +step:109/2285 train_time:6562ms step_avg:60.20ms +step:110/2285 train_time:6621ms step_avg:60.19ms +step:111/2285 train_time:6682ms step_avg:60.20ms +step:112/2285 train_time:6741ms step_avg:60.18ms +step:113/2285 train_time:6802ms step_avg:60.19ms +step:114/2285 train_time:6860ms step_avg:60.18ms +step:115/2285 train_time:6922ms step_avg:60.19ms +step:116/2285 train_time:6980ms step_avg:60.17ms +step:117/2285 train_time:7041ms step_avg:60.18ms +step:118/2285 train_time:7099ms step_avg:60.16ms +step:119/2285 train_time:7160ms step_avg:60.17ms +step:120/2285 train_time:7219ms step_avg:60.16ms +step:121/2285 train_time:7280ms step_avg:60.17ms +step:122/2285 train_time:7339ms step_avg:60.16ms +step:123/2285 train_time:7400ms step_avg:60.16ms +step:124/2285 train_time:7459ms step_avg:60.15ms +step:125/2285 train_time:7520ms step_avg:60.16ms +step:126/2285 train_time:7579ms step_avg:60.15ms +step:127/2285 train_time:7639ms step_avg:60.15ms +step:128/2285 train_time:7698ms step_avg:60.14ms +step:129/2285 train_time:7759ms step_avg:60.15ms +step:130/2285 train_time:7818ms step_avg:60.14ms +step:131/2285 train_time:7879ms step_avg:60.15ms +step:132/2285 train_time:7938ms step_avg:60.14ms +step:133/2285 train_time:8000ms step_avg:60.15ms +step:134/2285 train_time:8059ms step_avg:60.14ms +step:135/2285 train_time:8120ms step_avg:60.15ms +step:136/2285 train_time:8179ms step_avg:60.14ms +step:137/2285 train_time:8240ms step_avg:60.14ms +step:138/2285 train_time:8298ms step_avg:60.13ms +step:139/2285 train_time:8359ms step_avg:60.14ms +step:140/2285 train_time:8418ms step_avg:60.13ms +step:141/2285 train_time:8479ms step_avg:60.14ms +step:142/2285 train_time:8538ms step_avg:60.13ms +step:143/2285 train_time:8599ms step_avg:60.14ms +step:144/2285 train_time:8658ms step_avg:60.13ms +step:145/2285 train_time:8719ms step_avg:60.13ms +step:146/2285 train_time:8778ms step_avg:60.12ms +step:147/2285 train_time:8839ms step_avg:60.13ms +step:148/2285 train_time:8898ms step_avg:60.12ms +step:149/2285 train_time:8959ms step_avg:60.13ms +step:150/2285 train_time:9018ms step_avg:60.12ms +step:151/2285 train_time:9079ms step_avg:60.13ms +step:152/2285 train_time:9138ms step_avg:60.12ms +step:153/2285 train_time:9200ms step_avg:60.13ms +step:154/2285 train_time:9258ms step_avg:60.12ms +step:155/2285 train_time:9319ms step_avg:60.12ms +step:156/2285 train_time:9378ms step_avg:60.12ms +step:157/2285 train_time:9439ms step_avg:60.12ms +step:158/2285 train_time:9498ms step_avg:60.11ms +step:159/2285 train_time:9559ms step_avg:60.12ms +step:160/2285 train_time:9617ms step_avg:60.11ms +step:161/2285 train_time:9679ms step_avg:60.12ms +step:162/2285 train_time:9737ms step_avg:60.11ms +step:163/2285 train_time:9799ms step_avg:60.11ms +step:164/2285 train_time:9857ms step_avg:60.11ms +step:165/2285 train_time:9919ms step_avg:60.11ms +step:166/2285 train_time:9977ms step_avg:60.10ms +step:167/2285 train_time:10038ms step_avg:60.11ms +step:168/2285 train_time:10096ms step_avg:60.10ms +step:169/2285 train_time:10158ms step_avg:60.10ms +step:170/2285 train_time:10217ms step_avg:60.10ms +step:171/2285 train_time:10278ms step_avg:60.11ms +step:172/2285 train_time:10336ms step_avg:60.09ms +step:173/2285 train_time:10398ms step_avg:60.10ms +step:174/2285 train_time:10456ms step_avg:60.09ms +step:175/2285 train_time:10517ms step_avg:60.10ms +step:176/2285 train_time:10576ms step_avg:60.09ms +step:177/2285 train_time:10637ms step_avg:60.10ms +step:178/2285 train_time:10696ms step_avg:60.09ms +step:179/2285 train_time:10756ms step_avg:60.09ms +step:180/2285 train_time:10815ms step_avg:60.08ms +step:181/2285 train_time:10876ms step_avg:60.09ms +step:182/2285 train_time:10935ms step_avg:60.08ms +step:183/2285 train_time:10997ms step_avg:60.09ms +step:184/2285 train_time:11055ms step_avg:60.08ms +step:185/2285 train_time:11117ms step_avg:60.09ms +step:186/2285 train_time:11175ms step_avg:60.08ms +step:187/2285 train_time:11236ms step_avg:60.08ms +step:188/2285 train_time:11294ms step_avg:60.08ms +step:189/2285 train_time:11356ms step_avg:60.08ms +step:190/2285 train_time:11415ms step_avg:60.08ms +step:191/2285 train_time:11476ms step_avg:60.09ms +step:192/2285 train_time:11535ms step_avg:60.08ms +step:193/2285 train_time:11596ms step_avg:60.08ms +step:194/2285 train_time:11654ms step_avg:60.07ms +step:195/2285 train_time:11715ms step_avg:60.08ms +step:196/2285 train_time:11774ms step_avg:60.07ms +step:197/2285 train_time:11836ms step_avg:60.08ms +step:198/2285 train_time:11894ms step_avg:60.07ms +step:199/2285 train_time:11956ms step_avg:60.08ms +step:200/2285 train_time:12015ms step_avg:60.07ms +step:201/2285 train_time:12076ms step_avg:60.08ms +step:202/2285 train_time:12134ms step_avg:60.07ms +step:203/2285 train_time:12195ms step_avg:60.07ms +step:204/2285 train_time:12254ms step_avg:60.07ms +step:205/2285 train_time:12315ms step_avg:60.07ms +step:206/2285 train_time:12375ms step_avg:60.07ms +step:207/2285 train_time:12435ms step_avg:60.07ms +step:208/2285 train_time:12494ms step_avg:60.07ms +step:209/2285 train_time:12555ms step_avg:60.07ms +step:210/2285 train_time:12614ms step_avg:60.07ms +step:211/2285 train_time:12675ms step_avg:60.07ms +step:212/2285 train_time:12734ms step_avg:60.06ms +step:213/2285 train_time:12795ms step_avg:60.07ms +step:214/2285 train_time:12853ms step_avg:60.06ms +step:215/2285 train_time:12914ms step_avg:60.07ms +step:216/2285 train_time:12973ms step_avg:60.06ms +step:217/2285 train_time:13034ms step_avg:60.07ms +step:218/2285 train_time:13093ms step_avg:60.06ms +step:219/2285 train_time:13154ms step_avg:60.06ms +step:220/2285 train_time:13213ms step_avg:60.06ms +step:221/2285 train_time:13274ms step_avg:60.06ms +step:222/2285 train_time:13334ms step_avg:60.06ms +step:223/2285 train_time:13395ms step_avg:60.07ms +step:224/2285 train_time:13454ms step_avg:60.06ms +step:225/2285 train_time:13515ms step_avg:60.07ms +step:226/2285 train_time:13575ms step_avg:60.06ms +step:227/2285 train_time:13636ms step_avg:60.07ms +step:228/2285 train_time:13695ms step_avg:60.07ms +step:229/2285 train_time:13756ms step_avg:60.07ms +step:230/2285 train_time:13814ms step_avg:60.06ms +step:231/2285 train_time:13876ms step_avg:60.07ms +step:232/2285 train_time:13934ms step_avg:60.06ms +step:233/2285 train_time:13996ms step_avg:60.07ms +step:234/2285 train_time:14054ms step_avg:60.06ms +step:235/2285 train_time:14115ms step_avg:60.07ms +step:236/2285 train_time:14174ms step_avg:60.06ms +step:237/2285 train_time:14235ms step_avg:60.06ms +step:238/2285 train_time:14294ms step_avg:60.06ms +step:239/2285 train_time:14355ms step_avg:60.06ms +step:240/2285 train_time:14414ms step_avg:60.06ms +step:241/2285 train_time:14476ms step_avg:60.07ms +step:242/2285 train_time:14534ms step_avg:60.06ms +step:243/2285 train_time:14596ms step_avg:60.06ms +step:244/2285 train_time:14654ms step_avg:60.06ms +step:245/2285 train_time:14715ms step_avg:60.06ms +step:246/2285 train_time:14774ms step_avg:60.06ms +step:247/2285 train_time:14835ms step_avg:60.06ms +step:248/2285 train_time:14894ms step_avg:60.06ms +step:249/2285 train_time:14956ms step_avg:60.06ms +step:250/2285 train_time:15015ms step_avg:60.06ms +step:250/2285 val_loss:4.0863 train_time:15078ms step_avg:60.31ms +step:251/2285 train_time:15096ms step_avg:60.14ms +step:252/2285 train_time:15139ms step_avg:60.08ms +step:253/2285 train_time:15207ms step_avg:60.11ms +step:254/2285 train_time:15271ms step_avg:60.12ms +step:255/2285 train_time:15336ms step_avg:60.14ms +step:256/2285 train_time:15395ms step_avg:60.13ms +step:257/2285 train_time:15455ms step_avg:60.14ms +step:258/2285 train_time:15514ms step_avg:60.13ms +step:259/2285 train_time:15574ms step_avg:60.13ms +step:260/2285 train_time:15632ms step_avg:60.12ms +step:261/2285 train_time:15692ms step_avg:60.12ms +step:262/2285 train_time:15750ms step_avg:60.12ms +step:263/2285 train_time:15810ms step_avg:60.12ms +step:264/2285 train_time:15868ms step_avg:60.11ms +step:265/2285 train_time:15928ms step_avg:60.11ms +step:266/2285 train_time:15986ms step_avg:60.10ms +step:267/2285 train_time:16048ms step_avg:60.10ms +step:268/2285 train_time:16107ms step_avg:60.10ms +step:269/2285 train_time:16169ms step_avg:60.11ms +step:270/2285 train_time:16230ms step_avg:60.11ms +step:271/2285 train_time:16292ms step_avg:60.12ms +step:272/2285 train_time:16351ms step_avg:60.11ms +step:273/2285 train_time:16412ms step_avg:60.12ms +step:274/2285 train_time:16471ms step_avg:60.11ms +step:275/2285 train_time:16531ms step_avg:60.11ms +step:276/2285 train_time:16590ms step_avg:60.11ms +step:277/2285 train_time:16650ms step_avg:60.11ms +step:278/2285 train_time:16708ms step_avg:60.10ms +step:279/2285 train_time:16768ms step_avg:60.10ms +step:280/2285 train_time:16826ms step_avg:60.09ms +step:281/2285 train_time:16887ms step_avg:60.10ms +step:282/2285 train_time:16945ms step_avg:60.09ms +step:283/2285 train_time:17005ms step_avg:60.09ms +step:284/2285 train_time:17064ms step_avg:60.08ms +step:285/2285 train_time:17125ms step_avg:60.09ms +step:286/2285 train_time:17184ms step_avg:60.08ms +step:287/2285 train_time:17246ms step_avg:60.09ms +step:288/2285 train_time:17305ms step_avg:60.09ms +step:289/2285 train_time:17366ms step_avg:60.09ms +step:290/2285 train_time:17425ms step_avg:60.09ms +step:291/2285 train_time:17487ms step_avg:60.09ms +step:292/2285 train_time:17546ms step_avg:60.09ms +step:293/2285 train_time:17607ms step_avg:60.09ms +step:294/2285 train_time:17665ms step_avg:60.09ms +step:295/2285 train_time:17726ms step_avg:60.09ms +step:296/2285 train_time:17784ms step_avg:60.08ms +step:297/2285 train_time:17844ms step_avg:60.08ms +step:298/2285 train_time:17902ms step_avg:60.07ms +step:299/2285 train_time:17962ms step_avg:60.07ms +step:300/2285 train_time:18020ms step_avg:60.07ms +step:301/2285 train_time:18081ms step_avg:60.07ms +step:302/2285 train_time:18141ms step_avg:60.07ms +step:303/2285 train_time:18202ms step_avg:60.07ms +step:304/2285 train_time:18261ms step_avg:60.07ms +step:305/2285 train_time:18323ms step_avg:60.08ms +step:306/2285 train_time:18382ms step_avg:60.07ms +step:307/2285 train_time:18443ms step_avg:60.08ms +step:308/2285 train_time:18502ms step_avg:60.07ms +step:309/2285 train_time:18563ms step_avg:60.07ms +step:310/2285 train_time:18622ms step_avg:60.07ms +step:311/2285 train_time:18683ms step_avg:60.07ms +step:312/2285 train_time:18742ms step_avg:60.07ms +step:313/2285 train_time:18803ms step_avg:60.07ms +step:314/2285 train_time:18861ms step_avg:60.07ms +step:315/2285 train_time:18922ms step_avg:60.07ms +step:316/2285 train_time:18980ms step_avg:60.06ms +step:317/2285 train_time:19041ms step_avg:60.07ms +step:318/2285 train_time:19099ms step_avg:60.06ms +step:319/2285 train_time:19161ms step_avg:60.06ms +step:320/2285 train_time:19220ms step_avg:60.06ms +step:321/2285 train_time:19281ms step_avg:60.07ms +step:322/2285 train_time:19340ms step_avg:60.06ms +step:323/2285 train_time:19401ms step_avg:60.07ms +step:324/2285 train_time:19460ms step_avg:60.06ms +step:325/2285 train_time:19521ms step_avg:60.07ms +step:326/2285 train_time:19580ms step_avg:60.06ms +step:327/2285 train_time:19641ms step_avg:60.06ms +step:328/2285 train_time:19700ms step_avg:60.06ms +step:329/2285 train_time:19761ms step_avg:60.06ms +step:330/2285 train_time:19819ms step_avg:60.06ms +step:331/2285 train_time:19880ms step_avg:60.06ms +step:332/2285 train_time:19939ms step_avg:60.06ms +step:333/2285 train_time:20000ms step_avg:60.06ms +step:334/2285 train_time:20058ms step_avg:60.05ms +step:335/2285 train_time:20119ms step_avg:60.06ms +step:336/2285 train_time:20178ms step_avg:60.05ms +step:337/2285 train_time:20239ms step_avg:60.06ms +step:338/2285 train_time:20298ms step_avg:60.05ms +step:339/2285 train_time:20359ms step_avg:60.06ms +step:340/2285 train_time:20418ms step_avg:60.05ms +step:341/2285 train_time:20480ms step_avg:60.06ms +step:342/2285 train_time:20539ms step_avg:60.06ms +step:343/2285 train_time:20600ms step_avg:60.06ms +step:344/2285 train_time:20659ms step_avg:60.05ms +step:345/2285 train_time:20720ms step_avg:60.06ms +step:346/2285 train_time:20779ms step_avg:60.05ms +step:347/2285 train_time:20840ms step_avg:60.06ms +step:348/2285 train_time:20899ms step_avg:60.05ms +step:349/2285 train_time:20959ms step_avg:60.06ms +step:350/2285 train_time:21018ms step_avg:60.05ms +step:351/2285 train_time:21078ms step_avg:60.05ms +step:352/2285 train_time:21137ms step_avg:60.05ms +step:353/2285 train_time:21198ms step_avg:60.05ms +step:354/2285 train_time:21257ms step_avg:60.05ms +step:355/2285 train_time:21319ms step_avg:60.05ms +step:356/2285 train_time:21378ms step_avg:60.05ms +step:357/2285 train_time:21440ms step_avg:60.06ms +step:358/2285 train_time:21499ms step_avg:60.05ms +step:359/2285 train_time:21560ms step_avg:60.06ms +step:360/2285 train_time:21620ms step_avg:60.05ms +step:361/2285 train_time:21681ms step_avg:60.06ms +step:362/2285 train_time:21739ms step_avg:60.05ms +step:363/2285 train_time:21800ms step_avg:60.06ms +step:364/2285 train_time:21859ms step_avg:60.05ms +step:365/2285 train_time:21920ms step_avg:60.05ms +step:366/2285 train_time:21978ms step_avg:60.05ms +step:367/2285 train_time:22040ms step_avg:60.05ms +step:368/2285 train_time:22098ms step_avg:60.05ms +step:369/2285 train_time:22159ms step_avg:60.05ms +step:370/2285 train_time:22218ms step_avg:60.05ms +step:371/2285 train_time:22279ms step_avg:60.05ms +step:372/2285 train_time:22339ms step_avg:60.05ms +step:373/2285 train_time:22400ms step_avg:60.05ms +step:374/2285 train_time:22458ms step_avg:60.05ms +step:375/2285 train_time:22520ms step_avg:60.05ms +step:376/2285 train_time:22579ms step_avg:60.05ms +step:377/2285 train_time:22639ms step_avg:60.05ms +step:378/2285 train_time:22698ms step_avg:60.05ms +step:379/2285 train_time:22759ms step_avg:60.05ms +step:380/2285 train_time:22818ms step_avg:60.05ms +step:381/2285 train_time:22879ms step_avg:60.05ms +step:382/2285 train_time:22938ms step_avg:60.05ms +step:383/2285 train_time:23000ms step_avg:60.05ms +step:384/2285 train_time:23059ms step_avg:60.05ms +step:385/2285 train_time:23121ms step_avg:60.05ms +step:386/2285 train_time:23180ms step_avg:60.05ms +step:387/2285 train_time:23242ms step_avg:60.06ms +step:388/2285 train_time:23301ms step_avg:60.05ms +step:389/2285 train_time:23363ms step_avg:60.06ms +step:390/2285 train_time:23422ms step_avg:60.06ms +step:391/2285 train_time:23483ms step_avg:60.06ms +step:392/2285 train_time:23543ms step_avg:60.06ms +step:393/2285 train_time:23603ms step_avg:60.06ms +step:394/2285 train_time:23662ms step_avg:60.06ms +step:395/2285 train_time:23723ms step_avg:60.06ms +step:396/2285 train_time:23782ms step_avg:60.06ms +step:397/2285 train_time:23843ms step_avg:60.06ms +step:398/2285 train_time:23902ms step_avg:60.06ms +step:399/2285 train_time:23963ms step_avg:60.06ms +step:400/2285 train_time:24022ms step_avg:60.06ms +step:401/2285 train_time:24083ms step_avg:60.06ms +step:402/2285 train_time:24142ms step_avg:60.06ms +step:403/2285 train_time:24204ms step_avg:60.06ms +step:404/2285 train_time:24263ms step_avg:60.06ms +step:405/2285 train_time:24324ms step_avg:60.06ms +step:406/2285 train_time:24383ms step_avg:60.06ms +step:407/2285 train_time:24445ms step_avg:60.06ms +step:408/2285 train_time:24504ms step_avg:60.06ms +step:409/2285 train_time:24566ms step_avg:60.06ms +step:410/2285 train_time:24625ms step_avg:60.06ms +step:411/2285 train_time:24686ms step_avg:60.06ms +step:412/2285 train_time:24745ms step_avg:60.06ms +step:413/2285 train_time:24806ms step_avg:60.06ms +step:414/2285 train_time:24864ms step_avg:60.06ms +step:415/2285 train_time:24926ms step_avg:60.06ms +step:416/2285 train_time:24985ms step_avg:60.06ms +step:417/2285 train_time:25046ms step_avg:60.06ms +step:418/2285 train_time:25105ms step_avg:60.06ms +step:419/2285 train_time:25166ms step_avg:60.06ms +step:420/2285 train_time:25225ms step_avg:60.06ms +step:421/2285 train_time:25287ms step_avg:60.06ms +step:422/2285 train_time:25346ms step_avg:60.06ms +step:423/2285 train_time:25408ms step_avg:60.07ms +step:424/2285 train_time:25467ms step_avg:60.06ms +step:425/2285 train_time:25528ms step_avg:60.07ms +step:426/2285 train_time:25587ms step_avg:60.06ms +step:427/2285 train_time:25648ms step_avg:60.07ms +step:428/2285 train_time:25707ms step_avg:60.06ms +step:429/2285 train_time:25768ms step_avg:60.07ms +step:430/2285 train_time:25827ms step_avg:60.06ms +step:431/2285 train_time:25888ms step_avg:60.06ms +step:432/2285 train_time:25947ms step_avg:60.06ms +step:433/2285 train_time:26008ms step_avg:60.07ms +step:434/2285 train_time:26067ms step_avg:60.06ms +step:435/2285 train_time:26129ms step_avg:60.07ms +step:436/2285 train_time:26188ms step_avg:60.07ms +step:437/2285 train_time:26250ms step_avg:60.07ms +step:438/2285 train_time:26309ms step_avg:60.07ms +step:439/2285 train_time:26370ms step_avg:60.07ms +step:440/2285 train_time:26429ms step_avg:60.07ms +step:441/2285 train_time:26491ms step_avg:60.07ms +step:442/2285 train_time:26550ms step_avg:60.07ms +step:443/2285 train_time:26610ms step_avg:60.07ms +step:444/2285 train_time:26669ms step_avg:60.07ms +step:445/2285 train_time:26730ms step_avg:60.07ms +step:446/2285 train_time:26790ms step_avg:60.07ms +step:447/2285 train_time:26851ms step_avg:60.07ms +step:448/2285 train_time:26909ms step_avg:60.07ms +step:449/2285 train_time:26971ms step_avg:60.07ms +step:450/2285 train_time:27029ms step_avg:60.07ms +step:451/2285 train_time:27091ms step_avg:60.07ms +step:452/2285 train_time:27149ms step_avg:60.07ms +step:453/2285 train_time:27211ms step_avg:60.07ms +step:454/2285 train_time:27271ms step_avg:60.07ms +step:455/2285 train_time:27332ms step_avg:60.07ms +step:456/2285 train_time:27392ms step_avg:60.07ms +step:457/2285 train_time:27453ms step_avg:60.07ms +step:458/2285 train_time:27512ms step_avg:60.07ms +step:459/2285 train_time:27573ms step_avg:60.07ms +step:460/2285 train_time:27632ms step_avg:60.07ms +step:461/2285 train_time:27694ms step_avg:60.07ms +step:462/2285 train_time:27753ms step_avg:60.07ms +step:463/2285 train_time:27814ms step_avg:60.07ms +step:464/2285 train_time:27873ms step_avg:60.07ms +step:465/2285 train_time:27934ms step_avg:60.07ms +step:466/2285 train_time:27993ms step_avg:60.07ms +step:467/2285 train_time:28054ms step_avg:60.07ms +step:468/2285 train_time:28113ms step_avg:60.07ms +step:469/2285 train_time:28175ms step_avg:60.07ms +step:470/2285 train_time:28235ms step_avg:60.07ms +step:471/2285 train_time:28296ms step_avg:60.08ms +step:472/2285 train_time:28355ms step_avg:60.07ms +step:473/2285 train_time:28416ms step_avg:60.08ms +step:474/2285 train_time:28475ms step_avg:60.07ms +step:475/2285 train_time:28537ms step_avg:60.08ms +step:476/2285 train_time:28596ms step_avg:60.08ms +step:477/2285 train_time:28657ms step_avg:60.08ms +step:478/2285 train_time:28717ms step_avg:60.08ms +step:479/2285 train_time:28778ms step_avg:60.08ms +step:480/2285 train_time:28837ms step_avg:60.08ms +step:481/2285 train_time:28898ms step_avg:60.08ms +step:482/2285 train_time:28957ms step_avg:60.08ms +step:483/2285 train_time:29019ms step_avg:60.08ms +step:484/2285 train_time:29078ms step_avg:60.08ms +step:485/2285 train_time:29140ms step_avg:60.08ms +step:486/2285 train_time:29199ms step_avg:60.08ms +step:487/2285 train_time:29260ms step_avg:60.08ms +step:488/2285 train_time:29320ms step_avg:60.08ms +step:489/2285 train_time:29382ms step_avg:60.09ms +step:490/2285 train_time:29441ms step_avg:60.08ms +step:491/2285 train_time:29502ms step_avg:60.09ms +step:492/2285 train_time:29561ms step_avg:60.08ms +step:493/2285 train_time:29623ms step_avg:60.09ms +step:494/2285 train_time:29682ms step_avg:60.08ms +step:495/2285 train_time:29744ms step_avg:60.09ms +step:496/2285 train_time:29803ms step_avg:60.09ms +step:497/2285 train_time:29864ms step_avg:60.09ms +step:498/2285 train_time:29923ms step_avg:60.09ms +step:499/2285 train_time:29984ms step_avg:60.09ms +step:500/2285 train_time:30043ms step_avg:60.09ms +step:500/2285 val_loss:3.7891 train_time:30106ms step_avg:60.21ms +step:501/2285 train_time:30125ms step_avg:60.13ms +step:502/2285 train_time:30166ms step_avg:60.09ms +step:503/2285 train_time:30227ms step_avg:60.09ms +step:504/2285 train_time:30287ms step_avg:60.09ms +step:505/2285 train_time:30349ms step_avg:60.10ms +step:506/2285 train_time:30408ms step_avg:60.10ms +step:507/2285 train_time:30469ms step_avg:60.10ms +step:508/2285 train_time:30528ms step_avg:60.10ms +step:509/2285 train_time:30591ms step_avg:60.10ms +step:510/2285 train_time:30649ms step_avg:60.10ms +step:511/2285 train_time:30711ms step_avg:60.10ms +step:512/2285 train_time:30769ms step_avg:60.10ms +step:513/2285 train_time:30830ms step_avg:60.10ms +step:514/2285 train_time:30890ms step_avg:60.10ms +step:515/2285 train_time:30952ms step_avg:60.10ms +step:516/2285 train_time:31012ms step_avg:60.10ms +step:517/2285 train_time:31078ms step_avg:60.11ms +step:518/2285 train_time:31138ms step_avg:60.11ms +step:519/2285 train_time:31199ms step_avg:60.11ms +step:520/2285 train_time:31258ms step_avg:60.11ms +step:521/2285 train_time:31321ms step_avg:60.12ms +step:522/2285 train_time:31380ms step_avg:60.12ms +step:523/2285 train_time:31441ms step_avg:60.12ms +step:524/2285 train_time:31500ms step_avg:60.11ms +step:525/2285 train_time:31561ms step_avg:60.12ms +step:526/2285 train_time:31620ms step_avg:60.11ms +step:527/2285 train_time:31681ms step_avg:60.12ms +step:528/2285 train_time:31740ms step_avg:60.11ms +step:529/2285 train_time:31801ms step_avg:60.12ms +step:530/2285 train_time:31860ms step_avg:60.11ms +step:531/2285 train_time:31921ms step_avg:60.12ms +step:532/2285 train_time:31981ms step_avg:60.11ms +step:533/2285 train_time:32043ms step_avg:60.12ms +step:534/2285 train_time:32103ms step_avg:60.12ms +step:535/2285 train_time:32164ms step_avg:60.12ms +step:536/2285 train_time:32224ms step_avg:60.12ms +step:537/2285 train_time:32286ms step_avg:60.12ms +step:538/2285 train_time:32345ms step_avg:60.12ms +step:539/2285 train_time:32406ms step_avg:60.12ms +step:540/2285 train_time:32465ms step_avg:60.12ms +step:541/2285 train_time:32527ms step_avg:60.12ms +step:542/2285 train_time:32585ms step_avg:60.12ms +step:543/2285 train_time:32647ms step_avg:60.12ms +step:544/2285 train_time:32705ms step_avg:60.12ms +step:545/2285 train_time:32767ms step_avg:60.12ms +step:546/2285 train_time:32826ms step_avg:60.12ms +step:547/2285 train_time:32887ms step_avg:60.12ms +step:548/2285 train_time:32946ms step_avg:60.12ms +step:549/2285 train_time:33008ms step_avg:60.12ms +step:550/2285 train_time:33069ms step_avg:60.13ms +step:551/2285 train_time:33130ms step_avg:60.13ms +step:552/2285 train_time:33190ms step_avg:60.13ms +step:553/2285 train_time:33251ms step_avg:60.13ms +step:554/2285 train_time:33310ms step_avg:60.13ms +step:555/2285 train_time:33372ms step_avg:60.13ms +step:556/2285 train_time:33430ms step_avg:60.13ms +step:557/2285 train_time:33492ms step_avg:60.13ms +step:558/2285 train_time:33551ms step_avg:60.13ms +step:559/2285 train_time:33612ms step_avg:60.13ms +step:560/2285 train_time:33672ms step_avg:60.13ms +step:561/2285 train_time:33733ms step_avg:60.13ms +step:562/2285 train_time:33792ms step_avg:60.13ms +step:563/2285 train_time:33854ms step_avg:60.13ms +step:564/2285 train_time:33914ms step_avg:60.13ms +step:565/2285 train_time:33975ms step_avg:60.13ms +step:566/2285 train_time:34035ms step_avg:60.13ms +step:567/2285 train_time:34097ms step_avg:60.14ms +step:568/2285 train_time:34156ms step_avg:60.13ms +step:569/2285 train_time:34217ms step_avg:60.14ms +step:570/2285 train_time:34276ms step_avg:60.13ms +step:571/2285 train_time:34338ms step_avg:60.14ms +step:572/2285 train_time:34397ms step_avg:60.13ms +step:573/2285 train_time:34458ms step_avg:60.14ms +step:574/2285 train_time:34517ms step_avg:60.13ms +step:575/2285 train_time:34578ms step_avg:60.14ms +step:576/2285 train_time:34637ms step_avg:60.13ms +step:577/2285 train_time:34698ms step_avg:60.14ms +step:578/2285 train_time:34757ms step_avg:60.13ms +step:579/2285 train_time:34819ms step_avg:60.14ms +step:580/2285 train_time:34878ms step_avg:60.13ms +step:581/2285 train_time:34940ms step_avg:60.14ms +step:582/2285 train_time:34998ms step_avg:60.13ms +step:583/2285 train_time:35060ms step_avg:60.14ms +step:584/2285 train_time:35119ms step_avg:60.14ms +step:585/2285 train_time:35180ms step_avg:60.14ms +step:586/2285 train_time:35239ms step_avg:60.14ms +step:587/2285 train_time:35301ms step_avg:60.14ms +step:588/2285 train_time:35360ms step_avg:60.14ms +step:589/2285 train_time:35421ms step_avg:60.14ms +step:590/2285 train_time:35480ms step_avg:60.14ms +step:591/2285 train_time:35542ms step_avg:60.14ms +step:592/2285 train_time:35601ms step_avg:60.14ms +step:593/2285 train_time:35662ms step_avg:60.14ms +step:594/2285 train_time:35721ms step_avg:60.14ms +step:595/2285 train_time:35782ms step_avg:60.14ms +step:596/2285 train_time:35841ms step_avg:60.14ms +step:597/2285 train_time:35902ms step_avg:60.14ms +step:598/2285 train_time:35961ms step_avg:60.14ms +step:599/2285 train_time:36022ms step_avg:60.14ms +step:600/2285 train_time:36081ms step_avg:60.14ms +step:601/2285 train_time:36142ms step_avg:60.14ms +step:602/2285 train_time:36201ms step_avg:60.13ms +step:603/2285 train_time:36263ms step_avg:60.14ms +step:604/2285 train_time:36321ms step_avg:60.13ms +step:605/2285 train_time:36383ms step_avg:60.14ms +step:606/2285 train_time:36443ms step_avg:60.14ms +step:607/2285 train_time:36504ms step_avg:60.14ms +step:608/2285 train_time:36564ms step_avg:60.14ms +step:609/2285 train_time:36624ms step_avg:60.14ms +step:610/2285 train_time:36683ms step_avg:60.14ms +step:611/2285 train_time:36744ms step_avg:60.14ms +step:612/2285 train_time:36803ms step_avg:60.14ms +step:613/2285 train_time:36865ms step_avg:60.14ms +step:614/2285 train_time:36924ms step_avg:60.14ms +step:615/2285 train_time:36986ms step_avg:60.14ms +step:616/2285 train_time:37045ms step_avg:60.14ms +step:617/2285 train_time:37106ms step_avg:60.14ms +step:618/2285 train_time:37165ms step_avg:60.14ms +step:619/2285 train_time:37227ms step_avg:60.14ms +step:620/2285 train_time:37286ms step_avg:60.14ms +step:621/2285 train_time:37348ms step_avg:60.14ms +step:622/2285 train_time:37407ms step_avg:60.14ms +step:623/2285 train_time:37469ms step_avg:60.14ms +step:624/2285 train_time:37529ms step_avg:60.14ms +step:625/2285 train_time:37590ms step_avg:60.14ms +step:626/2285 train_time:37649ms step_avg:60.14ms +step:627/2285 train_time:37710ms step_avg:60.14ms +step:628/2285 train_time:37770ms step_avg:60.14ms +step:629/2285 train_time:37831ms step_avg:60.15ms +step:630/2285 train_time:37891ms step_avg:60.14ms +step:631/2285 train_time:37953ms step_avg:60.15ms +step:632/2285 train_time:38012ms step_avg:60.15ms +step:633/2285 train_time:38074ms step_avg:60.15ms +step:634/2285 train_time:38133ms step_avg:60.15ms +step:635/2285 train_time:38195ms step_avg:60.15ms +step:636/2285 train_time:38254ms step_avg:60.15ms +step:637/2285 train_time:38316ms step_avg:60.15ms +step:638/2285 train_time:38376ms step_avg:60.15ms +step:639/2285 train_time:38437ms step_avg:60.15ms +step:640/2285 train_time:38496ms step_avg:60.15ms +step:641/2285 train_time:38558ms step_avg:60.15ms +step:642/2285 train_time:38617ms step_avg:60.15ms +step:643/2285 train_time:38679ms step_avg:60.15ms +step:644/2285 train_time:38738ms step_avg:60.15ms +step:645/2285 train_time:38799ms step_avg:60.15ms +step:646/2285 train_time:38858ms step_avg:60.15ms +step:647/2285 train_time:38920ms step_avg:60.15ms +step:648/2285 train_time:38979ms step_avg:60.15ms +step:649/2285 train_time:39040ms step_avg:60.15ms +step:650/2285 train_time:39099ms step_avg:60.15ms +step:651/2285 train_time:39161ms step_avg:60.15ms +step:652/2285 train_time:39220ms step_avg:60.15ms +step:653/2285 train_time:39281ms step_avg:60.15ms +step:654/2285 train_time:39340ms step_avg:60.15ms +step:655/2285 train_time:39401ms step_avg:60.15ms +step:656/2285 train_time:39460ms step_avg:60.15ms +step:657/2285 train_time:39522ms step_avg:60.15ms +step:658/2285 train_time:39581ms step_avg:60.15ms +step:659/2285 train_time:39642ms step_avg:60.16ms +step:660/2285 train_time:39702ms step_avg:60.15ms +step:661/2285 train_time:39764ms step_avg:60.16ms +step:662/2285 train_time:39823ms step_avg:60.15ms +step:663/2285 train_time:39884ms step_avg:60.16ms +step:664/2285 train_time:39943ms step_avg:60.16ms +step:665/2285 train_time:40004ms step_avg:60.16ms +step:666/2285 train_time:40063ms step_avg:60.16ms +step:667/2285 train_time:40125ms step_avg:60.16ms +step:668/2285 train_time:40184ms step_avg:60.16ms +step:669/2285 train_time:40245ms step_avg:60.16ms +step:670/2285 train_time:40304ms step_avg:60.16ms +step:671/2285 train_time:40366ms step_avg:60.16ms +step:672/2285 train_time:40426ms step_avg:60.16ms +step:673/2285 train_time:40488ms step_avg:60.16ms +step:674/2285 train_time:40546ms step_avg:60.16ms +step:675/2285 train_time:40608ms step_avg:60.16ms +step:676/2285 train_time:40668ms step_avg:60.16ms +step:677/2285 train_time:40729ms step_avg:60.16ms +step:678/2285 train_time:40788ms step_avg:60.16ms +step:679/2285 train_time:40850ms step_avg:60.16ms +step:680/2285 train_time:40910ms step_avg:60.16ms +step:681/2285 train_time:40972ms step_avg:60.16ms +step:682/2285 train_time:41031ms step_avg:60.16ms +step:683/2285 train_time:41093ms step_avg:60.16ms +step:684/2285 train_time:41152ms step_avg:60.16ms +step:685/2285 train_time:41214ms step_avg:60.17ms +step:686/2285 train_time:41273ms step_avg:60.17ms +step:687/2285 train_time:41335ms step_avg:60.17ms +step:688/2285 train_time:41394ms step_avg:60.17ms +step:689/2285 train_time:41455ms step_avg:60.17ms +step:690/2285 train_time:41515ms step_avg:60.17ms +step:691/2285 train_time:41577ms step_avg:60.17ms +step:692/2285 train_time:41636ms step_avg:60.17ms +step:693/2285 train_time:41698ms step_avg:60.17ms +step:694/2285 train_time:41757ms step_avg:60.17ms +step:695/2285 train_time:41819ms step_avg:60.17ms +step:696/2285 train_time:41878ms step_avg:60.17ms +step:697/2285 train_time:41939ms step_avg:60.17ms +step:698/2285 train_time:41998ms step_avg:60.17ms +step:699/2285 train_time:42060ms step_avg:60.17ms +step:700/2285 train_time:42119ms step_avg:60.17ms +step:701/2285 train_time:42180ms step_avg:60.17ms +step:702/2285 train_time:42239ms step_avg:60.17ms +step:703/2285 train_time:42300ms step_avg:60.17ms +step:704/2285 train_time:42359ms step_avg:60.17ms +step:705/2285 train_time:42421ms step_avg:60.17ms +step:706/2285 train_time:42480ms step_avg:60.17ms +step:707/2285 train_time:42541ms step_avg:60.17ms +step:708/2285 train_time:42600ms step_avg:60.17ms +step:709/2285 train_time:42662ms step_avg:60.17ms +step:710/2285 train_time:42721ms step_avg:60.17ms +step:711/2285 train_time:42783ms step_avg:60.17ms +step:712/2285 train_time:42841ms step_avg:60.17ms +step:713/2285 train_time:42903ms step_avg:60.17ms +step:714/2285 train_time:42962ms step_avg:60.17ms +step:715/2285 train_time:43023ms step_avg:60.17ms +step:716/2285 train_time:43082ms step_avg:60.17ms +step:717/2285 train_time:43144ms step_avg:60.17ms +step:718/2285 train_time:43203ms step_avg:60.17ms +step:719/2285 train_time:43264ms step_avg:60.17ms +step:720/2285 train_time:43323ms step_avg:60.17ms +step:721/2285 train_time:43384ms step_avg:60.17ms +step:722/2285 train_time:43443ms step_avg:60.17ms +step:723/2285 train_time:43505ms step_avg:60.17ms +step:724/2285 train_time:43564ms step_avg:60.17ms +step:725/2285 train_time:43626ms step_avg:60.17ms +step:726/2285 train_time:43685ms step_avg:60.17ms +step:727/2285 train_time:43746ms step_avg:60.17ms +step:728/2285 train_time:43806ms step_avg:60.17ms +step:729/2285 train_time:43868ms step_avg:60.17ms +step:730/2285 train_time:43927ms step_avg:60.17ms +step:731/2285 train_time:43988ms step_avg:60.18ms +step:732/2285 train_time:44048ms step_avg:60.17ms +step:733/2285 train_time:44110ms step_avg:60.18ms +step:734/2285 train_time:44170ms step_avg:60.18ms +step:735/2285 train_time:44231ms step_avg:60.18ms +step:736/2285 train_time:44290ms step_avg:60.18ms +step:737/2285 train_time:44353ms step_avg:60.18ms +step:738/2285 train_time:44412ms step_avg:60.18ms +step:739/2285 train_time:44473ms step_avg:60.18ms +step:740/2285 train_time:44532ms step_avg:60.18ms +step:741/2285 train_time:44595ms step_avg:60.18ms +step:742/2285 train_time:44654ms step_avg:60.18ms +step:743/2285 train_time:44715ms step_avg:60.18ms +step:744/2285 train_time:44775ms step_avg:60.18ms +step:745/2285 train_time:44836ms step_avg:60.18ms +step:746/2285 train_time:44895ms step_avg:60.18ms +step:747/2285 train_time:44957ms step_avg:60.18ms +step:748/2285 train_time:45016ms step_avg:60.18ms +step:749/2285 train_time:45078ms step_avg:60.18ms +step:750/2285 train_time:45138ms step_avg:60.18ms +step:750/2285 val_loss:3.6571 train_time:45200ms step_avg:60.27ms +step:751/2285 train_time:45218ms step_avg:60.21ms +step:752/2285 train_time:45260ms step_avg:60.19ms +step:753/2285 train_time:45324ms step_avg:60.19ms +step:754/2285 train_time:45387ms step_avg:60.20ms +step:755/2285 train_time:45449ms step_avg:60.20ms +step:756/2285 train_time:45508ms step_avg:60.20ms +step:757/2285 train_time:45570ms step_avg:60.20ms +step:758/2285 train_time:45628ms step_avg:60.20ms +step:759/2285 train_time:45689ms step_avg:60.20ms +step:760/2285 train_time:45747ms step_avg:60.19ms +step:761/2285 train_time:45808ms step_avg:60.19ms +step:762/2285 train_time:45866ms step_avg:60.19ms +step:763/2285 train_time:45927ms step_avg:60.19ms +step:764/2285 train_time:45986ms step_avg:60.19ms +step:765/2285 train_time:46047ms step_avg:60.19ms +step:766/2285 train_time:46107ms step_avg:60.19ms +step:767/2285 train_time:46170ms step_avg:60.20ms +step:768/2285 train_time:46231ms step_avg:60.20ms +step:769/2285 train_time:46295ms step_avg:60.20ms +step:770/2285 train_time:46355ms step_avg:60.20ms +step:771/2285 train_time:46418ms step_avg:60.20ms +step:772/2285 train_time:46477ms step_avg:60.20ms +step:773/2285 train_time:46539ms step_avg:60.21ms +step:774/2285 train_time:46598ms step_avg:60.20ms +step:775/2285 train_time:46661ms step_avg:60.21ms +step:776/2285 train_time:46720ms step_avg:60.21ms +step:777/2285 train_time:46781ms step_avg:60.21ms +step:778/2285 train_time:46840ms step_avg:60.21ms +step:779/2285 train_time:46902ms step_avg:60.21ms +step:780/2285 train_time:46960ms step_avg:60.21ms +step:781/2285 train_time:47021ms step_avg:60.21ms +step:782/2285 train_time:47081ms step_avg:60.21ms +step:783/2285 train_time:47143ms step_avg:60.21ms +step:784/2285 train_time:47203ms step_avg:60.21ms +step:785/2285 train_time:47265ms step_avg:60.21ms +step:786/2285 train_time:47325ms step_avg:60.21ms +step:787/2285 train_time:47388ms step_avg:60.21ms +step:788/2285 train_time:47447ms step_avg:60.21ms +step:789/2285 train_time:47509ms step_avg:60.21ms +step:790/2285 train_time:47569ms step_avg:60.21ms +step:791/2285 train_time:47631ms step_avg:60.22ms +step:792/2285 train_time:47690ms step_avg:60.22ms +step:793/2285 train_time:47752ms step_avg:60.22ms +step:794/2285 train_time:47812ms step_avg:60.22ms +step:795/2285 train_time:47873ms step_avg:60.22ms +step:796/2285 train_time:47933ms step_avg:60.22ms +step:797/2285 train_time:47995ms step_avg:60.22ms +step:798/2285 train_time:48055ms step_avg:60.22ms +step:799/2285 train_time:48118ms step_avg:60.22ms +step:800/2285 train_time:48177ms step_avg:60.22ms +step:801/2285 train_time:48240ms step_avg:60.22ms +step:802/2285 train_time:48299ms step_avg:60.22ms +step:803/2285 train_time:48361ms step_avg:60.23ms +step:804/2285 train_time:48421ms step_avg:60.22ms +step:805/2285 train_time:48483ms step_avg:60.23ms +step:806/2285 train_time:48542ms step_avg:60.23ms +step:807/2285 train_time:48605ms step_avg:60.23ms +step:808/2285 train_time:48664ms step_avg:60.23ms +step:809/2285 train_time:48725ms step_avg:60.23ms +step:810/2285 train_time:48785ms step_avg:60.23ms +step:811/2285 train_time:48846ms step_avg:60.23ms +step:812/2285 train_time:48906ms step_avg:60.23ms +step:813/2285 train_time:48968ms step_avg:60.23ms +step:814/2285 train_time:49028ms step_avg:60.23ms +step:815/2285 train_time:49090ms step_avg:60.23ms +step:816/2285 train_time:49149ms step_avg:60.23ms +step:817/2285 train_time:49212ms step_avg:60.23ms +step:818/2285 train_time:49271ms step_avg:60.23ms +step:819/2285 train_time:49333ms step_avg:60.24ms +step:820/2285 train_time:49393ms step_avg:60.24ms +step:821/2285 train_time:49455ms step_avg:60.24ms +step:822/2285 train_time:49514ms step_avg:60.24ms +step:823/2285 train_time:49576ms step_avg:60.24ms +step:824/2285 train_time:49636ms step_avg:60.24ms +step:825/2285 train_time:49699ms step_avg:60.24ms +step:826/2285 train_time:49758ms step_avg:60.24ms +step:827/2285 train_time:49820ms step_avg:60.24ms +step:828/2285 train_time:49879ms step_avg:60.24ms +step:829/2285 train_time:49941ms step_avg:60.24ms +step:830/2285 train_time:50000ms step_avg:60.24ms +step:831/2285 train_time:50062ms step_avg:60.24ms +step:832/2285 train_time:50121ms step_avg:60.24ms +step:833/2285 train_time:50184ms step_avg:60.24ms +step:834/2285 train_time:50243ms step_avg:60.24ms +step:835/2285 train_time:50305ms step_avg:60.25ms +step:836/2285 train_time:50365ms step_avg:60.24ms +step:837/2285 train_time:50426ms step_avg:60.25ms +step:838/2285 train_time:50486ms step_avg:60.25ms +step:839/2285 train_time:50547ms step_avg:60.25ms +step:840/2285 train_time:50607ms step_avg:60.25ms +step:841/2285 train_time:50669ms step_avg:60.25ms +step:842/2285 train_time:50729ms step_avg:60.25ms +step:843/2285 train_time:50791ms step_avg:60.25ms +step:844/2285 train_time:50851ms step_avg:60.25ms +step:845/2285 train_time:50912ms step_avg:60.25ms +step:846/2285 train_time:50972ms step_avg:60.25ms +step:847/2285 train_time:51034ms step_avg:60.25ms +step:848/2285 train_time:51094ms step_avg:60.25ms +step:849/2285 train_time:51157ms step_avg:60.26ms +step:850/2285 train_time:51217ms step_avg:60.26ms +step:851/2285 train_time:51279ms step_avg:60.26ms +step:852/2285 train_time:51338ms step_avg:60.26ms +step:853/2285 train_time:51400ms step_avg:60.26ms +step:854/2285 train_time:51460ms step_avg:60.26ms +step:855/2285 train_time:51521ms step_avg:60.26ms +step:856/2285 train_time:51581ms step_avg:60.26ms +step:857/2285 train_time:51643ms step_avg:60.26ms +step:858/2285 train_time:51702ms step_avg:60.26ms +step:859/2285 train_time:51764ms step_avg:60.26ms +step:860/2285 train_time:51824ms step_avg:60.26ms +step:861/2285 train_time:51886ms step_avg:60.26ms +step:862/2285 train_time:51945ms step_avg:60.26ms +step:863/2285 train_time:52007ms step_avg:60.26ms +step:864/2285 train_time:52066ms step_avg:60.26ms +step:865/2285 train_time:52128ms step_avg:60.26ms +step:866/2285 train_time:52188ms step_avg:60.26ms +step:867/2285 train_time:52250ms step_avg:60.27ms +step:868/2285 train_time:52310ms step_avg:60.26ms +step:869/2285 train_time:52372ms step_avg:60.27ms +step:870/2285 train_time:52431ms step_avg:60.27ms +step:871/2285 train_time:52494ms step_avg:60.27ms +step:872/2285 train_time:52554ms step_avg:60.27ms +step:873/2285 train_time:52616ms step_avg:60.27ms +step:874/2285 train_time:52676ms step_avg:60.27ms +step:875/2285 train_time:52738ms step_avg:60.27ms +step:876/2285 train_time:52798ms step_avg:60.27ms +step:877/2285 train_time:52860ms step_avg:60.27ms +step:878/2285 train_time:52919ms step_avg:60.27ms +step:879/2285 train_time:52980ms step_avg:60.27ms +step:880/2285 train_time:53040ms step_avg:60.27ms +step:881/2285 train_time:53102ms step_avg:60.27ms +step:882/2285 train_time:53161ms step_avg:60.27ms +step:883/2285 train_time:53223ms step_avg:60.28ms +step:884/2285 train_time:53283ms step_avg:60.27ms +step:885/2285 train_time:53345ms step_avg:60.28ms +step:886/2285 train_time:53405ms step_avg:60.28ms +step:887/2285 train_time:53466ms step_avg:60.28ms +step:888/2285 train_time:53526ms step_avg:60.28ms +step:889/2285 train_time:53588ms step_avg:60.28ms +step:890/2285 train_time:53647ms step_avg:60.28ms +step:891/2285 train_time:53709ms step_avg:60.28ms +step:892/2285 train_time:53769ms step_avg:60.28ms +step:893/2285 train_time:53831ms step_avg:60.28ms +step:894/2285 train_time:53890ms step_avg:60.28ms +step:895/2285 train_time:53953ms step_avg:60.28ms +step:896/2285 train_time:54013ms step_avg:60.28ms +step:897/2285 train_time:54074ms step_avg:60.28ms +step:898/2285 train_time:54134ms step_avg:60.28ms +step:899/2285 train_time:54196ms step_avg:60.28ms +step:900/2285 train_time:54255ms step_avg:60.28ms +step:901/2285 train_time:54317ms step_avg:60.29ms +step:902/2285 train_time:54377ms step_avg:60.28ms +step:903/2285 train_time:54439ms step_avg:60.29ms +step:904/2285 train_time:54499ms step_avg:60.29ms +step:905/2285 train_time:54560ms step_avg:60.29ms +step:906/2285 train_time:54620ms step_avg:60.29ms +step:907/2285 train_time:54681ms step_avg:60.29ms +step:908/2285 train_time:54741ms step_avg:60.29ms +step:909/2285 train_time:54803ms step_avg:60.29ms +step:910/2285 train_time:54863ms step_avg:60.29ms +step:911/2285 train_time:54924ms step_avg:60.29ms +step:912/2285 train_time:54984ms step_avg:60.29ms +step:913/2285 train_time:55045ms step_avg:60.29ms +step:914/2285 train_time:55104ms step_avg:60.29ms +step:915/2285 train_time:55166ms step_avg:60.29ms +step:916/2285 train_time:55226ms step_avg:60.29ms +step:917/2285 train_time:55288ms step_avg:60.29ms +step:918/2285 train_time:55347ms step_avg:60.29ms +step:919/2285 train_time:55409ms step_avg:60.29ms +step:920/2285 train_time:55469ms step_avg:60.29ms +step:921/2285 train_time:55531ms step_avg:60.29ms +step:922/2285 train_time:55591ms step_avg:60.29ms +step:923/2285 train_time:55653ms step_avg:60.30ms +step:924/2285 train_time:55714ms step_avg:60.30ms +step:925/2285 train_time:55775ms step_avg:60.30ms +step:926/2285 train_time:55835ms step_avg:60.30ms +step:927/2285 train_time:55897ms step_avg:60.30ms +step:928/2285 train_time:55956ms step_avg:60.30ms +step:929/2285 train_time:56019ms step_avg:60.30ms +step:930/2285 train_time:56078ms step_avg:60.30ms +step:931/2285 train_time:56140ms step_avg:60.30ms +step:932/2285 train_time:56199ms step_avg:60.30ms +step:933/2285 train_time:56261ms step_avg:60.30ms +step:934/2285 train_time:56321ms step_avg:60.30ms +step:935/2285 train_time:56383ms step_avg:60.30ms +step:936/2285 train_time:56442ms step_avg:60.30ms +step:937/2285 train_time:56505ms step_avg:60.30ms +step:938/2285 train_time:56564ms step_avg:60.30ms +step:939/2285 train_time:56627ms step_avg:60.31ms +step:940/2285 train_time:56686ms step_avg:60.30ms +step:941/2285 train_time:56748ms step_avg:60.31ms +step:942/2285 train_time:56808ms step_avg:60.31ms +step:943/2285 train_time:56869ms step_avg:60.31ms +step:944/2285 train_time:56929ms step_avg:60.31ms +step:945/2285 train_time:56990ms step_avg:60.31ms +step:946/2285 train_time:57051ms step_avg:60.31ms +step:947/2285 train_time:57113ms step_avg:60.31ms +step:948/2285 train_time:57173ms step_avg:60.31ms +step:949/2285 train_time:57235ms step_avg:60.31ms +step:950/2285 train_time:57294ms step_avg:60.31ms +step:951/2285 train_time:57356ms step_avg:60.31ms +step:952/2285 train_time:57416ms step_avg:60.31ms +step:953/2285 train_time:57478ms step_avg:60.31ms +step:954/2285 train_time:57538ms step_avg:60.31ms +step:955/2285 train_time:57600ms step_avg:60.31ms +step:956/2285 train_time:57659ms step_avg:60.31ms +step:957/2285 train_time:57722ms step_avg:60.32ms +step:958/2285 train_time:57781ms step_avg:60.31ms +step:959/2285 train_time:57843ms step_avg:60.32ms +step:960/2285 train_time:57903ms step_avg:60.32ms +step:961/2285 train_time:57965ms step_avg:60.32ms +step:962/2285 train_time:58024ms step_avg:60.32ms +step:963/2285 train_time:58086ms step_avg:60.32ms +step:964/2285 train_time:58145ms step_avg:60.32ms +step:965/2285 train_time:58207ms step_avg:60.32ms +step:966/2285 train_time:58267ms step_avg:60.32ms +step:967/2285 train_time:58329ms step_avg:60.32ms +step:968/2285 train_time:58389ms step_avg:60.32ms +step:969/2285 train_time:58451ms step_avg:60.32ms +step:970/2285 train_time:58511ms step_avg:60.32ms +step:971/2285 train_time:58573ms step_avg:60.32ms +step:972/2285 train_time:58632ms step_avg:60.32ms +step:973/2285 train_time:58694ms step_avg:60.32ms +step:974/2285 train_time:58754ms step_avg:60.32ms +step:975/2285 train_time:58817ms step_avg:60.32ms +step:976/2285 train_time:58877ms step_avg:60.32ms +step:977/2285 train_time:58938ms step_avg:60.33ms +step:978/2285 train_time:58998ms step_avg:60.33ms +step:979/2285 train_time:59059ms step_avg:60.33ms +step:980/2285 train_time:59119ms step_avg:60.33ms +step:981/2285 train_time:59181ms step_avg:60.33ms +step:982/2285 train_time:59241ms step_avg:60.33ms +step:983/2285 train_time:59303ms step_avg:60.33ms +step:984/2285 train_time:59363ms step_avg:60.33ms +step:985/2285 train_time:59425ms step_avg:60.33ms +step:986/2285 train_time:59484ms step_avg:60.33ms +step:987/2285 train_time:59546ms step_avg:60.33ms +step:988/2285 train_time:59605ms step_avg:60.33ms +step:989/2285 train_time:59667ms step_avg:60.33ms +step:990/2285 train_time:59727ms step_avg:60.33ms +step:991/2285 train_time:59789ms step_avg:60.33ms +step:992/2285 train_time:59848ms step_avg:60.33ms +step:993/2285 train_time:59910ms step_avg:60.33ms +step:994/2285 train_time:59970ms step_avg:60.33ms +step:995/2285 train_time:60032ms step_avg:60.33ms +step:996/2285 train_time:60091ms step_avg:60.33ms +step:997/2285 train_time:60154ms step_avg:60.33ms +step:998/2285 train_time:60214ms step_avg:60.33ms +step:999/2285 train_time:60275ms step_avg:60.34ms +step:1000/2285 train_time:60335ms step_avg:60.33ms +step:1000/2285 val_loss:3.5674 train_time:60398ms step_avg:60.40ms +step:1001/2285 train_time:60418ms step_avg:60.36ms +step:1002/2285 train_time:60458ms step_avg:60.34ms +step:1003/2285 train_time:60519ms step_avg:60.34ms +step:1004/2285 train_time:60578ms step_avg:60.34ms +step:1005/2285 train_time:60641ms step_avg:60.34ms +step:1006/2285 train_time:60701ms step_avg:60.34ms +step:1007/2285 train_time:60762ms step_avg:60.34ms +step:1008/2285 train_time:60820ms step_avg:60.34ms +step:1009/2285 train_time:60881ms step_avg:60.34ms +step:1010/2285 train_time:60939ms step_avg:60.34ms +step:1011/2285 train_time:61000ms step_avg:60.34ms +step:1012/2285 train_time:61058ms step_avg:60.33ms +step:1013/2285 train_time:61121ms step_avg:60.34ms +step:1014/2285 train_time:61180ms step_avg:60.34ms +step:1015/2285 train_time:61241ms step_avg:60.34ms +step:1016/2285 train_time:61302ms step_avg:60.34ms +step:1017/2285 train_time:61368ms step_avg:60.34ms +step:1018/2285 train_time:61429ms step_avg:60.34ms +step:1019/2285 train_time:61491ms step_avg:60.34ms +step:1020/2285 train_time:61551ms step_avg:60.34ms +step:1021/2285 train_time:61613ms step_avg:60.35ms +step:1022/2285 train_time:61672ms step_avg:60.34ms +step:1023/2285 train_time:61734ms step_avg:60.35ms +step:1024/2285 train_time:61793ms step_avg:60.34ms +step:1025/2285 train_time:61855ms step_avg:60.35ms +step:1026/2285 train_time:61914ms step_avg:60.34ms +step:1027/2285 train_time:61975ms step_avg:60.35ms +step:1028/2285 train_time:62034ms step_avg:60.34ms +step:1029/2285 train_time:62096ms step_avg:60.35ms +step:1030/2285 train_time:62156ms step_avg:60.35ms +step:1031/2285 train_time:62217ms step_avg:60.35ms +step:1032/2285 train_time:62278ms step_avg:60.35ms +step:1033/2285 train_time:62340ms step_avg:60.35ms +step:1034/2285 train_time:62400ms step_avg:60.35ms +step:1035/2285 train_time:62462ms step_avg:60.35ms +step:1036/2285 train_time:62522ms step_avg:60.35ms +step:1037/2285 train_time:62584ms step_avg:60.35ms +step:1038/2285 train_time:62643ms step_avg:60.35ms +step:1039/2285 train_time:62706ms step_avg:60.35ms +step:1040/2285 train_time:62766ms step_avg:60.35ms +step:1041/2285 train_time:62827ms step_avg:60.35ms +step:1042/2285 train_time:62887ms step_avg:60.35ms +step:1043/2285 train_time:62949ms step_avg:60.35ms +step:1044/2285 train_time:63008ms step_avg:60.35ms +step:1045/2285 train_time:63070ms step_avg:60.35ms +step:1046/2285 train_time:63130ms step_avg:60.35ms +step:1047/2285 train_time:63192ms step_avg:60.35ms +step:1048/2285 train_time:63252ms step_avg:60.35ms +step:1049/2285 train_time:63314ms step_avg:60.36ms +step:1050/2285 train_time:63374ms step_avg:60.36ms +step:1051/2285 train_time:63436ms step_avg:60.36ms +step:1052/2285 train_time:63495ms step_avg:60.36ms +step:1053/2285 train_time:63557ms step_avg:60.36ms +step:1054/2285 train_time:63617ms step_avg:60.36ms +step:1055/2285 train_time:63679ms step_avg:60.36ms +step:1056/2285 train_time:63738ms step_avg:60.36ms +step:1057/2285 train_time:63800ms step_avg:60.36ms +step:1058/2285 train_time:63860ms step_avg:60.36ms +step:1059/2285 train_time:63922ms step_avg:60.36ms +step:1060/2285 train_time:63981ms step_avg:60.36ms +step:1061/2285 train_time:64043ms step_avg:60.36ms +step:1062/2285 train_time:64103ms step_avg:60.36ms +step:1063/2285 train_time:64165ms step_avg:60.36ms +step:1064/2285 train_time:64224ms step_avg:60.36ms +step:1065/2285 train_time:64286ms step_avg:60.36ms +step:1066/2285 train_time:64347ms step_avg:60.36ms +step:1067/2285 train_time:64409ms step_avg:60.36ms +step:1068/2285 train_time:64468ms step_avg:60.36ms +step:1069/2285 train_time:64530ms step_avg:60.37ms +step:1070/2285 train_time:64590ms step_avg:60.36ms +step:1071/2285 train_time:64652ms step_avg:60.37ms +step:1072/2285 train_time:64712ms step_avg:60.37ms +step:1073/2285 train_time:64774ms step_avg:60.37ms +step:1074/2285 train_time:64833ms step_avg:60.37ms +step:1075/2285 train_time:64896ms step_avg:60.37ms +step:1076/2285 train_time:64955ms step_avg:60.37ms +step:1077/2285 train_time:65017ms step_avg:60.37ms +step:1078/2285 train_time:65076ms step_avg:60.37ms +step:1079/2285 train_time:65137ms step_avg:60.37ms +step:1080/2285 train_time:65197ms step_avg:60.37ms +step:1081/2285 train_time:65259ms step_avg:60.37ms +step:1082/2285 train_time:65319ms step_avg:60.37ms +step:1083/2285 train_time:65382ms step_avg:60.37ms +step:1084/2285 train_time:65441ms step_avg:60.37ms +step:1085/2285 train_time:65503ms step_avg:60.37ms +step:1086/2285 train_time:65562ms step_avg:60.37ms +step:1087/2285 train_time:65624ms step_avg:60.37ms +step:1088/2285 train_time:65683ms step_avg:60.37ms +step:1089/2285 train_time:65745ms step_avg:60.37ms +step:1090/2285 train_time:65805ms step_avg:60.37ms +step:1091/2285 train_time:65867ms step_avg:60.37ms +step:1092/2285 train_time:65927ms step_avg:60.37ms +step:1093/2285 train_time:65990ms step_avg:60.38ms +step:1094/2285 train_time:66051ms step_avg:60.38ms +step:1095/2285 train_time:66112ms step_avg:60.38ms +step:1096/2285 train_time:66172ms step_avg:60.38ms +step:1097/2285 train_time:66235ms step_avg:60.38ms +step:1098/2285 train_time:66294ms step_avg:60.38ms +step:1099/2285 train_time:66356ms step_avg:60.38ms +step:1100/2285 train_time:66416ms step_avg:60.38ms +step:1101/2285 train_time:66478ms step_avg:60.38ms +step:1102/2285 train_time:66537ms step_avg:60.38ms +step:1103/2285 train_time:66598ms step_avg:60.38ms +step:1104/2285 train_time:66658ms step_avg:60.38ms +step:1105/2285 train_time:66720ms step_avg:60.38ms +step:1106/2285 train_time:66779ms step_avg:60.38ms +step:1107/2285 train_time:66842ms step_avg:60.38ms +step:1108/2285 train_time:66901ms step_avg:60.38ms +step:1109/2285 train_time:66963ms step_avg:60.38ms +step:1110/2285 train_time:67023ms step_avg:60.38ms +step:1111/2285 train_time:67084ms step_avg:60.38ms +step:1112/2285 train_time:67144ms step_avg:60.38ms +step:1113/2285 train_time:67206ms step_avg:60.38ms +step:1114/2285 train_time:67265ms step_avg:60.38ms +step:1115/2285 train_time:67327ms step_avg:60.38ms +step:1116/2285 train_time:67386ms step_avg:60.38ms +step:1117/2285 train_time:67449ms step_avg:60.38ms +step:1118/2285 train_time:67509ms step_avg:60.38ms +step:1119/2285 train_time:67571ms step_avg:60.38ms +step:1120/2285 train_time:67631ms step_avg:60.39ms +step:1121/2285 train_time:67693ms step_avg:60.39ms +step:1122/2285 train_time:67753ms step_avg:60.39ms +step:1123/2285 train_time:67815ms step_avg:60.39ms +step:1124/2285 train_time:67874ms step_avg:60.39ms +step:1125/2285 train_time:67937ms step_avg:60.39ms +step:1126/2285 train_time:67996ms step_avg:60.39ms +step:1127/2285 train_time:68058ms step_avg:60.39ms +step:1128/2285 train_time:68117ms step_avg:60.39ms +step:1129/2285 train_time:68179ms step_avg:60.39ms +step:1130/2285 train_time:68239ms step_avg:60.39ms +step:1131/2285 train_time:68301ms step_avg:60.39ms +step:1132/2285 train_time:68361ms step_avg:60.39ms +step:1133/2285 train_time:68422ms step_avg:60.39ms +step:1134/2285 train_time:68481ms step_avg:60.39ms +step:1135/2285 train_time:68543ms step_avg:60.39ms +step:1136/2285 train_time:68603ms step_avg:60.39ms +step:1137/2285 train_time:68664ms step_avg:60.39ms +step:1138/2285 train_time:68723ms step_avg:60.39ms +step:1139/2285 train_time:68785ms step_avg:60.39ms +step:1140/2285 train_time:68846ms step_avg:60.39ms +step:1141/2285 train_time:68908ms step_avg:60.39ms +step:1142/2285 train_time:68967ms step_avg:60.39ms +step:1143/2285 train_time:69029ms step_avg:60.39ms +step:1144/2285 train_time:69089ms step_avg:60.39ms +step:1145/2285 train_time:69152ms step_avg:60.39ms +step:1146/2285 train_time:69211ms step_avg:60.39ms +step:1147/2285 train_time:69274ms step_avg:60.40ms +step:1148/2285 train_time:69334ms step_avg:60.40ms +step:1149/2285 train_time:69396ms step_avg:60.40ms +step:1150/2285 train_time:69456ms step_avg:60.40ms +step:1151/2285 train_time:69517ms step_avg:60.40ms +step:1152/2285 train_time:69577ms step_avg:60.40ms +step:1153/2285 train_time:69639ms step_avg:60.40ms +step:1154/2285 train_time:69699ms step_avg:60.40ms +step:1155/2285 train_time:69761ms step_avg:60.40ms +step:1156/2285 train_time:69821ms step_avg:60.40ms +step:1157/2285 train_time:69883ms step_avg:60.40ms +step:1158/2285 train_time:69942ms step_avg:60.40ms +step:1159/2285 train_time:70004ms step_avg:60.40ms +step:1160/2285 train_time:70064ms step_avg:60.40ms +step:1161/2285 train_time:70126ms step_avg:60.40ms +step:1162/2285 train_time:70186ms step_avg:60.40ms +step:1163/2285 train_time:70248ms step_avg:60.40ms +step:1164/2285 train_time:70309ms step_avg:60.40ms +step:1165/2285 train_time:70371ms step_avg:60.40ms +step:1166/2285 train_time:70432ms step_avg:60.40ms +step:1167/2285 train_time:70495ms step_avg:60.41ms +step:1168/2285 train_time:70555ms step_avg:60.41ms +step:1169/2285 train_time:70617ms step_avg:60.41ms +step:1170/2285 train_time:70676ms step_avg:60.41ms +step:1171/2285 train_time:70738ms step_avg:60.41ms +step:1172/2285 train_time:70797ms step_avg:60.41ms +step:1173/2285 train_time:70859ms step_avg:60.41ms +step:1174/2285 train_time:70919ms step_avg:60.41ms +step:1175/2285 train_time:70982ms step_avg:60.41ms +step:1176/2285 train_time:71041ms step_avg:60.41ms +step:1177/2285 train_time:71103ms step_avg:60.41ms +step:1178/2285 train_time:71162ms step_avg:60.41ms +step:1179/2285 train_time:71225ms step_avg:60.41ms +step:1180/2285 train_time:71284ms step_avg:60.41ms +step:1181/2285 train_time:71347ms step_avg:60.41ms +step:1182/2285 train_time:71407ms step_avg:60.41ms +step:1183/2285 train_time:71469ms step_avg:60.41ms +step:1184/2285 train_time:71529ms step_avg:60.41ms +step:1185/2285 train_time:71592ms step_avg:60.42ms +step:1186/2285 train_time:71652ms step_avg:60.41ms +step:1187/2285 train_time:71714ms step_avg:60.42ms +step:1188/2285 train_time:71774ms step_avg:60.42ms +step:1189/2285 train_time:71836ms step_avg:60.42ms +step:1190/2285 train_time:71896ms step_avg:60.42ms +step:1191/2285 train_time:71958ms step_avg:60.42ms +step:1192/2285 train_time:72018ms step_avg:60.42ms +step:1193/2285 train_time:72080ms step_avg:60.42ms +step:1194/2285 train_time:72139ms step_avg:60.42ms +step:1195/2285 train_time:72202ms step_avg:60.42ms +step:1196/2285 train_time:72261ms step_avg:60.42ms +step:1197/2285 train_time:72323ms step_avg:60.42ms +step:1198/2285 train_time:72383ms step_avg:60.42ms +step:1199/2285 train_time:72445ms step_avg:60.42ms +step:1200/2285 train_time:72504ms step_avg:60.42ms +step:1201/2285 train_time:72567ms step_avg:60.42ms +step:1202/2285 train_time:72627ms step_avg:60.42ms +step:1203/2285 train_time:72689ms step_avg:60.42ms +step:1204/2285 train_time:72748ms step_avg:60.42ms +step:1205/2285 train_time:72810ms step_avg:60.42ms +step:1206/2285 train_time:72872ms step_avg:60.42ms +step:1207/2285 train_time:72935ms step_avg:60.43ms +step:1208/2285 train_time:72994ms step_avg:60.43ms +step:1209/2285 train_time:73056ms step_avg:60.43ms +step:1210/2285 train_time:73116ms step_avg:60.43ms +step:1211/2285 train_time:73178ms step_avg:60.43ms +step:1212/2285 train_time:73238ms step_avg:60.43ms +step:1213/2285 train_time:73301ms step_avg:60.43ms +step:1214/2285 train_time:73361ms step_avg:60.43ms +step:1215/2285 train_time:73423ms step_avg:60.43ms +step:1216/2285 train_time:73482ms step_avg:60.43ms +step:1217/2285 train_time:73544ms step_avg:60.43ms +step:1218/2285 train_time:73604ms step_avg:60.43ms +step:1219/2285 train_time:73667ms step_avg:60.43ms +step:1220/2285 train_time:73726ms step_avg:60.43ms +step:1221/2285 train_time:73788ms step_avg:60.43ms +step:1222/2285 train_time:73849ms step_avg:60.43ms +step:1223/2285 train_time:73912ms step_avg:60.43ms +step:1224/2285 train_time:73972ms step_avg:60.43ms +step:1225/2285 train_time:74034ms step_avg:60.44ms +step:1226/2285 train_time:74094ms step_avg:60.44ms +step:1227/2285 train_time:74157ms step_avg:60.44ms +step:1228/2285 train_time:74217ms step_avg:60.44ms +step:1229/2285 train_time:74279ms step_avg:60.44ms +step:1230/2285 train_time:74339ms step_avg:60.44ms +step:1231/2285 train_time:74402ms step_avg:60.44ms +step:1232/2285 train_time:74462ms step_avg:60.44ms +step:1233/2285 train_time:74524ms step_avg:60.44ms +step:1234/2285 train_time:74583ms step_avg:60.44ms +step:1235/2285 train_time:74645ms step_avg:60.44ms +step:1236/2285 train_time:74705ms step_avg:60.44ms +step:1237/2285 train_time:74768ms step_avg:60.44ms +step:1238/2285 train_time:74828ms step_avg:60.44ms +step:1239/2285 train_time:74890ms step_avg:60.44ms +step:1240/2285 train_time:74950ms step_avg:60.44ms +step:1241/2285 train_time:75012ms step_avg:60.44ms +step:1242/2285 train_time:75072ms step_avg:60.44ms +step:1243/2285 train_time:75135ms step_avg:60.45ms +step:1244/2285 train_time:75195ms step_avg:60.45ms +step:1245/2285 train_time:75257ms step_avg:60.45ms +step:1246/2285 train_time:75316ms step_avg:60.45ms +step:1247/2285 train_time:75378ms step_avg:60.45ms +step:1248/2285 train_time:75438ms step_avg:60.45ms +step:1249/2285 train_time:75499ms step_avg:60.45ms +step:1250/2285 train_time:75560ms step_avg:60.45ms +step:1250/2285 val_loss:3.4957 train_time:75624ms step_avg:60.50ms +step:1251/2285 train_time:75642ms step_avg:60.47ms +step:1252/2285 train_time:75684ms step_avg:60.45ms +step:1253/2285 train_time:75747ms step_avg:60.45ms +step:1254/2285 train_time:75806ms step_avg:60.45ms +step:1255/2285 train_time:75869ms step_avg:60.45ms +step:1256/2285 train_time:75928ms step_avg:60.45ms +step:1257/2285 train_time:75990ms step_avg:60.45ms +step:1258/2285 train_time:76049ms step_avg:60.45ms +step:1259/2285 train_time:76110ms step_avg:60.45ms +step:1260/2285 train_time:76169ms step_avg:60.45ms +step:1261/2285 train_time:76230ms step_avg:60.45ms +step:1262/2285 train_time:76289ms step_avg:60.45ms +step:1263/2285 train_time:76350ms step_avg:60.45ms +step:1264/2285 train_time:76409ms step_avg:60.45ms +step:1265/2285 train_time:76470ms step_avg:60.45ms +step:1266/2285 train_time:76536ms step_avg:60.45ms +step:1267/2285 train_time:76602ms step_avg:60.46ms +step:1268/2285 train_time:76663ms step_avg:60.46ms +step:1269/2285 train_time:76726ms step_avg:60.46ms +step:1270/2285 train_time:76785ms step_avg:60.46ms +step:1271/2285 train_time:76848ms step_avg:60.46ms +step:1272/2285 train_time:76907ms step_avg:60.46ms +step:1273/2285 train_time:76968ms step_avg:60.46ms +step:1274/2285 train_time:77028ms step_avg:60.46ms +step:1275/2285 train_time:77089ms step_avg:60.46ms +step:1276/2285 train_time:77148ms step_avg:60.46ms +step:1277/2285 train_time:77210ms step_avg:60.46ms +step:1278/2285 train_time:77268ms step_avg:60.46ms +step:1279/2285 train_time:77330ms step_avg:60.46ms +step:1280/2285 train_time:77389ms step_avg:60.46ms +step:1281/2285 train_time:77451ms step_avg:60.46ms +step:1282/2285 train_time:77513ms step_avg:60.46ms +step:1283/2285 train_time:77578ms step_avg:60.47ms +step:1284/2285 train_time:77638ms step_avg:60.47ms +step:1285/2285 train_time:77700ms step_avg:60.47ms +step:1286/2285 train_time:77760ms step_avg:60.47ms +step:1287/2285 train_time:77822ms step_avg:60.47ms +step:1288/2285 train_time:77882ms step_avg:60.47ms +step:1289/2285 train_time:77944ms step_avg:60.47ms +step:1290/2285 train_time:78003ms step_avg:60.47ms +step:1291/2285 train_time:78065ms step_avg:60.47ms +step:1292/2285 train_time:78124ms step_avg:60.47ms +step:1293/2285 train_time:78186ms step_avg:60.47ms +step:1294/2285 train_time:78246ms step_avg:60.47ms +step:1295/2285 train_time:78307ms step_avg:60.47ms +step:1296/2285 train_time:78366ms step_avg:60.47ms +step:1297/2285 train_time:78429ms step_avg:60.47ms +step:1298/2285 train_time:78489ms step_avg:60.47ms +step:1299/2285 train_time:78553ms step_avg:60.47ms +step:1300/2285 train_time:78613ms step_avg:60.47ms +step:1301/2285 train_time:78677ms step_avg:60.47ms +step:1302/2285 train_time:78737ms step_avg:60.47ms +step:1303/2285 train_time:78799ms step_avg:60.48ms +step:1304/2285 train_time:78859ms step_avg:60.47ms +step:1305/2285 train_time:78921ms step_avg:60.48ms +step:1306/2285 train_time:78980ms step_avg:60.48ms +step:1307/2285 train_time:79043ms step_avg:60.48ms +step:1308/2285 train_time:79102ms step_avg:60.48ms +step:1309/2285 train_time:79164ms step_avg:60.48ms +step:1310/2285 train_time:79223ms step_avg:60.48ms +step:1311/2285 train_time:79285ms step_avg:60.48ms +step:1312/2285 train_time:79345ms step_avg:60.48ms +step:1313/2285 train_time:79408ms step_avg:60.48ms +step:1314/2285 train_time:79468ms step_avg:60.48ms +step:1315/2285 train_time:79530ms step_avg:60.48ms +step:1316/2285 train_time:79590ms step_avg:60.48ms +step:1317/2285 train_time:79653ms step_avg:60.48ms +step:1318/2285 train_time:79714ms step_avg:60.48ms +step:1319/2285 train_time:79777ms step_avg:60.48ms +step:1320/2285 train_time:79837ms step_avg:60.48ms +step:1321/2285 train_time:79899ms step_avg:60.48ms +step:1322/2285 train_time:79958ms step_avg:60.48ms +step:1323/2285 train_time:80020ms step_avg:60.48ms +step:1324/2285 train_time:80080ms step_avg:60.48ms +step:1325/2285 train_time:80142ms step_avg:60.48ms +step:1326/2285 train_time:80202ms step_avg:60.48ms +step:1327/2285 train_time:80264ms step_avg:60.49ms +step:1328/2285 train_time:80323ms step_avg:60.48ms +step:1329/2285 train_time:80385ms step_avg:60.49ms +step:1330/2285 train_time:80445ms step_avg:60.48ms +step:1331/2285 train_time:80507ms step_avg:60.49ms +step:1332/2285 train_time:80568ms step_avg:60.49ms +step:1333/2285 train_time:80630ms step_avg:60.49ms +step:1334/2285 train_time:80691ms step_avg:60.49ms +step:1335/2285 train_time:80753ms step_avg:60.49ms +step:1336/2285 train_time:80813ms step_avg:60.49ms +step:1337/2285 train_time:80875ms step_avg:60.49ms +step:1338/2285 train_time:80936ms step_avg:60.49ms +step:1339/2285 train_time:80998ms step_avg:60.49ms +step:1340/2285 train_time:81057ms step_avg:60.49ms +step:1341/2285 train_time:81120ms step_avg:60.49ms +step:1342/2285 train_time:81180ms step_avg:60.49ms +step:1343/2285 train_time:81242ms step_avg:60.49ms +step:1344/2285 train_time:81302ms step_avg:60.49ms +step:1345/2285 train_time:81364ms step_avg:60.49ms +step:1346/2285 train_time:81424ms step_avg:60.49ms +step:1347/2285 train_time:81485ms step_avg:60.49ms +step:1348/2285 train_time:81545ms step_avg:60.49ms +step:1349/2285 train_time:81608ms step_avg:60.49ms +step:1350/2285 train_time:81668ms step_avg:60.49ms +step:1351/2285 train_time:81730ms step_avg:60.50ms +step:1352/2285 train_time:81790ms step_avg:60.50ms +step:1353/2285 train_time:81853ms step_avg:60.50ms +step:1354/2285 train_time:81912ms step_avg:60.50ms +step:1355/2285 train_time:81975ms step_avg:60.50ms +step:1356/2285 train_time:82035ms step_avg:60.50ms +step:1357/2285 train_time:82098ms step_avg:60.50ms +step:1358/2285 train_time:82158ms step_avg:60.50ms +step:1359/2285 train_time:82220ms step_avg:60.50ms +step:1360/2285 train_time:82280ms step_avg:60.50ms +step:1361/2285 train_time:82342ms step_avg:60.50ms +step:1362/2285 train_time:82402ms step_avg:60.50ms +step:1363/2285 train_time:82464ms step_avg:60.50ms +step:1364/2285 train_time:82524ms step_avg:60.50ms +step:1365/2285 train_time:82585ms step_avg:60.50ms +step:1366/2285 train_time:82645ms step_avg:60.50ms +step:1367/2285 train_time:82707ms step_avg:60.50ms +step:1368/2285 train_time:82767ms step_avg:60.50ms +step:1369/2285 train_time:82829ms step_avg:60.50ms +step:1370/2285 train_time:82889ms step_avg:60.50ms +step:1371/2285 train_time:82952ms step_avg:60.50ms +step:1372/2285 train_time:83012ms step_avg:60.50ms +step:1373/2285 train_time:83075ms step_avg:60.51ms +step:1374/2285 train_time:83135ms step_avg:60.51ms +step:1375/2285 train_time:83197ms step_avg:60.51ms +step:1376/2285 train_time:83257ms step_avg:60.51ms +step:1377/2285 train_time:83319ms step_avg:60.51ms +step:1378/2285 train_time:83378ms step_avg:60.51ms +step:1379/2285 train_time:83440ms step_avg:60.51ms +step:1380/2285 train_time:83500ms step_avg:60.51ms +step:1381/2285 train_time:83563ms step_avg:60.51ms +step:1382/2285 train_time:83623ms step_avg:60.51ms +step:1383/2285 train_time:83685ms step_avg:60.51ms +step:1384/2285 train_time:83745ms step_avg:60.51ms +step:1385/2285 train_time:83807ms step_avg:60.51ms +step:1386/2285 train_time:83867ms step_avg:60.51ms +step:1387/2285 train_time:83929ms step_avg:60.51ms +step:1388/2285 train_time:83989ms step_avg:60.51ms +step:1389/2285 train_time:84052ms step_avg:60.51ms +step:1390/2285 train_time:84112ms step_avg:60.51ms +step:1391/2285 train_time:84175ms step_avg:60.51ms +step:1392/2285 train_time:84236ms step_avg:60.51ms +step:1393/2285 train_time:84298ms step_avg:60.52ms +step:1394/2285 train_time:84358ms step_avg:60.52ms +step:1395/2285 train_time:84420ms step_avg:60.52ms +step:1396/2285 train_time:84479ms step_avg:60.52ms +step:1397/2285 train_time:84541ms step_avg:60.52ms +step:1398/2285 train_time:84601ms step_avg:60.52ms +step:1399/2285 train_time:84664ms step_avg:60.52ms +step:1400/2285 train_time:84724ms step_avg:60.52ms +step:1401/2285 train_time:84785ms step_avg:60.52ms +step:1402/2285 train_time:84845ms step_avg:60.52ms +step:1403/2285 train_time:84907ms step_avg:60.52ms +step:1404/2285 train_time:84967ms step_avg:60.52ms +step:1405/2285 train_time:85029ms step_avg:60.52ms +step:1406/2285 train_time:85089ms step_avg:60.52ms +step:1407/2285 train_time:85152ms step_avg:60.52ms +step:1408/2285 train_time:85212ms step_avg:60.52ms +step:1409/2285 train_time:85274ms step_avg:60.52ms +step:1410/2285 train_time:85334ms step_avg:60.52ms +step:1411/2285 train_time:85397ms step_avg:60.52ms +step:1412/2285 train_time:85457ms step_avg:60.52ms +step:1413/2285 train_time:85519ms step_avg:60.52ms +step:1414/2285 train_time:85578ms step_avg:60.52ms +step:1415/2285 train_time:85640ms step_avg:60.52ms +step:1416/2285 train_time:85700ms step_avg:60.52ms +step:1417/2285 train_time:85762ms step_avg:60.52ms +step:1418/2285 train_time:85822ms step_avg:60.52ms +step:1419/2285 train_time:85884ms step_avg:60.52ms +step:1420/2285 train_time:85944ms step_avg:60.52ms +step:1421/2285 train_time:86007ms step_avg:60.53ms +step:1422/2285 train_time:86067ms step_avg:60.53ms +step:1423/2285 train_time:86129ms step_avg:60.53ms +step:1424/2285 train_time:86188ms step_avg:60.53ms +step:1425/2285 train_time:86250ms step_avg:60.53ms +step:1426/2285 train_time:86311ms step_avg:60.53ms +step:1427/2285 train_time:86373ms step_avg:60.53ms +step:1428/2285 train_time:86433ms step_avg:60.53ms +step:1429/2285 train_time:86496ms step_avg:60.53ms +step:1430/2285 train_time:86556ms step_avg:60.53ms +step:1431/2285 train_time:86618ms step_avg:60.53ms +step:1432/2285 train_time:86677ms step_avg:60.53ms +step:1433/2285 train_time:86739ms step_avg:60.53ms +step:1434/2285 train_time:86799ms step_avg:60.53ms +step:1435/2285 train_time:86863ms step_avg:60.53ms +step:1436/2285 train_time:86922ms step_avg:60.53ms +step:1437/2285 train_time:86984ms step_avg:60.53ms +step:1438/2285 train_time:87044ms step_avg:60.53ms +step:1439/2285 train_time:87106ms step_avg:60.53ms +step:1440/2285 train_time:87165ms step_avg:60.53ms +step:1441/2285 train_time:87228ms step_avg:60.53ms +step:1442/2285 train_time:87288ms step_avg:60.53ms +step:1443/2285 train_time:87350ms step_avg:60.53ms +step:1444/2285 train_time:87410ms step_avg:60.53ms +step:1445/2285 train_time:87473ms step_avg:60.54ms +step:1446/2285 train_time:87534ms step_avg:60.54ms +step:1447/2285 train_time:87597ms step_avg:60.54ms +step:1448/2285 train_time:87657ms step_avg:60.54ms +step:1449/2285 train_time:87719ms step_avg:60.54ms +step:1450/2285 train_time:87778ms step_avg:60.54ms +step:1451/2285 train_time:87840ms step_avg:60.54ms +step:1452/2285 train_time:87900ms step_avg:60.54ms +step:1453/2285 train_time:87963ms step_avg:60.54ms +step:1454/2285 train_time:88022ms step_avg:60.54ms +step:1455/2285 train_time:88084ms step_avg:60.54ms +step:1456/2285 train_time:88144ms step_avg:60.54ms +step:1457/2285 train_time:88206ms step_avg:60.54ms +step:1458/2285 train_time:88266ms step_avg:60.54ms +step:1459/2285 train_time:88328ms step_avg:60.54ms +step:1460/2285 train_time:88388ms step_avg:60.54ms +step:1461/2285 train_time:88451ms step_avg:60.54ms +step:1462/2285 train_time:88511ms step_avg:60.54ms +step:1463/2285 train_time:88574ms step_avg:60.54ms +step:1464/2285 train_time:88635ms step_avg:60.54ms +step:1465/2285 train_time:88697ms step_avg:60.54ms +step:1466/2285 train_time:88757ms step_avg:60.54ms +step:1467/2285 train_time:88818ms step_avg:60.54ms +step:1468/2285 train_time:88878ms step_avg:60.54ms +step:1469/2285 train_time:88940ms step_avg:60.54ms +step:1470/2285 train_time:89000ms step_avg:60.54ms +step:1471/2285 train_time:89062ms step_avg:60.55ms +step:1472/2285 train_time:89122ms step_avg:60.54ms +step:1473/2285 train_time:89184ms step_avg:60.55ms +step:1474/2285 train_time:89244ms step_avg:60.55ms +step:1475/2285 train_time:89306ms step_avg:60.55ms +step:1476/2285 train_time:89365ms step_avg:60.55ms +step:1477/2285 train_time:89428ms step_avg:60.55ms +step:1478/2285 train_time:89488ms step_avg:60.55ms +step:1479/2285 train_time:89550ms step_avg:60.55ms +step:1480/2285 train_time:89611ms step_avg:60.55ms +step:1481/2285 train_time:89674ms step_avg:60.55ms +step:1482/2285 train_time:89735ms step_avg:60.55ms +step:1483/2285 train_time:89797ms step_avg:60.55ms +step:1484/2285 train_time:89857ms step_avg:60.55ms +step:1485/2285 train_time:89919ms step_avg:60.55ms +step:1486/2285 train_time:89978ms step_avg:60.55ms +step:1487/2285 train_time:90040ms step_avg:60.55ms +step:1488/2285 train_time:90100ms step_avg:60.55ms +step:1489/2285 train_time:90162ms step_avg:60.55ms +step:1490/2285 train_time:90222ms step_avg:60.55ms +step:1491/2285 train_time:90284ms step_avg:60.55ms +step:1492/2285 train_time:90344ms step_avg:60.55ms +step:1493/2285 train_time:90406ms step_avg:60.55ms +step:1494/2285 train_time:90466ms step_avg:60.55ms +step:1495/2285 train_time:90529ms step_avg:60.55ms +step:1496/2285 train_time:90589ms step_avg:60.55ms +step:1497/2285 train_time:90652ms step_avg:60.56ms +step:1498/2285 train_time:90712ms step_avg:60.56ms +step:1499/2285 train_time:90774ms step_avg:60.56ms +step:1500/2285 train_time:90834ms step_avg:60.56ms +step:1500/2285 val_loss:3.4280 train_time:90899ms step_avg:60.60ms +step:1501/2285 train_time:90917ms step_avg:60.57ms +step:1502/2285 train_time:90960ms step_avg:60.56ms +step:1503/2285 train_time:91026ms step_avg:60.56ms +step:1504/2285 train_time:91089ms step_avg:60.56ms +step:1505/2285 train_time:91151ms step_avg:60.57ms +step:1506/2285 train_time:91211ms step_avg:60.57ms +step:1507/2285 train_time:91272ms step_avg:60.57ms +step:1508/2285 train_time:91331ms step_avg:60.56ms +step:1509/2285 train_time:91393ms step_avg:60.57ms +step:1510/2285 train_time:91452ms step_avg:60.56ms +step:1511/2285 train_time:91514ms step_avg:60.57ms +step:1512/2285 train_time:91573ms step_avg:60.56ms +step:1513/2285 train_time:91635ms step_avg:60.57ms +step:1514/2285 train_time:91694ms step_avg:60.56ms +step:1515/2285 train_time:91756ms step_avg:60.56ms +step:1516/2285 train_time:91815ms step_avg:60.56ms +step:1517/2285 train_time:91878ms step_avg:60.57ms +step:1518/2285 train_time:91940ms step_avg:60.57ms +step:1519/2285 train_time:92004ms step_avg:60.57ms +step:1520/2285 train_time:92064ms step_avg:60.57ms +step:1521/2285 train_time:92128ms step_avg:60.57ms +step:1522/2285 train_time:92188ms step_avg:60.57ms +step:1523/2285 train_time:92250ms step_avg:60.57ms +step:1524/2285 train_time:92309ms step_avg:60.57ms +step:1525/2285 train_time:92372ms step_avg:60.57ms +step:1526/2285 train_time:92431ms step_avg:60.57ms +step:1527/2285 train_time:92493ms step_avg:60.57ms +step:1528/2285 train_time:92552ms step_avg:60.57ms +step:1529/2285 train_time:92614ms step_avg:60.57ms +step:1530/2285 train_time:92674ms step_avg:60.57ms +step:1531/2285 train_time:92735ms step_avg:60.57ms +step:1532/2285 train_time:92795ms step_avg:60.57ms +step:1533/2285 train_time:92858ms step_avg:60.57ms +step:1534/2285 train_time:92919ms step_avg:60.57ms +step:1535/2285 train_time:92982ms step_avg:60.57ms +step:1536/2285 train_time:93042ms step_avg:60.57ms +step:1537/2285 train_time:93105ms step_avg:60.58ms +step:1538/2285 train_time:93165ms step_avg:60.58ms +step:1539/2285 train_time:93227ms step_avg:60.58ms +step:1540/2285 train_time:93287ms step_avg:60.58ms +step:1541/2285 train_time:93349ms step_avg:60.58ms +step:1542/2285 train_time:93409ms step_avg:60.58ms +step:1543/2285 train_time:93472ms step_avg:60.58ms +step:1544/2285 train_time:93532ms step_avg:60.58ms +step:1545/2285 train_time:93594ms step_avg:60.58ms +step:1546/2285 train_time:93654ms step_avg:60.58ms +step:1547/2285 train_time:93716ms step_avg:60.58ms +step:1548/2285 train_time:93776ms step_avg:60.58ms +step:1549/2285 train_time:93838ms step_avg:60.58ms +step:1550/2285 train_time:93899ms step_avg:60.58ms +step:1551/2285 train_time:93962ms step_avg:60.58ms +step:1552/2285 train_time:94021ms step_avg:60.58ms +step:1553/2285 train_time:94084ms step_avg:60.58ms +step:1554/2285 train_time:94144ms step_avg:60.58ms +step:1555/2285 train_time:94207ms step_avg:60.58ms +step:1556/2285 train_time:94267ms step_avg:60.58ms +step:1557/2285 train_time:94330ms step_avg:60.58ms +step:1558/2285 train_time:94390ms step_avg:60.58ms +step:1559/2285 train_time:94452ms step_avg:60.58ms +step:1560/2285 train_time:94511ms step_avg:60.58ms +step:1561/2285 train_time:94574ms step_avg:60.59ms +step:1562/2285 train_time:94634ms step_avg:60.59ms +step:1563/2285 train_time:94696ms step_avg:60.59ms +step:1564/2285 train_time:94756ms step_avg:60.59ms +step:1565/2285 train_time:94818ms step_avg:60.59ms +step:1566/2285 train_time:94878ms step_avg:60.59ms +step:1567/2285 train_time:94940ms step_avg:60.59ms +step:1568/2285 train_time:95001ms step_avg:60.59ms +step:1569/2285 train_time:95064ms step_avg:60.59ms +step:1570/2285 train_time:95124ms step_avg:60.59ms +step:1571/2285 train_time:95186ms step_avg:60.59ms +step:1572/2285 train_time:95246ms step_avg:60.59ms +step:1573/2285 train_time:95309ms step_avg:60.59ms +step:1574/2285 train_time:95369ms step_avg:60.59ms +step:1575/2285 train_time:95431ms step_avg:60.59ms +step:1576/2285 train_time:95491ms step_avg:60.59ms +step:1577/2285 train_time:95553ms step_avg:60.59ms +step:1578/2285 train_time:95613ms step_avg:60.59ms +step:1579/2285 train_time:95676ms step_avg:60.59ms +step:1580/2285 train_time:95736ms step_avg:60.59ms +step:1581/2285 train_time:95798ms step_avg:60.59ms +step:1582/2285 train_time:95859ms step_avg:60.59ms +step:1583/2285 train_time:95921ms step_avg:60.59ms +step:1584/2285 train_time:95981ms step_avg:60.59ms +step:1585/2285 train_time:96043ms step_avg:60.60ms +step:1586/2285 train_time:96103ms step_avg:60.59ms +step:1587/2285 train_time:96166ms step_avg:60.60ms +step:1588/2285 train_time:96226ms step_avg:60.60ms +step:1589/2285 train_time:96288ms step_avg:60.60ms +step:1590/2285 train_time:96348ms step_avg:60.60ms +step:1591/2285 train_time:96411ms step_avg:60.60ms +step:1592/2285 train_time:96471ms step_avg:60.60ms +step:1593/2285 train_time:96534ms step_avg:60.60ms +step:1594/2285 train_time:96594ms step_avg:60.60ms +step:1595/2285 train_time:96656ms step_avg:60.60ms +step:1596/2285 train_time:96716ms step_avg:60.60ms +step:1597/2285 train_time:96779ms step_avg:60.60ms +step:1598/2285 train_time:96839ms step_avg:60.60ms +step:1599/2285 train_time:96901ms step_avg:60.60ms +step:1600/2285 train_time:96960ms step_avg:60.60ms +step:1601/2285 train_time:97023ms step_avg:60.60ms +step:1602/2285 train_time:97083ms step_avg:60.60ms +step:1603/2285 train_time:97145ms step_avg:60.60ms +step:1604/2285 train_time:97205ms step_avg:60.60ms +step:1605/2285 train_time:97267ms step_avg:60.60ms +step:1606/2285 train_time:97327ms step_avg:60.60ms +step:1607/2285 train_time:97390ms step_avg:60.60ms +step:1608/2285 train_time:97449ms step_avg:60.60ms +step:1609/2285 train_time:97512ms step_avg:60.60ms +step:1610/2285 train_time:97573ms step_avg:60.60ms +step:1611/2285 train_time:97636ms step_avg:60.61ms +step:1612/2285 train_time:97696ms step_avg:60.61ms +step:1613/2285 train_time:97759ms step_avg:60.61ms +step:1614/2285 train_time:97818ms step_avg:60.61ms +step:1615/2285 train_time:97881ms step_avg:60.61ms +step:1616/2285 train_time:97941ms step_avg:60.61ms +step:1617/2285 train_time:98003ms step_avg:60.61ms +step:1618/2285 train_time:98062ms step_avg:60.61ms +step:1619/2285 train_time:98124ms step_avg:60.61ms +step:1620/2285 train_time:98184ms step_avg:60.61ms +step:1621/2285 train_time:98246ms step_avg:60.61ms +step:1622/2285 train_time:98306ms step_avg:60.61ms +step:1623/2285 train_time:98368ms step_avg:60.61ms +step:1624/2285 train_time:98429ms step_avg:60.61ms +step:1625/2285 train_time:98492ms step_avg:60.61ms +step:1626/2285 train_time:98552ms step_avg:60.61ms +step:1627/2285 train_time:98614ms step_avg:60.61ms +step:1628/2285 train_time:98674ms step_avg:60.61ms +step:1629/2285 train_time:98737ms step_avg:60.61ms +step:1630/2285 train_time:98797ms step_avg:60.61ms +step:1631/2285 train_time:98859ms step_avg:60.61ms +step:1632/2285 train_time:98919ms step_avg:60.61ms +step:1633/2285 train_time:98982ms step_avg:60.61ms +step:1634/2285 train_time:99041ms step_avg:60.61ms +step:1635/2285 train_time:99103ms step_avg:60.61ms +step:1636/2285 train_time:99163ms step_avg:60.61ms +step:1637/2285 train_time:99225ms step_avg:60.61ms +step:1638/2285 train_time:99285ms step_avg:60.61ms +step:1639/2285 train_time:99347ms step_avg:60.61ms +step:1640/2285 train_time:99408ms step_avg:60.61ms +step:1641/2285 train_time:99470ms step_avg:60.62ms +step:1642/2285 train_time:99531ms step_avg:60.62ms +step:1643/2285 train_time:99594ms step_avg:60.62ms +step:1644/2285 train_time:99654ms step_avg:60.62ms +step:1645/2285 train_time:99716ms step_avg:60.62ms +step:1646/2285 train_time:99777ms step_avg:60.62ms +step:1647/2285 train_time:99839ms step_avg:60.62ms +step:1648/2285 train_time:99899ms step_avg:60.62ms +step:1649/2285 train_time:99961ms step_avg:60.62ms +step:1650/2285 train_time:100021ms step_avg:60.62ms +step:1651/2285 train_time:100084ms step_avg:60.62ms +step:1652/2285 train_time:100144ms step_avg:60.62ms +step:1653/2285 train_time:100206ms step_avg:60.62ms +step:1654/2285 train_time:100265ms step_avg:60.62ms +step:1655/2285 train_time:100327ms step_avg:60.62ms +step:1656/2285 train_time:100387ms step_avg:60.62ms +step:1657/2285 train_time:100450ms step_avg:60.62ms +step:1658/2285 train_time:100511ms step_avg:60.62ms +step:1659/2285 train_time:100574ms step_avg:60.62ms +step:1660/2285 train_time:100634ms step_avg:60.62ms +step:1661/2285 train_time:100696ms step_avg:60.62ms +step:1662/2285 train_time:100756ms step_avg:60.62ms +step:1663/2285 train_time:100818ms step_avg:60.62ms +step:1664/2285 train_time:100878ms step_avg:60.62ms +step:1665/2285 train_time:100941ms step_avg:60.63ms +step:1666/2285 train_time:101001ms step_avg:60.62ms +step:1667/2285 train_time:101063ms step_avg:60.63ms +step:1668/2285 train_time:101122ms step_avg:60.62ms +step:1669/2285 train_time:101184ms step_avg:60.63ms +step:1670/2285 train_time:101244ms step_avg:60.63ms +step:1671/2285 train_time:101306ms step_avg:60.63ms +step:1672/2285 train_time:101367ms step_avg:60.63ms +step:1673/2285 train_time:101429ms step_avg:60.63ms +step:1674/2285 train_time:101489ms step_avg:60.63ms +step:1675/2285 train_time:101553ms step_avg:60.63ms +step:1676/2285 train_time:101613ms step_avg:60.63ms +step:1677/2285 train_time:101676ms step_avg:60.63ms +step:1678/2285 train_time:101736ms step_avg:60.63ms +step:1679/2285 train_time:101798ms step_avg:60.63ms +step:1680/2285 train_time:101858ms step_avg:60.63ms +step:1681/2285 train_time:101921ms step_avg:60.63ms +step:1682/2285 train_time:101980ms step_avg:60.63ms +step:1683/2285 train_time:102043ms step_avg:60.63ms +step:1684/2285 train_time:102103ms step_avg:60.63ms +step:1685/2285 train_time:102165ms step_avg:60.63ms +step:1686/2285 train_time:102225ms step_avg:60.63ms +step:1687/2285 train_time:102287ms step_avg:60.63ms +step:1688/2285 train_time:102347ms step_avg:60.63ms +step:1689/2285 train_time:102410ms step_avg:60.63ms +step:1690/2285 train_time:102471ms step_avg:60.63ms +step:1691/2285 train_time:102533ms step_avg:60.63ms +step:1692/2285 train_time:102594ms step_avg:60.63ms +step:1693/2285 train_time:102655ms step_avg:60.64ms +step:1694/2285 train_time:102715ms step_avg:60.63ms +step:1695/2285 train_time:102778ms step_avg:60.64ms +step:1696/2285 train_time:102838ms step_avg:60.64ms +step:1697/2285 train_time:102900ms step_avg:60.64ms +step:1698/2285 train_time:102960ms step_avg:60.64ms +step:1699/2285 train_time:103022ms step_avg:60.64ms +step:1700/2285 train_time:103082ms step_avg:60.64ms +step:1701/2285 train_time:103144ms step_avg:60.64ms +step:1702/2285 train_time:103204ms step_avg:60.64ms +step:1703/2285 train_time:103266ms step_avg:60.64ms +step:1704/2285 train_time:103325ms step_avg:60.64ms +step:1705/2285 train_time:103388ms step_avg:60.64ms +step:1706/2285 train_time:103448ms step_avg:60.64ms +step:1707/2285 train_time:103511ms step_avg:60.64ms +step:1708/2285 train_time:103571ms step_avg:60.64ms +step:1709/2285 train_time:103634ms step_avg:60.64ms +step:1710/2285 train_time:103694ms step_avg:60.64ms +step:1711/2285 train_time:103756ms step_avg:60.64ms +step:1712/2285 train_time:103816ms step_avg:60.64ms +step:1713/2285 train_time:103879ms step_avg:60.64ms +step:1714/2285 train_time:103939ms step_avg:60.64ms +step:1715/2285 train_time:104002ms step_avg:60.64ms +step:1716/2285 train_time:104061ms step_avg:60.64ms +step:1717/2285 train_time:104124ms step_avg:60.64ms +step:1718/2285 train_time:104184ms step_avg:60.64ms +step:1719/2285 train_time:104246ms step_avg:60.64ms +step:1720/2285 train_time:104306ms step_avg:60.64ms +step:1721/2285 train_time:104368ms step_avg:60.64ms +step:1722/2285 train_time:104428ms step_avg:60.64ms +step:1723/2285 train_time:104491ms step_avg:60.64ms +step:1724/2285 train_time:104551ms step_avg:60.64ms +step:1725/2285 train_time:104615ms step_avg:60.65ms +step:1726/2285 train_time:104675ms step_avg:60.65ms +step:1727/2285 train_time:104737ms step_avg:60.65ms +step:1728/2285 train_time:104797ms step_avg:60.65ms +step:1729/2285 train_time:104859ms step_avg:60.65ms +step:1730/2285 train_time:104919ms step_avg:60.65ms +step:1731/2285 train_time:104981ms step_avg:60.65ms +step:1732/2285 train_time:105041ms step_avg:60.65ms +step:1733/2285 train_time:105103ms step_avg:60.65ms +step:1734/2285 train_time:105163ms step_avg:60.65ms +step:1735/2285 train_time:105225ms step_avg:60.65ms +step:1736/2285 train_time:105284ms step_avg:60.65ms +step:1737/2285 train_time:105347ms step_avg:60.65ms +step:1738/2285 train_time:105407ms step_avg:60.65ms +step:1739/2285 train_time:105470ms step_avg:60.65ms +step:1740/2285 train_time:105530ms step_avg:60.65ms +step:1741/2285 train_time:105593ms step_avg:60.65ms +step:1742/2285 train_time:105653ms step_avg:60.65ms +step:1743/2285 train_time:105716ms step_avg:60.65ms +step:1744/2285 train_time:105776ms step_avg:60.65ms +step:1745/2285 train_time:105838ms step_avg:60.65ms +step:1746/2285 train_time:105898ms step_avg:60.65ms +step:1747/2285 train_time:105960ms step_avg:60.65ms +step:1748/2285 train_time:106019ms step_avg:60.65ms +step:1749/2285 train_time:106082ms step_avg:60.65ms +step:1750/2285 train_time:106141ms step_avg:60.65ms +step:1750/2285 val_loss:3.3665 train_time:106205ms step_avg:60.69ms +step:1751/2285 train_time:106223ms step_avg:60.66ms +step:1752/2285 train_time:106266ms step_avg:60.65ms +step:1753/2285 train_time:106329ms step_avg:60.66ms +step:1754/2285 train_time:106389ms step_avg:60.66ms +step:1755/2285 train_time:106454ms step_avg:60.66ms +step:1756/2285 train_time:106515ms step_avg:60.66ms +step:1757/2285 train_time:106576ms step_avg:60.66ms +step:1758/2285 train_time:106635ms step_avg:60.66ms +step:1759/2285 train_time:106697ms step_avg:60.66ms +step:1760/2285 train_time:106756ms step_avg:60.66ms +step:1761/2285 train_time:106818ms step_avg:60.66ms +step:1762/2285 train_time:106877ms step_avg:60.66ms +step:1763/2285 train_time:106939ms step_avg:60.66ms +step:1764/2285 train_time:106998ms step_avg:60.66ms +step:1765/2285 train_time:107059ms step_avg:60.66ms +step:1766/2285 train_time:107119ms step_avg:60.66ms +step:1767/2285 train_time:107183ms step_avg:60.66ms +step:1768/2285 train_time:107244ms step_avg:60.66ms +step:1769/2285 train_time:107306ms step_avg:60.66ms +step:1770/2285 train_time:107367ms step_avg:60.66ms +step:1771/2285 train_time:107429ms step_avg:60.66ms +step:1772/2285 train_time:107489ms step_avg:60.66ms +step:1773/2285 train_time:107552ms step_avg:60.66ms +step:1774/2285 train_time:107612ms step_avg:60.66ms +step:1775/2285 train_time:107675ms step_avg:60.66ms +step:1776/2285 train_time:107735ms step_avg:60.66ms +step:1777/2285 train_time:107796ms step_avg:60.66ms +step:1778/2285 train_time:107855ms step_avg:60.66ms +step:1779/2285 train_time:107917ms step_avg:60.66ms +step:1780/2285 train_time:107976ms step_avg:60.66ms +step:1781/2285 train_time:108038ms step_avg:60.66ms +step:1782/2285 train_time:108098ms step_avg:60.66ms +step:1783/2285 train_time:108161ms step_avg:60.66ms +step:1784/2285 train_time:108221ms step_avg:60.66ms +step:1785/2285 train_time:108284ms step_avg:60.66ms +step:1786/2285 train_time:108345ms step_avg:60.66ms +step:1787/2285 train_time:108407ms step_avg:60.66ms +step:1788/2285 train_time:108467ms step_avg:60.66ms +step:1789/2285 train_time:108530ms step_avg:60.67ms +step:1790/2285 train_time:108590ms step_avg:60.66ms +step:1791/2285 train_time:108653ms step_avg:60.67ms +step:1792/2285 train_time:108713ms step_avg:60.67ms +step:1793/2285 train_time:108774ms step_avg:60.67ms +step:1794/2285 train_time:108834ms step_avg:60.67ms +step:1795/2285 train_time:108896ms step_avg:60.67ms +step:1796/2285 train_time:108955ms step_avg:60.67ms +step:1797/2285 train_time:109017ms step_avg:60.67ms +step:1798/2285 train_time:109077ms step_avg:60.67ms +step:1799/2285 train_time:109140ms step_avg:60.67ms +step:1800/2285 train_time:109200ms step_avg:60.67ms +step:1801/2285 train_time:109263ms step_avg:60.67ms +step:1802/2285 train_time:109324ms step_avg:60.67ms +step:1803/2285 train_time:109386ms step_avg:60.67ms +step:1804/2285 train_time:109446ms step_avg:60.67ms +step:1805/2285 train_time:109508ms step_avg:60.67ms +step:1806/2285 train_time:109568ms step_avg:60.67ms +step:1807/2285 train_time:109631ms step_avg:60.67ms +step:1808/2285 train_time:109691ms step_avg:60.67ms +step:1809/2285 train_time:109753ms step_avg:60.67ms +step:1810/2285 train_time:109813ms step_avg:60.67ms +step:1811/2285 train_time:109876ms step_avg:60.67ms +step:1812/2285 train_time:109936ms step_avg:60.67ms +step:1813/2285 train_time:109998ms step_avg:60.67ms +step:1814/2285 train_time:110057ms step_avg:60.67ms +step:1815/2285 train_time:110120ms step_avg:60.67ms +step:1816/2285 train_time:110180ms step_avg:60.67ms +step:1817/2285 train_time:110244ms step_avg:60.67ms +step:1818/2285 train_time:110303ms step_avg:60.67ms +step:1819/2285 train_time:110366ms step_avg:60.67ms +step:1820/2285 train_time:110425ms step_avg:60.67ms +step:1821/2285 train_time:110488ms step_avg:60.67ms +step:1822/2285 train_time:110548ms step_avg:60.67ms +step:1823/2285 train_time:110610ms step_avg:60.67ms +step:1824/2285 train_time:110670ms step_avg:60.67ms +step:1825/2285 train_time:110733ms step_avg:60.68ms +step:1826/2285 train_time:110793ms step_avg:60.68ms +step:1827/2285 train_time:110856ms step_avg:60.68ms +step:1828/2285 train_time:110916ms step_avg:60.68ms +step:1829/2285 train_time:110978ms step_avg:60.68ms +step:1830/2285 train_time:111037ms step_avg:60.68ms +step:1831/2285 train_time:111100ms step_avg:60.68ms +step:1832/2285 train_time:111160ms step_avg:60.68ms +step:1833/2285 train_time:111223ms step_avg:60.68ms +step:1834/2285 train_time:111282ms step_avg:60.68ms +step:1835/2285 train_time:111344ms step_avg:60.68ms +step:1836/2285 train_time:111404ms step_avg:60.68ms +step:1837/2285 train_time:111466ms step_avg:60.68ms +step:1838/2285 train_time:111526ms step_avg:60.68ms +step:1839/2285 train_time:111588ms step_avg:60.68ms +step:1840/2285 train_time:111648ms step_avg:60.68ms +step:1841/2285 train_time:111710ms step_avg:60.68ms +step:1842/2285 train_time:111771ms step_avg:60.68ms +step:1843/2285 train_time:111834ms step_avg:60.68ms +step:1844/2285 train_time:111894ms step_avg:60.68ms +step:1845/2285 train_time:111957ms step_avg:60.68ms +step:1846/2285 train_time:112017ms step_avg:60.68ms +step:1847/2285 train_time:112079ms step_avg:60.68ms +step:1848/2285 train_time:112139ms step_avg:60.68ms +step:1849/2285 train_time:112201ms step_avg:60.68ms +step:1850/2285 train_time:112261ms step_avg:60.68ms +step:1851/2285 train_time:112324ms step_avg:60.68ms +step:1852/2285 train_time:112384ms step_avg:60.68ms +step:1853/2285 train_time:112445ms step_avg:60.68ms +step:1854/2285 train_time:112505ms step_avg:60.68ms +step:1855/2285 train_time:112567ms step_avg:60.68ms +step:1856/2285 train_time:112627ms step_avg:60.68ms +step:1857/2285 train_time:112689ms step_avg:60.68ms +step:1858/2285 train_time:112749ms step_avg:60.68ms +step:1859/2285 train_time:112812ms step_avg:60.68ms +step:1860/2285 train_time:112872ms step_avg:60.68ms +step:1861/2285 train_time:112936ms step_avg:60.69ms +step:1862/2285 train_time:112997ms step_avg:60.69ms +step:1863/2285 train_time:113058ms step_avg:60.69ms +step:1864/2285 train_time:113118ms step_avg:60.69ms +step:1865/2285 train_time:113181ms step_avg:60.69ms +step:1866/2285 train_time:113241ms step_avg:60.69ms +step:1867/2285 train_time:113303ms step_avg:60.69ms +step:1868/2285 train_time:113363ms step_avg:60.69ms +step:1869/2285 train_time:113424ms step_avg:60.69ms +step:1870/2285 train_time:113484ms step_avg:60.69ms +step:1871/2285 train_time:113546ms step_avg:60.69ms +step:1872/2285 train_time:113606ms step_avg:60.69ms +step:1873/2285 train_time:113669ms step_avg:60.69ms +step:1874/2285 train_time:113729ms step_avg:60.69ms +step:1875/2285 train_time:113792ms step_avg:60.69ms +step:1876/2285 train_time:113852ms step_avg:60.69ms +step:1877/2285 train_time:113916ms step_avg:60.69ms +step:1878/2285 train_time:113976ms step_avg:60.69ms +step:1879/2285 train_time:114038ms step_avg:60.69ms +step:1880/2285 train_time:114098ms step_avg:60.69ms +step:1881/2285 train_time:114160ms step_avg:60.69ms +step:1882/2285 train_time:114220ms step_avg:60.69ms +step:1883/2285 train_time:114282ms step_avg:60.69ms +step:1884/2285 train_time:114342ms step_avg:60.69ms +step:1885/2285 train_time:114404ms step_avg:60.69ms +step:1886/2285 train_time:114464ms step_avg:60.69ms +step:1887/2285 train_time:114526ms step_avg:60.69ms +step:1888/2285 train_time:114586ms step_avg:60.69ms +step:1889/2285 train_time:114648ms step_avg:60.69ms +step:1890/2285 train_time:114708ms step_avg:60.69ms +step:1891/2285 train_time:114770ms step_avg:60.69ms +step:1892/2285 train_time:114831ms step_avg:60.69ms +step:1893/2285 train_time:114893ms step_avg:60.69ms +step:1894/2285 train_time:114953ms step_avg:60.69ms +step:1895/2285 train_time:115016ms step_avg:60.69ms +step:1896/2285 train_time:115076ms step_avg:60.69ms +step:1897/2285 train_time:115138ms step_avg:60.69ms +step:1898/2285 train_time:115198ms step_avg:60.69ms +step:1899/2285 train_time:115260ms step_avg:60.70ms +step:1900/2285 train_time:115320ms step_avg:60.69ms +step:1901/2285 train_time:115382ms step_avg:60.70ms +step:1902/2285 train_time:115442ms step_avg:60.70ms +step:1903/2285 train_time:115505ms step_avg:60.70ms +step:1904/2285 train_time:115565ms step_avg:60.70ms +step:1905/2285 train_time:115628ms step_avg:60.70ms +step:1906/2285 train_time:115688ms step_avg:60.70ms +step:1907/2285 train_time:115751ms step_avg:60.70ms +step:1908/2285 train_time:115811ms step_avg:60.70ms +step:1909/2285 train_time:115873ms step_avg:60.70ms +step:1910/2285 train_time:115934ms step_avg:60.70ms +step:1911/2285 train_time:115996ms step_avg:60.70ms +step:1912/2285 train_time:116057ms step_avg:60.70ms +step:1913/2285 train_time:116119ms step_avg:60.70ms +step:1914/2285 train_time:116179ms step_avg:60.70ms +step:1915/2285 train_time:116242ms step_avg:60.70ms +step:1916/2285 train_time:116302ms step_avg:60.70ms +step:1917/2285 train_time:116364ms step_avg:60.70ms +step:1918/2285 train_time:116424ms step_avg:60.70ms +step:1919/2285 train_time:116487ms step_avg:60.70ms +step:1920/2285 train_time:116547ms step_avg:60.70ms +step:1921/2285 train_time:116609ms step_avg:60.70ms +step:1922/2285 train_time:116669ms step_avg:60.70ms +step:1923/2285 train_time:116732ms step_avg:60.70ms +step:1924/2285 train_time:116792ms step_avg:60.70ms +step:1925/2285 train_time:116855ms step_avg:60.70ms +step:1926/2285 train_time:116915ms step_avg:60.70ms +step:1927/2285 train_time:116977ms step_avg:60.70ms +step:1928/2285 train_time:117038ms step_avg:60.70ms +step:1929/2285 train_time:117100ms step_avg:60.70ms +step:1930/2285 train_time:117160ms step_avg:60.70ms +step:1931/2285 train_time:117222ms step_avg:60.71ms +step:1932/2285 train_time:117282ms step_avg:60.71ms +step:1933/2285 train_time:117344ms step_avg:60.71ms +step:1934/2285 train_time:117404ms step_avg:60.71ms +step:1935/2285 train_time:117467ms step_avg:60.71ms +step:1936/2285 train_time:117527ms step_avg:60.71ms +step:1937/2285 train_time:117589ms step_avg:60.71ms +step:1938/2285 train_time:117649ms step_avg:60.71ms +step:1939/2285 train_time:117712ms step_avg:60.71ms +step:1940/2285 train_time:117772ms step_avg:60.71ms +step:1941/2285 train_time:117835ms step_avg:60.71ms +step:1942/2285 train_time:117894ms step_avg:60.71ms +step:1943/2285 train_time:117957ms step_avg:60.71ms +step:1944/2285 train_time:118017ms step_avg:60.71ms +step:1945/2285 train_time:118079ms step_avg:60.71ms +step:1946/2285 train_time:118140ms step_avg:60.71ms +step:1947/2285 train_time:118203ms step_avg:60.71ms +step:1948/2285 train_time:118263ms step_avg:60.71ms +step:1949/2285 train_time:118325ms step_avg:60.71ms +step:1950/2285 train_time:118385ms step_avg:60.71ms +step:1951/2285 train_time:118447ms step_avg:60.71ms +step:1952/2285 train_time:118507ms step_avg:60.71ms +step:1953/2285 train_time:118569ms step_avg:60.71ms +step:1954/2285 train_time:118629ms step_avg:60.71ms +step:1955/2285 train_time:118692ms step_avg:60.71ms +step:1956/2285 train_time:118752ms step_avg:60.71ms +step:1957/2285 train_time:118814ms step_avg:60.71ms +step:1958/2285 train_time:118874ms step_avg:60.71ms +step:1959/2285 train_time:118937ms step_avg:60.71ms +step:1960/2285 train_time:118997ms step_avg:60.71ms +step:1961/2285 train_time:119059ms step_avg:60.71ms +step:1962/2285 train_time:119119ms step_avg:60.71ms +step:1963/2285 train_time:119182ms step_avg:60.71ms +step:1964/2285 train_time:119242ms step_avg:60.71ms +step:1965/2285 train_time:119304ms step_avg:60.71ms +step:1966/2285 train_time:119364ms step_avg:60.71ms +step:1967/2285 train_time:119426ms step_avg:60.72ms +step:1968/2285 train_time:119486ms step_avg:60.71ms +step:1969/2285 train_time:119549ms step_avg:60.72ms +step:1970/2285 train_time:119609ms step_avg:60.72ms +step:1971/2285 train_time:119672ms step_avg:60.72ms +step:1972/2285 train_time:119732ms step_avg:60.72ms +step:1973/2285 train_time:119794ms step_avg:60.72ms +step:1974/2285 train_time:119855ms step_avg:60.72ms +step:1975/2285 train_time:119917ms step_avg:60.72ms +step:1976/2285 train_time:119977ms step_avg:60.72ms +step:1977/2285 train_time:120039ms step_avg:60.72ms +step:1978/2285 train_time:120099ms step_avg:60.72ms +step:1979/2285 train_time:120162ms step_avg:60.72ms +step:1980/2285 train_time:120222ms step_avg:60.72ms +step:1981/2285 train_time:120284ms step_avg:60.72ms +step:1982/2285 train_time:120344ms step_avg:60.72ms +step:1983/2285 train_time:120407ms step_avg:60.72ms +step:1984/2285 train_time:120466ms step_avg:60.72ms +step:1985/2285 train_time:120528ms step_avg:60.72ms +step:1986/2285 train_time:120589ms step_avg:60.72ms +step:1987/2285 train_time:120652ms step_avg:60.72ms +step:1988/2285 train_time:120713ms step_avg:60.72ms +step:1989/2285 train_time:120775ms step_avg:60.72ms +step:1990/2285 train_time:120835ms step_avg:60.72ms +step:1991/2285 train_time:120898ms step_avg:60.72ms +step:1992/2285 train_time:120958ms step_avg:60.72ms +step:1993/2285 train_time:121021ms step_avg:60.72ms +step:1994/2285 train_time:121081ms step_avg:60.72ms +step:1995/2285 train_time:121144ms step_avg:60.72ms +step:1996/2285 train_time:121204ms step_avg:60.72ms +step:1997/2285 train_time:121266ms step_avg:60.72ms +step:1998/2285 train_time:121326ms step_avg:60.72ms +step:1999/2285 train_time:121388ms step_avg:60.72ms +step:2000/2285 train_time:121448ms step_avg:60.72ms +step:2000/2285 val_loss:3.3174 train_time:121512ms step_avg:60.76ms +step:2001/2285 train_time:121530ms step_avg:60.73ms +step:2002/2285 train_time:121574ms step_avg:60.73ms +step:2003/2285 train_time:121636ms step_avg:60.73ms +step:2004/2285 train_time:121697ms step_avg:60.73ms +step:2005/2285 train_time:121762ms step_avg:60.73ms +step:2006/2285 train_time:121822ms step_avg:60.73ms +step:2007/2285 train_time:121884ms step_avg:60.73ms +step:2008/2285 train_time:121945ms step_avg:60.73ms +step:2009/2285 train_time:122007ms step_avg:60.73ms +step:2010/2285 train_time:122066ms step_avg:60.73ms +step:2011/2285 train_time:122127ms step_avg:60.73ms +step:2012/2285 train_time:122186ms step_avg:60.73ms +step:2013/2285 train_time:122248ms step_avg:60.73ms +step:2014/2285 train_time:122309ms step_avg:60.73ms +step:2015/2285 train_time:122371ms step_avg:60.73ms +step:2016/2285 train_time:122432ms step_avg:60.73ms +step:2017/2285 train_time:122496ms step_avg:60.73ms +step:2018/2285 train_time:122557ms step_avg:60.73ms +step:2019/2285 train_time:122620ms step_avg:60.73ms +step:2020/2285 train_time:122681ms step_avg:60.73ms +step:2021/2285 train_time:122744ms step_avg:60.73ms +step:2022/2285 train_time:122804ms step_avg:60.73ms +step:2023/2285 train_time:122867ms step_avg:60.73ms +step:2024/2285 train_time:122927ms step_avg:60.73ms +step:2025/2285 train_time:122988ms step_avg:60.74ms +step:2026/2285 train_time:123048ms step_avg:60.73ms +step:2027/2285 train_time:123109ms step_avg:60.73ms +step:2028/2285 train_time:123169ms step_avg:60.73ms +step:2029/2285 train_time:123230ms step_avg:60.73ms +step:2030/2285 train_time:123290ms step_avg:60.73ms +step:2031/2285 train_time:123352ms step_avg:60.73ms +step:2032/2285 train_time:123413ms step_avg:60.73ms +step:2033/2285 train_time:123476ms step_avg:60.74ms +step:2034/2285 train_time:123536ms step_avg:60.74ms +step:2035/2285 train_time:123599ms step_avg:60.74ms +step:2036/2285 train_time:123660ms step_avg:60.74ms +step:2037/2285 train_time:123723ms step_avg:60.74ms +step:2038/2285 train_time:123784ms step_avg:60.74ms +step:2039/2285 train_time:123847ms step_avg:60.74ms +step:2040/2285 train_time:123907ms step_avg:60.74ms +step:2041/2285 train_time:123969ms step_avg:60.74ms +step:2042/2285 train_time:124029ms step_avg:60.74ms +step:2043/2285 train_time:124091ms step_avg:60.74ms +step:2044/2285 train_time:124151ms step_avg:60.74ms +step:2045/2285 train_time:124213ms step_avg:60.74ms +step:2046/2285 train_time:124272ms step_avg:60.74ms +step:2047/2285 train_time:124335ms step_avg:60.74ms +step:2048/2285 train_time:124395ms step_avg:60.74ms +step:2049/2285 train_time:124457ms step_avg:60.74ms +step:2050/2285 train_time:124517ms step_avg:60.74ms +step:2051/2285 train_time:124579ms step_avg:60.74ms +step:2052/2285 train_time:124640ms step_avg:60.74ms +step:2053/2285 train_time:124703ms step_avg:60.74ms +step:2054/2285 train_time:124764ms step_avg:60.74ms +step:2055/2285 train_time:124826ms step_avg:60.74ms +step:2056/2285 train_time:124886ms step_avg:60.74ms +step:2057/2285 train_time:124948ms step_avg:60.74ms +step:2058/2285 train_time:125008ms step_avg:60.74ms +step:2059/2285 train_time:125070ms step_avg:60.74ms +step:2060/2285 train_time:125130ms step_avg:60.74ms +step:2061/2285 train_time:125191ms step_avg:60.74ms +step:2062/2285 train_time:125251ms step_avg:60.74ms +step:2063/2285 train_time:125313ms step_avg:60.74ms +step:2064/2285 train_time:125373ms step_avg:60.74ms +step:2065/2285 train_time:125436ms step_avg:60.74ms +step:2066/2285 train_time:125496ms step_avg:60.74ms +step:2067/2285 train_time:125558ms step_avg:60.74ms +step:2068/2285 train_time:125619ms step_avg:60.74ms +step:2069/2285 train_time:125682ms step_avg:60.75ms +step:2070/2285 train_time:125742ms step_avg:60.75ms +step:2071/2285 train_time:125805ms step_avg:60.75ms +step:2072/2285 train_time:125866ms step_avg:60.75ms +step:2073/2285 train_time:125928ms step_avg:60.75ms +step:2074/2285 train_time:125988ms step_avg:60.75ms +step:2075/2285 train_time:126050ms step_avg:60.75ms +step:2076/2285 train_time:126110ms step_avg:60.75ms +step:2077/2285 train_time:126172ms step_avg:60.75ms +step:2078/2285 train_time:126232ms step_avg:60.75ms +step:2079/2285 train_time:126295ms step_avg:60.75ms +step:2080/2285 train_time:126355ms step_avg:60.75ms +step:2081/2285 train_time:126417ms step_avg:60.75ms +step:2082/2285 train_time:126477ms step_avg:60.75ms +step:2083/2285 train_time:126539ms step_avg:60.75ms +step:2084/2285 train_time:126600ms step_avg:60.75ms +step:2085/2285 train_time:126663ms step_avg:60.75ms +step:2086/2285 train_time:126723ms step_avg:60.75ms +step:2087/2285 train_time:126787ms step_avg:60.75ms +step:2088/2285 train_time:126847ms step_avg:60.75ms +step:2089/2285 train_time:126909ms step_avg:60.75ms +step:2090/2285 train_time:126969ms step_avg:60.75ms +step:2091/2285 train_time:127031ms step_avg:60.75ms +step:2092/2285 train_time:127091ms step_avg:60.75ms +step:2093/2285 train_time:127154ms step_avg:60.75ms +step:2094/2285 train_time:127214ms step_avg:60.75ms +step:2095/2285 train_time:127275ms step_avg:60.75ms +step:2096/2285 train_time:127335ms step_avg:60.75ms +step:2097/2285 train_time:127398ms step_avg:60.75ms +step:2098/2285 train_time:127457ms step_avg:60.75ms +step:2099/2285 train_time:127519ms step_avg:60.75ms +step:2100/2285 train_time:127580ms step_avg:60.75ms +step:2101/2285 train_time:127642ms step_avg:60.75ms +step:2102/2285 train_time:127703ms step_avg:60.75ms +step:2103/2285 train_time:127766ms step_avg:60.75ms +step:2104/2285 train_time:127826ms step_avg:60.75ms +step:2105/2285 train_time:127888ms step_avg:60.75ms +step:2106/2285 train_time:127948ms step_avg:60.75ms +step:2107/2285 train_time:128010ms step_avg:60.75ms +step:2108/2285 train_time:128071ms step_avg:60.75ms +step:2109/2285 train_time:128132ms step_avg:60.76ms +step:2110/2285 train_time:128192ms step_avg:60.75ms +step:2111/2285 train_time:128254ms step_avg:60.76ms +step:2112/2285 train_time:128314ms step_avg:60.75ms +step:2113/2285 train_time:128376ms step_avg:60.76ms +step:2114/2285 train_time:128436ms step_avg:60.75ms +step:2115/2285 train_time:128497ms step_avg:60.76ms +step:2116/2285 train_time:128558ms step_avg:60.76ms +step:2117/2285 train_time:128621ms step_avg:60.76ms +step:2118/2285 train_time:128681ms step_avg:60.76ms +step:2119/2285 train_time:128744ms step_avg:60.76ms +step:2120/2285 train_time:128805ms step_avg:60.76ms +step:2121/2285 train_time:128867ms step_avg:60.76ms +step:2122/2285 train_time:128928ms step_avg:60.76ms +step:2123/2285 train_time:128990ms step_avg:60.76ms +step:2124/2285 train_time:129050ms step_avg:60.76ms +step:2125/2285 train_time:129112ms step_avg:60.76ms +step:2126/2285 train_time:129173ms step_avg:60.76ms +step:2127/2285 train_time:129235ms step_avg:60.76ms +step:2128/2285 train_time:129294ms step_avg:60.76ms +step:2129/2285 train_time:129356ms step_avg:60.76ms +step:2130/2285 train_time:129416ms step_avg:60.76ms +step:2131/2285 train_time:129479ms step_avg:60.76ms +step:2132/2285 train_time:129539ms step_avg:60.76ms +step:2133/2285 train_time:129601ms step_avg:60.76ms +step:2134/2285 train_time:129663ms step_avg:60.76ms +step:2135/2285 train_time:129725ms step_avg:60.76ms +step:2136/2285 train_time:129785ms step_avg:60.76ms +step:2137/2285 train_time:129848ms step_avg:60.76ms +step:2138/2285 train_time:129908ms step_avg:60.76ms +step:2139/2285 train_time:129971ms step_avg:60.76ms +step:2140/2285 train_time:130031ms step_avg:60.76ms +step:2141/2285 train_time:130093ms step_avg:60.76ms +step:2142/2285 train_time:130153ms step_avg:60.76ms +step:2143/2285 train_time:130215ms step_avg:60.76ms +step:2144/2285 train_time:130275ms step_avg:60.76ms +step:2145/2285 train_time:130337ms step_avg:60.76ms +step:2146/2285 train_time:130397ms step_avg:60.76ms +step:2147/2285 train_time:130459ms step_avg:60.76ms +step:2148/2285 train_time:130520ms step_avg:60.76ms +step:2149/2285 train_time:130582ms step_avg:60.76ms +step:2150/2285 train_time:130642ms step_avg:60.76ms +step:2151/2285 train_time:130705ms step_avg:60.76ms +step:2152/2285 train_time:130766ms step_avg:60.76ms +step:2153/2285 train_time:130828ms step_avg:60.77ms +step:2154/2285 train_time:130888ms step_avg:60.77ms +step:2155/2285 train_time:130951ms step_avg:60.77ms +step:2156/2285 train_time:131011ms step_avg:60.77ms +step:2157/2285 train_time:131074ms step_avg:60.77ms +step:2158/2285 train_time:131134ms step_avg:60.77ms +step:2159/2285 train_time:131196ms step_avg:60.77ms +step:2160/2285 train_time:131256ms step_avg:60.77ms +step:2161/2285 train_time:131318ms step_avg:60.77ms +step:2162/2285 train_time:131378ms step_avg:60.77ms +step:2163/2285 train_time:131441ms step_avg:60.77ms +step:2164/2285 train_time:131501ms step_avg:60.77ms +step:2165/2285 train_time:131564ms step_avg:60.77ms +step:2166/2285 train_time:131624ms step_avg:60.77ms +step:2167/2285 train_time:131686ms step_avg:60.77ms +step:2168/2285 train_time:131746ms step_avg:60.77ms +step:2169/2285 train_time:131808ms step_avg:60.77ms +step:2170/2285 train_time:131868ms step_avg:60.77ms +step:2171/2285 train_time:131931ms step_avg:60.77ms +step:2172/2285 train_time:131991ms step_avg:60.77ms +step:2173/2285 train_time:132054ms step_avg:60.77ms +step:2174/2285 train_time:132113ms step_avg:60.77ms +step:2175/2285 train_time:132175ms step_avg:60.77ms +step:2176/2285 train_time:132235ms step_avg:60.77ms +step:2177/2285 train_time:132297ms step_avg:60.77ms +step:2178/2285 train_time:132357ms step_avg:60.77ms +step:2179/2285 train_time:132419ms step_avg:60.77ms +step:2180/2285 train_time:132479ms step_avg:60.77ms +step:2181/2285 train_time:132541ms step_avg:60.77ms +step:2182/2285 train_time:132602ms step_avg:60.77ms +step:2183/2285 train_time:132665ms step_avg:60.77ms +step:2184/2285 train_time:132725ms step_avg:60.77ms +step:2185/2285 train_time:132788ms step_avg:60.77ms +step:2186/2285 train_time:132848ms step_avg:60.77ms +step:2187/2285 train_time:132911ms step_avg:60.77ms +step:2188/2285 train_time:132971ms step_avg:60.77ms +step:2189/2285 train_time:133034ms step_avg:60.77ms +step:2190/2285 train_time:133093ms step_avg:60.77ms +step:2191/2285 train_time:133156ms step_avg:60.77ms +step:2192/2285 train_time:133216ms step_avg:60.77ms +step:2193/2285 train_time:133277ms step_avg:60.77ms +step:2194/2285 train_time:133337ms step_avg:60.77ms +step:2195/2285 train_time:133399ms step_avg:60.77ms +step:2196/2285 train_time:133459ms step_avg:60.77ms +step:2197/2285 train_time:133522ms step_avg:60.77ms +step:2198/2285 train_time:133582ms step_avg:60.77ms +step:2199/2285 train_time:133645ms step_avg:60.78ms +step:2200/2285 train_time:133705ms step_avg:60.78ms +step:2201/2285 train_time:133768ms step_avg:60.78ms +step:2202/2285 train_time:133828ms step_avg:60.78ms +step:2203/2285 train_time:133890ms step_avg:60.78ms +step:2204/2285 train_time:133951ms step_avg:60.78ms +step:2205/2285 train_time:134013ms step_avg:60.78ms +step:2206/2285 train_time:134073ms step_avg:60.78ms +step:2207/2285 train_time:134135ms step_avg:60.78ms +step:2208/2285 train_time:134195ms step_avg:60.78ms +step:2209/2285 train_time:134257ms step_avg:60.78ms +step:2210/2285 train_time:134317ms step_avg:60.78ms +step:2211/2285 train_time:134380ms step_avg:60.78ms +step:2212/2285 train_time:134439ms step_avg:60.78ms +step:2213/2285 train_time:134502ms step_avg:60.78ms +step:2214/2285 train_time:134563ms step_avg:60.78ms +step:2215/2285 train_time:134626ms step_avg:60.78ms +step:2216/2285 train_time:134685ms step_avg:60.78ms +step:2217/2285 train_time:134748ms step_avg:60.78ms +step:2218/2285 train_time:134808ms step_avg:60.78ms +step:2219/2285 train_time:134870ms step_avg:60.78ms +step:2220/2285 train_time:134930ms step_avg:60.78ms +step:2221/2285 train_time:134992ms step_avg:60.78ms +step:2222/2285 train_time:135052ms step_avg:60.78ms +step:2223/2285 train_time:135114ms step_avg:60.78ms +step:2224/2285 train_time:135175ms step_avg:60.78ms +step:2225/2285 train_time:135237ms step_avg:60.78ms +step:2226/2285 train_time:135297ms step_avg:60.78ms +step:2227/2285 train_time:135360ms step_avg:60.78ms +step:2228/2285 train_time:135420ms step_avg:60.78ms +step:2229/2285 train_time:135482ms step_avg:60.78ms +step:2230/2285 train_time:135543ms step_avg:60.78ms +step:2231/2285 train_time:135606ms step_avg:60.78ms +step:2232/2285 train_time:135666ms step_avg:60.78ms +step:2233/2285 train_time:135729ms step_avg:60.78ms +step:2234/2285 train_time:135788ms step_avg:60.78ms +step:2235/2285 train_time:135850ms step_avg:60.78ms +step:2236/2285 train_time:135910ms step_avg:60.78ms +step:2237/2285 train_time:135973ms step_avg:60.78ms +step:2238/2285 train_time:136032ms step_avg:60.78ms +step:2239/2285 train_time:136095ms step_avg:60.78ms +step:2240/2285 train_time:136155ms step_avg:60.78ms +step:2241/2285 train_time:136217ms step_avg:60.78ms +step:2242/2285 train_time:136277ms step_avg:60.78ms +step:2243/2285 train_time:136340ms step_avg:60.78ms +step:2244/2285 train_time:136400ms step_avg:60.78ms +step:2245/2285 train_time:136463ms step_avg:60.79ms +step:2246/2285 train_time:136523ms step_avg:60.79ms +step:2247/2285 train_time:136586ms step_avg:60.79ms +step:2248/2285 train_time:136646ms step_avg:60.79ms +step:2249/2285 train_time:136708ms step_avg:60.79ms +step:2250/2285 train_time:136769ms step_avg:60.79ms +step:2250/2285 val_loss:3.2821 train_time:136832ms step_avg:60.81ms +step:2251/2285 train_time:136851ms step_avg:60.80ms +step:2252/2285 train_time:136896ms step_avg:60.79ms +step:2253/2285 train_time:136961ms step_avg:60.79ms +step:2254/2285 train_time:137022ms step_avg:60.79ms +step:2255/2285 train_time:137085ms step_avg:60.79ms +step:2256/2285 train_time:137145ms step_avg:60.79ms +step:2257/2285 train_time:137206ms step_avg:60.79ms +step:2258/2285 train_time:137266ms step_avg:60.79ms +step:2259/2285 train_time:137327ms step_avg:60.79ms +step:2260/2285 train_time:137387ms step_avg:60.79ms +step:2261/2285 train_time:137449ms step_avg:60.79ms +step:2262/2285 train_time:137508ms step_avg:60.79ms +step:2263/2285 train_time:137570ms step_avg:60.79ms +step:2264/2285 train_time:137630ms step_avg:60.79ms +step:2265/2285 train_time:137692ms step_avg:60.79ms +step:2266/2285 train_time:137753ms step_avg:60.79ms +step:2267/2285 train_time:137818ms step_avg:60.79ms +step:2268/2285 train_time:137880ms step_avg:60.79ms +step:2269/2285 train_time:137943ms step_avg:60.79ms +step:2270/2285 train_time:138004ms step_avg:60.79ms +step:2271/2285 train_time:138067ms step_avg:60.80ms +step:2272/2285 train_time:138128ms step_avg:60.80ms +step:2273/2285 train_time:138190ms step_avg:60.80ms +step:2274/2285 train_time:138250ms step_avg:60.80ms +step:2275/2285 train_time:138312ms step_avg:60.80ms +step:2276/2285 train_time:138371ms step_avg:60.80ms +step:2277/2285 train_time:138433ms step_avg:60.80ms +step:2278/2285 train_time:138493ms step_avg:60.80ms +step:2279/2285 train_time:138555ms step_avg:60.80ms +step:2280/2285 train_time:138615ms step_avg:60.80ms +step:2281/2285 train_time:138677ms step_avg:60.80ms +step:2282/2285 train_time:138738ms step_avg:60.80ms +step:2283/2285 train_time:138802ms step_avg:60.80ms +step:2284/2285 train_time:138863ms step_avg:60.80ms +step:2285/2285 train_time:138925ms step_avg:60.80ms +step:2285/2285 val_loss:3.2766 train_time:138986ms step_avg:60.83ms +peak memory allocated: 29249 MiB reserved: 50528 MiB diff --git a/records/track_1_short/2025-10-27_FixMuonLR/6c588921-a777-458d-8003-f608774f040c.txt b/records/track_1_short/2025-10-27_FixMuonLR/6c588921-a777-458d-8003-f608774f040c.txt new file mode 100644 index 000000000..473bd8971 --- /dev/null +++ b/records/track_1_short/2025-10-27_FixMuonLR/6c588921-a777-458d-8003-f608774f040c.txt @@ -0,0 +1,3814 @@ +import os +import sys + +with open(sys.argv[0]) as f: + code = f.read() # read the code of this file ASAP, for logging +import copy +import glob +import math +import threading +import time +import uuid +from dataclasses import dataclass +from collections import defaultdict +from itertools import accumulate +from pathlib import Path + +os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" +import torch + +torch.empty( + 1, device="cuda", requires_grad=True +).backward() # prevents a bug on some systems +import torch._dynamo as dynamo +import torch.distributed as dist +import torch.nn.functional as F + +# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min +import triton +import triton.language as tl +from kernels import get_kernel +from torch import Tensor, nn + +dynamo.config.recompile_limit = 64 + +# ----------------------------------------------------------------------------- +# Custom operators: FP8 matmul by @YouJiacheng + + +@torch.library.custom_op("nanogpt::mm", mutates_args=()) +def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: + @torch.compile + def impl(x: Tensor, w: Tensor): + assert x.is_contiguous() and w.is_contiguous() + x_f8 = x.div(x_s).to(torch.float8_e4m3fn) + w_f8 = w.div(w_s).to(torch.float8_e4m3fn) + out = torch._scaled_mm( + x_f8, + w_f8.T, + out_dtype=torch.bfloat16, + scale_a=x.new_tensor(x_s, dtype=torch.float32), + scale_b=x.new_tensor(w_s, dtype=torch.float32), + use_fast_accum=True, + ) + return out, x_f8, w_f8 + + return impl(x, w) + +@mm_op.register_fake +def _(x: Tensor, w: Tensor, *_): + assert x.ndim == w.ndim == 2 + assert x.shape[1] == w.shape[1] + assert x.device == w.device + assert x.is_contiguous() and w.is_contiguous() + return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) + +@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) +def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: + @torch.compile + def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): + assert grad.is_contiguous() + x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) + w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) + grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) + grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) + grad_x = torch._scaled_mm( + grad_f8, + w_f8.T.contiguous().T, + out_dtype=torch.bfloat16, + scale_a=grad_inv_s, + scale_b=w_inv_s, + use_fast_accum=False, + ) + # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) + grad_w = torch._scaled_mm( + x_f8.T.contiguous(), + grad_f8.T.contiguous().T, + out_dtype=torch.float32, + scale_a=x_inv_s, + scale_b=grad_inv_s, + use_fast_accum=False, + ).T + return grad_x, grad_w + + return impl(g, x_f8, w_f8) + +@mm_backward_op.register_fake +def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): + return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) + +def backward(ctx, grad_out: Tensor, *_): + x_f8, w_f8 = ctx.saved_tensors + x_s, w_s, grad_s = ctx.scales + grad_x, grad_w = torch.ops.nanogpt.mm_backward( + grad_out, x_f8, w_f8, x_s, w_s, grad_s + ) + return grad_x, grad_w, None, None, None + +def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): + *_, x_s, w_s, grad_s = inputs + _, x_f8, w_f8 = output + ctx.save_for_backward(x_f8, w_f8) + ctx.scales = x_s, w_s, grad_s + ctx.set_materialize_grads(False) + +mm_op.register_autograd(backward, setup_context=setup_context) + +# ----------------------------------------------------------------------------- +# Triton kernel for symmetric matrix multiplication by @byronxu99 + +def _get_autotune_configs(): + return [ + triton.Config( + { + "BLOCK_SIZE_M": bm, + "BLOCK_SIZE_N": bn, + "BLOCK_SIZE_K": bk, + "GROUP_SIZE_M": 8, + "LOWER_UPPER": 1, + }, + num_stages=stages, + num_warps=warps, + ) + for bm in [64, 128] + for bn in [64, 128, 256] + for bk in [64, 128] + for stages, warps in [(3, 4), (3, 8), (4, 4)] + if bm // bn <= 2 and bn // bm <= 2 + ] + +@triton.jit +def _pid_to_block( + pid, + M, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) + + # Map PID to a single matrix in batch + batch_idx = pid // (num_pid_m * num_pid_n) + pid = pid % (num_pid_m * num_pid_n) + + # Map PID to 2D grid of blocks + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) + + m_idx = pid_m * BLOCK_SIZE_M + n_idx = pid_n * BLOCK_SIZE_N + return batch_idx, m_idx, n_idx + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def XXT_kernel( + A_ptr, C_ptr, + M, K, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def XXT(A: torch.Tensor, out: torch.Tensor): + """ + Launch Triton kernel to compute C = A @ A.T + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert out.size(-2) == M, "Output matrix has incorrect shape" + assert out.size(-1) == M, "Output matrix has incorrect shape" + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + XXT_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + K=K, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + ) + return out + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def ba_plus_cAA_kernel( + A_ptr, C_ptr, + M, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + alpha, beta, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A + # Performance is slightly slower than XXT_kernel, so we use two separate kernels + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + # Load block of A to add (corresponds to the current block of C) + offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) + a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) + a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) + a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) + + # Apply alpha and beta + accumulator *= alpha + accumulator += a_add * beta + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): + """ + Launch Triton kernel to compute C = alpha * A @ A.T + beta * A + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert M == K, "Input matrix must be square" + assert out.size(-2) == M + assert out.size(-1) == M + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + ba_plus_cAA_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + alpha=alpha, + beta=beta, + ) + return out + +# Computed for num_iters=5, safety_factor=2e-2, cushion=2 +polar_express_coeffs = [ + (8.156554524902461, -22.48329292557795, 15.878769915207462), + (4.042929935166739, -2.808917465908714, 0.5000178451051316), + (3.8916678022926607, -2.772484153217685, 0.5060648178503393), + (3.285753657755655, -2.3681294933425376, 0.46449024233003106), + (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) +] + +@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower +def polar_express(G: torch.Tensor): + """ + Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 + by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. + Code adapted from https://github.com/NoahAmsel/PolarExpress/tree/main by @varunneal. + """ + X = G.bfloat16() + if G.size(-2) > G.size(-1): + X = X.mT + + # Ensure spectral norm is at most 1 + X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) + + # Allocate buffers + X = X.contiguous() + A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) + B = torch.empty_like(A) + C = torch.empty_like(X) + + aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm + + # Perform the iterations + for a, b, c in polar_express_coeffs: + XXT(X, out=A) # A = X @ X.mT + ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A + aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X + X, C = C, X # Swap references to avoid unnecessary copies + + if G.size(-2) > G.size(-1): + X = X.mT + return X + +# ----------------------------------------------------------------------------- +# Muon optimizer + +class Muon(torch.optim.Optimizer): + """ + Muon - MomentUm Orthogonalized by Newton-schulz + + https://kellerjordan.github.io/posts/muon/ + + Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- + processing step, in which each 2D parameter's update is replaced with the nearest orthogonal + matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has + the advantage that it can be stably run in bfloat16 on the GPU. + Note: A later PR replaced Newton-Shulz with Polar Express for the orthogonalization step + + Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, + or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). + Though empirically small 1D params perform efficiently here: + NS approximately performs a magnitude normalization of the grad + This hyper-optimized class has faster execution time than the current impl of Adam for small params + + Custom distributed sizing: + The model stores all attn and mlp weights in the same shape, and then updates the view as + needed on the forward pass. This enables attn and mlp weights to be contained within the same + dist.reduce_scatter_tensor() call. The model architecture has been customized to enable + (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. + The scheduling is: + 1. reduce scatter smear_gate (1 param 7 padding params) + 2. reduce scatter attn_gate (10 params 6 padding params) + 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) + 4. reduce scatter attn/mlp round 2 (16 mlp params) + 5. wait on step 1, then compute update of 1 and schedule all gather + 6. wait on step 2, then compute update of 2 and schedule all gather + 7. wait on step 3, then compute update of 3 and schedule all gather + GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] + GPUs that receive params of type attn reshape before computing update + 8. wait on 4, then compute update of 4 and schedule all gather + 9. wait for each all gather to complete and update params + Empirically, leading with small params provides an additional 0.2s improvement. + """ + def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, eps=1e-8, beta2=0.95, custom_sizing=True): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + # custom sizing requires 8 GPUs + if custom_sizing and dist.get_world_size()==8: + param_groups = self.generate_custom_param_groups(params) + else: + param_groups = self.generate_standard_param_groups(params) + super().__init__(param_groups, defaults) + + def reset(self): + # expose a reset for clearing buffers + for group in self.param_groups: + group["momentum_buffer"].zero_() + group["second_momentum_buffer"].zero_() + + def generate_standard_param_groups(self, params): + """ + Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. + Creates one param group per module. + """ + groups = defaultdict(list) + for param in params: + groups[param.label].append(param) + + param_groups = [] + for module_name, group_params in groups.items(): + chunk_size = (len(group_params) + self.world_size - 1) // self.world_size + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + + return param_groups + + def generate_custom_param_groups(self, params): + """ + Implementation requires that a single GPU does not receive both attn + and mlp params when a param group is split across GPUs. + """ + module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp_up', 'mlp_down'] + params_list = list(params) + params_list.sort(key=lambda x: module_group_order.index(x.label)) + + idx = 0 + group_sizes = [1, 10, 16, 16] + assert len(params_list) == sum(group_sizes) + param_groups = [] + for size in group_sizes: + chunk_size = (size + self.world_size - 1) // self.world_size + group_params = params_list[idx: idx + size] + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + idx += size + + return param_groups + + @torch.no_grad() + def step(self): + # Efficient systems-wise implementation of step developed by @YouJiacheng, + # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, + # @ryanyang0, @vagrawal, and @varunneal. + rank = dist.get_rank() + group_infos = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + if not params: + continue + + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + stacked_grads = torch.empty( + (padded_num_params, *params[0].shape), + dtype=params[0].dtype, + device=params[0].device + ) + for i, p in enumerate(params): + stacked_grads[i].copy_(p.grad, non_blocking=True) + if len(params) < padded_num_params: + stacked_grads[len(params):].zero_() + + grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) + + reduce_future = dist.reduce_scatter_tensor( + grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True + ).get_future() + + group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) + + all_gather_infos = [] + # Second pass: wait for gradients, compute updates for the local shard of parameters, + # and launch all async all_gather operations. + for group, info in zip(self.param_groups, group_infos): + info["reduce_future"].wait() + + params = group["params"] + grad_chunk = info["grad_chunk"] + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + start_idx = rank * chunk_size + module_idx = start_idx if start_idx < len(params) else 0 + + num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank + + if "momentum_buffer" not in group: + group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) + momentum_buffer = group["momentum_buffer"] + # Apply momentum update to the persistent momentum buffer in-place + momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) + updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) + + grad_shape = updated_grads.shape + if params[module_idx].label == 'attn': + # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] + for p in params[module_idx:module_idx + num_params]: + assert p.label == 'attn' + updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) + ref_param = params[module_idx] + param_shape = ref_param.shape + + if "second_momentum_buffer" not in group: + group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) + if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) + ) + second_momentum_buffer = group["second_momentum_buffer"] + + if "param_lr" not in group: + group["param_lr"] = ( + max(1., param_shape[-2] / param_shape[-1]) ** 0.5 + * ref_param.new_tensor( + [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + ) + + group["param_wd"] = ref_param.new_tensor( + [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + + # Determine LR and WR + eff_lr = group["lr"] * group["param_lr"] + eff_wd = group["weight_decay"] * group["param_wd"] + + # Compute zeropower for the entire chunk in a single, batched call. + if num_params == 0: + v_chunk = updated_grads + elif params[module_idx].label == "smear_gate": + # dividing by magnitude is equivalent of SVN for 1d tensors + v_chunk = updated_grads / (updated_grads.norm(dim=(-2, -1), keepdim=True).clamp_min(1e-10)) + else: + v_chunk = polar_express(updated_grads) + + # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) + v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) + second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) + step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() + v_chunk.mul_(step_size) + v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) + + v_chunk = v_chunk.view(grad_shape) + + updated_params = torch.empty_like(grad_chunk) + param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) + # Apply weight decay directly to the buffer. + param_chunk.mul_(1 - eff_wd) + + param_chunk.add_(-eff_lr * v_chunk) + + updated_params[:num_params].copy_(param_chunk) + if num_params < chunk_size: + updated_params[num_params:].zero_() + + stacked_params = torch.empty( + (padded_num_params, *param_shape), + dtype=updated_params.dtype, + device=updated_params.device, + ) + + gather_future = dist.all_gather_into_tensor( + stacked_params, updated_params, async_op=True + ).get_future() + + all_gather_infos.append( + { + "gather_future": gather_future, + "stacked_params": stacked_params, + "orig_params": params, + } + ) + + # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. + for info in all_gather_infos: + info["gather_future"].wait() + stacked_params = info["stacked_params"] + orig_params = info["orig_params"] + + unstacked_params = torch.unbind(stacked_params) + for i, p in enumerate(orig_params): + p.copy_(unstacked_params[i], non_blocking=True) + + +class DistAdam(torch.optim.Optimizer): + def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + params = list(params) + sizes = {p.shape for p in params} + # create one buffer per unique parameter-size + param_groups = [] + for size in sizes: + group_params = [p for p in params if p.shape == size] + param_groups.append(dict(params=group_params)) + super().__init__(param_groups, defaults) + # init state + for p in params: + chunk_size = p.size(0) // self.world_size + exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) + exp_avg_sq = torch.zeros_like(exp_avg) + self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) + # DistributedAdam implementation by @vagrawal + + @torch.compile + @torch.no_grad() + def step(self): + rank = dist.get_rank() + reduce_scatter_futures: list[torch.Future] = [] + all_gather_futures: list[torch.Future] = [] + grad_slices = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + for param in params: + grad = param.grad + rank_size = grad.shape[0] // self.world_size + grad_slice = torch.empty_like(grad[:rank_size]) + reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) + grad_slices.append(grad_slice) + + idx = 0 + for group in self.param_groups: + beta1, beta2 = group['betas'] + eps = group['eps'] + wd = group['weight_decay'] + params = group['params'] + for param in params: + reduce_scatter_futures[idx].wait() + rank_size = param.shape[0] // self.world_size + p_slice = param[rank * rank_size:(rank + 1) * rank_size] + lr = group['lr'] * getattr(param, "lr_mul", 1.0) + state = self.state[param] + g_slice = grad_slices[idx] + + exp_avg = state["exp_avg"] + exp_avg_sq = state["exp_avg_sq"] + state["step"] += 1 + t = state["step"] + # weight decay + if wd != 0: + eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) + p_slice.mul_(1 - eff_weight_decay) + # update running averages + exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) + # bias corrections + bias1 = 1 - beta1 ** t + bias2 = 1 - beta2 ** t + # compute step + denom = exp_avg_sq.sqrt().add_(eps) + step_size = lr * (bias2 ** 0.5 / bias1) + update = exp_avg.div(denom).mul_(step_size) + p_slice.add_(other=update, alpha=-1.0) + idx += 1 + all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) + torch.futures.collect_all(all_gather_futures).wait() + +# ----------------------------------------------------------------------------- +# PyTorch nn.Module definitions for the model + +def norm(x: Tensor): + return F.rms_norm(x, (x.size(-1),)) + +class CastedLinear(nn.Linear): + def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): + super().__init__(in_features, out_features, bias=False) + self.use_fp8 = use_fp8 + self.x_s = x_s + self.w_s = w_s + self.grad_s = grad_s + + def reset_parameters(self) -> None: + with torch.no_grad(): + self.weight.zero_() # @Grad62304977 and others + + def forward(self, x: Tensor): + if self.use_fp8 and self.training: + _x = x.flatten(0, -2) + out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] + return out.reshape(*x.shape[:-1], -1) + else: + return F.linear(x, self.weight.type_as(x)) + +# yarn implementation @classiclarryd +class Yarn(nn.Module): + def __init__(self, head_dim, max_seq_len): + super().__init__() + self.head_dim = head_dim + self.max_seq_len = max_seq_len + self.reset() + + def reset(self): + angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) + # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) + angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) + theta = torch.outer(t, angular_freq) + self.cos = nn.Buffer( + theta.cos().to(torch.bfloat16), persistent=False + ) + self.sin = nn.Buffer( + theta.sin().to(torch.bfloat16), persistent=False + ) + self.angular_freq = angular_freq + # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 + self.attn_scale = 0.1 + + def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): + rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) + scaling_factor = old_window / new_window + interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) + self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) + theta = torch.outer(t, self.angular_freq) + self.cos.copy_(theta.cos()) + self.sin.copy_(theta.sin()) + self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 + +def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): + assert cos.size(0) >= x_BTHD.size(-3) + cos, sin = ( + cos[None, : x_BTHD.size(-3), None, :], + sin[None, : x_BTHD.size(-3), None, :], + ) + x1, x2 = x_BTHD.chunk(2, dim=-1) + y1 = x1 * cos + x2 * sin + y2 = x1 * (-sin) + x2 * cos + return torch.cat((y1, y2), 3) + +@dataclass +class AttnArgs: + ve: torch.Tensor + sa_lambdas: torch.Tensor + seqlens: torch.Tensor + bm_size: int + cos: torch.Tensor + sin: torch.Tensor + attn_scale: float + +flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface + +class CausalSelfAttention(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int): + super().__init__() + self.num_heads = num_heads + self.head_dim = head_dim + self.dim = dim + self.hdim = num_heads * head_dim + + assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" + std = 0.5 * (self.dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng + # https://x.com/hi_tysam/status/1879699187107033311 + # make matrices the same shape as MLP to enable batched call in optimizer + self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) + # label module to enable custom optimizer sizing + self.qkvo_w.label='attn' + + with torch.no_grad(): + self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights + self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero + + # sparse gated attention to enable context based no-op by @classiclarryd + self.attn_gate = CastedLinear(12, num_heads) + # label module to enable custom optimizer sizing + self.attn_gate.weight.label = 'attn_gate' + + def forward(self, x: Tensor, attn_args: AttnArgs): + B, T = x.size(0), x.size(1) # batch size, sequence length + assert B == 1, "varlen sequences requires B == 1" + assert T % 16 == 0 + # unpack attention args + cos, sin = attn_args.cos, attn_args.sin + ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas + seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size + + q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) + q, k = norm(q), norm(k) # QK norm @Grad62304977 + q, k = rotary(q, cos, sin), rotary(k, cos, sin) + if ve is not None: + v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 + else: # skip mid-layers token value embeddings by @YouJiacheng + v = sa_lambdas[0] * v + + max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) + + # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng + y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, + max_seqlen_q=max_len, max_seqlen_k=max_len, + causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) + y = y.view(B, T, self.num_heads, self.head_dim) + y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) + y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side + y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) + return y + + +class MLP(nn.Module): + def __init__(self, dim: int): + super().__init__() + hdim = 4 * dim + # make matrices the same shape to enable batched call in optimizer + self.c_fc = nn.Parameter(torch.empty(dim, hdim)) + self.c_proj = nn.Parameter(torch.empty(dim, hdim)) + # label modules to enable custom optimizer sizing + self.c_fc.label = 'mlp_up' + self.c_proj.label = 'mlp_down' + # corrective factor to account for transpose + self.c_fc.lr_mul = 2. + + std = 0.5 * (dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + with torch.no_grad(): + self.c_fc.uniform_(-bound, bound) + self.c_proj.zero_() # zero init suggested by @Grad62304977 + + def forward(self, x: Tensor): + x = F.linear(x, self.c_fc.T.type_as(x)) + x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 + x = F.linear(x, self.c_proj.type_as(x)) + return x + +class Block(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): + super().__init__() + # skip attention of blocks.7 (the 8th layer) by @YouJiacheng + self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None + # skip MLP blocks for first MLP layer by @EmelyanenkoK + self.mlp = MLP(dim) if layer_idx != 0 else None + + def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): + x = lambdas[0] * x + lambdas[1] * x0 + if self.attn is not None: + x = x + self.attn(norm(x), attn_args) + if self.mlp is not None: + x = x + self.mlp(norm(x)) + return x + +# ----------------------------------------------------------------------------- +# The main model + +def next_multiple_of_n(v: float | int, *, n: int): + return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) + +class GPT(nn.Module): + def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): + super().__init__() + vocab_size = next_multiple_of_n(vocab_size, n=128) + self.embed = nn.Embedding(vocab_size, model_dim) + self.smear_gate = CastedLinear(12, 1) + # label modules to enable custom optimizer sizing + self.smear_gate.weight.label = 'smear_gate' + # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 + # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 + self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) + self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) + self.yarn = Yarn(head_dim, max_seq_len) + # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. + # suggested to me by @Grad62304977. this originates from Karpathy's experiments. + use_fp8 = not os.environ.get("DISABLE_FP8", False) + self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) + # Add learnable skip connection weights for decoder layers + assert num_layers % 2 == 0 + pad = (-num_layers * 5 - 2) % dist.get_world_size() + self.scalars = nn.Parameter( + torch.cat( + [ + -1.5 + * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 + *[ + torch.tensor([1.0, 0.0]) for _ in range(num_layers) + ], # block lambdas + *[ + torch.tensor([0.5, 0.5]) for _ in range(num_layers) + ], # SA lambdas + torch.zeros(1), # smear_lambda + 0.5*torch.ones(1), # backout_lambda + torch.ones(pad), + ] + ) + ) + # set learning rates + for param in self.embed.parameters(): + param.lr_mul = 75. + for param in self.value_embeds.parameters(): + param.lr_mul = 75. + self.lm_head.weight.lr_mul = 1.0 + self.scalars.lr_mul = 5.0 + + def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): + assert input_seq.ndim == 1 + + ve = [value_embed(input_seq) for value_embed in self.value_embeds] + # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure + # dropping first layer updates this to .12 ... 012 + ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] + assert len(ve) == len(self.blocks) + + short_bm = ws_short * args.block_size + long_bm = ws_long * args.block_size + bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] + assert len(bm_sizes) == len(self.blocks) + + x = self.embed(input_seq) + + skip_weights = self.scalars[:(len(self.blocks) // 2)] + lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) + sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) + smear_lambda = self.scalars[5 * len(self.blocks)] + backout_lambda = self.scalars[5 * len(self.blocks)+1] + + # smear token embed forward 1 position @classiclarryd + smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) + x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) + x = x0 = norm(x[None]) + + # U-net design by @brendanh0gan + skip_connections = [] + n = len(self.blocks) // 2 + + x_backout = None + backout_layer = 8 + # skip layer zero + for i in range(1,len(self.blocks)): + attn_args = AttnArgs( + ve=ve[i], + sa_lambdas=sa_lambdas[i], + seqlens=seqlens, + bm_size=bm_sizes[i], + cos=self.yarn.cos, + sin=self.yarn.sin, + attn_scale=self.yarn.attn_scale + ) + # since layer 0 is skipped, layer 11 does not have skip_connection + if i >= n and i<11: + gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) + x = x + gate * skip_connections.pop() + x = self.blocks[i](x, x0, lambdas[i], attn_args) + if i < n: + skip_connections.append(x) + if i == backout_layer: + x_backout = x + + # back out contributions from first 8 layers that are only required for downstream context and not direct prediction + x -= backout_lambda * x_backout + x = norm(x) + logits = self.lm_head(x) + # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) + logits = 30 * torch.sigmoid(logits / 7.5) + logits_for_loss = logits.float() if not self.training else logits + loss = F.cross_entropy( + logits_for_loss.view(-1, logits_for_loss.size(-1)), + target_seq, + reduction="sum" if self.training else "mean", + ) + return loss + +# ----------------------------------------------------------------------------- +# Distributed data loader + +def _load_data_shard(file: Path): + header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 + assert header[0] == 20240520, "magic number mismatch in the data .bin file" + assert header[1] == 1, "unsupported version" + num_tokens = int(header[2]) # number of tokens (claimed) + with file.open("rb", buffering=0) as f: + tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng + f.seek(256 * 4) + nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng + assert nbytes == 2 * num_tokens, "number of tokens read does not match header" + return tokens + +BOS_ID = 50256 + +class BOSFinder: + # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd + def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): + # Precompute BOS positions once per shard + self.tokens=tokens + self.size = tokens.numel() + self.quickload = quickload + if quickload: + # only scan first 4 million tokens, then kickoff async thread to scan rest + self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.thread = None + self.ready = threading.Event() + self.start() + else: + self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.i = 0 + self.world_size = world_size + self.batch_iter = 0 + + def _load(self): + self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + self.bos_idx = self.bos_idx_async + + def next_batch(self, num_tokens_local: int, max_seq_len: int): + # if quickload was used, repoint to the full dataset after 5 batches + if self.quickload and self.batch_iter==5: + self.get() + n = len(self.bos_idx) + starts = [[] for _ in range(self.world_size)] + ends = [[] for _ in range(self.world_size)] + + idx = self.i + for r in range(self.world_size): + cur_len = 0 + while cur_len <= num_tokens_local: + if idx >= n: + raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") + cur = self.bos_idx[idx] + starts[r].append(cur) + end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, + cur + max_seq_len, + cur + num_tokens_local - cur_len + 1) + ends[r].append(end) + cur_len += end - cur + idx += 1 + + assert cur_len == num_tokens_local + 1 + self.i = idx + self.batch_iter+=1 + return starts, ends + +class DataPreloader: + # Helper for asynchronously loading next shard and indexing bos tokens + def __init__(self, file_iter, world_size: int = 1): + self.file_iter = file_iter + self.world_size = world_size + self.thread = None + self.data = None + self.ready = threading.Event() + + def _load(self): + tokens = _load_data_shard(next(self.file_iter)) + self.data = (tokens, BOSFinder(tokens, self.world_size)) + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + return self.data + +def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): + # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len + rank = dist.get_rank() if dist.is_initialized() else 0 + world_size = dist.get_world_size() if dist.is_initialized() else 1 + assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" + num_tokens = num_tokens // grad_accum_steps + + files = [Path(file) for file in sorted(glob.glob(filename_pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") + + file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training + tokens = _load_data_shard(next(file_iter)) + if align_to_bos: + finder = BOSFinder(tokens, world_size=world_size, quickload=True) + preloader = DataPreloader(file_iter, world_size) + preloader.start() + else: + pos = 0 # for unaligned case + + while True: + num_tokens_local = num_tokens // world_size + max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 + + if align_to_bos: + try: + seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) + start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) + except StopIteration: + # This shard is exhausted, load the next one in the next loop iteration. + tokens, finder = preloader.get() + preloader.start() + continue + + buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) + _inputs = buf[:-1] + _targets = buf[1:] + end_idxs[-1] -= 1 # last document was too long to account for _targets offset + cum_lengths = (end_idxs - start_idxs).cumsum(0) + + else: + if pos + num_tokens + 1 >= len(tokens): # should not occur for val data + tokens, pos = _load_data_shard(next(file_iter)), 0 + + pos_local = pos + rank * num_tokens_local + buf = tokens[pos_local: pos_local + num_tokens_local + 1] + _inputs = buf[:-1].view(num_tokens_local, ) + _targets = buf[1:].view(num_tokens_local, ) + + cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] + pos += num_tokens + + + _cum_lengths = torch.full((max_num_docs,), num_tokens_local) + _cum_lengths[0] = 0 + _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths + + new_params = yield ( + _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), + _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), + _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) + ) + + if new_params is not None: + # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() + new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params + assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" + num_tokens = new_num_tokens + max_seq_len = new_max_seq_len + grad_accum_steps = new_grad_accum_steps + + +# ----------------------------------------------------------------------------- +# int main + +@dataclass +class Hyperparameters: + # data + train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on + val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on + val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons + train_batch_size: int = 2048 * 16 * 8 + train_max_seq_len: int = 128 * 16 + val_batch_size: int = 4 * 64 * 1024 * 8 + # optimization + num_iterations: int = 2285 + lr_schedule = (0.5, 0.98) # breakpoints for 3-part schedule: (flat, linear decay, flat) + lr_min = 0.1 + # evaluation and logging + run_id: str = f"{uuid.uuid4()}" + val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end + save_checkpoint: bool = False + # attention masking + block_size: int = 128 + ws_schedule: tuple = (3, 5, 7, 9, 11, 13) + ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN + +args = Hyperparameters() + +data_path = os.environ.get("DATA_PATH", ".") +args.train_files = os.path.join(data_path, args.train_files) +args.val_files = os.path.join(data_path, args.val_files) + +# torchrun sets these env variables +rank = int(os.environ["RANK"]) +world_size = int(os.environ["WORLD_SIZE"]) +assert 8 % world_size == 0, "world_size must be a divisor of 8" +grad_accum_steps = 8 // world_size +assert torch.cuda.is_available() +device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) +torch.cuda.set_device(device) +dist.init_process_group(backend="nccl", device_id=device) +dist.barrier() +master_process = (rank == 0) # this process will do logging, checkpointing etc. + +# begin logging +logfile = None +if master_process: + run_id = args.run_id + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{run_id}.txt" + print(logfile) +def print0(s, console=False): + if master_process: + with open(logfile, "a") as f: + if console: + print(s) + print(s, file=f) + +# begin by printing this file (the Python code) +print0(code) +print0("="*100) +# log information about the hardware/software environment this is running on +print0(f"Running Python {sys.version}") +print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") +print0(f"Running Triton version {triton.__version__}") + +def nvidia_smi(): + import subprocess # avoid top level import + return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout +print0(nvidia_smi()) +print0("="*100) + +model: nn.Module = GPT( + vocab_size=50257, + num_layers=12, + num_heads=6, + head_dim=128, + model_dim=768, + max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) +).cuda() +for m in model.modules(): + if isinstance(m, (nn.Embedding, nn.Linear)): + m.bfloat16() +for param in model.parameters(): + dist.broadcast(param.detach(), 0) + +# collect the parameters to optimize +hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] +embed_params = [p for n, p in model.named_parameters() if "embed" in n] +scalar_params = [p for p in model.parameters() if p.ndim < 2] +head_params = [model.lm_head.weight] +gate_params = [p for n, p in model.named_parameters() if "gate" in n] + +# init the optimizer(s) +# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence +# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 +optimizer1 = DistAdam( + scalar_params + head_params + embed_params, + lr=0.008, + betas=(0.65, 0.95), + eps=1e-8, + weight_decay=0.0, +) +optimizer2 = Muon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=0.0) +optimizers = [optimizer1, optimizer2] +for opt in optimizers: + for group in opt.param_groups: + group["initial_lr"] = group["lr"] + +def get_lr(step: int): + assert step < args.num_iterations + # Three part schedule: flat, linear decrease, flat + lr_schedule = args.lr_schedule + x = step / args.num_iterations + + if x < lr_schedule[0]: + return 1.0 + elif x < lr_schedule[1]: + progress = (x - lr_schedule[0]) / (lr_schedule[1] - lr_schedule[0]) + lr = 1.0 - (1.0 - args.lr_min) * progress + else: + lr = args.lr_min + return lr + +def get_ws(step: int): + assert step <= args.num_iterations + x = step / (args.num_iterations + 1) + ws_idx = int(len(args.ws_schedule) * x) + return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] + +def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): + # warmup phase: linearly increase momentum from min to max + # cooldown phase: linearly decrease momentum from max to min + momentum_cd_start = args.num_iterations - muon_cooldown_steps + if step < muon_warmup_steps: + frac = step / muon_warmup_steps + momentum = momentum_min + frac * (momentum_max - momentum_min) + elif step > momentum_cd_start: + frac = (step - momentum_cd_start) / muon_cooldown_steps + momentum = momentum_max - frac * (momentum_max - momentum_min) + else: + momentum = momentum_max + return momentum + +def step_optimizers(step: int, optimizers, model): + # update lr + for optimizer in optimizers: + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * get_lr(step) + + # set muon momentum based on step + momentum = get_muon_momentum(step) + for group in optimizers[1].param_groups: + group["momentum"] = momentum + + # on even steps, only step Muon params + # on odd steps, step all params + if step%2==0: + optimizers[1].step() + optimizers[1].zero_grad(set_to_none=True) + else: + for optimizer in optimizers: + optimizer.step() + model.zero_grad(set_to_none=True) + +model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) + +######################################## +# Warmup kernels # +######################################## + +# Warmup the training kernels, then re-initialize the state so we aren't cheating +warmup_steps = 30 +initial_state = dict(model=copy.deepcopy(model.state_dict()), + optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +for step in range(warmup_steps): + inputs, targets, cum_seqlens = next(train_loader) + # each window size is a new graph, need to warm up each with Yarn.attn_scale + ws_idx = step % len(args.ws_schedule) + if ws_idx==0: + model.yarn.reset() + ws_long = args.ws_schedule[0] + else: + new_ws_long = args.ws_schedule[ws_idx] + if new_ws_long > ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long = new_ws_long + model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() + for opt in optimizers: + opt.step() + model.zero_grad(set_to_none=True) +model.yarn.reset() # rotary buffer is not stored in state_dict +model.load_state_dict(initial_state["model"]) +optimizer2.reset() # momentum buffer not in state dict +for opt, opt_state in zip(optimizers, initial_state["optimizers"]): + opt.load_state_dict(opt_state) +del train_loader, initial_state + +######################################## +# Training and validation # +######################################## + +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +training_time_ms = 0 +# start the clock +torch.cuda.synchronize() +t0 = time.perf_counter() +# begin training +train_steps = args.num_iterations +ws_short, ws_long = get_ws(0) +for step in range(train_steps + 1): + last_step = (step == train_steps) + ws_short, new_ws_long = get_ws(step) + if new_ws_long != ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long=new_ws_long + + # --------------- VALIDATION SECTION ----------------- + if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): + if last_step: + ws_long = args.ws_validate_post_yarn_ext + # stop the clock + torch.cuda.synchronize() + training_time_ms += 1000 * (time.perf_counter() - t0) + model.eval() + assert args.val_tokens % args.val_batch_size == 0 + val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size + val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) + val_loss = 0 + with torch.no_grad(): + for _ in range(val_steps): + inputs, targets, cum_seqlens = next(val_loader) + val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) + val_loss /= val_steps + del val_loader + dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) + print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) + model.train() + # start the clock again + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if master_process and args.save_checkpoint: + log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) + os.makedirs(f"logs/{run_id}", exist_ok=True) + torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") + # the last step only has the validation loop, so break to avoid training + break + + # --------------- TRAINING SECTION ----------------- + loss = 0 + for _ in range(grad_accum_steps): + inputs, targets, cum_seqlens = next(train_loader) + loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps + loss.backward() + step_optimizers(step, optimizers, model) + + # logging + approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) + print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) + +print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) +dist.destroy_process_group() + +==================================================================================================== +Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] +Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 +Running Triton version 3.5.0 +Tue Oct 28 01:55:48 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | +| N/A 40C P0 128W / 700W | 5858MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | +| N/A 33C P0 128W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | +| N/A 32C P0 121W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | +| N/A 37C P0 123W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | +| N/A 38C P0 122W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | +| N/A 32C P0 119W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | +| N/A 37C P0 125W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | +| N/A 31C P0 116W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| ++-----------------------------------------------------------------------------------------+ + +==================================================================================================== +step:0/2285 val_loss:10.8258 train_time:0ms step_avg:0.02ms +step:1/2285 train_time:116ms step_avg:115.76ms +step:2/2285 train_time:137ms step_avg:68.68ms +step:3/2285 train_time:174ms step_avg:58.06ms +step:4/2285 train_time:230ms step_avg:57.53ms +step:5/2285 train_time:289ms step_avg:57.89ms +step:6/2285 train_time:348ms step_avg:57.93ms +step:7/2285 train_time:408ms step_avg:58.30ms +step:8/2285 train_time:466ms step_avg:58.28ms +step:9/2285 train_time:527ms step_avg:58.60ms +step:10/2285 train_time:585ms step_avg:58.55ms +step:11/2285 train_time:646ms step_avg:58.73ms +step:12/2285 train_time:704ms step_avg:58.68ms +step:13/2285 train_time:765ms step_avg:58.87ms +step:14/2285 train_time:824ms step_avg:58.83ms +step:15/2285 train_time:884ms step_avg:58.96ms +step:16/2285 train_time:943ms step_avg:58.95ms +step:17/2285 train_time:1006ms step_avg:59.20ms +step:18/2285 train_time:1070ms step_avg:59.42ms +step:19/2285 train_time:1134ms step_avg:59.70ms +step:20/2285 train_time:1195ms step_avg:59.73ms +step:21/2285 train_time:1257ms step_avg:59.85ms +step:22/2285 train_time:1315ms step_avg:59.79ms +step:23/2285 train_time:1377ms step_avg:59.89ms +step:24/2285 train_time:1436ms step_avg:59.85ms +step:25/2285 train_time:1498ms step_avg:59.91ms +step:26/2285 train_time:1557ms step_avg:59.88ms +step:27/2285 train_time:1618ms step_avg:59.94ms +step:28/2285 train_time:1677ms step_avg:59.90ms +step:29/2285 train_time:1738ms step_avg:59.93ms +step:30/2285 train_time:1796ms step_avg:59.88ms +step:31/2285 train_time:1858ms step_avg:59.93ms +step:32/2285 train_time:1917ms step_avg:59.90ms +step:33/2285 train_time:1979ms step_avg:59.97ms +step:34/2285 train_time:2039ms step_avg:59.98ms +step:35/2285 train_time:2103ms step_avg:60.07ms +step:36/2285 train_time:2164ms step_avg:60.11ms +step:37/2285 train_time:2227ms step_avg:60.18ms +step:38/2285 train_time:2286ms step_avg:60.15ms +step:39/2285 train_time:2347ms step_avg:60.19ms +step:40/2285 train_time:2407ms step_avg:60.18ms +step:41/2285 train_time:2467ms step_avg:60.18ms +step:42/2285 train_time:2526ms step_avg:60.15ms +step:43/2285 train_time:2588ms step_avg:60.18ms +step:44/2285 train_time:2646ms step_avg:60.14ms +step:45/2285 train_time:2708ms step_avg:60.18ms +step:46/2285 train_time:2768ms step_avg:60.18ms +step:47/2285 train_time:2830ms step_avg:60.20ms +step:48/2285 train_time:2888ms step_avg:60.17ms +step:49/2285 train_time:2950ms step_avg:60.20ms +step:50/2285 train_time:3009ms step_avg:60.18ms +step:51/2285 train_time:3070ms step_avg:60.21ms +step:52/2285 train_time:3130ms step_avg:60.18ms +step:53/2285 train_time:3191ms step_avg:60.21ms +step:54/2285 train_time:3250ms step_avg:60.19ms +step:55/2285 train_time:3312ms step_avg:60.21ms +step:56/2285 train_time:3371ms step_avg:60.20ms +step:57/2285 train_time:3432ms step_avg:60.21ms +step:58/2285 train_time:3490ms step_avg:60.18ms +step:59/2285 train_time:3551ms step_avg:60.19ms +step:60/2285 train_time:3610ms step_avg:60.17ms +step:61/2285 train_time:3672ms step_avg:60.19ms +step:62/2285 train_time:3730ms step_avg:60.16ms +step:63/2285 train_time:3792ms step_avg:60.19ms +step:64/2285 train_time:3850ms step_avg:60.16ms +step:65/2285 train_time:3911ms step_avg:60.17ms +step:66/2285 train_time:3970ms step_avg:60.15ms +step:67/2285 train_time:4032ms step_avg:60.18ms +step:68/2285 train_time:4091ms step_avg:60.15ms +step:69/2285 train_time:4152ms step_avg:60.17ms +step:70/2285 train_time:4211ms step_avg:60.15ms +step:71/2285 train_time:4271ms step_avg:60.16ms +step:72/2285 train_time:4330ms step_avg:60.14ms +step:73/2285 train_time:4391ms step_avg:60.15ms +step:74/2285 train_time:4450ms step_avg:60.13ms +step:75/2285 train_time:4511ms step_avg:60.15ms +step:76/2285 train_time:4570ms step_avg:60.13ms +step:77/2285 train_time:4631ms step_avg:60.14ms +step:78/2285 train_time:4690ms step_avg:60.13ms +step:79/2285 train_time:4751ms step_avg:60.14ms +step:80/2285 train_time:4809ms step_avg:60.12ms +step:81/2285 train_time:4871ms step_avg:60.14ms +step:82/2285 train_time:4930ms step_avg:60.12ms +step:83/2285 train_time:4992ms step_avg:60.14ms +step:84/2285 train_time:5050ms step_avg:60.12ms +step:85/2285 train_time:5112ms step_avg:60.14ms +step:86/2285 train_time:5170ms step_avg:60.12ms +step:87/2285 train_time:5232ms step_avg:60.14ms +step:88/2285 train_time:5290ms step_avg:60.12ms +step:89/2285 train_time:5351ms step_avg:60.13ms +step:90/2285 train_time:5410ms step_avg:60.11ms +step:91/2285 train_time:5471ms step_avg:60.12ms +step:92/2285 train_time:5530ms step_avg:60.11ms +step:93/2285 train_time:5590ms step_avg:60.11ms +step:94/2285 train_time:5649ms step_avg:60.10ms +step:95/2285 train_time:5710ms step_avg:60.11ms +step:96/2285 train_time:5769ms step_avg:60.09ms +step:97/2285 train_time:5830ms step_avg:60.10ms +step:98/2285 train_time:5889ms step_avg:60.09ms +step:99/2285 train_time:5950ms step_avg:60.10ms +step:100/2285 train_time:6009ms step_avg:60.09ms +step:101/2285 train_time:6070ms step_avg:60.10ms +step:102/2285 train_time:6129ms step_avg:60.09ms +step:103/2285 train_time:6190ms step_avg:60.10ms +step:104/2285 train_time:6249ms step_avg:60.09ms +step:105/2285 train_time:6310ms step_avg:60.10ms +step:106/2285 train_time:6369ms step_avg:60.08ms +step:107/2285 train_time:6430ms step_avg:60.09ms +step:108/2285 train_time:6489ms step_avg:60.08ms +step:109/2285 train_time:6549ms step_avg:60.09ms +step:110/2285 train_time:6608ms step_avg:60.07ms +step:111/2285 train_time:6669ms step_avg:60.08ms +step:112/2285 train_time:6727ms step_avg:60.07ms +step:113/2285 train_time:6788ms step_avg:60.07ms +step:114/2285 train_time:6847ms step_avg:60.06ms +step:115/2285 train_time:6908ms step_avg:60.07ms +step:116/2285 train_time:6967ms step_avg:60.06ms +step:117/2285 train_time:7029ms step_avg:60.08ms +step:118/2285 train_time:7088ms step_avg:60.07ms +step:119/2285 train_time:7149ms step_avg:60.08ms +step:120/2285 train_time:7208ms step_avg:60.07ms +step:121/2285 train_time:7269ms step_avg:60.08ms +step:122/2285 train_time:7328ms step_avg:60.07ms +step:123/2285 train_time:7389ms step_avg:60.07ms +step:124/2285 train_time:7447ms step_avg:60.06ms +step:125/2285 train_time:7509ms step_avg:60.07ms +step:126/2285 train_time:7567ms step_avg:60.06ms +step:127/2285 train_time:7628ms step_avg:60.06ms +step:128/2285 train_time:7687ms step_avg:60.05ms +step:129/2285 train_time:7748ms step_avg:60.06ms +step:130/2285 train_time:7807ms step_avg:60.05ms +step:131/2285 train_time:7868ms step_avg:60.06ms +step:132/2285 train_time:7927ms step_avg:60.05ms +step:133/2285 train_time:7988ms step_avg:60.06ms +step:134/2285 train_time:8047ms step_avg:60.05ms +step:135/2285 train_time:8108ms step_avg:60.06ms +step:136/2285 train_time:8167ms step_avg:60.05ms +step:137/2285 train_time:8228ms step_avg:60.06ms +step:138/2285 train_time:8287ms step_avg:60.05ms +step:139/2285 train_time:8348ms step_avg:60.06ms +step:140/2285 train_time:8407ms step_avg:60.05ms +step:141/2285 train_time:8468ms step_avg:60.06ms +step:142/2285 train_time:8527ms step_avg:60.05ms +step:143/2285 train_time:8588ms step_avg:60.06ms +step:144/2285 train_time:8646ms step_avg:60.04ms +step:145/2285 train_time:8708ms step_avg:60.05ms +step:146/2285 train_time:8766ms step_avg:60.04ms +step:147/2285 train_time:8828ms step_avg:60.05ms +step:148/2285 train_time:8886ms step_avg:60.04ms +step:149/2285 train_time:8947ms step_avg:60.05ms +step:150/2285 train_time:9006ms step_avg:60.04ms +step:151/2285 train_time:9067ms step_avg:60.05ms +step:152/2285 train_time:9127ms step_avg:60.04ms +step:153/2285 train_time:9188ms step_avg:60.05ms +step:154/2285 train_time:9246ms step_avg:60.04ms +step:155/2285 train_time:9307ms step_avg:60.05ms +step:156/2285 train_time:9366ms step_avg:60.04ms +step:157/2285 train_time:9428ms step_avg:60.05ms +step:158/2285 train_time:9486ms step_avg:60.04ms +step:159/2285 train_time:9547ms step_avg:60.04ms +step:160/2285 train_time:9605ms step_avg:60.03ms +step:161/2285 train_time:9667ms step_avg:60.04ms +step:162/2285 train_time:9726ms step_avg:60.04ms +step:163/2285 train_time:9787ms step_avg:60.04ms +step:164/2285 train_time:9845ms step_avg:60.03ms +step:165/2285 train_time:9906ms step_avg:60.04ms +step:166/2285 train_time:9965ms step_avg:60.03ms +step:167/2285 train_time:10026ms step_avg:60.04ms +step:168/2285 train_time:10086ms step_avg:60.03ms +step:169/2285 train_time:10146ms step_avg:60.04ms +step:170/2285 train_time:10205ms step_avg:60.03ms +step:171/2285 train_time:10266ms step_avg:60.04ms +step:172/2285 train_time:10325ms step_avg:60.03ms +step:173/2285 train_time:10387ms step_avg:60.04ms +step:174/2285 train_time:10445ms step_avg:60.03ms +step:175/2285 train_time:10506ms step_avg:60.04ms +step:176/2285 train_time:10565ms step_avg:60.03ms +step:177/2285 train_time:10627ms step_avg:60.04ms +step:178/2285 train_time:10685ms step_avg:60.03ms +step:179/2285 train_time:10746ms step_avg:60.03ms +step:180/2285 train_time:10805ms step_avg:60.03ms +step:181/2285 train_time:10866ms step_avg:60.03ms +step:182/2285 train_time:10926ms step_avg:60.04ms +step:183/2285 train_time:10987ms step_avg:60.04ms +step:184/2285 train_time:11045ms step_avg:60.03ms +step:185/2285 train_time:11106ms step_avg:60.03ms +step:186/2285 train_time:11165ms step_avg:60.03ms +step:187/2285 train_time:11226ms step_avg:60.03ms +step:188/2285 train_time:11285ms step_avg:60.03ms +step:189/2285 train_time:11346ms step_avg:60.03ms +step:190/2285 train_time:11405ms step_avg:60.03ms +step:191/2285 train_time:11466ms step_avg:60.03ms +step:192/2285 train_time:11525ms step_avg:60.02ms +step:193/2285 train_time:11586ms step_avg:60.03ms +step:194/2285 train_time:11644ms step_avg:60.02ms +step:195/2285 train_time:11705ms step_avg:60.03ms +step:196/2285 train_time:11764ms step_avg:60.02ms +step:197/2285 train_time:11825ms step_avg:60.03ms +step:198/2285 train_time:11884ms step_avg:60.02ms +step:199/2285 train_time:11946ms step_avg:60.03ms +step:200/2285 train_time:12005ms step_avg:60.02ms +step:201/2285 train_time:12066ms step_avg:60.03ms +step:202/2285 train_time:12125ms step_avg:60.03ms +step:203/2285 train_time:12186ms step_avg:60.03ms +step:204/2285 train_time:12245ms step_avg:60.02ms +step:205/2285 train_time:12306ms step_avg:60.03ms +step:206/2285 train_time:12366ms step_avg:60.03ms +step:207/2285 train_time:12427ms step_avg:60.03ms +step:208/2285 train_time:12485ms step_avg:60.02ms +step:209/2285 train_time:12546ms step_avg:60.03ms +step:210/2285 train_time:12606ms step_avg:60.03ms +step:211/2285 train_time:12667ms step_avg:60.04ms +step:212/2285 train_time:12726ms step_avg:60.03ms +step:213/2285 train_time:12787ms step_avg:60.03ms +step:214/2285 train_time:12846ms step_avg:60.03ms +step:215/2285 train_time:12907ms step_avg:60.03ms +step:216/2285 train_time:12966ms step_avg:60.03ms +step:217/2285 train_time:13027ms step_avg:60.03ms +step:218/2285 train_time:13086ms step_avg:60.03ms +step:219/2285 train_time:13147ms step_avg:60.03ms +step:220/2285 train_time:13206ms step_avg:60.03ms +step:221/2285 train_time:13267ms step_avg:60.03ms +step:222/2285 train_time:13326ms step_avg:60.03ms +step:223/2285 train_time:13387ms step_avg:60.03ms +step:224/2285 train_time:13445ms step_avg:60.02ms +step:225/2285 train_time:13507ms step_avg:60.03ms +step:226/2285 train_time:13565ms step_avg:60.02ms +step:227/2285 train_time:13627ms step_avg:60.03ms +step:228/2285 train_time:13685ms step_avg:60.02ms +step:229/2285 train_time:13746ms step_avg:60.03ms +step:230/2285 train_time:13805ms step_avg:60.02ms +step:231/2285 train_time:13866ms step_avg:60.03ms +step:232/2285 train_time:13925ms step_avg:60.02ms +step:233/2285 train_time:13986ms step_avg:60.03ms +step:234/2285 train_time:14045ms step_avg:60.02ms +step:235/2285 train_time:14107ms step_avg:60.03ms +step:236/2285 train_time:14166ms step_avg:60.02ms +step:237/2285 train_time:14227ms step_avg:60.03ms +step:238/2285 train_time:14286ms step_avg:60.02ms +step:239/2285 train_time:14346ms step_avg:60.03ms +step:240/2285 train_time:14406ms step_avg:60.02ms +step:241/2285 train_time:14467ms step_avg:60.03ms +step:242/2285 train_time:14526ms step_avg:60.02ms +step:243/2285 train_time:14586ms step_avg:60.03ms +step:244/2285 train_time:14645ms step_avg:60.02ms +step:245/2285 train_time:14706ms step_avg:60.02ms +step:246/2285 train_time:14765ms step_avg:60.02ms +step:247/2285 train_time:14826ms step_avg:60.02ms +step:248/2285 train_time:14884ms step_avg:60.02ms +step:249/2285 train_time:14945ms step_avg:60.02ms +step:250/2285 train_time:15004ms step_avg:60.01ms +step:250/2285 val_loss:4.0876 train_time:15067ms step_avg:60.27ms +step:251/2285 train_time:15086ms step_avg:60.10ms +step:252/2285 train_time:15126ms step_avg:60.02ms +step:253/2285 train_time:15194ms step_avg:60.06ms +step:254/2285 train_time:15259ms step_avg:60.08ms +step:255/2285 train_time:15320ms step_avg:60.08ms +step:256/2285 train_time:15379ms step_avg:60.07ms +step:257/2285 train_time:15439ms step_avg:60.07ms +step:258/2285 train_time:15497ms step_avg:60.07ms +step:259/2285 train_time:15557ms step_avg:60.07ms +step:260/2285 train_time:15615ms step_avg:60.06ms +step:261/2285 train_time:15676ms step_avg:60.06ms +step:262/2285 train_time:15733ms step_avg:60.05ms +step:263/2285 train_time:15793ms step_avg:60.05ms +step:264/2285 train_time:15851ms step_avg:60.04ms +step:265/2285 train_time:15911ms step_avg:60.04ms +step:266/2285 train_time:15969ms step_avg:60.03ms +step:267/2285 train_time:16031ms step_avg:60.04ms +step:268/2285 train_time:16090ms step_avg:60.04ms +step:269/2285 train_time:16153ms step_avg:60.05ms +step:270/2285 train_time:16216ms step_avg:60.06ms +step:271/2285 train_time:16277ms step_avg:60.06ms +step:272/2285 train_time:16335ms step_avg:60.06ms +step:273/2285 train_time:16397ms step_avg:60.06ms +step:274/2285 train_time:16455ms step_avg:60.06ms +step:275/2285 train_time:16516ms step_avg:60.06ms +step:276/2285 train_time:16576ms step_avg:60.06ms +step:277/2285 train_time:16635ms step_avg:60.05ms +step:278/2285 train_time:16693ms step_avg:60.05ms +step:279/2285 train_time:16753ms step_avg:60.05ms +step:280/2285 train_time:16811ms step_avg:60.04ms +step:281/2285 train_time:16870ms step_avg:60.04ms +step:282/2285 train_time:16928ms step_avg:60.03ms +step:283/2285 train_time:16989ms step_avg:60.03ms +step:284/2285 train_time:17047ms step_avg:60.02ms +step:285/2285 train_time:17109ms step_avg:60.03ms +step:286/2285 train_time:17169ms step_avg:60.03ms +step:287/2285 train_time:17231ms step_avg:60.04ms +step:288/2285 train_time:17290ms step_avg:60.04ms +step:289/2285 train_time:17352ms step_avg:60.04ms +step:290/2285 train_time:17411ms step_avg:60.04ms +step:291/2285 train_time:17472ms step_avg:60.04ms +step:292/2285 train_time:17531ms step_avg:60.04ms +step:293/2285 train_time:17592ms step_avg:60.04ms +step:294/2285 train_time:17651ms step_avg:60.04ms +step:295/2285 train_time:17712ms step_avg:60.04ms +step:296/2285 train_time:17770ms step_avg:60.03ms +step:297/2285 train_time:17829ms step_avg:60.03ms +step:298/2285 train_time:17887ms step_avg:60.02ms +step:299/2285 train_time:17948ms step_avg:60.03ms +step:300/2285 train_time:18006ms step_avg:60.02ms +step:301/2285 train_time:18067ms step_avg:60.02ms +step:302/2285 train_time:18126ms step_avg:60.02ms +step:303/2285 train_time:18187ms step_avg:60.02ms +step:304/2285 train_time:18246ms step_avg:60.02ms +step:305/2285 train_time:18309ms step_avg:60.03ms +step:306/2285 train_time:18367ms step_avg:60.02ms +step:307/2285 train_time:18429ms step_avg:60.03ms +step:308/2285 train_time:18488ms step_avg:60.03ms +step:309/2285 train_time:18549ms step_avg:60.03ms +step:310/2285 train_time:18609ms step_avg:60.03ms +step:311/2285 train_time:18669ms step_avg:60.03ms +step:312/2285 train_time:18728ms step_avg:60.02ms +step:313/2285 train_time:18788ms step_avg:60.03ms +step:314/2285 train_time:18846ms step_avg:60.02ms +step:315/2285 train_time:18907ms step_avg:60.02ms +step:316/2285 train_time:18965ms step_avg:60.02ms +step:317/2285 train_time:19027ms step_avg:60.02ms +step:318/2285 train_time:19085ms step_avg:60.01ms +step:319/2285 train_time:19147ms step_avg:60.02ms +step:320/2285 train_time:19206ms step_avg:60.02ms +step:321/2285 train_time:19268ms step_avg:60.03ms +step:322/2285 train_time:19327ms step_avg:60.02ms +step:323/2285 train_time:19389ms step_avg:60.03ms +step:324/2285 train_time:19448ms step_avg:60.02ms +step:325/2285 train_time:19510ms step_avg:60.03ms +step:326/2285 train_time:19569ms step_avg:60.03ms +step:327/2285 train_time:19629ms step_avg:60.03ms +step:328/2285 train_time:19688ms step_avg:60.02ms +step:329/2285 train_time:19748ms step_avg:60.02ms +step:330/2285 train_time:19807ms step_avg:60.02ms +step:331/2285 train_time:19867ms step_avg:60.02ms +step:332/2285 train_time:19925ms step_avg:60.02ms +step:333/2285 train_time:19986ms step_avg:60.02ms +step:334/2285 train_time:20045ms step_avg:60.01ms +step:335/2285 train_time:20105ms step_avg:60.02ms +step:336/2285 train_time:20164ms step_avg:60.01ms +step:337/2285 train_time:20225ms step_avg:60.01ms +step:338/2285 train_time:20284ms step_avg:60.01ms +step:339/2285 train_time:20345ms step_avg:60.01ms +step:340/2285 train_time:20404ms step_avg:60.01ms +step:341/2285 train_time:20467ms step_avg:60.02ms +step:342/2285 train_time:20526ms step_avg:60.02ms +step:343/2285 train_time:20587ms step_avg:60.02ms +step:344/2285 train_time:20646ms step_avg:60.02ms +step:345/2285 train_time:20707ms step_avg:60.02ms +step:346/2285 train_time:20766ms step_avg:60.02ms +step:347/2285 train_time:20827ms step_avg:60.02ms +step:348/2285 train_time:20885ms step_avg:60.01ms +step:349/2285 train_time:20946ms step_avg:60.02ms +step:350/2285 train_time:21004ms step_avg:60.01ms +step:351/2285 train_time:21065ms step_avg:60.01ms +step:352/2285 train_time:21123ms step_avg:60.01ms +step:353/2285 train_time:21184ms step_avg:60.01ms +step:354/2285 train_time:21242ms step_avg:60.01ms +step:355/2285 train_time:21304ms step_avg:60.01ms +step:356/2285 train_time:21362ms step_avg:60.01ms +step:357/2285 train_time:21425ms step_avg:60.01ms +step:358/2285 train_time:21483ms step_avg:60.01ms +step:359/2285 train_time:21544ms step_avg:60.01ms +step:360/2285 train_time:21603ms step_avg:60.01ms +step:361/2285 train_time:21664ms step_avg:60.01ms +step:362/2285 train_time:21723ms step_avg:60.01ms +step:363/2285 train_time:21783ms step_avg:60.01ms +step:364/2285 train_time:21842ms step_avg:60.01ms +step:365/2285 train_time:21902ms step_avg:60.01ms +step:366/2285 train_time:21960ms step_avg:60.00ms +step:367/2285 train_time:22021ms step_avg:60.00ms +step:368/2285 train_time:22079ms step_avg:60.00ms +step:369/2285 train_time:22140ms step_avg:60.00ms +step:370/2285 train_time:22197ms step_avg:59.99ms +step:371/2285 train_time:22259ms step_avg:60.00ms +step:372/2285 train_time:22317ms step_avg:59.99ms +step:373/2285 train_time:22378ms step_avg:60.00ms +step:374/2285 train_time:22437ms step_avg:59.99ms +step:375/2285 train_time:22497ms step_avg:59.99ms +step:376/2285 train_time:22556ms step_avg:59.99ms +step:377/2285 train_time:22617ms step_avg:59.99ms +step:378/2285 train_time:22676ms step_avg:59.99ms +step:379/2285 train_time:22736ms step_avg:59.99ms +step:380/2285 train_time:22794ms step_avg:59.99ms +step:381/2285 train_time:22855ms step_avg:59.99ms +step:382/2285 train_time:22914ms step_avg:59.98ms +step:383/2285 train_time:22975ms step_avg:59.99ms +step:384/2285 train_time:23034ms step_avg:59.98ms +step:385/2285 train_time:23095ms step_avg:59.99ms +step:386/2285 train_time:23154ms step_avg:59.98ms +step:387/2285 train_time:23215ms step_avg:59.99ms +step:388/2285 train_time:23274ms step_avg:59.98ms +step:389/2285 train_time:23335ms step_avg:59.99ms +step:390/2285 train_time:23394ms step_avg:59.99ms +step:391/2285 train_time:23456ms step_avg:59.99ms +step:392/2285 train_time:23515ms step_avg:59.99ms +step:393/2285 train_time:23576ms step_avg:59.99ms +step:394/2285 train_time:23635ms step_avg:59.99ms +step:395/2285 train_time:23696ms step_avg:59.99ms +step:396/2285 train_time:23755ms step_avg:59.99ms +step:397/2285 train_time:23817ms step_avg:59.99ms +step:398/2285 train_time:23875ms step_avg:59.99ms +step:399/2285 train_time:23936ms step_avg:59.99ms +step:400/2285 train_time:23995ms step_avg:59.99ms +step:401/2285 train_time:24055ms step_avg:59.99ms +step:402/2285 train_time:24114ms step_avg:59.99ms +step:403/2285 train_time:24175ms step_avg:59.99ms +step:404/2285 train_time:24234ms step_avg:59.98ms +step:405/2285 train_time:24296ms step_avg:59.99ms +step:406/2285 train_time:24355ms step_avg:59.99ms +step:407/2285 train_time:24417ms step_avg:59.99ms +step:408/2285 train_time:24475ms step_avg:59.99ms +step:409/2285 train_time:24536ms step_avg:59.99ms +step:410/2285 train_time:24595ms step_avg:59.99ms +step:411/2285 train_time:24657ms step_avg:59.99ms +step:412/2285 train_time:24715ms step_avg:59.99ms +step:413/2285 train_time:24776ms step_avg:59.99ms +step:414/2285 train_time:24835ms step_avg:59.99ms +step:415/2285 train_time:24896ms step_avg:59.99ms +step:416/2285 train_time:24955ms step_avg:59.99ms +step:417/2285 train_time:25017ms step_avg:59.99ms +step:418/2285 train_time:25075ms step_avg:59.99ms +step:419/2285 train_time:25136ms step_avg:59.99ms +step:420/2285 train_time:25195ms step_avg:59.99ms +step:421/2285 train_time:25256ms step_avg:59.99ms +step:422/2285 train_time:25315ms step_avg:59.99ms +step:423/2285 train_time:25376ms step_avg:59.99ms +step:424/2285 train_time:25435ms step_avg:59.99ms +step:425/2285 train_time:25496ms step_avg:59.99ms +step:426/2285 train_time:25555ms step_avg:59.99ms +step:427/2285 train_time:25617ms step_avg:59.99ms +step:428/2285 train_time:25675ms step_avg:59.99ms +step:429/2285 train_time:25736ms step_avg:59.99ms +step:430/2285 train_time:25795ms step_avg:59.99ms +step:431/2285 train_time:25856ms step_avg:59.99ms +step:432/2285 train_time:25915ms step_avg:59.99ms +step:433/2285 train_time:25976ms step_avg:59.99ms +step:434/2285 train_time:26035ms step_avg:59.99ms +step:435/2285 train_time:26096ms step_avg:59.99ms +step:436/2285 train_time:26155ms step_avg:59.99ms +step:437/2285 train_time:26216ms step_avg:59.99ms +step:438/2285 train_time:26275ms step_avg:59.99ms +step:439/2285 train_time:26336ms step_avg:59.99ms +step:440/2285 train_time:26394ms step_avg:59.99ms +step:441/2285 train_time:26456ms step_avg:59.99ms +step:442/2285 train_time:26515ms step_avg:59.99ms +step:443/2285 train_time:26576ms step_avg:59.99ms +step:444/2285 train_time:26635ms step_avg:59.99ms +step:445/2285 train_time:26696ms step_avg:59.99ms +step:446/2285 train_time:26755ms step_avg:59.99ms +step:447/2285 train_time:26816ms step_avg:59.99ms +step:448/2285 train_time:26875ms step_avg:59.99ms +step:449/2285 train_time:26936ms step_avg:59.99ms +step:450/2285 train_time:26995ms step_avg:59.99ms +step:451/2285 train_time:27056ms step_avg:59.99ms +step:452/2285 train_time:27115ms step_avg:59.99ms +step:453/2285 train_time:27177ms step_avg:59.99ms +step:454/2285 train_time:27236ms step_avg:59.99ms +step:455/2285 train_time:27297ms step_avg:59.99ms +step:456/2285 train_time:27356ms step_avg:59.99ms +step:457/2285 train_time:27417ms step_avg:59.99ms +step:458/2285 train_time:27476ms step_avg:59.99ms +step:459/2285 train_time:27537ms step_avg:59.99ms +step:460/2285 train_time:27595ms step_avg:59.99ms +step:461/2285 train_time:27656ms step_avg:59.99ms +step:462/2285 train_time:27715ms step_avg:59.99ms +step:463/2285 train_time:27779ms step_avg:60.00ms +step:464/2285 train_time:27834ms step_avg:59.99ms +step:465/2285 train_time:27895ms step_avg:59.99ms +step:466/2285 train_time:27954ms step_avg:59.99ms +step:467/2285 train_time:28015ms step_avg:59.99ms +step:468/2285 train_time:28074ms step_avg:59.99ms +step:469/2285 train_time:28136ms step_avg:59.99ms +step:470/2285 train_time:28195ms step_avg:59.99ms +step:471/2285 train_time:28256ms step_avg:59.99ms +step:472/2285 train_time:28315ms step_avg:59.99ms +step:473/2285 train_time:28376ms step_avg:59.99ms +step:474/2285 train_time:28435ms step_avg:59.99ms +step:475/2285 train_time:28496ms step_avg:59.99ms +step:476/2285 train_time:28555ms step_avg:59.99ms +step:477/2285 train_time:28616ms step_avg:59.99ms +step:478/2285 train_time:28675ms step_avg:59.99ms +step:479/2285 train_time:28736ms step_avg:59.99ms +step:480/2285 train_time:28795ms step_avg:59.99ms +step:481/2285 train_time:28856ms step_avg:59.99ms +step:482/2285 train_time:28915ms step_avg:59.99ms +step:483/2285 train_time:28975ms step_avg:59.99ms +step:484/2285 train_time:29034ms step_avg:59.99ms +step:485/2285 train_time:29095ms step_avg:59.99ms +step:486/2285 train_time:29154ms step_avg:59.99ms +step:487/2285 train_time:29216ms step_avg:59.99ms +step:488/2285 train_time:29275ms step_avg:59.99ms +step:489/2285 train_time:29336ms step_avg:59.99ms +step:490/2285 train_time:29395ms step_avg:59.99ms +step:491/2285 train_time:29457ms step_avg:59.99ms +step:492/2285 train_time:29516ms step_avg:59.99ms +step:493/2285 train_time:29578ms step_avg:60.00ms +step:494/2285 train_time:29636ms step_avg:59.99ms +step:495/2285 train_time:29697ms step_avg:59.99ms +step:496/2285 train_time:29757ms step_avg:59.99ms +step:497/2285 train_time:29818ms step_avg:60.00ms +step:498/2285 train_time:29877ms step_avg:59.99ms +step:499/2285 train_time:29938ms step_avg:60.00ms +step:500/2285 train_time:29996ms step_avg:59.99ms +step:500/2285 val_loss:3.7874 train_time:30059ms step_avg:60.12ms +step:501/2285 train_time:30087ms step_avg:60.05ms +step:502/2285 train_time:30120ms step_avg:60.00ms +step:503/2285 train_time:30180ms step_avg:60.00ms +step:504/2285 train_time:30241ms step_avg:60.00ms +step:505/2285 train_time:30303ms step_avg:60.01ms +step:506/2285 train_time:30362ms step_avg:60.00ms +step:507/2285 train_time:30423ms step_avg:60.01ms +step:508/2285 train_time:30481ms step_avg:60.00ms +step:509/2285 train_time:30542ms step_avg:60.00ms +step:510/2285 train_time:30600ms step_avg:60.00ms +step:511/2285 train_time:30660ms step_avg:60.00ms +step:512/2285 train_time:30719ms step_avg:60.00ms +step:513/2285 train_time:30779ms step_avg:60.00ms +step:514/2285 train_time:30838ms step_avg:60.00ms +step:515/2285 train_time:30898ms step_avg:60.00ms +step:516/2285 train_time:30957ms step_avg:60.00ms +step:517/2285 train_time:31023ms step_avg:60.01ms +step:518/2285 train_time:31085ms step_avg:60.01ms +step:519/2285 train_time:31147ms step_avg:60.01ms +step:520/2285 train_time:31206ms step_avg:60.01ms +step:521/2285 train_time:31267ms step_avg:60.01ms +step:522/2285 train_time:31326ms step_avg:60.01ms +step:523/2285 train_time:31387ms step_avg:60.01ms +step:524/2285 train_time:31446ms step_avg:60.01ms +step:525/2285 train_time:31507ms step_avg:60.01ms +step:526/2285 train_time:31566ms step_avg:60.01ms +step:527/2285 train_time:31627ms step_avg:60.01ms +step:528/2285 train_time:31686ms step_avg:60.01ms +step:529/2285 train_time:31747ms step_avg:60.01ms +step:530/2285 train_time:31806ms step_avg:60.01ms +step:531/2285 train_time:31867ms step_avg:60.01ms +step:532/2285 train_time:31926ms step_avg:60.01ms +step:533/2285 train_time:31989ms step_avg:60.02ms +step:534/2285 train_time:32048ms step_avg:60.02ms +step:535/2285 train_time:32110ms step_avg:60.02ms +step:536/2285 train_time:32170ms step_avg:60.02ms +step:537/2285 train_time:32233ms step_avg:60.02ms +step:538/2285 train_time:32293ms step_avg:60.02ms +step:539/2285 train_time:32354ms step_avg:60.03ms +step:540/2285 train_time:32413ms step_avg:60.02ms +step:541/2285 train_time:32474ms step_avg:60.03ms +step:542/2285 train_time:32533ms step_avg:60.02ms +step:543/2285 train_time:32594ms step_avg:60.03ms +step:544/2285 train_time:32653ms step_avg:60.02ms +step:545/2285 train_time:32714ms step_avg:60.03ms +step:546/2285 train_time:32773ms step_avg:60.02ms +step:547/2285 train_time:32835ms step_avg:60.03ms +step:548/2285 train_time:32894ms step_avg:60.03ms +step:549/2285 train_time:32957ms step_avg:60.03ms +step:550/2285 train_time:33016ms step_avg:60.03ms +step:551/2285 train_time:33077ms step_avg:60.03ms +step:552/2285 train_time:33137ms step_avg:60.03ms +step:553/2285 train_time:33199ms step_avg:60.03ms +step:554/2285 train_time:33258ms step_avg:60.03ms +step:555/2285 train_time:33320ms step_avg:60.04ms +step:556/2285 train_time:33378ms step_avg:60.03ms +step:557/2285 train_time:33439ms step_avg:60.03ms +step:558/2285 train_time:33498ms step_avg:60.03ms +step:559/2285 train_time:33559ms step_avg:60.03ms +step:560/2285 train_time:33618ms step_avg:60.03ms +step:561/2285 train_time:33680ms step_avg:60.04ms +step:562/2285 train_time:33739ms step_avg:60.03ms +step:563/2285 train_time:33800ms step_avg:60.04ms +step:564/2285 train_time:33859ms step_avg:60.03ms +step:565/2285 train_time:33921ms step_avg:60.04ms +step:566/2285 train_time:33979ms step_avg:60.03ms +step:567/2285 train_time:34041ms step_avg:60.04ms +step:568/2285 train_time:34100ms step_avg:60.04ms +step:569/2285 train_time:34162ms step_avg:60.04ms +step:570/2285 train_time:34220ms step_avg:60.04ms +step:571/2285 train_time:34282ms step_avg:60.04ms +step:572/2285 train_time:34341ms step_avg:60.04ms +step:573/2285 train_time:34402ms step_avg:60.04ms +step:574/2285 train_time:34461ms step_avg:60.04ms +step:575/2285 train_time:34522ms step_avg:60.04ms +step:576/2285 train_time:34580ms step_avg:60.04ms +step:577/2285 train_time:34642ms step_avg:60.04ms +step:578/2285 train_time:34700ms step_avg:60.04ms +step:579/2285 train_time:34762ms step_avg:60.04ms +step:580/2285 train_time:34820ms step_avg:60.04ms +step:581/2285 train_time:34881ms step_avg:60.04ms +step:582/2285 train_time:34940ms step_avg:60.03ms +step:583/2285 train_time:35002ms step_avg:60.04ms +step:584/2285 train_time:35061ms step_avg:60.04ms +step:585/2285 train_time:35122ms step_avg:60.04ms +step:586/2285 train_time:35181ms step_avg:60.04ms +step:587/2285 train_time:35242ms step_avg:60.04ms +step:588/2285 train_time:35301ms step_avg:60.04ms +step:589/2285 train_time:35362ms step_avg:60.04ms +step:590/2285 train_time:35421ms step_avg:60.04ms +step:591/2285 train_time:35482ms step_avg:60.04ms +step:592/2285 train_time:35541ms step_avg:60.04ms +step:593/2285 train_time:35602ms step_avg:60.04ms +step:594/2285 train_time:35661ms step_avg:60.04ms +step:595/2285 train_time:35722ms step_avg:60.04ms +step:596/2285 train_time:35781ms step_avg:60.03ms +step:597/2285 train_time:35842ms step_avg:60.04ms +step:598/2285 train_time:35901ms step_avg:60.03ms +step:599/2285 train_time:35962ms step_avg:60.04ms +step:600/2285 train_time:36021ms step_avg:60.03ms +step:601/2285 train_time:36082ms step_avg:60.04ms +step:602/2285 train_time:36141ms step_avg:60.03ms +step:603/2285 train_time:36202ms step_avg:60.04ms +step:604/2285 train_time:36261ms step_avg:60.03ms +step:605/2285 train_time:36322ms step_avg:60.04ms +step:606/2285 train_time:36381ms step_avg:60.03ms +step:607/2285 train_time:36443ms step_avg:60.04ms +step:608/2285 train_time:36502ms step_avg:60.04ms +step:609/2285 train_time:36562ms step_avg:60.04ms +step:610/2285 train_time:36621ms step_avg:60.03ms +step:611/2285 train_time:36682ms step_avg:60.04ms +step:612/2285 train_time:36741ms step_avg:60.03ms +step:613/2285 train_time:36802ms step_avg:60.04ms +step:614/2285 train_time:36861ms step_avg:60.03ms +step:615/2285 train_time:36922ms step_avg:60.04ms +step:616/2285 train_time:36981ms step_avg:60.03ms +step:617/2285 train_time:37043ms step_avg:60.04ms +step:618/2285 train_time:37102ms step_avg:60.04ms +step:619/2285 train_time:37163ms step_avg:60.04ms +step:620/2285 train_time:37221ms step_avg:60.03ms +step:621/2285 train_time:37282ms step_avg:60.04ms +step:622/2285 train_time:37341ms step_avg:60.03ms +step:623/2285 train_time:37402ms step_avg:60.04ms +step:624/2285 train_time:37461ms step_avg:60.03ms +step:625/2285 train_time:37522ms step_avg:60.04ms +step:626/2285 train_time:37581ms step_avg:60.03ms +step:627/2285 train_time:37642ms step_avg:60.04ms +step:628/2285 train_time:37701ms step_avg:60.03ms +step:629/2285 train_time:37763ms step_avg:60.04ms +step:630/2285 train_time:37821ms step_avg:60.03ms +step:631/2285 train_time:37882ms step_avg:60.04ms +step:632/2285 train_time:37941ms step_avg:60.03ms +step:633/2285 train_time:38003ms step_avg:60.04ms +step:634/2285 train_time:38061ms step_avg:60.03ms +step:635/2285 train_time:38122ms step_avg:60.03ms +step:636/2285 train_time:38181ms step_avg:60.03ms +step:637/2285 train_time:38242ms step_avg:60.03ms +step:638/2285 train_time:38301ms step_avg:60.03ms +step:639/2285 train_time:38362ms step_avg:60.03ms +step:640/2285 train_time:38420ms step_avg:60.03ms +step:641/2285 train_time:38482ms step_avg:60.03ms +step:642/2285 train_time:38541ms step_avg:60.03ms +step:643/2285 train_time:38602ms step_avg:60.03ms +step:644/2285 train_time:38661ms step_avg:60.03ms +step:645/2285 train_time:38723ms step_avg:60.03ms +step:646/2285 train_time:38781ms step_avg:60.03ms +step:647/2285 train_time:38842ms step_avg:60.03ms +step:648/2285 train_time:38901ms step_avg:60.03ms +step:649/2285 train_time:38962ms step_avg:60.03ms +step:650/2285 train_time:39021ms step_avg:60.03ms +step:651/2285 train_time:39081ms step_avg:60.03ms +step:652/2285 train_time:39140ms step_avg:60.03ms +step:653/2285 train_time:39202ms step_avg:60.03ms +step:654/2285 train_time:39260ms step_avg:60.03ms +step:655/2285 train_time:39322ms step_avg:60.03ms +step:656/2285 train_time:39381ms step_avg:60.03ms +step:657/2285 train_time:39442ms step_avg:60.03ms +step:658/2285 train_time:39501ms step_avg:60.03ms +step:659/2285 train_time:39563ms step_avg:60.03ms +step:660/2285 train_time:39622ms step_avg:60.03ms +step:661/2285 train_time:39684ms step_avg:60.04ms +step:662/2285 train_time:39743ms step_avg:60.03ms +step:663/2285 train_time:39804ms step_avg:60.04ms +step:664/2285 train_time:39862ms step_avg:60.03ms +step:665/2285 train_time:39923ms step_avg:60.03ms +step:666/2285 train_time:39982ms step_avg:60.03ms +step:667/2285 train_time:40042ms step_avg:60.03ms +step:668/2285 train_time:40101ms step_avg:60.03ms +step:669/2285 train_time:40163ms step_avg:60.03ms +step:670/2285 train_time:40221ms step_avg:60.03ms +step:671/2285 train_time:40282ms step_avg:60.03ms +step:672/2285 train_time:40341ms step_avg:60.03ms +step:673/2285 train_time:40402ms step_avg:60.03ms +step:674/2285 train_time:40461ms step_avg:60.03ms +step:675/2285 train_time:40522ms step_avg:60.03ms +step:676/2285 train_time:40582ms step_avg:60.03ms +step:677/2285 train_time:40643ms step_avg:60.03ms +step:678/2285 train_time:40702ms step_avg:60.03ms +step:679/2285 train_time:40764ms step_avg:60.03ms +step:680/2285 train_time:40822ms step_avg:60.03ms +step:681/2285 train_time:40884ms step_avg:60.03ms +step:682/2285 train_time:40942ms step_avg:60.03ms +step:683/2285 train_time:41003ms step_avg:60.03ms +step:684/2285 train_time:41062ms step_avg:60.03ms +step:685/2285 train_time:41123ms step_avg:60.03ms +step:686/2285 train_time:41181ms step_avg:60.03ms +step:687/2285 train_time:41242ms step_avg:60.03ms +step:688/2285 train_time:41301ms step_avg:60.03ms +step:689/2285 train_time:41364ms step_avg:60.03ms +step:690/2285 train_time:41422ms step_avg:60.03ms +step:691/2285 train_time:41483ms step_avg:60.03ms +step:692/2285 train_time:41542ms step_avg:60.03ms +step:693/2285 train_time:41603ms step_avg:60.03ms +step:694/2285 train_time:41662ms step_avg:60.03ms +step:695/2285 train_time:41724ms step_avg:60.03ms +step:696/2285 train_time:41783ms step_avg:60.03ms +step:697/2285 train_time:41843ms step_avg:60.03ms +step:698/2285 train_time:41902ms step_avg:60.03ms +step:699/2285 train_time:41964ms step_avg:60.03ms +step:700/2285 train_time:42022ms step_avg:60.03ms +step:701/2285 train_time:42083ms step_avg:60.03ms +step:702/2285 train_time:42142ms step_avg:60.03ms +step:703/2285 train_time:42203ms step_avg:60.03ms +step:704/2285 train_time:42262ms step_avg:60.03ms +step:705/2285 train_time:42323ms step_avg:60.03ms +step:706/2285 train_time:42382ms step_avg:60.03ms +step:707/2285 train_time:42443ms step_avg:60.03ms +step:708/2285 train_time:42502ms step_avg:60.03ms +step:709/2285 train_time:42563ms step_avg:60.03ms +step:710/2285 train_time:42622ms step_avg:60.03ms +step:711/2285 train_time:42684ms step_avg:60.03ms +step:712/2285 train_time:42742ms step_avg:60.03ms +step:713/2285 train_time:42804ms step_avg:60.03ms +step:714/2285 train_time:42863ms step_avg:60.03ms +step:715/2285 train_time:42925ms step_avg:60.03ms +step:716/2285 train_time:42983ms step_avg:60.03ms +step:717/2285 train_time:43044ms step_avg:60.03ms +step:718/2285 train_time:43104ms step_avg:60.03ms +step:719/2285 train_time:43165ms step_avg:60.03ms +step:720/2285 train_time:43223ms step_avg:60.03ms +step:721/2285 train_time:43284ms step_avg:60.03ms +step:722/2285 train_time:43343ms step_avg:60.03ms +step:723/2285 train_time:43404ms step_avg:60.03ms +step:724/2285 train_time:43463ms step_avg:60.03ms +step:725/2285 train_time:43524ms step_avg:60.03ms +step:726/2285 train_time:43583ms step_avg:60.03ms +step:727/2285 train_time:43645ms step_avg:60.03ms +step:728/2285 train_time:43703ms step_avg:60.03ms +step:729/2285 train_time:43765ms step_avg:60.03ms +step:730/2285 train_time:43823ms step_avg:60.03ms +step:731/2285 train_time:43885ms step_avg:60.03ms +step:732/2285 train_time:43944ms step_avg:60.03ms +step:733/2285 train_time:44005ms step_avg:60.03ms +step:734/2285 train_time:44064ms step_avg:60.03ms +step:735/2285 train_time:44125ms step_avg:60.03ms +step:736/2285 train_time:44184ms step_avg:60.03ms +step:737/2285 train_time:44245ms step_avg:60.03ms +step:738/2285 train_time:44304ms step_avg:60.03ms +step:739/2285 train_time:44366ms step_avg:60.04ms +step:740/2285 train_time:44424ms step_avg:60.03ms +step:741/2285 train_time:44486ms step_avg:60.04ms +step:742/2285 train_time:44545ms step_avg:60.03ms +step:743/2285 train_time:44607ms step_avg:60.04ms +step:744/2285 train_time:44666ms step_avg:60.03ms +step:745/2285 train_time:44727ms step_avg:60.04ms +step:746/2285 train_time:44786ms step_avg:60.04ms +step:747/2285 train_time:44848ms step_avg:60.04ms +step:748/2285 train_time:44907ms step_avg:60.04ms +step:749/2285 train_time:44968ms step_avg:60.04ms +step:750/2285 train_time:45028ms step_avg:60.04ms +step:750/2285 val_loss:3.6604 train_time:45092ms step_avg:60.12ms +step:751/2285 train_time:45116ms step_avg:60.08ms +step:752/2285 train_time:45153ms step_avg:60.04ms +step:753/2285 train_time:45216ms step_avg:60.05ms +step:754/2285 train_time:45281ms step_avg:60.06ms +step:755/2285 train_time:45343ms step_avg:60.06ms +step:756/2285 train_time:45401ms step_avg:60.05ms +step:757/2285 train_time:45462ms step_avg:60.06ms +step:758/2285 train_time:45521ms step_avg:60.05ms +step:759/2285 train_time:45583ms step_avg:60.06ms +step:760/2285 train_time:45641ms step_avg:60.05ms +step:761/2285 train_time:45702ms step_avg:60.06ms +step:762/2285 train_time:45760ms step_avg:60.05ms +step:763/2285 train_time:45821ms step_avg:60.05ms +step:764/2285 train_time:45880ms step_avg:60.05ms +step:765/2285 train_time:45941ms step_avg:60.05ms +step:766/2285 train_time:46001ms step_avg:60.05ms +step:767/2285 train_time:46065ms step_avg:60.06ms +step:768/2285 train_time:46126ms step_avg:60.06ms +step:769/2285 train_time:46190ms step_avg:60.06ms +step:770/2285 train_time:46251ms step_avg:60.07ms +step:771/2285 train_time:46313ms step_avg:60.07ms +step:772/2285 train_time:46372ms step_avg:60.07ms +step:773/2285 train_time:46433ms step_avg:60.07ms +step:774/2285 train_time:46492ms step_avg:60.07ms +step:775/2285 train_time:46553ms step_avg:60.07ms +step:776/2285 train_time:46613ms step_avg:60.07ms +step:777/2285 train_time:46674ms step_avg:60.07ms +step:778/2285 train_time:46733ms step_avg:60.07ms +step:779/2285 train_time:46795ms step_avg:60.07ms +step:780/2285 train_time:46854ms step_avg:60.07ms +step:781/2285 train_time:46915ms step_avg:60.07ms +step:782/2285 train_time:46975ms step_avg:60.07ms +step:783/2285 train_time:47037ms step_avg:60.07ms +step:784/2285 train_time:47097ms step_avg:60.07ms +step:785/2285 train_time:47161ms step_avg:60.08ms +step:786/2285 train_time:47221ms step_avg:60.08ms +step:787/2285 train_time:47284ms step_avg:60.08ms +step:788/2285 train_time:47343ms step_avg:60.08ms +step:789/2285 train_time:47405ms step_avg:60.08ms +step:790/2285 train_time:47465ms step_avg:60.08ms +step:791/2285 train_time:47526ms step_avg:60.08ms +step:792/2285 train_time:47586ms step_avg:60.08ms +step:793/2285 train_time:47648ms step_avg:60.09ms +step:794/2285 train_time:47706ms step_avg:60.08ms +step:795/2285 train_time:47768ms step_avg:60.09ms +step:796/2285 train_time:47827ms step_avg:60.08ms +step:797/2285 train_time:47888ms step_avg:60.09ms +step:798/2285 train_time:47947ms step_avg:60.08ms +step:799/2285 train_time:48010ms step_avg:60.09ms +step:800/2285 train_time:48069ms step_avg:60.09ms +step:801/2285 train_time:48132ms step_avg:60.09ms +step:802/2285 train_time:48191ms step_avg:60.09ms +step:803/2285 train_time:48254ms step_avg:60.09ms +step:804/2285 train_time:48312ms step_avg:60.09ms +step:805/2285 train_time:48374ms step_avg:60.09ms +step:806/2285 train_time:48434ms step_avg:60.09ms +step:807/2285 train_time:48496ms step_avg:60.09ms +step:808/2285 train_time:48555ms step_avg:60.09ms +step:809/2285 train_time:48617ms step_avg:60.10ms +step:810/2285 train_time:48677ms step_avg:60.09ms +step:811/2285 train_time:48738ms step_avg:60.10ms +step:812/2285 train_time:48797ms step_avg:60.10ms +step:813/2285 train_time:48859ms step_avg:60.10ms +step:814/2285 train_time:48919ms step_avg:60.10ms +step:815/2285 train_time:48981ms step_avg:60.10ms +step:816/2285 train_time:49040ms step_avg:60.10ms +step:817/2285 train_time:49102ms step_avg:60.10ms +step:818/2285 train_time:49162ms step_avg:60.10ms +step:819/2285 train_time:49224ms step_avg:60.10ms +step:820/2285 train_time:49283ms step_avg:60.10ms +step:821/2285 train_time:49345ms step_avg:60.10ms +step:822/2285 train_time:49405ms step_avg:60.10ms +step:823/2285 train_time:49467ms step_avg:60.11ms +step:824/2285 train_time:49527ms step_avg:60.11ms +step:825/2285 train_time:49589ms step_avg:60.11ms +step:826/2285 train_time:49648ms step_avg:60.11ms +step:827/2285 train_time:49709ms step_avg:60.11ms +step:828/2285 train_time:49768ms step_avg:60.11ms +step:829/2285 train_time:49830ms step_avg:60.11ms +step:830/2285 train_time:49889ms step_avg:60.11ms +step:831/2285 train_time:49951ms step_avg:60.11ms +step:832/2285 train_time:50011ms step_avg:60.11ms +step:833/2285 train_time:50073ms step_avg:60.11ms +step:834/2285 train_time:50132ms step_avg:60.11ms +step:835/2285 train_time:50194ms step_avg:60.11ms +step:836/2285 train_time:50253ms step_avg:60.11ms +step:837/2285 train_time:50315ms step_avg:60.11ms +step:838/2285 train_time:50374ms step_avg:60.11ms +step:839/2285 train_time:50437ms step_avg:60.12ms +step:840/2285 train_time:50496ms step_avg:60.11ms +step:841/2285 train_time:50560ms step_avg:60.12ms +step:842/2285 train_time:50618ms step_avg:60.12ms +step:843/2285 train_time:50680ms step_avg:60.12ms +step:844/2285 train_time:50739ms step_avg:60.12ms +step:845/2285 train_time:50801ms step_avg:60.12ms +step:846/2285 train_time:50861ms step_avg:60.12ms +step:847/2285 train_time:50923ms step_avg:60.12ms +step:848/2285 train_time:50982ms step_avg:60.12ms +step:849/2285 train_time:51045ms step_avg:60.12ms +step:850/2285 train_time:51104ms step_avg:60.12ms +step:851/2285 train_time:51166ms step_avg:60.12ms +step:852/2285 train_time:51225ms step_avg:60.12ms +step:853/2285 train_time:51287ms step_avg:60.13ms +step:854/2285 train_time:51347ms step_avg:60.13ms +step:855/2285 train_time:51408ms step_avg:60.13ms +step:856/2285 train_time:51468ms step_avg:60.13ms +step:857/2285 train_time:51530ms step_avg:60.13ms +step:858/2285 train_time:51589ms step_avg:60.13ms +step:859/2285 train_time:51651ms step_avg:60.13ms +step:860/2285 train_time:51710ms step_avg:60.13ms +step:861/2285 train_time:51771ms step_avg:60.13ms +step:862/2285 train_time:51831ms step_avg:60.13ms +step:863/2285 train_time:51894ms step_avg:60.13ms +step:864/2285 train_time:51952ms step_avg:60.13ms +step:865/2285 train_time:52013ms step_avg:60.13ms +step:866/2285 train_time:52072ms step_avg:60.13ms +step:867/2285 train_time:52134ms step_avg:60.13ms +step:868/2285 train_time:52193ms step_avg:60.13ms +step:869/2285 train_time:52255ms step_avg:60.13ms +step:870/2285 train_time:52314ms step_avg:60.13ms +step:871/2285 train_time:52376ms step_avg:60.13ms +step:872/2285 train_time:52436ms step_avg:60.13ms +step:873/2285 train_time:52498ms step_avg:60.14ms +step:874/2285 train_time:52558ms step_avg:60.13ms +step:875/2285 train_time:52620ms step_avg:60.14ms +step:876/2285 train_time:52680ms step_avg:60.14ms +step:877/2285 train_time:52743ms step_avg:60.14ms +step:878/2285 train_time:52802ms step_avg:60.14ms +step:879/2285 train_time:52864ms step_avg:60.14ms +step:880/2285 train_time:52924ms step_avg:60.14ms +step:881/2285 train_time:52986ms step_avg:60.14ms +step:882/2285 train_time:53045ms step_avg:60.14ms +step:883/2285 train_time:53107ms step_avg:60.14ms +step:884/2285 train_time:53166ms step_avg:60.14ms +step:885/2285 train_time:53228ms step_avg:60.14ms +step:886/2285 train_time:53288ms step_avg:60.14ms +step:887/2285 train_time:53350ms step_avg:60.15ms +step:888/2285 train_time:53409ms step_avg:60.15ms +step:889/2285 train_time:53472ms step_avg:60.15ms +step:890/2285 train_time:53531ms step_avg:60.15ms +step:891/2285 train_time:53593ms step_avg:60.15ms +step:892/2285 train_time:53652ms step_avg:60.15ms +step:893/2285 train_time:53714ms step_avg:60.15ms +step:894/2285 train_time:53773ms step_avg:60.15ms +step:895/2285 train_time:53835ms step_avg:60.15ms +step:896/2285 train_time:53894ms step_avg:60.15ms +step:897/2285 train_time:53956ms step_avg:60.15ms +step:898/2285 train_time:54015ms step_avg:60.15ms +step:899/2285 train_time:54078ms step_avg:60.15ms +step:900/2285 train_time:54137ms step_avg:60.15ms +step:901/2285 train_time:54200ms step_avg:60.16ms +step:902/2285 train_time:54260ms step_avg:60.15ms +step:903/2285 train_time:54322ms step_avg:60.16ms +step:904/2285 train_time:54381ms step_avg:60.16ms +step:905/2285 train_time:54444ms step_avg:60.16ms +step:906/2285 train_time:54504ms step_avg:60.16ms +step:907/2285 train_time:54565ms step_avg:60.16ms +step:908/2285 train_time:54625ms step_avg:60.16ms +step:909/2285 train_time:54686ms step_avg:60.16ms +step:910/2285 train_time:54746ms step_avg:60.16ms +step:911/2285 train_time:54807ms step_avg:60.16ms +step:912/2285 train_time:54866ms step_avg:60.16ms +step:913/2285 train_time:54928ms step_avg:60.16ms +step:914/2285 train_time:54988ms step_avg:60.16ms +step:915/2285 train_time:55050ms step_avg:60.16ms +step:916/2285 train_time:55109ms step_avg:60.16ms +step:917/2285 train_time:55170ms step_avg:60.16ms +step:918/2285 train_time:55230ms step_avg:60.16ms +step:919/2285 train_time:55292ms step_avg:60.16ms +step:920/2285 train_time:55350ms step_avg:60.16ms +step:921/2285 train_time:55412ms step_avg:60.17ms +step:922/2285 train_time:55471ms step_avg:60.16ms +step:923/2285 train_time:55533ms step_avg:60.17ms +step:924/2285 train_time:55592ms step_avg:60.16ms +step:925/2285 train_time:55654ms step_avg:60.17ms +step:926/2285 train_time:55713ms step_avg:60.16ms +step:927/2285 train_time:55775ms step_avg:60.17ms +step:928/2285 train_time:55834ms step_avg:60.17ms +step:929/2285 train_time:55896ms step_avg:60.17ms +step:930/2285 train_time:55955ms step_avg:60.17ms +step:931/2285 train_time:56017ms step_avg:60.17ms +step:932/2285 train_time:56077ms step_avg:60.17ms +step:933/2285 train_time:56139ms step_avg:60.17ms +step:934/2285 train_time:56199ms step_avg:60.17ms +step:935/2285 train_time:56261ms step_avg:60.17ms +step:936/2285 train_time:56321ms step_avg:60.17ms +step:937/2285 train_time:56382ms step_avg:60.17ms +step:938/2285 train_time:56443ms step_avg:60.17ms +step:939/2285 train_time:56505ms step_avg:60.18ms +step:940/2285 train_time:56564ms step_avg:60.17ms +step:941/2285 train_time:56626ms step_avg:60.18ms +step:942/2285 train_time:56686ms step_avg:60.18ms +step:943/2285 train_time:56748ms step_avg:60.18ms +step:944/2285 train_time:56808ms step_avg:60.18ms +step:945/2285 train_time:56869ms step_avg:60.18ms +step:946/2285 train_time:56928ms step_avg:60.18ms +step:947/2285 train_time:56990ms step_avg:60.18ms +step:948/2285 train_time:57050ms step_avg:60.18ms +step:949/2285 train_time:57111ms step_avg:60.18ms +step:950/2285 train_time:57171ms step_avg:60.18ms +step:951/2285 train_time:57232ms step_avg:60.18ms +step:952/2285 train_time:57292ms step_avg:60.18ms +step:953/2285 train_time:57354ms step_avg:60.18ms +step:954/2285 train_time:57413ms step_avg:60.18ms +step:955/2285 train_time:57475ms step_avg:60.18ms +step:956/2285 train_time:57534ms step_avg:60.18ms +step:957/2285 train_time:57596ms step_avg:60.18ms +step:958/2285 train_time:57655ms step_avg:60.18ms +step:959/2285 train_time:57717ms step_avg:60.18ms +step:960/2285 train_time:57776ms step_avg:60.18ms +step:961/2285 train_time:57839ms step_avg:60.19ms +step:962/2285 train_time:57898ms step_avg:60.18ms +step:963/2285 train_time:57960ms step_avg:60.19ms +step:964/2285 train_time:58019ms step_avg:60.19ms +step:965/2285 train_time:58081ms step_avg:60.19ms +step:966/2285 train_time:58142ms step_avg:60.19ms +step:967/2285 train_time:58204ms step_avg:60.19ms +step:968/2285 train_time:58264ms step_avg:60.19ms +step:969/2285 train_time:58325ms step_avg:60.19ms +step:970/2285 train_time:58385ms step_avg:60.19ms +step:971/2285 train_time:58447ms step_avg:60.19ms +step:972/2285 train_time:58506ms step_avg:60.19ms +step:973/2285 train_time:58569ms step_avg:60.19ms +step:974/2285 train_time:58628ms step_avg:60.19ms +step:975/2285 train_time:58690ms step_avg:60.19ms +step:976/2285 train_time:58749ms step_avg:60.19ms +step:977/2285 train_time:58810ms step_avg:60.19ms +step:978/2285 train_time:58870ms step_avg:60.19ms +step:979/2285 train_time:58931ms step_avg:60.20ms +step:980/2285 train_time:58991ms step_avg:60.20ms +step:981/2285 train_time:59054ms step_avg:60.20ms +step:982/2285 train_time:59113ms step_avg:60.20ms +step:983/2285 train_time:59175ms step_avg:60.20ms +step:984/2285 train_time:59235ms step_avg:60.20ms +step:985/2285 train_time:59297ms step_avg:60.20ms +step:986/2285 train_time:59357ms step_avg:60.20ms +step:987/2285 train_time:59419ms step_avg:60.20ms +step:988/2285 train_time:59478ms step_avg:60.20ms +step:989/2285 train_time:59540ms step_avg:60.20ms +step:990/2285 train_time:59600ms step_avg:60.20ms +step:991/2285 train_time:59662ms step_avg:60.20ms +step:992/2285 train_time:59721ms step_avg:60.20ms +step:993/2285 train_time:59784ms step_avg:60.21ms +step:994/2285 train_time:59844ms step_avg:60.20ms +step:995/2285 train_time:59906ms step_avg:60.21ms +step:996/2285 train_time:59965ms step_avg:60.21ms +step:997/2285 train_time:60027ms step_avg:60.21ms +step:998/2285 train_time:60087ms step_avg:60.21ms +step:999/2285 train_time:60149ms step_avg:60.21ms +step:1000/2285 train_time:60209ms step_avg:60.21ms +step:1000/2285 val_loss:3.5730 train_time:60272ms step_avg:60.27ms +step:1001/2285 train_time:60293ms step_avg:60.23ms +step:1002/2285 train_time:60334ms step_avg:60.21ms +step:1003/2285 train_time:60398ms step_avg:60.22ms +step:1004/2285 train_time:60459ms step_avg:60.22ms +step:1005/2285 train_time:60522ms step_avg:60.22ms +step:1006/2285 train_time:60583ms step_avg:60.22ms +step:1007/2285 train_time:60644ms step_avg:60.22ms +step:1008/2285 train_time:60702ms step_avg:60.22ms +step:1009/2285 train_time:60764ms step_avg:60.22ms +step:1010/2285 train_time:60822ms step_avg:60.22ms +step:1011/2285 train_time:60883ms step_avg:60.22ms +step:1012/2285 train_time:60942ms step_avg:60.22ms +step:1013/2285 train_time:61003ms step_avg:60.22ms +step:1014/2285 train_time:61062ms step_avg:60.22ms +step:1015/2285 train_time:61123ms step_avg:60.22ms +step:1016/2285 train_time:61184ms step_avg:60.22ms +step:1017/2285 train_time:61250ms step_avg:60.23ms +step:1018/2285 train_time:61310ms step_avg:60.23ms +step:1019/2285 train_time:61372ms step_avg:60.23ms +step:1020/2285 train_time:61432ms step_avg:60.23ms +step:1021/2285 train_time:61495ms step_avg:60.23ms +step:1022/2285 train_time:61555ms step_avg:60.23ms +step:1023/2285 train_time:61617ms step_avg:60.23ms +step:1024/2285 train_time:61677ms step_avg:60.23ms +step:1025/2285 train_time:61738ms step_avg:60.23ms +step:1026/2285 train_time:61797ms step_avg:60.23ms +step:1027/2285 train_time:61859ms step_avg:60.23ms +step:1028/2285 train_time:61918ms step_avg:60.23ms +step:1029/2285 train_time:61979ms step_avg:60.23ms +step:1030/2285 train_time:62038ms step_avg:60.23ms +step:1031/2285 train_time:62100ms step_avg:60.23ms +step:1032/2285 train_time:62160ms step_avg:60.23ms +step:1033/2285 train_time:62223ms step_avg:60.23ms +step:1034/2285 train_time:62283ms step_avg:60.23ms +step:1035/2285 train_time:62345ms step_avg:60.24ms +step:1036/2285 train_time:62405ms step_avg:60.24ms +step:1037/2285 train_time:62467ms step_avg:60.24ms +step:1038/2285 train_time:62527ms step_avg:60.24ms +step:1039/2285 train_time:62588ms step_avg:60.24ms +step:1040/2285 train_time:62647ms step_avg:60.24ms +step:1041/2285 train_time:62709ms step_avg:60.24ms +step:1042/2285 train_time:62768ms step_avg:60.24ms +step:1043/2285 train_time:62830ms step_avg:60.24ms +step:1044/2285 train_time:62889ms step_avg:60.24ms +step:1045/2285 train_time:62950ms step_avg:60.24ms +step:1046/2285 train_time:63010ms step_avg:60.24ms +step:1047/2285 train_time:63072ms step_avg:60.24ms +step:1048/2285 train_time:63131ms step_avg:60.24ms +step:1049/2285 train_time:63193ms step_avg:60.24ms +step:1050/2285 train_time:63253ms step_avg:60.24ms +step:1051/2285 train_time:63316ms step_avg:60.24ms +step:1052/2285 train_time:63377ms step_avg:60.24ms +step:1053/2285 train_time:63439ms step_avg:60.25ms +step:1054/2285 train_time:63499ms step_avg:60.25ms +step:1055/2285 train_time:63561ms step_avg:60.25ms +step:1056/2285 train_time:63620ms step_avg:60.25ms +step:1057/2285 train_time:63682ms step_avg:60.25ms +step:1058/2285 train_time:63742ms step_avg:60.25ms +step:1059/2285 train_time:63803ms step_avg:60.25ms +step:1060/2285 train_time:63863ms step_avg:60.25ms +step:1061/2285 train_time:63925ms step_avg:60.25ms +step:1062/2285 train_time:63984ms step_avg:60.25ms +step:1063/2285 train_time:64046ms step_avg:60.25ms +step:1064/2285 train_time:64105ms step_avg:60.25ms +step:1065/2285 train_time:64167ms step_avg:60.25ms +step:1066/2285 train_time:64226ms step_avg:60.25ms +step:1067/2285 train_time:64288ms step_avg:60.25ms +step:1068/2285 train_time:64348ms step_avg:60.25ms +step:1069/2285 train_time:64410ms step_avg:60.25ms +step:1070/2285 train_time:64469ms step_avg:60.25ms +step:1071/2285 train_time:64531ms step_avg:60.25ms +step:1072/2285 train_time:64591ms step_avg:60.25ms +step:1073/2285 train_time:64653ms step_avg:60.25ms +step:1074/2285 train_time:64712ms step_avg:60.25ms +step:1075/2285 train_time:64774ms step_avg:60.26ms +step:1076/2285 train_time:64834ms step_avg:60.25ms +step:1077/2285 train_time:64897ms step_avg:60.26ms +step:1078/2285 train_time:64956ms step_avg:60.26ms +step:1079/2285 train_time:65019ms step_avg:60.26ms +step:1080/2285 train_time:65078ms step_avg:60.26ms +step:1081/2285 train_time:65140ms step_avg:60.26ms +step:1082/2285 train_time:65200ms step_avg:60.26ms +step:1083/2285 train_time:65262ms step_avg:60.26ms +step:1084/2285 train_time:65321ms step_avg:60.26ms +step:1085/2285 train_time:65383ms step_avg:60.26ms +step:1086/2285 train_time:65443ms step_avg:60.26ms +step:1087/2285 train_time:65505ms step_avg:60.26ms +step:1088/2285 train_time:65565ms step_avg:60.26ms +step:1089/2285 train_time:65627ms step_avg:60.26ms +step:1090/2285 train_time:65686ms step_avg:60.26ms +step:1091/2285 train_time:65747ms step_avg:60.26ms +step:1092/2285 train_time:65807ms step_avg:60.26ms +step:1093/2285 train_time:65868ms step_avg:60.26ms +step:1094/2285 train_time:65928ms step_avg:60.26ms +step:1095/2285 train_time:65990ms step_avg:60.26ms +step:1096/2285 train_time:66049ms step_avg:60.26ms +step:1097/2285 train_time:66111ms step_avg:60.27ms +step:1098/2285 train_time:66171ms step_avg:60.27ms +step:1099/2285 train_time:66234ms step_avg:60.27ms +step:1100/2285 train_time:66293ms step_avg:60.27ms +step:1101/2285 train_time:66355ms step_avg:60.27ms +step:1102/2285 train_time:66415ms step_avg:60.27ms +step:1103/2285 train_time:66478ms step_avg:60.27ms +step:1104/2285 train_time:66538ms step_avg:60.27ms +step:1105/2285 train_time:66600ms step_avg:60.27ms +step:1106/2285 train_time:66659ms step_avg:60.27ms +step:1107/2285 train_time:66722ms step_avg:60.27ms +step:1108/2285 train_time:66781ms step_avg:60.27ms +step:1109/2285 train_time:66843ms step_avg:60.27ms +step:1110/2285 train_time:66903ms step_avg:60.27ms +step:1111/2285 train_time:66965ms step_avg:60.27ms +step:1112/2285 train_time:67024ms step_avg:60.27ms +step:1113/2285 train_time:67086ms step_avg:60.27ms +step:1114/2285 train_time:67145ms step_avg:60.27ms +step:1115/2285 train_time:67207ms step_avg:60.28ms +step:1116/2285 train_time:67266ms step_avg:60.27ms +step:1117/2285 train_time:67328ms step_avg:60.28ms +step:1118/2285 train_time:67388ms step_avg:60.28ms +step:1119/2285 train_time:67449ms step_avg:60.28ms +step:1120/2285 train_time:67508ms step_avg:60.28ms +step:1121/2285 train_time:67571ms step_avg:60.28ms +step:1122/2285 train_time:67630ms step_avg:60.28ms +step:1123/2285 train_time:67692ms step_avg:60.28ms +step:1124/2285 train_time:67752ms step_avg:60.28ms +step:1125/2285 train_time:67814ms step_avg:60.28ms +step:1126/2285 train_time:67873ms step_avg:60.28ms +step:1127/2285 train_time:67936ms step_avg:60.28ms +step:1128/2285 train_time:67996ms step_avg:60.28ms +step:1129/2285 train_time:68058ms step_avg:60.28ms +step:1130/2285 train_time:68117ms step_avg:60.28ms +step:1131/2285 train_time:68180ms step_avg:60.28ms +step:1132/2285 train_time:68239ms step_avg:60.28ms +step:1133/2285 train_time:68301ms step_avg:60.28ms +step:1134/2285 train_time:68361ms step_avg:60.28ms +step:1135/2285 train_time:68423ms step_avg:60.28ms +step:1136/2285 train_time:68483ms step_avg:60.28ms +step:1137/2285 train_time:68544ms step_avg:60.29ms +step:1138/2285 train_time:68604ms step_avg:60.28ms +step:1139/2285 train_time:68666ms step_avg:60.29ms +step:1140/2285 train_time:68726ms step_avg:60.29ms +step:1141/2285 train_time:68788ms step_avg:60.29ms +step:1142/2285 train_time:68847ms step_avg:60.29ms +step:1143/2285 train_time:68909ms step_avg:60.29ms +step:1144/2285 train_time:68968ms step_avg:60.29ms +step:1145/2285 train_time:69030ms step_avg:60.29ms +step:1146/2285 train_time:69089ms step_avg:60.29ms +step:1147/2285 train_time:69152ms step_avg:60.29ms +step:1148/2285 train_time:69211ms step_avg:60.29ms +step:1149/2285 train_time:69274ms step_avg:60.29ms +step:1150/2285 train_time:69334ms step_avg:60.29ms +step:1151/2285 train_time:69397ms step_avg:60.29ms +step:1152/2285 train_time:69457ms step_avg:60.29ms +step:1153/2285 train_time:69520ms step_avg:60.29ms +step:1154/2285 train_time:69581ms step_avg:60.30ms +step:1155/2285 train_time:69643ms step_avg:60.30ms +step:1156/2285 train_time:69703ms step_avg:60.30ms +step:1157/2285 train_time:69765ms step_avg:60.30ms +step:1158/2285 train_time:69826ms step_avg:60.30ms +step:1159/2285 train_time:69887ms step_avg:60.30ms +step:1160/2285 train_time:69946ms step_avg:60.30ms +step:1161/2285 train_time:70008ms step_avg:60.30ms +step:1162/2285 train_time:70067ms step_avg:60.30ms +step:1163/2285 train_time:70129ms step_avg:60.30ms +step:1164/2285 train_time:70189ms step_avg:60.30ms +step:1165/2285 train_time:70251ms step_avg:60.30ms +step:1166/2285 train_time:70310ms step_avg:60.30ms +step:1167/2285 train_time:70373ms step_avg:60.30ms +step:1168/2285 train_time:70433ms step_avg:60.30ms +step:1169/2285 train_time:70496ms step_avg:60.30ms +step:1170/2285 train_time:70556ms step_avg:60.30ms +step:1171/2285 train_time:70620ms step_avg:60.31ms +step:1172/2285 train_time:70679ms step_avg:60.31ms +step:1173/2285 train_time:70741ms step_avg:60.31ms +step:1174/2285 train_time:70801ms step_avg:60.31ms +step:1175/2285 train_time:70863ms step_avg:60.31ms +step:1176/2285 train_time:70923ms step_avg:60.31ms +step:1177/2285 train_time:70986ms step_avg:60.31ms +step:1178/2285 train_time:71045ms step_avg:60.31ms +step:1179/2285 train_time:71106ms step_avg:60.31ms +step:1180/2285 train_time:71166ms step_avg:60.31ms +step:1181/2285 train_time:71228ms step_avg:60.31ms +step:1182/2285 train_time:71288ms step_avg:60.31ms +step:1183/2285 train_time:71350ms step_avg:60.31ms +step:1184/2285 train_time:71410ms step_avg:60.31ms +step:1185/2285 train_time:71472ms step_avg:60.31ms +step:1186/2285 train_time:71532ms step_avg:60.31ms +step:1187/2285 train_time:71595ms step_avg:60.32ms +step:1188/2285 train_time:71655ms step_avg:60.32ms +step:1189/2285 train_time:71717ms step_avg:60.32ms +step:1190/2285 train_time:71778ms step_avg:60.32ms +step:1191/2285 train_time:71840ms step_avg:60.32ms +step:1192/2285 train_time:71900ms step_avg:60.32ms +step:1193/2285 train_time:71962ms step_avg:60.32ms +step:1194/2285 train_time:72021ms step_avg:60.32ms +step:1195/2285 train_time:72084ms step_avg:60.32ms +step:1196/2285 train_time:72144ms step_avg:60.32ms +step:1197/2285 train_time:72205ms step_avg:60.32ms +step:1198/2285 train_time:72264ms step_avg:60.32ms +step:1199/2285 train_time:72326ms step_avg:60.32ms +step:1200/2285 train_time:72386ms step_avg:60.32ms +step:1201/2285 train_time:72448ms step_avg:60.32ms +step:1202/2285 train_time:72507ms step_avg:60.32ms +step:1203/2285 train_time:72570ms step_avg:60.32ms +step:1204/2285 train_time:72630ms step_avg:60.32ms +step:1205/2285 train_time:72692ms step_avg:60.33ms +step:1206/2285 train_time:72751ms step_avg:60.32ms +step:1207/2285 train_time:72814ms step_avg:60.33ms +step:1208/2285 train_time:72875ms step_avg:60.33ms +step:1209/2285 train_time:72937ms step_avg:60.33ms +step:1210/2285 train_time:72997ms step_avg:60.33ms +step:1211/2285 train_time:73059ms step_avg:60.33ms +step:1212/2285 train_time:73119ms step_avg:60.33ms +step:1213/2285 train_time:73182ms step_avg:60.33ms +step:1214/2285 train_time:73242ms step_avg:60.33ms +step:1215/2285 train_time:73304ms step_avg:60.33ms +step:1216/2285 train_time:73364ms step_avg:60.33ms +step:1217/2285 train_time:73426ms step_avg:60.33ms +step:1218/2285 train_time:73485ms step_avg:60.33ms +step:1219/2285 train_time:73547ms step_avg:60.33ms +step:1220/2285 train_time:73606ms step_avg:60.33ms +step:1221/2285 train_time:73668ms step_avg:60.33ms +step:1222/2285 train_time:73728ms step_avg:60.33ms +step:1223/2285 train_time:73791ms step_avg:60.34ms +step:1224/2285 train_time:73850ms step_avg:60.34ms +step:1225/2285 train_time:73912ms step_avg:60.34ms +step:1226/2285 train_time:73972ms step_avg:60.34ms +step:1227/2285 train_time:74034ms step_avg:60.34ms +step:1228/2285 train_time:74095ms step_avg:60.34ms +step:1229/2285 train_time:74158ms step_avg:60.34ms +step:1230/2285 train_time:74218ms step_avg:60.34ms +step:1231/2285 train_time:74280ms step_avg:60.34ms +step:1232/2285 train_time:74340ms step_avg:60.34ms +step:1233/2285 train_time:74402ms step_avg:60.34ms +step:1234/2285 train_time:74462ms step_avg:60.34ms +step:1235/2285 train_time:74525ms step_avg:60.34ms +step:1236/2285 train_time:74585ms step_avg:60.34ms +step:1237/2285 train_time:74646ms step_avg:60.34ms +step:1238/2285 train_time:74706ms step_avg:60.34ms +step:1239/2285 train_time:74767ms step_avg:60.34ms +step:1240/2285 train_time:74827ms step_avg:60.34ms +step:1241/2285 train_time:74889ms step_avg:60.35ms +step:1242/2285 train_time:74949ms step_avg:60.35ms +step:1243/2285 train_time:75012ms step_avg:60.35ms +step:1244/2285 train_time:75071ms step_avg:60.35ms +step:1245/2285 train_time:75134ms step_avg:60.35ms +step:1246/2285 train_time:75195ms step_avg:60.35ms +step:1247/2285 train_time:75256ms step_avg:60.35ms +step:1248/2285 train_time:75315ms step_avg:60.35ms +step:1249/2285 train_time:75378ms step_avg:60.35ms +step:1250/2285 train_time:75439ms step_avg:60.35ms +step:1250/2285 val_loss:3.4968 train_time:75503ms step_avg:60.40ms +step:1251/2285 train_time:75531ms step_avg:60.38ms +step:1252/2285 train_time:75563ms step_avg:60.35ms +step:1253/2285 train_time:75624ms step_avg:60.35ms +step:1254/2285 train_time:75684ms step_avg:60.35ms +step:1255/2285 train_time:75748ms step_avg:60.36ms +step:1256/2285 train_time:75808ms step_avg:60.36ms +step:1257/2285 train_time:75870ms step_avg:60.36ms +step:1258/2285 train_time:75929ms step_avg:60.36ms +step:1259/2285 train_time:75990ms step_avg:60.36ms +step:1260/2285 train_time:76048ms step_avg:60.36ms +step:1261/2285 train_time:76109ms step_avg:60.36ms +step:1262/2285 train_time:76168ms step_avg:60.35ms +step:1263/2285 train_time:76229ms step_avg:60.36ms +step:1264/2285 train_time:76287ms step_avg:60.35ms +step:1265/2285 train_time:76348ms step_avg:60.35ms +step:1266/2285 train_time:76410ms step_avg:60.36ms +step:1267/2285 train_time:76477ms step_avg:60.36ms +step:1268/2285 train_time:76537ms step_avg:60.36ms +step:1269/2285 train_time:76599ms step_avg:60.36ms +step:1270/2285 train_time:76659ms step_avg:60.36ms +step:1271/2285 train_time:76721ms step_avg:60.36ms +step:1272/2285 train_time:76781ms step_avg:60.36ms +step:1273/2285 train_time:76842ms step_avg:60.36ms +step:1274/2285 train_time:76902ms step_avg:60.36ms +step:1275/2285 train_time:76965ms step_avg:60.36ms +step:1276/2285 train_time:77024ms step_avg:60.36ms +step:1277/2285 train_time:77085ms step_avg:60.36ms +step:1278/2285 train_time:77144ms step_avg:60.36ms +step:1279/2285 train_time:77205ms step_avg:60.36ms +step:1280/2285 train_time:77265ms step_avg:60.36ms +step:1281/2285 train_time:77327ms step_avg:60.36ms +step:1282/2285 train_time:77388ms step_avg:60.37ms +step:1283/2285 train_time:77452ms step_avg:60.37ms +step:1284/2285 train_time:77511ms step_avg:60.37ms +step:1285/2285 train_time:77573ms step_avg:60.37ms +step:1286/2285 train_time:77633ms step_avg:60.37ms +step:1287/2285 train_time:77696ms step_avg:60.37ms +step:1288/2285 train_time:77755ms step_avg:60.37ms +step:1289/2285 train_time:77818ms step_avg:60.37ms +step:1290/2285 train_time:77878ms step_avg:60.37ms +step:1291/2285 train_time:77939ms step_avg:60.37ms +step:1292/2285 train_time:77999ms step_avg:60.37ms +step:1293/2285 train_time:78061ms step_avg:60.37ms +step:1294/2285 train_time:78120ms step_avg:60.37ms +step:1295/2285 train_time:78182ms step_avg:60.37ms +step:1296/2285 train_time:78241ms step_avg:60.37ms +step:1297/2285 train_time:78304ms step_avg:60.37ms +step:1298/2285 train_time:78364ms step_avg:60.37ms +step:1299/2285 train_time:78426ms step_avg:60.37ms +step:1300/2285 train_time:78487ms step_avg:60.37ms +step:1301/2285 train_time:78550ms step_avg:60.38ms +step:1302/2285 train_time:78609ms step_avg:60.38ms +step:1303/2285 train_time:78671ms step_avg:60.38ms +step:1304/2285 train_time:78731ms step_avg:60.38ms +step:1305/2285 train_time:78793ms step_avg:60.38ms +step:1306/2285 train_time:78853ms step_avg:60.38ms +step:1307/2285 train_time:78915ms step_avg:60.38ms +step:1308/2285 train_time:78975ms step_avg:60.38ms +step:1309/2285 train_time:79036ms step_avg:60.38ms +step:1310/2285 train_time:79096ms step_avg:60.38ms +step:1311/2285 train_time:79158ms step_avg:60.38ms +step:1312/2285 train_time:79218ms step_avg:60.38ms +step:1313/2285 train_time:79281ms step_avg:60.38ms +step:1314/2285 train_time:79341ms step_avg:60.38ms +step:1315/2285 train_time:79404ms step_avg:60.38ms +step:1316/2285 train_time:79464ms step_avg:60.38ms +step:1317/2285 train_time:79526ms step_avg:60.38ms +step:1318/2285 train_time:79586ms step_avg:60.38ms +step:1319/2285 train_time:79648ms step_avg:60.39ms +step:1320/2285 train_time:79708ms step_avg:60.39ms +step:1321/2285 train_time:79770ms step_avg:60.39ms +step:1322/2285 train_time:79830ms step_avg:60.39ms +step:1323/2285 train_time:79891ms step_avg:60.39ms +step:1324/2285 train_time:79950ms step_avg:60.39ms +step:1325/2285 train_time:80013ms step_avg:60.39ms +step:1326/2285 train_time:80073ms step_avg:60.39ms +step:1327/2285 train_time:80135ms step_avg:60.39ms +step:1328/2285 train_time:80194ms step_avg:60.39ms +step:1329/2285 train_time:80256ms step_avg:60.39ms +step:1330/2285 train_time:80316ms step_avg:60.39ms +step:1331/2285 train_time:80378ms step_avg:60.39ms +step:1332/2285 train_time:80438ms step_avg:60.39ms +step:1333/2285 train_time:80501ms step_avg:60.39ms +step:1334/2285 train_time:80562ms step_avg:60.39ms +step:1335/2285 train_time:80624ms step_avg:60.39ms +step:1336/2285 train_time:80684ms step_avg:60.39ms +step:1337/2285 train_time:80746ms step_avg:60.39ms +step:1338/2285 train_time:80805ms step_avg:60.39ms +step:1339/2285 train_time:80867ms step_avg:60.39ms +step:1340/2285 train_time:80927ms step_avg:60.39ms +step:1341/2285 train_time:80989ms step_avg:60.39ms +step:1342/2285 train_time:81048ms step_avg:60.39ms +step:1343/2285 train_time:81110ms step_avg:60.39ms +step:1344/2285 train_time:81170ms step_avg:60.39ms +step:1345/2285 train_time:81232ms step_avg:60.40ms +step:1346/2285 train_time:81292ms step_avg:60.40ms +step:1347/2285 train_time:81354ms step_avg:60.40ms +step:1348/2285 train_time:81414ms step_avg:60.40ms +step:1349/2285 train_time:81476ms step_avg:60.40ms +step:1350/2285 train_time:81535ms step_avg:60.40ms +step:1351/2285 train_time:81598ms step_avg:60.40ms +step:1352/2285 train_time:81658ms step_avg:60.40ms +step:1353/2285 train_time:81720ms step_avg:60.40ms +step:1354/2285 train_time:81780ms step_avg:60.40ms +step:1355/2285 train_time:81843ms step_avg:60.40ms +step:1356/2285 train_time:81902ms step_avg:60.40ms +step:1357/2285 train_time:81966ms step_avg:60.40ms +step:1358/2285 train_time:82025ms step_avg:60.40ms +step:1359/2285 train_time:82088ms step_avg:60.40ms +step:1360/2285 train_time:82147ms step_avg:60.40ms +step:1361/2285 train_time:82209ms step_avg:60.40ms +step:1362/2285 train_time:82269ms step_avg:60.40ms +step:1363/2285 train_time:82330ms step_avg:60.40ms +step:1364/2285 train_time:82390ms step_avg:60.40ms +step:1365/2285 train_time:82452ms step_avg:60.40ms +step:1366/2285 train_time:82512ms step_avg:60.40ms +step:1367/2285 train_time:82574ms step_avg:60.41ms +step:1368/2285 train_time:82634ms step_avg:60.41ms +step:1369/2285 train_time:82697ms step_avg:60.41ms +step:1370/2285 train_time:82756ms step_avg:60.41ms +step:1371/2285 train_time:82818ms step_avg:60.41ms +step:1372/2285 train_time:82878ms step_avg:60.41ms +step:1373/2285 train_time:82940ms step_avg:60.41ms +step:1374/2285 train_time:83000ms step_avg:60.41ms +step:1375/2285 train_time:83063ms step_avg:60.41ms +step:1376/2285 train_time:83123ms step_avg:60.41ms +step:1377/2285 train_time:83185ms step_avg:60.41ms +step:1378/2285 train_time:83244ms step_avg:60.41ms +step:1379/2285 train_time:83306ms step_avg:60.41ms +step:1380/2285 train_time:83367ms step_avg:60.41ms +step:1381/2285 train_time:83429ms step_avg:60.41ms +step:1382/2285 train_time:83489ms step_avg:60.41ms +step:1383/2285 train_time:83551ms step_avg:60.41ms +step:1384/2285 train_time:83610ms step_avg:60.41ms +step:1385/2285 train_time:83672ms step_avg:60.41ms +step:1386/2285 train_time:83732ms step_avg:60.41ms +step:1387/2285 train_time:83794ms step_avg:60.41ms +step:1388/2285 train_time:83853ms step_avg:60.41ms +step:1389/2285 train_time:83915ms step_avg:60.41ms +step:1390/2285 train_time:83975ms step_avg:60.41ms +step:1391/2285 train_time:84037ms step_avg:60.41ms +step:1392/2285 train_time:84096ms step_avg:60.41ms +step:1393/2285 train_time:84159ms step_avg:60.42ms +step:1394/2285 train_time:84219ms step_avg:60.42ms +step:1395/2285 train_time:84281ms step_avg:60.42ms +step:1396/2285 train_time:84341ms step_avg:60.42ms +step:1397/2285 train_time:84404ms step_avg:60.42ms +step:1398/2285 train_time:84464ms step_avg:60.42ms +step:1399/2285 train_time:84526ms step_avg:60.42ms +step:1400/2285 train_time:84586ms step_avg:60.42ms +step:1401/2285 train_time:84648ms step_avg:60.42ms +step:1402/2285 train_time:84707ms step_avg:60.42ms +step:1403/2285 train_time:84770ms step_avg:60.42ms +step:1404/2285 train_time:84830ms step_avg:60.42ms +step:1405/2285 train_time:84891ms step_avg:60.42ms +step:1406/2285 train_time:84951ms step_avg:60.42ms +step:1407/2285 train_time:85013ms step_avg:60.42ms +step:1408/2285 train_time:85073ms step_avg:60.42ms +step:1409/2285 train_time:85136ms step_avg:60.42ms +step:1410/2285 train_time:85194ms step_avg:60.42ms +step:1411/2285 train_time:85257ms step_avg:60.42ms +step:1412/2285 train_time:85317ms step_avg:60.42ms +step:1413/2285 train_time:85379ms step_avg:60.42ms +step:1414/2285 train_time:85439ms step_avg:60.42ms +step:1415/2285 train_time:85501ms step_avg:60.42ms +step:1416/2285 train_time:85562ms step_avg:60.43ms +step:1417/2285 train_time:85625ms step_avg:60.43ms +step:1418/2285 train_time:85684ms step_avg:60.43ms +step:1419/2285 train_time:85747ms step_avg:60.43ms +step:1420/2285 train_time:85807ms step_avg:60.43ms +step:1421/2285 train_time:85870ms step_avg:60.43ms +step:1422/2285 train_time:85929ms step_avg:60.43ms +step:1423/2285 train_time:85991ms step_avg:60.43ms +step:1424/2285 train_time:86050ms step_avg:60.43ms +step:1425/2285 train_time:86112ms step_avg:60.43ms +step:1426/2285 train_time:86171ms step_avg:60.43ms +step:1427/2285 train_time:86233ms step_avg:60.43ms +step:1428/2285 train_time:86292ms step_avg:60.43ms +step:1429/2285 train_time:86354ms step_avg:60.43ms +step:1430/2285 train_time:86414ms step_avg:60.43ms +step:1431/2285 train_time:86476ms step_avg:60.43ms +step:1432/2285 train_time:86536ms step_avg:60.43ms +step:1433/2285 train_time:86598ms step_avg:60.43ms +step:1434/2285 train_time:86658ms step_avg:60.43ms +step:1435/2285 train_time:86721ms step_avg:60.43ms +step:1436/2285 train_time:86781ms step_avg:60.43ms +step:1437/2285 train_time:86844ms step_avg:60.43ms +step:1438/2285 train_time:86903ms step_avg:60.43ms +step:1439/2285 train_time:86965ms step_avg:60.43ms +step:1440/2285 train_time:87025ms step_avg:60.43ms +step:1441/2285 train_time:87087ms step_avg:60.44ms +step:1442/2285 train_time:87147ms step_avg:60.43ms +step:1443/2285 train_time:87209ms step_avg:60.44ms +step:1444/2285 train_time:87268ms step_avg:60.44ms +step:1445/2285 train_time:87330ms step_avg:60.44ms +step:1446/2285 train_time:87390ms step_avg:60.44ms +step:1447/2285 train_time:87452ms step_avg:60.44ms +step:1448/2285 train_time:87512ms step_avg:60.44ms +step:1449/2285 train_time:87575ms step_avg:60.44ms +step:1450/2285 train_time:87635ms step_avg:60.44ms +step:1451/2285 train_time:87697ms step_avg:60.44ms +step:1452/2285 train_time:87757ms step_avg:60.44ms +step:1453/2285 train_time:87819ms step_avg:60.44ms +step:1454/2285 train_time:87879ms step_avg:60.44ms +step:1455/2285 train_time:87941ms step_avg:60.44ms +step:1456/2285 train_time:88001ms step_avg:60.44ms +step:1457/2285 train_time:88063ms step_avg:60.44ms +step:1458/2285 train_time:88123ms step_avg:60.44ms +step:1459/2285 train_time:88185ms step_avg:60.44ms +step:1460/2285 train_time:88245ms step_avg:60.44ms +step:1461/2285 train_time:88307ms step_avg:60.44ms +step:1462/2285 train_time:88367ms step_avg:60.44ms +step:1463/2285 train_time:88430ms step_avg:60.44ms +step:1464/2285 train_time:88489ms step_avg:60.44ms +step:1465/2285 train_time:88551ms step_avg:60.44ms +step:1466/2285 train_time:88611ms step_avg:60.44ms +step:1467/2285 train_time:88673ms step_avg:60.45ms +step:1468/2285 train_time:88733ms step_avg:60.44ms +step:1469/2285 train_time:88795ms step_avg:60.45ms +step:1470/2285 train_time:88855ms step_avg:60.45ms +step:1471/2285 train_time:88917ms step_avg:60.45ms +step:1472/2285 train_time:88976ms step_avg:60.45ms +step:1473/2285 train_time:89039ms step_avg:60.45ms +step:1474/2285 train_time:89099ms step_avg:60.45ms +step:1475/2285 train_time:89162ms step_avg:60.45ms +step:1476/2285 train_time:89221ms step_avg:60.45ms +step:1477/2285 train_time:89283ms step_avg:60.45ms +step:1478/2285 train_time:89343ms step_avg:60.45ms +step:1479/2285 train_time:89405ms step_avg:60.45ms +step:1480/2285 train_time:89466ms step_avg:60.45ms +step:1481/2285 train_time:89528ms step_avg:60.45ms +step:1482/2285 train_time:89588ms step_avg:60.45ms +step:1483/2285 train_time:89650ms step_avg:60.45ms +step:1484/2285 train_time:89709ms step_avg:60.45ms +step:1485/2285 train_time:89771ms step_avg:60.45ms +step:1486/2285 train_time:89831ms step_avg:60.45ms +step:1487/2285 train_time:89893ms step_avg:60.45ms +step:1488/2285 train_time:89953ms step_avg:60.45ms +step:1489/2285 train_time:90015ms step_avg:60.45ms +step:1490/2285 train_time:90075ms step_avg:60.45ms +step:1491/2285 train_time:90137ms step_avg:60.45ms +step:1492/2285 train_time:90197ms step_avg:60.45ms +step:1493/2285 train_time:90260ms step_avg:60.46ms +step:1494/2285 train_time:90320ms step_avg:60.45ms +step:1495/2285 train_time:90382ms step_avg:60.46ms +step:1496/2285 train_time:90442ms step_avg:60.46ms +step:1497/2285 train_time:90505ms step_avg:60.46ms +step:1498/2285 train_time:90565ms step_avg:60.46ms +step:1499/2285 train_time:90627ms step_avg:60.46ms +step:1500/2285 train_time:90687ms step_avg:60.46ms +step:1500/2285 val_loss:3.4294 train_time:90751ms step_avg:60.50ms +step:1501/2285 train_time:90773ms step_avg:60.48ms +step:1502/2285 train_time:90814ms step_avg:60.46ms +step:1503/2285 train_time:90879ms step_avg:60.47ms +step:1504/2285 train_time:90940ms step_avg:60.47ms +step:1505/2285 train_time:91002ms step_avg:60.47ms +step:1506/2285 train_time:91062ms step_avg:60.47ms +step:1507/2285 train_time:91123ms step_avg:60.47ms +step:1508/2285 train_time:91182ms step_avg:60.47ms +step:1509/2285 train_time:91244ms step_avg:60.47ms +step:1510/2285 train_time:91303ms step_avg:60.47ms +step:1511/2285 train_time:91364ms step_avg:60.47ms +step:1512/2285 train_time:91423ms step_avg:60.47ms +step:1513/2285 train_time:91484ms step_avg:60.47ms +step:1514/2285 train_time:91544ms step_avg:60.46ms +step:1515/2285 train_time:91606ms step_avg:60.47ms +step:1516/2285 train_time:91666ms step_avg:60.47ms +step:1517/2285 train_time:91729ms step_avg:60.47ms +step:1518/2285 train_time:91790ms step_avg:60.47ms +step:1519/2285 train_time:91853ms step_avg:60.47ms +step:1520/2285 train_time:91914ms step_avg:60.47ms +step:1521/2285 train_time:91976ms step_avg:60.47ms +step:1522/2285 train_time:92036ms step_avg:60.47ms +step:1523/2285 train_time:92098ms step_avg:60.47ms +step:1524/2285 train_time:92157ms step_avg:60.47ms +step:1525/2285 train_time:92220ms step_avg:60.47ms +step:1526/2285 train_time:92280ms step_avg:60.47ms +step:1527/2285 train_time:92342ms step_avg:60.47ms +step:1528/2285 train_time:92402ms step_avg:60.47ms +step:1529/2285 train_time:92464ms step_avg:60.47ms +step:1530/2285 train_time:92523ms step_avg:60.47ms +step:1531/2285 train_time:92585ms step_avg:60.47ms +step:1532/2285 train_time:92645ms step_avg:60.47ms +step:1533/2285 train_time:92708ms step_avg:60.47ms +step:1534/2285 train_time:92768ms step_avg:60.47ms +step:1535/2285 train_time:92831ms step_avg:60.48ms +step:1536/2285 train_time:92892ms step_avg:60.48ms +step:1537/2285 train_time:92954ms step_avg:60.48ms +step:1538/2285 train_time:93015ms step_avg:60.48ms +step:1539/2285 train_time:93077ms step_avg:60.48ms +step:1540/2285 train_time:93137ms step_avg:60.48ms +step:1541/2285 train_time:93200ms step_avg:60.48ms +step:1542/2285 train_time:93260ms step_avg:60.48ms +step:1543/2285 train_time:93322ms step_avg:60.48ms +step:1544/2285 train_time:93381ms step_avg:60.48ms +step:1545/2285 train_time:93443ms step_avg:60.48ms +step:1546/2285 train_time:93503ms step_avg:60.48ms +step:1547/2285 train_time:93565ms step_avg:60.48ms +step:1548/2285 train_time:93625ms step_avg:60.48ms +step:1549/2285 train_time:93688ms step_avg:60.48ms +step:1550/2285 train_time:93748ms step_avg:60.48ms +step:1551/2285 train_time:93811ms step_avg:60.48ms +step:1552/2285 train_time:93871ms step_avg:60.48ms +step:1553/2285 train_time:93934ms step_avg:60.49ms +step:1554/2285 train_time:93993ms step_avg:60.48ms +step:1555/2285 train_time:94056ms step_avg:60.49ms +step:1556/2285 train_time:94116ms step_avg:60.49ms +step:1557/2285 train_time:94178ms step_avg:60.49ms +step:1558/2285 train_time:94238ms step_avg:60.49ms +step:1559/2285 train_time:94301ms step_avg:60.49ms +step:1560/2285 train_time:94361ms step_avg:60.49ms +step:1561/2285 train_time:94423ms step_avg:60.49ms +step:1562/2285 train_time:94483ms step_avg:60.49ms +step:1563/2285 train_time:94544ms step_avg:60.49ms +step:1564/2285 train_time:94604ms step_avg:60.49ms +step:1565/2285 train_time:94667ms step_avg:60.49ms +step:1566/2285 train_time:94727ms step_avg:60.49ms +step:1567/2285 train_time:94789ms step_avg:60.49ms +step:1568/2285 train_time:94849ms step_avg:60.49ms +step:1569/2285 train_time:94912ms step_avg:60.49ms +step:1570/2285 train_time:94972ms step_avg:60.49ms +step:1571/2285 train_time:95034ms step_avg:60.49ms +step:1572/2285 train_time:95094ms step_avg:60.49ms +step:1573/2285 train_time:95156ms step_avg:60.49ms +step:1574/2285 train_time:95216ms step_avg:60.49ms +step:1575/2285 train_time:95279ms step_avg:60.49ms +step:1576/2285 train_time:95339ms step_avg:60.49ms +step:1577/2285 train_time:95402ms step_avg:60.50ms +step:1578/2285 train_time:95462ms step_avg:60.50ms +step:1579/2285 train_time:95524ms step_avg:60.50ms +step:1580/2285 train_time:95584ms step_avg:60.50ms +step:1581/2285 train_time:95646ms step_avg:60.50ms +step:1582/2285 train_time:95707ms step_avg:60.50ms +step:1583/2285 train_time:95769ms step_avg:60.50ms +step:1584/2285 train_time:95829ms step_avg:60.50ms +step:1585/2285 train_time:95891ms step_avg:60.50ms +step:1586/2285 train_time:95952ms step_avg:60.50ms +step:1587/2285 train_time:96014ms step_avg:60.50ms +step:1588/2285 train_time:96073ms step_avg:60.50ms +step:1589/2285 train_time:96136ms step_avg:60.50ms +step:1590/2285 train_time:96196ms step_avg:60.50ms +step:1591/2285 train_time:96258ms step_avg:60.50ms +step:1592/2285 train_time:96319ms step_avg:60.50ms +step:1593/2285 train_time:96381ms step_avg:60.50ms +step:1594/2285 train_time:96441ms step_avg:60.50ms +step:1595/2285 train_time:96504ms step_avg:60.50ms +step:1596/2285 train_time:96564ms step_avg:60.50ms +step:1597/2285 train_time:96626ms step_avg:60.50ms +step:1598/2285 train_time:96686ms step_avg:60.50ms +step:1599/2285 train_time:96749ms step_avg:60.51ms +step:1600/2285 train_time:96810ms step_avg:60.51ms +step:1601/2285 train_time:96872ms step_avg:60.51ms +step:1602/2285 train_time:96932ms step_avg:60.51ms +step:1603/2285 train_time:96994ms step_avg:60.51ms +step:1604/2285 train_time:97053ms step_avg:60.51ms +step:1605/2285 train_time:97116ms step_avg:60.51ms +step:1606/2285 train_time:97175ms step_avg:60.51ms +step:1607/2285 train_time:97237ms step_avg:60.51ms +step:1608/2285 train_time:97298ms step_avg:60.51ms +step:1609/2285 train_time:97360ms step_avg:60.51ms +step:1610/2285 train_time:97420ms step_avg:60.51ms +step:1611/2285 train_time:97482ms step_avg:60.51ms +step:1612/2285 train_time:97542ms step_avg:60.51ms +step:1613/2285 train_time:97605ms step_avg:60.51ms +step:1614/2285 train_time:97665ms step_avg:60.51ms +step:1615/2285 train_time:97728ms step_avg:60.51ms +step:1616/2285 train_time:97788ms step_avg:60.51ms +step:1617/2285 train_time:97851ms step_avg:60.51ms +step:1618/2285 train_time:97911ms step_avg:60.51ms +step:1619/2285 train_time:97973ms step_avg:60.51ms +step:1620/2285 train_time:98032ms step_avg:60.51ms +step:1621/2285 train_time:98094ms step_avg:60.51ms +step:1622/2285 train_time:98154ms step_avg:60.51ms +step:1623/2285 train_time:98216ms step_avg:60.52ms +step:1624/2285 train_time:98276ms step_avg:60.51ms +step:1625/2285 train_time:98339ms step_avg:60.52ms +step:1626/2285 train_time:98400ms step_avg:60.52ms +step:1627/2285 train_time:98463ms step_avg:60.52ms +step:1628/2285 train_time:98523ms step_avg:60.52ms +step:1629/2285 train_time:98585ms step_avg:60.52ms +step:1630/2285 train_time:98645ms step_avg:60.52ms +step:1631/2285 train_time:98708ms step_avg:60.52ms +step:1632/2285 train_time:98767ms step_avg:60.52ms +step:1633/2285 train_time:98830ms step_avg:60.52ms +step:1634/2285 train_time:98890ms step_avg:60.52ms +step:1635/2285 train_time:98952ms step_avg:60.52ms +step:1636/2285 train_time:99012ms step_avg:60.52ms +step:1637/2285 train_time:99074ms step_avg:60.52ms +step:1638/2285 train_time:99133ms step_avg:60.52ms +step:1639/2285 train_time:99195ms step_avg:60.52ms +step:1640/2285 train_time:99255ms step_avg:60.52ms +step:1641/2285 train_time:99318ms step_avg:60.52ms +step:1642/2285 train_time:99378ms step_avg:60.52ms +step:1643/2285 train_time:99441ms step_avg:60.52ms +step:1644/2285 train_time:99502ms step_avg:60.52ms +step:1645/2285 train_time:99564ms step_avg:60.53ms +step:1646/2285 train_time:99624ms step_avg:60.52ms +step:1647/2285 train_time:99686ms step_avg:60.53ms +step:1648/2285 train_time:99746ms step_avg:60.53ms +step:1649/2285 train_time:99808ms step_avg:60.53ms +step:1650/2285 train_time:99869ms step_avg:60.53ms +step:1651/2285 train_time:99930ms step_avg:60.53ms +step:1652/2285 train_time:99990ms step_avg:60.53ms +step:1653/2285 train_time:100052ms step_avg:60.53ms +step:1654/2285 train_time:100112ms step_avg:60.53ms +step:1655/2285 train_time:100175ms step_avg:60.53ms +step:1656/2285 train_time:100234ms step_avg:60.53ms +step:1657/2285 train_time:100297ms step_avg:60.53ms +step:1658/2285 train_time:100357ms step_avg:60.53ms +step:1659/2285 train_time:100421ms step_avg:60.53ms +step:1660/2285 train_time:100480ms step_avg:60.53ms +step:1661/2285 train_time:100543ms step_avg:60.53ms +step:1662/2285 train_time:100603ms step_avg:60.53ms +step:1663/2285 train_time:100665ms step_avg:60.53ms +step:1664/2285 train_time:100725ms step_avg:60.53ms +step:1665/2285 train_time:100788ms step_avg:60.53ms +step:1666/2285 train_time:100847ms step_avg:60.53ms +step:1667/2285 train_time:100910ms step_avg:60.53ms +step:1668/2285 train_time:100969ms step_avg:60.53ms +step:1669/2285 train_time:101031ms step_avg:60.53ms +step:1670/2285 train_time:101092ms step_avg:60.53ms +step:1671/2285 train_time:101154ms step_avg:60.54ms +step:1672/2285 train_time:101215ms step_avg:60.54ms +step:1673/2285 train_time:101277ms step_avg:60.54ms +step:1674/2285 train_time:101337ms step_avg:60.54ms +step:1675/2285 train_time:101399ms step_avg:60.54ms +step:1676/2285 train_time:101459ms step_avg:60.54ms +step:1677/2285 train_time:101522ms step_avg:60.54ms +step:1678/2285 train_time:101582ms step_avg:60.54ms +step:1679/2285 train_time:101645ms step_avg:60.54ms +step:1680/2285 train_time:101705ms step_avg:60.54ms +step:1681/2285 train_time:101768ms step_avg:60.54ms +step:1682/2285 train_time:101828ms step_avg:60.54ms +step:1683/2285 train_time:101890ms step_avg:60.54ms +step:1684/2285 train_time:101949ms step_avg:60.54ms +step:1685/2285 train_time:102012ms step_avg:60.54ms +step:1686/2285 train_time:102071ms step_avg:60.54ms +step:1687/2285 train_time:102133ms step_avg:60.54ms +step:1688/2285 train_time:102193ms step_avg:60.54ms +step:1689/2285 train_time:102255ms step_avg:60.54ms +step:1690/2285 train_time:102315ms step_avg:60.54ms +step:1691/2285 train_time:102378ms step_avg:60.54ms +step:1692/2285 train_time:102439ms step_avg:60.54ms +step:1693/2285 train_time:102502ms step_avg:60.54ms +step:1694/2285 train_time:102562ms step_avg:60.54ms +step:1695/2285 train_time:102624ms step_avg:60.55ms +step:1696/2285 train_time:102684ms step_avg:60.54ms +step:1697/2285 train_time:102746ms step_avg:60.55ms +step:1698/2285 train_time:102807ms step_avg:60.55ms +step:1699/2285 train_time:102869ms step_avg:60.55ms +step:1700/2285 train_time:102929ms step_avg:60.55ms +step:1701/2285 train_time:102991ms step_avg:60.55ms +step:1702/2285 train_time:103050ms step_avg:60.55ms +step:1703/2285 train_time:103113ms step_avg:60.55ms +step:1704/2285 train_time:103172ms step_avg:60.55ms +step:1705/2285 train_time:103235ms step_avg:60.55ms +step:1706/2285 train_time:103294ms step_avg:60.55ms +step:1707/2285 train_time:103358ms step_avg:60.55ms +step:1708/2285 train_time:103418ms step_avg:60.55ms +step:1709/2285 train_time:103481ms step_avg:60.55ms +step:1710/2285 train_time:103541ms step_avg:60.55ms +step:1711/2285 train_time:103604ms step_avg:60.55ms +step:1712/2285 train_time:103663ms step_avg:60.55ms +step:1713/2285 train_time:103726ms step_avg:60.55ms +step:1714/2285 train_time:103786ms step_avg:60.55ms +step:1715/2285 train_time:103848ms step_avg:60.55ms +step:1716/2285 train_time:103909ms step_avg:60.55ms +step:1717/2285 train_time:103970ms step_avg:60.55ms +step:1718/2285 train_time:104030ms step_avg:60.55ms +step:1719/2285 train_time:104092ms step_avg:60.55ms +step:1720/2285 train_time:104152ms step_avg:60.55ms +step:1721/2285 train_time:104214ms step_avg:60.55ms +step:1722/2285 train_time:104274ms step_avg:60.55ms +step:1723/2285 train_time:104336ms step_avg:60.56ms +step:1724/2285 train_time:104396ms step_avg:60.55ms +step:1725/2285 train_time:104459ms step_avg:60.56ms +step:1726/2285 train_time:104520ms step_avg:60.56ms +step:1727/2285 train_time:104582ms step_avg:60.56ms +step:1728/2285 train_time:104642ms step_avg:60.56ms +step:1729/2285 train_time:104705ms step_avg:60.56ms +step:1730/2285 train_time:104765ms step_avg:60.56ms +step:1731/2285 train_time:104827ms step_avg:60.56ms +step:1732/2285 train_time:104887ms step_avg:60.56ms +step:1733/2285 train_time:104949ms step_avg:60.56ms +step:1734/2285 train_time:105009ms step_avg:60.56ms +step:1735/2285 train_time:105072ms step_avg:60.56ms +step:1736/2285 train_time:105131ms step_avg:60.56ms +step:1737/2285 train_time:105193ms step_avg:60.56ms +step:1738/2285 train_time:105253ms step_avg:60.56ms +step:1739/2285 train_time:105315ms step_avg:60.56ms +step:1740/2285 train_time:105374ms step_avg:60.56ms +step:1741/2285 train_time:105437ms step_avg:60.56ms +step:1742/2285 train_time:105497ms step_avg:60.56ms +step:1743/2285 train_time:105560ms step_avg:60.56ms +step:1744/2285 train_time:105621ms step_avg:60.56ms +step:1745/2285 train_time:105683ms step_avg:60.56ms +step:1746/2285 train_time:105743ms step_avg:60.56ms +step:1747/2285 train_time:105806ms step_avg:60.56ms +step:1748/2285 train_time:105866ms step_avg:60.56ms +step:1749/2285 train_time:105928ms step_avg:60.57ms +step:1750/2285 train_time:105988ms step_avg:60.56ms +step:1750/2285 val_loss:3.3689 train_time:106052ms step_avg:60.60ms +step:1751/2285 train_time:106072ms step_avg:60.58ms +step:1752/2285 train_time:106112ms step_avg:60.57ms +step:1753/2285 train_time:106176ms step_avg:60.57ms +step:1754/2285 train_time:106237ms step_avg:60.57ms +step:1755/2285 train_time:106301ms step_avg:60.57ms +step:1756/2285 train_time:106362ms step_avg:60.57ms +step:1757/2285 train_time:106424ms step_avg:60.57ms +step:1758/2285 train_time:106483ms step_avg:60.57ms +step:1759/2285 train_time:106544ms step_avg:60.57ms +step:1760/2285 train_time:106603ms step_avg:60.57ms +step:1761/2285 train_time:106664ms step_avg:60.57ms +step:1762/2285 train_time:106723ms step_avg:60.57ms +step:1763/2285 train_time:106784ms step_avg:60.57ms +step:1764/2285 train_time:106843ms step_avg:60.57ms +step:1765/2285 train_time:106904ms step_avg:60.57ms +step:1766/2285 train_time:106966ms step_avg:60.57ms +step:1767/2285 train_time:107033ms step_avg:60.57ms +step:1768/2285 train_time:107094ms step_avg:60.57ms +step:1769/2285 train_time:107156ms step_avg:60.57ms +step:1770/2285 train_time:107217ms step_avg:60.57ms +step:1771/2285 train_time:107280ms step_avg:60.58ms +step:1772/2285 train_time:107340ms step_avg:60.58ms +step:1773/2285 train_time:107402ms step_avg:60.58ms +step:1774/2285 train_time:107462ms step_avg:60.58ms +step:1775/2285 train_time:107523ms step_avg:60.58ms +step:1776/2285 train_time:107583ms step_avg:60.58ms +step:1777/2285 train_time:107644ms step_avg:60.58ms +step:1778/2285 train_time:107703ms step_avg:60.58ms +step:1779/2285 train_time:107765ms step_avg:60.58ms +step:1780/2285 train_time:107824ms step_avg:60.58ms +step:1781/2285 train_time:107885ms step_avg:60.58ms +step:1782/2285 train_time:107946ms step_avg:60.58ms +step:1783/2285 train_time:108010ms step_avg:60.58ms +step:1784/2285 train_time:108071ms step_avg:60.58ms +step:1785/2285 train_time:108133ms step_avg:60.58ms +step:1786/2285 train_time:108193ms step_avg:60.58ms +step:1787/2285 train_time:108256ms step_avg:60.58ms +step:1788/2285 train_time:108316ms step_avg:60.58ms +step:1789/2285 train_time:108379ms step_avg:60.58ms +step:1790/2285 train_time:108439ms step_avg:60.58ms +step:1791/2285 train_time:108501ms step_avg:60.58ms +step:1792/2285 train_time:108561ms step_avg:60.58ms +step:1793/2285 train_time:108623ms step_avg:60.58ms +step:1794/2285 train_time:108682ms step_avg:60.58ms +step:1795/2285 train_time:108744ms step_avg:60.58ms +step:1796/2285 train_time:108803ms step_avg:60.58ms +step:1797/2285 train_time:108865ms step_avg:60.58ms +step:1798/2285 train_time:108926ms step_avg:60.58ms +step:1799/2285 train_time:108988ms step_avg:60.58ms +step:1800/2285 train_time:109048ms step_avg:60.58ms +step:1801/2285 train_time:109111ms step_avg:60.58ms +step:1802/2285 train_time:109172ms step_avg:60.58ms +step:1803/2285 train_time:109234ms step_avg:60.58ms +step:1804/2285 train_time:109294ms step_avg:60.58ms +step:1805/2285 train_time:109358ms step_avg:60.59ms +step:1806/2285 train_time:109418ms step_avg:60.59ms +step:1807/2285 train_time:109480ms step_avg:60.59ms +step:1808/2285 train_time:109540ms step_avg:60.59ms +step:1809/2285 train_time:109602ms step_avg:60.59ms +step:1810/2285 train_time:109662ms step_avg:60.59ms +step:1811/2285 train_time:109724ms step_avg:60.59ms +step:1812/2285 train_time:109783ms step_avg:60.59ms +step:1813/2285 train_time:109845ms step_avg:60.59ms +step:1814/2285 train_time:109905ms step_avg:60.59ms +step:1815/2285 train_time:109967ms step_avg:60.59ms +step:1816/2285 train_time:110027ms step_avg:60.59ms +step:1817/2285 train_time:110090ms step_avg:60.59ms +step:1818/2285 train_time:110150ms step_avg:60.59ms +step:1819/2285 train_time:110213ms step_avg:60.59ms +step:1820/2285 train_time:110272ms step_avg:60.59ms +step:1821/2285 train_time:110334ms step_avg:60.59ms +step:1822/2285 train_time:110394ms step_avg:60.59ms +step:1823/2285 train_time:110457ms step_avg:60.59ms +step:1824/2285 train_time:110516ms step_avg:60.59ms +step:1825/2285 train_time:110579ms step_avg:60.59ms +step:1826/2285 train_time:110639ms step_avg:60.59ms +step:1827/2285 train_time:110701ms step_avg:60.59ms +step:1828/2285 train_time:110761ms step_avg:60.59ms +step:1829/2285 train_time:110824ms step_avg:60.59ms +step:1830/2285 train_time:110883ms step_avg:60.59ms +step:1831/2285 train_time:110945ms step_avg:60.59ms +step:1832/2285 train_time:111005ms step_avg:60.59ms +step:1833/2285 train_time:111067ms step_avg:60.59ms +step:1834/2285 train_time:111127ms step_avg:60.59ms +step:1835/2285 train_time:111189ms step_avg:60.59ms +step:1836/2285 train_time:111249ms step_avg:60.59ms +step:1837/2285 train_time:111311ms step_avg:60.59ms +step:1838/2285 train_time:111371ms step_avg:60.59ms +step:1839/2285 train_time:111434ms step_avg:60.59ms +step:1840/2285 train_time:111494ms step_avg:60.59ms +step:1841/2285 train_time:111558ms step_avg:60.60ms +step:1842/2285 train_time:111618ms step_avg:60.60ms +step:1843/2285 train_time:111680ms step_avg:60.60ms +step:1844/2285 train_time:111740ms step_avg:60.60ms +step:1845/2285 train_time:111802ms step_avg:60.60ms +step:1846/2285 train_time:111862ms step_avg:60.60ms +step:1847/2285 train_time:111925ms step_avg:60.60ms +step:1848/2285 train_time:111985ms step_avg:60.60ms +step:1849/2285 train_time:112046ms step_avg:60.60ms +step:1850/2285 train_time:112107ms step_avg:60.60ms +step:1851/2285 train_time:112169ms step_avg:60.60ms +step:1852/2285 train_time:112228ms step_avg:60.60ms +step:1853/2285 train_time:112291ms step_avg:60.60ms +step:1854/2285 train_time:112350ms step_avg:60.60ms +step:1855/2285 train_time:112413ms step_avg:60.60ms +step:1856/2285 train_time:112473ms step_avg:60.60ms +step:1857/2285 train_time:112536ms step_avg:60.60ms +step:1858/2285 train_time:112596ms step_avg:60.60ms +step:1859/2285 train_time:112660ms step_avg:60.60ms +step:1860/2285 train_time:112720ms step_avg:60.60ms +step:1861/2285 train_time:112782ms step_avg:60.60ms +step:1862/2285 train_time:112841ms step_avg:60.60ms +step:1863/2285 train_time:112904ms step_avg:60.60ms +step:1864/2285 train_time:112964ms step_avg:60.60ms +step:1865/2285 train_time:113026ms step_avg:60.60ms +step:1866/2285 train_time:113085ms step_avg:60.60ms +step:1867/2285 train_time:113147ms step_avg:60.60ms +step:1868/2285 train_time:113207ms step_avg:60.60ms +step:1869/2285 train_time:113270ms step_avg:60.60ms +step:1870/2285 train_time:113330ms step_avg:60.60ms +step:1871/2285 train_time:113392ms step_avg:60.60ms +step:1872/2285 train_time:113452ms step_avg:60.60ms +step:1873/2285 train_time:113515ms step_avg:60.61ms +step:1874/2285 train_time:113575ms step_avg:60.61ms +step:1875/2285 train_time:113638ms step_avg:60.61ms +step:1876/2285 train_time:113698ms step_avg:60.61ms +step:1877/2285 train_time:113761ms step_avg:60.61ms +step:1878/2285 train_time:113821ms step_avg:60.61ms +step:1879/2285 train_time:113883ms step_avg:60.61ms +step:1880/2285 train_time:113944ms step_avg:60.61ms +step:1881/2285 train_time:114006ms step_avg:60.61ms +step:1882/2285 train_time:114065ms step_avg:60.61ms +step:1883/2285 train_time:114127ms step_avg:60.61ms +step:1884/2285 train_time:114187ms step_avg:60.61ms +step:1885/2285 train_time:114249ms step_avg:60.61ms +step:1886/2285 train_time:114309ms step_avg:60.61ms +step:1887/2285 train_time:114372ms step_avg:60.61ms +step:1888/2285 train_time:114432ms step_avg:60.61ms +step:1889/2285 train_time:114495ms step_avg:60.61ms +step:1890/2285 train_time:114554ms step_avg:60.61ms +step:1891/2285 train_time:114618ms step_avg:60.61ms +step:1892/2285 train_time:114678ms step_avg:60.61ms +step:1893/2285 train_time:114741ms step_avg:60.61ms +step:1894/2285 train_time:114800ms step_avg:60.61ms +step:1895/2285 train_time:114863ms step_avg:60.61ms +step:1896/2285 train_time:114923ms step_avg:60.61ms +step:1897/2285 train_time:114985ms step_avg:60.61ms +step:1898/2285 train_time:115044ms step_avg:60.61ms +step:1899/2285 train_time:115106ms step_avg:60.61ms +step:1900/2285 train_time:115166ms step_avg:60.61ms +step:1901/2285 train_time:115229ms step_avg:60.61ms +step:1902/2285 train_time:115289ms step_avg:60.61ms +step:1903/2285 train_time:115351ms step_avg:60.62ms +step:1904/2285 train_time:115411ms step_avg:60.62ms +step:1905/2285 train_time:115473ms step_avg:60.62ms +step:1906/2285 train_time:115533ms step_avg:60.62ms +step:1907/2285 train_time:115596ms step_avg:60.62ms +step:1908/2285 train_time:115656ms step_avg:60.62ms +step:1909/2285 train_time:115720ms step_avg:60.62ms +step:1910/2285 train_time:115780ms step_avg:60.62ms +step:1911/2285 train_time:115842ms step_avg:60.62ms +step:1912/2285 train_time:115902ms step_avg:60.62ms +step:1913/2285 train_time:115964ms step_avg:60.62ms +step:1914/2285 train_time:116025ms step_avg:60.62ms +step:1915/2285 train_time:116087ms step_avg:60.62ms +step:1916/2285 train_time:116146ms step_avg:60.62ms +step:1917/2285 train_time:116209ms step_avg:60.62ms +step:1918/2285 train_time:116269ms step_avg:60.62ms +step:1919/2285 train_time:116331ms step_avg:60.62ms +step:1920/2285 train_time:116391ms step_avg:60.62ms +step:1921/2285 train_time:116453ms step_avg:60.62ms +step:1922/2285 train_time:116514ms step_avg:60.62ms +step:1923/2285 train_time:116577ms step_avg:60.62ms +step:1924/2285 train_time:116638ms step_avg:60.62ms +step:1925/2285 train_time:116701ms step_avg:60.62ms +step:1926/2285 train_time:116761ms step_avg:60.62ms +step:1927/2285 train_time:116824ms step_avg:60.62ms +step:1928/2285 train_time:116884ms step_avg:60.62ms +step:1929/2285 train_time:116947ms step_avg:60.63ms +step:1930/2285 train_time:117006ms step_avg:60.63ms +step:1931/2285 train_time:117069ms step_avg:60.63ms +step:1932/2285 train_time:117129ms step_avg:60.63ms +step:1933/2285 train_time:117192ms step_avg:60.63ms +step:1934/2285 train_time:117251ms step_avg:60.63ms +step:1935/2285 train_time:117314ms step_avg:60.63ms +step:1936/2285 train_time:117374ms step_avg:60.63ms +step:1937/2285 train_time:117436ms step_avg:60.63ms +step:1938/2285 train_time:117496ms step_avg:60.63ms +step:1939/2285 train_time:117559ms step_avg:60.63ms +step:1940/2285 train_time:117619ms step_avg:60.63ms +step:1941/2285 train_time:117682ms step_avg:60.63ms +step:1942/2285 train_time:117742ms step_avg:60.63ms +step:1943/2285 train_time:117805ms step_avg:60.63ms +step:1944/2285 train_time:117865ms step_avg:60.63ms +step:1945/2285 train_time:117927ms step_avg:60.63ms +step:1946/2285 train_time:117987ms step_avg:60.63ms +step:1947/2285 train_time:118049ms step_avg:60.63ms +step:1948/2285 train_time:118109ms step_avg:60.63ms +step:1949/2285 train_time:118171ms step_avg:60.63ms +step:1950/2285 train_time:118231ms step_avg:60.63ms +step:1951/2285 train_time:118293ms step_avg:60.63ms +step:1952/2285 train_time:118353ms step_avg:60.63ms +step:1953/2285 train_time:118416ms step_avg:60.63ms +step:1954/2285 train_time:118476ms step_avg:60.63ms +step:1955/2285 train_time:118539ms step_avg:60.63ms +step:1956/2285 train_time:118598ms step_avg:60.63ms +step:1957/2285 train_time:118661ms step_avg:60.63ms +step:1958/2285 train_time:118721ms step_avg:60.63ms +step:1959/2285 train_time:118784ms step_avg:60.63ms +step:1960/2285 train_time:118844ms step_avg:60.63ms +step:1961/2285 train_time:118906ms step_avg:60.64ms +step:1962/2285 train_time:118966ms step_avg:60.63ms +step:1963/2285 train_time:119028ms step_avg:60.64ms +step:1964/2285 train_time:119088ms step_avg:60.64ms +step:1965/2285 train_time:119150ms step_avg:60.64ms +step:1966/2285 train_time:119210ms step_avg:60.64ms +step:1967/2285 train_time:119272ms step_avg:60.64ms +step:1968/2285 train_time:119332ms step_avg:60.64ms +step:1969/2285 train_time:119394ms step_avg:60.64ms +step:1970/2285 train_time:119455ms step_avg:60.64ms +step:1971/2285 train_time:119517ms step_avg:60.64ms +step:1972/2285 train_time:119577ms step_avg:60.64ms +step:1973/2285 train_time:119641ms step_avg:60.64ms +step:1974/2285 train_time:119701ms step_avg:60.64ms +step:1975/2285 train_time:119764ms step_avg:60.64ms +step:1976/2285 train_time:119824ms step_avg:60.64ms +step:1977/2285 train_time:119886ms step_avg:60.64ms +step:1978/2285 train_time:119945ms step_avg:60.64ms +step:1979/2285 train_time:120008ms step_avg:60.64ms +step:1980/2285 train_time:120068ms step_avg:60.64ms +step:1981/2285 train_time:120130ms step_avg:60.64ms +step:1982/2285 train_time:120190ms step_avg:60.64ms +step:1983/2285 train_time:120252ms step_avg:60.64ms +step:1984/2285 train_time:120312ms step_avg:60.64ms +step:1985/2285 train_time:120375ms step_avg:60.64ms +step:1986/2285 train_time:120435ms step_avg:60.64ms +step:1987/2285 train_time:120497ms step_avg:60.64ms +step:1988/2285 train_time:120558ms step_avg:60.64ms +step:1989/2285 train_time:120620ms step_avg:60.64ms +step:1990/2285 train_time:120680ms step_avg:60.64ms +step:1991/2285 train_time:120744ms step_avg:60.64ms +step:1992/2285 train_time:120803ms step_avg:60.64ms +step:1993/2285 train_time:120865ms step_avg:60.64ms +step:1994/2285 train_time:120925ms step_avg:60.64ms +step:1995/2285 train_time:120988ms step_avg:60.65ms +step:1996/2285 train_time:121048ms step_avg:60.65ms +step:1997/2285 train_time:121110ms step_avg:60.65ms +step:1998/2285 train_time:121170ms step_avg:60.65ms +step:1999/2285 train_time:121232ms step_avg:60.65ms +step:2000/2285 train_time:121292ms step_avg:60.65ms +step:2000/2285 val_loss:3.3201 train_time:121356ms step_avg:60.68ms +step:2001/2285 train_time:121381ms step_avg:60.66ms +step:2002/2285 train_time:121417ms step_avg:60.65ms +step:2003/2285 train_time:121479ms step_avg:60.65ms +step:2004/2285 train_time:121540ms step_avg:60.65ms +step:2005/2285 train_time:121605ms step_avg:60.65ms +step:2006/2285 train_time:121664ms step_avg:60.65ms +step:2007/2285 train_time:121726ms step_avg:60.65ms +step:2008/2285 train_time:121786ms step_avg:60.65ms +step:2009/2285 train_time:121848ms step_avg:60.65ms +step:2010/2285 train_time:121907ms step_avg:60.65ms +step:2011/2285 train_time:121968ms step_avg:60.65ms +step:2012/2285 train_time:122028ms step_avg:60.65ms +step:2013/2285 train_time:122092ms step_avg:60.65ms +step:2014/2285 train_time:122152ms step_avg:60.65ms +step:2015/2285 train_time:122213ms step_avg:60.65ms +step:2016/2285 train_time:122276ms step_avg:60.65ms +step:2017/2285 train_time:122340ms step_avg:60.65ms +step:2018/2285 train_time:122401ms step_avg:60.65ms +step:2019/2285 train_time:122465ms step_avg:60.66ms +step:2020/2285 train_time:122527ms step_avg:60.66ms +step:2021/2285 train_time:122590ms step_avg:60.66ms +step:2022/2285 train_time:122650ms step_avg:60.66ms +step:2023/2285 train_time:122713ms step_avg:60.66ms +step:2024/2285 train_time:122774ms step_avg:60.66ms +step:2025/2285 train_time:122835ms step_avg:60.66ms +step:2026/2285 train_time:122896ms step_avg:60.66ms +step:2027/2285 train_time:122957ms step_avg:60.66ms +step:2028/2285 train_time:123017ms step_avg:60.66ms +step:2029/2285 train_time:123079ms step_avg:60.66ms +step:2030/2285 train_time:123138ms step_avg:60.66ms +step:2031/2285 train_time:123200ms step_avg:60.66ms +step:2032/2285 train_time:123260ms step_avg:60.66ms +step:2033/2285 train_time:123323ms step_avg:60.66ms +step:2034/2285 train_time:123383ms step_avg:60.66ms +step:2035/2285 train_time:123447ms step_avg:60.66ms +step:2036/2285 train_time:123507ms step_avg:60.66ms +step:2037/2285 train_time:123570ms step_avg:60.66ms +step:2038/2285 train_time:123631ms step_avg:60.66ms +step:2039/2285 train_time:123694ms step_avg:60.66ms +step:2040/2285 train_time:123754ms step_avg:60.66ms +step:2041/2285 train_time:123817ms step_avg:60.66ms +step:2042/2285 train_time:123876ms step_avg:60.66ms +step:2043/2285 train_time:123939ms step_avg:60.67ms +step:2044/2285 train_time:123998ms step_avg:60.66ms +step:2045/2285 train_time:124060ms step_avg:60.67ms +step:2046/2285 train_time:124120ms step_avg:60.66ms +step:2047/2285 train_time:124182ms step_avg:60.67ms +step:2048/2285 train_time:124242ms step_avg:60.67ms +step:2049/2285 train_time:124305ms step_avg:60.67ms +step:2050/2285 train_time:124365ms step_avg:60.67ms +step:2051/2285 train_time:124429ms step_avg:60.67ms +step:2052/2285 train_time:124489ms step_avg:60.67ms +step:2053/2285 train_time:124552ms step_avg:60.67ms +step:2054/2285 train_time:124612ms step_avg:60.67ms +step:2055/2285 train_time:124675ms step_avg:60.67ms +step:2056/2285 train_time:124735ms step_avg:60.67ms +step:2057/2285 train_time:124798ms step_avg:60.67ms +step:2058/2285 train_time:124857ms step_avg:60.67ms +step:2059/2285 train_time:124920ms step_avg:60.67ms +step:2060/2285 train_time:124979ms step_avg:60.67ms +step:2061/2285 train_time:125041ms step_avg:60.67ms +step:2062/2285 train_time:125101ms step_avg:60.67ms +step:2063/2285 train_time:125163ms step_avg:60.67ms +step:2064/2285 train_time:125223ms step_avg:60.67ms +step:2065/2285 train_time:125285ms step_avg:60.67ms +step:2066/2285 train_time:125346ms step_avg:60.67ms +step:2067/2285 train_time:125409ms step_avg:60.67ms +step:2068/2285 train_time:125469ms step_avg:60.67ms +step:2069/2285 train_time:125533ms step_avg:60.67ms +step:2070/2285 train_time:125593ms step_avg:60.67ms +step:2071/2285 train_time:125655ms step_avg:60.67ms +step:2072/2285 train_time:125715ms step_avg:60.67ms +step:2073/2285 train_time:125778ms step_avg:60.67ms +step:2074/2285 train_time:125838ms step_avg:60.67ms +step:2075/2285 train_time:125900ms step_avg:60.67ms +step:2076/2285 train_time:125960ms step_avg:60.67ms +step:2077/2285 train_time:126022ms step_avg:60.68ms +step:2078/2285 train_time:126082ms step_avg:60.67ms +step:2079/2285 train_time:126145ms step_avg:60.68ms +step:2080/2285 train_time:126205ms step_avg:60.68ms +step:2081/2285 train_time:126268ms step_avg:60.68ms +step:2082/2285 train_time:126328ms step_avg:60.68ms +step:2083/2285 train_time:126391ms step_avg:60.68ms +step:2084/2285 train_time:126450ms step_avg:60.68ms +step:2085/2285 train_time:126513ms step_avg:60.68ms +step:2086/2285 train_time:126574ms step_avg:60.68ms +step:2087/2285 train_time:126637ms step_avg:60.68ms +step:2088/2285 train_time:126696ms step_avg:60.68ms +step:2089/2285 train_time:126759ms step_avg:60.68ms +step:2090/2285 train_time:126818ms step_avg:60.68ms +step:2091/2285 train_time:126881ms step_avg:60.68ms +step:2092/2285 train_time:126940ms step_avg:60.68ms +step:2093/2285 train_time:127003ms step_avg:60.68ms +step:2094/2285 train_time:127063ms step_avg:60.68ms +step:2095/2285 train_time:127125ms step_avg:60.68ms +step:2096/2285 train_time:127185ms step_avg:60.68ms +step:2097/2285 train_time:127248ms step_avg:60.68ms +step:2098/2285 train_time:127308ms step_avg:60.68ms +step:2099/2285 train_time:127371ms step_avg:60.68ms +step:2100/2285 train_time:127431ms step_avg:60.68ms +step:2101/2285 train_time:127493ms step_avg:60.68ms +step:2102/2285 train_time:127553ms step_avg:60.68ms +step:2103/2285 train_time:127616ms step_avg:60.68ms +step:2104/2285 train_time:127676ms step_avg:60.68ms +step:2105/2285 train_time:127740ms step_avg:60.68ms +step:2106/2285 train_time:127799ms step_avg:60.68ms +step:2107/2285 train_time:127862ms step_avg:60.68ms +step:2108/2285 train_time:127922ms step_avg:60.68ms +step:2109/2285 train_time:127985ms step_avg:60.69ms +step:2110/2285 train_time:128045ms step_avg:60.68ms +step:2111/2285 train_time:128108ms step_avg:60.69ms +step:2112/2285 train_time:128168ms step_avg:60.69ms +step:2113/2285 train_time:128231ms step_avg:60.69ms +step:2114/2285 train_time:128291ms step_avg:60.69ms +step:2115/2285 train_time:128354ms step_avg:60.69ms +step:2116/2285 train_time:128413ms step_avg:60.69ms +step:2117/2285 train_time:128475ms step_avg:60.69ms +step:2118/2285 train_time:128536ms step_avg:60.69ms +step:2119/2285 train_time:128598ms step_avg:60.69ms +step:2120/2285 train_time:128658ms step_avg:60.69ms +step:2121/2285 train_time:128720ms step_avg:60.69ms +step:2122/2285 train_time:128780ms step_avg:60.69ms +step:2123/2285 train_time:128843ms step_avg:60.69ms +step:2124/2285 train_time:128903ms step_avg:60.69ms +step:2125/2285 train_time:128965ms step_avg:60.69ms +step:2126/2285 train_time:129026ms step_avg:60.69ms +step:2127/2285 train_time:129089ms step_avg:60.69ms +step:2128/2285 train_time:129149ms step_avg:60.69ms +step:2129/2285 train_time:129211ms step_avg:60.69ms +step:2130/2285 train_time:129272ms step_avg:60.69ms +step:2131/2285 train_time:129334ms step_avg:60.69ms +step:2132/2285 train_time:129395ms step_avg:60.69ms +step:2133/2285 train_time:129457ms step_avg:60.69ms +step:2134/2285 train_time:129517ms step_avg:60.69ms +step:2135/2285 train_time:129579ms step_avg:60.69ms +step:2136/2285 train_time:129640ms step_avg:60.69ms +step:2137/2285 train_time:129703ms step_avg:60.69ms +step:2138/2285 train_time:129763ms step_avg:60.69ms +step:2139/2285 train_time:129825ms step_avg:60.69ms +step:2140/2285 train_time:129885ms step_avg:60.69ms +step:2141/2285 train_time:129947ms step_avg:60.69ms +step:2142/2285 train_time:130007ms step_avg:60.69ms +step:2143/2285 train_time:130070ms step_avg:60.70ms +step:2144/2285 train_time:130130ms step_avg:60.69ms +step:2145/2285 train_time:130193ms step_avg:60.70ms +step:2146/2285 train_time:130252ms step_avg:60.70ms +step:2147/2285 train_time:130315ms step_avg:60.70ms +step:2148/2285 train_time:130376ms step_avg:60.70ms +step:2149/2285 train_time:130438ms step_avg:60.70ms +step:2150/2285 train_time:130498ms step_avg:60.70ms +step:2151/2285 train_time:130561ms step_avg:60.70ms +step:2152/2285 train_time:130620ms step_avg:60.70ms +step:2153/2285 train_time:130683ms step_avg:60.70ms +step:2154/2285 train_time:130743ms step_avg:60.70ms +step:2155/2285 train_time:130805ms step_avg:60.70ms +step:2156/2285 train_time:130867ms step_avg:60.70ms +step:2157/2285 train_time:130929ms step_avg:60.70ms +step:2158/2285 train_time:130990ms step_avg:60.70ms +step:2159/2285 train_time:131052ms step_avg:60.70ms +step:2160/2285 train_time:131112ms step_avg:60.70ms +step:2161/2285 train_time:131174ms step_avg:60.70ms +step:2162/2285 train_time:131235ms step_avg:60.70ms +step:2163/2285 train_time:131297ms step_avg:60.70ms +step:2164/2285 train_time:131358ms step_avg:60.70ms +step:2165/2285 train_time:131421ms step_avg:60.70ms +step:2166/2285 train_time:131481ms step_avg:60.70ms +step:2167/2285 train_time:131544ms step_avg:60.70ms +step:2168/2285 train_time:131604ms step_avg:60.70ms +step:2169/2285 train_time:131667ms step_avg:60.70ms +step:2170/2285 train_time:131727ms step_avg:60.70ms +step:2171/2285 train_time:131789ms step_avg:60.70ms +step:2172/2285 train_time:131850ms step_avg:60.70ms +step:2173/2285 train_time:131912ms step_avg:60.71ms +step:2174/2285 train_time:131972ms step_avg:60.70ms +step:2175/2285 train_time:132035ms step_avg:60.71ms +step:2176/2285 train_time:132094ms step_avg:60.71ms +step:2177/2285 train_time:132157ms step_avg:60.71ms +step:2178/2285 train_time:132217ms step_avg:60.71ms +step:2179/2285 train_time:132279ms step_avg:60.71ms +step:2180/2285 train_time:132339ms step_avg:60.71ms +step:2181/2285 train_time:132401ms step_avg:60.71ms +step:2182/2285 train_time:132461ms step_avg:60.71ms +step:2183/2285 train_time:132523ms step_avg:60.71ms +step:2184/2285 train_time:132584ms step_avg:60.71ms +step:2185/2285 train_time:132646ms step_avg:60.71ms +step:2186/2285 train_time:132706ms step_avg:60.71ms +step:2187/2285 train_time:132769ms step_avg:60.71ms +step:2188/2285 train_time:132829ms step_avg:60.71ms +step:2189/2285 train_time:132892ms step_avg:60.71ms +step:2190/2285 train_time:132952ms step_avg:60.71ms +step:2191/2285 train_time:133014ms step_avg:60.71ms +step:2192/2285 train_time:133075ms step_avg:60.71ms +step:2193/2285 train_time:133137ms step_avg:60.71ms +step:2194/2285 train_time:133197ms step_avg:60.71ms +step:2195/2285 train_time:133259ms step_avg:60.71ms +step:2196/2285 train_time:133319ms step_avg:60.71ms +step:2197/2285 train_time:133382ms step_avg:60.71ms +step:2198/2285 train_time:133441ms step_avg:60.71ms +step:2199/2285 train_time:133504ms step_avg:60.71ms +step:2200/2285 train_time:133564ms step_avg:60.71ms +step:2201/2285 train_time:133626ms step_avg:60.71ms +step:2202/2285 train_time:133686ms step_avg:60.71ms +step:2203/2285 train_time:133750ms step_avg:60.71ms +step:2204/2285 train_time:133810ms step_avg:60.71ms +step:2205/2285 train_time:133873ms step_avg:60.71ms +step:2206/2285 train_time:133933ms step_avg:60.71ms +step:2207/2285 train_time:133996ms step_avg:60.71ms +step:2208/2285 train_time:134056ms step_avg:60.71ms +step:2209/2285 train_time:134118ms step_avg:60.71ms +step:2210/2285 train_time:134178ms step_avg:60.71ms +step:2211/2285 train_time:134240ms step_avg:60.71ms +step:2212/2285 train_time:134301ms step_avg:60.71ms +step:2213/2285 train_time:134363ms step_avg:60.72ms +step:2214/2285 train_time:134424ms step_avg:60.72ms +step:2215/2285 train_time:134487ms step_avg:60.72ms +step:2216/2285 train_time:134547ms step_avg:60.72ms +step:2217/2285 train_time:134610ms step_avg:60.72ms +step:2218/2285 train_time:134670ms step_avg:60.72ms +step:2219/2285 train_time:134733ms step_avg:60.72ms +step:2220/2285 train_time:134792ms step_avg:60.72ms +step:2221/2285 train_time:134854ms step_avg:60.72ms +step:2222/2285 train_time:134914ms step_avg:60.72ms +step:2223/2285 train_time:134976ms step_avg:60.72ms +step:2224/2285 train_time:135037ms step_avg:60.72ms +step:2225/2285 train_time:135099ms step_avg:60.72ms +step:2226/2285 train_time:135159ms step_avg:60.72ms +step:2227/2285 train_time:135221ms step_avg:60.72ms +step:2228/2285 train_time:135281ms step_avg:60.72ms +step:2229/2285 train_time:135343ms step_avg:60.72ms +step:2230/2285 train_time:135403ms step_avg:60.72ms +step:2231/2285 train_time:135465ms step_avg:60.72ms +step:2232/2285 train_time:135525ms step_avg:60.72ms +step:2233/2285 train_time:135588ms step_avg:60.72ms +step:2234/2285 train_time:135649ms step_avg:60.72ms +step:2235/2285 train_time:135711ms step_avg:60.72ms +step:2236/2285 train_time:135771ms step_avg:60.72ms +step:2237/2285 train_time:135833ms step_avg:60.72ms +step:2238/2285 train_time:135894ms step_avg:60.72ms +step:2239/2285 train_time:135955ms step_avg:60.72ms +step:2240/2285 train_time:136015ms step_avg:60.72ms +step:2241/2285 train_time:136078ms step_avg:60.72ms +step:2242/2285 train_time:136138ms step_avg:60.72ms +step:2243/2285 train_time:136200ms step_avg:60.72ms +step:2244/2285 train_time:136260ms step_avg:60.72ms +step:2245/2285 train_time:136323ms step_avg:60.72ms +step:2246/2285 train_time:136383ms step_avg:60.72ms +step:2247/2285 train_time:136446ms step_avg:60.72ms +step:2248/2285 train_time:136505ms step_avg:60.72ms +step:2249/2285 train_time:136568ms step_avg:60.72ms +step:2250/2285 train_time:136628ms step_avg:60.72ms +step:2250/2285 val_loss:3.2856 train_time:136692ms step_avg:60.75ms +step:2251/2285 train_time:136714ms step_avg:60.73ms +step:2252/2285 train_time:136754ms step_avg:60.73ms +step:2253/2285 train_time:136819ms step_avg:60.73ms +step:2254/2285 train_time:136880ms step_avg:60.73ms +step:2255/2285 train_time:136942ms step_avg:60.73ms +step:2256/2285 train_time:137002ms step_avg:60.73ms +step:2257/2285 train_time:137064ms step_avg:60.73ms +step:2258/2285 train_time:137123ms step_avg:60.73ms +step:2259/2285 train_time:137184ms step_avg:60.73ms +step:2260/2285 train_time:137244ms step_avg:60.73ms +step:2261/2285 train_time:137306ms step_avg:60.73ms +step:2262/2285 train_time:137366ms step_avg:60.73ms +step:2263/2285 train_time:137428ms step_avg:60.73ms +step:2264/2285 train_time:137488ms step_avg:60.73ms +step:2265/2285 train_time:137550ms step_avg:60.73ms +step:2266/2285 train_time:137610ms step_avg:60.73ms +step:2267/2285 train_time:137674ms step_avg:60.73ms +step:2268/2285 train_time:137735ms step_avg:60.73ms +step:2269/2285 train_time:137799ms step_avg:60.73ms +step:2270/2285 train_time:137859ms step_avg:60.73ms +step:2271/2285 train_time:137923ms step_avg:60.73ms +step:2272/2285 train_time:137983ms step_avg:60.73ms +step:2273/2285 train_time:138045ms step_avg:60.73ms +step:2274/2285 train_time:138104ms step_avg:60.73ms +step:2275/2285 train_time:138166ms step_avg:60.73ms +step:2276/2285 train_time:138225ms step_avg:60.73ms +step:2277/2285 train_time:138287ms step_avg:60.73ms +step:2278/2285 train_time:138346ms step_avg:60.73ms +step:2279/2285 train_time:138408ms step_avg:60.73ms +step:2280/2285 train_time:138468ms step_avg:60.73ms +step:2281/2285 train_time:138530ms step_avg:60.73ms +step:2282/2285 train_time:138590ms step_avg:60.73ms +step:2283/2285 train_time:138653ms step_avg:60.73ms +step:2284/2285 train_time:138714ms step_avg:60.73ms +step:2285/2285 train_time:138777ms step_avg:60.73ms +step:2285/2285 val_loss:3.2794 train_time:138838ms step_avg:60.76ms +peak memory allocated: 29626 MiB reserved: 50528 MiB diff --git a/records/track_1_short/2025-10-27_FixMuonLR/6e1efe80-8453-4ef6-a34d-8c73543618a8.txt b/records/track_1_short/2025-10-27_FixMuonLR/6e1efe80-8453-4ef6-a34d-8c73543618a8.txt new file mode 100644 index 000000000..17f0ced3a --- /dev/null +++ b/records/track_1_short/2025-10-27_FixMuonLR/6e1efe80-8453-4ef6-a34d-8c73543618a8.txt @@ -0,0 +1,3814 @@ +import os +import sys + +with open(sys.argv[0]) as f: + code = f.read() # read the code of this file ASAP, for logging +import copy +import glob +import math +import threading +import time +import uuid +from dataclasses import dataclass +from collections import defaultdict +from itertools import accumulate +from pathlib import Path + +os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" +import torch + +torch.empty( + 1, device="cuda", requires_grad=True +).backward() # prevents a bug on some systems +import torch._dynamo as dynamo +import torch.distributed as dist +import torch.nn.functional as F + +# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min +import triton +import triton.language as tl +from kernels import get_kernel +from torch import Tensor, nn + +dynamo.config.recompile_limit = 64 + +# ----------------------------------------------------------------------------- +# Custom operators: FP8 matmul by @YouJiacheng + + +@torch.library.custom_op("nanogpt::mm", mutates_args=()) +def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: + @torch.compile + def impl(x: Tensor, w: Tensor): + assert x.is_contiguous() and w.is_contiguous() + x_f8 = x.div(x_s).to(torch.float8_e4m3fn) + w_f8 = w.div(w_s).to(torch.float8_e4m3fn) + out = torch._scaled_mm( + x_f8, + w_f8.T, + out_dtype=torch.bfloat16, + scale_a=x.new_tensor(x_s, dtype=torch.float32), + scale_b=x.new_tensor(w_s, dtype=torch.float32), + use_fast_accum=True, + ) + return out, x_f8, w_f8 + + return impl(x, w) + +@mm_op.register_fake +def _(x: Tensor, w: Tensor, *_): + assert x.ndim == w.ndim == 2 + assert x.shape[1] == w.shape[1] + assert x.device == w.device + assert x.is_contiguous() and w.is_contiguous() + return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) + +@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) +def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: + @torch.compile + def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): + assert grad.is_contiguous() + x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) + w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) + grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) + grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) + grad_x = torch._scaled_mm( + grad_f8, + w_f8.T.contiguous().T, + out_dtype=torch.bfloat16, + scale_a=grad_inv_s, + scale_b=w_inv_s, + use_fast_accum=False, + ) + # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) + grad_w = torch._scaled_mm( + x_f8.T.contiguous(), + grad_f8.T.contiguous().T, + out_dtype=torch.float32, + scale_a=x_inv_s, + scale_b=grad_inv_s, + use_fast_accum=False, + ).T + return grad_x, grad_w + + return impl(g, x_f8, w_f8) + +@mm_backward_op.register_fake +def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): + return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) + +def backward(ctx, grad_out: Tensor, *_): + x_f8, w_f8 = ctx.saved_tensors + x_s, w_s, grad_s = ctx.scales + grad_x, grad_w = torch.ops.nanogpt.mm_backward( + grad_out, x_f8, w_f8, x_s, w_s, grad_s + ) + return grad_x, grad_w, None, None, None + +def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): + *_, x_s, w_s, grad_s = inputs + _, x_f8, w_f8 = output + ctx.save_for_backward(x_f8, w_f8) + ctx.scales = x_s, w_s, grad_s + ctx.set_materialize_grads(False) + +mm_op.register_autograd(backward, setup_context=setup_context) + +# ----------------------------------------------------------------------------- +# Triton kernel for symmetric matrix multiplication by @byronxu99 + +def _get_autotune_configs(): + return [ + triton.Config( + { + "BLOCK_SIZE_M": bm, + "BLOCK_SIZE_N": bn, + "BLOCK_SIZE_K": bk, + "GROUP_SIZE_M": 8, + "LOWER_UPPER": 1, + }, + num_stages=stages, + num_warps=warps, + ) + for bm in [64, 128] + for bn in [64, 128, 256] + for bk in [64, 128] + for stages, warps in [(3, 4), (3, 8), (4, 4)] + if bm // bn <= 2 and bn // bm <= 2 + ] + +@triton.jit +def _pid_to_block( + pid, + M, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) + + # Map PID to a single matrix in batch + batch_idx = pid // (num_pid_m * num_pid_n) + pid = pid % (num_pid_m * num_pid_n) + + # Map PID to 2D grid of blocks + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) + + m_idx = pid_m * BLOCK_SIZE_M + n_idx = pid_n * BLOCK_SIZE_N + return batch_idx, m_idx, n_idx + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def XXT_kernel( + A_ptr, C_ptr, + M, K, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def XXT(A: torch.Tensor, out: torch.Tensor): + """ + Launch Triton kernel to compute C = A @ A.T + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert out.size(-2) == M, "Output matrix has incorrect shape" + assert out.size(-1) == M, "Output matrix has incorrect shape" + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + XXT_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + K=K, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + ) + return out + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def ba_plus_cAA_kernel( + A_ptr, C_ptr, + M, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + alpha, beta, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A + # Performance is slightly slower than XXT_kernel, so we use two separate kernels + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + # Load block of A to add (corresponds to the current block of C) + offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) + a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) + a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) + a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) + + # Apply alpha and beta + accumulator *= alpha + accumulator += a_add * beta + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): + """ + Launch Triton kernel to compute C = alpha * A @ A.T + beta * A + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert M == K, "Input matrix must be square" + assert out.size(-2) == M + assert out.size(-1) == M + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + ba_plus_cAA_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + alpha=alpha, + beta=beta, + ) + return out + +# Computed for num_iters=5, safety_factor=2e-2, cushion=2 +polar_express_coeffs = [ + (8.156554524902461, -22.48329292557795, 15.878769915207462), + (4.042929935166739, -2.808917465908714, 0.5000178451051316), + (3.8916678022926607, -2.772484153217685, 0.5060648178503393), + (3.285753657755655, -2.3681294933425376, 0.46449024233003106), + (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) +] + +@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower +def polar_express(G: torch.Tensor): + """ + Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 + by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. + Code adapted from https://github.com/NoahAmsel/PolarExpress/tree/main by @varunneal. + """ + X = G.bfloat16() + if G.size(-2) > G.size(-1): + X = X.mT + + # Ensure spectral norm is at most 1 + X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) + + # Allocate buffers + X = X.contiguous() + A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) + B = torch.empty_like(A) + C = torch.empty_like(X) + + aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm + + # Perform the iterations + for a, b, c in polar_express_coeffs: + XXT(X, out=A) # A = X @ X.mT + ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A + aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X + X, C = C, X # Swap references to avoid unnecessary copies + + if G.size(-2) > G.size(-1): + X = X.mT + return X + +# ----------------------------------------------------------------------------- +# Muon optimizer + +class Muon(torch.optim.Optimizer): + """ + Muon - MomentUm Orthogonalized by Newton-schulz + + https://kellerjordan.github.io/posts/muon/ + + Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- + processing step, in which each 2D parameter's update is replaced with the nearest orthogonal + matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has + the advantage that it can be stably run in bfloat16 on the GPU. + Note: A later PR replaced Newton-Shulz with Polar Express for the orthogonalization step + + Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, + or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). + Though empirically small 1D params perform efficiently here: + NS approximately performs a magnitude normalization of the grad + This hyper-optimized class has faster execution time than the current impl of Adam for small params + + Custom distributed sizing: + The model stores all attn and mlp weights in the same shape, and then updates the view as + needed on the forward pass. This enables attn and mlp weights to be contained within the same + dist.reduce_scatter_tensor() call. The model architecture has been customized to enable + (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. + The scheduling is: + 1. reduce scatter smear_gate (1 param 7 padding params) + 2. reduce scatter attn_gate (10 params 6 padding params) + 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) + 4. reduce scatter attn/mlp round 2 (16 mlp params) + 5. wait on step 1, then compute update of 1 and schedule all gather + 6. wait on step 2, then compute update of 2 and schedule all gather + 7. wait on step 3, then compute update of 3 and schedule all gather + GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] + GPUs that receive params of type attn reshape before computing update + 8. wait on 4, then compute update of 4 and schedule all gather + 9. wait for each all gather to complete and update params + Empirically, leading with small params provides an additional 0.2s improvement. + """ + def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, eps=1e-8, beta2=0.95, custom_sizing=True): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + # custom sizing requires 8 GPUs + if custom_sizing and dist.get_world_size()==8: + param_groups = self.generate_custom_param_groups(params) + else: + param_groups = self.generate_standard_param_groups(params) + super().__init__(param_groups, defaults) + + def reset(self): + # expose a reset for clearing buffers + for group in self.param_groups: + group["momentum_buffer"].zero_() + group["second_momentum_buffer"].zero_() + + def generate_standard_param_groups(self, params): + """ + Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. + Creates one param group per module. + """ + groups = defaultdict(list) + for param in params: + groups[param.label].append(param) + + param_groups = [] + for module_name, group_params in groups.items(): + chunk_size = (len(group_params) + self.world_size - 1) // self.world_size + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + + return param_groups + + def generate_custom_param_groups(self, params): + """ + Implementation requires that a single GPU does not receive both attn + and mlp params when a param group is split across GPUs. + """ + module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp_up', 'mlp_down'] + params_list = list(params) + params_list.sort(key=lambda x: module_group_order.index(x.label)) + + idx = 0 + group_sizes = [1, 10, 16, 16] + assert len(params_list) == sum(group_sizes) + param_groups = [] + for size in group_sizes: + chunk_size = (size + self.world_size - 1) // self.world_size + group_params = params_list[idx: idx + size] + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + idx += size + + return param_groups + + @torch.no_grad() + def step(self): + # Efficient systems-wise implementation of step developed by @YouJiacheng, + # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, + # @ryanyang0, @vagrawal, and @varunneal. + rank = dist.get_rank() + group_infos = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + if not params: + continue + + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + stacked_grads = torch.empty( + (padded_num_params, *params[0].shape), + dtype=params[0].dtype, + device=params[0].device + ) + for i, p in enumerate(params): + stacked_grads[i].copy_(p.grad, non_blocking=True) + if len(params) < padded_num_params: + stacked_grads[len(params):].zero_() + + grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) + + reduce_future = dist.reduce_scatter_tensor( + grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True + ).get_future() + + group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) + + all_gather_infos = [] + # Second pass: wait for gradients, compute updates for the local shard of parameters, + # and launch all async all_gather operations. + for group, info in zip(self.param_groups, group_infos): + info["reduce_future"].wait() + + params = group["params"] + grad_chunk = info["grad_chunk"] + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + start_idx = rank * chunk_size + module_idx = start_idx if start_idx < len(params) else 0 + + num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank + + if "momentum_buffer" not in group: + group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) + momentum_buffer = group["momentum_buffer"] + # Apply momentum update to the persistent momentum buffer in-place + momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) + updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) + + grad_shape = updated_grads.shape + if params[module_idx].label == 'attn': + # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] + for p in params[module_idx:module_idx + num_params]: + assert p.label == 'attn' + updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) + ref_param = params[module_idx] + param_shape = ref_param.shape + + if "second_momentum_buffer" not in group: + group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) + if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) + ) + second_momentum_buffer = group["second_momentum_buffer"] + + if "param_lr" not in group: + group["param_lr"] = ( + max(1., param_shape[-2] / param_shape[-1]) ** 0.5 + * ref_param.new_tensor( + [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + ) + + group["param_wd"] = ref_param.new_tensor( + [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + + # Determine LR and WR + eff_lr = group["lr"] * group["param_lr"] + eff_wd = group["weight_decay"] * group["param_wd"] + + # Compute zeropower for the entire chunk in a single, batched call. + if num_params == 0: + v_chunk = updated_grads + elif params[module_idx].label == "smear_gate": + # dividing by magnitude is equivalent of SVN for 1d tensors + v_chunk = updated_grads / (updated_grads.norm(dim=(-2, -1), keepdim=True).clamp_min(1e-10)) + else: + v_chunk = polar_express(updated_grads) + + # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) + v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) + second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) + step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() + v_chunk.mul_(step_size) + v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) + + v_chunk = v_chunk.view(grad_shape) + + updated_params = torch.empty_like(grad_chunk) + param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) + # Apply weight decay directly to the buffer. + param_chunk.mul_(1 - eff_wd) + + param_chunk.add_(-eff_lr * v_chunk) + + updated_params[:num_params].copy_(param_chunk) + if num_params < chunk_size: + updated_params[num_params:].zero_() + + stacked_params = torch.empty( + (padded_num_params, *param_shape), + dtype=updated_params.dtype, + device=updated_params.device, + ) + + gather_future = dist.all_gather_into_tensor( + stacked_params, updated_params, async_op=True + ).get_future() + + all_gather_infos.append( + { + "gather_future": gather_future, + "stacked_params": stacked_params, + "orig_params": params, + } + ) + + # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. + for info in all_gather_infos: + info["gather_future"].wait() + stacked_params = info["stacked_params"] + orig_params = info["orig_params"] + + unstacked_params = torch.unbind(stacked_params) + for i, p in enumerate(orig_params): + p.copy_(unstacked_params[i], non_blocking=True) + + +class DistAdam(torch.optim.Optimizer): + def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + params = list(params) + sizes = {p.shape for p in params} + # create one buffer per unique parameter-size + param_groups = [] + for size in sizes: + group_params = [p for p in params if p.shape == size] + param_groups.append(dict(params=group_params)) + super().__init__(param_groups, defaults) + # init state + for p in params: + chunk_size = p.size(0) // self.world_size + exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) + exp_avg_sq = torch.zeros_like(exp_avg) + self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) + # DistributedAdam implementation by @vagrawal + + @torch.compile + @torch.no_grad() + def step(self): + rank = dist.get_rank() + reduce_scatter_futures: list[torch.Future] = [] + all_gather_futures: list[torch.Future] = [] + grad_slices = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + for param in params: + grad = param.grad + rank_size = grad.shape[0] // self.world_size + grad_slice = torch.empty_like(grad[:rank_size]) + reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) + grad_slices.append(grad_slice) + + idx = 0 + for group in self.param_groups: + beta1, beta2 = group['betas'] + eps = group['eps'] + wd = group['weight_decay'] + params = group['params'] + for param in params: + reduce_scatter_futures[idx].wait() + rank_size = param.shape[0] // self.world_size + p_slice = param[rank * rank_size:(rank + 1) * rank_size] + lr = group['lr'] * getattr(param, "lr_mul", 1.0) + state = self.state[param] + g_slice = grad_slices[idx] + + exp_avg = state["exp_avg"] + exp_avg_sq = state["exp_avg_sq"] + state["step"] += 1 + t = state["step"] + # weight decay + if wd != 0: + eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) + p_slice.mul_(1 - eff_weight_decay) + # update running averages + exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) + # bias corrections + bias1 = 1 - beta1 ** t + bias2 = 1 - beta2 ** t + # compute step + denom = exp_avg_sq.sqrt().add_(eps) + step_size = lr * (bias2 ** 0.5 / bias1) + update = exp_avg.div(denom).mul_(step_size) + p_slice.add_(other=update, alpha=-1.0) + idx += 1 + all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) + torch.futures.collect_all(all_gather_futures).wait() + +# ----------------------------------------------------------------------------- +# PyTorch nn.Module definitions for the model + +def norm(x: Tensor): + return F.rms_norm(x, (x.size(-1),)) + +class CastedLinear(nn.Linear): + def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): + super().__init__(in_features, out_features, bias=False) + self.use_fp8 = use_fp8 + self.x_s = x_s + self.w_s = w_s + self.grad_s = grad_s + + def reset_parameters(self) -> None: + with torch.no_grad(): + self.weight.zero_() # @Grad62304977 and others + + def forward(self, x: Tensor): + if self.use_fp8 and self.training: + _x = x.flatten(0, -2) + out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] + return out.reshape(*x.shape[:-1], -1) + else: + return F.linear(x, self.weight.type_as(x)) + +# yarn implementation @classiclarryd +class Yarn(nn.Module): + def __init__(self, head_dim, max_seq_len): + super().__init__() + self.head_dim = head_dim + self.max_seq_len = max_seq_len + self.reset() + + def reset(self): + angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) + # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) + angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) + theta = torch.outer(t, angular_freq) + self.cos = nn.Buffer( + theta.cos().to(torch.bfloat16), persistent=False + ) + self.sin = nn.Buffer( + theta.sin().to(torch.bfloat16), persistent=False + ) + self.angular_freq = angular_freq + # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 + self.attn_scale = 0.1 + + def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): + rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) + scaling_factor = old_window / new_window + interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) + self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) + theta = torch.outer(t, self.angular_freq) + self.cos.copy_(theta.cos()) + self.sin.copy_(theta.sin()) + self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 + +def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): + assert cos.size(0) >= x_BTHD.size(-3) + cos, sin = ( + cos[None, : x_BTHD.size(-3), None, :], + sin[None, : x_BTHD.size(-3), None, :], + ) + x1, x2 = x_BTHD.chunk(2, dim=-1) + y1 = x1 * cos + x2 * sin + y2 = x1 * (-sin) + x2 * cos + return torch.cat((y1, y2), 3) + +@dataclass +class AttnArgs: + ve: torch.Tensor + sa_lambdas: torch.Tensor + seqlens: torch.Tensor + bm_size: int + cos: torch.Tensor + sin: torch.Tensor + attn_scale: float + +flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface + +class CausalSelfAttention(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int): + super().__init__() + self.num_heads = num_heads + self.head_dim = head_dim + self.dim = dim + self.hdim = num_heads * head_dim + + assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" + std = 0.5 * (self.dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng + # https://x.com/hi_tysam/status/1879699187107033311 + # make matrices the same shape as MLP to enable batched call in optimizer + self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) + # label module to enable custom optimizer sizing + self.qkvo_w.label='attn' + + with torch.no_grad(): + self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights + self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero + + # sparse gated attention to enable context based no-op by @classiclarryd + self.attn_gate = CastedLinear(12, num_heads) + # label module to enable custom optimizer sizing + self.attn_gate.weight.label = 'attn_gate' + + def forward(self, x: Tensor, attn_args: AttnArgs): + B, T = x.size(0), x.size(1) # batch size, sequence length + assert B == 1, "varlen sequences requires B == 1" + assert T % 16 == 0 + # unpack attention args + cos, sin = attn_args.cos, attn_args.sin + ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas + seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size + + q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) + q, k = norm(q), norm(k) # QK norm @Grad62304977 + q, k = rotary(q, cos, sin), rotary(k, cos, sin) + if ve is not None: + v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 + else: # skip mid-layers token value embeddings by @YouJiacheng + v = sa_lambdas[0] * v + + max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) + + # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng + y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, + max_seqlen_q=max_len, max_seqlen_k=max_len, + causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) + y = y.view(B, T, self.num_heads, self.head_dim) + y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) + y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side + y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) + return y + + +class MLP(nn.Module): + def __init__(self, dim: int): + super().__init__() + hdim = 4 * dim + # make matrices the same shape to enable batched call in optimizer + self.c_fc = nn.Parameter(torch.empty(dim, hdim)) + self.c_proj = nn.Parameter(torch.empty(dim, hdim)) + # label modules to enable custom optimizer sizing + self.c_fc.label = 'mlp_up' + self.c_proj.label = 'mlp_down' + # corrective factor to account for transpose + self.c_fc.lr_mul = 2. + + std = 0.5 * (dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + with torch.no_grad(): + self.c_fc.uniform_(-bound, bound) + self.c_proj.zero_() # zero init suggested by @Grad62304977 + + def forward(self, x: Tensor): + x = F.linear(x, self.c_fc.T.type_as(x)) + x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 + x = F.linear(x, self.c_proj.type_as(x)) + return x + +class Block(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): + super().__init__() + # skip attention of blocks.7 (the 8th layer) by @YouJiacheng + self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None + # skip MLP blocks for first MLP layer by @EmelyanenkoK + self.mlp = MLP(dim) if layer_idx != 0 else None + + def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): + x = lambdas[0] * x + lambdas[1] * x0 + if self.attn is not None: + x = x + self.attn(norm(x), attn_args) + if self.mlp is not None: + x = x + self.mlp(norm(x)) + return x + +# ----------------------------------------------------------------------------- +# The main model + +def next_multiple_of_n(v: float | int, *, n: int): + return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) + +class GPT(nn.Module): + def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): + super().__init__() + vocab_size = next_multiple_of_n(vocab_size, n=128) + self.embed = nn.Embedding(vocab_size, model_dim) + self.smear_gate = CastedLinear(12, 1) + # label modules to enable custom optimizer sizing + self.smear_gate.weight.label = 'smear_gate' + # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 + # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 + self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) + self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) + self.yarn = Yarn(head_dim, max_seq_len) + # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. + # suggested to me by @Grad62304977. this originates from Karpathy's experiments. + use_fp8 = not os.environ.get("DISABLE_FP8", False) + self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) + # Add learnable skip connection weights for decoder layers + assert num_layers % 2 == 0 + pad = (-num_layers * 5 - 2) % dist.get_world_size() + self.scalars = nn.Parameter( + torch.cat( + [ + -1.5 + * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 + *[ + torch.tensor([1.0, 0.0]) for _ in range(num_layers) + ], # block lambdas + *[ + torch.tensor([0.5, 0.5]) for _ in range(num_layers) + ], # SA lambdas + torch.zeros(1), # smear_lambda + 0.5*torch.ones(1), # backout_lambda + torch.ones(pad), + ] + ) + ) + # set learning rates + for param in self.embed.parameters(): + param.lr_mul = 75. + for param in self.value_embeds.parameters(): + param.lr_mul = 75. + self.lm_head.weight.lr_mul = 1.0 + self.scalars.lr_mul = 5.0 + + def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): + assert input_seq.ndim == 1 + + ve = [value_embed(input_seq) for value_embed in self.value_embeds] + # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure + # dropping first layer updates this to .12 ... 012 + ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] + assert len(ve) == len(self.blocks) + + short_bm = ws_short * args.block_size + long_bm = ws_long * args.block_size + bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] + assert len(bm_sizes) == len(self.blocks) + + x = self.embed(input_seq) + + skip_weights = self.scalars[:(len(self.blocks) // 2)] + lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) + sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) + smear_lambda = self.scalars[5 * len(self.blocks)] + backout_lambda = self.scalars[5 * len(self.blocks)+1] + + # smear token embed forward 1 position @classiclarryd + smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) + x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) + x = x0 = norm(x[None]) + + # U-net design by @brendanh0gan + skip_connections = [] + n = len(self.blocks) // 2 + + x_backout = None + backout_layer = 8 + # skip layer zero + for i in range(1,len(self.blocks)): + attn_args = AttnArgs( + ve=ve[i], + sa_lambdas=sa_lambdas[i], + seqlens=seqlens, + bm_size=bm_sizes[i], + cos=self.yarn.cos, + sin=self.yarn.sin, + attn_scale=self.yarn.attn_scale + ) + # since layer 0 is skipped, layer 11 does not have skip_connection + if i >= n and i<11: + gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) + x = x + gate * skip_connections.pop() + x = self.blocks[i](x, x0, lambdas[i], attn_args) + if i < n: + skip_connections.append(x) + if i == backout_layer: + x_backout = x + + # back out contributions from first 8 layers that are only required for downstream context and not direct prediction + x -= backout_lambda * x_backout + x = norm(x) + logits = self.lm_head(x) + # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) + logits = 30 * torch.sigmoid(logits / 7.5) + logits_for_loss = logits.float() if not self.training else logits + loss = F.cross_entropy( + logits_for_loss.view(-1, logits_for_loss.size(-1)), + target_seq, + reduction="sum" if self.training else "mean", + ) + return loss + +# ----------------------------------------------------------------------------- +# Distributed data loader + +def _load_data_shard(file: Path): + header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 + assert header[0] == 20240520, "magic number mismatch in the data .bin file" + assert header[1] == 1, "unsupported version" + num_tokens = int(header[2]) # number of tokens (claimed) + with file.open("rb", buffering=0) as f: + tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng + f.seek(256 * 4) + nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng + assert nbytes == 2 * num_tokens, "number of tokens read does not match header" + return tokens + +BOS_ID = 50256 + +class BOSFinder: + # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd + def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): + # Precompute BOS positions once per shard + self.tokens=tokens + self.size = tokens.numel() + self.quickload = quickload + if quickload: + # only scan first 4 million tokens, then kickoff async thread to scan rest + self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.thread = None + self.ready = threading.Event() + self.start() + else: + self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.i = 0 + self.world_size = world_size + self.batch_iter = 0 + + def _load(self): + self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + self.bos_idx = self.bos_idx_async + + def next_batch(self, num_tokens_local: int, max_seq_len: int): + # if quickload was used, repoint to the full dataset after 5 batches + if self.quickload and self.batch_iter==5: + self.get() + n = len(self.bos_idx) + starts = [[] for _ in range(self.world_size)] + ends = [[] for _ in range(self.world_size)] + + idx = self.i + for r in range(self.world_size): + cur_len = 0 + while cur_len <= num_tokens_local: + if idx >= n: + raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") + cur = self.bos_idx[idx] + starts[r].append(cur) + end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, + cur + max_seq_len, + cur + num_tokens_local - cur_len + 1) + ends[r].append(end) + cur_len += end - cur + idx += 1 + + assert cur_len == num_tokens_local + 1 + self.i = idx + self.batch_iter+=1 + return starts, ends + +class DataPreloader: + # Helper for asynchronously loading next shard and indexing bos tokens + def __init__(self, file_iter, world_size: int = 1): + self.file_iter = file_iter + self.world_size = world_size + self.thread = None + self.data = None + self.ready = threading.Event() + + def _load(self): + tokens = _load_data_shard(next(self.file_iter)) + self.data = (tokens, BOSFinder(tokens, self.world_size)) + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + return self.data + +def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): + # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len + rank = dist.get_rank() if dist.is_initialized() else 0 + world_size = dist.get_world_size() if dist.is_initialized() else 1 + assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" + num_tokens = num_tokens // grad_accum_steps + + files = [Path(file) for file in sorted(glob.glob(filename_pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") + + file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training + tokens = _load_data_shard(next(file_iter)) + if align_to_bos: + finder = BOSFinder(tokens, world_size=world_size, quickload=True) + preloader = DataPreloader(file_iter, world_size) + preloader.start() + else: + pos = 0 # for unaligned case + + while True: + num_tokens_local = num_tokens // world_size + max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 + + if align_to_bos: + try: + seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) + start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) + except StopIteration: + # This shard is exhausted, load the next one in the next loop iteration. + tokens, finder = preloader.get() + preloader.start() + continue + + buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) + _inputs = buf[:-1] + _targets = buf[1:] + end_idxs[-1] -= 1 # last document was too long to account for _targets offset + cum_lengths = (end_idxs - start_idxs).cumsum(0) + + else: + if pos + num_tokens + 1 >= len(tokens): # should not occur for val data + tokens, pos = _load_data_shard(next(file_iter)), 0 + + pos_local = pos + rank * num_tokens_local + buf = tokens[pos_local: pos_local + num_tokens_local + 1] + _inputs = buf[:-1].view(num_tokens_local, ) + _targets = buf[1:].view(num_tokens_local, ) + + cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] + pos += num_tokens + + + _cum_lengths = torch.full((max_num_docs,), num_tokens_local) + _cum_lengths[0] = 0 + _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths + + new_params = yield ( + _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), + _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), + _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) + ) + + if new_params is not None: + # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() + new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params + assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" + num_tokens = new_num_tokens + max_seq_len = new_max_seq_len + grad_accum_steps = new_grad_accum_steps + + +# ----------------------------------------------------------------------------- +# int main + +@dataclass +class Hyperparameters: + # data + train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on + val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on + val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons + train_batch_size: int = 2048 * 16 * 8 + train_max_seq_len: int = 128 * 16 + val_batch_size: int = 4 * 64 * 1024 * 8 + # optimization + num_iterations: int = 2285 + lr_schedule = (0.5, 0.98) # breakpoints for 3-part schedule: (flat, linear decay, flat) + lr_min = 0.1 + # evaluation and logging + run_id: str = f"{uuid.uuid4()}" + val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end + save_checkpoint: bool = False + # attention masking + block_size: int = 128 + ws_schedule: tuple = (3, 5, 7, 9, 11, 13) + ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN + +args = Hyperparameters() + +data_path = os.environ.get("DATA_PATH", ".") +args.train_files = os.path.join(data_path, args.train_files) +args.val_files = os.path.join(data_path, args.val_files) + +# torchrun sets these env variables +rank = int(os.environ["RANK"]) +world_size = int(os.environ["WORLD_SIZE"]) +assert 8 % world_size == 0, "world_size must be a divisor of 8" +grad_accum_steps = 8 // world_size +assert torch.cuda.is_available() +device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) +torch.cuda.set_device(device) +dist.init_process_group(backend="nccl", device_id=device) +dist.barrier() +master_process = (rank == 0) # this process will do logging, checkpointing etc. + +# begin logging +logfile = None +if master_process: + run_id = args.run_id + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{run_id}.txt" + print(logfile) +def print0(s, console=False): + if master_process: + with open(logfile, "a") as f: + if console: + print(s) + print(s, file=f) + +# begin by printing this file (the Python code) +print0(code) +print0("="*100) +# log information about the hardware/software environment this is running on +print0(f"Running Python {sys.version}") +print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") +print0(f"Running Triton version {triton.__version__}") + +def nvidia_smi(): + import subprocess # avoid top level import + return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout +print0(nvidia_smi()) +print0("="*100) + +model: nn.Module = GPT( + vocab_size=50257, + num_layers=12, + num_heads=6, + head_dim=128, + model_dim=768, + max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) +).cuda() +for m in model.modules(): + if isinstance(m, (nn.Embedding, nn.Linear)): + m.bfloat16() +for param in model.parameters(): + dist.broadcast(param.detach(), 0) + +# collect the parameters to optimize +hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] +embed_params = [p for n, p in model.named_parameters() if "embed" in n] +scalar_params = [p for p in model.parameters() if p.ndim < 2] +head_params = [model.lm_head.weight] +gate_params = [p for n, p in model.named_parameters() if "gate" in n] + +# init the optimizer(s) +# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence +# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 +optimizer1 = DistAdam( + scalar_params + head_params + embed_params, + lr=0.008, + betas=(0.65, 0.95), + eps=1e-8, + weight_decay=0.0, +) +optimizer2 = Muon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=0.0) +optimizers = [optimizer1, optimizer2] +for opt in optimizers: + for group in opt.param_groups: + group["initial_lr"] = group["lr"] + +def get_lr(step: int): + assert step < args.num_iterations + # Three part schedule: flat, linear decrease, flat + lr_schedule = args.lr_schedule + x = step / args.num_iterations + + if x < lr_schedule[0]: + return 1.0 + elif x < lr_schedule[1]: + progress = (x - lr_schedule[0]) / (lr_schedule[1] - lr_schedule[0]) + lr = 1.0 - (1.0 - args.lr_min) * progress + else: + lr = args.lr_min + return lr + +def get_ws(step: int): + assert step <= args.num_iterations + x = step / (args.num_iterations + 1) + ws_idx = int(len(args.ws_schedule) * x) + return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] + +def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): + # warmup phase: linearly increase momentum from min to max + # cooldown phase: linearly decrease momentum from max to min + momentum_cd_start = args.num_iterations - muon_cooldown_steps + if step < muon_warmup_steps: + frac = step / muon_warmup_steps + momentum = momentum_min + frac * (momentum_max - momentum_min) + elif step > momentum_cd_start: + frac = (step - momentum_cd_start) / muon_cooldown_steps + momentum = momentum_max - frac * (momentum_max - momentum_min) + else: + momentum = momentum_max + return momentum + +def step_optimizers(step: int, optimizers, model): + # update lr + for optimizer in optimizers: + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * get_lr(step) + + # set muon momentum based on step + momentum = get_muon_momentum(step) + for group in optimizers[1].param_groups: + group["momentum"] = momentum + + # on even steps, only step Muon params + # on odd steps, step all params + if step%2==0: + optimizers[1].step() + optimizers[1].zero_grad(set_to_none=True) + else: + for optimizer in optimizers: + optimizer.step() + model.zero_grad(set_to_none=True) + +model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) + +######################################## +# Warmup kernels # +######################################## + +# Warmup the training kernels, then re-initialize the state so we aren't cheating +warmup_steps = 30 +initial_state = dict(model=copy.deepcopy(model.state_dict()), + optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +for step in range(warmup_steps): + inputs, targets, cum_seqlens = next(train_loader) + # each window size is a new graph, need to warm up each with Yarn.attn_scale + ws_idx = step % len(args.ws_schedule) + if ws_idx==0: + model.yarn.reset() + ws_long = args.ws_schedule[0] + else: + new_ws_long = args.ws_schedule[ws_idx] + if new_ws_long > ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long = new_ws_long + model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() + for opt in optimizers: + opt.step() + model.zero_grad(set_to_none=True) +model.yarn.reset() # rotary buffer is not stored in state_dict +model.load_state_dict(initial_state["model"]) +optimizer2.reset() # momentum buffer not in state dict +for opt, opt_state in zip(optimizers, initial_state["optimizers"]): + opt.load_state_dict(opt_state) +del train_loader, initial_state + +######################################## +# Training and validation # +######################################## + +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +training_time_ms = 0 +# start the clock +torch.cuda.synchronize() +t0 = time.perf_counter() +# begin training +train_steps = args.num_iterations +ws_short, ws_long = get_ws(0) +for step in range(train_steps + 1): + last_step = (step == train_steps) + ws_short, new_ws_long = get_ws(step) + if new_ws_long != ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long=new_ws_long + + # --------------- VALIDATION SECTION ----------------- + if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): + if last_step: + ws_long = args.ws_validate_post_yarn_ext + # stop the clock + torch.cuda.synchronize() + training_time_ms += 1000 * (time.perf_counter() - t0) + model.eval() + assert args.val_tokens % args.val_batch_size == 0 + val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size + val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) + val_loss = 0 + with torch.no_grad(): + for _ in range(val_steps): + inputs, targets, cum_seqlens = next(val_loader) + val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) + val_loss /= val_steps + del val_loader + dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) + print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) + model.train() + # start the clock again + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if master_process and args.save_checkpoint: + log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) + os.makedirs(f"logs/{run_id}", exist_ok=True) + torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") + # the last step only has the validation loop, so break to avoid training + break + + # --------------- TRAINING SECTION ----------------- + loss = 0 + for _ in range(grad_accum_steps): + inputs, targets, cum_seqlens = next(train_loader) + loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps + loss.backward() + step_optimizers(step, optimizers, model) + + # logging + approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) + print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) + +print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) +dist.destroy_process_group() + +==================================================================================================== +Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] +Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 +Running Triton version 3.5.0 +Tue Oct 28 02:00:09 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | +| N/A 41C P0 129W / 700W | 5858MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | +| N/A 33C P0 127W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | +| N/A 32C P0 121W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | +| N/A 38C P0 126W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | +| N/A 39C P0 121W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | +| N/A 32C P0 120W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | +| N/A 38C P0 125W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | +| N/A 31C P0 115W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| ++-----------------------------------------------------------------------------------------+ + +==================================================================================================== +step:0/2285 val_loss:10.8258 train_time:0ms step_avg:0.11ms +step:1/2285 train_time:110ms step_avg:109.58ms +step:2/2285 train_time:131ms step_avg:65.39ms +step:3/2285 train_time:168ms step_avg:56.15ms +step:4/2285 train_time:225ms step_avg:56.20ms +step:5/2285 train_time:284ms step_avg:56.77ms +step:6/2285 train_time:342ms step_avg:56.97ms +step:7/2285 train_time:402ms step_avg:57.48ms +step:8/2285 train_time:461ms step_avg:57.59ms +step:9/2285 train_time:521ms step_avg:57.93ms +step:10/2285 train_time:580ms step_avg:58.01ms +step:11/2285 train_time:641ms step_avg:58.29ms +step:12/2285 train_time:700ms step_avg:58.33ms +step:13/2285 train_time:760ms step_avg:58.48ms +step:14/2285 train_time:820ms step_avg:58.55ms +step:15/2285 train_time:881ms step_avg:58.70ms +step:16/2285 train_time:939ms step_avg:58.69ms +step:17/2285 train_time:1003ms step_avg:58.98ms +step:18/2285 train_time:1065ms step_avg:59.19ms +step:19/2285 train_time:1131ms step_avg:59.50ms +step:20/2285 train_time:1190ms step_avg:59.50ms +step:21/2285 train_time:1252ms step_avg:59.60ms +step:22/2285 train_time:1311ms step_avg:59.57ms +step:23/2285 train_time:1371ms step_avg:59.62ms +step:24/2285 train_time:1430ms step_avg:59.58ms +step:25/2285 train_time:1491ms step_avg:59.63ms +step:26/2285 train_time:1549ms step_avg:59.59ms +step:27/2285 train_time:1610ms step_avg:59.65ms +step:28/2285 train_time:1669ms step_avg:59.61ms +step:29/2285 train_time:1730ms step_avg:59.67ms +step:30/2285 train_time:1789ms step_avg:59.64ms +step:31/2285 train_time:1851ms step_avg:59.70ms +step:32/2285 train_time:1910ms step_avg:59.68ms +step:33/2285 train_time:1972ms step_avg:59.75ms +step:34/2285 train_time:2031ms step_avg:59.74ms +step:35/2285 train_time:2093ms step_avg:59.81ms +step:36/2285 train_time:2152ms step_avg:59.78ms +step:37/2285 train_time:2214ms step_avg:59.84ms +step:38/2285 train_time:2273ms step_avg:59.81ms +step:39/2285 train_time:2334ms step_avg:59.86ms +step:40/2285 train_time:2393ms step_avg:59.83ms +step:41/2285 train_time:2454ms step_avg:59.85ms +step:42/2285 train_time:2512ms step_avg:59.82ms +step:43/2285 train_time:2574ms step_avg:59.85ms +step:44/2285 train_time:2633ms step_avg:59.83ms +step:45/2285 train_time:2694ms step_avg:59.86ms +step:46/2285 train_time:2753ms step_avg:59.84ms +step:47/2285 train_time:2814ms step_avg:59.87ms +step:48/2285 train_time:2873ms step_avg:59.85ms +step:49/2285 train_time:2934ms step_avg:59.88ms +step:50/2285 train_time:2993ms step_avg:59.86ms +step:51/2285 train_time:3054ms step_avg:59.89ms +step:52/2285 train_time:3113ms step_avg:59.87ms +step:53/2285 train_time:3175ms step_avg:59.91ms +step:54/2285 train_time:3234ms step_avg:59.89ms +step:55/2285 train_time:3296ms step_avg:59.92ms +step:56/2285 train_time:3355ms step_avg:59.91ms +step:57/2285 train_time:3416ms step_avg:59.94ms +step:58/2285 train_time:3475ms step_avg:59.92ms +step:59/2285 train_time:3537ms step_avg:59.95ms +step:60/2285 train_time:3596ms step_avg:59.93ms +step:61/2285 train_time:3657ms step_avg:59.96ms +step:62/2285 train_time:3717ms step_avg:59.95ms +step:63/2285 train_time:3778ms step_avg:59.97ms +step:64/2285 train_time:3838ms step_avg:59.96ms +step:65/2285 train_time:3899ms step_avg:59.99ms +step:66/2285 train_time:3958ms step_avg:59.98ms +step:67/2285 train_time:4020ms step_avg:60.00ms +step:68/2285 train_time:4080ms step_avg:60.00ms +step:69/2285 train_time:4142ms step_avg:60.03ms +step:70/2285 train_time:4201ms step_avg:60.02ms +step:71/2285 train_time:4263ms step_avg:60.04ms +step:72/2285 train_time:4322ms step_avg:60.03ms +step:73/2285 train_time:4383ms step_avg:60.04ms +step:74/2285 train_time:4442ms step_avg:60.03ms +step:75/2285 train_time:4503ms step_avg:60.04ms +step:76/2285 train_time:4562ms step_avg:60.03ms +step:77/2285 train_time:4624ms step_avg:60.05ms +step:78/2285 train_time:4684ms step_avg:60.05ms +step:79/2285 train_time:4745ms step_avg:60.07ms +step:80/2285 train_time:4804ms step_avg:60.06ms +step:81/2285 train_time:4866ms step_avg:60.07ms +step:82/2285 train_time:4925ms step_avg:60.06ms +step:83/2285 train_time:4986ms step_avg:60.07ms +step:84/2285 train_time:5045ms step_avg:60.06ms +step:85/2285 train_time:5106ms step_avg:60.08ms +step:86/2285 train_time:5165ms step_avg:60.06ms +step:87/2285 train_time:5226ms step_avg:60.07ms +step:88/2285 train_time:5284ms step_avg:60.05ms +step:89/2285 train_time:5345ms step_avg:60.06ms +step:90/2285 train_time:5404ms step_avg:60.04ms +step:91/2285 train_time:5465ms step_avg:60.06ms +step:92/2285 train_time:5524ms step_avg:60.04ms +step:93/2285 train_time:5585ms step_avg:60.05ms +step:94/2285 train_time:5644ms step_avg:60.04ms +step:95/2285 train_time:5705ms step_avg:60.05ms +step:96/2285 train_time:5764ms step_avg:60.04ms +step:97/2285 train_time:5825ms step_avg:60.06ms +step:98/2285 train_time:5884ms step_avg:60.04ms +step:99/2285 train_time:5945ms step_avg:60.06ms +step:100/2285 train_time:6004ms step_avg:60.04ms +step:101/2285 train_time:6066ms step_avg:60.06ms +step:102/2285 train_time:6124ms step_avg:60.04ms +step:103/2285 train_time:6186ms step_avg:60.06ms +step:104/2285 train_time:6245ms step_avg:60.05ms +step:105/2285 train_time:6305ms step_avg:60.05ms +step:106/2285 train_time:6364ms step_avg:60.03ms +step:107/2285 train_time:6425ms step_avg:60.05ms +step:108/2285 train_time:6484ms step_avg:60.03ms +step:109/2285 train_time:6545ms step_avg:60.04ms +step:110/2285 train_time:6604ms step_avg:60.03ms +step:111/2285 train_time:6666ms step_avg:60.05ms +step:112/2285 train_time:6725ms step_avg:60.04ms +step:113/2285 train_time:6786ms step_avg:60.05ms +step:114/2285 train_time:6845ms step_avg:60.04ms +step:115/2285 train_time:6906ms step_avg:60.05ms +step:116/2285 train_time:6966ms step_avg:60.05ms +step:117/2285 train_time:7027ms step_avg:60.06ms +step:118/2285 train_time:7086ms step_avg:60.05ms +step:119/2285 train_time:7147ms step_avg:60.06ms +step:120/2285 train_time:7206ms step_avg:60.05ms +step:121/2285 train_time:7267ms step_avg:60.06ms +step:122/2285 train_time:7327ms step_avg:60.06ms +step:123/2285 train_time:7388ms step_avg:60.07ms +step:124/2285 train_time:7447ms step_avg:60.05ms +step:125/2285 train_time:7507ms step_avg:60.06ms +step:126/2285 train_time:7566ms step_avg:60.05ms +step:127/2285 train_time:7627ms step_avg:60.06ms +step:128/2285 train_time:7687ms step_avg:60.05ms +step:129/2285 train_time:7748ms step_avg:60.06ms +step:130/2285 train_time:7807ms step_avg:60.06ms +step:131/2285 train_time:7869ms step_avg:60.07ms +step:132/2285 train_time:7929ms step_avg:60.06ms +step:133/2285 train_time:7990ms step_avg:60.07ms +step:134/2285 train_time:8049ms step_avg:60.07ms +step:135/2285 train_time:8110ms step_avg:60.07ms +step:136/2285 train_time:8169ms step_avg:60.06ms +step:137/2285 train_time:8230ms step_avg:60.07ms +step:138/2285 train_time:8289ms step_avg:60.06ms +step:139/2285 train_time:8350ms step_avg:60.07ms +step:140/2285 train_time:8408ms step_avg:60.06ms +step:141/2285 train_time:8469ms step_avg:60.06ms +step:142/2285 train_time:8528ms step_avg:60.06ms +step:143/2285 train_time:8588ms step_avg:60.06ms +step:144/2285 train_time:8647ms step_avg:60.05ms +step:145/2285 train_time:8708ms step_avg:60.06ms +step:146/2285 train_time:8767ms step_avg:60.05ms +step:147/2285 train_time:8829ms step_avg:60.06ms +step:148/2285 train_time:8888ms step_avg:60.05ms +step:149/2285 train_time:8949ms step_avg:60.06ms +step:150/2285 train_time:9008ms step_avg:60.05ms +step:151/2285 train_time:9069ms step_avg:60.06ms +step:152/2285 train_time:9128ms step_avg:60.05ms +step:153/2285 train_time:9188ms step_avg:60.05ms +step:154/2285 train_time:9247ms step_avg:60.05ms +step:155/2285 train_time:9308ms step_avg:60.05ms +step:156/2285 train_time:9367ms step_avg:60.05ms +step:157/2285 train_time:9428ms step_avg:60.05ms +step:158/2285 train_time:9487ms step_avg:60.04ms +step:159/2285 train_time:9548ms step_avg:60.05ms +step:160/2285 train_time:9606ms step_avg:60.04ms +step:161/2285 train_time:9666ms step_avg:60.04ms +step:162/2285 train_time:9725ms step_avg:60.03ms +step:163/2285 train_time:9786ms step_avg:60.04ms +step:164/2285 train_time:9845ms step_avg:60.03ms +step:165/2285 train_time:9907ms step_avg:60.04ms +step:166/2285 train_time:9966ms step_avg:60.03ms +step:167/2285 train_time:10027ms step_avg:60.04ms +step:168/2285 train_time:10086ms step_avg:60.03ms +step:169/2285 train_time:10147ms step_avg:60.04ms +step:170/2285 train_time:10205ms step_avg:60.03ms +step:171/2285 train_time:10266ms step_avg:60.04ms +step:172/2285 train_time:10325ms step_avg:60.03ms +step:173/2285 train_time:10386ms step_avg:60.04ms +step:174/2285 train_time:10445ms step_avg:60.03ms +step:175/2285 train_time:10506ms step_avg:60.04ms +step:176/2285 train_time:10565ms step_avg:60.03ms +step:177/2285 train_time:10626ms step_avg:60.03ms +step:178/2285 train_time:10685ms step_avg:60.03ms +step:179/2285 train_time:10746ms step_avg:60.03ms +step:180/2285 train_time:10805ms step_avg:60.03ms +step:181/2285 train_time:10866ms step_avg:60.03ms +step:182/2285 train_time:10925ms step_avg:60.03ms +step:183/2285 train_time:10985ms step_avg:60.03ms +step:184/2285 train_time:11044ms step_avg:60.02ms +step:185/2285 train_time:11105ms step_avg:60.03ms +step:186/2285 train_time:11165ms step_avg:60.03ms +step:187/2285 train_time:11226ms step_avg:60.03ms +step:188/2285 train_time:11284ms step_avg:60.02ms +step:189/2285 train_time:11345ms step_avg:60.03ms +step:190/2285 train_time:11404ms step_avg:60.02ms +step:191/2285 train_time:11465ms step_avg:60.03ms +step:192/2285 train_time:11524ms step_avg:60.02ms +step:193/2285 train_time:11585ms step_avg:60.03ms +step:194/2285 train_time:11644ms step_avg:60.02ms +step:195/2285 train_time:11705ms step_avg:60.03ms +step:196/2285 train_time:11763ms step_avg:60.02ms +step:197/2285 train_time:11825ms step_avg:60.02ms +step:198/2285 train_time:11883ms step_avg:60.02ms +step:199/2285 train_time:11944ms step_avg:60.02ms +step:200/2285 train_time:12003ms step_avg:60.01ms +step:201/2285 train_time:12064ms step_avg:60.02ms +step:202/2285 train_time:12123ms step_avg:60.02ms +step:203/2285 train_time:12184ms step_avg:60.02ms +step:204/2285 train_time:12243ms step_avg:60.02ms +step:205/2285 train_time:12304ms step_avg:60.02ms +step:206/2285 train_time:12363ms step_avg:60.02ms +step:207/2285 train_time:12425ms step_avg:60.02ms +step:208/2285 train_time:12483ms step_avg:60.01ms +step:209/2285 train_time:12544ms step_avg:60.02ms +step:210/2285 train_time:12603ms step_avg:60.01ms +step:211/2285 train_time:12664ms step_avg:60.02ms +step:212/2285 train_time:12723ms step_avg:60.01ms +step:213/2285 train_time:12784ms step_avg:60.02ms +step:214/2285 train_time:12843ms step_avg:60.01ms +step:215/2285 train_time:12904ms step_avg:60.02ms +step:216/2285 train_time:12963ms step_avg:60.01ms +step:217/2285 train_time:13025ms step_avg:60.02ms +step:218/2285 train_time:13083ms step_avg:60.01ms +step:219/2285 train_time:13145ms step_avg:60.02ms +step:220/2285 train_time:13203ms step_avg:60.01ms +step:221/2285 train_time:13264ms step_avg:60.02ms +step:222/2285 train_time:13323ms step_avg:60.01ms +step:223/2285 train_time:13384ms step_avg:60.02ms +step:224/2285 train_time:13443ms step_avg:60.01ms +step:225/2285 train_time:13504ms step_avg:60.02ms +step:226/2285 train_time:13563ms step_avg:60.01ms +step:227/2285 train_time:13625ms step_avg:60.02ms +step:228/2285 train_time:13683ms step_avg:60.01ms +step:229/2285 train_time:13744ms step_avg:60.02ms +step:230/2285 train_time:13802ms step_avg:60.01ms +step:231/2285 train_time:13865ms step_avg:60.02ms +step:232/2285 train_time:13922ms step_avg:60.01ms +step:233/2285 train_time:13983ms step_avg:60.01ms +step:234/2285 train_time:14042ms step_avg:60.01ms +step:235/2285 train_time:14103ms step_avg:60.01ms +step:236/2285 train_time:14162ms step_avg:60.01ms +step:237/2285 train_time:14224ms step_avg:60.02ms +step:238/2285 train_time:14282ms step_avg:60.01ms +step:239/2285 train_time:14343ms step_avg:60.01ms +step:240/2285 train_time:14402ms step_avg:60.01ms +step:241/2285 train_time:14464ms step_avg:60.02ms +step:242/2285 train_time:14522ms step_avg:60.01ms +step:243/2285 train_time:14583ms step_avg:60.01ms +step:244/2285 train_time:14642ms step_avg:60.01ms +step:245/2285 train_time:14703ms step_avg:60.01ms +step:246/2285 train_time:14761ms step_avg:60.00ms +step:247/2285 train_time:14823ms step_avg:60.01ms +step:248/2285 train_time:14882ms step_avg:60.01ms +step:249/2285 train_time:14943ms step_avg:60.01ms +step:250/2285 train_time:15001ms step_avg:60.01ms +step:250/2285 val_loss:4.0805 train_time:15064ms step_avg:60.26ms +step:251/2285 train_time:15083ms step_avg:60.09ms +step:252/2285 train_time:15123ms step_avg:60.01ms +step:253/2285 train_time:15191ms step_avg:60.04ms +step:254/2285 train_time:15258ms step_avg:60.07ms +step:255/2285 train_time:15320ms step_avg:60.08ms +step:256/2285 train_time:15379ms step_avg:60.08ms +step:257/2285 train_time:15440ms step_avg:60.08ms +step:258/2285 train_time:15498ms step_avg:60.07ms +step:259/2285 train_time:15558ms step_avg:60.07ms +step:260/2285 train_time:15616ms step_avg:60.06ms +step:261/2285 train_time:15676ms step_avg:60.06ms +step:262/2285 train_time:15734ms step_avg:60.05ms +step:263/2285 train_time:15794ms step_avg:60.05ms +step:264/2285 train_time:15852ms step_avg:60.04ms +step:265/2285 train_time:15912ms step_avg:60.04ms +step:266/2285 train_time:15969ms step_avg:60.04ms +step:267/2285 train_time:16030ms step_avg:60.04ms +step:268/2285 train_time:16090ms step_avg:60.04ms +step:269/2285 train_time:16153ms step_avg:60.05ms +step:270/2285 train_time:16213ms step_avg:60.05ms +step:271/2285 train_time:16276ms step_avg:60.06ms +step:272/2285 train_time:16335ms step_avg:60.06ms +step:273/2285 train_time:16396ms step_avg:60.06ms +step:274/2285 train_time:16455ms step_avg:60.05ms +step:275/2285 train_time:16515ms step_avg:60.06ms +step:276/2285 train_time:16574ms step_avg:60.05ms +step:277/2285 train_time:16634ms step_avg:60.05ms +step:278/2285 train_time:16692ms step_avg:60.04ms +step:279/2285 train_time:16753ms step_avg:60.05ms +step:280/2285 train_time:16811ms step_avg:60.04ms +step:281/2285 train_time:16872ms step_avg:60.04ms +step:282/2285 train_time:16930ms step_avg:60.04ms +step:283/2285 train_time:16991ms step_avg:60.04ms +step:284/2285 train_time:17049ms step_avg:60.03ms +step:285/2285 train_time:17110ms step_avg:60.04ms +step:286/2285 train_time:17170ms step_avg:60.04ms +step:287/2285 train_time:17232ms step_avg:60.04ms +step:288/2285 train_time:17291ms step_avg:60.04ms +step:289/2285 train_time:17353ms step_avg:60.05ms +step:290/2285 train_time:17412ms step_avg:60.04ms +step:291/2285 train_time:17473ms step_avg:60.05ms +step:292/2285 train_time:17532ms step_avg:60.04ms +step:293/2285 train_time:17593ms step_avg:60.04ms +step:294/2285 train_time:17651ms step_avg:60.04ms +step:295/2285 train_time:17711ms step_avg:60.04ms +step:296/2285 train_time:17770ms step_avg:60.03ms +step:297/2285 train_time:17830ms step_avg:60.03ms +step:298/2285 train_time:17888ms step_avg:60.03ms +step:299/2285 train_time:17949ms step_avg:60.03ms +step:300/2285 train_time:18008ms step_avg:60.03ms +step:301/2285 train_time:18069ms step_avg:60.03ms +step:302/2285 train_time:18128ms step_avg:60.03ms +step:303/2285 train_time:18190ms step_avg:60.03ms +step:304/2285 train_time:18248ms step_avg:60.03ms +step:305/2285 train_time:18310ms step_avg:60.03ms +step:306/2285 train_time:18369ms step_avg:60.03ms +step:307/2285 train_time:18431ms step_avg:60.04ms +step:308/2285 train_time:18489ms step_avg:60.03ms +step:309/2285 train_time:18551ms step_avg:60.03ms +step:310/2285 train_time:18609ms step_avg:60.03ms +step:311/2285 train_time:18670ms step_avg:60.03ms +step:312/2285 train_time:18728ms step_avg:60.03ms +step:313/2285 train_time:18789ms step_avg:60.03ms +step:314/2285 train_time:18847ms step_avg:60.02ms +step:315/2285 train_time:18908ms step_avg:60.03ms +step:316/2285 train_time:18967ms step_avg:60.02ms +step:317/2285 train_time:19027ms step_avg:60.02ms +step:318/2285 train_time:19086ms step_avg:60.02ms +step:319/2285 train_time:19147ms step_avg:60.02ms +step:320/2285 train_time:19206ms step_avg:60.02ms +step:321/2285 train_time:19268ms step_avg:60.02ms +step:322/2285 train_time:19326ms step_avg:60.02ms +step:323/2285 train_time:19387ms step_avg:60.02ms +step:324/2285 train_time:19446ms step_avg:60.02ms +step:325/2285 train_time:19507ms step_avg:60.02ms +step:326/2285 train_time:19566ms step_avg:60.02ms +step:327/2285 train_time:19627ms step_avg:60.02ms +step:328/2285 train_time:19686ms step_avg:60.02ms +step:329/2285 train_time:19746ms step_avg:60.02ms +step:330/2285 train_time:19804ms step_avg:60.01ms +step:331/2285 train_time:19865ms step_avg:60.01ms +step:332/2285 train_time:19923ms step_avg:60.01ms +step:333/2285 train_time:19984ms step_avg:60.01ms +step:334/2285 train_time:20042ms step_avg:60.01ms +step:335/2285 train_time:20103ms step_avg:60.01ms +step:336/2285 train_time:20161ms step_avg:60.00ms +step:337/2285 train_time:20222ms step_avg:60.01ms +step:338/2285 train_time:20281ms step_avg:60.00ms +step:339/2285 train_time:20342ms step_avg:60.01ms +step:340/2285 train_time:20401ms step_avg:60.00ms +step:341/2285 train_time:20462ms step_avg:60.01ms +step:342/2285 train_time:20521ms step_avg:60.00ms +step:343/2285 train_time:20582ms step_avg:60.01ms +step:344/2285 train_time:20641ms step_avg:60.00ms +step:345/2285 train_time:20702ms step_avg:60.01ms +step:346/2285 train_time:20761ms step_avg:60.00ms +step:347/2285 train_time:20822ms step_avg:60.00ms +step:348/2285 train_time:20880ms step_avg:60.00ms +step:349/2285 train_time:20941ms step_avg:60.00ms +step:350/2285 train_time:20999ms step_avg:60.00ms +step:351/2285 train_time:21060ms step_avg:60.00ms +step:352/2285 train_time:21119ms step_avg:60.00ms +step:353/2285 train_time:21179ms step_avg:60.00ms +step:354/2285 train_time:21238ms step_avg:59.99ms +step:355/2285 train_time:21298ms step_avg:60.00ms +step:356/2285 train_time:21357ms step_avg:59.99ms +step:357/2285 train_time:21419ms step_avg:60.00ms +step:358/2285 train_time:21478ms step_avg:59.99ms +step:359/2285 train_time:21539ms step_avg:60.00ms +step:360/2285 train_time:21598ms step_avg:59.99ms +step:361/2285 train_time:21659ms step_avg:60.00ms +step:362/2285 train_time:21718ms step_avg:59.99ms +step:363/2285 train_time:21779ms step_avg:60.00ms +step:364/2285 train_time:21837ms step_avg:59.99ms +step:365/2285 train_time:21898ms step_avg:59.99ms +step:366/2285 train_time:21956ms step_avg:59.99ms +step:367/2285 train_time:22017ms step_avg:59.99ms +step:368/2285 train_time:22075ms step_avg:59.99ms +step:369/2285 train_time:22136ms step_avg:59.99ms +step:370/2285 train_time:22194ms step_avg:59.98ms +step:371/2285 train_time:22255ms step_avg:59.99ms +step:372/2285 train_time:22314ms step_avg:59.98ms +step:373/2285 train_time:22375ms step_avg:59.99ms +step:374/2285 train_time:22434ms step_avg:59.98ms +step:375/2285 train_time:22495ms step_avg:59.99ms +step:376/2285 train_time:22553ms step_avg:59.98ms +step:377/2285 train_time:22615ms step_avg:59.99ms +step:378/2285 train_time:22674ms step_avg:59.98ms +step:379/2285 train_time:22734ms step_avg:59.98ms +step:380/2285 train_time:22793ms step_avg:59.98ms +step:381/2285 train_time:22853ms step_avg:59.98ms +step:382/2285 train_time:22912ms step_avg:59.98ms +step:383/2285 train_time:22973ms step_avg:59.98ms +step:384/2285 train_time:23032ms step_avg:59.98ms +step:385/2285 train_time:23094ms step_avg:59.98ms +step:386/2285 train_time:23153ms step_avg:59.98ms +step:387/2285 train_time:23214ms step_avg:59.98ms +step:388/2285 train_time:23273ms step_avg:59.98ms +step:389/2285 train_time:23335ms step_avg:59.99ms +step:390/2285 train_time:23394ms step_avg:59.98ms +step:391/2285 train_time:23455ms step_avg:59.99ms +step:392/2285 train_time:23514ms step_avg:59.98ms +step:393/2285 train_time:23575ms step_avg:59.99ms +step:394/2285 train_time:23634ms step_avg:59.99ms +step:395/2285 train_time:23696ms step_avg:59.99ms +step:396/2285 train_time:23755ms step_avg:59.99ms +step:397/2285 train_time:23816ms step_avg:59.99ms +step:398/2285 train_time:23875ms step_avg:59.99ms +step:399/2285 train_time:23936ms step_avg:59.99ms +step:400/2285 train_time:23995ms step_avg:59.99ms +step:401/2285 train_time:24056ms step_avg:59.99ms +step:402/2285 train_time:24115ms step_avg:59.99ms +step:403/2285 train_time:24177ms step_avg:59.99ms +step:404/2285 train_time:24236ms step_avg:59.99ms +step:405/2285 train_time:24297ms step_avg:59.99ms +step:406/2285 train_time:24356ms step_avg:59.99ms +step:407/2285 train_time:24418ms step_avg:60.00ms +step:408/2285 train_time:24477ms step_avg:59.99ms +step:409/2285 train_time:24539ms step_avg:60.00ms +step:410/2285 train_time:24597ms step_avg:59.99ms +step:411/2285 train_time:24658ms step_avg:60.00ms +step:412/2285 train_time:24718ms step_avg:59.99ms +step:413/2285 train_time:24779ms step_avg:60.00ms +step:414/2285 train_time:24838ms step_avg:60.00ms +step:415/2285 train_time:24899ms step_avg:60.00ms +step:416/2285 train_time:24959ms step_avg:60.00ms +step:417/2285 train_time:25020ms step_avg:60.00ms +step:418/2285 train_time:25079ms step_avg:60.00ms +step:419/2285 train_time:25140ms step_avg:60.00ms +step:420/2285 train_time:25199ms step_avg:60.00ms +step:421/2285 train_time:25261ms step_avg:60.00ms +step:422/2285 train_time:25320ms step_avg:60.00ms +step:423/2285 train_time:25381ms step_avg:60.00ms +step:424/2285 train_time:25440ms step_avg:60.00ms +step:425/2285 train_time:25501ms step_avg:60.00ms +step:426/2285 train_time:25560ms step_avg:60.00ms +step:427/2285 train_time:25622ms step_avg:60.00ms +step:428/2285 train_time:25681ms step_avg:60.00ms +step:429/2285 train_time:25742ms step_avg:60.00ms +step:430/2285 train_time:25801ms step_avg:60.00ms +step:431/2285 train_time:25863ms step_avg:60.01ms +step:432/2285 train_time:25922ms step_avg:60.00ms +step:433/2285 train_time:25983ms step_avg:60.01ms +step:434/2285 train_time:26042ms step_avg:60.00ms +step:435/2285 train_time:26104ms step_avg:60.01ms +step:436/2285 train_time:26163ms step_avg:60.01ms +step:437/2285 train_time:26225ms step_avg:60.01ms +step:438/2285 train_time:26284ms step_avg:60.01ms +step:439/2285 train_time:26345ms step_avg:60.01ms +step:440/2285 train_time:26404ms step_avg:60.01ms +step:441/2285 train_time:26466ms step_avg:60.01ms +step:442/2285 train_time:26525ms step_avg:60.01ms +step:443/2285 train_time:26587ms step_avg:60.02ms +step:444/2285 train_time:26646ms step_avg:60.01ms +step:445/2285 train_time:26707ms step_avg:60.02ms +step:446/2285 train_time:26766ms step_avg:60.01ms +step:447/2285 train_time:26827ms step_avg:60.02ms +step:448/2285 train_time:26887ms step_avg:60.02ms +step:449/2285 train_time:26948ms step_avg:60.02ms +step:450/2285 train_time:27007ms step_avg:60.02ms +step:451/2285 train_time:27068ms step_avg:60.02ms +step:452/2285 train_time:27127ms step_avg:60.02ms +step:453/2285 train_time:27189ms step_avg:60.02ms +step:454/2285 train_time:27247ms step_avg:60.02ms +step:455/2285 train_time:27309ms step_avg:60.02ms +step:456/2285 train_time:27367ms step_avg:60.02ms +step:457/2285 train_time:27429ms step_avg:60.02ms +step:458/2285 train_time:27489ms step_avg:60.02ms +step:459/2285 train_time:27550ms step_avg:60.02ms +step:460/2285 train_time:27609ms step_avg:60.02ms +step:461/2285 train_time:27671ms step_avg:60.02ms +step:462/2285 train_time:27730ms step_avg:60.02ms +step:463/2285 train_time:27791ms step_avg:60.02ms +step:464/2285 train_time:27850ms step_avg:60.02ms +step:465/2285 train_time:27911ms step_avg:60.02ms +step:466/2285 train_time:27970ms step_avg:60.02ms +step:467/2285 train_time:28032ms step_avg:60.03ms +step:468/2285 train_time:28091ms step_avg:60.02ms +step:469/2285 train_time:28153ms step_avg:60.03ms +step:470/2285 train_time:28213ms step_avg:60.03ms +step:471/2285 train_time:28274ms step_avg:60.03ms +step:472/2285 train_time:28333ms step_avg:60.03ms +step:473/2285 train_time:28395ms step_avg:60.03ms +step:474/2285 train_time:28454ms step_avg:60.03ms +step:475/2285 train_time:28515ms step_avg:60.03ms +step:476/2285 train_time:28574ms step_avg:60.03ms +step:477/2285 train_time:28636ms step_avg:60.03ms +step:478/2285 train_time:28694ms step_avg:60.03ms +step:479/2285 train_time:28755ms step_avg:60.03ms +step:480/2285 train_time:28814ms step_avg:60.03ms +step:481/2285 train_time:28875ms step_avg:60.03ms +step:482/2285 train_time:28934ms step_avg:60.03ms +step:483/2285 train_time:28995ms step_avg:60.03ms +step:484/2285 train_time:29054ms step_avg:60.03ms +step:485/2285 train_time:29116ms step_avg:60.03ms +step:486/2285 train_time:29175ms step_avg:60.03ms +step:487/2285 train_time:29236ms step_avg:60.03ms +step:488/2285 train_time:29295ms step_avg:60.03ms +step:489/2285 train_time:29356ms step_avg:60.03ms +step:490/2285 train_time:29415ms step_avg:60.03ms +step:491/2285 train_time:29476ms step_avg:60.03ms +step:492/2285 train_time:29536ms step_avg:60.03ms +step:493/2285 train_time:29596ms step_avg:60.03ms +step:494/2285 train_time:29655ms step_avg:60.03ms +step:495/2285 train_time:29717ms step_avg:60.03ms +step:496/2285 train_time:29776ms step_avg:60.03ms +step:497/2285 train_time:29838ms step_avg:60.04ms +step:498/2285 train_time:29896ms step_avg:60.03ms +step:499/2285 train_time:29958ms step_avg:60.04ms +step:500/2285 train_time:30017ms step_avg:60.03ms +step:500/2285 val_loss:3.7901 train_time:30079ms step_avg:60.16ms +step:501/2285 train_time:30099ms step_avg:60.08ms +step:502/2285 train_time:30140ms step_avg:60.04ms +step:503/2285 train_time:30200ms step_avg:60.04ms +step:504/2285 train_time:30258ms step_avg:60.04ms +step:505/2285 train_time:30319ms step_avg:60.04ms +step:506/2285 train_time:30378ms step_avg:60.04ms +step:507/2285 train_time:30439ms step_avg:60.04ms +step:508/2285 train_time:30497ms step_avg:60.03ms +step:509/2285 train_time:30557ms step_avg:60.03ms +step:510/2285 train_time:30615ms step_avg:60.03ms +step:511/2285 train_time:30676ms step_avg:60.03ms +step:512/2285 train_time:30734ms step_avg:60.03ms +step:513/2285 train_time:30794ms step_avg:60.03ms +step:514/2285 train_time:30852ms step_avg:60.02ms +step:515/2285 train_time:30913ms step_avg:60.02ms +step:516/2285 train_time:30973ms step_avg:60.02ms +step:517/2285 train_time:31038ms step_avg:60.04ms +step:518/2285 train_time:31100ms step_avg:60.04ms +step:519/2285 train_time:31161ms step_avg:60.04ms +step:520/2285 train_time:31220ms step_avg:60.04ms +step:521/2285 train_time:31282ms step_avg:60.04ms +step:522/2285 train_time:31341ms step_avg:60.04ms +step:523/2285 train_time:31402ms step_avg:60.04ms +step:524/2285 train_time:31461ms step_avg:60.04ms +step:525/2285 train_time:31522ms step_avg:60.04ms +step:526/2285 train_time:31581ms step_avg:60.04ms +step:527/2285 train_time:31642ms step_avg:60.04ms +step:528/2285 train_time:31701ms step_avg:60.04ms +step:529/2285 train_time:31762ms step_avg:60.04ms +step:530/2285 train_time:31822ms step_avg:60.04ms +step:531/2285 train_time:31883ms step_avg:60.04ms +step:532/2285 train_time:31943ms step_avg:60.04ms +step:533/2285 train_time:32007ms step_avg:60.05ms +step:534/2285 train_time:32067ms step_avg:60.05ms +step:535/2285 train_time:32130ms step_avg:60.06ms +step:536/2285 train_time:32189ms step_avg:60.05ms +step:537/2285 train_time:32251ms step_avg:60.06ms +step:538/2285 train_time:32310ms step_avg:60.06ms +step:539/2285 train_time:32371ms step_avg:60.06ms +step:540/2285 train_time:32431ms step_avg:60.06ms +step:541/2285 train_time:32492ms step_avg:60.06ms +step:542/2285 train_time:32551ms step_avg:60.06ms +step:543/2285 train_time:32612ms step_avg:60.06ms +step:544/2285 train_time:32671ms step_avg:60.06ms +step:545/2285 train_time:32733ms step_avg:60.06ms +step:546/2285 train_time:32792ms step_avg:60.06ms +step:547/2285 train_time:32853ms step_avg:60.06ms +step:548/2285 train_time:32912ms step_avg:60.06ms +step:549/2285 train_time:32974ms step_avg:60.06ms +step:550/2285 train_time:33033ms step_avg:60.06ms +step:551/2285 train_time:33095ms step_avg:60.06ms +step:552/2285 train_time:33154ms step_avg:60.06ms +step:553/2285 train_time:33216ms step_avg:60.06ms +step:554/2285 train_time:33275ms step_avg:60.06ms +step:555/2285 train_time:33336ms step_avg:60.06ms +step:556/2285 train_time:33395ms step_avg:60.06ms +step:557/2285 train_time:33456ms step_avg:60.06ms +step:558/2285 train_time:33515ms step_avg:60.06ms +step:559/2285 train_time:33576ms step_avg:60.06ms +step:560/2285 train_time:33635ms step_avg:60.06ms +step:561/2285 train_time:33696ms step_avg:60.06ms +step:562/2285 train_time:33755ms step_avg:60.06ms +step:563/2285 train_time:33817ms step_avg:60.07ms +step:564/2285 train_time:33876ms step_avg:60.06ms +step:565/2285 train_time:33938ms step_avg:60.07ms +step:566/2285 train_time:33996ms step_avg:60.06ms +step:567/2285 train_time:34058ms step_avg:60.07ms +step:568/2285 train_time:34117ms step_avg:60.07ms +step:569/2285 train_time:34178ms step_avg:60.07ms +step:570/2285 train_time:34237ms step_avg:60.06ms +step:571/2285 train_time:34298ms step_avg:60.07ms +step:572/2285 train_time:34356ms step_avg:60.06ms +step:573/2285 train_time:34418ms step_avg:60.07ms +step:574/2285 train_time:34476ms step_avg:60.06ms +step:575/2285 train_time:34537ms step_avg:60.06ms +step:576/2285 train_time:34595ms step_avg:60.06ms +step:577/2285 train_time:34656ms step_avg:60.06ms +step:578/2285 train_time:34716ms step_avg:60.06ms +step:579/2285 train_time:34777ms step_avg:60.06ms +step:580/2285 train_time:34836ms step_avg:60.06ms +step:581/2285 train_time:34897ms step_avg:60.06ms +step:582/2285 train_time:34956ms step_avg:60.06ms +step:583/2285 train_time:35017ms step_avg:60.06ms +step:584/2285 train_time:35077ms step_avg:60.06ms +step:585/2285 train_time:35138ms step_avg:60.07ms +step:586/2285 train_time:35197ms step_avg:60.06ms +step:587/2285 train_time:35258ms step_avg:60.06ms +step:588/2285 train_time:35317ms step_avg:60.06ms +step:589/2285 train_time:35378ms step_avg:60.06ms +step:590/2285 train_time:35436ms step_avg:60.06ms +step:591/2285 train_time:35497ms step_avg:60.06ms +step:592/2285 train_time:35556ms step_avg:60.06ms +step:593/2285 train_time:35617ms step_avg:60.06ms +step:594/2285 train_time:35676ms step_avg:60.06ms +step:595/2285 train_time:35739ms step_avg:60.07ms +step:596/2285 train_time:35796ms step_avg:60.06ms +step:597/2285 train_time:35857ms step_avg:60.06ms +step:598/2285 train_time:35916ms step_avg:60.06ms +step:599/2285 train_time:35978ms step_avg:60.06ms +step:600/2285 train_time:36036ms step_avg:60.06ms +step:601/2285 train_time:36098ms step_avg:60.06ms +step:602/2285 train_time:36157ms step_avg:60.06ms +step:603/2285 train_time:36218ms step_avg:60.06ms +step:604/2285 train_time:36277ms step_avg:60.06ms +step:605/2285 train_time:36339ms step_avg:60.06ms +step:606/2285 train_time:36397ms step_avg:60.06ms +step:607/2285 train_time:36459ms step_avg:60.06ms +step:608/2285 train_time:36517ms step_avg:60.06ms +step:609/2285 train_time:36578ms step_avg:60.06ms +step:610/2285 train_time:36637ms step_avg:60.06ms +step:611/2285 train_time:36698ms step_avg:60.06ms +step:612/2285 train_time:36756ms step_avg:60.06ms +step:613/2285 train_time:36818ms step_avg:60.06ms +step:614/2285 train_time:36877ms step_avg:60.06ms +step:615/2285 train_time:36938ms step_avg:60.06ms +step:616/2285 train_time:36997ms step_avg:60.06ms +step:617/2285 train_time:37059ms step_avg:60.06ms +step:618/2285 train_time:37118ms step_avg:60.06ms +step:619/2285 train_time:37179ms step_avg:60.06ms +step:620/2285 train_time:37238ms step_avg:60.06ms +step:621/2285 train_time:37299ms step_avg:60.06ms +step:622/2285 train_time:37359ms step_avg:60.06ms +step:623/2285 train_time:37420ms step_avg:60.06ms +step:624/2285 train_time:37478ms step_avg:60.06ms +step:625/2285 train_time:37540ms step_avg:60.06ms +step:626/2285 train_time:37599ms step_avg:60.06ms +step:627/2285 train_time:37660ms step_avg:60.06ms +step:628/2285 train_time:37719ms step_avg:60.06ms +step:629/2285 train_time:37781ms step_avg:60.06ms +step:630/2285 train_time:37840ms step_avg:60.06ms +step:631/2285 train_time:37902ms step_avg:60.07ms +step:632/2285 train_time:37961ms step_avg:60.06ms +step:633/2285 train_time:38022ms step_avg:60.07ms +step:634/2285 train_time:38081ms step_avg:60.06ms +step:635/2285 train_time:38143ms step_avg:60.07ms +step:636/2285 train_time:38202ms step_avg:60.07ms +step:637/2285 train_time:38263ms step_avg:60.07ms +step:638/2285 train_time:38323ms step_avg:60.07ms +step:639/2285 train_time:38384ms step_avg:60.07ms +step:640/2285 train_time:38444ms step_avg:60.07ms +step:641/2285 train_time:38506ms step_avg:60.07ms +step:642/2285 train_time:38566ms step_avg:60.07ms +step:643/2285 train_time:38627ms step_avg:60.07ms +step:644/2285 train_time:38687ms step_avg:60.07ms +step:645/2285 train_time:38748ms step_avg:60.07ms +step:646/2285 train_time:38808ms step_avg:60.07ms +step:647/2285 train_time:38869ms step_avg:60.08ms +step:648/2285 train_time:38928ms step_avg:60.07ms +step:649/2285 train_time:38990ms step_avg:60.08ms +step:650/2285 train_time:39050ms step_avg:60.08ms +step:651/2285 train_time:39111ms step_avg:60.08ms +step:652/2285 train_time:39171ms step_avg:60.08ms +step:653/2285 train_time:39232ms step_avg:60.08ms +step:654/2285 train_time:39292ms step_avg:60.08ms +step:655/2285 train_time:39353ms step_avg:60.08ms +step:656/2285 train_time:39412ms step_avg:60.08ms +step:657/2285 train_time:39473ms step_avg:60.08ms +step:658/2285 train_time:39532ms step_avg:60.08ms +step:659/2285 train_time:39594ms step_avg:60.08ms +step:660/2285 train_time:39653ms step_avg:60.08ms +step:661/2285 train_time:39714ms step_avg:60.08ms +step:662/2285 train_time:39773ms step_avg:60.08ms +step:663/2285 train_time:39835ms step_avg:60.08ms +step:664/2285 train_time:39893ms step_avg:60.08ms +step:665/2285 train_time:39955ms step_avg:60.08ms +step:666/2285 train_time:40014ms step_avg:60.08ms +step:667/2285 train_time:40075ms step_avg:60.08ms +step:668/2285 train_time:40134ms step_avg:60.08ms +step:669/2285 train_time:40196ms step_avg:60.08ms +step:670/2285 train_time:40255ms step_avg:60.08ms +step:671/2285 train_time:40317ms step_avg:60.08ms +step:672/2285 train_time:40376ms step_avg:60.08ms +step:673/2285 train_time:40437ms step_avg:60.08ms +step:674/2285 train_time:40495ms step_avg:60.08ms +step:675/2285 train_time:40557ms step_avg:60.08ms +step:676/2285 train_time:40616ms step_avg:60.08ms +step:677/2285 train_time:40677ms step_avg:60.08ms +step:678/2285 train_time:40736ms step_avg:60.08ms +step:679/2285 train_time:40797ms step_avg:60.08ms +step:680/2285 train_time:40857ms step_avg:60.08ms +step:681/2285 train_time:40918ms step_avg:60.09ms +step:682/2285 train_time:40977ms step_avg:60.08ms +step:683/2285 train_time:41039ms step_avg:60.09ms +step:684/2285 train_time:41097ms step_avg:60.08ms +step:685/2285 train_time:41159ms step_avg:60.09ms +step:686/2285 train_time:41217ms step_avg:60.08ms +step:687/2285 train_time:41278ms step_avg:60.08ms +step:688/2285 train_time:41337ms step_avg:60.08ms +step:689/2285 train_time:41398ms step_avg:60.08ms +step:690/2285 train_time:41457ms step_avg:60.08ms +step:691/2285 train_time:41518ms step_avg:60.08ms +step:692/2285 train_time:41577ms step_avg:60.08ms +step:693/2285 train_time:41638ms step_avg:60.08ms +step:694/2285 train_time:41697ms step_avg:60.08ms +step:695/2285 train_time:41758ms step_avg:60.08ms +step:696/2285 train_time:41817ms step_avg:60.08ms +step:697/2285 train_time:41879ms step_avg:60.08ms +step:698/2285 train_time:41938ms step_avg:60.08ms +step:699/2285 train_time:41999ms step_avg:60.08ms +step:700/2285 train_time:42058ms step_avg:60.08ms +step:701/2285 train_time:42120ms step_avg:60.08ms +step:702/2285 train_time:42178ms step_avg:60.08ms +step:703/2285 train_time:42239ms step_avg:60.08ms +step:704/2285 train_time:42298ms step_avg:60.08ms +step:705/2285 train_time:42360ms step_avg:60.08ms +step:706/2285 train_time:42418ms step_avg:60.08ms +step:707/2285 train_time:42480ms step_avg:60.08ms +step:708/2285 train_time:42538ms step_avg:60.08ms +step:709/2285 train_time:42600ms step_avg:60.08ms +step:710/2285 train_time:42659ms step_avg:60.08ms +step:711/2285 train_time:42720ms step_avg:60.08ms +step:712/2285 train_time:42779ms step_avg:60.08ms +step:713/2285 train_time:42840ms step_avg:60.08ms +step:714/2285 train_time:42899ms step_avg:60.08ms +step:715/2285 train_time:42961ms step_avg:60.09ms +step:716/2285 train_time:43020ms step_avg:60.08ms +step:717/2285 train_time:43081ms step_avg:60.09ms +step:718/2285 train_time:43140ms step_avg:60.08ms +step:719/2285 train_time:43201ms step_avg:60.09ms +step:720/2285 train_time:43260ms step_avg:60.08ms +step:721/2285 train_time:43322ms step_avg:60.09ms +step:722/2285 train_time:43381ms step_avg:60.08ms +step:723/2285 train_time:43443ms step_avg:60.09ms +step:724/2285 train_time:43503ms step_avg:60.09ms +step:725/2285 train_time:43564ms step_avg:60.09ms +step:726/2285 train_time:43624ms step_avg:60.09ms +step:727/2285 train_time:43685ms step_avg:60.09ms +step:728/2285 train_time:43744ms step_avg:60.09ms +step:729/2285 train_time:43806ms step_avg:60.09ms +step:730/2285 train_time:43865ms step_avg:60.09ms +step:731/2285 train_time:43927ms step_avg:60.09ms +step:732/2285 train_time:43986ms step_avg:60.09ms +step:733/2285 train_time:44048ms step_avg:60.09ms +step:734/2285 train_time:44107ms step_avg:60.09ms +step:735/2285 train_time:44168ms step_avg:60.09ms +step:736/2285 train_time:44228ms step_avg:60.09ms +step:737/2285 train_time:44289ms step_avg:60.09ms +step:738/2285 train_time:44349ms step_avg:60.09ms +step:739/2285 train_time:44411ms step_avg:60.10ms +step:740/2285 train_time:44470ms step_avg:60.09ms +step:741/2285 train_time:44532ms step_avg:60.10ms +step:742/2285 train_time:44591ms step_avg:60.10ms +step:743/2285 train_time:44654ms step_avg:60.10ms +step:744/2285 train_time:44713ms step_avg:60.10ms +step:745/2285 train_time:44775ms step_avg:60.10ms +step:746/2285 train_time:44834ms step_avg:60.10ms +step:747/2285 train_time:44895ms step_avg:60.10ms +step:748/2285 train_time:44954ms step_avg:60.10ms +step:749/2285 train_time:45015ms step_avg:60.10ms +step:750/2285 train_time:45074ms step_avg:60.10ms +step:750/2285 val_loss:3.6583 train_time:45136ms step_avg:60.18ms +step:751/2285 train_time:45155ms step_avg:60.13ms +step:752/2285 train_time:45196ms step_avg:60.10ms +step:753/2285 train_time:45259ms step_avg:60.10ms +step:754/2285 train_time:45319ms step_avg:60.10ms +step:755/2285 train_time:45380ms step_avg:60.11ms +step:756/2285 train_time:45439ms step_avg:60.10ms +step:757/2285 train_time:45499ms step_avg:60.10ms +step:758/2285 train_time:45558ms step_avg:60.10ms +step:759/2285 train_time:45619ms step_avg:60.10ms +step:760/2285 train_time:45678ms step_avg:60.10ms +step:761/2285 train_time:45739ms step_avg:60.10ms +step:762/2285 train_time:45797ms step_avg:60.10ms +step:763/2285 train_time:45859ms step_avg:60.10ms +step:764/2285 train_time:45919ms step_avg:60.10ms +step:765/2285 train_time:45981ms step_avg:60.11ms +step:766/2285 train_time:46040ms step_avg:60.10ms +step:767/2285 train_time:46104ms step_avg:60.11ms +step:768/2285 train_time:46166ms step_avg:60.11ms +step:769/2285 train_time:46229ms step_avg:60.12ms +step:770/2285 train_time:46288ms step_avg:60.11ms +step:771/2285 train_time:46350ms step_avg:60.12ms +step:772/2285 train_time:46409ms step_avg:60.12ms +step:773/2285 train_time:46470ms step_avg:60.12ms +step:774/2285 train_time:46530ms step_avg:60.12ms +step:775/2285 train_time:46591ms step_avg:60.12ms +step:776/2285 train_time:46650ms step_avg:60.12ms +step:777/2285 train_time:46711ms step_avg:60.12ms +step:778/2285 train_time:46770ms step_avg:60.12ms +step:779/2285 train_time:46832ms step_avg:60.12ms +step:780/2285 train_time:46891ms step_avg:60.12ms +step:781/2285 train_time:46952ms step_avg:60.12ms +step:782/2285 train_time:47013ms step_avg:60.12ms +step:783/2285 train_time:47076ms step_avg:60.12ms +step:784/2285 train_time:47137ms step_avg:60.12ms +step:785/2285 train_time:47200ms step_avg:60.13ms +step:786/2285 train_time:47260ms step_avg:60.13ms +step:787/2285 train_time:47322ms step_avg:60.13ms +step:788/2285 train_time:47382ms step_avg:60.13ms +step:789/2285 train_time:47443ms step_avg:60.13ms +step:790/2285 train_time:47502ms step_avg:60.13ms +step:791/2285 train_time:47564ms step_avg:60.13ms +step:792/2285 train_time:47624ms step_avg:60.13ms +step:793/2285 train_time:47684ms step_avg:60.13ms +step:794/2285 train_time:47744ms step_avg:60.13ms +step:795/2285 train_time:47805ms step_avg:60.13ms +step:796/2285 train_time:47865ms step_avg:60.13ms +step:797/2285 train_time:47927ms step_avg:60.13ms +step:798/2285 train_time:47987ms step_avg:60.13ms +step:799/2285 train_time:48049ms step_avg:60.14ms +step:800/2285 train_time:48109ms step_avg:60.14ms +step:801/2285 train_time:48171ms step_avg:60.14ms +step:802/2285 train_time:48230ms step_avg:60.14ms +step:803/2285 train_time:48292ms step_avg:60.14ms +step:804/2285 train_time:48351ms step_avg:60.14ms +step:805/2285 train_time:48413ms step_avg:60.14ms +step:806/2285 train_time:48472ms step_avg:60.14ms +step:807/2285 train_time:48533ms step_avg:60.14ms +step:808/2285 train_time:48593ms step_avg:60.14ms +step:809/2285 train_time:48655ms step_avg:60.14ms +step:810/2285 train_time:48715ms step_avg:60.14ms +step:811/2285 train_time:48777ms step_avg:60.14ms +step:812/2285 train_time:48837ms step_avg:60.14ms +step:813/2285 train_time:48899ms step_avg:60.15ms +step:814/2285 train_time:48958ms step_avg:60.15ms +step:815/2285 train_time:49021ms step_avg:60.15ms +step:816/2285 train_time:49080ms step_avg:60.15ms +step:817/2285 train_time:49142ms step_avg:60.15ms +step:818/2285 train_time:49201ms step_avg:60.15ms +step:819/2285 train_time:49264ms step_avg:60.15ms +step:820/2285 train_time:49324ms step_avg:60.15ms +step:821/2285 train_time:49385ms step_avg:60.15ms +step:822/2285 train_time:49445ms step_avg:60.15ms +step:823/2285 train_time:49507ms step_avg:60.15ms +step:824/2285 train_time:49567ms step_avg:60.15ms +step:825/2285 train_time:49629ms step_avg:60.16ms +step:826/2285 train_time:49688ms step_avg:60.15ms +step:827/2285 train_time:49750ms step_avg:60.16ms +step:828/2285 train_time:49809ms step_avg:60.16ms +step:829/2285 train_time:49870ms step_avg:60.16ms +step:830/2285 train_time:49930ms step_avg:60.16ms +step:831/2285 train_time:49991ms step_avg:60.16ms +step:832/2285 train_time:50050ms step_avg:60.16ms +step:833/2285 train_time:50112ms step_avg:60.16ms +step:834/2285 train_time:50171ms step_avg:60.16ms +step:835/2285 train_time:50234ms step_avg:60.16ms +step:836/2285 train_time:50294ms step_avg:60.16ms +step:837/2285 train_time:50356ms step_avg:60.16ms +step:838/2285 train_time:50415ms step_avg:60.16ms +step:839/2285 train_time:50478ms step_avg:60.16ms +step:840/2285 train_time:50538ms step_avg:60.16ms +step:841/2285 train_time:50600ms step_avg:60.17ms +step:842/2285 train_time:50659ms step_avg:60.17ms +step:843/2285 train_time:50722ms step_avg:60.17ms +step:844/2285 train_time:50781ms step_avg:60.17ms +step:845/2285 train_time:50843ms step_avg:60.17ms +step:846/2285 train_time:50903ms step_avg:60.17ms +step:847/2285 train_time:50965ms step_avg:60.17ms +step:848/2285 train_time:51024ms step_avg:60.17ms +step:849/2285 train_time:51086ms step_avg:60.17ms +step:850/2285 train_time:51145ms step_avg:60.17ms +step:851/2285 train_time:51208ms step_avg:60.17ms +step:852/2285 train_time:51268ms step_avg:60.17ms +step:853/2285 train_time:51329ms step_avg:60.18ms +step:854/2285 train_time:51388ms step_avg:60.17ms +step:855/2285 train_time:51450ms step_avg:60.18ms +step:856/2285 train_time:51509ms step_avg:60.17ms +step:857/2285 train_time:51571ms step_avg:60.18ms +step:858/2285 train_time:51631ms step_avg:60.18ms +step:859/2285 train_time:51693ms step_avg:60.18ms +step:860/2285 train_time:51752ms step_avg:60.18ms +step:861/2285 train_time:51815ms step_avg:60.18ms +step:862/2285 train_time:51874ms step_avg:60.18ms +step:863/2285 train_time:51936ms step_avg:60.18ms +step:864/2285 train_time:51996ms step_avg:60.18ms +step:865/2285 train_time:52058ms step_avg:60.18ms +step:866/2285 train_time:52117ms step_avg:60.18ms +step:867/2285 train_time:52179ms step_avg:60.18ms +step:868/2285 train_time:52239ms step_avg:60.18ms +step:869/2285 train_time:52301ms step_avg:60.19ms +step:870/2285 train_time:52360ms step_avg:60.18ms +step:871/2285 train_time:52422ms step_avg:60.19ms +step:872/2285 train_time:52482ms step_avg:60.19ms +step:873/2285 train_time:52544ms step_avg:60.19ms +step:874/2285 train_time:52604ms step_avg:60.19ms +step:875/2285 train_time:52666ms step_avg:60.19ms +step:876/2285 train_time:52725ms step_avg:60.19ms +step:877/2285 train_time:52787ms step_avg:60.19ms +step:878/2285 train_time:52846ms step_avg:60.19ms +step:879/2285 train_time:52908ms step_avg:60.19ms +step:880/2285 train_time:52967ms step_avg:60.19ms +step:881/2285 train_time:53029ms step_avg:60.19ms +step:882/2285 train_time:53089ms step_avg:60.19ms +step:883/2285 train_time:53151ms step_avg:60.19ms +step:884/2285 train_time:53211ms step_avg:60.19ms +step:885/2285 train_time:53272ms step_avg:60.19ms +step:886/2285 train_time:53331ms step_avg:60.19ms +step:887/2285 train_time:53393ms step_avg:60.19ms +step:888/2285 train_time:53452ms step_avg:60.19ms +step:889/2285 train_time:53514ms step_avg:60.20ms +step:890/2285 train_time:53573ms step_avg:60.19ms +step:891/2285 train_time:53635ms step_avg:60.20ms +step:892/2285 train_time:53695ms step_avg:60.20ms +step:893/2285 train_time:53757ms step_avg:60.20ms +step:894/2285 train_time:53817ms step_avg:60.20ms +step:895/2285 train_time:53880ms step_avg:60.20ms +step:896/2285 train_time:53939ms step_avg:60.20ms +step:897/2285 train_time:54001ms step_avg:60.20ms +step:898/2285 train_time:54061ms step_avg:60.20ms +step:899/2285 train_time:54123ms step_avg:60.20ms +step:900/2285 train_time:54182ms step_avg:60.20ms +step:901/2285 train_time:54243ms step_avg:60.20ms +step:902/2285 train_time:54302ms step_avg:60.20ms +step:903/2285 train_time:54364ms step_avg:60.20ms +step:904/2285 train_time:54424ms step_avg:60.20ms +step:905/2285 train_time:54486ms step_avg:60.21ms +step:906/2285 train_time:54546ms step_avg:60.21ms +step:907/2285 train_time:54608ms step_avg:60.21ms +step:908/2285 train_time:54668ms step_avg:60.21ms +step:909/2285 train_time:54730ms step_avg:60.21ms +step:910/2285 train_time:54789ms step_avg:60.21ms +step:911/2285 train_time:54850ms step_avg:60.21ms +step:912/2285 train_time:54910ms step_avg:60.21ms +step:913/2285 train_time:54972ms step_avg:60.21ms +step:914/2285 train_time:55031ms step_avg:60.21ms +step:915/2285 train_time:55093ms step_avg:60.21ms +step:916/2285 train_time:55153ms step_avg:60.21ms +step:917/2285 train_time:55215ms step_avg:60.21ms +step:918/2285 train_time:55275ms step_avg:60.21ms +step:919/2285 train_time:55337ms step_avg:60.21ms +step:920/2285 train_time:55397ms step_avg:60.21ms +step:921/2285 train_time:55459ms step_avg:60.22ms +step:922/2285 train_time:55519ms step_avg:60.22ms +step:923/2285 train_time:55581ms step_avg:60.22ms +step:924/2285 train_time:55640ms step_avg:60.22ms +step:925/2285 train_time:55702ms step_avg:60.22ms +step:926/2285 train_time:55762ms step_avg:60.22ms +step:927/2285 train_time:55824ms step_avg:60.22ms +step:928/2285 train_time:55883ms step_avg:60.22ms +step:929/2285 train_time:55944ms step_avg:60.22ms +step:930/2285 train_time:56004ms step_avg:60.22ms +step:931/2285 train_time:56066ms step_avg:60.22ms +step:932/2285 train_time:56126ms step_avg:60.22ms +step:933/2285 train_time:56187ms step_avg:60.22ms +step:934/2285 train_time:56247ms step_avg:60.22ms +step:935/2285 train_time:56308ms step_avg:60.22ms +step:936/2285 train_time:56368ms step_avg:60.22ms +step:937/2285 train_time:56429ms step_avg:60.22ms +step:938/2285 train_time:56489ms step_avg:60.22ms +step:939/2285 train_time:56551ms step_avg:60.23ms +step:940/2285 train_time:56611ms step_avg:60.22ms +step:941/2285 train_time:56673ms step_avg:60.23ms +step:942/2285 train_time:56732ms step_avg:60.22ms +step:943/2285 train_time:56794ms step_avg:60.23ms +step:944/2285 train_time:56853ms step_avg:60.23ms +step:945/2285 train_time:56915ms step_avg:60.23ms +step:946/2285 train_time:56974ms step_avg:60.23ms +step:947/2285 train_time:57036ms step_avg:60.23ms +step:948/2285 train_time:57096ms step_avg:60.23ms +step:949/2285 train_time:57159ms step_avg:60.23ms +step:950/2285 train_time:57219ms step_avg:60.23ms +step:951/2285 train_time:57281ms step_avg:60.23ms +step:952/2285 train_time:57340ms step_avg:60.23ms +step:953/2285 train_time:57402ms step_avg:60.23ms +step:954/2285 train_time:57463ms step_avg:60.23ms +step:955/2285 train_time:57524ms step_avg:60.23ms +step:956/2285 train_time:57584ms step_avg:60.23ms +step:957/2285 train_time:57645ms step_avg:60.24ms +step:958/2285 train_time:57705ms step_avg:60.23ms +step:959/2285 train_time:57767ms step_avg:60.24ms +step:960/2285 train_time:57827ms step_avg:60.24ms +step:961/2285 train_time:57889ms step_avg:60.24ms +step:962/2285 train_time:57948ms step_avg:60.24ms +step:963/2285 train_time:58009ms step_avg:60.24ms +step:964/2285 train_time:58069ms step_avg:60.24ms +step:965/2285 train_time:58131ms step_avg:60.24ms +step:966/2285 train_time:58190ms step_avg:60.24ms +step:967/2285 train_time:58252ms step_avg:60.24ms +step:968/2285 train_time:58311ms step_avg:60.24ms +step:969/2285 train_time:58374ms step_avg:60.24ms +step:970/2285 train_time:58433ms step_avg:60.24ms +step:971/2285 train_time:58495ms step_avg:60.24ms +step:972/2285 train_time:58555ms step_avg:60.24ms +step:973/2285 train_time:58617ms step_avg:60.24ms +step:974/2285 train_time:58676ms step_avg:60.24ms +step:975/2285 train_time:58739ms step_avg:60.24ms +step:976/2285 train_time:58798ms step_avg:60.24ms +step:977/2285 train_time:58860ms step_avg:60.25ms +step:978/2285 train_time:58920ms step_avg:60.25ms +step:979/2285 train_time:58982ms step_avg:60.25ms +step:980/2285 train_time:59041ms step_avg:60.25ms +step:981/2285 train_time:59103ms step_avg:60.25ms +step:982/2285 train_time:59163ms step_avg:60.25ms +step:983/2285 train_time:59225ms step_avg:60.25ms +step:984/2285 train_time:59285ms step_avg:60.25ms +step:985/2285 train_time:59347ms step_avg:60.25ms +step:986/2285 train_time:59406ms step_avg:60.25ms +step:987/2285 train_time:59469ms step_avg:60.25ms +step:988/2285 train_time:59528ms step_avg:60.25ms +step:989/2285 train_time:59590ms step_avg:60.25ms +step:990/2285 train_time:59649ms step_avg:60.25ms +step:991/2285 train_time:59711ms step_avg:60.25ms +step:992/2285 train_time:59770ms step_avg:60.25ms +step:993/2285 train_time:59832ms step_avg:60.25ms +step:994/2285 train_time:59890ms step_avg:60.25ms +step:995/2285 train_time:59953ms step_avg:60.25ms +step:996/2285 train_time:60012ms step_avg:60.25ms +step:997/2285 train_time:60074ms step_avg:60.25ms +step:998/2285 train_time:60133ms step_avg:60.25ms +step:999/2285 train_time:60195ms step_avg:60.26ms +step:1000/2285 train_time:60255ms step_avg:60.25ms +step:1000/2285 val_loss:3.5692 train_time:60318ms step_avg:60.32ms +step:1001/2285 train_time:60337ms step_avg:60.28ms +step:1002/2285 train_time:60380ms step_avg:60.26ms +step:1003/2285 train_time:60442ms step_avg:60.26ms +step:1004/2285 train_time:60503ms step_avg:60.26ms +step:1005/2285 train_time:60567ms step_avg:60.27ms +step:1006/2285 train_time:60626ms step_avg:60.26ms +step:1007/2285 train_time:60687ms step_avg:60.27ms +step:1008/2285 train_time:60745ms step_avg:60.26ms +step:1009/2285 train_time:60806ms step_avg:60.26ms +step:1010/2285 train_time:60865ms step_avg:60.26ms +step:1011/2285 train_time:60925ms step_avg:60.26ms +step:1012/2285 train_time:60984ms step_avg:60.26ms +step:1013/2285 train_time:61045ms step_avg:60.26ms +step:1014/2285 train_time:61105ms step_avg:60.26ms +step:1015/2285 train_time:61166ms step_avg:60.26ms +step:1016/2285 train_time:61229ms step_avg:60.26ms +step:1017/2285 train_time:61296ms step_avg:60.27ms +step:1018/2285 train_time:61356ms step_avg:60.27ms +step:1019/2285 train_time:61418ms step_avg:60.27ms +step:1020/2285 train_time:61478ms step_avg:60.27ms +step:1021/2285 train_time:61539ms step_avg:60.27ms +step:1022/2285 train_time:61599ms step_avg:60.27ms +step:1023/2285 train_time:61661ms step_avg:60.27ms +step:1024/2285 train_time:61720ms step_avg:60.27ms +step:1025/2285 train_time:61782ms step_avg:60.27ms +step:1026/2285 train_time:61841ms step_avg:60.27ms +step:1027/2285 train_time:61902ms step_avg:60.27ms +step:1028/2285 train_time:61961ms step_avg:60.27ms +step:1029/2285 train_time:62023ms step_avg:60.27ms +step:1030/2285 train_time:62083ms step_avg:60.27ms +step:1031/2285 train_time:62146ms step_avg:60.28ms +step:1032/2285 train_time:62207ms step_avg:60.28ms +step:1033/2285 train_time:62270ms step_avg:60.28ms +step:1034/2285 train_time:62330ms step_avg:60.28ms +step:1035/2285 train_time:62393ms step_avg:60.28ms +step:1036/2285 train_time:62453ms step_avg:60.28ms +step:1037/2285 train_time:62515ms step_avg:60.28ms +step:1038/2285 train_time:62574ms step_avg:60.28ms +step:1039/2285 train_time:62636ms step_avg:60.28ms +step:1040/2285 train_time:62695ms step_avg:60.28ms +step:1041/2285 train_time:62757ms step_avg:60.28ms +step:1042/2285 train_time:62816ms step_avg:60.28ms +step:1043/2285 train_time:62877ms step_avg:60.28ms +step:1044/2285 train_time:62936ms step_avg:60.28ms +step:1045/2285 train_time:62997ms step_avg:60.28ms +step:1046/2285 train_time:63057ms step_avg:60.28ms +step:1047/2285 train_time:63119ms step_avg:60.29ms +step:1048/2285 train_time:63179ms step_avg:60.29ms +step:1049/2285 train_time:63243ms step_avg:60.29ms +step:1050/2285 train_time:63304ms step_avg:60.29ms +step:1051/2285 train_time:63367ms step_avg:60.29ms +step:1052/2285 train_time:63427ms step_avg:60.29ms +step:1053/2285 train_time:63489ms step_avg:60.29ms +step:1054/2285 train_time:63548ms step_avg:60.29ms +step:1055/2285 train_time:63610ms step_avg:60.29ms +step:1056/2285 train_time:63669ms step_avg:60.29ms +step:1057/2285 train_time:63732ms step_avg:60.29ms +step:1058/2285 train_time:63791ms step_avg:60.29ms +step:1059/2285 train_time:63853ms step_avg:60.30ms +step:1060/2285 train_time:63912ms step_avg:60.29ms +step:1061/2285 train_time:63973ms step_avg:60.30ms +step:1062/2285 train_time:64032ms step_avg:60.29ms +step:1063/2285 train_time:64095ms step_avg:60.30ms +step:1064/2285 train_time:64155ms step_avg:60.30ms +step:1065/2285 train_time:64218ms step_avg:60.30ms +step:1066/2285 train_time:64277ms step_avg:60.30ms +step:1067/2285 train_time:64339ms step_avg:60.30ms +step:1068/2285 train_time:64399ms step_avg:60.30ms +step:1069/2285 train_time:64461ms step_avg:60.30ms +step:1070/2285 train_time:64521ms step_avg:60.30ms +step:1071/2285 train_time:64583ms step_avg:60.30ms +step:1072/2285 train_time:64643ms step_avg:60.30ms +step:1073/2285 train_time:64705ms step_avg:60.30ms +step:1074/2285 train_time:64764ms step_avg:60.30ms +step:1075/2285 train_time:64826ms step_avg:60.30ms +step:1076/2285 train_time:64885ms step_avg:60.30ms +step:1077/2285 train_time:64947ms step_avg:60.30ms +step:1078/2285 train_time:65006ms step_avg:60.30ms +step:1079/2285 train_time:65068ms step_avg:60.30ms +step:1080/2285 train_time:65128ms step_avg:60.30ms +step:1081/2285 train_time:65191ms step_avg:60.31ms +step:1082/2285 train_time:65250ms step_avg:60.31ms +step:1083/2285 train_time:65312ms step_avg:60.31ms +step:1084/2285 train_time:65372ms step_avg:60.31ms +step:1085/2285 train_time:65433ms step_avg:60.31ms +step:1086/2285 train_time:65493ms step_avg:60.31ms +step:1087/2285 train_time:65555ms step_avg:60.31ms +step:1088/2285 train_time:65614ms step_avg:60.31ms +step:1089/2285 train_time:65676ms step_avg:60.31ms +step:1090/2285 train_time:65735ms step_avg:60.31ms +step:1091/2285 train_time:65796ms step_avg:60.31ms +step:1092/2285 train_time:65856ms step_avg:60.31ms +step:1093/2285 train_time:65917ms step_avg:60.31ms +step:1094/2285 train_time:65977ms step_avg:60.31ms +step:1095/2285 train_time:66039ms step_avg:60.31ms +step:1096/2285 train_time:66099ms step_avg:60.31ms +step:1097/2285 train_time:66161ms step_avg:60.31ms +step:1098/2285 train_time:66220ms step_avg:60.31ms +step:1099/2285 train_time:66283ms step_avg:60.31ms +step:1100/2285 train_time:66343ms step_avg:60.31ms +step:1101/2285 train_time:66406ms step_avg:60.31ms +step:1102/2285 train_time:66465ms step_avg:60.31ms +step:1103/2285 train_time:66527ms step_avg:60.31ms +step:1104/2285 train_time:66587ms step_avg:60.31ms +step:1105/2285 train_time:66649ms step_avg:60.32ms +step:1106/2285 train_time:66709ms step_avg:60.32ms +step:1107/2285 train_time:66771ms step_avg:60.32ms +step:1108/2285 train_time:66830ms step_avg:60.32ms +step:1109/2285 train_time:66892ms step_avg:60.32ms +step:1110/2285 train_time:66951ms step_avg:60.32ms +step:1111/2285 train_time:67013ms step_avg:60.32ms +step:1112/2285 train_time:67072ms step_avg:60.32ms +step:1113/2285 train_time:67133ms step_avg:60.32ms +step:1114/2285 train_time:67193ms step_avg:60.32ms +step:1115/2285 train_time:67255ms step_avg:60.32ms +step:1116/2285 train_time:67314ms step_avg:60.32ms +step:1117/2285 train_time:67376ms step_avg:60.32ms +step:1118/2285 train_time:67436ms step_avg:60.32ms +step:1119/2285 train_time:67498ms step_avg:60.32ms +step:1120/2285 train_time:67557ms step_avg:60.32ms +step:1121/2285 train_time:67619ms step_avg:60.32ms +step:1122/2285 train_time:67679ms step_avg:60.32ms +step:1123/2285 train_time:67741ms step_avg:60.32ms +step:1124/2285 train_time:67801ms step_avg:60.32ms +step:1125/2285 train_time:67863ms step_avg:60.32ms +step:1126/2285 train_time:67923ms step_avg:60.32ms +step:1127/2285 train_time:67986ms step_avg:60.32ms +step:1128/2285 train_time:68045ms step_avg:60.32ms +step:1129/2285 train_time:68106ms step_avg:60.32ms +step:1130/2285 train_time:68165ms step_avg:60.32ms +step:1131/2285 train_time:68227ms step_avg:60.32ms +step:1132/2285 train_time:68287ms step_avg:60.32ms +step:1133/2285 train_time:68349ms step_avg:60.33ms +step:1134/2285 train_time:68409ms step_avg:60.33ms +step:1135/2285 train_time:68470ms step_avg:60.33ms +step:1136/2285 train_time:68531ms step_avg:60.33ms +step:1137/2285 train_time:68593ms step_avg:60.33ms +step:1138/2285 train_time:68652ms step_avg:60.33ms +step:1139/2285 train_time:68714ms step_avg:60.33ms +step:1140/2285 train_time:68774ms step_avg:60.33ms +step:1141/2285 train_time:68835ms step_avg:60.33ms +step:1142/2285 train_time:68895ms step_avg:60.33ms +step:1143/2285 train_time:68957ms step_avg:60.33ms +step:1144/2285 train_time:69016ms step_avg:60.33ms +step:1145/2285 train_time:69078ms step_avg:60.33ms +step:1146/2285 train_time:69137ms step_avg:60.33ms +step:1147/2285 train_time:69199ms step_avg:60.33ms +step:1148/2285 train_time:69259ms step_avg:60.33ms +step:1149/2285 train_time:69322ms step_avg:60.33ms +step:1150/2285 train_time:69383ms step_avg:60.33ms +step:1151/2285 train_time:69445ms step_avg:60.33ms +step:1152/2285 train_time:69505ms step_avg:60.33ms +step:1153/2285 train_time:69568ms step_avg:60.34ms +step:1154/2285 train_time:69628ms step_avg:60.34ms +step:1155/2285 train_time:69690ms step_avg:60.34ms +step:1156/2285 train_time:69750ms step_avg:60.34ms +step:1157/2285 train_time:69812ms step_avg:60.34ms +step:1158/2285 train_time:69871ms step_avg:60.34ms +step:1159/2285 train_time:69933ms step_avg:60.34ms +step:1160/2285 train_time:69993ms step_avg:60.34ms +step:1161/2285 train_time:70054ms step_avg:60.34ms +step:1162/2285 train_time:70114ms step_avg:60.34ms +step:1163/2285 train_time:70176ms step_avg:60.34ms +step:1164/2285 train_time:70236ms step_avg:60.34ms +step:1165/2285 train_time:70298ms step_avg:60.34ms +step:1166/2285 train_time:70357ms step_avg:60.34ms +step:1167/2285 train_time:70419ms step_avg:60.34ms +step:1168/2285 train_time:70479ms step_avg:60.34ms +step:1169/2285 train_time:70542ms step_avg:60.34ms +step:1170/2285 train_time:70603ms step_avg:60.34ms +step:1171/2285 train_time:70666ms step_avg:60.35ms +step:1172/2285 train_time:70725ms step_avg:60.35ms +step:1173/2285 train_time:70787ms step_avg:60.35ms +step:1174/2285 train_time:70846ms step_avg:60.35ms +step:1175/2285 train_time:70908ms step_avg:60.35ms +step:1176/2285 train_time:70968ms step_avg:60.35ms +step:1177/2285 train_time:71030ms step_avg:60.35ms +step:1178/2285 train_time:71090ms step_avg:60.35ms +step:1179/2285 train_time:71152ms step_avg:60.35ms +step:1180/2285 train_time:71212ms step_avg:60.35ms +step:1181/2285 train_time:71274ms step_avg:60.35ms +step:1182/2285 train_time:71334ms step_avg:60.35ms +step:1183/2285 train_time:71395ms step_avg:60.35ms +step:1184/2285 train_time:71455ms step_avg:60.35ms +step:1185/2285 train_time:71517ms step_avg:60.35ms +step:1186/2285 train_time:71577ms step_avg:60.35ms +step:1187/2285 train_time:71639ms step_avg:60.35ms +step:1188/2285 train_time:71699ms step_avg:60.35ms +step:1189/2285 train_time:71762ms step_avg:60.35ms +step:1190/2285 train_time:71822ms step_avg:60.35ms +step:1191/2285 train_time:71885ms step_avg:60.36ms +step:1192/2285 train_time:71945ms step_avg:60.36ms +step:1193/2285 train_time:72007ms step_avg:60.36ms +step:1194/2285 train_time:72066ms step_avg:60.36ms +step:1195/2285 train_time:72128ms step_avg:60.36ms +step:1196/2285 train_time:72188ms step_avg:60.36ms +step:1197/2285 train_time:72250ms step_avg:60.36ms +step:1198/2285 train_time:72311ms step_avg:60.36ms +step:1199/2285 train_time:72373ms step_avg:60.36ms +step:1200/2285 train_time:72433ms step_avg:60.36ms +step:1201/2285 train_time:72495ms step_avg:60.36ms +step:1202/2285 train_time:72555ms step_avg:60.36ms +step:1203/2285 train_time:72617ms step_avg:60.36ms +step:1204/2285 train_time:72676ms step_avg:60.36ms +step:1205/2285 train_time:72739ms step_avg:60.36ms +step:1206/2285 train_time:72799ms step_avg:60.36ms +step:1207/2285 train_time:72862ms step_avg:60.37ms +step:1208/2285 train_time:72922ms step_avg:60.37ms +step:1209/2285 train_time:72986ms step_avg:60.37ms +step:1210/2285 train_time:73045ms step_avg:60.37ms +step:1211/2285 train_time:73108ms step_avg:60.37ms +step:1212/2285 train_time:73167ms step_avg:60.37ms +step:1213/2285 train_time:73229ms step_avg:60.37ms +step:1214/2285 train_time:73289ms step_avg:60.37ms +step:1215/2285 train_time:73352ms step_avg:60.37ms +step:1216/2285 train_time:73412ms step_avg:60.37ms +step:1217/2285 train_time:73474ms step_avg:60.37ms +step:1218/2285 train_time:73533ms step_avg:60.37ms +step:1219/2285 train_time:73596ms step_avg:60.37ms +step:1220/2285 train_time:73655ms step_avg:60.37ms +step:1221/2285 train_time:73717ms step_avg:60.37ms +step:1222/2285 train_time:73777ms step_avg:60.37ms +step:1223/2285 train_time:73839ms step_avg:60.38ms +step:1224/2285 train_time:73899ms step_avg:60.38ms +step:1225/2285 train_time:73961ms step_avg:60.38ms +step:1226/2285 train_time:74022ms step_avg:60.38ms +step:1227/2285 train_time:74085ms step_avg:60.38ms +step:1228/2285 train_time:74145ms step_avg:60.38ms +step:1229/2285 train_time:74208ms step_avg:60.38ms +step:1230/2285 train_time:74267ms step_avg:60.38ms +step:1231/2285 train_time:74329ms step_avg:60.38ms +step:1232/2285 train_time:74389ms step_avg:60.38ms +step:1233/2285 train_time:74451ms step_avg:60.38ms +step:1234/2285 train_time:74511ms step_avg:60.38ms +step:1235/2285 train_time:74573ms step_avg:60.38ms +step:1236/2285 train_time:74633ms step_avg:60.38ms +step:1237/2285 train_time:74695ms step_avg:60.38ms +step:1238/2285 train_time:74755ms step_avg:60.38ms +step:1239/2285 train_time:74817ms step_avg:60.38ms +step:1240/2285 train_time:74876ms step_avg:60.38ms +step:1241/2285 train_time:74939ms step_avg:60.39ms +step:1242/2285 train_time:74998ms step_avg:60.39ms +step:1243/2285 train_time:75061ms step_avg:60.39ms +step:1244/2285 train_time:75121ms step_avg:60.39ms +step:1245/2285 train_time:75183ms step_avg:60.39ms +step:1246/2285 train_time:75244ms step_avg:60.39ms +step:1247/2285 train_time:75306ms step_avg:60.39ms +step:1248/2285 train_time:75366ms step_avg:60.39ms +step:1249/2285 train_time:75428ms step_avg:60.39ms +step:1250/2285 train_time:75487ms step_avg:60.39ms +step:1250/2285 val_loss:3.4960 train_time:75551ms step_avg:60.44ms +step:1251/2285 train_time:75570ms step_avg:60.41ms +step:1252/2285 train_time:75610ms step_avg:60.39ms +step:1253/2285 train_time:75673ms step_avg:60.39ms +step:1254/2285 train_time:75734ms step_avg:60.39ms +step:1255/2285 train_time:75798ms step_avg:60.40ms +step:1256/2285 train_time:75859ms step_avg:60.40ms +step:1257/2285 train_time:75920ms step_avg:60.40ms +step:1258/2285 train_time:75979ms step_avg:60.40ms +step:1259/2285 train_time:76040ms step_avg:60.40ms +step:1260/2285 train_time:76098ms step_avg:60.40ms +step:1261/2285 train_time:76159ms step_avg:60.40ms +step:1262/2285 train_time:76218ms step_avg:60.39ms +step:1263/2285 train_time:76279ms step_avg:60.40ms +step:1264/2285 train_time:76337ms step_avg:60.39ms +step:1265/2285 train_time:76398ms step_avg:60.39ms +step:1266/2285 train_time:76462ms step_avg:60.40ms +step:1267/2285 train_time:76529ms step_avg:60.40ms +step:1268/2285 train_time:76591ms step_avg:60.40ms +step:1269/2285 train_time:76654ms step_avg:60.40ms +step:1270/2285 train_time:76714ms step_avg:60.40ms +step:1271/2285 train_time:76776ms step_avg:60.41ms +step:1272/2285 train_time:76836ms step_avg:60.41ms +step:1273/2285 train_time:76899ms step_avg:60.41ms +step:1274/2285 train_time:76958ms step_avg:60.41ms +step:1275/2285 train_time:77020ms step_avg:60.41ms +step:1276/2285 train_time:77079ms step_avg:60.41ms +step:1277/2285 train_time:77140ms step_avg:60.41ms +step:1278/2285 train_time:77198ms step_avg:60.41ms +step:1279/2285 train_time:77260ms step_avg:60.41ms +step:1280/2285 train_time:77319ms step_avg:60.41ms +step:1281/2285 train_time:77382ms step_avg:60.41ms +step:1282/2285 train_time:77443ms step_avg:60.41ms +step:1283/2285 train_time:77506ms step_avg:60.41ms +step:1284/2285 train_time:77568ms step_avg:60.41ms +step:1285/2285 train_time:77630ms step_avg:60.41ms +step:1286/2285 train_time:77690ms step_avg:60.41ms +step:1287/2285 train_time:77752ms step_avg:60.41ms +step:1288/2285 train_time:77811ms step_avg:60.41ms +step:1289/2285 train_time:77873ms step_avg:60.41ms +step:1290/2285 train_time:77933ms step_avg:60.41ms +step:1291/2285 train_time:77995ms step_avg:60.41ms +step:1292/2285 train_time:78054ms step_avg:60.41ms +step:1293/2285 train_time:78115ms step_avg:60.41ms +step:1294/2285 train_time:78175ms step_avg:60.41ms +step:1295/2285 train_time:78236ms step_avg:60.41ms +step:1296/2285 train_time:78295ms step_avg:60.41ms +step:1297/2285 train_time:78357ms step_avg:60.41ms +step:1298/2285 train_time:78417ms step_avg:60.41ms +step:1299/2285 train_time:78481ms step_avg:60.42ms +step:1300/2285 train_time:78541ms step_avg:60.42ms +step:1301/2285 train_time:78604ms step_avg:60.42ms +step:1302/2285 train_time:78664ms step_avg:60.42ms +step:1303/2285 train_time:78726ms step_avg:60.42ms +step:1304/2285 train_time:78786ms step_avg:60.42ms +step:1305/2285 train_time:78848ms step_avg:60.42ms +step:1306/2285 train_time:78908ms step_avg:60.42ms +step:1307/2285 train_time:78970ms step_avg:60.42ms +step:1308/2285 train_time:79030ms step_avg:60.42ms +step:1309/2285 train_time:79091ms step_avg:60.42ms +step:1310/2285 train_time:79150ms step_avg:60.42ms +step:1311/2285 train_time:79212ms step_avg:60.42ms +step:1312/2285 train_time:79272ms step_avg:60.42ms +step:1313/2285 train_time:79333ms step_avg:60.42ms +step:1314/2285 train_time:79392ms step_avg:60.42ms +step:1315/2285 train_time:79455ms step_avg:60.42ms +step:1316/2285 train_time:79514ms step_avg:60.42ms +step:1317/2285 train_time:79577ms step_avg:60.42ms +step:1318/2285 train_time:79638ms step_avg:60.42ms +step:1319/2285 train_time:79701ms step_avg:60.43ms +step:1320/2285 train_time:79761ms step_avg:60.42ms +step:1321/2285 train_time:79823ms step_avg:60.43ms +step:1322/2285 train_time:79882ms step_avg:60.43ms +step:1323/2285 train_time:79945ms step_avg:60.43ms +step:1324/2285 train_time:80004ms step_avg:60.43ms +step:1325/2285 train_time:80066ms step_avg:60.43ms +step:1326/2285 train_time:80127ms step_avg:60.43ms +step:1327/2285 train_time:80189ms step_avg:60.43ms +step:1328/2285 train_time:80248ms step_avg:60.43ms +step:1329/2285 train_time:80310ms step_avg:60.43ms +step:1330/2285 train_time:80371ms step_avg:60.43ms +step:1331/2285 train_time:80433ms step_avg:60.43ms +step:1332/2285 train_time:80492ms step_avg:60.43ms +step:1333/2285 train_time:80555ms step_avg:60.43ms +step:1334/2285 train_time:80614ms step_avg:60.43ms +step:1335/2285 train_time:80677ms step_avg:60.43ms +step:1336/2285 train_time:80737ms step_avg:60.43ms +step:1337/2285 train_time:80800ms step_avg:60.43ms +step:1338/2285 train_time:80860ms step_avg:60.43ms +step:1339/2285 train_time:80922ms step_avg:60.43ms +step:1340/2285 train_time:80981ms step_avg:60.43ms +step:1341/2285 train_time:81043ms step_avg:60.43ms +step:1342/2285 train_time:81103ms step_avg:60.43ms +step:1343/2285 train_time:81165ms step_avg:60.44ms +step:1344/2285 train_time:81225ms step_avg:60.44ms +step:1345/2285 train_time:81288ms step_avg:60.44ms +step:1346/2285 train_time:81348ms step_avg:60.44ms +step:1347/2285 train_time:81410ms step_avg:60.44ms +step:1348/2285 train_time:81469ms step_avg:60.44ms +step:1349/2285 train_time:81531ms step_avg:60.44ms +step:1350/2285 train_time:81590ms step_avg:60.44ms +step:1351/2285 train_time:81653ms step_avg:60.44ms +step:1352/2285 train_time:81713ms step_avg:60.44ms +step:1353/2285 train_time:81774ms step_avg:60.44ms +step:1354/2285 train_time:81834ms step_avg:60.44ms +step:1355/2285 train_time:81896ms step_avg:60.44ms +step:1356/2285 train_time:81957ms step_avg:60.44ms +step:1357/2285 train_time:82019ms step_avg:60.44ms +step:1358/2285 train_time:82079ms step_avg:60.44ms +step:1359/2285 train_time:82141ms step_avg:60.44ms +step:1360/2285 train_time:82201ms step_avg:60.44ms +step:1361/2285 train_time:82263ms step_avg:60.44ms +step:1362/2285 train_time:82323ms step_avg:60.44ms +step:1363/2285 train_time:82384ms step_avg:60.44ms +step:1364/2285 train_time:82444ms step_avg:60.44ms +step:1365/2285 train_time:82506ms step_avg:60.44ms +step:1366/2285 train_time:82566ms step_avg:60.44ms +step:1367/2285 train_time:82628ms step_avg:60.45ms +step:1368/2285 train_time:82689ms step_avg:60.44ms +step:1369/2285 train_time:82751ms step_avg:60.45ms +step:1370/2285 train_time:82811ms step_avg:60.45ms +step:1371/2285 train_time:82873ms step_avg:60.45ms +step:1372/2285 train_time:82933ms step_avg:60.45ms +step:1373/2285 train_time:82995ms step_avg:60.45ms +step:1374/2285 train_time:83055ms step_avg:60.45ms +step:1375/2285 train_time:83117ms step_avg:60.45ms +step:1376/2285 train_time:83177ms step_avg:60.45ms +step:1377/2285 train_time:83240ms step_avg:60.45ms +step:1378/2285 train_time:83299ms step_avg:60.45ms +step:1379/2285 train_time:83362ms step_avg:60.45ms +step:1380/2285 train_time:83422ms step_avg:60.45ms +step:1381/2285 train_time:83483ms step_avg:60.45ms +step:1382/2285 train_time:83543ms step_avg:60.45ms +step:1383/2285 train_time:83606ms step_avg:60.45ms +step:1384/2285 train_time:83667ms step_avg:60.45ms +step:1385/2285 train_time:83729ms step_avg:60.45ms +step:1386/2285 train_time:83789ms step_avg:60.45ms +step:1387/2285 train_time:83851ms step_avg:60.45ms +step:1388/2285 train_time:83910ms step_avg:60.45ms +step:1389/2285 train_time:83972ms step_avg:60.46ms +step:1390/2285 train_time:84032ms step_avg:60.45ms +step:1391/2285 train_time:84094ms step_avg:60.46ms +step:1392/2285 train_time:84154ms step_avg:60.46ms +step:1393/2285 train_time:84216ms step_avg:60.46ms +step:1394/2285 train_time:84276ms step_avg:60.46ms +step:1395/2285 train_time:84339ms step_avg:60.46ms +step:1396/2285 train_time:84398ms step_avg:60.46ms +step:1397/2285 train_time:84461ms step_avg:60.46ms +step:1398/2285 train_time:84521ms step_avg:60.46ms +step:1399/2285 train_time:84583ms step_avg:60.46ms +step:1400/2285 train_time:84643ms step_avg:60.46ms +step:1401/2285 train_time:84705ms step_avg:60.46ms +step:1402/2285 train_time:84765ms step_avg:60.46ms +step:1403/2285 train_time:84827ms step_avg:60.46ms +step:1404/2285 train_time:84887ms step_avg:60.46ms +step:1405/2285 train_time:84949ms step_avg:60.46ms +step:1406/2285 train_time:85009ms step_avg:60.46ms +step:1407/2285 train_time:85071ms step_avg:60.46ms +step:1408/2285 train_time:85131ms step_avg:60.46ms +step:1409/2285 train_time:85193ms step_avg:60.46ms +step:1410/2285 train_time:85252ms step_avg:60.46ms +step:1411/2285 train_time:85314ms step_avg:60.46ms +step:1412/2285 train_time:85374ms step_avg:60.46ms +step:1413/2285 train_time:85437ms step_avg:60.46ms +step:1414/2285 train_time:85497ms step_avg:60.46ms +step:1415/2285 train_time:85559ms step_avg:60.47ms +step:1416/2285 train_time:85620ms step_avg:60.47ms +step:1417/2285 train_time:85682ms step_avg:60.47ms +step:1418/2285 train_time:85742ms step_avg:60.47ms +step:1419/2285 train_time:85803ms step_avg:60.47ms +step:1420/2285 train_time:85863ms step_avg:60.47ms +step:1421/2285 train_time:85925ms step_avg:60.47ms +step:1422/2285 train_time:85985ms step_avg:60.47ms +step:1423/2285 train_time:86047ms step_avg:60.47ms +step:1424/2285 train_time:86107ms step_avg:60.47ms +step:1425/2285 train_time:86169ms step_avg:60.47ms +step:1426/2285 train_time:86229ms step_avg:60.47ms +step:1427/2285 train_time:86292ms step_avg:60.47ms +step:1428/2285 train_time:86351ms step_avg:60.47ms +step:1429/2285 train_time:86414ms step_avg:60.47ms +step:1430/2285 train_time:86473ms step_avg:60.47ms +step:1431/2285 train_time:86535ms step_avg:60.47ms +step:1432/2285 train_time:86595ms step_avg:60.47ms +step:1433/2285 train_time:86658ms step_avg:60.47ms +step:1434/2285 train_time:86718ms step_avg:60.47ms +step:1435/2285 train_time:86781ms step_avg:60.47ms +step:1436/2285 train_time:86841ms step_avg:60.47ms +step:1437/2285 train_time:86903ms step_avg:60.48ms +step:1438/2285 train_time:86962ms step_avg:60.47ms +step:1439/2285 train_time:87024ms step_avg:60.48ms +step:1440/2285 train_time:87084ms step_avg:60.47ms +step:1441/2285 train_time:87145ms step_avg:60.48ms +step:1442/2285 train_time:87205ms step_avg:60.48ms +step:1443/2285 train_time:87267ms step_avg:60.48ms +step:1444/2285 train_time:87327ms step_avg:60.48ms +step:1445/2285 train_time:87389ms step_avg:60.48ms +step:1446/2285 train_time:87449ms step_avg:60.48ms +step:1447/2285 train_time:87512ms step_avg:60.48ms +step:1448/2285 train_time:87571ms step_avg:60.48ms +step:1449/2285 train_time:87634ms step_avg:60.48ms +step:1450/2285 train_time:87693ms step_avg:60.48ms +step:1451/2285 train_time:87755ms step_avg:60.48ms +step:1452/2285 train_time:87815ms step_avg:60.48ms +step:1453/2285 train_time:87877ms step_avg:60.48ms +step:1454/2285 train_time:87937ms step_avg:60.48ms +step:1455/2285 train_time:88000ms step_avg:60.48ms +step:1456/2285 train_time:88060ms step_avg:60.48ms +step:1457/2285 train_time:88122ms step_avg:60.48ms +step:1458/2285 train_time:88182ms step_avg:60.48ms +step:1459/2285 train_time:88244ms step_avg:60.48ms +step:1460/2285 train_time:88305ms step_avg:60.48ms +step:1461/2285 train_time:88367ms step_avg:60.48ms +step:1462/2285 train_time:88427ms step_avg:60.48ms +step:1463/2285 train_time:88489ms step_avg:60.48ms +step:1464/2285 train_time:88549ms step_avg:60.48ms +step:1465/2285 train_time:88611ms step_avg:60.49ms +step:1466/2285 train_time:88671ms step_avg:60.48ms +step:1467/2285 train_time:88733ms step_avg:60.49ms +step:1468/2285 train_time:88792ms step_avg:60.48ms +step:1469/2285 train_time:88854ms step_avg:60.49ms +step:1470/2285 train_time:88914ms step_avg:60.49ms +step:1471/2285 train_time:88977ms step_avg:60.49ms +step:1472/2285 train_time:89037ms step_avg:60.49ms +step:1473/2285 train_time:89099ms step_avg:60.49ms +step:1474/2285 train_time:89159ms step_avg:60.49ms +step:1475/2285 train_time:89221ms step_avg:60.49ms +step:1476/2285 train_time:89281ms step_avg:60.49ms +step:1477/2285 train_time:89343ms step_avg:60.49ms +step:1478/2285 train_time:89403ms step_avg:60.49ms +step:1479/2285 train_time:89465ms step_avg:60.49ms +step:1480/2285 train_time:89525ms step_avg:60.49ms +step:1481/2285 train_time:89588ms step_avg:60.49ms +step:1482/2285 train_time:89647ms step_avg:60.49ms +step:1483/2285 train_time:89709ms step_avg:60.49ms +step:1484/2285 train_time:89769ms step_avg:60.49ms +step:1485/2285 train_time:89831ms step_avg:60.49ms +step:1486/2285 train_time:89891ms step_avg:60.49ms +step:1487/2285 train_time:89952ms step_avg:60.49ms +step:1488/2285 train_time:90012ms step_avg:60.49ms +step:1489/2285 train_time:90074ms step_avg:60.49ms +step:1490/2285 train_time:90134ms step_avg:60.49ms +step:1491/2285 train_time:90196ms step_avg:60.49ms +step:1492/2285 train_time:90256ms step_avg:60.49ms +step:1493/2285 train_time:90319ms step_avg:60.49ms +step:1494/2285 train_time:90378ms step_avg:60.49ms +step:1495/2285 train_time:90441ms step_avg:60.50ms +step:1496/2285 train_time:90501ms step_avg:60.50ms +step:1497/2285 train_time:90564ms step_avg:60.50ms +step:1498/2285 train_time:90624ms step_avg:60.50ms +step:1499/2285 train_time:90687ms step_avg:60.50ms +step:1500/2285 train_time:90746ms step_avg:60.50ms +step:1500/2285 val_loss:3.4273 train_time:90810ms step_avg:60.54ms +step:1501/2285 train_time:90828ms step_avg:60.51ms +step:1502/2285 train_time:90870ms step_avg:60.50ms +step:1503/2285 train_time:90935ms step_avg:60.50ms +step:1504/2285 train_time:90996ms step_avg:60.50ms +step:1505/2285 train_time:91058ms step_avg:60.50ms +step:1506/2285 train_time:91119ms step_avg:60.50ms +step:1507/2285 train_time:91180ms step_avg:60.50ms +step:1508/2285 train_time:91238ms step_avg:60.50ms +step:1509/2285 train_time:91300ms step_avg:60.50ms +step:1510/2285 train_time:91359ms step_avg:60.50ms +step:1511/2285 train_time:91420ms step_avg:60.50ms +step:1512/2285 train_time:91479ms step_avg:60.50ms +step:1513/2285 train_time:91540ms step_avg:60.50ms +step:1514/2285 train_time:91601ms step_avg:60.50ms +step:1515/2285 train_time:91664ms step_avg:60.50ms +step:1516/2285 train_time:91724ms step_avg:60.50ms +step:1517/2285 train_time:91787ms step_avg:60.51ms +step:1518/2285 train_time:91848ms step_avg:60.51ms +step:1519/2285 train_time:91911ms step_avg:60.51ms +step:1520/2285 train_time:91972ms step_avg:60.51ms +step:1521/2285 train_time:92034ms step_avg:60.51ms +step:1522/2285 train_time:92094ms step_avg:60.51ms +step:1523/2285 train_time:92156ms step_avg:60.51ms +step:1524/2285 train_time:92215ms step_avg:60.51ms +step:1525/2285 train_time:92277ms step_avg:60.51ms +step:1526/2285 train_time:92336ms step_avg:60.51ms +step:1527/2285 train_time:92398ms step_avg:60.51ms +step:1528/2285 train_time:92457ms step_avg:60.51ms +step:1529/2285 train_time:92519ms step_avg:60.51ms +step:1530/2285 train_time:92578ms step_avg:60.51ms +step:1531/2285 train_time:92641ms step_avg:60.51ms +step:1532/2285 train_time:92701ms step_avg:60.51ms +step:1533/2285 train_time:92763ms step_avg:60.51ms +step:1534/2285 train_time:92824ms step_avg:60.51ms +step:1535/2285 train_time:92888ms step_avg:60.51ms +step:1536/2285 train_time:92948ms step_avg:60.51ms +step:1537/2285 train_time:93011ms step_avg:60.51ms +step:1538/2285 train_time:93071ms step_avg:60.51ms +step:1539/2285 train_time:93133ms step_avg:60.52ms +step:1540/2285 train_time:93194ms step_avg:60.52ms +step:1541/2285 train_time:93256ms step_avg:60.52ms +step:1542/2285 train_time:93315ms step_avg:60.52ms +step:1543/2285 train_time:93377ms step_avg:60.52ms +step:1544/2285 train_time:93436ms step_avg:60.52ms +step:1545/2285 train_time:93498ms step_avg:60.52ms +step:1546/2285 train_time:93558ms step_avg:60.52ms +step:1547/2285 train_time:93620ms step_avg:60.52ms +step:1548/2285 train_time:93680ms step_avg:60.52ms +step:1549/2285 train_time:93743ms step_avg:60.52ms +step:1550/2285 train_time:93803ms step_avg:60.52ms +step:1551/2285 train_time:93866ms step_avg:60.52ms +step:1552/2285 train_time:93927ms step_avg:60.52ms +step:1553/2285 train_time:93990ms step_avg:60.52ms +step:1554/2285 train_time:94050ms step_avg:60.52ms +step:1555/2285 train_time:94112ms step_avg:60.52ms +step:1556/2285 train_time:94172ms step_avg:60.52ms +step:1557/2285 train_time:94234ms step_avg:60.52ms +step:1558/2285 train_time:94293ms step_avg:60.52ms +step:1559/2285 train_time:94356ms step_avg:60.52ms +step:1560/2285 train_time:94415ms step_avg:60.52ms +step:1561/2285 train_time:94477ms step_avg:60.52ms +step:1562/2285 train_time:94537ms step_avg:60.52ms +step:1563/2285 train_time:94599ms step_avg:60.52ms +step:1564/2285 train_time:94659ms step_avg:60.52ms +step:1565/2285 train_time:94721ms step_avg:60.52ms +step:1566/2285 train_time:94781ms step_avg:60.52ms +step:1567/2285 train_time:94844ms step_avg:60.53ms +step:1568/2285 train_time:94906ms step_avg:60.53ms +step:1569/2285 train_time:94969ms step_avg:60.53ms +step:1570/2285 train_time:95029ms step_avg:60.53ms +step:1571/2285 train_time:95091ms step_avg:60.53ms +step:1572/2285 train_time:95151ms step_avg:60.53ms +step:1573/2285 train_time:95213ms step_avg:60.53ms +step:1574/2285 train_time:95273ms step_avg:60.53ms +step:1575/2285 train_time:95335ms step_avg:60.53ms +step:1576/2285 train_time:95395ms step_avg:60.53ms +step:1577/2285 train_time:95457ms step_avg:60.53ms +step:1578/2285 train_time:95517ms step_avg:60.53ms +step:1579/2285 train_time:95579ms step_avg:60.53ms +step:1580/2285 train_time:95639ms step_avg:60.53ms +step:1581/2285 train_time:95701ms step_avg:60.53ms +step:1582/2285 train_time:95761ms step_avg:60.53ms +step:1583/2285 train_time:95824ms step_avg:60.53ms +step:1584/2285 train_time:95884ms step_avg:60.53ms +step:1585/2285 train_time:95948ms step_avg:60.53ms +step:1586/2285 train_time:96008ms step_avg:60.53ms +step:1587/2285 train_time:96070ms step_avg:60.54ms +step:1588/2285 train_time:96130ms step_avg:60.54ms +step:1589/2285 train_time:96192ms step_avg:60.54ms +step:1590/2285 train_time:96253ms step_avg:60.54ms +step:1591/2285 train_time:96316ms step_avg:60.54ms +step:1592/2285 train_time:96375ms step_avg:60.54ms +step:1593/2285 train_time:96437ms step_avg:60.54ms +step:1594/2285 train_time:96497ms step_avg:60.54ms +step:1595/2285 train_time:96560ms step_avg:60.54ms +step:1596/2285 train_time:96620ms step_avg:60.54ms +step:1597/2285 train_time:96681ms step_avg:60.54ms +step:1598/2285 train_time:96741ms step_avg:60.54ms +step:1599/2285 train_time:96803ms step_avg:60.54ms +step:1600/2285 train_time:96864ms step_avg:60.54ms +step:1601/2285 train_time:96926ms step_avg:60.54ms +step:1602/2285 train_time:96986ms step_avg:60.54ms +step:1603/2285 train_time:97049ms step_avg:60.54ms +step:1604/2285 train_time:97109ms step_avg:60.54ms +step:1605/2285 train_time:97171ms step_avg:60.54ms +step:1606/2285 train_time:97231ms step_avg:60.54ms +step:1607/2285 train_time:97294ms step_avg:60.54ms +step:1608/2285 train_time:97354ms step_avg:60.54ms +step:1609/2285 train_time:97417ms step_avg:60.54ms +step:1610/2285 train_time:97476ms step_avg:60.54ms +step:1611/2285 train_time:97538ms step_avg:60.55ms +step:1612/2285 train_time:97598ms step_avg:60.54ms +step:1613/2285 train_time:97661ms step_avg:60.55ms +step:1614/2285 train_time:97721ms step_avg:60.55ms +step:1615/2285 train_time:97783ms step_avg:60.55ms +step:1616/2285 train_time:97843ms step_avg:60.55ms +step:1617/2285 train_time:97907ms step_avg:60.55ms +step:1618/2285 train_time:97967ms step_avg:60.55ms +step:1619/2285 train_time:98030ms step_avg:60.55ms +step:1620/2285 train_time:98090ms step_avg:60.55ms +step:1621/2285 train_time:98152ms step_avg:60.55ms +step:1622/2285 train_time:98213ms step_avg:60.55ms +step:1623/2285 train_time:98275ms step_avg:60.55ms +step:1624/2285 train_time:98335ms step_avg:60.55ms +step:1625/2285 train_time:98397ms step_avg:60.55ms +step:1626/2285 train_time:98456ms step_avg:60.55ms +step:1627/2285 train_time:98518ms step_avg:60.55ms +step:1628/2285 train_time:98577ms step_avg:60.55ms +step:1629/2285 train_time:98640ms step_avg:60.55ms +step:1630/2285 train_time:98700ms step_avg:60.55ms +step:1631/2285 train_time:98762ms step_avg:60.55ms +step:1632/2285 train_time:98822ms step_avg:60.55ms +step:1633/2285 train_time:98884ms step_avg:60.55ms +step:1634/2285 train_time:98944ms step_avg:60.55ms +step:1635/2285 train_time:99007ms step_avg:60.55ms +step:1636/2285 train_time:99067ms step_avg:60.55ms +step:1637/2285 train_time:99130ms step_avg:60.56ms +step:1638/2285 train_time:99190ms step_avg:60.56ms +step:1639/2285 train_time:99254ms step_avg:60.56ms +step:1640/2285 train_time:99313ms step_avg:60.56ms +step:1641/2285 train_time:99376ms step_avg:60.56ms +step:1642/2285 train_time:99435ms step_avg:60.56ms +step:1643/2285 train_time:99497ms step_avg:60.56ms +step:1644/2285 train_time:99557ms step_avg:60.56ms +step:1645/2285 train_time:99619ms step_avg:60.56ms +step:1646/2285 train_time:99678ms step_avg:60.56ms +step:1647/2285 train_time:99740ms step_avg:60.56ms +step:1648/2285 train_time:99800ms step_avg:60.56ms +step:1649/2285 train_time:99862ms step_avg:60.56ms +step:1650/2285 train_time:99923ms step_avg:60.56ms +step:1651/2285 train_time:99986ms step_avg:60.56ms +step:1652/2285 train_time:100046ms step_avg:60.56ms +step:1653/2285 train_time:100109ms step_avg:60.56ms +step:1654/2285 train_time:100169ms step_avg:60.56ms +step:1655/2285 train_time:100232ms step_avg:60.56ms +step:1656/2285 train_time:100292ms step_avg:60.56ms +step:1657/2285 train_time:100354ms step_avg:60.56ms +step:1658/2285 train_time:100415ms step_avg:60.56ms +step:1659/2285 train_time:100476ms step_avg:60.56ms +step:1660/2285 train_time:100536ms step_avg:60.56ms +step:1661/2285 train_time:100598ms step_avg:60.56ms +step:1662/2285 train_time:100658ms step_avg:60.56ms +step:1663/2285 train_time:100720ms step_avg:60.57ms +step:1664/2285 train_time:100780ms step_avg:60.56ms +step:1665/2285 train_time:100842ms step_avg:60.57ms +step:1666/2285 train_time:100902ms step_avg:60.57ms +step:1667/2285 train_time:100965ms step_avg:60.57ms +step:1668/2285 train_time:101026ms step_avg:60.57ms +step:1669/2285 train_time:101088ms step_avg:60.57ms +step:1670/2285 train_time:101149ms step_avg:60.57ms +step:1671/2285 train_time:101212ms step_avg:60.57ms +step:1672/2285 train_time:101272ms step_avg:60.57ms +step:1673/2285 train_time:101334ms step_avg:60.57ms +step:1674/2285 train_time:101393ms step_avg:60.57ms +step:1675/2285 train_time:101456ms step_avg:60.57ms +step:1676/2285 train_time:101515ms step_avg:60.57ms +step:1677/2285 train_time:101577ms step_avg:60.57ms +step:1678/2285 train_time:101637ms step_avg:60.57ms +step:1679/2285 train_time:101699ms step_avg:60.57ms +step:1680/2285 train_time:101759ms step_avg:60.57ms +step:1681/2285 train_time:101822ms step_avg:60.57ms +step:1682/2285 train_time:101881ms step_avg:60.57ms +step:1683/2285 train_time:101944ms step_avg:60.57ms +step:1684/2285 train_time:102004ms step_avg:60.57ms +step:1685/2285 train_time:102067ms step_avg:60.57ms +step:1686/2285 train_time:102127ms step_avg:60.57ms +step:1687/2285 train_time:102190ms step_avg:60.58ms +step:1688/2285 train_time:102250ms step_avg:60.57ms +step:1689/2285 train_time:102313ms step_avg:60.58ms +step:1690/2285 train_time:102372ms step_avg:60.58ms +step:1691/2285 train_time:102436ms step_avg:60.58ms +step:1692/2285 train_time:102496ms step_avg:60.58ms +step:1693/2285 train_time:102557ms step_avg:60.58ms +step:1694/2285 train_time:102617ms step_avg:60.58ms +step:1695/2285 train_time:102679ms step_avg:60.58ms +step:1696/2285 train_time:102738ms step_avg:60.58ms +step:1697/2285 train_time:102800ms step_avg:60.58ms +step:1698/2285 train_time:102860ms step_avg:60.58ms +step:1699/2285 train_time:102922ms step_avg:60.58ms +step:1700/2285 train_time:102983ms step_avg:60.58ms +step:1701/2285 train_time:103046ms step_avg:60.58ms +step:1702/2285 train_time:103107ms step_avg:60.58ms +step:1703/2285 train_time:103169ms step_avg:60.58ms +step:1704/2285 train_time:103230ms step_avg:60.58ms +step:1705/2285 train_time:103292ms step_avg:60.58ms +step:1706/2285 train_time:103352ms step_avg:60.58ms +step:1707/2285 train_time:103415ms step_avg:60.58ms +step:1708/2285 train_time:103475ms step_avg:60.58ms +step:1709/2285 train_time:103537ms step_avg:60.58ms +step:1710/2285 train_time:103596ms step_avg:60.58ms +step:1711/2285 train_time:103658ms step_avg:60.58ms +step:1712/2285 train_time:103718ms step_avg:60.58ms +step:1713/2285 train_time:103779ms step_avg:60.58ms +step:1714/2285 train_time:103839ms step_avg:60.58ms +step:1715/2285 train_time:103902ms step_avg:60.58ms +step:1716/2285 train_time:103962ms step_avg:60.58ms +step:1717/2285 train_time:104025ms step_avg:60.59ms +step:1718/2285 train_time:104085ms step_avg:60.58ms +step:1719/2285 train_time:104148ms step_avg:60.59ms +step:1720/2285 train_time:104208ms step_avg:60.59ms +step:1721/2285 train_time:104271ms step_avg:60.59ms +step:1722/2285 train_time:104331ms step_avg:60.59ms +step:1723/2285 train_time:104393ms step_avg:60.59ms +step:1724/2285 train_time:104453ms step_avg:60.59ms +step:1725/2285 train_time:104516ms step_avg:60.59ms +step:1726/2285 train_time:104575ms step_avg:60.59ms +step:1727/2285 train_time:104637ms step_avg:60.59ms +step:1728/2285 train_time:104697ms step_avg:60.59ms +step:1729/2285 train_time:104759ms step_avg:60.59ms +step:1730/2285 train_time:104818ms step_avg:60.59ms +step:1731/2285 train_time:104881ms step_avg:60.59ms +step:1732/2285 train_time:104941ms step_avg:60.59ms +step:1733/2285 train_time:105003ms step_avg:60.59ms +step:1734/2285 train_time:105064ms step_avg:60.59ms +step:1735/2285 train_time:105127ms step_avg:60.59ms +step:1736/2285 train_time:105188ms step_avg:60.59ms +step:1737/2285 train_time:105251ms step_avg:60.59ms +step:1738/2285 train_time:105311ms step_avg:60.59ms +step:1739/2285 train_time:105373ms step_avg:60.59ms +step:1740/2285 train_time:105433ms step_avg:60.59ms +step:1741/2285 train_time:105495ms step_avg:60.59ms +step:1742/2285 train_time:105555ms step_avg:60.59ms +step:1743/2285 train_time:105617ms step_avg:60.59ms +step:1744/2285 train_time:105676ms step_avg:60.59ms +step:1745/2285 train_time:105739ms step_avg:60.60ms +step:1746/2285 train_time:105799ms step_avg:60.59ms +step:1747/2285 train_time:105861ms step_avg:60.60ms +step:1748/2285 train_time:105921ms step_avg:60.60ms +step:1749/2285 train_time:105982ms step_avg:60.60ms +step:1750/2285 train_time:106043ms step_avg:60.60ms +step:1750/2285 val_loss:3.3663 train_time:106108ms step_avg:60.63ms +step:1751/2285 train_time:106129ms step_avg:60.61ms +step:1752/2285 train_time:106167ms step_avg:60.60ms +step:1753/2285 train_time:106230ms step_avg:60.60ms +step:1754/2285 train_time:106291ms step_avg:60.60ms +step:1755/2285 train_time:106355ms step_avg:60.60ms +step:1756/2285 train_time:106416ms step_avg:60.60ms +step:1757/2285 train_time:106478ms step_avg:60.60ms +step:1758/2285 train_time:106537ms step_avg:60.60ms +step:1759/2285 train_time:106599ms step_avg:60.60ms +step:1760/2285 train_time:106658ms step_avg:60.60ms +step:1761/2285 train_time:106719ms step_avg:60.60ms +step:1762/2285 train_time:106778ms step_avg:60.60ms +step:1763/2285 train_time:106841ms step_avg:60.60ms +step:1764/2285 train_time:106901ms step_avg:60.60ms +step:1765/2285 train_time:106963ms step_avg:60.60ms +step:1766/2285 train_time:107025ms step_avg:60.60ms +step:1767/2285 train_time:107090ms step_avg:60.61ms +step:1768/2285 train_time:107150ms step_avg:60.61ms +step:1769/2285 train_time:107212ms step_avg:60.61ms +step:1770/2285 train_time:107272ms step_avg:60.61ms +step:1771/2285 train_time:107335ms step_avg:60.61ms +step:1772/2285 train_time:107395ms step_avg:60.61ms +step:1773/2285 train_time:107458ms step_avg:60.61ms +step:1774/2285 train_time:107517ms step_avg:60.61ms +step:1775/2285 train_time:107579ms step_avg:60.61ms +step:1776/2285 train_time:107638ms step_avg:60.61ms +step:1777/2285 train_time:107700ms step_avg:60.61ms +step:1778/2285 train_time:107759ms step_avg:60.61ms +step:1779/2285 train_time:107821ms step_avg:60.61ms +step:1780/2285 train_time:107880ms step_avg:60.61ms +step:1781/2285 train_time:107943ms step_avg:60.61ms +step:1782/2285 train_time:108004ms step_avg:60.61ms +step:1783/2285 train_time:108068ms step_avg:60.61ms +step:1784/2285 train_time:108127ms step_avg:60.61ms +step:1785/2285 train_time:108190ms step_avg:60.61ms +step:1786/2285 train_time:108251ms step_avg:60.61ms +step:1787/2285 train_time:108314ms step_avg:60.61ms +step:1788/2285 train_time:108374ms step_avg:60.61ms +step:1789/2285 train_time:108436ms step_avg:60.61ms +step:1790/2285 train_time:108495ms step_avg:60.61ms +step:1791/2285 train_time:108557ms step_avg:60.61ms +step:1792/2285 train_time:108617ms step_avg:60.61ms +step:1793/2285 train_time:108678ms step_avg:60.61ms +step:1794/2285 train_time:108738ms step_avg:60.61ms +step:1795/2285 train_time:108800ms step_avg:60.61ms +step:1796/2285 train_time:108859ms step_avg:60.61ms +step:1797/2285 train_time:108921ms step_avg:60.61ms +step:1798/2285 train_time:108981ms step_avg:60.61ms +step:1799/2285 train_time:109044ms step_avg:60.61ms +step:1800/2285 train_time:109104ms step_avg:60.61ms +step:1801/2285 train_time:109168ms step_avg:60.61ms +step:1802/2285 train_time:109228ms step_avg:60.61ms +step:1803/2285 train_time:109290ms step_avg:60.62ms +step:1804/2285 train_time:109350ms step_avg:60.62ms +step:1805/2285 train_time:109413ms step_avg:60.62ms +step:1806/2285 train_time:109473ms step_avg:60.62ms +step:1807/2285 train_time:109534ms step_avg:60.62ms +step:1808/2285 train_time:109594ms step_avg:60.62ms +step:1809/2285 train_time:109655ms step_avg:60.62ms +step:1810/2285 train_time:109716ms step_avg:60.62ms +step:1811/2285 train_time:109778ms step_avg:60.62ms +step:1812/2285 train_time:109837ms step_avg:60.62ms +step:1813/2285 train_time:109900ms step_avg:60.62ms +step:1814/2285 train_time:109959ms step_avg:60.62ms +step:1815/2285 train_time:110022ms step_avg:60.62ms +step:1816/2285 train_time:110083ms step_avg:60.62ms +step:1817/2285 train_time:110146ms step_avg:60.62ms +step:1818/2285 train_time:110207ms step_avg:60.62ms +step:1819/2285 train_time:110270ms step_avg:60.62ms +step:1820/2285 train_time:110329ms step_avg:60.62ms +step:1821/2285 train_time:110392ms step_avg:60.62ms +step:1822/2285 train_time:110452ms step_avg:60.62ms +step:1823/2285 train_time:110514ms step_avg:60.62ms +step:1824/2285 train_time:110573ms step_avg:60.62ms +step:1825/2285 train_time:110635ms step_avg:60.62ms +step:1826/2285 train_time:110695ms step_avg:60.62ms +step:1827/2285 train_time:110757ms step_avg:60.62ms +step:1828/2285 train_time:110817ms step_avg:60.62ms +step:1829/2285 train_time:110879ms step_avg:60.62ms +step:1830/2285 train_time:110939ms step_avg:60.62ms +step:1831/2285 train_time:111002ms step_avg:60.62ms +step:1832/2285 train_time:111062ms step_avg:60.62ms +step:1833/2285 train_time:111124ms step_avg:60.62ms +step:1834/2285 train_time:111185ms step_avg:60.62ms +step:1835/2285 train_time:111248ms step_avg:60.63ms +step:1836/2285 train_time:111307ms step_avg:60.62ms +step:1837/2285 train_time:111370ms step_avg:60.63ms +step:1838/2285 train_time:111430ms step_avg:60.63ms +step:1839/2285 train_time:111492ms step_avg:60.63ms +step:1840/2285 train_time:111553ms step_avg:60.63ms +step:1841/2285 train_time:111614ms step_avg:60.63ms +step:1842/2285 train_time:111674ms step_avg:60.63ms +step:1843/2285 train_time:111736ms step_avg:60.63ms +step:1844/2285 train_time:111795ms step_avg:60.63ms +step:1845/2285 train_time:111858ms step_avg:60.63ms +step:1846/2285 train_time:111918ms step_avg:60.63ms +step:1847/2285 train_time:111980ms step_avg:60.63ms +step:1848/2285 train_time:112040ms step_avg:60.63ms +step:1849/2285 train_time:112103ms step_avg:60.63ms +step:1850/2285 train_time:112163ms step_avg:60.63ms +step:1851/2285 train_time:112226ms step_avg:60.63ms +step:1852/2285 train_time:112286ms step_avg:60.63ms +step:1853/2285 train_time:112349ms step_avg:60.63ms +step:1854/2285 train_time:112408ms step_avg:60.63ms +step:1855/2285 train_time:112471ms step_avg:60.63ms +step:1856/2285 train_time:112531ms step_avg:60.63ms +step:1857/2285 train_time:112593ms step_avg:60.63ms +step:1858/2285 train_time:112653ms step_avg:60.63ms +step:1859/2285 train_time:112715ms step_avg:60.63ms +step:1860/2285 train_time:112774ms step_avg:60.63ms +step:1861/2285 train_time:112837ms step_avg:60.63ms +step:1862/2285 train_time:112896ms step_avg:60.63ms +step:1863/2285 train_time:112959ms step_avg:60.63ms +step:1864/2285 train_time:113018ms step_avg:60.63ms +step:1865/2285 train_time:113080ms step_avg:60.63ms +step:1866/2285 train_time:113140ms step_avg:60.63ms +step:1867/2285 train_time:113203ms step_avg:60.63ms +step:1868/2285 train_time:113263ms step_avg:60.63ms +step:1869/2285 train_time:113326ms step_avg:60.63ms +step:1870/2285 train_time:113386ms step_avg:60.63ms +step:1871/2285 train_time:113449ms step_avg:60.64ms +step:1872/2285 train_time:113508ms step_avg:60.63ms +step:1873/2285 train_time:113570ms step_avg:60.64ms +step:1874/2285 train_time:113630ms step_avg:60.63ms +step:1875/2285 train_time:113692ms step_avg:60.64ms +step:1876/2285 train_time:113753ms step_avg:60.64ms +step:1877/2285 train_time:113815ms step_avg:60.64ms +step:1878/2285 train_time:113875ms step_avg:60.64ms +step:1879/2285 train_time:113937ms step_avg:60.64ms +step:1880/2285 train_time:113997ms step_avg:60.64ms +step:1881/2285 train_time:114060ms step_avg:60.64ms +step:1882/2285 train_time:114120ms step_avg:60.64ms +step:1883/2285 train_time:114181ms step_avg:60.64ms +step:1884/2285 train_time:114241ms step_avg:60.64ms +step:1885/2285 train_time:114304ms step_avg:60.64ms +step:1886/2285 train_time:114364ms step_avg:60.64ms +step:1887/2285 train_time:114427ms step_avg:60.64ms +step:1888/2285 train_time:114486ms step_avg:60.64ms +step:1889/2285 train_time:114549ms step_avg:60.64ms +step:1890/2285 train_time:114609ms step_avg:60.64ms +step:1891/2285 train_time:114671ms step_avg:60.64ms +step:1892/2285 train_time:114731ms step_avg:60.64ms +step:1893/2285 train_time:114794ms step_avg:60.64ms +step:1894/2285 train_time:114854ms step_avg:60.64ms +step:1895/2285 train_time:114916ms step_avg:60.64ms +step:1896/2285 train_time:114976ms step_avg:60.64ms +step:1897/2285 train_time:115039ms step_avg:60.64ms +step:1898/2285 train_time:115099ms step_avg:60.64ms +step:1899/2285 train_time:115161ms step_avg:60.64ms +step:1900/2285 train_time:115220ms step_avg:60.64ms +step:1901/2285 train_time:115283ms step_avg:60.64ms +step:1902/2285 train_time:115344ms step_avg:60.64ms +step:1903/2285 train_time:115407ms step_avg:60.64ms +step:1904/2285 train_time:115466ms step_avg:60.64ms +step:1905/2285 train_time:115529ms step_avg:60.65ms +step:1906/2285 train_time:115589ms step_avg:60.64ms +step:1907/2285 train_time:115651ms step_avg:60.65ms +step:1908/2285 train_time:115711ms step_avg:60.65ms +step:1909/2285 train_time:115773ms step_avg:60.65ms +step:1910/2285 train_time:115833ms step_avg:60.65ms +step:1911/2285 train_time:115896ms step_avg:60.65ms +step:1912/2285 train_time:115956ms step_avg:60.65ms +step:1913/2285 train_time:116018ms step_avg:60.65ms +step:1914/2285 train_time:116078ms step_avg:60.65ms +step:1915/2285 train_time:116140ms step_avg:60.65ms +step:1916/2285 train_time:116201ms step_avg:60.65ms +step:1917/2285 train_time:116264ms step_avg:60.65ms +step:1918/2285 train_time:116324ms step_avg:60.65ms +step:1919/2285 train_time:116387ms step_avg:60.65ms +step:1920/2285 train_time:116447ms step_avg:60.65ms +step:1921/2285 train_time:116510ms step_avg:60.65ms +step:1922/2285 train_time:116570ms step_avg:60.65ms +step:1923/2285 train_time:116633ms step_avg:60.65ms +step:1924/2285 train_time:116693ms step_avg:60.65ms +step:1925/2285 train_time:116755ms step_avg:60.65ms +step:1926/2285 train_time:116815ms step_avg:60.65ms +step:1927/2285 train_time:116878ms step_avg:60.65ms +step:1928/2285 train_time:116938ms step_avg:60.65ms +step:1929/2285 train_time:117000ms step_avg:60.65ms +step:1930/2285 train_time:117060ms step_avg:60.65ms +step:1931/2285 train_time:117122ms step_avg:60.65ms +step:1932/2285 train_time:117182ms step_avg:60.65ms +step:1933/2285 train_time:117245ms step_avg:60.65ms +step:1934/2285 train_time:117305ms step_avg:60.65ms +step:1935/2285 train_time:117367ms step_avg:60.65ms +step:1936/2285 train_time:117427ms step_avg:60.65ms +step:1937/2285 train_time:117489ms step_avg:60.66ms +step:1938/2285 train_time:117549ms step_avg:60.65ms +step:1939/2285 train_time:117612ms step_avg:60.66ms +step:1940/2285 train_time:117671ms step_avg:60.66ms +step:1941/2285 train_time:117734ms step_avg:60.66ms +step:1942/2285 train_time:117794ms step_avg:60.66ms +step:1943/2285 train_time:117856ms step_avg:60.66ms +step:1944/2285 train_time:117916ms step_avg:60.66ms +step:1945/2285 train_time:117978ms step_avg:60.66ms +step:1946/2285 train_time:118038ms step_avg:60.66ms +step:1947/2285 train_time:118100ms step_avg:60.66ms +step:1948/2285 train_time:118161ms step_avg:60.66ms +step:1949/2285 train_time:118223ms step_avg:60.66ms +step:1950/2285 train_time:118284ms step_avg:60.66ms +step:1951/2285 train_time:118346ms step_avg:60.66ms +step:1952/2285 train_time:118406ms step_avg:60.66ms +step:1953/2285 train_time:118468ms step_avg:60.66ms +step:1954/2285 train_time:118528ms step_avg:60.66ms +step:1955/2285 train_time:118591ms step_avg:60.66ms +step:1956/2285 train_time:118651ms step_avg:60.66ms +step:1957/2285 train_time:118714ms step_avg:60.66ms +step:1958/2285 train_time:118774ms step_avg:60.66ms +step:1959/2285 train_time:118836ms step_avg:60.66ms +step:1960/2285 train_time:118897ms step_avg:60.66ms +step:1961/2285 train_time:118959ms step_avg:60.66ms +step:1962/2285 train_time:119018ms step_avg:60.66ms +step:1963/2285 train_time:119081ms step_avg:60.66ms +step:1964/2285 train_time:119141ms step_avg:60.66ms +step:1965/2285 train_time:119203ms step_avg:60.66ms +step:1966/2285 train_time:119263ms step_avg:60.66ms +step:1967/2285 train_time:119326ms step_avg:60.66ms +step:1968/2285 train_time:119386ms step_avg:60.66ms +step:1969/2285 train_time:119448ms step_avg:60.66ms +step:1970/2285 train_time:119508ms step_avg:60.66ms +step:1971/2285 train_time:119570ms step_avg:60.66ms +step:1972/2285 train_time:119631ms step_avg:60.66ms +step:1973/2285 train_time:119693ms step_avg:60.67ms +step:1974/2285 train_time:119754ms step_avg:60.67ms +step:1975/2285 train_time:119816ms step_avg:60.67ms +step:1976/2285 train_time:119876ms step_avg:60.67ms +step:1977/2285 train_time:119938ms step_avg:60.67ms +step:1978/2285 train_time:119999ms step_avg:60.67ms +step:1979/2285 train_time:120061ms step_avg:60.67ms +step:1980/2285 train_time:120121ms step_avg:60.67ms +step:1981/2285 train_time:120183ms step_avg:60.67ms +step:1982/2285 train_time:120244ms step_avg:60.67ms +step:1983/2285 train_time:120306ms step_avg:60.67ms +step:1984/2285 train_time:120367ms step_avg:60.67ms +step:1985/2285 train_time:120429ms step_avg:60.67ms +step:1986/2285 train_time:120489ms step_avg:60.67ms +step:1987/2285 train_time:120551ms step_avg:60.67ms +step:1988/2285 train_time:120611ms step_avg:60.67ms +step:1989/2285 train_time:120674ms step_avg:60.67ms +step:1990/2285 train_time:120734ms step_avg:60.67ms +step:1991/2285 train_time:120796ms step_avg:60.67ms +step:1992/2285 train_time:120857ms step_avg:60.67ms +step:1993/2285 train_time:120919ms step_avg:60.67ms +step:1994/2285 train_time:120979ms step_avg:60.67ms +step:1995/2285 train_time:121041ms step_avg:60.67ms +step:1996/2285 train_time:121101ms step_avg:60.67ms +step:1997/2285 train_time:121163ms step_avg:60.67ms +step:1998/2285 train_time:121223ms step_avg:60.67ms +step:1999/2285 train_time:121285ms step_avg:60.67ms +step:2000/2285 train_time:121345ms step_avg:60.67ms +step:2000/2285 val_loss:3.3174 train_time:121410ms step_avg:60.70ms +step:2001/2285 train_time:121428ms step_avg:60.68ms +step:2002/2285 train_time:121470ms step_avg:60.67ms +step:2003/2285 train_time:121533ms step_avg:60.68ms +step:2004/2285 train_time:121595ms step_avg:60.68ms +step:2005/2285 train_time:121658ms step_avg:60.68ms +step:2006/2285 train_time:121718ms step_avg:60.68ms +step:2007/2285 train_time:121780ms step_avg:60.68ms +step:2008/2285 train_time:121839ms step_avg:60.68ms +step:2009/2285 train_time:121901ms step_avg:60.68ms +step:2010/2285 train_time:121960ms step_avg:60.68ms +step:2011/2285 train_time:122021ms step_avg:60.68ms +step:2012/2285 train_time:122081ms step_avg:60.68ms +step:2013/2285 train_time:122142ms step_avg:60.68ms +step:2014/2285 train_time:122202ms step_avg:60.68ms +step:2015/2285 train_time:122263ms step_avg:60.68ms +step:2016/2285 train_time:122325ms step_avg:60.68ms +step:2017/2285 train_time:122390ms step_avg:60.68ms +step:2018/2285 train_time:122451ms step_avg:60.68ms +step:2019/2285 train_time:122513ms step_avg:60.68ms +step:2020/2285 train_time:122574ms step_avg:60.68ms +step:2021/2285 train_time:122638ms step_avg:60.68ms +step:2022/2285 train_time:122698ms step_avg:60.68ms +step:2023/2285 train_time:122760ms step_avg:60.68ms +step:2024/2285 train_time:122820ms step_avg:60.68ms +step:2025/2285 train_time:122882ms step_avg:60.68ms +step:2026/2285 train_time:122942ms step_avg:60.68ms +step:2027/2285 train_time:123003ms step_avg:60.68ms +step:2028/2285 train_time:123063ms step_avg:60.68ms +step:2029/2285 train_time:123124ms step_avg:60.68ms +step:2030/2285 train_time:123184ms step_avg:60.68ms +step:2031/2285 train_time:123245ms step_avg:60.68ms +step:2032/2285 train_time:123306ms step_avg:60.68ms +step:2033/2285 train_time:123370ms step_avg:60.68ms +step:2034/2285 train_time:123430ms step_avg:60.68ms +step:2035/2285 train_time:123493ms step_avg:60.68ms +step:2036/2285 train_time:123553ms step_avg:60.68ms +step:2037/2285 train_time:123617ms step_avg:60.69ms +step:2038/2285 train_time:123677ms step_avg:60.69ms +step:2039/2285 train_time:123739ms step_avg:60.69ms +step:2040/2285 train_time:123799ms step_avg:60.69ms +step:2041/2285 train_time:123861ms step_avg:60.69ms +step:2042/2285 train_time:123921ms step_avg:60.69ms +step:2043/2285 train_time:123983ms step_avg:60.69ms +step:2044/2285 train_time:124043ms step_avg:60.69ms +step:2045/2285 train_time:124105ms step_avg:60.69ms +step:2046/2285 train_time:124165ms step_avg:60.69ms +step:2047/2285 train_time:124227ms step_avg:60.69ms +step:2048/2285 train_time:124287ms step_avg:60.69ms +step:2049/2285 train_time:124349ms step_avg:60.69ms +step:2050/2285 train_time:124410ms step_avg:60.69ms +step:2051/2285 train_time:124473ms step_avg:60.69ms +step:2052/2285 train_time:124533ms step_avg:60.69ms +step:2053/2285 train_time:124596ms step_avg:60.69ms +step:2054/2285 train_time:124656ms step_avg:60.69ms +step:2055/2285 train_time:124719ms step_avg:60.69ms +step:2056/2285 train_time:124779ms step_avg:60.69ms +step:2057/2285 train_time:124841ms step_avg:60.69ms +step:2058/2285 train_time:124901ms step_avg:60.69ms +step:2059/2285 train_time:124963ms step_avg:60.69ms +step:2060/2285 train_time:125023ms step_avg:60.69ms +step:2061/2285 train_time:125085ms step_avg:60.69ms +step:2062/2285 train_time:125145ms step_avg:60.69ms +step:2063/2285 train_time:125207ms step_avg:60.69ms +step:2064/2285 train_time:125267ms step_avg:60.69ms +step:2065/2285 train_time:125330ms step_avg:60.69ms +step:2066/2285 train_time:125391ms step_avg:60.69ms +step:2067/2285 train_time:125453ms step_avg:60.69ms +step:2068/2285 train_time:125514ms step_avg:60.69ms +step:2069/2285 train_time:125576ms step_avg:60.69ms +step:2070/2285 train_time:125637ms step_avg:60.69ms +step:2071/2285 train_time:125699ms step_avg:60.69ms +step:2072/2285 train_time:125759ms step_avg:60.69ms +step:2073/2285 train_time:125821ms step_avg:60.70ms +step:2074/2285 train_time:125881ms step_avg:60.69ms +step:2075/2285 train_time:125943ms step_avg:60.70ms +step:2076/2285 train_time:126003ms step_avg:60.69ms +step:2077/2285 train_time:126065ms step_avg:60.70ms +step:2078/2285 train_time:126125ms step_avg:60.70ms +step:2079/2285 train_time:126187ms step_avg:60.70ms +step:2080/2285 train_time:126247ms step_avg:60.70ms +step:2081/2285 train_time:126309ms step_avg:60.70ms +step:2082/2285 train_time:126369ms step_avg:60.70ms +step:2083/2285 train_time:126432ms step_avg:60.70ms +step:2084/2285 train_time:126492ms step_avg:60.70ms +step:2085/2285 train_time:126555ms step_avg:60.70ms +step:2086/2285 train_time:126615ms step_avg:60.70ms +step:2087/2285 train_time:126678ms step_avg:60.70ms +step:2088/2285 train_time:126738ms step_avg:60.70ms +step:2089/2285 train_time:126801ms step_avg:60.70ms +step:2090/2285 train_time:126860ms step_avg:60.70ms +step:2091/2285 train_time:126923ms step_avg:60.70ms +step:2092/2285 train_time:126983ms step_avg:60.70ms +step:2093/2285 train_time:127045ms step_avg:60.70ms +step:2094/2285 train_time:127105ms step_avg:60.70ms +step:2095/2285 train_time:127167ms step_avg:60.70ms +step:2096/2285 train_time:127227ms step_avg:60.70ms +step:2097/2285 train_time:127289ms step_avg:60.70ms +step:2098/2285 train_time:127349ms step_avg:60.70ms +step:2099/2285 train_time:127411ms step_avg:60.70ms +step:2100/2285 train_time:127471ms step_avg:60.70ms +step:2101/2285 train_time:127534ms step_avg:60.70ms +step:2102/2285 train_time:127594ms step_avg:60.70ms +step:2103/2285 train_time:127657ms step_avg:60.70ms +step:2104/2285 train_time:127716ms step_avg:60.70ms +step:2105/2285 train_time:127779ms step_avg:60.70ms +step:2106/2285 train_time:127839ms step_avg:60.70ms +step:2107/2285 train_time:127901ms step_avg:60.70ms +step:2108/2285 train_time:127962ms step_avg:60.70ms +step:2109/2285 train_time:128024ms step_avg:60.70ms +step:2110/2285 train_time:128085ms step_avg:60.70ms +step:2111/2285 train_time:128147ms step_avg:60.70ms +step:2112/2285 train_time:128207ms step_avg:60.70ms +step:2113/2285 train_time:128269ms step_avg:60.70ms +step:2114/2285 train_time:128329ms step_avg:60.70ms +step:2115/2285 train_time:128392ms step_avg:60.71ms +step:2116/2285 train_time:128452ms step_avg:60.70ms +step:2117/2285 train_time:128514ms step_avg:60.71ms +step:2118/2285 train_time:128575ms step_avg:60.71ms +step:2119/2285 train_time:128637ms step_avg:60.71ms +step:2120/2285 train_time:128697ms step_avg:60.71ms +step:2121/2285 train_time:128760ms step_avg:60.71ms +step:2122/2285 train_time:128820ms step_avg:60.71ms +step:2123/2285 train_time:128883ms step_avg:60.71ms +step:2124/2285 train_time:128943ms step_avg:60.71ms +step:2125/2285 train_time:129005ms step_avg:60.71ms +step:2126/2285 train_time:129065ms step_avg:60.71ms +step:2127/2285 train_time:129128ms step_avg:60.71ms +step:2128/2285 train_time:129188ms step_avg:60.71ms +step:2129/2285 train_time:129250ms step_avg:60.71ms +step:2130/2285 train_time:129310ms step_avg:60.71ms +step:2131/2285 train_time:129372ms step_avg:60.71ms +step:2132/2285 train_time:129432ms step_avg:60.71ms +step:2133/2285 train_time:129495ms step_avg:60.71ms +step:2134/2285 train_time:129555ms step_avg:60.71ms +step:2135/2285 train_time:129618ms step_avg:60.71ms +step:2136/2285 train_time:129678ms step_avg:60.71ms +step:2137/2285 train_time:129740ms step_avg:60.71ms +step:2138/2285 train_time:129800ms step_avg:60.71ms +step:2139/2285 train_time:129863ms step_avg:60.71ms +step:2140/2285 train_time:129923ms step_avg:60.71ms +step:2141/2285 train_time:129985ms step_avg:60.71ms +step:2142/2285 train_time:130045ms step_avg:60.71ms +step:2143/2285 train_time:130108ms step_avg:60.71ms +step:2144/2285 train_time:130168ms step_avg:60.71ms +step:2145/2285 train_time:130230ms step_avg:60.71ms +step:2146/2285 train_time:130290ms step_avg:60.71ms +step:2147/2285 train_time:130352ms step_avg:60.71ms +step:2148/2285 train_time:130412ms step_avg:60.71ms +step:2149/2285 train_time:130475ms step_avg:60.71ms +step:2150/2285 train_time:130535ms step_avg:60.71ms +step:2151/2285 train_time:130599ms step_avg:60.72ms +step:2152/2285 train_time:130658ms step_avg:60.71ms +step:2153/2285 train_time:130720ms step_avg:60.72ms +step:2154/2285 train_time:130781ms step_avg:60.72ms +step:2155/2285 train_time:130844ms step_avg:60.72ms +step:2156/2285 train_time:130904ms step_avg:60.72ms +step:2157/2285 train_time:130966ms step_avg:60.72ms +step:2158/2285 train_time:131026ms step_avg:60.72ms +step:2159/2285 train_time:131088ms step_avg:60.72ms +step:2160/2285 train_time:131148ms step_avg:60.72ms +step:2161/2285 train_time:131210ms step_avg:60.72ms +step:2162/2285 train_time:131270ms step_avg:60.72ms +step:2163/2285 train_time:131333ms step_avg:60.72ms +step:2164/2285 train_time:131393ms step_avg:60.72ms +step:2165/2285 train_time:131455ms step_avg:60.72ms +step:2166/2285 train_time:131515ms step_avg:60.72ms +step:2167/2285 train_time:131579ms step_avg:60.72ms +step:2168/2285 train_time:131638ms step_avg:60.72ms +step:2169/2285 train_time:131701ms step_avg:60.72ms +step:2170/2285 train_time:131761ms step_avg:60.72ms +step:2171/2285 train_time:131824ms step_avg:60.72ms +step:2172/2285 train_time:131884ms step_avg:60.72ms +step:2173/2285 train_time:131946ms step_avg:60.72ms +step:2174/2285 train_time:132006ms step_avg:60.72ms +step:2175/2285 train_time:132068ms step_avg:60.72ms +step:2176/2285 train_time:132128ms step_avg:60.72ms +step:2177/2285 train_time:132190ms step_avg:60.72ms +step:2178/2285 train_time:132250ms step_avg:60.72ms +step:2179/2285 train_time:132313ms step_avg:60.72ms +step:2180/2285 train_time:132375ms step_avg:60.72ms +step:2181/2285 train_time:132437ms step_avg:60.72ms +step:2182/2285 train_time:132497ms step_avg:60.72ms +step:2183/2285 train_time:132559ms step_avg:60.72ms +step:2184/2285 train_time:132619ms step_avg:60.72ms +step:2185/2285 train_time:132682ms step_avg:60.72ms +step:2186/2285 train_time:132741ms step_avg:60.72ms +step:2187/2285 train_time:132804ms step_avg:60.72ms +step:2188/2285 train_time:132864ms step_avg:60.72ms +step:2189/2285 train_time:132926ms step_avg:60.72ms +step:2190/2285 train_time:132987ms step_avg:60.72ms +step:2191/2285 train_time:133049ms step_avg:60.73ms +step:2192/2285 train_time:133109ms step_avg:60.72ms +step:2193/2285 train_time:133171ms step_avg:60.73ms +step:2194/2285 train_time:133231ms step_avg:60.73ms +step:2195/2285 train_time:133293ms step_avg:60.73ms +step:2196/2285 train_time:133354ms step_avg:60.73ms +step:2197/2285 train_time:133417ms step_avg:60.73ms +step:2198/2285 train_time:133477ms step_avg:60.73ms +step:2199/2285 train_time:133539ms step_avg:60.73ms +step:2200/2285 train_time:133599ms step_avg:60.73ms +step:2201/2285 train_time:133661ms step_avg:60.73ms +step:2202/2285 train_time:133721ms step_avg:60.73ms +step:2203/2285 train_time:133784ms step_avg:60.73ms +step:2204/2285 train_time:133844ms step_avg:60.73ms +step:2205/2285 train_time:133907ms step_avg:60.73ms +step:2206/2285 train_time:133967ms step_avg:60.73ms +step:2207/2285 train_time:134029ms step_avg:60.73ms +step:2208/2285 train_time:134089ms step_avg:60.73ms +step:2209/2285 train_time:134151ms step_avg:60.73ms +step:2210/2285 train_time:134212ms step_avg:60.73ms +step:2211/2285 train_time:134276ms step_avg:60.73ms +step:2212/2285 train_time:134336ms step_avg:60.73ms +step:2213/2285 train_time:134398ms step_avg:60.73ms +step:2214/2285 train_time:134458ms step_avg:60.73ms +step:2215/2285 train_time:134520ms step_avg:60.73ms +step:2216/2285 train_time:134581ms step_avg:60.73ms +step:2217/2285 train_time:134643ms step_avg:60.73ms +step:2218/2285 train_time:134703ms step_avg:60.73ms +step:2219/2285 train_time:134765ms step_avg:60.73ms +step:2220/2285 train_time:134825ms step_avg:60.73ms +step:2221/2285 train_time:134888ms step_avg:60.73ms +step:2222/2285 train_time:134947ms step_avg:60.73ms +step:2223/2285 train_time:135009ms step_avg:60.73ms +step:2224/2285 train_time:135070ms step_avg:60.73ms +step:2225/2285 train_time:135132ms step_avg:60.73ms +step:2226/2285 train_time:135192ms step_avg:60.73ms +step:2227/2285 train_time:135255ms step_avg:60.73ms +step:2228/2285 train_time:135316ms step_avg:60.73ms +step:2229/2285 train_time:135378ms step_avg:60.74ms +step:2230/2285 train_time:135438ms step_avg:60.73ms +step:2231/2285 train_time:135501ms step_avg:60.74ms +step:2232/2285 train_time:135561ms step_avg:60.74ms +step:2233/2285 train_time:135624ms step_avg:60.74ms +step:2234/2285 train_time:135684ms step_avg:60.74ms +step:2235/2285 train_time:135746ms step_avg:60.74ms +step:2236/2285 train_time:135806ms step_avg:60.74ms +step:2237/2285 train_time:135868ms step_avg:60.74ms +step:2238/2285 train_time:135927ms step_avg:60.74ms +step:2239/2285 train_time:135990ms step_avg:60.74ms +step:2240/2285 train_time:136050ms step_avg:60.74ms +step:2241/2285 train_time:136112ms step_avg:60.74ms +step:2242/2285 train_time:136172ms step_avg:60.74ms +step:2243/2285 train_time:136235ms step_avg:60.74ms +step:2244/2285 train_time:136296ms step_avg:60.74ms +step:2245/2285 train_time:136359ms step_avg:60.74ms +step:2246/2285 train_time:136420ms step_avg:60.74ms +step:2247/2285 train_time:136482ms step_avg:60.74ms +step:2248/2285 train_time:136542ms step_avg:60.74ms +step:2249/2285 train_time:136605ms step_avg:60.74ms +step:2250/2285 train_time:136665ms step_avg:60.74ms +step:2250/2285 val_loss:3.2822 train_time:136728ms step_avg:60.77ms +step:2251/2285 train_time:136747ms step_avg:60.75ms +step:2252/2285 train_time:136789ms step_avg:60.74ms +step:2253/2285 train_time:136855ms step_avg:60.74ms +step:2254/2285 train_time:136917ms step_avg:60.74ms +step:2255/2285 train_time:136980ms step_avg:60.74ms +step:2256/2285 train_time:137040ms step_avg:60.74ms +step:2257/2285 train_time:137102ms step_avg:60.75ms +step:2258/2285 train_time:137162ms step_avg:60.74ms +step:2259/2285 train_time:137224ms step_avg:60.75ms +step:2260/2285 train_time:137283ms step_avg:60.74ms +step:2261/2285 train_time:137345ms step_avg:60.75ms +step:2262/2285 train_time:137404ms step_avg:60.74ms +step:2263/2285 train_time:137466ms step_avg:60.75ms +step:2264/2285 train_time:137527ms step_avg:60.74ms +step:2265/2285 train_time:137589ms step_avg:60.75ms +step:2266/2285 train_time:137649ms step_avg:60.75ms +step:2267/2285 train_time:137713ms step_avg:60.75ms +step:2268/2285 train_time:137773ms step_avg:60.75ms +step:2269/2285 train_time:137837ms step_avg:60.75ms +step:2270/2285 train_time:137897ms step_avg:60.75ms +step:2271/2285 train_time:137960ms step_avg:60.75ms +step:2272/2285 train_time:138020ms step_avg:60.75ms +step:2273/2285 train_time:138082ms step_avg:60.75ms +step:2274/2285 train_time:138142ms step_avg:60.75ms +step:2275/2285 train_time:138204ms step_avg:60.75ms +step:2276/2285 train_time:138263ms step_avg:60.75ms +step:2277/2285 train_time:138325ms step_avg:60.75ms +step:2278/2285 train_time:138384ms step_avg:60.75ms +step:2279/2285 train_time:138446ms step_avg:60.75ms +step:2280/2285 train_time:138506ms step_avg:60.75ms +step:2281/2285 train_time:138569ms step_avg:60.75ms +step:2282/2285 train_time:138630ms step_avg:60.75ms +step:2283/2285 train_time:138693ms step_avg:60.75ms +step:2284/2285 train_time:138753ms step_avg:60.75ms +step:2285/2285 train_time:138816ms step_avg:60.75ms +step:2285/2285 val_loss:3.2770 train_time:138877ms step_avg:60.78ms +peak memory allocated: 29626 MiB reserved: 50528 MiB diff --git a/records/track_1_short/2025-10-27_FixMuonLR/72231598-c098-4e79-94f2-26952a4bbdc6.txt b/records/track_1_short/2025-10-27_FixMuonLR/72231598-c098-4e79-94f2-26952a4bbdc6.txt new file mode 100644 index 000000000..558ac579d --- /dev/null +++ b/records/track_1_short/2025-10-27_FixMuonLR/72231598-c098-4e79-94f2-26952a4bbdc6.txt @@ -0,0 +1,3814 @@ +import os +import sys + +with open(sys.argv[0]) as f: + code = f.read() # read the code of this file ASAP, for logging +import copy +import glob +import math +import threading +import time +import uuid +from dataclasses import dataclass +from collections import defaultdict +from itertools import accumulate +from pathlib import Path + +os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" +import torch + +torch.empty( + 1, device="cuda", requires_grad=True +).backward() # prevents a bug on some systems +import torch._dynamo as dynamo +import torch.distributed as dist +import torch.nn.functional as F + +# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min +import triton +import triton.language as tl +from kernels import get_kernel +from torch import Tensor, nn + +dynamo.config.recompile_limit = 64 + +# ----------------------------------------------------------------------------- +# Custom operators: FP8 matmul by @YouJiacheng + + +@torch.library.custom_op("nanogpt::mm", mutates_args=()) +def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: + @torch.compile + def impl(x: Tensor, w: Tensor): + assert x.is_contiguous() and w.is_contiguous() + x_f8 = x.div(x_s).to(torch.float8_e4m3fn) + w_f8 = w.div(w_s).to(torch.float8_e4m3fn) + out = torch._scaled_mm( + x_f8, + w_f8.T, + out_dtype=torch.bfloat16, + scale_a=x.new_tensor(x_s, dtype=torch.float32), + scale_b=x.new_tensor(w_s, dtype=torch.float32), + use_fast_accum=True, + ) + return out, x_f8, w_f8 + + return impl(x, w) + +@mm_op.register_fake +def _(x: Tensor, w: Tensor, *_): + assert x.ndim == w.ndim == 2 + assert x.shape[1] == w.shape[1] + assert x.device == w.device + assert x.is_contiguous() and w.is_contiguous() + return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) + +@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) +def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: + @torch.compile + def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): + assert grad.is_contiguous() + x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) + w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) + grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) + grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) + grad_x = torch._scaled_mm( + grad_f8, + w_f8.T.contiguous().T, + out_dtype=torch.bfloat16, + scale_a=grad_inv_s, + scale_b=w_inv_s, + use_fast_accum=False, + ) + # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) + grad_w = torch._scaled_mm( + x_f8.T.contiguous(), + grad_f8.T.contiguous().T, + out_dtype=torch.float32, + scale_a=x_inv_s, + scale_b=grad_inv_s, + use_fast_accum=False, + ).T + return grad_x, grad_w + + return impl(g, x_f8, w_f8) + +@mm_backward_op.register_fake +def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): + return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) + +def backward(ctx, grad_out: Tensor, *_): + x_f8, w_f8 = ctx.saved_tensors + x_s, w_s, grad_s = ctx.scales + grad_x, grad_w = torch.ops.nanogpt.mm_backward( + grad_out, x_f8, w_f8, x_s, w_s, grad_s + ) + return grad_x, grad_w, None, None, None + +def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): + *_, x_s, w_s, grad_s = inputs + _, x_f8, w_f8 = output + ctx.save_for_backward(x_f8, w_f8) + ctx.scales = x_s, w_s, grad_s + ctx.set_materialize_grads(False) + +mm_op.register_autograd(backward, setup_context=setup_context) + +# ----------------------------------------------------------------------------- +# Triton kernel for symmetric matrix multiplication by @byronxu99 + +def _get_autotune_configs(): + return [ + triton.Config( + { + "BLOCK_SIZE_M": bm, + "BLOCK_SIZE_N": bn, + "BLOCK_SIZE_K": bk, + "GROUP_SIZE_M": 8, + "LOWER_UPPER": 1, + }, + num_stages=stages, + num_warps=warps, + ) + for bm in [64, 128] + for bn in [64, 128, 256] + for bk in [64, 128] + for stages, warps in [(3, 4), (3, 8), (4, 4)] + if bm // bn <= 2 and bn // bm <= 2 + ] + +@triton.jit +def _pid_to_block( + pid, + M, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) + + # Map PID to a single matrix in batch + batch_idx = pid // (num_pid_m * num_pid_n) + pid = pid % (num_pid_m * num_pid_n) + + # Map PID to 2D grid of blocks + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) + + m_idx = pid_m * BLOCK_SIZE_M + n_idx = pid_n * BLOCK_SIZE_N + return batch_idx, m_idx, n_idx + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def XXT_kernel( + A_ptr, C_ptr, + M, K, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def XXT(A: torch.Tensor, out: torch.Tensor): + """ + Launch Triton kernel to compute C = A @ A.T + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert out.size(-2) == M, "Output matrix has incorrect shape" + assert out.size(-1) == M, "Output matrix has incorrect shape" + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + XXT_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + K=K, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + ) + return out + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def ba_plus_cAA_kernel( + A_ptr, C_ptr, + M, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + alpha, beta, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A + # Performance is slightly slower than XXT_kernel, so we use two separate kernels + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + # Load block of A to add (corresponds to the current block of C) + offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) + a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) + a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) + a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) + + # Apply alpha and beta + accumulator *= alpha + accumulator += a_add * beta + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): + """ + Launch Triton kernel to compute C = alpha * A @ A.T + beta * A + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert M == K, "Input matrix must be square" + assert out.size(-2) == M + assert out.size(-1) == M + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + ba_plus_cAA_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + alpha=alpha, + beta=beta, + ) + return out + +# Computed for num_iters=5, safety_factor=2e-2, cushion=2 +polar_express_coeffs = [ + (8.156554524902461, -22.48329292557795, 15.878769915207462), + (4.042929935166739, -2.808917465908714, 0.5000178451051316), + (3.8916678022926607, -2.772484153217685, 0.5060648178503393), + (3.285753657755655, -2.3681294933425376, 0.46449024233003106), + (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) +] + +@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower +def polar_express(G: torch.Tensor): + """ + Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 + by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. + Code adapted from https://github.com/NoahAmsel/PolarExpress/tree/main by @varunneal. + """ + X = G.bfloat16() + if G.size(-2) > G.size(-1): + X = X.mT + + # Ensure spectral norm is at most 1 + X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) + + # Allocate buffers + X = X.contiguous() + A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) + B = torch.empty_like(A) + C = torch.empty_like(X) + + aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm + + # Perform the iterations + for a, b, c in polar_express_coeffs: + XXT(X, out=A) # A = X @ X.mT + ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A + aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X + X, C = C, X # Swap references to avoid unnecessary copies + + if G.size(-2) > G.size(-1): + X = X.mT + return X + +# ----------------------------------------------------------------------------- +# Muon optimizer + +class Muon(torch.optim.Optimizer): + """ + Muon - MomentUm Orthogonalized by Newton-schulz + + https://kellerjordan.github.io/posts/muon/ + + Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- + processing step, in which each 2D parameter's update is replaced with the nearest orthogonal + matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has + the advantage that it can be stably run in bfloat16 on the GPU. + Note: A later PR replaced Newton-Shulz with Polar Express for the orthogonalization step + + Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, + or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). + Though empirically small 1D params perform efficiently here: + NS approximately performs a magnitude normalization of the grad + This hyper-optimized class has faster execution time than the current impl of Adam for small params + + Custom distributed sizing: + The model stores all attn and mlp weights in the same shape, and then updates the view as + needed on the forward pass. This enables attn and mlp weights to be contained within the same + dist.reduce_scatter_tensor() call. The model architecture has been customized to enable + (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. + The scheduling is: + 1. reduce scatter smear_gate (1 param 7 padding params) + 2. reduce scatter attn_gate (10 params 6 padding params) + 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) + 4. reduce scatter attn/mlp round 2 (16 mlp params) + 5. wait on step 1, then compute update of 1 and schedule all gather + 6. wait on step 2, then compute update of 2 and schedule all gather + 7. wait on step 3, then compute update of 3 and schedule all gather + GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] + GPUs that receive params of type attn reshape before computing update + 8. wait on 4, then compute update of 4 and schedule all gather + 9. wait for each all gather to complete and update params + Empirically, leading with small params provides an additional 0.2s improvement. + """ + def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, eps=1e-8, beta2=0.95, custom_sizing=True): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + # custom sizing requires 8 GPUs + if custom_sizing and dist.get_world_size()==8: + param_groups = self.generate_custom_param_groups(params) + else: + param_groups = self.generate_standard_param_groups(params) + super().__init__(param_groups, defaults) + + def reset(self): + # expose a reset for clearing buffers + for group in self.param_groups: + group["momentum_buffer"].zero_() + group["second_momentum_buffer"].zero_() + + def generate_standard_param_groups(self, params): + """ + Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. + Creates one param group per module. + """ + groups = defaultdict(list) + for param in params: + groups[param.label].append(param) + + param_groups = [] + for module_name, group_params in groups.items(): + chunk_size = (len(group_params) + self.world_size - 1) // self.world_size + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + + return param_groups + + def generate_custom_param_groups(self, params): + """ + Implementation requires that a single GPU does not receive both attn + and mlp params when a param group is split across GPUs. + """ + module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp_up', 'mlp_down'] + params_list = list(params) + params_list.sort(key=lambda x: module_group_order.index(x.label)) + + idx = 0 + group_sizes = [1, 10, 16, 16] + assert len(params_list) == sum(group_sizes) + param_groups = [] + for size in group_sizes: + chunk_size = (size + self.world_size - 1) // self.world_size + group_params = params_list[idx: idx + size] + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + idx += size + + return param_groups + + @torch.no_grad() + def step(self): + # Efficient systems-wise implementation of step developed by @YouJiacheng, + # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, + # @ryanyang0, @vagrawal, and @varunneal. + rank = dist.get_rank() + group_infos = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + if not params: + continue + + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + stacked_grads = torch.empty( + (padded_num_params, *params[0].shape), + dtype=params[0].dtype, + device=params[0].device + ) + for i, p in enumerate(params): + stacked_grads[i].copy_(p.grad, non_blocking=True) + if len(params) < padded_num_params: + stacked_grads[len(params):].zero_() + + grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) + + reduce_future = dist.reduce_scatter_tensor( + grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True + ).get_future() + + group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) + + all_gather_infos = [] + # Second pass: wait for gradients, compute updates for the local shard of parameters, + # and launch all async all_gather operations. + for group, info in zip(self.param_groups, group_infos): + info["reduce_future"].wait() + + params = group["params"] + grad_chunk = info["grad_chunk"] + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + start_idx = rank * chunk_size + module_idx = start_idx if start_idx < len(params) else 0 + + num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank + + if "momentum_buffer" not in group: + group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) + momentum_buffer = group["momentum_buffer"] + # Apply momentum update to the persistent momentum buffer in-place + momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) + updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) + + grad_shape = updated_grads.shape + if params[module_idx].label == 'attn': + # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] + for p in params[module_idx:module_idx + num_params]: + assert p.label == 'attn' + updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) + ref_param = params[module_idx] + param_shape = ref_param.shape + + if "second_momentum_buffer" not in group: + group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) + if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) + ) + second_momentum_buffer = group["second_momentum_buffer"] + + if "param_lr" not in group: + group["param_lr"] = ( + max(1., param_shape[-2] / param_shape[-1]) ** 0.5 + * ref_param.new_tensor( + [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + ) + + group["param_wd"] = ref_param.new_tensor( + [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + + # Determine LR and WR + eff_lr = group["lr"] * group["param_lr"] + eff_wd = group["weight_decay"] * group["param_wd"] + + # Compute zeropower for the entire chunk in a single, batched call. + if num_params == 0: + v_chunk = updated_grads + elif params[module_idx].label == "smear_gate": + # dividing by magnitude is equivalent of SVN for 1d tensors + v_chunk = updated_grads / (updated_grads.norm(dim=(-2, -1), keepdim=True).clamp_min(1e-10)) + else: + v_chunk = polar_express(updated_grads) + + # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) + v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) + second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) + step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() + v_chunk.mul_(step_size) + v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) + + v_chunk = v_chunk.view(grad_shape) + + updated_params = torch.empty_like(grad_chunk) + param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) + # Apply weight decay directly to the buffer. + param_chunk.mul_(1 - eff_wd) + + param_chunk.add_(-eff_lr * v_chunk) + + updated_params[:num_params].copy_(param_chunk) + if num_params < chunk_size: + updated_params[num_params:].zero_() + + stacked_params = torch.empty( + (padded_num_params, *param_shape), + dtype=updated_params.dtype, + device=updated_params.device, + ) + + gather_future = dist.all_gather_into_tensor( + stacked_params, updated_params, async_op=True + ).get_future() + + all_gather_infos.append( + { + "gather_future": gather_future, + "stacked_params": stacked_params, + "orig_params": params, + } + ) + + # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. + for info in all_gather_infos: + info["gather_future"].wait() + stacked_params = info["stacked_params"] + orig_params = info["orig_params"] + + unstacked_params = torch.unbind(stacked_params) + for i, p in enumerate(orig_params): + p.copy_(unstacked_params[i], non_blocking=True) + + +class DistAdam(torch.optim.Optimizer): + def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + params = list(params) + sizes = {p.shape for p in params} + # create one buffer per unique parameter-size + param_groups = [] + for size in sizes: + group_params = [p for p in params if p.shape == size] + param_groups.append(dict(params=group_params)) + super().__init__(param_groups, defaults) + # init state + for p in params: + chunk_size = p.size(0) // self.world_size + exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) + exp_avg_sq = torch.zeros_like(exp_avg) + self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) + # DistributedAdam implementation by @vagrawal + + @torch.compile + @torch.no_grad() + def step(self): + rank = dist.get_rank() + reduce_scatter_futures: list[torch.Future] = [] + all_gather_futures: list[torch.Future] = [] + grad_slices = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + for param in params: + grad = param.grad + rank_size = grad.shape[0] // self.world_size + grad_slice = torch.empty_like(grad[:rank_size]) + reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) + grad_slices.append(grad_slice) + + idx = 0 + for group in self.param_groups: + beta1, beta2 = group['betas'] + eps = group['eps'] + wd = group['weight_decay'] + params = group['params'] + for param in params: + reduce_scatter_futures[idx].wait() + rank_size = param.shape[0] // self.world_size + p_slice = param[rank * rank_size:(rank + 1) * rank_size] + lr = group['lr'] * getattr(param, "lr_mul", 1.0) + state = self.state[param] + g_slice = grad_slices[idx] + + exp_avg = state["exp_avg"] + exp_avg_sq = state["exp_avg_sq"] + state["step"] += 1 + t = state["step"] + # weight decay + if wd != 0: + eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) + p_slice.mul_(1 - eff_weight_decay) + # update running averages + exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) + # bias corrections + bias1 = 1 - beta1 ** t + bias2 = 1 - beta2 ** t + # compute step + denom = exp_avg_sq.sqrt().add_(eps) + step_size = lr * (bias2 ** 0.5 / bias1) + update = exp_avg.div(denom).mul_(step_size) + p_slice.add_(other=update, alpha=-1.0) + idx += 1 + all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) + torch.futures.collect_all(all_gather_futures).wait() + +# ----------------------------------------------------------------------------- +# PyTorch nn.Module definitions for the model + +def norm(x: Tensor): + return F.rms_norm(x, (x.size(-1),)) + +class CastedLinear(nn.Linear): + def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): + super().__init__(in_features, out_features, bias=False) + self.use_fp8 = use_fp8 + self.x_s = x_s + self.w_s = w_s + self.grad_s = grad_s + + def reset_parameters(self) -> None: + with torch.no_grad(): + self.weight.zero_() # @Grad62304977 and others + + def forward(self, x: Tensor): + if self.use_fp8 and self.training: + _x = x.flatten(0, -2) + out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] + return out.reshape(*x.shape[:-1], -1) + else: + return F.linear(x, self.weight.type_as(x)) + +# yarn implementation @classiclarryd +class Yarn(nn.Module): + def __init__(self, head_dim, max_seq_len): + super().__init__() + self.head_dim = head_dim + self.max_seq_len = max_seq_len + self.reset() + + def reset(self): + angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) + # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) + angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) + theta = torch.outer(t, angular_freq) + self.cos = nn.Buffer( + theta.cos().to(torch.bfloat16), persistent=False + ) + self.sin = nn.Buffer( + theta.sin().to(torch.bfloat16), persistent=False + ) + self.angular_freq = angular_freq + # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 + self.attn_scale = 0.1 + + def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): + rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) + scaling_factor = old_window / new_window + interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) + self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) + theta = torch.outer(t, self.angular_freq) + self.cos.copy_(theta.cos()) + self.sin.copy_(theta.sin()) + self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 + +def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): + assert cos.size(0) >= x_BTHD.size(-3) + cos, sin = ( + cos[None, : x_BTHD.size(-3), None, :], + sin[None, : x_BTHD.size(-3), None, :], + ) + x1, x2 = x_BTHD.chunk(2, dim=-1) + y1 = x1 * cos + x2 * sin + y2 = x1 * (-sin) + x2 * cos + return torch.cat((y1, y2), 3) + +@dataclass +class AttnArgs: + ve: torch.Tensor + sa_lambdas: torch.Tensor + seqlens: torch.Tensor + bm_size: int + cos: torch.Tensor + sin: torch.Tensor + attn_scale: float + +flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface + +class CausalSelfAttention(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int): + super().__init__() + self.num_heads = num_heads + self.head_dim = head_dim + self.dim = dim + self.hdim = num_heads * head_dim + + assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" + std = 0.5 * (self.dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng + # https://x.com/hi_tysam/status/1879699187107033311 + # make matrices the same shape as MLP to enable batched call in optimizer + self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) + # label module to enable custom optimizer sizing + self.qkvo_w.label='attn' + + with torch.no_grad(): + self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights + self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero + + # sparse gated attention to enable context based no-op by @classiclarryd + self.attn_gate = CastedLinear(12, num_heads) + # label module to enable custom optimizer sizing + self.attn_gate.weight.label = 'attn_gate' + + def forward(self, x: Tensor, attn_args: AttnArgs): + B, T = x.size(0), x.size(1) # batch size, sequence length + assert B == 1, "varlen sequences requires B == 1" + assert T % 16 == 0 + # unpack attention args + cos, sin = attn_args.cos, attn_args.sin + ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas + seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size + + q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) + q, k = norm(q), norm(k) # QK norm @Grad62304977 + q, k = rotary(q, cos, sin), rotary(k, cos, sin) + if ve is not None: + v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 + else: # skip mid-layers token value embeddings by @YouJiacheng + v = sa_lambdas[0] * v + + max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) + + # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng + y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, + max_seqlen_q=max_len, max_seqlen_k=max_len, + causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) + y = y.view(B, T, self.num_heads, self.head_dim) + y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) + y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side + y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) + return y + + +class MLP(nn.Module): + def __init__(self, dim: int): + super().__init__() + hdim = 4 * dim + # make matrices the same shape to enable batched call in optimizer + self.c_fc = nn.Parameter(torch.empty(dim, hdim)) + self.c_proj = nn.Parameter(torch.empty(dim, hdim)) + # label modules to enable custom optimizer sizing + self.c_fc.label = 'mlp_up' + self.c_proj.label = 'mlp_down' + # corrective factor to account for transpose + self.c_fc.lr_mul = 2. + + std = 0.5 * (dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + with torch.no_grad(): + self.c_fc.uniform_(-bound, bound) + self.c_proj.zero_() # zero init suggested by @Grad62304977 + + def forward(self, x: Tensor): + x = F.linear(x, self.c_fc.T.type_as(x)) + x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 + x = F.linear(x, self.c_proj.type_as(x)) + return x + +class Block(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): + super().__init__() + # skip attention of blocks.7 (the 8th layer) by @YouJiacheng + self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None + # skip MLP blocks for first MLP layer by @EmelyanenkoK + self.mlp = MLP(dim) if layer_idx != 0 else None + + def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): + x = lambdas[0] * x + lambdas[1] * x0 + if self.attn is not None: + x = x + self.attn(norm(x), attn_args) + if self.mlp is not None: + x = x + self.mlp(norm(x)) + return x + +# ----------------------------------------------------------------------------- +# The main model + +def next_multiple_of_n(v: float | int, *, n: int): + return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) + +class GPT(nn.Module): + def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): + super().__init__() + vocab_size = next_multiple_of_n(vocab_size, n=128) + self.embed = nn.Embedding(vocab_size, model_dim) + self.smear_gate = CastedLinear(12, 1) + # label modules to enable custom optimizer sizing + self.smear_gate.weight.label = 'smear_gate' + # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 + # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 + self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) + self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) + self.yarn = Yarn(head_dim, max_seq_len) + # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. + # suggested to me by @Grad62304977. this originates from Karpathy's experiments. + use_fp8 = not os.environ.get("DISABLE_FP8", False) + self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) + # Add learnable skip connection weights for decoder layers + assert num_layers % 2 == 0 + pad = (-num_layers * 5 - 2) % dist.get_world_size() + self.scalars = nn.Parameter( + torch.cat( + [ + -1.5 + * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 + *[ + torch.tensor([1.0, 0.0]) for _ in range(num_layers) + ], # block lambdas + *[ + torch.tensor([0.5, 0.5]) for _ in range(num_layers) + ], # SA lambdas + torch.zeros(1), # smear_lambda + 0.5*torch.ones(1), # backout_lambda + torch.ones(pad), + ] + ) + ) + # set learning rates + for param in self.embed.parameters(): + param.lr_mul = 75. + for param in self.value_embeds.parameters(): + param.lr_mul = 75. + self.lm_head.weight.lr_mul = 1.0 + self.scalars.lr_mul = 5.0 + + def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): + assert input_seq.ndim == 1 + + ve = [value_embed(input_seq) for value_embed in self.value_embeds] + # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure + # dropping first layer updates this to .12 ... 012 + ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] + assert len(ve) == len(self.blocks) + + short_bm = ws_short * args.block_size + long_bm = ws_long * args.block_size + bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] + assert len(bm_sizes) == len(self.blocks) + + x = self.embed(input_seq) + + skip_weights = self.scalars[:(len(self.blocks) // 2)] + lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) + sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) + smear_lambda = self.scalars[5 * len(self.blocks)] + backout_lambda = self.scalars[5 * len(self.blocks)+1] + + # smear token embed forward 1 position @classiclarryd + smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) + x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) + x = x0 = norm(x[None]) + + # U-net design by @brendanh0gan + skip_connections = [] + n = len(self.blocks) // 2 + + x_backout = None + backout_layer = 8 + # skip layer zero + for i in range(1,len(self.blocks)): + attn_args = AttnArgs( + ve=ve[i], + sa_lambdas=sa_lambdas[i], + seqlens=seqlens, + bm_size=bm_sizes[i], + cos=self.yarn.cos, + sin=self.yarn.sin, + attn_scale=self.yarn.attn_scale + ) + # since layer 0 is skipped, layer 11 does not have skip_connection + if i >= n and i<11: + gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) + x = x + gate * skip_connections.pop() + x = self.blocks[i](x, x0, lambdas[i], attn_args) + if i < n: + skip_connections.append(x) + if i == backout_layer: + x_backout = x + + # back out contributions from first 8 layers that are only required for downstream context and not direct prediction + x -= backout_lambda * x_backout + x = norm(x) + logits = self.lm_head(x) + # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) + logits = 30 * torch.sigmoid(logits / 7.5) + logits_for_loss = logits.float() if not self.training else logits + loss = F.cross_entropy( + logits_for_loss.view(-1, logits_for_loss.size(-1)), + target_seq, + reduction="sum" if self.training else "mean", + ) + return loss + +# ----------------------------------------------------------------------------- +# Distributed data loader + +def _load_data_shard(file: Path): + header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 + assert header[0] == 20240520, "magic number mismatch in the data .bin file" + assert header[1] == 1, "unsupported version" + num_tokens = int(header[2]) # number of tokens (claimed) + with file.open("rb", buffering=0) as f: + tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng + f.seek(256 * 4) + nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng + assert nbytes == 2 * num_tokens, "number of tokens read does not match header" + return tokens + +BOS_ID = 50256 + +class BOSFinder: + # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd + def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): + # Precompute BOS positions once per shard + self.tokens=tokens + self.size = tokens.numel() + self.quickload = quickload + if quickload: + # only scan first 4 million tokens, then kickoff async thread to scan rest + self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.thread = None + self.ready = threading.Event() + self.start() + else: + self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.i = 0 + self.world_size = world_size + self.batch_iter = 0 + + def _load(self): + self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + self.bos_idx = self.bos_idx_async + + def next_batch(self, num_tokens_local: int, max_seq_len: int): + # if quickload was used, repoint to the full dataset after 5 batches + if self.quickload and self.batch_iter==5: + self.get() + n = len(self.bos_idx) + starts = [[] for _ in range(self.world_size)] + ends = [[] for _ in range(self.world_size)] + + idx = self.i + for r in range(self.world_size): + cur_len = 0 + while cur_len <= num_tokens_local: + if idx >= n: + raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") + cur = self.bos_idx[idx] + starts[r].append(cur) + end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, + cur + max_seq_len, + cur + num_tokens_local - cur_len + 1) + ends[r].append(end) + cur_len += end - cur + idx += 1 + + assert cur_len == num_tokens_local + 1 + self.i = idx + self.batch_iter+=1 + return starts, ends + +class DataPreloader: + # Helper for asynchronously loading next shard and indexing bos tokens + def __init__(self, file_iter, world_size: int = 1): + self.file_iter = file_iter + self.world_size = world_size + self.thread = None + self.data = None + self.ready = threading.Event() + + def _load(self): + tokens = _load_data_shard(next(self.file_iter)) + self.data = (tokens, BOSFinder(tokens, self.world_size)) + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + return self.data + +def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): + # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len + rank = dist.get_rank() if dist.is_initialized() else 0 + world_size = dist.get_world_size() if dist.is_initialized() else 1 + assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" + num_tokens = num_tokens // grad_accum_steps + + files = [Path(file) for file in sorted(glob.glob(filename_pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") + + file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training + tokens = _load_data_shard(next(file_iter)) + if align_to_bos: + finder = BOSFinder(tokens, world_size=world_size, quickload=True) + preloader = DataPreloader(file_iter, world_size) + preloader.start() + else: + pos = 0 # for unaligned case + + while True: + num_tokens_local = num_tokens // world_size + max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 + + if align_to_bos: + try: + seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) + start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) + except StopIteration: + # This shard is exhausted, load the next one in the next loop iteration. + tokens, finder = preloader.get() + preloader.start() + continue + + buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) + _inputs = buf[:-1] + _targets = buf[1:] + end_idxs[-1] -= 1 # last document was too long to account for _targets offset + cum_lengths = (end_idxs - start_idxs).cumsum(0) + + else: + if pos + num_tokens + 1 >= len(tokens): # should not occur for val data + tokens, pos = _load_data_shard(next(file_iter)), 0 + + pos_local = pos + rank * num_tokens_local + buf = tokens[pos_local: pos_local + num_tokens_local + 1] + _inputs = buf[:-1].view(num_tokens_local, ) + _targets = buf[1:].view(num_tokens_local, ) + + cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] + pos += num_tokens + + + _cum_lengths = torch.full((max_num_docs,), num_tokens_local) + _cum_lengths[0] = 0 + _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths + + new_params = yield ( + _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), + _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), + _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) + ) + + if new_params is not None: + # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() + new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params + assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" + num_tokens = new_num_tokens + max_seq_len = new_max_seq_len + grad_accum_steps = new_grad_accum_steps + + +# ----------------------------------------------------------------------------- +# int main + +@dataclass +class Hyperparameters: + # data + train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on + val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on + val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons + train_batch_size: int = 2048 * 16 * 8 + train_max_seq_len: int = 128 * 16 + val_batch_size: int = 4 * 64 * 1024 * 8 + # optimization + num_iterations: int = 2285 + lr_schedule = (0.5, 0.98) # breakpoints for 3-part schedule: (flat, linear decay, flat) + lr_min = 0.1 + # evaluation and logging + run_id: str = f"{uuid.uuid4()}" + val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end + save_checkpoint: bool = False + # attention masking + block_size: int = 128 + ws_schedule: tuple = (3, 5, 7, 9, 11, 13) + ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN + +args = Hyperparameters() + +data_path = os.environ.get("DATA_PATH", ".") +args.train_files = os.path.join(data_path, args.train_files) +args.val_files = os.path.join(data_path, args.val_files) + +# torchrun sets these env variables +rank = int(os.environ["RANK"]) +world_size = int(os.environ["WORLD_SIZE"]) +assert 8 % world_size == 0, "world_size must be a divisor of 8" +grad_accum_steps = 8 // world_size +assert torch.cuda.is_available() +device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) +torch.cuda.set_device(device) +dist.init_process_group(backend="nccl", device_id=device) +dist.barrier() +master_process = (rank == 0) # this process will do logging, checkpointing etc. + +# begin logging +logfile = None +if master_process: + run_id = args.run_id + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{run_id}.txt" + print(logfile) +def print0(s, console=False): + if master_process: + with open(logfile, "a") as f: + if console: + print(s) + print(s, file=f) + +# begin by printing this file (the Python code) +print0(code) +print0("="*100) +# log information about the hardware/software environment this is running on +print0(f"Running Python {sys.version}") +print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") +print0(f"Running Triton version {triton.__version__}") + +def nvidia_smi(): + import subprocess # avoid top level import + return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout +print0(nvidia_smi()) +print0("="*100) + +model: nn.Module = GPT( + vocab_size=50257, + num_layers=12, + num_heads=6, + head_dim=128, + model_dim=768, + max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) +).cuda() +for m in model.modules(): + if isinstance(m, (nn.Embedding, nn.Linear)): + m.bfloat16() +for param in model.parameters(): + dist.broadcast(param.detach(), 0) + +# collect the parameters to optimize +hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] +embed_params = [p for n, p in model.named_parameters() if "embed" in n] +scalar_params = [p for p in model.parameters() if p.ndim < 2] +head_params = [model.lm_head.weight] +gate_params = [p for n, p in model.named_parameters() if "gate" in n] + +# init the optimizer(s) +# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence +# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 +optimizer1 = DistAdam( + scalar_params + head_params + embed_params, + lr=0.008, + betas=(0.65, 0.95), + eps=1e-8, + weight_decay=0.0, +) +optimizer2 = Muon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=0.0) +optimizers = [optimizer1, optimizer2] +for opt in optimizers: + for group in opt.param_groups: + group["initial_lr"] = group["lr"] + +def get_lr(step: int): + assert step < args.num_iterations + # Three part schedule: flat, linear decrease, flat + lr_schedule = args.lr_schedule + x = step / args.num_iterations + + if x < lr_schedule[0]: + return 1.0 + elif x < lr_schedule[1]: + progress = (x - lr_schedule[0]) / (lr_schedule[1] - lr_schedule[0]) + lr = 1.0 - (1.0 - args.lr_min) * progress + else: + lr = args.lr_min + return lr + +def get_ws(step: int): + assert step <= args.num_iterations + x = step / (args.num_iterations + 1) + ws_idx = int(len(args.ws_schedule) * x) + return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] + +def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): + # warmup phase: linearly increase momentum from min to max + # cooldown phase: linearly decrease momentum from max to min + momentum_cd_start = args.num_iterations - muon_cooldown_steps + if step < muon_warmup_steps: + frac = step / muon_warmup_steps + momentum = momentum_min + frac * (momentum_max - momentum_min) + elif step > momentum_cd_start: + frac = (step - momentum_cd_start) / muon_cooldown_steps + momentum = momentum_max - frac * (momentum_max - momentum_min) + else: + momentum = momentum_max + return momentum + +def step_optimizers(step: int, optimizers, model): + # update lr + for optimizer in optimizers: + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * get_lr(step) + + # set muon momentum based on step + momentum = get_muon_momentum(step) + for group in optimizers[1].param_groups: + group["momentum"] = momentum + + # on even steps, only step Muon params + # on odd steps, step all params + if step%2==0: + optimizers[1].step() + optimizers[1].zero_grad(set_to_none=True) + else: + for optimizer in optimizers: + optimizer.step() + model.zero_grad(set_to_none=True) + +model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) + +######################################## +# Warmup kernels # +######################################## + +# Warmup the training kernels, then re-initialize the state so we aren't cheating +warmup_steps = 30 +initial_state = dict(model=copy.deepcopy(model.state_dict()), + optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +for step in range(warmup_steps): + inputs, targets, cum_seqlens = next(train_loader) + # each window size is a new graph, need to warm up each with Yarn.attn_scale + ws_idx = step % len(args.ws_schedule) + if ws_idx==0: + model.yarn.reset() + ws_long = args.ws_schedule[0] + else: + new_ws_long = args.ws_schedule[ws_idx] + if new_ws_long > ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long = new_ws_long + model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() + for opt in optimizers: + opt.step() + model.zero_grad(set_to_none=True) +model.yarn.reset() # rotary buffer is not stored in state_dict +model.load_state_dict(initial_state["model"]) +optimizer2.reset() # momentum buffer not in state dict +for opt, opt_state in zip(optimizers, initial_state["optimizers"]): + opt.load_state_dict(opt_state) +del train_loader, initial_state + +######################################## +# Training and validation # +######################################## + +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +training_time_ms = 0 +# start the clock +torch.cuda.synchronize() +t0 = time.perf_counter() +# begin training +train_steps = args.num_iterations +ws_short, ws_long = get_ws(0) +for step in range(train_steps + 1): + last_step = (step == train_steps) + ws_short, new_ws_long = get_ws(step) + if new_ws_long != ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long=new_ws_long + + # --------------- VALIDATION SECTION ----------------- + if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): + if last_step: + ws_long = args.ws_validate_post_yarn_ext + # stop the clock + torch.cuda.synchronize() + training_time_ms += 1000 * (time.perf_counter() - t0) + model.eval() + assert args.val_tokens % args.val_batch_size == 0 + val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size + val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) + val_loss = 0 + with torch.no_grad(): + for _ in range(val_steps): + inputs, targets, cum_seqlens = next(val_loader) + val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) + val_loss /= val_steps + del val_loader + dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) + print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) + model.train() + # start the clock again + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if master_process and args.save_checkpoint: + log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) + os.makedirs(f"logs/{run_id}", exist_ok=True) + torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") + # the last step only has the validation loop, so break to avoid training + break + + # --------------- TRAINING SECTION ----------------- + loss = 0 + for _ in range(grad_accum_steps): + inputs, targets, cum_seqlens = next(train_loader) + loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps + loss.backward() + step_optimizers(step, optimizers, model) + + # logging + approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) + print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) + +print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) +dist.destroy_process_group() + +==================================================================================================== +Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] +Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 +Running Triton version 3.5.0 +Tue Oct 28 02:04:31 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | +| N/A 40C P0 129W / 700W | 5858MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | +| N/A 33C P0 126W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | +| N/A 32C P0 122W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | +| N/A 37C P0 125W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | +| N/A 39C P0 122W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | +| N/A 32C P0 119W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | +| N/A 37C P0 124W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | +| N/A 31C P0 115W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| ++-----------------------------------------------------------------------------------------+ + +==================================================================================================== +step:0/2285 val_loss:10.8258 train_time:0ms step_avg:0.04ms +step:1/2285 train_time:119ms step_avg:119.32ms +step:2/2285 train_time:141ms step_avg:70.40ms +step:3/2285 train_time:178ms step_avg:59.44ms +step:4/2285 train_time:234ms step_avg:58.60ms +step:5/2285 train_time:294ms step_avg:58.77ms +step:6/2285 train_time:352ms step_avg:58.61ms +step:7/2285 train_time:412ms step_avg:58.87ms +step:8/2285 train_time:471ms step_avg:58.83ms +step:9/2285 train_time:531ms step_avg:59.02ms +step:10/2285 train_time:590ms step_avg:59.00ms +step:11/2285 train_time:651ms step_avg:59.15ms +step:12/2285 train_time:709ms step_avg:59.08ms +step:13/2285 train_time:770ms step_avg:59.20ms +step:14/2285 train_time:828ms step_avg:59.17ms +step:15/2285 train_time:889ms step_avg:59.27ms +step:16/2285 train_time:948ms step_avg:59.23ms +step:17/2285 train_time:1012ms step_avg:59.55ms +step:18/2285 train_time:1076ms step_avg:59.78ms +step:19/2285 train_time:1140ms step_avg:60.03ms +step:20/2285 train_time:1201ms step_avg:60.03ms +step:21/2285 train_time:1262ms step_avg:60.07ms +step:22/2285 train_time:1321ms step_avg:60.03ms +step:23/2285 train_time:1381ms step_avg:60.06ms +step:24/2285 train_time:1440ms step_avg:60.01ms +step:25/2285 train_time:1501ms step_avg:60.06ms +step:26/2285 train_time:1561ms step_avg:60.03ms +step:27/2285 train_time:1622ms step_avg:60.07ms +step:28/2285 train_time:1681ms step_avg:60.05ms +step:29/2285 train_time:1743ms step_avg:60.09ms +step:30/2285 train_time:1802ms step_avg:60.07ms +step:31/2285 train_time:1863ms step_avg:60.09ms +step:32/2285 train_time:1922ms step_avg:60.06ms +step:33/2285 train_time:1985ms step_avg:60.14ms +step:34/2285 train_time:2044ms step_avg:60.13ms +step:35/2285 train_time:2106ms step_avg:60.18ms +step:36/2285 train_time:2165ms step_avg:60.15ms +step:37/2285 train_time:2227ms step_avg:60.18ms +step:38/2285 train_time:2286ms step_avg:60.15ms +step:39/2285 train_time:2347ms step_avg:60.18ms +step:40/2285 train_time:2406ms step_avg:60.15ms +step:41/2285 train_time:2467ms step_avg:60.18ms +step:42/2285 train_time:2526ms step_avg:60.15ms +step:43/2285 train_time:2587ms step_avg:60.17ms +step:44/2285 train_time:2647ms step_avg:60.16ms +step:45/2285 train_time:2709ms step_avg:60.20ms +step:46/2285 train_time:2768ms step_avg:60.17ms +step:47/2285 train_time:2830ms step_avg:60.21ms +step:48/2285 train_time:2889ms step_avg:60.19ms +step:49/2285 train_time:2951ms step_avg:60.23ms +step:50/2285 train_time:3011ms step_avg:60.22ms +step:51/2285 train_time:3073ms step_avg:60.25ms +step:52/2285 train_time:3132ms step_avg:60.24ms +step:53/2285 train_time:3194ms step_avg:60.27ms +step:54/2285 train_time:3253ms step_avg:60.24ms +step:55/2285 train_time:3315ms step_avg:60.27ms +step:56/2285 train_time:3373ms step_avg:60.24ms +step:57/2285 train_time:3435ms step_avg:60.26ms +step:58/2285 train_time:3494ms step_avg:60.24ms +step:59/2285 train_time:3555ms step_avg:60.26ms +step:60/2285 train_time:3615ms step_avg:60.25ms +step:61/2285 train_time:3677ms step_avg:60.27ms +step:62/2285 train_time:3736ms step_avg:60.25ms +step:63/2285 train_time:3797ms step_avg:60.27ms +step:64/2285 train_time:3856ms step_avg:60.25ms +step:65/2285 train_time:3918ms step_avg:60.27ms +step:66/2285 train_time:3977ms step_avg:60.26ms +step:67/2285 train_time:4038ms step_avg:60.28ms +step:68/2285 train_time:4098ms step_avg:60.26ms +step:69/2285 train_time:4159ms step_avg:60.28ms +step:70/2285 train_time:4218ms step_avg:60.26ms +step:71/2285 train_time:4279ms step_avg:60.27ms +step:72/2285 train_time:4338ms step_avg:60.25ms +step:73/2285 train_time:4399ms step_avg:60.26ms +step:74/2285 train_time:4458ms step_avg:60.24ms +step:75/2285 train_time:4519ms step_avg:60.25ms +step:76/2285 train_time:4578ms step_avg:60.24ms +step:77/2285 train_time:4641ms step_avg:60.27ms +step:78/2285 train_time:4700ms step_avg:60.25ms +step:79/2285 train_time:4761ms step_avg:60.26ms +step:80/2285 train_time:4820ms step_avg:60.25ms +step:81/2285 train_time:4881ms step_avg:60.26ms +step:82/2285 train_time:4940ms step_avg:60.24ms +step:83/2285 train_time:5002ms step_avg:60.26ms +step:84/2285 train_time:5060ms step_avg:60.24ms +step:85/2285 train_time:5122ms step_avg:60.25ms +step:86/2285 train_time:5180ms step_avg:60.24ms +step:87/2285 train_time:5242ms step_avg:60.25ms +step:88/2285 train_time:5301ms step_avg:60.23ms +step:89/2285 train_time:5362ms step_avg:60.24ms +step:90/2285 train_time:5421ms step_avg:60.23ms +step:91/2285 train_time:5482ms step_avg:60.24ms +step:92/2285 train_time:5541ms step_avg:60.23ms +step:93/2285 train_time:5602ms step_avg:60.24ms +step:94/2285 train_time:5661ms step_avg:60.22ms +step:95/2285 train_time:5722ms step_avg:60.23ms +step:96/2285 train_time:5780ms step_avg:60.21ms +step:97/2285 train_time:5841ms step_avg:60.22ms +step:98/2285 train_time:5900ms step_avg:60.20ms +step:99/2285 train_time:5961ms step_avg:60.22ms +step:100/2285 train_time:6020ms step_avg:60.20ms +step:101/2285 train_time:6081ms step_avg:60.21ms +step:102/2285 train_time:6140ms step_avg:60.19ms +step:103/2285 train_time:6201ms step_avg:60.20ms +step:104/2285 train_time:6259ms step_avg:60.19ms +step:105/2285 train_time:6320ms step_avg:60.19ms +step:106/2285 train_time:6379ms step_avg:60.18ms +step:107/2285 train_time:6441ms step_avg:60.19ms +step:108/2285 train_time:6500ms step_avg:60.18ms +step:109/2285 train_time:6561ms step_avg:60.19ms +step:110/2285 train_time:6619ms step_avg:60.18ms +step:111/2285 train_time:6681ms step_avg:60.19ms +step:112/2285 train_time:6739ms step_avg:60.17ms +step:113/2285 train_time:6800ms step_avg:60.18ms +step:114/2285 train_time:6859ms step_avg:60.16ms +step:115/2285 train_time:6920ms step_avg:60.17ms +step:116/2285 train_time:6979ms step_avg:60.16ms +step:117/2285 train_time:7040ms step_avg:60.17ms +step:118/2285 train_time:7100ms step_avg:60.17ms +step:119/2285 train_time:7161ms step_avg:60.17ms +step:120/2285 train_time:7219ms step_avg:60.16ms +step:121/2285 train_time:7280ms step_avg:60.17ms +step:122/2285 train_time:7339ms step_avg:60.15ms +step:123/2285 train_time:7400ms step_avg:60.16ms +step:124/2285 train_time:7459ms step_avg:60.15ms +step:125/2285 train_time:7519ms step_avg:60.15ms +step:126/2285 train_time:7578ms step_avg:60.14ms +step:127/2285 train_time:7639ms step_avg:60.15ms +step:128/2285 train_time:7698ms step_avg:60.14ms +step:129/2285 train_time:7759ms step_avg:60.15ms +step:130/2285 train_time:7818ms step_avg:60.14ms +step:131/2285 train_time:7879ms step_avg:60.14ms +step:132/2285 train_time:7937ms step_avg:60.13ms +step:133/2285 train_time:7998ms step_avg:60.13ms +step:134/2285 train_time:8057ms step_avg:60.12ms +step:135/2285 train_time:8118ms step_avg:60.13ms +step:136/2285 train_time:8177ms step_avg:60.12ms +step:137/2285 train_time:8238ms step_avg:60.13ms +step:138/2285 train_time:8297ms step_avg:60.13ms +step:139/2285 train_time:8358ms step_avg:60.13ms +step:140/2285 train_time:8417ms step_avg:60.12ms +step:141/2285 train_time:8478ms step_avg:60.13ms +step:142/2285 train_time:8537ms step_avg:60.12ms +step:143/2285 train_time:8598ms step_avg:60.13ms +step:144/2285 train_time:8657ms step_avg:60.12ms +step:145/2285 train_time:8717ms step_avg:60.12ms +step:146/2285 train_time:8776ms step_avg:60.11ms +step:147/2285 train_time:8838ms step_avg:60.12ms +step:148/2285 train_time:8896ms step_avg:60.11ms +step:149/2285 train_time:8958ms step_avg:60.12ms +step:150/2285 train_time:9016ms step_avg:60.11ms +step:151/2285 train_time:9078ms step_avg:60.12ms +step:152/2285 train_time:9136ms step_avg:60.11ms +step:153/2285 train_time:9197ms step_avg:60.11ms +step:154/2285 train_time:9256ms step_avg:60.10ms +step:155/2285 train_time:9317ms step_avg:60.11ms +step:156/2285 train_time:9376ms step_avg:60.10ms +step:157/2285 train_time:9437ms step_avg:60.11ms +step:158/2285 train_time:9496ms step_avg:60.10ms +step:159/2285 train_time:9557ms step_avg:60.11ms +step:160/2285 train_time:9615ms step_avg:60.10ms +step:161/2285 train_time:9677ms step_avg:60.10ms +step:162/2285 train_time:9736ms step_avg:60.10ms +step:163/2285 train_time:9797ms step_avg:60.10ms +step:164/2285 train_time:9855ms step_avg:60.09ms +step:165/2285 train_time:9916ms step_avg:60.10ms +step:166/2285 train_time:9975ms step_avg:60.09ms +step:167/2285 train_time:10036ms step_avg:60.10ms +step:168/2285 train_time:10095ms step_avg:60.09ms +step:169/2285 train_time:10156ms step_avg:60.09ms +step:170/2285 train_time:10215ms step_avg:60.09ms +step:171/2285 train_time:10276ms step_avg:60.09ms +step:172/2285 train_time:10334ms step_avg:60.08ms +step:173/2285 train_time:10395ms step_avg:60.09ms +step:174/2285 train_time:10454ms step_avg:60.08ms +step:175/2285 train_time:10515ms step_avg:60.09ms +step:176/2285 train_time:10574ms step_avg:60.08ms +step:177/2285 train_time:10635ms step_avg:60.09ms +step:178/2285 train_time:10694ms step_avg:60.08ms +step:179/2285 train_time:10755ms step_avg:60.08ms +step:180/2285 train_time:10814ms step_avg:60.08ms +step:181/2285 train_time:10875ms step_avg:60.08ms +step:182/2285 train_time:10934ms step_avg:60.08ms +step:183/2285 train_time:10996ms step_avg:60.09ms +step:184/2285 train_time:11055ms step_avg:60.08ms +step:185/2285 train_time:11116ms step_avg:60.09ms +step:186/2285 train_time:11175ms step_avg:60.08ms +step:187/2285 train_time:11235ms step_avg:60.08ms +step:188/2285 train_time:11294ms step_avg:60.08ms +step:189/2285 train_time:11355ms step_avg:60.08ms +step:190/2285 train_time:11414ms step_avg:60.07ms +step:191/2285 train_time:11474ms step_avg:60.08ms +step:192/2285 train_time:11533ms step_avg:60.07ms +step:193/2285 train_time:11595ms step_avg:60.08ms +step:194/2285 train_time:11654ms step_avg:60.07ms +step:195/2285 train_time:11715ms step_avg:60.08ms +step:196/2285 train_time:11773ms step_avg:60.07ms +step:197/2285 train_time:11834ms step_avg:60.07ms +step:198/2285 train_time:11893ms step_avg:60.07ms +step:199/2285 train_time:11954ms step_avg:60.07ms +step:200/2285 train_time:12013ms step_avg:60.07ms +step:201/2285 train_time:12074ms step_avg:60.07ms +step:202/2285 train_time:12133ms step_avg:60.06ms +step:203/2285 train_time:12195ms step_avg:60.07ms +step:204/2285 train_time:12253ms step_avg:60.06ms +step:205/2285 train_time:12315ms step_avg:60.07ms +step:206/2285 train_time:12373ms step_avg:60.07ms +step:207/2285 train_time:12434ms step_avg:60.07ms +step:208/2285 train_time:12493ms step_avg:60.06ms +step:209/2285 train_time:12555ms step_avg:60.07ms +step:210/2285 train_time:12614ms step_avg:60.06ms +step:211/2285 train_time:12674ms step_avg:60.07ms +step:212/2285 train_time:12733ms step_avg:60.06ms +step:213/2285 train_time:12794ms step_avg:60.07ms +step:214/2285 train_time:12853ms step_avg:60.06ms +step:215/2285 train_time:12914ms step_avg:60.06ms +step:216/2285 train_time:12973ms step_avg:60.06ms +step:217/2285 train_time:13034ms step_avg:60.06ms +step:218/2285 train_time:13094ms step_avg:60.06ms +step:219/2285 train_time:13155ms step_avg:60.07ms +step:220/2285 train_time:13214ms step_avg:60.06ms +step:221/2285 train_time:13275ms step_avg:60.07ms +step:222/2285 train_time:13334ms step_avg:60.06ms +step:223/2285 train_time:13395ms step_avg:60.07ms +step:224/2285 train_time:13454ms step_avg:60.06ms +step:225/2285 train_time:13515ms step_avg:60.07ms +step:226/2285 train_time:13573ms step_avg:60.06ms +step:227/2285 train_time:13634ms step_avg:60.06ms +step:228/2285 train_time:13694ms step_avg:60.06ms +step:229/2285 train_time:13755ms step_avg:60.07ms +step:230/2285 train_time:13814ms step_avg:60.06ms +step:231/2285 train_time:13875ms step_avg:60.07ms +step:232/2285 train_time:13934ms step_avg:60.06ms +step:233/2285 train_time:13994ms step_avg:60.06ms +step:234/2285 train_time:14053ms step_avg:60.05ms +step:235/2285 train_time:14114ms step_avg:60.06ms +step:236/2285 train_time:14172ms step_avg:60.05ms +step:237/2285 train_time:14233ms step_avg:60.06ms +step:238/2285 train_time:14292ms step_avg:60.05ms +step:239/2285 train_time:14353ms step_avg:60.05ms +step:240/2285 train_time:14412ms step_avg:60.05ms +step:241/2285 train_time:14472ms step_avg:60.05ms +step:242/2285 train_time:14531ms step_avg:60.05ms +step:243/2285 train_time:14592ms step_avg:60.05ms +step:244/2285 train_time:14651ms step_avg:60.04ms +step:245/2285 train_time:14711ms step_avg:60.05ms +step:246/2285 train_time:14770ms step_avg:60.04ms +step:247/2285 train_time:14831ms step_avg:60.04ms +step:248/2285 train_time:14890ms step_avg:60.04ms +step:249/2285 train_time:14951ms step_avg:60.04ms +step:250/2285 train_time:15009ms step_avg:60.04ms +step:250/2285 val_loss:4.0723 train_time:15071ms step_avg:60.29ms +step:251/2285 train_time:15089ms step_avg:60.12ms +step:252/2285 train_time:15130ms step_avg:60.04ms +step:253/2285 train_time:15197ms step_avg:60.07ms +step:254/2285 train_time:15259ms step_avg:60.08ms +step:255/2285 train_time:15321ms step_avg:60.08ms +step:256/2285 train_time:15380ms step_avg:60.08ms +step:257/2285 train_time:15440ms step_avg:60.08ms +step:258/2285 train_time:15499ms step_avg:60.07ms +step:259/2285 train_time:15559ms step_avg:60.07ms +step:260/2285 train_time:15618ms step_avg:60.07ms +step:261/2285 train_time:15678ms step_avg:60.07ms +step:262/2285 train_time:15735ms step_avg:60.06ms +step:263/2285 train_time:15796ms step_avg:60.06ms +step:264/2285 train_time:15853ms step_avg:60.05ms +step:265/2285 train_time:15913ms step_avg:60.05ms +step:266/2285 train_time:15971ms step_avg:60.04ms +step:267/2285 train_time:16031ms step_avg:60.04ms +step:268/2285 train_time:16090ms step_avg:60.04ms +step:269/2285 train_time:16152ms step_avg:60.04ms +step:270/2285 train_time:16212ms step_avg:60.04ms +step:271/2285 train_time:16273ms step_avg:60.05ms +step:272/2285 train_time:16333ms step_avg:60.05ms +step:273/2285 train_time:16394ms step_avg:60.05ms +step:274/2285 train_time:16453ms step_avg:60.05ms +step:275/2285 train_time:16514ms step_avg:60.05ms +step:276/2285 train_time:16572ms step_avg:60.04ms +step:277/2285 train_time:16633ms step_avg:60.05ms +step:278/2285 train_time:16692ms step_avg:60.04ms +step:279/2285 train_time:16752ms step_avg:60.04ms +step:280/2285 train_time:16810ms step_avg:60.04ms +step:281/2285 train_time:16870ms step_avg:60.04ms +step:282/2285 train_time:16928ms step_avg:60.03ms +step:283/2285 train_time:16988ms step_avg:60.03ms +step:284/2285 train_time:17046ms step_avg:60.02ms +step:285/2285 train_time:17107ms step_avg:60.03ms +step:286/2285 train_time:17166ms step_avg:60.02ms +step:287/2285 train_time:17228ms step_avg:60.03ms +step:288/2285 train_time:17287ms step_avg:60.02ms +step:289/2285 train_time:17349ms step_avg:60.03ms +step:290/2285 train_time:17407ms step_avg:60.03ms +step:291/2285 train_time:17469ms step_avg:60.03ms +step:292/2285 train_time:17528ms step_avg:60.03ms +step:293/2285 train_time:17589ms step_avg:60.03ms +step:294/2285 train_time:17648ms step_avg:60.03ms +step:295/2285 train_time:17709ms step_avg:60.03ms +step:296/2285 train_time:17767ms step_avg:60.02ms +step:297/2285 train_time:17828ms step_avg:60.03ms +step:298/2285 train_time:17886ms step_avg:60.02ms +step:299/2285 train_time:17946ms step_avg:60.02ms +step:300/2285 train_time:18005ms step_avg:60.02ms +step:301/2285 train_time:18065ms step_avg:60.02ms +step:302/2285 train_time:18124ms step_avg:60.01ms +step:303/2285 train_time:18185ms step_avg:60.02ms +step:304/2285 train_time:18244ms step_avg:60.01ms +step:305/2285 train_time:18306ms step_avg:60.02ms +step:306/2285 train_time:18365ms step_avg:60.02ms +step:307/2285 train_time:18427ms step_avg:60.02ms +step:308/2285 train_time:18486ms step_avg:60.02ms +step:309/2285 train_time:18547ms step_avg:60.02ms +step:310/2285 train_time:18606ms step_avg:60.02ms +step:311/2285 train_time:18666ms step_avg:60.02ms +step:312/2285 train_time:18724ms step_avg:60.01ms +step:313/2285 train_time:18785ms step_avg:60.02ms +step:314/2285 train_time:18843ms step_avg:60.01ms +step:315/2285 train_time:18904ms step_avg:60.01ms +step:316/2285 train_time:18962ms step_avg:60.01ms +step:317/2285 train_time:19023ms step_avg:60.01ms +step:318/2285 train_time:19081ms step_avg:60.00ms +step:319/2285 train_time:19142ms step_avg:60.01ms +step:320/2285 train_time:19200ms step_avg:60.00ms +step:321/2285 train_time:19262ms step_avg:60.01ms +step:322/2285 train_time:19320ms step_avg:60.00ms +step:323/2285 train_time:19381ms step_avg:60.00ms +step:324/2285 train_time:19440ms step_avg:60.00ms +step:325/2285 train_time:19502ms step_avg:60.01ms +step:326/2285 train_time:19560ms step_avg:60.00ms +step:327/2285 train_time:19621ms step_avg:60.00ms +step:328/2285 train_time:19680ms step_avg:60.00ms +step:329/2285 train_time:19741ms step_avg:60.00ms +step:330/2285 train_time:19800ms step_avg:60.00ms +step:331/2285 train_time:19860ms step_avg:60.00ms +step:332/2285 train_time:19918ms step_avg:59.99ms +step:333/2285 train_time:19979ms step_avg:60.00ms +step:334/2285 train_time:20037ms step_avg:59.99ms +step:335/2285 train_time:20097ms step_avg:59.99ms +step:336/2285 train_time:20156ms step_avg:59.99ms +step:337/2285 train_time:20216ms step_avg:59.99ms +step:338/2285 train_time:20275ms step_avg:59.98ms +step:339/2285 train_time:20336ms step_avg:59.99ms +step:340/2285 train_time:20394ms step_avg:59.98ms +step:341/2285 train_time:20456ms step_avg:59.99ms +step:342/2285 train_time:20514ms step_avg:59.98ms +step:343/2285 train_time:20575ms step_avg:59.99ms +step:344/2285 train_time:20634ms step_avg:59.98ms +step:345/2285 train_time:20695ms step_avg:59.98ms +step:346/2285 train_time:20753ms step_avg:59.98ms +step:347/2285 train_time:20814ms step_avg:59.98ms +step:348/2285 train_time:20872ms step_avg:59.98ms +step:349/2285 train_time:20933ms step_avg:59.98ms +step:350/2285 train_time:20991ms step_avg:59.98ms +step:351/2285 train_time:21052ms step_avg:59.98ms +step:352/2285 train_time:21110ms step_avg:59.97ms +step:353/2285 train_time:21171ms step_avg:59.97ms +step:354/2285 train_time:21229ms step_avg:59.97ms +step:355/2285 train_time:21290ms step_avg:59.97ms +step:356/2285 train_time:21349ms step_avg:59.97ms +step:357/2285 train_time:21410ms step_avg:59.97ms +step:358/2285 train_time:21469ms step_avg:59.97ms +step:359/2285 train_time:21529ms step_avg:59.97ms +step:360/2285 train_time:21588ms step_avg:59.97ms +step:361/2285 train_time:21649ms step_avg:59.97ms +step:362/2285 train_time:21707ms step_avg:59.97ms +step:363/2285 train_time:21768ms step_avg:59.97ms +step:364/2285 train_time:21827ms step_avg:59.96ms +step:365/2285 train_time:21888ms step_avg:59.97ms +step:366/2285 train_time:21946ms step_avg:59.96ms +step:367/2285 train_time:22007ms step_avg:59.96ms +step:368/2285 train_time:22065ms step_avg:59.96ms +step:369/2285 train_time:22126ms step_avg:59.96ms +step:370/2285 train_time:22185ms step_avg:59.96ms +step:371/2285 train_time:22247ms step_avg:59.97ms +step:372/2285 train_time:22306ms step_avg:59.96ms +step:373/2285 train_time:22367ms step_avg:59.97ms +step:374/2285 train_time:22426ms step_avg:59.96ms +step:375/2285 train_time:22487ms step_avg:59.96ms +step:376/2285 train_time:22546ms step_avg:59.96ms +step:377/2285 train_time:22607ms step_avg:59.97ms +step:378/2285 train_time:22666ms step_avg:59.96ms +step:379/2285 train_time:22727ms step_avg:59.97ms +step:380/2285 train_time:22786ms step_avg:59.96ms +step:381/2285 train_time:22847ms step_avg:59.97ms +step:382/2285 train_time:22906ms step_avg:59.96ms +step:383/2285 train_time:22967ms step_avg:59.97ms +step:384/2285 train_time:23026ms step_avg:59.96ms +step:385/2285 train_time:23087ms step_avg:59.97ms +step:386/2285 train_time:23146ms step_avg:59.96ms +step:387/2285 train_time:23208ms step_avg:59.97ms +step:388/2285 train_time:23267ms step_avg:59.97ms +step:389/2285 train_time:23328ms step_avg:59.97ms +step:390/2285 train_time:23387ms step_avg:59.97ms +step:391/2285 train_time:23449ms step_avg:59.97ms +step:392/2285 train_time:23508ms step_avg:59.97ms +step:393/2285 train_time:23570ms step_avg:59.97ms +step:394/2285 train_time:23628ms step_avg:59.97ms +step:395/2285 train_time:23690ms step_avg:59.97ms +step:396/2285 train_time:23749ms step_avg:59.97ms +step:397/2285 train_time:23810ms step_avg:59.98ms +step:398/2285 train_time:23869ms step_avg:59.97ms +step:399/2285 train_time:23930ms step_avg:59.98ms +step:400/2285 train_time:23989ms step_avg:59.97ms +step:401/2285 train_time:24051ms step_avg:59.98ms +step:402/2285 train_time:24110ms step_avg:59.98ms +step:403/2285 train_time:24172ms step_avg:59.98ms +step:404/2285 train_time:24231ms step_avg:59.98ms +step:405/2285 train_time:24292ms step_avg:59.98ms +step:406/2285 train_time:24350ms step_avg:59.98ms +step:407/2285 train_time:24411ms step_avg:59.98ms +step:408/2285 train_time:24470ms step_avg:59.98ms +step:409/2285 train_time:24532ms step_avg:59.98ms +step:410/2285 train_time:24592ms step_avg:59.98ms +step:411/2285 train_time:24653ms step_avg:59.98ms +step:412/2285 train_time:24712ms step_avg:59.98ms +step:413/2285 train_time:24773ms step_avg:59.98ms +step:414/2285 train_time:24832ms step_avg:59.98ms +step:415/2285 train_time:24893ms step_avg:59.98ms +step:416/2285 train_time:24952ms step_avg:59.98ms +step:417/2285 train_time:25014ms step_avg:59.98ms +step:418/2285 train_time:25073ms step_avg:59.98ms +step:419/2285 train_time:25133ms step_avg:59.98ms +step:420/2285 train_time:25192ms step_avg:59.98ms +step:421/2285 train_time:25253ms step_avg:59.98ms +step:422/2285 train_time:25313ms step_avg:59.98ms +step:423/2285 train_time:25374ms step_avg:59.99ms +step:424/2285 train_time:25433ms step_avg:59.98ms +step:425/2285 train_time:25494ms step_avg:59.99ms +step:426/2285 train_time:25553ms step_avg:59.98ms +step:427/2285 train_time:25614ms step_avg:59.99ms +step:428/2285 train_time:25673ms step_avg:59.98ms +step:429/2285 train_time:25734ms step_avg:59.99ms +step:430/2285 train_time:25794ms step_avg:59.99ms +step:431/2285 train_time:25856ms step_avg:59.99ms +step:432/2285 train_time:25915ms step_avg:59.99ms +step:433/2285 train_time:25976ms step_avg:59.99ms +step:434/2285 train_time:26035ms step_avg:59.99ms +step:435/2285 train_time:26096ms step_avg:59.99ms +step:436/2285 train_time:26155ms step_avg:59.99ms +step:437/2285 train_time:26217ms step_avg:59.99ms +step:438/2285 train_time:26276ms step_avg:59.99ms +step:439/2285 train_time:26337ms step_avg:59.99ms +step:440/2285 train_time:26396ms step_avg:59.99ms +step:441/2285 train_time:26458ms step_avg:59.99ms +step:442/2285 train_time:26517ms step_avg:59.99ms +step:443/2285 train_time:26578ms step_avg:59.99ms +step:444/2285 train_time:26636ms step_avg:59.99ms +step:445/2285 train_time:26697ms step_avg:59.99ms +step:446/2285 train_time:26756ms step_avg:59.99ms +step:447/2285 train_time:26818ms step_avg:59.99ms +step:448/2285 train_time:26876ms step_avg:59.99ms +step:449/2285 train_time:26937ms step_avg:59.99ms +step:450/2285 train_time:26996ms step_avg:59.99ms +step:451/2285 train_time:27057ms step_avg:59.99ms +step:452/2285 train_time:27117ms step_avg:59.99ms +step:453/2285 train_time:27178ms step_avg:59.99ms +step:454/2285 train_time:27236ms step_avg:59.99ms +step:455/2285 train_time:27297ms step_avg:59.99ms +step:456/2285 train_time:27357ms step_avg:59.99ms +step:457/2285 train_time:27418ms step_avg:60.00ms +step:458/2285 train_time:27477ms step_avg:59.99ms +step:459/2285 train_time:27538ms step_avg:60.00ms +step:460/2285 train_time:27596ms step_avg:59.99ms +step:461/2285 train_time:27658ms step_avg:59.99ms +step:462/2285 train_time:27717ms step_avg:59.99ms +step:463/2285 train_time:27778ms step_avg:60.00ms +step:464/2285 train_time:27837ms step_avg:59.99ms +step:465/2285 train_time:27898ms step_avg:59.99ms +step:466/2285 train_time:27957ms step_avg:59.99ms +step:467/2285 train_time:28018ms step_avg:60.00ms +step:468/2285 train_time:28077ms step_avg:59.99ms +step:469/2285 train_time:28138ms step_avg:60.00ms +step:470/2285 train_time:28197ms step_avg:59.99ms +step:471/2285 train_time:28258ms step_avg:60.00ms +step:472/2285 train_time:28318ms step_avg:59.99ms +step:473/2285 train_time:28378ms step_avg:60.00ms +step:474/2285 train_time:28437ms step_avg:59.99ms +step:475/2285 train_time:28498ms step_avg:60.00ms +step:476/2285 train_time:28557ms step_avg:59.99ms +step:477/2285 train_time:28618ms step_avg:60.00ms +step:478/2285 train_time:28677ms step_avg:59.99ms +step:479/2285 train_time:28738ms step_avg:60.00ms +step:480/2285 train_time:28798ms step_avg:60.00ms +step:481/2285 train_time:28859ms step_avg:60.00ms +step:482/2285 train_time:28918ms step_avg:60.00ms +step:483/2285 train_time:28979ms step_avg:60.00ms +step:484/2285 train_time:29037ms step_avg:59.99ms +step:485/2285 train_time:29099ms step_avg:60.00ms +step:486/2285 train_time:29158ms step_avg:60.00ms +step:487/2285 train_time:29219ms step_avg:60.00ms +step:488/2285 train_time:29277ms step_avg:59.99ms +step:489/2285 train_time:29338ms step_avg:60.00ms +step:490/2285 train_time:29397ms step_avg:59.99ms +step:491/2285 train_time:29458ms step_avg:60.00ms +step:492/2285 train_time:29517ms step_avg:59.99ms +step:493/2285 train_time:29578ms step_avg:60.00ms +step:494/2285 train_time:29637ms step_avg:59.99ms +step:495/2285 train_time:29699ms step_avg:60.00ms +step:496/2285 train_time:29758ms step_avg:60.00ms +step:497/2285 train_time:29819ms step_avg:60.00ms +step:498/2285 train_time:29877ms step_avg:59.99ms +step:499/2285 train_time:29938ms step_avg:60.00ms +step:500/2285 train_time:29997ms step_avg:59.99ms +step:500/2285 val_loss:3.7842 train_time:30060ms step_avg:60.12ms +step:501/2285 train_time:30083ms step_avg:60.05ms +step:502/2285 train_time:30120ms step_avg:60.00ms +step:503/2285 train_time:30180ms step_avg:60.00ms +step:504/2285 train_time:30238ms step_avg:60.00ms +step:505/2285 train_time:30299ms step_avg:60.00ms +step:506/2285 train_time:30358ms step_avg:60.00ms +step:507/2285 train_time:30418ms step_avg:60.00ms +step:508/2285 train_time:30476ms step_avg:59.99ms +step:509/2285 train_time:30537ms step_avg:59.99ms +step:510/2285 train_time:30595ms step_avg:59.99ms +step:511/2285 train_time:30656ms step_avg:59.99ms +step:512/2285 train_time:30714ms step_avg:59.99ms +step:513/2285 train_time:30774ms step_avg:59.99ms +step:514/2285 train_time:30833ms step_avg:59.99ms +step:515/2285 train_time:30893ms step_avg:59.99ms +step:516/2285 train_time:30955ms step_avg:59.99ms +step:517/2285 train_time:31023ms step_avg:60.01ms +step:518/2285 train_time:31085ms step_avg:60.01ms +step:519/2285 train_time:31146ms step_avg:60.01ms +step:520/2285 train_time:31205ms step_avg:60.01ms +step:521/2285 train_time:31266ms step_avg:60.01ms +step:522/2285 train_time:31325ms step_avg:60.01ms +step:523/2285 train_time:31386ms step_avg:60.01ms +step:524/2285 train_time:31445ms step_avg:60.01ms +step:525/2285 train_time:31506ms step_avg:60.01ms +step:526/2285 train_time:31565ms step_avg:60.01ms +step:527/2285 train_time:31627ms step_avg:60.01ms +step:528/2285 train_time:31686ms step_avg:60.01ms +step:529/2285 train_time:31747ms step_avg:60.01ms +step:530/2285 train_time:31806ms step_avg:60.01ms +step:531/2285 train_time:31867ms step_avg:60.01ms +step:532/2285 train_time:31927ms step_avg:60.01ms +step:533/2285 train_time:31990ms step_avg:60.02ms +step:534/2285 train_time:32049ms step_avg:60.02ms +step:535/2285 train_time:32111ms step_avg:60.02ms +step:536/2285 train_time:32171ms step_avg:60.02ms +step:537/2285 train_time:32233ms step_avg:60.02ms +step:538/2285 train_time:32292ms step_avg:60.02ms +step:539/2285 train_time:32353ms step_avg:60.02ms +step:540/2285 train_time:32412ms step_avg:60.02ms +step:541/2285 train_time:32473ms step_avg:60.02ms +step:542/2285 train_time:32533ms step_avg:60.02ms +step:543/2285 train_time:32594ms step_avg:60.03ms +step:544/2285 train_time:32653ms step_avg:60.02ms +step:545/2285 train_time:32714ms step_avg:60.03ms +step:546/2285 train_time:32773ms step_avg:60.02ms +step:547/2285 train_time:32834ms step_avg:60.03ms +step:548/2285 train_time:32893ms step_avg:60.02ms +step:549/2285 train_time:32955ms step_avg:60.03ms +step:550/2285 train_time:33014ms step_avg:60.02ms +step:551/2285 train_time:33076ms step_avg:60.03ms +step:552/2285 train_time:33135ms step_avg:60.03ms +step:553/2285 train_time:33197ms step_avg:60.03ms +step:554/2285 train_time:33257ms step_avg:60.03ms +step:555/2285 train_time:33319ms step_avg:60.03ms +step:556/2285 train_time:33378ms step_avg:60.03ms +step:557/2285 train_time:33439ms step_avg:60.03ms +step:558/2285 train_time:33498ms step_avg:60.03ms +step:559/2285 train_time:33560ms step_avg:60.04ms +step:560/2285 train_time:33619ms step_avg:60.03ms +step:561/2285 train_time:33679ms step_avg:60.03ms +step:562/2285 train_time:33739ms step_avg:60.03ms +step:563/2285 train_time:33800ms step_avg:60.04ms +step:564/2285 train_time:33859ms step_avg:60.03ms +step:565/2285 train_time:33920ms step_avg:60.04ms +step:566/2285 train_time:33980ms step_avg:60.04ms +step:567/2285 train_time:34042ms step_avg:60.04ms +step:568/2285 train_time:34101ms step_avg:60.04ms +step:569/2285 train_time:34163ms step_avg:60.04ms +step:570/2285 train_time:34222ms step_avg:60.04ms +step:571/2285 train_time:34284ms step_avg:60.04ms +step:572/2285 train_time:34342ms step_avg:60.04ms +step:573/2285 train_time:34404ms step_avg:60.04ms +step:574/2285 train_time:34463ms step_avg:60.04ms +step:575/2285 train_time:34525ms step_avg:60.04ms +step:576/2285 train_time:34584ms step_avg:60.04ms +step:577/2285 train_time:34645ms step_avg:60.04ms +step:578/2285 train_time:34704ms step_avg:60.04ms +step:579/2285 train_time:34765ms step_avg:60.04ms +step:580/2285 train_time:34824ms step_avg:60.04ms +step:581/2285 train_time:34886ms step_avg:60.04ms +step:582/2285 train_time:34945ms step_avg:60.04ms +step:583/2285 train_time:35006ms step_avg:60.04ms +step:584/2285 train_time:35065ms step_avg:60.04ms +step:585/2285 train_time:35126ms step_avg:60.04ms +step:586/2285 train_time:35185ms step_avg:60.04ms +step:587/2285 train_time:35246ms step_avg:60.04ms +step:588/2285 train_time:35305ms step_avg:60.04ms +step:589/2285 train_time:35367ms step_avg:60.05ms +step:590/2285 train_time:35426ms step_avg:60.04ms +step:591/2285 train_time:35487ms step_avg:60.05ms +step:592/2285 train_time:35546ms step_avg:60.04ms +step:593/2285 train_time:35607ms step_avg:60.05ms +step:594/2285 train_time:35667ms step_avg:60.04ms +step:595/2285 train_time:35728ms step_avg:60.05ms +step:596/2285 train_time:35787ms step_avg:60.05ms +step:597/2285 train_time:35849ms step_avg:60.05ms +step:598/2285 train_time:35908ms step_avg:60.05ms +step:599/2285 train_time:35970ms step_avg:60.05ms +step:600/2285 train_time:36029ms step_avg:60.05ms +step:601/2285 train_time:36091ms step_avg:60.05ms +step:602/2285 train_time:36150ms step_avg:60.05ms +step:603/2285 train_time:36211ms step_avg:60.05ms +step:604/2285 train_time:36270ms step_avg:60.05ms +step:605/2285 train_time:36331ms step_avg:60.05ms +step:606/2285 train_time:36391ms step_avg:60.05ms +step:607/2285 train_time:36452ms step_avg:60.05ms +step:608/2285 train_time:36511ms step_avg:60.05ms +step:609/2285 train_time:36572ms step_avg:60.05ms +step:610/2285 train_time:36631ms step_avg:60.05ms +step:611/2285 train_time:36692ms step_avg:60.05ms +step:612/2285 train_time:36751ms step_avg:60.05ms +step:613/2285 train_time:36813ms step_avg:60.05ms +step:614/2285 train_time:36871ms step_avg:60.05ms +step:615/2285 train_time:36933ms step_avg:60.05ms +step:616/2285 train_time:36992ms step_avg:60.05ms +step:617/2285 train_time:37054ms step_avg:60.05ms +step:618/2285 train_time:37113ms step_avg:60.05ms +step:619/2285 train_time:37175ms step_avg:60.06ms +step:620/2285 train_time:37234ms step_avg:60.06ms +step:621/2285 train_time:37296ms step_avg:60.06ms +step:622/2285 train_time:37355ms step_avg:60.06ms +step:623/2285 train_time:37416ms step_avg:60.06ms +step:624/2285 train_time:37476ms step_avg:60.06ms +step:625/2285 train_time:37538ms step_avg:60.06ms +step:626/2285 train_time:37597ms step_avg:60.06ms +step:627/2285 train_time:37659ms step_avg:60.06ms +step:628/2285 train_time:37718ms step_avg:60.06ms +step:629/2285 train_time:37780ms step_avg:60.06ms +step:630/2285 train_time:37839ms step_avg:60.06ms +step:631/2285 train_time:37901ms step_avg:60.07ms +step:632/2285 train_time:37960ms step_avg:60.06ms +step:633/2285 train_time:38022ms step_avg:60.07ms +step:634/2285 train_time:38081ms step_avg:60.06ms +step:635/2285 train_time:38143ms step_avg:60.07ms +step:636/2285 train_time:38202ms step_avg:60.07ms +step:637/2285 train_time:38263ms step_avg:60.07ms +step:638/2285 train_time:38322ms step_avg:60.07ms +step:639/2285 train_time:38384ms step_avg:60.07ms +step:640/2285 train_time:38444ms step_avg:60.07ms +step:641/2285 train_time:38505ms step_avg:60.07ms +step:642/2285 train_time:38564ms step_avg:60.07ms +step:643/2285 train_time:38626ms step_avg:60.07ms +step:644/2285 train_time:38685ms step_avg:60.07ms +step:645/2285 train_time:38747ms step_avg:60.07ms +step:646/2285 train_time:38806ms step_avg:60.07ms +step:647/2285 train_time:38867ms step_avg:60.07ms +step:648/2285 train_time:38926ms step_avg:60.07ms +step:649/2285 train_time:38988ms step_avg:60.07ms +step:650/2285 train_time:39047ms step_avg:60.07ms +step:651/2285 train_time:39108ms step_avg:60.07ms +step:652/2285 train_time:39167ms step_avg:60.07ms +step:653/2285 train_time:39229ms step_avg:60.07ms +step:654/2285 train_time:39288ms step_avg:60.07ms +step:655/2285 train_time:39350ms step_avg:60.08ms +step:656/2285 train_time:39409ms step_avg:60.07ms +step:657/2285 train_time:39470ms step_avg:60.08ms +step:658/2285 train_time:39529ms step_avg:60.07ms +step:659/2285 train_time:39591ms step_avg:60.08ms +step:660/2285 train_time:39650ms step_avg:60.08ms +step:661/2285 train_time:39712ms step_avg:60.08ms +step:662/2285 train_time:39770ms step_avg:60.08ms +step:663/2285 train_time:39832ms step_avg:60.08ms +step:664/2285 train_time:39891ms step_avg:60.08ms +step:665/2285 train_time:39952ms step_avg:60.08ms +step:666/2285 train_time:40011ms step_avg:60.08ms +step:667/2285 train_time:40072ms step_avg:60.08ms +step:668/2285 train_time:40131ms step_avg:60.08ms +step:669/2285 train_time:40193ms step_avg:60.08ms +step:670/2285 train_time:40252ms step_avg:60.08ms +step:671/2285 train_time:40314ms step_avg:60.08ms +step:672/2285 train_time:40373ms step_avg:60.08ms +step:673/2285 train_time:40434ms step_avg:60.08ms +step:674/2285 train_time:40493ms step_avg:60.08ms +step:675/2285 train_time:40555ms step_avg:60.08ms +step:676/2285 train_time:40614ms step_avg:60.08ms +step:677/2285 train_time:40675ms step_avg:60.08ms +step:678/2285 train_time:40734ms step_avg:60.08ms +step:679/2285 train_time:40796ms step_avg:60.08ms +step:680/2285 train_time:40855ms step_avg:60.08ms +step:681/2285 train_time:40916ms step_avg:60.08ms +step:682/2285 train_time:40975ms step_avg:60.08ms +step:683/2285 train_time:41037ms step_avg:60.08ms +step:684/2285 train_time:41097ms step_avg:60.08ms +step:685/2285 train_time:41159ms step_avg:60.09ms +step:686/2285 train_time:41218ms step_avg:60.08ms +step:687/2285 train_time:41280ms step_avg:60.09ms +step:688/2285 train_time:41339ms step_avg:60.09ms +step:689/2285 train_time:41400ms step_avg:60.09ms +step:690/2285 train_time:41460ms step_avg:60.09ms +step:691/2285 train_time:41521ms step_avg:60.09ms +step:692/2285 train_time:41580ms step_avg:60.09ms +step:693/2285 train_time:41642ms step_avg:60.09ms +step:694/2285 train_time:41701ms step_avg:60.09ms +step:695/2285 train_time:41762ms step_avg:60.09ms +step:696/2285 train_time:41822ms step_avg:60.09ms +step:697/2285 train_time:41883ms step_avg:60.09ms +step:698/2285 train_time:41942ms step_avg:60.09ms +step:699/2285 train_time:42004ms step_avg:60.09ms +step:700/2285 train_time:42063ms step_avg:60.09ms +step:701/2285 train_time:42124ms step_avg:60.09ms +step:702/2285 train_time:42183ms step_avg:60.09ms +step:703/2285 train_time:42245ms step_avg:60.09ms +step:704/2285 train_time:42304ms step_avg:60.09ms +step:705/2285 train_time:42365ms step_avg:60.09ms +step:706/2285 train_time:42424ms step_avg:60.09ms +step:707/2285 train_time:42486ms step_avg:60.09ms +step:708/2285 train_time:42545ms step_avg:60.09ms +step:709/2285 train_time:42607ms step_avg:60.09ms +step:710/2285 train_time:42667ms step_avg:60.09ms +step:711/2285 train_time:42728ms step_avg:60.10ms +step:712/2285 train_time:42787ms step_avg:60.09ms +step:713/2285 train_time:42849ms step_avg:60.10ms +step:714/2285 train_time:42908ms step_avg:60.10ms +step:715/2285 train_time:42969ms step_avg:60.10ms +step:716/2285 train_time:43028ms step_avg:60.10ms +step:717/2285 train_time:43090ms step_avg:60.10ms +step:718/2285 train_time:43149ms step_avg:60.10ms +step:719/2285 train_time:43210ms step_avg:60.10ms +step:720/2285 train_time:43270ms step_avg:60.10ms +step:721/2285 train_time:43331ms step_avg:60.10ms +step:722/2285 train_time:43391ms step_avg:60.10ms +step:723/2285 train_time:43453ms step_avg:60.10ms +step:724/2285 train_time:43512ms step_avg:60.10ms +step:725/2285 train_time:43574ms step_avg:60.10ms +step:726/2285 train_time:43633ms step_avg:60.10ms +step:727/2285 train_time:43695ms step_avg:60.10ms +step:728/2285 train_time:43753ms step_avg:60.10ms +step:729/2285 train_time:43815ms step_avg:60.10ms +step:730/2285 train_time:43874ms step_avg:60.10ms +step:731/2285 train_time:43936ms step_avg:60.10ms +step:732/2285 train_time:43995ms step_avg:60.10ms +step:733/2285 train_time:44057ms step_avg:60.10ms +step:734/2285 train_time:44116ms step_avg:60.10ms +step:735/2285 train_time:44178ms step_avg:60.11ms +step:736/2285 train_time:44237ms step_avg:60.10ms +step:737/2285 train_time:44299ms step_avg:60.11ms +step:738/2285 train_time:44358ms step_avg:60.11ms +step:739/2285 train_time:44420ms step_avg:60.11ms +step:740/2285 train_time:44479ms step_avg:60.11ms +step:741/2285 train_time:44540ms step_avg:60.11ms +step:742/2285 train_time:44599ms step_avg:60.11ms +step:743/2285 train_time:44661ms step_avg:60.11ms +step:744/2285 train_time:44720ms step_avg:60.11ms +step:745/2285 train_time:44782ms step_avg:60.11ms +step:746/2285 train_time:44841ms step_avg:60.11ms +step:747/2285 train_time:44902ms step_avg:60.11ms +step:748/2285 train_time:44961ms step_avg:60.11ms +step:749/2285 train_time:45023ms step_avg:60.11ms +step:750/2285 train_time:45082ms step_avg:60.11ms +step:750/2285 val_loss:3.6546 train_time:45145ms step_avg:60.19ms +step:751/2285 train_time:45163ms step_avg:60.14ms +step:752/2285 train_time:45206ms step_avg:60.11ms +step:753/2285 train_time:45270ms step_avg:60.12ms +step:754/2285 train_time:45331ms step_avg:60.12ms +step:755/2285 train_time:45393ms step_avg:60.12ms +step:756/2285 train_time:45452ms step_avg:60.12ms +step:757/2285 train_time:45512ms step_avg:60.12ms +step:758/2285 train_time:45571ms step_avg:60.12ms +step:759/2285 train_time:45631ms step_avg:60.12ms +step:760/2285 train_time:45690ms step_avg:60.12ms +step:761/2285 train_time:45750ms step_avg:60.12ms +step:762/2285 train_time:45808ms step_avg:60.12ms +step:763/2285 train_time:45869ms step_avg:60.12ms +step:764/2285 train_time:45929ms step_avg:60.12ms +step:765/2285 train_time:45989ms step_avg:60.12ms +step:766/2285 train_time:46049ms step_avg:60.12ms +step:767/2285 train_time:46113ms step_avg:60.12ms +step:768/2285 train_time:46174ms step_avg:60.12ms +step:769/2285 train_time:46238ms step_avg:60.13ms +step:770/2285 train_time:46297ms step_avg:60.13ms +step:771/2285 train_time:46359ms step_avg:60.13ms +step:772/2285 train_time:46419ms step_avg:60.13ms +step:773/2285 train_time:46481ms step_avg:60.13ms +step:774/2285 train_time:46540ms step_avg:60.13ms +step:775/2285 train_time:46602ms step_avg:60.13ms +step:776/2285 train_time:46661ms step_avg:60.13ms +step:777/2285 train_time:46723ms step_avg:60.13ms +step:778/2285 train_time:46783ms step_avg:60.13ms +step:779/2285 train_time:46844ms step_avg:60.13ms +step:780/2285 train_time:46903ms step_avg:60.13ms +step:781/2285 train_time:46965ms step_avg:60.13ms +step:782/2285 train_time:47025ms step_avg:60.13ms +step:783/2285 train_time:47087ms step_avg:60.14ms +step:784/2285 train_time:47147ms step_avg:60.14ms +step:785/2285 train_time:47209ms step_avg:60.14ms +step:786/2285 train_time:47268ms step_avg:60.14ms +step:787/2285 train_time:47331ms step_avg:60.14ms +step:788/2285 train_time:47391ms step_avg:60.14ms +step:789/2285 train_time:47453ms step_avg:60.14ms +step:790/2285 train_time:47513ms step_avg:60.14ms +step:791/2285 train_time:47575ms step_avg:60.15ms +step:792/2285 train_time:47634ms step_avg:60.14ms +step:793/2285 train_time:47696ms step_avg:60.15ms +step:794/2285 train_time:47755ms step_avg:60.15ms +step:795/2285 train_time:47816ms step_avg:60.15ms +step:796/2285 train_time:47876ms step_avg:60.15ms +step:797/2285 train_time:47938ms step_avg:60.15ms +step:798/2285 train_time:47997ms step_avg:60.15ms +step:799/2285 train_time:48059ms step_avg:60.15ms +step:800/2285 train_time:48119ms step_avg:60.15ms +step:801/2285 train_time:48181ms step_avg:60.15ms +step:802/2285 train_time:48241ms step_avg:60.15ms +step:803/2285 train_time:48304ms step_avg:60.15ms +step:804/2285 train_time:48364ms step_avg:60.15ms +step:805/2285 train_time:48427ms step_avg:60.16ms +step:806/2285 train_time:48487ms step_avg:60.16ms +step:807/2285 train_time:48549ms step_avg:60.16ms +step:808/2285 train_time:48608ms step_avg:60.16ms +step:809/2285 train_time:48669ms step_avg:60.16ms +step:810/2285 train_time:48728ms step_avg:60.16ms +step:811/2285 train_time:48790ms step_avg:60.16ms +step:812/2285 train_time:48850ms step_avg:60.16ms +step:813/2285 train_time:48912ms step_avg:60.16ms +step:814/2285 train_time:48972ms step_avg:60.16ms +step:815/2285 train_time:49033ms step_avg:60.16ms +step:816/2285 train_time:49093ms step_avg:60.16ms +step:817/2285 train_time:49155ms step_avg:60.16ms +step:818/2285 train_time:49214ms step_avg:60.16ms +step:819/2285 train_time:49276ms step_avg:60.17ms +step:820/2285 train_time:49336ms step_avg:60.17ms +step:821/2285 train_time:49399ms step_avg:60.17ms +step:822/2285 train_time:49458ms step_avg:60.17ms +step:823/2285 train_time:49520ms step_avg:60.17ms +step:824/2285 train_time:49579ms step_avg:60.17ms +step:825/2285 train_time:49642ms step_avg:60.17ms +step:826/2285 train_time:49701ms step_avg:60.17ms +step:827/2285 train_time:49763ms step_avg:60.17ms +step:828/2285 train_time:49823ms step_avg:60.17ms +step:829/2285 train_time:49885ms step_avg:60.17ms +step:830/2285 train_time:49944ms step_avg:60.17ms +step:831/2285 train_time:50006ms step_avg:60.18ms +step:832/2285 train_time:50066ms step_avg:60.17ms +step:833/2285 train_time:50127ms step_avg:60.18ms +step:834/2285 train_time:50187ms step_avg:60.18ms +step:835/2285 train_time:50249ms step_avg:60.18ms +step:836/2285 train_time:50309ms step_avg:60.18ms +step:837/2285 train_time:50371ms step_avg:60.18ms +step:838/2285 train_time:50432ms step_avg:60.18ms +step:839/2285 train_time:50494ms step_avg:60.18ms +step:840/2285 train_time:50553ms step_avg:60.18ms +step:841/2285 train_time:50614ms step_avg:60.18ms +step:842/2285 train_time:50674ms step_avg:60.18ms +step:843/2285 train_time:50736ms step_avg:60.19ms +step:844/2285 train_time:50795ms step_avg:60.18ms +step:845/2285 train_time:50857ms step_avg:60.19ms +step:846/2285 train_time:50916ms step_avg:60.18ms +step:847/2285 train_time:50978ms step_avg:60.19ms +step:848/2285 train_time:51037ms step_avg:60.18ms +step:849/2285 train_time:51098ms step_avg:60.19ms +step:850/2285 train_time:51158ms step_avg:60.19ms +step:851/2285 train_time:51220ms step_avg:60.19ms +step:852/2285 train_time:51280ms step_avg:60.19ms +step:853/2285 train_time:51343ms step_avg:60.19ms +step:854/2285 train_time:51403ms step_avg:60.19ms +step:855/2285 train_time:51466ms step_avg:60.19ms +step:856/2285 train_time:51525ms step_avg:60.19ms +step:857/2285 train_time:51587ms step_avg:60.19ms +step:858/2285 train_time:51646ms step_avg:60.19ms +step:859/2285 train_time:51708ms step_avg:60.20ms +step:860/2285 train_time:51767ms step_avg:60.19ms +step:861/2285 train_time:51829ms step_avg:60.20ms +step:862/2285 train_time:51889ms step_avg:60.20ms +step:863/2285 train_time:51951ms step_avg:60.20ms +step:864/2285 train_time:52010ms step_avg:60.20ms +step:865/2285 train_time:52072ms step_avg:60.20ms +step:866/2285 train_time:52132ms step_avg:60.20ms +step:867/2285 train_time:52194ms step_avg:60.20ms +step:868/2285 train_time:52253ms step_avg:60.20ms +step:869/2285 train_time:52315ms step_avg:60.20ms +step:870/2285 train_time:52375ms step_avg:60.20ms +step:871/2285 train_time:52438ms step_avg:60.20ms +step:872/2285 train_time:52497ms step_avg:60.20ms +step:873/2285 train_time:52558ms step_avg:60.20ms +step:874/2285 train_time:52617ms step_avg:60.20ms +step:875/2285 train_time:52679ms step_avg:60.21ms +step:876/2285 train_time:52739ms step_avg:60.20ms +step:877/2285 train_time:52801ms step_avg:60.21ms +step:878/2285 train_time:52861ms step_avg:60.21ms +step:879/2285 train_time:52923ms step_avg:60.21ms +step:880/2285 train_time:52983ms step_avg:60.21ms +step:881/2285 train_time:53045ms step_avg:60.21ms +step:882/2285 train_time:53104ms step_avg:60.21ms +step:883/2285 train_time:53166ms step_avg:60.21ms +step:884/2285 train_time:53226ms step_avg:60.21ms +step:885/2285 train_time:53288ms step_avg:60.21ms +step:886/2285 train_time:53347ms step_avg:60.21ms +step:887/2285 train_time:53409ms step_avg:60.21ms +step:888/2285 train_time:53469ms step_avg:60.21ms +step:889/2285 train_time:53531ms step_avg:60.21ms +step:890/2285 train_time:53591ms step_avg:60.21ms +step:891/2285 train_time:53652ms step_avg:60.22ms +step:892/2285 train_time:53712ms step_avg:60.22ms +step:893/2285 train_time:53774ms step_avg:60.22ms +step:894/2285 train_time:53834ms step_avg:60.22ms +step:895/2285 train_time:53895ms step_avg:60.22ms +step:896/2285 train_time:53955ms step_avg:60.22ms +step:897/2285 train_time:54016ms step_avg:60.22ms +step:898/2285 train_time:54075ms step_avg:60.22ms +step:899/2285 train_time:54137ms step_avg:60.22ms +step:900/2285 train_time:54196ms step_avg:60.22ms +step:901/2285 train_time:54258ms step_avg:60.22ms +step:902/2285 train_time:54317ms step_avg:60.22ms +step:903/2285 train_time:54379ms step_avg:60.22ms +step:904/2285 train_time:54439ms step_avg:60.22ms +step:905/2285 train_time:54501ms step_avg:60.22ms +step:906/2285 train_time:54560ms step_avg:60.22ms +step:907/2285 train_time:54622ms step_avg:60.22ms +step:908/2285 train_time:54682ms step_avg:60.22ms +step:909/2285 train_time:54744ms step_avg:60.22ms +step:910/2285 train_time:54804ms step_avg:60.22ms +step:911/2285 train_time:54866ms step_avg:60.23ms +step:912/2285 train_time:54925ms step_avg:60.23ms +step:913/2285 train_time:54987ms step_avg:60.23ms +step:914/2285 train_time:55046ms step_avg:60.23ms +step:915/2285 train_time:55108ms step_avg:60.23ms +step:916/2285 train_time:55167ms step_avg:60.23ms +step:917/2285 train_time:55229ms step_avg:60.23ms +step:918/2285 train_time:55288ms step_avg:60.23ms +step:919/2285 train_time:55351ms step_avg:60.23ms +step:920/2285 train_time:55410ms step_avg:60.23ms +step:921/2285 train_time:55472ms step_avg:60.23ms +step:922/2285 train_time:55532ms step_avg:60.23ms +step:923/2285 train_time:55593ms step_avg:60.23ms +step:924/2285 train_time:55653ms step_avg:60.23ms +step:925/2285 train_time:55715ms step_avg:60.23ms +step:926/2285 train_time:55775ms step_avg:60.23ms +step:927/2285 train_time:55837ms step_avg:60.23ms +step:928/2285 train_time:55897ms step_avg:60.23ms +step:929/2285 train_time:55958ms step_avg:60.23ms +step:930/2285 train_time:56017ms step_avg:60.23ms +step:931/2285 train_time:56079ms step_avg:60.23ms +step:932/2285 train_time:56138ms step_avg:60.23ms +step:933/2285 train_time:56200ms step_avg:60.24ms +step:934/2285 train_time:56259ms step_avg:60.23ms +step:935/2285 train_time:56322ms step_avg:60.24ms +step:936/2285 train_time:56382ms step_avg:60.24ms +step:937/2285 train_time:56444ms step_avg:60.24ms +step:938/2285 train_time:56504ms step_avg:60.24ms +step:939/2285 train_time:56566ms step_avg:60.24ms +step:940/2285 train_time:56625ms step_avg:60.24ms +step:941/2285 train_time:56688ms step_avg:60.24ms +step:942/2285 train_time:56747ms step_avg:60.24ms +step:943/2285 train_time:56808ms step_avg:60.24ms +step:944/2285 train_time:56868ms step_avg:60.24ms +step:945/2285 train_time:56931ms step_avg:60.24ms +step:946/2285 train_time:56991ms step_avg:60.24ms +step:947/2285 train_time:57052ms step_avg:60.25ms +step:948/2285 train_time:57112ms step_avg:60.24ms +step:949/2285 train_time:57174ms step_avg:60.25ms +step:950/2285 train_time:57234ms step_avg:60.25ms +step:951/2285 train_time:57295ms step_avg:60.25ms +step:952/2285 train_time:57355ms step_avg:60.25ms +step:953/2285 train_time:57417ms step_avg:60.25ms +step:954/2285 train_time:57476ms step_avg:60.25ms +step:955/2285 train_time:57538ms step_avg:60.25ms +step:956/2285 train_time:57597ms step_avg:60.25ms +step:957/2285 train_time:57659ms step_avg:60.25ms +step:958/2285 train_time:57718ms step_avg:60.25ms +step:959/2285 train_time:57781ms step_avg:60.25ms +step:960/2285 train_time:57841ms step_avg:60.25ms +step:961/2285 train_time:57902ms step_avg:60.25ms +step:962/2285 train_time:57962ms step_avg:60.25ms +step:963/2285 train_time:58025ms step_avg:60.25ms +step:964/2285 train_time:58085ms step_avg:60.25ms +step:965/2285 train_time:58147ms step_avg:60.26ms +step:966/2285 train_time:58206ms step_avg:60.25ms +step:967/2285 train_time:58268ms step_avg:60.26ms +step:968/2285 train_time:58327ms step_avg:60.26ms +step:969/2285 train_time:58390ms step_avg:60.26ms +step:970/2285 train_time:58449ms step_avg:60.26ms +step:971/2285 train_time:58511ms step_avg:60.26ms +step:972/2285 train_time:58571ms step_avg:60.26ms +step:973/2285 train_time:58633ms step_avg:60.26ms +step:974/2285 train_time:58693ms step_avg:60.26ms +step:975/2285 train_time:58755ms step_avg:60.26ms +step:976/2285 train_time:58815ms step_avg:60.26ms +step:977/2285 train_time:58877ms step_avg:60.26ms +step:978/2285 train_time:58936ms step_avg:60.26ms +step:979/2285 train_time:58997ms step_avg:60.26ms +step:980/2285 train_time:59057ms step_avg:60.26ms +step:981/2285 train_time:59119ms step_avg:60.26ms +step:982/2285 train_time:59179ms step_avg:60.26ms +step:983/2285 train_time:59241ms step_avg:60.27ms +step:984/2285 train_time:59301ms step_avg:60.27ms +step:985/2285 train_time:59363ms step_avg:60.27ms +step:986/2285 train_time:59423ms step_avg:60.27ms +step:987/2285 train_time:59485ms step_avg:60.27ms +step:988/2285 train_time:59545ms step_avg:60.27ms +step:989/2285 train_time:59607ms step_avg:60.27ms +step:990/2285 train_time:59666ms step_avg:60.27ms +step:991/2285 train_time:59728ms step_avg:60.27ms +step:992/2285 train_time:59787ms step_avg:60.27ms +step:993/2285 train_time:59849ms step_avg:60.27ms +step:994/2285 train_time:59909ms step_avg:60.27ms +step:995/2285 train_time:59971ms step_avg:60.27ms +step:996/2285 train_time:60031ms step_avg:60.27ms +step:997/2285 train_time:60093ms step_avg:60.27ms +step:998/2285 train_time:60152ms step_avg:60.27ms +step:999/2285 train_time:60214ms step_avg:60.27ms +step:1000/2285 train_time:60274ms step_avg:60.27ms +step:1000/2285 val_loss:3.5659 train_time:60338ms step_avg:60.34ms +step:1001/2285 train_time:60357ms step_avg:60.30ms +step:1002/2285 train_time:60400ms step_avg:60.28ms +step:1003/2285 train_time:60461ms step_avg:60.28ms +step:1004/2285 train_time:60521ms step_avg:60.28ms +step:1005/2285 train_time:60583ms step_avg:60.28ms +step:1006/2285 train_time:60643ms step_avg:60.28ms +step:1007/2285 train_time:60704ms step_avg:60.28ms +step:1008/2285 train_time:60763ms step_avg:60.28ms +step:1009/2285 train_time:60824ms step_avg:60.28ms +step:1010/2285 train_time:60883ms step_avg:60.28ms +step:1011/2285 train_time:60944ms step_avg:60.28ms +step:1012/2285 train_time:61003ms step_avg:60.28ms +step:1013/2285 train_time:61064ms step_avg:60.28ms +step:1014/2285 train_time:61123ms step_avg:60.28ms +step:1015/2285 train_time:61184ms step_avg:60.28ms +step:1016/2285 train_time:61243ms step_avg:60.28ms +step:1017/2285 train_time:61309ms step_avg:60.28ms +step:1018/2285 train_time:61371ms step_avg:60.29ms +step:1019/2285 train_time:61433ms step_avg:60.29ms +step:1020/2285 train_time:61492ms step_avg:60.29ms +step:1021/2285 train_time:61554ms step_avg:60.29ms +step:1022/2285 train_time:61614ms step_avg:60.29ms +step:1023/2285 train_time:61676ms step_avg:60.29ms +step:1024/2285 train_time:61735ms step_avg:60.29ms +step:1025/2285 train_time:61797ms step_avg:60.29ms +step:1026/2285 train_time:61857ms step_avg:60.29ms +step:1027/2285 train_time:61918ms step_avg:60.29ms +step:1028/2285 train_time:61977ms step_avg:60.29ms +step:1029/2285 train_time:62038ms step_avg:60.29ms +step:1030/2285 train_time:62097ms step_avg:60.29ms +step:1031/2285 train_time:62159ms step_avg:60.29ms +step:1032/2285 train_time:62219ms step_avg:60.29ms +step:1033/2285 train_time:62281ms step_avg:60.29ms +step:1034/2285 train_time:62341ms step_avg:60.29ms +step:1035/2285 train_time:62403ms step_avg:60.29ms +step:1036/2285 train_time:62462ms step_avg:60.29ms +step:1037/2285 train_time:62524ms step_avg:60.29ms +step:1038/2285 train_time:62585ms step_avg:60.29ms +step:1039/2285 train_time:62647ms step_avg:60.30ms +step:1040/2285 train_time:62706ms step_avg:60.29ms +step:1041/2285 train_time:62768ms step_avg:60.30ms +step:1042/2285 train_time:62828ms step_avg:60.30ms +step:1043/2285 train_time:62891ms step_avg:60.30ms +step:1044/2285 train_time:62950ms step_avg:60.30ms +step:1045/2285 train_time:63012ms step_avg:60.30ms +step:1046/2285 train_time:63071ms step_avg:60.30ms +step:1047/2285 train_time:63133ms step_avg:60.30ms +step:1048/2285 train_time:63192ms step_avg:60.30ms +step:1049/2285 train_time:63254ms step_avg:60.30ms +step:1050/2285 train_time:63314ms step_avg:60.30ms +step:1051/2285 train_time:63376ms step_avg:60.30ms +step:1052/2285 train_time:63435ms step_avg:60.30ms +step:1053/2285 train_time:63497ms step_avg:60.30ms +step:1054/2285 train_time:63557ms step_avg:60.30ms +step:1055/2285 train_time:63619ms step_avg:60.30ms +step:1056/2285 train_time:63678ms step_avg:60.30ms +step:1057/2285 train_time:63740ms step_avg:60.30ms +step:1058/2285 train_time:63799ms step_avg:60.30ms +step:1059/2285 train_time:63860ms step_avg:60.30ms +step:1060/2285 train_time:63920ms step_avg:60.30ms +step:1061/2285 train_time:63982ms step_avg:60.30ms +step:1062/2285 train_time:64041ms step_avg:60.30ms +step:1063/2285 train_time:64102ms step_avg:60.30ms +step:1064/2285 train_time:64162ms step_avg:60.30ms +step:1065/2285 train_time:64224ms step_avg:60.30ms +step:1066/2285 train_time:64284ms step_avg:60.30ms +step:1067/2285 train_time:64345ms step_avg:60.30ms +step:1068/2285 train_time:64405ms step_avg:60.30ms +step:1069/2285 train_time:64467ms step_avg:60.31ms +step:1070/2285 train_time:64527ms step_avg:60.31ms +step:1071/2285 train_time:64589ms step_avg:60.31ms +step:1072/2285 train_time:64649ms step_avg:60.31ms +step:1073/2285 train_time:64710ms step_avg:60.31ms +step:1074/2285 train_time:64770ms step_avg:60.31ms +step:1075/2285 train_time:64831ms step_avg:60.31ms +step:1076/2285 train_time:64891ms step_avg:60.31ms +step:1077/2285 train_time:64953ms step_avg:60.31ms +step:1078/2285 train_time:65012ms step_avg:60.31ms +step:1079/2285 train_time:65074ms step_avg:60.31ms +step:1080/2285 train_time:65134ms step_avg:60.31ms +step:1081/2285 train_time:65195ms step_avg:60.31ms +step:1082/2285 train_time:65255ms step_avg:60.31ms +step:1083/2285 train_time:65317ms step_avg:60.31ms +step:1084/2285 train_time:65377ms step_avg:60.31ms +step:1085/2285 train_time:65440ms step_avg:60.31ms +step:1086/2285 train_time:65499ms step_avg:60.31ms +step:1087/2285 train_time:65561ms step_avg:60.31ms +step:1088/2285 train_time:65620ms step_avg:60.31ms +step:1089/2285 train_time:65682ms step_avg:60.31ms +step:1090/2285 train_time:65742ms step_avg:60.31ms +step:1091/2285 train_time:65804ms step_avg:60.31ms +step:1092/2285 train_time:65863ms step_avg:60.31ms +step:1093/2285 train_time:65925ms step_avg:60.32ms +step:1094/2285 train_time:65985ms step_avg:60.32ms +step:1095/2285 train_time:66047ms step_avg:60.32ms +step:1096/2285 train_time:66106ms step_avg:60.32ms +step:1097/2285 train_time:66169ms step_avg:60.32ms +step:1098/2285 train_time:66228ms step_avg:60.32ms +step:1099/2285 train_time:66290ms step_avg:60.32ms +step:1100/2285 train_time:66350ms step_avg:60.32ms +step:1101/2285 train_time:66412ms step_avg:60.32ms +step:1102/2285 train_time:66471ms step_avg:60.32ms +step:1103/2285 train_time:66533ms step_avg:60.32ms +step:1104/2285 train_time:66593ms step_avg:60.32ms +step:1105/2285 train_time:66654ms step_avg:60.32ms +step:1106/2285 train_time:66714ms step_avg:60.32ms +step:1107/2285 train_time:66776ms step_avg:60.32ms +step:1108/2285 train_time:66836ms step_avg:60.32ms +step:1109/2285 train_time:66898ms step_avg:60.32ms +step:1110/2285 train_time:66958ms step_avg:60.32ms +step:1111/2285 train_time:67020ms step_avg:60.32ms +step:1112/2285 train_time:67079ms step_avg:60.32ms +step:1113/2285 train_time:67141ms step_avg:60.32ms +step:1114/2285 train_time:67201ms step_avg:60.32ms +step:1115/2285 train_time:67263ms step_avg:60.33ms +step:1116/2285 train_time:67322ms step_avg:60.32ms +step:1117/2285 train_time:67384ms step_avg:60.33ms +step:1118/2285 train_time:67443ms step_avg:60.33ms +step:1119/2285 train_time:67506ms step_avg:60.33ms +step:1120/2285 train_time:67565ms step_avg:60.33ms +step:1121/2285 train_time:67627ms step_avg:60.33ms +step:1122/2285 train_time:67687ms step_avg:60.33ms +step:1123/2285 train_time:67750ms step_avg:60.33ms +step:1124/2285 train_time:67809ms step_avg:60.33ms +step:1125/2285 train_time:67870ms step_avg:60.33ms +step:1126/2285 train_time:67930ms step_avg:60.33ms +step:1127/2285 train_time:67992ms step_avg:60.33ms +step:1128/2285 train_time:68052ms step_avg:60.33ms +step:1129/2285 train_time:68115ms step_avg:60.33ms +step:1130/2285 train_time:68174ms step_avg:60.33ms +step:1131/2285 train_time:68236ms step_avg:60.33ms +step:1132/2285 train_time:68295ms step_avg:60.33ms +step:1133/2285 train_time:68357ms step_avg:60.33ms +step:1134/2285 train_time:68417ms step_avg:60.33ms +step:1135/2285 train_time:68480ms step_avg:60.34ms +step:1136/2285 train_time:68539ms step_avg:60.33ms +step:1137/2285 train_time:68601ms step_avg:60.34ms +step:1138/2285 train_time:68660ms step_avg:60.33ms +step:1139/2285 train_time:68722ms step_avg:60.34ms +step:1140/2285 train_time:68781ms step_avg:60.33ms +step:1141/2285 train_time:68843ms step_avg:60.34ms +step:1142/2285 train_time:68902ms step_avg:60.33ms +step:1143/2285 train_time:68964ms step_avg:60.34ms +step:1144/2285 train_time:69024ms step_avg:60.34ms +step:1145/2285 train_time:69086ms step_avg:60.34ms +step:1146/2285 train_time:69146ms step_avg:60.34ms +step:1147/2285 train_time:69209ms step_avg:60.34ms +step:1148/2285 train_time:69270ms step_avg:60.34ms +step:1149/2285 train_time:69332ms step_avg:60.34ms +step:1150/2285 train_time:69392ms step_avg:60.34ms +step:1151/2285 train_time:69454ms step_avg:60.34ms +step:1152/2285 train_time:69513ms step_avg:60.34ms +step:1153/2285 train_time:69576ms step_avg:60.34ms +step:1154/2285 train_time:69635ms step_avg:60.34ms +step:1155/2285 train_time:69698ms step_avg:60.34ms +step:1156/2285 train_time:69758ms step_avg:60.34ms +step:1157/2285 train_time:69820ms step_avg:60.35ms +step:1158/2285 train_time:69880ms step_avg:60.35ms +step:1159/2285 train_time:69942ms step_avg:60.35ms +step:1160/2285 train_time:70002ms step_avg:60.35ms +step:1161/2285 train_time:70064ms step_avg:60.35ms +step:1162/2285 train_time:70123ms step_avg:60.35ms +step:1163/2285 train_time:70185ms step_avg:60.35ms +step:1164/2285 train_time:70245ms step_avg:60.35ms +step:1165/2285 train_time:70307ms step_avg:60.35ms +step:1166/2285 train_time:70368ms step_avg:60.35ms +step:1167/2285 train_time:70431ms step_avg:60.35ms +step:1168/2285 train_time:70491ms step_avg:60.35ms +step:1169/2285 train_time:70553ms step_avg:60.35ms +step:1170/2285 train_time:70613ms step_avg:60.35ms +step:1171/2285 train_time:70674ms step_avg:60.35ms +step:1172/2285 train_time:70734ms step_avg:60.35ms +step:1173/2285 train_time:70796ms step_avg:60.35ms +step:1174/2285 train_time:70856ms step_avg:60.35ms +step:1175/2285 train_time:70918ms step_avg:60.36ms +step:1176/2285 train_time:70978ms step_avg:60.36ms +step:1177/2285 train_time:71040ms step_avg:60.36ms +step:1178/2285 train_time:71099ms step_avg:60.36ms +step:1179/2285 train_time:71161ms step_avg:60.36ms +step:1180/2285 train_time:71221ms step_avg:60.36ms +step:1181/2285 train_time:71284ms step_avg:60.36ms +step:1182/2285 train_time:71344ms step_avg:60.36ms +step:1183/2285 train_time:71406ms step_avg:60.36ms +step:1184/2285 train_time:71467ms step_avg:60.36ms +step:1185/2285 train_time:71530ms step_avg:60.36ms +step:1186/2285 train_time:71591ms step_avg:60.36ms +step:1187/2285 train_time:71653ms step_avg:60.36ms +step:1188/2285 train_time:71712ms step_avg:60.36ms +step:1189/2285 train_time:71775ms step_avg:60.37ms +step:1190/2285 train_time:71834ms step_avg:60.36ms +step:1191/2285 train_time:71896ms step_avg:60.37ms +step:1192/2285 train_time:71956ms step_avg:60.37ms +step:1193/2285 train_time:72018ms step_avg:60.37ms +step:1194/2285 train_time:72078ms step_avg:60.37ms +step:1195/2285 train_time:72141ms step_avg:60.37ms +step:1196/2285 train_time:72200ms step_avg:60.37ms +step:1197/2285 train_time:72262ms step_avg:60.37ms +step:1198/2285 train_time:72323ms step_avg:60.37ms +step:1199/2285 train_time:72385ms step_avg:60.37ms +step:1200/2285 train_time:72444ms step_avg:60.37ms +step:1201/2285 train_time:72506ms step_avg:60.37ms +step:1202/2285 train_time:72566ms step_avg:60.37ms +step:1203/2285 train_time:72629ms step_avg:60.37ms +step:1204/2285 train_time:72689ms step_avg:60.37ms +step:1205/2285 train_time:72751ms step_avg:60.37ms +step:1206/2285 train_time:72811ms step_avg:60.37ms +step:1207/2285 train_time:72873ms step_avg:60.38ms +step:1208/2285 train_time:72933ms step_avg:60.37ms +step:1209/2285 train_time:72994ms step_avg:60.38ms +step:1210/2285 train_time:73054ms step_avg:60.38ms +step:1211/2285 train_time:73117ms step_avg:60.38ms +step:1212/2285 train_time:73177ms step_avg:60.38ms +step:1213/2285 train_time:73240ms step_avg:60.38ms +step:1214/2285 train_time:73300ms step_avg:60.38ms +step:1215/2285 train_time:73362ms step_avg:60.38ms +step:1216/2285 train_time:73422ms step_avg:60.38ms +step:1217/2285 train_time:73484ms step_avg:60.38ms +step:1218/2285 train_time:73544ms step_avg:60.38ms +step:1219/2285 train_time:73606ms step_avg:60.38ms +step:1220/2285 train_time:73666ms step_avg:60.38ms +step:1221/2285 train_time:73728ms step_avg:60.38ms +step:1222/2285 train_time:73789ms step_avg:60.38ms +step:1223/2285 train_time:73851ms step_avg:60.39ms +step:1224/2285 train_time:73911ms step_avg:60.38ms +step:1225/2285 train_time:73973ms step_avg:60.39ms +step:1226/2285 train_time:74032ms step_avg:60.39ms +step:1227/2285 train_time:74094ms step_avg:60.39ms +step:1228/2285 train_time:74154ms step_avg:60.39ms +step:1229/2285 train_time:74216ms step_avg:60.39ms +step:1230/2285 train_time:74277ms step_avg:60.39ms +step:1231/2285 train_time:74340ms step_avg:60.39ms +step:1232/2285 train_time:74399ms step_avg:60.39ms +step:1233/2285 train_time:74461ms step_avg:60.39ms +step:1234/2285 train_time:74521ms step_avg:60.39ms +step:1235/2285 train_time:74584ms step_avg:60.39ms +step:1236/2285 train_time:74644ms step_avg:60.39ms +step:1237/2285 train_time:74706ms step_avg:60.39ms +step:1238/2285 train_time:74766ms step_avg:60.39ms +step:1239/2285 train_time:74829ms step_avg:60.39ms +step:1240/2285 train_time:74888ms step_avg:60.39ms +step:1241/2285 train_time:74950ms step_avg:60.40ms +step:1242/2285 train_time:75010ms step_avg:60.39ms +step:1243/2285 train_time:75073ms step_avg:60.40ms +step:1244/2285 train_time:75133ms step_avg:60.40ms +step:1245/2285 train_time:75195ms step_avg:60.40ms +step:1246/2285 train_time:75254ms step_avg:60.40ms +step:1247/2285 train_time:75316ms step_avg:60.40ms +step:1248/2285 train_time:75376ms step_avg:60.40ms +step:1249/2285 train_time:75439ms step_avg:60.40ms +step:1250/2285 train_time:75500ms step_avg:60.40ms +step:1250/2285 val_loss:3.4966 train_time:75563ms step_avg:60.45ms +step:1251/2285 train_time:75586ms step_avg:60.42ms +step:1252/2285 train_time:75626ms step_avg:60.40ms +step:1253/2285 train_time:75687ms step_avg:60.40ms +step:1254/2285 train_time:75745ms step_avg:60.40ms +step:1255/2285 train_time:75807ms step_avg:60.40ms +step:1256/2285 train_time:75866ms step_avg:60.40ms +step:1257/2285 train_time:75927ms step_avg:60.40ms +step:1258/2285 train_time:75986ms step_avg:60.40ms +step:1259/2285 train_time:76047ms step_avg:60.40ms +step:1260/2285 train_time:76106ms step_avg:60.40ms +step:1261/2285 train_time:76167ms step_avg:60.40ms +step:1262/2285 train_time:76226ms step_avg:60.40ms +step:1263/2285 train_time:76287ms step_avg:60.40ms +step:1264/2285 train_time:76346ms step_avg:60.40ms +step:1265/2285 train_time:76407ms step_avg:60.40ms +step:1266/2285 train_time:76476ms step_avg:60.41ms +step:1267/2285 train_time:76544ms step_avg:60.41ms +step:1268/2285 train_time:76604ms step_avg:60.41ms +step:1269/2285 train_time:76667ms step_avg:60.41ms +step:1270/2285 train_time:76726ms step_avg:60.41ms +step:1271/2285 train_time:76787ms step_avg:60.42ms +step:1272/2285 train_time:76847ms step_avg:60.41ms +step:1273/2285 train_time:76908ms step_avg:60.42ms +step:1274/2285 train_time:76967ms step_avg:60.41ms +step:1275/2285 train_time:77028ms step_avg:60.41ms +step:1276/2285 train_time:77087ms step_avg:60.41ms +step:1277/2285 train_time:77148ms step_avg:60.41ms +step:1278/2285 train_time:77208ms step_avg:60.41ms +step:1279/2285 train_time:77269ms step_avg:60.41ms +step:1280/2285 train_time:77328ms step_avg:60.41ms +step:1281/2285 train_time:77391ms step_avg:60.41ms +step:1282/2285 train_time:77454ms step_avg:60.42ms +step:1283/2285 train_time:77518ms step_avg:60.42ms +step:1284/2285 train_time:77579ms step_avg:60.42ms +step:1285/2285 train_time:77641ms step_avg:60.42ms +step:1286/2285 train_time:77701ms step_avg:60.42ms +step:1287/2285 train_time:77763ms step_avg:60.42ms +step:1288/2285 train_time:77822ms step_avg:60.42ms +step:1289/2285 train_time:77884ms step_avg:60.42ms +step:1290/2285 train_time:77943ms step_avg:60.42ms +step:1291/2285 train_time:78004ms step_avg:60.42ms +step:1292/2285 train_time:78063ms step_avg:60.42ms +step:1293/2285 train_time:78125ms step_avg:60.42ms +step:1294/2285 train_time:78185ms step_avg:60.42ms +step:1295/2285 train_time:78246ms step_avg:60.42ms +step:1296/2285 train_time:78305ms step_avg:60.42ms +step:1297/2285 train_time:78367ms step_avg:60.42ms +step:1298/2285 train_time:78428ms step_avg:60.42ms +step:1299/2285 train_time:78491ms step_avg:60.42ms +step:1300/2285 train_time:78553ms step_avg:60.43ms +step:1301/2285 train_time:78617ms step_avg:60.43ms +step:1302/2285 train_time:78676ms step_avg:60.43ms +step:1303/2285 train_time:78738ms step_avg:60.43ms +step:1304/2285 train_time:78798ms step_avg:60.43ms +step:1305/2285 train_time:78859ms step_avg:60.43ms +step:1306/2285 train_time:78919ms step_avg:60.43ms +step:1307/2285 train_time:78982ms step_avg:60.43ms +step:1308/2285 train_time:79041ms step_avg:60.43ms +step:1309/2285 train_time:79103ms step_avg:60.43ms +step:1310/2285 train_time:79162ms step_avg:60.43ms +step:1311/2285 train_time:79224ms step_avg:60.43ms +step:1312/2285 train_time:79283ms step_avg:60.43ms +step:1313/2285 train_time:79345ms step_avg:60.43ms +step:1314/2285 train_time:79405ms step_avg:60.43ms +step:1315/2285 train_time:79468ms step_avg:60.43ms +step:1316/2285 train_time:79529ms step_avg:60.43ms +step:1317/2285 train_time:79592ms step_avg:60.43ms +step:1318/2285 train_time:79653ms step_avg:60.44ms +step:1319/2285 train_time:79716ms step_avg:60.44ms +step:1320/2285 train_time:79775ms step_avg:60.44ms +step:1321/2285 train_time:79837ms step_avg:60.44ms +step:1322/2285 train_time:79897ms step_avg:60.44ms +step:1323/2285 train_time:79959ms step_avg:60.44ms +step:1324/2285 train_time:80019ms step_avg:60.44ms +step:1325/2285 train_time:80081ms step_avg:60.44ms +step:1326/2285 train_time:80141ms step_avg:60.44ms +step:1327/2285 train_time:80203ms step_avg:60.44ms +step:1328/2285 train_time:80263ms step_avg:60.44ms +step:1329/2285 train_time:80324ms step_avg:60.44ms +step:1330/2285 train_time:80384ms step_avg:60.44ms +step:1331/2285 train_time:80446ms step_avg:60.44ms +step:1332/2285 train_time:80506ms step_avg:60.44ms +step:1333/2285 train_time:80569ms step_avg:60.44ms +step:1334/2285 train_time:80628ms step_avg:60.44ms +step:1335/2285 train_time:80691ms step_avg:60.44ms +step:1336/2285 train_time:80752ms step_avg:60.44ms +step:1337/2285 train_time:80815ms step_avg:60.44ms +step:1338/2285 train_time:80874ms step_avg:60.44ms +step:1339/2285 train_time:80936ms step_avg:60.45ms +step:1340/2285 train_time:80996ms step_avg:60.45ms +step:1341/2285 train_time:81058ms step_avg:60.45ms +step:1342/2285 train_time:81117ms step_avg:60.44ms +step:1343/2285 train_time:81179ms step_avg:60.45ms +step:1344/2285 train_time:81238ms step_avg:60.45ms +step:1345/2285 train_time:81300ms step_avg:60.45ms +step:1346/2285 train_time:81362ms step_avg:60.45ms +step:1347/2285 train_time:81424ms step_avg:60.45ms +step:1348/2285 train_time:81483ms step_avg:60.45ms +step:1349/2285 train_time:81546ms step_avg:60.45ms +step:1350/2285 train_time:81606ms step_avg:60.45ms +step:1351/2285 train_time:81668ms step_avg:60.45ms +step:1352/2285 train_time:81727ms step_avg:60.45ms +step:1353/2285 train_time:81790ms step_avg:60.45ms +step:1354/2285 train_time:81850ms step_avg:60.45ms +step:1355/2285 train_time:81913ms step_avg:60.45ms +step:1356/2285 train_time:81973ms step_avg:60.45ms +step:1357/2285 train_time:82035ms step_avg:60.45ms +step:1358/2285 train_time:82094ms step_avg:60.45ms +step:1359/2285 train_time:82157ms step_avg:60.45ms +step:1360/2285 train_time:82216ms step_avg:60.45ms +step:1361/2285 train_time:82278ms step_avg:60.45ms +step:1362/2285 train_time:82338ms step_avg:60.45ms +step:1363/2285 train_time:82400ms step_avg:60.45ms +step:1364/2285 train_time:82460ms step_avg:60.45ms +step:1365/2285 train_time:82523ms step_avg:60.46ms +step:1366/2285 train_time:82583ms step_avg:60.46ms +step:1367/2285 train_time:82645ms step_avg:60.46ms +step:1368/2285 train_time:82705ms step_avg:60.46ms +step:1369/2285 train_time:82767ms step_avg:60.46ms +step:1370/2285 train_time:82827ms step_avg:60.46ms +step:1371/2285 train_time:82889ms step_avg:60.46ms +step:1372/2285 train_time:82949ms step_avg:60.46ms +step:1373/2285 train_time:83011ms step_avg:60.46ms +step:1374/2285 train_time:83071ms step_avg:60.46ms +step:1375/2285 train_time:83133ms step_avg:60.46ms +step:1376/2285 train_time:83193ms step_avg:60.46ms +step:1377/2285 train_time:83255ms step_avg:60.46ms +step:1378/2285 train_time:83315ms step_avg:60.46ms +step:1379/2285 train_time:83377ms step_avg:60.46ms +step:1380/2285 train_time:83436ms step_avg:60.46ms +step:1381/2285 train_time:83499ms step_avg:60.46ms +step:1382/2285 train_time:83559ms step_avg:60.46ms +step:1383/2285 train_time:83621ms step_avg:60.46ms +step:1384/2285 train_time:83681ms step_avg:60.46ms +step:1385/2285 train_time:83744ms step_avg:60.46ms +step:1386/2285 train_time:83804ms step_avg:60.46ms +step:1387/2285 train_time:83866ms step_avg:60.47ms +step:1388/2285 train_time:83926ms step_avg:60.47ms +step:1389/2285 train_time:83987ms step_avg:60.47ms +step:1390/2285 train_time:84047ms step_avg:60.47ms +step:1391/2285 train_time:84110ms step_avg:60.47ms +step:1392/2285 train_time:84170ms step_avg:60.47ms +step:1393/2285 train_time:84232ms step_avg:60.47ms +step:1394/2285 train_time:84292ms step_avg:60.47ms +step:1395/2285 train_time:84355ms step_avg:60.47ms +step:1396/2285 train_time:84415ms step_avg:60.47ms +step:1397/2285 train_time:84477ms step_avg:60.47ms +step:1398/2285 train_time:84536ms step_avg:60.47ms +step:1399/2285 train_time:84599ms step_avg:60.47ms +step:1400/2285 train_time:84659ms step_avg:60.47ms +step:1401/2285 train_time:84721ms step_avg:60.47ms +step:1402/2285 train_time:84781ms step_avg:60.47ms +step:1403/2285 train_time:84843ms step_avg:60.47ms +step:1404/2285 train_time:84903ms step_avg:60.47ms +step:1405/2285 train_time:84965ms step_avg:60.47ms +step:1406/2285 train_time:85025ms step_avg:60.47ms +step:1407/2285 train_time:85087ms step_avg:60.47ms +step:1408/2285 train_time:85146ms step_avg:60.47ms +step:1409/2285 train_time:85208ms step_avg:60.47ms +step:1410/2285 train_time:85268ms step_avg:60.47ms +step:1411/2285 train_time:85330ms step_avg:60.47ms +step:1412/2285 train_time:85390ms step_avg:60.47ms +step:1413/2285 train_time:85454ms step_avg:60.48ms +step:1414/2285 train_time:85514ms step_avg:60.48ms +step:1415/2285 train_time:85576ms step_avg:60.48ms +step:1416/2285 train_time:85636ms step_avg:60.48ms +step:1417/2285 train_time:85698ms step_avg:60.48ms +step:1418/2285 train_time:85758ms step_avg:60.48ms +step:1419/2285 train_time:85820ms step_avg:60.48ms +step:1420/2285 train_time:85880ms step_avg:60.48ms +step:1421/2285 train_time:85943ms step_avg:60.48ms +step:1422/2285 train_time:86003ms step_avg:60.48ms +step:1423/2285 train_time:86064ms step_avg:60.48ms +step:1424/2285 train_time:86124ms step_avg:60.48ms +step:1425/2285 train_time:86186ms step_avg:60.48ms +step:1426/2285 train_time:86245ms step_avg:60.48ms +step:1427/2285 train_time:86307ms step_avg:60.48ms +step:1428/2285 train_time:86367ms step_avg:60.48ms +step:1429/2285 train_time:86430ms step_avg:60.48ms +step:1430/2285 train_time:86491ms step_avg:60.48ms +step:1431/2285 train_time:86554ms step_avg:60.49ms +step:1432/2285 train_time:86614ms step_avg:60.48ms +step:1433/2285 train_time:86676ms step_avg:60.49ms +step:1434/2285 train_time:86736ms step_avg:60.49ms +step:1435/2285 train_time:86798ms step_avg:60.49ms +step:1436/2285 train_time:86858ms step_avg:60.49ms +step:1437/2285 train_time:86920ms step_avg:60.49ms +step:1438/2285 train_time:86979ms step_avg:60.49ms +step:1439/2285 train_time:87041ms step_avg:60.49ms +step:1440/2285 train_time:87101ms step_avg:60.49ms +step:1441/2285 train_time:87163ms step_avg:60.49ms +step:1442/2285 train_time:87223ms step_avg:60.49ms +step:1443/2285 train_time:87285ms step_avg:60.49ms +step:1444/2285 train_time:87345ms step_avg:60.49ms +step:1445/2285 train_time:87407ms step_avg:60.49ms +step:1446/2285 train_time:87467ms step_avg:60.49ms +step:1447/2285 train_time:87529ms step_avg:60.49ms +step:1448/2285 train_time:87590ms step_avg:60.49ms +step:1449/2285 train_time:87654ms step_avg:60.49ms +step:1450/2285 train_time:87714ms step_avg:60.49ms +step:1451/2285 train_time:87776ms step_avg:60.49ms +step:1452/2285 train_time:87835ms step_avg:60.49ms +step:1453/2285 train_time:87897ms step_avg:60.49ms +step:1454/2285 train_time:87957ms step_avg:60.49ms +step:1455/2285 train_time:88019ms step_avg:60.49ms +step:1456/2285 train_time:88079ms step_avg:60.49ms +step:1457/2285 train_time:88141ms step_avg:60.49ms +step:1458/2285 train_time:88201ms step_avg:60.49ms +step:1459/2285 train_time:88263ms step_avg:60.50ms +step:1460/2285 train_time:88323ms step_avg:60.50ms +step:1461/2285 train_time:88385ms step_avg:60.50ms +step:1462/2285 train_time:88445ms step_avg:60.50ms +step:1463/2285 train_time:88508ms step_avg:60.50ms +step:1464/2285 train_time:88568ms step_avg:60.50ms +step:1465/2285 train_time:88630ms step_avg:60.50ms +step:1466/2285 train_time:88690ms step_avg:60.50ms +step:1467/2285 train_time:88753ms step_avg:60.50ms +step:1468/2285 train_time:88813ms step_avg:60.50ms +step:1469/2285 train_time:88875ms step_avg:60.50ms +step:1470/2285 train_time:88935ms step_avg:60.50ms +step:1471/2285 train_time:88997ms step_avg:60.50ms +step:1472/2285 train_time:89057ms step_avg:60.50ms +step:1473/2285 train_time:89119ms step_avg:60.50ms +step:1474/2285 train_time:89179ms step_avg:60.50ms +step:1475/2285 train_time:89241ms step_avg:60.50ms +step:1476/2285 train_time:89301ms step_avg:60.50ms +step:1477/2285 train_time:89364ms step_avg:60.50ms +step:1478/2285 train_time:89424ms step_avg:60.50ms +step:1479/2285 train_time:89486ms step_avg:60.50ms +step:1480/2285 train_time:89545ms step_avg:60.50ms +step:1481/2285 train_time:89608ms step_avg:60.50ms +step:1482/2285 train_time:89667ms step_avg:60.50ms +step:1483/2285 train_time:89729ms step_avg:60.51ms +step:1484/2285 train_time:89789ms step_avg:60.50ms +step:1485/2285 train_time:89853ms step_avg:60.51ms +step:1486/2285 train_time:89913ms step_avg:60.51ms +step:1487/2285 train_time:89975ms step_avg:60.51ms +step:1488/2285 train_time:90035ms step_avg:60.51ms +step:1489/2285 train_time:90097ms step_avg:60.51ms +step:1490/2285 train_time:90156ms step_avg:60.51ms +step:1491/2285 train_time:90218ms step_avg:60.51ms +step:1492/2285 train_time:90278ms step_avg:60.51ms +step:1493/2285 train_time:90340ms step_avg:60.51ms +step:1494/2285 train_time:90401ms step_avg:60.51ms +step:1495/2285 train_time:90465ms step_avg:60.51ms +step:1496/2285 train_time:90524ms step_avg:60.51ms +step:1497/2285 train_time:90586ms step_avg:60.51ms +step:1498/2285 train_time:90645ms step_avg:60.51ms +step:1499/2285 train_time:90707ms step_avg:60.51ms +step:1500/2285 train_time:90767ms step_avg:60.51ms +step:1500/2285 val_loss:3.4283 train_time:90830ms step_avg:60.55ms +step:1501/2285 train_time:90848ms step_avg:60.53ms +step:1502/2285 train_time:90892ms step_avg:60.51ms +step:1503/2285 train_time:90957ms step_avg:60.52ms +step:1504/2285 train_time:91017ms step_avg:60.52ms +step:1505/2285 train_time:91079ms step_avg:60.52ms +step:1506/2285 train_time:91138ms step_avg:60.52ms +step:1507/2285 train_time:91200ms step_avg:60.52ms +step:1508/2285 train_time:91259ms step_avg:60.52ms +step:1509/2285 train_time:91320ms step_avg:60.52ms +step:1510/2285 train_time:91380ms step_avg:60.52ms +step:1511/2285 train_time:91443ms step_avg:60.52ms +step:1512/2285 train_time:91502ms step_avg:60.52ms +step:1513/2285 train_time:91563ms step_avg:60.52ms +step:1514/2285 train_time:91623ms step_avg:60.52ms +step:1515/2285 train_time:91684ms step_avg:60.52ms +step:1516/2285 train_time:91745ms step_avg:60.52ms +step:1517/2285 train_time:91809ms step_avg:60.52ms +step:1518/2285 train_time:91870ms step_avg:60.52ms +step:1519/2285 train_time:91933ms step_avg:60.52ms +step:1520/2285 train_time:91994ms step_avg:60.52ms +step:1521/2285 train_time:92056ms step_avg:60.52ms +step:1522/2285 train_time:92116ms step_avg:60.52ms +step:1523/2285 train_time:92177ms step_avg:60.52ms +step:1524/2285 train_time:92236ms step_avg:60.52ms +step:1525/2285 train_time:92298ms step_avg:60.52ms +step:1526/2285 train_time:92358ms step_avg:60.52ms +step:1527/2285 train_time:92421ms step_avg:60.52ms +step:1528/2285 train_time:92480ms step_avg:60.52ms +step:1529/2285 train_time:92542ms step_avg:60.52ms +step:1530/2285 train_time:92601ms step_avg:60.52ms +step:1531/2285 train_time:92664ms step_avg:60.52ms +step:1532/2285 train_time:92724ms step_avg:60.52ms +step:1533/2285 train_time:92788ms step_avg:60.53ms +step:1534/2285 train_time:92849ms step_avg:60.53ms +step:1535/2285 train_time:92913ms step_avg:60.53ms +step:1536/2285 train_time:92973ms step_avg:60.53ms +step:1537/2285 train_time:93035ms step_avg:60.53ms +step:1538/2285 train_time:93095ms step_avg:60.53ms +step:1539/2285 train_time:93157ms step_avg:60.53ms +step:1540/2285 train_time:93217ms step_avg:60.53ms +step:1541/2285 train_time:93279ms step_avg:60.53ms +step:1542/2285 train_time:93339ms step_avg:60.53ms +step:1543/2285 train_time:93401ms step_avg:60.53ms +step:1544/2285 train_time:93461ms step_avg:60.53ms +step:1545/2285 train_time:93522ms step_avg:60.53ms +step:1546/2285 train_time:93582ms step_avg:60.53ms +step:1547/2285 train_time:93643ms step_avg:60.53ms +step:1548/2285 train_time:93704ms step_avg:60.53ms +step:1549/2285 train_time:93767ms step_avg:60.53ms +step:1550/2285 train_time:93828ms step_avg:60.53ms +step:1551/2285 train_time:93891ms step_avg:60.54ms +step:1552/2285 train_time:93950ms step_avg:60.53ms +step:1553/2285 train_time:94013ms step_avg:60.54ms +step:1554/2285 train_time:94073ms step_avg:60.54ms +step:1555/2285 train_time:94135ms step_avg:60.54ms +step:1556/2285 train_time:94196ms step_avg:60.54ms +step:1557/2285 train_time:94258ms step_avg:60.54ms +step:1558/2285 train_time:94317ms step_avg:60.54ms +step:1559/2285 train_time:94380ms step_avg:60.54ms +step:1560/2285 train_time:94439ms step_avg:60.54ms +step:1561/2285 train_time:94501ms step_avg:60.54ms +step:1562/2285 train_time:94560ms step_avg:60.54ms +step:1563/2285 train_time:94622ms step_avg:60.54ms +step:1564/2285 train_time:94682ms step_avg:60.54ms +step:1565/2285 train_time:94745ms step_avg:60.54ms +step:1566/2285 train_time:94807ms step_avg:60.54ms +step:1567/2285 train_time:94870ms step_avg:60.54ms +step:1568/2285 train_time:94930ms step_avg:60.54ms +step:1569/2285 train_time:94993ms step_avg:60.54ms +step:1570/2285 train_time:95053ms step_avg:60.54ms +step:1571/2285 train_time:95116ms step_avg:60.54ms +step:1572/2285 train_time:95176ms step_avg:60.54ms +step:1573/2285 train_time:95238ms step_avg:60.55ms +step:1574/2285 train_time:95298ms step_avg:60.55ms +step:1575/2285 train_time:95360ms step_avg:60.55ms +step:1576/2285 train_time:95419ms step_avg:60.55ms +step:1577/2285 train_time:95482ms step_avg:60.55ms +step:1578/2285 train_time:95542ms step_avg:60.55ms +step:1579/2285 train_time:95603ms step_avg:60.55ms +step:1580/2285 train_time:95663ms step_avg:60.55ms +step:1581/2285 train_time:95726ms step_avg:60.55ms +step:1582/2285 train_time:95788ms step_avg:60.55ms +step:1583/2285 train_time:95850ms step_avg:60.55ms +step:1584/2285 train_time:95910ms step_avg:60.55ms +step:1585/2285 train_time:95973ms step_avg:60.55ms +step:1586/2285 train_time:96033ms step_avg:60.55ms +step:1587/2285 train_time:96095ms step_avg:60.55ms +step:1588/2285 train_time:96155ms step_avg:60.55ms +step:1589/2285 train_time:96217ms step_avg:60.55ms +step:1590/2285 train_time:96277ms step_avg:60.55ms +step:1591/2285 train_time:96339ms step_avg:60.55ms +step:1592/2285 train_time:96398ms step_avg:60.55ms +step:1593/2285 train_time:96460ms step_avg:60.55ms +step:1594/2285 train_time:96520ms step_avg:60.55ms +step:1595/2285 train_time:96583ms step_avg:60.55ms +step:1596/2285 train_time:96642ms step_avg:60.55ms +step:1597/2285 train_time:96705ms step_avg:60.55ms +step:1598/2285 train_time:96766ms step_avg:60.55ms +step:1599/2285 train_time:96830ms step_avg:60.56ms +step:1600/2285 train_time:96890ms step_avg:60.56ms +step:1601/2285 train_time:96951ms step_avg:60.56ms +step:1602/2285 train_time:97011ms step_avg:60.56ms +step:1603/2285 train_time:97073ms step_avg:60.56ms +step:1604/2285 train_time:97133ms step_avg:60.56ms +step:1605/2285 train_time:97195ms step_avg:60.56ms +step:1606/2285 train_time:97255ms step_avg:60.56ms +step:1607/2285 train_time:97317ms step_avg:60.56ms +step:1608/2285 train_time:97377ms step_avg:60.56ms +step:1609/2285 train_time:97440ms step_avg:60.56ms +step:1610/2285 train_time:97500ms step_avg:60.56ms +step:1611/2285 train_time:97561ms step_avg:60.56ms +step:1612/2285 train_time:97621ms step_avg:60.56ms +step:1613/2285 train_time:97683ms step_avg:60.56ms +step:1614/2285 train_time:97744ms step_avg:60.56ms +step:1615/2285 train_time:97808ms step_avg:60.56ms +step:1616/2285 train_time:97869ms step_avg:60.56ms +step:1617/2285 train_time:97931ms step_avg:60.56ms +step:1618/2285 train_time:97992ms step_avg:60.56ms +step:1619/2285 train_time:98054ms step_avg:60.56ms +step:1620/2285 train_time:98114ms step_avg:60.56ms +step:1621/2285 train_time:98176ms step_avg:60.56ms +step:1622/2285 train_time:98235ms step_avg:60.56ms +step:1623/2285 train_time:98297ms step_avg:60.57ms +step:1624/2285 train_time:98357ms step_avg:60.56ms +step:1625/2285 train_time:98419ms step_avg:60.57ms +step:1626/2285 train_time:98479ms step_avg:60.57ms +step:1627/2285 train_time:98541ms step_avg:60.57ms +step:1628/2285 train_time:98601ms step_avg:60.57ms +step:1629/2285 train_time:98663ms step_avg:60.57ms +step:1630/2285 train_time:98723ms step_avg:60.57ms +step:1631/2285 train_time:98786ms step_avg:60.57ms +step:1632/2285 train_time:98848ms step_avg:60.57ms +step:1633/2285 train_time:98911ms step_avg:60.57ms +step:1634/2285 train_time:98970ms step_avg:60.57ms +step:1635/2285 train_time:99033ms step_avg:60.57ms +step:1636/2285 train_time:99093ms step_avg:60.57ms +step:1637/2285 train_time:99156ms step_avg:60.57ms +step:1638/2285 train_time:99216ms step_avg:60.57ms +step:1639/2285 train_time:99278ms step_avg:60.57ms +step:1640/2285 train_time:99338ms step_avg:60.57ms +step:1641/2285 train_time:99400ms step_avg:60.57ms +step:1642/2285 train_time:99460ms step_avg:60.57ms +step:1643/2285 train_time:99522ms step_avg:60.57ms +step:1644/2285 train_time:99582ms step_avg:60.57ms +step:1645/2285 train_time:99643ms step_avg:60.57ms +step:1646/2285 train_time:99703ms step_avg:60.57ms +step:1647/2285 train_time:99766ms step_avg:60.57ms +step:1648/2285 train_time:99827ms step_avg:60.57ms +step:1649/2285 train_time:99890ms step_avg:60.58ms +step:1650/2285 train_time:99950ms step_avg:60.58ms +step:1651/2285 train_time:100013ms step_avg:60.58ms +step:1652/2285 train_time:100073ms step_avg:60.58ms +step:1653/2285 train_time:100135ms step_avg:60.58ms +step:1654/2285 train_time:100195ms step_avg:60.58ms +step:1655/2285 train_time:100257ms step_avg:60.58ms +step:1656/2285 train_time:100317ms step_avg:60.58ms +step:1657/2285 train_time:100379ms step_avg:60.58ms +step:1658/2285 train_time:100439ms step_avg:60.58ms +step:1659/2285 train_time:100501ms step_avg:60.58ms +step:1660/2285 train_time:100561ms step_avg:60.58ms +step:1661/2285 train_time:100624ms step_avg:60.58ms +step:1662/2285 train_time:100684ms step_avg:60.58ms +step:1663/2285 train_time:100747ms step_avg:60.58ms +step:1664/2285 train_time:100807ms step_avg:60.58ms +step:1665/2285 train_time:100870ms step_avg:60.58ms +step:1666/2285 train_time:100930ms step_avg:60.58ms +step:1667/2285 train_time:100992ms step_avg:60.58ms +step:1668/2285 train_time:101052ms step_avg:60.58ms +step:1669/2285 train_time:101114ms step_avg:60.58ms +step:1670/2285 train_time:101174ms step_avg:60.58ms +step:1671/2285 train_time:101236ms step_avg:60.58ms +step:1672/2285 train_time:101297ms step_avg:60.58ms +step:1673/2285 train_time:101359ms step_avg:60.59ms +step:1674/2285 train_time:101419ms step_avg:60.58ms +step:1675/2285 train_time:101481ms step_avg:60.59ms +step:1676/2285 train_time:101541ms step_avg:60.59ms +step:1677/2285 train_time:101603ms step_avg:60.59ms +step:1678/2285 train_time:101664ms step_avg:60.59ms +step:1679/2285 train_time:101726ms step_avg:60.59ms +step:1680/2285 train_time:101787ms step_avg:60.59ms +step:1681/2285 train_time:101849ms step_avg:60.59ms +step:1682/2285 train_time:101909ms step_avg:60.59ms +step:1683/2285 train_time:101971ms step_avg:60.59ms +step:1684/2285 train_time:102031ms step_avg:60.59ms +step:1685/2285 train_time:102093ms step_avg:60.59ms +step:1686/2285 train_time:102153ms step_avg:60.59ms +step:1687/2285 train_time:102216ms step_avg:60.59ms +step:1688/2285 train_time:102276ms step_avg:60.59ms +step:1689/2285 train_time:102338ms step_avg:60.59ms +step:1690/2285 train_time:102398ms step_avg:60.59ms +step:1691/2285 train_time:102460ms step_avg:60.59ms +step:1692/2285 train_time:102520ms step_avg:60.59ms +step:1693/2285 train_time:102582ms step_avg:60.59ms +step:1694/2285 train_time:102642ms step_avg:60.59ms +step:1695/2285 train_time:102705ms step_avg:60.59ms +step:1696/2285 train_time:102765ms step_avg:60.59ms +step:1697/2285 train_time:102828ms step_avg:60.59ms +step:1698/2285 train_time:102888ms step_avg:60.59ms +step:1699/2285 train_time:102950ms step_avg:60.59ms +step:1700/2285 train_time:103011ms step_avg:60.59ms +step:1701/2285 train_time:103073ms step_avg:60.60ms +step:1702/2285 train_time:103133ms step_avg:60.60ms +step:1703/2285 train_time:103195ms step_avg:60.60ms +step:1704/2285 train_time:103255ms step_avg:60.60ms +step:1705/2285 train_time:103317ms step_avg:60.60ms +step:1706/2285 train_time:103377ms step_avg:60.60ms +step:1707/2285 train_time:103440ms step_avg:60.60ms +step:1708/2285 train_time:103500ms step_avg:60.60ms +step:1709/2285 train_time:103562ms step_avg:60.60ms +step:1710/2285 train_time:103621ms step_avg:60.60ms +step:1711/2285 train_time:103684ms step_avg:60.60ms +step:1712/2285 train_time:103743ms step_avg:60.60ms +step:1713/2285 train_time:103805ms step_avg:60.60ms +step:1714/2285 train_time:103866ms step_avg:60.60ms +step:1715/2285 train_time:103929ms step_avg:60.60ms +step:1716/2285 train_time:103990ms step_avg:60.60ms +step:1717/2285 train_time:104052ms step_avg:60.60ms +step:1718/2285 train_time:104112ms step_avg:60.60ms +step:1719/2285 train_time:104175ms step_avg:60.60ms +step:1720/2285 train_time:104235ms step_avg:60.60ms +step:1721/2285 train_time:104297ms step_avg:60.60ms +step:1722/2285 train_time:104356ms step_avg:60.60ms +step:1723/2285 train_time:104419ms step_avg:60.60ms +step:1724/2285 train_time:104478ms step_avg:60.60ms +step:1725/2285 train_time:104540ms step_avg:60.60ms +step:1726/2285 train_time:104601ms step_avg:60.60ms +step:1727/2285 train_time:104663ms step_avg:60.60ms +step:1728/2285 train_time:104723ms step_avg:60.60ms +step:1729/2285 train_time:104786ms step_avg:60.60ms +step:1730/2285 train_time:104846ms step_avg:60.60ms +step:1731/2285 train_time:104909ms step_avg:60.61ms +step:1732/2285 train_time:104969ms step_avg:60.61ms +step:1733/2285 train_time:105031ms step_avg:60.61ms +step:1734/2285 train_time:105092ms step_avg:60.61ms +step:1735/2285 train_time:105154ms step_avg:60.61ms +step:1736/2285 train_time:105215ms step_avg:60.61ms +step:1737/2285 train_time:105277ms step_avg:60.61ms +step:1738/2285 train_time:105336ms step_avg:60.61ms +step:1739/2285 train_time:105399ms step_avg:60.61ms +step:1740/2285 train_time:105459ms step_avg:60.61ms +step:1741/2285 train_time:105521ms step_avg:60.61ms +step:1742/2285 train_time:105580ms step_avg:60.61ms +step:1743/2285 train_time:105642ms step_avg:60.61ms +step:1744/2285 train_time:105703ms step_avg:60.61ms +step:1745/2285 train_time:105765ms step_avg:60.61ms +step:1746/2285 train_time:105826ms step_avg:60.61ms +step:1747/2285 train_time:105889ms step_avg:60.61ms +step:1748/2285 train_time:105949ms step_avg:60.61ms +step:1749/2285 train_time:106011ms step_avg:60.61ms +step:1750/2285 train_time:106071ms step_avg:60.61ms +step:1750/2285 val_loss:3.3684 train_time:106135ms step_avg:60.65ms +step:1751/2285 train_time:106157ms step_avg:60.63ms +step:1752/2285 train_time:106196ms step_avg:60.61ms +step:1753/2285 train_time:106259ms step_avg:60.62ms +step:1754/2285 train_time:106321ms step_avg:60.62ms +step:1755/2285 train_time:106386ms step_avg:60.62ms +step:1756/2285 train_time:106447ms step_avg:60.62ms +step:1757/2285 train_time:106508ms step_avg:60.62ms +step:1758/2285 train_time:106567ms step_avg:60.62ms +step:1759/2285 train_time:106628ms step_avg:60.62ms +step:1760/2285 train_time:106687ms step_avg:60.62ms +step:1761/2285 train_time:106749ms step_avg:60.62ms +step:1762/2285 train_time:106808ms step_avg:60.62ms +step:1763/2285 train_time:106870ms step_avg:60.62ms +step:1764/2285 train_time:106930ms step_avg:60.62ms +step:1765/2285 train_time:106991ms step_avg:60.62ms +step:1766/2285 train_time:107052ms step_avg:60.62ms +step:1767/2285 train_time:107115ms step_avg:60.62ms +step:1768/2285 train_time:107176ms step_avg:60.62ms +step:1769/2285 train_time:107239ms step_avg:60.62ms +step:1770/2285 train_time:107300ms step_avg:60.62ms +step:1771/2285 train_time:107362ms step_avg:60.62ms +step:1772/2285 train_time:107423ms step_avg:60.62ms +step:1773/2285 train_time:107485ms step_avg:60.62ms +step:1774/2285 train_time:107545ms step_avg:60.62ms +step:1775/2285 train_time:107607ms step_avg:60.62ms +step:1776/2285 train_time:107666ms step_avg:60.62ms +step:1777/2285 train_time:107727ms step_avg:60.62ms +step:1778/2285 train_time:107786ms step_avg:60.62ms +step:1779/2285 train_time:107848ms step_avg:60.62ms +step:1780/2285 train_time:107907ms step_avg:60.62ms +step:1781/2285 train_time:107969ms step_avg:60.62ms +step:1782/2285 train_time:108029ms step_avg:60.62ms +step:1783/2285 train_time:108093ms step_avg:60.62ms +step:1784/2285 train_time:108153ms step_avg:60.62ms +step:1785/2285 train_time:108216ms step_avg:60.63ms +step:1786/2285 train_time:108276ms step_avg:60.63ms +step:1787/2285 train_time:108339ms step_avg:60.63ms +step:1788/2285 train_time:108399ms step_avg:60.63ms +step:1789/2285 train_time:108462ms step_avg:60.63ms +step:1790/2285 train_time:108522ms step_avg:60.63ms +step:1791/2285 train_time:108584ms step_avg:60.63ms +step:1792/2285 train_time:108644ms step_avg:60.63ms +step:1793/2285 train_time:108706ms step_avg:60.63ms +step:1794/2285 train_time:108766ms step_avg:60.63ms +step:1795/2285 train_time:108828ms step_avg:60.63ms +step:1796/2285 train_time:108887ms step_avg:60.63ms +step:1797/2285 train_time:108949ms step_avg:60.63ms +step:1798/2285 train_time:109008ms step_avg:60.63ms +step:1799/2285 train_time:109072ms step_avg:60.63ms +step:1800/2285 train_time:109133ms step_avg:60.63ms +step:1801/2285 train_time:109196ms step_avg:60.63ms +step:1802/2285 train_time:109256ms step_avg:60.63ms +step:1803/2285 train_time:109319ms step_avg:60.63ms +step:1804/2285 train_time:109378ms step_avg:60.63ms +step:1805/2285 train_time:109441ms step_avg:60.63ms +step:1806/2285 train_time:109500ms step_avg:60.63ms +step:1807/2285 train_time:109562ms step_avg:60.63ms +step:1808/2285 train_time:109622ms step_avg:60.63ms +step:1809/2285 train_time:109684ms step_avg:60.63ms +step:1810/2285 train_time:109744ms step_avg:60.63ms +step:1811/2285 train_time:109806ms step_avg:60.63ms +step:1812/2285 train_time:109866ms step_avg:60.63ms +step:1813/2285 train_time:109929ms step_avg:60.63ms +step:1814/2285 train_time:109988ms step_avg:60.63ms +step:1815/2285 train_time:110051ms step_avg:60.63ms +step:1816/2285 train_time:110111ms step_avg:60.63ms +step:1817/2285 train_time:110174ms step_avg:60.63ms +step:1818/2285 train_time:110235ms step_avg:60.64ms +step:1819/2285 train_time:110297ms step_avg:60.64ms +step:1820/2285 train_time:110357ms step_avg:60.64ms +step:1821/2285 train_time:110420ms step_avg:60.64ms +step:1822/2285 train_time:110479ms step_avg:60.64ms +step:1823/2285 train_time:110541ms step_avg:60.64ms +step:1824/2285 train_time:110601ms step_avg:60.64ms +step:1825/2285 train_time:110662ms step_avg:60.64ms +step:1826/2285 train_time:110722ms step_avg:60.64ms +step:1827/2285 train_time:110784ms step_avg:60.64ms +step:1828/2285 train_time:110844ms step_avg:60.64ms +step:1829/2285 train_time:110907ms step_avg:60.64ms +step:1830/2285 train_time:110967ms step_avg:60.64ms +step:1831/2285 train_time:111029ms step_avg:60.64ms +step:1832/2285 train_time:111089ms step_avg:60.64ms +step:1833/2285 train_time:111151ms step_avg:60.64ms +step:1834/2285 train_time:111211ms step_avg:60.64ms +step:1835/2285 train_time:111274ms step_avg:60.64ms +step:1836/2285 train_time:111335ms step_avg:60.64ms +step:1837/2285 train_time:111398ms step_avg:60.64ms +step:1838/2285 train_time:111458ms step_avg:60.64ms +step:1839/2285 train_time:111520ms step_avg:60.64ms +step:1840/2285 train_time:111580ms step_avg:60.64ms +step:1841/2285 train_time:111642ms step_avg:60.64ms +step:1842/2285 train_time:111702ms step_avg:60.64ms +step:1843/2285 train_time:111764ms step_avg:60.64ms +step:1844/2285 train_time:111825ms step_avg:60.64ms +step:1845/2285 train_time:111888ms step_avg:60.64ms +step:1846/2285 train_time:111948ms step_avg:60.64ms +step:1847/2285 train_time:112010ms step_avg:60.64ms +step:1848/2285 train_time:112069ms step_avg:60.64ms +step:1849/2285 train_time:112132ms step_avg:60.64ms +step:1850/2285 train_time:112192ms step_avg:60.64ms +step:1851/2285 train_time:112254ms step_avg:60.65ms +step:1852/2285 train_time:112313ms step_avg:60.64ms +step:1853/2285 train_time:112376ms step_avg:60.65ms +step:1854/2285 train_time:112436ms step_avg:60.64ms +step:1855/2285 train_time:112498ms step_avg:60.65ms +step:1856/2285 train_time:112558ms step_avg:60.65ms +step:1857/2285 train_time:112620ms step_avg:60.65ms +step:1858/2285 train_time:112680ms step_avg:60.65ms +step:1859/2285 train_time:112742ms step_avg:60.65ms +step:1860/2285 train_time:112801ms step_avg:60.65ms +step:1861/2285 train_time:112865ms step_avg:60.65ms +step:1862/2285 train_time:112925ms step_avg:60.65ms +step:1863/2285 train_time:112987ms step_avg:60.65ms +step:1864/2285 train_time:113048ms step_avg:60.65ms +step:1865/2285 train_time:113110ms step_avg:60.65ms +step:1866/2285 train_time:113170ms step_avg:60.65ms +step:1867/2285 train_time:113232ms step_avg:60.65ms +step:1868/2285 train_time:113291ms step_avg:60.65ms +step:1869/2285 train_time:113354ms step_avg:60.65ms +step:1870/2285 train_time:113414ms step_avg:60.65ms +step:1871/2285 train_time:113476ms step_avg:60.65ms +step:1872/2285 train_time:113536ms step_avg:60.65ms +step:1873/2285 train_time:113598ms step_avg:60.65ms +step:1874/2285 train_time:113657ms step_avg:60.65ms +step:1875/2285 train_time:113720ms step_avg:60.65ms +step:1876/2285 train_time:113779ms step_avg:60.65ms +step:1877/2285 train_time:113842ms step_avg:60.65ms +step:1878/2285 train_time:113901ms step_avg:60.65ms +step:1879/2285 train_time:113964ms step_avg:60.65ms +step:1880/2285 train_time:114025ms step_avg:60.65ms +step:1881/2285 train_time:114088ms step_avg:60.65ms +step:1882/2285 train_time:114148ms step_avg:60.65ms +step:1883/2285 train_time:114210ms step_avg:60.65ms +step:1884/2285 train_time:114270ms step_avg:60.65ms +step:1885/2285 train_time:114332ms step_avg:60.65ms +step:1886/2285 train_time:114392ms step_avg:60.65ms +step:1887/2285 train_time:114454ms step_avg:60.65ms +step:1888/2285 train_time:114515ms step_avg:60.65ms +step:1889/2285 train_time:114577ms step_avg:60.65ms +step:1890/2285 train_time:114637ms step_avg:60.65ms +step:1891/2285 train_time:114699ms step_avg:60.66ms +step:1892/2285 train_time:114759ms step_avg:60.65ms +step:1893/2285 train_time:114821ms step_avg:60.66ms +step:1894/2285 train_time:114881ms step_avg:60.66ms +step:1895/2285 train_time:114943ms step_avg:60.66ms +step:1896/2285 train_time:115004ms step_avg:60.66ms +step:1897/2285 train_time:115066ms step_avg:60.66ms +step:1898/2285 train_time:115126ms step_avg:60.66ms +step:1899/2285 train_time:115189ms step_avg:60.66ms +step:1900/2285 train_time:115249ms step_avg:60.66ms +step:1901/2285 train_time:115312ms step_avg:60.66ms +step:1902/2285 train_time:115372ms step_avg:60.66ms +step:1903/2285 train_time:115434ms step_avg:60.66ms +step:1904/2285 train_time:115495ms step_avg:60.66ms +step:1905/2285 train_time:115557ms step_avg:60.66ms +step:1906/2285 train_time:115618ms step_avg:60.66ms +step:1907/2285 train_time:115679ms step_avg:60.66ms +step:1908/2285 train_time:115739ms step_avg:60.66ms +step:1909/2285 train_time:115801ms step_avg:60.66ms +step:1910/2285 train_time:115862ms step_avg:60.66ms +step:1911/2285 train_time:115924ms step_avg:60.66ms +step:1912/2285 train_time:115984ms step_avg:60.66ms +step:1913/2285 train_time:116047ms step_avg:60.66ms +step:1914/2285 train_time:116107ms step_avg:60.66ms +step:1915/2285 train_time:116170ms step_avg:60.66ms +step:1916/2285 train_time:116230ms step_avg:60.66ms +step:1917/2285 train_time:116293ms step_avg:60.66ms +step:1918/2285 train_time:116352ms step_avg:60.66ms +step:1919/2285 train_time:116414ms step_avg:60.66ms +step:1920/2285 train_time:116475ms step_avg:60.66ms +step:1921/2285 train_time:116537ms step_avg:60.66ms +step:1922/2285 train_time:116597ms step_avg:60.66ms +step:1923/2285 train_time:116659ms step_avg:60.67ms +step:1924/2285 train_time:116719ms step_avg:60.66ms +step:1925/2285 train_time:116781ms step_avg:60.67ms +step:1926/2285 train_time:116841ms step_avg:60.67ms +step:1927/2285 train_time:116904ms step_avg:60.67ms +step:1928/2285 train_time:116964ms step_avg:60.67ms +step:1929/2285 train_time:117027ms step_avg:60.67ms +step:1930/2285 train_time:117087ms step_avg:60.67ms +step:1931/2285 train_time:117150ms step_avg:60.67ms +step:1932/2285 train_time:117209ms step_avg:60.67ms +step:1933/2285 train_time:117272ms step_avg:60.67ms +step:1934/2285 train_time:117331ms step_avg:60.67ms +step:1935/2285 train_time:117394ms step_avg:60.67ms +step:1936/2285 train_time:117454ms step_avg:60.67ms +step:1937/2285 train_time:117517ms step_avg:60.67ms +step:1938/2285 train_time:117577ms step_avg:60.67ms +step:1939/2285 train_time:117639ms step_avg:60.67ms +step:1940/2285 train_time:117699ms step_avg:60.67ms +step:1941/2285 train_time:117762ms step_avg:60.67ms +step:1942/2285 train_time:117822ms step_avg:60.67ms +step:1943/2285 train_time:117884ms step_avg:60.67ms +step:1944/2285 train_time:117944ms step_avg:60.67ms +step:1945/2285 train_time:118006ms step_avg:60.67ms +step:1946/2285 train_time:118066ms step_avg:60.67ms +step:1947/2285 train_time:118129ms step_avg:60.67ms +step:1948/2285 train_time:118189ms step_avg:60.67ms +step:1949/2285 train_time:118252ms step_avg:60.67ms +step:1950/2285 train_time:118312ms step_avg:60.67ms +step:1951/2285 train_time:118374ms step_avg:60.67ms +step:1952/2285 train_time:118434ms step_avg:60.67ms +step:1953/2285 train_time:118496ms step_avg:60.67ms +step:1954/2285 train_time:118557ms step_avg:60.67ms +step:1955/2285 train_time:118619ms step_avg:60.67ms +step:1956/2285 train_time:118679ms step_avg:60.67ms +step:1957/2285 train_time:118742ms step_avg:60.68ms +step:1958/2285 train_time:118802ms step_avg:60.68ms +step:1959/2285 train_time:118864ms step_avg:60.68ms +step:1960/2285 train_time:118924ms step_avg:60.68ms +step:1961/2285 train_time:118987ms step_avg:60.68ms +step:1962/2285 train_time:119047ms step_avg:60.68ms +step:1963/2285 train_time:119109ms step_avg:60.68ms +step:1964/2285 train_time:119169ms step_avg:60.68ms +step:1965/2285 train_time:119232ms step_avg:60.68ms +step:1966/2285 train_time:119292ms step_avg:60.68ms +step:1967/2285 train_time:119355ms step_avg:60.68ms +step:1968/2285 train_time:119415ms step_avg:60.68ms +step:1969/2285 train_time:119477ms step_avg:60.68ms +step:1970/2285 train_time:119538ms step_avg:60.68ms +step:1971/2285 train_time:119600ms step_avg:60.68ms +step:1972/2285 train_time:119659ms step_avg:60.68ms +step:1973/2285 train_time:119722ms step_avg:60.68ms +step:1974/2285 train_time:119782ms step_avg:60.68ms +step:1975/2285 train_time:119844ms step_avg:60.68ms +step:1976/2285 train_time:119903ms step_avg:60.68ms +step:1977/2285 train_time:119966ms step_avg:60.68ms +step:1978/2285 train_time:120026ms step_avg:60.68ms +step:1979/2285 train_time:120088ms step_avg:60.68ms +step:1980/2285 train_time:120148ms step_avg:60.68ms +step:1981/2285 train_time:120210ms step_avg:60.68ms +step:1982/2285 train_time:120271ms step_avg:60.68ms +step:1983/2285 train_time:120334ms step_avg:60.68ms +step:1984/2285 train_time:120394ms step_avg:60.68ms +step:1985/2285 train_time:120456ms step_avg:60.68ms +step:1986/2285 train_time:120516ms step_avg:60.68ms +step:1987/2285 train_time:120578ms step_avg:60.68ms +step:1988/2285 train_time:120638ms step_avg:60.68ms +step:1989/2285 train_time:120701ms step_avg:60.68ms +step:1990/2285 train_time:120761ms step_avg:60.68ms +step:1991/2285 train_time:120823ms step_avg:60.68ms +step:1992/2285 train_time:120883ms step_avg:60.68ms +step:1993/2285 train_time:120946ms step_avg:60.69ms +step:1994/2285 train_time:121006ms step_avg:60.69ms +step:1995/2285 train_time:121069ms step_avg:60.69ms +step:1996/2285 train_time:121129ms step_avg:60.69ms +step:1997/2285 train_time:121191ms step_avg:60.69ms +step:1998/2285 train_time:121251ms step_avg:60.69ms +step:1999/2285 train_time:121314ms step_avg:60.69ms +step:2000/2285 train_time:121374ms step_avg:60.69ms +step:2000/2285 val_loss:3.3190 train_time:121438ms step_avg:60.72ms +step:2001/2285 train_time:121458ms step_avg:60.70ms +step:2002/2285 train_time:121499ms step_avg:60.69ms +step:2003/2285 train_time:121562ms step_avg:60.69ms +step:2004/2285 train_time:121624ms step_avg:60.69ms +step:2005/2285 train_time:121688ms step_avg:60.69ms +step:2006/2285 train_time:121749ms step_avg:60.69ms +step:2007/2285 train_time:121810ms step_avg:60.69ms +step:2008/2285 train_time:121870ms step_avg:60.69ms +step:2009/2285 train_time:121932ms step_avg:60.69ms +step:2010/2285 train_time:121991ms step_avg:60.69ms +step:2011/2285 train_time:122053ms step_avg:60.69ms +step:2012/2285 train_time:122112ms step_avg:60.69ms +step:2013/2285 train_time:122174ms step_avg:60.69ms +step:2014/2285 train_time:122233ms step_avg:60.69ms +step:2015/2285 train_time:122295ms step_avg:60.69ms +step:2016/2285 train_time:122356ms step_avg:60.69ms +step:2017/2285 train_time:122420ms step_avg:60.69ms +step:2018/2285 train_time:122481ms step_avg:60.69ms +step:2019/2285 train_time:122545ms step_avg:60.70ms +step:2020/2285 train_time:122606ms step_avg:60.70ms +step:2021/2285 train_time:122669ms step_avg:60.70ms +step:2022/2285 train_time:122730ms step_avg:60.70ms +step:2023/2285 train_time:122792ms step_avg:60.70ms +step:2024/2285 train_time:122852ms step_avg:60.70ms +step:2025/2285 train_time:122914ms step_avg:60.70ms +step:2026/2285 train_time:122973ms step_avg:60.70ms +step:2027/2285 train_time:123035ms step_avg:60.70ms +step:2028/2285 train_time:123095ms step_avg:60.70ms +step:2029/2285 train_time:123156ms step_avg:60.70ms +step:2030/2285 train_time:123216ms step_avg:60.70ms +step:2031/2285 train_time:123278ms step_avg:60.70ms +step:2032/2285 train_time:123338ms step_avg:60.70ms +step:2033/2285 train_time:123401ms step_avg:60.70ms +step:2034/2285 train_time:123461ms step_avg:60.70ms +step:2035/2285 train_time:123524ms step_avg:60.70ms +step:2036/2285 train_time:123584ms step_avg:60.70ms +step:2037/2285 train_time:123647ms step_avg:60.70ms +step:2038/2285 train_time:123707ms step_avg:60.70ms +step:2039/2285 train_time:123770ms step_avg:60.70ms +step:2040/2285 train_time:123830ms step_avg:60.70ms +step:2041/2285 train_time:123893ms step_avg:60.70ms +step:2042/2285 train_time:123952ms step_avg:60.70ms +step:2043/2285 train_time:124014ms step_avg:60.70ms +step:2044/2285 train_time:124075ms step_avg:60.70ms +step:2045/2285 train_time:124137ms step_avg:60.70ms +step:2046/2285 train_time:124196ms step_avg:60.70ms +step:2047/2285 train_time:124258ms step_avg:60.70ms +step:2048/2285 train_time:124319ms step_avg:60.70ms +step:2049/2285 train_time:124382ms step_avg:60.70ms +step:2050/2285 train_time:124442ms step_avg:60.70ms +step:2051/2285 train_time:124505ms step_avg:60.70ms +step:2052/2285 train_time:124566ms step_avg:60.70ms +step:2053/2285 train_time:124629ms step_avg:60.71ms +step:2054/2285 train_time:124689ms step_avg:60.71ms +step:2055/2285 train_time:124752ms step_avg:60.71ms +step:2056/2285 train_time:124812ms step_avg:60.71ms +step:2057/2285 train_time:124875ms step_avg:60.71ms +step:2058/2285 train_time:124934ms step_avg:60.71ms +step:2059/2285 train_time:124996ms step_avg:60.71ms +step:2060/2285 train_time:125056ms step_avg:60.71ms +step:2061/2285 train_time:125118ms step_avg:60.71ms +step:2062/2285 train_time:125178ms step_avg:60.71ms +step:2063/2285 train_time:125240ms step_avg:60.71ms +step:2064/2285 train_time:125300ms step_avg:60.71ms +step:2065/2285 train_time:125363ms step_avg:60.71ms +step:2066/2285 train_time:125423ms step_avg:60.71ms +step:2067/2285 train_time:125486ms step_avg:60.71ms +step:2068/2285 train_time:125546ms step_avg:60.71ms +step:2069/2285 train_time:125610ms step_avg:60.71ms +step:2070/2285 train_time:125670ms step_avg:60.71ms +step:2071/2285 train_time:125732ms step_avg:60.71ms +step:2072/2285 train_time:125792ms step_avg:60.71ms +step:2073/2285 train_time:125854ms step_avg:60.71ms +step:2074/2285 train_time:125914ms step_avg:60.71ms +step:2075/2285 train_time:125977ms step_avg:60.71ms +step:2076/2285 train_time:126037ms step_avg:60.71ms +step:2077/2285 train_time:126099ms step_avg:60.71ms +step:2078/2285 train_time:126159ms step_avg:60.71ms +step:2079/2285 train_time:126221ms step_avg:60.71ms +step:2080/2285 train_time:126282ms step_avg:60.71ms +step:2081/2285 train_time:126344ms step_avg:60.71ms +step:2082/2285 train_time:126404ms step_avg:60.71ms +step:2083/2285 train_time:126466ms step_avg:60.71ms +step:2084/2285 train_time:126526ms step_avg:60.71ms +step:2085/2285 train_time:126589ms step_avg:60.71ms +step:2086/2285 train_time:126649ms step_avg:60.71ms +step:2087/2285 train_time:126712ms step_avg:60.71ms +step:2088/2285 train_time:126772ms step_avg:60.71ms +step:2089/2285 train_time:126835ms step_avg:60.72ms +step:2090/2285 train_time:126894ms step_avg:60.71ms +step:2091/2285 train_time:126956ms step_avg:60.72ms +step:2092/2285 train_time:127016ms step_avg:60.72ms +step:2093/2285 train_time:127079ms step_avg:60.72ms +step:2094/2285 train_time:127138ms step_avg:60.72ms +step:2095/2285 train_time:127201ms step_avg:60.72ms +step:2096/2285 train_time:127261ms step_avg:60.72ms +step:2097/2285 train_time:127324ms step_avg:60.72ms +step:2098/2285 train_time:127384ms step_avg:60.72ms +step:2099/2285 train_time:127447ms step_avg:60.72ms +step:2100/2285 train_time:127507ms step_avg:60.72ms +step:2101/2285 train_time:127569ms step_avg:60.72ms +step:2102/2285 train_time:127630ms step_avg:60.72ms +step:2103/2285 train_time:127692ms step_avg:60.72ms +step:2104/2285 train_time:127752ms step_avg:60.72ms +step:2105/2285 train_time:127814ms step_avg:60.72ms +step:2106/2285 train_time:127875ms step_avg:60.72ms +step:2107/2285 train_time:127937ms step_avg:60.72ms +step:2108/2285 train_time:127997ms step_avg:60.72ms +step:2109/2285 train_time:128059ms step_avg:60.72ms +step:2110/2285 train_time:128120ms step_avg:60.72ms +step:2111/2285 train_time:128182ms step_avg:60.72ms +step:2112/2285 train_time:128241ms step_avg:60.72ms +step:2113/2285 train_time:128304ms step_avg:60.72ms +step:2114/2285 train_time:128364ms step_avg:60.72ms +step:2115/2285 train_time:128426ms step_avg:60.72ms +step:2116/2285 train_time:128486ms step_avg:60.72ms +step:2117/2285 train_time:128549ms step_avg:60.72ms +step:2118/2285 train_time:128609ms step_avg:60.72ms +step:2119/2285 train_time:128671ms step_avg:60.72ms +step:2120/2285 train_time:128731ms step_avg:60.72ms +step:2121/2285 train_time:128794ms step_avg:60.72ms +step:2122/2285 train_time:128854ms step_avg:60.72ms +step:2123/2285 train_time:128917ms step_avg:60.72ms +step:2124/2285 train_time:128977ms step_avg:60.72ms +step:2125/2285 train_time:129039ms step_avg:60.72ms +step:2126/2285 train_time:129100ms step_avg:60.72ms +step:2127/2285 train_time:129163ms step_avg:60.73ms +step:2128/2285 train_time:129223ms step_avg:60.72ms +step:2129/2285 train_time:129285ms step_avg:60.73ms +step:2130/2285 train_time:129344ms step_avg:60.72ms +step:2131/2285 train_time:129407ms step_avg:60.73ms +step:2132/2285 train_time:129467ms step_avg:60.73ms +step:2133/2285 train_time:129530ms step_avg:60.73ms +step:2134/2285 train_time:129591ms step_avg:60.73ms +step:2135/2285 train_time:129652ms step_avg:60.73ms +step:2136/2285 train_time:129713ms step_avg:60.73ms +step:2137/2285 train_time:129775ms step_avg:60.73ms +step:2138/2285 train_time:129835ms step_avg:60.73ms +step:2139/2285 train_time:129898ms step_avg:60.73ms +step:2140/2285 train_time:129958ms step_avg:60.73ms +step:2141/2285 train_time:130020ms step_avg:60.73ms +step:2142/2285 train_time:130080ms step_avg:60.73ms +step:2143/2285 train_time:130142ms step_avg:60.73ms +step:2144/2285 train_time:130202ms step_avg:60.73ms +step:2145/2285 train_time:130265ms step_avg:60.73ms +step:2146/2285 train_time:130325ms step_avg:60.73ms +step:2147/2285 train_time:130387ms step_avg:60.73ms +step:2148/2285 train_time:130447ms step_avg:60.73ms +step:2149/2285 train_time:130509ms step_avg:60.73ms +step:2150/2285 train_time:130569ms step_avg:60.73ms +step:2151/2285 train_time:130632ms step_avg:60.73ms +step:2152/2285 train_time:130692ms step_avg:60.73ms +step:2153/2285 train_time:130754ms step_avg:60.73ms +step:2154/2285 train_time:130815ms step_avg:60.73ms +step:2155/2285 train_time:130877ms step_avg:60.73ms +step:2156/2285 train_time:130937ms step_avg:60.73ms +step:2157/2285 train_time:130999ms step_avg:60.73ms +step:2158/2285 train_time:131060ms step_avg:60.73ms +step:2159/2285 train_time:131123ms step_avg:60.73ms +step:2160/2285 train_time:131182ms step_avg:60.73ms +step:2161/2285 train_time:131245ms step_avg:60.73ms +step:2162/2285 train_time:131305ms step_avg:60.73ms +step:2163/2285 train_time:131368ms step_avg:60.73ms +step:2164/2285 train_time:131428ms step_avg:60.73ms +step:2165/2285 train_time:131490ms step_avg:60.73ms +step:2166/2285 train_time:131550ms step_avg:60.73ms +step:2167/2285 train_time:131612ms step_avg:60.73ms +step:2168/2285 train_time:131672ms step_avg:60.73ms +step:2169/2285 train_time:131734ms step_avg:60.73ms +step:2170/2285 train_time:131794ms step_avg:60.73ms +step:2171/2285 train_time:131856ms step_avg:60.74ms +step:2172/2285 train_time:131917ms step_avg:60.74ms +step:2173/2285 train_time:131979ms step_avg:60.74ms +step:2174/2285 train_time:132039ms step_avg:60.74ms +step:2175/2285 train_time:132102ms step_avg:60.74ms +step:2176/2285 train_time:132163ms step_avg:60.74ms +step:2177/2285 train_time:132225ms step_avg:60.74ms +step:2178/2285 train_time:132285ms step_avg:60.74ms +step:2179/2285 train_time:132347ms step_avg:60.74ms +step:2180/2285 train_time:132407ms step_avg:60.74ms +step:2181/2285 train_time:132470ms step_avg:60.74ms +step:2182/2285 train_time:132530ms step_avg:60.74ms +step:2183/2285 train_time:132592ms step_avg:60.74ms +step:2184/2285 train_time:132652ms step_avg:60.74ms +step:2185/2285 train_time:132715ms step_avg:60.74ms +step:2186/2285 train_time:132775ms step_avg:60.74ms +step:2187/2285 train_time:132837ms step_avg:60.74ms +step:2188/2285 train_time:132898ms step_avg:60.74ms +step:2189/2285 train_time:132960ms step_avg:60.74ms +step:2190/2285 train_time:133020ms step_avg:60.74ms +step:2191/2285 train_time:133083ms step_avg:60.74ms +step:2192/2285 train_time:133143ms step_avg:60.74ms +step:2193/2285 train_time:133205ms step_avg:60.74ms +step:2194/2285 train_time:133266ms step_avg:60.74ms +step:2195/2285 train_time:133328ms step_avg:60.74ms +step:2196/2285 train_time:133388ms step_avg:60.74ms +step:2197/2285 train_time:133450ms step_avg:60.74ms +step:2198/2285 train_time:133510ms step_avg:60.74ms +step:2199/2285 train_time:133573ms step_avg:60.74ms +step:2200/2285 train_time:133632ms step_avg:60.74ms +step:2201/2285 train_time:133694ms step_avg:60.74ms +step:2202/2285 train_time:133754ms step_avg:60.74ms +step:2203/2285 train_time:133817ms step_avg:60.74ms +step:2204/2285 train_time:133878ms step_avg:60.74ms +step:2205/2285 train_time:133940ms step_avg:60.74ms +step:2206/2285 train_time:134000ms step_avg:60.74ms +step:2207/2285 train_time:134063ms step_avg:60.74ms +step:2208/2285 train_time:134123ms step_avg:60.74ms +step:2209/2285 train_time:134186ms step_avg:60.75ms +step:2210/2285 train_time:134246ms step_avg:60.74ms +step:2211/2285 train_time:134309ms step_avg:60.75ms +step:2212/2285 train_time:134369ms step_avg:60.75ms +step:2213/2285 train_time:134432ms step_avg:60.75ms +step:2214/2285 train_time:134492ms step_avg:60.75ms +step:2215/2285 train_time:134555ms step_avg:60.75ms +step:2216/2285 train_time:134614ms step_avg:60.75ms +step:2217/2285 train_time:134677ms step_avg:60.75ms +step:2218/2285 train_time:134737ms step_avg:60.75ms +step:2219/2285 train_time:134799ms step_avg:60.75ms +step:2220/2285 train_time:134859ms step_avg:60.75ms +step:2221/2285 train_time:134922ms step_avg:60.75ms +step:2222/2285 train_time:134982ms step_avg:60.75ms +step:2223/2285 train_time:135044ms step_avg:60.75ms +step:2224/2285 train_time:135105ms step_avg:60.75ms +step:2225/2285 train_time:135167ms step_avg:60.75ms +step:2226/2285 train_time:135227ms step_avg:60.75ms +step:2227/2285 train_time:135289ms step_avg:60.75ms +step:2228/2285 train_time:135349ms step_avg:60.75ms +step:2229/2285 train_time:135411ms step_avg:60.75ms +step:2230/2285 train_time:135472ms step_avg:60.75ms +step:2231/2285 train_time:135534ms step_avg:60.75ms +step:2232/2285 train_time:135594ms step_avg:60.75ms +step:2233/2285 train_time:135656ms step_avg:60.75ms +step:2234/2285 train_time:135716ms step_avg:60.75ms +step:2235/2285 train_time:135778ms step_avg:60.75ms +step:2236/2285 train_time:135838ms step_avg:60.75ms +step:2237/2285 train_time:135901ms step_avg:60.75ms +step:2238/2285 train_time:135961ms step_avg:60.75ms +step:2239/2285 train_time:136023ms step_avg:60.75ms +step:2240/2285 train_time:136083ms step_avg:60.75ms +step:2241/2285 train_time:136145ms step_avg:60.75ms +step:2242/2285 train_time:136206ms step_avg:60.75ms +step:2243/2285 train_time:136268ms step_avg:60.75ms +step:2244/2285 train_time:136328ms step_avg:60.75ms +step:2245/2285 train_time:136391ms step_avg:60.75ms +step:2246/2285 train_time:136450ms step_avg:60.75ms +step:2247/2285 train_time:136513ms step_avg:60.75ms +step:2248/2285 train_time:136573ms step_avg:60.75ms +step:2249/2285 train_time:136635ms step_avg:60.75ms +step:2250/2285 train_time:136695ms step_avg:60.75ms +step:2250/2285 val_loss:3.2836 train_time:136759ms step_avg:60.78ms +step:2251/2285 train_time:136778ms step_avg:60.76ms +step:2252/2285 train_time:136819ms step_avg:60.75ms +step:2253/2285 train_time:136884ms step_avg:60.76ms +step:2254/2285 train_time:136945ms step_avg:60.76ms +step:2255/2285 train_time:137007ms step_avg:60.76ms +step:2256/2285 train_time:137067ms step_avg:60.76ms +step:2257/2285 train_time:137128ms step_avg:60.76ms +step:2258/2285 train_time:137188ms step_avg:60.76ms +step:2259/2285 train_time:137250ms step_avg:60.76ms +step:2260/2285 train_time:137309ms step_avg:60.76ms +step:2261/2285 train_time:137372ms step_avg:60.76ms +step:2262/2285 train_time:137432ms step_avg:60.76ms +step:2263/2285 train_time:137494ms step_avg:60.76ms +step:2264/2285 train_time:137553ms step_avg:60.76ms +step:2265/2285 train_time:137615ms step_avg:60.76ms +step:2266/2285 train_time:137675ms step_avg:60.76ms +step:2267/2285 train_time:137740ms step_avg:60.76ms +step:2268/2285 train_time:137802ms step_avg:60.76ms +step:2269/2285 train_time:137864ms step_avg:60.76ms +step:2270/2285 train_time:137925ms step_avg:60.76ms +step:2271/2285 train_time:137988ms step_avg:60.76ms +step:2272/2285 train_time:138048ms step_avg:60.76ms +step:2273/2285 train_time:138110ms step_avg:60.76ms +step:2274/2285 train_time:138169ms step_avg:60.76ms +step:2275/2285 train_time:138231ms step_avg:60.76ms +step:2276/2285 train_time:138290ms step_avg:60.76ms +step:2277/2285 train_time:138352ms step_avg:60.76ms +step:2278/2285 train_time:138412ms step_avg:60.76ms +step:2279/2285 train_time:138474ms step_avg:60.76ms +step:2280/2285 train_time:138534ms step_avg:60.76ms +step:2281/2285 train_time:138596ms step_avg:60.76ms +step:2282/2285 train_time:138656ms step_avg:60.76ms +step:2283/2285 train_time:138720ms step_avg:60.76ms +step:2284/2285 train_time:138780ms step_avg:60.76ms +step:2285/2285 train_time:138843ms step_avg:60.76ms +step:2285/2285 val_loss:3.2776 train_time:138905ms step_avg:60.79ms +peak memory allocated: 29626 MiB reserved: 50528 MiB diff --git a/records/track_1_short/2025-10-27_FixMuonLR/74ef00d7-4030-46f2-a269-bea707f0f0bd.txt b/records/track_1_short/2025-10-27_FixMuonLR/74ef00d7-4030-46f2-a269-bea707f0f0bd.txt new file mode 100644 index 000000000..628b9ca0e --- /dev/null +++ b/records/track_1_short/2025-10-27_FixMuonLR/74ef00d7-4030-46f2-a269-bea707f0f0bd.txt @@ -0,0 +1,3814 @@ +import os +import sys + +with open(sys.argv[0]) as f: + code = f.read() # read the code of this file ASAP, for logging +import copy +import glob +import math +import threading +import time +import uuid +from dataclasses import dataclass +from collections import defaultdict +from itertools import accumulate +from pathlib import Path + +os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" +import torch + +torch.empty( + 1, device="cuda", requires_grad=True +).backward() # prevents a bug on some systems +import torch._dynamo as dynamo +import torch.distributed as dist +import torch.nn.functional as F + +# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min +import triton +import triton.language as tl +from kernels import get_kernel +from torch import Tensor, nn + +dynamo.config.recompile_limit = 64 + +# ----------------------------------------------------------------------------- +# Custom operators: FP8 matmul by @YouJiacheng + + +@torch.library.custom_op("nanogpt::mm", mutates_args=()) +def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: + @torch.compile + def impl(x: Tensor, w: Tensor): + assert x.is_contiguous() and w.is_contiguous() + x_f8 = x.div(x_s).to(torch.float8_e4m3fn) + w_f8 = w.div(w_s).to(torch.float8_e4m3fn) + out = torch._scaled_mm( + x_f8, + w_f8.T, + out_dtype=torch.bfloat16, + scale_a=x.new_tensor(x_s, dtype=torch.float32), + scale_b=x.new_tensor(w_s, dtype=torch.float32), + use_fast_accum=True, + ) + return out, x_f8, w_f8 + + return impl(x, w) + +@mm_op.register_fake +def _(x: Tensor, w: Tensor, *_): + assert x.ndim == w.ndim == 2 + assert x.shape[1] == w.shape[1] + assert x.device == w.device + assert x.is_contiguous() and w.is_contiguous() + return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) + +@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) +def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: + @torch.compile + def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): + assert grad.is_contiguous() + x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) + w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) + grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) + grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) + grad_x = torch._scaled_mm( + grad_f8, + w_f8.T.contiguous().T, + out_dtype=torch.bfloat16, + scale_a=grad_inv_s, + scale_b=w_inv_s, + use_fast_accum=False, + ) + # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) + grad_w = torch._scaled_mm( + x_f8.T.contiguous(), + grad_f8.T.contiguous().T, + out_dtype=torch.float32, + scale_a=x_inv_s, + scale_b=grad_inv_s, + use_fast_accum=False, + ).T + return grad_x, grad_w + + return impl(g, x_f8, w_f8) + +@mm_backward_op.register_fake +def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): + return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) + +def backward(ctx, grad_out: Tensor, *_): + x_f8, w_f8 = ctx.saved_tensors + x_s, w_s, grad_s = ctx.scales + grad_x, grad_w = torch.ops.nanogpt.mm_backward( + grad_out, x_f8, w_f8, x_s, w_s, grad_s + ) + return grad_x, grad_w, None, None, None + +def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): + *_, x_s, w_s, grad_s = inputs + _, x_f8, w_f8 = output + ctx.save_for_backward(x_f8, w_f8) + ctx.scales = x_s, w_s, grad_s + ctx.set_materialize_grads(False) + +mm_op.register_autograd(backward, setup_context=setup_context) + +# ----------------------------------------------------------------------------- +# Triton kernel for symmetric matrix multiplication by @byronxu99 + +def _get_autotune_configs(): + return [ + triton.Config( + { + "BLOCK_SIZE_M": bm, + "BLOCK_SIZE_N": bn, + "BLOCK_SIZE_K": bk, + "GROUP_SIZE_M": 8, + "LOWER_UPPER": 1, + }, + num_stages=stages, + num_warps=warps, + ) + for bm in [64, 128] + for bn in [64, 128, 256] + for bk in [64, 128] + for stages, warps in [(3, 4), (3, 8), (4, 4)] + if bm // bn <= 2 and bn // bm <= 2 + ] + +@triton.jit +def _pid_to_block( + pid, + M, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) + + # Map PID to a single matrix in batch + batch_idx = pid // (num_pid_m * num_pid_n) + pid = pid % (num_pid_m * num_pid_n) + + # Map PID to 2D grid of blocks + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) + + m_idx = pid_m * BLOCK_SIZE_M + n_idx = pid_n * BLOCK_SIZE_N + return batch_idx, m_idx, n_idx + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def XXT_kernel( + A_ptr, C_ptr, + M, K, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def XXT(A: torch.Tensor, out: torch.Tensor): + """ + Launch Triton kernel to compute C = A @ A.T + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert out.size(-2) == M, "Output matrix has incorrect shape" + assert out.size(-1) == M, "Output matrix has incorrect shape" + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + XXT_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + K=K, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + ) + return out + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def ba_plus_cAA_kernel( + A_ptr, C_ptr, + M, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + alpha, beta, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A + # Performance is slightly slower than XXT_kernel, so we use two separate kernels + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + # Load block of A to add (corresponds to the current block of C) + offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) + a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) + a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) + a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) + + # Apply alpha and beta + accumulator *= alpha + accumulator += a_add * beta + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): + """ + Launch Triton kernel to compute C = alpha * A @ A.T + beta * A + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert M == K, "Input matrix must be square" + assert out.size(-2) == M + assert out.size(-1) == M + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + ba_plus_cAA_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + alpha=alpha, + beta=beta, + ) + return out + +# Computed for num_iters=5, safety_factor=2e-2, cushion=2 +polar_express_coeffs = [ + (8.156554524902461, -22.48329292557795, 15.878769915207462), + (4.042929935166739, -2.808917465908714, 0.5000178451051316), + (3.8916678022926607, -2.772484153217685, 0.5060648178503393), + (3.285753657755655, -2.3681294933425376, 0.46449024233003106), + (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) +] + +@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower +def polar_express(G: torch.Tensor): + """ + Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 + by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. + Code adapted from https://github.com/NoahAmsel/PolarExpress/tree/main by @varunneal. + """ + X = G.bfloat16() + if G.size(-2) > G.size(-1): + X = X.mT + + # Ensure spectral norm is at most 1 + X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) + + # Allocate buffers + X = X.contiguous() + A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) + B = torch.empty_like(A) + C = torch.empty_like(X) + + aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm + + # Perform the iterations + for a, b, c in polar_express_coeffs: + XXT(X, out=A) # A = X @ X.mT + ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A + aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X + X, C = C, X # Swap references to avoid unnecessary copies + + if G.size(-2) > G.size(-1): + X = X.mT + return X + +# ----------------------------------------------------------------------------- +# Muon optimizer + +class Muon(torch.optim.Optimizer): + """ + Muon - MomentUm Orthogonalized by Newton-schulz + + https://kellerjordan.github.io/posts/muon/ + + Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- + processing step, in which each 2D parameter's update is replaced with the nearest orthogonal + matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has + the advantage that it can be stably run in bfloat16 on the GPU. + Note: A later PR replaced Newton-Shulz with Polar Express for the orthogonalization step + + Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, + or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). + Though empirically small 1D params perform efficiently here: + NS approximately performs a magnitude normalization of the grad + This hyper-optimized class has faster execution time than the current impl of Adam for small params + + Custom distributed sizing: + The model stores all attn and mlp weights in the same shape, and then updates the view as + needed on the forward pass. This enables attn and mlp weights to be contained within the same + dist.reduce_scatter_tensor() call. The model architecture has been customized to enable + (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. + The scheduling is: + 1. reduce scatter smear_gate (1 param 7 padding params) + 2. reduce scatter attn_gate (10 params 6 padding params) + 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) + 4. reduce scatter attn/mlp round 2 (16 mlp params) + 5. wait on step 1, then compute update of 1 and schedule all gather + 6. wait on step 2, then compute update of 2 and schedule all gather + 7. wait on step 3, then compute update of 3 and schedule all gather + GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] + GPUs that receive params of type attn reshape before computing update + 8. wait on 4, then compute update of 4 and schedule all gather + 9. wait for each all gather to complete and update params + Empirically, leading with small params provides an additional 0.2s improvement. + """ + def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, eps=1e-8, beta2=0.95, custom_sizing=True): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + # custom sizing requires 8 GPUs + if custom_sizing and dist.get_world_size()==8: + param_groups = self.generate_custom_param_groups(params) + else: + param_groups = self.generate_standard_param_groups(params) + super().__init__(param_groups, defaults) + + def reset(self): + # expose a reset for clearing buffers + for group in self.param_groups: + group["momentum_buffer"].zero_() + group["second_momentum_buffer"].zero_() + + def generate_standard_param_groups(self, params): + """ + Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. + Creates one param group per module. + """ + groups = defaultdict(list) + for param in params: + groups[param.label].append(param) + + param_groups = [] + for module_name, group_params in groups.items(): + chunk_size = (len(group_params) + self.world_size - 1) // self.world_size + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + + return param_groups + + def generate_custom_param_groups(self, params): + """ + Implementation requires that a single GPU does not receive both attn + and mlp params when a param group is split across GPUs. + """ + module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp_up', 'mlp_down'] + params_list = list(params) + params_list.sort(key=lambda x: module_group_order.index(x.label)) + + idx = 0 + group_sizes = [1, 10, 16, 16] + assert len(params_list) == sum(group_sizes) + param_groups = [] + for size in group_sizes: + chunk_size = (size + self.world_size - 1) // self.world_size + group_params = params_list[idx: idx + size] + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + idx += size + + return param_groups + + @torch.no_grad() + def step(self): + # Efficient systems-wise implementation of step developed by @YouJiacheng, + # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, + # @ryanyang0, @vagrawal, and @varunneal. + rank = dist.get_rank() + group_infos = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + if not params: + continue + + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + stacked_grads = torch.empty( + (padded_num_params, *params[0].shape), + dtype=params[0].dtype, + device=params[0].device + ) + for i, p in enumerate(params): + stacked_grads[i].copy_(p.grad, non_blocking=True) + if len(params) < padded_num_params: + stacked_grads[len(params):].zero_() + + grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) + + reduce_future = dist.reduce_scatter_tensor( + grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True + ).get_future() + + group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) + + all_gather_infos = [] + # Second pass: wait for gradients, compute updates for the local shard of parameters, + # and launch all async all_gather operations. + for group, info in zip(self.param_groups, group_infos): + info["reduce_future"].wait() + + params = group["params"] + grad_chunk = info["grad_chunk"] + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + start_idx = rank * chunk_size + module_idx = start_idx if start_idx < len(params) else 0 + + num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank + + if "momentum_buffer" not in group: + group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) + momentum_buffer = group["momentum_buffer"] + # Apply momentum update to the persistent momentum buffer in-place + momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) + updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) + + grad_shape = updated_grads.shape + if params[module_idx].label == 'attn': + # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] + for p in params[module_idx:module_idx + num_params]: + assert p.label == 'attn' + updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) + ref_param = params[module_idx] + param_shape = ref_param.shape + + if "second_momentum_buffer" not in group: + group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) + if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) + ) + second_momentum_buffer = group["second_momentum_buffer"] + + if "param_lr" not in group: + group["param_lr"] = ( + max(1., param_shape[-2] / param_shape[-1]) ** 0.5 + * ref_param.new_tensor( + [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + ) + + group["param_wd"] = ref_param.new_tensor( + [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + + # Determine LR and WR + eff_lr = group["lr"] * group["param_lr"] + eff_wd = group["weight_decay"] * group["param_wd"] + + # Compute zeropower for the entire chunk in a single, batched call. + if num_params == 0: + v_chunk = updated_grads + elif params[module_idx].label == "smear_gate": + # dividing by magnitude is equivalent of SVN for 1d tensors + v_chunk = updated_grads / (updated_grads.norm(dim=(-2, -1), keepdim=True).clamp_min(1e-10)) + else: + v_chunk = polar_express(updated_grads) + + # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) + v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) + second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) + step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() + v_chunk.mul_(step_size) + v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) + + v_chunk = v_chunk.view(grad_shape) + + updated_params = torch.empty_like(grad_chunk) + param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) + # Apply weight decay directly to the buffer. + param_chunk.mul_(1 - eff_wd) + + param_chunk.add_(-eff_lr * v_chunk) + + updated_params[:num_params].copy_(param_chunk) + if num_params < chunk_size: + updated_params[num_params:].zero_() + + stacked_params = torch.empty( + (padded_num_params, *param_shape), + dtype=updated_params.dtype, + device=updated_params.device, + ) + + gather_future = dist.all_gather_into_tensor( + stacked_params, updated_params, async_op=True + ).get_future() + + all_gather_infos.append( + { + "gather_future": gather_future, + "stacked_params": stacked_params, + "orig_params": params, + } + ) + + # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. + for info in all_gather_infos: + info["gather_future"].wait() + stacked_params = info["stacked_params"] + orig_params = info["orig_params"] + + unstacked_params = torch.unbind(stacked_params) + for i, p in enumerate(orig_params): + p.copy_(unstacked_params[i], non_blocking=True) + + +class DistAdam(torch.optim.Optimizer): + def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + params = list(params) + sizes = {p.shape for p in params} + # create one buffer per unique parameter-size + param_groups = [] + for size in sizes: + group_params = [p for p in params if p.shape == size] + param_groups.append(dict(params=group_params)) + super().__init__(param_groups, defaults) + # init state + for p in params: + chunk_size = p.size(0) // self.world_size + exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) + exp_avg_sq = torch.zeros_like(exp_avg) + self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) + # DistributedAdam implementation by @vagrawal + + @torch.compile + @torch.no_grad() + def step(self): + rank = dist.get_rank() + reduce_scatter_futures: list[torch.Future] = [] + all_gather_futures: list[torch.Future] = [] + grad_slices = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + for param in params: + grad = param.grad + rank_size = grad.shape[0] // self.world_size + grad_slice = torch.empty_like(grad[:rank_size]) + reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) + grad_slices.append(grad_slice) + + idx = 0 + for group in self.param_groups: + beta1, beta2 = group['betas'] + eps = group['eps'] + wd = group['weight_decay'] + params = group['params'] + for param in params: + reduce_scatter_futures[idx].wait() + rank_size = param.shape[0] // self.world_size + p_slice = param[rank * rank_size:(rank + 1) * rank_size] + lr = group['lr'] * getattr(param, "lr_mul", 1.0) + state = self.state[param] + g_slice = grad_slices[idx] + + exp_avg = state["exp_avg"] + exp_avg_sq = state["exp_avg_sq"] + state["step"] += 1 + t = state["step"] + # weight decay + if wd != 0: + eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) + p_slice.mul_(1 - eff_weight_decay) + # update running averages + exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) + # bias corrections + bias1 = 1 - beta1 ** t + bias2 = 1 - beta2 ** t + # compute step + denom = exp_avg_sq.sqrt().add_(eps) + step_size = lr * (bias2 ** 0.5 / bias1) + update = exp_avg.div(denom).mul_(step_size) + p_slice.add_(other=update, alpha=-1.0) + idx += 1 + all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) + torch.futures.collect_all(all_gather_futures).wait() + +# ----------------------------------------------------------------------------- +# PyTorch nn.Module definitions for the model + +def norm(x: Tensor): + return F.rms_norm(x, (x.size(-1),)) + +class CastedLinear(nn.Linear): + def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): + super().__init__(in_features, out_features, bias=False) + self.use_fp8 = use_fp8 + self.x_s = x_s + self.w_s = w_s + self.grad_s = grad_s + + def reset_parameters(self) -> None: + with torch.no_grad(): + self.weight.zero_() # @Grad62304977 and others + + def forward(self, x: Tensor): + if self.use_fp8 and self.training: + _x = x.flatten(0, -2) + out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] + return out.reshape(*x.shape[:-1], -1) + else: + return F.linear(x, self.weight.type_as(x)) + +# yarn implementation @classiclarryd +class Yarn(nn.Module): + def __init__(self, head_dim, max_seq_len): + super().__init__() + self.head_dim = head_dim + self.max_seq_len = max_seq_len + self.reset() + + def reset(self): + angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) + # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) + angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) + theta = torch.outer(t, angular_freq) + self.cos = nn.Buffer( + theta.cos().to(torch.bfloat16), persistent=False + ) + self.sin = nn.Buffer( + theta.sin().to(torch.bfloat16), persistent=False + ) + self.angular_freq = angular_freq + # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 + self.attn_scale = 0.1 + + def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): + rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) + scaling_factor = old_window / new_window + interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) + self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) + theta = torch.outer(t, self.angular_freq) + self.cos.copy_(theta.cos()) + self.sin.copy_(theta.sin()) + self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 + +def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): + assert cos.size(0) >= x_BTHD.size(-3) + cos, sin = ( + cos[None, : x_BTHD.size(-3), None, :], + sin[None, : x_BTHD.size(-3), None, :], + ) + x1, x2 = x_BTHD.chunk(2, dim=-1) + y1 = x1 * cos + x2 * sin + y2 = x1 * (-sin) + x2 * cos + return torch.cat((y1, y2), 3) + +@dataclass +class AttnArgs: + ve: torch.Tensor + sa_lambdas: torch.Tensor + seqlens: torch.Tensor + bm_size: int + cos: torch.Tensor + sin: torch.Tensor + attn_scale: float + +flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface + +class CausalSelfAttention(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int): + super().__init__() + self.num_heads = num_heads + self.head_dim = head_dim + self.dim = dim + self.hdim = num_heads * head_dim + + assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" + std = 0.5 * (self.dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng + # https://x.com/hi_tysam/status/1879699187107033311 + # make matrices the same shape as MLP to enable batched call in optimizer + self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) + # label module to enable custom optimizer sizing + self.qkvo_w.label='attn' + + with torch.no_grad(): + self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights + self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero + + # sparse gated attention to enable context based no-op by @classiclarryd + self.attn_gate = CastedLinear(12, num_heads) + # label module to enable custom optimizer sizing + self.attn_gate.weight.label = 'attn_gate' + + def forward(self, x: Tensor, attn_args: AttnArgs): + B, T = x.size(0), x.size(1) # batch size, sequence length + assert B == 1, "varlen sequences requires B == 1" + assert T % 16 == 0 + # unpack attention args + cos, sin = attn_args.cos, attn_args.sin + ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas + seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size + + q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) + q, k = norm(q), norm(k) # QK norm @Grad62304977 + q, k = rotary(q, cos, sin), rotary(k, cos, sin) + if ve is not None: + v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 + else: # skip mid-layers token value embeddings by @YouJiacheng + v = sa_lambdas[0] * v + + max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) + + # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng + y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, + max_seqlen_q=max_len, max_seqlen_k=max_len, + causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) + y = y.view(B, T, self.num_heads, self.head_dim) + y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) + y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side + y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) + return y + + +class MLP(nn.Module): + def __init__(self, dim: int): + super().__init__() + hdim = 4 * dim + # make matrices the same shape to enable batched call in optimizer + self.c_fc = nn.Parameter(torch.empty(dim, hdim)) + self.c_proj = nn.Parameter(torch.empty(dim, hdim)) + # label modules to enable custom optimizer sizing + self.c_fc.label = 'mlp_up' + self.c_proj.label = 'mlp_down' + # corrective factor to account for transpose + self.c_fc.lr_mul = 2. + + std = 0.5 * (dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + with torch.no_grad(): + self.c_fc.uniform_(-bound, bound) + self.c_proj.zero_() # zero init suggested by @Grad62304977 + + def forward(self, x: Tensor): + x = F.linear(x, self.c_fc.T.type_as(x)) + x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 + x = F.linear(x, self.c_proj.type_as(x)) + return x + +class Block(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): + super().__init__() + # skip attention of blocks.7 (the 8th layer) by @YouJiacheng + self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None + # skip MLP blocks for first MLP layer by @EmelyanenkoK + self.mlp = MLP(dim) if layer_idx != 0 else None + + def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): + x = lambdas[0] * x + lambdas[1] * x0 + if self.attn is not None: + x = x + self.attn(norm(x), attn_args) + if self.mlp is not None: + x = x + self.mlp(norm(x)) + return x + +# ----------------------------------------------------------------------------- +# The main model + +def next_multiple_of_n(v: float | int, *, n: int): + return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) + +class GPT(nn.Module): + def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): + super().__init__() + vocab_size = next_multiple_of_n(vocab_size, n=128) + self.embed = nn.Embedding(vocab_size, model_dim) + self.smear_gate = CastedLinear(12, 1) + # label modules to enable custom optimizer sizing + self.smear_gate.weight.label = 'smear_gate' + # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 + # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 + self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) + self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) + self.yarn = Yarn(head_dim, max_seq_len) + # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. + # suggested to me by @Grad62304977. this originates from Karpathy's experiments. + use_fp8 = not os.environ.get("DISABLE_FP8", False) + self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) + # Add learnable skip connection weights for decoder layers + assert num_layers % 2 == 0 + pad = (-num_layers * 5 - 2) % dist.get_world_size() + self.scalars = nn.Parameter( + torch.cat( + [ + -1.5 + * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 + *[ + torch.tensor([1.0, 0.0]) for _ in range(num_layers) + ], # block lambdas + *[ + torch.tensor([0.5, 0.5]) for _ in range(num_layers) + ], # SA lambdas + torch.zeros(1), # smear_lambda + 0.5*torch.ones(1), # backout_lambda + torch.ones(pad), + ] + ) + ) + # set learning rates + for param in self.embed.parameters(): + param.lr_mul = 75. + for param in self.value_embeds.parameters(): + param.lr_mul = 75. + self.lm_head.weight.lr_mul = 1.0 + self.scalars.lr_mul = 5.0 + + def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): + assert input_seq.ndim == 1 + + ve = [value_embed(input_seq) for value_embed in self.value_embeds] + # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure + # dropping first layer updates this to .12 ... 012 + ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] + assert len(ve) == len(self.blocks) + + short_bm = ws_short * args.block_size + long_bm = ws_long * args.block_size + bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] + assert len(bm_sizes) == len(self.blocks) + + x = self.embed(input_seq) + + skip_weights = self.scalars[:(len(self.blocks) // 2)] + lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) + sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) + smear_lambda = self.scalars[5 * len(self.blocks)] + backout_lambda = self.scalars[5 * len(self.blocks)+1] + + # smear token embed forward 1 position @classiclarryd + smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) + x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) + x = x0 = norm(x[None]) + + # U-net design by @brendanh0gan + skip_connections = [] + n = len(self.blocks) // 2 + + x_backout = None + backout_layer = 8 + # skip layer zero + for i in range(1,len(self.blocks)): + attn_args = AttnArgs( + ve=ve[i], + sa_lambdas=sa_lambdas[i], + seqlens=seqlens, + bm_size=bm_sizes[i], + cos=self.yarn.cos, + sin=self.yarn.sin, + attn_scale=self.yarn.attn_scale + ) + # since layer 0 is skipped, layer 11 does not have skip_connection + if i >= n and i<11: + gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) + x = x + gate * skip_connections.pop() + x = self.blocks[i](x, x0, lambdas[i], attn_args) + if i < n: + skip_connections.append(x) + if i == backout_layer: + x_backout = x + + # back out contributions from first 8 layers that are only required for downstream context and not direct prediction + x -= backout_lambda * x_backout + x = norm(x) + logits = self.lm_head(x) + # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) + logits = 30 * torch.sigmoid(logits / 7.5) + logits_for_loss = logits.float() if not self.training else logits + loss = F.cross_entropy( + logits_for_loss.view(-1, logits_for_loss.size(-1)), + target_seq, + reduction="sum" if self.training else "mean", + ) + return loss + +# ----------------------------------------------------------------------------- +# Distributed data loader + +def _load_data_shard(file: Path): + header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 + assert header[0] == 20240520, "magic number mismatch in the data .bin file" + assert header[1] == 1, "unsupported version" + num_tokens = int(header[2]) # number of tokens (claimed) + with file.open("rb", buffering=0) as f: + tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng + f.seek(256 * 4) + nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng + assert nbytes == 2 * num_tokens, "number of tokens read does not match header" + return tokens + +BOS_ID = 50256 + +class BOSFinder: + # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd + def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): + # Precompute BOS positions once per shard + self.tokens=tokens + self.size = tokens.numel() + self.quickload = quickload + if quickload: + # only scan first 4 million tokens, then kickoff async thread to scan rest + self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.thread = None + self.ready = threading.Event() + self.start() + else: + self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.i = 0 + self.world_size = world_size + self.batch_iter = 0 + + def _load(self): + self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + self.bos_idx = self.bos_idx_async + + def next_batch(self, num_tokens_local: int, max_seq_len: int): + # if quickload was used, repoint to the full dataset after 5 batches + if self.quickload and self.batch_iter==5: + self.get() + n = len(self.bos_idx) + starts = [[] for _ in range(self.world_size)] + ends = [[] for _ in range(self.world_size)] + + idx = self.i + for r in range(self.world_size): + cur_len = 0 + while cur_len <= num_tokens_local: + if idx >= n: + raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") + cur = self.bos_idx[idx] + starts[r].append(cur) + end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, + cur + max_seq_len, + cur + num_tokens_local - cur_len + 1) + ends[r].append(end) + cur_len += end - cur + idx += 1 + + assert cur_len == num_tokens_local + 1 + self.i = idx + self.batch_iter+=1 + return starts, ends + +class DataPreloader: + # Helper for asynchronously loading next shard and indexing bos tokens + def __init__(self, file_iter, world_size: int = 1): + self.file_iter = file_iter + self.world_size = world_size + self.thread = None + self.data = None + self.ready = threading.Event() + + def _load(self): + tokens = _load_data_shard(next(self.file_iter)) + self.data = (tokens, BOSFinder(tokens, self.world_size)) + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + return self.data + +def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): + # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len + rank = dist.get_rank() if dist.is_initialized() else 0 + world_size = dist.get_world_size() if dist.is_initialized() else 1 + assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" + num_tokens = num_tokens // grad_accum_steps + + files = [Path(file) for file in sorted(glob.glob(filename_pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") + + file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training + tokens = _load_data_shard(next(file_iter)) + if align_to_bos: + finder = BOSFinder(tokens, world_size=world_size, quickload=True) + preloader = DataPreloader(file_iter, world_size) + preloader.start() + else: + pos = 0 # for unaligned case + + while True: + num_tokens_local = num_tokens // world_size + max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 + + if align_to_bos: + try: + seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) + start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) + except StopIteration: + # This shard is exhausted, load the next one in the next loop iteration. + tokens, finder = preloader.get() + preloader.start() + continue + + buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) + _inputs = buf[:-1] + _targets = buf[1:] + end_idxs[-1] -= 1 # last document was too long to account for _targets offset + cum_lengths = (end_idxs - start_idxs).cumsum(0) + + else: + if pos + num_tokens + 1 >= len(tokens): # should not occur for val data + tokens, pos = _load_data_shard(next(file_iter)), 0 + + pos_local = pos + rank * num_tokens_local + buf = tokens[pos_local: pos_local + num_tokens_local + 1] + _inputs = buf[:-1].view(num_tokens_local, ) + _targets = buf[1:].view(num_tokens_local, ) + + cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] + pos += num_tokens + + + _cum_lengths = torch.full((max_num_docs,), num_tokens_local) + _cum_lengths[0] = 0 + _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths + + new_params = yield ( + _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), + _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), + _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) + ) + + if new_params is not None: + # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() + new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params + assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" + num_tokens = new_num_tokens + max_seq_len = new_max_seq_len + grad_accum_steps = new_grad_accum_steps + + +# ----------------------------------------------------------------------------- +# int main + +@dataclass +class Hyperparameters: + # data + train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on + val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on + val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons + train_batch_size: int = 2048 * 16 * 8 + train_max_seq_len: int = 128 * 16 + val_batch_size: int = 4 * 64 * 1024 * 8 + # optimization + num_iterations: int = 2285 + lr_schedule = (0.5, 0.98) # breakpoints for 3-part schedule: (flat, linear decay, flat) + lr_min = 0.1 + # evaluation and logging + run_id: str = f"{uuid.uuid4()}" + val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end + save_checkpoint: bool = False + # attention masking + block_size: int = 128 + ws_schedule: tuple = (3, 5, 7, 9, 11, 13) + ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN + +args = Hyperparameters() + +data_path = os.environ.get("DATA_PATH", ".") +args.train_files = os.path.join(data_path, args.train_files) +args.val_files = os.path.join(data_path, args.val_files) + +# torchrun sets these env variables +rank = int(os.environ["RANK"]) +world_size = int(os.environ["WORLD_SIZE"]) +assert 8 % world_size == 0, "world_size must be a divisor of 8" +grad_accum_steps = 8 // world_size +assert torch.cuda.is_available() +device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) +torch.cuda.set_device(device) +dist.init_process_group(backend="nccl", device_id=device) +dist.barrier() +master_process = (rank == 0) # this process will do logging, checkpointing etc. + +# begin logging +logfile = None +if master_process: + run_id = args.run_id + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{run_id}.txt" + print(logfile) +def print0(s, console=False): + if master_process: + with open(logfile, "a") as f: + if console: + print(s) + print(s, file=f) + +# begin by printing this file (the Python code) +print0(code) +print0("="*100) +# log information about the hardware/software environment this is running on +print0(f"Running Python {sys.version}") +print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") +print0(f"Running Triton version {triton.__version__}") + +def nvidia_smi(): + import subprocess # avoid top level import + return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout +print0(nvidia_smi()) +print0("="*100) + +model: nn.Module = GPT( + vocab_size=50257, + num_layers=12, + num_heads=6, + head_dim=128, + model_dim=768, + max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) +).cuda() +for m in model.modules(): + if isinstance(m, (nn.Embedding, nn.Linear)): + m.bfloat16() +for param in model.parameters(): + dist.broadcast(param.detach(), 0) + +# collect the parameters to optimize +hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] +embed_params = [p for n, p in model.named_parameters() if "embed" in n] +scalar_params = [p for p in model.parameters() if p.ndim < 2] +head_params = [model.lm_head.weight] +gate_params = [p for n, p in model.named_parameters() if "gate" in n] + +# init the optimizer(s) +# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence +# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 +optimizer1 = DistAdam( + scalar_params + head_params + embed_params, + lr=0.008, + betas=(0.65, 0.95), + eps=1e-8, + weight_decay=0.0, +) +optimizer2 = Muon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=0.0) +optimizers = [optimizer1, optimizer2] +for opt in optimizers: + for group in opt.param_groups: + group["initial_lr"] = group["lr"] + +def get_lr(step: int): + assert step < args.num_iterations + # Three part schedule: flat, linear decrease, flat + lr_schedule = args.lr_schedule + x = step / args.num_iterations + + if x < lr_schedule[0]: + return 1.0 + elif x < lr_schedule[1]: + progress = (x - lr_schedule[0]) / (lr_schedule[1] - lr_schedule[0]) + lr = 1.0 - (1.0 - args.lr_min) * progress + else: + lr = args.lr_min + return lr + +def get_ws(step: int): + assert step <= args.num_iterations + x = step / (args.num_iterations + 1) + ws_idx = int(len(args.ws_schedule) * x) + return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] + +def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): + # warmup phase: linearly increase momentum from min to max + # cooldown phase: linearly decrease momentum from max to min + momentum_cd_start = args.num_iterations - muon_cooldown_steps + if step < muon_warmup_steps: + frac = step / muon_warmup_steps + momentum = momentum_min + frac * (momentum_max - momentum_min) + elif step > momentum_cd_start: + frac = (step - momentum_cd_start) / muon_cooldown_steps + momentum = momentum_max - frac * (momentum_max - momentum_min) + else: + momentum = momentum_max + return momentum + +def step_optimizers(step: int, optimizers, model): + # update lr + for optimizer in optimizers: + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * get_lr(step) + + # set muon momentum based on step + momentum = get_muon_momentum(step) + for group in optimizers[1].param_groups: + group["momentum"] = momentum + + # on even steps, only step Muon params + # on odd steps, step all params + if step%2==0: + optimizers[1].step() + optimizers[1].zero_grad(set_to_none=True) + else: + for optimizer in optimizers: + optimizer.step() + model.zero_grad(set_to_none=True) + +model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) + +######################################## +# Warmup kernels # +######################################## + +# Warmup the training kernels, then re-initialize the state so we aren't cheating +warmup_steps = 30 +initial_state = dict(model=copy.deepcopy(model.state_dict()), + optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +for step in range(warmup_steps): + inputs, targets, cum_seqlens = next(train_loader) + # each window size is a new graph, need to warm up each with Yarn.attn_scale + ws_idx = step % len(args.ws_schedule) + if ws_idx==0: + model.yarn.reset() + ws_long = args.ws_schedule[0] + else: + new_ws_long = args.ws_schedule[ws_idx] + if new_ws_long > ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long = new_ws_long + model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() + for opt in optimizers: + opt.step() + model.zero_grad(set_to_none=True) +model.yarn.reset() # rotary buffer is not stored in state_dict +model.load_state_dict(initial_state["model"]) +optimizer2.reset() # momentum buffer not in state dict +for opt, opt_state in zip(optimizers, initial_state["optimizers"]): + opt.load_state_dict(opt_state) +del train_loader, initial_state + +######################################## +# Training and validation # +######################################## + +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +training_time_ms = 0 +# start the clock +torch.cuda.synchronize() +t0 = time.perf_counter() +# begin training +train_steps = args.num_iterations +ws_short, ws_long = get_ws(0) +for step in range(train_steps + 1): + last_step = (step == train_steps) + ws_short, new_ws_long = get_ws(step) + if new_ws_long != ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long=new_ws_long + + # --------------- VALIDATION SECTION ----------------- + if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): + if last_step: + ws_long = args.ws_validate_post_yarn_ext + # stop the clock + torch.cuda.synchronize() + training_time_ms += 1000 * (time.perf_counter() - t0) + model.eval() + assert args.val_tokens % args.val_batch_size == 0 + val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size + val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) + val_loss = 0 + with torch.no_grad(): + for _ in range(val_steps): + inputs, targets, cum_seqlens = next(val_loader) + val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) + val_loss /= val_steps + del val_loader + dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) + print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) + model.train() + # start the clock again + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if master_process and args.save_checkpoint: + log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) + os.makedirs(f"logs/{run_id}", exist_ok=True) + torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") + # the last step only has the validation loop, so break to avoid training + break + + # --------------- TRAINING SECTION ----------------- + loss = 0 + for _ in range(grad_accum_steps): + inputs, targets, cum_seqlens = next(train_loader) + loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps + loss.backward() + step_optimizers(step, optimizers, model) + + # logging + approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) + print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) + +print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) +dist.destroy_process_group() + +==================================================================================================== +Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] +Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 +Running Triton version 3.5.0 +Tue Oct 28 02:13:13 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | +| N/A 40C P0 128W / 700W | 5858MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | +| N/A 33C P0 127W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | +| N/A 32C P0 121W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | +| N/A 38C P0 124W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | +| N/A 39C P0 120W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | +| N/A 32C P0 120W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | +| N/A 38C P0 124W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | +| N/A 31C P0 114W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| ++-----------------------------------------------------------------------------------------+ + +==================================================================================================== +step:0/2285 val_loss:10.8258 train_time:0ms step_avg:0.02ms +step:1/2285 train_time:120ms step_avg:119.52ms +step:2/2285 train_time:140ms step_avg:70.10ms +step:3/2285 train_time:179ms step_avg:59.70ms +step:4/2285 train_time:235ms step_avg:58.79ms +step:5/2285 train_time:294ms step_avg:58.87ms +step:6/2285 train_time:353ms step_avg:58.79ms +step:7/2285 train_time:414ms step_avg:59.12ms +step:8/2285 train_time:472ms step_avg:59.00ms +step:9/2285 train_time:532ms step_avg:59.16ms +step:10/2285 train_time:591ms step_avg:59.06ms +step:11/2285 train_time:651ms step_avg:59.21ms +step:12/2285 train_time:710ms step_avg:59.14ms +step:13/2285 train_time:770ms step_avg:59.22ms +step:14/2285 train_time:828ms step_avg:59.16ms +step:15/2285 train_time:889ms step_avg:59.24ms +step:16/2285 train_time:947ms step_avg:59.19ms +step:17/2285 train_time:1009ms step_avg:59.33ms +step:18/2285 train_time:1071ms step_avg:59.49ms +step:19/2285 train_time:1137ms step_avg:59.82ms +step:20/2285 train_time:1198ms step_avg:59.89ms +step:21/2285 train_time:1259ms step_avg:59.96ms +step:22/2285 train_time:1318ms step_avg:59.91ms +step:23/2285 train_time:1379ms step_avg:59.97ms +step:24/2285 train_time:1438ms step_avg:59.91ms +step:25/2285 train_time:1499ms step_avg:59.95ms +step:26/2285 train_time:1558ms step_avg:59.92ms +step:27/2285 train_time:1619ms step_avg:59.95ms +step:28/2285 train_time:1678ms step_avg:59.91ms +step:29/2285 train_time:1739ms step_avg:59.95ms +step:30/2285 train_time:1797ms step_avg:59.91ms +step:31/2285 train_time:1858ms step_avg:59.95ms +step:32/2285 train_time:1917ms step_avg:59.90ms +step:33/2285 train_time:1978ms step_avg:59.94ms +step:34/2285 train_time:2038ms step_avg:59.93ms +step:35/2285 train_time:2101ms step_avg:60.03ms +step:36/2285 train_time:2162ms step_avg:60.04ms +step:37/2285 train_time:2223ms step_avg:60.09ms +step:38/2285 train_time:2283ms step_avg:60.08ms +step:39/2285 train_time:2345ms step_avg:60.12ms +step:40/2285 train_time:2403ms step_avg:60.08ms +step:41/2285 train_time:2465ms step_avg:60.13ms +step:42/2285 train_time:2524ms step_avg:60.10ms +step:43/2285 train_time:2586ms step_avg:60.13ms +step:44/2285 train_time:2645ms step_avg:60.11ms +step:45/2285 train_time:2706ms step_avg:60.13ms +step:46/2285 train_time:2765ms step_avg:60.11ms +step:47/2285 train_time:2826ms step_avg:60.14ms +step:48/2285 train_time:2886ms step_avg:60.12ms +step:49/2285 train_time:2948ms step_avg:60.16ms +step:50/2285 train_time:3007ms step_avg:60.14ms +step:51/2285 train_time:3070ms step_avg:60.20ms +step:52/2285 train_time:3130ms step_avg:60.19ms +step:53/2285 train_time:3191ms step_avg:60.21ms +step:54/2285 train_time:3250ms step_avg:60.19ms +step:55/2285 train_time:3312ms step_avg:60.21ms +step:56/2285 train_time:3371ms step_avg:60.19ms +step:57/2285 train_time:3432ms step_avg:60.21ms +step:58/2285 train_time:3491ms step_avg:60.19ms +step:59/2285 train_time:3553ms step_avg:60.23ms +step:60/2285 train_time:3612ms step_avg:60.20ms +step:61/2285 train_time:3673ms step_avg:60.21ms +step:62/2285 train_time:3731ms step_avg:60.18ms +step:63/2285 train_time:3792ms step_avg:60.20ms +step:64/2285 train_time:3852ms step_avg:60.18ms +step:65/2285 train_time:3913ms step_avg:60.20ms +step:66/2285 train_time:3972ms step_avg:60.18ms +step:67/2285 train_time:4033ms step_avg:60.19ms +step:68/2285 train_time:4092ms step_avg:60.18ms +step:69/2285 train_time:4154ms step_avg:60.20ms +step:70/2285 train_time:4213ms step_avg:60.18ms +step:71/2285 train_time:4274ms step_avg:60.20ms +step:72/2285 train_time:4332ms step_avg:60.17ms +step:73/2285 train_time:4394ms step_avg:60.19ms +step:74/2285 train_time:4454ms step_avg:60.19ms +step:75/2285 train_time:4514ms step_avg:60.18ms +step:76/2285 train_time:4572ms step_avg:60.16ms +step:77/2285 train_time:4634ms step_avg:60.18ms +step:78/2285 train_time:4693ms step_avg:60.17ms +step:79/2285 train_time:4755ms step_avg:60.19ms +step:80/2285 train_time:4814ms step_avg:60.17ms +step:81/2285 train_time:4876ms step_avg:60.20ms +step:82/2285 train_time:4935ms step_avg:60.18ms +step:83/2285 train_time:4996ms step_avg:60.19ms +step:84/2285 train_time:5055ms step_avg:60.18ms +step:85/2285 train_time:5116ms step_avg:60.19ms +step:86/2285 train_time:5174ms step_avg:60.17ms +step:87/2285 train_time:5236ms step_avg:60.18ms +step:88/2285 train_time:5294ms step_avg:60.16ms +step:89/2285 train_time:5355ms step_avg:60.17ms +step:90/2285 train_time:5414ms step_avg:60.16ms +step:91/2285 train_time:5475ms step_avg:60.16ms +step:92/2285 train_time:5533ms step_avg:60.15ms +step:93/2285 train_time:5595ms step_avg:60.16ms +step:94/2285 train_time:5653ms step_avg:60.14ms +step:95/2285 train_time:5715ms step_avg:60.16ms +step:96/2285 train_time:5774ms step_avg:60.14ms +step:97/2285 train_time:5835ms step_avg:60.15ms +step:98/2285 train_time:5894ms step_avg:60.14ms +step:99/2285 train_time:5955ms step_avg:60.15ms +step:100/2285 train_time:6014ms step_avg:60.14ms +step:101/2285 train_time:6074ms step_avg:60.14ms +step:102/2285 train_time:6133ms step_avg:60.13ms +step:103/2285 train_time:6194ms step_avg:60.14ms +step:104/2285 train_time:6253ms step_avg:60.12ms +step:105/2285 train_time:6314ms step_avg:60.14ms +step:106/2285 train_time:6373ms step_avg:60.12ms +step:107/2285 train_time:6434ms step_avg:60.13ms +step:108/2285 train_time:6492ms step_avg:60.11ms +step:109/2285 train_time:6553ms step_avg:60.12ms +step:110/2285 train_time:6611ms step_avg:60.10ms +step:111/2285 train_time:6673ms step_avg:60.12ms +step:112/2285 train_time:6731ms step_avg:60.10ms +step:113/2285 train_time:6793ms step_avg:60.12ms +step:114/2285 train_time:6852ms step_avg:60.11ms +step:115/2285 train_time:6913ms step_avg:60.11ms +step:116/2285 train_time:6971ms step_avg:60.10ms +step:117/2285 train_time:7032ms step_avg:60.11ms +step:118/2285 train_time:7091ms step_avg:60.09ms +step:119/2285 train_time:7152ms step_avg:60.10ms +step:120/2285 train_time:7210ms step_avg:60.09ms +step:121/2285 train_time:7272ms step_avg:60.10ms +step:122/2285 train_time:7331ms step_avg:60.09ms +step:123/2285 train_time:7392ms step_avg:60.10ms +step:124/2285 train_time:7451ms step_avg:60.09ms +step:125/2285 train_time:7512ms step_avg:60.09ms +step:126/2285 train_time:7570ms step_avg:60.08ms +step:127/2285 train_time:7631ms step_avg:60.08ms +step:128/2285 train_time:7690ms step_avg:60.08ms +step:129/2285 train_time:7751ms step_avg:60.08ms +step:130/2285 train_time:7809ms step_avg:60.07ms +step:131/2285 train_time:7870ms step_avg:60.08ms +step:132/2285 train_time:7929ms step_avg:60.07ms +step:133/2285 train_time:7990ms step_avg:60.07ms +step:134/2285 train_time:8048ms step_avg:60.06ms +step:135/2285 train_time:8109ms step_avg:60.07ms +step:136/2285 train_time:8168ms step_avg:60.06ms +step:137/2285 train_time:8229ms step_avg:60.07ms +step:138/2285 train_time:8288ms step_avg:60.06ms +step:139/2285 train_time:8349ms step_avg:60.06ms +step:140/2285 train_time:8407ms step_avg:60.05ms +step:141/2285 train_time:8468ms step_avg:60.06ms +step:142/2285 train_time:8527ms step_avg:60.05ms +step:143/2285 train_time:8588ms step_avg:60.06ms +step:144/2285 train_time:8647ms step_avg:60.05ms +step:145/2285 train_time:8708ms step_avg:60.06ms +step:146/2285 train_time:8767ms step_avg:60.05ms +step:147/2285 train_time:8827ms step_avg:60.05ms +step:148/2285 train_time:8886ms step_avg:60.04ms +step:149/2285 train_time:8948ms step_avg:60.05ms +step:150/2285 train_time:9007ms step_avg:60.04ms +step:151/2285 train_time:9068ms step_avg:60.06ms +step:152/2285 train_time:9127ms step_avg:60.04ms +step:153/2285 train_time:9188ms step_avg:60.06ms +step:154/2285 train_time:9247ms step_avg:60.05ms +step:155/2285 train_time:9308ms step_avg:60.05ms +step:156/2285 train_time:9367ms step_avg:60.04ms +step:157/2285 train_time:9428ms step_avg:60.05ms +step:158/2285 train_time:9486ms step_avg:60.04ms +step:159/2285 train_time:9548ms step_avg:60.05ms +step:160/2285 train_time:9607ms step_avg:60.04ms +step:161/2285 train_time:9669ms step_avg:60.05ms +step:162/2285 train_time:9727ms step_avg:60.04ms +step:163/2285 train_time:9788ms step_avg:60.05ms +step:164/2285 train_time:9847ms step_avg:60.04ms +step:165/2285 train_time:9908ms step_avg:60.05ms +step:166/2285 train_time:9966ms step_avg:60.04ms +step:167/2285 train_time:10027ms step_avg:60.04ms +step:168/2285 train_time:10086ms step_avg:60.04ms +step:169/2285 train_time:10148ms step_avg:60.05ms +step:170/2285 train_time:10206ms step_avg:60.04ms +step:171/2285 train_time:10268ms step_avg:60.05ms +step:172/2285 train_time:10327ms step_avg:60.04ms +step:173/2285 train_time:10388ms step_avg:60.04ms +step:174/2285 train_time:10447ms step_avg:60.04ms +step:175/2285 train_time:10508ms step_avg:60.05ms +step:176/2285 train_time:10567ms step_avg:60.04ms +step:177/2285 train_time:10627ms step_avg:60.04ms +step:178/2285 train_time:10686ms step_avg:60.03ms +step:179/2285 train_time:10748ms step_avg:60.04ms +step:180/2285 train_time:10807ms step_avg:60.04ms +step:181/2285 train_time:10867ms step_avg:60.04ms +step:182/2285 train_time:10926ms step_avg:60.03ms +step:183/2285 train_time:10987ms step_avg:60.04ms +step:184/2285 train_time:11046ms step_avg:60.03ms +step:185/2285 train_time:11107ms step_avg:60.04ms +step:186/2285 train_time:11166ms step_avg:60.03ms +step:187/2285 train_time:11227ms step_avg:60.04ms +step:188/2285 train_time:11285ms step_avg:60.03ms +step:189/2285 train_time:11346ms step_avg:60.03ms +step:190/2285 train_time:11405ms step_avg:60.03ms +step:191/2285 train_time:11467ms step_avg:60.04ms +step:192/2285 train_time:11525ms step_avg:60.03ms +step:193/2285 train_time:11586ms step_avg:60.03ms +step:194/2285 train_time:11645ms step_avg:60.03ms +step:195/2285 train_time:11706ms step_avg:60.03ms +step:196/2285 train_time:11765ms step_avg:60.03ms +step:197/2285 train_time:11826ms step_avg:60.03ms +step:198/2285 train_time:11885ms step_avg:60.03ms +step:199/2285 train_time:11946ms step_avg:60.03ms +step:200/2285 train_time:12005ms step_avg:60.03ms +step:201/2285 train_time:12067ms step_avg:60.04ms +step:202/2285 train_time:12126ms step_avg:60.03ms +step:203/2285 train_time:12187ms step_avg:60.04ms +step:204/2285 train_time:12246ms step_avg:60.03ms +step:205/2285 train_time:12307ms step_avg:60.04ms +step:206/2285 train_time:12366ms step_avg:60.03ms +step:207/2285 train_time:12427ms step_avg:60.04ms +step:208/2285 train_time:12486ms step_avg:60.03ms +step:209/2285 train_time:12548ms step_avg:60.04ms +step:210/2285 train_time:12607ms step_avg:60.03ms +step:211/2285 train_time:12668ms step_avg:60.04ms +step:212/2285 train_time:12727ms step_avg:60.03ms +step:213/2285 train_time:12788ms step_avg:60.04ms +step:214/2285 train_time:12847ms step_avg:60.03ms +step:215/2285 train_time:12908ms step_avg:60.04ms +step:216/2285 train_time:12966ms step_avg:60.03ms +step:217/2285 train_time:13028ms step_avg:60.04ms +step:218/2285 train_time:13087ms step_avg:60.03ms +step:219/2285 train_time:13149ms step_avg:60.04ms +step:220/2285 train_time:13207ms step_avg:60.03ms +step:221/2285 train_time:13268ms step_avg:60.04ms +step:222/2285 train_time:13327ms step_avg:60.03ms +step:223/2285 train_time:13388ms step_avg:60.03ms +step:224/2285 train_time:13447ms step_avg:60.03ms +step:225/2285 train_time:13508ms step_avg:60.04ms +step:226/2285 train_time:13567ms step_avg:60.03ms +step:227/2285 train_time:13628ms step_avg:60.03ms +step:228/2285 train_time:13686ms step_avg:60.03ms +step:229/2285 train_time:13747ms step_avg:60.03ms +step:230/2285 train_time:13806ms step_avg:60.03ms +step:231/2285 train_time:13868ms step_avg:60.03ms +step:232/2285 train_time:13926ms step_avg:60.03ms +step:233/2285 train_time:13987ms step_avg:60.03ms +step:234/2285 train_time:14046ms step_avg:60.02ms +step:235/2285 train_time:14107ms step_avg:60.03ms +step:236/2285 train_time:14166ms step_avg:60.02ms +step:237/2285 train_time:14227ms step_avg:60.03ms +step:238/2285 train_time:14286ms step_avg:60.02ms +step:239/2285 train_time:14348ms step_avg:60.03ms +step:240/2285 train_time:14406ms step_avg:60.03ms +step:241/2285 train_time:14468ms step_avg:60.03ms +step:242/2285 train_time:14526ms step_avg:60.03ms +step:243/2285 train_time:14588ms step_avg:60.03ms +step:244/2285 train_time:14647ms step_avg:60.03ms +step:245/2285 train_time:14707ms step_avg:60.03ms +step:246/2285 train_time:14766ms step_avg:60.02ms +step:247/2285 train_time:14828ms step_avg:60.03ms +step:248/2285 train_time:14886ms step_avg:60.03ms +step:249/2285 train_time:14948ms step_avg:60.03ms +step:250/2285 train_time:15007ms step_avg:60.03ms +step:250/2285 val_loss:4.0735 train_time:15069ms step_avg:60.28ms +step:251/2285 train_time:15089ms step_avg:60.11ms +step:252/2285 train_time:15129ms step_avg:60.04ms +step:253/2285 train_time:15194ms step_avg:60.05ms +step:254/2285 train_time:15257ms step_avg:60.07ms +step:255/2285 train_time:15319ms step_avg:60.07ms +step:256/2285 train_time:15378ms step_avg:60.07ms +step:257/2285 train_time:15438ms step_avg:60.07ms +step:258/2285 train_time:15497ms step_avg:60.06ms +step:259/2285 train_time:15556ms step_avg:60.06ms +step:260/2285 train_time:15614ms step_avg:60.05ms +step:261/2285 train_time:15675ms step_avg:60.06ms +step:262/2285 train_time:15732ms step_avg:60.05ms +step:263/2285 train_time:15792ms step_avg:60.05ms +step:264/2285 train_time:15850ms step_avg:60.04ms +step:265/2285 train_time:15910ms step_avg:60.04ms +step:266/2285 train_time:15968ms step_avg:60.03ms +step:267/2285 train_time:16029ms step_avg:60.03ms +step:268/2285 train_time:16088ms step_avg:60.03ms +step:269/2285 train_time:16152ms step_avg:60.04ms +step:270/2285 train_time:16213ms step_avg:60.05ms +step:271/2285 train_time:16276ms step_avg:60.06ms +step:272/2285 train_time:16335ms step_avg:60.06ms +step:273/2285 train_time:16397ms step_avg:60.06ms +step:274/2285 train_time:16455ms step_avg:60.05ms +step:275/2285 train_time:16516ms step_avg:60.06ms +step:276/2285 train_time:16574ms step_avg:60.05ms +step:277/2285 train_time:16634ms step_avg:60.05ms +step:278/2285 train_time:16692ms step_avg:60.04ms +step:279/2285 train_time:16752ms step_avg:60.04ms +step:280/2285 train_time:16810ms step_avg:60.04ms +step:281/2285 train_time:16871ms step_avg:60.04ms +step:282/2285 train_time:16929ms step_avg:60.03ms +step:283/2285 train_time:16989ms step_avg:60.03ms +step:284/2285 train_time:17048ms step_avg:60.03ms +step:285/2285 train_time:17110ms step_avg:60.03ms +step:286/2285 train_time:17169ms step_avg:60.03ms +step:287/2285 train_time:17232ms step_avg:60.04ms +step:288/2285 train_time:17292ms step_avg:60.04ms +step:289/2285 train_time:17353ms step_avg:60.05ms +step:290/2285 train_time:17413ms step_avg:60.04ms +step:291/2285 train_time:17474ms step_avg:60.05ms +step:292/2285 train_time:17532ms step_avg:60.04ms +step:293/2285 train_time:17593ms step_avg:60.04ms +step:294/2285 train_time:17651ms step_avg:60.04ms +step:295/2285 train_time:17712ms step_avg:60.04ms +step:296/2285 train_time:17769ms step_avg:60.03ms +step:297/2285 train_time:17830ms step_avg:60.03ms +step:298/2285 train_time:17888ms step_avg:60.03ms +step:299/2285 train_time:17949ms step_avg:60.03ms +step:300/2285 train_time:18008ms step_avg:60.03ms +step:301/2285 train_time:18069ms step_avg:60.03ms +step:302/2285 train_time:18128ms step_avg:60.03ms +step:303/2285 train_time:18189ms step_avg:60.03ms +step:304/2285 train_time:18249ms step_avg:60.03ms +step:305/2285 train_time:18310ms step_avg:60.03ms +step:306/2285 train_time:18370ms step_avg:60.03ms +step:307/2285 train_time:18432ms step_avg:60.04ms +step:308/2285 train_time:18491ms step_avg:60.04ms +step:309/2285 train_time:18552ms step_avg:60.04ms +step:310/2285 train_time:18611ms step_avg:60.04ms +step:311/2285 train_time:18671ms step_avg:60.04ms +step:312/2285 train_time:18729ms step_avg:60.03ms +step:313/2285 train_time:18790ms step_avg:60.03ms +step:314/2285 train_time:18848ms step_avg:60.03ms +step:315/2285 train_time:18908ms step_avg:60.03ms +step:316/2285 train_time:18966ms step_avg:60.02ms +step:317/2285 train_time:19027ms step_avg:60.02ms +step:318/2285 train_time:19085ms step_avg:60.02ms +step:319/2285 train_time:19147ms step_avg:60.02ms +step:320/2285 train_time:19205ms step_avg:60.02ms +step:321/2285 train_time:19266ms step_avg:60.02ms +step:322/2285 train_time:19325ms step_avg:60.02ms +step:323/2285 train_time:19386ms step_avg:60.02ms +step:324/2285 train_time:19446ms step_avg:60.02ms +step:325/2285 train_time:19507ms step_avg:60.02ms +step:326/2285 train_time:19565ms step_avg:60.02ms +step:327/2285 train_time:19626ms step_avg:60.02ms +step:328/2285 train_time:19684ms step_avg:60.01ms +step:329/2285 train_time:19744ms step_avg:60.01ms +step:330/2285 train_time:19802ms step_avg:60.01ms +step:331/2285 train_time:19863ms step_avg:60.01ms +step:332/2285 train_time:19921ms step_avg:60.00ms +step:333/2285 train_time:19982ms step_avg:60.01ms +step:334/2285 train_time:20040ms step_avg:60.00ms +step:335/2285 train_time:20101ms step_avg:60.00ms +step:336/2285 train_time:20159ms step_avg:60.00ms +step:337/2285 train_time:20220ms step_avg:60.00ms +step:338/2285 train_time:20279ms step_avg:60.00ms +step:339/2285 train_time:20340ms step_avg:60.00ms +step:340/2285 train_time:20400ms step_avg:60.00ms +step:341/2285 train_time:20461ms step_avg:60.00ms +step:342/2285 train_time:20519ms step_avg:60.00ms +step:343/2285 train_time:20580ms step_avg:60.00ms +step:344/2285 train_time:20639ms step_avg:60.00ms +step:345/2285 train_time:20700ms step_avg:60.00ms +step:346/2285 train_time:20758ms step_avg:60.00ms +step:347/2285 train_time:20820ms step_avg:60.00ms +step:348/2285 train_time:20878ms step_avg:59.99ms +step:349/2285 train_time:20938ms step_avg:60.00ms +step:350/2285 train_time:20997ms step_avg:59.99ms +step:351/2285 train_time:21057ms step_avg:59.99ms +step:352/2285 train_time:21115ms step_avg:59.99ms +step:353/2285 train_time:21176ms step_avg:59.99ms +step:354/2285 train_time:21234ms step_avg:59.98ms +step:355/2285 train_time:21295ms step_avg:59.99ms +step:356/2285 train_time:21354ms step_avg:59.98ms +step:357/2285 train_time:21416ms step_avg:59.99ms +step:358/2285 train_time:21475ms step_avg:59.99ms +step:359/2285 train_time:21536ms step_avg:59.99ms +step:360/2285 train_time:21595ms step_avg:59.99ms +step:361/2285 train_time:21656ms step_avg:59.99ms +step:362/2285 train_time:21714ms step_avg:59.98ms +step:363/2285 train_time:21775ms step_avg:59.99ms +step:364/2285 train_time:21833ms step_avg:59.98ms +step:365/2285 train_time:21894ms step_avg:59.98ms +step:366/2285 train_time:21953ms step_avg:59.98ms +step:367/2285 train_time:22013ms step_avg:59.98ms +step:368/2285 train_time:22072ms step_avg:59.98ms +step:369/2285 train_time:22132ms step_avg:59.98ms +step:370/2285 train_time:22191ms step_avg:59.98ms +step:371/2285 train_time:22252ms step_avg:59.98ms +step:372/2285 train_time:22310ms step_avg:59.97ms +step:373/2285 train_time:22372ms step_avg:59.98ms +step:374/2285 train_time:22431ms step_avg:59.97ms +step:375/2285 train_time:22492ms step_avg:59.98ms +step:376/2285 train_time:22551ms step_avg:59.98ms +step:377/2285 train_time:22612ms step_avg:59.98ms +step:378/2285 train_time:22671ms step_avg:59.98ms +step:379/2285 train_time:22732ms step_avg:59.98ms +step:380/2285 train_time:22791ms step_avg:59.98ms +step:381/2285 train_time:22852ms step_avg:59.98ms +step:382/2285 train_time:22910ms step_avg:59.97ms +step:383/2285 train_time:22971ms step_avg:59.98ms +step:384/2285 train_time:23030ms step_avg:59.97ms +step:385/2285 train_time:23091ms step_avg:59.98ms +step:386/2285 train_time:23150ms step_avg:59.97ms +step:387/2285 train_time:23211ms step_avg:59.98ms +step:388/2285 train_time:23270ms step_avg:59.97ms +step:389/2285 train_time:23332ms step_avg:59.98ms +step:390/2285 train_time:23391ms step_avg:59.98ms +step:391/2285 train_time:23453ms step_avg:59.98ms +step:392/2285 train_time:23512ms step_avg:59.98ms +step:393/2285 train_time:23574ms step_avg:59.98ms +step:394/2285 train_time:23633ms step_avg:59.98ms +step:395/2285 train_time:23694ms step_avg:59.99ms +step:396/2285 train_time:23754ms step_avg:59.98ms +step:397/2285 train_time:23815ms step_avg:59.99ms +step:398/2285 train_time:23874ms step_avg:59.98ms +step:399/2285 train_time:23935ms step_avg:59.99ms +step:400/2285 train_time:23994ms step_avg:59.99ms +step:401/2285 train_time:24055ms step_avg:59.99ms +step:402/2285 train_time:24114ms step_avg:59.99ms +step:403/2285 train_time:24176ms step_avg:59.99ms +step:404/2285 train_time:24235ms step_avg:59.99ms +step:405/2285 train_time:24298ms step_avg:59.99ms +step:406/2285 train_time:24355ms step_avg:59.99ms +step:407/2285 train_time:24416ms step_avg:59.99ms +step:408/2285 train_time:24475ms step_avg:59.99ms +step:409/2285 train_time:24537ms step_avg:59.99ms +step:410/2285 train_time:24596ms step_avg:59.99ms +step:411/2285 train_time:24657ms step_avg:59.99ms +step:412/2285 train_time:24716ms step_avg:59.99ms +step:413/2285 train_time:24778ms step_avg:59.99ms +step:414/2285 train_time:24837ms step_avg:59.99ms +step:415/2285 train_time:24898ms step_avg:59.99ms +step:416/2285 train_time:24956ms step_avg:59.99ms +step:417/2285 train_time:25018ms step_avg:59.99ms +step:418/2285 train_time:25077ms step_avg:59.99ms +step:419/2285 train_time:25138ms step_avg:60.00ms +step:420/2285 train_time:25197ms step_avg:59.99ms +step:421/2285 train_time:25259ms step_avg:60.00ms +step:422/2285 train_time:25317ms step_avg:59.99ms +step:423/2285 train_time:25379ms step_avg:60.00ms +step:424/2285 train_time:25437ms step_avg:59.99ms +step:425/2285 train_time:25499ms step_avg:60.00ms +step:426/2285 train_time:25558ms step_avg:60.00ms +step:427/2285 train_time:25620ms step_avg:60.00ms +step:428/2285 train_time:25679ms step_avg:60.00ms +step:429/2285 train_time:25740ms step_avg:60.00ms +step:430/2285 train_time:25800ms step_avg:60.00ms +step:431/2285 train_time:25860ms step_avg:60.00ms +step:432/2285 train_time:25919ms step_avg:60.00ms +step:433/2285 train_time:25980ms step_avg:60.00ms +step:434/2285 train_time:26039ms step_avg:60.00ms +step:435/2285 train_time:26100ms step_avg:60.00ms +step:436/2285 train_time:26159ms step_avg:60.00ms +step:437/2285 train_time:26220ms step_avg:60.00ms +step:438/2285 train_time:26279ms step_avg:60.00ms +step:439/2285 train_time:26340ms step_avg:60.00ms +step:440/2285 train_time:26399ms step_avg:60.00ms +step:441/2285 train_time:26460ms step_avg:60.00ms +step:442/2285 train_time:26519ms step_avg:60.00ms +step:443/2285 train_time:26580ms step_avg:60.00ms +step:444/2285 train_time:26640ms step_avg:60.00ms +step:445/2285 train_time:26701ms step_avg:60.00ms +step:446/2285 train_time:26760ms step_avg:60.00ms +step:447/2285 train_time:26822ms step_avg:60.00ms +step:448/2285 train_time:26880ms step_avg:60.00ms +step:449/2285 train_time:26941ms step_avg:60.00ms +step:450/2285 train_time:27000ms step_avg:60.00ms +step:451/2285 train_time:27062ms step_avg:60.00ms +step:452/2285 train_time:27121ms step_avg:60.00ms +step:453/2285 train_time:27182ms step_avg:60.00ms +step:454/2285 train_time:27241ms step_avg:60.00ms +step:455/2285 train_time:27302ms step_avg:60.00ms +step:456/2285 train_time:27361ms step_avg:60.00ms +step:457/2285 train_time:27423ms step_avg:60.01ms +step:458/2285 train_time:27481ms step_avg:60.00ms +step:459/2285 train_time:27542ms step_avg:60.01ms +step:460/2285 train_time:27602ms step_avg:60.00ms +step:461/2285 train_time:27663ms step_avg:60.01ms +step:462/2285 train_time:27722ms step_avg:60.00ms +step:463/2285 train_time:27782ms step_avg:60.01ms +step:464/2285 train_time:27841ms step_avg:60.00ms +step:465/2285 train_time:27902ms step_avg:60.00ms +step:466/2285 train_time:27961ms step_avg:60.00ms +step:467/2285 train_time:28022ms step_avg:60.00ms +step:468/2285 train_time:28081ms step_avg:60.00ms +step:469/2285 train_time:28142ms step_avg:60.00ms +step:470/2285 train_time:28201ms step_avg:60.00ms +step:471/2285 train_time:28263ms step_avg:60.01ms +step:472/2285 train_time:28322ms step_avg:60.00ms +step:473/2285 train_time:28383ms step_avg:60.01ms +step:474/2285 train_time:28442ms step_avg:60.00ms +step:475/2285 train_time:28503ms step_avg:60.01ms +step:476/2285 train_time:28562ms step_avg:60.01ms +step:477/2285 train_time:28624ms step_avg:60.01ms +step:478/2285 train_time:28683ms step_avg:60.01ms +step:479/2285 train_time:28744ms step_avg:60.01ms +step:480/2285 train_time:28803ms step_avg:60.01ms +step:481/2285 train_time:28864ms step_avg:60.01ms +step:482/2285 train_time:28922ms step_avg:60.01ms +step:483/2285 train_time:28983ms step_avg:60.01ms +step:484/2285 train_time:29042ms step_avg:60.00ms +step:485/2285 train_time:29104ms step_avg:60.01ms +step:486/2285 train_time:29163ms step_avg:60.01ms +step:487/2285 train_time:29224ms step_avg:60.01ms +step:488/2285 train_time:29282ms step_avg:60.00ms +step:489/2285 train_time:29344ms step_avg:60.01ms +step:490/2285 train_time:29403ms step_avg:60.01ms +step:491/2285 train_time:29464ms step_avg:60.01ms +step:492/2285 train_time:29523ms step_avg:60.01ms +step:493/2285 train_time:29584ms step_avg:60.01ms +step:494/2285 train_time:29644ms step_avg:60.01ms +step:495/2285 train_time:29705ms step_avg:60.01ms +step:496/2285 train_time:29764ms step_avg:60.01ms +step:497/2285 train_time:29825ms step_avg:60.01ms +step:498/2285 train_time:29884ms step_avg:60.01ms +step:499/2285 train_time:29945ms step_avg:60.01ms +step:500/2285 train_time:30004ms step_avg:60.01ms +step:500/2285 val_loss:3.7835 train_time:30067ms step_avg:60.13ms +step:501/2285 train_time:30085ms step_avg:60.05ms +step:502/2285 train_time:30127ms step_avg:60.01ms +step:503/2285 train_time:30189ms step_avg:60.02ms +step:504/2285 train_time:30251ms step_avg:60.02ms +step:505/2285 train_time:30314ms step_avg:60.03ms +step:506/2285 train_time:30374ms step_avg:60.03ms +step:507/2285 train_time:30435ms step_avg:60.03ms +step:508/2285 train_time:30494ms step_avg:60.03ms +step:509/2285 train_time:30555ms step_avg:60.03ms +step:510/2285 train_time:30613ms step_avg:60.03ms +step:511/2285 train_time:30674ms step_avg:60.03ms +step:512/2285 train_time:30733ms step_avg:60.02ms +step:513/2285 train_time:30794ms step_avg:60.03ms +step:514/2285 train_time:30853ms step_avg:60.03ms +step:515/2285 train_time:30914ms step_avg:60.03ms +step:516/2285 train_time:30974ms step_avg:60.03ms +step:517/2285 train_time:31038ms step_avg:60.04ms +step:518/2285 train_time:31098ms step_avg:60.03ms +step:519/2285 train_time:31160ms step_avg:60.04ms +step:520/2285 train_time:31220ms step_avg:60.04ms +step:521/2285 train_time:31281ms step_avg:60.04ms +step:522/2285 train_time:31340ms step_avg:60.04ms +step:523/2285 train_time:31402ms step_avg:60.04ms +step:524/2285 train_time:31462ms step_avg:60.04ms +step:525/2285 train_time:31523ms step_avg:60.04ms +step:526/2285 train_time:31581ms step_avg:60.04ms +step:527/2285 train_time:31643ms step_avg:60.04ms +step:528/2285 train_time:31702ms step_avg:60.04ms +step:529/2285 train_time:31764ms step_avg:60.05ms +step:530/2285 train_time:31823ms step_avg:60.04ms +step:531/2285 train_time:31885ms step_avg:60.05ms +step:532/2285 train_time:31944ms step_avg:60.05ms +step:533/2285 train_time:32005ms step_avg:60.05ms +step:534/2285 train_time:32065ms step_avg:60.05ms +step:535/2285 train_time:32127ms step_avg:60.05ms +step:536/2285 train_time:32187ms step_avg:60.05ms +step:537/2285 train_time:32249ms step_avg:60.05ms +step:538/2285 train_time:32309ms step_avg:60.05ms +step:539/2285 train_time:32371ms step_avg:60.06ms +step:540/2285 train_time:32430ms step_avg:60.06ms +step:541/2285 train_time:32492ms step_avg:60.06ms +step:542/2285 train_time:32551ms step_avg:60.06ms +step:543/2285 train_time:32613ms step_avg:60.06ms +step:544/2285 train_time:32672ms step_avg:60.06ms +step:545/2285 train_time:32733ms step_avg:60.06ms +step:546/2285 train_time:32792ms step_avg:60.06ms +step:547/2285 train_time:32854ms step_avg:60.06ms +step:548/2285 train_time:32913ms step_avg:60.06ms +step:549/2285 train_time:32974ms step_avg:60.06ms +step:550/2285 train_time:33033ms step_avg:60.06ms +step:551/2285 train_time:33095ms step_avg:60.06ms +step:552/2285 train_time:33154ms step_avg:60.06ms +step:553/2285 train_time:33216ms step_avg:60.06ms +step:554/2285 train_time:33275ms step_avg:60.06ms +step:555/2285 train_time:33337ms step_avg:60.07ms +step:556/2285 train_time:33395ms step_avg:60.06ms +step:557/2285 train_time:33457ms step_avg:60.07ms +step:558/2285 train_time:33516ms step_avg:60.06ms +step:559/2285 train_time:33577ms step_avg:60.07ms +step:560/2285 train_time:33636ms step_avg:60.06ms +step:561/2285 train_time:33697ms step_avg:60.07ms +step:562/2285 train_time:33756ms step_avg:60.06ms +step:563/2285 train_time:33817ms step_avg:60.07ms +step:564/2285 train_time:33876ms step_avg:60.06ms +step:565/2285 train_time:33938ms step_avg:60.07ms +step:566/2285 train_time:33997ms step_avg:60.07ms +step:567/2285 train_time:34058ms step_avg:60.07ms +step:568/2285 train_time:34117ms step_avg:60.07ms +step:569/2285 train_time:34179ms step_avg:60.07ms +step:570/2285 train_time:34238ms step_avg:60.07ms +step:571/2285 train_time:34300ms step_avg:60.07ms +step:572/2285 train_time:34359ms step_avg:60.07ms +step:573/2285 train_time:34420ms step_avg:60.07ms +step:574/2285 train_time:34479ms step_avg:60.07ms +step:575/2285 train_time:34540ms step_avg:60.07ms +step:576/2285 train_time:34599ms step_avg:60.07ms +step:577/2285 train_time:34661ms step_avg:60.07ms +step:578/2285 train_time:34720ms step_avg:60.07ms +step:579/2285 train_time:34782ms step_avg:60.07ms +step:580/2285 train_time:34841ms step_avg:60.07ms +step:581/2285 train_time:34903ms step_avg:60.07ms +step:582/2285 train_time:34962ms step_avg:60.07ms +step:583/2285 train_time:35023ms step_avg:60.07ms +step:584/2285 train_time:35082ms step_avg:60.07ms +step:585/2285 train_time:35144ms step_avg:60.07ms +step:586/2285 train_time:35203ms step_avg:60.07ms +step:587/2285 train_time:35265ms step_avg:60.08ms +step:588/2285 train_time:35324ms step_avg:60.08ms +step:589/2285 train_time:35386ms step_avg:60.08ms +step:590/2285 train_time:35446ms step_avg:60.08ms +step:591/2285 train_time:35508ms step_avg:60.08ms +step:592/2285 train_time:35567ms step_avg:60.08ms +step:593/2285 train_time:35628ms step_avg:60.08ms +step:594/2285 train_time:35687ms step_avg:60.08ms +step:595/2285 train_time:35749ms step_avg:60.08ms +step:596/2285 train_time:35808ms step_avg:60.08ms +step:597/2285 train_time:35870ms step_avg:60.08ms +step:598/2285 train_time:35929ms step_avg:60.08ms +step:599/2285 train_time:35990ms step_avg:60.08ms +step:600/2285 train_time:36049ms step_avg:60.08ms +step:601/2285 train_time:36111ms step_avg:60.08ms +step:602/2285 train_time:36170ms step_avg:60.08ms +step:603/2285 train_time:36232ms step_avg:60.09ms +step:604/2285 train_time:36291ms step_avg:60.08ms +step:605/2285 train_time:36352ms step_avg:60.09ms +step:606/2285 train_time:36412ms step_avg:60.08ms +step:607/2285 train_time:36473ms step_avg:60.09ms +step:608/2285 train_time:36532ms step_avg:60.09ms +step:609/2285 train_time:36594ms step_avg:60.09ms +step:610/2285 train_time:36653ms step_avg:60.09ms +step:611/2285 train_time:36715ms step_avg:60.09ms +step:612/2285 train_time:36774ms step_avg:60.09ms +step:613/2285 train_time:36835ms step_avg:60.09ms +step:614/2285 train_time:36894ms step_avg:60.09ms +step:615/2285 train_time:36955ms step_avg:60.09ms +step:616/2285 train_time:37014ms step_avg:60.09ms +step:617/2285 train_time:37075ms step_avg:60.09ms +step:618/2285 train_time:37134ms step_avg:60.09ms +step:619/2285 train_time:37195ms step_avg:60.09ms +step:620/2285 train_time:37254ms step_avg:60.09ms +step:621/2285 train_time:37316ms step_avg:60.09ms +step:622/2285 train_time:37375ms step_avg:60.09ms +step:623/2285 train_time:37436ms step_avg:60.09ms +step:624/2285 train_time:37496ms step_avg:60.09ms +step:625/2285 train_time:37558ms step_avg:60.09ms +step:626/2285 train_time:37616ms step_avg:60.09ms +step:627/2285 train_time:37678ms step_avg:60.09ms +step:628/2285 train_time:37737ms step_avg:60.09ms +step:629/2285 train_time:37798ms step_avg:60.09ms +step:630/2285 train_time:37858ms step_avg:60.09ms +step:631/2285 train_time:37918ms step_avg:60.09ms +step:632/2285 train_time:37976ms step_avg:60.09ms +step:633/2285 train_time:38038ms step_avg:60.09ms +step:634/2285 train_time:38096ms step_avg:60.09ms +step:635/2285 train_time:38158ms step_avg:60.09ms +step:636/2285 train_time:38217ms step_avg:60.09ms +step:637/2285 train_time:38278ms step_avg:60.09ms +step:638/2285 train_time:38337ms step_avg:60.09ms +step:639/2285 train_time:38399ms step_avg:60.09ms +step:640/2285 train_time:38458ms step_avg:60.09ms +step:641/2285 train_time:38519ms step_avg:60.09ms +step:642/2285 train_time:38578ms step_avg:60.09ms +step:643/2285 train_time:38639ms step_avg:60.09ms +step:644/2285 train_time:38698ms step_avg:60.09ms +step:645/2285 train_time:38759ms step_avg:60.09ms +step:646/2285 train_time:38818ms step_avg:60.09ms +step:647/2285 train_time:38879ms step_avg:60.09ms +step:648/2285 train_time:38938ms step_avg:60.09ms +step:649/2285 train_time:38999ms step_avg:60.09ms +step:650/2285 train_time:39058ms step_avg:60.09ms +step:651/2285 train_time:39120ms step_avg:60.09ms +step:652/2285 train_time:39179ms step_avg:60.09ms +step:653/2285 train_time:39240ms step_avg:60.09ms +step:654/2285 train_time:39299ms step_avg:60.09ms +step:655/2285 train_time:39360ms step_avg:60.09ms +step:656/2285 train_time:39420ms step_avg:60.09ms +step:657/2285 train_time:39481ms step_avg:60.09ms +step:658/2285 train_time:39539ms step_avg:60.09ms +step:659/2285 train_time:39601ms step_avg:60.09ms +step:660/2285 train_time:39660ms step_avg:60.09ms +step:661/2285 train_time:39722ms step_avg:60.09ms +step:662/2285 train_time:39781ms step_avg:60.09ms +step:663/2285 train_time:39842ms step_avg:60.09ms +step:664/2285 train_time:39902ms step_avg:60.09ms +step:665/2285 train_time:39963ms step_avg:60.09ms +step:666/2285 train_time:40022ms step_avg:60.09ms +step:667/2285 train_time:40083ms step_avg:60.09ms +step:668/2285 train_time:40142ms step_avg:60.09ms +step:669/2285 train_time:40203ms step_avg:60.09ms +step:670/2285 train_time:40263ms step_avg:60.09ms +step:671/2285 train_time:40325ms step_avg:60.10ms +step:672/2285 train_time:40384ms step_avg:60.10ms +step:673/2285 train_time:40445ms step_avg:60.10ms +step:674/2285 train_time:40505ms step_avg:60.10ms +step:675/2285 train_time:40567ms step_avg:60.10ms +step:676/2285 train_time:40625ms step_avg:60.10ms +step:677/2285 train_time:40687ms step_avg:60.10ms +step:678/2285 train_time:40747ms step_avg:60.10ms +step:679/2285 train_time:40809ms step_avg:60.10ms +step:680/2285 train_time:40868ms step_avg:60.10ms +step:681/2285 train_time:40930ms step_avg:60.10ms +step:682/2285 train_time:40989ms step_avg:60.10ms +step:683/2285 train_time:41051ms step_avg:60.10ms +step:684/2285 train_time:41111ms step_avg:60.10ms +step:685/2285 train_time:41173ms step_avg:60.11ms +step:686/2285 train_time:41233ms step_avg:60.11ms +step:687/2285 train_time:41294ms step_avg:60.11ms +step:688/2285 train_time:41353ms step_avg:60.11ms +step:689/2285 train_time:41416ms step_avg:60.11ms +step:690/2285 train_time:41475ms step_avg:60.11ms +step:691/2285 train_time:41536ms step_avg:60.11ms +step:692/2285 train_time:41595ms step_avg:60.11ms +step:693/2285 train_time:41656ms step_avg:60.11ms +step:694/2285 train_time:41715ms step_avg:60.11ms +step:695/2285 train_time:41777ms step_avg:60.11ms +step:696/2285 train_time:41835ms step_avg:60.11ms +step:697/2285 train_time:41897ms step_avg:60.11ms +step:698/2285 train_time:41956ms step_avg:60.11ms +step:699/2285 train_time:42018ms step_avg:60.11ms +step:700/2285 train_time:42076ms step_avg:60.11ms +step:701/2285 train_time:42138ms step_avg:60.11ms +step:702/2285 train_time:42197ms step_avg:60.11ms +step:703/2285 train_time:42258ms step_avg:60.11ms +step:704/2285 train_time:42317ms step_avg:60.11ms +step:705/2285 train_time:42379ms step_avg:60.11ms +step:706/2285 train_time:42437ms step_avg:60.11ms +step:707/2285 train_time:42499ms step_avg:60.11ms +step:708/2285 train_time:42558ms step_avg:60.11ms +step:709/2285 train_time:42619ms step_avg:60.11ms +step:710/2285 train_time:42678ms step_avg:60.11ms +step:711/2285 train_time:42739ms step_avg:60.11ms +step:712/2285 train_time:42798ms step_avg:60.11ms +step:713/2285 train_time:42860ms step_avg:60.11ms +step:714/2285 train_time:42919ms step_avg:60.11ms +step:715/2285 train_time:42981ms step_avg:60.11ms +step:716/2285 train_time:43040ms step_avg:60.11ms +step:717/2285 train_time:43101ms step_avg:60.11ms +step:718/2285 train_time:43161ms step_avg:60.11ms +step:719/2285 train_time:43223ms step_avg:60.12ms +step:720/2285 train_time:43282ms step_avg:60.11ms +step:721/2285 train_time:43343ms step_avg:60.12ms +step:722/2285 train_time:43402ms step_avg:60.11ms +step:723/2285 train_time:43464ms step_avg:60.12ms +step:724/2285 train_time:43523ms step_avg:60.11ms +step:725/2285 train_time:43584ms step_avg:60.12ms +step:726/2285 train_time:43643ms step_avg:60.11ms +step:727/2285 train_time:43704ms step_avg:60.12ms +step:728/2285 train_time:43765ms step_avg:60.12ms +step:729/2285 train_time:43826ms step_avg:60.12ms +step:730/2285 train_time:43885ms step_avg:60.12ms +step:731/2285 train_time:43947ms step_avg:60.12ms +step:732/2285 train_time:44006ms step_avg:60.12ms +step:733/2285 train_time:44067ms step_avg:60.12ms +step:734/2285 train_time:44126ms step_avg:60.12ms +step:735/2285 train_time:44189ms step_avg:60.12ms +step:736/2285 train_time:44248ms step_avg:60.12ms +step:737/2285 train_time:44309ms step_avg:60.12ms +step:738/2285 train_time:44369ms step_avg:60.12ms +step:739/2285 train_time:44430ms step_avg:60.12ms +step:740/2285 train_time:44490ms step_avg:60.12ms +step:741/2285 train_time:44552ms step_avg:60.12ms +step:742/2285 train_time:44611ms step_avg:60.12ms +step:743/2285 train_time:44672ms step_avg:60.12ms +step:744/2285 train_time:44731ms step_avg:60.12ms +step:745/2285 train_time:44793ms step_avg:60.12ms +step:746/2285 train_time:44852ms step_avg:60.12ms +step:747/2285 train_time:44914ms step_avg:60.13ms +step:748/2285 train_time:44973ms step_avg:60.12ms +step:749/2285 train_time:45035ms step_avg:60.13ms +step:750/2285 train_time:45095ms step_avg:60.13ms +step:750/2285 val_loss:3.6604 train_time:45158ms step_avg:60.21ms +step:751/2285 train_time:45177ms step_avg:60.16ms +step:752/2285 train_time:45218ms step_avg:60.13ms +step:753/2285 train_time:45282ms step_avg:60.14ms +step:754/2285 train_time:45343ms step_avg:60.14ms +step:755/2285 train_time:45405ms step_avg:60.14ms +step:756/2285 train_time:45464ms step_avg:60.14ms +step:757/2285 train_time:45525ms step_avg:60.14ms +step:758/2285 train_time:45583ms step_avg:60.14ms +step:759/2285 train_time:45644ms step_avg:60.14ms +step:760/2285 train_time:45703ms step_avg:60.14ms +step:761/2285 train_time:45764ms step_avg:60.14ms +step:762/2285 train_time:45823ms step_avg:60.13ms +step:763/2285 train_time:45884ms step_avg:60.14ms +step:764/2285 train_time:45943ms step_avg:60.13ms +step:765/2285 train_time:46004ms step_avg:60.14ms +step:766/2285 train_time:46064ms step_avg:60.14ms +step:767/2285 train_time:46127ms step_avg:60.14ms +step:768/2285 train_time:46188ms step_avg:60.14ms +step:769/2285 train_time:46251ms step_avg:60.14ms +step:770/2285 train_time:46311ms step_avg:60.14ms +step:771/2285 train_time:46374ms step_avg:60.15ms +step:772/2285 train_time:46433ms step_avg:60.15ms +step:773/2285 train_time:46494ms step_avg:60.15ms +step:774/2285 train_time:46554ms step_avg:60.15ms +step:775/2285 train_time:46615ms step_avg:60.15ms +step:776/2285 train_time:46674ms step_avg:60.15ms +step:777/2285 train_time:46736ms step_avg:60.15ms +step:778/2285 train_time:46795ms step_avg:60.15ms +step:779/2285 train_time:46857ms step_avg:60.15ms +step:780/2285 train_time:46916ms step_avg:60.15ms +step:781/2285 train_time:46978ms step_avg:60.15ms +step:782/2285 train_time:47037ms step_avg:60.15ms +step:783/2285 train_time:47099ms step_avg:60.15ms +step:784/2285 train_time:47160ms step_avg:60.15ms +step:785/2285 train_time:47222ms step_avg:60.16ms +step:786/2285 train_time:47282ms step_avg:60.16ms +step:787/2285 train_time:47344ms step_avg:60.16ms +step:788/2285 train_time:47405ms step_avg:60.16ms +step:789/2285 train_time:47467ms step_avg:60.16ms +step:790/2285 train_time:47526ms step_avg:60.16ms +step:791/2285 train_time:47588ms step_avg:60.16ms +step:792/2285 train_time:47647ms step_avg:60.16ms +step:793/2285 train_time:47709ms step_avg:60.16ms +step:794/2285 train_time:47768ms step_avg:60.16ms +step:795/2285 train_time:47830ms step_avg:60.16ms +step:796/2285 train_time:47889ms step_avg:60.16ms +step:797/2285 train_time:47951ms step_avg:60.16ms +step:798/2285 train_time:48010ms step_avg:60.16ms +step:799/2285 train_time:48072ms step_avg:60.17ms +step:800/2285 train_time:48132ms step_avg:60.17ms +step:801/2285 train_time:48194ms step_avg:60.17ms +step:802/2285 train_time:48253ms step_avg:60.17ms +step:803/2285 train_time:48315ms step_avg:60.17ms +step:804/2285 train_time:48374ms step_avg:60.17ms +step:805/2285 train_time:48436ms step_avg:60.17ms +step:806/2285 train_time:48496ms step_avg:60.17ms +step:807/2285 train_time:48557ms step_avg:60.17ms +step:808/2285 train_time:48617ms step_avg:60.17ms +step:809/2285 train_time:48679ms step_avg:60.17ms +step:810/2285 train_time:48738ms step_avg:60.17ms +step:811/2285 train_time:48800ms step_avg:60.17ms +step:812/2285 train_time:48859ms step_avg:60.17ms +step:813/2285 train_time:48920ms step_avg:60.17ms +step:814/2285 train_time:48980ms step_avg:60.17ms +step:815/2285 train_time:49042ms step_avg:60.17ms +step:816/2285 train_time:49103ms step_avg:60.17ms +step:817/2285 train_time:49165ms step_avg:60.18ms +step:818/2285 train_time:49225ms step_avg:60.18ms +step:819/2285 train_time:49287ms step_avg:60.18ms +step:820/2285 train_time:49347ms step_avg:60.18ms +step:821/2285 train_time:49409ms step_avg:60.18ms +step:822/2285 train_time:49469ms step_avg:60.18ms +step:823/2285 train_time:49531ms step_avg:60.18ms +step:824/2285 train_time:49590ms step_avg:60.18ms +step:825/2285 train_time:49652ms step_avg:60.18ms +step:826/2285 train_time:49712ms step_avg:60.18ms +step:827/2285 train_time:49774ms step_avg:60.19ms +step:828/2285 train_time:49833ms step_avg:60.18ms +step:829/2285 train_time:49894ms step_avg:60.19ms +step:830/2285 train_time:49954ms step_avg:60.19ms +step:831/2285 train_time:50016ms step_avg:60.19ms +step:832/2285 train_time:50075ms step_avg:60.19ms +step:833/2285 train_time:50137ms step_avg:60.19ms +step:834/2285 train_time:50197ms step_avg:60.19ms +step:835/2285 train_time:50258ms step_avg:60.19ms +step:836/2285 train_time:50318ms step_avg:60.19ms +step:837/2285 train_time:50380ms step_avg:60.19ms +step:838/2285 train_time:50439ms step_avg:60.19ms +step:839/2285 train_time:50501ms step_avg:60.19ms +step:840/2285 train_time:50561ms step_avg:60.19ms +step:841/2285 train_time:50623ms step_avg:60.19ms +step:842/2285 train_time:50683ms step_avg:60.19ms +step:843/2285 train_time:50745ms step_avg:60.20ms +step:844/2285 train_time:50804ms step_avg:60.19ms +step:845/2285 train_time:50866ms step_avg:60.20ms +step:846/2285 train_time:50927ms step_avg:60.20ms +step:847/2285 train_time:50988ms step_avg:60.20ms +step:848/2285 train_time:51047ms step_avg:60.20ms +step:849/2285 train_time:51110ms step_avg:60.20ms +step:850/2285 train_time:51169ms step_avg:60.20ms +step:851/2285 train_time:51231ms step_avg:60.20ms +step:852/2285 train_time:51291ms step_avg:60.20ms +step:853/2285 train_time:51352ms step_avg:60.20ms +step:854/2285 train_time:51412ms step_avg:60.20ms +step:855/2285 train_time:51473ms step_avg:60.20ms +step:856/2285 train_time:51533ms step_avg:60.20ms +step:857/2285 train_time:51595ms step_avg:60.20ms +step:858/2285 train_time:51654ms step_avg:60.20ms +step:859/2285 train_time:51716ms step_avg:60.20ms +step:860/2285 train_time:51775ms step_avg:60.20ms +step:861/2285 train_time:51837ms step_avg:60.21ms +step:862/2285 train_time:51897ms step_avg:60.20ms +step:863/2285 train_time:51958ms step_avg:60.21ms +step:864/2285 train_time:52018ms step_avg:60.21ms +step:865/2285 train_time:52079ms step_avg:60.21ms +step:866/2285 train_time:52139ms step_avg:60.21ms +step:867/2285 train_time:52201ms step_avg:60.21ms +step:868/2285 train_time:52260ms step_avg:60.21ms +step:869/2285 train_time:52322ms step_avg:60.21ms +step:870/2285 train_time:52382ms step_avg:60.21ms +step:871/2285 train_time:52444ms step_avg:60.21ms +step:872/2285 train_time:52505ms step_avg:60.21ms +step:873/2285 train_time:52567ms step_avg:60.21ms +step:874/2285 train_time:52627ms step_avg:60.21ms +step:875/2285 train_time:52689ms step_avg:60.22ms +step:876/2285 train_time:52748ms step_avg:60.21ms +step:877/2285 train_time:52810ms step_avg:60.22ms +step:878/2285 train_time:52869ms step_avg:60.22ms +step:879/2285 train_time:52931ms step_avg:60.22ms +step:880/2285 train_time:52991ms step_avg:60.22ms +step:881/2285 train_time:53054ms step_avg:60.22ms +step:882/2285 train_time:53113ms step_avg:60.22ms +step:883/2285 train_time:53175ms step_avg:60.22ms +step:884/2285 train_time:53234ms step_avg:60.22ms +step:885/2285 train_time:53296ms step_avg:60.22ms +step:886/2285 train_time:53356ms step_avg:60.22ms +step:887/2285 train_time:53417ms step_avg:60.22ms +step:888/2285 train_time:53477ms step_avg:60.22ms +step:889/2285 train_time:53539ms step_avg:60.22ms +step:890/2285 train_time:53598ms step_avg:60.22ms +step:891/2285 train_time:53660ms step_avg:60.22ms +step:892/2285 train_time:53720ms step_avg:60.22ms +step:893/2285 train_time:53781ms step_avg:60.23ms +step:894/2285 train_time:53841ms step_avg:60.22ms +step:895/2285 train_time:53904ms step_avg:60.23ms +step:896/2285 train_time:53963ms step_avg:60.23ms +step:897/2285 train_time:54025ms step_avg:60.23ms +step:898/2285 train_time:54085ms step_avg:60.23ms +step:899/2285 train_time:54147ms step_avg:60.23ms +step:900/2285 train_time:54207ms step_avg:60.23ms +step:901/2285 train_time:54269ms step_avg:60.23ms +step:902/2285 train_time:54329ms step_avg:60.23ms +step:903/2285 train_time:54390ms step_avg:60.23ms +step:904/2285 train_time:54450ms step_avg:60.23ms +step:905/2285 train_time:54511ms step_avg:60.23ms +step:906/2285 train_time:54571ms step_avg:60.23ms +step:907/2285 train_time:54632ms step_avg:60.23ms +step:908/2285 train_time:54692ms step_avg:60.23ms +step:909/2285 train_time:54753ms step_avg:60.23ms +step:910/2285 train_time:54813ms step_avg:60.23ms +step:911/2285 train_time:54875ms step_avg:60.24ms +step:912/2285 train_time:54935ms step_avg:60.24ms +step:913/2285 train_time:54997ms step_avg:60.24ms +step:914/2285 train_time:55056ms step_avg:60.24ms +step:915/2285 train_time:55118ms step_avg:60.24ms +step:916/2285 train_time:55178ms step_avg:60.24ms +step:917/2285 train_time:55240ms step_avg:60.24ms +step:918/2285 train_time:55299ms step_avg:60.24ms +step:919/2285 train_time:55361ms step_avg:60.24ms +step:920/2285 train_time:55421ms step_avg:60.24ms +step:921/2285 train_time:55483ms step_avg:60.24ms +step:922/2285 train_time:55542ms step_avg:60.24ms +step:923/2285 train_time:55604ms step_avg:60.24ms +step:924/2285 train_time:55663ms step_avg:60.24ms +step:925/2285 train_time:55725ms step_avg:60.24ms +step:926/2285 train_time:55784ms step_avg:60.24ms +step:927/2285 train_time:55846ms step_avg:60.24ms +step:928/2285 train_time:55907ms step_avg:60.24ms +step:929/2285 train_time:55969ms step_avg:60.25ms +step:930/2285 train_time:56028ms step_avg:60.25ms +step:931/2285 train_time:56090ms step_avg:60.25ms +step:932/2285 train_time:56150ms step_avg:60.25ms +step:933/2285 train_time:56212ms step_avg:60.25ms +step:934/2285 train_time:56271ms step_avg:60.25ms +step:935/2285 train_time:56334ms step_avg:60.25ms +step:936/2285 train_time:56393ms step_avg:60.25ms +step:937/2285 train_time:56455ms step_avg:60.25ms +step:938/2285 train_time:56514ms step_avg:60.25ms +step:939/2285 train_time:56576ms step_avg:60.25ms +step:940/2285 train_time:56635ms step_avg:60.25ms +step:941/2285 train_time:56696ms step_avg:60.25ms +step:942/2285 train_time:56756ms step_avg:60.25ms +step:943/2285 train_time:56818ms step_avg:60.25ms +step:944/2285 train_time:56878ms step_avg:60.25ms +step:945/2285 train_time:56939ms step_avg:60.25ms +step:946/2285 train_time:56999ms step_avg:60.25ms +step:947/2285 train_time:57061ms step_avg:60.25ms +step:948/2285 train_time:57121ms step_avg:60.25ms +step:949/2285 train_time:57183ms step_avg:60.26ms +step:950/2285 train_time:57243ms step_avg:60.26ms +step:951/2285 train_time:57305ms step_avg:60.26ms +step:952/2285 train_time:57365ms step_avg:60.26ms +step:953/2285 train_time:57427ms step_avg:60.26ms +step:954/2285 train_time:57487ms step_avg:60.26ms +step:955/2285 train_time:57548ms step_avg:60.26ms +step:956/2285 train_time:57608ms step_avg:60.26ms +step:957/2285 train_time:57670ms step_avg:60.26ms +step:958/2285 train_time:57730ms step_avg:60.26ms +step:959/2285 train_time:57792ms step_avg:60.26ms +step:960/2285 train_time:57852ms step_avg:60.26ms +step:961/2285 train_time:57913ms step_avg:60.26ms +step:962/2285 train_time:57973ms step_avg:60.26ms +step:963/2285 train_time:58034ms step_avg:60.26ms +step:964/2285 train_time:58094ms step_avg:60.26ms +step:965/2285 train_time:58156ms step_avg:60.27ms +step:966/2285 train_time:58216ms step_avg:60.27ms +step:967/2285 train_time:58278ms step_avg:60.27ms +step:968/2285 train_time:58337ms step_avg:60.27ms +step:969/2285 train_time:58399ms step_avg:60.27ms +step:970/2285 train_time:58459ms step_avg:60.27ms +step:971/2285 train_time:58521ms step_avg:60.27ms +step:972/2285 train_time:58580ms step_avg:60.27ms +step:973/2285 train_time:58641ms step_avg:60.27ms +step:974/2285 train_time:58701ms step_avg:60.27ms +step:975/2285 train_time:58763ms step_avg:60.27ms +step:976/2285 train_time:58823ms step_avg:60.27ms +step:977/2285 train_time:58885ms step_avg:60.27ms +step:978/2285 train_time:58945ms step_avg:60.27ms +step:979/2285 train_time:59008ms step_avg:60.27ms +step:980/2285 train_time:59067ms step_avg:60.27ms +step:981/2285 train_time:59129ms step_avg:60.27ms +step:982/2285 train_time:59189ms step_avg:60.27ms +step:983/2285 train_time:59251ms step_avg:60.28ms +step:984/2285 train_time:59310ms step_avg:60.27ms +step:985/2285 train_time:59372ms step_avg:60.28ms +step:986/2285 train_time:59432ms step_avg:60.28ms +step:987/2285 train_time:59494ms step_avg:60.28ms +step:988/2285 train_time:59553ms step_avg:60.28ms +step:989/2285 train_time:59615ms step_avg:60.28ms +step:990/2285 train_time:59675ms step_avg:60.28ms +step:991/2285 train_time:59737ms step_avg:60.28ms +step:992/2285 train_time:59796ms step_avg:60.28ms +step:993/2285 train_time:59858ms step_avg:60.28ms +step:994/2285 train_time:59918ms step_avg:60.28ms +step:995/2285 train_time:59980ms step_avg:60.28ms +step:996/2285 train_time:60039ms step_avg:60.28ms +step:997/2285 train_time:60101ms step_avg:60.28ms +step:998/2285 train_time:60161ms step_avg:60.28ms +step:999/2285 train_time:60222ms step_avg:60.28ms +step:1000/2285 train_time:60281ms step_avg:60.28ms +step:1000/2285 val_loss:3.5649 train_time:60345ms step_avg:60.35ms +step:1001/2285 train_time:60364ms step_avg:60.30ms +step:1002/2285 train_time:60406ms step_avg:60.29ms +step:1003/2285 train_time:60468ms step_avg:60.29ms +step:1004/2285 train_time:60528ms step_avg:60.29ms +step:1005/2285 train_time:60593ms step_avg:60.29ms +step:1006/2285 train_time:60654ms step_avg:60.29ms +step:1007/2285 train_time:60715ms step_avg:60.29ms +step:1008/2285 train_time:60774ms step_avg:60.29ms +step:1009/2285 train_time:60835ms step_avg:60.29ms +step:1010/2285 train_time:60894ms step_avg:60.29ms +step:1011/2285 train_time:60955ms step_avg:60.29ms +step:1012/2285 train_time:61013ms step_avg:60.29ms +step:1013/2285 train_time:61074ms step_avg:60.29ms +step:1014/2285 train_time:61133ms step_avg:60.29ms +step:1015/2285 train_time:61194ms step_avg:60.29ms +step:1016/2285 train_time:61254ms step_avg:60.29ms +step:1017/2285 train_time:61317ms step_avg:60.29ms +step:1018/2285 train_time:61377ms step_avg:60.29ms +step:1019/2285 train_time:61440ms step_avg:60.29ms +step:1020/2285 train_time:61501ms step_avg:60.29ms +step:1021/2285 train_time:61563ms step_avg:60.30ms +step:1022/2285 train_time:61623ms step_avg:60.30ms +step:1023/2285 train_time:61685ms step_avg:60.30ms +step:1024/2285 train_time:61744ms step_avg:60.30ms +step:1025/2285 train_time:61806ms step_avg:60.30ms +step:1026/2285 train_time:61866ms step_avg:60.30ms +step:1027/2285 train_time:61927ms step_avg:60.30ms +step:1028/2285 train_time:61986ms step_avg:60.30ms +step:1029/2285 train_time:62048ms step_avg:60.30ms +step:1030/2285 train_time:62107ms step_avg:60.30ms +step:1031/2285 train_time:62168ms step_avg:60.30ms +step:1032/2285 train_time:62228ms step_avg:60.30ms +step:1033/2285 train_time:62290ms step_avg:60.30ms +step:1034/2285 train_time:62351ms step_avg:60.30ms +step:1035/2285 train_time:62413ms step_avg:60.30ms +step:1036/2285 train_time:62474ms step_avg:60.30ms +step:1037/2285 train_time:62536ms step_avg:60.30ms +step:1038/2285 train_time:62596ms step_avg:60.30ms +step:1039/2285 train_time:62658ms step_avg:60.31ms +step:1040/2285 train_time:62717ms step_avg:60.31ms +step:1041/2285 train_time:62780ms step_avg:60.31ms +step:1042/2285 train_time:62838ms step_avg:60.31ms +step:1043/2285 train_time:62900ms step_avg:60.31ms +step:1044/2285 train_time:62959ms step_avg:60.31ms +step:1045/2285 train_time:63021ms step_avg:60.31ms +step:1046/2285 train_time:63081ms step_avg:60.31ms +step:1047/2285 train_time:63142ms step_avg:60.31ms +step:1048/2285 train_time:63202ms step_avg:60.31ms +step:1049/2285 train_time:63264ms step_avg:60.31ms +step:1050/2285 train_time:63324ms step_avg:60.31ms +step:1051/2285 train_time:63386ms step_avg:60.31ms +step:1052/2285 train_time:63446ms step_avg:60.31ms +step:1053/2285 train_time:63508ms step_avg:60.31ms +step:1054/2285 train_time:63568ms step_avg:60.31ms +step:1055/2285 train_time:63630ms step_avg:60.31ms +step:1056/2285 train_time:63691ms step_avg:60.31ms +step:1057/2285 train_time:63754ms step_avg:60.32ms +step:1058/2285 train_time:63814ms step_avg:60.32ms +step:1059/2285 train_time:63875ms step_avg:60.32ms +step:1060/2285 train_time:63935ms step_avg:60.32ms +step:1061/2285 train_time:63996ms step_avg:60.32ms +step:1062/2285 train_time:64056ms step_avg:60.32ms +step:1063/2285 train_time:64117ms step_avg:60.32ms +step:1064/2285 train_time:64177ms step_avg:60.32ms +step:1065/2285 train_time:64239ms step_avg:60.32ms +step:1066/2285 train_time:64299ms step_avg:60.32ms +step:1067/2285 train_time:64361ms step_avg:60.32ms +step:1068/2285 train_time:64420ms step_avg:60.32ms +step:1069/2285 train_time:64482ms step_avg:60.32ms +step:1070/2285 train_time:64541ms step_avg:60.32ms +step:1071/2285 train_time:64603ms step_avg:60.32ms +step:1072/2285 train_time:64663ms step_avg:60.32ms +step:1073/2285 train_time:64726ms step_avg:60.32ms +step:1074/2285 train_time:64786ms step_avg:60.32ms +step:1075/2285 train_time:64848ms step_avg:60.32ms +step:1076/2285 train_time:64907ms step_avg:60.32ms +step:1077/2285 train_time:64969ms step_avg:60.32ms +step:1078/2285 train_time:65028ms step_avg:60.32ms +step:1079/2285 train_time:65091ms step_avg:60.32ms +step:1080/2285 train_time:65150ms step_avg:60.32ms +step:1081/2285 train_time:65212ms step_avg:60.33ms +step:1082/2285 train_time:65272ms step_avg:60.33ms +step:1083/2285 train_time:65334ms step_avg:60.33ms +step:1084/2285 train_time:65393ms step_avg:60.33ms +step:1085/2285 train_time:65456ms step_avg:60.33ms +step:1086/2285 train_time:65515ms step_avg:60.33ms +step:1087/2285 train_time:65577ms step_avg:60.33ms +step:1088/2285 train_time:65637ms step_avg:60.33ms +step:1089/2285 train_time:65699ms step_avg:60.33ms +step:1090/2285 train_time:65758ms step_avg:60.33ms +step:1091/2285 train_time:65821ms step_avg:60.33ms +step:1092/2285 train_time:65880ms step_avg:60.33ms +step:1093/2285 train_time:65942ms step_avg:60.33ms +step:1094/2285 train_time:66001ms step_avg:60.33ms +step:1095/2285 train_time:66063ms step_avg:60.33ms +step:1096/2285 train_time:66123ms step_avg:60.33ms +step:1097/2285 train_time:66184ms step_avg:60.33ms +step:1098/2285 train_time:66244ms step_avg:60.33ms +step:1099/2285 train_time:66305ms step_avg:60.33ms +step:1100/2285 train_time:66365ms step_avg:60.33ms +step:1101/2285 train_time:66427ms step_avg:60.33ms +step:1102/2285 train_time:66486ms step_avg:60.33ms +step:1103/2285 train_time:66548ms step_avg:60.33ms +step:1104/2285 train_time:66608ms step_avg:60.33ms +step:1105/2285 train_time:66672ms step_avg:60.34ms +step:1106/2285 train_time:66730ms step_avg:60.33ms +step:1107/2285 train_time:66792ms step_avg:60.34ms +step:1108/2285 train_time:66852ms step_avg:60.34ms +step:1109/2285 train_time:66914ms step_avg:60.34ms +step:1110/2285 train_time:66974ms step_avg:60.34ms +step:1111/2285 train_time:67035ms step_avg:60.34ms +step:1112/2285 train_time:67095ms step_avg:60.34ms +step:1113/2285 train_time:67157ms step_avg:60.34ms +step:1114/2285 train_time:67216ms step_avg:60.34ms +step:1115/2285 train_time:67277ms step_avg:60.34ms +step:1116/2285 train_time:67336ms step_avg:60.34ms +step:1117/2285 train_time:67398ms step_avg:60.34ms +step:1118/2285 train_time:67458ms step_avg:60.34ms +step:1119/2285 train_time:67520ms step_avg:60.34ms +step:1120/2285 train_time:67579ms step_avg:60.34ms +step:1121/2285 train_time:67642ms step_avg:60.34ms +step:1122/2285 train_time:67701ms step_avg:60.34ms +step:1123/2285 train_time:67763ms step_avg:60.34ms +step:1124/2285 train_time:67822ms step_avg:60.34ms +step:1125/2285 train_time:67884ms step_avg:60.34ms +step:1126/2285 train_time:67943ms step_avg:60.34ms +step:1127/2285 train_time:68005ms step_avg:60.34ms +step:1128/2285 train_time:68066ms step_avg:60.34ms +step:1129/2285 train_time:68128ms step_avg:60.34ms +step:1130/2285 train_time:68187ms step_avg:60.34ms +step:1131/2285 train_time:68249ms step_avg:60.34ms +step:1132/2285 train_time:68309ms step_avg:60.34ms +step:1133/2285 train_time:68371ms step_avg:60.34ms +step:1134/2285 train_time:68430ms step_avg:60.34ms +step:1135/2285 train_time:68492ms step_avg:60.35ms +step:1136/2285 train_time:68553ms step_avg:60.35ms +step:1137/2285 train_time:68614ms step_avg:60.35ms +step:1138/2285 train_time:68674ms step_avg:60.35ms +step:1139/2285 train_time:68736ms step_avg:60.35ms +step:1140/2285 train_time:68795ms step_avg:60.35ms +step:1141/2285 train_time:68857ms step_avg:60.35ms +step:1142/2285 train_time:68917ms step_avg:60.35ms +step:1143/2285 train_time:68980ms step_avg:60.35ms +step:1144/2285 train_time:69039ms step_avg:60.35ms +step:1145/2285 train_time:69101ms step_avg:60.35ms +step:1146/2285 train_time:69160ms step_avg:60.35ms +step:1147/2285 train_time:69222ms step_avg:60.35ms +step:1148/2285 train_time:69282ms step_avg:60.35ms +step:1149/2285 train_time:69345ms step_avg:60.35ms +step:1150/2285 train_time:69404ms step_avg:60.35ms +step:1151/2285 train_time:69467ms step_avg:60.35ms +step:1152/2285 train_time:69526ms step_avg:60.35ms +step:1153/2285 train_time:69588ms step_avg:60.35ms +step:1154/2285 train_time:69649ms step_avg:60.35ms +step:1155/2285 train_time:69711ms step_avg:60.36ms +step:1156/2285 train_time:69771ms step_avg:60.36ms +step:1157/2285 train_time:69834ms step_avg:60.36ms +step:1158/2285 train_time:69894ms step_avg:60.36ms +step:1159/2285 train_time:69956ms step_avg:60.36ms +step:1160/2285 train_time:70016ms step_avg:60.36ms +step:1161/2285 train_time:70078ms step_avg:60.36ms +step:1162/2285 train_time:70138ms step_avg:60.36ms +step:1163/2285 train_time:70200ms step_avg:60.36ms +step:1164/2285 train_time:70260ms step_avg:60.36ms +step:1165/2285 train_time:70322ms step_avg:60.36ms +step:1166/2285 train_time:70382ms step_avg:60.36ms +step:1167/2285 train_time:70444ms step_avg:60.36ms +step:1168/2285 train_time:70504ms step_avg:60.36ms +step:1169/2285 train_time:70567ms step_avg:60.37ms +step:1170/2285 train_time:70627ms step_avg:60.37ms +step:1171/2285 train_time:70689ms step_avg:60.37ms +step:1172/2285 train_time:70749ms step_avg:60.37ms +step:1173/2285 train_time:70811ms step_avg:60.37ms +step:1174/2285 train_time:70871ms step_avg:60.37ms +step:1175/2285 train_time:70934ms step_avg:60.37ms +step:1176/2285 train_time:70994ms step_avg:60.37ms +step:1177/2285 train_time:71056ms step_avg:60.37ms +step:1178/2285 train_time:71116ms step_avg:60.37ms +step:1179/2285 train_time:71178ms step_avg:60.37ms +step:1180/2285 train_time:71238ms step_avg:60.37ms +step:1181/2285 train_time:71300ms step_avg:60.37ms +step:1182/2285 train_time:71360ms step_avg:60.37ms +step:1183/2285 train_time:71422ms step_avg:60.37ms +step:1184/2285 train_time:71481ms step_avg:60.37ms +step:1185/2285 train_time:71543ms step_avg:60.37ms +step:1186/2285 train_time:71603ms step_avg:60.37ms +step:1187/2285 train_time:71666ms step_avg:60.38ms +step:1188/2285 train_time:71726ms step_avg:60.38ms +step:1189/2285 train_time:71789ms step_avg:60.38ms +step:1190/2285 train_time:71850ms step_avg:60.38ms +step:1191/2285 train_time:71913ms step_avg:60.38ms +step:1192/2285 train_time:71973ms step_avg:60.38ms +step:1193/2285 train_time:72035ms step_avg:60.38ms +step:1194/2285 train_time:72094ms step_avg:60.38ms +step:1195/2285 train_time:72156ms step_avg:60.38ms +step:1196/2285 train_time:72216ms step_avg:60.38ms +step:1197/2285 train_time:72278ms step_avg:60.38ms +step:1198/2285 train_time:72338ms step_avg:60.38ms +step:1199/2285 train_time:72400ms step_avg:60.38ms +step:1200/2285 train_time:72460ms step_avg:60.38ms +step:1201/2285 train_time:72522ms step_avg:60.38ms +step:1202/2285 train_time:72583ms step_avg:60.38ms +step:1203/2285 train_time:72644ms step_avg:60.39ms +step:1204/2285 train_time:72703ms step_avg:60.38ms +step:1205/2285 train_time:72766ms step_avg:60.39ms +step:1206/2285 train_time:72826ms step_avg:60.39ms +step:1207/2285 train_time:72888ms step_avg:60.39ms +step:1208/2285 train_time:72948ms step_avg:60.39ms +step:1209/2285 train_time:73010ms step_avg:60.39ms +step:1210/2285 train_time:73070ms step_avg:60.39ms +step:1211/2285 train_time:73133ms step_avg:60.39ms +step:1212/2285 train_time:73193ms step_avg:60.39ms +step:1213/2285 train_time:73256ms step_avg:60.39ms +step:1214/2285 train_time:73316ms step_avg:60.39ms +step:1215/2285 train_time:73378ms step_avg:60.39ms +step:1216/2285 train_time:73438ms step_avg:60.39ms +step:1217/2285 train_time:73500ms step_avg:60.39ms +step:1218/2285 train_time:73560ms step_avg:60.39ms +step:1219/2285 train_time:73622ms step_avg:60.40ms +step:1220/2285 train_time:73682ms step_avg:60.39ms +step:1221/2285 train_time:73745ms step_avg:60.40ms +step:1222/2285 train_time:73805ms step_avg:60.40ms +step:1223/2285 train_time:73867ms step_avg:60.40ms +step:1224/2285 train_time:73927ms step_avg:60.40ms +step:1225/2285 train_time:73989ms step_avg:60.40ms +step:1226/2285 train_time:74050ms step_avg:60.40ms +step:1227/2285 train_time:74113ms step_avg:60.40ms +step:1228/2285 train_time:74173ms step_avg:60.40ms +step:1229/2285 train_time:74235ms step_avg:60.40ms +step:1230/2285 train_time:74295ms step_avg:60.40ms +step:1231/2285 train_time:74357ms step_avg:60.40ms +step:1232/2285 train_time:74418ms step_avg:60.40ms +step:1233/2285 train_time:74481ms step_avg:60.41ms +step:1234/2285 train_time:74540ms step_avg:60.40ms +step:1235/2285 train_time:74601ms step_avg:60.41ms +step:1236/2285 train_time:74661ms step_avg:60.41ms +step:1237/2285 train_time:74723ms step_avg:60.41ms +step:1238/2285 train_time:74782ms step_avg:60.41ms +step:1239/2285 train_time:74844ms step_avg:60.41ms +step:1240/2285 train_time:74904ms step_avg:60.41ms +step:1241/2285 train_time:74966ms step_avg:60.41ms +step:1242/2285 train_time:75026ms step_avg:60.41ms +step:1243/2285 train_time:75089ms step_avg:60.41ms +step:1244/2285 train_time:75149ms step_avg:60.41ms +step:1245/2285 train_time:75211ms step_avg:60.41ms +step:1246/2285 train_time:75272ms step_avg:60.41ms +step:1247/2285 train_time:75335ms step_avg:60.41ms +step:1248/2285 train_time:75395ms step_avg:60.41ms +step:1249/2285 train_time:75458ms step_avg:60.41ms +step:1250/2285 train_time:75517ms step_avg:60.41ms +step:1250/2285 val_loss:3.4939 train_time:75581ms step_avg:60.46ms +step:1251/2285 train_time:75600ms step_avg:60.43ms +step:1252/2285 train_time:75641ms step_avg:60.42ms +step:1253/2285 train_time:75703ms step_avg:60.42ms +step:1254/2285 train_time:75762ms step_avg:60.42ms +step:1255/2285 train_time:75823ms step_avg:60.42ms +step:1256/2285 train_time:75882ms step_avg:60.42ms +step:1257/2285 train_time:75943ms step_avg:60.42ms +step:1258/2285 train_time:76002ms step_avg:60.41ms +step:1259/2285 train_time:76063ms step_avg:60.42ms +step:1260/2285 train_time:76122ms step_avg:60.41ms +step:1261/2285 train_time:76183ms step_avg:60.41ms +step:1262/2285 train_time:76241ms step_avg:60.41ms +step:1263/2285 train_time:76302ms step_avg:60.41ms +step:1264/2285 train_time:76363ms step_avg:60.41ms +step:1265/2285 train_time:76422ms step_avg:60.41ms +step:1266/2285 train_time:76485ms step_avg:60.41ms +step:1267/2285 train_time:76553ms step_avg:60.42ms +step:1268/2285 train_time:76614ms step_avg:60.42ms +step:1269/2285 train_time:76676ms step_avg:60.42ms +step:1270/2285 train_time:76735ms step_avg:60.42ms +step:1271/2285 train_time:76798ms step_avg:60.42ms +step:1272/2285 train_time:76857ms step_avg:60.42ms +step:1273/2285 train_time:76918ms step_avg:60.42ms +step:1274/2285 train_time:76978ms step_avg:60.42ms +step:1275/2285 train_time:77040ms step_avg:60.42ms +step:1276/2285 train_time:77098ms step_avg:60.42ms +step:1277/2285 train_time:77160ms step_avg:60.42ms +step:1278/2285 train_time:77219ms step_avg:60.42ms +step:1279/2285 train_time:77280ms step_avg:60.42ms +step:1280/2285 train_time:77339ms step_avg:60.42ms +step:1281/2285 train_time:77401ms step_avg:60.42ms +step:1282/2285 train_time:77463ms step_avg:60.42ms +step:1283/2285 train_time:77527ms step_avg:60.43ms +step:1284/2285 train_time:77587ms step_avg:60.43ms +step:1285/2285 train_time:77649ms step_avg:60.43ms +step:1286/2285 train_time:77709ms step_avg:60.43ms +step:1287/2285 train_time:77771ms step_avg:60.43ms +step:1288/2285 train_time:77830ms step_avg:60.43ms +step:1289/2285 train_time:77893ms step_avg:60.43ms +step:1290/2285 train_time:77953ms step_avg:60.43ms +step:1291/2285 train_time:78014ms step_avg:60.43ms +step:1292/2285 train_time:78073ms step_avg:60.43ms +step:1293/2285 train_time:78135ms step_avg:60.43ms +step:1294/2285 train_time:78194ms step_avg:60.43ms +step:1295/2285 train_time:78256ms step_avg:60.43ms +step:1296/2285 train_time:78316ms step_avg:60.43ms +step:1297/2285 train_time:78380ms step_avg:60.43ms +step:1298/2285 train_time:78440ms step_avg:60.43ms +step:1299/2285 train_time:78503ms step_avg:60.43ms +step:1300/2285 train_time:78564ms step_avg:60.43ms +step:1301/2285 train_time:78626ms step_avg:60.43ms +step:1302/2285 train_time:78685ms step_avg:60.43ms +step:1303/2285 train_time:78747ms step_avg:60.44ms +step:1304/2285 train_time:78806ms step_avg:60.43ms +step:1305/2285 train_time:78868ms step_avg:60.44ms +step:1306/2285 train_time:78927ms step_avg:60.43ms +step:1307/2285 train_time:78988ms step_avg:60.43ms +step:1308/2285 train_time:79048ms step_avg:60.43ms +step:1309/2285 train_time:79110ms step_avg:60.44ms +step:1310/2285 train_time:79170ms step_avg:60.44ms +step:1311/2285 train_time:79232ms step_avg:60.44ms +step:1312/2285 train_time:79292ms step_avg:60.44ms +step:1313/2285 train_time:79354ms step_avg:60.44ms +step:1314/2285 train_time:79415ms step_avg:60.44ms +step:1315/2285 train_time:79477ms step_avg:60.44ms +step:1316/2285 train_time:79538ms step_avg:60.44ms +step:1317/2285 train_time:79601ms step_avg:60.44ms +step:1318/2285 train_time:79661ms step_avg:60.44ms +step:1319/2285 train_time:79723ms step_avg:60.44ms +step:1320/2285 train_time:79782ms step_avg:60.44ms +step:1321/2285 train_time:79844ms step_avg:60.44ms +step:1322/2285 train_time:79903ms step_avg:60.44ms +step:1323/2285 train_time:79965ms step_avg:60.44ms +step:1324/2285 train_time:80025ms step_avg:60.44ms +step:1325/2285 train_time:80087ms step_avg:60.44ms +step:1326/2285 train_time:80147ms step_avg:60.44ms +step:1327/2285 train_time:80208ms step_avg:60.44ms +step:1328/2285 train_time:80272ms step_avg:60.45ms +step:1329/2285 train_time:80331ms step_avg:60.44ms +step:1330/2285 train_time:80391ms step_avg:60.44ms +step:1331/2285 train_time:80453ms step_avg:60.45ms +step:1332/2285 train_time:80513ms step_avg:60.44ms +step:1333/2285 train_time:80575ms step_avg:60.45ms +step:1334/2285 train_time:80636ms step_avg:60.45ms +step:1335/2285 train_time:80699ms step_avg:60.45ms +step:1336/2285 train_time:80758ms step_avg:60.45ms +step:1337/2285 train_time:80820ms step_avg:60.45ms +step:1338/2285 train_time:80880ms step_avg:60.45ms +step:1339/2285 train_time:80943ms step_avg:60.45ms +step:1340/2285 train_time:81002ms step_avg:60.45ms +step:1341/2285 train_time:81064ms step_avg:60.45ms +step:1342/2285 train_time:81124ms step_avg:60.45ms +step:1343/2285 train_time:81186ms step_avg:60.45ms +step:1344/2285 train_time:81245ms step_avg:60.45ms +step:1345/2285 train_time:81308ms step_avg:60.45ms +step:1346/2285 train_time:81369ms step_avg:60.45ms +step:1347/2285 train_time:81430ms step_avg:60.45ms +step:1348/2285 train_time:81490ms step_avg:60.45ms +step:1349/2285 train_time:81553ms step_avg:60.45ms +step:1350/2285 train_time:81613ms step_avg:60.45ms +step:1351/2285 train_time:81676ms step_avg:60.46ms +step:1352/2285 train_time:81737ms step_avg:60.46ms +step:1353/2285 train_time:81799ms step_avg:60.46ms +step:1354/2285 train_time:81859ms step_avg:60.46ms +step:1355/2285 train_time:81920ms step_avg:60.46ms +step:1356/2285 train_time:81980ms step_avg:60.46ms +step:1357/2285 train_time:82042ms step_avg:60.46ms +step:1358/2285 train_time:82101ms step_avg:60.46ms +step:1359/2285 train_time:82163ms step_avg:60.46ms +step:1360/2285 train_time:82223ms step_avg:60.46ms +step:1361/2285 train_time:82285ms step_avg:60.46ms +step:1362/2285 train_time:82345ms step_avg:60.46ms +step:1363/2285 train_time:82407ms step_avg:60.46ms +step:1364/2285 train_time:82467ms step_avg:60.46ms +step:1365/2285 train_time:82529ms step_avg:60.46ms +step:1366/2285 train_time:82589ms step_avg:60.46ms +step:1367/2285 train_time:82652ms step_avg:60.46ms +step:1368/2285 train_time:82712ms step_avg:60.46ms +step:1369/2285 train_time:82774ms step_avg:60.46ms +step:1370/2285 train_time:82835ms step_avg:60.46ms +step:1371/2285 train_time:82897ms step_avg:60.46ms +step:1372/2285 train_time:82957ms step_avg:60.46ms +step:1373/2285 train_time:83020ms step_avg:60.47ms +step:1374/2285 train_time:83079ms step_avg:60.47ms +step:1375/2285 train_time:83141ms step_avg:60.47ms +step:1376/2285 train_time:83201ms step_avg:60.47ms +step:1377/2285 train_time:83264ms step_avg:60.47ms +step:1378/2285 train_time:83323ms step_avg:60.47ms +step:1379/2285 train_time:83385ms step_avg:60.47ms +step:1380/2285 train_time:83445ms step_avg:60.47ms +step:1381/2285 train_time:83507ms step_avg:60.47ms +step:1382/2285 train_time:83567ms step_avg:60.47ms +step:1383/2285 train_time:83629ms step_avg:60.47ms +step:1384/2285 train_time:83690ms step_avg:60.47ms +step:1385/2285 train_time:83753ms step_avg:60.47ms +step:1386/2285 train_time:83813ms step_avg:60.47ms +step:1387/2285 train_time:83875ms step_avg:60.47ms +step:1388/2285 train_time:83936ms step_avg:60.47ms +step:1389/2285 train_time:83998ms step_avg:60.47ms +step:1390/2285 train_time:84058ms step_avg:60.47ms +step:1391/2285 train_time:84120ms step_avg:60.47ms +step:1392/2285 train_time:84180ms step_avg:60.47ms +step:1393/2285 train_time:84242ms step_avg:60.47ms +step:1394/2285 train_time:84302ms step_avg:60.47ms +step:1395/2285 train_time:84364ms step_avg:60.48ms +step:1396/2285 train_time:84424ms step_avg:60.48ms +step:1397/2285 train_time:84486ms step_avg:60.48ms +step:1398/2285 train_time:84546ms step_avg:60.48ms +step:1399/2285 train_time:84608ms step_avg:60.48ms +step:1400/2285 train_time:84668ms step_avg:60.48ms +step:1401/2285 train_time:84730ms step_avg:60.48ms +step:1402/2285 train_time:84790ms step_avg:60.48ms +step:1403/2285 train_time:84853ms step_avg:60.48ms +step:1404/2285 train_time:84913ms step_avg:60.48ms +step:1405/2285 train_time:84974ms step_avg:60.48ms +step:1406/2285 train_time:85034ms step_avg:60.48ms +step:1407/2285 train_time:85097ms step_avg:60.48ms +step:1408/2285 train_time:85156ms step_avg:60.48ms +step:1409/2285 train_time:85219ms step_avg:60.48ms +step:1410/2285 train_time:85278ms step_avg:60.48ms +step:1411/2285 train_time:85341ms step_avg:60.48ms +step:1412/2285 train_time:85401ms step_avg:60.48ms +step:1413/2285 train_time:85463ms step_avg:60.48ms +step:1414/2285 train_time:85523ms step_avg:60.48ms +step:1415/2285 train_time:85585ms step_avg:60.48ms +step:1416/2285 train_time:85645ms step_avg:60.48ms +step:1417/2285 train_time:85707ms step_avg:60.49ms +step:1418/2285 train_time:85767ms step_avg:60.48ms +step:1419/2285 train_time:85829ms step_avg:60.49ms +step:1420/2285 train_time:85889ms step_avg:60.49ms +step:1421/2285 train_time:85952ms step_avg:60.49ms +step:1422/2285 train_time:86012ms step_avg:60.49ms +step:1423/2285 train_time:86074ms step_avg:60.49ms +step:1424/2285 train_time:86134ms step_avg:60.49ms +step:1425/2285 train_time:86196ms step_avg:60.49ms +step:1426/2285 train_time:86256ms step_avg:60.49ms +step:1427/2285 train_time:86318ms step_avg:60.49ms +step:1428/2285 train_time:86378ms step_avg:60.49ms +step:1429/2285 train_time:86440ms step_avg:60.49ms +step:1430/2285 train_time:86500ms step_avg:60.49ms +step:1431/2285 train_time:86563ms step_avg:60.49ms +step:1432/2285 train_time:86622ms step_avg:60.49ms +step:1433/2285 train_time:86685ms step_avg:60.49ms +step:1434/2285 train_time:86744ms step_avg:60.49ms +step:1435/2285 train_time:86806ms step_avg:60.49ms +step:1436/2285 train_time:86866ms step_avg:60.49ms +step:1437/2285 train_time:86928ms step_avg:60.49ms +step:1438/2285 train_time:86988ms step_avg:60.49ms +step:1439/2285 train_time:87050ms step_avg:60.49ms +step:1440/2285 train_time:87111ms step_avg:60.49ms +step:1441/2285 train_time:87173ms step_avg:60.49ms +step:1442/2285 train_time:87233ms step_avg:60.49ms +step:1443/2285 train_time:87295ms step_avg:60.50ms +step:1444/2285 train_time:87355ms step_avg:60.50ms +step:1445/2285 train_time:87417ms step_avg:60.50ms +step:1446/2285 train_time:87477ms step_avg:60.50ms +step:1447/2285 train_time:87540ms step_avg:60.50ms +step:1448/2285 train_time:87600ms step_avg:60.50ms +step:1449/2285 train_time:87663ms step_avg:60.50ms +step:1450/2285 train_time:87723ms step_avg:60.50ms +step:1451/2285 train_time:87785ms step_avg:60.50ms +step:1452/2285 train_time:87844ms step_avg:60.50ms +step:1453/2285 train_time:87906ms step_avg:60.50ms +step:1454/2285 train_time:87966ms step_avg:60.50ms +step:1455/2285 train_time:88027ms step_avg:60.50ms +step:1456/2285 train_time:88088ms step_avg:60.50ms +step:1457/2285 train_time:88150ms step_avg:60.50ms +step:1458/2285 train_time:88210ms step_avg:60.50ms +step:1459/2285 train_time:88273ms step_avg:60.50ms +step:1460/2285 train_time:88332ms step_avg:60.50ms +step:1461/2285 train_time:88395ms step_avg:60.50ms +step:1462/2285 train_time:88456ms step_avg:60.50ms +step:1463/2285 train_time:88519ms step_avg:60.51ms +step:1464/2285 train_time:88579ms step_avg:60.50ms +step:1465/2285 train_time:88641ms step_avg:60.51ms +step:1466/2285 train_time:88701ms step_avg:60.51ms +step:1467/2285 train_time:88763ms step_avg:60.51ms +step:1468/2285 train_time:88823ms step_avg:60.51ms +step:1469/2285 train_time:88886ms step_avg:60.51ms +step:1470/2285 train_time:88945ms step_avg:60.51ms +step:1471/2285 train_time:89007ms step_avg:60.51ms +step:1472/2285 train_time:89067ms step_avg:60.51ms +step:1473/2285 train_time:89129ms step_avg:60.51ms +step:1474/2285 train_time:89188ms step_avg:60.51ms +step:1475/2285 train_time:89250ms step_avg:60.51ms +step:1476/2285 train_time:89311ms step_avg:60.51ms +step:1477/2285 train_time:89374ms step_avg:60.51ms +step:1478/2285 train_time:89434ms step_avg:60.51ms +step:1479/2285 train_time:89496ms step_avg:60.51ms +step:1480/2285 train_time:89557ms step_avg:60.51ms +step:1481/2285 train_time:89620ms step_avg:60.51ms +step:1482/2285 train_time:89679ms step_avg:60.51ms +step:1483/2285 train_time:89742ms step_avg:60.51ms +step:1484/2285 train_time:89801ms step_avg:60.51ms +step:1485/2285 train_time:89863ms step_avg:60.51ms +step:1486/2285 train_time:89922ms step_avg:60.51ms +step:1487/2285 train_time:89985ms step_avg:60.51ms +step:1488/2285 train_time:90045ms step_avg:60.51ms +step:1489/2285 train_time:90107ms step_avg:60.51ms +step:1490/2285 train_time:90168ms step_avg:60.52ms +step:1491/2285 train_time:90228ms step_avg:60.52ms +step:1492/2285 train_time:90288ms step_avg:60.51ms +step:1493/2285 train_time:90350ms step_avg:60.52ms +step:1494/2285 train_time:90410ms step_avg:60.52ms +step:1495/2285 train_time:90473ms step_avg:60.52ms +step:1496/2285 train_time:90533ms step_avg:60.52ms +step:1497/2285 train_time:90595ms step_avg:60.52ms +step:1498/2285 train_time:90656ms step_avg:60.52ms +step:1499/2285 train_time:90718ms step_avg:60.52ms +step:1500/2285 train_time:90778ms step_avg:60.52ms +step:1500/2285 val_loss:3.4262 train_time:90841ms step_avg:60.56ms +step:1501/2285 train_time:90860ms step_avg:60.53ms +step:1502/2285 train_time:90902ms step_avg:60.52ms +step:1503/2285 train_time:90968ms step_avg:60.52ms +step:1504/2285 train_time:91027ms step_avg:60.52ms +step:1505/2285 train_time:91089ms step_avg:60.52ms +step:1506/2285 train_time:91148ms step_avg:60.52ms +step:1507/2285 train_time:91211ms step_avg:60.52ms +step:1508/2285 train_time:91268ms step_avg:60.52ms +step:1509/2285 train_time:91330ms step_avg:60.52ms +step:1510/2285 train_time:91389ms step_avg:60.52ms +step:1511/2285 train_time:91451ms step_avg:60.52ms +step:1512/2285 train_time:91510ms step_avg:60.52ms +step:1513/2285 train_time:91572ms step_avg:60.52ms +step:1514/2285 train_time:91632ms step_avg:60.52ms +step:1515/2285 train_time:91694ms step_avg:60.52ms +step:1516/2285 train_time:91755ms step_avg:60.52ms +step:1517/2285 train_time:91819ms step_avg:60.53ms +step:1518/2285 train_time:91879ms step_avg:60.53ms +step:1519/2285 train_time:91943ms step_avg:60.53ms +step:1520/2285 train_time:92004ms step_avg:60.53ms +step:1521/2285 train_time:92067ms step_avg:60.53ms +step:1522/2285 train_time:92126ms step_avg:60.53ms +step:1523/2285 train_time:92188ms step_avg:60.53ms +step:1524/2285 train_time:92247ms step_avg:60.53ms +step:1525/2285 train_time:92310ms step_avg:60.53ms +step:1526/2285 train_time:92369ms step_avg:60.53ms +step:1527/2285 train_time:92431ms step_avg:60.53ms +step:1528/2285 train_time:92491ms step_avg:60.53ms +step:1529/2285 train_time:92553ms step_avg:60.53ms +step:1530/2285 train_time:92612ms step_avg:60.53ms +step:1531/2285 train_time:92674ms step_avg:60.53ms +step:1532/2285 train_time:92735ms step_avg:60.53ms +step:1533/2285 train_time:92799ms step_avg:60.53ms +step:1534/2285 train_time:92859ms step_avg:60.53ms +step:1535/2285 train_time:92922ms step_avg:60.54ms +step:1536/2285 train_time:92983ms step_avg:60.54ms +step:1537/2285 train_time:93047ms step_avg:60.54ms +step:1538/2285 train_time:93107ms step_avg:60.54ms +step:1539/2285 train_time:93168ms step_avg:60.54ms +step:1540/2285 train_time:93228ms step_avg:60.54ms +step:1541/2285 train_time:93290ms step_avg:60.54ms +step:1542/2285 train_time:93350ms step_avg:60.54ms +step:1543/2285 train_time:93412ms step_avg:60.54ms +step:1544/2285 train_time:93471ms step_avg:60.54ms +step:1545/2285 train_time:93533ms step_avg:60.54ms +step:1546/2285 train_time:93593ms step_avg:60.54ms +step:1547/2285 train_time:93655ms step_avg:60.54ms +step:1548/2285 train_time:93715ms step_avg:60.54ms +step:1549/2285 train_time:93778ms step_avg:60.54ms +step:1550/2285 train_time:93838ms step_avg:60.54ms +step:1551/2285 train_time:93901ms step_avg:60.54ms +step:1552/2285 train_time:93962ms step_avg:60.54ms +step:1553/2285 train_time:94026ms step_avg:60.54ms +step:1554/2285 train_time:94086ms step_avg:60.54ms +step:1555/2285 train_time:94148ms step_avg:60.55ms +step:1556/2285 train_time:94207ms step_avg:60.54ms +step:1557/2285 train_time:94270ms step_avg:60.55ms +step:1558/2285 train_time:94330ms step_avg:60.55ms +step:1559/2285 train_time:94391ms step_avg:60.55ms +step:1560/2285 train_time:94451ms step_avg:60.55ms +step:1561/2285 train_time:94512ms step_avg:60.55ms +step:1562/2285 train_time:94572ms step_avg:60.55ms +step:1563/2285 train_time:94634ms step_avg:60.55ms +step:1564/2285 train_time:94694ms step_avg:60.55ms +step:1565/2285 train_time:94756ms step_avg:60.55ms +step:1566/2285 train_time:94816ms step_avg:60.55ms +step:1567/2285 train_time:94880ms step_avg:60.55ms +step:1568/2285 train_time:94940ms step_avg:60.55ms +step:1569/2285 train_time:95003ms step_avg:60.55ms +step:1570/2285 train_time:95064ms step_avg:60.55ms +step:1571/2285 train_time:95126ms step_avg:60.55ms +step:1572/2285 train_time:95186ms step_avg:60.55ms +step:1573/2285 train_time:95248ms step_avg:60.55ms +step:1574/2285 train_time:95308ms step_avg:60.55ms +step:1575/2285 train_time:95371ms step_avg:60.55ms +step:1576/2285 train_time:95430ms step_avg:60.55ms +step:1577/2285 train_time:95492ms step_avg:60.55ms +step:1578/2285 train_time:95552ms step_avg:60.55ms +step:1579/2285 train_time:95614ms step_avg:60.55ms +step:1580/2285 train_time:95674ms step_avg:60.55ms +step:1581/2285 train_time:95736ms step_avg:60.55ms +step:1582/2285 train_time:95796ms step_avg:60.55ms +step:1583/2285 train_time:95859ms step_avg:60.56ms +step:1584/2285 train_time:95919ms step_avg:60.56ms +step:1585/2285 train_time:95982ms step_avg:60.56ms +step:1586/2285 train_time:96043ms step_avg:60.56ms +step:1587/2285 train_time:96105ms step_avg:60.56ms +step:1588/2285 train_time:96164ms step_avg:60.56ms +step:1589/2285 train_time:96227ms step_avg:60.56ms +step:1590/2285 train_time:96286ms step_avg:60.56ms +step:1591/2285 train_time:96348ms step_avg:60.56ms +step:1592/2285 train_time:96408ms step_avg:60.56ms +step:1593/2285 train_time:96471ms step_avg:60.56ms +step:1594/2285 train_time:96530ms step_avg:60.56ms +step:1595/2285 train_time:96593ms step_avg:60.56ms +step:1596/2285 train_time:96652ms step_avg:60.56ms +step:1597/2285 train_time:96714ms step_avg:60.56ms +step:1598/2285 train_time:96774ms step_avg:60.56ms +step:1599/2285 train_time:96837ms step_avg:60.56ms +step:1600/2285 train_time:96898ms step_avg:60.56ms +step:1601/2285 train_time:96961ms step_avg:60.56ms +step:1602/2285 train_time:97020ms step_avg:60.56ms +step:1603/2285 train_time:97083ms step_avg:60.56ms +step:1604/2285 train_time:97144ms step_avg:60.56ms +step:1605/2285 train_time:97207ms step_avg:60.56ms +step:1606/2285 train_time:97266ms step_avg:60.56ms +step:1607/2285 train_time:97329ms step_avg:60.57ms +step:1608/2285 train_time:97388ms step_avg:60.56ms +step:1609/2285 train_time:97450ms step_avg:60.57ms +step:1610/2285 train_time:97510ms step_avg:60.57ms +step:1611/2285 train_time:97572ms step_avg:60.57ms +step:1612/2285 train_time:97632ms step_avg:60.57ms +step:1613/2285 train_time:97694ms step_avg:60.57ms +step:1614/2285 train_time:97754ms step_avg:60.57ms +step:1615/2285 train_time:97817ms step_avg:60.57ms +step:1616/2285 train_time:97877ms step_avg:60.57ms +step:1617/2285 train_time:97939ms step_avg:60.57ms +step:1618/2285 train_time:97999ms step_avg:60.57ms +step:1619/2285 train_time:98062ms step_avg:60.57ms +step:1620/2285 train_time:98122ms step_avg:60.57ms +step:1621/2285 train_time:98185ms step_avg:60.57ms +step:1622/2285 train_time:98245ms step_avg:60.57ms +step:1623/2285 train_time:98307ms step_avg:60.57ms +step:1624/2285 train_time:98367ms step_avg:60.57ms +step:1625/2285 train_time:98430ms step_avg:60.57ms +step:1626/2285 train_time:98490ms step_avg:60.57ms +step:1627/2285 train_time:98552ms step_avg:60.57ms +step:1628/2285 train_time:98611ms step_avg:60.57ms +step:1629/2285 train_time:98673ms step_avg:60.57ms +step:1630/2285 train_time:98733ms step_avg:60.57ms +step:1631/2285 train_time:98795ms step_avg:60.57ms +step:1632/2285 train_time:98855ms step_avg:60.57ms +step:1633/2285 train_time:98918ms step_avg:60.57ms +step:1634/2285 train_time:98977ms step_avg:60.57ms +step:1635/2285 train_time:99039ms step_avg:60.57ms +step:1636/2285 train_time:99100ms step_avg:60.57ms +step:1637/2285 train_time:99162ms step_avg:60.58ms +step:1638/2285 train_time:99224ms step_avg:60.58ms +step:1639/2285 train_time:99287ms step_avg:60.58ms +step:1640/2285 train_time:99346ms step_avg:60.58ms +step:1641/2285 train_time:99408ms step_avg:60.58ms +step:1642/2285 train_time:99468ms step_avg:60.58ms +step:1643/2285 train_time:99531ms step_avg:60.58ms +step:1644/2285 train_time:99591ms step_avg:60.58ms +step:1645/2285 train_time:99653ms step_avg:60.58ms +step:1646/2285 train_time:99712ms step_avg:60.58ms +step:1647/2285 train_time:99774ms step_avg:60.58ms +step:1648/2285 train_time:99834ms step_avg:60.58ms +step:1649/2285 train_time:99896ms step_avg:60.58ms +step:1650/2285 train_time:99957ms step_avg:60.58ms +step:1651/2285 train_time:100019ms step_avg:60.58ms +step:1652/2285 train_time:100079ms step_avg:60.58ms +step:1653/2285 train_time:100142ms step_avg:60.58ms +step:1654/2285 train_time:100202ms step_avg:60.58ms +step:1655/2285 train_time:100265ms step_avg:60.58ms +step:1656/2285 train_time:100325ms step_avg:60.58ms +step:1657/2285 train_time:100389ms step_avg:60.58ms +step:1658/2285 train_time:100449ms step_avg:60.58ms +step:1659/2285 train_time:100511ms step_avg:60.59ms +step:1660/2285 train_time:100571ms step_avg:60.59ms +step:1661/2285 train_time:100633ms step_avg:60.59ms +step:1662/2285 train_time:100693ms step_avg:60.59ms +step:1663/2285 train_time:100755ms step_avg:60.59ms +step:1664/2285 train_time:100816ms step_avg:60.59ms +step:1665/2285 train_time:100877ms step_avg:60.59ms +step:1666/2285 train_time:100937ms step_avg:60.59ms +step:1667/2285 train_time:101000ms step_avg:60.59ms +step:1668/2285 train_time:101059ms step_avg:60.59ms +step:1669/2285 train_time:101122ms step_avg:60.59ms +step:1670/2285 train_time:101182ms step_avg:60.59ms +step:1671/2285 train_time:101245ms step_avg:60.59ms +step:1672/2285 train_time:101304ms step_avg:60.59ms +step:1673/2285 train_time:101367ms step_avg:60.59ms +step:1674/2285 train_time:101427ms step_avg:60.59ms +step:1675/2285 train_time:101490ms step_avg:60.59ms +step:1676/2285 train_time:101550ms step_avg:60.59ms +step:1677/2285 train_time:101612ms step_avg:60.59ms +step:1678/2285 train_time:101672ms step_avg:60.59ms +step:1679/2285 train_time:101734ms step_avg:60.59ms +step:1680/2285 train_time:101793ms step_avg:60.59ms +step:1681/2285 train_time:101856ms step_avg:60.59ms +step:1682/2285 train_time:101916ms step_avg:60.59ms +step:1683/2285 train_time:101978ms step_avg:60.59ms +step:1684/2285 train_time:102038ms step_avg:60.59ms +step:1685/2285 train_time:102102ms step_avg:60.59ms +step:1686/2285 train_time:102162ms step_avg:60.59ms +step:1687/2285 train_time:102224ms step_avg:60.60ms +step:1688/2285 train_time:102285ms step_avg:60.60ms +step:1689/2285 train_time:102348ms step_avg:60.60ms +step:1690/2285 train_time:102407ms step_avg:60.60ms +step:1691/2285 train_time:102470ms step_avg:60.60ms +step:1692/2285 train_time:102530ms step_avg:60.60ms +step:1693/2285 train_time:102592ms step_avg:60.60ms +step:1694/2285 train_time:102651ms step_avg:60.60ms +step:1695/2285 train_time:102714ms step_avg:60.60ms +step:1696/2285 train_time:102773ms step_avg:60.60ms +step:1697/2285 train_time:102835ms step_avg:60.60ms +step:1698/2285 train_time:102895ms step_avg:60.60ms +step:1699/2285 train_time:102957ms step_avg:60.60ms +step:1700/2285 train_time:103019ms step_avg:60.60ms +step:1701/2285 train_time:103080ms step_avg:60.60ms +step:1702/2285 train_time:103140ms step_avg:60.60ms +step:1703/2285 train_time:103203ms step_avg:60.60ms +step:1704/2285 train_time:103263ms step_avg:60.60ms +step:1705/2285 train_time:103327ms step_avg:60.60ms +step:1706/2285 train_time:103387ms step_avg:60.60ms +step:1707/2285 train_time:103450ms step_avg:60.60ms +step:1708/2285 train_time:103510ms step_avg:60.60ms +step:1709/2285 train_time:103573ms step_avg:60.60ms +step:1710/2285 train_time:103633ms step_avg:60.60ms +step:1711/2285 train_time:103695ms step_avg:60.60ms +step:1712/2285 train_time:103754ms step_avg:60.60ms +step:1713/2285 train_time:103817ms step_avg:60.61ms +step:1714/2285 train_time:103877ms step_avg:60.60ms +step:1715/2285 train_time:103939ms step_avg:60.61ms +step:1716/2285 train_time:104000ms step_avg:60.61ms +step:1717/2285 train_time:104061ms step_avg:60.61ms +step:1718/2285 train_time:104121ms step_avg:60.61ms +step:1719/2285 train_time:104184ms step_avg:60.61ms +step:1720/2285 train_time:104244ms step_avg:60.61ms +step:1721/2285 train_time:104307ms step_avg:60.61ms +step:1722/2285 train_time:104367ms step_avg:60.61ms +step:1723/2285 train_time:104429ms step_avg:60.61ms +step:1724/2285 train_time:104490ms step_avg:60.61ms +step:1725/2285 train_time:104552ms step_avg:60.61ms +step:1726/2285 train_time:104612ms step_avg:60.61ms +step:1727/2285 train_time:104674ms step_avg:60.61ms +step:1728/2285 train_time:104733ms step_avg:60.61ms +step:1729/2285 train_time:104795ms step_avg:60.61ms +step:1730/2285 train_time:104854ms step_avg:60.61ms +step:1731/2285 train_time:104917ms step_avg:60.61ms +step:1732/2285 train_time:104977ms step_avg:60.61ms +step:1733/2285 train_time:105039ms step_avg:60.61ms +step:1734/2285 train_time:105100ms step_avg:60.61ms +step:1735/2285 train_time:105162ms step_avg:60.61ms +step:1736/2285 train_time:105222ms step_avg:60.61ms +step:1737/2285 train_time:105286ms step_avg:60.61ms +step:1738/2285 train_time:105346ms step_avg:60.61ms +step:1739/2285 train_time:105408ms step_avg:60.61ms +step:1740/2285 train_time:105468ms step_avg:60.61ms +step:1741/2285 train_time:105530ms step_avg:60.61ms +step:1742/2285 train_time:105591ms step_avg:60.61ms +step:1743/2285 train_time:105653ms step_avg:60.62ms +step:1744/2285 train_time:105712ms step_avg:60.61ms +step:1745/2285 train_time:105774ms step_avg:60.62ms +step:1746/2285 train_time:105834ms step_avg:60.61ms +step:1747/2285 train_time:105896ms step_avg:60.62ms +step:1748/2285 train_time:105956ms step_avg:60.62ms +step:1749/2285 train_time:106019ms step_avg:60.62ms +step:1750/2285 train_time:106079ms step_avg:60.62ms +step:1750/2285 val_loss:3.3662 train_time:106143ms step_avg:60.65ms +step:1751/2285 train_time:106162ms step_avg:60.63ms +step:1752/2285 train_time:106206ms step_avg:60.62ms +step:1753/2285 train_time:106269ms step_avg:60.62ms +step:1754/2285 train_time:106330ms step_avg:60.62ms +step:1755/2285 train_time:106393ms step_avg:60.62ms +step:1756/2285 train_time:106453ms step_avg:60.62ms +step:1757/2285 train_time:106514ms step_avg:60.62ms +step:1758/2285 train_time:106574ms step_avg:60.62ms +step:1759/2285 train_time:106635ms step_avg:60.62ms +step:1760/2285 train_time:106694ms step_avg:60.62ms +step:1761/2285 train_time:106756ms step_avg:60.62ms +step:1762/2285 train_time:106815ms step_avg:60.62ms +step:1763/2285 train_time:106876ms step_avg:60.62ms +step:1764/2285 train_time:106936ms step_avg:60.62ms +step:1765/2285 train_time:106998ms step_avg:60.62ms +step:1766/2285 train_time:107060ms step_avg:60.62ms +step:1767/2285 train_time:107125ms step_avg:60.63ms +step:1768/2285 train_time:107185ms step_avg:60.63ms +step:1769/2285 train_time:107247ms step_avg:60.63ms +step:1770/2285 train_time:107307ms step_avg:60.63ms +step:1771/2285 train_time:107370ms step_avg:60.63ms +step:1772/2285 train_time:107430ms step_avg:60.63ms +step:1773/2285 train_time:107492ms step_avg:60.63ms +step:1774/2285 train_time:107551ms step_avg:60.63ms +step:1775/2285 train_time:107614ms step_avg:60.63ms +step:1776/2285 train_time:107673ms step_avg:60.63ms +step:1777/2285 train_time:107735ms step_avg:60.63ms +step:1778/2285 train_time:107794ms step_avg:60.63ms +step:1779/2285 train_time:107856ms step_avg:60.63ms +step:1780/2285 train_time:107915ms step_avg:60.63ms +step:1781/2285 train_time:107977ms step_avg:60.63ms +step:1782/2285 train_time:108038ms step_avg:60.63ms +step:1783/2285 train_time:108102ms step_avg:60.63ms +step:1784/2285 train_time:108162ms step_avg:60.63ms +step:1785/2285 train_time:108225ms step_avg:60.63ms +step:1786/2285 train_time:108285ms step_avg:60.63ms +step:1787/2285 train_time:108347ms step_avg:60.63ms +step:1788/2285 train_time:108407ms step_avg:60.63ms +step:1789/2285 train_time:108469ms step_avg:60.63ms +step:1790/2285 train_time:108529ms step_avg:60.63ms +step:1791/2285 train_time:108591ms step_avg:60.63ms +step:1792/2285 train_time:108651ms step_avg:60.63ms +step:1793/2285 train_time:108713ms step_avg:60.63ms +step:1794/2285 train_time:108772ms step_avg:60.63ms +step:1795/2285 train_time:108834ms step_avg:60.63ms +step:1796/2285 train_time:108894ms step_avg:60.63ms +step:1797/2285 train_time:108956ms step_avg:60.63ms +step:1798/2285 train_time:109016ms step_avg:60.63ms +step:1799/2285 train_time:109079ms step_avg:60.63ms +step:1800/2285 train_time:109139ms step_avg:60.63ms +step:1801/2285 train_time:109203ms step_avg:60.63ms +step:1802/2285 train_time:109263ms step_avg:60.63ms +step:1803/2285 train_time:109325ms step_avg:60.63ms +step:1804/2285 train_time:109385ms step_avg:60.63ms +step:1805/2285 train_time:109446ms step_avg:60.64ms +step:1806/2285 train_time:109506ms step_avg:60.63ms +step:1807/2285 train_time:109568ms step_avg:60.64ms +step:1808/2285 train_time:109628ms step_avg:60.63ms +step:1809/2285 train_time:109690ms step_avg:60.64ms +step:1810/2285 train_time:109750ms step_avg:60.64ms +step:1811/2285 train_time:109813ms step_avg:60.64ms +step:1812/2285 train_time:109873ms step_avg:60.64ms +step:1813/2285 train_time:109936ms step_avg:60.64ms +step:1814/2285 train_time:109995ms step_avg:60.64ms +step:1815/2285 train_time:110058ms step_avg:60.64ms +step:1816/2285 train_time:110119ms step_avg:60.64ms +step:1817/2285 train_time:110182ms step_avg:60.64ms +step:1818/2285 train_time:110241ms step_avg:60.64ms +step:1819/2285 train_time:110304ms step_avg:60.64ms +step:1820/2285 train_time:110363ms step_avg:60.64ms +step:1821/2285 train_time:110425ms step_avg:60.64ms +step:1822/2285 train_time:110485ms step_avg:60.64ms +step:1823/2285 train_time:110547ms step_avg:60.64ms +step:1824/2285 train_time:110607ms step_avg:60.64ms +step:1825/2285 train_time:110669ms step_avg:60.64ms +step:1826/2285 train_time:110730ms step_avg:60.64ms +step:1827/2285 train_time:110792ms step_avg:60.64ms +step:1828/2285 train_time:110852ms step_avg:60.64ms +step:1829/2285 train_time:110915ms step_avg:60.64ms +step:1830/2285 train_time:110975ms step_avg:60.64ms +step:1831/2285 train_time:111037ms step_avg:60.64ms +step:1832/2285 train_time:111097ms step_avg:60.64ms +step:1833/2285 train_time:111160ms step_avg:60.64ms +step:1834/2285 train_time:111220ms step_avg:60.64ms +step:1835/2285 train_time:111282ms step_avg:60.64ms +step:1836/2285 train_time:111342ms step_avg:60.64ms +step:1837/2285 train_time:111404ms step_avg:60.64ms +step:1838/2285 train_time:111464ms step_avg:60.64ms +step:1839/2285 train_time:111526ms step_avg:60.64ms +step:1840/2285 train_time:111586ms step_avg:60.64ms +step:1841/2285 train_time:111648ms step_avg:60.65ms +step:1842/2285 train_time:111709ms step_avg:60.65ms +step:1843/2285 train_time:111771ms step_avg:60.65ms +step:1844/2285 train_time:111832ms step_avg:60.65ms +step:1845/2285 train_time:111894ms step_avg:60.65ms +step:1846/2285 train_time:111954ms step_avg:60.65ms +step:1847/2285 train_time:112017ms step_avg:60.65ms +step:1848/2285 train_time:112077ms step_avg:60.65ms +step:1849/2285 train_time:112140ms step_avg:60.65ms +step:1850/2285 train_time:112199ms step_avg:60.65ms +step:1851/2285 train_time:112262ms step_avg:60.65ms +step:1852/2285 train_time:112321ms step_avg:60.65ms +step:1853/2285 train_time:112384ms step_avg:60.65ms +step:1854/2285 train_time:112443ms step_avg:60.65ms +step:1855/2285 train_time:112505ms step_avg:60.65ms +step:1856/2285 train_time:112565ms step_avg:60.65ms +step:1857/2285 train_time:112627ms step_avg:60.65ms +step:1858/2285 train_time:112687ms step_avg:60.65ms +step:1859/2285 train_time:112750ms step_avg:60.65ms +step:1860/2285 train_time:112810ms step_avg:60.65ms +step:1861/2285 train_time:112872ms step_avg:60.65ms +step:1862/2285 train_time:112933ms step_avg:60.65ms +step:1863/2285 train_time:112995ms step_avg:60.65ms +step:1864/2285 train_time:113055ms step_avg:60.65ms +step:1865/2285 train_time:113117ms step_avg:60.65ms +step:1866/2285 train_time:113177ms step_avg:60.65ms +step:1867/2285 train_time:113240ms step_avg:60.65ms +step:1868/2285 train_time:113300ms step_avg:60.65ms +step:1869/2285 train_time:113362ms step_avg:60.65ms +step:1870/2285 train_time:113422ms step_avg:60.65ms +step:1871/2285 train_time:113484ms step_avg:60.65ms +step:1872/2285 train_time:113543ms step_avg:60.65ms +step:1873/2285 train_time:113605ms step_avg:60.65ms +step:1874/2285 train_time:113666ms step_avg:60.65ms +step:1875/2285 train_time:113728ms step_avg:60.66ms +step:1876/2285 train_time:113788ms step_avg:60.65ms +step:1877/2285 train_time:113850ms step_avg:60.66ms +step:1878/2285 train_time:113911ms step_avg:60.66ms +step:1879/2285 train_time:113975ms step_avg:60.66ms +step:1880/2285 train_time:114035ms step_avg:60.66ms +step:1881/2285 train_time:114097ms step_avg:60.66ms +step:1882/2285 train_time:114157ms step_avg:60.66ms +step:1883/2285 train_time:114220ms step_avg:60.66ms +step:1884/2285 train_time:114280ms step_avg:60.66ms +step:1885/2285 train_time:114342ms step_avg:60.66ms +step:1886/2285 train_time:114401ms step_avg:60.66ms +step:1887/2285 train_time:114463ms step_avg:60.66ms +step:1888/2285 train_time:114523ms step_avg:60.66ms +step:1889/2285 train_time:114586ms step_avg:60.66ms +step:1890/2285 train_time:114646ms step_avg:60.66ms +step:1891/2285 train_time:114708ms step_avg:60.66ms +step:1892/2285 train_time:114769ms step_avg:60.66ms +step:1893/2285 train_time:114831ms step_avg:60.66ms +step:1894/2285 train_time:114891ms step_avg:60.66ms +step:1895/2285 train_time:114954ms step_avg:60.66ms +step:1896/2285 train_time:115015ms step_avg:60.66ms +step:1897/2285 train_time:115077ms step_avg:60.66ms +step:1898/2285 train_time:115137ms step_avg:60.66ms +step:1899/2285 train_time:115200ms step_avg:60.66ms +step:1900/2285 train_time:115260ms step_avg:60.66ms +step:1901/2285 train_time:115322ms step_avg:60.66ms +step:1902/2285 train_time:115382ms step_avg:60.66ms +step:1903/2285 train_time:115443ms step_avg:60.66ms +step:1904/2285 train_time:115503ms step_avg:60.66ms +step:1905/2285 train_time:115565ms step_avg:60.66ms +step:1906/2285 train_time:115625ms step_avg:60.66ms +step:1907/2285 train_time:115687ms step_avg:60.66ms +step:1908/2285 train_time:115748ms step_avg:60.66ms +step:1909/2285 train_time:115810ms step_avg:60.67ms +step:1910/2285 train_time:115871ms step_avg:60.67ms +step:1911/2285 train_time:115933ms step_avg:60.67ms +step:1912/2285 train_time:115993ms step_avg:60.67ms +step:1913/2285 train_time:116056ms step_avg:60.67ms +step:1914/2285 train_time:116116ms step_avg:60.67ms +step:1915/2285 train_time:116178ms step_avg:60.67ms +step:1916/2285 train_time:116238ms step_avg:60.67ms +step:1917/2285 train_time:116301ms step_avg:60.67ms +step:1918/2285 train_time:116361ms step_avg:60.67ms +step:1919/2285 train_time:116423ms step_avg:60.67ms +step:1920/2285 train_time:116483ms step_avg:60.67ms +step:1921/2285 train_time:116545ms step_avg:60.67ms +step:1922/2285 train_time:116605ms step_avg:60.67ms +step:1923/2285 train_time:116667ms step_avg:60.67ms +step:1924/2285 train_time:116728ms step_avg:60.67ms +step:1925/2285 train_time:116790ms step_avg:60.67ms +step:1926/2285 train_time:116850ms step_avg:60.67ms +step:1927/2285 train_time:116913ms step_avg:60.67ms +step:1928/2285 train_time:116974ms step_avg:60.67ms +step:1929/2285 train_time:117037ms step_avg:60.67ms +step:1930/2285 train_time:117097ms step_avg:60.67ms +step:1931/2285 train_time:117159ms step_avg:60.67ms +step:1932/2285 train_time:117220ms step_avg:60.67ms +step:1933/2285 train_time:117282ms step_avg:60.67ms +step:1934/2285 train_time:117342ms step_avg:60.67ms +step:1935/2285 train_time:117404ms step_avg:60.67ms +step:1936/2285 train_time:117464ms step_avg:60.67ms +step:1937/2285 train_time:117526ms step_avg:60.67ms +step:1938/2285 train_time:117585ms step_avg:60.67ms +step:1939/2285 train_time:117648ms step_avg:60.67ms +step:1940/2285 train_time:117708ms step_avg:60.67ms +step:1941/2285 train_time:117771ms step_avg:60.68ms +step:1942/2285 train_time:117831ms step_avg:60.67ms +step:1943/2285 train_time:117893ms step_avg:60.68ms +step:1944/2285 train_time:117953ms step_avg:60.68ms +step:1945/2285 train_time:118015ms step_avg:60.68ms +step:1946/2285 train_time:118075ms step_avg:60.68ms +step:1947/2285 train_time:118138ms step_avg:60.68ms +step:1948/2285 train_time:118198ms step_avg:60.68ms +step:1949/2285 train_time:118262ms step_avg:60.68ms +step:1950/2285 train_time:118322ms step_avg:60.68ms +step:1951/2285 train_time:118383ms step_avg:60.68ms +step:1952/2285 train_time:118443ms step_avg:60.68ms +step:1953/2285 train_time:118505ms step_avg:60.68ms +step:1954/2285 train_time:118565ms step_avg:60.68ms +step:1955/2285 train_time:118627ms step_avg:60.68ms +step:1956/2285 train_time:118687ms step_avg:60.68ms +step:1957/2285 train_time:118750ms step_avg:60.68ms +step:1958/2285 train_time:118810ms step_avg:60.68ms +step:1959/2285 train_time:118873ms step_avg:60.68ms +step:1960/2285 train_time:118933ms step_avg:60.68ms +step:1961/2285 train_time:118996ms step_avg:60.68ms +step:1962/2285 train_time:119056ms step_avg:60.68ms +step:1963/2285 train_time:119119ms step_avg:60.68ms +step:1964/2285 train_time:119179ms step_avg:60.68ms +step:1965/2285 train_time:119242ms step_avg:60.68ms +step:1966/2285 train_time:119302ms step_avg:60.68ms +step:1967/2285 train_time:119364ms step_avg:60.68ms +step:1968/2285 train_time:119425ms step_avg:60.68ms +step:1969/2285 train_time:119486ms step_avg:60.68ms +step:1970/2285 train_time:119546ms step_avg:60.68ms +step:1971/2285 train_time:119609ms step_avg:60.68ms +step:1972/2285 train_time:119669ms step_avg:60.68ms +step:1973/2285 train_time:119731ms step_avg:60.68ms +step:1974/2285 train_time:119791ms step_avg:60.68ms +step:1975/2285 train_time:119853ms step_avg:60.69ms +step:1976/2285 train_time:119914ms step_avg:60.69ms +step:1977/2285 train_time:119976ms step_avg:60.69ms +step:1978/2285 train_time:120036ms step_avg:60.69ms +step:1979/2285 train_time:120098ms step_avg:60.69ms +step:1980/2285 train_time:120158ms step_avg:60.69ms +step:1981/2285 train_time:120220ms step_avg:60.69ms +step:1982/2285 train_time:120280ms step_avg:60.69ms +step:1983/2285 train_time:120343ms step_avg:60.69ms +step:1984/2285 train_time:120402ms step_avg:60.69ms +step:1985/2285 train_time:120464ms step_avg:60.69ms +step:1986/2285 train_time:120525ms step_avg:60.69ms +step:1987/2285 train_time:120587ms step_avg:60.69ms +step:1988/2285 train_time:120647ms step_avg:60.69ms +step:1989/2285 train_time:120709ms step_avg:60.69ms +step:1990/2285 train_time:120769ms step_avg:60.69ms +step:1991/2285 train_time:120832ms step_avg:60.69ms +step:1992/2285 train_time:120892ms step_avg:60.69ms +step:1993/2285 train_time:120955ms step_avg:60.69ms +step:1994/2285 train_time:121015ms step_avg:60.69ms +step:1995/2285 train_time:121078ms step_avg:60.69ms +step:1996/2285 train_time:121137ms step_avg:60.69ms +step:1997/2285 train_time:121200ms step_avg:60.69ms +step:1998/2285 train_time:121260ms step_avg:60.69ms +step:1999/2285 train_time:121323ms step_avg:60.69ms +step:2000/2285 train_time:121382ms step_avg:60.69ms +step:2000/2285 val_loss:3.3172 train_time:121446ms step_avg:60.72ms +step:2001/2285 train_time:121464ms step_avg:60.70ms +step:2002/2285 train_time:121507ms step_avg:60.69ms +step:2003/2285 train_time:121569ms step_avg:60.69ms +step:2004/2285 train_time:121630ms step_avg:60.69ms +step:2005/2285 train_time:121693ms step_avg:60.69ms +step:2006/2285 train_time:121754ms step_avg:60.69ms +step:2007/2285 train_time:121815ms step_avg:60.70ms +step:2008/2285 train_time:121874ms step_avg:60.69ms +step:2009/2285 train_time:121936ms step_avg:60.69ms +step:2010/2285 train_time:121995ms step_avg:60.69ms +step:2011/2285 train_time:122057ms step_avg:60.69ms +step:2012/2285 train_time:122116ms step_avg:60.69ms +step:2013/2285 train_time:122178ms step_avg:60.69ms +step:2014/2285 train_time:122238ms step_avg:60.69ms +step:2015/2285 train_time:122300ms step_avg:60.69ms +step:2016/2285 train_time:122362ms step_avg:60.70ms +step:2017/2285 train_time:122426ms step_avg:60.70ms +step:2018/2285 train_time:122488ms step_avg:60.70ms +step:2019/2285 train_time:122551ms step_avg:60.70ms +step:2020/2285 train_time:122611ms step_avg:60.70ms +step:2021/2285 train_time:122674ms step_avg:60.70ms +step:2022/2285 train_time:122734ms step_avg:60.70ms +step:2023/2285 train_time:122795ms step_avg:60.70ms +step:2024/2285 train_time:122855ms step_avg:60.70ms +step:2025/2285 train_time:122916ms step_avg:60.70ms +step:2026/2285 train_time:122979ms step_avg:60.70ms +step:2027/2285 train_time:123038ms step_avg:60.70ms +step:2028/2285 train_time:123098ms step_avg:60.70ms +step:2029/2285 train_time:123159ms step_avg:60.70ms +step:2030/2285 train_time:123219ms step_avg:60.70ms +step:2031/2285 train_time:123281ms step_avg:60.70ms +step:2032/2285 train_time:123342ms step_avg:60.70ms +step:2033/2285 train_time:123406ms step_avg:60.70ms +step:2034/2285 train_time:123467ms step_avg:60.70ms +step:2035/2285 train_time:123530ms step_avg:60.70ms +step:2036/2285 train_time:123590ms step_avg:60.70ms +step:2037/2285 train_time:123654ms step_avg:60.70ms +step:2038/2285 train_time:123713ms step_avg:60.70ms +step:2039/2285 train_time:123776ms step_avg:60.70ms +step:2040/2285 train_time:123836ms step_avg:60.70ms +step:2041/2285 train_time:123897ms step_avg:60.70ms +step:2042/2285 train_time:123957ms step_avg:60.70ms +step:2043/2285 train_time:124019ms step_avg:60.70ms +step:2044/2285 train_time:124079ms step_avg:60.70ms +step:2045/2285 train_time:124141ms step_avg:60.70ms +step:2046/2285 train_time:124201ms step_avg:60.70ms +step:2047/2285 train_time:124264ms step_avg:60.71ms +step:2048/2285 train_time:124325ms step_avg:60.71ms +step:2049/2285 train_time:124388ms step_avg:60.71ms +step:2050/2285 train_time:124448ms step_avg:60.71ms +step:2051/2285 train_time:124511ms step_avg:60.71ms +step:2052/2285 train_time:124571ms step_avg:60.71ms +step:2053/2285 train_time:124634ms step_avg:60.71ms +step:2054/2285 train_time:124694ms step_avg:60.71ms +step:2055/2285 train_time:124756ms step_avg:60.71ms +step:2056/2285 train_time:124816ms step_avg:60.71ms +step:2057/2285 train_time:124878ms step_avg:60.71ms +step:2058/2285 train_time:124937ms step_avg:60.71ms +step:2059/2285 train_time:124999ms step_avg:60.71ms +step:2060/2285 train_time:125059ms step_avg:60.71ms +step:2061/2285 train_time:125121ms step_avg:60.71ms +step:2062/2285 train_time:125182ms step_avg:60.71ms +step:2063/2285 train_time:125245ms step_avg:60.71ms +step:2064/2285 train_time:125305ms step_avg:60.71ms +step:2065/2285 train_time:125368ms step_avg:60.71ms +step:2066/2285 train_time:125428ms step_avg:60.71ms +step:2067/2285 train_time:125491ms step_avg:60.71ms +step:2068/2285 train_time:125551ms step_avg:60.71ms +step:2069/2285 train_time:125613ms step_avg:60.71ms +step:2070/2285 train_time:125673ms step_avg:60.71ms +step:2071/2285 train_time:125735ms step_avg:60.71ms +step:2072/2285 train_time:125796ms step_avg:60.71ms +step:2073/2285 train_time:125858ms step_avg:60.71ms +step:2074/2285 train_time:125918ms step_avg:60.71ms +step:2075/2285 train_time:125980ms step_avg:60.71ms +step:2076/2285 train_time:126039ms step_avg:60.71ms +step:2077/2285 train_time:126102ms step_avg:60.71ms +step:2078/2285 train_time:126162ms step_avg:60.71ms +step:2079/2285 train_time:126225ms step_avg:60.71ms +step:2080/2285 train_time:126286ms step_avg:60.71ms +step:2081/2285 train_time:126348ms step_avg:60.71ms +step:2082/2285 train_time:126408ms step_avg:60.71ms +step:2083/2285 train_time:126471ms step_avg:60.72ms +step:2084/2285 train_time:126531ms step_avg:60.72ms +step:2085/2285 train_time:126593ms step_avg:60.72ms +step:2086/2285 train_time:126653ms step_avg:60.72ms +step:2087/2285 train_time:126715ms step_avg:60.72ms +step:2088/2285 train_time:126775ms step_avg:60.72ms +step:2089/2285 train_time:126837ms step_avg:60.72ms +step:2090/2285 train_time:126897ms step_avg:60.72ms +step:2091/2285 train_time:126960ms step_avg:60.72ms +step:2092/2285 train_time:127019ms step_avg:60.72ms +step:2093/2285 train_time:127081ms step_avg:60.72ms +step:2094/2285 train_time:127142ms step_avg:60.72ms +step:2095/2285 train_time:127205ms step_avg:60.72ms +step:2096/2285 train_time:127265ms step_avg:60.72ms +step:2097/2285 train_time:127328ms step_avg:60.72ms +step:2098/2285 train_time:127387ms step_avg:60.72ms +step:2099/2285 train_time:127449ms step_avg:60.72ms +step:2100/2285 train_time:127509ms step_avg:60.72ms +step:2101/2285 train_time:127572ms step_avg:60.72ms +step:2102/2285 train_time:127632ms step_avg:60.72ms +step:2103/2285 train_time:127694ms step_avg:60.72ms +step:2104/2285 train_time:127754ms step_avg:60.72ms +step:2105/2285 train_time:127817ms step_avg:60.72ms +step:2106/2285 train_time:127876ms step_avg:60.72ms +step:2107/2285 train_time:127939ms step_avg:60.72ms +step:2108/2285 train_time:127999ms step_avg:60.72ms +step:2109/2285 train_time:128061ms step_avg:60.72ms +step:2110/2285 train_time:128120ms step_avg:60.72ms +step:2111/2285 train_time:128184ms step_avg:60.72ms +step:2112/2285 train_time:128244ms step_avg:60.72ms +step:2113/2285 train_time:128307ms step_avg:60.72ms +step:2114/2285 train_time:128367ms step_avg:60.72ms +step:2115/2285 train_time:128430ms step_avg:60.72ms +step:2116/2285 train_time:128490ms step_avg:60.72ms +step:2117/2285 train_time:128552ms step_avg:60.72ms +step:2118/2285 train_time:128612ms step_avg:60.72ms +step:2119/2285 train_time:128677ms step_avg:60.73ms +step:2120/2285 train_time:128735ms step_avg:60.72ms +step:2121/2285 train_time:128797ms step_avg:60.72ms +step:2122/2285 train_time:128857ms step_avg:60.72ms +step:2123/2285 train_time:128920ms step_avg:60.73ms +step:2124/2285 train_time:128979ms step_avg:60.72ms +step:2125/2285 train_time:129041ms step_avg:60.73ms +step:2126/2285 train_time:129102ms step_avg:60.73ms +step:2127/2285 train_time:129164ms step_avg:60.73ms +step:2128/2285 train_time:129224ms step_avg:60.73ms +step:2129/2285 train_time:129287ms step_avg:60.73ms +step:2130/2285 train_time:129348ms step_avg:60.73ms +step:2131/2285 train_time:129410ms step_avg:60.73ms +step:2132/2285 train_time:129471ms step_avg:60.73ms +step:2133/2285 train_time:129533ms step_avg:60.73ms +step:2134/2285 train_time:129593ms step_avg:60.73ms +step:2135/2285 train_time:129655ms step_avg:60.73ms +step:2136/2285 train_time:129715ms step_avg:60.73ms +step:2137/2285 train_time:129777ms step_avg:60.73ms +step:2138/2285 train_time:129837ms step_avg:60.73ms +step:2139/2285 train_time:129900ms step_avg:60.73ms +step:2140/2285 train_time:129960ms step_avg:60.73ms +step:2141/2285 train_time:130022ms step_avg:60.73ms +step:2142/2285 train_time:130082ms step_avg:60.73ms +step:2143/2285 train_time:130145ms step_avg:60.73ms +step:2144/2285 train_time:130205ms step_avg:60.73ms +step:2145/2285 train_time:130268ms step_avg:60.73ms +step:2146/2285 train_time:130328ms step_avg:60.73ms +step:2147/2285 train_time:130390ms step_avg:60.73ms +step:2148/2285 train_time:130451ms step_avg:60.73ms +step:2149/2285 train_time:130513ms step_avg:60.73ms +step:2150/2285 train_time:130573ms step_avg:60.73ms +step:2151/2285 train_time:130635ms step_avg:60.73ms +step:2152/2285 train_time:130695ms step_avg:60.73ms +step:2153/2285 train_time:130757ms step_avg:60.73ms +step:2154/2285 train_time:130818ms step_avg:60.73ms +step:2155/2285 train_time:130881ms step_avg:60.73ms +step:2156/2285 train_time:130940ms step_avg:60.73ms +step:2157/2285 train_time:131002ms step_avg:60.73ms +step:2158/2285 train_time:131063ms step_avg:60.73ms +step:2159/2285 train_time:131125ms step_avg:60.73ms +step:2160/2285 train_time:131185ms step_avg:60.73ms +step:2161/2285 train_time:131248ms step_avg:60.73ms +step:2162/2285 train_time:131308ms step_avg:60.73ms +step:2163/2285 train_time:131371ms step_avg:60.74ms +step:2164/2285 train_time:131431ms step_avg:60.74ms +step:2165/2285 train_time:131493ms step_avg:60.74ms +step:2166/2285 train_time:131553ms step_avg:60.74ms +step:2167/2285 train_time:131615ms step_avg:60.74ms +step:2168/2285 train_time:131676ms step_avg:60.74ms +step:2169/2285 train_time:131738ms step_avg:60.74ms +step:2170/2285 train_time:131798ms step_avg:60.74ms +step:2171/2285 train_time:131861ms step_avg:60.74ms +step:2172/2285 train_time:131921ms step_avg:60.74ms +step:2173/2285 train_time:131983ms step_avg:60.74ms +step:2174/2285 train_time:132043ms step_avg:60.74ms +step:2175/2285 train_time:132105ms step_avg:60.74ms +step:2176/2285 train_time:132165ms step_avg:60.74ms +step:2177/2285 train_time:132228ms step_avg:60.74ms +step:2178/2285 train_time:132288ms step_avg:60.74ms +step:2179/2285 train_time:132351ms step_avg:60.74ms +step:2180/2285 train_time:132412ms step_avg:60.74ms +step:2181/2285 train_time:132474ms step_avg:60.74ms +step:2182/2285 train_time:132535ms step_avg:60.74ms +step:2183/2285 train_time:132597ms step_avg:60.74ms +step:2184/2285 train_time:132657ms step_avg:60.74ms +step:2185/2285 train_time:132720ms step_avg:60.74ms +step:2186/2285 train_time:132780ms step_avg:60.74ms +step:2187/2285 train_time:132843ms step_avg:60.74ms +step:2188/2285 train_time:132903ms step_avg:60.74ms +step:2189/2285 train_time:132965ms step_avg:60.74ms +step:2190/2285 train_time:133025ms step_avg:60.74ms +step:2191/2285 train_time:133087ms step_avg:60.74ms +step:2192/2285 train_time:133147ms step_avg:60.74ms +step:2193/2285 train_time:133210ms step_avg:60.74ms +step:2194/2285 train_time:133270ms step_avg:60.74ms +step:2195/2285 train_time:133332ms step_avg:60.74ms +step:2196/2285 train_time:133392ms step_avg:60.74ms +step:2197/2285 train_time:133454ms step_avg:60.74ms +step:2198/2285 train_time:133516ms step_avg:60.74ms +step:2199/2285 train_time:133578ms step_avg:60.74ms +step:2200/2285 train_time:133638ms step_avg:60.74ms +step:2201/2285 train_time:133701ms step_avg:60.75ms +step:2202/2285 train_time:133761ms step_avg:60.75ms +step:2203/2285 train_time:133823ms step_avg:60.75ms +step:2204/2285 train_time:133882ms step_avg:60.75ms +step:2205/2285 train_time:133945ms step_avg:60.75ms +step:2206/2285 train_time:134005ms step_avg:60.75ms +step:2207/2285 train_time:134067ms step_avg:60.75ms +step:2208/2285 train_time:134127ms step_avg:60.75ms +step:2209/2285 train_time:134190ms step_avg:60.75ms +step:2210/2285 train_time:134251ms step_avg:60.75ms +step:2211/2285 train_time:134313ms step_avg:60.75ms +step:2212/2285 train_time:134373ms step_avg:60.75ms +step:2213/2285 train_time:134436ms step_avg:60.75ms +step:2214/2285 train_time:134496ms step_avg:60.75ms +step:2215/2285 train_time:134558ms step_avg:60.75ms +step:2216/2285 train_time:134618ms step_avg:60.75ms +step:2217/2285 train_time:134680ms step_avg:60.75ms +step:2218/2285 train_time:134740ms step_avg:60.75ms +step:2219/2285 train_time:134803ms step_avg:60.75ms +step:2220/2285 train_time:134863ms step_avg:60.75ms +step:2221/2285 train_time:134926ms step_avg:60.75ms +step:2222/2285 train_time:134986ms step_avg:60.75ms +step:2223/2285 train_time:135048ms step_avg:60.75ms +step:2224/2285 train_time:135109ms step_avg:60.75ms +step:2225/2285 train_time:135171ms step_avg:60.75ms +step:2226/2285 train_time:135231ms step_avg:60.75ms +step:2227/2285 train_time:135293ms step_avg:60.75ms +step:2228/2285 train_time:135354ms step_avg:60.75ms +step:2229/2285 train_time:135416ms step_avg:60.75ms +step:2230/2285 train_time:135477ms step_avg:60.75ms +step:2231/2285 train_time:135539ms step_avg:60.75ms +step:2232/2285 train_time:135600ms step_avg:60.75ms +step:2233/2285 train_time:135662ms step_avg:60.75ms +step:2234/2285 train_time:135721ms step_avg:60.75ms +step:2235/2285 train_time:135783ms step_avg:60.75ms +step:2236/2285 train_time:135844ms step_avg:60.75ms +step:2237/2285 train_time:135906ms step_avg:60.75ms +step:2238/2285 train_time:135966ms step_avg:60.75ms +step:2239/2285 train_time:136029ms step_avg:60.75ms +step:2240/2285 train_time:136089ms step_avg:60.75ms +step:2241/2285 train_time:136151ms step_avg:60.75ms +step:2242/2285 train_time:136211ms step_avg:60.75ms +step:2243/2285 train_time:136273ms step_avg:60.75ms +step:2244/2285 train_time:136333ms step_avg:60.75ms +step:2245/2285 train_time:136396ms step_avg:60.76ms +step:2246/2285 train_time:136456ms step_avg:60.76ms +step:2247/2285 train_time:136519ms step_avg:60.76ms +step:2248/2285 train_time:136579ms step_avg:60.76ms +step:2249/2285 train_time:136642ms step_avg:60.76ms +step:2250/2285 train_time:136702ms step_avg:60.76ms +step:2250/2285 val_loss:3.2821 train_time:136766ms step_avg:60.78ms +step:2251/2285 train_time:136784ms step_avg:60.77ms +step:2252/2285 train_time:136830ms step_avg:60.76ms +step:2253/2285 train_time:136894ms step_avg:60.76ms +step:2254/2285 train_time:136955ms step_avg:60.76ms +step:2255/2285 train_time:137016ms step_avg:60.76ms +step:2256/2285 train_time:137076ms step_avg:60.76ms +step:2257/2285 train_time:137138ms step_avg:60.76ms +step:2258/2285 train_time:137198ms step_avg:60.76ms +step:2259/2285 train_time:137260ms step_avg:60.76ms +step:2260/2285 train_time:137320ms step_avg:60.76ms +step:2261/2285 train_time:137383ms step_avg:60.76ms +step:2262/2285 train_time:137442ms step_avg:60.76ms +step:2263/2285 train_time:137504ms step_avg:60.76ms +step:2264/2285 train_time:137564ms step_avg:60.76ms +step:2265/2285 train_time:137626ms step_avg:60.76ms +step:2266/2285 train_time:137686ms step_avg:60.76ms +step:2267/2285 train_time:137749ms step_avg:60.76ms +step:2268/2285 train_time:137811ms step_avg:60.76ms +step:2269/2285 train_time:137874ms step_avg:60.76ms +step:2270/2285 train_time:137935ms step_avg:60.76ms +step:2271/2285 train_time:137998ms step_avg:60.77ms +step:2272/2285 train_time:138058ms step_avg:60.76ms +step:2273/2285 train_time:138120ms step_avg:60.77ms +step:2274/2285 train_time:138180ms step_avg:60.77ms +step:2275/2285 train_time:138242ms step_avg:60.77ms +step:2276/2285 train_time:138302ms step_avg:60.77ms +step:2277/2285 train_time:138363ms step_avg:60.77ms +step:2278/2285 train_time:138423ms step_avg:60.77ms +step:2279/2285 train_time:138485ms step_avg:60.77ms +step:2280/2285 train_time:138545ms step_avg:60.77ms +step:2281/2285 train_time:138607ms step_avg:60.77ms +step:2282/2285 train_time:138667ms step_avg:60.77ms +step:2283/2285 train_time:138730ms step_avg:60.77ms +step:2284/2285 train_time:138791ms step_avg:60.77ms +step:2285/2285 train_time:138854ms step_avg:60.77ms +step:2285/2285 val_loss:3.2760 train_time:138916ms step_avg:60.79ms +peak memory allocated: 29626 MiB reserved: 50528 MiB diff --git a/records/track_1_short/2025-10-27_FixMuonLR/README.md b/records/track_1_short/2025-10-27_FixMuonLR/README.md new file mode 100644 index 000000000..504ed2120 --- /dev/null +++ b/records/track_1_short/2025-10-27_FixMuonLR/README.md @@ -0,0 +1,65 @@ +# Faster Muon step, corrected learning rates + +This record improves the step time of Muon and addresses some bugs in our current effective learning rate calculation. It incorporates the results from [PR#144](https://github.com/KellerJordan/modded-nanogpt/pull/144). + +## Timing and Validation + +This record improves the final training by 30 steps and decreases by step by around 1%. + +This PR: + +``` +import scipy.stats +import torch + +losses = [3.2766, 3.2794, 3.2770, 3.2776, 3.2760, 3.2802, 3.2757] +times = [138.986, 138.838, 138.877, 138.905, 138.916, 138.846, 138.937] + +print("p=%.4f" % scipy.stats.ttest_1samp(losses, 3.28, alternative="less").pvalue) +# p=0.0041 + +print("losses:", torch.std_mean(torch.tensor(losses))) +# losses: (std=0.001706, mean=3.277500) + +print("time:", torch.std_mean(torch.tensor(times))) +# time: (std=0.052171, mean=138.900711) +``` + +Previous PR (timed on same machine): + +``` +import scipy.stats +import torch + +times = [142.379, 142.156, 141.391, 142.374, 142.316] + +print("time:", torch.std_mean(torch.tensor(times))) +# time: (std=0.419136, mean=142.123200) +``` + +In total, this corresponds to roughly a $3.22$ second decrease in training time. On a faster machine (the one used for official timing), this will probably be $\approx 3.17$ seconds. + +Thank you to Prime Intellect for sponsoring my research. + +## Changes + + +### (1) Vectorized Muon Step + +I vectorized several loops inside the Muon `step`, which slightly decreases step time. I am guessing we can apply `torch.compile` to a subpart of `step` for further gains, as well. I moved the momentum buffers to being properties of groups, not of states, though this requires that we add a `reset()` (similar to `Yarn`). + +### (2) Corrected learning rate + +In the previous Muon step, the `eff_lr_val` was scaling the learning rate on the attention parameters by `1/2`, since they were treated as `[dim, 4 * dim]`-shaped parameters. However, in practice, they are square parameters, so we should not do this. Moving the attention reshape in the step corrects this issue. + +Similarly, the MLP up-projection is also scaled down. Following the theory that effective learning rate is proportional to $\sqrt{\text{output_dim}}$ I have increased `lr_mul` on the MLP up-projection to `2.0`. I have removed the logic that requires all parameters in the same group the share the same learning rate and weight decay. + +Both of these changes meant that our previous Muon learning rate was ~twice as high as it should be, so I've decreased it to `0.03`. I have not further tuned this value. + +### (3) LR refactoring + WS Schedule tweak + +I removed the logic for iteration extension and instead changed `get_lr` to account for a "flat" section at the end. The hyperparameters for learning rate have been changed to instead to be fractional breakpoints, which helps in testing out lower step accounts. I believe that the LR schedule can be further improved. + +Since the WS schedule was also impacted by the iteration extension, I updated the schedule from being 3 parts to 6 parts. This schedule is different than the previous three part schedule though it performs essentially the same as the version with iteration extension. + +Additionally, I corrected a subtle bug where gradients were being summed in `grad_accum_steps` but averaged over ranks. In practice this is mostly irrelevant due to magnitude invariance, however it causes minor precision issues for $<8$ devices. diff --git a/records/track_1_short/2025-10-27_FixMuonLR/f196cb62-827b-4bb1-94f0-4169eb1c9375.txt b/records/track_1_short/2025-10-27_FixMuonLR/f196cb62-827b-4bb1-94f0-4169eb1c9375.txt new file mode 100644 index 000000000..183ae8822 --- /dev/null +++ b/records/track_1_short/2025-10-27_FixMuonLR/f196cb62-827b-4bb1-94f0-4169eb1c9375.txt @@ -0,0 +1,3814 @@ +import os +import sys + +with open(sys.argv[0]) as f: + code = f.read() # read the code of this file ASAP, for logging +import copy +import glob +import math +import threading +import time +import uuid +from dataclasses import dataclass +from collections import defaultdict +from itertools import accumulate +from pathlib import Path + +os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" +import torch + +torch.empty( + 1, device="cuda", requires_grad=True +).backward() # prevents a bug on some systems +import torch._dynamo as dynamo +import torch.distributed as dist +import torch.nn.functional as F + +# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min +import triton +import triton.language as tl +from kernels import get_kernel +from torch import Tensor, nn + +dynamo.config.recompile_limit = 64 + +# ----------------------------------------------------------------------------- +# Custom operators: FP8 matmul by @YouJiacheng + + +@torch.library.custom_op("nanogpt::mm", mutates_args=()) +def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: + @torch.compile + def impl(x: Tensor, w: Tensor): + assert x.is_contiguous() and w.is_contiguous() + x_f8 = x.div(x_s).to(torch.float8_e4m3fn) + w_f8 = w.div(w_s).to(torch.float8_e4m3fn) + out = torch._scaled_mm( + x_f8, + w_f8.T, + out_dtype=torch.bfloat16, + scale_a=x.new_tensor(x_s, dtype=torch.float32), + scale_b=x.new_tensor(w_s, dtype=torch.float32), + use_fast_accum=True, + ) + return out, x_f8, w_f8 + + return impl(x, w) + +@mm_op.register_fake +def _(x: Tensor, w: Tensor, *_): + assert x.ndim == w.ndim == 2 + assert x.shape[1] == w.shape[1] + assert x.device == w.device + assert x.is_contiguous() and w.is_contiguous() + return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) + +@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) +def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: + @torch.compile + def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): + assert grad.is_contiguous() + x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) + w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) + grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) + grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) + grad_x = torch._scaled_mm( + grad_f8, + w_f8.T.contiguous().T, + out_dtype=torch.bfloat16, + scale_a=grad_inv_s, + scale_b=w_inv_s, + use_fast_accum=False, + ) + # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) + grad_w = torch._scaled_mm( + x_f8.T.contiguous(), + grad_f8.T.contiguous().T, + out_dtype=torch.float32, + scale_a=x_inv_s, + scale_b=grad_inv_s, + use_fast_accum=False, + ).T + return grad_x, grad_w + + return impl(g, x_f8, w_f8) + +@mm_backward_op.register_fake +def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): + return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) + +def backward(ctx, grad_out: Tensor, *_): + x_f8, w_f8 = ctx.saved_tensors + x_s, w_s, grad_s = ctx.scales + grad_x, grad_w = torch.ops.nanogpt.mm_backward( + grad_out, x_f8, w_f8, x_s, w_s, grad_s + ) + return grad_x, grad_w, None, None, None + +def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): + *_, x_s, w_s, grad_s = inputs + _, x_f8, w_f8 = output + ctx.save_for_backward(x_f8, w_f8) + ctx.scales = x_s, w_s, grad_s + ctx.set_materialize_grads(False) + +mm_op.register_autograd(backward, setup_context=setup_context) + +# ----------------------------------------------------------------------------- +# Triton kernel for symmetric matrix multiplication by @byronxu99 + +def _get_autotune_configs(): + return [ + triton.Config( + { + "BLOCK_SIZE_M": bm, + "BLOCK_SIZE_N": bn, + "BLOCK_SIZE_K": bk, + "GROUP_SIZE_M": 8, + "LOWER_UPPER": 1, + }, + num_stages=stages, + num_warps=warps, + ) + for bm in [64, 128] + for bn in [64, 128, 256] + for bk in [64, 128] + for stages, warps in [(3, 4), (3, 8), (4, 4)] + if bm // bn <= 2 and bn // bm <= 2 + ] + +@triton.jit +def _pid_to_block( + pid, + M, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) + + # Map PID to a single matrix in batch + batch_idx = pid // (num_pid_m * num_pid_n) + pid = pid % (num_pid_m * num_pid_n) + + # Map PID to 2D grid of blocks + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) + + m_idx = pid_m * BLOCK_SIZE_M + n_idx = pid_n * BLOCK_SIZE_N + return batch_idx, m_idx, n_idx + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def XXT_kernel( + A_ptr, C_ptr, + M, K, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def XXT(A: torch.Tensor, out: torch.Tensor): + """ + Launch Triton kernel to compute C = A @ A.T + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert out.size(-2) == M, "Output matrix has incorrect shape" + assert out.size(-1) == M, "Output matrix has incorrect shape" + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + XXT_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + K=K, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + ) + return out + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def ba_plus_cAA_kernel( + A_ptr, C_ptr, + M, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + alpha, beta, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A + # Performance is slightly slower than XXT_kernel, so we use two separate kernels + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + # Load block of A to add (corresponds to the current block of C) + offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) + a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) + a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) + a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) + + # Apply alpha and beta + accumulator *= alpha + accumulator += a_add * beta + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): + """ + Launch Triton kernel to compute C = alpha * A @ A.T + beta * A + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert M == K, "Input matrix must be square" + assert out.size(-2) == M + assert out.size(-1) == M + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + ba_plus_cAA_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + alpha=alpha, + beta=beta, + ) + return out + +# Computed for num_iters=5, safety_factor=2e-2, cushion=2 +polar_express_coeffs = [ + (8.156554524902461, -22.48329292557795, 15.878769915207462), + (4.042929935166739, -2.808917465908714, 0.5000178451051316), + (3.8916678022926607, -2.772484153217685, 0.5060648178503393), + (3.285753657755655, -2.3681294933425376, 0.46449024233003106), + (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) +] + +@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower +def polar_express(G: torch.Tensor): + """ + Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 + by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. + Code adapted from https://github.com/NoahAmsel/PolarExpress/tree/main by @varunneal. + """ + X = G.bfloat16() + if G.size(-2) > G.size(-1): + X = X.mT + + # Ensure spectral norm is at most 1 + X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) + + # Allocate buffers + X = X.contiguous() + A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) + B = torch.empty_like(A) + C = torch.empty_like(X) + + aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm + + # Perform the iterations + for a, b, c in polar_express_coeffs: + XXT(X, out=A) # A = X @ X.mT + ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A + aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X + X, C = C, X # Swap references to avoid unnecessary copies + + if G.size(-2) > G.size(-1): + X = X.mT + return X + +# ----------------------------------------------------------------------------- +# Muon optimizer + +class Muon(torch.optim.Optimizer): + """ + Muon - MomentUm Orthogonalized by Newton-schulz + + https://kellerjordan.github.io/posts/muon/ + + Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- + processing step, in which each 2D parameter's update is replaced with the nearest orthogonal + matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has + the advantage that it can be stably run in bfloat16 on the GPU. + Note: A later PR replaced Newton-Shulz with Polar Express for the orthogonalization step + + Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, + or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). + Though empirically small 1D params perform efficiently here: + NS approximately performs a magnitude normalization of the grad + This hyper-optimized class has faster execution time than the current impl of Adam for small params + + Custom distributed sizing: + The model stores all attn and mlp weights in the same shape, and then updates the view as + needed on the forward pass. This enables attn and mlp weights to be contained within the same + dist.reduce_scatter_tensor() call. The model architecture has been customized to enable + (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. + The scheduling is: + 1. reduce scatter smear_gate (1 param 7 padding params) + 2. reduce scatter attn_gate (10 params 6 padding params) + 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) + 4. reduce scatter attn/mlp round 2 (16 mlp params) + 5. wait on step 1, then compute update of 1 and schedule all gather + 6. wait on step 2, then compute update of 2 and schedule all gather + 7. wait on step 3, then compute update of 3 and schedule all gather + GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] + GPUs that receive params of type attn reshape before computing update + 8. wait on 4, then compute update of 4 and schedule all gather + 9. wait for each all gather to complete and update params + Empirically, leading with small params provides an additional 0.2s improvement. + """ + def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, eps=1e-8, beta2=0.95, custom_sizing=True): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + # custom sizing requires 8 GPUs + if custom_sizing and dist.get_world_size()==8: + param_groups = self.generate_custom_param_groups(params) + else: + param_groups = self.generate_standard_param_groups(params) + super().__init__(param_groups, defaults) + + def reset(self): + # expose a reset for clearing buffers + for group in self.param_groups: + group["momentum_buffer"].zero_() + group["second_momentum_buffer"].zero_() + + def generate_standard_param_groups(self, params): + """ + Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. + Creates one param group per module. + """ + groups = defaultdict(list) + for param in params: + groups[param.label].append(param) + + param_groups = [] + for module_name, group_params in groups.items(): + chunk_size = (len(group_params) + self.world_size - 1) // self.world_size + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + + return param_groups + + def generate_custom_param_groups(self, params): + """ + Implementation requires that a single GPU does not receive both attn + and mlp params when a param group is split across GPUs. + """ + module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp_up', 'mlp_down'] + params_list = list(params) + params_list.sort(key=lambda x: module_group_order.index(x.label)) + + idx = 0 + group_sizes = [1, 10, 16, 16] + assert len(params_list) == sum(group_sizes) + param_groups = [] + for size in group_sizes: + chunk_size = (size + self.world_size - 1) // self.world_size + group_params = params_list[idx: idx + size] + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + idx += size + + return param_groups + + @torch.no_grad() + def step(self): + # Efficient systems-wise implementation of step developed by @YouJiacheng, + # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, + # @ryanyang0, @vagrawal, and @varunneal. + rank = dist.get_rank() + group_infos = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + if not params: + continue + + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + stacked_grads = torch.empty( + (padded_num_params, *params[0].shape), + dtype=params[0].dtype, + device=params[0].device + ) + for i, p in enumerate(params): + stacked_grads[i].copy_(p.grad, non_blocking=True) + if len(params) < padded_num_params: + stacked_grads[len(params):].zero_() + + grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) + + reduce_future = dist.reduce_scatter_tensor( + grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True + ).get_future() + + group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) + + all_gather_infos = [] + # Second pass: wait for gradients, compute updates for the local shard of parameters, + # and launch all async all_gather operations. + for group, info in zip(self.param_groups, group_infos): + info["reduce_future"].wait() + + params = group["params"] + grad_chunk = info["grad_chunk"] + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + start_idx = rank * chunk_size + module_idx = start_idx if start_idx < len(params) else 0 + + num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank + + if "momentum_buffer" not in group: + group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) + momentum_buffer = group["momentum_buffer"] + # Apply momentum update to the persistent momentum buffer in-place + momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) + updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) + + grad_shape = updated_grads.shape + if params[module_idx].label == 'attn': + # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] + for p in params[module_idx:module_idx + num_params]: + assert p.label == 'attn' + updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) + ref_param = params[module_idx] + param_shape = ref_param.shape + + if "second_momentum_buffer" not in group: + group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) + if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) + ) + second_momentum_buffer = group["second_momentum_buffer"] + + if "param_lr" not in group: + group["param_lr"] = ( + max(1., param_shape[-2] / param_shape[-1]) ** 0.5 + * ref_param.new_tensor( + [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + ) + + group["param_wd"] = ref_param.new_tensor( + [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + + # Determine LR and WR + eff_lr = group["lr"] * group["param_lr"] + eff_wd = group["weight_decay"] * group["param_wd"] + + # Compute zeropower for the entire chunk in a single, batched call. + if num_params == 0: + v_chunk = updated_grads + elif params[module_idx].label == "smear_gate": + # dividing by magnitude is equivalent of SVN for 1d tensors + v_chunk = updated_grads / (updated_grads.norm(dim=(-2, -1), keepdim=True).clamp_min(1e-10)) + else: + v_chunk = polar_express(updated_grads) + + # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) + v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) + second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) + step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() + v_chunk.mul_(step_size) + v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) + + v_chunk = v_chunk.view(grad_shape) + + updated_params = torch.empty_like(grad_chunk) + param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) + # Apply weight decay directly to the buffer. + param_chunk.mul_(1 - eff_wd) + + param_chunk.add_(-eff_lr * v_chunk) + + updated_params[:num_params].copy_(param_chunk) + if num_params < chunk_size: + updated_params[num_params:].zero_() + + stacked_params = torch.empty( + (padded_num_params, *param_shape), + dtype=updated_params.dtype, + device=updated_params.device, + ) + + gather_future = dist.all_gather_into_tensor( + stacked_params, updated_params, async_op=True + ).get_future() + + all_gather_infos.append( + { + "gather_future": gather_future, + "stacked_params": stacked_params, + "orig_params": params, + } + ) + + # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. + for info in all_gather_infos: + info["gather_future"].wait() + stacked_params = info["stacked_params"] + orig_params = info["orig_params"] + + unstacked_params = torch.unbind(stacked_params) + for i, p in enumerate(orig_params): + p.copy_(unstacked_params[i], non_blocking=True) + + +class DistAdam(torch.optim.Optimizer): + def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + params = list(params) + sizes = {p.shape for p in params} + # create one buffer per unique parameter-size + param_groups = [] + for size in sizes: + group_params = [p for p in params if p.shape == size] + param_groups.append(dict(params=group_params)) + super().__init__(param_groups, defaults) + # init state + for p in params: + chunk_size = p.size(0) // self.world_size + exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) + exp_avg_sq = torch.zeros_like(exp_avg) + self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) + # DistributedAdam implementation by @vagrawal + + @torch.compile + @torch.no_grad() + def step(self): + rank = dist.get_rank() + reduce_scatter_futures: list[torch.Future] = [] + all_gather_futures: list[torch.Future] = [] + grad_slices = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + for param in params: + grad = param.grad + rank_size = grad.shape[0] // self.world_size + grad_slice = torch.empty_like(grad[:rank_size]) + reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) + grad_slices.append(grad_slice) + + idx = 0 + for group in self.param_groups: + beta1, beta2 = group['betas'] + eps = group['eps'] + wd = group['weight_decay'] + params = group['params'] + for param in params: + reduce_scatter_futures[idx].wait() + rank_size = param.shape[0] // self.world_size + p_slice = param[rank * rank_size:(rank + 1) * rank_size] + lr = group['lr'] * getattr(param, "lr_mul", 1.0) + state = self.state[param] + g_slice = grad_slices[idx] + + exp_avg = state["exp_avg"] + exp_avg_sq = state["exp_avg_sq"] + state["step"] += 1 + t = state["step"] + # weight decay + if wd != 0: + eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) + p_slice.mul_(1 - eff_weight_decay) + # update running averages + exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) + # bias corrections + bias1 = 1 - beta1 ** t + bias2 = 1 - beta2 ** t + # compute step + denom = exp_avg_sq.sqrt().add_(eps) + step_size = lr * (bias2 ** 0.5 / bias1) + update = exp_avg.div(denom).mul_(step_size) + p_slice.add_(other=update, alpha=-1.0) + idx += 1 + all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) + torch.futures.collect_all(all_gather_futures).wait() + +# ----------------------------------------------------------------------------- +# PyTorch nn.Module definitions for the model + +def norm(x: Tensor): + return F.rms_norm(x, (x.size(-1),)) + +class CastedLinear(nn.Linear): + def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): + super().__init__(in_features, out_features, bias=False) + self.use_fp8 = use_fp8 + self.x_s = x_s + self.w_s = w_s + self.grad_s = grad_s + + def reset_parameters(self) -> None: + with torch.no_grad(): + self.weight.zero_() # @Grad62304977 and others + + def forward(self, x: Tensor): + if self.use_fp8 and self.training: + _x = x.flatten(0, -2) + out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] + return out.reshape(*x.shape[:-1], -1) + else: + return F.linear(x, self.weight.type_as(x)) + +# yarn implementation @classiclarryd +class Yarn(nn.Module): + def __init__(self, head_dim, max_seq_len): + super().__init__() + self.head_dim = head_dim + self.max_seq_len = max_seq_len + self.reset() + + def reset(self): + angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) + # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) + angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) + theta = torch.outer(t, angular_freq) + self.cos = nn.Buffer( + theta.cos().to(torch.bfloat16), persistent=False + ) + self.sin = nn.Buffer( + theta.sin().to(torch.bfloat16), persistent=False + ) + self.angular_freq = angular_freq + # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 + self.attn_scale = 0.1 + + def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): + rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) + scaling_factor = old_window / new_window + interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) + self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) + theta = torch.outer(t, self.angular_freq) + self.cos.copy_(theta.cos()) + self.sin.copy_(theta.sin()) + self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 + +def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): + assert cos.size(0) >= x_BTHD.size(-3) + cos, sin = ( + cos[None, : x_BTHD.size(-3), None, :], + sin[None, : x_BTHD.size(-3), None, :], + ) + x1, x2 = x_BTHD.chunk(2, dim=-1) + y1 = x1 * cos + x2 * sin + y2 = x1 * (-sin) + x2 * cos + return torch.cat((y1, y2), 3) + +@dataclass +class AttnArgs: + ve: torch.Tensor + sa_lambdas: torch.Tensor + seqlens: torch.Tensor + bm_size: int + cos: torch.Tensor + sin: torch.Tensor + attn_scale: float + +flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface + +class CausalSelfAttention(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int): + super().__init__() + self.num_heads = num_heads + self.head_dim = head_dim + self.dim = dim + self.hdim = num_heads * head_dim + + assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" + std = 0.5 * (self.dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng + # https://x.com/hi_tysam/status/1879699187107033311 + # make matrices the same shape as MLP to enable batched call in optimizer + self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) + # label module to enable custom optimizer sizing + self.qkvo_w.label='attn' + + with torch.no_grad(): + self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights + self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero + + # sparse gated attention to enable context based no-op by @classiclarryd + self.attn_gate = CastedLinear(12, num_heads) + # label module to enable custom optimizer sizing + self.attn_gate.weight.label = 'attn_gate' + + def forward(self, x: Tensor, attn_args: AttnArgs): + B, T = x.size(0), x.size(1) # batch size, sequence length + assert B == 1, "varlen sequences requires B == 1" + assert T % 16 == 0 + # unpack attention args + cos, sin = attn_args.cos, attn_args.sin + ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas + seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size + + q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) + q, k = norm(q), norm(k) # QK norm @Grad62304977 + q, k = rotary(q, cos, sin), rotary(k, cos, sin) + if ve is not None: + v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 + else: # skip mid-layers token value embeddings by @YouJiacheng + v = sa_lambdas[0] * v + + max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) + + # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng + y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, + max_seqlen_q=max_len, max_seqlen_k=max_len, + causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) + y = y.view(B, T, self.num_heads, self.head_dim) + y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) + y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side + y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) + return y + + +class MLP(nn.Module): + def __init__(self, dim: int): + super().__init__() + hdim = 4 * dim + # make matrices the same shape to enable batched call in optimizer + self.c_fc = nn.Parameter(torch.empty(dim, hdim)) + self.c_proj = nn.Parameter(torch.empty(dim, hdim)) + # label modules to enable custom optimizer sizing + self.c_fc.label = 'mlp_up' + self.c_proj.label = 'mlp_down' + # corrective factor to account for transpose + self.c_fc.lr_mul = 2. + + std = 0.5 * (dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + with torch.no_grad(): + self.c_fc.uniform_(-bound, bound) + self.c_proj.zero_() # zero init suggested by @Grad62304977 + + def forward(self, x: Tensor): + x = F.linear(x, self.c_fc.T.type_as(x)) + x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 + x = F.linear(x, self.c_proj.type_as(x)) + return x + +class Block(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): + super().__init__() + # skip attention of blocks.7 (the 8th layer) by @YouJiacheng + self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None + # skip MLP blocks for first MLP layer by @EmelyanenkoK + self.mlp = MLP(dim) if layer_idx != 0 else None + + def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): + x = lambdas[0] * x + lambdas[1] * x0 + if self.attn is not None: + x = x + self.attn(norm(x), attn_args) + if self.mlp is not None: + x = x + self.mlp(norm(x)) + return x + +# ----------------------------------------------------------------------------- +# The main model + +def next_multiple_of_n(v: float | int, *, n: int): + return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) + +class GPT(nn.Module): + def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): + super().__init__() + vocab_size = next_multiple_of_n(vocab_size, n=128) + self.embed = nn.Embedding(vocab_size, model_dim) + self.smear_gate = CastedLinear(12, 1) + # label modules to enable custom optimizer sizing + self.smear_gate.weight.label = 'smear_gate' + # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 + # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 + self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) + self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) + self.yarn = Yarn(head_dim, max_seq_len) + # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. + # suggested to me by @Grad62304977. this originates from Karpathy's experiments. + use_fp8 = not os.environ.get("DISABLE_FP8", False) + self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) + # Add learnable skip connection weights for decoder layers + assert num_layers % 2 == 0 + pad = (-num_layers * 5 - 2) % dist.get_world_size() + self.scalars = nn.Parameter( + torch.cat( + [ + -1.5 + * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 + *[ + torch.tensor([1.0, 0.0]) for _ in range(num_layers) + ], # block lambdas + *[ + torch.tensor([0.5, 0.5]) for _ in range(num_layers) + ], # SA lambdas + torch.zeros(1), # smear_lambda + 0.5*torch.ones(1), # backout_lambda + torch.ones(pad), + ] + ) + ) + # set learning rates + for param in self.embed.parameters(): + param.lr_mul = 75. + for param in self.value_embeds.parameters(): + param.lr_mul = 75. + self.lm_head.weight.lr_mul = 1.0 + self.scalars.lr_mul = 5.0 + + def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): + assert input_seq.ndim == 1 + + ve = [value_embed(input_seq) for value_embed in self.value_embeds] + # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure + # dropping first layer updates this to .12 ... 012 + ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] + assert len(ve) == len(self.blocks) + + short_bm = ws_short * args.block_size + long_bm = ws_long * args.block_size + bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] + assert len(bm_sizes) == len(self.blocks) + + x = self.embed(input_seq) + + skip_weights = self.scalars[:(len(self.blocks) // 2)] + lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) + sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) + smear_lambda = self.scalars[5 * len(self.blocks)] + backout_lambda = self.scalars[5 * len(self.blocks)+1] + + # smear token embed forward 1 position @classiclarryd + smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) + x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) + x = x0 = norm(x[None]) + + # U-net design by @brendanh0gan + skip_connections = [] + n = len(self.blocks) // 2 + + x_backout = None + backout_layer = 8 + # skip layer zero + for i in range(1,len(self.blocks)): + attn_args = AttnArgs( + ve=ve[i], + sa_lambdas=sa_lambdas[i], + seqlens=seqlens, + bm_size=bm_sizes[i], + cos=self.yarn.cos, + sin=self.yarn.sin, + attn_scale=self.yarn.attn_scale + ) + # since layer 0 is skipped, layer 11 does not have skip_connection + if i >= n and i<11: + gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) + x = x + gate * skip_connections.pop() + x = self.blocks[i](x, x0, lambdas[i], attn_args) + if i < n: + skip_connections.append(x) + if i == backout_layer: + x_backout = x + + # back out contributions from first 8 layers that are only required for downstream context and not direct prediction + x -= backout_lambda * x_backout + x = norm(x) + logits = self.lm_head(x) + # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) + logits = 30 * torch.sigmoid(logits / 7.5) + logits_for_loss = logits.float() if not self.training else logits + loss = F.cross_entropy( + logits_for_loss.view(-1, logits_for_loss.size(-1)), + target_seq, + reduction="sum" if self.training else "mean", + ) + return loss + +# ----------------------------------------------------------------------------- +# Distributed data loader + +def _load_data_shard(file: Path): + header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 + assert header[0] == 20240520, "magic number mismatch in the data .bin file" + assert header[1] == 1, "unsupported version" + num_tokens = int(header[2]) # number of tokens (claimed) + with file.open("rb", buffering=0) as f: + tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng + f.seek(256 * 4) + nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng + assert nbytes == 2 * num_tokens, "number of tokens read does not match header" + return tokens + +BOS_ID = 50256 + +class BOSFinder: + # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd + def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): + # Precompute BOS positions once per shard + self.tokens=tokens + self.size = tokens.numel() + self.quickload = quickload + if quickload: + # only scan first 4 million tokens, then kickoff async thread to scan rest + self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.thread = None + self.ready = threading.Event() + self.start() + else: + self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.i = 0 + self.world_size = world_size + self.batch_iter = 0 + + def _load(self): + self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + self.bos_idx = self.bos_idx_async + + def next_batch(self, num_tokens_local: int, max_seq_len: int): + # if quickload was used, repoint to the full dataset after 5 batches + if self.quickload and self.batch_iter==5: + self.get() + n = len(self.bos_idx) + starts = [[] for _ in range(self.world_size)] + ends = [[] for _ in range(self.world_size)] + + idx = self.i + for r in range(self.world_size): + cur_len = 0 + while cur_len <= num_tokens_local: + if idx >= n: + raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") + cur = self.bos_idx[idx] + starts[r].append(cur) + end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, + cur + max_seq_len, + cur + num_tokens_local - cur_len + 1) + ends[r].append(end) + cur_len += end - cur + idx += 1 + + assert cur_len == num_tokens_local + 1 + self.i = idx + self.batch_iter+=1 + return starts, ends + +class DataPreloader: + # Helper for asynchronously loading next shard and indexing bos tokens + def __init__(self, file_iter, world_size: int = 1): + self.file_iter = file_iter + self.world_size = world_size + self.thread = None + self.data = None + self.ready = threading.Event() + + def _load(self): + tokens = _load_data_shard(next(self.file_iter)) + self.data = (tokens, BOSFinder(tokens, self.world_size)) + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + return self.data + +def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): + # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len + rank = dist.get_rank() if dist.is_initialized() else 0 + world_size = dist.get_world_size() if dist.is_initialized() else 1 + assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" + num_tokens = num_tokens // grad_accum_steps + + files = [Path(file) for file in sorted(glob.glob(filename_pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") + + file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training + tokens = _load_data_shard(next(file_iter)) + if align_to_bos: + finder = BOSFinder(tokens, world_size=world_size, quickload=True) + preloader = DataPreloader(file_iter, world_size) + preloader.start() + else: + pos = 0 # for unaligned case + + while True: + num_tokens_local = num_tokens // world_size + max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 + + if align_to_bos: + try: + seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) + start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) + except StopIteration: + # This shard is exhausted, load the next one in the next loop iteration. + tokens, finder = preloader.get() + preloader.start() + continue + + buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) + _inputs = buf[:-1] + _targets = buf[1:] + end_idxs[-1] -= 1 # last document was too long to account for _targets offset + cum_lengths = (end_idxs - start_idxs).cumsum(0) + + else: + if pos + num_tokens + 1 >= len(tokens): # should not occur for val data + tokens, pos = _load_data_shard(next(file_iter)), 0 + + pos_local = pos + rank * num_tokens_local + buf = tokens[pos_local: pos_local + num_tokens_local + 1] + _inputs = buf[:-1].view(num_tokens_local, ) + _targets = buf[1:].view(num_tokens_local, ) + + cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] + pos += num_tokens + + + _cum_lengths = torch.full((max_num_docs,), num_tokens_local) + _cum_lengths[0] = 0 + _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths + + new_params = yield ( + _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), + _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), + _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) + ) + + if new_params is not None: + # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() + new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params + assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" + num_tokens = new_num_tokens + max_seq_len = new_max_seq_len + grad_accum_steps = new_grad_accum_steps + + +# ----------------------------------------------------------------------------- +# int main + +@dataclass +class Hyperparameters: + # data + train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on + val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on + val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons + train_batch_size: int = 2048 * 16 * 8 + train_max_seq_len: int = 128 * 16 + val_batch_size: int = 4 * 64 * 1024 * 8 + # optimization + num_iterations: int = 2285 + lr_schedule = (0.5, 0.98) # breakpoints for 3-part schedule: (flat, linear decay, flat) + lr_min = 0.1 + # evaluation and logging + run_id: str = f"{uuid.uuid4()}" + val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end + save_checkpoint: bool = False + # attention masking + block_size: int = 128 + ws_schedule: tuple = (3, 5, 7, 9, 11, 13) + ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN + +args = Hyperparameters() + +data_path = os.environ.get("DATA_PATH", ".") +args.train_files = os.path.join(data_path, args.train_files) +args.val_files = os.path.join(data_path, args.val_files) + +# torchrun sets these env variables +rank = int(os.environ["RANK"]) +world_size = int(os.environ["WORLD_SIZE"]) +assert 8 % world_size == 0, "world_size must be a divisor of 8" +grad_accum_steps = 8 // world_size +assert torch.cuda.is_available() +device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) +torch.cuda.set_device(device) +dist.init_process_group(backend="nccl", device_id=device) +dist.barrier() +master_process = (rank == 0) # this process will do logging, checkpointing etc. + +# begin logging +logfile = None +if master_process: + run_id = args.run_id + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{run_id}.txt" + print(logfile) +def print0(s, console=False): + if master_process: + with open(logfile, "a") as f: + if console: + print(s) + print(s, file=f) + +# begin by printing this file (the Python code) +print0(code) +print0("="*100) +# log information about the hardware/software environment this is running on +print0(f"Running Python {sys.version}") +print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") +print0(f"Running Triton version {triton.__version__}") + +def nvidia_smi(): + import subprocess # avoid top level import + return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout +print0(nvidia_smi()) +print0("="*100) + +model: nn.Module = GPT( + vocab_size=50257, + num_layers=12, + num_heads=6, + head_dim=128, + model_dim=768, + max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) +).cuda() +for m in model.modules(): + if isinstance(m, (nn.Embedding, nn.Linear)): + m.bfloat16() +for param in model.parameters(): + dist.broadcast(param.detach(), 0) + +# collect the parameters to optimize +hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] +embed_params = [p for n, p in model.named_parameters() if "embed" in n] +scalar_params = [p for p in model.parameters() if p.ndim < 2] +head_params = [model.lm_head.weight] +gate_params = [p for n, p in model.named_parameters() if "gate" in n] + +# init the optimizer(s) +# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence +# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 +optimizer1 = DistAdam( + scalar_params + head_params + embed_params, + lr=0.008, + betas=(0.65, 0.95), + eps=1e-8, + weight_decay=0.0, +) +optimizer2 = Muon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=0.0) +optimizers = [optimizer1, optimizer2] +for opt in optimizers: + for group in opt.param_groups: + group["initial_lr"] = group["lr"] + +def get_lr(step: int): + assert step < args.num_iterations + # Three part schedule: flat, linear decrease, flat + lr_schedule = args.lr_schedule + x = step / args.num_iterations + + if x < lr_schedule[0]: + return 1.0 + elif x < lr_schedule[1]: + progress = (x - lr_schedule[0]) / (lr_schedule[1] - lr_schedule[0]) + lr = 1.0 - (1.0 - args.lr_min) * progress + else: + lr = args.lr_min + return lr + +def get_ws(step: int): + assert step <= args.num_iterations + x = step / (args.num_iterations + 1) + ws_idx = int(len(args.ws_schedule) * x) + return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] + +def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): + # warmup phase: linearly increase momentum from min to max + # cooldown phase: linearly decrease momentum from max to min + momentum_cd_start = args.num_iterations - muon_cooldown_steps + if step < muon_warmup_steps: + frac = step / muon_warmup_steps + momentum = momentum_min + frac * (momentum_max - momentum_min) + elif step > momentum_cd_start: + frac = (step - momentum_cd_start) / muon_cooldown_steps + momentum = momentum_max - frac * (momentum_max - momentum_min) + else: + momentum = momentum_max + return momentum + +def step_optimizers(step: int, optimizers, model): + # update lr + for optimizer in optimizers: + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * get_lr(step) + + # set muon momentum based on step + momentum = get_muon_momentum(step) + for group in optimizers[1].param_groups: + group["momentum"] = momentum + + # on even steps, only step Muon params + # on odd steps, step all params + if step%2==0: + optimizers[1].step() + optimizers[1].zero_grad(set_to_none=True) + else: + for optimizer in optimizers: + optimizer.step() + model.zero_grad(set_to_none=True) + +model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) + +######################################## +# Warmup kernels # +######################################## + +# Warmup the training kernels, then re-initialize the state so we aren't cheating +warmup_steps = 30 +initial_state = dict(model=copy.deepcopy(model.state_dict()), + optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +for step in range(warmup_steps): + inputs, targets, cum_seqlens = next(train_loader) + # each window size is a new graph, need to warm up each with Yarn.attn_scale + ws_idx = step % len(args.ws_schedule) + if ws_idx==0: + model.yarn.reset() + ws_long = args.ws_schedule[0] + else: + new_ws_long = args.ws_schedule[ws_idx] + if new_ws_long > ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long = new_ws_long + model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() + for opt in optimizers: + opt.step() + model.zero_grad(set_to_none=True) +model.yarn.reset() # rotary buffer is not stored in state_dict +model.load_state_dict(initial_state["model"]) +optimizer2.reset() # momentum buffer not in state dict +for opt, opt_state in zip(optimizers, initial_state["optimizers"]): + opt.load_state_dict(opt_state) +del train_loader, initial_state + +######################################## +# Training and validation # +######################################## + +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +training_time_ms = 0 +# start the clock +torch.cuda.synchronize() +t0 = time.perf_counter() +# begin training +train_steps = args.num_iterations +ws_short, ws_long = get_ws(0) +for step in range(train_steps + 1): + last_step = (step == train_steps) + ws_short, new_ws_long = get_ws(step) + if new_ws_long != ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long=new_ws_long + + # --------------- VALIDATION SECTION ----------------- + if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): + if last_step: + ws_long = args.ws_validate_post_yarn_ext + # stop the clock + torch.cuda.synchronize() + training_time_ms += 1000 * (time.perf_counter() - t0) + model.eval() + assert args.val_tokens % args.val_batch_size == 0 + val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size + val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) + val_loss = 0 + with torch.no_grad(): + for _ in range(val_steps): + inputs, targets, cum_seqlens = next(val_loader) + val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) + val_loss /= val_steps + del val_loader + dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) + print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) + model.train() + # start the clock again + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if master_process and args.save_checkpoint: + log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) + os.makedirs(f"logs/{run_id}", exist_ok=True) + torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") + # the last step only has the validation loop, so break to avoid training + break + + # --------------- TRAINING SECTION ----------------- + loss = 0 + for _ in range(grad_accum_steps): + inputs, targets, cum_seqlens = next(train_loader) + loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps + loss.backward() + step_optimizers(step, optimizers, model) + + # logging + approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) + print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) + +print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) +dist.destroy_process_group() + +==================================================================================================== +Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] +Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 +Running Triton version 3.5.0 +Tue Oct 28 01:51:26 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | +| N/A 33C P0 122W / 700W | 5858MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | +| N/A 31C P0 125W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | +| N/A 29C P0 119W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | +| N/A 31C P0 119W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | +| N/A 32C P0 117W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | +| N/A 29C P0 118W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | +| N/A 31C P0 119W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | +| N/A 28C P0 113W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| ++-----------------------------------------------------------------------------------------+ + +==================================================================================================== +step:0/2285 val_loss:10.8258 train_time:0ms step_avg:0.02ms +step:1/2285 train_time:106ms step_avg:105.97ms +step:2/2285 train_time:128ms step_avg:64.03ms +step:3/2285 train_time:166ms step_avg:55.27ms +step:4/2285 train_time:222ms step_avg:55.49ms +step:5/2285 train_time:281ms step_avg:56.27ms +step:6/2285 train_time:339ms step_avg:56.55ms +step:7/2285 train_time:400ms step_avg:57.17ms +step:8/2285 train_time:458ms step_avg:57.31ms +step:9/2285 train_time:519ms step_avg:57.67ms +step:10/2285 train_time:578ms step_avg:57.76ms +step:11/2285 train_time:638ms step_avg:58.04ms +step:12/2285 train_time:697ms step_avg:58.08ms +step:13/2285 train_time:758ms step_avg:58.30ms +step:14/2285 train_time:816ms step_avg:58.32ms +step:15/2285 train_time:877ms step_avg:58.46ms +step:16/2285 train_time:935ms step_avg:58.47ms +step:17/2285 train_time:998ms step_avg:58.68ms +step:18/2285 train_time:1060ms step_avg:58.87ms +step:19/2285 train_time:1125ms step_avg:59.20ms +step:20/2285 train_time:1185ms step_avg:59.26ms +step:21/2285 train_time:1247ms step_avg:59.38ms +step:22/2285 train_time:1306ms step_avg:59.37ms +step:23/2285 train_time:1367ms step_avg:59.44ms +step:24/2285 train_time:1426ms step_avg:59.42ms +step:25/2285 train_time:1487ms step_avg:59.47ms +step:26/2285 train_time:1545ms step_avg:59.43ms +step:27/2285 train_time:1606ms step_avg:59.48ms +step:28/2285 train_time:1665ms step_avg:59.45ms +step:29/2285 train_time:1727ms step_avg:59.54ms +step:30/2285 train_time:1785ms step_avg:59.51ms +step:31/2285 train_time:1847ms step_avg:59.57ms +step:32/2285 train_time:1906ms step_avg:59.55ms +step:33/2285 train_time:1967ms step_avg:59.61ms +step:34/2285 train_time:2027ms step_avg:59.61ms +step:35/2285 train_time:2088ms step_avg:59.67ms +step:36/2285 train_time:2148ms step_avg:59.67ms +step:37/2285 train_time:2211ms step_avg:59.74ms +step:38/2285 train_time:2270ms step_avg:59.73ms +step:39/2285 train_time:2331ms step_avg:59.78ms +step:40/2285 train_time:2390ms step_avg:59.76ms +step:41/2285 train_time:2452ms step_avg:59.80ms +step:42/2285 train_time:2511ms step_avg:59.78ms +step:43/2285 train_time:2573ms step_avg:59.83ms +step:44/2285 train_time:2632ms step_avg:59.81ms +step:45/2285 train_time:2694ms step_avg:59.87ms +step:46/2285 train_time:2753ms step_avg:59.85ms +step:47/2285 train_time:2815ms step_avg:59.89ms +step:48/2285 train_time:2874ms step_avg:59.87ms +step:49/2285 train_time:2935ms step_avg:59.90ms +step:50/2285 train_time:2994ms step_avg:59.88ms +step:51/2285 train_time:3056ms step_avg:59.92ms +step:52/2285 train_time:3115ms step_avg:59.91ms +step:53/2285 train_time:3177ms step_avg:59.95ms +step:54/2285 train_time:3237ms step_avg:59.94ms +step:55/2285 train_time:3299ms step_avg:59.97ms +step:56/2285 train_time:3357ms step_avg:59.95ms +step:57/2285 train_time:3419ms step_avg:59.98ms +step:58/2285 train_time:3477ms step_avg:59.95ms +step:59/2285 train_time:3539ms step_avg:59.98ms +step:60/2285 train_time:3598ms step_avg:59.97ms +step:61/2285 train_time:3660ms step_avg:60.00ms +step:62/2285 train_time:3719ms step_avg:59.98ms +step:63/2285 train_time:3780ms step_avg:60.00ms +step:64/2285 train_time:3839ms step_avg:59.99ms +step:65/2285 train_time:3900ms step_avg:60.01ms +step:66/2285 train_time:3959ms step_avg:59.98ms +step:67/2285 train_time:4021ms step_avg:60.01ms +step:68/2285 train_time:4079ms step_avg:59.99ms +step:69/2285 train_time:4141ms step_avg:60.01ms +step:70/2285 train_time:4200ms step_avg:60.00ms +step:71/2285 train_time:4261ms step_avg:60.01ms +step:72/2285 train_time:4320ms step_avg:60.00ms +step:73/2285 train_time:4382ms step_avg:60.03ms +step:74/2285 train_time:4441ms step_avg:60.01ms +step:75/2285 train_time:4502ms step_avg:60.03ms +step:76/2285 train_time:4562ms step_avg:60.02ms +step:77/2285 train_time:4623ms step_avg:60.04ms +step:78/2285 train_time:4681ms step_avg:60.02ms +step:79/2285 train_time:4743ms step_avg:60.04ms +step:80/2285 train_time:4801ms step_avg:60.02ms +step:81/2285 train_time:4863ms step_avg:60.03ms +step:82/2285 train_time:4921ms step_avg:60.02ms +step:83/2285 train_time:4982ms step_avg:60.02ms +step:84/2285 train_time:5041ms step_avg:60.01ms +step:85/2285 train_time:5101ms step_avg:60.02ms +step:86/2285 train_time:5161ms step_avg:60.01ms +step:87/2285 train_time:5221ms step_avg:60.01ms +step:88/2285 train_time:5280ms step_avg:60.00ms +step:89/2285 train_time:5342ms step_avg:60.02ms +step:90/2285 train_time:5400ms step_avg:60.00ms +step:91/2285 train_time:5461ms step_avg:60.01ms +step:92/2285 train_time:5520ms step_avg:60.00ms +step:93/2285 train_time:5581ms step_avg:60.01ms +step:94/2285 train_time:5639ms step_avg:59.99ms +step:95/2285 train_time:5701ms step_avg:60.01ms +step:96/2285 train_time:5759ms step_avg:59.99ms +step:97/2285 train_time:5821ms step_avg:60.01ms +step:98/2285 train_time:5879ms step_avg:59.99ms +step:99/2285 train_time:5940ms step_avg:60.00ms +step:100/2285 train_time:5999ms step_avg:59.99ms +step:101/2285 train_time:6060ms step_avg:60.00ms +step:102/2285 train_time:6119ms step_avg:59.99ms +step:103/2285 train_time:6180ms step_avg:60.00ms +step:104/2285 train_time:6238ms step_avg:59.98ms +step:105/2285 train_time:6300ms step_avg:60.00ms +step:106/2285 train_time:6359ms step_avg:59.99ms +step:107/2285 train_time:6420ms step_avg:60.00ms +step:108/2285 train_time:6479ms step_avg:59.99ms +step:109/2285 train_time:6541ms step_avg:60.01ms +step:110/2285 train_time:6599ms step_avg:59.99ms +step:111/2285 train_time:6660ms step_avg:60.00ms +step:112/2285 train_time:6719ms step_avg:59.99ms +step:113/2285 train_time:6780ms step_avg:60.00ms +step:114/2285 train_time:6839ms step_avg:59.99ms +step:115/2285 train_time:6900ms step_avg:60.00ms +step:116/2285 train_time:6958ms step_avg:59.99ms +step:117/2285 train_time:7020ms step_avg:60.00ms +step:118/2285 train_time:7078ms step_avg:59.99ms +step:119/2285 train_time:7140ms step_avg:60.00ms +step:120/2285 train_time:7199ms step_avg:59.99ms +step:121/2285 train_time:7260ms step_avg:60.00ms +step:122/2285 train_time:7318ms step_avg:59.99ms +step:123/2285 train_time:7380ms step_avg:60.00ms +step:124/2285 train_time:7439ms step_avg:59.99ms +step:125/2285 train_time:7501ms step_avg:60.00ms +step:126/2285 train_time:7559ms step_avg:59.99ms +step:127/2285 train_time:7621ms step_avg:60.01ms +step:128/2285 train_time:7680ms step_avg:60.00ms +step:129/2285 train_time:7741ms step_avg:60.01ms +step:130/2285 train_time:7799ms step_avg:60.00ms +step:131/2285 train_time:7860ms step_avg:60.00ms +step:132/2285 train_time:7919ms step_avg:59.99ms +step:133/2285 train_time:7980ms step_avg:60.00ms +step:134/2285 train_time:8038ms step_avg:59.99ms +step:135/2285 train_time:8099ms step_avg:60.00ms +step:136/2285 train_time:8158ms step_avg:59.99ms +step:137/2285 train_time:8219ms step_avg:59.99ms +step:138/2285 train_time:8278ms step_avg:59.99ms +step:139/2285 train_time:8341ms step_avg:60.00ms +step:140/2285 train_time:8399ms step_avg:59.99ms +step:141/2285 train_time:8460ms step_avg:60.00ms +step:142/2285 train_time:8519ms step_avg:59.99ms +step:143/2285 train_time:8580ms step_avg:60.00ms +step:144/2285 train_time:8639ms step_avg:59.99ms +step:145/2285 train_time:8701ms step_avg:60.01ms +step:146/2285 train_time:8759ms step_avg:59.99ms +step:147/2285 train_time:8820ms step_avg:60.00ms +step:148/2285 train_time:8879ms step_avg:60.00ms +step:149/2285 train_time:8940ms step_avg:60.00ms +step:150/2285 train_time:8999ms step_avg:59.99ms +step:151/2285 train_time:9060ms step_avg:60.00ms +step:152/2285 train_time:9118ms step_avg:59.99ms +step:153/2285 train_time:9180ms step_avg:60.00ms +step:154/2285 train_time:9238ms step_avg:59.99ms +step:155/2285 train_time:9299ms step_avg:60.00ms +step:156/2285 train_time:9358ms step_avg:59.99ms +step:157/2285 train_time:9419ms step_avg:59.99ms +step:158/2285 train_time:9478ms step_avg:59.99ms +step:159/2285 train_time:9539ms step_avg:59.99ms +step:160/2285 train_time:9598ms step_avg:59.99ms +step:161/2285 train_time:9659ms step_avg:60.00ms +step:162/2285 train_time:9718ms step_avg:59.99ms +step:163/2285 train_time:9779ms step_avg:59.99ms +step:164/2285 train_time:9838ms step_avg:59.99ms +step:165/2285 train_time:9899ms step_avg:59.99ms +step:166/2285 train_time:9958ms step_avg:59.99ms +step:167/2285 train_time:10019ms step_avg:59.99ms +step:168/2285 train_time:10078ms step_avg:59.99ms +step:169/2285 train_time:10140ms step_avg:60.00ms +step:170/2285 train_time:10198ms step_avg:59.99ms +step:171/2285 train_time:10259ms step_avg:60.00ms +step:172/2285 train_time:10318ms step_avg:59.99ms +step:173/2285 train_time:10379ms step_avg:59.99ms +step:174/2285 train_time:10437ms step_avg:59.98ms +step:175/2285 train_time:10499ms step_avg:59.99ms +step:176/2285 train_time:10558ms step_avg:59.99ms +step:177/2285 train_time:10619ms step_avg:59.99ms +step:178/2285 train_time:10678ms step_avg:59.99ms +step:179/2285 train_time:10740ms step_avg:60.00ms +step:180/2285 train_time:10798ms step_avg:59.99ms +step:181/2285 train_time:10859ms step_avg:60.00ms +step:182/2285 train_time:10918ms step_avg:59.99ms +step:183/2285 train_time:10979ms step_avg:59.99ms +step:184/2285 train_time:11038ms step_avg:59.99ms +step:185/2285 train_time:11099ms step_avg:60.00ms +step:186/2285 train_time:11158ms step_avg:59.99ms +step:187/2285 train_time:11219ms step_avg:60.00ms +step:188/2285 train_time:11278ms step_avg:59.99ms +step:189/2285 train_time:11339ms step_avg:59.99ms +step:190/2285 train_time:11398ms step_avg:59.99ms +step:191/2285 train_time:11459ms step_avg:59.99ms +step:192/2285 train_time:11518ms step_avg:59.99ms +step:193/2285 train_time:11579ms step_avg:59.99ms +step:194/2285 train_time:11638ms step_avg:59.99ms +step:195/2285 train_time:11699ms step_avg:60.00ms +step:196/2285 train_time:11758ms step_avg:59.99ms +step:197/2285 train_time:11819ms step_avg:60.00ms +step:198/2285 train_time:11878ms step_avg:59.99ms +step:199/2285 train_time:11939ms step_avg:60.00ms +step:200/2285 train_time:11998ms step_avg:59.99ms +step:201/2285 train_time:12059ms step_avg:60.00ms +step:202/2285 train_time:12118ms step_avg:59.99ms +step:203/2285 train_time:12179ms step_avg:59.99ms +step:204/2285 train_time:12238ms step_avg:59.99ms +step:205/2285 train_time:12299ms step_avg:60.00ms +step:206/2285 train_time:12358ms step_avg:59.99ms +step:207/2285 train_time:12419ms step_avg:59.99ms +step:208/2285 train_time:12477ms step_avg:59.98ms +step:209/2285 train_time:12538ms step_avg:59.99ms +step:210/2285 train_time:12597ms step_avg:59.99ms +step:211/2285 train_time:12658ms step_avg:59.99ms +step:212/2285 train_time:12717ms step_avg:59.98ms +step:213/2285 train_time:12778ms step_avg:59.99ms +step:214/2285 train_time:12837ms step_avg:59.98ms +step:215/2285 train_time:12899ms step_avg:59.99ms +step:216/2285 train_time:12957ms step_avg:59.99ms +step:217/2285 train_time:13019ms step_avg:60.00ms +step:218/2285 train_time:13078ms step_avg:59.99ms +step:219/2285 train_time:13141ms step_avg:60.00ms +step:220/2285 train_time:13198ms step_avg:59.99ms +step:221/2285 train_time:13259ms step_avg:60.00ms +step:222/2285 train_time:13318ms step_avg:59.99ms +step:223/2285 train_time:13379ms step_avg:59.99ms +step:224/2285 train_time:13437ms step_avg:59.99ms +step:225/2285 train_time:13499ms step_avg:59.99ms +step:226/2285 train_time:13558ms step_avg:59.99ms +step:227/2285 train_time:13619ms step_avg:59.99ms +step:228/2285 train_time:13677ms step_avg:59.99ms +step:229/2285 train_time:13739ms step_avg:60.00ms +step:230/2285 train_time:13798ms step_avg:59.99ms +step:231/2285 train_time:13859ms step_avg:60.00ms +step:232/2285 train_time:13917ms step_avg:59.99ms +step:233/2285 train_time:13979ms step_avg:59.99ms +step:234/2285 train_time:14037ms step_avg:59.99ms +step:235/2285 train_time:14098ms step_avg:59.99ms +step:236/2285 train_time:14157ms step_avg:59.99ms +step:237/2285 train_time:14218ms step_avg:59.99ms +step:238/2285 train_time:14277ms step_avg:59.99ms +step:239/2285 train_time:14337ms step_avg:59.99ms +step:240/2285 train_time:14396ms step_avg:59.98ms +step:241/2285 train_time:14458ms step_avg:59.99ms +step:242/2285 train_time:14516ms step_avg:59.99ms +step:243/2285 train_time:14577ms step_avg:59.99ms +step:244/2285 train_time:14636ms step_avg:59.99ms +step:245/2285 train_time:14698ms step_avg:59.99ms +step:246/2285 train_time:14757ms step_avg:59.99ms +step:247/2285 train_time:14818ms step_avg:59.99ms +step:248/2285 train_time:14877ms step_avg:59.99ms +step:249/2285 train_time:14938ms step_avg:59.99ms +step:250/2285 train_time:14998ms step_avg:59.99ms +step:250/2285 val_loss:4.0701 train_time:15061ms step_avg:60.24ms +step:251/2285 train_time:15080ms step_avg:60.08ms +step:252/2285 train_time:15123ms step_avg:60.01ms +step:253/2285 train_time:15187ms step_avg:60.03ms +step:254/2285 train_time:15251ms step_avg:60.04ms +step:255/2285 train_time:15315ms step_avg:60.06ms +step:256/2285 train_time:15373ms step_avg:60.05ms +step:257/2285 train_time:15434ms step_avg:60.05ms +step:258/2285 train_time:15492ms step_avg:60.05ms +step:259/2285 train_time:15553ms step_avg:60.05ms +step:260/2285 train_time:15611ms step_avg:60.04ms +step:261/2285 train_time:15671ms step_avg:60.04ms +step:262/2285 train_time:15728ms step_avg:60.03ms +step:263/2285 train_time:15788ms step_avg:60.03ms +step:264/2285 train_time:15847ms step_avg:60.03ms +step:265/2285 train_time:15906ms step_avg:60.02ms +step:266/2285 train_time:15964ms step_avg:60.01ms +step:267/2285 train_time:16025ms step_avg:60.02ms +step:268/2285 train_time:16084ms step_avg:60.01ms +step:269/2285 train_time:16146ms step_avg:60.02ms +step:270/2285 train_time:16206ms step_avg:60.02ms +step:271/2285 train_time:16269ms step_avg:60.03ms +step:272/2285 train_time:16328ms step_avg:60.03ms +step:273/2285 train_time:16389ms step_avg:60.03ms +step:274/2285 train_time:16447ms step_avg:60.03ms +step:275/2285 train_time:16508ms step_avg:60.03ms +step:276/2285 train_time:16566ms step_avg:60.02ms +step:277/2285 train_time:16627ms step_avg:60.02ms +step:278/2285 train_time:16685ms step_avg:60.02ms +step:279/2285 train_time:16746ms step_avg:60.02ms +step:280/2285 train_time:16804ms step_avg:60.01ms +step:281/2285 train_time:16864ms step_avg:60.02ms +step:282/2285 train_time:16923ms step_avg:60.01ms +step:283/2285 train_time:16983ms step_avg:60.01ms +step:284/2285 train_time:17042ms step_avg:60.01ms +step:285/2285 train_time:17103ms step_avg:60.01ms +step:286/2285 train_time:17162ms step_avg:60.01ms +step:287/2285 train_time:17224ms step_avg:60.01ms +step:288/2285 train_time:17283ms step_avg:60.01ms +step:289/2285 train_time:17344ms step_avg:60.01ms +step:290/2285 train_time:17403ms step_avg:60.01ms +step:291/2285 train_time:17465ms step_avg:60.02ms +step:292/2285 train_time:17523ms step_avg:60.01ms +step:293/2285 train_time:17584ms step_avg:60.01ms +step:294/2285 train_time:17642ms step_avg:60.01ms +step:295/2285 train_time:17703ms step_avg:60.01ms +step:296/2285 train_time:17761ms step_avg:60.00ms +step:297/2285 train_time:17822ms step_avg:60.01ms +step:298/2285 train_time:17880ms step_avg:60.00ms +step:299/2285 train_time:17940ms step_avg:60.00ms +step:300/2285 train_time:17999ms step_avg:60.00ms +step:301/2285 train_time:18060ms step_avg:60.00ms +step:302/2285 train_time:18119ms step_avg:60.00ms +step:303/2285 train_time:18180ms step_avg:60.00ms +step:304/2285 train_time:18242ms step_avg:60.01ms +step:305/2285 train_time:18301ms step_avg:60.00ms +step:306/2285 train_time:18361ms step_avg:60.00ms +step:307/2285 train_time:18422ms step_avg:60.01ms +step:308/2285 train_time:18481ms step_avg:60.00ms +step:309/2285 train_time:18543ms step_avg:60.01ms +step:310/2285 train_time:18601ms step_avg:60.00ms +step:311/2285 train_time:18662ms step_avg:60.01ms +step:312/2285 train_time:18720ms step_avg:60.00ms +step:313/2285 train_time:18781ms step_avg:60.00ms +step:314/2285 train_time:18842ms step_avg:60.00ms +step:315/2285 train_time:18900ms step_avg:60.00ms +step:316/2285 train_time:18959ms step_avg:60.00ms +step:317/2285 train_time:19020ms step_avg:60.00ms +step:318/2285 train_time:19079ms step_avg:60.00ms +step:319/2285 train_time:19140ms step_avg:60.00ms +step:320/2285 train_time:19199ms step_avg:60.00ms +step:321/2285 train_time:19261ms step_avg:60.00ms +step:322/2285 train_time:19320ms step_avg:60.00ms +step:323/2285 train_time:19382ms step_avg:60.00ms +step:324/2285 train_time:19441ms step_avg:60.00ms +step:325/2285 train_time:19503ms step_avg:60.01ms +step:326/2285 train_time:19562ms step_avg:60.00ms +step:327/2285 train_time:19622ms step_avg:60.01ms +step:328/2285 train_time:19681ms step_avg:60.00ms +step:329/2285 train_time:19741ms step_avg:60.00ms +step:330/2285 train_time:19800ms step_avg:60.00ms +step:331/2285 train_time:19861ms step_avg:60.00ms +step:332/2285 train_time:19919ms step_avg:60.00ms +step:333/2285 train_time:19980ms step_avg:60.00ms +step:334/2285 train_time:20039ms step_avg:60.00ms +step:335/2285 train_time:20100ms step_avg:60.00ms +step:336/2285 train_time:20159ms step_avg:60.00ms +step:337/2285 train_time:20220ms step_avg:60.00ms +step:338/2285 train_time:20279ms step_avg:60.00ms +step:339/2285 train_time:20342ms step_avg:60.01ms +step:340/2285 train_time:20400ms step_avg:60.00ms +step:341/2285 train_time:20461ms step_avg:60.00ms +step:342/2285 train_time:20520ms step_avg:60.00ms +step:343/2285 train_time:20581ms step_avg:60.00ms +step:344/2285 train_time:20640ms step_avg:60.00ms +step:345/2285 train_time:20702ms step_avg:60.01ms +step:346/2285 train_time:20761ms step_avg:60.00ms +step:347/2285 train_time:20822ms step_avg:60.01ms +step:348/2285 train_time:20881ms step_avg:60.00ms +step:349/2285 train_time:20944ms step_avg:60.01ms +step:350/2285 train_time:21001ms step_avg:60.00ms +step:351/2285 train_time:21061ms step_avg:60.00ms +step:352/2285 train_time:21120ms step_avg:60.00ms +step:353/2285 train_time:21181ms step_avg:60.00ms +step:354/2285 train_time:21239ms step_avg:60.00ms +step:355/2285 train_time:21300ms step_avg:60.00ms +step:356/2285 train_time:21359ms step_avg:60.00ms +step:357/2285 train_time:21420ms step_avg:60.00ms +step:358/2285 train_time:21479ms step_avg:60.00ms +step:359/2285 train_time:21541ms step_avg:60.00ms +step:360/2285 train_time:21601ms step_avg:60.00ms +step:361/2285 train_time:21661ms step_avg:60.00ms +step:362/2285 train_time:21720ms step_avg:60.00ms +step:363/2285 train_time:21780ms step_avg:60.00ms +step:364/2285 train_time:21839ms step_avg:60.00ms +step:365/2285 train_time:21900ms step_avg:60.00ms +step:366/2285 train_time:21959ms step_avg:60.00ms +step:367/2285 train_time:22020ms step_avg:60.00ms +step:368/2285 train_time:22078ms step_avg:60.00ms +step:369/2285 train_time:22139ms step_avg:60.00ms +step:370/2285 train_time:22198ms step_avg:59.99ms +step:371/2285 train_time:22258ms step_avg:60.00ms +step:372/2285 train_time:22317ms step_avg:59.99ms +step:373/2285 train_time:22378ms step_avg:59.99ms +step:374/2285 train_time:22437ms step_avg:59.99ms +step:375/2285 train_time:22498ms step_avg:59.99ms +step:376/2285 train_time:22557ms step_avg:59.99ms +step:377/2285 train_time:22618ms step_avg:59.99ms +step:378/2285 train_time:22677ms step_avg:59.99ms +step:379/2285 train_time:22738ms step_avg:59.99ms +step:380/2285 train_time:22797ms step_avg:59.99ms +step:381/2285 train_time:22858ms step_avg:59.99ms +step:382/2285 train_time:22917ms step_avg:59.99ms +step:383/2285 train_time:22978ms step_avg:60.00ms +step:384/2285 train_time:23037ms step_avg:59.99ms +step:385/2285 train_time:23098ms step_avg:59.99ms +step:386/2285 train_time:23157ms step_avg:59.99ms +step:387/2285 train_time:23218ms step_avg:60.00ms +step:388/2285 train_time:23277ms step_avg:59.99ms +step:389/2285 train_time:23338ms step_avg:60.00ms +step:390/2285 train_time:23398ms step_avg:59.99ms +step:391/2285 train_time:23459ms step_avg:60.00ms +step:392/2285 train_time:23519ms step_avg:60.00ms +step:393/2285 train_time:23580ms step_avg:60.00ms +step:394/2285 train_time:23639ms step_avg:60.00ms +step:395/2285 train_time:23700ms step_avg:60.00ms +step:396/2285 train_time:23760ms step_avg:60.00ms +step:397/2285 train_time:23821ms step_avg:60.00ms +step:398/2285 train_time:23880ms step_avg:60.00ms +step:399/2285 train_time:23942ms step_avg:60.01ms +step:400/2285 train_time:24001ms step_avg:60.00ms +step:401/2285 train_time:24062ms step_avg:60.01ms +step:402/2285 train_time:24122ms step_avg:60.00ms +step:403/2285 train_time:24182ms step_avg:60.01ms +step:404/2285 train_time:24242ms step_avg:60.00ms +step:405/2285 train_time:24303ms step_avg:60.01ms +step:406/2285 train_time:24362ms step_avg:60.00ms +step:407/2285 train_time:24423ms step_avg:60.01ms +step:408/2285 train_time:24482ms step_avg:60.00ms +step:409/2285 train_time:24543ms step_avg:60.01ms +step:410/2285 train_time:24602ms step_avg:60.01ms +step:411/2285 train_time:24663ms step_avg:60.01ms +step:412/2285 train_time:24722ms step_avg:60.00ms +step:413/2285 train_time:24784ms step_avg:60.01ms +step:414/2285 train_time:24842ms step_avg:60.01ms +step:415/2285 train_time:24904ms step_avg:60.01ms +step:416/2285 train_time:24963ms step_avg:60.01ms +step:417/2285 train_time:25024ms step_avg:60.01ms +step:418/2285 train_time:25083ms step_avg:60.01ms +step:419/2285 train_time:25144ms step_avg:60.01ms +step:420/2285 train_time:25203ms step_avg:60.01ms +step:421/2285 train_time:25264ms step_avg:60.01ms +step:422/2285 train_time:25323ms step_avg:60.01ms +step:423/2285 train_time:25385ms step_avg:60.01ms +step:424/2285 train_time:25444ms step_avg:60.01ms +step:425/2285 train_time:25506ms step_avg:60.01ms +step:426/2285 train_time:25565ms step_avg:60.01ms +step:427/2285 train_time:25626ms step_avg:60.01ms +step:428/2285 train_time:25685ms step_avg:60.01ms +step:429/2285 train_time:25746ms step_avg:60.01ms +step:430/2285 train_time:25805ms step_avg:60.01ms +step:431/2285 train_time:25867ms step_avg:60.02ms +step:432/2285 train_time:25926ms step_avg:60.01ms +step:433/2285 train_time:25987ms step_avg:60.02ms +step:434/2285 train_time:26046ms step_avg:60.01ms +step:435/2285 train_time:26107ms step_avg:60.02ms +step:436/2285 train_time:26166ms step_avg:60.01ms +step:437/2285 train_time:26227ms step_avg:60.02ms +step:438/2285 train_time:26286ms step_avg:60.01ms +step:439/2285 train_time:26348ms step_avg:60.02ms +step:440/2285 train_time:26407ms step_avg:60.02ms +step:441/2285 train_time:26468ms step_avg:60.02ms +step:442/2285 train_time:26527ms step_avg:60.02ms +step:443/2285 train_time:26588ms step_avg:60.02ms +step:444/2285 train_time:26647ms step_avg:60.02ms +step:445/2285 train_time:26708ms step_avg:60.02ms +step:446/2285 train_time:26767ms step_avg:60.02ms +step:447/2285 train_time:26828ms step_avg:60.02ms +step:448/2285 train_time:26887ms step_avg:60.02ms +step:449/2285 train_time:26948ms step_avg:60.02ms +step:450/2285 train_time:27007ms step_avg:60.01ms +step:451/2285 train_time:27068ms step_avg:60.02ms +step:452/2285 train_time:27126ms step_avg:60.01ms +step:453/2285 train_time:27187ms step_avg:60.02ms +step:454/2285 train_time:27246ms step_avg:60.01ms +step:455/2285 train_time:27307ms step_avg:60.02ms +step:456/2285 train_time:27366ms step_avg:60.01ms +step:457/2285 train_time:27428ms step_avg:60.02ms +step:458/2285 train_time:27487ms step_avg:60.01ms +step:459/2285 train_time:27548ms step_avg:60.02ms +step:460/2285 train_time:27607ms step_avg:60.02ms +step:461/2285 train_time:27669ms step_avg:60.02ms +step:462/2285 train_time:27728ms step_avg:60.02ms +step:463/2285 train_time:27790ms step_avg:60.02ms +step:464/2285 train_time:27849ms step_avg:60.02ms +step:465/2285 train_time:27910ms step_avg:60.02ms +step:466/2285 train_time:27970ms step_avg:60.02ms +step:467/2285 train_time:28030ms step_avg:60.02ms +step:468/2285 train_time:28089ms step_avg:60.02ms +step:469/2285 train_time:28152ms step_avg:60.03ms +step:470/2285 train_time:28210ms step_avg:60.02ms +step:471/2285 train_time:28271ms step_avg:60.02ms +step:472/2285 train_time:28330ms step_avg:60.02ms +step:473/2285 train_time:28391ms step_avg:60.02ms +step:474/2285 train_time:28450ms step_avg:60.02ms +step:475/2285 train_time:28512ms step_avg:60.02ms +step:476/2285 train_time:28570ms step_avg:60.02ms +step:477/2285 train_time:28632ms step_avg:60.02ms +step:478/2285 train_time:28690ms step_avg:60.02ms +step:479/2285 train_time:28752ms step_avg:60.03ms +step:480/2285 train_time:28811ms step_avg:60.02ms +step:481/2285 train_time:28873ms step_avg:60.03ms +step:482/2285 train_time:28932ms step_avg:60.02ms +step:483/2285 train_time:28993ms step_avg:60.03ms +step:484/2285 train_time:29052ms step_avg:60.02ms +step:485/2285 train_time:29113ms step_avg:60.03ms +step:486/2285 train_time:29172ms step_avg:60.02ms +step:487/2285 train_time:29233ms step_avg:60.03ms +step:488/2285 train_time:29292ms step_avg:60.02ms +step:489/2285 train_time:29353ms step_avg:60.03ms +step:490/2285 train_time:29412ms step_avg:60.03ms +step:491/2285 train_time:29474ms step_avg:60.03ms +step:492/2285 train_time:29533ms step_avg:60.03ms +step:493/2285 train_time:29595ms step_avg:60.03ms +step:494/2285 train_time:29655ms step_avg:60.03ms +step:495/2285 train_time:29716ms step_avg:60.03ms +step:496/2285 train_time:29775ms step_avg:60.03ms +step:497/2285 train_time:29836ms step_avg:60.03ms +step:498/2285 train_time:29895ms step_avg:60.03ms +step:499/2285 train_time:29956ms step_avg:60.03ms +step:500/2285 train_time:30015ms step_avg:60.03ms +step:500/2285 val_loss:3.7855 train_time:30078ms step_avg:60.16ms +step:501/2285 train_time:30097ms step_avg:60.07ms +step:502/2285 train_time:30139ms step_avg:60.04ms +step:503/2285 train_time:30201ms step_avg:60.04ms +step:504/2285 train_time:30261ms step_avg:60.04ms +step:505/2285 train_time:30324ms step_avg:60.05ms +step:506/2285 train_time:30384ms step_avg:60.05ms +step:507/2285 train_time:30445ms step_avg:60.05ms +step:508/2285 train_time:30504ms step_avg:60.05ms +step:509/2285 train_time:30565ms step_avg:60.05ms +step:510/2285 train_time:30623ms step_avg:60.05ms +step:511/2285 train_time:30683ms step_avg:60.05ms +step:512/2285 train_time:30741ms step_avg:60.04ms +step:513/2285 train_time:30803ms step_avg:60.05ms +step:514/2285 train_time:30862ms step_avg:60.04ms +step:515/2285 train_time:30923ms step_avg:60.04ms +step:516/2285 train_time:30985ms step_avg:60.05ms +step:517/2285 train_time:31051ms step_avg:60.06ms +step:518/2285 train_time:31111ms step_avg:60.06ms +step:519/2285 train_time:31173ms step_avg:60.06ms +step:520/2285 train_time:31232ms step_avg:60.06ms +step:521/2285 train_time:31294ms step_avg:60.06ms +step:522/2285 train_time:31353ms step_avg:60.06ms +step:523/2285 train_time:31414ms step_avg:60.06ms +step:524/2285 train_time:31472ms step_avg:60.06ms +step:525/2285 train_time:31533ms step_avg:60.06ms +step:526/2285 train_time:31592ms step_avg:60.06ms +step:527/2285 train_time:31653ms step_avg:60.06ms +step:528/2285 train_time:31712ms step_avg:60.06ms +step:529/2285 train_time:31773ms step_avg:60.06ms +step:530/2285 train_time:31831ms step_avg:60.06ms +step:531/2285 train_time:31894ms step_avg:60.06ms +step:532/2285 train_time:31954ms step_avg:60.06ms +step:533/2285 train_time:32016ms step_avg:60.07ms +step:534/2285 train_time:32075ms step_avg:60.07ms +step:535/2285 train_time:32136ms step_avg:60.07ms +step:536/2285 train_time:32195ms step_avg:60.06ms +step:537/2285 train_time:32256ms step_avg:60.07ms +step:538/2285 train_time:32315ms step_avg:60.06ms +step:539/2285 train_time:32376ms step_avg:60.07ms +step:540/2285 train_time:32435ms step_avg:60.06ms +step:541/2285 train_time:32497ms step_avg:60.07ms +step:542/2285 train_time:32555ms step_avg:60.07ms +step:543/2285 train_time:32617ms step_avg:60.07ms +step:544/2285 train_time:32675ms step_avg:60.06ms +step:545/2285 train_time:32737ms step_avg:60.07ms +step:546/2285 train_time:32796ms step_avg:60.07ms +step:547/2285 train_time:32857ms step_avg:60.07ms +step:548/2285 train_time:32916ms step_avg:60.07ms +step:549/2285 train_time:32977ms step_avg:60.07ms +step:550/2285 train_time:33037ms step_avg:60.07ms +step:551/2285 train_time:33098ms step_avg:60.07ms +step:552/2285 train_time:33157ms step_avg:60.07ms +step:553/2285 train_time:33218ms step_avg:60.07ms +step:554/2285 train_time:33277ms step_avg:60.07ms +step:555/2285 train_time:33339ms step_avg:60.07ms +step:556/2285 train_time:33398ms step_avg:60.07ms +step:557/2285 train_time:33460ms step_avg:60.07ms +step:558/2285 train_time:33519ms step_avg:60.07ms +step:559/2285 train_time:33581ms step_avg:60.07ms +step:560/2285 train_time:33640ms step_avg:60.07ms +step:561/2285 train_time:33703ms step_avg:60.08ms +step:562/2285 train_time:33762ms step_avg:60.08ms +step:563/2285 train_time:33824ms step_avg:60.08ms +step:564/2285 train_time:33883ms step_avg:60.08ms +step:565/2285 train_time:33945ms step_avg:60.08ms +step:566/2285 train_time:34005ms step_avg:60.08ms +step:567/2285 train_time:34066ms step_avg:60.08ms +step:568/2285 train_time:34125ms step_avg:60.08ms +step:569/2285 train_time:34186ms step_avg:60.08ms +step:570/2285 train_time:34246ms step_avg:60.08ms +step:571/2285 train_time:34307ms step_avg:60.08ms +step:572/2285 train_time:34367ms step_avg:60.08ms +step:573/2285 train_time:34428ms step_avg:60.08ms +step:574/2285 train_time:34487ms step_avg:60.08ms +step:575/2285 train_time:34548ms step_avg:60.08ms +step:576/2285 train_time:34607ms step_avg:60.08ms +step:577/2285 train_time:34669ms step_avg:60.09ms +step:578/2285 train_time:34728ms step_avg:60.08ms +step:579/2285 train_time:34789ms step_avg:60.09ms +step:580/2285 train_time:34848ms step_avg:60.08ms +step:581/2285 train_time:34910ms step_avg:60.09ms +step:582/2285 train_time:34969ms step_avg:60.08ms +step:583/2285 train_time:35031ms step_avg:60.09ms +step:584/2285 train_time:35089ms step_avg:60.08ms +step:585/2285 train_time:35151ms step_avg:60.09ms +step:586/2285 train_time:35210ms step_avg:60.09ms +step:587/2285 train_time:35272ms step_avg:60.09ms +step:588/2285 train_time:35331ms step_avg:60.09ms +step:589/2285 train_time:35392ms step_avg:60.09ms +step:590/2285 train_time:35450ms step_avg:60.09ms +step:591/2285 train_time:35511ms step_avg:60.09ms +step:592/2285 train_time:35570ms step_avg:60.08ms +step:593/2285 train_time:35631ms step_avg:60.09ms +step:594/2285 train_time:35690ms step_avg:60.08ms +step:595/2285 train_time:35752ms step_avg:60.09ms +step:596/2285 train_time:35811ms step_avg:60.08ms +step:597/2285 train_time:35872ms step_avg:60.09ms +step:598/2285 train_time:35930ms step_avg:60.08ms +step:599/2285 train_time:35991ms step_avg:60.09ms +step:600/2285 train_time:36050ms step_avg:60.08ms +step:601/2285 train_time:36112ms step_avg:60.09ms +step:602/2285 train_time:36171ms step_avg:60.08ms +step:603/2285 train_time:36232ms step_avg:60.09ms +step:604/2285 train_time:36291ms step_avg:60.08ms +step:605/2285 train_time:36352ms step_avg:60.09ms +step:606/2285 train_time:36411ms step_avg:60.08ms +step:607/2285 train_time:36472ms step_avg:60.09ms +step:608/2285 train_time:36530ms step_avg:60.08ms +step:609/2285 train_time:36592ms step_avg:60.08ms +step:610/2285 train_time:36650ms step_avg:60.08ms +step:611/2285 train_time:36711ms step_avg:60.08ms +step:612/2285 train_time:36770ms step_avg:60.08ms +step:613/2285 train_time:36832ms step_avg:60.08ms +step:614/2285 train_time:36891ms step_avg:60.08ms +step:615/2285 train_time:36953ms step_avg:60.09ms +step:616/2285 train_time:37012ms step_avg:60.08ms +step:617/2285 train_time:37073ms step_avg:60.09ms +step:618/2285 train_time:37132ms step_avg:60.08ms +step:619/2285 train_time:37192ms step_avg:60.08ms +step:620/2285 train_time:37251ms step_avg:60.08ms +step:621/2285 train_time:37312ms step_avg:60.08ms +step:622/2285 train_time:37371ms step_avg:60.08ms +step:623/2285 train_time:37432ms step_avg:60.08ms +step:624/2285 train_time:37490ms step_avg:60.08ms +step:625/2285 train_time:37553ms step_avg:60.08ms +step:626/2285 train_time:37610ms step_avg:60.08ms +step:627/2285 train_time:37672ms step_avg:60.08ms +step:628/2285 train_time:37730ms step_avg:60.08ms +step:629/2285 train_time:37792ms step_avg:60.08ms +step:630/2285 train_time:37850ms step_avg:60.08ms +step:631/2285 train_time:37912ms step_avg:60.08ms +step:632/2285 train_time:37971ms step_avg:60.08ms +step:633/2285 train_time:38032ms step_avg:60.08ms +step:634/2285 train_time:38092ms step_avg:60.08ms +step:635/2285 train_time:38153ms step_avg:60.08ms +step:636/2285 train_time:38212ms step_avg:60.08ms +step:637/2285 train_time:38273ms step_avg:60.08ms +step:638/2285 train_time:38331ms step_avg:60.08ms +step:639/2285 train_time:38392ms step_avg:60.08ms +step:640/2285 train_time:38451ms step_avg:60.08ms +step:641/2285 train_time:38512ms step_avg:60.08ms +step:642/2285 train_time:38571ms step_avg:60.08ms +step:643/2285 train_time:38632ms step_avg:60.08ms +step:644/2285 train_time:38691ms step_avg:60.08ms +step:645/2285 train_time:38752ms step_avg:60.08ms +step:646/2285 train_time:38811ms step_avg:60.08ms +step:647/2285 train_time:38872ms step_avg:60.08ms +step:648/2285 train_time:38931ms step_avg:60.08ms +step:649/2285 train_time:38993ms step_avg:60.08ms +step:650/2285 train_time:39052ms step_avg:60.08ms +step:651/2285 train_time:39113ms step_avg:60.08ms +step:652/2285 train_time:39172ms step_avg:60.08ms +step:653/2285 train_time:39233ms step_avg:60.08ms +step:654/2285 train_time:39292ms step_avg:60.08ms +step:655/2285 train_time:39353ms step_avg:60.08ms +step:656/2285 train_time:39412ms step_avg:60.08ms +step:657/2285 train_time:39474ms step_avg:60.08ms +step:658/2285 train_time:39532ms step_avg:60.08ms +step:659/2285 train_time:39593ms step_avg:60.08ms +step:660/2285 train_time:39652ms step_avg:60.08ms +step:661/2285 train_time:39713ms step_avg:60.08ms +step:662/2285 train_time:39772ms step_avg:60.08ms +step:663/2285 train_time:39833ms step_avg:60.08ms +step:664/2285 train_time:39893ms step_avg:60.08ms +step:665/2285 train_time:39954ms step_avg:60.08ms +step:666/2285 train_time:40013ms step_avg:60.08ms +step:667/2285 train_time:40074ms step_avg:60.08ms +step:668/2285 train_time:40133ms step_avg:60.08ms +step:669/2285 train_time:40194ms step_avg:60.08ms +step:670/2285 train_time:40253ms step_avg:60.08ms +step:671/2285 train_time:40315ms step_avg:60.08ms +step:672/2285 train_time:40373ms step_avg:60.08ms +step:673/2285 train_time:40434ms step_avg:60.08ms +step:674/2285 train_time:40493ms step_avg:60.08ms +step:675/2285 train_time:40554ms step_avg:60.08ms +step:676/2285 train_time:40613ms step_avg:60.08ms +step:677/2285 train_time:40673ms step_avg:60.08ms +step:678/2285 train_time:40732ms step_avg:60.08ms +step:679/2285 train_time:40794ms step_avg:60.08ms +step:680/2285 train_time:40853ms step_avg:60.08ms +step:681/2285 train_time:40915ms step_avg:60.08ms +step:682/2285 train_time:40974ms step_avg:60.08ms +step:683/2285 train_time:41035ms step_avg:60.08ms +step:684/2285 train_time:41094ms step_avg:60.08ms +step:685/2285 train_time:41155ms step_avg:60.08ms +step:686/2285 train_time:41214ms step_avg:60.08ms +step:687/2285 train_time:41275ms step_avg:60.08ms +step:688/2285 train_time:41334ms step_avg:60.08ms +step:689/2285 train_time:41395ms step_avg:60.08ms +step:690/2285 train_time:41454ms step_avg:60.08ms +step:691/2285 train_time:41515ms step_avg:60.08ms +step:692/2285 train_time:41575ms step_avg:60.08ms +step:693/2285 train_time:41636ms step_avg:60.08ms +step:694/2285 train_time:41695ms step_avg:60.08ms +step:695/2285 train_time:41756ms step_avg:60.08ms +step:696/2285 train_time:41816ms step_avg:60.08ms +step:697/2285 train_time:41877ms step_avg:60.08ms +step:698/2285 train_time:41936ms step_avg:60.08ms +step:699/2285 train_time:41997ms step_avg:60.08ms +step:700/2285 train_time:42056ms step_avg:60.08ms +step:701/2285 train_time:42118ms step_avg:60.08ms +step:702/2285 train_time:42177ms step_avg:60.08ms +step:703/2285 train_time:42238ms step_avg:60.08ms +step:704/2285 train_time:42297ms step_avg:60.08ms +step:705/2285 train_time:42359ms step_avg:60.08ms +step:706/2285 train_time:42418ms step_avg:60.08ms +step:707/2285 train_time:42480ms step_avg:60.09ms +step:708/2285 train_time:42539ms step_avg:60.08ms +step:709/2285 train_time:42601ms step_avg:60.09ms +step:710/2285 train_time:42661ms step_avg:60.09ms +step:711/2285 train_time:42722ms step_avg:60.09ms +step:712/2285 train_time:42781ms step_avg:60.09ms +step:713/2285 train_time:42844ms step_avg:60.09ms +step:714/2285 train_time:42903ms step_avg:60.09ms +step:715/2285 train_time:42964ms step_avg:60.09ms +step:716/2285 train_time:43024ms step_avg:60.09ms +step:717/2285 train_time:43085ms step_avg:60.09ms +step:718/2285 train_time:43144ms step_avg:60.09ms +step:719/2285 train_time:43206ms step_avg:60.09ms +step:720/2285 train_time:43265ms step_avg:60.09ms +step:721/2285 train_time:43327ms step_avg:60.09ms +step:722/2285 train_time:43386ms step_avg:60.09ms +step:723/2285 train_time:43449ms step_avg:60.10ms +step:724/2285 train_time:43508ms step_avg:60.09ms +step:725/2285 train_time:43570ms step_avg:60.10ms +step:726/2285 train_time:43628ms step_avg:60.09ms +step:727/2285 train_time:43689ms step_avg:60.10ms +step:728/2285 train_time:43749ms step_avg:60.09ms +step:729/2285 train_time:43810ms step_avg:60.10ms +step:730/2285 train_time:43869ms step_avg:60.09ms +step:731/2285 train_time:43931ms step_avg:60.10ms +step:732/2285 train_time:43989ms step_avg:60.09ms +step:733/2285 train_time:44051ms step_avg:60.10ms +step:734/2285 train_time:44110ms step_avg:60.09ms +step:735/2285 train_time:44171ms step_avg:60.10ms +step:736/2285 train_time:44230ms step_avg:60.10ms +step:737/2285 train_time:44291ms step_avg:60.10ms +step:738/2285 train_time:44350ms step_avg:60.10ms +step:739/2285 train_time:44412ms step_avg:60.10ms +step:740/2285 train_time:44470ms step_avg:60.09ms +step:741/2285 train_time:44531ms step_avg:60.10ms +step:742/2285 train_time:44590ms step_avg:60.09ms +step:743/2285 train_time:44651ms step_avg:60.10ms +step:744/2285 train_time:44710ms step_avg:60.09ms +step:745/2285 train_time:44771ms step_avg:60.10ms +step:746/2285 train_time:44829ms step_avg:60.09ms +step:747/2285 train_time:44890ms step_avg:60.09ms +step:748/2285 train_time:44949ms step_avg:60.09ms +step:749/2285 train_time:45010ms step_avg:60.09ms +step:750/2285 train_time:45069ms step_avg:60.09ms +step:750/2285 val_loss:3.6572 train_time:45133ms step_avg:60.18ms +step:751/2285 train_time:45153ms step_avg:60.12ms +step:752/2285 train_time:45193ms step_avg:60.10ms +step:753/2285 train_time:45256ms step_avg:60.10ms +step:754/2285 train_time:45316ms step_avg:60.10ms +step:755/2285 train_time:45377ms step_avg:60.10ms +step:756/2285 train_time:45436ms step_avg:60.10ms +step:757/2285 train_time:45497ms step_avg:60.10ms +step:758/2285 train_time:45556ms step_avg:60.10ms +step:759/2285 train_time:45616ms step_avg:60.10ms +step:760/2285 train_time:45675ms step_avg:60.10ms +step:761/2285 train_time:45736ms step_avg:60.10ms +step:762/2285 train_time:45798ms step_avg:60.10ms +step:763/2285 train_time:45858ms step_avg:60.10ms +step:764/2285 train_time:45918ms step_avg:60.10ms +step:765/2285 train_time:45979ms step_avg:60.10ms +step:766/2285 train_time:46038ms step_avg:60.10ms +step:767/2285 train_time:46101ms step_avg:60.11ms +step:768/2285 train_time:46162ms step_avg:60.11ms +step:769/2285 train_time:46224ms step_avg:60.11ms +step:770/2285 train_time:46284ms step_avg:60.11ms +step:771/2285 train_time:46346ms step_avg:60.11ms +step:772/2285 train_time:46405ms step_avg:60.11ms +step:773/2285 train_time:46467ms step_avg:60.11ms +step:774/2285 train_time:46526ms step_avg:60.11ms +step:775/2285 train_time:46587ms step_avg:60.11ms +step:776/2285 train_time:46646ms step_avg:60.11ms +step:777/2285 train_time:46707ms step_avg:60.11ms +step:778/2285 train_time:46766ms step_avg:60.11ms +step:779/2285 train_time:46828ms step_avg:60.11ms +step:780/2285 train_time:46887ms step_avg:60.11ms +step:781/2285 train_time:46949ms step_avg:60.11ms +step:782/2285 train_time:47009ms step_avg:60.11ms +step:783/2285 train_time:47071ms step_avg:60.12ms +step:784/2285 train_time:47130ms step_avg:60.12ms +step:785/2285 train_time:47192ms step_avg:60.12ms +step:786/2285 train_time:47252ms step_avg:60.12ms +step:787/2285 train_time:47314ms step_avg:60.12ms +step:788/2285 train_time:47374ms step_avg:60.12ms +step:789/2285 train_time:47435ms step_avg:60.12ms +step:790/2285 train_time:47495ms step_avg:60.12ms +step:791/2285 train_time:47557ms step_avg:60.12ms +step:792/2285 train_time:47617ms step_avg:60.12ms +step:793/2285 train_time:47678ms step_avg:60.12ms +step:794/2285 train_time:47737ms step_avg:60.12ms +step:795/2285 train_time:47799ms step_avg:60.12ms +step:796/2285 train_time:47859ms step_avg:60.12ms +step:797/2285 train_time:47920ms step_avg:60.13ms +step:798/2285 train_time:47979ms step_avg:60.12ms +step:799/2285 train_time:48040ms step_avg:60.13ms +step:800/2285 train_time:48100ms step_avg:60.12ms +step:801/2285 train_time:48162ms step_avg:60.13ms +step:802/2285 train_time:48222ms step_avg:60.13ms +step:803/2285 train_time:48284ms step_avg:60.13ms +step:804/2285 train_time:48343ms step_avg:60.13ms +step:805/2285 train_time:48406ms step_avg:60.13ms +step:806/2285 train_time:48465ms step_avg:60.13ms +step:807/2285 train_time:48527ms step_avg:60.13ms +step:808/2285 train_time:48586ms step_avg:60.13ms +step:809/2285 train_time:48647ms step_avg:60.13ms +step:810/2285 train_time:48706ms step_avg:60.13ms +step:811/2285 train_time:48768ms step_avg:60.13ms +step:812/2285 train_time:48827ms step_avg:60.13ms +step:813/2285 train_time:48888ms step_avg:60.13ms +step:814/2285 train_time:48948ms step_avg:60.13ms +step:815/2285 train_time:49010ms step_avg:60.13ms +step:816/2285 train_time:49070ms step_avg:60.13ms +step:817/2285 train_time:49132ms step_avg:60.14ms +step:818/2285 train_time:49192ms step_avg:60.14ms +step:819/2285 train_time:49254ms step_avg:60.14ms +step:820/2285 train_time:49314ms step_avg:60.14ms +step:821/2285 train_time:49377ms step_avg:60.14ms +step:822/2285 train_time:49436ms step_avg:60.14ms +step:823/2285 train_time:49498ms step_avg:60.14ms +step:824/2285 train_time:49558ms step_avg:60.14ms +step:825/2285 train_time:49619ms step_avg:60.14ms +step:826/2285 train_time:49679ms step_avg:60.14ms +step:827/2285 train_time:49740ms step_avg:60.15ms +step:828/2285 train_time:49800ms step_avg:60.15ms +step:829/2285 train_time:49861ms step_avg:60.15ms +step:830/2285 train_time:49920ms step_avg:60.15ms +step:831/2285 train_time:49982ms step_avg:60.15ms +step:832/2285 train_time:50041ms step_avg:60.15ms +step:833/2285 train_time:50103ms step_avg:60.15ms +step:834/2285 train_time:50162ms step_avg:60.15ms +step:835/2285 train_time:50225ms step_avg:60.15ms +step:836/2285 train_time:50285ms step_avg:60.15ms +step:837/2285 train_time:50347ms step_avg:60.15ms +step:838/2285 train_time:50406ms step_avg:60.15ms +step:839/2285 train_time:50468ms step_avg:60.15ms +step:840/2285 train_time:50527ms step_avg:60.15ms +step:841/2285 train_time:50589ms step_avg:60.15ms +step:842/2285 train_time:50649ms step_avg:60.15ms +step:843/2285 train_time:50710ms step_avg:60.15ms +step:844/2285 train_time:50770ms step_avg:60.15ms +step:845/2285 train_time:50832ms step_avg:60.16ms +step:846/2285 train_time:50892ms step_avg:60.16ms +step:847/2285 train_time:50954ms step_avg:60.16ms +step:848/2285 train_time:51014ms step_avg:60.16ms +step:849/2285 train_time:51077ms step_avg:60.16ms +step:850/2285 train_time:51136ms step_avg:60.16ms +step:851/2285 train_time:51199ms step_avg:60.16ms +step:852/2285 train_time:51258ms step_avg:60.16ms +step:853/2285 train_time:51320ms step_avg:60.16ms +step:854/2285 train_time:51380ms step_avg:60.16ms +step:855/2285 train_time:51441ms step_avg:60.17ms +step:856/2285 train_time:51501ms step_avg:60.16ms +step:857/2285 train_time:51562ms step_avg:60.17ms +step:858/2285 train_time:51621ms step_avg:60.16ms +step:859/2285 train_time:51682ms step_avg:60.17ms +step:860/2285 train_time:51741ms step_avg:60.16ms +step:861/2285 train_time:51803ms step_avg:60.17ms +step:862/2285 train_time:51863ms step_avg:60.17ms +step:863/2285 train_time:51925ms step_avg:60.17ms +step:864/2285 train_time:51984ms step_avg:60.17ms +step:865/2285 train_time:52046ms step_avg:60.17ms +step:866/2285 train_time:52105ms step_avg:60.17ms +step:867/2285 train_time:52167ms step_avg:60.17ms +step:868/2285 train_time:52226ms step_avg:60.17ms +step:869/2285 train_time:52288ms step_avg:60.17ms +step:870/2285 train_time:52347ms step_avg:60.17ms +step:871/2285 train_time:52410ms step_avg:60.17ms +step:872/2285 train_time:52469ms step_avg:60.17ms +step:873/2285 train_time:52531ms step_avg:60.17ms +step:874/2285 train_time:52590ms step_avg:60.17ms +step:875/2285 train_time:52652ms step_avg:60.17ms +step:876/2285 train_time:52712ms step_avg:60.17ms +step:877/2285 train_time:52774ms step_avg:60.18ms +step:878/2285 train_time:52834ms step_avg:60.18ms +step:879/2285 train_time:52896ms step_avg:60.18ms +step:880/2285 train_time:52956ms step_avg:60.18ms +step:881/2285 train_time:53017ms step_avg:60.18ms +step:882/2285 train_time:53077ms step_avg:60.18ms +step:883/2285 train_time:53139ms step_avg:60.18ms +step:884/2285 train_time:53198ms step_avg:60.18ms +step:885/2285 train_time:53260ms step_avg:60.18ms +step:886/2285 train_time:53320ms step_avg:60.18ms +step:887/2285 train_time:53380ms step_avg:60.18ms +step:888/2285 train_time:53440ms step_avg:60.18ms +step:889/2285 train_time:53501ms step_avg:60.18ms +step:890/2285 train_time:53560ms step_avg:60.18ms +step:891/2285 train_time:53622ms step_avg:60.18ms +step:892/2285 train_time:53681ms step_avg:60.18ms +step:893/2285 train_time:53743ms step_avg:60.18ms +step:894/2285 train_time:53803ms step_avg:60.18ms +step:895/2285 train_time:53865ms step_avg:60.18ms +step:896/2285 train_time:53924ms step_avg:60.18ms +step:897/2285 train_time:53985ms step_avg:60.18ms +step:898/2285 train_time:54044ms step_avg:60.18ms +step:899/2285 train_time:54105ms step_avg:60.18ms +step:900/2285 train_time:54165ms step_avg:60.18ms +step:901/2285 train_time:54226ms step_avg:60.18ms +step:902/2285 train_time:54286ms step_avg:60.18ms +step:903/2285 train_time:54348ms step_avg:60.19ms +step:904/2285 train_time:54408ms step_avg:60.19ms +step:905/2285 train_time:54470ms step_avg:60.19ms +step:906/2285 train_time:54529ms step_avg:60.19ms +step:907/2285 train_time:54591ms step_avg:60.19ms +step:908/2285 train_time:54651ms step_avg:60.19ms +step:909/2285 train_time:54713ms step_avg:60.19ms +step:910/2285 train_time:54773ms step_avg:60.19ms +step:911/2285 train_time:54834ms step_avg:60.19ms +step:912/2285 train_time:54894ms step_avg:60.19ms +step:913/2285 train_time:54957ms step_avg:60.19ms +step:914/2285 train_time:55017ms step_avg:60.19ms +step:915/2285 train_time:55079ms step_avg:60.20ms +step:916/2285 train_time:55138ms step_avg:60.19ms +step:917/2285 train_time:55200ms step_avg:60.20ms +step:918/2285 train_time:55259ms step_avg:60.20ms +step:919/2285 train_time:55321ms step_avg:60.20ms +step:920/2285 train_time:55380ms step_avg:60.20ms +step:921/2285 train_time:55442ms step_avg:60.20ms +step:922/2285 train_time:55501ms step_avg:60.20ms +step:923/2285 train_time:55563ms step_avg:60.20ms +step:924/2285 train_time:55623ms step_avg:60.20ms +step:925/2285 train_time:55684ms step_avg:60.20ms +step:926/2285 train_time:55744ms step_avg:60.20ms +step:927/2285 train_time:55806ms step_avg:60.20ms +step:928/2285 train_time:55865ms step_avg:60.20ms +step:929/2285 train_time:55927ms step_avg:60.20ms +step:930/2285 train_time:55986ms step_avg:60.20ms +step:931/2285 train_time:56047ms step_avg:60.20ms +step:932/2285 train_time:56107ms step_avg:60.20ms +step:933/2285 train_time:56169ms step_avg:60.20ms +step:934/2285 train_time:56229ms step_avg:60.20ms +step:935/2285 train_time:56291ms step_avg:60.20ms +step:936/2285 train_time:56350ms step_avg:60.20ms +step:937/2285 train_time:56412ms step_avg:60.21ms +step:938/2285 train_time:56472ms step_avg:60.20ms +step:939/2285 train_time:56534ms step_avg:60.21ms +step:940/2285 train_time:56593ms step_avg:60.21ms +step:941/2285 train_time:56656ms step_avg:60.21ms +step:942/2285 train_time:56715ms step_avg:60.21ms +step:943/2285 train_time:56778ms step_avg:60.21ms +step:944/2285 train_time:56837ms step_avg:60.21ms +step:945/2285 train_time:56899ms step_avg:60.21ms +step:946/2285 train_time:56959ms step_avg:60.21ms +step:947/2285 train_time:57020ms step_avg:60.21ms +step:948/2285 train_time:57080ms step_avg:60.21ms +step:949/2285 train_time:57141ms step_avg:60.21ms +step:950/2285 train_time:57201ms step_avg:60.21ms +step:951/2285 train_time:57262ms step_avg:60.21ms +step:952/2285 train_time:57321ms step_avg:60.21ms +step:953/2285 train_time:57382ms step_avg:60.21ms +step:954/2285 train_time:57441ms step_avg:60.21ms +step:955/2285 train_time:57506ms step_avg:60.22ms +step:956/2285 train_time:57564ms step_avg:60.21ms +step:957/2285 train_time:57626ms step_avg:60.21ms +step:958/2285 train_time:57685ms step_avg:60.21ms +step:959/2285 train_time:57747ms step_avg:60.22ms +step:960/2285 train_time:57806ms step_avg:60.21ms +step:961/2285 train_time:57868ms step_avg:60.22ms +step:962/2285 train_time:57927ms step_avg:60.22ms +step:963/2285 train_time:57989ms step_avg:60.22ms +step:964/2285 train_time:58048ms step_avg:60.22ms +step:965/2285 train_time:58110ms step_avg:60.22ms +step:966/2285 train_time:58170ms step_avg:60.22ms +step:967/2285 train_time:58232ms step_avg:60.22ms +step:968/2285 train_time:58292ms step_avg:60.22ms +step:969/2285 train_time:58354ms step_avg:60.22ms +step:970/2285 train_time:58414ms step_avg:60.22ms +step:971/2285 train_time:58476ms step_avg:60.22ms +step:972/2285 train_time:58536ms step_avg:60.22ms +step:973/2285 train_time:58598ms step_avg:60.22ms +step:974/2285 train_time:58657ms step_avg:60.22ms +step:975/2285 train_time:58719ms step_avg:60.22ms +step:976/2285 train_time:58779ms step_avg:60.22ms +step:977/2285 train_time:58840ms step_avg:60.23ms +step:978/2285 train_time:58900ms step_avg:60.22ms +step:979/2285 train_time:58961ms step_avg:60.23ms +step:980/2285 train_time:59021ms step_avg:60.23ms +step:981/2285 train_time:59082ms step_avg:60.23ms +step:982/2285 train_time:59142ms step_avg:60.23ms +step:983/2285 train_time:59203ms step_avg:60.23ms +step:984/2285 train_time:59262ms step_avg:60.23ms +step:985/2285 train_time:59324ms step_avg:60.23ms +step:986/2285 train_time:59383ms step_avg:60.23ms +step:987/2285 train_time:59445ms step_avg:60.23ms +step:988/2285 train_time:59505ms step_avg:60.23ms +step:989/2285 train_time:59566ms step_avg:60.23ms +step:990/2285 train_time:59625ms step_avg:60.23ms +step:991/2285 train_time:59687ms step_avg:60.23ms +step:992/2285 train_time:59747ms step_avg:60.23ms +step:993/2285 train_time:59809ms step_avg:60.23ms +step:994/2285 train_time:59868ms step_avg:60.23ms +step:995/2285 train_time:59931ms step_avg:60.23ms +step:996/2285 train_time:59990ms step_avg:60.23ms +step:997/2285 train_time:60053ms step_avg:60.23ms +step:998/2285 train_time:60112ms step_avg:60.23ms +step:999/2285 train_time:60174ms step_avg:60.23ms +step:1000/2285 train_time:60234ms step_avg:60.23ms +step:1000/2285 val_loss:3.5694 train_time:60298ms step_avg:60.30ms +step:1001/2285 train_time:60318ms step_avg:60.26ms +step:1002/2285 train_time:60360ms step_avg:60.24ms +step:1003/2285 train_time:60422ms step_avg:60.24ms +step:1004/2285 train_time:60481ms step_avg:60.24ms +step:1005/2285 train_time:60543ms step_avg:60.24ms +step:1006/2285 train_time:60602ms step_avg:60.24ms +step:1007/2285 train_time:60663ms step_avg:60.24ms +step:1008/2285 train_time:60722ms step_avg:60.24ms +step:1009/2285 train_time:60783ms step_avg:60.24ms +step:1010/2285 train_time:60841ms step_avg:60.24ms +step:1011/2285 train_time:60902ms step_avg:60.24ms +step:1012/2285 train_time:60960ms step_avg:60.24ms +step:1013/2285 train_time:61022ms step_avg:60.24ms +step:1014/2285 train_time:61080ms step_avg:60.24ms +step:1015/2285 train_time:61141ms step_avg:60.24ms +step:1016/2285 train_time:61204ms step_avg:60.24ms +step:1017/2285 train_time:61271ms step_avg:60.25ms +step:1018/2285 train_time:61331ms step_avg:60.25ms +step:1019/2285 train_time:61393ms step_avg:60.25ms +step:1020/2285 train_time:61453ms step_avg:60.25ms +step:1021/2285 train_time:61515ms step_avg:60.25ms +step:1022/2285 train_time:61575ms step_avg:60.25ms +step:1023/2285 train_time:61637ms step_avg:60.25ms +step:1024/2285 train_time:61697ms step_avg:60.25ms +step:1025/2285 train_time:61759ms step_avg:60.25ms +step:1026/2285 train_time:61818ms step_avg:60.25ms +step:1027/2285 train_time:61879ms step_avg:60.25ms +step:1028/2285 train_time:61938ms step_avg:60.25ms +step:1029/2285 train_time:61999ms step_avg:60.25ms +step:1030/2285 train_time:62057ms step_avg:60.25ms +step:1031/2285 train_time:62119ms step_avg:60.25ms +step:1032/2285 train_time:62179ms step_avg:60.25ms +step:1033/2285 train_time:62243ms step_avg:60.25ms +step:1034/2285 train_time:62303ms step_avg:60.25ms +step:1035/2285 train_time:62366ms step_avg:60.26ms +step:1036/2285 train_time:62426ms step_avg:60.26ms +step:1037/2285 train_time:62489ms step_avg:60.26ms +step:1038/2285 train_time:62548ms step_avg:60.26ms +step:1039/2285 train_time:62610ms step_avg:60.26ms +step:1040/2285 train_time:62669ms step_avg:60.26ms +step:1041/2285 train_time:62731ms step_avg:60.26ms +step:1042/2285 train_time:62791ms step_avg:60.26ms +step:1043/2285 train_time:62853ms step_avg:60.26ms +step:1044/2285 train_time:62912ms step_avg:60.26ms +step:1045/2285 train_time:62974ms step_avg:60.26ms +step:1046/2285 train_time:63034ms step_avg:60.26ms +step:1047/2285 train_time:63097ms step_avg:60.26ms +step:1048/2285 train_time:63156ms step_avg:60.26ms +step:1049/2285 train_time:63218ms step_avg:60.27ms +step:1050/2285 train_time:63278ms step_avg:60.26ms +step:1051/2285 train_time:63341ms step_avg:60.27ms +step:1052/2285 train_time:63400ms step_avg:60.27ms +step:1053/2285 train_time:63462ms step_avg:60.27ms +step:1054/2285 train_time:63521ms step_avg:60.27ms +step:1055/2285 train_time:63583ms step_avg:60.27ms +step:1056/2285 train_time:63642ms step_avg:60.27ms +step:1057/2285 train_time:63704ms step_avg:60.27ms +step:1058/2285 train_time:63763ms step_avg:60.27ms +step:1059/2285 train_time:63825ms step_avg:60.27ms +step:1060/2285 train_time:63884ms step_avg:60.27ms +step:1061/2285 train_time:63946ms step_avg:60.27ms +step:1062/2285 train_time:64005ms step_avg:60.27ms +step:1063/2285 train_time:64067ms step_avg:60.27ms +step:1064/2285 train_time:64127ms step_avg:60.27ms +step:1065/2285 train_time:64189ms step_avg:60.27ms +step:1066/2285 train_time:64248ms step_avg:60.27ms +step:1067/2285 train_time:64310ms step_avg:60.27ms +step:1068/2285 train_time:64370ms step_avg:60.27ms +step:1069/2285 train_time:64433ms step_avg:60.27ms +step:1070/2285 train_time:64493ms step_avg:60.27ms +step:1071/2285 train_time:64555ms step_avg:60.28ms +step:1072/2285 train_time:64615ms step_avg:60.27ms +step:1073/2285 train_time:64677ms step_avg:60.28ms +step:1074/2285 train_time:64736ms step_avg:60.28ms +step:1075/2285 train_time:64798ms step_avg:60.28ms +step:1076/2285 train_time:64857ms step_avg:60.28ms +step:1077/2285 train_time:64919ms step_avg:60.28ms +step:1078/2285 train_time:64978ms step_avg:60.28ms +step:1079/2285 train_time:65040ms step_avg:60.28ms +step:1080/2285 train_time:65099ms step_avg:60.28ms +step:1081/2285 train_time:65161ms step_avg:60.28ms +step:1082/2285 train_time:65220ms step_avg:60.28ms +step:1083/2285 train_time:65282ms step_avg:60.28ms +step:1084/2285 train_time:65341ms step_avg:60.28ms +step:1085/2285 train_time:65402ms step_avg:60.28ms +step:1086/2285 train_time:65462ms step_avg:60.28ms +step:1087/2285 train_time:65524ms step_avg:60.28ms +step:1088/2285 train_time:65583ms step_avg:60.28ms +step:1089/2285 train_time:65646ms step_avg:60.28ms +step:1090/2285 train_time:65704ms step_avg:60.28ms +step:1091/2285 train_time:65766ms step_avg:60.28ms +step:1092/2285 train_time:65825ms step_avg:60.28ms +step:1093/2285 train_time:65887ms step_avg:60.28ms +step:1094/2285 train_time:65946ms step_avg:60.28ms +step:1095/2285 train_time:66008ms step_avg:60.28ms +step:1096/2285 train_time:66067ms step_avg:60.28ms +step:1097/2285 train_time:66129ms step_avg:60.28ms +step:1098/2285 train_time:66189ms step_avg:60.28ms +step:1099/2285 train_time:66251ms step_avg:60.28ms +step:1100/2285 train_time:66311ms step_avg:60.28ms +step:1101/2285 train_time:66374ms step_avg:60.28ms +step:1102/2285 train_time:66434ms step_avg:60.28ms +step:1103/2285 train_time:66496ms step_avg:60.29ms +step:1104/2285 train_time:66556ms step_avg:60.29ms +step:1105/2285 train_time:66618ms step_avg:60.29ms +step:1106/2285 train_time:66678ms step_avg:60.29ms +step:1107/2285 train_time:66740ms step_avg:60.29ms +step:1108/2285 train_time:66799ms step_avg:60.29ms +step:1109/2285 train_time:66861ms step_avg:60.29ms +step:1110/2285 train_time:66920ms step_avg:60.29ms +step:1111/2285 train_time:66981ms step_avg:60.29ms +step:1112/2285 train_time:67041ms step_avg:60.29ms +step:1113/2285 train_time:67102ms step_avg:60.29ms +step:1114/2285 train_time:67162ms step_avg:60.29ms +step:1115/2285 train_time:67223ms step_avg:60.29ms +step:1116/2285 train_time:67283ms step_avg:60.29ms +step:1117/2285 train_time:67344ms step_avg:60.29ms +step:1118/2285 train_time:67403ms step_avg:60.29ms +step:1119/2285 train_time:67465ms step_avg:60.29ms +step:1120/2285 train_time:67524ms step_avg:60.29ms +step:1121/2285 train_time:67586ms step_avg:60.29ms +step:1122/2285 train_time:67645ms step_avg:60.29ms +step:1123/2285 train_time:67708ms step_avg:60.29ms +step:1124/2285 train_time:67768ms step_avg:60.29ms +step:1125/2285 train_time:67829ms step_avg:60.29ms +step:1126/2285 train_time:67890ms step_avg:60.29ms +step:1127/2285 train_time:67952ms step_avg:60.29ms +step:1128/2285 train_time:68012ms step_avg:60.29ms +step:1129/2285 train_time:68074ms step_avg:60.30ms +step:1130/2285 train_time:68133ms step_avg:60.30ms +step:1131/2285 train_time:68196ms step_avg:60.30ms +step:1132/2285 train_time:68255ms step_avg:60.30ms +step:1133/2285 train_time:68317ms step_avg:60.30ms +step:1134/2285 train_time:68377ms step_avg:60.30ms +step:1135/2285 train_time:68439ms step_avg:60.30ms +step:1136/2285 train_time:68498ms step_avg:60.30ms +step:1137/2285 train_time:68560ms step_avg:60.30ms +step:1138/2285 train_time:68619ms step_avg:60.30ms +step:1139/2285 train_time:68681ms step_avg:60.30ms +step:1140/2285 train_time:68740ms step_avg:60.30ms +step:1141/2285 train_time:68802ms step_avg:60.30ms +step:1142/2285 train_time:68861ms step_avg:60.30ms +step:1143/2285 train_time:68923ms step_avg:60.30ms +step:1144/2285 train_time:68983ms step_avg:60.30ms +step:1145/2285 train_time:69044ms step_avg:60.30ms +step:1146/2285 train_time:69104ms step_avg:60.30ms +step:1147/2285 train_time:69167ms step_avg:60.30ms +step:1148/2285 train_time:69226ms step_avg:60.30ms +step:1149/2285 train_time:69288ms step_avg:60.30ms +step:1150/2285 train_time:69348ms step_avg:60.30ms +step:1151/2285 train_time:69410ms step_avg:60.30ms +step:1152/2285 train_time:69470ms step_avg:60.30ms +step:1153/2285 train_time:69533ms step_avg:60.31ms +step:1154/2285 train_time:69594ms step_avg:60.31ms +step:1155/2285 train_time:69656ms step_avg:60.31ms +step:1156/2285 train_time:69716ms step_avg:60.31ms +step:1157/2285 train_time:69778ms step_avg:60.31ms +step:1158/2285 train_time:69838ms step_avg:60.31ms +step:1159/2285 train_time:69900ms step_avg:60.31ms +step:1160/2285 train_time:69960ms step_avg:60.31ms +step:1161/2285 train_time:70021ms step_avg:60.31ms +step:1162/2285 train_time:70080ms step_avg:60.31ms +step:1163/2285 train_time:70142ms step_avg:60.31ms +step:1164/2285 train_time:70202ms step_avg:60.31ms +step:1165/2285 train_time:70264ms step_avg:60.31ms +step:1166/2285 train_time:70324ms step_avg:60.31ms +step:1167/2285 train_time:70386ms step_avg:60.31ms +step:1168/2285 train_time:70445ms step_avg:60.31ms +step:1169/2285 train_time:70507ms step_avg:60.31ms +step:1170/2285 train_time:70567ms step_avg:60.31ms +step:1171/2285 train_time:70630ms step_avg:60.32ms +step:1172/2285 train_time:70690ms step_avg:60.32ms +step:1173/2285 train_time:70752ms step_avg:60.32ms +step:1174/2285 train_time:70813ms step_avg:60.32ms +step:1175/2285 train_time:70876ms step_avg:60.32ms +step:1176/2285 train_time:70935ms step_avg:60.32ms +step:1177/2285 train_time:70998ms step_avg:60.32ms +step:1178/2285 train_time:71058ms step_avg:60.32ms +step:1179/2285 train_time:71120ms step_avg:60.32ms +step:1180/2285 train_time:71179ms step_avg:60.32ms +step:1181/2285 train_time:71241ms step_avg:60.32ms +step:1182/2285 train_time:71301ms step_avg:60.32ms +step:1183/2285 train_time:71362ms step_avg:60.32ms +step:1184/2285 train_time:71422ms step_avg:60.32ms +step:1185/2285 train_time:71485ms step_avg:60.32ms +step:1186/2285 train_time:71545ms step_avg:60.32ms +step:1187/2285 train_time:71607ms step_avg:60.33ms +step:1188/2285 train_time:71667ms step_avg:60.33ms +step:1189/2285 train_time:71729ms step_avg:60.33ms +step:1190/2285 train_time:71789ms step_avg:60.33ms +step:1191/2285 train_time:71851ms step_avg:60.33ms +step:1192/2285 train_time:71912ms step_avg:60.33ms +step:1193/2285 train_time:71975ms step_avg:60.33ms +step:1194/2285 train_time:72035ms step_avg:60.33ms +step:1195/2285 train_time:72097ms step_avg:60.33ms +step:1196/2285 train_time:72157ms step_avg:60.33ms +step:1197/2285 train_time:72219ms step_avg:60.33ms +step:1198/2285 train_time:72278ms step_avg:60.33ms +step:1199/2285 train_time:72341ms step_avg:60.33ms +step:1200/2285 train_time:72400ms step_avg:60.33ms +step:1201/2285 train_time:72463ms step_avg:60.34ms +step:1202/2285 train_time:72522ms step_avg:60.33ms +step:1203/2285 train_time:72584ms step_avg:60.34ms +step:1204/2285 train_time:72643ms step_avg:60.33ms +step:1205/2285 train_time:72705ms step_avg:60.34ms +step:1206/2285 train_time:72765ms step_avg:60.34ms +step:1207/2285 train_time:72827ms step_avg:60.34ms +step:1208/2285 train_time:72887ms step_avg:60.34ms +step:1209/2285 train_time:72949ms step_avg:60.34ms +step:1210/2285 train_time:73009ms step_avg:60.34ms +step:1211/2285 train_time:73072ms step_avg:60.34ms +step:1212/2285 train_time:73132ms step_avg:60.34ms +step:1213/2285 train_time:73195ms step_avg:60.34ms +step:1214/2285 train_time:73256ms step_avg:60.34ms +step:1215/2285 train_time:73318ms step_avg:60.34ms +step:1216/2285 train_time:73378ms step_avg:60.34ms +step:1217/2285 train_time:73440ms step_avg:60.35ms +step:1218/2285 train_time:73500ms step_avg:60.34ms +step:1219/2285 train_time:73562ms step_avg:60.35ms +step:1220/2285 train_time:73621ms step_avg:60.34ms +step:1221/2285 train_time:73683ms step_avg:60.35ms +step:1222/2285 train_time:73742ms step_avg:60.35ms +step:1223/2285 train_time:73805ms step_avg:60.35ms +step:1224/2285 train_time:73864ms step_avg:60.35ms +step:1225/2285 train_time:73927ms step_avg:60.35ms +step:1226/2285 train_time:73987ms step_avg:60.35ms +step:1227/2285 train_time:74049ms step_avg:60.35ms +step:1228/2285 train_time:74109ms step_avg:60.35ms +step:1229/2285 train_time:74171ms step_avg:60.35ms +step:1230/2285 train_time:74232ms step_avg:60.35ms +step:1231/2285 train_time:74294ms step_avg:60.35ms +step:1232/2285 train_time:74354ms step_avg:60.35ms +step:1233/2285 train_time:74417ms step_avg:60.35ms +step:1234/2285 train_time:74477ms step_avg:60.35ms +step:1235/2285 train_time:74538ms step_avg:60.35ms +step:1236/2285 train_time:74598ms step_avg:60.35ms +step:1237/2285 train_time:74660ms step_avg:60.36ms +step:1238/2285 train_time:74720ms step_avg:60.36ms +step:1239/2285 train_time:74782ms step_avg:60.36ms +step:1240/2285 train_time:74841ms step_avg:60.36ms +step:1241/2285 train_time:74903ms step_avg:60.36ms +step:1242/2285 train_time:74963ms step_avg:60.36ms +step:1243/2285 train_time:75025ms step_avg:60.36ms +step:1244/2285 train_time:75085ms step_avg:60.36ms +step:1245/2285 train_time:75148ms step_avg:60.36ms +step:1246/2285 train_time:75207ms step_avg:60.36ms +step:1247/2285 train_time:75269ms step_avg:60.36ms +step:1248/2285 train_time:75330ms step_avg:60.36ms +step:1249/2285 train_time:75392ms step_avg:60.36ms +step:1250/2285 train_time:75452ms step_avg:60.36ms +step:1250/2285 val_loss:3.4998 train_time:75516ms step_avg:60.41ms +step:1251/2285 train_time:75535ms step_avg:60.38ms +step:1252/2285 train_time:75578ms step_avg:60.37ms +step:1253/2285 train_time:75639ms step_avg:60.37ms +step:1254/2285 train_time:75699ms step_avg:60.37ms +step:1255/2285 train_time:75761ms step_avg:60.37ms +step:1256/2285 train_time:75822ms step_avg:60.37ms +step:1257/2285 train_time:75883ms step_avg:60.37ms +step:1258/2285 train_time:75941ms step_avg:60.37ms +step:1259/2285 train_time:76002ms step_avg:60.37ms +step:1260/2285 train_time:76061ms step_avg:60.37ms +step:1261/2285 train_time:76122ms step_avg:60.37ms +step:1262/2285 train_time:76181ms step_avg:60.37ms +step:1263/2285 train_time:76242ms step_avg:60.37ms +step:1264/2285 train_time:76301ms step_avg:60.36ms +step:1265/2285 train_time:76363ms step_avg:60.37ms +step:1266/2285 train_time:76428ms step_avg:60.37ms +step:1267/2285 train_time:76494ms step_avg:60.37ms +step:1268/2285 train_time:76555ms step_avg:60.37ms +step:1269/2285 train_time:76617ms step_avg:60.38ms +step:1270/2285 train_time:76677ms step_avg:60.38ms +step:1271/2285 train_time:76739ms step_avg:60.38ms +step:1272/2285 train_time:76798ms step_avg:60.38ms +step:1273/2285 train_time:76860ms step_avg:60.38ms +step:1274/2285 train_time:76919ms step_avg:60.38ms +step:1275/2285 train_time:76980ms step_avg:60.38ms +step:1276/2285 train_time:77039ms step_avg:60.38ms +step:1277/2285 train_time:77100ms step_avg:60.38ms +step:1278/2285 train_time:77159ms step_avg:60.38ms +step:1279/2285 train_time:77221ms step_avg:60.38ms +step:1280/2285 train_time:77280ms step_avg:60.38ms +step:1281/2285 train_time:77343ms step_avg:60.38ms +step:1282/2285 train_time:77405ms step_avg:60.38ms +step:1283/2285 train_time:77470ms step_avg:60.38ms +step:1284/2285 train_time:77530ms step_avg:60.38ms +step:1285/2285 train_time:77592ms step_avg:60.38ms +step:1286/2285 train_time:77651ms step_avg:60.38ms +step:1287/2285 train_time:77713ms step_avg:60.38ms +step:1288/2285 train_time:77773ms step_avg:60.38ms +step:1289/2285 train_time:77834ms step_avg:60.38ms +step:1290/2285 train_time:77893ms step_avg:60.38ms +step:1291/2285 train_time:77955ms step_avg:60.38ms +step:1292/2285 train_time:78015ms step_avg:60.38ms +step:1293/2285 train_time:78077ms step_avg:60.38ms +step:1294/2285 train_time:78136ms step_avg:60.38ms +step:1295/2285 train_time:78197ms step_avg:60.38ms +step:1296/2285 train_time:78257ms step_avg:60.38ms +step:1297/2285 train_time:78319ms step_avg:60.38ms +step:1298/2285 train_time:78380ms step_avg:60.39ms +step:1299/2285 train_time:78444ms step_avg:60.39ms +step:1300/2285 train_time:78504ms step_avg:60.39ms +step:1301/2285 train_time:78567ms step_avg:60.39ms +step:1302/2285 train_time:78627ms step_avg:60.39ms +step:1303/2285 train_time:78689ms step_avg:60.39ms +step:1304/2285 train_time:78749ms step_avg:60.39ms +step:1305/2285 train_time:78811ms step_avg:60.39ms +step:1306/2285 train_time:78870ms step_avg:60.39ms +step:1307/2285 train_time:78931ms step_avg:60.39ms +step:1308/2285 train_time:78991ms step_avg:60.39ms +step:1309/2285 train_time:79053ms step_avg:60.39ms +step:1310/2285 train_time:79113ms step_avg:60.39ms +step:1311/2285 train_time:79175ms step_avg:60.39ms +step:1312/2285 train_time:79235ms step_avg:60.39ms +step:1313/2285 train_time:79297ms step_avg:60.39ms +step:1314/2285 train_time:79356ms step_avg:60.39ms +step:1315/2285 train_time:79419ms step_avg:60.39ms +step:1316/2285 train_time:79479ms step_avg:60.39ms +step:1317/2285 train_time:79542ms step_avg:60.40ms +step:1318/2285 train_time:79602ms step_avg:60.40ms +step:1319/2285 train_time:79665ms step_avg:60.40ms +step:1320/2285 train_time:79724ms step_avg:60.40ms +step:1321/2285 train_time:79786ms step_avg:60.40ms +step:1322/2285 train_time:79846ms step_avg:60.40ms +step:1323/2285 train_time:79908ms step_avg:60.40ms +step:1324/2285 train_time:79968ms step_avg:60.40ms +step:1325/2285 train_time:80030ms step_avg:60.40ms +step:1326/2285 train_time:80089ms step_avg:60.40ms +step:1327/2285 train_time:80151ms step_avg:60.40ms +step:1328/2285 train_time:80211ms step_avg:60.40ms +step:1329/2285 train_time:80273ms step_avg:60.40ms +step:1330/2285 train_time:80333ms step_avg:60.40ms +step:1331/2285 train_time:80396ms step_avg:60.40ms +step:1332/2285 train_time:80455ms step_avg:60.40ms +step:1333/2285 train_time:80518ms step_avg:60.40ms +step:1334/2285 train_time:80577ms step_avg:60.40ms +step:1335/2285 train_time:80640ms step_avg:60.40ms +step:1336/2285 train_time:80700ms step_avg:60.40ms +step:1337/2285 train_time:80763ms step_avg:60.41ms +step:1338/2285 train_time:80822ms step_avg:60.41ms +step:1339/2285 train_time:80884ms step_avg:60.41ms +step:1340/2285 train_time:80944ms step_avg:60.41ms +step:1341/2285 train_time:81006ms step_avg:60.41ms +step:1342/2285 train_time:81066ms step_avg:60.41ms +step:1343/2285 train_time:81128ms step_avg:60.41ms +step:1344/2285 train_time:81187ms step_avg:60.41ms +step:1345/2285 train_time:81250ms step_avg:60.41ms +step:1346/2285 train_time:81309ms step_avg:60.41ms +step:1347/2285 train_time:81371ms step_avg:60.41ms +step:1348/2285 train_time:81431ms step_avg:60.41ms +step:1349/2285 train_time:81493ms step_avg:60.41ms +step:1350/2285 train_time:81553ms step_avg:60.41ms +step:1351/2285 train_time:81616ms step_avg:60.41ms +step:1352/2285 train_time:81675ms step_avg:60.41ms +step:1353/2285 train_time:81737ms step_avg:60.41ms +step:1354/2285 train_time:81797ms step_avg:60.41ms +step:1355/2285 train_time:81859ms step_avg:60.41ms +step:1356/2285 train_time:81919ms step_avg:60.41ms +step:1357/2285 train_time:81982ms step_avg:60.41ms +step:1358/2285 train_time:82042ms step_avg:60.41ms +step:1359/2285 train_time:82104ms step_avg:60.41ms +step:1360/2285 train_time:82164ms step_avg:60.41ms +step:1361/2285 train_time:82226ms step_avg:60.42ms +step:1362/2285 train_time:82286ms step_avg:60.42ms +step:1363/2285 train_time:82348ms step_avg:60.42ms +step:1364/2285 train_time:82408ms step_avg:60.42ms +step:1365/2285 train_time:82471ms step_avg:60.42ms +step:1366/2285 train_time:82530ms step_avg:60.42ms +step:1367/2285 train_time:82592ms step_avg:60.42ms +step:1368/2285 train_time:82652ms step_avg:60.42ms +step:1369/2285 train_time:82715ms step_avg:60.42ms +step:1370/2285 train_time:82774ms step_avg:60.42ms +step:1371/2285 train_time:82836ms step_avg:60.42ms +step:1372/2285 train_time:82896ms step_avg:60.42ms +step:1373/2285 train_time:82958ms step_avg:60.42ms +step:1374/2285 train_time:83017ms step_avg:60.42ms +step:1375/2285 train_time:83080ms step_avg:60.42ms +step:1376/2285 train_time:83140ms step_avg:60.42ms +step:1377/2285 train_time:83202ms step_avg:60.42ms +step:1378/2285 train_time:83262ms step_avg:60.42ms +step:1379/2285 train_time:83324ms step_avg:60.42ms +step:1380/2285 train_time:83384ms step_avg:60.42ms +step:1381/2285 train_time:83446ms step_avg:60.42ms +step:1382/2285 train_time:83506ms step_avg:60.42ms +step:1383/2285 train_time:83568ms step_avg:60.43ms +step:1384/2285 train_time:83632ms step_avg:60.43ms +step:1385/2285 train_time:83691ms step_avg:60.43ms +step:1386/2285 train_time:83750ms step_avg:60.43ms +step:1387/2285 train_time:83812ms step_avg:60.43ms +step:1388/2285 train_time:83871ms step_avg:60.43ms +step:1389/2285 train_time:83933ms step_avg:60.43ms +step:1390/2285 train_time:83993ms step_avg:60.43ms +step:1391/2285 train_time:84055ms step_avg:60.43ms +step:1392/2285 train_time:84115ms step_avg:60.43ms +step:1393/2285 train_time:84177ms step_avg:60.43ms +step:1394/2285 train_time:84237ms step_avg:60.43ms +step:1395/2285 train_time:84299ms step_avg:60.43ms +step:1396/2285 train_time:84359ms step_avg:60.43ms +step:1397/2285 train_time:84421ms step_avg:60.43ms +step:1398/2285 train_time:84482ms step_avg:60.43ms +step:1399/2285 train_time:84544ms step_avg:60.43ms +step:1400/2285 train_time:84604ms step_avg:60.43ms +step:1401/2285 train_time:84666ms step_avg:60.43ms +step:1402/2285 train_time:84726ms step_avg:60.43ms +step:1403/2285 train_time:84788ms step_avg:60.43ms +step:1404/2285 train_time:84848ms step_avg:60.43ms +step:1405/2285 train_time:84910ms step_avg:60.43ms +step:1406/2285 train_time:84969ms step_avg:60.43ms +step:1407/2285 train_time:85031ms step_avg:60.43ms +step:1408/2285 train_time:85091ms step_avg:60.43ms +step:1409/2285 train_time:85153ms step_avg:60.43ms +step:1410/2285 train_time:85213ms step_avg:60.43ms +step:1411/2285 train_time:85276ms step_avg:60.44ms +step:1412/2285 train_time:85336ms step_avg:60.44ms +step:1413/2285 train_time:85398ms step_avg:60.44ms +step:1414/2285 train_time:85458ms step_avg:60.44ms +step:1415/2285 train_time:85521ms step_avg:60.44ms +step:1416/2285 train_time:85581ms step_avg:60.44ms +step:1417/2285 train_time:85643ms step_avg:60.44ms +step:1418/2285 train_time:85703ms step_avg:60.44ms +step:1419/2285 train_time:85766ms step_avg:60.44ms +step:1420/2285 train_time:85826ms step_avg:60.44ms +step:1421/2285 train_time:85888ms step_avg:60.44ms +step:1422/2285 train_time:85947ms step_avg:60.44ms +step:1423/2285 train_time:86009ms step_avg:60.44ms +step:1424/2285 train_time:86069ms step_avg:60.44ms +step:1425/2285 train_time:86130ms step_avg:60.44ms +step:1426/2285 train_time:86190ms step_avg:60.44ms +step:1427/2285 train_time:86252ms step_avg:60.44ms +step:1428/2285 train_time:86312ms step_avg:60.44ms +step:1429/2285 train_time:86374ms step_avg:60.44ms +step:1430/2285 train_time:86435ms step_avg:60.44ms +step:1431/2285 train_time:86497ms step_avg:60.44ms +step:1432/2285 train_time:86556ms step_avg:60.44ms +step:1433/2285 train_time:86619ms step_avg:60.45ms +step:1434/2285 train_time:86679ms step_avg:60.45ms +step:1435/2285 train_time:86741ms step_avg:60.45ms +step:1436/2285 train_time:86802ms step_avg:60.45ms +step:1437/2285 train_time:86865ms step_avg:60.45ms +step:1438/2285 train_time:86925ms step_avg:60.45ms +step:1439/2285 train_time:86987ms step_avg:60.45ms +step:1440/2285 train_time:87046ms step_avg:60.45ms +step:1441/2285 train_time:87108ms step_avg:60.45ms +step:1442/2285 train_time:87169ms step_avg:60.45ms +step:1443/2285 train_time:87231ms step_avg:60.45ms +step:1444/2285 train_time:87290ms step_avg:60.45ms +step:1445/2285 train_time:87352ms step_avg:60.45ms +step:1446/2285 train_time:87412ms step_avg:60.45ms +step:1447/2285 train_time:87474ms step_avg:60.45ms +step:1448/2285 train_time:87533ms step_avg:60.45ms +step:1449/2285 train_time:87596ms step_avg:60.45ms +step:1450/2285 train_time:87655ms step_avg:60.45ms +step:1451/2285 train_time:87717ms step_avg:60.45ms +step:1452/2285 train_time:87777ms step_avg:60.45ms +step:1453/2285 train_time:87839ms step_avg:60.45ms +step:1454/2285 train_time:87899ms step_avg:60.45ms +step:1455/2285 train_time:87961ms step_avg:60.45ms +step:1456/2285 train_time:88022ms step_avg:60.45ms +step:1457/2285 train_time:88085ms step_avg:60.46ms +step:1458/2285 train_time:88145ms step_avg:60.46ms +step:1459/2285 train_time:88207ms step_avg:60.46ms +step:1460/2285 train_time:88268ms step_avg:60.46ms +step:1461/2285 train_time:88330ms step_avg:60.46ms +step:1462/2285 train_time:88389ms step_avg:60.46ms +step:1463/2285 train_time:88452ms step_avg:60.46ms +step:1464/2285 train_time:88511ms step_avg:60.46ms +step:1465/2285 train_time:88573ms step_avg:60.46ms +step:1466/2285 train_time:88633ms step_avg:60.46ms +step:1467/2285 train_time:88695ms step_avg:60.46ms +step:1468/2285 train_time:88755ms step_avg:60.46ms +step:1469/2285 train_time:88817ms step_avg:60.46ms +step:1470/2285 train_time:88876ms step_avg:60.46ms +step:1471/2285 train_time:88938ms step_avg:60.46ms +step:1472/2285 train_time:88998ms step_avg:60.46ms +step:1473/2285 train_time:89061ms step_avg:60.46ms +step:1474/2285 train_time:89121ms step_avg:60.46ms +step:1475/2285 train_time:89183ms step_avg:60.46ms +step:1476/2285 train_time:89243ms step_avg:60.46ms +step:1477/2285 train_time:89306ms step_avg:60.46ms +step:1478/2285 train_time:89366ms step_avg:60.46ms +step:1479/2285 train_time:89430ms step_avg:60.47ms +step:1480/2285 train_time:89488ms step_avg:60.47ms +step:1481/2285 train_time:89550ms step_avg:60.47ms +step:1482/2285 train_time:89610ms step_avg:60.47ms +step:1483/2285 train_time:89672ms step_avg:60.47ms +step:1484/2285 train_time:89731ms step_avg:60.47ms +step:1485/2285 train_time:89794ms step_avg:60.47ms +step:1486/2285 train_time:89853ms step_avg:60.47ms +step:1487/2285 train_time:89916ms step_avg:60.47ms +step:1488/2285 train_time:89976ms step_avg:60.47ms +step:1489/2285 train_time:90038ms step_avg:60.47ms +step:1490/2285 train_time:90097ms step_avg:60.47ms +step:1491/2285 train_time:90159ms step_avg:60.47ms +step:1492/2285 train_time:90220ms step_avg:60.47ms +step:1493/2285 train_time:90283ms step_avg:60.47ms +step:1494/2285 train_time:90343ms step_avg:60.47ms +step:1495/2285 train_time:90406ms step_avg:60.47ms +step:1496/2285 train_time:90466ms step_avg:60.47ms +step:1497/2285 train_time:90528ms step_avg:60.47ms +step:1498/2285 train_time:90588ms step_avg:60.47ms +step:1499/2285 train_time:90650ms step_avg:60.47ms +step:1500/2285 train_time:90710ms step_avg:60.47ms +step:1500/2285 val_loss:3.4305 train_time:90773ms step_avg:60.52ms +step:1501/2285 train_time:90791ms step_avg:60.49ms +step:1502/2285 train_time:90834ms step_avg:60.48ms +step:1503/2285 train_time:90898ms step_avg:60.48ms +step:1504/2285 train_time:90958ms step_avg:60.48ms +step:1505/2285 train_time:91020ms step_avg:60.48ms +step:1506/2285 train_time:91080ms step_avg:60.48ms +step:1507/2285 train_time:91141ms step_avg:60.48ms +step:1508/2285 train_time:91200ms step_avg:60.48ms +step:1509/2285 train_time:91261ms step_avg:60.48ms +step:1510/2285 train_time:91321ms step_avg:60.48ms +step:1511/2285 train_time:91383ms step_avg:60.48ms +step:1512/2285 train_time:91443ms step_avg:60.48ms +step:1513/2285 train_time:91504ms step_avg:60.48ms +step:1514/2285 train_time:91564ms step_avg:60.48ms +step:1515/2285 train_time:91626ms step_avg:60.48ms +step:1516/2285 train_time:91687ms step_avg:60.48ms +step:1517/2285 train_time:91751ms step_avg:60.48ms +step:1518/2285 train_time:91812ms step_avg:60.48ms +step:1519/2285 train_time:91875ms step_avg:60.48ms +step:1520/2285 train_time:91935ms step_avg:60.48ms +step:1521/2285 train_time:91997ms step_avg:60.48ms +step:1522/2285 train_time:92056ms step_avg:60.48ms +step:1523/2285 train_time:92118ms step_avg:60.48ms +step:1524/2285 train_time:92177ms step_avg:60.48ms +step:1525/2285 train_time:92239ms step_avg:60.48ms +step:1526/2285 train_time:92298ms step_avg:60.48ms +step:1527/2285 train_time:92360ms step_avg:60.48ms +step:1528/2285 train_time:92419ms step_avg:60.48ms +step:1529/2285 train_time:92481ms step_avg:60.48ms +step:1530/2285 train_time:92540ms step_avg:60.48ms +step:1531/2285 train_time:92602ms step_avg:60.48ms +step:1532/2285 train_time:92663ms step_avg:60.49ms +step:1533/2285 train_time:92727ms step_avg:60.49ms +step:1534/2285 train_time:92788ms step_avg:60.49ms +step:1535/2285 train_time:92851ms step_avg:60.49ms +step:1536/2285 train_time:92911ms step_avg:60.49ms +step:1537/2285 train_time:92973ms step_avg:60.49ms +step:1538/2285 train_time:93033ms step_avg:60.49ms +step:1539/2285 train_time:93095ms step_avg:60.49ms +step:1540/2285 train_time:93155ms step_avg:60.49ms +step:1541/2285 train_time:93217ms step_avg:60.49ms +step:1542/2285 train_time:93277ms step_avg:60.49ms +step:1543/2285 train_time:93339ms step_avg:60.49ms +step:1544/2285 train_time:93398ms step_avg:60.49ms +step:1545/2285 train_time:93461ms step_avg:60.49ms +step:1546/2285 train_time:93520ms step_avg:60.49ms +step:1547/2285 train_time:93582ms step_avg:60.49ms +step:1548/2285 train_time:93642ms step_avg:60.49ms +step:1549/2285 train_time:93706ms step_avg:60.49ms +step:1550/2285 train_time:93767ms step_avg:60.49ms +step:1551/2285 train_time:93830ms step_avg:60.50ms +step:1552/2285 train_time:93891ms step_avg:60.50ms +step:1553/2285 train_time:93953ms step_avg:60.50ms +step:1554/2285 train_time:94014ms step_avg:60.50ms +step:1555/2285 train_time:94076ms step_avg:60.50ms +step:1556/2285 train_time:94135ms step_avg:60.50ms +step:1557/2285 train_time:94197ms step_avg:60.50ms +step:1558/2285 train_time:94257ms step_avg:60.50ms +step:1559/2285 train_time:94319ms step_avg:60.50ms +step:1560/2285 train_time:94378ms step_avg:60.50ms +step:1561/2285 train_time:94440ms step_avg:60.50ms +step:1562/2285 train_time:94500ms step_avg:60.50ms +step:1563/2285 train_time:94563ms step_avg:60.50ms +step:1564/2285 train_time:94623ms step_avg:60.50ms +step:1565/2285 train_time:94686ms step_avg:60.50ms +step:1566/2285 train_time:94747ms step_avg:60.50ms +step:1567/2285 train_time:94810ms step_avg:60.50ms +step:1568/2285 train_time:94870ms step_avg:60.50ms +step:1569/2285 train_time:94933ms step_avg:60.51ms +step:1570/2285 train_time:94993ms step_avg:60.50ms +step:1571/2285 train_time:95054ms step_avg:60.51ms +step:1572/2285 train_time:95114ms step_avg:60.51ms +step:1573/2285 train_time:95177ms step_avg:60.51ms +step:1574/2285 train_time:95236ms step_avg:60.51ms +step:1575/2285 train_time:95298ms step_avg:60.51ms +step:1576/2285 train_time:95358ms step_avg:60.51ms +step:1577/2285 train_time:95420ms step_avg:60.51ms +step:1578/2285 train_time:95479ms step_avg:60.51ms +step:1579/2285 train_time:95541ms step_avg:60.51ms +step:1580/2285 train_time:95601ms step_avg:60.51ms +step:1581/2285 train_time:95663ms step_avg:60.51ms +step:1582/2285 train_time:95724ms step_avg:60.51ms +step:1583/2285 train_time:95787ms step_avg:60.51ms +step:1584/2285 train_time:95848ms step_avg:60.51ms +step:1585/2285 train_time:95911ms step_avg:60.51ms +step:1586/2285 train_time:95971ms step_avg:60.51ms +step:1587/2285 train_time:96034ms step_avg:60.51ms +step:1588/2285 train_time:96094ms step_avg:60.51ms +step:1589/2285 train_time:96156ms step_avg:60.51ms +step:1590/2285 train_time:96216ms step_avg:60.51ms +step:1591/2285 train_time:96278ms step_avg:60.51ms +step:1592/2285 train_time:96337ms step_avg:60.51ms +step:1593/2285 train_time:96399ms step_avg:60.51ms +step:1594/2285 train_time:96459ms step_avg:60.51ms +step:1595/2285 train_time:96521ms step_avg:60.51ms +step:1596/2285 train_time:96580ms step_avg:60.51ms +step:1597/2285 train_time:96643ms step_avg:60.52ms +step:1598/2285 train_time:96703ms step_avg:60.52ms +step:1599/2285 train_time:96767ms step_avg:60.52ms +step:1600/2285 train_time:96827ms step_avg:60.52ms +step:1601/2285 train_time:96890ms step_avg:60.52ms +step:1602/2285 train_time:96949ms step_avg:60.52ms +step:1603/2285 train_time:97012ms step_avg:60.52ms +step:1604/2285 train_time:97072ms step_avg:60.52ms +step:1605/2285 train_time:97135ms step_avg:60.52ms +step:1606/2285 train_time:97195ms step_avg:60.52ms +step:1607/2285 train_time:97256ms step_avg:60.52ms +step:1608/2285 train_time:97316ms step_avg:60.52ms +step:1609/2285 train_time:97378ms step_avg:60.52ms +step:1610/2285 train_time:97437ms step_avg:60.52ms +step:1611/2285 train_time:97500ms step_avg:60.52ms +step:1612/2285 train_time:97560ms step_avg:60.52ms +step:1613/2285 train_time:97622ms step_avg:60.52ms +step:1614/2285 train_time:97682ms step_avg:60.52ms +step:1615/2285 train_time:97745ms step_avg:60.52ms +step:1616/2285 train_time:97807ms step_avg:60.52ms +step:1617/2285 train_time:97869ms step_avg:60.53ms +step:1618/2285 train_time:97929ms step_avg:60.52ms +step:1619/2285 train_time:97992ms step_avg:60.53ms +step:1620/2285 train_time:98051ms step_avg:60.53ms +step:1621/2285 train_time:98114ms step_avg:60.53ms +step:1622/2285 train_time:98174ms step_avg:60.53ms +step:1623/2285 train_time:98235ms step_avg:60.53ms +step:1624/2285 train_time:98295ms step_avg:60.53ms +step:1625/2285 train_time:98357ms step_avg:60.53ms +step:1626/2285 train_time:98417ms step_avg:60.53ms +step:1627/2285 train_time:98479ms step_avg:60.53ms +step:1628/2285 train_time:98539ms step_avg:60.53ms +step:1629/2285 train_time:98601ms step_avg:60.53ms +step:1630/2285 train_time:98662ms step_avg:60.53ms +step:1631/2285 train_time:98724ms step_avg:60.53ms +step:1632/2285 train_time:98784ms step_avg:60.53ms +step:1633/2285 train_time:98847ms step_avg:60.53ms +step:1634/2285 train_time:98908ms step_avg:60.53ms +step:1635/2285 train_time:98970ms step_avg:60.53ms +step:1636/2285 train_time:99031ms step_avg:60.53ms +step:1637/2285 train_time:99093ms step_avg:60.53ms +step:1638/2285 train_time:99153ms step_avg:60.53ms +step:1639/2285 train_time:99215ms step_avg:60.53ms +step:1640/2285 train_time:99275ms step_avg:60.53ms +step:1641/2285 train_time:99337ms step_avg:60.53ms +step:1642/2285 train_time:99396ms step_avg:60.53ms +step:1643/2285 train_time:99458ms step_avg:60.53ms +step:1644/2285 train_time:99518ms step_avg:60.53ms +step:1645/2285 train_time:99580ms step_avg:60.53ms +step:1646/2285 train_time:99640ms step_avg:60.53ms +step:1647/2285 train_time:99702ms step_avg:60.54ms +step:1648/2285 train_time:99762ms step_avg:60.53ms +step:1649/2285 train_time:99825ms step_avg:60.54ms +step:1650/2285 train_time:99885ms step_avg:60.54ms +step:1651/2285 train_time:99949ms step_avg:60.54ms +step:1652/2285 train_time:100010ms step_avg:60.54ms +step:1653/2285 train_time:100072ms step_avg:60.54ms +step:1654/2285 train_time:100132ms step_avg:60.54ms +step:1655/2285 train_time:100194ms step_avg:60.54ms +step:1656/2285 train_time:100254ms step_avg:60.54ms +step:1657/2285 train_time:100317ms step_avg:60.54ms +step:1658/2285 train_time:100376ms step_avg:60.54ms +step:1659/2285 train_time:100438ms step_avg:60.54ms +step:1660/2285 train_time:100498ms step_avg:60.54ms +step:1661/2285 train_time:100560ms step_avg:60.54ms +step:1662/2285 train_time:100620ms step_avg:60.54ms +step:1663/2285 train_time:100682ms step_avg:60.54ms +step:1664/2285 train_time:100741ms step_avg:60.54ms +step:1665/2285 train_time:100804ms step_avg:60.54ms +step:1666/2285 train_time:100864ms step_avg:60.54ms +step:1667/2285 train_time:100926ms step_avg:60.54ms +step:1668/2285 train_time:100987ms step_avg:60.54ms +step:1669/2285 train_time:101049ms step_avg:60.54ms +step:1670/2285 train_time:101110ms step_avg:60.54ms +step:1671/2285 train_time:101172ms step_avg:60.55ms +step:1672/2285 train_time:101232ms step_avg:60.55ms +step:1673/2285 train_time:101295ms step_avg:60.55ms +step:1674/2285 train_time:101354ms step_avg:60.55ms +step:1675/2285 train_time:101416ms step_avg:60.55ms +step:1676/2285 train_time:101475ms step_avg:60.55ms +step:1677/2285 train_time:101538ms step_avg:60.55ms +step:1678/2285 train_time:101598ms step_avg:60.55ms +step:1679/2285 train_time:101660ms step_avg:60.55ms +step:1680/2285 train_time:101720ms step_avg:60.55ms +step:1681/2285 train_time:101782ms step_avg:60.55ms +step:1682/2285 train_time:101842ms step_avg:60.55ms +step:1683/2285 train_time:101905ms step_avg:60.55ms +step:1684/2285 train_time:101965ms step_avg:60.55ms +step:1685/2285 train_time:102028ms step_avg:60.55ms +step:1686/2285 train_time:102088ms step_avg:60.55ms +step:1687/2285 train_time:102152ms step_avg:60.55ms +step:1688/2285 train_time:102212ms step_avg:60.55ms +step:1689/2285 train_time:102275ms step_avg:60.55ms +step:1690/2285 train_time:102334ms step_avg:60.55ms +step:1691/2285 train_time:102396ms step_avg:60.55ms +step:1692/2285 train_time:102456ms step_avg:60.55ms +step:1693/2285 train_time:102518ms step_avg:60.55ms +step:1694/2285 train_time:102578ms step_avg:60.55ms +step:1695/2285 train_time:102640ms step_avg:60.55ms +step:1696/2285 train_time:102700ms step_avg:60.55ms +step:1697/2285 train_time:102762ms step_avg:60.56ms +step:1698/2285 train_time:102822ms step_avg:60.55ms +step:1699/2285 train_time:102885ms step_avg:60.56ms +step:1700/2285 train_time:102945ms step_avg:60.56ms +step:1701/2285 train_time:103008ms step_avg:60.56ms +step:1702/2285 train_time:103069ms step_avg:60.56ms +step:1703/2285 train_time:103132ms step_avg:60.56ms +step:1704/2285 train_time:103192ms step_avg:60.56ms +step:1705/2285 train_time:103254ms step_avg:60.56ms +step:1706/2285 train_time:103314ms step_avg:60.56ms +step:1707/2285 train_time:103378ms step_avg:60.56ms +step:1708/2285 train_time:103436ms step_avg:60.56ms +step:1709/2285 train_time:103499ms step_avg:60.56ms +step:1710/2285 train_time:103559ms step_avg:60.56ms +step:1711/2285 train_time:103621ms step_avg:60.56ms +step:1712/2285 train_time:103680ms step_avg:60.56ms +step:1713/2285 train_time:103742ms step_avg:60.56ms +step:1714/2285 train_time:103802ms step_avg:60.56ms +step:1715/2285 train_time:103865ms step_avg:60.56ms +step:1716/2285 train_time:103925ms step_avg:60.56ms +step:1717/2285 train_time:103988ms step_avg:60.56ms +step:1718/2285 train_time:104048ms step_avg:60.56ms +step:1719/2285 train_time:104112ms step_avg:60.57ms +step:1720/2285 train_time:104171ms step_avg:60.56ms +step:1721/2285 train_time:104234ms step_avg:60.57ms +step:1722/2285 train_time:104294ms step_avg:60.57ms +step:1723/2285 train_time:104356ms step_avg:60.57ms +step:1724/2285 train_time:104416ms step_avg:60.57ms +step:1725/2285 train_time:104478ms step_avg:60.57ms +step:1726/2285 train_time:104537ms step_avg:60.57ms +step:1727/2285 train_time:104599ms step_avg:60.57ms +step:1728/2285 train_time:104659ms step_avg:60.57ms +step:1729/2285 train_time:104721ms step_avg:60.57ms +step:1730/2285 train_time:104781ms step_avg:60.57ms +step:1731/2285 train_time:104843ms step_avg:60.57ms +step:1732/2285 train_time:104904ms step_avg:60.57ms +step:1733/2285 train_time:104967ms step_avg:60.57ms +step:1734/2285 train_time:105027ms step_avg:60.57ms +step:1735/2285 train_time:105090ms step_avg:60.57ms +step:1736/2285 train_time:105150ms step_avg:60.57ms +step:1737/2285 train_time:105212ms step_avg:60.57ms +step:1738/2285 train_time:105272ms step_avg:60.57ms +step:1739/2285 train_time:105334ms step_avg:60.57ms +step:1740/2285 train_time:105393ms step_avg:60.57ms +step:1741/2285 train_time:105456ms step_avg:60.57ms +step:1742/2285 train_time:105516ms step_avg:60.57ms +step:1743/2285 train_time:105578ms step_avg:60.57ms +step:1744/2285 train_time:105638ms step_avg:60.57ms +step:1745/2285 train_time:105700ms step_avg:60.57ms +step:1746/2285 train_time:105759ms step_avg:60.57ms +step:1747/2285 train_time:105821ms step_avg:60.57ms +step:1748/2285 train_time:105881ms step_avg:60.57ms +step:1749/2285 train_time:105944ms step_avg:60.57ms +step:1750/2285 train_time:106004ms step_avg:60.57ms +step:1750/2285 val_loss:3.3698 train_time:106069ms step_avg:60.61ms +step:1751/2285 train_time:106087ms step_avg:60.59ms +step:1752/2285 train_time:106132ms step_avg:60.58ms +step:1753/2285 train_time:106194ms step_avg:60.58ms +step:1754/2285 train_time:106254ms step_avg:60.58ms +step:1755/2285 train_time:106318ms step_avg:60.58ms +step:1756/2285 train_time:106378ms step_avg:60.58ms +step:1757/2285 train_time:106439ms step_avg:60.58ms +step:1758/2285 train_time:106498ms step_avg:60.58ms +step:1759/2285 train_time:106559ms step_avg:60.58ms +step:1760/2285 train_time:106618ms step_avg:60.58ms +step:1761/2285 train_time:106680ms step_avg:60.58ms +step:1762/2285 train_time:106740ms step_avg:60.58ms +step:1763/2285 train_time:106803ms step_avg:60.58ms +step:1764/2285 train_time:106862ms step_avg:60.58ms +step:1765/2285 train_time:106923ms step_avg:60.58ms +step:1766/2285 train_time:106986ms step_avg:60.58ms +step:1767/2285 train_time:107050ms step_avg:60.58ms +step:1768/2285 train_time:107110ms step_avg:60.58ms +step:1769/2285 train_time:107173ms step_avg:60.58ms +step:1770/2285 train_time:107233ms step_avg:60.58ms +step:1771/2285 train_time:107295ms step_avg:60.58ms +step:1772/2285 train_time:107355ms step_avg:60.58ms +step:1773/2285 train_time:107417ms step_avg:60.59ms +step:1774/2285 train_time:107477ms step_avg:60.58ms +step:1775/2285 train_time:107539ms step_avg:60.59ms +step:1776/2285 train_time:107598ms step_avg:60.58ms +step:1777/2285 train_time:107660ms step_avg:60.59ms +step:1778/2285 train_time:107719ms step_avg:60.58ms +step:1779/2285 train_time:107781ms step_avg:60.59ms +step:1780/2285 train_time:107841ms step_avg:60.58ms +step:1781/2285 train_time:107903ms step_avg:60.59ms +step:1782/2285 train_time:107964ms step_avg:60.59ms +step:1783/2285 train_time:108027ms step_avg:60.59ms +step:1784/2285 train_time:108087ms step_avg:60.59ms +step:1785/2285 train_time:108150ms step_avg:60.59ms +step:1786/2285 train_time:108209ms step_avg:60.59ms +step:1787/2285 train_time:108271ms step_avg:60.59ms +step:1788/2285 train_time:108331ms step_avg:60.59ms +step:1789/2285 train_time:108393ms step_avg:60.59ms +step:1790/2285 train_time:108453ms step_avg:60.59ms +step:1791/2285 train_time:108515ms step_avg:60.59ms +step:1792/2285 train_time:108575ms step_avg:60.59ms +step:1793/2285 train_time:108637ms step_avg:60.59ms +step:1794/2285 train_time:108697ms step_avg:60.59ms +step:1795/2285 train_time:108759ms step_avg:60.59ms +step:1796/2285 train_time:108818ms step_avg:60.59ms +step:1797/2285 train_time:108881ms step_avg:60.59ms +step:1798/2285 train_time:108942ms step_avg:60.59ms +step:1799/2285 train_time:109005ms step_avg:60.59ms +step:1800/2285 train_time:109066ms step_avg:60.59ms +step:1801/2285 train_time:109128ms step_avg:60.59ms +step:1802/2285 train_time:109188ms step_avg:60.59ms +step:1803/2285 train_time:109251ms step_avg:60.59ms +step:1804/2285 train_time:109310ms step_avg:60.59ms +step:1805/2285 train_time:109372ms step_avg:60.59ms +step:1806/2285 train_time:109432ms step_avg:60.59ms +step:1807/2285 train_time:109494ms step_avg:60.59ms +step:1808/2285 train_time:109554ms step_avg:60.59ms +step:1809/2285 train_time:109616ms step_avg:60.59ms +step:1810/2285 train_time:109676ms step_avg:60.59ms +step:1811/2285 train_time:109738ms step_avg:60.60ms +step:1812/2285 train_time:109798ms step_avg:60.60ms +step:1813/2285 train_time:109861ms step_avg:60.60ms +step:1814/2285 train_time:109921ms step_avg:60.60ms +step:1815/2285 train_time:109984ms step_avg:60.60ms +step:1816/2285 train_time:110044ms step_avg:60.60ms +step:1817/2285 train_time:110106ms step_avg:60.60ms +step:1818/2285 train_time:110166ms step_avg:60.60ms +step:1819/2285 train_time:110228ms step_avg:60.60ms +step:1820/2285 train_time:110289ms step_avg:60.60ms +step:1821/2285 train_time:110351ms step_avg:60.60ms +step:1822/2285 train_time:110414ms step_avg:60.60ms +step:1823/2285 train_time:110473ms step_avg:60.60ms +step:1824/2285 train_time:110532ms step_avg:60.60ms +step:1825/2285 train_time:110594ms step_avg:60.60ms +step:1826/2285 train_time:110654ms step_avg:60.60ms +step:1827/2285 train_time:110717ms step_avg:60.60ms +step:1828/2285 train_time:110777ms step_avg:60.60ms +step:1829/2285 train_time:110840ms step_avg:60.60ms +step:1830/2285 train_time:110900ms step_avg:60.60ms +step:1831/2285 train_time:110963ms step_avg:60.60ms +step:1832/2285 train_time:111023ms step_avg:60.60ms +step:1833/2285 train_time:111086ms step_avg:60.60ms +step:1834/2285 train_time:111146ms step_avg:60.60ms +step:1835/2285 train_time:111208ms step_avg:60.60ms +step:1836/2285 train_time:111268ms step_avg:60.60ms +step:1837/2285 train_time:111330ms step_avg:60.60ms +step:1838/2285 train_time:111391ms step_avg:60.60ms +step:1839/2285 train_time:111453ms step_avg:60.61ms +step:1840/2285 train_time:111512ms step_avg:60.60ms +step:1841/2285 train_time:111574ms step_avg:60.61ms +step:1842/2285 train_time:111634ms step_avg:60.60ms +step:1843/2285 train_time:111696ms step_avg:60.61ms +step:1844/2285 train_time:111756ms step_avg:60.61ms +step:1845/2285 train_time:111819ms step_avg:60.61ms +step:1846/2285 train_time:111880ms step_avg:60.61ms +step:1847/2285 train_time:111943ms step_avg:60.61ms +step:1848/2285 train_time:112002ms step_avg:60.61ms +step:1849/2285 train_time:112065ms step_avg:60.61ms +step:1850/2285 train_time:112125ms step_avg:60.61ms +step:1851/2285 train_time:112188ms step_avg:60.61ms +step:1852/2285 train_time:112248ms step_avg:60.61ms +step:1853/2285 train_time:112312ms step_avg:60.61ms +step:1854/2285 train_time:112370ms step_avg:60.61ms +step:1855/2285 train_time:112432ms step_avg:60.61ms +step:1856/2285 train_time:112492ms step_avg:60.61ms +step:1857/2285 train_time:112554ms step_avg:60.61ms +step:1858/2285 train_time:112613ms step_avg:60.61ms +step:1859/2285 train_time:112675ms step_avg:60.61ms +step:1860/2285 train_time:112735ms step_avg:60.61ms +step:1861/2285 train_time:112798ms step_avg:60.61ms +step:1862/2285 train_time:112859ms step_avg:60.61ms +step:1863/2285 train_time:112921ms step_avg:60.61ms +step:1864/2285 train_time:112981ms step_avg:60.61ms +step:1865/2285 train_time:113044ms step_avg:60.61ms +step:1866/2285 train_time:113104ms step_avg:60.61ms +step:1867/2285 train_time:113167ms step_avg:60.61ms +step:1868/2285 train_time:113226ms step_avg:60.61ms +step:1869/2285 train_time:113288ms step_avg:60.61ms +step:1870/2285 train_time:113348ms step_avg:60.61ms +step:1871/2285 train_time:113410ms step_avg:60.61ms +step:1872/2285 train_time:113470ms step_avg:60.61ms +step:1873/2285 train_time:113532ms step_avg:60.61ms +step:1874/2285 train_time:113591ms step_avg:60.61ms +step:1875/2285 train_time:113654ms step_avg:60.62ms +step:1876/2285 train_time:113713ms step_avg:60.61ms +step:1877/2285 train_time:113775ms step_avg:60.62ms +step:1878/2285 train_time:113835ms step_avg:60.62ms +step:1879/2285 train_time:113897ms step_avg:60.62ms +step:1880/2285 train_time:113958ms step_avg:60.62ms +step:1881/2285 train_time:114021ms step_avg:60.62ms +step:1882/2285 train_time:114081ms step_avg:60.62ms +step:1883/2285 train_time:114144ms step_avg:60.62ms +step:1884/2285 train_time:114204ms step_avg:60.62ms +step:1885/2285 train_time:114266ms step_avg:60.62ms +step:1886/2285 train_time:114326ms step_avg:60.62ms +step:1887/2285 train_time:114389ms step_avg:60.62ms +step:1888/2285 train_time:114448ms step_avg:60.62ms +step:1889/2285 train_time:114511ms step_avg:60.62ms +step:1890/2285 train_time:114570ms step_avg:60.62ms +step:1891/2285 train_time:114632ms step_avg:60.62ms +step:1892/2285 train_time:114691ms step_avg:60.62ms +step:1893/2285 train_time:114754ms step_avg:60.62ms +step:1894/2285 train_time:114813ms step_avg:60.62ms +step:1895/2285 train_time:114876ms step_avg:60.62ms +step:1896/2285 train_time:114936ms step_avg:60.62ms +step:1897/2285 train_time:114999ms step_avg:60.62ms +step:1898/2285 train_time:115060ms step_avg:60.62ms +step:1899/2285 train_time:115123ms step_avg:60.62ms +step:1900/2285 train_time:115183ms step_avg:60.62ms +step:1901/2285 train_time:115246ms step_avg:60.62ms +step:1902/2285 train_time:115305ms step_avg:60.62ms +step:1903/2285 train_time:115369ms step_avg:60.62ms +step:1904/2285 train_time:115428ms step_avg:60.62ms +step:1905/2285 train_time:115490ms step_avg:60.62ms +step:1906/2285 train_time:115550ms step_avg:60.62ms +step:1907/2285 train_time:115612ms step_avg:60.63ms +step:1908/2285 train_time:115671ms step_avg:60.62ms +step:1909/2285 train_time:115734ms step_avg:60.63ms +step:1910/2285 train_time:115794ms step_avg:60.63ms +step:1911/2285 train_time:115856ms step_avg:60.63ms +step:1912/2285 train_time:115917ms step_avg:60.63ms +step:1913/2285 train_time:115980ms step_avg:60.63ms +step:1914/2285 train_time:116041ms step_avg:60.63ms +step:1915/2285 train_time:116103ms step_avg:60.63ms +step:1916/2285 train_time:116164ms step_avg:60.63ms +step:1917/2285 train_time:116226ms step_avg:60.63ms +step:1918/2285 train_time:116287ms step_avg:60.63ms +step:1919/2285 train_time:116349ms step_avg:60.63ms +step:1920/2285 train_time:116409ms step_avg:60.63ms +step:1921/2285 train_time:116471ms step_avg:60.63ms +step:1922/2285 train_time:116531ms step_avg:60.63ms +step:1923/2285 train_time:116593ms step_avg:60.63ms +step:1924/2285 train_time:116653ms step_avg:60.63ms +step:1925/2285 train_time:116714ms step_avg:60.63ms +step:1926/2285 train_time:116774ms step_avg:60.63ms +step:1927/2285 train_time:116837ms step_avg:60.63ms +step:1928/2285 train_time:116897ms step_avg:60.63ms +step:1929/2285 train_time:116960ms step_avg:60.63ms +step:1930/2285 train_time:117021ms step_avg:60.63ms +step:1931/2285 train_time:117083ms step_avg:60.63ms +step:1932/2285 train_time:117144ms step_avg:60.63ms +step:1933/2285 train_time:117206ms step_avg:60.63ms +step:1934/2285 train_time:117266ms step_avg:60.63ms +step:1935/2285 train_time:117329ms step_avg:60.64ms +step:1936/2285 train_time:117389ms step_avg:60.63ms +step:1937/2285 train_time:117451ms step_avg:60.64ms +step:1938/2285 train_time:117511ms step_avg:60.64ms +step:1939/2285 train_time:117573ms step_avg:60.64ms +step:1940/2285 train_time:117632ms step_avg:60.64ms +step:1941/2285 train_time:117695ms step_avg:60.64ms +step:1942/2285 train_time:117754ms step_avg:60.64ms +step:1943/2285 train_time:117817ms step_avg:60.64ms +step:1944/2285 train_time:117877ms step_avg:60.64ms +step:1945/2285 train_time:117940ms step_avg:60.64ms +step:1946/2285 train_time:118001ms step_avg:60.64ms +step:1947/2285 train_time:118063ms step_avg:60.64ms +step:1948/2285 train_time:118123ms step_avg:60.64ms +step:1949/2285 train_time:118186ms step_avg:60.64ms +step:1950/2285 train_time:118247ms step_avg:60.64ms +step:1951/2285 train_time:118308ms step_avg:60.64ms +step:1952/2285 train_time:118368ms step_avg:60.64ms +step:1953/2285 train_time:118431ms step_avg:60.64ms +step:1954/2285 train_time:118491ms step_avg:60.64ms +step:1955/2285 train_time:118553ms step_avg:60.64ms +step:1956/2285 train_time:118612ms step_avg:60.64ms +step:1957/2285 train_time:118674ms step_avg:60.64ms +step:1958/2285 train_time:118735ms step_avg:60.64ms +step:1959/2285 train_time:118797ms step_avg:60.64ms +step:1960/2285 train_time:118858ms step_avg:60.64ms +step:1961/2285 train_time:118920ms step_avg:60.64ms +step:1962/2285 train_time:118981ms step_avg:60.64ms +step:1963/2285 train_time:119044ms step_avg:60.64ms +step:1964/2285 train_time:119103ms step_avg:60.64ms +step:1965/2285 train_time:119166ms step_avg:60.64ms +step:1966/2285 train_time:119226ms step_avg:60.64ms +step:1967/2285 train_time:119289ms step_avg:60.64ms +step:1968/2285 train_time:119349ms step_avg:60.64ms +step:1969/2285 train_time:119411ms step_avg:60.65ms +step:1970/2285 train_time:119471ms step_avg:60.65ms +step:1971/2285 train_time:119533ms step_avg:60.65ms +step:1972/2285 train_time:119592ms step_avg:60.65ms +step:1973/2285 train_time:119654ms step_avg:60.65ms +step:1974/2285 train_time:119714ms step_avg:60.65ms +step:1975/2285 train_time:119777ms step_avg:60.65ms +step:1976/2285 train_time:119837ms step_avg:60.65ms +step:1977/2285 train_time:119900ms step_avg:60.65ms +step:1978/2285 train_time:119960ms step_avg:60.65ms +step:1979/2285 train_time:120023ms step_avg:60.65ms +step:1980/2285 train_time:120082ms step_avg:60.65ms +step:1981/2285 train_time:120145ms step_avg:60.65ms +step:1982/2285 train_time:120205ms step_avg:60.65ms +step:1983/2285 train_time:120268ms step_avg:60.65ms +step:1984/2285 train_time:120328ms step_avg:60.65ms +step:1985/2285 train_time:120390ms step_avg:60.65ms +step:1986/2285 train_time:120451ms step_avg:60.65ms +step:1987/2285 train_time:120512ms step_avg:60.65ms +step:1988/2285 train_time:120572ms step_avg:60.65ms +step:1989/2285 train_time:120634ms step_avg:60.65ms +step:1990/2285 train_time:120693ms step_avg:60.65ms +step:1991/2285 train_time:120756ms step_avg:60.65ms +step:1992/2285 train_time:120819ms step_avg:60.65ms +step:1993/2285 train_time:120879ms step_avg:60.65ms +step:1994/2285 train_time:120940ms step_avg:60.65ms +step:1995/2285 train_time:121003ms step_avg:60.65ms +step:1996/2285 train_time:121063ms step_avg:60.65ms +step:1997/2285 train_time:121125ms step_avg:60.65ms +step:1998/2285 train_time:121185ms step_avg:60.65ms +step:1999/2285 train_time:121248ms step_avg:60.65ms +step:2000/2285 train_time:121308ms step_avg:60.65ms +step:2000/2285 val_loss:3.3212 train_time:121371ms step_avg:60.69ms +step:2001/2285 train_time:121390ms step_avg:60.66ms +step:2002/2285 train_time:121435ms step_avg:60.66ms +step:2003/2285 train_time:121497ms step_avg:60.66ms +step:2004/2285 train_time:121559ms step_avg:60.66ms +step:2005/2285 train_time:121622ms step_avg:60.66ms +step:2006/2285 train_time:121683ms step_avg:60.66ms +step:2007/2285 train_time:121745ms step_avg:60.66ms +step:2008/2285 train_time:121804ms step_avg:60.66ms +step:2009/2285 train_time:121866ms step_avg:60.66ms +step:2010/2285 train_time:121925ms step_avg:60.66ms +step:2011/2285 train_time:121987ms step_avg:60.66ms +step:2012/2285 train_time:122046ms step_avg:60.66ms +step:2013/2285 train_time:122108ms step_avg:60.66ms +step:2014/2285 train_time:122167ms step_avg:60.66ms +step:2015/2285 train_time:122229ms step_avg:60.66ms +step:2016/2285 train_time:122292ms step_avg:60.66ms +step:2017/2285 train_time:122357ms step_avg:60.66ms +step:2018/2285 train_time:122418ms step_avg:60.66ms +step:2019/2285 train_time:122481ms step_avg:60.66ms +step:2020/2285 train_time:122542ms step_avg:60.66ms +step:2021/2285 train_time:122606ms step_avg:60.67ms +step:2022/2285 train_time:122667ms step_avg:60.67ms +step:2023/2285 train_time:122729ms step_avg:60.67ms +step:2024/2285 train_time:122788ms step_avg:60.67ms +step:2025/2285 train_time:122850ms step_avg:60.67ms +step:2026/2285 train_time:122909ms step_avg:60.67ms +step:2027/2285 train_time:122970ms step_avg:60.67ms +step:2028/2285 train_time:123029ms step_avg:60.67ms +step:2029/2285 train_time:123091ms step_avg:60.67ms +step:2030/2285 train_time:123151ms step_avg:60.67ms +step:2031/2285 train_time:123213ms step_avg:60.67ms +step:2032/2285 train_time:123274ms step_avg:60.67ms +step:2033/2285 train_time:123337ms step_avg:60.67ms +step:2034/2285 train_time:123398ms step_avg:60.67ms +step:2035/2285 train_time:123461ms step_avg:60.67ms +step:2036/2285 train_time:123521ms step_avg:60.67ms +step:2037/2285 train_time:123585ms step_avg:60.67ms +step:2038/2285 train_time:123645ms step_avg:60.67ms +step:2039/2285 train_time:123707ms step_avg:60.67ms +step:2040/2285 train_time:123767ms step_avg:60.67ms +step:2041/2285 train_time:123829ms step_avg:60.67ms +step:2042/2285 train_time:123889ms step_avg:60.67ms +step:2043/2285 train_time:123950ms step_avg:60.67ms +step:2044/2285 train_time:124010ms step_avg:60.67ms +step:2045/2285 train_time:124072ms step_avg:60.67ms +step:2046/2285 train_time:124132ms step_avg:60.67ms +step:2047/2285 train_time:124194ms step_avg:60.67ms +step:2048/2285 train_time:124253ms step_avg:60.67ms +step:2049/2285 train_time:124316ms step_avg:60.67ms +step:2050/2285 train_time:124377ms step_avg:60.67ms +step:2051/2285 train_time:124439ms step_avg:60.67ms +step:2052/2285 train_time:124501ms step_avg:60.67ms +step:2053/2285 train_time:124565ms step_avg:60.67ms +step:2054/2285 train_time:124625ms step_avg:60.67ms +step:2055/2285 train_time:124687ms step_avg:60.67ms +step:2056/2285 train_time:124747ms step_avg:60.67ms +step:2057/2285 train_time:124809ms step_avg:60.68ms +step:2058/2285 train_time:124869ms step_avg:60.67ms +step:2059/2285 train_time:124931ms step_avg:60.68ms +step:2060/2285 train_time:124990ms step_avg:60.67ms +step:2061/2285 train_time:125052ms step_avg:60.68ms +step:2062/2285 train_time:125112ms step_avg:60.67ms +step:2063/2285 train_time:125174ms step_avg:60.68ms +step:2064/2285 train_time:125233ms step_avg:60.68ms +step:2065/2285 train_time:125296ms step_avg:60.68ms +step:2066/2285 train_time:125356ms step_avg:60.68ms +step:2067/2285 train_time:125419ms step_avg:60.68ms +step:2068/2285 train_time:125480ms step_avg:60.68ms +step:2069/2285 train_time:125543ms step_avg:60.68ms +step:2070/2285 train_time:125604ms step_avg:60.68ms +step:2071/2285 train_time:125667ms step_avg:60.68ms +step:2072/2285 train_time:125726ms step_avg:60.68ms +step:2073/2285 train_time:125788ms step_avg:60.68ms +step:2074/2285 train_time:125848ms step_avg:60.68ms +step:2075/2285 train_time:125911ms step_avg:60.68ms +step:2076/2285 train_time:125971ms step_avg:60.68ms +step:2077/2285 train_time:126033ms step_avg:60.68ms +step:2078/2285 train_time:126093ms step_avg:60.68ms +step:2079/2285 train_time:126155ms step_avg:60.68ms +step:2080/2285 train_time:126214ms step_avg:60.68ms +step:2081/2285 train_time:126277ms step_avg:60.68ms +step:2082/2285 train_time:126337ms step_avg:60.68ms +step:2083/2285 train_time:126401ms step_avg:60.68ms +step:2084/2285 train_time:126461ms step_avg:60.68ms +step:2085/2285 train_time:126524ms step_avg:60.68ms +step:2086/2285 train_time:126584ms step_avg:60.68ms +step:2087/2285 train_time:126646ms step_avg:60.68ms +step:2088/2285 train_time:126706ms step_avg:60.68ms +step:2089/2285 train_time:126768ms step_avg:60.68ms +step:2090/2285 train_time:126828ms step_avg:60.68ms +step:2091/2285 train_time:126890ms step_avg:60.68ms +step:2092/2285 train_time:126950ms step_avg:60.68ms +step:2093/2285 train_time:127012ms step_avg:60.68ms +step:2094/2285 train_time:127073ms step_avg:60.68ms +step:2095/2285 train_time:127135ms step_avg:60.68ms +step:2096/2285 train_time:127194ms step_avg:60.68ms +step:2097/2285 train_time:127257ms step_avg:60.69ms +step:2098/2285 train_time:127316ms step_avg:60.68ms +step:2099/2285 train_time:127379ms step_avg:60.69ms +step:2100/2285 train_time:127439ms step_avg:60.69ms +step:2101/2285 train_time:127502ms step_avg:60.69ms +step:2102/2285 train_time:127563ms step_avg:60.69ms +step:2103/2285 train_time:127625ms step_avg:60.69ms +step:2104/2285 train_time:127685ms step_avg:60.69ms +step:2105/2285 train_time:127748ms step_avg:60.69ms +step:2106/2285 train_time:127808ms step_avg:60.69ms +step:2107/2285 train_time:127870ms step_avg:60.69ms +step:2108/2285 train_time:127930ms step_avg:60.69ms +step:2109/2285 train_time:127992ms step_avg:60.69ms +step:2110/2285 train_time:128052ms step_avg:60.69ms +step:2111/2285 train_time:128114ms step_avg:60.69ms +step:2112/2285 train_time:128174ms step_avg:60.69ms +step:2113/2285 train_time:128236ms step_avg:60.69ms +step:2114/2285 train_time:128296ms step_avg:60.69ms +step:2115/2285 train_time:128359ms step_avg:60.69ms +step:2116/2285 train_time:128419ms step_avg:60.69ms +step:2117/2285 train_time:128482ms step_avg:60.69ms +step:2118/2285 train_time:128542ms step_avg:60.69ms +step:2119/2285 train_time:128605ms step_avg:60.69ms +step:2120/2285 train_time:128666ms step_avg:60.69ms +step:2121/2285 train_time:128728ms step_avg:60.69ms +step:2122/2285 train_time:128788ms step_avg:60.69ms +step:2123/2285 train_time:128850ms step_avg:60.69ms +step:2124/2285 train_time:128911ms step_avg:60.69ms +step:2125/2285 train_time:128973ms step_avg:60.69ms +step:2126/2285 train_time:129032ms step_avg:60.69ms +step:2127/2285 train_time:129095ms step_avg:60.69ms +step:2128/2285 train_time:129154ms step_avg:60.69ms +step:2129/2285 train_time:129217ms step_avg:60.69ms +step:2130/2285 train_time:129277ms step_avg:60.69ms +step:2131/2285 train_time:129340ms step_avg:60.69ms +step:2132/2285 train_time:129400ms step_avg:60.69ms +step:2133/2285 train_time:129463ms step_avg:60.70ms +step:2134/2285 train_time:129523ms step_avg:60.70ms +step:2135/2285 train_time:129586ms step_avg:60.70ms +step:2136/2285 train_time:129646ms step_avg:60.70ms +step:2137/2285 train_time:129708ms step_avg:60.70ms +step:2138/2285 train_time:129769ms step_avg:60.70ms +step:2139/2285 train_time:129831ms step_avg:60.70ms +step:2140/2285 train_time:129891ms step_avg:60.70ms +step:2141/2285 train_time:129953ms step_avg:60.70ms +step:2142/2285 train_time:130013ms step_avg:60.70ms +step:2143/2285 train_time:130075ms step_avg:60.70ms +step:2144/2285 train_time:130135ms step_avg:60.70ms +step:2145/2285 train_time:130197ms step_avg:60.70ms +step:2146/2285 train_time:130257ms step_avg:60.70ms +step:2147/2285 train_time:130320ms step_avg:60.70ms +step:2148/2285 train_time:130380ms step_avg:60.70ms +step:2149/2285 train_time:130443ms step_avg:60.70ms +step:2150/2285 train_time:130503ms step_avg:60.70ms +step:2151/2285 train_time:130567ms step_avg:60.70ms +step:2152/2285 train_time:130626ms step_avg:60.70ms +step:2153/2285 train_time:130689ms step_avg:60.70ms +step:2154/2285 train_time:130749ms step_avg:60.70ms +step:2155/2285 train_time:130811ms step_avg:60.70ms +step:2156/2285 train_time:130871ms step_avg:60.70ms +step:2157/2285 train_time:130933ms step_avg:60.70ms +step:2158/2285 train_time:130993ms step_avg:60.70ms +step:2159/2285 train_time:131055ms step_avg:60.70ms +step:2160/2285 train_time:131115ms step_avg:60.70ms +step:2161/2285 train_time:131177ms step_avg:60.70ms +step:2162/2285 train_time:131237ms step_avg:60.70ms +step:2163/2285 train_time:131300ms step_avg:60.70ms +step:2164/2285 train_time:131360ms step_avg:60.70ms +step:2165/2285 train_time:131423ms step_avg:60.70ms +step:2166/2285 train_time:131484ms step_avg:60.70ms +step:2167/2285 train_time:131546ms step_avg:60.70ms +step:2168/2285 train_time:131606ms step_avg:60.70ms +step:2169/2285 train_time:131670ms step_avg:60.71ms +step:2170/2285 train_time:131730ms step_avg:60.71ms +step:2171/2285 train_time:131793ms step_avg:60.71ms +step:2172/2285 train_time:131853ms step_avg:60.71ms +step:2173/2285 train_time:131915ms step_avg:60.71ms +step:2174/2285 train_time:131976ms step_avg:60.71ms +step:2175/2285 train_time:132038ms step_avg:60.71ms +step:2176/2285 train_time:132099ms step_avg:60.71ms +step:2177/2285 train_time:132161ms step_avg:60.71ms +step:2178/2285 train_time:132221ms step_avg:60.71ms +step:2179/2285 train_time:132284ms step_avg:60.71ms +step:2180/2285 train_time:132344ms step_avg:60.71ms +step:2181/2285 train_time:132407ms step_avg:60.71ms +step:2182/2285 train_time:132467ms step_avg:60.71ms +step:2183/2285 train_time:132529ms step_avg:60.71ms +step:2184/2285 train_time:132589ms step_avg:60.71ms +step:2185/2285 train_time:132652ms step_avg:60.71ms +step:2186/2285 train_time:132712ms step_avg:60.71ms +step:2187/2285 train_time:132776ms step_avg:60.71ms +step:2188/2285 train_time:132835ms step_avg:60.71ms +step:2189/2285 train_time:132898ms step_avg:60.71ms +step:2190/2285 train_time:132958ms step_avg:60.71ms +step:2191/2285 train_time:133021ms step_avg:60.71ms +step:2192/2285 train_time:133081ms step_avg:60.71ms +step:2193/2285 train_time:133143ms step_avg:60.71ms +step:2194/2285 train_time:133203ms step_avg:60.71ms +step:2195/2285 train_time:133265ms step_avg:60.71ms +step:2196/2285 train_time:133325ms step_avg:60.71ms +step:2197/2285 train_time:133387ms step_avg:60.71ms +step:2198/2285 train_time:133447ms step_avg:60.71ms +step:2199/2285 train_time:133510ms step_avg:60.71ms +step:2200/2285 train_time:133570ms step_avg:60.71ms +step:2201/2285 train_time:133633ms step_avg:60.71ms +step:2202/2285 train_time:133693ms step_avg:60.71ms +step:2203/2285 train_time:133756ms step_avg:60.72ms +step:2204/2285 train_time:133816ms step_avg:60.72ms +step:2205/2285 train_time:133878ms step_avg:60.72ms +step:2206/2285 train_time:133939ms step_avg:60.72ms +step:2207/2285 train_time:134002ms step_avg:60.72ms +step:2208/2285 train_time:134062ms step_avg:60.72ms +step:2209/2285 train_time:134124ms step_avg:60.72ms +step:2210/2285 train_time:134184ms step_avg:60.72ms +step:2211/2285 train_time:134247ms step_avg:60.72ms +step:2212/2285 train_time:134307ms step_avg:60.72ms +step:2213/2285 train_time:134369ms step_avg:60.72ms +step:2214/2285 train_time:134428ms step_avg:60.72ms +step:2215/2285 train_time:134491ms step_avg:60.72ms +step:2216/2285 train_time:134551ms step_avg:60.72ms +step:2217/2285 train_time:134614ms step_avg:60.72ms +step:2218/2285 train_time:134673ms step_avg:60.72ms +step:2219/2285 train_time:134736ms step_avg:60.72ms +step:2220/2285 train_time:134796ms step_avg:60.72ms +step:2221/2285 train_time:134858ms step_avg:60.72ms +step:2222/2285 train_time:134919ms step_avg:60.72ms +step:2223/2285 train_time:134982ms step_avg:60.72ms +step:2224/2285 train_time:135043ms step_avg:60.72ms +step:2225/2285 train_time:135104ms step_avg:60.72ms +step:2226/2285 train_time:135165ms step_avg:60.72ms +step:2227/2285 train_time:135227ms step_avg:60.72ms +step:2228/2285 train_time:135287ms step_avg:60.72ms +step:2229/2285 train_time:135349ms step_avg:60.72ms +step:2230/2285 train_time:135409ms step_avg:60.72ms +step:2231/2285 train_time:135472ms step_avg:60.72ms +step:2232/2285 train_time:135531ms step_avg:60.72ms +step:2233/2285 train_time:135594ms step_avg:60.72ms +step:2234/2285 train_time:135654ms step_avg:60.72ms +step:2235/2285 train_time:135717ms step_avg:60.72ms +step:2236/2285 train_time:135777ms step_avg:60.72ms +step:2237/2285 train_time:135839ms step_avg:60.72ms +step:2238/2285 train_time:135900ms step_avg:60.72ms +step:2239/2285 train_time:135963ms step_avg:60.72ms +step:2240/2285 train_time:136023ms step_avg:60.72ms +step:2241/2285 train_time:136085ms step_avg:60.73ms +step:2242/2285 train_time:136145ms step_avg:60.72ms +step:2243/2285 train_time:136207ms step_avg:60.73ms +step:2244/2285 train_time:136268ms step_avg:60.73ms +step:2245/2285 train_time:136330ms step_avg:60.73ms +step:2246/2285 train_time:136391ms step_avg:60.73ms +step:2247/2285 train_time:136453ms step_avg:60.73ms +step:2248/2285 train_time:136512ms step_avg:60.73ms +step:2249/2285 train_time:136574ms step_avg:60.73ms +step:2250/2285 train_time:136634ms step_avg:60.73ms +step:2250/2285 val_loss:3.2861 train_time:136698ms step_avg:60.75ms +step:2251/2285 train_time:136715ms step_avg:60.74ms +step:2252/2285 train_time:136760ms step_avg:60.73ms +step:2253/2285 train_time:136823ms step_avg:60.73ms +step:2254/2285 train_time:136883ms step_avg:60.73ms +step:2255/2285 train_time:136945ms step_avg:60.73ms +step:2256/2285 train_time:137005ms step_avg:60.73ms +step:2257/2285 train_time:137066ms step_avg:60.73ms +step:2258/2285 train_time:137126ms step_avg:60.73ms +step:2259/2285 train_time:137187ms step_avg:60.73ms +step:2260/2285 train_time:137247ms step_avg:60.73ms +step:2261/2285 train_time:137310ms step_avg:60.73ms +step:2262/2285 train_time:137370ms step_avg:60.73ms +step:2263/2285 train_time:137433ms step_avg:60.73ms +step:2264/2285 train_time:137492ms step_avg:60.73ms +step:2265/2285 train_time:137555ms step_avg:60.73ms +step:2266/2285 train_time:137616ms step_avg:60.73ms +step:2267/2285 train_time:137680ms step_avg:60.73ms +step:2268/2285 train_time:137741ms step_avg:60.73ms +step:2269/2285 train_time:137804ms step_avg:60.73ms +step:2270/2285 train_time:137864ms step_avg:60.73ms +step:2271/2285 train_time:137927ms step_avg:60.73ms +step:2272/2285 train_time:137987ms step_avg:60.73ms +step:2273/2285 train_time:138049ms step_avg:60.73ms +step:2274/2285 train_time:138109ms step_avg:60.73ms +step:2275/2285 train_time:138171ms step_avg:60.73ms +step:2276/2285 train_time:138231ms step_avg:60.73ms +step:2277/2285 train_time:138293ms step_avg:60.73ms +step:2278/2285 train_time:138353ms step_avg:60.73ms +step:2279/2285 train_time:138416ms step_avg:60.74ms +step:2280/2285 train_time:138476ms step_avg:60.74ms +step:2281/2285 train_time:138539ms step_avg:60.74ms +step:2282/2285 train_time:138599ms step_avg:60.74ms +step:2283/2285 train_time:138662ms step_avg:60.74ms +step:2284/2285 train_time:138722ms step_avg:60.74ms +step:2285/2285 train_time:138785ms step_avg:60.74ms +step:2285/2285 val_loss:3.2802 train_time:138846ms step_avg:60.76ms +peak memory allocated: 29249 MiB reserved: 50528 MiB diff --git a/records/track_1_short/2025-10-27_FixMuonLR/fc12c205-f953-4028-bfdf-0519c72fb269.txt b/records/track_1_short/2025-10-27_FixMuonLR/fc12c205-f953-4028-bfdf-0519c72fb269.txt new file mode 100644 index 000000000..115b6510f --- /dev/null +++ b/records/track_1_short/2025-10-27_FixMuonLR/fc12c205-f953-4028-bfdf-0519c72fb269.txt @@ -0,0 +1,3814 @@ +import os +import sys + +with open(sys.argv[0]) as f: + code = f.read() # read the code of this file ASAP, for logging +import copy +import glob +import math +import threading +import time +import uuid +from dataclasses import dataclass +from collections import defaultdict +from itertools import accumulate +from pathlib import Path + +os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" +import torch + +torch.empty( + 1, device="cuda", requires_grad=True +).backward() # prevents a bug on some systems +import torch._dynamo as dynamo +import torch.distributed as dist +import torch.nn.functional as F + +# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min +import triton +import triton.language as tl +from kernels import get_kernel +from torch import Tensor, nn + +dynamo.config.recompile_limit = 64 + +# ----------------------------------------------------------------------------- +# Custom operators: FP8 matmul by @YouJiacheng + + +@torch.library.custom_op("nanogpt::mm", mutates_args=()) +def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: + @torch.compile + def impl(x: Tensor, w: Tensor): + assert x.is_contiguous() and w.is_contiguous() + x_f8 = x.div(x_s).to(torch.float8_e4m3fn) + w_f8 = w.div(w_s).to(torch.float8_e4m3fn) + out = torch._scaled_mm( + x_f8, + w_f8.T, + out_dtype=torch.bfloat16, + scale_a=x.new_tensor(x_s, dtype=torch.float32), + scale_b=x.new_tensor(w_s, dtype=torch.float32), + use_fast_accum=True, + ) + return out, x_f8, w_f8 + + return impl(x, w) + +@mm_op.register_fake +def _(x: Tensor, w: Tensor, *_): + assert x.ndim == w.ndim == 2 + assert x.shape[1] == w.shape[1] + assert x.device == w.device + assert x.is_contiguous() and w.is_contiguous() + return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) + +@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) +def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: + @torch.compile + def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): + assert grad.is_contiguous() + x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) + w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) + grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) + grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) + grad_x = torch._scaled_mm( + grad_f8, + w_f8.T.contiguous().T, + out_dtype=torch.bfloat16, + scale_a=grad_inv_s, + scale_b=w_inv_s, + use_fast_accum=False, + ) + # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) + grad_w = torch._scaled_mm( + x_f8.T.contiguous(), + grad_f8.T.contiguous().T, + out_dtype=torch.float32, + scale_a=x_inv_s, + scale_b=grad_inv_s, + use_fast_accum=False, + ).T + return grad_x, grad_w + + return impl(g, x_f8, w_f8) + +@mm_backward_op.register_fake +def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): + return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) + +def backward(ctx, grad_out: Tensor, *_): + x_f8, w_f8 = ctx.saved_tensors + x_s, w_s, grad_s = ctx.scales + grad_x, grad_w = torch.ops.nanogpt.mm_backward( + grad_out, x_f8, w_f8, x_s, w_s, grad_s + ) + return grad_x, grad_w, None, None, None + +def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): + *_, x_s, w_s, grad_s = inputs + _, x_f8, w_f8 = output + ctx.save_for_backward(x_f8, w_f8) + ctx.scales = x_s, w_s, grad_s + ctx.set_materialize_grads(False) + +mm_op.register_autograd(backward, setup_context=setup_context) + +# ----------------------------------------------------------------------------- +# Triton kernel for symmetric matrix multiplication by @byronxu99 + +def _get_autotune_configs(): + return [ + triton.Config( + { + "BLOCK_SIZE_M": bm, + "BLOCK_SIZE_N": bn, + "BLOCK_SIZE_K": bk, + "GROUP_SIZE_M": 8, + "LOWER_UPPER": 1, + }, + num_stages=stages, + num_warps=warps, + ) + for bm in [64, 128] + for bn in [64, 128, 256] + for bk in [64, 128] + for stages, warps in [(3, 4), (3, 8), (4, 4)] + if bm // bn <= 2 and bn // bm <= 2 + ] + +@triton.jit +def _pid_to_block( + pid, + M, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) + + # Map PID to a single matrix in batch + batch_idx = pid // (num_pid_m * num_pid_n) + pid = pid % (num_pid_m * num_pid_n) + + # Map PID to 2D grid of blocks + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) + + m_idx = pid_m * BLOCK_SIZE_M + n_idx = pid_n * BLOCK_SIZE_N + return batch_idx, m_idx, n_idx + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def XXT_kernel( + A_ptr, C_ptr, + M, K, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def XXT(A: torch.Tensor, out: torch.Tensor): + """ + Launch Triton kernel to compute C = A @ A.T + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert out.size(-2) == M, "Output matrix has incorrect shape" + assert out.size(-1) == M, "Output matrix has incorrect shape" + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + XXT_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + K=K, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + ) + return out + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def ba_plus_cAA_kernel( + A_ptr, C_ptr, + M, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + alpha, beta, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A + # Performance is slightly slower than XXT_kernel, so we use two separate kernels + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + # Load block of A to add (corresponds to the current block of C) + offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) + a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) + a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) + a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) + + # Apply alpha and beta + accumulator *= alpha + accumulator += a_add * beta + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): + """ + Launch Triton kernel to compute C = alpha * A @ A.T + beta * A + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert M == K, "Input matrix must be square" + assert out.size(-2) == M + assert out.size(-1) == M + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + ba_plus_cAA_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + alpha=alpha, + beta=beta, + ) + return out + +# Computed for num_iters=5, safety_factor=2e-2, cushion=2 +polar_express_coeffs = [ + (8.156554524902461, -22.48329292557795, 15.878769915207462), + (4.042929935166739, -2.808917465908714, 0.5000178451051316), + (3.8916678022926607, -2.772484153217685, 0.5060648178503393), + (3.285753657755655, -2.3681294933425376, 0.46449024233003106), + (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) +] + +@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower +def polar_express(G: torch.Tensor): + """ + Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 + by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. + Code adapted from https://github.com/NoahAmsel/PolarExpress/tree/main by @varunneal. + """ + X = G.bfloat16() + if G.size(-2) > G.size(-1): + X = X.mT + + # Ensure spectral norm is at most 1 + X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) + + # Allocate buffers + X = X.contiguous() + A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) + B = torch.empty_like(A) + C = torch.empty_like(X) + + aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm + + # Perform the iterations + for a, b, c in polar_express_coeffs: + XXT(X, out=A) # A = X @ X.mT + ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A + aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X + X, C = C, X # Swap references to avoid unnecessary copies + + if G.size(-2) > G.size(-1): + X = X.mT + return X + +# ----------------------------------------------------------------------------- +# Muon optimizer + +class Muon(torch.optim.Optimizer): + """ + Muon - MomentUm Orthogonalized by Newton-schulz + + https://kellerjordan.github.io/posts/muon/ + + Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- + processing step, in which each 2D parameter's update is replaced with the nearest orthogonal + matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has + the advantage that it can be stably run in bfloat16 on the GPU. + Note: A later PR replaced Newton-Shulz with Polar Express for the orthogonalization step + + Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, + or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). + Though empirically small 1D params perform efficiently here: + NS approximately performs a magnitude normalization of the grad + This hyper-optimized class has faster execution time than the current impl of Adam for small params + + Custom distributed sizing: + The model stores all attn and mlp weights in the same shape, and then updates the view as + needed on the forward pass. This enables attn and mlp weights to be contained within the same + dist.reduce_scatter_tensor() call. The model architecture has been customized to enable + (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. + The scheduling is: + 1. reduce scatter smear_gate (1 param 7 padding params) + 2. reduce scatter attn_gate (10 params 6 padding params) + 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) + 4. reduce scatter attn/mlp round 2 (16 mlp params) + 5. wait on step 1, then compute update of 1 and schedule all gather + 6. wait on step 2, then compute update of 2 and schedule all gather + 7. wait on step 3, then compute update of 3 and schedule all gather + GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] + GPUs that receive params of type attn reshape before computing update + 8. wait on 4, then compute update of 4 and schedule all gather + 9. wait for each all gather to complete and update params + Empirically, leading with small params provides an additional 0.2s improvement. + """ + def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, eps=1e-8, beta2=0.95, custom_sizing=True): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + # custom sizing requires 8 GPUs + if custom_sizing and dist.get_world_size()==8: + param_groups = self.generate_custom_param_groups(params) + else: + param_groups = self.generate_standard_param_groups(params) + super().__init__(param_groups, defaults) + + def reset(self): + # expose a reset for clearing buffers + for group in self.param_groups: + group["momentum_buffer"].zero_() + group["second_momentum_buffer"].zero_() + + def generate_standard_param_groups(self, params): + """ + Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. + Creates one param group per module. + """ + groups = defaultdict(list) + for param in params: + groups[param.label].append(param) + + param_groups = [] + for module_name, group_params in groups.items(): + chunk_size = (len(group_params) + self.world_size - 1) // self.world_size + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + + return param_groups + + def generate_custom_param_groups(self, params): + """ + Implementation requires that a single GPU does not receive both attn + and mlp params when a param group is split across GPUs. + """ + module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp_up', 'mlp_down'] + params_list = list(params) + params_list.sort(key=lambda x: module_group_order.index(x.label)) + + idx = 0 + group_sizes = [1, 10, 16, 16] + assert len(params_list) == sum(group_sizes) + param_groups = [] + for size in group_sizes: + chunk_size = (size + self.world_size - 1) // self.world_size + group_params = params_list[idx: idx + size] + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + idx += size + + return param_groups + + @torch.no_grad() + def step(self): + # Efficient systems-wise implementation of step developed by @YouJiacheng, + # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, + # @ryanyang0, @vagrawal, and @varunneal. + rank = dist.get_rank() + group_infos = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + if not params: + continue + + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + stacked_grads = torch.empty( + (padded_num_params, *params[0].shape), + dtype=params[0].dtype, + device=params[0].device + ) + for i, p in enumerate(params): + stacked_grads[i].copy_(p.grad, non_blocking=True) + if len(params) < padded_num_params: + stacked_grads[len(params):].zero_() + + grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) + + reduce_future = dist.reduce_scatter_tensor( + grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True + ).get_future() + + group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) + + all_gather_infos = [] + # Second pass: wait for gradients, compute updates for the local shard of parameters, + # and launch all async all_gather operations. + for group, info in zip(self.param_groups, group_infos): + info["reduce_future"].wait() + + params = group["params"] + grad_chunk = info["grad_chunk"] + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + start_idx = rank * chunk_size + module_idx = start_idx if start_idx < len(params) else 0 + + num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank + + if "momentum_buffer" not in group: + group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) + momentum_buffer = group["momentum_buffer"] + # Apply momentum update to the persistent momentum buffer in-place + momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) + updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) + + grad_shape = updated_grads.shape + if params[module_idx].label == 'attn': + # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] + for p in params[module_idx:module_idx + num_params]: + assert p.label == 'attn' + updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) + ref_param = params[module_idx] + param_shape = ref_param.shape + + if "second_momentum_buffer" not in group: + group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) + if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) + ) + second_momentum_buffer = group["second_momentum_buffer"] + + if "param_lr" not in group: + group["param_lr"] = ( + max(1., param_shape[-2] / param_shape[-1]) ** 0.5 + * ref_param.new_tensor( + [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + ) + + group["param_wd"] = ref_param.new_tensor( + [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + + # Determine LR and WR + eff_lr = group["lr"] * group["param_lr"] + eff_wd = group["weight_decay"] * group["param_wd"] + + # Compute zeropower for the entire chunk in a single, batched call. + if num_params == 0: + v_chunk = updated_grads + elif params[module_idx].label == "smear_gate": + # dividing by magnitude is equivalent of SVN for 1d tensors + v_chunk = updated_grads / (updated_grads.norm(dim=(-2, -1), keepdim=True).clamp_min(1e-10)) + else: + v_chunk = polar_express(updated_grads) + + # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) + v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) + second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) + step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() + v_chunk.mul_(step_size) + v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) + + v_chunk = v_chunk.view(grad_shape) + + updated_params = torch.empty_like(grad_chunk) + param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) + # Apply weight decay directly to the buffer. + param_chunk.mul_(1 - eff_wd) + + param_chunk.add_(-eff_lr * v_chunk) + + updated_params[:num_params].copy_(param_chunk) + if num_params < chunk_size: + updated_params[num_params:].zero_() + + stacked_params = torch.empty( + (padded_num_params, *param_shape), + dtype=updated_params.dtype, + device=updated_params.device, + ) + + gather_future = dist.all_gather_into_tensor( + stacked_params, updated_params, async_op=True + ).get_future() + + all_gather_infos.append( + { + "gather_future": gather_future, + "stacked_params": stacked_params, + "orig_params": params, + } + ) + + # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. + for info in all_gather_infos: + info["gather_future"].wait() + stacked_params = info["stacked_params"] + orig_params = info["orig_params"] + + unstacked_params = torch.unbind(stacked_params) + for i, p in enumerate(orig_params): + p.copy_(unstacked_params[i], non_blocking=True) + + +class DistAdam(torch.optim.Optimizer): + def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + params = list(params) + sizes = {p.shape for p in params} + # create one buffer per unique parameter-size + param_groups = [] + for size in sizes: + group_params = [p for p in params if p.shape == size] + param_groups.append(dict(params=group_params)) + super().__init__(param_groups, defaults) + # init state + for p in params: + chunk_size = p.size(0) // self.world_size + exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) + exp_avg_sq = torch.zeros_like(exp_avg) + self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) + # DistributedAdam implementation by @vagrawal + + @torch.compile + @torch.no_grad() + def step(self): + rank = dist.get_rank() + reduce_scatter_futures: list[torch.Future] = [] + all_gather_futures: list[torch.Future] = [] + grad_slices = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + for param in params: + grad = param.grad + rank_size = grad.shape[0] // self.world_size + grad_slice = torch.empty_like(grad[:rank_size]) + reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) + grad_slices.append(grad_slice) + + idx = 0 + for group in self.param_groups: + beta1, beta2 = group['betas'] + eps = group['eps'] + wd = group['weight_decay'] + params = group['params'] + for param in params: + reduce_scatter_futures[idx].wait() + rank_size = param.shape[0] // self.world_size + p_slice = param[rank * rank_size:(rank + 1) * rank_size] + lr = group['lr'] * getattr(param, "lr_mul", 1.0) + state = self.state[param] + g_slice = grad_slices[idx] + + exp_avg = state["exp_avg"] + exp_avg_sq = state["exp_avg_sq"] + state["step"] += 1 + t = state["step"] + # weight decay + if wd != 0: + eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) + p_slice.mul_(1 - eff_weight_decay) + # update running averages + exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) + # bias corrections + bias1 = 1 - beta1 ** t + bias2 = 1 - beta2 ** t + # compute step + denom = exp_avg_sq.sqrt().add_(eps) + step_size = lr * (bias2 ** 0.5 / bias1) + update = exp_avg.div(denom).mul_(step_size) + p_slice.add_(other=update, alpha=-1.0) + idx += 1 + all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) + torch.futures.collect_all(all_gather_futures).wait() + +# ----------------------------------------------------------------------------- +# PyTorch nn.Module definitions for the model + +def norm(x: Tensor): + return F.rms_norm(x, (x.size(-1),)) + +class CastedLinear(nn.Linear): + def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): + super().__init__(in_features, out_features, bias=False) + self.use_fp8 = use_fp8 + self.x_s = x_s + self.w_s = w_s + self.grad_s = grad_s + + def reset_parameters(self) -> None: + with torch.no_grad(): + self.weight.zero_() # @Grad62304977 and others + + def forward(self, x: Tensor): + if self.use_fp8 and self.training: + _x = x.flatten(0, -2) + out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] + return out.reshape(*x.shape[:-1], -1) + else: + return F.linear(x, self.weight.type_as(x)) + +# yarn implementation @classiclarryd +class Yarn(nn.Module): + def __init__(self, head_dim, max_seq_len): + super().__init__() + self.head_dim = head_dim + self.max_seq_len = max_seq_len + self.reset() + + def reset(self): + angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) + # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) + angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) + theta = torch.outer(t, angular_freq) + self.cos = nn.Buffer( + theta.cos().to(torch.bfloat16), persistent=False + ) + self.sin = nn.Buffer( + theta.sin().to(torch.bfloat16), persistent=False + ) + self.angular_freq = angular_freq + # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 + self.attn_scale = 0.1 + + def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): + rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) + scaling_factor = old_window / new_window + interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) + self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) + theta = torch.outer(t, self.angular_freq) + self.cos.copy_(theta.cos()) + self.sin.copy_(theta.sin()) + self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 + +def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): + assert cos.size(0) >= x_BTHD.size(-3) + cos, sin = ( + cos[None, : x_BTHD.size(-3), None, :], + sin[None, : x_BTHD.size(-3), None, :], + ) + x1, x2 = x_BTHD.chunk(2, dim=-1) + y1 = x1 * cos + x2 * sin + y2 = x1 * (-sin) + x2 * cos + return torch.cat((y1, y2), 3) + +@dataclass +class AttnArgs: + ve: torch.Tensor + sa_lambdas: torch.Tensor + seqlens: torch.Tensor + bm_size: int + cos: torch.Tensor + sin: torch.Tensor + attn_scale: float + +flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface + +class CausalSelfAttention(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int): + super().__init__() + self.num_heads = num_heads + self.head_dim = head_dim + self.dim = dim + self.hdim = num_heads * head_dim + + assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" + std = 0.5 * (self.dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng + # https://x.com/hi_tysam/status/1879699187107033311 + # make matrices the same shape as MLP to enable batched call in optimizer + self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) + # label module to enable custom optimizer sizing + self.qkvo_w.label='attn' + + with torch.no_grad(): + self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights + self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero + + # sparse gated attention to enable context based no-op by @classiclarryd + self.attn_gate = CastedLinear(12, num_heads) + # label module to enable custom optimizer sizing + self.attn_gate.weight.label = 'attn_gate' + + def forward(self, x: Tensor, attn_args: AttnArgs): + B, T = x.size(0), x.size(1) # batch size, sequence length + assert B == 1, "varlen sequences requires B == 1" + assert T % 16 == 0 + # unpack attention args + cos, sin = attn_args.cos, attn_args.sin + ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas + seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size + + q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) + q, k = norm(q), norm(k) # QK norm @Grad62304977 + q, k = rotary(q, cos, sin), rotary(k, cos, sin) + if ve is not None: + v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 + else: # skip mid-layers token value embeddings by @YouJiacheng + v = sa_lambdas[0] * v + + max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) + + # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng + y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, + max_seqlen_q=max_len, max_seqlen_k=max_len, + causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) + y = y.view(B, T, self.num_heads, self.head_dim) + y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) + y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side + y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) + return y + + +class MLP(nn.Module): + def __init__(self, dim: int): + super().__init__() + hdim = 4 * dim + # make matrices the same shape to enable batched call in optimizer + self.c_fc = nn.Parameter(torch.empty(dim, hdim)) + self.c_proj = nn.Parameter(torch.empty(dim, hdim)) + # label modules to enable custom optimizer sizing + self.c_fc.label = 'mlp_up' + self.c_proj.label = 'mlp_down' + # corrective factor to account for transpose + self.c_fc.lr_mul = 2. + + std = 0.5 * (dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + with torch.no_grad(): + self.c_fc.uniform_(-bound, bound) + self.c_proj.zero_() # zero init suggested by @Grad62304977 + + def forward(self, x: Tensor): + x = F.linear(x, self.c_fc.T.type_as(x)) + x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 + x = F.linear(x, self.c_proj.type_as(x)) + return x + +class Block(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): + super().__init__() + # skip attention of blocks.7 (the 8th layer) by @YouJiacheng + self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None + # skip MLP blocks for first MLP layer by @EmelyanenkoK + self.mlp = MLP(dim) if layer_idx != 0 else None + + def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): + x = lambdas[0] * x + lambdas[1] * x0 + if self.attn is not None: + x = x + self.attn(norm(x), attn_args) + if self.mlp is not None: + x = x + self.mlp(norm(x)) + return x + +# ----------------------------------------------------------------------------- +# The main model + +def next_multiple_of_n(v: float | int, *, n: int): + return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) + +class GPT(nn.Module): + def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): + super().__init__() + vocab_size = next_multiple_of_n(vocab_size, n=128) + self.embed = nn.Embedding(vocab_size, model_dim) + self.smear_gate = CastedLinear(12, 1) + # label modules to enable custom optimizer sizing + self.smear_gate.weight.label = 'smear_gate' + # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 + # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 + self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) + self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) + self.yarn = Yarn(head_dim, max_seq_len) + # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. + # suggested to me by @Grad62304977. this originates from Karpathy's experiments. + use_fp8 = not os.environ.get("DISABLE_FP8", False) + self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) + # Add learnable skip connection weights for decoder layers + assert num_layers % 2 == 0 + pad = (-num_layers * 5 - 2) % dist.get_world_size() + self.scalars = nn.Parameter( + torch.cat( + [ + -1.5 + * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 + *[ + torch.tensor([1.0, 0.0]) for _ in range(num_layers) + ], # block lambdas + *[ + torch.tensor([0.5, 0.5]) for _ in range(num_layers) + ], # SA lambdas + torch.zeros(1), # smear_lambda + 0.5*torch.ones(1), # backout_lambda + torch.ones(pad), + ] + ) + ) + # set learning rates + for param in self.embed.parameters(): + param.lr_mul = 75. + for param in self.value_embeds.parameters(): + param.lr_mul = 75. + self.lm_head.weight.lr_mul = 1.0 + self.scalars.lr_mul = 5.0 + + def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): + assert input_seq.ndim == 1 + + ve = [value_embed(input_seq) for value_embed in self.value_embeds] + # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure + # dropping first layer updates this to .12 ... 012 + ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] + assert len(ve) == len(self.blocks) + + short_bm = ws_short * args.block_size + long_bm = ws_long * args.block_size + bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] + assert len(bm_sizes) == len(self.blocks) + + x = self.embed(input_seq) + + skip_weights = self.scalars[:(len(self.blocks) // 2)] + lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) + sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) + smear_lambda = self.scalars[5 * len(self.blocks)] + backout_lambda = self.scalars[5 * len(self.blocks)+1] + + # smear token embed forward 1 position @classiclarryd + smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) + x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) + x = x0 = norm(x[None]) + + # U-net design by @brendanh0gan + skip_connections = [] + n = len(self.blocks) // 2 + + x_backout = None + backout_layer = 8 + # skip layer zero + for i in range(1,len(self.blocks)): + attn_args = AttnArgs( + ve=ve[i], + sa_lambdas=sa_lambdas[i], + seqlens=seqlens, + bm_size=bm_sizes[i], + cos=self.yarn.cos, + sin=self.yarn.sin, + attn_scale=self.yarn.attn_scale + ) + # since layer 0 is skipped, layer 11 does not have skip_connection + if i >= n and i<11: + gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) + x = x + gate * skip_connections.pop() + x = self.blocks[i](x, x0, lambdas[i], attn_args) + if i < n: + skip_connections.append(x) + if i == backout_layer: + x_backout = x + + # back out contributions from first 8 layers that are only required for downstream context and not direct prediction + x -= backout_lambda * x_backout + x = norm(x) + logits = self.lm_head(x) + # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) + logits = 30 * torch.sigmoid(logits / 7.5) + logits_for_loss = logits.float() if not self.training else logits + loss = F.cross_entropy( + logits_for_loss.view(-1, logits_for_loss.size(-1)), + target_seq, + reduction="sum" if self.training else "mean", + ) + return loss + +# ----------------------------------------------------------------------------- +# Distributed data loader + +def _load_data_shard(file: Path): + header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 + assert header[0] == 20240520, "magic number mismatch in the data .bin file" + assert header[1] == 1, "unsupported version" + num_tokens = int(header[2]) # number of tokens (claimed) + with file.open("rb", buffering=0) as f: + tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng + f.seek(256 * 4) + nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng + assert nbytes == 2 * num_tokens, "number of tokens read does not match header" + return tokens + +BOS_ID = 50256 + +class BOSFinder: + # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd + def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): + # Precompute BOS positions once per shard + self.tokens=tokens + self.size = tokens.numel() + self.quickload = quickload + if quickload: + # only scan first 4 million tokens, then kickoff async thread to scan rest + self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.thread = None + self.ready = threading.Event() + self.start() + else: + self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.i = 0 + self.world_size = world_size + self.batch_iter = 0 + + def _load(self): + self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + self.bos_idx = self.bos_idx_async + + def next_batch(self, num_tokens_local: int, max_seq_len: int): + # if quickload was used, repoint to the full dataset after 5 batches + if self.quickload and self.batch_iter==5: + self.get() + n = len(self.bos_idx) + starts = [[] for _ in range(self.world_size)] + ends = [[] for _ in range(self.world_size)] + + idx = self.i + for r in range(self.world_size): + cur_len = 0 + while cur_len <= num_tokens_local: + if idx >= n: + raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") + cur = self.bos_idx[idx] + starts[r].append(cur) + end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, + cur + max_seq_len, + cur + num_tokens_local - cur_len + 1) + ends[r].append(end) + cur_len += end - cur + idx += 1 + + assert cur_len == num_tokens_local + 1 + self.i = idx + self.batch_iter+=1 + return starts, ends + +class DataPreloader: + # Helper for asynchronously loading next shard and indexing bos tokens + def __init__(self, file_iter, world_size: int = 1): + self.file_iter = file_iter + self.world_size = world_size + self.thread = None + self.data = None + self.ready = threading.Event() + + def _load(self): + tokens = _load_data_shard(next(self.file_iter)) + self.data = (tokens, BOSFinder(tokens, self.world_size)) + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + return self.data + +def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): + # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len + rank = dist.get_rank() if dist.is_initialized() else 0 + world_size = dist.get_world_size() if dist.is_initialized() else 1 + assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" + num_tokens = num_tokens // grad_accum_steps + + files = [Path(file) for file in sorted(glob.glob(filename_pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") + + file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training + tokens = _load_data_shard(next(file_iter)) + if align_to_bos: + finder = BOSFinder(tokens, world_size=world_size, quickload=True) + preloader = DataPreloader(file_iter, world_size) + preloader.start() + else: + pos = 0 # for unaligned case + + while True: + num_tokens_local = num_tokens // world_size + max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 + + if align_to_bos: + try: + seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) + start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) + except StopIteration: + # This shard is exhausted, load the next one in the next loop iteration. + tokens, finder = preloader.get() + preloader.start() + continue + + buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) + _inputs = buf[:-1] + _targets = buf[1:] + end_idxs[-1] -= 1 # last document was too long to account for _targets offset + cum_lengths = (end_idxs - start_idxs).cumsum(0) + + else: + if pos + num_tokens + 1 >= len(tokens): # should not occur for val data + tokens, pos = _load_data_shard(next(file_iter)), 0 + + pos_local = pos + rank * num_tokens_local + buf = tokens[pos_local: pos_local + num_tokens_local + 1] + _inputs = buf[:-1].view(num_tokens_local, ) + _targets = buf[1:].view(num_tokens_local, ) + + cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] + pos += num_tokens + + + _cum_lengths = torch.full((max_num_docs,), num_tokens_local) + _cum_lengths[0] = 0 + _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths + + new_params = yield ( + _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), + _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), + _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) + ) + + if new_params is not None: + # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() + new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params + assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" + num_tokens = new_num_tokens + max_seq_len = new_max_seq_len + grad_accum_steps = new_grad_accum_steps + + +# ----------------------------------------------------------------------------- +# int main + +@dataclass +class Hyperparameters: + # data + train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on + val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on + val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons + train_batch_size: int = 2048 * 16 * 8 + train_max_seq_len: int = 128 * 16 + val_batch_size: int = 4 * 64 * 1024 * 8 + # optimization + num_iterations: int = 2285 + lr_schedule = (0.5, 0.98) # breakpoints for 3-part schedule: (flat, linear decay, flat) + lr_min = 0.1 + # evaluation and logging + run_id: str = f"{uuid.uuid4()}" + val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end + save_checkpoint: bool = False + # attention masking + block_size: int = 128 + ws_schedule: tuple = (3, 5, 7, 9, 11, 13) + ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN + +args = Hyperparameters() + +data_path = os.environ.get("DATA_PATH", ".") +args.train_files = os.path.join(data_path, args.train_files) +args.val_files = os.path.join(data_path, args.val_files) + +# torchrun sets these env variables +rank = int(os.environ["RANK"]) +world_size = int(os.environ["WORLD_SIZE"]) +assert 8 % world_size == 0, "world_size must be a divisor of 8" +grad_accum_steps = 8 // world_size +assert torch.cuda.is_available() +device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) +torch.cuda.set_device(device) +dist.init_process_group(backend="nccl", device_id=device) +dist.barrier() +master_process = (rank == 0) # this process will do logging, checkpointing etc. + +# begin logging +logfile = None +if master_process: + run_id = args.run_id + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{run_id}.txt" + print(logfile) +def print0(s, console=False): + if master_process: + with open(logfile, "a") as f: + if console: + print(s) + print(s, file=f) + +# begin by printing this file (the Python code) +print0(code) +print0("="*100) +# log information about the hardware/software environment this is running on +print0(f"Running Python {sys.version}") +print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") +print0(f"Running Triton version {triton.__version__}") + +def nvidia_smi(): + import subprocess # avoid top level import + return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout +print0(nvidia_smi()) +print0("="*100) + +model: nn.Module = GPT( + vocab_size=50257, + num_layers=12, + num_heads=6, + head_dim=128, + model_dim=768, + max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) +).cuda() +for m in model.modules(): + if isinstance(m, (nn.Embedding, nn.Linear)): + m.bfloat16() +for param in model.parameters(): + dist.broadcast(param.detach(), 0) + +# collect the parameters to optimize +hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] +embed_params = [p for n, p in model.named_parameters() if "embed" in n] +scalar_params = [p for p in model.parameters() if p.ndim < 2] +head_params = [model.lm_head.weight] +gate_params = [p for n, p in model.named_parameters() if "gate" in n] + +# init the optimizer(s) +# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence +# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 +optimizer1 = DistAdam( + scalar_params + head_params + embed_params, + lr=0.008, + betas=(0.65, 0.95), + eps=1e-8, + weight_decay=0.0, +) +optimizer2 = Muon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=0.0) +optimizers = [optimizer1, optimizer2] +for opt in optimizers: + for group in opt.param_groups: + group["initial_lr"] = group["lr"] + +def get_lr(step: int): + assert step < args.num_iterations + # Three part schedule: flat, linear decrease, flat + lr_schedule = args.lr_schedule + x = step / args.num_iterations + + if x < lr_schedule[0]: + return 1.0 + elif x < lr_schedule[1]: + progress = (x - lr_schedule[0]) / (lr_schedule[1] - lr_schedule[0]) + lr = 1.0 - (1.0 - args.lr_min) * progress + else: + lr = args.lr_min + return lr + +def get_ws(step: int): + assert step <= args.num_iterations + x = step / (args.num_iterations + 1) + ws_idx = int(len(args.ws_schedule) * x) + return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] + +def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): + # warmup phase: linearly increase momentum from min to max + # cooldown phase: linearly decrease momentum from max to min + momentum_cd_start = args.num_iterations - muon_cooldown_steps + if step < muon_warmup_steps: + frac = step / muon_warmup_steps + momentum = momentum_min + frac * (momentum_max - momentum_min) + elif step > momentum_cd_start: + frac = (step - momentum_cd_start) / muon_cooldown_steps + momentum = momentum_max - frac * (momentum_max - momentum_min) + else: + momentum = momentum_max + return momentum + +def step_optimizers(step: int, optimizers, model): + # update lr + for optimizer in optimizers: + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * get_lr(step) + + # set muon momentum based on step + momentum = get_muon_momentum(step) + for group in optimizers[1].param_groups: + group["momentum"] = momentum + + # on even steps, only step Muon params + # on odd steps, step all params + if step%2==0: + optimizers[1].step() + optimizers[1].zero_grad(set_to_none=True) + else: + for optimizer in optimizers: + optimizer.step() + model.zero_grad(set_to_none=True) + +model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) + +######################################## +# Warmup kernels # +######################################## + +# Warmup the training kernels, then re-initialize the state so we aren't cheating +warmup_steps = 30 +initial_state = dict(model=copy.deepcopy(model.state_dict()), + optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +for step in range(warmup_steps): + inputs, targets, cum_seqlens = next(train_loader) + # each window size is a new graph, need to warm up each with Yarn.attn_scale + ws_idx = step % len(args.ws_schedule) + if ws_idx==0: + model.yarn.reset() + ws_long = args.ws_schedule[0] + else: + new_ws_long = args.ws_schedule[ws_idx] + if new_ws_long > ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long = new_ws_long + model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() + for opt in optimizers: + opt.step() + model.zero_grad(set_to_none=True) +model.yarn.reset() # rotary buffer is not stored in state_dict +model.load_state_dict(initial_state["model"]) +optimizer2.reset() # momentum buffer not in state dict +for opt, opt_state in zip(optimizers, initial_state["optimizers"]): + opt.load_state_dict(opt_state) +del train_loader, initial_state + +######################################## +# Training and validation # +######################################## + +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +training_time_ms = 0 +# start the clock +torch.cuda.synchronize() +t0 = time.perf_counter() +# begin training +train_steps = args.num_iterations +ws_short, ws_long = get_ws(0) +for step in range(train_steps + 1): + last_step = (step == train_steps) + ws_short, new_ws_long = get_ws(step) + if new_ws_long != ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long=new_ws_long + + # --------------- VALIDATION SECTION ----------------- + if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): + if last_step: + ws_long = args.ws_validate_post_yarn_ext + # stop the clock + torch.cuda.synchronize() + training_time_ms += 1000 * (time.perf_counter() - t0) + model.eval() + assert args.val_tokens % args.val_batch_size == 0 + val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size + val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) + val_loss = 0 + with torch.no_grad(): + for _ in range(val_steps): + inputs, targets, cum_seqlens = next(val_loader) + val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) + val_loss /= val_steps + del val_loader + dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) + print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) + model.train() + # start the clock again + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if master_process and args.save_checkpoint: + log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) + os.makedirs(f"logs/{run_id}", exist_ok=True) + torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") + # the last step only has the validation loop, so break to avoid training + break + + # --------------- TRAINING SECTION ----------------- + loss = 0 + for _ in range(grad_accum_steps): + inputs, targets, cum_seqlens = next(train_loader) + loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps + loss.backward() + step_optimizers(step, optimizers, model) + + # logging + approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) + print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) + +print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) +dist.destroy_process_group() + +==================================================================================================== +Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] +Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 +Running Triton version 3.5.0 +Tue Oct 28 02:08:52 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | +| N/A 40C P0 128W / 700W | 5858MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | +| N/A 33C P0 127W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | +| N/A 32C P0 121W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | +| N/A 37C P0 124W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | +| N/A 39C P0 121W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | +| N/A 32C P0 120W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | +| N/A 38C P0 126W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | +| N/A 31C P0 114W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| ++-----------------------------------------------------------------------------------------+ + +==================================================================================================== +step:0/2285 val_loss:10.8258 train_time:0ms step_avg:0.02ms +step:1/2285 train_time:109ms step_avg:109.38ms +step:2/2285 train_time:130ms step_avg:64.89ms +step:3/2285 train_time:168ms step_avg:56.07ms +step:4/2285 train_time:224ms step_avg:56.11ms +step:5/2285 train_time:284ms step_avg:56.71ms +step:6/2285 train_time:342ms step_avg:56.96ms +step:7/2285 train_time:402ms step_avg:57.43ms +step:8/2285 train_time:460ms step_avg:57.54ms +step:9/2285 train_time:521ms step_avg:57.89ms +step:10/2285 train_time:580ms step_avg:57.95ms +step:11/2285 train_time:640ms step_avg:58.21ms +step:12/2285 train_time:699ms step_avg:58.22ms +step:13/2285 train_time:759ms step_avg:58.39ms +step:14/2285 train_time:818ms step_avg:58.41ms +step:15/2285 train_time:878ms step_avg:58.55ms +step:16/2285 train_time:937ms step_avg:58.54ms +step:17/2285 train_time:999ms step_avg:58.76ms +step:18/2285 train_time:1061ms step_avg:58.93ms +step:19/2285 train_time:1126ms step_avg:59.24ms +step:20/2285 train_time:1186ms step_avg:59.32ms +step:21/2285 train_time:1248ms step_avg:59.42ms +step:22/2285 train_time:1307ms step_avg:59.40ms +step:23/2285 train_time:1368ms step_avg:59.46ms +step:24/2285 train_time:1426ms step_avg:59.42ms +step:25/2285 train_time:1487ms step_avg:59.49ms +step:26/2285 train_time:1546ms step_avg:59.46ms +step:27/2285 train_time:1607ms step_avg:59.53ms +step:28/2285 train_time:1666ms step_avg:59.49ms +step:29/2285 train_time:1727ms step_avg:59.55ms +step:30/2285 train_time:1786ms step_avg:59.53ms +step:31/2285 train_time:1847ms step_avg:59.59ms +step:32/2285 train_time:1906ms step_avg:59.56ms +step:33/2285 train_time:1968ms step_avg:59.63ms +step:34/2285 train_time:2027ms step_avg:59.63ms +step:35/2285 train_time:2090ms step_avg:59.71ms +step:36/2285 train_time:2149ms step_avg:59.70ms +step:37/2285 train_time:2211ms step_avg:59.76ms +step:38/2285 train_time:2270ms step_avg:59.73ms +step:39/2285 train_time:2331ms step_avg:59.78ms +step:40/2285 train_time:2390ms step_avg:59.76ms +step:41/2285 train_time:2452ms step_avg:59.81ms +step:42/2285 train_time:2512ms step_avg:59.80ms +step:43/2285 train_time:2573ms step_avg:59.83ms +step:44/2285 train_time:2632ms step_avg:59.82ms +step:45/2285 train_time:2693ms step_avg:59.85ms +step:46/2285 train_time:2752ms step_avg:59.83ms +step:47/2285 train_time:2813ms step_avg:59.86ms +step:48/2285 train_time:2873ms step_avg:59.85ms +step:49/2285 train_time:2934ms step_avg:59.89ms +step:50/2285 train_time:2995ms step_avg:59.89ms +step:51/2285 train_time:3057ms step_avg:59.94ms +step:52/2285 train_time:3117ms step_avg:59.94ms +step:53/2285 train_time:3178ms step_avg:59.97ms +step:54/2285 train_time:3237ms step_avg:59.94ms +step:55/2285 train_time:3299ms step_avg:59.98ms +step:56/2285 train_time:3358ms step_avg:59.96ms +step:57/2285 train_time:3419ms step_avg:59.99ms +step:58/2285 train_time:3479ms step_avg:59.98ms +step:59/2285 train_time:3542ms step_avg:60.03ms +step:60/2285 train_time:3601ms step_avg:60.01ms +step:61/2285 train_time:3662ms step_avg:60.03ms +step:62/2285 train_time:3721ms step_avg:60.01ms +step:63/2285 train_time:3782ms step_avg:60.04ms +step:64/2285 train_time:3842ms step_avg:60.02ms +step:65/2285 train_time:3903ms step_avg:60.05ms +step:66/2285 train_time:3963ms step_avg:60.05ms +step:67/2285 train_time:4025ms step_avg:60.07ms +step:68/2285 train_time:4083ms step_avg:60.05ms +step:69/2285 train_time:4145ms step_avg:60.07ms +step:70/2285 train_time:4204ms step_avg:60.05ms +step:71/2285 train_time:4265ms step_avg:60.07ms +step:72/2285 train_time:4324ms step_avg:60.05ms +step:73/2285 train_time:4385ms step_avg:60.06ms +step:74/2285 train_time:4444ms step_avg:60.06ms +step:75/2285 train_time:4506ms step_avg:60.08ms +step:76/2285 train_time:4565ms step_avg:60.06ms +step:77/2285 train_time:4626ms step_avg:60.08ms +step:78/2285 train_time:4685ms step_avg:60.06ms +step:79/2285 train_time:4747ms step_avg:60.08ms +step:80/2285 train_time:4806ms step_avg:60.08ms +step:81/2285 train_time:4868ms step_avg:60.10ms +step:82/2285 train_time:4927ms step_avg:60.08ms +step:83/2285 train_time:4988ms step_avg:60.10ms +step:84/2285 train_time:5048ms step_avg:60.09ms +step:85/2285 train_time:5109ms step_avg:60.11ms +step:86/2285 train_time:5168ms step_avg:60.10ms +step:87/2285 train_time:5229ms step_avg:60.11ms +step:88/2285 train_time:5288ms step_avg:60.09ms +step:89/2285 train_time:5349ms step_avg:60.10ms +step:90/2285 train_time:5408ms step_avg:60.09ms +step:91/2285 train_time:5469ms step_avg:60.10ms +step:92/2285 train_time:5529ms step_avg:60.10ms +step:93/2285 train_time:5590ms step_avg:60.11ms +step:94/2285 train_time:5650ms step_avg:60.11ms +step:95/2285 train_time:5712ms step_avg:60.12ms +step:96/2285 train_time:5770ms step_avg:60.11ms +step:97/2285 train_time:5832ms step_avg:60.12ms +step:98/2285 train_time:5892ms step_avg:60.12ms +step:99/2285 train_time:5953ms step_avg:60.13ms +step:100/2285 train_time:6012ms step_avg:60.12ms +step:101/2285 train_time:6073ms step_avg:60.13ms +step:102/2285 train_time:6131ms step_avg:60.11ms +step:103/2285 train_time:6193ms step_avg:60.13ms +step:104/2285 train_time:6252ms step_avg:60.12ms +step:105/2285 train_time:6313ms step_avg:60.12ms +step:106/2285 train_time:6372ms step_avg:60.11ms +step:107/2285 train_time:6433ms step_avg:60.12ms +step:108/2285 train_time:6493ms step_avg:60.12ms +step:109/2285 train_time:6555ms step_avg:60.14ms +step:110/2285 train_time:6614ms step_avg:60.13ms +step:111/2285 train_time:6675ms step_avg:60.14ms +step:112/2285 train_time:6734ms step_avg:60.13ms +step:113/2285 train_time:6796ms step_avg:60.14ms +step:114/2285 train_time:6855ms step_avg:60.14ms +step:115/2285 train_time:6917ms step_avg:60.15ms +step:116/2285 train_time:6976ms step_avg:60.14ms +step:117/2285 train_time:7037ms step_avg:60.15ms +step:118/2285 train_time:7096ms step_avg:60.13ms +step:119/2285 train_time:7158ms step_avg:60.15ms +step:120/2285 train_time:7217ms step_avg:60.14ms +step:121/2285 train_time:7278ms step_avg:60.15ms +step:122/2285 train_time:7337ms step_avg:60.14ms +step:123/2285 train_time:7398ms step_avg:60.14ms +step:124/2285 train_time:7456ms step_avg:60.13ms +step:125/2285 train_time:7518ms step_avg:60.14ms +step:126/2285 train_time:7577ms step_avg:60.13ms +step:127/2285 train_time:7638ms step_avg:60.14ms +step:128/2285 train_time:7697ms step_avg:60.13ms +step:129/2285 train_time:7758ms step_avg:60.14ms +step:130/2285 train_time:7817ms step_avg:60.13ms +step:131/2285 train_time:7878ms step_avg:60.14ms +step:132/2285 train_time:7937ms step_avg:60.13ms +step:133/2285 train_time:7999ms step_avg:60.14ms +step:134/2285 train_time:8058ms step_avg:60.13ms +step:135/2285 train_time:8118ms step_avg:60.14ms +step:136/2285 train_time:8177ms step_avg:60.12ms +step:137/2285 train_time:8238ms step_avg:60.13ms +step:138/2285 train_time:8296ms step_avg:60.12ms +step:139/2285 train_time:8357ms step_avg:60.12ms +step:140/2285 train_time:8417ms step_avg:60.12ms +step:141/2285 train_time:8479ms step_avg:60.13ms +step:142/2285 train_time:8537ms step_avg:60.12ms +step:143/2285 train_time:8599ms step_avg:60.13ms +step:144/2285 train_time:8658ms step_avg:60.12ms +step:145/2285 train_time:8719ms step_avg:60.13ms +step:146/2285 train_time:8778ms step_avg:60.12ms +step:147/2285 train_time:8839ms step_avg:60.13ms +step:148/2285 train_time:8898ms step_avg:60.12ms +step:149/2285 train_time:8959ms step_avg:60.13ms +step:150/2285 train_time:9018ms step_avg:60.12ms +step:151/2285 train_time:9079ms step_avg:60.13ms +step:152/2285 train_time:9138ms step_avg:60.12ms +step:153/2285 train_time:9199ms step_avg:60.12ms +step:154/2285 train_time:9257ms step_avg:60.11ms +step:155/2285 train_time:9318ms step_avg:60.12ms +step:156/2285 train_time:9377ms step_avg:60.11ms +step:157/2285 train_time:9438ms step_avg:60.11ms +step:158/2285 train_time:9497ms step_avg:60.11ms +step:159/2285 train_time:9558ms step_avg:60.11ms +step:160/2285 train_time:9617ms step_avg:60.10ms +step:161/2285 train_time:9678ms step_avg:60.11ms +step:162/2285 train_time:9737ms step_avg:60.10ms +step:163/2285 train_time:9798ms step_avg:60.11ms +step:164/2285 train_time:9856ms step_avg:60.10ms +step:165/2285 train_time:9918ms step_avg:60.11ms +step:166/2285 train_time:9976ms step_avg:60.10ms +step:167/2285 train_time:10037ms step_avg:60.10ms +step:168/2285 train_time:10096ms step_avg:60.10ms +step:169/2285 train_time:10158ms step_avg:60.10ms +step:170/2285 train_time:10216ms step_avg:60.09ms +step:171/2285 train_time:10277ms step_avg:60.10ms +step:172/2285 train_time:10335ms step_avg:60.09ms +step:173/2285 train_time:10397ms step_avg:60.10ms +step:174/2285 train_time:10455ms step_avg:60.09ms +step:175/2285 train_time:10516ms step_avg:60.09ms +step:176/2285 train_time:10575ms step_avg:60.08ms +step:177/2285 train_time:10637ms step_avg:60.09ms +step:178/2285 train_time:10696ms step_avg:60.09ms +step:179/2285 train_time:10757ms step_avg:60.09ms +step:180/2285 train_time:10815ms step_avg:60.09ms +step:181/2285 train_time:10876ms step_avg:60.09ms +step:182/2285 train_time:10935ms step_avg:60.08ms +step:183/2285 train_time:10997ms step_avg:60.09ms +step:184/2285 train_time:11056ms step_avg:60.08ms +step:185/2285 train_time:11117ms step_avg:60.09ms +step:186/2285 train_time:11176ms step_avg:60.08ms +step:187/2285 train_time:11237ms step_avg:60.09ms +step:188/2285 train_time:11296ms step_avg:60.08ms +step:189/2285 train_time:11357ms step_avg:60.09ms +step:190/2285 train_time:11416ms step_avg:60.08ms +step:191/2285 train_time:11477ms step_avg:60.09ms +step:192/2285 train_time:11536ms step_avg:60.08ms +step:193/2285 train_time:11597ms step_avg:60.09ms +step:194/2285 train_time:11655ms step_avg:60.08ms +step:195/2285 train_time:11716ms step_avg:60.08ms +step:196/2285 train_time:11775ms step_avg:60.08ms +step:197/2285 train_time:11837ms step_avg:60.08ms +step:198/2285 train_time:11896ms step_avg:60.08ms +step:199/2285 train_time:11957ms step_avg:60.08ms +step:200/2285 train_time:12015ms step_avg:60.08ms +step:201/2285 train_time:12076ms step_avg:60.08ms +step:202/2285 train_time:12135ms step_avg:60.07ms +step:203/2285 train_time:12196ms step_avg:60.08ms +step:204/2285 train_time:12255ms step_avg:60.07ms +step:205/2285 train_time:12316ms step_avg:60.08ms +step:206/2285 train_time:12375ms step_avg:60.07ms +step:207/2285 train_time:12436ms step_avg:60.08ms +step:208/2285 train_time:12495ms step_avg:60.07ms +step:209/2285 train_time:12556ms step_avg:60.08ms +step:210/2285 train_time:12615ms step_avg:60.07ms +step:211/2285 train_time:12676ms step_avg:60.08ms +step:212/2285 train_time:12735ms step_avg:60.07ms +step:213/2285 train_time:12797ms step_avg:60.08ms +step:214/2285 train_time:12856ms step_avg:60.07ms +step:215/2285 train_time:12917ms step_avg:60.08ms +step:216/2285 train_time:12975ms step_avg:60.07ms +step:217/2285 train_time:13036ms step_avg:60.07ms +step:218/2285 train_time:13095ms step_avg:60.07ms +step:219/2285 train_time:13156ms step_avg:60.07ms +step:220/2285 train_time:13215ms step_avg:60.07ms +step:221/2285 train_time:13276ms step_avg:60.07ms +step:222/2285 train_time:13334ms step_avg:60.06ms +step:223/2285 train_time:13395ms step_avg:60.07ms +step:224/2285 train_time:13454ms step_avg:60.06ms +step:225/2285 train_time:13515ms step_avg:60.07ms +step:226/2285 train_time:13574ms step_avg:60.06ms +step:227/2285 train_time:13635ms step_avg:60.07ms +step:228/2285 train_time:13694ms step_avg:60.06ms +step:229/2285 train_time:13756ms step_avg:60.07ms +step:230/2285 train_time:13815ms step_avg:60.06ms +step:231/2285 train_time:13876ms step_avg:60.07ms +step:232/2285 train_time:13934ms step_avg:60.06ms +step:233/2285 train_time:13996ms step_avg:60.07ms +step:234/2285 train_time:14054ms step_avg:60.06ms +step:235/2285 train_time:14115ms step_avg:60.06ms +step:236/2285 train_time:14174ms step_avg:60.06ms +step:237/2285 train_time:14235ms step_avg:60.06ms +step:238/2285 train_time:14295ms step_avg:60.06ms +step:239/2285 train_time:14356ms step_avg:60.07ms +step:240/2285 train_time:14415ms step_avg:60.06ms +step:241/2285 train_time:14476ms step_avg:60.07ms +step:242/2285 train_time:14534ms step_avg:60.06ms +step:243/2285 train_time:14596ms step_avg:60.06ms +step:244/2285 train_time:14655ms step_avg:60.06ms +step:245/2285 train_time:14716ms step_avg:60.06ms +step:246/2285 train_time:14774ms step_avg:60.06ms +step:247/2285 train_time:14835ms step_avg:60.06ms +step:248/2285 train_time:14894ms step_avg:60.06ms +step:249/2285 train_time:14955ms step_avg:60.06ms +step:250/2285 train_time:15014ms step_avg:60.05ms +step:250/2285 val_loss:4.0722 train_time:15076ms step_avg:60.30ms +step:251/2285 train_time:15094ms step_avg:60.14ms +step:252/2285 train_time:15135ms step_avg:60.06ms +step:253/2285 train_time:15202ms step_avg:60.09ms +step:254/2285 train_time:15264ms step_avg:60.09ms +step:255/2285 train_time:15327ms step_avg:60.11ms +step:256/2285 train_time:15387ms step_avg:60.10ms +step:257/2285 train_time:15448ms step_avg:60.11ms +step:258/2285 train_time:15506ms step_avg:60.10ms +step:259/2285 train_time:15567ms step_avg:60.10ms +step:260/2285 train_time:15625ms step_avg:60.09ms +step:261/2285 train_time:15684ms step_avg:60.09ms +step:262/2285 train_time:15742ms step_avg:60.08ms +step:263/2285 train_time:15802ms step_avg:60.08ms +step:264/2285 train_time:15860ms step_avg:60.08ms +step:265/2285 train_time:15920ms step_avg:60.07ms +step:266/2285 train_time:15977ms step_avg:60.07ms +step:267/2285 train_time:16037ms step_avg:60.07ms +step:268/2285 train_time:16096ms step_avg:60.06ms +step:269/2285 train_time:16157ms step_avg:60.06ms +step:270/2285 train_time:16216ms step_avg:60.06ms +step:271/2285 train_time:16279ms step_avg:60.07ms +step:272/2285 train_time:16338ms step_avg:60.07ms +step:273/2285 train_time:16399ms step_avg:60.07ms +step:274/2285 train_time:16458ms step_avg:60.07ms +step:275/2285 train_time:16519ms step_avg:60.07ms +step:276/2285 train_time:16578ms step_avg:60.07ms +step:277/2285 train_time:16639ms step_avg:60.07ms +step:278/2285 train_time:16698ms step_avg:60.06ms +step:279/2285 train_time:16758ms step_avg:60.07ms +step:280/2285 train_time:16816ms step_avg:60.06ms +step:281/2285 train_time:16877ms step_avg:60.06ms +step:282/2285 train_time:16936ms step_avg:60.06ms +step:283/2285 train_time:16996ms step_avg:60.06ms +step:284/2285 train_time:17054ms step_avg:60.05ms +step:285/2285 train_time:17115ms step_avg:60.05ms +step:286/2285 train_time:17173ms step_avg:60.05ms +step:287/2285 train_time:17235ms step_avg:60.05ms +step:288/2285 train_time:17293ms step_avg:60.05ms +step:289/2285 train_time:17355ms step_avg:60.05ms +step:290/2285 train_time:17414ms step_avg:60.05ms +step:291/2285 train_time:17476ms step_avg:60.05ms +step:292/2285 train_time:17535ms step_avg:60.05ms +step:293/2285 train_time:17596ms step_avg:60.06ms +step:294/2285 train_time:17655ms step_avg:60.05ms +step:295/2285 train_time:17716ms step_avg:60.05ms +step:296/2285 train_time:17774ms step_avg:60.05ms +step:297/2285 train_time:17835ms step_avg:60.05ms +step:298/2285 train_time:17894ms step_avg:60.05ms +step:299/2285 train_time:17954ms step_avg:60.05ms +step:300/2285 train_time:18012ms step_avg:60.04ms +step:301/2285 train_time:18072ms step_avg:60.04ms +step:302/2285 train_time:18130ms step_avg:60.03ms +step:303/2285 train_time:18191ms step_avg:60.04ms +step:304/2285 train_time:18250ms step_avg:60.03ms +step:305/2285 train_time:18311ms step_avg:60.04ms +step:306/2285 train_time:18370ms step_avg:60.03ms +step:307/2285 train_time:18431ms step_avg:60.04ms +step:308/2285 train_time:18490ms step_avg:60.03ms +step:309/2285 train_time:18552ms step_avg:60.04ms +step:310/2285 train_time:18611ms step_avg:60.03ms +step:311/2285 train_time:18672ms step_avg:60.04ms +step:312/2285 train_time:18730ms step_avg:60.03ms +step:313/2285 train_time:18791ms step_avg:60.04ms +step:314/2285 train_time:18850ms step_avg:60.03ms +step:315/2285 train_time:18911ms step_avg:60.03ms +step:316/2285 train_time:18969ms step_avg:60.03ms +step:317/2285 train_time:19030ms step_avg:60.03ms +step:318/2285 train_time:19088ms step_avg:60.02ms +step:319/2285 train_time:19148ms step_avg:60.03ms +step:320/2285 train_time:19207ms step_avg:60.02ms +step:321/2285 train_time:19269ms step_avg:60.03ms +step:322/2285 train_time:19328ms step_avg:60.02ms +step:323/2285 train_time:19390ms step_avg:60.03ms +step:324/2285 train_time:19449ms step_avg:60.03ms +step:325/2285 train_time:19511ms step_avg:60.03ms +step:326/2285 train_time:19570ms step_avg:60.03ms +step:327/2285 train_time:19631ms step_avg:60.03ms +step:328/2285 train_time:19690ms step_avg:60.03ms +step:329/2285 train_time:19752ms step_avg:60.04ms +step:330/2285 train_time:19810ms step_avg:60.03ms +step:331/2285 train_time:19871ms step_avg:60.03ms +step:332/2285 train_time:19930ms step_avg:60.03ms +step:333/2285 train_time:19990ms step_avg:60.03ms +step:334/2285 train_time:20048ms step_avg:60.03ms +step:335/2285 train_time:20109ms step_avg:60.03ms +step:336/2285 train_time:20167ms step_avg:60.02ms +step:337/2285 train_time:20228ms step_avg:60.02ms +step:338/2285 train_time:20286ms step_avg:60.02ms +step:339/2285 train_time:20348ms step_avg:60.02ms +step:340/2285 train_time:20407ms step_avg:60.02ms +step:341/2285 train_time:20468ms step_avg:60.02ms +step:342/2285 train_time:20526ms step_avg:60.02ms +step:343/2285 train_time:20588ms step_avg:60.02ms +step:344/2285 train_time:20647ms step_avg:60.02ms +step:345/2285 train_time:20709ms step_avg:60.03ms +step:346/2285 train_time:20768ms step_avg:60.02ms +step:347/2285 train_time:20829ms step_avg:60.02ms +step:348/2285 train_time:20887ms step_avg:60.02ms +step:349/2285 train_time:20948ms step_avg:60.02ms +step:350/2285 train_time:21007ms step_avg:60.02ms +step:351/2285 train_time:21067ms step_avg:60.02ms +step:352/2285 train_time:21126ms step_avg:60.02ms +step:353/2285 train_time:21186ms step_avg:60.02ms +step:354/2285 train_time:21245ms step_avg:60.01ms +step:355/2285 train_time:21306ms step_avg:60.02ms +step:356/2285 train_time:21365ms step_avg:60.01ms +step:357/2285 train_time:21426ms step_avg:60.02ms +step:358/2285 train_time:21485ms step_avg:60.02ms +step:359/2285 train_time:21547ms step_avg:60.02ms +step:360/2285 train_time:21606ms step_avg:60.02ms +step:361/2285 train_time:21667ms step_avg:60.02ms +step:362/2285 train_time:21726ms step_avg:60.02ms +step:363/2285 train_time:21787ms step_avg:60.02ms +step:364/2285 train_time:21846ms step_avg:60.02ms +step:365/2285 train_time:21907ms step_avg:60.02ms +step:366/2285 train_time:21965ms step_avg:60.01ms +step:367/2285 train_time:22027ms step_avg:60.02ms +step:368/2285 train_time:22085ms step_avg:60.01ms +step:369/2285 train_time:22146ms step_avg:60.02ms +step:370/2285 train_time:22204ms step_avg:60.01ms +step:371/2285 train_time:22264ms step_avg:60.01ms +step:372/2285 train_time:22323ms step_avg:60.01ms +step:373/2285 train_time:22384ms step_avg:60.01ms +step:374/2285 train_time:22443ms step_avg:60.01ms +step:375/2285 train_time:22504ms step_avg:60.01ms +step:376/2285 train_time:22563ms step_avg:60.01ms +step:377/2285 train_time:22624ms step_avg:60.01ms +step:378/2285 train_time:22683ms step_avg:60.01ms +step:379/2285 train_time:22745ms step_avg:60.01ms +step:380/2285 train_time:22804ms step_avg:60.01ms +step:381/2285 train_time:22865ms step_avg:60.01ms +step:382/2285 train_time:22924ms step_avg:60.01ms +step:383/2285 train_time:22985ms step_avg:60.01ms +step:384/2285 train_time:23044ms step_avg:60.01ms +step:385/2285 train_time:23105ms step_avg:60.01ms +step:386/2285 train_time:23164ms step_avg:60.01ms +step:387/2285 train_time:23225ms step_avg:60.01ms +step:388/2285 train_time:23284ms step_avg:60.01ms +step:389/2285 train_time:23345ms step_avg:60.01ms +step:390/2285 train_time:23404ms step_avg:60.01ms +step:391/2285 train_time:23466ms step_avg:60.01ms +step:392/2285 train_time:23525ms step_avg:60.01ms +step:393/2285 train_time:23587ms step_avg:60.02ms +step:394/2285 train_time:23648ms step_avg:60.02ms +step:395/2285 train_time:23709ms step_avg:60.02ms +step:396/2285 train_time:23769ms step_avg:60.02ms +step:397/2285 train_time:23830ms step_avg:60.03ms +step:398/2285 train_time:23889ms step_avg:60.02ms +step:399/2285 train_time:23951ms step_avg:60.03ms +step:400/2285 train_time:24010ms step_avg:60.02ms +step:401/2285 train_time:24071ms step_avg:60.03ms +step:402/2285 train_time:24130ms step_avg:60.03ms +step:403/2285 train_time:24192ms step_avg:60.03ms +step:404/2285 train_time:24251ms step_avg:60.03ms +step:405/2285 train_time:24312ms step_avg:60.03ms +step:406/2285 train_time:24371ms step_avg:60.03ms +step:407/2285 train_time:24433ms step_avg:60.03ms +step:408/2285 train_time:24492ms step_avg:60.03ms +step:409/2285 train_time:24554ms step_avg:60.03ms +step:410/2285 train_time:24614ms step_avg:60.03ms +step:411/2285 train_time:24675ms step_avg:60.04ms +step:412/2285 train_time:24735ms step_avg:60.04ms +step:413/2285 train_time:24796ms step_avg:60.04ms +step:414/2285 train_time:24856ms step_avg:60.04ms +step:415/2285 train_time:24917ms step_avg:60.04ms +step:416/2285 train_time:24976ms step_avg:60.04ms +step:417/2285 train_time:25037ms step_avg:60.04ms +step:418/2285 train_time:25095ms step_avg:60.04ms +step:419/2285 train_time:25157ms step_avg:60.04ms +step:420/2285 train_time:25216ms step_avg:60.04ms +step:421/2285 train_time:25277ms step_avg:60.04ms +step:422/2285 train_time:25335ms step_avg:60.04ms +step:423/2285 train_time:25397ms step_avg:60.04ms +step:424/2285 train_time:25456ms step_avg:60.04ms +step:425/2285 train_time:25518ms step_avg:60.04ms +step:426/2285 train_time:25576ms step_avg:60.04ms +step:427/2285 train_time:25638ms step_avg:60.04ms +step:428/2285 train_time:25697ms step_avg:60.04ms +step:429/2285 train_time:25758ms step_avg:60.04ms +step:430/2285 train_time:25817ms step_avg:60.04ms +step:431/2285 train_time:25879ms step_avg:60.04ms +step:432/2285 train_time:25937ms step_avg:60.04ms +step:433/2285 train_time:25999ms step_avg:60.04ms +step:434/2285 train_time:26058ms step_avg:60.04ms +step:435/2285 train_time:26120ms step_avg:60.05ms +step:436/2285 train_time:26179ms step_avg:60.04ms +step:437/2285 train_time:26239ms step_avg:60.04ms +step:438/2285 train_time:26299ms step_avg:60.04ms +step:439/2285 train_time:26360ms step_avg:60.04ms +step:440/2285 train_time:26419ms step_avg:60.04ms +step:441/2285 train_time:26479ms step_avg:60.04ms +step:442/2285 train_time:26538ms step_avg:60.04ms +step:443/2285 train_time:26599ms step_avg:60.04ms +step:444/2285 train_time:26658ms step_avg:60.04ms +step:445/2285 train_time:26719ms step_avg:60.04ms +step:446/2285 train_time:26778ms step_avg:60.04ms +step:447/2285 train_time:26840ms step_avg:60.04ms +step:448/2285 train_time:26899ms step_avg:60.04ms +step:449/2285 train_time:26960ms step_avg:60.05ms +step:450/2285 train_time:27019ms step_avg:60.04ms +step:451/2285 train_time:27081ms step_avg:60.05ms +step:452/2285 train_time:27140ms step_avg:60.04ms +step:453/2285 train_time:27201ms step_avg:60.05ms +step:454/2285 train_time:27260ms step_avg:60.04ms +step:455/2285 train_time:27321ms step_avg:60.05ms +step:456/2285 train_time:27380ms step_avg:60.04ms +step:457/2285 train_time:27441ms step_avg:60.05ms +step:458/2285 train_time:27500ms step_avg:60.04ms +step:459/2285 train_time:27561ms step_avg:60.05ms +step:460/2285 train_time:27620ms step_avg:60.04ms +step:461/2285 train_time:27681ms step_avg:60.05ms +step:462/2285 train_time:27740ms step_avg:60.04ms +step:463/2285 train_time:27801ms step_avg:60.05ms +step:464/2285 train_time:27860ms step_avg:60.04ms +step:465/2285 train_time:27921ms step_avg:60.04ms +step:466/2285 train_time:27980ms step_avg:60.04ms +step:467/2285 train_time:28041ms step_avg:60.05ms +step:468/2285 train_time:28100ms step_avg:60.04ms +step:469/2285 train_time:28162ms step_avg:60.05ms +step:470/2285 train_time:28220ms step_avg:60.04ms +step:471/2285 train_time:28281ms step_avg:60.05ms +step:472/2285 train_time:28340ms step_avg:60.04ms +step:473/2285 train_time:28401ms step_avg:60.04ms +step:474/2285 train_time:28461ms step_avg:60.04ms +step:475/2285 train_time:28522ms step_avg:60.05ms +step:476/2285 train_time:28581ms step_avg:60.04ms +step:477/2285 train_time:28642ms step_avg:60.05ms +step:478/2285 train_time:28702ms step_avg:60.05ms +step:479/2285 train_time:28763ms step_avg:60.05ms +step:480/2285 train_time:28822ms step_avg:60.05ms +step:481/2285 train_time:28883ms step_avg:60.05ms +step:482/2285 train_time:28942ms step_avg:60.05ms +step:483/2285 train_time:29003ms step_avg:60.05ms +step:484/2285 train_time:29062ms step_avg:60.05ms +step:485/2285 train_time:29124ms step_avg:60.05ms +step:486/2285 train_time:29182ms step_avg:60.05ms +step:487/2285 train_time:29244ms step_avg:60.05ms +step:488/2285 train_time:29302ms step_avg:60.05ms +step:489/2285 train_time:29364ms step_avg:60.05ms +step:490/2285 train_time:29423ms step_avg:60.05ms +step:491/2285 train_time:29484ms step_avg:60.05ms +step:492/2285 train_time:29543ms step_avg:60.05ms +step:493/2285 train_time:29604ms step_avg:60.05ms +step:494/2285 train_time:29663ms step_avg:60.05ms +step:495/2285 train_time:29724ms step_avg:60.05ms +step:496/2285 train_time:29782ms step_avg:60.04ms +step:497/2285 train_time:29844ms step_avg:60.05ms +step:498/2285 train_time:29903ms step_avg:60.05ms +step:499/2285 train_time:29964ms step_avg:60.05ms +step:500/2285 train_time:30023ms step_avg:60.05ms +step:500/2285 val_loss:3.7848 train_time:30086ms step_avg:60.17ms +step:501/2285 train_time:30105ms step_avg:60.09ms +step:502/2285 train_time:30146ms step_avg:60.05ms +step:503/2285 train_time:30206ms step_avg:60.05ms +step:504/2285 train_time:30266ms step_avg:60.05ms +step:505/2285 train_time:30330ms step_avg:60.06ms +step:506/2285 train_time:30389ms step_avg:60.06ms +step:507/2285 train_time:30449ms step_avg:60.06ms +step:508/2285 train_time:30507ms step_avg:60.05ms +step:509/2285 train_time:30568ms step_avg:60.06ms +step:510/2285 train_time:30627ms step_avg:60.05ms +step:511/2285 train_time:30687ms step_avg:60.05ms +step:512/2285 train_time:30745ms step_avg:60.05ms +step:513/2285 train_time:30807ms step_avg:60.05ms +step:514/2285 train_time:30867ms step_avg:60.05ms +step:515/2285 train_time:30928ms step_avg:60.05ms +step:516/2285 train_time:30988ms step_avg:60.05ms +step:517/2285 train_time:31055ms step_avg:60.07ms +step:518/2285 train_time:31115ms step_avg:60.07ms +step:519/2285 train_time:31178ms step_avg:60.07ms +step:520/2285 train_time:31236ms step_avg:60.07ms +step:521/2285 train_time:31298ms step_avg:60.07ms +step:522/2285 train_time:31357ms step_avg:60.07ms +step:523/2285 train_time:31417ms step_avg:60.07ms +step:524/2285 train_time:31476ms step_avg:60.07ms +step:525/2285 train_time:31537ms step_avg:60.07ms +step:526/2285 train_time:31596ms step_avg:60.07ms +step:527/2285 train_time:31657ms step_avg:60.07ms +step:528/2285 train_time:31716ms step_avg:60.07ms +step:529/2285 train_time:31777ms step_avg:60.07ms +step:530/2285 train_time:31837ms step_avg:60.07ms +step:531/2285 train_time:31899ms step_avg:60.07ms +step:532/2285 train_time:31958ms step_avg:60.07ms +step:533/2285 train_time:32019ms step_avg:60.07ms +step:534/2285 train_time:32079ms step_avg:60.07ms +step:535/2285 train_time:32140ms step_avg:60.08ms +step:536/2285 train_time:32199ms step_avg:60.07ms +step:537/2285 train_time:32261ms step_avg:60.08ms +step:538/2285 train_time:32320ms step_avg:60.07ms +step:539/2285 train_time:32381ms step_avg:60.08ms +step:540/2285 train_time:32440ms step_avg:60.07ms +step:541/2285 train_time:32501ms step_avg:60.08ms +step:542/2285 train_time:32560ms step_avg:60.07ms +step:543/2285 train_time:32622ms step_avg:60.08ms +step:544/2285 train_time:32680ms step_avg:60.07ms +step:545/2285 train_time:32741ms step_avg:60.08ms +step:546/2285 train_time:32800ms step_avg:60.07ms +step:547/2285 train_time:32862ms step_avg:60.08ms +step:548/2285 train_time:32921ms step_avg:60.07ms +step:549/2285 train_time:32982ms step_avg:60.08ms +step:550/2285 train_time:33041ms step_avg:60.07ms +step:551/2285 train_time:33102ms step_avg:60.08ms +step:552/2285 train_time:33161ms step_avg:60.07ms +step:553/2285 train_time:33223ms step_avg:60.08ms +step:554/2285 train_time:33282ms step_avg:60.08ms +step:555/2285 train_time:33344ms step_avg:60.08ms +step:556/2285 train_time:33403ms step_avg:60.08ms +step:557/2285 train_time:33465ms step_avg:60.08ms +step:558/2285 train_time:33525ms step_avg:60.08ms +step:559/2285 train_time:33586ms step_avg:60.08ms +step:560/2285 train_time:33645ms step_avg:60.08ms +step:561/2285 train_time:33707ms step_avg:60.08ms +step:562/2285 train_time:33766ms step_avg:60.08ms +step:563/2285 train_time:33828ms step_avg:60.08ms +step:564/2285 train_time:33887ms step_avg:60.08ms +step:565/2285 train_time:33948ms step_avg:60.09ms +step:566/2285 train_time:34008ms step_avg:60.08ms +step:567/2285 train_time:34069ms step_avg:60.09ms +step:568/2285 train_time:34129ms step_avg:60.09ms +step:569/2285 train_time:34190ms step_avg:60.09ms +step:570/2285 train_time:34249ms step_avg:60.09ms +step:571/2285 train_time:34311ms step_avg:60.09ms +step:572/2285 train_time:34369ms step_avg:60.09ms +step:573/2285 train_time:34431ms step_avg:60.09ms +step:574/2285 train_time:34490ms step_avg:60.09ms +step:575/2285 train_time:34551ms step_avg:60.09ms +step:576/2285 train_time:34610ms step_avg:60.09ms +step:577/2285 train_time:34671ms step_avg:60.09ms +step:578/2285 train_time:34730ms step_avg:60.09ms +step:579/2285 train_time:34792ms step_avg:60.09ms +step:580/2285 train_time:34851ms step_avg:60.09ms +step:581/2285 train_time:34913ms step_avg:60.09ms +step:582/2285 train_time:34972ms step_avg:60.09ms +step:583/2285 train_time:35033ms step_avg:60.09ms +step:584/2285 train_time:35092ms step_avg:60.09ms +step:585/2285 train_time:35154ms step_avg:60.09ms +step:586/2285 train_time:35213ms step_avg:60.09ms +step:587/2285 train_time:35275ms step_avg:60.09ms +step:588/2285 train_time:35334ms step_avg:60.09ms +step:589/2285 train_time:35395ms step_avg:60.09ms +step:590/2285 train_time:35454ms step_avg:60.09ms +step:591/2285 train_time:35515ms step_avg:60.09ms +step:592/2285 train_time:35574ms step_avg:60.09ms +step:593/2285 train_time:35636ms step_avg:60.09ms +step:594/2285 train_time:35695ms step_avg:60.09ms +step:595/2285 train_time:35756ms step_avg:60.09ms +step:596/2285 train_time:35815ms step_avg:60.09ms +step:597/2285 train_time:35876ms step_avg:60.09ms +step:598/2285 train_time:35936ms step_avg:60.09ms +step:599/2285 train_time:35997ms step_avg:60.10ms +step:600/2285 train_time:36056ms step_avg:60.09ms +step:601/2285 train_time:36117ms step_avg:60.10ms +step:602/2285 train_time:36177ms step_avg:60.09ms +step:603/2285 train_time:36240ms step_avg:60.10ms +step:604/2285 train_time:36298ms step_avg:60.10ms +step:605/2285 train_time:36360ms step_avg:60.10ms +step:606/2285 train_time:36418ms step_avg:60.10ms +step:607/2285 train_time:36480ms step_avg:60.10ms +step:608/2285 train_time:36539ms step_avg:60.10ms +step:609/2285 train_time:36600ms step_avg:60.10ms +step:610/2285 train_time:36659ms step_avg:60.10ms +step:611/2285 train_time:36721ms step_avg:60.10ms +step:612/2285 train_time:36781ms step_avg:60.10ms +step:613/2285 train_time:36842ms step_avg:60.10ms +step:614/2285 train_time:36900ms step_avg:60.10ms +step:615/2285 train_time:36963ms step_avg:60.10ms +step:616/2285 train_time:37022ms step_avg:60.10ms +step:617/2285 train_time:37084ms step_avg:60.10ms +step:618/2285 train_time:37143ms step_avg:60.10ms +step:619/2285 train_time:37205ms step_avg:60.10ms +step:620/2285 train_time:37263ms step_avg:60.10ms +step:621/2285 train_time:37324ms step_avg:60.10ms +step:622/2285 train_time:37384ms step_avg:60.10ms +step:623/2285 train_time:37445ms step_avg:60.10ms +step:624/2285 train_time:37504ms step_avg:60.10ms +step:625/2285 train_time:37565ms step_avg:60.10ms +step:626/2285 train_time:37624ms step_avg:60.10ms +step:627/2285 train_time:37686ms step_avg:60.11ms +step:628/2285 train_time:37746ms step_avg:60.10ms +step:629/2285 train_time:37807ms step_avg:60.11ms +step:630/2285 train_time:37866ms step_avg:60.11ms +step:631/2285 train_time:37929ms step_avg:60.11ms +step:632/2285 train_time:37988ms step_avg:60.11ms +step:633/2285 train_time:38050ms step_avg:60.11ms +step:634/2285 train_time:38109ms step_avg:60.11ms +step:635/2285 train_time:38170ms step_avg:60.11ms +step:636/2285 train_time:38229ms step_avg:60.11ms +step:637/2285 train_time:38291ms step_avg:60.11ms +step:638/2285 train_time:38350ms step_avg:60.11ms +step:639/2285 train_time:38412ms step_avg:60.11ms +step:640/2285 train_time:38471ms step_avg:60.11ms +step:641/2285 train_time:38532ms step_avg:60.11ms +step:642/2285 train_time:38591ms step_avg:60.11ms +step:643/2285 train_time:38653ms step_avg:60.11ms +step:644/2285 train_time:38712ms step_avg:60.11ms +step:645/2285 train_time:38774ms step_avg:60.11ms +step:646/2285 train_time:38833ms step_avg:60.11ms +step:647/2285 train_time:38896ms step_avg:60.12ms +step:648/2285 train_time:38956ms step_avg:60.12ms +step:649/2285 train_time:39017ms step_avg:60.12ms +step:650/2285 train_time:39075ms step_avg:60.12ms +step:651/2285 train_time:39136ms step_avg:60.12ms +step:652/2285 train_time:39196ms step_avg:60.12ms +step:653/2285 train_time:39256ms step_avg:60.12ms +step:654/2285 train_time:39316ms step_avg:60.12ms +step:655/2285 train_time:39377ms step_avg:60.12ms +step:656/2285 train_time:39436ms step_avg:60.12ms +step:657/2285 train_time:39497ms step_avg:60.12ms +step:658/2285 train_time:39556ms step_avg:60.12ms +step:659/2285 train_time:39617ms step_avg:60.12ms +step:660/2285 train_time:39676ms step_avg:60.12ms +step:661/2285 train_time:39737ms step_avg:60.12ms +step:662/2285 train_time:39796ms step_avg:60.12ms +step:663/2285 train_time:39858ms step_avg:60.12ms +step:664/2285 train_time:39918ms step_avg:60.12ms +step:665/2285 train_time:39979ms step_avg:60.12ms +step:666/2285 train_time:40038ms step_avg:60.12ms +step:667/2285 train_time:40100ms step_avg:60.12ms +step:668/2285 train_time:40159ms step_avg:60.12ms +step:669/2285 train_time:40220ms step_avg:60.12ms +step:670/2285 train_time:40279ms step_avg:60.12ms +step:671/2285 train_time:40340ms step_avg:60.12ms +step:672/2285 train_time:40400ms step_avg:60.12ms +step:673/2285 train_time:40461ms step_avg:60.12ms +step:674/2285 train_time:40520ms step_avg:60.12ms +step:675/2285 train_time:40582ms step_avg:60.12ms +step:676/2285 train_time:40641ms step_avg:60.12ms +step:677/2285 train_time:40704ms step_avg:60.12ms +step:678/2285 train_time:40763ms step_avg:60.12ms +step:679/2285 train_time:40825ms step_avg:60.12ms +step:680/2285 train_time:40884ms step_avg:60.12ms +step:681/2285 train_time:40946ms step_avg:60.13ms +step:682/2285 train_time:41005ms step_avg:60.12ms +step:683/2285 train_time:41066ms step_avg:60.13ms +step:684/2285 train_time:41126ms step_avg:60.13ms +step:685/2285 train_time:41187ms step_avg:60.13ms +step:686/2285 train_time:41247ms step_avg:60.13ms +step:687/2285 train_time:41308ms step_avg:60.13ms +step:688/2285 train_time:41367ms step_avg:60.13ms +step:689/2285 train_time:41429ms step_avg:60.13ms +step:690/2285 train_time:41488ms step_avg:60.13ms +step:691/2285 train_time:41550ms step_avg:60.13ms +step:692/2285 train_time:41609ms step_avg:60.13ms +step:693/2285 train_time:41671ms step_avg:60.13ms +step:694/2285 train_time:41730ms step_avg:60.13ms +step:695/2285 train_time:41791ms step_avg:60.13ms +step:696/2285 train_time:41850ms step_avg:60.13ms +step:697/2285 train_time:41912ms step_avg:60.13ms +step:698/2285 train_time:41971ms step_avg:60.13ms +step:699/2285 train_time:42032ms step_avg:60.13ms +step:700/2285 train_time:42091ms step_avg:60.13ms +step:701/2285 train_time:42153ms step_avg:60.13ms +step:702/2285 train_time:42212ms step_avg:60.13ms +step:703/2285 train_time:42273ms step_avg:60.13ms +step:704/2285 train_time:42333ms step_avg:60.13ms +step:705/2285 train_time:42394ms step_avg:60.13ms +step:706/2285 train_time:42454ms step_avg:60.13ms +step:707/2285 train_time:42515ms step_avg:60.14ms +step:708/2285 train_time:42575ms step_avg:60.13ms +step:709/2285 train_time:42636ms step_avg:60.14ms +step:710/2285 train_time:42696ms step_avg:60.14ms +step:711/2285 train_time:42757ms step_avg:60.14ms +step:712/2285 train_time:42816ms step_avg:60.13ms +step:713/2285 train_time:42877ms step_avg:60.14ms +step:714/2285 train_time:42936ms step_avg:60.14ms +step:715/2285 train_time:42998ms step_avg:60.14ms +step:716/2285 train_time:43057ms step_avg:60.14ms +step:717/2285 train_time:43118ms step_avg:60.14ms +step:718/2285 train_time:43178ms step_avg:60.14ms +step:719/2285 train_time:43240ms step_avg:60.14ms +step:720/2285 train_time:43299ms step_avg:60.14ms +step:721/2285 train_time:43360ms step_avg:60.14ms +step:722/2285 train_time:43418ms step_avg:60.14ms +step:723/2285 train_time:43480ms step_avg:60.14ms +step:724/2285 train_time:43539ms step_avg:60.14ms +step:725/2285 train_time:43601ms step_avg:60.14ms +step:726/2285 train_time:43660ms step_avg:60.14ms +step:727/2285 train_time:43721ms step_avg:60.14ms +step:728/2285 train_time:43780ms step_avg:60.14ms +step:729/2285 train_time:43842ms step_avg:60.14ms +step:730/2285 train_time:43901ms step_avg:60.14ms +step:731/2285 train_time:43962ms step_avg:60.14ms +step:732/2285 train_time:44021ms step_avg:60.14ms +step:733/2285 train_time:44083ms step_avg:60.14ms +step:734/2285 train_time:44141ms step_avg:60.14ms +step:735/2285 train_time:44203ms step_avg:60.14ms +step:736/2285 train_time:44262ms step_avg:60.14ms +step:737/2285 train_time:44323ms step_avg:60.14ms +step:738/2285 train_time:44382ms step_avg:60.14ms +step:739/2285 train_time:44444ms step_avg:60.14ms +step:740/2285 train_time:44503ms step_avg:60.14ms +step:741/2285 train_time:44565ms step_avg:60.14ms +step:742/2285 train_time:44624ms step_avg:60.14ms +step:743/2285 train_time:44686ms step_avg:60.14ms +step:744/2285 train_time:44745ms step_avg:60.14ms +step:745/2285 train_time:44806ms step_avg:60.14ms +step:746/2285 train_time:44865ms step_avg:60.14ms +step:747/2285 train_time:44927ms step_avg:60.14ms +step:748/2285 train_time:44986ms step_avg:60.14ms +step:749/2285 train_time:45048ms step_avg:60.14ms +step:750/2285 train_time:45107ms step_avg:60.14ms +step:750/2285 val_loss:3.6533 train_time:45170ms step_avg:60.23ms +step:751/2285 train_time:45189ms step_avg:60.17ms +step:752/2285 train_time:45231ms step_avg:60.15ms +step:753/2285 train_time:45294ms step_avg:60.15ms +step:754/2285 train_time:45355ms step_avg:60.15ms +step:755/2285 train_time:45416ms step_avg:60.15ms +step:756/2285 train_time:45475ms step_avg:60.15ms +step:757/2285 train_time:45536ms step_avg:60.15ms +step:758/2285 train_time:45595ms step_avg:60.15ms +step:759/2285 train_time:45655ms step_avg:60.15ms +step:760/2285 train_time:45713ms step_avg:60.15ms +step:761/2285 train_time:45774ms step_avg:60.15ms +step:762/2285 train_time:45832ms step_avg:60.15ms +step:763/2285 train_time:45893ms step_avg:60.15ms +step:764/2285 train_time:45953ms step_avg:60.15ms +step:765/2285 train_time:46015ms step_avg:60.15ms +step:766/2285 train_time:46075ms step_avg:60.15ms +step:767/2285 train_time:46137ms step_avg:60.15ms +step:768/2285 train_time:46199ms step_avg:60.15ms +step:769/2285 train_time:46262ms step_avg:60.16ms +step:770/2285 train_time:46321ms step_avg:60.16ms +step:771/2285 train_time:46384ms step_avg:60.16ms +step:772/2285 train_time:46443ms step_avg:60.16ms +step:773/2285 train_time:46505ms step_avg:60.16ms +step:774/2285 train_time:46564ms step_avg:60.16ms +step:775/2285 train_time:46625ms step_avg:60.16ms +step:776/2285 train_time:46684ms step_avg:60.16ms +step:777/2285 train_time:46745ms step_avg:60.16ms +step:778/2285 train_time:46805ms step_avg:60.16ms +step:779/2285 train_time:46866ms step_avg:60.16ms +step:780/2285 train_time:46925ms step_avg:60.16ms +step:781/2285 train_time:46987ms step_avg:60.16ms +step:782/2285 train_time:47047ms step_avg:60.16ms +step:783/2285 train_time:47110ms step_avg:60.17ms +step:784/2285 train_time:47170ms step_avg:60.17ms +step:785/2285 train_time:47233ms step_avg:60.17ms +step:786/2285 train_time:47293ms step_avg:60.17ms +step:787/2285 train_time:47355ms step_avg:60.17ms +step:788/2285 train_time:47415ms step_avg:60.17ms +step:789/2285 train_time:47476ms step_avg:60.17ms +step:790/2285 train_time:47536ms step_avg:60.17ms +step:791/2285 train_time:47598ms step_avg:60.17ms +step:792/2285 train_time:47657ms step_avg:60.17ms +step:793/2285 train_time:47719ms step_avg:60.17ms +step:794/2285 train_time:47778ms step_avg:60.17ms +step:795/2285 train_time:47839ms step_avg:60.18ms +step:796/2285 train_time:47899ms step_avg:60.17ms +step:797/2285 train_time:47961ms step_avg:60.18ms +step:798/2285 train_time:48020ms step_avg:60.18ms +step:799/2285 train_time:48083ms step_avg:60.18ms +step:800/2285 train_time:48142ms step_avg:60.18ms +step:801/2285 train_time:48205ms step_avg:60.18ms +step:802/2285 train_time:48264ms step_avg:60.18ms +step:803/2285 train_time:48327ms step_avg:60.18ms +step:804/2285 train_time:48387ms step_avg:60.18ms +step:805/2285 train_time:48449ms step_avg:60.18ms +step:806/2285 train_time:48508ms step_avg:60.18ms +step:807/2285 train_time:48570ms step_avg:60.19ms +step:808/2285 train_time:48629ms step_avg:60.18ms +step:809/2285 train_time:48691ms step_avg:60.19ms +step:810/2285 train_time:48750ms step_avg:60.18ms +step:811/2285 train_time:48811ms step_avg:60.19ms +step:812/2285 train_time:48871ms step_avg:60.19ms +step:813/2285 train_time:48932ms step_avg:60.19ms +step:814/2285 train_time:48991ms step_avg:60.19ms +step:815/2285 train_time:49053ms step_avg:60.19ms +step:816/2285 train_time:49112ms step_avg:60.19ms +step:817/2285 train_time:49174ms step_avg:60.19ms +step:818/2285 train_time:49234ms step_avg:60.19ms +step:819/2285 train_time:49297ms step_avg:60.19ms +step:820/2285 train_time:49356ms step_avg:60.19ms +step:821/2285 train_time:49419ms step_avg:60.19ms +step:822/2285 train_time:49478ms step_avg:60.19ms +step:823/2285 train_time:49540ms step_avg:60.19ms +step:824/2285 train_time:49600ms step_avg:60.19ms +step:825/2285 train_time:49662ms step_avg:60.20ms +step:826/2285 train_time:49721ms step_avg:60.20ms +step:827/2285 train_time:49783ms step_avg:60.20ms +step:828/2285 train_time:49842ms step_avg:60.20ms +step:829/2285 train_time:49904ms step_avg:60.20ms +step:830/2285 train_time:49963ms step_avg:60.20ms +step:831/2285 train_time:50025ms step_avg:60.20ms +step:832/2285 train_time:50085ms step_avg:60.20ms +step:833/2285 train_time:50146ms step_avg:60.20ms +step:834/2285 train_time:50206ms step_avg:60.20ms +step:835/2285 train_time:50268ms step_avg:60.20ms +step:836/2285 train_time:50328ms step_avg:60.20ms +step:837/2285 train_time:50390ms step_avg:60.20ms +step:838/2285 train_time:50450ms step_avg:60.20ms +step:839/2285 train_time:50511ms step_avg:60.20ms +step:840/2285 train_time:50571ms step_avg:60.20ms +step:841/2285 train_time:50633ms step_avg:60.21ms +step:842/2285 train_time:50692ms step_avg:60.20ms +step:843/2285 train_time:50754ms step_avg:60.21ms +step:844/2285 train_time:50813ms step_avg:60.21ms +step:845/2285 train_time:50875ms step_avg:60.21ms +step:846/2285 train_time:50934ms step_avg:60.21ms +step:847/2285 train_time:50996ms step_avg:60.21ms +step:848/2285 train_time:51055ms step_avg:60.21ms +step:849/2285 train_time:51117ms step_avg:60.21ms +step:850/2285 train_time:51177ms step_avg:60.21ms +step:851/2285 train_time:51239ms step_avg:60.21ms +step:852/2285 train_time:51299ms step_avg:60.21ms +step:853/2285 train_time:51361ms step_avg:60.21ms +step:854/2285 train_time:51421ms step_avg:60.21ms +step:855/2285 train_time:51482ms step_avg:60.21ms +step:856/2285 train_time:51542ms step_avg:60.21ms +step:857/2285 train_time:51604ms step_avg:60.21ms +step:858/2285 train_time:51663ms step_avg:60.21ms +step:859/2285 train_time:51726ms step_avg:60.22ms +step:860/2285 train_time:51785ms step_avg:60.22ms +step:861/2285 train_time:51847ms step_avg:60.22ms +step:862/2285 train_time:51905ms step_avg:60.22ms +step:863/2285 train_time:51968ms step_avg:60.22ms +step:864/2285 train_time:52027ms step_avg:60.22ms +step:865/2285 train_time:52089ms step_avg:60.22ms +step:866/2285 train_time:52149ms step_avg:60.22ms +step:867/2285 train_time:52211ms step_avg:60.22ms +step:868/2285 train_time:52271ms step_avg:60.22ms +step:869/2285 train_time:52333ms step_avg:60.22ms +step:870/2285 train_time:52392ms step_avg:60.22ms +step:871/2285 train_time:52454ms step_avg:60.22ms +step:872/2285 train_time:52513ms step_avg:60.22ms +step:873/2285 train_time:52575ms step_avg:60.22ms +step:874/2285 train_time:52634ms step_avg:60.22ms +step:875/2285 train_time:52697ms step_avg:60.22ms +step:876/2285 train_time:52756ms step_avg:60.22ms +step:877/2285 train_time:52818ms step_avg:60.23ms +step:878/2285 train_time:52877ms step_avg:60.22ms +step:879/2285 train_time:52939ms step_avg:60.23ms +step:880/2285 train_time:52999ms step_avg:60.23ms +step:881/2285 train_time:53061ms step_avg:60.23ms +step:882/2285 train_time:53121ms step_avg:60.23ms +step:883/2285 train_time:53183ms step_avg:60.23ms +step:884/2285 train_time:53243ms step_avg:60.23ms +step:885/2285 train_time:53305ms step_avg:60.23ms +step:886/2285 train_time:53364ms step_avg:60.23ms +step:887/2285 train_time:53426ms step_avg:60.23ms +step:888/2285 train_time:53486ms step_avg:60.23ms +step:889/2285 train_time:53548ms step_avg:60.23ms +step:890/2285 train_time:53608ms step_avg:60.23ms +step:891/2285 train_time:53669ms step_avg:60.23ms +step:892/2285 train_time:53729ms step_avg:60.23ms +step:893/2285 train_time:53790ms step_avg:60.24ms +step:894/2285 train_time:53849ms step_avg:60.23ms +step:895/2285 train_time:53911ms step_avg:60.24ms +step:896/2285 train_time:53971ms step_avg:60.24ms +step:897/2285 train_time:54032ms step_avg:60.24ms +step:898/2285 train_time:54092ms step_avg:60.24ms +step:899/2285 train_time:54153ms step_avg:60.24ms +step:900/2285 train_time:54212ms step_avg:60.24ms +step:901/2285 train_time:54275ms step_avg:60.24ms +step:902/2285 train_time:54334ms step_avg:60.24ms +step:903/2285 train_time:54396ms step_avg:60.24ms +step:904/2285 train_time:54456ms step_avg:60.24ms +step:905/2285 train_time:54518ms step_avg:60.24ms +step:906/2285 train_time:54577ms step_avg:60.24ms +step:907/2285 train_time:54639ms step_avg:60.24ms +step:908/2285 train_time:54699ms step_avg:60.24ms +step:909/2285 train_time:54760ms step_avg:60.24ms +step:910/2285 train_time:54820ms step_avg:60.24ms +step:911/2285 train_time:54882ms step_avg:60.24ms +step:912/2285 train_time:54942ms step_avg:60.24ms +step:913/2285 train_time:55004ms step_avg:60.25ms +step:914/2285 train_time:55063ms step_avg:60.24ms +step:915/2285 train_time:55125ms step_avg:60.25ms +step:916/2285 train_time:55184ms step_avg:60.24ms +step:917/2285 train_time:55246ms step_avg:60.25ms +step:918/2285 train_time:55306ms step_avg:60.25ms +step:919/2285 train_time:55368ms step_avg:60.25ms +step:920/2285 train_time:55428ms step_avg:60.25ms +step:921/2285 train_time:55490ms step_avg:60.25ms +step:922/2285 train_time:55550ms step_avg:60.25ms +step:923/2285 train_time:55613ms step_avg:60.25ms +step:924/2285 train_time:55672ms step_avg:60.25ms +step:925/2285 train_time:55734ms step_avg:60.25ms +step:926/2285 train_time:55793ms step_avg:60.25ms +step:927/2285 train_time:55855ms step_avg:60.25ms +step:928/2285 train_time:55914ms step_avg:60.25ms +step:929/2285 train_time:55977ms step_avg:60.25ms +step:930/2285 train_time:56037ms step_avg:60.25ms +step:931/2285 train_time:56099ms step_avg:60.26ms +step:932/2285 train_time:56158ms step_avg:60.26ms +step:933/2285 train_time:56220ms step_avg:60.26ms +step:934/2285 train_time:56279ms step_avg:60.26ms +step:935/2285 train_time:56340ms step_avg:60.26ms +step:936/2285 train_time:56400ms step_avg:60.26ms +step:937/2285 train_time:56462ms step_avg:60.26ms +step:938/2285 train_time:56522ms step_avg:60.26ms +step:939/2285 train_time:56583ms step_avg:60.26ms +step:940/2285 train_time:56643ms step_avg:60.26ms +step:941/2285 train_time:56704ms step_avg:60.26ms +step:942/2285 train_time:56764ms step_avg:60.26ms +step:943/2285 train_time:56826ms step_avg:60.26ms +step:944/2285 train_time:56886ms step_avg:60.26ms +step:945/2285 train_time:56948ms step_avg:60.26ms +step:946/2285 train_time:57008ms step_avg:60.26ms +step:947/2285 train_time:57070ms step_avg:60.26ms +step:948/2285 train_time:57130ms step_avg:60.26ms +step:949/2285 train_time:57192ms step_avg:60.27ms +step:950/2285 train_time:57251ms step_avg:60.26ms +step:951/2285 train_time:57312ms step_avg:60.26ms +step:952/2285 train_time:57372ms step_avg:60.26ms +step:953/2285 train_time:57434ms step_avg:60.27ms +step:954/2285 train_time:57493ms step_avg:60.27ms +step:955/2285 train_time:57555ms step_avg:60.27ms +step:956/2285 train_time:57614ms step_avg:60.27ms +step:957/2285 train_time:57676ms step_avg:60.27ms +step:958/2285 train_time:57736ms step_avg:60.27ms +step:959/2285 train_time:57798ms step_avg:60.27ms +step:960/2285 train_time:57857ms step_avg:60.27ms +step:961/2285 train_time:57920ms step_avg:60.27ms +step:962/2285 train_time:57980ms step_avg:60.27ms +step:963/2285 train_time:58042ms step_avg:60.27ms +step:964/2285 train_time:58101ms step_avg:60.27ms +step:965/2285 train_time:58163ms step_avg:60.27ms +step:966/2285 train_time:58223ms step_avg:60.27ms +step:967/2285 train_time:58284ms step_avg:60.27ms +step:968/2285 train_time:58344ms step_avg:60.27ms +step:969/2285 train_time:58405ms step_avg:60.27ms +step:970/2285 train_time:58465ms step_avg:60.27ms +step:971/2285 train_time:58527ms step_avg:60.27ms +step:972/2285 train_time:58586ms step_avg:60.27ms +step:973/2285 train_time:58648ms step_avg:60.28ms +step:974/2285 train_time:58707ms step_avg:60.27ms +step:975/2285 train_time:58769ms step_avg:60.28ms +step:976/2285 train_time:58829ms step_avg:60.28ms +step:977/2285 train_time:58891ms step_avg:60.28ms +step:978/2285 train_time:58951ms step_avg:60.28ms +step:979/2285 train_time:59012ms step_avg:60.28ms +step:980/2285 train_time:59072ms step_avg:60.28ms +step:981/2285 train_time:59134ms step_avg:60.28ms +step:982/2285 train_time:59193ms step_avg:60.28ms +step:983/2285 train_time:59255ms step_avg:60.28ms +step:984/2285 train_time:59314ms step_avg:60.28ms +step:985/2285 train_time:59376ms step_avg:60.28ms +step:986/2285 train_time:59436ms step_avg:60.28ms +step:987/2285 train_time:59497ms step_avg:60.28ms +step:988/2285 train_time:59557ms step_avg:60.28ms +step:989/2285 train_time:59618ms step_avg:60.28ms +step:990/2285 train_time:59678ms step_avg:60.28ms +step:991/2285 train_time:59740ms step_avg:60.28ms +step:992/2285 train_time:59801ms step_avg:60.28ms +step:993/2285 train_time:59862ms step_avg:60.28ms +step:994/2285 train_time:59922ms step_avg:60.28ms +step:995/2285 train_time:59985ms step_avg:60.29ms +step:996/2285 train_time:60044ms step_avg:60.29ms +step:997/2285 train_time:60106ms step_avg:60.29ms +step:998/2285 train_time:60165ms step_avg:60.29ms +step:999/2285 train_time:60227ms step_avg:60.29ms +step:1000/2285 train_time:60286ms step_avg:60.29ms +step:1000/2285 val_loss:3.5663 train_time:60349ms step_avg:60.35ms +step:1001/2285 train_time:60368ms step_avg:60.31ms +step:1002/2285 train_time:60411ms step_avg:60.29ms +step:1003/2285 train_time:60472ms step_avg:60.29ms +step:1004/2285 train_time:60532ms step_avg:60.29ms +step:1005/2285 train_time:60595ms step_avg:60.29ms +step:1006/2285 train_time:60655ms step_avg:60.29ms +step:1007/2285 train_time:60716ms step_avg:60.29ms +step:1008/2285 train_time:60775ms step_avg:60.29ms +step:1009/2285 train_time:60836ms step_avg:60.29ms +step:1010/2285 train_time:60894ms step_avg:60.29ms +step:1011/2285 train_time:60955ms step_avg:60.29ms +step:1012/2285 train_time:61014ms step_avg:60.29ms +step:1013/2285 train_time:61074ms step_avg:60.29ms +step:1014/2285 train_time:61134ms step_avg:60.29ms +step:1015/2285 train_time:61195ms step_avg:60.29ms +step:1016/2285 train_time:61257ms step_avg:60.29ms +step:1017/2285 train_time:61325ms step_avg:60.30ms +step:1018/2285 train_time:61385ms step_avg:60.30ms +step:1019/2285 train_time:61447ms step_avg:60.30ms +step:1020/2285 train_time:61507ms step_avg:60.30ms +step:1021/2285 train_time:61568ms step_avg:60.30ms +step:1022/2285 train_time:61628ms step_avg:60.30ms +step:1023/2285 train_time:61690ms step_avg:60.30ms +step:1024/2285 train_time:61749ms step_avg:60.30ms +step:1025/2285 train_time:61810ms step_avg:60.30ms +step:1026/2285 train_time:61870ms step_avg:60.30ms +step:1027/2285 train_time:61931ms step_avg:60.30ms +step:1028/2285 train_time:61990ms step_avg:60.30ms +step:1029/2285 train_time:62051ms step_avg:60.30ms +step:1030/2285 train_time:62110ms step_avg:60.30ms +step:1031/2285 train_time:62172ms step_avg:60.30ms +step:1032/2285 train_time:62232ms step_avg:60.30ms +step:1033/2285 train_time:62295ms step_avg:60.31ms +step:1034/2285 train_time:62357ms step_avg:60.31ms +step:1035/2285 train_time:62420ms step_avg:60.31ms +step:1036/2285 train_time:62480ms step_avg:60.31ms +step:1037/2285 train_time:62542ms step_avg:60.31ms +step:1038/2285 train_time:62601ms step_avg:60.31ms +step:1039/2285 train_time:62663ms step_avg:60.31ms +step:1040/2285 train_time:62722ms step_avg:60.31ms +step:1041/2285 train_time:62783ms step_avg:60.31ms +step:1042/2285 train_time:62843ms step_avg:60.31ms +step:1043/2285 train_time:62904ms step_avg:60.31ms +step:1044/2285 train_time:62963ms step_avg:60.31ms +step:1045/2285 train_time:63025ms step_avg:60.31ms +step:1046/2285 train_time:63084ms step_avg:60.31ms +step:1047/2285 train_time:63146ms step_avg:60.31ms +step:1048/2285 train_time:63206ms step_avg:60.31ms +step:1049/2285 train_time:63268ms step_avg:60.31ms +step:1050/2285 train_time:63329ms step_avg:60.31ms +step:1051/2285 train_time:63391ms step_avg:60.31ms +step:1052/2285 train_time:63451ms step_avg:60.31ms +step:1053/2285 train_time:63513ms step_avg:60.32ms +step:1054/2285 train_time:63573ms step_avg:60.32ms +step:1055/2285 train_time:63635ms step_avg:60.32ms +step:1056/2285 train_time:63694ms step_avg:60.32ms +step:1057/2285 train_time:63756ms step_avg:60.32ms +step:1058/2285 train_time:63816ms step_avg:60.32ms +step:1059/2285 train_time:63878ms step_avg:60.32ms +step:1060/2285 train_time:63938ms step_avg:60.32ms +step:1061/2285 train_time:63999ms step_avg:60.32ms +step:1062/2285 train_time:64059ms step_avg:60.32ms +step:1063/2285 train_time:64121ms step_avg:60.32ms +step:1064/2285 train_time:64180ms step_avg:60.32ms +step:1065/2285 train_time:64242ms step_avg:60.32ms +step:1066/2285 train_time:64301ms step_avg:60.32ms +step:1067/2285 train_time:64363ms step_avg:60.32ms +step:1068/2285 train_time:64423ms step_avg:60.32ms +step:1069/2285 train_time:64485ms step_avg:60.32ms +step:1070/2285 train_time:64545ms step_avg:60.32ms +step:1071/2285 train_time:64606ms step_avg:60.32ms +step:1072/2285 train_time:64666ms step_avg:60.32ms +step:1073/2285 train_time:64728ms step_avg:60.32ms +step:1074/2285 train_time:64788ms step_avg:60.32ms +step:1075/2285 train_time:64850ms step_avg:60.33ms +step:1076/2285 train_time:64909ms step_avg:60.32ms +step:1077/2285 train_time:64971ms step_avg:60.33ms +step:1078/2285 train_time:65030ms step_avg:60.33ms +step:1079/2285 train_time:65092ms step_avg:60.33ms +step:1080/2285 train_time:65152ms step_avg:60.33ms +step:1081/2285 train_time:65214ms step_avg:60.33ms +step:1082/2285 train_time:65273ms step_avg:60.33ms +step:1083/2285 train_time:65336ms step_avg:60.33ms +step:1084/2285 train_time:65396ms step_avg:60.33ms +step:1085/2285 train_time:65458ms step_avg:60.33ms +step:1086/2285 train_time:65518ms step_avg:60.33ms +step:1087/2285 train_time:65580ms step_avg:60.33ms +step:1088/2285 train_time:65639ms step_avg:60.33ms +step:1089/2285 train_time:65701ms step_avg:60.33ms +step:1090/2285 train_time:65760ms step_avg:60.33ms +step:1091/2285 train_time:65822ms step_avg:60.33ms +step:1092/2285 train_time:65882ms step_avg:60.33ms +step:1093/2285 train_time:65943ms step_avg:60.33ms +step:1094/2285 train_time:66002ms step_avg:60.33ms +step:1095/2285 train_time:66065ms step_avg:60.33ms +step:1096/2285 train_time:66124ms step_avg:60.33ms +step:1097/2285 train_time:66186ms step_avg:60.33ms +step:1098/2285 train_time:66246ms step_avg:60.33ms +step:1099/2285 train_time:66308ms step_avg:60.33ms +step:1100/2285 train_time:66367ms step_avg:60.33ms +step:1101/2285 train_time:66429ms step_avg:60.34ms +step:1102/2285 train_time:66489ms step_avg:60.33ms +step:1103/2285 train_time:66550ms step_avg:60.34ms +step:1104/2285 train_time:66610ms step_avg:60.34ms +step:1105/2285 train_time:66673ms step_avg:60.34ms +step:1106/2285 train_time:66733ms step_avg:60.34ms +step:1107/2285 train_time:66795ms step_avg:60.34ms +step:1108/2285 train_time:66855ms step_avg:60.34ms +step:1109/2285 train_time:66917ms step_avg:60.34ms +step:1110/2285 train_time:66977ms step_avg:60.34ms +step:1111/2285 train_time:67039ms step_avg:60.34ms +step:1112/2285 train_time:67098ms step_avg:60.34ms +step:1113/2285 train_time:67160ms step_avg:60.34ms +step:1114/2285 train_time:67220ms step_avg:60.34ms +step:1115/2285 train_time:67281ms step_avg:60.34ms +step:1116/2285 train_time:67341ms step_avg:60.34ms +step:1117/2285 train_time:67403ms step_avg:60.34ms +step:1118/2285 train_time:67463ms step_avg:60.34ms +step:1119/2285 train_time:67525ms step_avg:60.34ms +step:1120/2285 train_time:67585ms step_avg:60.34ms +step:1121/2285 train_time:67646ms step_avg:60.34ms +step:1122/2285 train_time:67706ms step_avg:60.34ms +step:1123/2285 train_time:67768ms step_avg:60.35ms +step:1124/2285 train_time:67829ms step_avg:60.35ms +step:1125/2285 train_time:67890ms step_avg:60.35ms +step:1126/2285 train_time:67950ms step_avg:60.35ms +step:1127/2285 train_time:68011ms step_avg:60.35ms +step:1128/2285 train_time:68071ms step_avg:60.35ms +step:1129/2285 train_time:68133ms step_avg:60.35ms +step:1130/2285 train_time:68193ms step_avg:60.35ms +step:1131/2285 train_time:68255ms step_avg:60.35ms +step:1132/2285 train_time:68316ms step_avg:60.35ms +step:1133/2285 train_time:68378ms step_avg:60.35ms +step:1134/2285 train_time:68438ms step_avg:60.35ms +step:1135/2285 train_time:68500ms step_avg:60.35ms +step:1136/2285 train_time:68559ms step_avg:60.35ms +step:1137/2285 train_time:68621ms step_avg:60.35ms +step:1138/2285 train_time:68681ms step_avg:60.35ms +step:1139/2285 train_time:68743ms step_avg:60.35ms +step:1140/2285 train_time:68803ms step_avg:60.35ms +step:1141/2285 train_time:68865ms step_avg:60.35ms +step:1142/2285 train_time:68924ms step_avg:60.35ms +step:1143/2285 train_time:68986ms step_avg:60.36ms +step:1144/2285 train_time:69046ms step_avg:60.36ms +step:1145/2285 train_time:69108ms step_avg:60.36ms +step:1146/2285 train_time:69168ms step_avg:60.36ms +step:1147/2285 train_time:69231ms step_avg:60.36ms +step:1148/2285 train_time:69290ms step_avg:60.36ms +step:1149/2285 train_time:69352ms step_avg:60.36ms +step:1150/2285 train_time:69412ms step_avg:60.36ms +step:1151/2285 train_time:69474ms step_avg:60.36ms +step:1152/2285 train_time:69534ms step_avg:60.36ms +step:1153/2285 train_time:69595ms step_avg:60.36ms +step:1154/2285 train_time:69656ms step_avg:60.36ms +step:1155/2285 train_time:69719ms step_avg:60.36ms +step:1156/2285 train_time:69778ms step_avg:60.36ms +step:1157/2285 train_time:69840ms step_avg:60.36ms +step:1158/2285 train_time:69901ms step_avg:60.36ms +step:1159/2285 train_time:69963ms step_avg:60.37ms +step:1160/2285 train_time:70024ms step_avg:60.37ms +step:1161/2285 train_time:70086ms step_avg:60.37ms +step:1162/2285 train_time:70145ms step_avg:60.37ms +step:1163/2285 train_time:70207ms step_avg:60.37ms +step:1164/2285 train_time:70267ms step_avg:60.37ms +step:1165/2285 train_time:70329ms step_avg:60.37ms +step:1166/2285 train_time:70389ms step_avg:60.37ms +step:1167/2285 train_time:70451ms step_avg:60.37ms +step:1168/2285 train_time:70511ms step_avg:60.37ms +step:1169/2285 train_time:70573ms step_avg:60.37ms +step:1170/2285 train_time:70633ms step_avg:60.37ms +step:1171/2285 train_time:70694ms step_avg:60.37ms +step:1172/2285 train_time:70755ms step_avg:60.37ms +step:1173/2285 train_time:70818ms step_avg:60.37ms +step:1174/2285 train_time:70878ms step_avg:60.37ms +step:1175/2285 train_time:70940ms step_avg:60.37ms +step:1176/2285 train_time:71000ms step_avg:60.37ms +step:1177/2285 train_time:71062ms step_avg:60.38ms +step:1178/2285 train_time:71121ms step_avg:60.37ms +step:1179/2285 train_time:71183ms step_avg:60.38ms +step:1180/2285 train_time:71242ms step_avg:60.37ms +step:1181/2285 train_time:71305ms step_avg:60.38ms +step:1182/2285 train_time:71364ms step_avg:60.38ms +step:1183/2285 train_time:71427ms step_avg:60.38ms +step:1184/2285 train_time:71487ms step_avg:60.38ms +step:1185/2285 train_time:71549ms step_avg:60.38ms +step:1186/2285 train_time:71609ms step_avg:60.38ms +step:1187/2285 train_time:71672ms step_avg:60.38ms +step:1188/2285 train_time:71731ms step_avg:60.38ms +step:1189/2285 train_time:71794ms step_avg:60.38ms +step:1190/2285 train_time:71854ms step_avg:60.38ms +step:1191/2285 train_time:71917ms step_avg:60.38ms +step:1192/2285 train_time:71977ms step_avg:60.38ms +step:1193/2285 train_time:72040ms step_avg:60.39ms +step:1194/2285 train_time:72099ms step_avg:60.38ms +step:1195/2285 train_time:72161ms step_avg:60.39ms +step:1196/2285 train_time:72221ms step_avg:60.39ms +step:1197/2285 train_time:72283ms step_avg:60.39ms +step:1198/2285 train_time:72343ms step_avg:60.39ms +step:1199/2285 train_time:72405ms step_avg:60.39ms +step:1200/2285 train_time:72464ms step_avg:60.39ms +step:1201/2285 train_time:72526ms step_avg:60.39ms +step:1202/2285 train_time:72587ms step_avg:60.39ms +step:1203/2285 train_time:72649ms step_avg:60.39ms +step:1204/2285 train_time:72709ms step_avg:60.39ms +step:1205/2285 train_time:72771ms step_avg:60.39ms +step:1206/2285 train_time:72830ms step_avg:60.39ms +step:1207/2285 train_time:72893ms step_avg:60.39ms +step:1208/2285 train_time:72953ms step_avg:60.39ms +step:1209/2285 train_time:73015ms step_avg:60.39ms +step:1210/2285 train_time:73075ms step_avg:60.39ms +step:1211/2285 train_time:73138ms step_avg:60.39ms +step:1212/2285 train_time:73198ms step_avg:60.39ms +step:1213/2285 train_time:73261ms step_avg:60.40ms +step:1214/2285 train_time:73321ms step_avg:60.40ms +step:1215/2285 train_time:73382ms step_avg:60.40ms +step:1216/2285 train_time:73442ms step_avg:60.40ms +step:1217/2285 train_time:73504ms step_avg:60.40ms +step:1218/2285 train_time:73564ms step_avg:60.40ms +step:1219/2285 train_time:73626ms step_avg:60.40ms +step:1220/2285 train_time:73686ms step_avg:60.40ms +step:1221/2285 train_time:73748ms step_avg:60.40ms +step:1222/2285 train_time:73808ms step_avg:60.40ms +step:1223/2285 train_time:73871ms step_avg:60.40ms +step:1224/2285 train_time:73931ms step_avg:60.40ms +step:1225/2285 train_time:73993ms step_avg:60.40ms +step:1226/2285 train_time:74053ms step_avg:60.40ms +step:1227/2285 train_time:74116ms step_avg:60.40ms +step:1228/2285 train_time:74176ms step_avg:60.40ms +step:1229/2285 train_time:74239ms step_avg:60.41ms +step:1230/2285 train_time:74298ms step_avg:60.41ms +step:1231/2285 train_time:74360ms step_avg:60.41ms +step:1232/2285 train_time:74420ms step_avg:60.41ms +step:1233/2285 train_time:74483ms step_avg:60.41ms +step:1234/2285 train_time:74542ms step_avg:60.41ms +step:1235/2285 train_time:74604ms step_avg:60.41ms +step:1236/2285 train_time:74664ms step_avg:60.41ms +step:1237/2285 train_time:74726ms step_avg:60.41ms +step:1238/2285 train_time:74786ms step_avg:60.41ms +step:1239/2285 train_time:74848ms step_avg:60.41ms +step:1240/2285 train_time:74908ms step_avg:60.41ms +step:1241/2285 train_time:74970ms step_avg:60.41ms +step:1242/2285 train_time:75030ms step_avg:60.41ms +step:1243/2285 train_time:75093ms step_avg:60.41ms +step:1244/2285 train_time:75153ms step_avg:60.41ms +step:1245/2285 train_time:75215ms step_avg:60.41ms +step:1246/2285 train_time:75274ms step_avg:60.41ms +step:1247/2285 train_time:75336ms step_avg:60.41ms +step:1248/2285 train_time:75396ms step_avg:60.41ms +step:1249/2285 train_time:75459ms step_avg:60.42ms +step:1250/2285 train_time:75520ms step_avg:60.42ms +step:1250/2285 val_loss:3.4929 train_time:75583ms step_avg:60.47ms +step:1251/2285 train_time:75602ms step_avg:60.43ms +step:1252/2285 train_time:75645ms step_avg:60.42ms +step:1253/2285 train_time:75706ms step_avg:60.42ms +step:1254/2285 train_time:75765ms step_avg:60.42ms +step:1255/2285 train_time:75828ms step_avg:60.42ms +step:1256/2285 train_time:75887ms step_avg:60.42ms +step:1257/2285 train_time:75948ms step_avg:60.42ms +step:1258/2285 train_time:76007ms step_avg:60.42ms +step:1259/2285 train_time:76067ms step_avg:60.42ms +step:1260/2285 train_time:76126ms step_avg:60.42ms +step:1261/2285 train_time:76188ms step_avg:60.42ms +step:1262/2285 train_time:76247ms step_avg:60.42ms +step:1263/2285 train_time:76308ms step_avg:60.42ms +step:1264/2285 train_time:76366ms step_avg:60.42ms +step:1265/2285 train_time:76428ms step_avg:60.42ms +step:1266/2285 train_time:76496ms step_avg:60.42ms +step:1267/2285 train_time:76563ms step_avg:60.43ms +step:1268/2285 train_time:76624ms step_avg:60.43ms +step:1269/2285 train_time:76686ms step_avg:60.43ms +step:1270/2285 train_time:76745ms step_avg:60.43ms +step:1271/2285 train_time:76808ms step_avg:60.43ms +step:1272/2285 train_time:76867ms step_avg:60.43ms +step:1273/2285 train_time:76928ms step_avg:60.43ms +step:1274/2285 train_time:76987ms step_avg:60.43ms +step:1275/2285 train_time:77048ms step_avg:60.43ms +step:1276/2285 train_time:77107ms step_avg:60.43ms +step:1277/2285 train_time:77168ms step_avg:60.43ms +step:1278/2285 train_time:77227ms step_avg:60.43ms +step:1279/2285 train_time:77288ms step_avg:60.43ms +step:1280/2285 train_time:77348ms step_avg:60.43ms +step:1281/2285 train_time:77412ms step_avg:60.43ms +step:1282/2285 train_time:77475ms step_avg:60.43ms +step:1283/2285 train_time:77539ms step_avg:60.44ms +step:1284/2285 train_time:77599ms step_avg:60.44ms +step:1285/2285 train_time:77662ms step_avg:60.44ms +step:1286/2285 train_time:77722ms step_avg:60.44ms +step:1287/2285 train_time:77783ms step_avg:60.44ms +step:1288/2285 train_time:77843ms step_avg:60.44ms +step:1289/2285 train_time:77905ms step_avg:60.44ms +step:1290/2285 train_time:77964ms step_avg:60.44ms +step:1291/2285 train_time:78025ms step_avg:60.44ms +step:1292/2285 train_time:78084ms step_avg:60.44ms +step:1293/2285 train_time:78146ms step_avg:60.44ms +step:1294/2285 train_time:78205ms step_avg:60.44ms +step:1295/2285 train_time:78266ms step_avg:60.44ms +step:1296/2285 train_time:78326ms step_avg:60.44ms +step:1297/2285 train_time:78389ms step_avg:60.44ms +step:1298/2285 train_time:78451ms step_avg:60.44ms +step:1299/2285 train_time:78515ms step_avg:60.44ms +step:1300/2285 train_time:78575ms step_avg:60.44ms +step:1301/2285 train_time:78637ms step_avg:60.44ms +step:1302/2285 train_time:78697ms step_avg:60.44ms +step:1303/2285 train_time:78758ms step_avg:60.44ms +step:1304/2285 train_time:78818ms step_avg:60.44ms +step:1305/2285 train_time:78880ms step_avg:60.44ms +step:1306/2285 train_time:78940ms step_avg:60.44ms +step:1307/2285 train_time:79001ms step_avg:60.44ms +step:1308/2285 train_time:79061ms step_avg:60.44ms +step:1309/2285 train_time:79123ms step_avg:60.45ms +step:1310/2285 train_time:79182ms step_avg:60.44ms +step:1311/2285 train_time:79243ms step_avg:60.45ms +step:1312/2285 train_time:79303ms step_avg:60.44ms +step:1313/2285 train_time:79366ms step_avg:60.45ms +step:1314/2285 train_time:79426ms step_avg:60.45ms +step:1315/2285 train_time:79489ms step_avg:60.45ms +step:1316/2285 train_time:79549ms step_avg:60.45ms +step:1317/2285 train_time:79612ms step_avg:60.45ms +step:1318/2285 train_time:79672ms step_avg:60.45ms +step:1319/2285 train_time:79734ms step_avg:60.45ms +step:1320/2285 train_time:79794ms step_avg:60.45ms +step:1321/2285 train_time:79856ms step_avg:60.45ms +step:1322/2285 train_time:79915ms step_avg:60.45ms +step:1323/2285 train_time:79977ms step_avg:60.45ms +step:1324/2285 train_time:80037ms step_avg:60.45ms +step:1325/2285 train_time:80099ms step_avg:60.45ms +step:1326/2285 train_time:80159ms step_avg:60.45ms +step:1327/2285 train_time:80221ms step_avg:60.45ms +step:1328/2285 train_time:80281ms step_avg:60.45ms +step:1329/2285 train_time:80344ms step_avg:60.45ms +step:1330/2285 train_time:80404ms step_avg:60.45ms +step:1331/2285 train_time:80466ms step_avg:60.46ms +step:1332/2285 train_time:80525ms step_avg:60.45ms +step:1333/2285 train_time:80588ms step_avg:60.46ms +step:1334/2285 train_time:80648ms step_avg:60.46ms +step:1335/2285 train_time:80710ms step_avg:60.46ms +step:1336/2285 train_time:80769ms step_avg:60.46ms +step:1337/2285 train_time:80832ms step_avg:60.46ms +step:1338/2285 train_time:80892ms step_avg:60.46ms +step:1339/2285 train_time:80954ms step_avg:60.46ms +step:1340/2285 train_time:81014ms step_avg:60.46ms +step:1341/2285 train_time:81076ms step_avg:60.46ms +step:1342/2285 train_time:81136ms step_avg:60.46ms +step:1343/2285 train_time:81198ms step_avg:60.46ms +step:1344/2285 train_time:81257ms step_avg:60.46ms +step:1345/2285 train_time:81319ms step_avg:60.46ms +step:1346/2285 train_time:81380ms step_avg:60.46ms +step:1347/2285 train_time:81443ms step_avg:60.46ms +step:1348/2285 train_time:81503ms step_avg:60.46ms +step:1349/2285 train_time:81565ms step_avg:60.46ms +step:1350/2285 train_time:81624ms step_avg:60.46ms +step:1351/2285 train_time:81687ms step_avg:60.46ms +step:1352/2285 train_time:81747ms step_avg:60.46ms +step:1353/2285 train_time:81809ms step_avg:60.46ms +step:1354/2285 train_time:81868ms step_avg:60.46ms +step:1355/2285 train_time:81930ms step_avg:60.47ms +step:1356/2285 train_time:81991ms step_avg:60.47ms +step:1357/2285 train_time:82053ms step_avg:60.47ms +step:1358/2285 train_time:82113ms step_avg:60.47ms +step:1359/2285 train_time:82175ms step_avg:60.47ms +step:1360/2285 train_time:82235ms step_avg:60.47ms +step:1361/2285 train_time:82297ms step_avg:60.47ms +step:1362/2285 train_time:82356ms step_avg:60.47ms +step:1363/2285 train_time:82418ms step_avg:60.47ms +step:1364/2285 train_time:82478ms step_avg:60.47ms +step:1365/2285 train_time:82541ms step_avg:60.47ms +step:1366/2285 train_time:82601ms step_avg:60.47ms +step:1367/2285 train_time:82664ms step_avg:60.47ms +step:1368/2285 train_time:82724ms step_avg:60.47ms +step:1369/2285 train_time:82786ms step_avg:60.47ms +step:1370/2285 train_time:82846ms step_avg:60.47ms +step:1371/2285 train_time:82908ms step_avg:60.47ms +step:1372/2285 train_time:82968ms step_avg:60.47ms +step:1373/2285 train_time:83030ms step_avg:60.47ms +step:1374/2285 train_time:83090ms step_avg:60.47ms +step:1375/2285 train_time:83152ms step_avg:60.47ms +step:1376/2285 train_time:83212ms step_avg:60.47ms +step:1377/2285 train_time:83275ms step_avg:60.48ms +step:1378/2285 train_time:83335ms step_avg:60.47ms +step:1379/2285 train_time:83397ms step_avg:60.48ms +step:1380/2285 train_time:83457ms step_avg:60.48ms +step:1381/2285 train_time:83519ms step_avg:60.48ms +step:1382/2285 train_time:83579ms step_avg:60.48ms +step:1383/2285 train_time:83641ms step_avg:60.48ms +step:1384/2285 train_time:83702ms step_avg:60.48ms +step:1385/2285 train_time:83765ms step_avg:60.48ms +step:1386/2285 train_time:83824ms step_avg:60.48ms +step:1387/2285 train_time:83886ms step_avg:60.48ms +step:1388/2285 train_time:83945ms step_avg:60.48ms +step:1389/2285 train_time:84007ms step_avg:60.48ms +step:1390/2285 train_time:84067ms step_avg:60.48ms +step:1391/2285 train_time:84130ms step_avg:60.48ms +step:1392/2285 train_time:84191ms step_avg:60.48ms +step:1393/2285 train_time:84253ms step_avg:60.48ms +step:1394/2285 train_time:84313ms step_avg:60.48ms +step:1395/2285 train_time:84375ms step_avg:60.48ms +step:1396/2285 train_time:84435ms step_avg:60.48ms +step:1397/2285 train_time:84497ms step_avg:60.48ms +step:1398/2285 train_time:84557ms step_avg:60.48ms +step:1399/2285 train_time:84619ms step_avg:60.49ms +step:1400/2285 train_time:84679ms step_avg:60.48ms +step:1401/2285 train_time:84741ms step_avg:60.49ms +step:1402/2285 train_time:84801ms step_avg:60.49ms +step:1403/2285 train_time:84864ms step_avg:60.49ms +step:1404/2285 train_time:84924ms step_avg:60.49ms +step:1405/2285 train_time:84986ms step_avg:60.49ms +step:1406/2285 train_time:85046ms step_avg:60.49ms +step:1407/2285 train_time:85108ms step_avg:60.49ms +step:1408/2285 train_time:85167ms step_avg:60.49ms +step:1409/2285 train_time:85230ms step_avg:60.49ms +step:1410/2285 train_time:85290ms step_avg:60.49ms +step:1411/2285 train_time:85352ms step_avg:60.49ms +step:1412/2285 train_time:85412ms step_avg:60.49ms +step:1413/2285 train_time:85475ms step_avg:60.49ms +step:1414/2285 train_time:85534ms step_avg:60.49ms +step:1415/2285 train_time:85597ms step_avg:60.49ms +step:1416/2285 train_time:85656ms step_avg:60.49ms +step:1417/2285 train_time:85718ms step_avg:60.49ms +step:1418/2285 train_time:85778ms step_avg:60.49ms +step:1419/2285 train_time:85840ms step_avg:60.49ms +step:1420/2285 train_time:85901ms step_avg:60.49ms +step:1421/2285 train_time:85963ms step_avg:60.50ms +step:1422/2285 train_time:86024ms step_avg:60.49ms +step:1423/2285 train_time:86086ms step_avg:60.50ms +step:1424/2285 train_time:86145ms step_avg:60.50ms +step:1425/2285 train_time:86208ms step_avg:60.50ms +step:1426/2285 train_time:86267ms step_avg:60.50ms +step:1427/2285 train_time:86329ms step_avg:60.50ms +step:1428/2285 train_time:86390ms step_avg:60.50ms +step:1429/2285 train_time:86453ms step_avg:60.50ms +step:1430/2285 train_time:86513ms step_avg:60.50ms +step:1431/2285 train_time:86576ms step_avg:60.50ms +step:1432/2285 train_time:86635ms step_avg:60.50ms +step:1433/2285 train_time:86697ms step_avg:60.50ms +step:1434/2285 train_time:86757ms step_avg:60.50ms +step:1435/2285 train_time:86819ms step_avg:60.50ms +step:1436/2285 train_time:86879ms step_avg:60.50ms +step:1437/2285 train_time:86941ms step_avg:60.50ms +step:1438/2285 train_time:87001ms step_avg:60.50ms +step:1439/2285 train_time:87063ms step_avg:60.50ms +step:1440/2285 train_time:87124ms step_avg:60.50ms +step:1441/2285 train_time:87186ms step_avg:60.50ms +step:1442/2285 train_time:87246ms step_avg:60.50ms +step:1443/2285 train_time:87308ms step_avg:60.50ms +step:1444/2285 train_time:87368ms step_avg:60.50ms +step:1445/2285 train_time:87431ms step_avg:60.51ms +step:1446/2285 train_time:87491ms step_avg:60.51ms +step:1447/2285 train_time:87554ms step_avg:60.51ms +step:1448/2285 train_time:87613ms step_avg:60.51ms +step:1449/2285 train_time:87675ms step_avg:60.51ms +step:1450/2285 train_time:87735ms step_avg:60.51ms +step:1451/2285 train_time:87797ms step_avg:60.51ms +step:1452/2285 train_time:87856ms step_avg:60.51ms +step:1453/2285 train_time:87918ms step_avg:60.51ms +step:1454/2285 train_time:87978ms step_avg:60.51ms +step:1455/2285 train_time:88040ms step_avg:60.51ms +step:1456/2285 train_time:88101ms step_avg:60.51ms +step:1457/2285 train_time:88164ms step_avg:60.51ms +step:1458/2285 train_time:88223ms step_avg:60.51ms +step:1459/2285 train_time:88285ms step_avg:60.51ms +step:1460/2285 train_time:88345ms step_avg:60.51ms +step:1461/2285 train_time:88407ms step_avg:60.51ms +step:1462/2285 train_time:88467ms step_avg:60.51ms +step:1463/2285 train_time:88531ms step_avg:60.51ms +step:1464/2285 train_time:88591ms step_avg:60.51ms +step:1465/2285 train_time:88653ms step_avg:60.51ms +step:1466/2285 train_time:88713ms step_avg:60.51ms +step:1467/2285 train_time:88775ms step_avg:60.51ms +step:1468/2285 train_time:88835ms step_avg:60.51ms +step:1469/2285 train_time:88897ms step_avg:60.52ms +step:1470/2285 train_time:88957ms step_avg:60.51ms +step:1471/2285 train_time:89019ms step_avg:60.52ms +step:1472/2285 train_time:89078ms step_avg:60.52ms +step:1473/2285 train_time:89141ms step_avg:60.52ms +step:1474/2285 train_time:89201ms step_avg:60.52ms +step:1475/2285 train_time:89264ms step_avg:60.52ms +step:1476/2285 train_time:89324ms step_avg:60.52ms +step:1477/2285 train_time:89386ms step_avg:60.52ms +step:1478/2285 train_time:89446ms step_avg:60.52ms +step:1479/2285 train_time:89508ms step_avg:60.52ms +step:1480/2285 train_time:89568ms step_avg:60.52ms +step:1481/2285 train_time:89631ms step_avg:60.52ms +step:1482/2285 train_time:89691ms step_avg:60.52ms +step:1483/2285 train_time:89754ms step_avg:60.52ms +step:1484/2285 train_time:89813ms step_avg:60.52ms +step:1485/2285 train_time:89875ms step_avg:60.52ms +step:1486/2285 train_time:89935ms step_avg:60.52ms +step:1487/2285 train_time:89997ms step_avg:60.52ms +step:1488/2285 train_time:90056ms step_avg:60.52ms +step:1489/2285 train_time:90119ms step_avg:60.52ms +step:1490/2285 train_time:90178ms step_avg:60.52ms +step:1491/2285 train_time:90241ms step_avg:60.52ms +step:1492/2285 train_time:90301ms step_avg:60.52ms +step:1493/2285 train_time:90363ms step_avg:60.52ms +step:1494/2285 train_time:90423ms step_avg:60.52ms +step:1495/2285 train_time:90486ms step_avg:60.53ms +step:1496/2285 train_time:90545ms step_avg:60.53ms +step:1497/2285 train_time:90608ms step_avg:60.53ms +step:1498/2285 train_time:90668ms step_avg:60.53ms +step:1499/2285 train_time:90731ms step_avg:60.53ms +step:1500/2285 train_time:90791ms step_avg:60.53ms +step:1500/2285 val_loss:3.4260 train_time:90854ms step_avg:60.57ms +step:1501/2285 train_time:90873ms step_avg:60.54ms +step:1502/2285 train_time:90917ms step_avg:60.53ms +step:1503/2285 train_time:90981ms step_avg:60.53ms +step:1504/2285 train_time:91042ms step_avg:60.53ms +step:1505/2285 train_time:91103ms step_avg:60.53ms +step:1506/2285 train_time:91163ms step_avg:60.53ms +step:1507/2285 train_time:91225ms step_avg:60.53ms +step:1508/2285 train_time:91284ms step_avg:60.53ms +step:1509/2285 train_time:91346ms step_avg:60.53ms +step:1510/2285 train_time:91405ms step_avg:60.53ms +step:1511/2285 train_time:91467ms step_avg:60.53ms +step:1512/2285 train_time:91527ms step_avg:60.53ms +step:1513/2285 train_time:91588ms step_avg:60.53ms +step:1514/2285 train_time:91648ms step_avg:60.53ms +step:1515/2285 train_time:91709ms step_avg:60.53ms +step:1516/2285 train_time:91770ms step_avg:60.53ms +step:1517/2285 train_time:91834ms step_avg:60.54ms +step:1518/2285 train_time:91894ms step_avg:60.54ms +step:1519/2285 train_time:91957ms step_avg:60.54ms +step:1520/2285 train_time:92017ms step_avg:60.54ms +step:1521/2285 train_time:92079ms step_avg:60.54ms +step:1522/2285 train_time:92139ms step_avg:60.54ms +step:1523/2285 train_time:92201ms step_avg:60.54ms +step:1524/2285 train_time:92261ms step_avg:60.54ms +step:1525/2285 train_time:92322ms step_avg:60.54ms +step:1526/2285 train_time:92382ms step_avg:60.54ms +step:1527/2285 train_time:92444ms step_avg:60.54ms +step:1528/2285 train_time:92504ms step_avg:60.54ms +step:1529/2285 train_time:92565ms step_avg:60.54ms +step:1530/2285 train_time:92624ms step_avg:60.54ms +step:1531/2285 train_time:92687ms step_avg:60.54ms +step:1532/2285 train_time:92748ms step_avg:60.54ms +step:1533/2285 train_time:92811ms step_avg:60.54ms +step:1534/2285 train_time:92872ms step_avg:60.54ms +step:1535/2285 train_time:92936ms step_avg:60.54ms +step:1536/2285 train_time:92996ms step_avg:60.54ms +step:1537/2285 train_time:93058ms step_avg:60.55ms +step:1538/2285 train_time:93118ms step_avg:60.54ms +step:1539/2285 train_time:93180ms step_avg:60.55ms +step:1540/2285 train_time:93240ms step_avg:60.55ms +step:1541/2285 train_time:93302ms step_avg:60.55ms +step:1542/2285 train_time:93362ms step_avg:60.55ms +step:1543/2285 train_time:93423ms step_avg:60.55ms +step:1544/2285 train_time:93483ms step_avg:60.55ms +step:1545/2285 train_time:93544ms step_avg:60.55ms +step:1546/2285 train_time:93604ms step_avg:60.55ms +step:1547/2285 train_time:93666ms step_avg:60.55ms +step:1548/2285 train_time:93727ms step_avg:60.55ms +step:1549/2285 train_time:93789ms step_avg:60.55ms +step:1550/2285 train_time:93850ms step_avg:60.55ms +step:1551/2285 train_time:93913ms step_avg:60.55ms +step:1552/2285 train_time:93973ms step_avg:60.55ms +step:1553/2285 train_time:94036ms step_avg:60.55ms +step:1554/2285 train_time:94096ms step_avg:60.55ms +step:1555/2285 train_time:94159ms step_avg:60.55ms +step:1556/2285 train_time:94218ms step_avg:60.55ms +step:1557/2285 train_time:94281ms step_avg:60.55ms +step:1558/2285 train_time:94341ms step_avg:60.55ms +step:1559/2285 train_time:94402ms step_avg:60.55ms +step:1560/2285 train_time:94462ms step_avg:60.55ms +step:1561/2285 train_time:94524ms step_avg:60.55ms +step:1562/2285 train_time:94584ms step_avg:60.55ms +step:1563/2285 train_time:94646ms step_avg:60.55ms +step:1564/2285 train_time:94706ms step_avg:60.55ms +step:1565/2285 train_time:94769ms step_avg:60.56ms +step:1566/2285 train_time:94830ms step_avg:60.56ms +step:1567/2285 train_time:94893ms step_avg:60.56ms +step:1568/2285 train_time:94953ms step_avg:60.56ms +step:1569/2285 train_time:95015ms step_avg:60.56ms +step:1570/2285 train_time:95075ms step_avg:60.56ms +step:1571/2285 train_time:95138ms step_avg:60.56ms +step:1572/2285 train_time:95197ms step_avg:60.56ms +step:1573/2285 train_time:95260ms step_avg:60.56ms +step:1574/2285 train_time:95319ms step_avg:60.56ms +step:1575/2285 train_time:95381ms step_avg:60.56ms +step:1576/2285 train_time:95442ms step_avg:60.56ms +step:1577/2285 train_time:95504ms step_avg:60.56ms +step:1578/2285 train_time:95565ms step_avg:60.56ms +step:1579/2285 train_time:95627ms step_avg:60.56ms +step:1580/2285 train_time:95687ms step_avg:60.56ms +step:1581/2285 train_time:95750ms step_avg:60.56ms +step:1582/2285 train_time:95810ms step_avg:60.56ms +step:1583/2285 train_time:95873ms step_avg:60.56ms +step:1584/2285 train_time:95933ms step_avg:60.56ms +step:1585/2285 train_time:95996ms step_avg:60.57ms +step:1586/2285 train_time:96055ms step_avg:60.56ms +step:1587/2285 train_time:96118ms step_avg:60.57ms +step:1588/2285 train_time:96178ms step_avg:60.57ms +step:1589/2285 train_time:96240ms step_avg:60.57ms +step:1590/2285 train_time:96300ms step_avg:60.57ms +step:1591/2285 train_time:96361ms step_avg:60.57ms +step:1592/2285 train_time:96421ms step_avg:60.57ms +step:1593/2285 train_time:96483ms step_avg:60.57ms +step:1594/2285 train_time:96544ms step_avg:60.57ms +step:1595/2285 train_time:96607ms step_avg:60.57ms +step:1596/2285 train_time:96666ms step_avg:60.57ms +step:1597/2285 train_time:96728ms step_avg:60.57ms +step:1598/2285 train_time:96789ms step_avg:60.57ms +step:1599/2285 train_time:96852ms step_avg:60.57ms +step:1600/2285 train_time:96912ms step_avg:60.57ms +step:1601/2285 train_time:96975ms step_avg:60.57ms +step:1602/2285 train_time:97035ms step_avg:60.57ms +step:1603/2285 train_time:97096ms step_avg:60.57ms +step:1604/2285 train_time:97156ms step_avg:60.57ms +step:1605/2285 train_time:97218ms step_avg:60.57ms +step:1606/2285 train_time:97279ms step_avg:60.57ms +step:1607/2285 train_time:97341ms step_avg:60.57ms +step:1608/2285 train_time:97401ms step_avg:60.57ms +step:1609/2285 train_time:97463ms step_avg:60.57ms +step:1610/2285 train_time:97523ms step_avg:60.57ms +step:1611/2285 train_time:97585ms step_avg:60.57ms +step:1612/2285 train_time:97645ms step_avg:60.57ms +step:1613/2285 train_time:97708ms step_avg:60.58ms +step:1614/2285 train_time:97768ms step_avg:60.57ms +step:1615/2285 train_time:97830ms step_avg:60.58ms +step:1616/2285 train_time:97890ms step_avg:60.58ms +step:1617/2285 train_time:97953ms step_avg:60.58ms +step:1618/2285 train_time:98013ms step_avg:60.58ms +step:1619/2285 train_time:98075ms step_avg:60.58ms +step:1620/2285 train_time:98135ms step_avg:60.58ms +step:1621/2285 train_time:98198ms step_avg:60.58ms +step:1622/2285 train_time:98258ms step_avg:60.58ms +step:1623/2285 train_time:98320ms step_avg:60.58ms +step:1624/2285 train_time:98380ms step_avg:60.58ms +step:1625/2285 train_time:98442ms step_avg:60.58ms +step:1626/2285 train_time:98502ms step_avg:60.58ms +step:1627/2285 train_time:98564ms step_avg:60.58ms +step:1628/2285 train_time:98625ms step_avg:60.58ms +step:1629/2285 train_time:98687ms step_avg:60.58ms +step:1630/2285 train_time:98747ms step_avg:60.58ms +step:1631/2285 train_time:98809ms step_avg:60.58ms +step:1632/2285 train_time:98869ms step_avg:60.58ms +step:1633/2285 train_time:98932ms step_avg:60.58ms +step:1634/2285 train_time:98993ms step_avg:60.58ms +step:1635/2285 train_time:99055ms step_avg:60.58ms +step:1636/2285 train_time:99115ms step_avg:60.58ms +step:1637/2285 train_time:99177ms step_avg:60.58ms +step:1638/2285 train_time:99237ms step_avg:60.58ms +step:1639/2285 train_time:99300ms step_avg:60.59ms +step:1640/2285 train_time:99360ms step_avg:60.59ms +step:1641/2285 train_time:99422ms step_avg:60.59ms +step:1642/2285 train_time:99482ms step_avg:60.59ms +step:1643/2285 train_time:99544ms step_avg:60.59ms +step:1644/2285 train_time:99604ms step_avg:60.59ms +step:1645/2285 train_time:99666ms step_avg:60.59ms +step:1646/2285 train_time:99726ms step_avg:60.59ms +step:1647/2285 train_time:99789ms step_avg:60.59ms +step:1648/2285 train_time:99849ms step_avg:60.59ms +step:1649/2285 train_time:99912ms step_avg:60.59ms +step:1650/2285 train_time:99973ms step_avg:60.59ms +step:1651/2285 train_time:100036ms step_avg:60.59ms +step:1652/2285 train_time:100095ms step_avg:60.59ms +step:1653/2285 train_time:100157ms step_avg:60.59ms +step:1654/2285 train_time:100217ms step_avg:60.59ms +step:1655/2285 train_time:100279ms step_avg:60.59ms +step:1656/2285 train_time:100340ms step_avg:60.59ms +step:1657/2285 train_time:100402ms step_avg:60.59ms +step:1658/2285 train_time:100462ms step_avg:60.59ms +step:1659/2285 train_time:100524ms step_avg:60.59ms +step:1660/2285 train_time:100583ms step_avg:60.59ms +step:1661/2285 train_time:100646ms step_avg:60.59ms +step:1662/2285 train_time:100706ms step_avg:60.59ms +step:1663/2285 train_time:100768ms step_avg:60.59ms +step:1664/2285 train_time:100827ms step_avg:60.59ms +step:1665/2285 train_time:100890ms step_avg:60.59ms +step:1666/2285 train_time:100951ms step_avg:60.59ms +step:1667/2285 train_time:101014ms step_avg:60.60ms +step:1668/2285 train_time:101074ms step_avg:60.60ms +step:1669/2285 train_time:101136ms step_avg:60.60ms +step:1670/2285 train_time:101195ms step_avg:60.60ms +step:1671/2285 train_time:101258ms step_avg:60.60ms +step:1672/2285 train_time:101318ms step_avg:60.60ms +step:1673/2285 train_time:101380ms step_avg:60.60ms +step:1674/2285 train_time:101440ms step_avg:60.60ms +step:1675/2285 train_time:101502ms step_avg:60.60ms +step:1676/2285 train_time:101562ms step_avg:60.60ms +step:1677/2285 train_time:101624ms step_avg:60.60ms +step:1678/2285 train_time:101685ms step_avg:60.60ms +step:1679/2285 train_time:101746ms step_avg:60.60ms +step:1680/2285 train_time:101806ms step_avg:60.60ms +step:1681/2285 train_time:101869ms step_avg:60.60ms +step:1682/2285 train_time:101930ms step_avg:60.60ms +step:1683/2285 train_time:101992ms step_avg:60.60ms +step:1684/2285 train_time:102052ms step_avg:60.60ms +step:1685/2285 train_time:102115ms step_avg:60.60ms +step:1686/2285 train_time:102174ms step_avg:60.60ms +step:1687/2285 train_time:102236ms step_avg:60.60ms +step:1688/2285 train_time:102297ms step_avg:60.60ms +step:1689/2285 train_time:102359ms step_avg:60.60ms +step:1690/2285 train_time:102419ms step_avg:60.60ms +step:1691/2285 train_time:102481ms step_avg:60.60ms +step:1692/2285 train_time:102541ms step_avg:60.60ms +step:1693/2285 train_time:102604ms step_avg:60.60ms +step:1694/2285 train_time:102664ms step_avg:60.60ms +step:1695/2285 train_time:102726ms step_avg:60.61ms +step:1696/2285 train_time:102786ms step_avg:60.61ms +step:1697/2285 train_time:102849ms step_avg:60.61ms +step:1698/2285 train_time:102909ms step_avg:60.61ms +step:1699/2285 train_time:102972ms step_avg:60.61ms +step:1700/2285 train_time:103033ms step_avg:60.61ms +step:1701/2285 train_time:103095ms step_avg:60.61ms +step:1702/2285 train_time:103155ms step_avg:60.61ms +step:1703/2285 train_time:103217ms step_avg:60.61ms +step:1704/2285 train_time:103277ms step_avg:60.61ms +step:1705/2285 train_time:103340ms step_avg:60.61ms +step:1706/2285 train_time:103400ms step_avg:60.61ms +step:1707/2285 train_time:103463ms step_avg:60.61ms +step:1708/2285 train_time:103522ms step_avg:60.61ms +step:1709/2285 train_time:103584ms step_avg:60.61ms +step:1710/2285 train_time:103644ms step_avg:60.61ms +step:1711/2285 train_time:103706ms step_avg:60.61ms +step:1712/2285 train_time:103766ms step_avg:60.61ms +step:1713/2285 train_time:103828ms step_avg:60.61ms +step:1714/2285 train_time:103888ms step_avg:60.61ms +step:1715/2285 train_time:103951ms step_avg:60.61ms +step:1716/2285 train_time:104010ms step_avg:60.61ms +step:1717/2285 train_time:104074ms step_avg:60.61ms +step:1718/2285 train_time:104134ms step_avg:60.61ms +step:1719/2285 train_time:104196ms step_avg:60.61ms +step:1720/2285 train_time:104257ms step_avg:60.61ms +step:1721/2285 train_time:104320ms step_avg:60.62ms +step:1722/2285 train_time:104380ms step_avg:60.62ms +step:1723/2285 train_time:104442ms step_avg:60.62ms +step:1724/2285 train_time:104503ms step_avg:60.62ms +step:1725/2285 train_time:104565ms step_avg:60.62ms +step:1726/2285 train_time:104625ms step_avg:60.62ms +step:1727/2285 train_time:104686ms step_avg:60.62ms +step:1728/2285 train_time:104746ms step_avg:60.62ms +step:1729/2285 train_time:104808ms step_avg:60.62ms +step:1730/2285 train_time:104868ms step_avg:60.62ms +step:1731/2285 train_time:104931ms step_avg:60.62ms +step:1732/2285 train_time:104992ms step_avg:60.62ms +step:1733/2285 train_time:105054ms step_avg:60.62ms +step:1734/2285 train_time:105114ms step_avg:60.62ms +step:1735/2285 train_time:105176ms step_avg:60.62ms +step:1736/2285 train_time:105237ms step_avg:60.62ms +step:1737/2285 train_time:105299ms step_avg:60.62ms +step:1738/2285 train_time:105359ms step_avg:60.62ms +step:1739/2285 train_time:105421ms step_avg:60.62ms +step:1740/2285 train_time:105481ms step_avg:60.62ms +step:1741/2285 train_time:105543ms step_avg:60.62ms +step:1742/2285 train_time:105602ms step_avg:60.62ms +step:1743/2285 train_time:105664ms step_avg:60.62ms +step:1744/2285 train_time:105724ms step_avg:60.62ms +step:1745/2285 train_time:105787ms step_avg:60.62ms +step:1746/2285 train_time:105847ms step_avg:60.62ms +step:1747/2285 train_time:105910ms step_avg:60.62ms +step:1748/2285 train_time:105970ms step_avg:60.62ms +step:1749/2285 train_time:106033ms step_avg:60.63ms +step:1750/2285 train_time:106093ms step_avg:60.62ms +step:1750/2285 val_loss:3.3655 train_time:106157ms step_avg:60.66ms +step:1751/2285 train_time:106175ms step_avg:60.64ms +step:1752/2285 train_time:106221ms step_avg:60.63ms +step:1753/2285 train_time:106283ms step_avg:60.63ms +step:1754/2285 train_time:106344ms step_avg:60.63ms +step:1755/2285 train_time:106408ms step_avg:60.63ms +step:1756/2285 train_time:106468ms step_avg:60.63ms +step:1757/2285 train_time:106530ms step_avg:60.63ms +step:1758/2285 train_time:106588ms step_avg:60.63ms +step:1759/2285 train_time:106650ms step_avg:60.63ms +step:1760/2285 train_time:106709ms step_avg:60.63ms +step:1761/2285 train_time:106770ms step_avg:60.63ms +step:1762/2285 train_time:106830ms step_avg:60.63ms +step:1763/2285 train_time:106891ms step_avg:60.63ms +step:1764/2285 train_time:106951ms step_avg:60.63ms +step:1765/2285 train_time:107013ms step_avg:60.63ms +step:1766/2285 train_time:107075ms step_avg:60.63ms +step:1767/2285 train_time:107141ms step_avg:60.63ms +step:1768/2285 train_time:107201ms step_avg:60.63ms +step:1769/2285 train_time:107264ms step_avg:60.64ms +step:1770/2285 train_time:107325ms step_avg:60.64ms +step:1771/2285 train_time:107387ms step_avg:60.64ms +step:1772/2285 train_time:107447ms step_avg:60.64ms +step:1773/2285 train_time:107510ms step_avg:60.64ms +step:1774/2285 train_time:107569ms step_avg:60.64ms +step:1775/2285 train_time:107630ms step_avg:60.64ms +step:1776/2285 train_time:107689ms step_avg:60.64ms +step:1777/2285 train_time:107751ms step_avg:60.64ms +step:1778/2285 train_time:107810ms step_avg:60.64ms +step:1779/2285 train_time:107872ms step_avg:60.64ms +step:1780/2285 train_time:107931ms step_avg:60.64ms +step:1781/2285 train_time:107993ms step_avg:60.64ms +step:1782/2285 train_time:108055ms step_avg:60.64ms +step:1783/2285 train_time:108119ms step_avg:60.64ms +step:1784/2285 train_time:108179ms step_avg:60.64ms +step:1785/2285 train_time:108243ms step_avg:60.64ms +step:1786/2285 train_time:108303ms step_avg:60.64ms +step:1787/2285 train_time:108365ms step_avg:60.64ms +step:1788/2285 train_time:108425ms step_avg:60.64ms +step:1789/2285 train_time:108487ms step_avg:60.64ms +step:1790/2285 train_time:108546ms step_avg:60.64ms +step:1791/2285 train_time:108608ms step_avg:60.64ms +step:1792/2285 train_time:108667ms step_avg:60.64ms +step:1793/2285 train_time:108729ms step_avg:60.64ms +step:1794/2285 train_time:108788ms step_avg:60.64ms +step:1795/2285 train_time:108850ms step_avg:60.64ms +step:1796/2285 train_time:108909ms step_avg:60.64ms +step:1797/2285 train_time:108972ms step_avg:60.64ms +step:1798/2285 train_time:109032ms step_avg:60.64ms +step:1799/2285 train_time:109096ms step_avg:60.64ms +step:1800/2285 train_time:109157ms step_avg:60.64ms +step:1801/2285 train_time:109219ms step_avg:60.64ms +step:1802/2285 train_time:109280ms step_avg:60.64ms +step:1803/2285 train_time:109342ms step_avg:60.64ms +step:1804/2285 train_time:109402ms step_avg:60.64ms +step:1805/2285 train_time:109464ms step_avg:60.64ms +step:1806/2285 train_time:109525ms step_avg:60.64ms +step:1807/2285 train_time:109587ms step_avg:60.65ms +step:1808/2285 train_time:109646ms step_avg:60.64ms +step:1809/2285 train_time:109708ms step_avg:60.65ms +step:1810/2285 train_time:109767ms step_avg:60.64ms +step:1811/2285 train_time:109829ms step_avg:60.65ms +step:1812/2285 train_time:109888ms step_avg:60.64ms +step:1813/2285 train_time:109951ms step_avg:60.65ms +step:1814/2285 train_time:110011ms step_avg:60.65ms +step:1815/2285 train_time:110074ms step_avg:60.65ms +step:1816/2285 train_time:110135ms step_avg:60.65ms +step:1817/2285 train_time:110198ms step_avg:60.65ms +step:1818/2285 train_time:110259ms step_avg:60.65ms +step:1819/2285 train_time:110321ms step_avg:60.65ms +step:1820/2285 train_time:110381ms step_avg:60.65ms +step:1821/2285 train_time:110443ms step_avg:60.65ms +step:1822/2285 train_time:110503ms step_avg:60.65ms +step:1823/2285 train_time:110565ms step_avg:60.65ms +step:1824/2285 train_time:110624ms step_avg:60.65ms +step:1825/2285 train_time:110686ms step_avg:60.65ms +step:1826/2285 train_time:110746ms step_avg:60.65ms +step:1827/2285 train_time:110808ms step_avg:60.65ms +step:1828/2285 train_time:110868ms step_avg:60.65ms +step:1829/2285 train_time:110931ms step_avg:60.65ms +step:1830/2285 train_time:110990ms step_avg:60.65ms +step:1831/2285 train_time:111053ms step_avg:60.65ms +step:1832/2285 train_time:111113ms step_avg:60.65ms +step:1833/2285 train_time:111176ms step_avg:60.65ms +step:1834/2285 train_time:111236ms step_avg:60.65ms +step:1835/2285 train_time:111299ms step_avg:60.65ms +step:1836/2285 train_time:111359ms step_avg:60.65ms +step:1837/2285 train_time:111421ms step_avg:60.65ms +step:1838/2285 train_time:111481ms step_avg:60.65ms +step:1839/2285 train_time:111543ms step_avg:60.65ms +step:1840/2285 train_time:111603ms step_avg:60.65ms +step:1841/2285 train_time:111665ms step_avg:60.65ms +step:1842/2285 train_time:111725ms step_avg:60.65ms +step:1843/2285 train_time:111787ms step_avg:60.66ms +step:1844/2285 train_time:111847ms step_avg:60.65ms +step:1845/2285 train_time:111910ms step_avg:60.66ms +step:1846/2285 train_time:111970ms step_avg:60.66ms +step:1847/2285 train_time:112032ms step_avg:60.66ms +step:1848/2285 train_time:112092ms step_avg:60.66ms +step:1849/2285 train_time:112155ms step_avg:60.66ms +step:1850/2285 train_time:112215ms step_avg:60.66ms +step:1851/2285 train_time:112278ms step_avg:60.66ms +step:1852/2285 train_time:112338ms step_avg:60.66ms +step:1853/2285 train_time:112401ms step_avg:60.66ms +step:1854/2285 train_time:112461ms step_avg:60.66ms +step:1855/2285 train_time:112523ms step_avg:60.66ms +step:1856/2285 train_time:112583ms step_avg:60.66ms +step:1857/2285 train_time:112644ms step_avg:60.66ms +step:1858/2285 train_time:112704ms step_avg:60.66ms +step:1859/2285 train_time:112767ms step_avg:60.66ms +step:1860/2285 train_time:112827ms step_avg:60.66ms +step:1861/2285 train_time:112890ms step_avg:60.66ms +step:1862/2285 train_time:112949ms step_avg:60.66ms +step:1863/2285 train_time:113011ms step_avg:60.66ms +step:1864/2285 train_time:113072ms step_avg:60.66ms +step:1865/2285 train_time:113134ms step_avg:60.66ms +step:1866/2285 train_time:113194ms step_avg:60.66ms +step:1867/2285 train_time:113257ms step_avg:60.66ms +step:1868/2285 train_time:113318ms step_avg:60.66ms +step:1869/2285 train_time:113380ms step_avg:60.66ms +step:1870/2285 train_time:113439ms step_avg:60.66ms +step:1871/2285 train_time:113501ms step_avg:60.66ms +step:1872/2285 train_time:113561ms step_avg:60.66ms +step:1873/2285 train_time:113623ms step_avg:60.66ms +step:1874/2285 train_time:113683ms step_avg:60.66ms +step:1875/2285 train_time:113745ms step_avg:60.66ms +step:1876/2285 train_time:113805ms step_avg:60.66ms +step:1877/2285 train_time:113868ms step_avg:60.66ms +step:1878/2285 train_time:113928ms step_avg:60.66ms +step:1879/2285 train_time:113990ms step_avg:60.67ms +step:1880/2285 train_time:114050ms step_avg:60.66ms +step:1881/2285 train_time:114112ms step_avg:60.67ms +step:1882/2285 train_time:114173ms step_avg:60.67ms +step:1883/2285 train_time:114236ms step_avg:60.67ms +step:1884/2285 train_time:114296ms step_avg:60.67ms +step:1885/2285 train_time:114358ms step_avg:60.67ms +step:1886/2285 train_time:114418ms step_avg:60.67ms +step:1887/2285 train_time:114481ms step_avg:60.67ms +step:1888/2285 train_time:114540ms step_avg:60.67ms +step:1889/2285 train_time:114602ms step_avg:60.67ms +step:1890/2285 train_time:114662ms step_avg:60.67ms +step:1891/2285 train_time:114725ms step_avg:60.67ms +step:1892/2285 train_time:114785ms step_avg:60.67ms +step:1893/2285 train_time:114847ms step_avg:60.67ms +step:1894/2285 train_time:114907ms step_avg:60.67ms +step:1895/2285 train_time:114970ms step_avg:60.67ms +step:1896/2285 train_time:115030ms step_avg:60.67ms +step:1897/2285 train_time:115092ms step_avg:60.67ms +step:1898/2285 train_time:115153ms step_avg:60.67ms +step:1899/2285 train_time:115216ms step_avg:60.67ms +step:1900/2285 train_time:115276ms step_avg:60.67ms +step:1901/2285 train_time:115338ms step_avg:60.67ms +step:1902/2285 train_time:115397ms step_avg:60.67ms +step:1903/2285 train_time:115461ms step_avg:60.67ms +step:1904/2285 train_time:115520ms step_avg:60.67ms +step:1905/2285 train_time:115583ms step_avg:60.67ms +step:1906/2285 train_time:115642ms step_avg:60.67ms +step:1907/2285 train_time:115705ms step_avg:60.67ms +step:1908/2285 train_time:115765ms step_avg:60.67ms +step:1909/2285 train_time:115827ms step_avg:60.67ms +step:1910/2285 train_time:115888ms step_avg:60.67ms +step:1911/2285 train_time:115950ms step_avg:60.68ms +step:1912/2285 train_time:116010ms step_avg:60.67ms +step:1913/2285 train_time:116073ms step_avg:60.68ms +step:1914/2285 train_time:116133ms step_avg:60.68ms +step:1915/2285 train_time:116196ms step_avg:60.68ms +step:1916/2285 train_time:116256ms step_avg:60.68ms +step:1917/2285 train_time:116318ms step_avg:60.68ms +step:1918/2285 train_time:116379ms step_avg:60.68ms +step:1919/2285 train_time:116441ms step_avg:60.68ms +step:1920/2285 train_time:116501ms step_avg:60.68ms +step:1921/2285 train_time:116564ms step_avg:60.68ms +step:1922/2285 train_time:116625ms step_avg:60.68ms +step:1923/2285 train_time:116687ms step_avg:60.68ms +step:1924/2285 train_time:116747ms step_avg:60.68ms +step:1925/2285 train_time:116809ms step_avg:60.68ms +step:1926/2285 train_time:116869ms step_avg:60.68ms +step:1927/2285 train_time:116932ms step_avg:60.68ms +step:1928/2285 train_time:116992ms step_avg:60.68ms +step:1929/2285 train_time:117054ms step_avg:60.68ms +step:1930/2285 train_time:117115ms step_avg:60.68ms +step:1931/2285 train_time:117177ms step_avg:60.68ms +step:1932/2285 train_time:117237ms step_avg:60.68ms +step:1933/2285 train_time:117300ms step_avg:60.68ms +step:1934/2285 train_time:117359ms step_avg:60.68ms +step:1935/2285 train_time:117421ms step_avg:60.68ms +step:1936/2285 train_time:117482ms step_avg:60.68ms +step:1937/2285 train_time:117544ms step_avg:60.68ms +step:1938/2285 train_time:117604ms step_avg:60.68ms +step:1939/2285 train_time:117666ms step_avg:60.68ms +step:1940/2285 train_time:117726ms step_avg:60.68ms +step:1941/2285 train_time:117789ms step_avg:60.68ms +step:1942/2285 train_time:117849ms step_avg:60.68ms +step:1943/2285 train_time:117911ms step_avg:60.69ms +step:1944/2285 train_time:117971ms step_avg:60.68ms +step:1945/2285 train_time:118033ms step_avg:60.69ms +step:1946/2285 train_time:118093ms step_avg:60.69ms +step:1947/2285 train_time:118156ms step_avg:60.69ms +step:1948/2285 train_time:118216ms step_avg:60.69ms +step:1949/2285 train_time:118279ms step_avg:60.69ms +step:1950/2285 train_time:118339ms step_avg:60.69ms +step:1951/2285 train_time:118401ms step_avg:60.69ms +step:1952/2285 train_time:118461ms step_avg:60.69ms +step:1953/2285 train_time:118523ms step_avg:60.69ms +step:1954/2285 train_time:118583ms step_avg:60.69ms +step:1955/2285 train_time:118645ms step_avg:60.69ms +step:1956/2285 train_time:118706ms step_avg:60.69ms +step:1957/2285 train_time:118769ms step_avg:60.69ms +step:1958/2285 train_time:118829ms step_avg:60.69ms +step:1959/2285 train_time:118891ms step_avg:60.69ms +step:1960/2285 train_time:118951ms step_avg:60.69ms +step:1961/2285 train_time:119013ms step_avg:60.69ms +step:1962/2285 train_time:119073ms step_avg:60.69ms +step:1963/2285 train_time:119136ms step_avg:60.69ms +step:1964/2285 train_time:119196ms step_avg:60.69ms +step:1965/2285 train_time:119258ms step_avg:60.69ms +step:1966/2285 train_time:119319ms step_avg:60.69ms +step:1967/2285 train_time:119381ms step_avg:60.69ms +step:1968/2285 train_time:119440ms step_avg:60.69ms +step:1969/2285 train_time:119503ms step_avg:60.69ms +step:1970/2285 train_time:119564ms step_avg:60.69ms +step:1971/2285 train_time:119627ms step_avg:60.69ms +step:1972/2285 train_time:119687ms step_avg:60.69ms +step:1973/2285 train_time:119749ms step_avg:60.69ms +step:1974/2285 train_time:119809ms step_avg:60.69ms +step:1975/2285 train_time:119871ms step_avg:60.69ms +step:1976/2285 train_time:119932ms step_avg:60.69ms +step:1977/2285 train_time:119994ms step_avg:60.69ms +step:1978/2285 train_time:120053ms step_avg:60.69ms +step:1979/2285 train_time:120116ms step_avg:60.70ms +step:1980/2285 train_time:120176ms step_avg:60.69ms +step:1981/2285 train_time:120238ms step_avg:60.70ms +step:1982/2285 train_time:120298ms step_avg:60.70ms +step:1983/2285 train_time:120361ms step_avg:60.70ms +step:1984/2285 train_time:120420ms step_avg:60.70ms +step:1985/2285 train_time:120483ms step_avg:60.70ms +step:1986/2285 train_time:120543ms step_avg:60.70ms +step:1987/2285 train_time:120605ms step_avg:60.70ms +step:1988/2285 train_time:120665ms step_avg:60.70ms +step:1989/2285 train_time:120728ms step_avg:60.70ms +step:1990/2285 train_time:120788ms step_avg:60.70ms +step:1991/2285 train_time:120850ms step_avg:60.70ms +step:1992/2285 train_time:120911ms step_avg:60.70ms +step:1993/2285 train_time:120973ms step_avg:60.70ms +step:1994/2285 train_time:121033ms step_avg:60.70ms +step:1995/2285 train_time:121096ms step_avg:60.70ms +step:1996/2285 train_time:121156ms step_avg:60.70ms +step:1997/2285 train_time:121219ms step_avg:60.70ms +step:1998/2285 train_time:121279ms step_avg:60.70ms +step:1999/2285 train_time:121341ms step_avg:60.70ms +step:2000/2285 train_time:121401ms step_avg:60.70ms +step:2000/2285 val_loss:3.3166 train_time:121465ms step_avg:60.73ms +step:2001/2285 train_time:121483ms step_avg:60.71ms +step:2002/2285 train_time:121526ms step_avg:60.70ms +step:2003/2285 train_time:121588ms step_avg:60.70ms +step:2004/2285 train_time:121648ms step_avg:60.70ms +step:2005/2285 train_time:121712ms step_avg:60.70ms +step:2006/2285 train_time:121771ms step_avg:60.70ms +step:2007/2285 train_time:121833ms step_avg:60.70ms +step:2008/2285 train_time:121892ms step_avg:60.70ms +step:2009/2285 train_time:121954ms step_avg:60.70ms +step:2010/2285 train_time:122013ms step_avg:60.70ms +step:2011/2285 train_time:122074ms step_avg:60.70ms +step:2012/2285 train_time:122133ms step_avg:60.70ms +step:2013/2285 train_time:122195ms step_avg:60.70ms +step:2014/2285 train_time:122255ms step_avg:60.70ms +step:2015/2285 train_time:122316ms step_avg:60.70ms +step:2016/2285 train_time:122379ms step_avg:60.70ms +step:2017/2285 train_time:122447ms step_avg:60.71ms +step:2018/2285 train_time:122508ms step_avg:60.71ms +step:2019/2285 train_time:122571ms step_avg:60.71ms +step:2020/2285 train_time:122631ms step_avg:60.71ms +step:2021/2285 train_time:122694ms step_avg:60.71ms +step:2022/2285 train_time:122754ms step_avg:60.71ms +step:2023/2285 train_time:122816ms step_avg:60.71ms +step:2024/2285 train_time:122876ms step_avg:60.71ms +step:2025/2285 train_time:122937ms step_avg:60.71ms +step:2026/2285 train_time:122998ms step_avg:60.71ms +step:2027/2285 train_time:123059ms step_avg:60.71ms +step:2028/2285 train_time:123118ms step_avg:60.71ms +step:2029/2285 train_time:123180ms step_avg:60.71ms +step:2030/2285 train_time:123239ms step_avg:60.71ms +step:2031/2285 train_time:123302ms step_avg:60.71ms +step:2032/2285 train_time:123363ms step_avg:60.71ms +step:2033/2285 train_time:123426ms step_avg:60.71ms +step:2034/2285 train_time:123487ms step_avg:60.71ms +step:2035/2285 train_time:123549ms step_avg:60.71ms +step:2036/2285 train_time:123610ms step_avg:60.71ms +step:2037/2285 train_time:123672ms step_avg:60.71ms +step:2038/2285 train_time:123733ms step_avg:60.71ms +step:2039/2285 train_time:123796ms step_avg:60.71ms +step:2040/2285 train_time:123856ms step_avg:60.71ms +step:2041/2285 train_time:123918ms step_avg:60.71ms +step:2042/2285 train_time:123978ms step_avg:60.71ms +step:2043/2285 train_time:124039ms step_avg:60.71ms +step:2044/2285 train_time:124099ms step_avg:60.71ms +step:2045/2285 train_time:124161ms step_avg:60.71ms +step:2046/2285 train_time:124221ms step_avg:60.71ms +step:2047/2285 train_time:124284ms step_avg:60.72ms +step:2048/2285 train_time:124344ms step_avg:60.72ms +step:2049/2285 train_time:124408ms step_avg:60.72ms +step:2050/2285 train_time:124468ms step_avg:60.72ms +step:2051/2285 train_time:124531ms step_avg:60.72ms +step:2052/2285 train_time:124591ms step_avg:60.72ms +step:2053/2285 train_time:124653ms step_avg:60.72ms +step:2054/2285 train_time:124714ms step_avg:60.72ms +step:2055/2285 train_time:124776ms step_avg:60.72ms +step:2056/2285 train_time:124836ms step_avg:60.72ms +step:2057/2285 train_time:124898ms step_avg:60.72ms +step:2058/2285 train_time:124958ms step_avg:60.72ms +step:2059/2285 train_time:125020ms step_avg:60.72ms +step:2060/2285 train_time:125079ms step_avg:60.72ms +step:2061/2285 train_time:125142ms step_avg:60.72ms +step:2062/2285 train_time:125202ms step_avg:60.72ms +step:2063/2285 train_time:125264ms step_avg:60.72ms +step:2064/2285 train_time:125324ms step_avg:60.72ms +step:2065/2285 train_time:125386ms step_avg:60.72ms +step:2066/2285 train_time:125447ms step_avg:60.72ms +step:2067/2285 train_time:125509ms step_avg:60.72ms +step:2068/2285 train_time:125570ms step_avg:60.72ms +step:2069/2285 train_time:125633ms step_avg:60.72ms +step:2070/2285 train_time:125693ms step_avg:60.72ms +step:2071/2285 train_time:125755ms step_avg:60.72ms +step:2072/2285 train_time:125815ms step_avg:60.72ms +step:2073/2285 train_time:125878ms step_avg:60.72ms +step:2074/2285 train_time:125938ms step_avg:60.72ms +step:2075/2285 train_time:126001ms step_avg:60.72ms +step:2076/2285 train_time:126061ms step_avg:60.72ms +step:2077/2285 train_time:126123ms step_avg:60.72ms +step:2078/2285 train_time:126183ms step_avg:60.72ms +step:2079/2285 train_time:126245ms step_avg:60.72ms +step:2080/2285 train_time:126304ms step_avg:60.72ms +step:2081/2285 train_time:126366ms step_avg:60.72ms +step:2082/2285 train_time:126427ms step_avg:60.72ms +step:2083/2285 train_time:126489ms step_avg:60.72ms +step:2084/2285 train_time:126549ms step_avg:60.72ms +step:2085/2285 train_time:126612ms step_avg:60.73ms +step:2086/2285 train_time:126673ms step_avg:60.73ms +step:2087/2285 train_time:126735ms step_avg:60.73ms +step:2088/2285 train_time:126796ms step_avg:60.73ms +step:2089/2285 train_time:126857ms step_avg:60.73ms +step:2090/2285 train_time:126918ms step_avg:60.73ms +step:2091/2285 train_time:126980ms step_avg:60.73ms +step:2092/2285 train_time:127040ms step_avg:60.73ms +step:2093/2285 train_time:127103ms step_avg:60.73ms +step:2094/2285 train_time:127163ms step_avg:60.73ms +step:2095/2285 train_time:127224ms step_avg:60.73ms +step:2096/2285 train_time:127284ms step_avg:60.73ms +step:2097/2285 train_time:127346ms step_avg:60.73ms +step:2098/2285 train_time:127406ms step_avg:60.73ms +step:2099/2285 train_time:127468ms step_avg:60.73ms +step:2100/2285 train_time:127529ms step_avg:60.73ms +step:2101/2285 train_time:127591ms step_avg:60.73ms +step:2102/2285 train_time:127652ms step_avg:60.73ms +step:2103/2285 train_time:127715ms step_avg:60.73ms +step:2104/2285 train_time:127775ms step_avg:60.73ms +step:2105/2285 train_time:127837ms step_avg:60.73ms +step:2106/2285 train_time:127898ms step_avg:60.73ms +step:2107/2285 train_time:127960ms step_avg:60.73ms +step:2108/2285 train_time:128020ms step_avg:60.73ms +step:2109/2285 train_time:128082ms step_avg:60.73ms +step:2110/2285 train_time:128142ms step_avg:60.73ms +step:2111/2285 train_time:128205ms step_avg:60.73ms +step:2112/2285 train_time:128265ms step_avg:60.73ms +step:2113/2285 train_time:128327ms step_avg:60.73ms +step:2114/2285 train_time:128387ms step_avg:60.73ms +step:2115/2285 train_time:128450ms step_avg:60.73ms +step:2116/2285 train_time:128510ms step_avg:60.73ms +step:2117/2285 train_time:128572ms step_avg:60.73ms +step:2118/2285 train_time:128632ms step_avg:60.73ms +step:2119/2285 train_time:128695ms step_avg:60.73ms +step:2120/2285 train_time:128755ms step_avg:60.73ms +step:2121/2285 train_time:128818ms step_avg:60.73ms +step:2122/2285 train_time:128878ms step_avg:60.73ms +step:2123/2285 train_time:128940ms step_avg:60.73ms +step:2124/2285 train_time:129000ms step_avg:60.73ms +step:2125/2285 train_time:129063ms step_avg:60.74ms +step:2126/2285 train_time:129123ms step_avg:60.74ms +step:2127/2285 train_time:129185ms step_avg:60.74ms +step:2128/2285 train_time:129245ms step_avg:60.74ms +step:2129/2285 train_time:129307ms step_avg:60.74ms +step:2130/2285 train_time:129367ms step_avg:60.74ms +step:2131/2285 train_time:129429ms step_avg:60.74ms +step:2132/2285 train_time:129490ms step_avg:60.74ms +step:2133/2285 train_time:129552ms step_avg:60.74ms +step:2134/2285 train_time:129612ms step_avg:60.74ms +step:2135/2285 train_time:129675ms step_avg:60.74ms +step:2136/2285 train_time:129736ms step_avg:60.74ms +step:2137/2285 train_time:129799ms step_avg:60.74ms +step:2138/2285 train_time:129859ms step_avg:60.74ms +step:2139/2285 train_time:129921ms step_avg:60.74ms +step:2140/2285 train_time:129981ms step_avg:60.74ms +step:2141/2285 train_time:130044ms step_avg:60.74ms +step:2142/2285 train_time:130104ms step_avg:60.74ms +step:2143/2285 train_time:130166ms step_avg:60.74ms +step:2144/2285 train_time:130226ms step_avg:60.74ms +step:2145/2285 train_time:130288ms step_avg:60.74ms +step:2146/2285 train_time:130348ms step_avg:60.74ms +step:2147/2285 train_time:130411ms step_avg:60.74ms +step:2148/2285 train_time:130471ms step_avg:60.74ms +step:2149/2285 train_time:130533ms step_avg:60.74ms +step:2150/2285 train_time:130593ms step_avg:60.74ms +step:2151/2285 train_time:130656ms step_avg:60.74ms +step:2152/2285 train_time:130716ms step_avg:60.74ms +step:2153/2285 train_time:130778ms step_avg:60.74ms +step:2154/2285 train_time:130838ms step_avg:60.74ms +step:2155/2285 train_time:130901ms step_avg:60.74ms +step:2156/2285 train_time:130961ms step_avg:60.74ms +step:2157/2285 train_time:131024ms step_avg:60.74ms +step:2158/2285 train_time:131084ms step_avg:60.74ms +step:2159/2285 train_time:131147ms step_avg:60.74ms +step:2160/2285 train_time:131207ms step_avg:60.74ms +step:2161/2285 train_time:131269ms step_avg:60.74ms +step:2162/2285 train_time:131329ms step_avg:60.74ms +step:2163/2285 train_time:131392ms step_avg:60.75ms +step:2164/2285 train_time:131452ms step_avg:60.74ms +step:2165/2285 train_time:131514ms step_avg:60.75ms +step:2166/2285 train_time:131574ms step_avg:60.75ms +step:2167/2285 train_time:131636ms step_avg:60.75ms +step:2168/2285 train_time:131697ms step_avg:60.75ms +step:2169/2285 train_time:131759ms step_avg:60.75ms +step:2170/2285 train_time:131819ms step_avg:60.75ms +step:2171/2285 train_time:131882ms step_avg:60.75ms +step:2172/2285 train_time:131942ms step_avg:60.75ms +step:2173/2285 train_time:132004ms step_avg:60.75ms +step:2174/2285 train_time:132064ms step_avg:60.75ms +step:2175/2285 train_time:132127ms step_avg:60.75ms +step:2176/2285 train_time:132187ms step_avg:60.75ms +step:2177/2285 train_time:132250ms step_avg:60.75ms +step:2178/2285 train_time:132310ms step_avg:60.75ms +step:2179/2285 train_time:132372ms step_avg:60.75ms +step:2180/2285 train_time:132432ms step_avg:60.75ms +step:2181/2285 train_time:132495ms step_avg:60.75ms +step:2182/2285 train_time:132554ms step_avg:60.75ms +step:2183/2285 train_time:132617ms step_avg:60.75ms +step:2184/2285 train_time:132677ms step_avg:60.75ms +step:2185/2285 train_time:132739ms step_avg:60.75ms +step:2186/2285 train_time:132799ms step_avg:60.75ms +step:2187/2285 train_time:132861ms step_avg:60.75ms +step:2188/2285 train_time:132922ms step_avg:60.75ms +step:2189/2285 train_time:132984ms step_avg:60.75ms +step:2190/2285 train_time:133044ms step_avg:60.75ms +step:2191/2285 train_time:133106ms step_avg:60.75ms +step:2192/2285 train_time:133166ms step_avg:60.75ms +step:2193/2285 train_time:133229ms step_avg:60.75ms +step:2194/2285 train_time:133289ms step_avg:60.75ms +step:2195/2285 train_time:133351ms step_avg:60.75ms +step:2196/2285 train_time:133411ms step_avg:60.75ms +step:2197/2285 train_time:133473ms step_avg:60.75ms +step:2198/2285 train_time:133533ms step_avg:60.75ms +step:2199/2285 train_time:133597ms step_avg:60.75ms +step:2200/2285 train_time:133657ms step_avg:60.75ms +step:2201/2285 train_time:133719ms step_avg:60.75ms +step:2202/2285 train_time:133779ms step_avg:60.75ms +step:2203/2285 train_time:133842ms step_avg:60.75ms +step:2204/2285 train_time:133902ms step_avg:60.75ms +step:2205/2285 train_time:133965ms step_avg:60.75ms +step:2206/2285 train_time:134025ms step_avg:60.75ms +step:2207/2285 train_time:134087ms step_avg:60.76ms +step:2208/2285 train_time:134148ms step_avg:60.76ms +step:2209/2285 train_time:134210ms step_avg:60.76ms +step:2210/2285 train_time:134270ms step_avg:60.76ms +step:2211/2285 train_time:134333ms step_avg:60.76ms +step:2212/2285 train_time:134393ms step_avg:60.76ms +step:2213/2285 train_time:134455ms step_avg:60.76ms +step:2214/2285 train_time:134515ms step_avg:60.76ms +step:2215/2285 train_time:134578ms step_avg:60.76ms +step:2216/2285 train_time:134638ms step_avg:60.76ms +step:2217/2285 train_time:134701ms step_avg:60.76ms +step:2218/2285 train_time:134760ms step_avg:60.76ms +step:2219/2285 train_time:134823ms step_avg:60.76ms +step:2220/2285 train_time:134883ms step_avg:60.76ms +step:2221/2285 train_time:134945ms step_avg:60.76ms +step:2222/2285 train_time:135005ms step_avg:60.76ms +step:2223/2285 train_time:135068ms step_avg:60.76ms +step:2224/2285 train_time:135128ms step_avg:60.76ms +step:2225/2285 train_time:135190ms step_avg:60.76ms +step:2226/2285 train_time:135250ms step_avg:60.76ms +step:2227/2285 train_time:135313ms step_avg:60.76ms +step:2228/2285 train_time:135373ms step_avg:60.76ms +step:2229/2285 train_time:135435ms step_avg:60.76ms +step:2230/2285 train_time:135495ms step_avg:60.76ms +step:2231/2285 train_time:135557ms step_avg:60.76ms +step:2232/2285 train_time:135617ms step_avg:60.76ms +step:2233/2285 train_time:135679ms step_avg:60.76ms +step:2234/2285 train_time:135739ms step_avg:60.76ms +step:2235/2285 train_time:135801ms step_avg:60.76ms +step:2236/2285 train_time:135861ms step_avg:60.76ms +step:2237/2285 train_time:135924ms step_avg:60.76ms +step:2238/2285 train_time:135984ms step_avg:60.76ms +step:2239/2285 train_time:136046ms step_avg:60.76ms +step:2240/2285 train_time:136106ms step_avg:60.76ms +step:2241/2285 train_time:136168ms step_avg:60.76ms +step:2242/2285 train_time:136229ms step_avg:60.76ms +step:2243/2285 train_time:136293ms step_avg:60.76ms +step:2244/2285 train_time:136353ms step_avg:60.76ms +step:2245/2285 train_time:136416ms step_avg:60.76ms +step:2246/2285 train_time:136476ms step_avg:60.76ms +step:2247/2285 train_time:136538ms step_avg:60.76ms +step:2248/2285 train_time:136598ms step_avg:60.76ms +step:2249/2285 train_time:136660ms step_avg:60.76ms +step:2250/2285 train_time:136720ms step_avg:60.76ms +step:2250/2285 val_loss:3.2816 train_time:136784ms step_avg:60.79ms +step:2251/2285 train_time:136802ms step_avg:60.77ms +step:2252/2285 train_time:136847ms step_avg:60.77ms +step:2253/2285 train_time:136910ms step_avg:60.77ms +step:2254/2285 train_time:136971ms step_avg:60.77ms +step:2255/2285 train_time:137034ms step_avg:60.77ms +step:2256/2285 train_time:137094ms step_avg:60.77ms +step:2257/2285 train_time:137156ms step_avg:60.77ms +step:2258/2285 train_time:137217ms step_avg:60.77ms +step:2259/2285 train_time:137279ms step_avg:60.77ms +step:2260/2285 train_time:137339ms step_avg:60.77ms +step:2261/2285 train_time:137402ms step_avg:60.77ms +step:2262/2285 train_time:137461ms step_avg:60.77ms +step:2263/2285 train_time:137523ms step_avg:60.77ms +step:2264/2285 train_time:137584ms step_avg:60.77ms +step:2265/2285 train_time:137646ms step_avg:60.77ms +step:2266/2285 train_time:137706ms step_avg:60.77ms +step:2267/2285 train_time:137771ms step_avg:60.77ms +step:2268/2285 train_time:137832ms step_avg:60.77ms +step:2269/2285 train_time:137895ms step_avg:60.77ms +step:2270/2285 train_time:137956ms step_avg:60.77ms +step:2271/2285 train_time:138018ms step_avg:60.77ms +step:2272/2285 train_time:138078ms step_avg:60.77ms +step:2273/2285 train_time:138141ms step_avg:60.77ms +step:2274/2285 train_time:138202ms step_avg:60.77ms +step:2275/2285 train_time:138264ms step_avg:60.78ms +step:2276/2285 train_time:138323ms step_avg:60.77ms +step:2277/2285 train_time:138385ms step_avg:60.78ms +step:2278/2285 train_time:138445ms step_avg:60.77ms +step:2279/2285 train_time:138507ms step_avg:60.78ms +step:2280/2285 train_time:138567ms step_avg:60.77ms +step:2281/2285 train_time:138629ms step_avg:60.78ms +step:2282/2285 train_time:138690ms step_avg:60.78ms +step:2283/2285 train_time:138753ms step_avg:60.78ms +step:2284/2285 train_time:138813ms step_avg:60.78ms +step:2285/2285 train_time:138876ms step_avg:60.78ms +step:2285/2285 val_loss:3.2757 train_time:138937ms step_avg:60.80ms +peak memory allocated: 29626 MiB reserved: 50528 MiB diff --git a/train_gpt.py b/train_gpt.py index fc09ee34d..7030c04f1 100644 --- a/train_gpt.py +++ b/train_gpt.py @@ -10,10 +10,11 @@ import time import uuid from dataclasses import dataclass +from collections import defaultdict from itertools import accumulate from pathlib import Path -os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" +os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" import torch torch.empty( @@ -417,7 +418,7 @@ class Muon(torch.optim.Optimizer): Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has - the advantage that it can be stably run in bfloat16 on the GPU. + the advantage that it can be stably run in bfloat16 on the GPU. Note: A later PR replaced Newton-Shulz with Polar Express for the orthogonalization step Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, @@ -427,10 +428,10 @@ class Muon(torch.optim.Optimizer): This hyper-optimized class has faster execution time than the current impl of Adam for small params Custom distributed sizing: - The model stores all attn and mlp weights in the same shape, and then updates the view as - needed on the forward pass. This enables attn and mlp weights to be contained within the same - dist.reduce_scatter_tensor() call. The model architecture has been customized to enable - (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. + The model stores all attn and mlp weights in the same shape, and then updates the view as + needed on the forward pass. This enables attn and mlp weights to be contained within the same + dist.reduce_scatter_tensor() call. The model architecture has been customized to enable + (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. The scheduling is: 1. reduce scatter smear_gate (1 param 7 padding params) 2. reduce scatter attn_gate (10 params 6 padding params) @@ -445,8 +446,9 @@ class Muon(torch.optim.Optimizer): 9. wait for each all gather to complete and update params Empirically, leading with small params provides an additional 0.2s improvement. """ - def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, custom_sizing=True): - defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum) + def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, eps=1e-8, beta2=0.95, custom_sizing=True): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 # custom sizing requires 8 GPUs if custom_sizing and dist.get_world_size()==8: param_groups = self.generate_custom_param_groups(params) @@ -454,93 +456,81 @@ def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, custom_siz param_groups = self.generate_standard_param_groups(params) super().__init__(param_groups, defaults) + def reset(self): + # expose a reset for clearing buffers + for group in self.param_groups: + group["momentum_buffer"].zero_() + group["second_momentum_buffer"].zero_() + def generate_standard_param_groups(self, params): """ Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. - Creates one param group per size, while giving attn its own param group for resize op. + Creates one param group per module. """ - params = list(params) + groups = defaultdict(list) + for param in params: + groups[param.label].append(param) + param_groups = [] - attn_subset = [p for p in params if p.label == 'attn'] - non_attn_subset = [p for p in params if p.label != 'attn'] - param_groups.append(dict(params=attn_subset)) + for module_name, group_params in groups.items(): + chunk_size = (len(group_params) + self.world_size - 1) // self.world_size + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) - sizes = {p.shape for p in non_attn_subset} - for size in sizes: - group_params = [p for p in non_attn_subset if p.shape == size] - param_groups.append(dict(params=group_params)) return param_groups - + def generate_custom_param_groups(self, params): """ - Implementation requires that a single GPU does not receive both attn + Implementation requires that a single GPU does not receive both attn and mlp params when a param group is split across GPUs. """ - label_ranks = { - 'smear_gate': 1, # 1 param - 'attn_gate': 2, # 10 params - 'attn': 3, # 10 params - 'mlp': 4, # 22 params - } - params = list(params) - params.sort(key=lambda x: label_ranks.get(x.label)) + module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp_up', 'mlp_down'] + params_list = list(params) + params_list.sort(key=lambda x: module_group_order.index(x.label)) + idx = 0 - group_sizes = [1,10,16,16] - assert len(params)==sum(group_sizes) + group_sizes = [1, 10, 16, 16] + assert len(params_list) == sum(group_sizes) param_groups = [] for size in group_sizes: - group_params = params[idx:idx+size] - param_groups.append(dict(params=group_params)) + chunk_size = (size + self.world_size - 1) // self.world_size + group_params = params_list[idx: idx + size] + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) idx += size + return param_groups @torch.no_grad() def step(self): # Efficient systems-wise implementation of step developed by @YouJiacheng, # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, - # @ryanyang0, and @vagrawal. + # @ryanyang0, @vagrawal, and @varunneal. rank = dist.get_rank() - world_size = dist.get_world_size() group_infos = [] for group in self.param_groups: params: list[Tensor] = group["params"] if not params: continue - num_params = len(params) - padded_num_params = ( - (num_params + world_size - 1) // world_size * world_size - ) - - grads_to_stack = [p.grad for p in params] - if padded_num_params > num_params: - padding_grad = torch.zeros_like(params[0].grad) - grads_to_stack.extend( - [padding_grad] * (padded_num_params - num_params) - ) - - stacked_grads = torch.stack(grads_to_stack) + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size - chunk_size = padded_num_params // world_size - grad_chunk = torch.empty( - (chunk_size, *params[0].grad.shape), - dtype=stacked_grads.dtype, - device=stacked_grads.device, + stacked_grads = torch.empty( + (padded_num_params, *params[0].shape), + dtype=params[0].dtype, + device=params[0].device ) + for i, p in enumerate(params): + stacked_grads[i].copy_(p.grad, non_blocking=True) + if len(params) < padded_num_params: + stacked_grads[len(params):].zero_() + + grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) reduce_future = dist.reduce_scatter_tensor( grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True ).get_future() - group_infos.append( - { - "params": params, - "grad_chunk": grad_chunk, - "reduce_future": reduce_future, - "chunk_size": chunk_size, - "padded_num_params": padded_num_params, - } - ) + group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) all_gather_infos = [] # Second pass: wait for gradients, compute updates for the local shard of parameters, @@ -548,111 +538,93 @@ def step(self): for group, info in zip(self.param_groups, group_infos): info["reduce_future"].wait() - params = info["params"] + params = group["params"] grad_chunk = info["grad_chunk"] - chunk_size = info["chunk_size"] - start_idx = rank * chunk_size - - # Determine effective LR and WD once per group, assuming constant for same-shaped params. - # This helps in vectorizing operations later. - p_example = params[0] # All params in a group have the same shape. - eff_lr_val = ( - group["lr"] - * max(1, p_example.size(-2) / p_example.size(-1)) ** 0.5 - * getattr(p_example, "lr_mul", 1.0) - ) - eff_weight_decay_val = ( - group["lr"] - * group["weight_decay"] - * getattr(p_example, "wd_mul", 1.0) - ) - - # Prepare a contiguous buffer for the updated parameters for this rank's chunk. - # This buffer will serve as the input_tensor for dist.all_gather_into_tensor. - updated_param_chunk = torch.empty( - (chunk_size, *p_example.shape), - dtype=p_example.dtype, - device=p_example.device, - ) - - # List to collect update_grad tensors for batched zeropower computation. - update_grads_for_zeropower = [] - - # Process each parameter in this rank's chunk. - for i in range(chunk_size): - param_idx = start_idx + i - - if param_idx >= len(params): - # For padding: Fill the corresponding part of the updated_param_chunk with zeros. - # These padded entries will not be used by other ranks in the all_gather, but - # initializing them prevents uninitialized memory access issues. - updated_param_chunk[i].zero_() - # Also append a zero tensor for zeropower input if it must be padded. - update_grads_for_zeropower.append( - torch.zeros_like(p_example.grad) - ) - continue - param = params[param_idx] - grad = grad_chunk[ - i - ] # This gradient corresponds to the current parameter param. - state = self.state[param] - - # Initialize momentum buffer if not present - if not state: - state["momentum_buffer"] = torch.zeros_like(grad) + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size - momentum_buffer = state["momentum_buffer"] + start_idx = rank * chunk_size + module_idx = start_idx if start_idx < len(params) else 0 + + num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank + + if "momentum_buffer" not in group: + group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) + momentum_buffer = group["momentum_buffer"] + # Apply momentum update to the persistent momentum buffer in-place + momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) + updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) + + grad_shape = updated_grads.shape + if params[module_idx].label == 'attn': + # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] + for p in params[module_idx:module_idx + num_params]: + assert p.label == 'attn' + updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) + ref_param = params[module_idx] + param_shape = ref_param.shape + + if "second_momentum_buffer" not in group: + group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) + if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) + ) + second_momentum_buffer = group["second_momentum_buffer"] + + if "param_lr" not in group: + group["param_lr"] = ( + max(1., param_shape[-2] / param_shape[-1]) ** 0.5 + * ref_param.new_tensor( + [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + ) - # Apply momentum update directly to the persistent momentum buffer in-place. - momentum_buffer.lerp_(grad, 1 - group["momentum"]) + group["param_wd"] = ref_param.new_tensor( + [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) - # Compute the actual `update_grad` for zeropower. This creates a new tensor. - update_grad = grad.lerp(momentum_buffer, group["momentum"]) - update_grads_for_zeropower.append(update_grad) + # Determine LR and WR + eff_lr = group["lr"] * group["param_lr"] + eff_wd = group["weight_decay"] * group["param_wd"] - # Copy the current parameter value into the temporary buffer. - updated_param_chunk[i].copy_(param) + # Compute zeropower for the entire chunk in a single, batched call. + if num_params == 0: + v_chunk = updated_grads + elif params[module_idx].label == "smear_gate": + # dividing by magnitude is equivalent of SVN for 1d tensors + v_chunk = updated_grads / (updated_grads.norm(dim=(-2, -1), keepdim=True).clamp_min(1e-10)) + else: + v_chunk = polar_express(updated_grads) - # Apply weight decay directly to the buffer. - updated_param_chunk[i].mul_(1 - eff_weight_decay_val) + # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) + v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) + second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) + step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() + v_chunk.mul_(step_size) + v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) - # Stack the individual `update_grad` tensors for efficient batched zeropower computation. - batched_update_grads = torch.stack(update_grads_for_zeropower) + v_chunk = v_chunk.view(grad_shape) - # Compute zeropower for the entire chunk in a single, batched call. - original_shape = batched_update_grads.shape - # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] to apply polar_express independently to Q,K,V,O - param_idx = start_idx if start_idx < len(params) else 0 - if getattr(params[param_idx], 'label', None) == 'attn': - for p in params[param_idx:param_idx+chunk_size]: - assert getattr(params[param_idx], 'label', None)=='attn', "GPU cannot mix attn and mlp params" - batch = 4 * original_shape[0] - d1 = original_shape[1] - d2 = original_shape[2] // 4 - batched = batched_update_grads.view(batch, d1, d2) - v_chunk = polar_express(batched) - v_chunk = v_chunk.view(original_shape) - else: - v_chunk = polar_express(batched_update_grads) + updated_params = torch.empty_like(grad_chunk) + param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) + # Apply weight decay directly to the buffer. + param_chunk.mul_(1 - eff_wd) - # Add the computed zeropower update to the parameters in the buffer. - # This loop applies the zeropower output (v_chunk) to the `updated_param_chunk` buffer. - for i in range(chunk_size): - param_idx = start_idx + i - if param_idx >= len(params): # Skip padded entries again. - continue + param_chunk.add_(-eff_lr * v_chunk) - # Add the computed zeropower update to the parameter in the buffer. - updated_param_chunk[i].add_(v_chunk[i], alpha=-eff_lr_val) + updated_params[:num_params].copy_(param_chunk) + if num_params < chunk_size: + updated_params[num_params:].zero_() stacked_params = torch.empty( - (info["padded_num_params"], *params[0].shape), - dtype=params[0].dtype, - device=params[0].device, + (padded_num_params, *param_shape), + dtype=updated_params.dtype, + device=updated_params.device, ) + gather_future = dist.all_gather_into_tensor( - stacked_params, updated_param_chunk, async_op=True + stacked_params, updated_params, async_op=True ).get_future() all_gather_infos.append( @@ -676,6 +648,7 @@ def step(self): class DistAdam(torch.optim.Optimizer): def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) params = list(params) sizes = {p.shape for p in params} @@ -685,13 +658,18 @@ def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, group_params = [p for p in params if p.shape == size] param_groups.append(dict(params=group_params)) super().__init__(param_groups, defaults) + # init state + for p in params: + chunk_size = p.size(0) // self.world_size + exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) + exp_avg_sq = torch.zeros_like(exp_avg) + self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) # DistributedAdam implementation by @vagrawal @torch.compile @torch.no_grad() def step(self): rank = dist.get_rank() - world_size = dist.get_world_size() reduce_scatter_futures: list[torch.Future] = [] all_gather_futures: list[torch.Future] = [] grad_slices = [] @@ -699,7 +677,7 @@ def step(self): params: list[Tensor] = group["params"] for param in params: grad = param.grad - rank_size = grad.shape[0] // world_size + rank_size = grad.shape[0] // self.world_size grad_slice = torch.empty_like(grad[:rank_size]) reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) grad_slices.append(grad_slice) @@ -712,26 +690,12 @@ def step(self): params = group['params'] for param in params: reduce_scatter_futures[idx].wait() - rank_size = param.shape[0] // world_size + rank_size = param.shape[0] // self.world_size p_slice = param[rank * rank_size:(rank + 1) * rank_size] lr = group['lr'] * getattr(param, "lr_mul", 1.0) state = self.state[param] g_slice = grad_slices[idx] - # State init - if not state: - state["step"] = torch.tensor( - 0, dtype=torch.int64, device=param.device - ) - state["exp_avg"] = torch.zeros( - p_slice.shape, - dtype=torch.bfloat16, - device=p_slice.device, - ) - state["exp_avg_sq"] = torch.zeros( - p_slice.shape, - dtype=torch.bfloat16, - device=p_slice.device, - ) + exp_avg = state["exp_avg"] exp_avg_sq = state["exp_avg_sq"] state["step"] += 1 @@ -748,7 +712,7 @@ def step(self): bias2 = 1 - beta2 ** t # compute step denom = exp_avg_sq.sqrt().add_(eps) - step_size = lr * (torch.sqrt(bias2) / bias1) + step_size = lr * (bias2 ** 0.5 / bias1) update = exp_avg.div(denom).mul_(step_size) p_slice.add_(other=update, alpha=-1.0) idx += 1 @@ -770,10 +734,8 @@ def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, self.grad_s = grad_s def reset_parameters(self) -> None: - std = 0.5 * (self.in_features ** -0.5) # 0.5 is a bit better than the default 1/sqrt(3) - bound = (3 ** 0.5) * std with torch.no_grad(): - self.weight.uniform_(-bound, bound) + self.weight.zero_() # @Grad62304977 and others def forward(self, x: Tensor): if self.use_fp8 and self.training: @@ -790,7 +752,7 @@ def __init__(self, head_dim, max_seq_len): self.head_dim = head_dim self.max_seq_len = max_seq_len self.reset() - + def reset(self): angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) @@ -858,6 +820,7 @@ def __init__(self, dim: int, head_dim: int, num_heads: int): self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) # label module to enable custom optimizer sizing self.qkvo_w.label='attn' + with torch.no_grad(): self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero @@ -866,7 +829,6 @@ def __init__(self, dim: int, head_dim: int, num_heads: int): self.attn_gate = CastedLinear(12, num_heads) # label module to enable custom optimizer sizing self.attn_gate.weight.label = 'attn_gate' - self.attn_gate.weight.detach().zero_() def forward(self, x: Tensor, attn_args: AttnArgs): B, T = x.size(0), x.size(1) # batch size, sequence length @@ -888,14 +850,16 @@ def forward(self, x: Tensor, attn_args: AttnArgs): max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng - y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, max_seqlen_q=max_len, max_seqlen_k=max_len, - causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) + y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, + max_seqlen_q=max_len, max_seqlen_k=max_len, + causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) y = y.view(B, T, self.num_heads, self.head_dim) y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) return y + class MLP(nn.Module): def __init__(self, dim: int): super().__init__() @@ -904,8 +868,11 @@ def __init__(self, dim: int): self.c_fc = nn.Parameter(torch.empty(dim, hdim)) self.c_proj = nn.Parameter(torch.empty(dim, hdim)) # label modules to enable custom optimizer sizing - self.c_fc.label='mlp' - self.c_proj.label='mlp' + self.c_fc.label = 'mlp_up' + self.c_proj.label = 'mlp_down' + # corrective factor to account for transpose + self.c_fc.lr_mul = 2. + std = 0.5 * (dim ** -0.5) bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng with torch.no_grad(): @@ -946,7 +913,6 @@ def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: i vocab_size = next_multiple_of_n(vocab_size, n=128) self.embed = nn.Embedding(vocab_size, model_dim) self.smear_gate = CastedLinear(12, 1) - self.smear_gate.weight.detach().zero_() # label modules to enable custom optimizer sizing self.smear_gate.weight.label = 'smear_gate' # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 @@ -958,7 +924,6 @@ def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: i # suggested to me by @Grad62304977. this originates from Karpathy's experiments. use_fp8 = not os.environ.get("DISABLE_FP8", False) self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) - self.lm_head.weight.detach().zero_() # @Grad62304977 # Add learnable skip connection weights for decoder layers assert num_layers % 2 == 0 pad = (-num_layers * 5 - 2) % dist.get_world_size() @@ -1003,19 +968,19 @@ def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_sho x = self.embed(input_seq) - # smear token embed forward 1 position @classiclarryd + skip_weights = self.scalars[:(len(self.blocks) // 2)] + lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) + sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) smear_lambda = self.scalars[5 * len(self.blocks)] + backout_lambda = self.scalars[5 * len(self.blocks)+1] + + # smear token embed forward 1 position @classiclarryd smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) x = x0 = norm(x[None]) # U-net design by @brendanh0gan skip_connections = [] - skip_weights = self.scalars[:(len(self.blocks) // 2)] - lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) - sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) - backout_lambda = self.scalars[5 * len(self.blocks)+1] - n = len(self.blocks) // 2 x_backout = None @@ -1094,12 +1059,12 @@ def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False) def _load(self): self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() self.ready.set() - + def start(self): self.ready.clear() self.thread = threading.Thread(target=self._load) self.thread.start() - + def get(self): if self.thread: self.ready.wait() @@ -1142,17 +1107,17 @@ def __init__(self, file_iter, world_size: int = 1): self.thread = None self.data = None self.ready = threading.Event() - + def _load(self): tokens = _load_data_shard(next(self.file_iter)) self.data = (tokens, BOSFinder(tokens, self.world_size)) self.ready.set() - + def start(self): self.ready.clear() self.thread = threading.Thread(target=self._load) self.thread.start() - + def get(self): if self.thread: self.ready.wait() @@ -1244,18 +1209,16 @@ class Hyperparameters: train_max_seq_len: int = 128 * 16 val_batch_size: int = 4 * 64 * 1024 * 8 # optimization - num_scheduled_iterations: int = 2290 # number of steps to complete lr and ws schedule - num_extension_iterations: int = 40 # number of steps to continue training at final lr and ws - num_iterations: int = num_scheduled_iterations + num_extension_iterations - cooldown_frac: int = 0.45 # fraction of num_scheduled_iterations spent cooling down the learning rate + num_iterations: int = 2285 + lr_schedule = (0.5, 0.98) # breakpoints for 3-part schedule: (flat, linear decay, flat) + lr_min = 0.1 # evaluation and logging run_id: str = f"{uuid.uuid4()}" val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end save_checkpoint: bool = False # attention masking block_size: int = 128 - ws_schedule: tuple = (3, 7, 11) - ws_validate: int = 13 # increase final validation ws, used for YaRN extension and short window size @classiclarryd + ws_schedule: tuple = (3, 5, 7, 9, 11, 13) ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN args = Hyperparameters() @@ -1335,29 +1298,30 @@ def nvidia_smi(): eps=1e-8, weight_decay=0.0, ) -optimizer2 = Muon(hidden_matrix_params + gate_params, lr=0.06, momentum=0.95, weight_decay=0.0) +optimizer2 = Muon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=0.0) optimizers = [optimizer1, optimizer2] for opt in optimizers: for group in opt.param_groups: group["initial_lr"] = group["lr"] -# learning rate schedule: stable then linear decay def get_lr(step: int): - x = min(0.9999, step / args.num_iterations) - assert 0 <= x < 1 - lr = 1.0 - if x >= 1 - args.cooldown_frac: - w = (1 - x) / args.cooldown_frac - lr = w * 1.0 + (1 - w) * 0.1 + assert step < args.num_iterations + # Three part schedule: flat, linear decrease, flat + lr_schedule = args.lr_schedule + x = step / args.num_iterations + + if x < lr_schedule[0]: + return 1.0 + elif x < lr_schedule[1]: + progress = (x - lr_schedule[0]) / (lr_schedule[1] - lr_schedule[0]) + lr = 1.0 - (1.0 - args.lr_min) * progress + else: + lr = args.lr_min return lr def get_ws(step: int): - # set short window size to half of long window size - # on final step return specific ws for validation - if step == args.num_iterations: - return args.ws_validate // 2, args.ws_validate - x = min(step / (1 + args.num_scheduled_iterations), 0.9999) - assert 0 <= x < 1 + assert step <= args.num_iterations + x = step / (args.num_iterations + 1) ws_idx = int(len(args.ws_schedule) * x) return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] @@ -1415,7 +1379,7 @@ def step_optimizers(step: int, optimizers, model): model.yarn.reset() ws_long = args.ws_schedule[0] else: - new_ws_long = args.ws_schedule[ws_idx] + new_ws_long = args.ws_schedule[ws_idx] if new_ws_long > ws_long: model.yarn.apply(ws_long, new_ws_long) ws_long = new_ws_long @@ -1425,6 +1389,7 @@ def step_optimizers(step: int, optimizers, model): model.zero_grad(set_to_none=True) model.yarn.reset() # rotary buffer is not stored in state_dict model.load_state_dict(initial_state["model"]) +optimizer2.reset() # momentum buffer not in state dict for opt, opt_state in zip(optimizers, initial_state["optimizers"]): opt.load_state_dict(opt_state) del train_loader, initial_state @@ -1482,11 +1447,13 @@ def step_optimizers(step: int, optimizers, model): break # --------------- TRAINING SECTION ----------------- + loss = 0 for _ in range(grad_accum_steps): inputs, targets, cum_seqlens = next(train_loader) - model(inputs, targets, cum_seqlens, ws_short, ws_long).backward() + loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps + loss.backward() step_optimizers(step, optimizers, model) - + # logging approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) From 2c394c766896439889658ad33480275f14e29e77 Mon Sep 17 00:00:00 2001 From: Varun Date: Tue, 28 Oct 2025 00:02:34 -0400 Subject: [PATCH 2/2] Cautious weight decay --- .../3309f49f-8ea2-4f8c-95a4-d3deb9e19f46.txt | 3814 ----------------- .../6c588921-a777-458d-8003-f608774f040c.txt | 3814 ----------------- .../6e1efe80-8453-4ef6-a34d-8c73543618a8.txt | 3814 ----------------- .../72231598-c098-4e79-94f2-26952a4bbdc6.txt | 3814 ----------------- .../74ef00d7-4030-46f2-a269-bea707f0f0bd.txt | 3814 ----------------- .../2025-10-27_FixMuonLR/README.md | 65 - .../f196cb62-827b-4bb1-94f0-4169eb1c9375.txt | 3814 ----------------- .../fc12c205-f953-4028-bfdf-0519c72fb269.txt | 3814 ----------------- .../1aac0132-a891-4ed9-b358-0fd2abd1b019.txt | 3772 ++++++++++++++++ .../1f62a34f-fb60-4228-bd77-639ac781809f.txt | 3772 ++++++++++++++++ .../4c0708bf-8091-46ef-9557-f945cdf287c7.txt | 3772 ++++++++++++++++ .../541679f5-b0e3-4a19-a5ae-34b6a4f2d896.txt | 3772 ++++++++++++++++ .../7f53cbe9-4553-44fd-97e6-7e479337fdab.txt | 3772 ++++++++++++++++ .../2025-11-10_CautiousWD/README.md | 65 + .../a33fa276-234b-4c9c-9b78-43d85c411e8d.txt | 3772 ++++++++++++++++ .../aeaf2a6d-2a2e-4414-bc48-293946e087fc.txt | 3772 ++++++++++++++++ .../assets/cwd_condition_numbers.jpg | Bin 0 -> 604484 bytes .../assets/validation_loss.jpg | Bin 0 -> 98872 bytes .../cd7f6820-13cb-4ac8-b7ff-5cdac53faada.txt | 3772 ++++++++++++++++ .../cdef87a8-cc95-4916-bc6a-83c9615d24c2.txt | 3772 ++++++++++++++++ .../f2223004-18ce-47d6-bff7-065ce3a78092.txt | 3772 ++++++++++++++++ train_gpt.py | 93 +- 22 files changed, 37831 insertions(+), 26810 deletions(-) delete mode 100644 records/track_1_short/2025-10-27_FixMuonLR/3309f49f-8ea2-4f8c-95a4-d3deb9e19f46.txt delete mode 100644 records/track_1_short/2025-10-27_FixMuonLR/6c588921-a777-458d-8003-f608774f040c.txt delete mode 100644 records/track_1_short/2025-10-27_FixMuonLR/6e1efe80-8453-4ef6-a34d-8c73543618a8.txt delete mode 100644 records/track_1_short/2025-10-27_FixMuonLR/72231598-c098-4e79-94f2-26952a4bbdc6.txt delete mode 100644 records/track_1_short/2025-10-27_FixMuonLR/74ef00d7-4030-46f2-a269-bea707f0f0bd.txt delete mode 100644 records/track_1_short/2025-10-27_FixMuonLR/README.md delete mode 100644 records/track_1_short/2025-10-27_FixMuonLR/f196cb62-827b-4bb1-94f0-4169eb1c9375.txt delete mode 100644 records/track_1_short/2025-10-27_FixMuonLR/fc12c205-f953-4028-bfdf-0519c72fb269.txt create mode 100644 records/track_1_short/2025-11-10_CautiousWD/1aac0132-a891-4ed9-b358-0fd2abd1b019.txt create mode 100644 records/track_1_short/2025-11-10_CautiousWD/1f62a34f-fb60-4228-bd77-639ac781809f.txt create mode 100644 records/track_1_short/2025-11-10_CautiousWD/4c0708bf-8091-46ef-9557-f945cdf287c7.txt create mode 100644 records/track_1_short/2025-11-10_CautiousWD/541679f5-b0e3-4a19-a5ae-34b6a4f2d896.txt create mode 100644 records/track_1_short/2025-11-10_CautiousWD/7f53cbe9-4553-44fd-97e6-7e479337fdab.txt create mode 100644 records/track_1_short/2025-11-10_CautiousWD/README.md create mode 100644 records/track_1_short/2025-11-10_CautiousWD/a33fa276-234b-4c9c-9b78-43d85c411e8d.txt create mode 100644 records/track_1_short/2025-11-10_CautiousWD/aeaf2a6d-2a2e-4414-bc48-293946e087fc.txt create mode 100644 records/track_1_short/2025-11-10_CautiousWD/assets/cwd_condition_numbers.jpg create mode 100644 records/track_1_short/2025-11-10_CautiousWD/assets/validation_loss.jpg create mode 100644 records/track_1_short/2025-11-10_CautiousWD/cd7f6820-13cb-4ac8-b7ff-5cdac53faada.txt create mode 100644 records/track_1_short/2025-11-10_CautiousWD/cdef87a8-cc95-4916-bc6a-83c9615d24c2.txt create mode 100644 records/track_1_short/2025-11-10_CautiousWD/f2223004-18ce-47d6-bff7-065ce3a78092.txt diff --git a/records/track_1_short/2025-10-27_FixMuonLR/3309f49f-8ea2-4f8c-95a4-d3deb9e19f46.txt b/records/track_1_short/2025-10-27_FixMuonLR/3309f49f-8ea2-4f8c-95a4-d3deb9e19f46.txt deleted file mode 100644 index 7004bc11e..000000000 --- a/records/track_1_short/2025-10-27_FixMuonLR/3309f49f-8ea2-4f8c-95a4-d3deb9e19f46.txt +++ /dev/null @@ -1,3814 +0,0 @@ -import os -import sys - -with open(sys.argv[0]) as f: - code = f.read() # read the code of this file ASAP, for logging -import copy -import glob -import math -import threading -import time -import uuid -from dataclasses import dataclass -from collections import defaultdict -from itertools import accumulate -from pathlib import Path - -os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" -import torch - -torch.empty( - 1, device="cuda", requires_grad=True -).backward() # prevents a bug on some systems -import torch._dynamo as dynamo -import torch.distributed as dist -import torch.nn.functional as F - -# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min -import triton -import triton.language as tl -from kernels import get_kernel -from torch import Tensor, nn - -dynamo.config.recompile_limit = 64 - -# ----------------------------------------------------------------------------- -# Custom operators: FP8 matmul by @YouJiacheng - - -@torch.library.custom_op("nanogpt::mm", mutates_args=()) -def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: - @torch.compile - def impl(x: Tensor, w: Tensor): - assert x.is_contiguous() and w.is_contiguous() - x_f8 = x.div(x_s).to(torch.float8_e4m3fn) - w_f8 = w.div(w_s).to(torch.float8_e4m3fn) - out = torch._scaled_mm( - x_f8, - w_f8.T, - out_dtype=torch.bfloat16, - scale_a=x.new_tensor(x_s, dtype=torch.float32), - scale_b=x.new_tensor(w_s, dtype=torch.float32), - use_fast_accum=True, - ) - return out, x_f8, w_f8 - - return impl(x, w) - -@mm_op.register_fake -def _(x: Tensor, w: Tensor, *_): - assert x.ndim == w.ndim == 2 - assert x.shape[1] == w.shape[1] - assert x.device == w.device - assert x.is_contiguous() and w.is_contiguous() - return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) - -@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) -def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: - @torch.compile - def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): - assert grad.is_contiguous() - x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) - w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) - grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) - grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) - grad_x = torch._scaled_mm( - grad_f8, - w_f8.T.contiguous().T, - out_dtype=torch.bfloat16, - scale_a=grad_inv_s, - scale_b=w_inv_s, - use_fast_accum=False, - ) - # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) - grad_w = torch._scaled_mm( - x_f8.T.contiguous(), - grad_f8.T.contiguous().T, - out_dtype=torch.float32, - scale_a=x_inv_s, - scale_b=grad_inv_s, - use_fast_accum=False, - ).T - return grad_x, grad_w - - return impl(g, x_f8, w_f8) - -@mm_backward_op.register_fake -def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): - return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) - -def backward(ctx, grad_out: Tensor, *_): - x_f8, w_f8 = ctx.saved_tensors - x_s, w_s, grad_s = ctx.scales - grad_x, grad_w = torch.ops.nanogpt.mm_backward( - grad_out, x_f8, w_f8, x_s, w_s, grad_s - ) - return grad_x, grad_w, None, None, None - -def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): - *_, x_s, w_s, grad_s = inputs - _, x_f8, w_f8 = output - ctx.save_for_backward(x_f8, w_f8) - ctx.scales = x_s, w_s, grad_s - ctx.set_materialize_grads(False) - -mm_op.register_autograd(backward, setup_context=setup_context) - -# ----------------------------------------------------------------------------- -# Triton kernel for symmetric matrix multiplication by @byronxu99 - -def _get_autotune_configs(): - return [ - triton.Config( - { - "BLOCK_SIZE_M": bm, - "BLOCK_SIZE_N": bn, - "BLOCK_SIZE_K": bk, - "GROUP_SIZE_M": 8, - "LOWER_UPPER": 1, - }, - num_stages=stages, - num_warps=warps, - ) - for bm in [64, 128] - for bn in [64, 128, 256] - for bk in [64, 128] - for stages, warps in [(3, 4), (3, 8), (4, 4)] - if bm // bn <= 2 and bn // bm <= 2 - ] - -@triton.jit -def _pid_to_block( - pid, - M, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, -): - # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) - num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) - num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) - - # Map PID to a single matrix in batch - batch_idx = pid // (num_pid_m * num_pid_n) - pid = pid % (num_pid_m * num_pid_n) - - # Map PID to 2D grid of blocks - pid_m = pid // num_pid_n - pid_n = pid % num_pid_n - pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) - - m_idx = pid_m * BLOCK_SIZE_M - n_idx = pid_n * BLOCK_SIZE_N - return batch_idx, m_idx, n_idx - -@triton.autotune( - configs=_get_autotune_configs(), - key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], -) -@triton.jit -def XXT_kernel( - A_ptr, C_ptr, - M, K, - a_stride_b, a_stride_r, a_stride_c, - c_stride_b, c_stride_r, c_stride_c, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, - LOWER_UPPER: tl.constexpr, -): - pid = tl.program_id(axis=0) - batch_idx, m_idx, n_idx = _pid_to_block( - pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M - ) - - # Skip blocks that don't need to be computed - skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) - skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) - if skip_block_below_diag or skip_block_above_diag: - return - - # Index into one matrix of batch - A_ptr += batch_idx * a_stride_b - C_ptr += batch_idx * c_stride_b - - # Create pointer arrays for A and A.T - offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M - offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M - offs_k = tl.arange(0, BLOCK_SIZE_K) - a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) - at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) - - accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - - # Accumulate over blocks of K - for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): - a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) - at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) - accumulator = tl.dot(a, at, accumulator) - a_ptrs += BLOCK_SIZE_K * a_stride_c - at_ptrs += BLOCK_SIZE_K * a_stride_c - - out_dtype = C_ptr.dtype.element_ty - output = accumulator.to(out_dtype) - - # Store block of C - offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) - offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) - c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) - c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) - tl.store(c_ptrs, output, mask=c_mask) - - # Store block of C mirrored across the diagonal - c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) - c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) - tl.store(c_ptrs_t, output.T, mask=c_mask_t) - -def XXT(A: torch.Tensor, out: torch.Tensor): - """ - Launch Triton kernel to compute C = A @ A.T - """ - assert A.ndim == 2 or A.ndim == 3 - M, K = A.shape[-2:] - assert out.size(-2) == M, "Output matrix has incorrect shape" - assert out.size(-1) == M, "Output matrix has incorrect shape" - - batch_size = A.size(0) if A.ndim == 3 else 1 - input_batch_stride = A.stride(0) if A.ndim == 3 else 0 - output_batch_stride = out.stride(0) if out.ndim == 3 else 0 - - grid = lambda meta: ( - batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), - ) - XXT_kernel[grid]( - A_ptr=A, - C_ptr=out, - M=M, - K=K, - a_stride_b=input_batch_stride, - a_stride_r=A.stride(-2), - a_stride_c=A.stride(-1), - c_stride_b=output_batch_stride, - c_stride_r=out.stride(-2), - c_stride_c=out.stride(-1), - ) - return out - -@triton.autotune( - configs=_get_autotune_configs(), - key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], -) -@triton.jit -def ba_plus_cAA_kernel( - A_ptr, C_ptr, - M, - a_stride_b, a_stride_r, a_stride_c, - c_stride_b, c_stride_r, c_stride_c, - alpha, beta, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, - LOWER_UPPER: tl.constexpr, -): - # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A - # Performance is slightly slower than XXT_kernel, so we use two separate kernels - pid = tl.program_id(axis=0) - batch_idx, m_idx, n_idx = _pid_to_block( - pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M - ) - - # Skip blocks that don't need to be computed - skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) - skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) - if skip_block_below_diag or skip_block_above_diag: - return - - # Index into one matrix of batch - A_ptr += batch_idx * a_stride_b - C_ptr += batch_idx * c_stride_b - - # Create pointer arrays for A and A.T - offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M - offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M - offs_k = tl.arange(0, BLOCK_SIZE_K) - a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) - at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) - - accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - - # Accumulate over blocks of K - for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): - a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) - at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) - accumulator = tl.dot(a, at, accumulator) - a_ptrs += BLOCK_SIZE_K * a_stride_c - at_ptrs += BLOCK_SIZE_K * a_stride_c - - # Load block of A to add (corresponds to the current block of C) - offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) - offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) - a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) - a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) - a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) - - # Apply alpha and beta - accumulator *= alpha - accumulator += a_add * beta - - out_dtype = C_ptr.dtype.element_ty - output = accumulator.to(out_dtype) - - # Store block of C - offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) - offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) - c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) - c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) - tl.store(c_ptrs, output, mask=c_mask) - - # Store block of C mirrored across the diagonal - c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) - c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) - tl.store(c_ptrs_t, output.T, mask=c_mask_t) - -def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): - """ - Launch Triton kernel to compute C = alpha * A @ A.T + beta * A - """ - assert A.ndim == 2 or A.ndim == 3 - M, K = A.shape[-2:] - assert M == K, "Input matrix must be square" - assert out.size(-2) == M - assert out.size(-1) == M - - batch_size = A.size(0) if A.ndim == 3 else 1 - input_batch_stride = A.stride(0) if A.ndim == 3 else 0 - output_batch_stride = out.stride(0) if out.ndim == 3 else 0 - - grid = lambda meta: ( - batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), - ) - ba_plus_cAA_kernel[grid]( - A_ptr=A, - C_ptr=out, - M=M, - a_stride_b=input_batch_stride, - a_stride_r=A.stride(-2), - a_stride_c=A.stride(-1), - c_stride_b=output_batch_stride, - c_stride_r=out.stride(-2), - c_stride_c=out.stride(-1), - alpha=alpha, - beta=beta, - ) - return out - -# Computed for num_iters=5, safety_factor=2e-2, cushion=2 -polar_express_coeffs = [ - (8.156554524902461, -22.48329292557795, 15.878769915207462), - (4.042929935166739, -2.808917465908714, 0.5000178451051316), - (3.8916678022926607, -2.772484153217685, 0.5060648178503393), - (3.285753657755655, -2.3681294933425376, 0.46449024233003106), - (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) -] - -@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower -def polar_express(G: torch.Tensor): - """ - Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 - by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. - Code adapted from https://github.com/NoahAmsel/PolarExpress/tree/main by @varunneal. - """ - X = G.bfloat16() - if G.size(-2) > G.size(-1): - X = X.mT - - # Ensure spectral norm is at most 1 - X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) - - # Allocate buffers - X = X.contiguous() - A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) - B = torch.empty_like(A) - C = torch.empty_like(X) - - aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm - - # Perform the iterations - for a, b, c in polar_express_coeffs: - XXT(X, out=A) # A = X @ X.mT - ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A - aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X - X, C = C, X # Swap references to avoid unnecessary copies - - if G.size(-2) > G.size(-1): - X = X.mT - return X - -# ----------------------------------------------------------------------------- -# Muon optimizer - -class Muon(torch.optim.Optimizer): - """ - Muon - MomentUm Orthogonalized by Newton-schulz - - https://kellerjordan.github.io/posts/muon/ - - Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- - processing step, in which each 2D parameter's update is replaced with the nearest orthogonal - matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has - the advantage that it can be stably run in bfloat16 on the GPU. - Note: A later PR replaced Newton-Shulz with Polar Express for the orthogonalization step - - Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, - or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - Though empirically small 1D params perform efficiently here: - NS approximately performs a magnitude normalization of the grad - This hyper-optimized class has faster execution time than the current impl of Adam for small params - - Custom distributed sizing: - The model stores all attn and mlp weights in the same shape, and then updates the view as - needed on the forward pass. This enables attn and mlp weights to be contained within the same - dist.reduce_scatter_tensor() call. The model architecture has been customized to enable - (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. - The scheduling is: - 1. reduce scatter smear_gate (1 param 7 padding params) - 2. reduce scatter attn_gate (10 params 6 padding params) - 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) - 4. reduce scatter attn/mlp round 2 (16 mlp params) - 5. wait on step 1, then compute update of 1 and schedule all gather - 6. wait on step 2, then compute update of 2 and schedule all gather - 7. wait on step 3, then compute update of 3 and schedule all gather - GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] - GPUs that receive params of type attn reshape before computing update - 8. wait on 4, then compute update of 4 and schedule all gather - 9. wait for each all gather to complete and update params - Empirically, leading with small params provides an additional 0.2s improvement. - """ - def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, eps=1e-8, beta2=0.95, custom_sizing=True): - defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) - self.world_size = dist.get_world_size() if dist.is_initialized() else 1 - # custom sizing requires 8 GPUs - if custom_sizing and dist.get_world_size()==8: - param_groups = self.generate_custom_param_groups(params) - else: - param_groups = self.generate_standard_param_groups(params) - super().__init__(param_groups, defaults) - - def reset(self): - # expose a reset for clearing buffers - for group in self.param_groups: - group["momentum_buffer"].zero_() - group["second_momentum_buffer"].zero_() - - def generate_standard_param_groups(self, params): - """ - Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. - Creates one param group per module. - """ - groups = defaultdict(list) - for param in params: - groups[param.label].append(param) - - param_groups = [] - for module_name, group_params in groups.items(): - chunk_size = (len(group_params) + self.world_size - 1) // self.world_size - param_groups.append(dict(params=group_params, chunk_size=chunk_size)) - - return param_groups - - def generate_custom_param_groups(self, params): - """ - Implementation requires that a single GPU does not receive both attn - and mlp params when a param group is split across GPUs. - """ - module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp_up', 'mlp_down'] - params_list = list(params) - params_list.sort(key=lambda x: module_group_order.index(x.label)) - - idx = 0 - group_sizes = [1, 10, 16, 16] - assert len(params_list) == sum(group_sizes) - param_groups = [] - for size in group_sizes: - chunk_size = (size + self.world_size - 1) // self.world_size - group_params = params_list[idx: idx + size] - param_groups.append(dict(params=group_params, chunk_size=chunk_size)) - idx += size - - return param_groups - - @torch.no_grad() - def step(self): - # Efficient systems-wise implementation of step developed by @YouJiacheng, - # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, - # @ryanyang0, @vagrawal, and @varunneal. - rank = dist.get_rank() - group_infos = [] - for group in self.param_groups: - params: list[Tensor] = group["params"] - if not params: - continue - - chunk_size = group["chunk_size"] - padded_num_params = chunk_size * self.world_size - - stacked_grads = torch.empty( - (padded_num_params, *params[0].shape), - dtype=params[0].dtype, - device=params[0].device - ) - for i, p in enumerate(params): - stacked_grads[i].copy_(p.grad, non_blocking=True) - if len(params) < padded_num_params: - stacked_grads[len(params):].zero_() - - grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) - - reduce_future = dist.reduce_scatter_tensor( - grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True - ).get_future() - - group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) - - all_gather_infos = [] - # Second pass: wait for gradients, compute updates for the local shard of parameters, - # and launch all async all_gather operations. - for group, info in zip(self.param_groups, group_infos): - info["reduce_future"].wait() - - params = group["params"] - grad_chunk = info["grad_chunk"] - chunk_size = group["chunk_size"] - padded_num_params = chunk_size * self.world_size - - start_idx = rank * chunk_size - module_idx = start_idx if start_idx < len(params) else 0 - - num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank - - if "momentum_buffer" not in group: - group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) - momentum_buffer = group["momentum_buffer"] - # Apply momentum update to the persistent momentum buffer in-place - momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) - updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) - - grad_shape = updated_grads.shape - if params[module_idx].label == 'attn': - # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] - for p in params[module_idx:module_idx + num_params]: - assert p.label == 'attn' - updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) - ref_param = params[module_idx] - param_shape = ref_param.shape - - if "second_momentum_buffer" not in group: - group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) - if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) - ) - second_momentum_buffer = group["second_momentum_buffer"] - - if "param_lr" not in group: - group["param_lr"] = ( - max(1., param_shape[-2] / param_shape[-1]) ** 0.5 - * ref_param.new_tensor( - [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] - ).view(-1, 1, 1) - ) - - group["param_wd"] = ref_param.new_tensor( - [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] - ).view(-1, 1, 1) - - # Determine LR and WR - eff_lr = group["lr"] * group["param_lr"] - eff_wd = group["weight_decay"] * group["param_wd"] - - # Compute zeropower for the entire chunk in a single, batched call. - if num_params == 0: - v_chunk = updated_grads - elif params[module_idx].label == "smear_gate": - # dividing by magnitude is equivalent of SVN for 1d tensors - v_chunk = updated_grads / (updated_grads.norm(dim=(-2, -1), keepdim=True).clamp_min(1e-10)) - else: - v_chunk = polar_express(updated_grads) - - # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) - v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) - v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) - second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) - step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() - v_chunk.mul_(step_size) - v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) - v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) - - v_chunk = v_chunk.view(grad_shape) - - updated_params = torch.empty_like(grad_chunk) - param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) - # Apply weight decay directly to the buffer. - param_chunk.mul_(1 - eff_wd) - - param_chunk.add_(-eff_lr * v_chunk) - - updated_params[:num_params].copy_(param_chunk) - if num_params < chunk_size: - updated_params[num_params:].zero_() - - stacked_params = torch.empty( - (padded_num_params, *param_shape), - dtype=updated_params.dtype, - device=updated_params.device, - ) - - gather_future = dist.all_gather_into_tensor( - stacked_params, updated_params, async_op=True - ).get_future() - - all_gather_infos.append( - { - "gather_future": gather_future, - "stacked_params": stacked_params, - "orig_params": params, - } - ) - - # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. - for info in all_gather_infos: - info["gather_future"].wait() - stacked_params = info["stacked_params"] - orig_params = info["orig_params"] - - unstacked_params = torch.unbind(stacked_params) - for i, p in enumerate(orig_params): - p.copy_(unstacked_params[i], non_blocking=True) - - -class DistAdam(torch.optim.Optimizer): - def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): - self.world_size = dist.get_world_size() if dist.is_initialized() else 1 - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) - params = list(params) - sizes = {p.shape for p in params} - # create one buffer per unique parameter-size - param_groups = [] - for size in sizes: - group_params = [p for p in params if p.shape == size] - param_groups.append(dict(params=group_params)) - super().__init__(param_groups, defaults) - # init state - for p in params: - chunk_size = p.size(0) // self.world_size - exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) - exp_avg_sq = torch.zeros_like(exp_avg) - self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) - # DistributedAdam implementation by @vagrawal - - @torch.compile - @torch.no_grad() - def step(self): - rank = dist.get_rank() - reduce_scatter_futures: list[torch.Future] = [] - all_gather_futures: list[torch.Future] = [] - grad_slices = [] - for group in self.param_groups: - params: list[Tensor] = group["params"] - for param in params: - grad = param.grad - rank_size = grad.shape[0] // self.world_size - grad_slice = torch.empty_like(grad[:rank_size]) - reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) - grad_slices.append(grad_slice) - - idx = 0 - for group in self.param_groups: - beta1, beta2 = group['betas'] - eps = group['eps'] - wd = group['weight_decay'] - params = group['params'] - for param in params: - reduce_scatter_futures[idx].wait() - rank_size = param.shape[0] // self.world_size - p_slice = param[rank * rank_size:(rank + 1) * rank_size] - lr = group['lr'] * getattr(param, "lr_mul", 1.0) - state = self.state[param] - g_slice = grad_slices[idx] - - exp_avg = state["exp_avg"] - exp_avg_sq = state["exp_avg_sq"] - state["step"] += 1 - t = state["step"] - # weight decay - if wd != 0: - eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) - p_slice.mul_(1 - eff_weight_decay) - # update running averages - exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) - exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) - # bias corrections - bias1 = 1 - beta1 ** t - bias2 = 1 - beta2 ** t - # compute step - denom = exp_avg_sq.sqrt().add_(eps) - step_size = lr * (bias2 ** 0.5 / bias1) - update = exp_avg.div(denom).mul_(step_size) - p_slice.add_(other=update, alpha=-1.0) - idx += 1 - all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) - torch.futures.collect_all(all_gather_futures).wait() - -# ----------------------------------------------------------------------------- -# PyTorch nn.Module definitions for the model - -def norm(x: Tensor): - return F.rms_norm(x, (x.size(-1),)) - -class CastedLinear(nn.Linear): - def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): - super().__init__(in_features, out_features, bias=False) - self.use_fp8 = use_fp8 - self.x_s = x_s - self.w_s = w_s - self.grad_s = grad_s - - def reset_parameters(self) -> None: - with torch.no_grad(): - self.weight.zero_() # @Grad62304977 and others - - def forward(self, x: Tensor): - if self.use_fp8 and self.training: - _x = x.flatten(0, -2) - out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] - return out.reshape(*x.shape[:-1], -1) - else: - return F.linear(x, self.weight.type_as(x)) - -# yarn implementation @classiclarryd -class Yarn(nn.Module): - def __init__(self, head_dim, max_seq_len): - super().__init__() - self.head_dim = head_dim - self.max_seq_len = max_seq_len - self.reset() - - def reset(self): - angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) - # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) - angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) - t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) - theta = torch.outer(t, angular_freq) - self.cos = nn.Buffer( - theta.cos().to(torch.bfloat16), persistent=False - ) - self.sin = nn.Buffer( - theta.sin().to(torch.bfloat16), persistent=False - ) - self.angular_freq = angular_freq - # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 - self.attn_scale = 0.1 - - def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): - rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) - scaling_factor = old_window / new_window - interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) - self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) - t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) - theta = torch.outer(t, self.angular_freq) - self.cos.copy_(theta.cos()) - self.sin.copy_(theta.sin()) - self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 - -def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): - assert cos.size(0) >= x_BTHD.size(-3) - cos, sin = ( - cos[None, : x_BTHD.size(-3), None, :], - sin[None, : x_BTHD.size(-3), None, :], - ) - x1, x2 = x_BTHD.chunk(2, dim=-1) - y1 = x1 * cos + x2 * sin - y2 = x1 * (-sin) + x2 * cos - return torch.cat((y1, y2), 3) - -@dataclass -class AttnArgs: - ve: torch.Tensor - sa_lambdas: torch.Tensor - seqlens: torch.Tensor - bm_size: int - cos: torch.Tensor - sin: torch.Tensor - attn_scale: float - -flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface - -class CausalSelfAttention(nn.Module): - def __init__(self, dim: int, head_dim: int, num_heads: int): - super().__init__() - self.num_heads = num_heads - self.head_dim = head_dim - self.dim = dim - self.hdim = num_heads * head_dim - - assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" - std = 0.5 * (self.dim ** -0.5) - bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng - # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng - # https://x.com/hi_tysam/status/1879699187107033311 - # make matrices the same shape as MLP to enable batched call in optimizer - self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) - # label module to enable custom optimizer sizing - self.qkvo_w.label='attn' - - with torch.no_grad(): - self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights - self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero - - # sparse gated attention to enable context based no-op by @classiclarryd - self.attn_gate = CastedLinear(12, num_heads) - # label module to enable custom optimizer sizing - self.attn_gate.weight.label = 'attn_gate' - - def forward(self, x: Tensor, attn_args: AttnArgs): - B, T = x.size(0), x.size(1) # batch size, sequence length - assert B == 1, "varlen sequences requires B == 1" - assert T % 16 == 0 - # unpack attention args - cos, sin = attn_args.cos, attn_args.sin - ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas - seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size - - q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) - q, k = norm(q), norm(k) # QK norm @Grad62304977 - q, k = rotary(q, cos, sin), rotary(k, cos, sin) - if ve is not None: - v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 - else: # skip mid-layers token value embeddings by @YouJiacheng - v = sa_lambdas[0] * v - - max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) - - # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng - y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, - max_seqlen_q=max_len, max_seqlen_k=max_len, - causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) - y = y.view(B, T, self.num_heads, self.head_dim) - y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) - y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side - y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) - return y - - -class MLP(nn.Module): - def __init__(self, dim: int): - super().__init__() - hdim = 4 * dim - # make matrices the same shape to enable batched call in optimizer - self.c_fc = nn.Parameter(torch.empty(dim, hdim)) - self.c_proj = nn.Parameter(torch.empty(dim, hdim)) - # label modules to enable custom optimizer sizing - self.c_fc.label = 'mlp_up' - self.c_proj.label = 'mlp_down' - # corrective factor to account for transpose - self.c_fc.lr_mul = 2. - - std = 0.5 * (dim ** -0.5) - bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng - with torch.no_grad(): - self.c_fc.uniform_(-bound, bound) - self.c_proj.zero_() # zero init suggested by @Grad62304977 - - def forward(self, x: Tensor): - x = F.linear(x, self.c_fc.T.type_as(x)) - x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 - x = F.linear(x, self.c_proj.type_as(x)) - return x - -class Block(nn.Module): - def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): - super().__init__() - # skip attention of blocks.7 (the 8th layer) by @YouJiacheng - self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None - # skip MLP blocks for first MLP layer by @EmelyanenkoK - self.mlp = MLP(dim) if layer_idx != 0 else None - - def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): - x = lambdas[0] * x + lambdas[1] * x0 - if self.attn is not None: - x = x + self.attn(norm(x), attn_args) - if self.mlp is not None: - x = x + self.mlp(norm(x)) - return x - -# ----------------------------------------------------------------------------- -# The main model - -def next_multiple_of_n(v: float | int, *, n: int): - return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) - -class GPT(nn.Module): - def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): - super().__init__() - vocab_size = next_multiple_of_n(vocab_size, n=128) - self.embed = nn.Embedding(vocab_size, model_dim) - self.smear_gate = CastedLinear(12, 1) - # label modules to enable custom optimizer sizing - self.smear_gate.weight.label = 'smear_gate' - # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 - # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 - self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) - self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) - self.yarn = Yarn(head_dim, max_seq_len) - # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. - # suggested to me by @Grad62304977. this originates from Karpathy's experiments. - use_fp8 = not os.environ.get("DISABLE_FP8", False) - self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) - # Add learnable skip connection weights for decoder layers - assert num_layers % 2 == 0 - pad = (-num_layers * 5 - 2) % dist.get_world_size() - self.scalars = nn.Parameter( - torch.cat( - [ - -1.5 - * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 - *[ - torch.tensor([1.0, 0.0]) for _ in range(num_layers) - ], # block lambdas - *[ - torch.tensor([0.5, 0.5]) for _ in range(num_layers) - ], # SA lambdas - torch.zeros(1), # smear_lambda - 0.5*torch.ones(1), # backout_lambda - torch.ones(pad), - ] - ) - ) - # set learning rates - for param in self.embed.parameters(): - param.lr_mul = 75. - for param in self.value_embeds.parameters(): - param.lr_mul = 75. - self.lm_head.weight.lr_mul = 1.0 - self.scalars.lr_mul = 5.0 - - def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): - assert input_seq.ndim == 1 - - ve = [value_embed(input_seq) for value_embed in self.value_embeds] - # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure - # dropping first layer updates this to .12 ... 012 - ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] - assert len(ve) == len(self.blocks) - - short_bm = ws_short * args.block_size - long_bm = ws_long * args.block_size - bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] - assert len(bm_sizes) == len(self.blocks) - - x = self.embed(input_seq) - - skip_weights = self.scalars[:(len(self.blocks) // 2)] - lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) - sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) - smear_lambda = self.scalars[5 * len(self.blocks)] - backout_lambda = self.scalars[5 * len(self.blocks)+1] - - # smear token embed forward 1 position @classiclarryd - smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) - x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) - x = x0 = norm(x[None]) - - # U-net design by @brendanh0gan - skip_connections = [] - n = len(self.blocks) // 2 - - x_backout = None - backout_layer = 8 - # skip layer zero - for i in range(1,len(self.blocks)): - attn_args = AttnArgs( - ve=ve[i], - sa_lambdas=sa_lambdas[i], - seqlens=seqlens, - bm_size=bm_sizes[i], - cos=self.yarn.cos, - sin=self.yarn.sin, - attn_scale=self.yarn.attn_scale - ) - # since layer 0 is skipped, layer 11 does not have skip_connection - if i >= n and i<11: - gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) - x = x + gate * skip_connections.pop() - x = self.blocks[i](x, x0, lambdas[i], attn_args) - if i < n: - skip_connections.append(x) - if i == backout_layer: - x_backout = x - - # back out contributions from first 8 layers that are only required for downstream context and not direct prediction - x -= backout_lambda * x_backout - x = norm(x) - logits = self.lm_head(x) - # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) - logits = 30 * torch.sigmoid(logits / 7.5) - logits_for_loss = logits.float() if not self.training else logits - loss = F.cross_entropy( - logits_for_loss.view(-1, logits_for_loss.size(-1)), - target_seq, - reduction="sum" if self.training else "mean", - ) - return loss - -# ----------------------------------------------------------------------------- -# Distributed data loader - -def _load_data_shard(file: Path): - header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 - assert header[0] == 20240520, "magic number mismatch in the data .bin file" - assert header[1] == 1, "unsupported version" - num_tokens = int(header[2]) # number of tokens (claimed) - with file.open("rb", buffering=0) as f: - tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng - f.seek(256 * 4) - nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng - assert nbytes == 2 * num_tokens, "number of tokens read does not match header" - return tokens - -BOS_ID = 50256 - -class BOSFinder: - # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd - def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): - # Precompute BOS positions once per shard - self.tokens=tokens - self.size = tokens.numel() - self.quickload = quickload - if quickload: - # only scan first 4 million tokens, then kickoff async thread to scan rest - self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() - self.thread = None - self.ready = threading.Event() - self.start() - else: - self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() - self.i = 0 - self.world_size = world_size - self.batch_iter = 0 - - def _load(self): - self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() - self.ready.set() - - def start(self): - self.ready.clear() - self.thread = threading.Thread(target=self._load) - self.thread.start() - - def get(self): - if self.thread: - self.ready.wait() - self.thread.join() - self.bos_idx = self.bos_idx_async - - def next_batch(self, num_tokens_local: int, max_seq_len: int): - # if quickload was used, repoint to the full dataset after 5 batches - if self.quickload and self.batch_iter==5: - self.get() - n = len(self.bos_idx) - starts = [[] for _ in range(self.world_size)] - ends = [[] for _ in range(self.world_size)] - - idx = self.i - for r in range(self.world_size): - cur_len = 0 - while cur_len <= num_tokens_local: - if idx >= n: - raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") - cur = self.bos_idx[idx] - starts[r].append(cur) - end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, - cur + max_seq_len, - cur + num_tokens_local - cur_len + 1) - ends[r].append(end) - cur_len += end - cur - idx += 1 - - assert cur_len == num_tokens_local + 1 - self.i = idx - self.batch_iter+=1 - return starts, ends - -class DataPreloader: - # Helper for asynchronously loading next shard and indexing bos tokens - def __init__(self, file_iter, world_size: int = 1): - self.file_iter = file_iter - self.world_size = world_size - self.thread = None - self.data = None - self.ready = threading.Event() - - def _load(self): - tokens = _load_data_shard(next(self.file_iter)) - self.data = (tokens, BOSFinder(tokens, self.world_size)) - self.ready.set() - - def start(self): - self.ready.clear() - self.thread = threading.Thread(target=self._load) - self.thread.start() - - def get(self): - if self.thread: - self.ready.wait() - self.thread.join() - return self.data - -def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): - # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len - rank = dist.get_rank() if dist.is_initialized() else 0 - world_size = dist.get_world_size() if dist.is_initialized() else 1 - assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" - num_tokens = num_tokens // grad_accum_steps - - files = [Path(file) for file in sorted(glob.glob(filename_pattern))] - if not files: - raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") - - file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training - tokens = _load_data_shard(next(file_iter)) - if align_to_bos: - finder = BOSFinder(tokens, world_size=world_size, quickload=True) - preloader = DataPreloader(file_iter, world_size) - preloader.start() - else: - pos = 0 # for unaligned case - - while True: - num_tokens_local = num_tokens // world_size - max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 - - if align_to_bos: - try: - seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) - start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) - except StopIteration: - # This shard is exhausted, load the next one in the next loop iteration. - tokens, finder = preloader.get() - preloader.start() - continue - - buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) - _inputs = buf[:-1] - _targets = buf[1:] - end_idxs[-1] -= 1 # last document was too long to account for _targets offset - cum_lengths = (end_idxs - start_idxs).cumsum(0) - - else: - if pos + num_tokens + 1 >= len(tokens): # should not occur for val data - tokens, pos = _load_data_shard(next(file_iter)), 0 - - pos_local = pos + rank * num_tokens_local - buf = tokens[pos_local: pos_local + num_tokens_local + 1] - _inputs = buf[:-1].view(num_tokens_local, ) - _targets = buf[1:].view(num_tokens_local, ) - - cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] - pos += num_tokens - - - _cum_lengths = torch.full((max_num_docs,), num_tokens_local) - _cum_lengths[0] = 0 - _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths - - new_params = yield ( - _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), - _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), - _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) - ) - - if new_params is not None: - # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() - new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params - assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" - num_tokens = new_num_tokens - max_seq_len = new_max_seq_len - grad_accum_steps = new_grad_accum_steps - - -# ----------------------------------------------------------------------------- -# int main - -@dataclass -class Hyperparameters: - # data - train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on - val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on - val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons - train_batch_size: int = 2048 * 16 * 8 - train_max_seq_len: int = 128 * 16 - val_batch_size: int = 4 * 64 * 1024 * 8 - # optimization - num_iterations: int = 2285 - lr_schedule = (0.5, 0.98) # breakpoints for 3-part schedule: (flat, linear decay, flat) - lr_min = 0.1 - # evaluation and logging - run_id: str = f"{uuid.uuid4()}" - val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end - save_checkpoint: bool = False - # attention masking - block_size: int = 128 - ws_schedule: tuple = (3, 5, 7, 9, 11, 13) - ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN - -args = Hyperparameters() - -data_path = os.environ.get("DATA_PATH", ".") -args.train_files = os.path.join(data_path, args.train_files) -args.val_files = os.path.join(data_path, args.val_files) - -# torchrun sets these env variables -rank = int(os.environ["RANK"]) -world_size = int(os.environ["WORLD_SIZE"]) -assert 8 % world_size == 0, "world_size must be a divisor of 8" -grad_accum_steps = 8 // world_size -assert torch.cuda.is_available() -device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) -torch.cuda.set_device(device) -dist.init_process_group(backend="nccl", device_id=device) -dist.barrier() -master_process = (rank == 0) # this process will do logging, checkpointing etc. - -# begin logging -logfile = None -if master_process: - run_id = args.run_id - os.makedirs("logs", exist_ok=True) - logfile = f"logs/{run_id}.txt" - print(logfile) -def print0(s, console=False): - if master_process: - with open(logfile, "a") as f: - if console: - print(s) - print(s, file=f) - -# begin by printing this file (the Python code) -print0(code) -print0("="*100) -# log information about the hardware/software environment this is running on -print0(f"Running Python {sys.version}") -print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") -print0(f"Running Triton version {triton.__version__}") - -def nvidia_smi(): - import subprocess # avoid top level import - return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout -print0(nvidia_smi()) -print0("="*100) - -model: nn.Module = GPT( - vocab_size=50257, - num_layers=12, - num_heads=6, - head_dim=128, - model_dim=768, - max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) -).cuda() -for m in model.modules(): - if isinstance(m, (nn.Embedding, nn.Linear)): - m.bfloat16() -for param in model.parameters(): - dist.broadcast(param.detach(), 0) - -# collect the parameters to optimize -hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] -embed_params = [p for n, p in model.named_parameters() if "embed" in n] -scalar_params = [p for p in model.parameters() if p.ndim < 2] -head_params = [model.lm_head.weight] -gate_params = [p for n, p in model.named_parameters() if "gate" in n] - -# init the optimizer(s) -# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence -# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 -optimizer1 = DistAdam( - scalar_params + head_params + embed_params, - lr=0.008, - betas=(0.65, 0.95), - eps=1e-8, - weight_decay=0.0, -) -optimizer2 = Muon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=0.0) -optimizers = [optimizer1, optimizer2] -for opt in optimizers: - for group in opt.param_groups: - group["initial_lr"] = group["lr"] - -def get_lr(step: int): - assert step < args.num_iterations - # Three part schedule: flat, linear decrease, flat - lr_schedule = args.lr_schedule - x = step / args.num_iterations - - if x < lr_schedule[0]: - return 1.0 - elif x < lr_schedule[1]: - progress = (x - lr_schedule[0]) / (lr_schedule[1] - lr_schedule[0]) - lr = 1.0 - (1.0 - args.lr_min) * progress - else: - lr = args.lr_min - return lr - -def get_ws(step: int): - assert step <= args.num_iterations - x = step / (args.num_iterations + 1) - ws_idx = int(len(args.ws_schedule) * x) - return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] - -def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): - # warmup phase: linearly increase momentum from min to max - # cooldown phase: linearly decrease momentum from max to min - momentum_cd_start = args.num_iterations - muon_cooldown_steps - if step < muon_warmup_steps: - frac = step / muon_warmup_steps - momentum = momentum_min + frac * (momentum_max - momentum_min) - elif step > momentum_cd_start: - frac = (step - momentum_cd_start) / muon_cooldown_steps - momentum = momentum_max - frac * (momentum_max - momentum_min) - else: - momentum = momentum_max - return momentum - -def step_optimizers(step: int, optimizers, model): - # update lr - for optimizer in optimizers: - for group in optimizer.param_groups: - group["lr"] = group["initial_lr"] * get_lr(step) - - # set muon momentum based on step - momentum = get_muon_momentum(step) - for group in optimizers[1].param_groups: - group["momentum"] = momentum - - # on even steps, only step Muon params - # on odd steps, step all params - if step%2==0: - optimizers[1].step() - optimizers[1].zero_grad(set_to_none=True) - else: - for optimizer in optimizers: - optimizer.step() - model.zero_grad(set_to_none=True) - -model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) - -######################################## -# Warmup kernels # -######################################## - -# Warmup the training kernels, then re-initialize the state so we aren't cheating -warmup_steps = 30 -initial_state = dict(model=copy.deepcopy(model.state_dict()), - optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state -train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) -for step in range(warmup_steps): - inputs, targets, cum_seqlens = next(train_loader) - # each window size is a new graph, need to warm up each with Yarn.attn_scale - ws_idx = step % len(args.ws_schedule) - if ws_idx==0: - model.yarn.reset() - ws_long = args.ws_schedule[0] - else: - new_ws_long = args.ws_schedule[ws_idx] - if new_ws_long > ws_long: - model.yarn.apply(ws_long, new_ws_long) - ws_long = new_ws_long - model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() - for opt in optimizers: - opt.step() - model.zero_grad(set_to_none=True) -model.yarn.reset() # rotary buffer is not stored in state_dict -model.load_state_dict(initial_state["model"]) -optimizer2.reset() # momentum buffer not in state dict -for opt, opt_state in zip(optimizers, initial_state["optimizers"]): - opt.load_state_dict(opt_state) -del train_loader, initial_state - -######################################## -# Training and validation # -######################################## - -train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) -training_time_ms = 0 -# start the clock -torch.cuda.synchronize() -t0 = time.perf_counter() -# begin training -train_steps = args.num_iterations -ws_short, ws_long = get_ws(0) -for step in range(train_steps + 1): - last_step = (step == train_steps) - ws_short, new_ws_long = get_ws(step) - if new_ws_long != ws_long: - model.yarn.apply(ws_long, new_ws_long) - ws_long=new_ws_long - - # --------------- VALIDATION SECTION ----------------- - if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): - if last_step: - ws_long = args.ws_validate_post_yarn_ext - # stop the clock - torch.cuda.synchronize() - training_time_ms += 1000 * (time.perf_counter() - t0) - model.eval() - assert args.val_tokens % args.val_batch_size == 0 - val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size - val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) - val_loss = 0 - with torch.no_grad(): - for _ in range(val_steps): - inputs, targets, cum_seqlens = next(val_loader) - val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) - val_loss /= val_steps - del val_loader - dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) - print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) - model.train() - # start the clock again - torch.cuda.synchronize() - t0 = time.perf_counter() - - if last_step: - if master_process and args.save_checkpoint: - log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) - os.makedirs(f"logs/{run_id}", exist_ok=True) - torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") - # the last step only has the validation loop, so break to avoid training - break - - # --------------- TRAINING SECTION ----------------- - loss = 0 - for _ in range(grad_accum_steps): - inputs, targets, cum_seqlens = next(train_loader) - loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps - loss.backward() - step_optimizers(step, optimizers, model) - - # logging - approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) - print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) - -print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " - f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) -dist.destroy_process_group() - -==================================================================================================== -Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] -Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 -Running Triton version 3.5.0 -Tue Oct 28 02:17:34 2025 -+-----------------------------------------------------------------------------------------+ -| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | -|-----------------------------------------+------------------------+----------------------+ -| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | -| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | -| | | MIG M. | -|=========================================+========================+======================| -| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | -| N/A 40C P0 130W / 700W | 5858MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | -| N/A 33C P0 126W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | -| N/A 32C P0 121W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | -| N/A 38C P0 124W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | -| N/A 39C P0 121W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | -| N/A 32C P0 120W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | -| N/A 38C P0 125W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | -| N/A 31C P0 115W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ - -+-----------------------------------------------------------------------------------------+ -| Processes: | -| GPU GI CI PID Type Process name GPU Memory | -| ID ID Usage | -|=========================================================================================| -+-----------------------------------------------------------------------------------------+ - -==================================================================================================== -step:0/2285 val_loss:10.8258 train_time:0ms step_avg:0.02ms -step:1/2285 train_time:121ms step_avg:121.14ms -step:2/2285 train_time:142ms step_avg:70.78ms -step:3/2285 train_time:180ms step_avg:60.11ms -step:4/2285 train_time:237ms step_avg:59.15ms -step:5/2285 train_time:296ms step_avg:59.16ms -step:6/2285 train_time:354ms step_avg:58.97ms -step:7/2285 train_time:414ms step_avg:59.19ms -step:8/2285 train_time:473ms step_avg:59.14ms -step:9/2285 train_time:534ms step_avg:59.34ms -step:10/2285 train_time:593ms step_avg:59.28ms -step:11/2285 train_time:653ms step_avg:59.39ms -step:12/2285 train_time:712ms step_avg:59.35ms -step:13/2285 train_time:773ms step_avg:59.45ms -step:14/2285 train_time:831ms step_avg:59.37ms -step:15/2285 train_time:892ms step_avg:59.49ms -step:16/2285 train_time:951ms step_avg:59.42ms -step:17/2285 train_time:1014ms step_avg:59.62ms -step:18/2285 train_time:1077ms step_avg:59.81ms -step:19/2285 train_time:1141ms step_avg:60.08ms -step:20/2285 train_time:1204ms step_avg:60.18ms -step:21/2285 train_time:1265ms step_avg:60.24ms -step:22/2285 train_time:1323ms step_avg:60.15ms -step:23/2285 train_time:1385ms step_avg:60.20ms -step:24/2285 train_time:1444ms step_avg:60.16ms -step:25/2285 train_time:1505ms step_avg:60.19ms -step:26/2285 train_time:1563ms step_avg:60.13ms -step:27/2285 train_time:1625ms step_avg:60.19ms -step:28/2285 train_time:1683ms step_avg:60.12ms -step:29/2285 train_time:1744ms step_avg:60.14ms -step:30/2285 train_time:1803ms step_avg:60.10ms -step:31/2285 train_time:1864ms step_avg:60.14ms -step:32/2285 train_time:1923ms step_avg:60.11ms -step:33/2285 train_time:1985ms step_avg:60.16ms -step:34/2285 train_time:2045ms step_avg:60.15ms -step:35/2285 train_time:2107ms step_avg:60.21ms -step:36/2285 train_time:2166ms step_avg:60.18ms -step:37/2285 train_time:2228ms step_avg:60.22ms -step:38/2285 train_time:2287ms step_avg:60.19ms -step:39/2285 train_time:2348ms step_avg:60.21ms -step:40/2285 train_time:2408ms step_avg:60.19ms -step:41/2285 train_time:2469ms step_avg:60.22ms -step:42/2285 train_time:2528ms step_avg:60.18ms -step:43/2285 train_time:2589ms step_avg:60.21ms -step:44/2285 train_time:2648ms step_avg:60.17ms -step:45/2285 train_time:2709ms step_avg:60.21ms -step:46/2285 train_time:2769ms step_avg:60.19ms -step:47/2285 train_time:2830ms step_avg:60.22ms -step:48/2285 train_time:2889ms step_avg:60.20ms -step:49/2285 train_time:2952ms step_avg:60.24ms -step:50/2285 train_time:3011ms step_avg:60.22ms -step:51/2285 train_time:3074ms step_avg:60.27ms -step:52/2285 train_time:3133ms step_avg:60.25ms -step:53/2285 train_time:3195ms step_avg:60.28ms -step:54/2285 train_time:3254ms step_avg:60.26ms -step:55/2285 train_time:3316ms step_avg:60.29ms -step:56/2285 train_time:3375ms step_avg:60.27ms -step:57/2285 train_time:3436ms step_avg:60.29ms -step:58/2285 train_time:3496ms step_avg:60.27ms -step:59/2285 train_time:3557ms step_avg:60.30ms -step:60/2285 train_time:3618ms step_avg:60.29ms -step:61/2285 train_time:3679ms step_avg:60.31ms -step:62/2285 train_time:3739ms step_avg:60.30ms -step:63/2285 train_time:3800ms step_avg:60.32ms -step:64/2285 train_time:3859ms step_avg:60.30ms -step:65/2285 train_time:3922ms step_avg:60.33ms -step:66/2285 train_time:3980ms step_avg:60.31ms -step:67/2285 train_time:4042ms step_avg:60.33ms -step:68/2285 train_time:4101ms step_avg:60.32ms -step:69/2285 train_time:4163ms step_avg:60.33ms -step:70/2285 train_time:4221ms step_avg:60.30ms -step:71/2285 train_time:4283ms step_avg:60.32ms -step:72/2285 train_time:4341ms step_avg:60.29ms -step:73/2285 train_time:4403ms step_avg:60.31ms -step:74/2285 train_time:4461ms step_avg:60.29ms -step:75/2285 train_time:4523ms step_avg:60.30ms -step:76/2285 train_time:4581ms step_avg:60.28ms -step:77/2285 train_time:4643ms step_avg:60.29ms -step:78/2285 train_time:4701ms step_avg:60.27ms -step:79/2285 train_time:4763ms step_avg:60.29ms -step:80/2285 train_time:4821ms step_avg:60.27ms -step:81/2285 train_time:4882ms step_avg:60.28ms -step:82/2285 train_time:4941ms step_avg:60.26ms -step:83/2285 train_time:5002ms step_avg:60.27ms -step:84/2285 train_time:5062ms step_avg:60.26ms -step:85/2285 train_time:5123ms step_avg:60.27ms -step:86/2285 train_time:5182ms step_avg:60.25ms -step:87/2285 train_time:5243ms step_avg:60.26ms -step:88/2285 train_time:5301ms step_avg:60.24ms -step:89/2285 train_time:5363ms step_avg:60.26ms -step:90/2285 train_time:5421ms step_avg:60.24ms -step:91/2285 train_time:5483ms step_avg:60.25ms -step:92/2285 train_time:5541ms step_avg:60.23ms -step:93/2285 train_time:5602ms step_avg:60.24ms -step:94/2285 train_time:5661ms step_avg:60.22ms -step:95/2285 train_time:5722ms step_avg:60.24ms -step:96/2285 train_time:5781ms step_avg:60.22ms -step:97/2285 train_time:5843ms step_avg:60.23ms -step:98/2285 train_time:5902ms step_avg:60.22ms -step:99/2285 train_time:5963ms step_avg:60.23ms -step:100/2285 train_time:6022ms step_avg:60.22ms -step:101/2285 train_time:6083ms step_avg:60.22ms -step:102/2285 train_time:6142ms step_avg:60.22ms -step:103/2285 train_time:6203ms step_avg:60.22ms -step:104/2285 train_time:6261ms step_avg:60.21ms -step:105/2285 train_time:6322ms step_avg:60.21ms -step:106/2285 train_time:6381ms step_avg:60.20ms -step:107/2285 train_time:6442ms step_avg:60.21ms -step:108/2285 train_time:6501ms step_avg:60.20ms -step:109/2285 train_time:6562ms step_avg:60.20ms -step:110/2285 train_time:6621ms step_avg:60.19ms -step:111/2285 train_time:6682ms step_avg:60.20ms -step:112/2285 train_time:6741ms step_avg:60.18ms -step:113/2285 train_time:6802ms step_avg:60.19ms -step:114/2285 train_time:6860ms step_avg:60.18ms -step:115/2285 train_time:6922ms step_avg:60.19ms -step:116/2285 train_time:6980ms step_avg:60.17ms -step:117/2285 train_time:7041ms step_avg:60.18ms -step:118/2285 train_time:7099ms step_avg:60.16ms -step:119/2285 train_time:7160ms step_avg:60.17ms -step:120/2285 train_time:7219ms step_avg:60.16ms -step:121/2285 train_time:7280ms step_avg:60.17ms -step:122/2285 train_time:7339ms step_avg:60.16ms -step:123/2285 train_time:7400ms step_avg:60.16ms -step:124/2285 train_time:7459ms step_avg:60.15ms -step:125/2285 train_time:7520ms step_avg:60.16ms -step:126/2285 train_time:7579ms step_avg:60.15ms -step:127/2285 train_time:7639ms step_avg:60.15ms -step:128/2285 train_time:7698ms step_avg:60.14ms -step:129/2285 train_time:7759ms step_avg:60.15ms -step:130/2285 train_time:7818ms step_avg:60.14ms -step:131/2285 train_time:7879ms step_avg:60.15ms -step:132/2285 train_time:7938ms step_avg:60.14ms -step:133/2285 train_time:8000ms step_avg:60.15ms -step:134/2285 train_time:8059ms step_avg:60.14ms -step:135/2285 train_time:8120ms step_avg:60.15ms -step:136/2285 train_time:8179ms step_avg:60.14ms -step:137/2285 train_time:8240ms step_avg:60.14ms -step:138/2285 train_time:8298ms step_avg:60.13ms -step:139/2285 train_time:8359ms step_avg:60.14ms -step:140/2285 train_time:8418ms step_avg:60.13ms -step:141/2285 train_time:8479ms step_avg:60.14ms -step:142/2285 train_time:8538ms step_avg:60.13ms -step:143/2285 train_time:8599ms step_avg:60.14ms -step:144/2285 train_time:8658ms step_avg:60.13ms -step:145/2285 train_time:8719ms step_avg:60.13ms -step:146/2285 train_time:8778ms step_avg:60.12ms -step:147/2285 train_time:8839ms step_avg:60.13ms -step:148/2285 train_time:8898ms step_avg:60.12ms -step:149/2285 train_time:8959ms step_avg:60.13ms -step:150/2285 train_time:9018ms step_avg:60.12ms -step:151/2285 train_time:9079ms step_avg:60.13ms -step:152/2285 train_time:9138ms step_avg:60.12ms -step:153/2285 train_time:9200ms step_avg:60.13ms -step:154/2285 train_time:9258ms step_avg:60.12ms -step:155/2285 train_time:9319ms step_avg:60.12ms -step:156/2285 train_time:9378ms step_avg:60.12ms -step:157/2285 train_time:9439ms step_avg:60.12ms -step:158/2285 train_time:9498ms step_avg:60.11ms -step:159/2285 train_time:9559ms step_avg:60.12ms -step:160/2285 train_time:9617ms step_avg:60.11ms -step:161/2285 train_time:9679ms step_avg:60.12ms -step:162/2285 train_time:9737ms step_avg:60.11ms -step:163/2285 train_time:9799ms step_avg:60.11ms -step:164/2285 train_time:9857ms step_avg:60.11ms -step:165/2285 train_time:9919ms step_avg:60.11ms -step:166/2285 train_time:9977ms step_avg:60.10ms -step:167/2285 train_time:10038ms step_avg:60.11ms -step:168/2285 train_time:10096ms step_avg:60.10ms -step:169/2285 train_time:10158ms step_avg:60.10ms -step:170/2285 train_time:10217ms step_avg:60.10ms -step:171/2285 train_time:10278ms step_avg:60.11ms -step:172/2285 train_time:10336ms step_avg:60.09ms -step:173/2285 train_time:10398ms step_avg:60.10ms -step:174/2285 train_time:10456ms step_avg:60.09ms -step:175/2285 train_time:10517ms step_avg:60.10ms -step:176/2285 train_time:10576ms step_avg:60.09ms -step:177/2285 train_time:10637ms step_avg:60.10ms -step:178/2285 train_time:10696ms step_avg:60.09ms -step:179/2285 train_time:10756ms step_avg:60.09ms -step:180/2285 train_time:10815ms step_avg:60.08ms -step:181/2285 train_time:10876ms step_avg:60.09ms -step:182/2285 train_time:10935ms step_avg:60.08ms -step:183/2285 train_time:10997ms step_avg:60.09ms -step:184/2285 train_time:11055ms step_avg:60.08ms -step:185/2285 train_time:11117ms step_avg:60.09ms -step:186/2285 train_time:11175ms step_avg:60.08ms -step:187/2285 train_time:11236ms step_avg:60.08ms -step:188/2285 train_time:11294ms step_avg:60.08ms -step:189/2285 train_time:11356ms step_avg:60.08ms -step:190/2285 train_time:11415ms step_avg:60.08ms -step:191/2285 train_time:11476ms step_avg:60.09ms -step:192/2285 train_time:11535ms step_avg:60.08ms -step:193/2285 train_time:11596ms step_avg:60.08ms -step:194/2285 train_time:11654ms step_avg:60.07ms -step:195/2285 train_time:11715ms step_avg:60.08ms -step:196/2285 train_time:11774ms step_avg:60.07ms -step:197/2285 train_time:11836ms step_avg:60.08ms -step:198/2285 train_time:11894ms step_avg:60.07ms -step:199/2285 train_time:11956ms step_avg:60.08ms -step:200/2285 train_time:12015ms step_avg:60.07ms -step:201/2285 train_time:12076ms step_avg:60.08ms -step:202/2285 train_time:12134ms step_avg:60.07ms -step:203/2285 train_time:12195ms step_avg:60.07ms -step:204/2285 train_time:12254ms step_avg:60.07ms -step:205/2285 train_time:12315ms step_avg:60.07ms -step:206/2285 train_time:12375ms step_avg:60.07ms -step:207/2285 train_time:12435ms step_avg:60.07ms -step:208/2285 train_time:12494ms step_avg:60.07ms -step:209/2285 train_time:12555ms step_avg:60.07ms -step:210/2285 train_time:12614ms step_avg:60.07ms -step:211/2285 train_time:12675ms step_avg:60.07ms -step:212/2285 train_time:12734ms step_avg:60.06ms -step:213/2285 train_time:12795ms step_avg:60.07ms -step:214/2285 train_time:12853ms step_avg:60.06ms -step:215/2285 train_time:12914ms step_avg:60.07ms -step:216/2285 train_time:12973ms step_avg:60.06ms -step:217/2285 train_time:13034ms step_avg:60.07ms -step:218/2285 train_time:13093ms step_avg:60.06ms -step:219/2285 train_time:13154ms step_avg:60.06ms -step:220/2285 train_time:13213ms step_avg:60.06ms -step:221/2285 train_time:13274ms step_avg:60.06ms -step:222/2285 train_time:13334ms step_avg:60.06ms -step:223/2285 train_time:13395ms step_avg:60.07ms -step:224/2285 train_time:13454ms step_avg:60.06ms -step:225/2285 train_time:13515ms step_avg:60.07ms -step:226/2285 train_time:13575ms step_avg:60.06ms -step:227/2285 train_time:13636ms step_avg:60.07ms -step:228/2285 train_time:13695ms step_avg:60.07ms -step:229/2285 train_time:13756ms step_avg:60.07ms -step:230/2285 train_time:13814ms step_avg:60.06ms -step:231/2285 train_time:13876ms step_avg:60.07ms -step:232/2285 train_time:13934ms step_avg:60.06ms -step:233/2285 train_time:13996ms step_avg:60.07ms -step:234/2285 train_time:14054ms step_avg:60.06ms -step:235/2285 train_time:14115ms step_avg:60.07ms -step:236/2285 train_time:14174ms step_avg:60.06ms -step:237/2285 train_time:14235ms step_avg:60.06ms -step:238/2285 train_time:14294ms step_avg:60.06ms -step:239/2285 train_time:14355ms step_avg:60.06ms -step:240/2285 train_time:14414ms step_avg:60.06ms -step:241/2285 train_time:14476ms step_avg:60.07ms -step:242/2285 train_time:14534ms step_avg:60.06ms -step:243/2285 train_time:14596ms step_avg:60.06ms -step:244/2285 train_time:14654ms step_avg:60.06ms -step:245/2285 train_time:14715ms step_avg:60.06ms -step:246/2285 train_time:14774ms step_avg:60.06ms -step:247/2285 train_time:14835ms step_avg:60.06ms -step:248/2285 train_time:14894ms step_avg:60.06ms -step:249/2285 train_time:14956ms step_avg:60.06ms -step:250/2285 train_time:15015ms step_avg:60.06ms -step:250/2285 val_loss:4.0863 train_time:15078ms step_avg:60.31ms -step:251/2285 train_time:15096ms step_avg:60.14ms -step:252/2285 train_time:15139ms step_avg:60.08ms -step:253/2285 train_time:15207ms step_avg:60.11ms -step:254/2285 train_time:15271ms step_avg:60.12ms -step:255/2285 train_time:15336ms step_avg:60.14ms -step:256/2285 train_time:15395ms step_avg:60.13ms -step:257/2285 train_time:15455ms step_avg:60.14ms -step:258/2285 train_time:15514ms step_avg:60.13ms -step:259/2285 train_time:15574ms step_avg:60.13ms -step:260/2285 train_time:15632ms step_avg:60.12ms -step:261/2285 train_time:15692ms step_avg:60.12ms -step:262/2285 train_time:15750ms step_avg:60.12ms -step:263/2285 train_time:15810ms step_avg:60.12ms -step:264/2285 train_time:15868ms step_avg:60.11ms -step:265/2285 train_time:15928ms step_avg:60.11ms -step:266/2285 train_time:15986ms step_avg:60.10ms -step:267/2285 train_time:16048ms step_avg:60.10ms -step:268/2285 train_time:16107ms step_avg:60.10ms -step:269/2285 train_time:16169ms step_avg:60.11ms -step:270/2285 train_time:16230ms step_avg:60.11ms -step:271/2285 train_time:16292ms step_avg:60.12ms -step:272/2285 train_time:16351ms step_avg:60.11ms -step:273/2285 train_time:16412ms step_avg:60.12ms -step:274/2285 train_time:16471ms step_avg:60.11ms -step:275/2285 train_time:16531ms step_avg:60.11ms -step:276/2285 train_time:16590ms step_avg:60.11ms -step:277/2285 train_time:16650ms step_avg:60.11ms -step:278/2285 train_time:16708ms step_avg:60.10ms -step:279/2285 train_time:16768ms step_avg:60.10ms -step:280/2285 train_time:16826ms step_avg:60.09ms -step:281/2285 train_time:16887ms step_avg:60.10ms -step:282/2285 train_time:16945ms step_avg:60.09ms -step:283/2285 train_time:17005ms step_avg:60.09ms -step:284/2285 train_time:17064ms step_avg:60.08ms -step:285/2285 train_time:17125ms step_avg:60.09ms -step:286/2285 train_time:17184ms step_avg:60.08ms -step:287/2285 train_time:17246ms step_avg:60.09ms -step:288/2285 train_time:17305ms step_avg:60.09ms -step:289/2285 train_time:17366ms step_avg:60.09ms -step:290/2285 train_time:17425ms step_avg:60.09ms -step:291/2285 train_time:17487ms step_avg:60.09ms -step:292/2285 train_time:17546ms step_avg:60.09ms -step:293/2285 train_time:17607ms step_avg:60.09ms -step:294/2285 train_time:17665ms step_avg:60.09ms -step:295/2285 train_time:17726ms step_avg:60.09ms -step:296/2285 train_time:17784ms step_avg:60.08ms -step:297/2285 train_time:17844ms step_avg:60.08ms -step:298/2285 train_time:17902ms step_avg:60.07ms -step:299/2285 train_time:17962ms step_avg:60.07ms -step:300/2285 train_time:18020ms step_avg:60.07ms -step:301/2285 train_time:18081ms step_avg:60.07ms -step:302/2285 train_time:18141ms step_avg:60.07ms -step:303/2285 train_time:18202ms step_avg:60.07ms -step:304/2285 train_time:18261ms step_avg:60.07ms -step:305/2285 train_time:18323ms step_avg:60.08ms -step:306/2285 train_time:18382ms step_avg:60.07ms -step:307/2285 train_time:18443ms step_avg:60.08ms -step:308/2285 train_time:18502ms step_avg:60.07ms -step:309/2285 train_time:18563ms step_avg:60.07ms -step:310/2285 train_time:18622ms step_avg:60.07ms -step:311/2285 train_time:18683ms step_avg:60.07ms -step:312/2285 train_time:18742ms step_avg:60.07ms -step:313/2285 train_time:18803ms step_avg:60.07ms -step:314/2285 train_time:18861ms step_avg:60.07ms -step:315/2285 train_time:18922ms step_avg:60.07ms -step:316/2285 train_time:18980ms step_avg:60.06ms -step:317/2285 train_time:19041ms step_avg:60.07ms -step:318/2285 train_time:19099ms step_avg:60.06ms -step:319/2285 train_time:19161ms step_avg:60.06ms -step:320/2285 train_time:19220ms step_avg:60.06ms -step:321/2285 train_time:19281ms step_avg:60.07ms -step:322/2285 train_time:19340ms step_avg:60.06ms -step:323/2285 train_time:19401ms step_avg:60.07ms -step:324/2285 train_time:19460ms step_avg:60.06ms -step:325/2285 train_time:19521ms step_avg:60.07ms -step:326/2285 train_time:19580ms step_avg:60.06ms -step:327/2285 train_time:19641ms step_avg:60.06ms -step:328/2285 train_time:19700ms step_avg:60.06ms -step:329/2285 train_time:19761ms step_avg:60.06ms -step:330/2285 train_time:19819ms step_avg:60.06ms -step:331/2285 train_time:19880ms step_avg:60.06ms -step:332/2285 train_time:19939ms step_avg:60.06ms -step:333/2285 train_time:20000ms step_avg:60.06ms -step:334/2285 train_time:20058ms step_avg:60.05ms -step:335/2285 train_time:20119ms step_avg:60.06ms -step:336/2285 train_time:20178ms step_avg:60.05ms -step:337/2285 train_time:20239ms step_avg:60.06ms -step:338/2285 train_time:20298ms step_avg:60.05ms -step:339/2285 train_time:20359ms step_avg:60.06ms -step:340/2285 train_time:20418ms step_avg:60.05ms -step:341/2285 train_time:20480ms step_avg:60.06ms -step:342/2285 train_time:20539ms step_avg:60.06ms -step:343/2285 train_time:20600ms step_avg:60.06ms -step:344/2285 train_time:20659ms step_avg:60.05ms -step:345/2285 train_time:20720ms step_avg:60.06ms -step:346/2285 train_time:20779ms step_avg:60.05ms -step:347/2285 train_time:20840ms step_avg:60.06ms -step:348/2285 train_time:20899ms step_avg:60.05ms -step:349/2285 train_time:20959ms step_avg:60.06ms -step:350/2285 train_time:21018ms step_avg:60.05ms -step:351/2285 train_time:21078ms step_avg:60.05ms -step:352/2285 train_time:21137ms step_avg:60.05ms -step:353/2285 train_time:21198ms step_avg:60.05ms -step:354/2285 train_time:21257ms step_avg:60.05ms -step:355/2285 train_time:21319ms step_avg:60.05ms -step:356/2285 train_time:21378ms step_avg:60.05ms -step:357/2285 train_time:21440ms step_avg:60.06ms -step:358/2285 train_time:21499ms step_avg:60.05ms -step:359/2285 train_time:21560ms step_avg:60.06ms -step:360/2285 train_time:21620ms step_avg:60.05ms -step:361/2285 train_time:21681ms step_avg:60.06ms -step:362/2285 train_time:21739ms step_avg:60.05ms -step:363/2285 train_time:21800ms step_avg:60.06ms -step:364/2285 train_time:21859ms step_avg:60.05ms -step:365/2285 train_time:21920ms step_avg:60.05ms -step:366/2285 train_time:21978ms step_avg:60.05ms -step:367/2285 train_time:22040ms step_avg:60.05ms -step:368/2285 train_time:22098ms step_avg:60.05ms -step:369/2285 train_time:22159ms step_avg:60.05ms -step:370/2285 train_time:22218ms step_avg:60.05ms -step:371/2285 train_time:22279ms step_avg:60.05ms -step:372/2285 train_time:22339ms step_avg:60.05ms -step:373/2285 train_time:22400ms step_avg:60.05ms -step:374/2285 train_time:22458ms step_avg:60.05ms -step:375/2285 train_time:22520ms step_avg:60.05ms -step:376/2285 train_time:22579ms step_avg:60.05ms -step:377/2285 train_time:22639ms step_avg:60.05ms -step:378/2285 train_time:22698ms step_avg:60.05ms -step:379/2285 train_time:22759ms step_avg:60.05ms -step:380/2285 train_time:22818ms step_avg:60.05ms -step:381/2285 train_time:22879ms step_avg:60.05ms -step:382/2285 train_time:22938ms step_avg:60.05ms -step:383/2285 train_time:23000ms step_avg:60.05ms -step:384/2285 train_time:23059ms step_avg:60.05ms -step:385/2285 train_time:23121ms step_avg:60.05ms -step:386/2285 train_time:23180ms step_avg:60.05ms -step:387/2285 train_time:23242ms step_avg:60.06ms -step:388/2285 train_time:23301ms step_avg:60.05ms -step:389/2285 train_time:23363ms step_avg:60.06ms -step:390/2285 train_time:23422ms step_avg:60.06ms -step:391/2285 train_time:23483ms step_avg:60.06ms -step:392/2285 train_time:23543ms step_avg:60.06ms -step:393/2285 train_time:23603ms step_avg:60.06ms -step:394/2285 train_time:23662ms step_avg:60.06ms -step:395/2285 train_time:23723ms step_avg:60.06ms -step:396/2285 train_time:23782ms step_avg:60.06ms -step:397/2285 train_time:23843ms step_avg:60.06ms -step:398/2285 train_time:23902ms step_avg:60.06ms -step:399/2285 train_time:23963ms step_avg:60.06ms -step:400/2285 train_time:24022ms step_avg:60.06ms -step:401/2285 train_time:24083ms step_avg:60.06ms -step:402/2285 train_time:24142ms step_avg:60.06ms -step:403/2285 train_time:24204ms step_avg:60.06ms -step:404/2285 train_time:24263ms step_avg:60.06ms -step:405/2285 train_time:24324ms step_avg:60.06ms -step:406/2285 train_time:24383ms step_avg:60.06ms -step:407/2285 train_time:24445ms step_avg:60.06ms -step:408/2285 train_time:24504ms step_avg:60.06ms -step:409/2285 train_time:24566ms step_avg:60.06ms -step:410/2285 train_time:24625ms step_avg:60.06ms -step:411/2285 train_time:24686ms step_avg:60.06ms -step:412/2285 train_time:24745ms step_avg:60.06ms -step:413/2285 train_time:24806ms step_avg:60.06ms -step:414/2285 train_time:24864ms step_avg:60.06ms -step:415/2285 train_time:24926ms step_avg:60.06ms -step:416/2285 train_time:24985ms step_avg:60.06ms -step:417/2285 train_time:25046ms step_avg:60.06ms -step:418/2285 train_time:25105ms step_avg:60.06ms -step:419/2285 train_time:25166ms step_avg:60.06ms -step:420/2285 train_time:25225ms step_avg:60.06ms -step:421/2285 train_time:25287ms step_avg:60.06ms -step:422/2285 train_time:25346ms step_avg:60.06ms -step:423/2285 train_time:25408ms step_avg:60.07ms -step:424/2285 train_time:25467ms step_avg:60.06ms -step:425/2285 train_time:25528ms step_avg:60.07ms -step:426/2285 train_time:25587ms step_avg:60.06ms -step:427/2285 train_time:25648ms step_avg:60.07ms -step:428/2285 train_time:25707ms step_avg:60.06ms -step:429/2285 train_time:25768ms step_avg:60.07ms -step:430/2285 train_time:25827ms step_avg:60.06ms -step:431/2285 train_time:25888ms step_avg:60.06ms -step:432/2285 train_time:25947ms step_avg:60.06ms -step:433/2285 train_time:26008ms step_avg:60.07ms -step:434/2285 train_time:26067ms step_avg:60.06ms -step:435/2285 train_time:26129ms step_avg:60.07ms -step:436/2285 train_time:26188ms step_avg:60.07ms -step:437/2285 train_time:26250ms step_avg:60.07ms -step:438/2285 train_time:26309ms step_avg:60.07ms -step:439/2285 train_time:26370ms step_avg:60.07ms -step:440/2285 train_time:26429ms step_avg:60.07ms -step:441/2285 train_time:26491ms step_avg:60.07ms -step:442/2285 train_time:26550ms step_avg:60.07ms -step:443/2285 train_time:26610ms step_avg:60.07ms -step:444/2285 train_time:26669ms step_avg:60.07ms -step:445/2285 train_time:26730ms step_avg:60.07ms -step:446/2285 train_time:26790ms step_avg:60.07ms -step:447/2285 train_time:26851ms step_avg:60.07ms -step:448/2285 train_time:26909ms step_avg:60.07ms -step:449/2285 train_time:26971ms step_avg:60.07ms -step:450/2285 train_time:27029ms step_avg:60.07ms -step:451/2285 train_time:27091ms step_avg:60.07ms -step:452/2285 train_time:27149ms step_avg:60.07ms -step:453/2285 train_time:27211ms step_avg:60.07ms -step:454/2285 train_time:27271ms step_avg:60.07ms -step:455/2285 train_time:27332ms step_avg:60.07ms -step:456/2285 train_time:27392ms step_avg:60.07ms -step:457/2285 train_time:27453ms step_avg:60.07ms -step:458/2285 train_time:27512ms step_avg:60.07ms -step:459/2285 train_time:27573ms step_avg:60.07ms -step:460/2285 train_time:27632ms step_avg:60.07ms -step:461/2285 train_time:27694ms step_avg:60.07ms -step:462/2285 train_time:27753ms step_avg:60.07ms -step:463/2285 train_time:27814ms step_avg:60.07ms -step:464/2285 train_time:27873ms step_avg:60.07ms -step:465/2285 train_time:27934ms step_avg:60.07ms -step:466/2285 train_time:27993ms step_avg:60.07ms -step:467/2285 train_time:28054ms step_avg:60.07ms -step:468/2285 train_time:28113ms step_avg:60.07ms -step:469/2285 train_time:28175ms step_avg:60.07ms -step:470/2285 train_time:28235ms step_avg:60.07ms -step:471/2285 train_time:28296ms step_avg:60.08ms -step:472/2285 train_time:28355ms step_avg:60.07ms -step:473/2285 train_time:28416ms step_avg:60.08ms -step:474/2285 train_time:28475ms step_avg:60.07ms -step:475/2285 train_time:28537ms step_avg:60.08ms -step:476/2285 train_time:28596ms step_avg:60.08ms -step:477/2285 train_time:28657ms step_avg:60.08ms -step:478/2285 train_time:28717ms step_avg:60.08ms -step:479/2285 train_time:28778ms step_avg:60.08ms -step:480/2285 train_time:28837ms step_avg:60.08ms -step:481/2285 train_time:28898ms step_avg:60.08ms -step:482/2285 train_time:28957ms step_avg:60.08ms -step:483/2285 train_time:29019ms step_avg:60.08ms -step:484/2285 train_time:29078ms step_avg:60.08ms -step:485/2285 train_time:29140ms step_avg:60.08ms -step:486/2285 train_time:29199ms step_avg:60.08ms -step:487/2285 train_time:29260ms step_avg:60.08ms -step:488/2285 train_time:29320ms step_avg:60.08ms -step:489/2285 train_time:29382ms step_avg:60.09ms -step:490/2285 train_time:29441ms step_avg:60.08ms -step:491/2285 train_time:29502ms step_avg:60.09ms -step:492/2285 train_time:29561ms step_avg:60.08ms -step:493/2285 train_time:29623ms step_avg:60.09ms -step:494/2285 train_time:29682ms step_avg:60.08ms -step:495/2285 train_time:29744ms step_avg:60.09ms -step:496/2285 train_time:29803ms step_avg:60.09ms -step:497/2285 train_time:29864ms step_avg:60.09ms -step:498/2285 train_time:29923ms step_avg:60.09ms -step:499/2285 train_time:29984ms step_avg:60.09ms -step:500/2285 train_time:30043ms step_avg:60.09ms -step:500/2285 val_loss:3.7891 train_time:30106ms step_avg:60.21ms -step:501/2285 train_time:30125ms step_avg:60.13ms -step:502/2285 train_time:30166ms step_avg:60.09ms -step:503/2285 train_time:30227ms step_avg:60.09ms -step:504/2285 train_time:30287ms step_avg:60.09ms -step:505/2285 train_time:30349ms step_avg:60.10ms -step:506/2285 train_time:30408ms step_avg:60.10ms -step:507/2285 train_time:30469ms step_avg:60.10ms -step:508/2285 train_time:30528ms step_avg:60.10ms -step:509/2285 train_time:30591ms step_avg:60.10ms -step:510/2285 train_time:30649ms step_avg:60.10ms -step:511/2285 train_time:30711ms step_avg:60.10ms -step:512/2285 train_time:30769ms step_avg:60.10ms -step:513/2285 train_time:30830ms step_avg:60.10ms -step:514/2285 train_time:30890ms step_avg:60.10ms -step:515/2285 train_time:30952ms step_avg:60.10ms -step:516/2285 train_time:31012ms step_avg:60.10ms -step:517/2285 train_time:31078ms step_avg:60.11ms -step:518/2285 train_time:31138ms step_avg:60.11ms -step:519/2285 train_time:31199ms step_avg:60.11ms -step:520/2285 train_time:31258ms step_avg:60.11ms -step:521/2285 train_time:31321ms step_avg:60.12ms -step:522/2285 train_time:31380ms step_avg:60.12ms -step:523/2285 train_time:31441ms step_avg:60.12ms -step:524/2285 train_time:31500ms step_avg:60.11ms -step:525/2285 train_time:31561ms step_avg:60.12ms -step:526/2285 train_time:31620ms step_avg:60.11ms -step:527/2285 train_time:31681ms step_avg:60.12ms -step:528/2285 train_time:31740ms step_avg:60.11ms -step:529/2285 train_time:31801ms step_avg:60.12ms -step:530/2285 train_time:31860ms step_avg:60.11ms -step:531/2285 train_time:31921ms step_avg:60.12ms -step:532/2285 train_time:31981ms step_avg:60.11ms -step:533/2285 train_time:32043ms step_avg:60.12ms -step:534/2285 train_time:32103ms step_avg:60.12ms -step:535/2285 train_time:32164ms step_avg:60.12ms -step:536/2285 train_time:32224ms step_avg:60.12ms -step:537/2285 train_time:32286ms step_avg:60.12ms -step:538/2285 train_time:32345ms step_avg:60.12ms -step:539/2285 train_time:32406ms step_avg:60.12ms -step:540/2285 train_time:32465ms step_avg:60.12ms -step:541/2285 train_time:32527ms step_avg:60.12ms -step:542/2285 train_time:32585ms step_avg:60.12ms -step:543/2285 train_time:32647ms step_avg:60.12ms -step:544/2285 train_time:32705ms step_avg:60.12ms -step:545/2285 train_time:32767ms step_avg:60.12ms -step:546/2285 train_time:32826ms step_avg:60.12ms -step:547/2285 train_time:32887ms step_avg:60.12ms -step:548/2285 train_time:32946ms step_avg:60.12ms -step:549/2285 train_time:33008ms step_avg:60.12ms -step:550/2285 train_time:33069ms step_avg:60.13ms -step:551/2285 train_time:33130ms step_avg:60.13ms -step:552/2285 train_time:33190ms step_avg:60.13ms -step:553/2285 train_time:33251ms step_avg:60.13ms -step:554/2285 train_time:33310ms step_avg:60.13ms -step:555/2285 train_time:33372ms step_avg:60.13ms -step:556/2285 train_time:33430ms step_avg:60.13ms -step:557/2285 train_time:33492ms step_avg:60.13ms -step:558/2285 train_time:33551ms step_avg:60.13ms -step:559/2285 train_time:33612ms step_avg:60.13ms -step:560/2285 train_time:33672ms step_avg:60.13ms -step:561/2285 train_time:33733ms step_avg:60.13ms -step:562/2285 train_time:33792ms step_avg:60.13ms -step:563/2285 train_time:33854ms step_avg:60.13ms -step:564/2285 train_time:33914ms step_avg:60.13ms -step:565/2285 train_time:33975ms step_avg:60.13ms -step:566/2285 train_time:34035ms step_avg:60.13ms -step:567/2285 train_time:34097ms step_avg:60.14ms -step:568/2285 train_time:34156ms step_avg:60.13ms -step:569/2285 train_time:34217ms step_avg:60.14ms -step:570/2285 train_time:34276ms step_avg:60.13ms -step:571/2285 train_time:34338ms step_avg:60.14ms -step:572/2285 train_time:34397ms step_avg:60.13ms -step:573/2285 train_time:34458ms step_avg:60.14ms -step:574/2285 train_time:34517ms step_avg:60.13ms -step:575/2285 train_time:34578ms step_avg:60.14ms -step:576/2285 train_time:34637ms step_avg:60.13ms -step:577/2285 train_time:34698ms step_avg:60.14ms -step:578/2285 train_time:34757ms step_avg:60.13ms -step:579/2285 train_time:34819ms step_avg:60.14ms -step:580/2285 train_time:34878ms step_avg:60.13ms -step:581/2285 train_time:34940ms step_avg:60.14ms -step:582/2285 train_time:34998ms step_avg:60.13ms -step:583/2285 train_time:35060ms step_avg:60.14ms -step:584/2285 train_time:35119ms step_avg:60.14ms -step:585/2285 train_time:35180ms step_avg:60.14ms -step:586/2285 train_time:35239ms step_avg:60.14ms -step:587/2285 train_time:35301ms step_avg:60.14ms -step:588/2285 train_time:35360ms step_avg:60.14ms -step:589/2285 train_time:35421ms step_avg:60.14ms -step:590/2285 train_time:35480ms step_avg:60.14ms -step:591/2285 train_time:35542ms step_avg:60.14ms -step:592/2285 train_time:35601ms step_avg:60.14ms -step:593/2285 train_time:35662ms step_avg:60.14ms -step:594/2285 train_time:35721ms step_avg:60.14ms -step:595/2285 train_time:35782ms step_avg:60.14ms -step:596/2285 train_time:35841ms step_avg:60.14ms -step:597/2285 train_time:35902ms step_avg:60.14ms -step:598/2285 train_time:35961ms step_avg:60.14ms -step:599/2285 train_time:36022ms step_avg:60.14ms -step:600/2285 train_time:36081ms step_avg:60.14ms -step:601/2285 train_time:36142ms step_avg:60.14ms -step:602/2285 train_time:36201ms step_avg:60.13ms -step:603/2285 train_time:36263ms step_avg:60.14ms -step:604/2285 train_time:36321ms step_avg:60.13ms -step:605/2285 train_time:36383ms step_avg:60.14ms -step:606/2285 train_time:36443ms step_avg:60.14ms -step:607/2285 train_time:36504ms step_avg:60.14ms -step:608/2285 train_time:36564ms step_avg:60.14ms -step:609/2285 train_time:36624ms step_avg:60.14ms -step:610/2285 train_time:36683ms step_avg:60.14ms -step:611/2285 train_time:36744ms step_avg:60.14ms -step:612/2285 train_time:36803ms step_avg:60.14ms -step:613/2285 train_time:36865ms step_avg:60.14ms -step:614/2285 train_time:36924ms step_avg:60.14ms -step:615/2285 train_time:36986ms step_avg:60.14ms -step:616/2285 train_time:37045ms step_avg:60.14ms -step:617/2285 train_time:37106ms step_avg:60.14ms -step:618/2285 train_time:37165ms step_avg:60.14ms -step:619/2285 train_time:37227ms step_avg:60.14ms -step:620/2285 train_time:37286ms step_avg:60.14ms -step:621/2285 train_time:37348ms step_avg:60.14ms -step:622/2285 train_time:37407ms step_avg:60.14ms -step:623/2285 train_time:37469ms step_avg:60.14ms -step:624/2285 train_time:37529ms step_avg:60.14ms -step:625/2285 train_time:37590ms step_avg:60.14ms -step:626/2285 train_time:37649ms step_avg:60.14ms -step:627/2285 train_time:37710ms step_avg:60.14ms -step:628/2285 train_time:37770ms step_avg:60.14ms -step:629/2285 train_time:37831ms step_avg:60.15ms -step:630/2285 train_time:37891ms step_avg:60.14ms -step:631/2285 train_time:37953ms step_avg:60.15ms -step:632/2285 train_time:38012ms step_avg:60.15ms -step:633/2285 train_time:38074ms step_avg:60.15ms -step:634/2285 train_time:38133ms step_avg:60.15ms -step:635/2285 train_time:38195ms step_avg:60.15ms -step:636/2285 train_time:38254ms step_avg:60.15ms -step:637/2285 train_time:38316ms step_avg:60.15ms -step:638/2285 train_time:38376ms step_avg:60.15ms -step:639/2285 train_time:38437ms step_avg:60.15ms -step:640/2285 train_time:38496ms step_avg:60.15ms -step:641/2285 train_time:38558ms step_avg:60.15ms -step:642/2285 train_time:38617ms step_avg:60.15ms -step:643/2285 train_time:38679ms step_avg:60.15ms -step:644/2285 train_time:38738ms step_avg:60.15ms -step:645/2285 train_time:38799ms step_avg:60.15ms -step:646/2285 train_time:38858ms step_avg:60.15ms -step:647/2285 train_time:38920ms step_avg:60.15ms -step:648/2285 train_time:38979ms step_avg:60.15ms -step:649/2285 train_time:39040ms step_avg:60.15ms -step:650/2285 train_time:39099ms step_avg:60.15ms -step:651/2285 train_time:39161ms step_avg:60.15ms -step:652/2285 train_time:39220ms step_avg:60.15ms -step:653/2285 train_time:39281ms step_avg:60.15ms -step:654/2285 train_time:39340ms step_avg:60.15ms -step:655/2285 train_time:39401ms step_avg:60.15ms -step:656/2285 train_time:39460ms step_avg:60.15ms -step:657/2285 train_time:39522ms step_avg:60.15ms -step:658/2285 train_time:39581ms step_avg:60.15ms -step:659/2285 train_time:39642ms step_avg:60.16ms -step:660/2285 train_time:39702ms step_avg:60.15ms -step:661/2285 train_time:39764ms step_avg:60.16ms -step:662/2285 train_time:39823ms step_avg:60.15ms -step:663/2285 train_time:39884ms step_avg:60.16ms -step:664/2285 train_time:39943ms step_avg:60.16ms -step:665/2285 train_time:40004ms step_avg:60.16ms -step:666/2285 train_time:40063ms step_avg:60.16ms -step:667/2285 train_time:40125ms step_avg:60.16ms -step:668/2285 train_time:40184ms step_avg:60.16ms -step:669/2285 train_time:40245ms step_avg:60.16ms -step:670/2285 train_time:40304ms step_avg:60.16ms -step:671/2285 train_time:40366ms step_avg:60.16ms -step:672/2285 train_time:40426ms step_avg:60.16ms -step:673/2285 train_time:40488ms step_avg:60.16ms -step:674/2285 train_time:40546ms step_avg:60.16ms -step:675/2285 train_time:40608ms step_avg:60.16ms -step:676/2285 train_time:40668ms step_avg:60.16ms -step:677/2285 train_time:40729ms step_avg:60.16ms -step:678/2285 train_time:40788ms step_avg:60.16ms -step:679/2285 train_time:40850ms step_avg:60.16ms -step:680/2285 train_time:40910ms step_avg:60.16ms -step:681/2285 train_time:40972ms step_avg:60.16ms -step:682/2285 train_time:41031ms step_avg:60.16ms -step:683/2285 train_time:41093ms step_avg:60.16ms -step:684/2285 train_time:41152ms step_avg:60.16ms -step:685/2285 train_time:41214ms step_avg:60.17ms -step:686/2285 train_time:41273ms step_avg:60.17ms -step:687/2285 train_time:41335ms step_avg:60.17ms -step:688/2285 train_time:41394ms step_avg:60.17ms -step:689/2285 train_time:41455ms step_avg:60.17ms -step:690/2285 train_time:41515ms step_avg:60.17ms -step:691/2285 train_time:41577ms step_avg:60.17ms -step:692/2285 train_time:41636ms step_avg:60.17ms -step:693/2285 train_time:41698ms step_avg:60.17ms -step:694/2285 train_time:41757ms step_avg:60.17ms -step:695/2285 train_time:41819ms step_avg:60.17ms -step:696/2285 train_time:41878ms step_avg:60.17ms -step:697/2285 train_time:41939ms step_avg:60.17ms -step:698/2285 train_time:41998ms step_avg:60.17ms -step:699/2285 train_time:42060ms step_avg:60.17ms -step:700/2285 train_time:42119ms step_avg:60.17ms -step:701/2285 train_time:42180ms step_avg:60.17ms -step:702/2285 train_time:42239ms step_avg:60.17ms -step:703/2285 train_time:42300ms step_avg:60.17ms -step:704/2285 train_time:42359ms step_avg:60.17ms -step:705/2285 train_time:42421ms step_avg:60.17ms -step:706/2285 train_time:42480ms step_avg:60.17ms -step:707/2285 train_time:42541ms step_avg:60.17ms -step:708/2285 train_time:42600ms step_avg:60.17ms -step:709/2285 train_time:42662ms step_avg:60.17ms -step:710/2285 train_time:42721ms step_avg:60.17ms -step:711/2285 train_time:42783ms step_avg:60.17ms -step:712/2285 train_time:42841ms step_avg:60.17ms -step:713/2285 train_time:42903ms step_avg:60.17ms -step:714/2285 train_time:42962ms step_avg:60.17ms -step:715/2285 train_time:43023ms step_avg:60.17ms -step:716/2285 train_time:43082ms step_avg:60.17ms -step:717/2285 train_time:43144ms step_avg:60.17ms -step:718/2285 train_time:43203ms step_avg:60.17ms -step:719/2285 train_time:43264ms step_avg:60.17ms -step:720/2285 train_time:43323ms step_avg:60.17ms -step:721/2285 train_time:43384ms step_avg:60.17ms -step:722/2285 train_time:43443ms step_avg:60.17ms -step:723/2285 train_time:43505ms step_avg:60.17ms -step:724/2285 train_time:43564ms step_avg:60.17ms -step:725/2285 train_time:43626ms step_avg:60.17ms -step:726/2285 train_time:43685ms step_avg:60.17ms -step:727/2285 train_time:43746ms step_avg:60.17ms -step:728/2285 train_time:43806ms step_avg:60.17ms -step:729/2285 train_time:43868ms step_avg:60.17ms -step:730/2285 train_time:43927ms step_avg:60.17ms -step:731/2285 train_time:43988ms step_avg:60.18ms -step:732/2285 train_time:44048ms step_avg:60.17ms -step:733/2285 train_time:44110ms step_avg:60.18ms -step:734/2285 train_time:44170ms step_avg:60.18ms -step:735/2285 train_time:44231ms step_avg:60.18ms -step:736/2285 train_time:44290ms step_avg:60.18ms -step:737/2285 train_time:44353ms step_avg:60.18ms -step:738/2285 train_time:44412ms step_avg:60.18ms -step:739/2285 train_time:44473ms step_avg:60.18ms -step:740/2285 train_time:44532ms step_avg:60.18ms -step:741/2285 train_time:44595ms step_avg:60.18ms -step:742/2285 train_time:44654ms step_avg:60.18ms -step:743/2285 train_time:44715ms step_avg:60.18ms -step:744/2285 train_time:44775ms step_avg:60.18ms -step:745/2285 train_time:44836ms step_avg:60.18ms -step:746/2285 train_time:44895ms step_avg:60.18ms -step:747/2285 train_time:44957ms step_avg:60.18ms -step:748/2285 train_time:45016ms step_avg:60.18ms -step:749/2285 train_time:45078ms step_avg:60.18ms -step:750/2285 train_time:45138ms step_avg:60.18ms -step:750/2285 val_loss:3.6571 train_time:45200ms step_avg:60.27ms -step:751/2285 train_time:45218ms step_avg:60.21ms -step:752/2285 train_time:45260ms step_avg:60.19ms -step:753/2285 train_time:45324ms step_avg:60.19ms -step:754/2285 train_time:45387ms step_avg:60.20ms -step:755/2285 train_time:45449ms step_avg:60.20ms -step:756/2285 train_time:45508ms step_avg:60.20ms -step:757/2285 train_time:45570ms step_avg:60.20ms -step:758/2285 train_time:45628ms step_avg:60.20ms -step:759/2285 train_time:45689ms step_avg:60.20ms -step:760/2285 train_time:45747ms step_avg:60.19ms -step:761/2285 train_time:45808ms step_avg:60.19ms -step:762/2285 train_time:45866ms step_avg:60.19ms -step:763/2285 train_time:45927ms step_avg:60.19ms -step:764/2285 train_time:45986ms step_avg:60.19ms -step:765/2285 train_time:46047ms step_avg:60.19ms -step:766/2285 train_time:46107ms step_avg:60.19ms -step:767/2285 train_time:46170ms step_avg:60.20ms -step:768/2285 train_time:46231ms step_avg:60.20ms -step:769/2285 train_time:46295ms step_avg:60.20ms -step:770/2285 train_time:46355ms step_avg:60.20ms -step:771/2285 train_time:46418ms step_avg:60.20ms -step:772/2285 train_time:46477ms step_avg:60.20ms -step:773/2285 train_time:46539ms step_avg:60.21ms -step:774/2285 train_time:46598ms step_avg:60.20ms -step:775/2285 train_time:46661ms step_avg:60.21ms -step:776/2285 train_time:46720ms step_avg:60.21ms -step:777/2285 train_time:46781ms step_avg:60.21ms -step:778/2285 train_time:46840ms step_avg:60.21ms -step:779/2285 train_time:46902ms step_avg:60.21ms -step:780/2285 train_time:46960ms step_avg:60.21ms -step:781/2285 train_time:47021ms step_avg:60.21ms -step:782/2285 train_time:47081ms step_avg:60.21ms -step:783/2285 train_time:47143ms step_avg:60.21ms -step:784/2285 train_time:47203ms step_avg:60.21ms -step:785/2285 train_time:47265ms step_avg:60.21ms -step:786/2285 train_time:47325ms step_avg:60.21ms -step:787/2285 train_time:47388ms step_avg:60.21ms -step:788/2285 train_time:47447ms step_avg:60.21ms -step:789/2285 train_time:47509ms step_avg:60.21ms -step:790/2285 train_time:47569ms step_avg:60.21ms -step:791/2285 train_time:47631ms step_avg:60.22ms -step:792/2285 train_time:47690ms step_avg:60.22ms -step:793/2285 train_time:47752ms step_avg:60.22ms -step:794/2285 train_time:47812ms step_avg:60.22ms -step:795/2285 train_time:47873ms step_avg:60.22ms -step:796/2285 train_time:47933ms step_avg:60.22ms -step:797/2285 train_time:47995ms step_avg:60.22ms -step:798/2285 train_time:48055ms step_avg:60.22ms -step:799/2285 train_time:48118ms step_avg:60.22ms -step:800/2285 train_time:48177ms step_avg:60.22ms -step:801/2285 train_time:48240ms step_avg:60.22ms -step:802/2285 train_time:48299ms step_avg:60.22ms -step:803/2285 train_time:48361ms step_avg:60.23ms -step:804/2285 train_time:48421ms step_avg:60.22ms -step:805/2285 train_time:48483ms step_avg:60.23ms -step:806/2285 train_time:48542ms step_avg:60.23ms -step:807/2285 train_time:48605ms step_avg:60.23ms -step:808/2285 train_time:48664ms step_avg:60.23ms -step:809/2285 train_time:48725ms step_avg:60.23ms -step:810/2285 train_time:48785ms step_avg:60.23ms -step:811/2285 train_time:48846ms step_avg:60.23ms -step:812/2285 train_time:48906ms step_avg:60.23ms -step:813/2285 train_time:48968ms step_avg:60.23ms -step:814/2285 train_time:49028ms step_avg:60.23ms -step:815/2285 train_time:49090ms step_avg:60.23ms -step:816/2285 train_time:49149ms step_avg:60.23ms -step:817/2285 train_time:49212ms step_avg:60.23ms -step:818/2285 train_time:49271ms step_avg:60.23ms -step:819/2285 train_time:49333ms step_avg:60.24ms -step:820/2285 train_time:49393ms step_avg:60.24ms -step:821/2285 train_time:49455ms step_avg:60.24ms -step:822/2285 train_time:49514ms step_avg:60.24ms -step:823/2285 train_time:49576ms step_avg:60.24ms -step:824/2285 train_time:49636ms step_avg:60.24ms -step:825/2285 train_time:49699ms step_avg:60.24ms -step:826/2285 train_time:49758ms step_avg:60.24ms -step:827/2285 train_time:49820ms step_avg:60.24ms -step:828/2285 train_time:49879ms step_avg:60.24ms -step:829/2285 train_time:49941ms step_avg:60.24ms -step:830/2285 train_time:50000ms step_avg:60.24ms -step:831/2285 train_time:50062ms step_avg:60.24ms -step:832/2285 train_time:50121ms step_avg:60.24ms -step:833/2285 train_time:50184ms step_avg:60.24ms -step:834/2285 train_time:50243ms step_avg:60.24ms -step:835/2285 train_time:50305ms step_avg:60.25ms -step:836/2285 train_time:50365ms step_avg:60.24ms -step:837/2285 train_time:50426ms step_avg:60.25ms -step:838/2285 train_time:50486ms step_avg:60.25ms -step:839/2285 train_time:50547ms step_avg:60.25ms -step:840/2285 train_time:50607ms step_avg:60.25ms -step:841/2285 train_time:50669ms step_avg:60.25ms -step:842/2285 train_time:50729ms step_avg:60.25ms -step:843/2285 train_time:50791ms step_avg:60.25ms -step:844/2285 train_time:50851ms step_avg:60.25ms -step:845/2285 train_time:50912ms step_avg:60.25ms -step:846/2285 train_time:50972ms step_avg:60.25ms -step:847/2285 train_time:51034ms step_avg:60.25ms -step:848/2285 train_time:51094ms step_avg:60.25ms -step:849/2285 train_time:51157ms step_avg:60.26ms -step:850/2285 train_time:51217ms step_avg:60.26ms -step:851/2285 train_time:51279ms step_avg:60.26ms -step:852/2285 train_time:51338ms step_avg:60.26ms -step:853/2285 train_time:51400ms step_avg:60.26ms -step:854/2285 train_time:51460ms step_avg:60.26ms -step:855/2285 train_time:51521ms step_avg:60.26ms -step:856/2285 train_time:51581ms step_avg:60.26ms -step:857/2285 train_time:51643ms step_avg:60.26ms -step:858/2285 train_time:51702ms step_avg:60.26ms -step:859/2285 train_time:51764ms step_avg:60.26ms -step:860/2285 train_time:51824ms step_avg:60.26ms -step:861/2285 train_time:51886ms step_avg:60.26ms -step:862/2285 train_time:51945ms step_avg:60.26ms -step:863/2285 train_time:52007ms step_avg:60.26ms -step:864/2285 train_time:52066ms step_avg:60.26ms -step:865/2285 train_time:52128ms step_avg:60.26ms -step:866/2285 train_time:52188ms step_avg:60.26ms -step:867/2285 train_time:52250ms step_avg:60.27ms -step:868/2285 train_time:52310ms step_avg:60.26ms -step:869/2285 train_time:52372ms step_avg:60.27ms -step:870/2285 train_time:52431ms step_avg:60.27ms -step:871/2285 train_time:52494ms step_avg:60.27ms -step:872/2285 train_time:52554ms step_avg:60.27ms -step:873/2285 train_time:52616ms step_avg:60.27ms -step:874/2285 train_time:52676ms step_avg:60.27ms -step:875/2285 train_time:52738ms step_avg:60.27ms -step:876/2285 train_time:52798ms step_avg:60.27ms -step:877/2285 train_time:52860ms step_avg:60.27ms -step:878/2285 train_time:52919ms step_avg:60.27ms -step:879/2285 train_time:52980ms step_avg:60.27ms -step:880/2285 train_time:53040ms step_avg:60.27ms -step:881/2285 train_time:53102ms step_avg:60.27ms -step:882/2285 train_time:53161ms step_avg:60.27ms -step:883/2285 train_time:53223ms step_avg:60.28ms -step:884/2285 train_time:53283ms step_avg:60.27ms -step:885/2285 train_time:53345ms step_avg:60.28ms -step:886/2285 train_time:53405ms step_avg:60.28ms -step:887/2285 train_time:53466ms step_avg:60.28ms -step:888/2285 train_time:53526ms step_avg:60.28ms -step:889/2285 train_time:53588ms step_avg:60.28ms -step:890/2285 train_time:53647ms step_avg:60.28ms -step:891/2285 train_time:53709ms step_avg:60.28ms -step:892/2285 train_time:53769ms step_avg:60.28ms -step:893/2285 train_time:53831ms step_avg:60.28ms -step:894/2285 train_time:53890ms step_avg:60.28ms -step:895/2285 train_time:53953ms step_avg:60.28ms -step:896/2285 train_time:54013ms step_avg:60.28ms -step:897/2285 train_time:54074ms step_avg:60.28ms -step:898/2285 train_time:54134ms step_avg:60.28ms -step:899/2285 train_time:54196ms step_avg:60.28ms -step:900/2285 train_time:54255ms step_avg:60.28ms -step:901/2285 train_time:54317ms step_avg:60.29ms -step:902/2285 train_time:54377ms step_avg:60.28ms -step:903/2285 train_time:54439ms step_avg:60.29ms -step:904/2285 train_time:54499ms step_avg:60.29ms -step:905/2285 train_time:54560ms step_avg:60.29ms -step:906/2285 train_time:54620ms step_avg:60.29ms -step:907/2285 train_time:54681ms step_avg:60.29ms -step:908/2285 train_time:54741ms step_avg:60.29ms -step:909/2285 train_time:54803ms step_avg:60.29ms -step:910/2285 train_time:54863ms step_avg:60.29ms -step:911/2285 train_time:54924ms step_avg:60.29ms -step:912/2285 train_time:54984ms step_avg:60.29ms -step:913/2285 train_time:55045ms step_avg:60.29ms -step:914/2285 train_time:55104ms step_avg:60.29ms -step:915/2285 train_time:55166ms step_avg:60.29ms -step:916/2285 train_time:55226ms step_avg:60.29ms -step:917/2285 train_time:55288ms step_avg:60.29ms -step:918/2285 train_time:55347ms step_avg:60.29ms -step:919/2285 train_time:55409ms step_avg:60.29ms -step:920/2285 train_time:55469ms step_avg:60.29ms -step:921/2285 train_time:55531ms step_avg:60.29ms -step:922/2285 train_time:55591ms step_avg:60.29ms -step:923/2285 train_time:55653ms step_avg:60.30ms -step:924/2285 train_time:55714ms step_avg:60.30ms -step:925/2285 train_time:55775ms step_avg:60.30ms -step:926/2285 train_time:55835ms step_avg:60.30ms -step:927/2285 train_time:55897ms step_avg:60.30ms -step:928/2285 train_time:55956ms step_avg:60.30ms -step:929/2285 train_time:56019ms step_avg:60.30ms -step:930/2285 train_time:56078ms step_avg:60.30ms -step:931/2285 train_time:56140ms step_avg:60.30ms -step:932/2285 train_time:56199ms step_avg:60.30ms -step:933/2285 train_time:56261ms step_avg:60.30ms -step:934/2285 train_time:56321ms step_avg:60.30ms -step:935/2285 train_time:56383ms step_avg:60.30ms -step:936/2285 train_time:56442ms step_avg:60.30ms -step:937/2285 train_time:56505ms step_avg:60.30ms -step:938/2285 train_time:56564ms step_avg:60.30ms -step:939/2285 train_time:56627ms step_avg:60.31ms -step:940/2285 train_time:56686ms step_avg:60.30ms -step:941/2285 train_time:56748ms step_avg:60.31ms -step:942/2285 train_time:56808ms step_avg:60.31ms -step:943/2285 train_time:56869ms step_avg:60.31ms -step:944/2285 train_time:56929ms step_avg:60.31ms -step:945/2285 train_time:56990ms step_avg:60.31ms -step:946/2285 train_time:57051ms step_avg:60.31ms -step:947/2285 train_time:57113ms step_avg:60.31ms -step:948/2285 train_time:57173ms step_avg:60.31ms -step:949/2285 train_time:57235ms step_avg:60.31ms -step:950/2285 train_time:57294ms step_avg:60.31ms -step:951/2285 train_time:57356ms step_avg:60.31ms -step:952/2285 train_time:57416ms step_avg:60.31ms -step:953/2285 train_time:57478ms step_avg:60.31ms -step:954/2285 train_time:57538ms step_avg:60.31ms -step:955/2285 train_time:57600ms step_avg:60.31ms -step:956/2285 train_time:57659ms step_avg:60.31ms -step:957/2285 train_time:57722ms step_avg:60.32ms -step:958/2285 train_time:57781ms step_avg:60.31ms -step:959/2285 train_time:57843ms step_avg:60.32ms -step:960/2285 train_time:57903ms step_avg:60.32ms -step:961/2285 train_time:57965ms step_avg:60.32ms -step:962/2285 train_time:58024ms step_avg:60.32ms -step:963/2285 train_time:58086ms step_avg:60.32ms -step:964/2285 train_time:58145ms step_avg:60.32ms -step:965/2285 train_time:58207ms step_avg:60.32ms -step:966/2285 train_time:58267ms step_avg:60.32ms -step:967/2285 train_time:58329ms step_avg:60.32ms -step:968/2285 train_time:58389ms step_avg:60.32ms -step:969/2285 train_time:58451ms step_avg:60.32ms -step:970/2285 train_time:58511ms step_avg:60.32ms -step:971/2285 train_time:58573ms step_avg:60.32ms -step:972/2285 train_time:58632ms step_avg:60.32ms -step:973/2285 train_time:58694ms step_avg:60.32ms -step:974/2285 train_time:58754ms step_avg:60.32ms -step:975/2285 train_time:58817ms step_avg:60.32ms -step:976/2285 train_time:58877ms step_avg:60.32ms -step:977/2285 train_time:58938ms step_avg:60.33ms -step:978/2285 train_time:58998ms step_avg:60.33ms -step:979/2285 train_time:59059ms step_avg:60.33ms -step:980/2285 train_time:59119ms step_avg:60.33ms -step:981/2285 train_time:59181ms step_avg:60.33ms -step:982/2285 train_time:59241ms step_avg:60.33ms -step:983/2285 train_time:59303ms step_avg:60.33ms -step:984/2285 train_time:59363ms step_avg:60.33ms -step:985/2285 train_time:59425ms step_avg:60.33ms -step:986/2285 train_time:59484ms step_avg:60.33ms -step:987/2285 train_time:59546ms step_avg:60.33ms -step:988/2285 train_time:59605ms step_avg:60.33ms -step:989/2285 train_time:59667ms step_avg:60.33ms -step:990/2285 train_time:59727ms step_avg:60.33ms -step:991/2285 train_time:59789ms step_avg:60.33ms -step:992/2285 train_time:59848ms step_avg:60.33ms -step:993/2285 train_time:59910ms step_avg:60.33ms -step:994/2285 train_time:59970ms step_avg:60.33ms -step:995/2285 train_time:60032ms step_avg:60.33ms -step:996/2285 train_time:60091ms step_avg:60.33ms -step:997/2285 train_time:60154ms step_avg:60.33ms -step:998/2285 train_time:60214ms step_avg:60.33ms -step:999/2285 train_time:60275ms step_avg:60.34ms -step:1000/2285 train_time:60335ms step_avg:60.33ms -step:1000/2285 val_loss:3.5674 train_time:60398ms step_avg:60.40ms -step:1001/2285 train_time:60418ms step_avg:60.36ms -step:1002/2285 train_time:60458ms step_avg:60.34ms -step:1003/2285 train_time:60519ms step_avg:60.34ms -step:1004/2285 train_time:60578ms step_avg:60.34ms -step:1005/2285 train_time:60641ms step_avg:60.34ms -step:1006/2285 train_time:60701ms step_avg:60.34ms -step:1007/2285 train_time:60762ms step_avg:60.34ms -step:1008/2285 train_time:60820ms step_avg:60.34ms -step:1009/2285 train_time:60881ms step_avg:60.34ms -step:1010/2285 train_time:60939ms step_avg:60.34ms -step:1011/2285 train_time:61000ms step_avg:60.34ms -step:1012/2285 train_time:61058ms step_avg:60.33ms -step:1013/2285 train_time:61121ms step_avg:60.34ms -step:1014/2285 train_time:61180ms step_avg:60.34ms -step:1015/2285 train_time:61241ms step_avg:60.34ms -step:1016/2285 train_time:61302ms step_avg:60.34ms -step:1017/2285 train_time:61368ms step_avg:60.34ms -step:1018/2285 train_time:61429ms step_avg:60.34ms -step:1019/2285 train_time:61491ms step_avg:60.34ms -step:1020/2285 train_time:61551ms step_avg:60.34ms -step:1021/2285 train_time:61613ms step_avg:60.35ms -step:1022/2285 train_time:61672ms step_avg:60.34ms -step:1023/2285 train_time:61734ms step_avg:60.35ms -step:1024/2285 train_time:61793ms step_avg:60.34ms -step:1025/2285 train_time:61855ms step_avg:60.35ms -step:1026/2285 train_time:61914ms step_avg:60.34ms -step:1027/2285 train_time:61975ms step_avg:60.35ms -step:1028/2285 train_time:62034ms step_avg:60.34ms -step:1029/2285 train_time:62096ms step_avg:60.35ms -step:1030/2285 train_time:62156ms step_avg:60.35ms -step:1031/2285 train_time:62217ms step_avg:60.35ms -step:1032/2285 train_time:62278ms step_avg:60.35ms -step:1033/2285 train_time:62340ms step_avg:60.35ms -step:1034/2285 train_time:62400ms step_avg:60.35ms -step:1035/2285 train_time:62462ms step_avg:60.35ms -step:1036/2285 train_time:62522ms step_avg:60.35ms -step:1037/2285 train_time:62584ms step_avg:60.35ms -step:1038/2285 train_time:62643ms step_avg:60.35ms -step:1039/2285 train_time:62706ms step_avg:60.35ms -step:1040/2285 train_time:62766ms step_avg:60.35ms -step:1041/2285 train_time:62827ms step_avg:60.35ms -step:1042/2285 train_time:62887ms step_avg:60.35ms -step:1043/2285 train_time:62949ms step_avg:60.35ms -step:1044/2285 train_time:63008ms step_avg:60.35ms -step:1045/2285 train_time:63070ms step_avg:60.35ms -step:1046/2285 train_time:63130ms step_avg:60.35ms -step:1047/2285 train_time:63192ms step_avg:60.35ms -step:1048/2285 train_time:63252ms step_avg:60.35ms -step:1049/2285 train_time:63314ms step_avg:60.36ms -step:1050/2285 train_time:63374ms step_avg:60.36ms -step:1051/2285 train_time:63436ms step_avg:60.36ms -step:1052/2285 train_time:63495ms step_avg:60.36ms -step:1053/2285 train_time:63557ms step_avg:60.36ms -step:1054/2285 train_time:63617ms step_avg:60.36ms -step:1055/2285 train_time:63679ms step_avg:60.36ms -step:1056/2285 train_time:63738ms step_avg:60.36ms -step:1057/2285 train_time:63800ms step_avg:60.36ms -step:1058/2285 train_time:63860ms step_avg:60.36ms -step:1059/2285 train_time:63922ms step_avg:60.36ms -step:1060/2285 train_time:63981ms step_avg:60.36ms -step:1061/2285 train_time:64043ms step_avg:60.36ms -step:1062/2285 train_time:64103ms step_avg:60.36ms -step:1063/2285 train_time:64165ms step_avg:60.36ms -step:1064/2285 train_time:64224ms step_avg:60.36ms -step:1065/2285 train_time:64286ms step_avg:60.36ms -step:1066/2285 train_time:64347ms step_avg:60.36ms -step:1067/2285 train_time:64409ms step_avg:60.36ms -step:1068/2285 train_time:64468ms step_avg:60.36ms -step:1069/2285 train_time:64530ms step_avg:60.37ms -step:1070/2285 train_time:64590ms step_avg:60.36ms -step:1071/2285 train_time:64652ms step_avg:60.37ms -step:1072/2285 train_time:64712ms step_avg:60.37ms -step:1073/2285 train_time:64774ms step_avg:60.37ms -step:1074/2285 train_time:64833ms step_avg:60.37ms -step:1075/2285 train_time:64896ms step_avg:60.37ms -step:1076/2285 train_time:64955ms step_avg:60.37ms -step:1077/2285 train_time:65017ms step_avg:60.37ms -step:1078/2285 train_time:65076ms step_avg:60.37ms -step:1079/2285 train_time:65137ms step_avg:60.37ms -step:1080/2285 train_time:65197ms step_avg:60.37ms -step:1081/2285 train_time:65259ms step_avg:60.37ms -step:1082/2285 train_time:65319ms step_avg:60.37ms -step:1083/2285 train_time:65382ms step_avg:60.37ms -step:1084/2285 train_time:65441ms step_avg:60.37ms -step:1085/2285 train_time:65503ms step_avg:60.37ms -step:1086/2285 train_time:65562ms step_avg:60.37ms -step:1087/2285 train_time:65624ms step_avg:60.37ms -step:1088/2285 train_time:65683ms step_avg:60.37ms -step:1089/2285 train_time:65745ms step_avg:60.37ms -step:1090/2285 train_time:65805ms step_avg:60.37ms -step:1091/2285 train_time:65867ms step_avg:60.37ms -step:1092/2285 train_time:65927ms step_avg:60.37ms -step:1093/2285 train_time:65990ms step_avg:60.38ms -step:1094/2285 train_time:66051ms step_avg:60.38ms -step:1095/2285 train_time:66112ms step_avg:60.38ms -step:1096/2285 train_time:66172ms step_avg:60.38ms -step:1097/2285 train_time:66235ms step_avg:60.38ms -step:1098/2285 train_time:66294ms step_avg:60.38ms -step:1099/2285 train_time:66356ms step_avg:60.38ms -step:1100/2285 train_time:66416ms step_avg:60.38ms -step:1101/2285 train_time:66478ms step_avg:60.38ms -step:1102/2285 train_time:66537ms step_avg:60.38ms -step:1103/2285 train_time:66598ms step_avg:60.38ms -step:1104/2285 train_time:66658ms step_avg:60.38ms -step:1105/2285 train_time:66720ms step_avg:60.38ms -step:1106/2285 train_time:66779ms step_avg:60.38ms -step:1107/2285 train_time:66842ms step_avg:60.38ms -step:1108/2285 train_time:66901ms step_avg:60.38ms -step:1109/2285 train_time:66963ms step_avg:60.38ms -step:1110/2285 train_time:67023ms step_avg:60.38ms -step:1111/2285 train_time:67084ms step_avg:60.38ms -step:1112/2285 train_time:67144ms step_avg:60.38ms -step:1113/2285 train_time:67206ms step_avg:60.38ms -step:1114/2285 train_time:67265ms step_avg:60.38ms -step:1115/2285 train_time:67327ms step_avg:60.38ms -step:1116/2285 train_time:67386ms step_avg:60.38ms -step:1117/2285 train_time:67449ms step_avg:60.38ms -step:1118/2285 train_time:67509ms step_avg:60.38ms -step:1119/2285 train_time:67571ms step_avg:60.38ms -step:1120/2285 train_time:67631ms step_avg:60.39ms -step:1121/2285 train_time:67693ms step_avg:60.39ms -step:1122/2285 train_time:67753ms step_avg:60.39ms -step:1123/2285 train_time:67815ms step_avg:60.39ms -step:1124/2285 train_time:67874ms step_avg:60.39ms -step:1125/2285 train_time:67937ms step_avg:60.39ms -step:1126/2285 train_time:67996ms step_avg:60.39ms -step:1127/2285 train_time:68058ms step_avg:60.39ms -step:1128/2285 train_time:68117ms step_avg:60.39ms -step:1129/2285 train_time:68179ms step_avg:60.39ms -step:1130/2285 train_time:68239ms step_avg:60.39ms -step:1131/2285 train_time:68301ms step_avg:60.39ms -step:1132/2285 train_time:68361ms step_avg:60.39ms -step:1133/2285 train_time:68422ms step_avg:60.39ms -step:1134/2285 train_time:68481ms step_avg:60.39ms -step:1135/2285 train_time:68543ms step_avg:60.39ms -step:1136/2285 train_time:68603ms step_avg:60.39ms -step:1137/2285 train_time:68664ms step_avg:60.39ms -step:1138/2285 train_time:68723ms step_avg:60.39ms -step:1139/2285 train_time:68785ms step_avg:60.39ms -step:1140/2285 train_time:68846ms step_avg:60.39ms -step:1141/2285 train_time:68908ms step_avg:60.39ms -step:1142/2285 train_time:68967ms step_avg:60.39ms -step:1143/2285 train_time:69029ms step_avg:60.39ms -step:1144/2285 train_time:69089ms step_avg:60.39ms -step:1145/2285 train_time:69152ms step_avg:60.39ms -step:1146/2285 train_time:69211ms step_avg:60.39ms -step:1147/2285 train_time:69274ms step_avg:60.40ms -step:1148/2285 train_time:69334ms step_avg:60.40ms -step:1149/2285 train_time:69396ms step_avg:60.40ms -step:1150/2285 train_time:69456ms step_avg:60.40ms -step:1151/2285 train_time:69517ms step_avg:60.40ms -step:1152/2285 train_time:69577ms step_avg:60.40ms -step:1153/2285 train_time:69639ms step_avg:60.40ms -step:1154/2285 train_time:69699ms step_avg:60.40ms -step:1155/2285 train_time:69761ms step_avg:60.40ms -step:1156/2285 train_time:69821ms step_avg:60.40ms -step:1157/2285 train_time:69883ms step_avg:60.40ms -step:1158/2285 train_time:69942ms step_avg:60.40ms -step:1159/2285 train_time:70004ms step_avg:60.40ms -step:1160/2285 train_time:70064ms step_avg:60.40ms -step:1161/2285 train_time:70126ms step_avg:60.40ms -step:1162/2285 train_time:70186ms step_avg:60.40ms -step:1163/2285 train_time:70248ms step_avg:60.40ms -step:1164/2285 train_time:70309ms step_avg:60.40ms -step:1165/2285 train_time:70371ms step_avg:60.40ms -step:1166/2285 train_time:70432ms step_avg:60.40ms -step:1167/2285 train_time:70495ms step_avg:60.41ms -step:1168/2285 train_time:70555ms step_avg:60.41ms -step:1169/2285 train_time:70617ms step_avg:60.41ms -step:1170/2285 train_time:70676ms step_avg:60.41ms -step:1171/2285 train_time:70738ms step_avg:60.41ms -step:1172/2285 train_time:70797ms step_avg:60.41ms -step:1173/2285 train_time:70859ms step_avg:60.41ms -step:1174/2285 train_time:70919ms step_avg:60.41ms -step:1175/2285 train_time:70982ms step_avg:60.41ms -step:1176/2285 train_time:71041ms step_avg:60.41ms -step:1177/2285 train_time:71103ms step_avg:60.41ms -step:1178/2285 train_time:71162ms step_avg:60.41ms -step:1179/2285 train_time:71225ms step_avg:60.41ms -step:1180/2285 train_time:71284ms step_avg:60.41ms -step:1181/2285 train_time:71347ms step_avg:60.41ms -step:1182/2285 train_time:71407ms step_avg:60.41ms -step:1183/2285 train_time:71469ms step_avg:60.41ms -step:1184/2285 train_time:71529ms step_avg:60.41ms -step:1185/2285 train_time:71592ms step_avg:60.42ms -step:1186/2285 train_time:71652ms step_avg:60.41ms -step:1187/2285 train_time:71714ms step_avg:60.42ms -step:1188/2285 train_time:71774ms step_avg:60.42ms -step:1189/2285 train_time:71836ms step_avg:60.42ms -step:1190/2285 train_time:71896ms step_avg:60.42ms -step:1191/2285 train_time:71958ms step_avg:60.42ms -step:1192/2285 train_time:72018ms step_avg:60.42ms -step:1193/2285 train_time:72080ms step_avg:60.42ms -step:1194/2285 train_time:72139ms step_avg:60.42ms -step:1195/2285 train_time:72202ms step_avg:60.42ms -step:1196/2285 train_time:72261ms step_avg:60.42ms -step:1197/2285 train_time:72323ms step_avg:60.42ms -step:1198/2285 train_time:72383ms step_avg:60.42ms -step:1199/2285 train_time:72445ms step_avg:60.42ms -step:1200/2285 train_time:72504ms step_avg:60.42ms -step:1201/2285 train_time:72567ms step_avg:60.42ms -step:1202/2285 train_time:72627ms step_avg:60.42ms -step:1203/2285 train_time:72689ms step_avg:60.42ms -step:1204/2285 train_time:72748ms step_avg:60.42ms -step:1205/2285 train_time:72810ms step_avg:60.42ms -step:1206/2285 train_time:72872ms step_avg:60.42ms -step:1207/2285 train_time:72935ms step_avg:60.43ms -step:1208/2285 train_time:72994ms step_avg:60.43ms -step:1209/2285 train_time:73056ms step_avg:60.43ms -step:1210/2285 train_time:73116ms step_avg:60.43ms -step:1211/2285 train_time:73178ms step_avg:60.43ms -step:1212/2285 train_time:73238ms step_avg:60.43ms -step:1213/2285 train_time:73301ms step_avg:60.43ms -step:1214/2285 train_time:73361ms step_avg:60.43ms -step:1215/2285 train_time:73423ms step_avg:60.43ms -step:1216/2285 train_time:73482ms step_avg:60.43ms -step:1217/2285 train_time:73544ms step_avg:60.43ms -step:1218/2285 train_time:73604ms step_avg:60.43ms -step:1219/2285 train_time:73667ms step_avg:60.43ms -step:1220/2285 train_time:73726ms step_avg:60.43ms -step:1221/2285 train_time:73788ms step_avg:60.43ms -step:1222/2285 train_time:73849ms step_avg:60.43ms -step:1223/2285 train_time:73912ms step_avg:60.43ms -step:1224/2285 train_time:73972ms step_avg:60.43ms -step:1225/2285 train_time:74034ms step_avg:60.44ms -step:1226/2285 train_time:74094ms step_avg:60.44ms -step:1227/2285 train_time:74157ms step_avg:60.44ms -step:1228/2285 train_time:74217ms step_avg:60.44ms -step:1229/2285 train_time:74279ms step_avg:60.44ms -step:1230/2285 train_time:74339ms step_avg:60.44ms -step:1231/2285 train_time:74402ms step_avg:60.44ms -step:1232/2285 train_time:74462ms step_avg:60.44ms -step:1233/2285 train_time:74524ms step_avg:60.44ms -step:1234/2285 train_time:74583ms step_avg:60.44ms -step:1235/2285 train_time:74645ms step_avg:60.44ms -step:1236/2285 train_time:74705ms step_avg:60.44ms -step:1237/2285 train_time:74768ms step_avg:60.44ms -step:1238/2285 train_time:74828ms step_avg:60.44ms -step:1239/2285 train_time:74890ms step_avg:60.44ms -step:1240/2285 train_time:74950ms step_avg:60.44ms -step:1241/2285 train_time:75012ms step_avg:60.44ms -step:1242/2285 train_time:75072ms step_avg:60.44ms -step:1243/2285 train_time:75135ms step_avg:60.45ms -step:1244/2285 train_time:75195ms step_avg:60.45ms -step:1245/2285 train_time:75257ms step_avg:60.45ms -step:1246/2285 train_time:75316ms step_avg:60.45ms -step:1247/2285 train_time:75378ms step_avg:60.45ms -step:1248/2285 train_time:75438ms step_avg:60.45ms -step:1249/2285 train_time:75499ms step_avg:60.45ms -step:1250/2285 train_time:75560ms step_avg:60.45ms -step:1250/2285 val_loss:3.4957 train_time:75624ms step_avg:60.50ms -step:1251/2285 train_time:75642ms step_avg:60.47ms -step:1252/2285 train_time:75684ms step_avg:60.45ms -step:1253/2285 train_time:75747ms step_avg:60.45ms -step:1254/2285 train_time:75806ms step_avg:60.45ms -step:1255/2285 train_time:75869ms step_avg:60.45ms -step:1256/2285 train_time:75928ms step_avg:60.45ms -step:1257/2285 train_time:75990ms step_avg:60.45ms -step:1258/2285 train_time:76049ms step_avg:60.45ms -step:1259/2285 train_time:76110ms step_avg:60.45ms -step:1260/2285 train_time:76169ms step_avg:60.45ms -step:1261/2285 train_time:76230ms step_avg:60.45ms -step:1262/2285 train_time:76289ms step_avg:60.45ms -step:1263/2285 train_time:76350ms step_avg:60.45ms -step:1264/2285 train_time:76409ms step_avg:60.45ms -step:1265/2285 train_time:76470ms step_avg:60.45ms -step:1266/2285 train_time:76536ms step_avg:60.45ms -step:1267/2285 train_time:76602ms step_avg:60.46ms -step:1268/2285 train_time:76663ms step_avg:60.46ms -step:1269/2285 train_time:76726ms step_avg:60.46ms -step:1270/2285 train_time:76785ms step_avg:60.46ms -step:1271/2285 train_time:76848ms step_avg:60.46ms -step:1272/2285 train_time:76907ms step_avg:60.46ms -step:1273/2285 train_time:76968ms step_avg:60.46ms -step:1274/2285 train_time:77028ms step_avg:60.46ms -step:1275/2285 train_time:77089ms step_avg:60.46ms -step:1276/2285 train_time:77148ms step_avg:60.46ms -step:1277/2285 train_time:77210ms step_avg:60.46ms -step:1278/2285 train_time:77268ms step_avg:60.46ms -step:1279/2285 train_time:77330ms step_avg:60.46ms -step:1280/2285 train_time:77389ms step_avg:60.46ms -step:1281/2285 train_time:77451ms step_avg:60.46ms -step:1282/2285 train_time:77513ms step_avg:60.46ms -step:1283/2285 train_time:77578ms step_avg:60.47ms -step:1284/2285 train_time:77638ms step_avg:60.47ms -step:1285/2285 train_time:77700ms step_avg:60.47ms -step:1286/2285 train_time:77760ms step_avg:60.47ms -step:1287/2285 train_time:77822ms step_avg:60.47ms -step:1288/2285 train_time:77882ms step_avg:60.47ms -step:1289/2285 train_time:77944ms step_avg:60.47ms -step:1290/2285 train_time:78003ms step_avg:60.47ms -step:1291/2285 train_time:78065ms step_avg:60.47ms -step:1292/2285 train_time:78124ms step_avg:60.47ms -step:1293/2285 train_time:78186ms step_avg:60.47ms -step:1294/2285 train_time:78246ms step_avg:60.47ms -step:1295/2285 train_time:78307ms step_avg:60.47ms -step:1296/2285 train_time:78366ms step_avg:60.47ms -step:1297/2285 train_time:78429ms step_avg:60.47ms -step:1298/2285 train_time:78489ms step_avg:60.47ms -step:1299/2285 train_time:78553ms step_avg:60.47ms -step:1300/2285 train_time:78613ms step_avg:60.47ms -step:1301/2285 train_time:78677ms step_avg:60.47ms -step:1302/2285 train_time:78737ms step_avg:60.47ms -step:1303/2285 train_time:78799ms step_avg:60.48ms -step:1304/2285 train_time:78859ms step_avg:60.47ms -step:1305/2285 train_time:78921ms step_avg:60.48ms -step:1306/2285 train_time:78980ms step_avg:60.48ms -step:1307/2285 train_time:79043ms step_avg:60.48ms -step:1308/2285 train_time:79102ms step_avg:60.48ms -step:1309/2285 train_time:79164ms step_avg:60.48ms -step:1310/2285 train_time:79223ms step_avg:60.48ms -step:1311/2285 train_time:79285ms step_avg:60.48ms -step:1312/2285 train_time:79345ms step_avg:60.48ms -step:1313/2285 train_time:79408ms step_avg:60.48ms -step:1314/2285 train_time:79468ms step_avg:60.48ms -step:1315/2285 train_time:79530ms step_avg:60.48ms -step:1316/2285 train_time:79590ms step_avg:60.48ms -step:1317/2285 train_time:79653ms step_avg:60.48ms -step:1318/2285 train_time:79714ms step_avg:60.48ms -step:1319/2285 train_time:79777ms step_avg:60.48ms -step:1320/2285 train_time:79837ms step_avg:60.48ms -step:1321/2285 train_time:79899ms step_avg:60.48ms -step:1322/2285 train_time:79958ms step_avg:60.48ms -step:1323/2285 train_time:80020ms step_avg:60.48ms -step:1324/2285 train_time:80080ms step_avg:60.48ms -step:1325/2285 train_time:80142ms step_avg:60.48ms -step:1326/2285 train_time:80202ms step_avg:60.48ms -step:1327/2285 train_time:80264ms step_avg:60.49ms -step:1328/2285 train_time:80323ms step_avg:60.48ms -step:1329/2285 train_time:80385ms step_avg:60.49ms -step:1330/2285 train_time:80445ms step_avg:60.48ms -step:1331/2285 train_time:80507ms step_avg:60.49ms -step:1332/2285 train_time:80568ms step_avg:60.49ms -step:1333/2285 train_time:80630ms step_avg:60.49ms -step:1334/2285 train_time:80691ms step_avg:60.49ms -step:1335/2285 train_time:80753ms step_avg:60.49ms -step:1336/2285 train_time:80813ms step_avg:60.49ms -step:1337/2285 train_time:80875ms step_avg:60.49ms -step:1338/2285 train_time:80936ms step_avg:60.49ms -step:1339/2285 train_time:80998ms step_avg:60.49ms -step:1340/2285 train_time:81057ms step_avg:60.49ms -step:1341/2285 train_time:81120ms step_avg:60.49ms -step:1342/2285 train_time:81180ms step_avg:60.49ms -step:1343/2285 train_time:81242ms step_avg:60.49ms -step:1344/2285 train_time:81302ms step_avg:60.49ms -step:1345/2285 train_time:81364ms step_avg:60.49ms -step:1346/2285 train_time:81424ms step_avg:60.49ms -step:1347/2285 train_time:81485ms step_avg:60.49ms -step:1348/2285 train_time:81545ms step_avg:60.49ms -step:1349/2285 train_time:81608ms step_avg:60.49ms -step:1350/2285 train_time:81668ms step_avg:60.49ms -step:1351/2285 train_time:81730ms step_avg:60.50ms -step:1352/2285 train_time:81790ms step_avg:60.50ms -step:1353/2285 train_time:81853ms step_avg:60.50ms -step:1354/2285 train_time:81912ms step_avg:60.50ms -step:1355/2285 train_time:81975ms step_avg:60.50ms -step:1356/2285 train_time:82035ms step_avg:60.50ms -step:1357/2285 train_time:82098ms step_avg:60.50ms -step:1358/2285 train_time:82158ms step_avg:60.50ms -step:1359/2285 train_time:82220ms step_avg:60.50ms -step:1360/2285 train_time:82280ms step_avg:60.50ms -step:1361/2285 train_time:82342ms step_avg:60.50ms -step:1362/2285 train_time:82402ms step_avg:60.50ms -step:1363/2285 train_time:82464ms step_avg:60.50ms -step:1364/2285 train_time:82524ms step_avg:60.50ms -step:1365/2285 train_time:82585ms step_avg:60.50ms -step:1366/2285 train_time:82645ms step_avg:60.50ms -step:1367/2285 train_time:82707ms step_avg:60.50ms -step:1368/2285 train_time:82767ms step_avg:60.50ms -step:1369/2285 train_time:82829ms step_avg:60.50ms -step:1370/2285 train_time:82889ms step_avg:60.50ms -step:1371/2285 train_time:82952ms step_avg:60.50ms -step:1372/2285 train_time:83012ms step_avg:60.50ms -step:1373/2285 train_time:83075ms step_avg:60.51ms -step:1374/2285 train_time:83135ms step_avg:60.51ms -step:1375/2285 train_time:83197ms step_avg:60.51ms -step:1376/2285 train_time:83257ms step_avg:60.51ms -step:1377/2285 train_time:83319ms step_avg:60.51ms -step:1378/2285 train_time:83378ms step_avg:60.51ms -step:1379/2285 train_time:83440ms step_avg:60.51ms -step:1380/2285 train_time:83500ms step_avg:60.51ms -step:1381/2285 train_time:83563ms step_avg:60.51ms -step:1382/2285 train_time:83623ms step_avg:60.51ms -step:1383/2285 train_time:83685ms step_avg:60.51ms -step:1384/2285 train_time:83745ms step_avg:60.51ms -step:1385/2285 train_time:83807ms step_avg:60.51ms -step:1386/2285 train_time:83867ms step_avg:60.51ms -step:1387/2285 train_time:83929ms step_avg:60.51ms -step:1388/2285 train_time:83989ms step_avg:60.51ms -step:1389/2285 train_time:84052ms step_avg:60.51ms -step:1390/2285 train_time:84112ms step_avg:60.51ms -step:1391/2285 train_time:84175ms step_avg:60.51ms -step:1392/2285 train_time:84236ms step_avg:60.51ms -step:1393/2285 train_time:84298ms step_avg:60.52ms -step:1394/2285 train_time:84358ms step_avg:60.52ms -step:1395/2285 train_time:84420ms step_avg:60.52ms -step:1396/2285 train_time:84479ms step_avg:60.52ms -step:1397/2285 train_time:84541ms step_avg:60.52ms -step:1398/2285 train_time:84601ms step_avg:60.52ms -step:1399/2285 train_time:84664ms step_avg:60.52ms -step:1400/2285 train_time:84724ms step_avg:60.52ms -step:1401/2285 train_time:84785ms step_avg:60.52ms -step:1402/2285 train_time:84845ms step_avg:60.52ms -step:1403/2285 train_time:84907ms step_avg:60.52ms -step:1404/2285 train_time:84967ms step_avg:60.52ms -step:1405/2285 train_time:85029ms step_avg:60.52ms -step:1406/2285 train_time:85089ms step_avg:60.52ms -step:1407/2285 train_time:85152ms step_avg:60.52ms -step:1408/2285 train_time:85212ms step_avg:60.52ms -step:1409/2285 train_time:85274ms step_avg:60.52ms -step:1410/2285 train_time:85334ms step_avg:60.52ms -step:1411/2285 train_time:85397ms step_avg:60.52ms -step:1412/2285 train_time:85457ms step_avg:60.52ms -step:1413/2285 train_time:85519ms step_avg:60.52ms -step:1414/2285 train_time:85578ms step_avg:60.52ms -step:1415/2285 train_time:85640ms step_avg:60.52ms -step:1416/2285 train_time:85700ms step_avg:60.52ms -step:1417/2285 train_time:85762ms step_avg:60.52ms -step:1418/2285 train_time:85822ms step_avg:60.52ms -step:1419/2285 train_time:85884ms step_avg:60.52ms -step:1420/2285 train_time:85944ms step_avg:60.52ms -step:1421/2285 train_time:86007ms step_avg:60.53ms -step:1422/2285 train_time:86067ms step_avg:60.53ms -step:1423/2285 train_time:86129ms step_avg:60.53ms -step:1424/2285 train_time:86188ms step_avg:60.53ms -step:1425/2285 train_time:86250ms step_avg:60.53ms -step:1426/2285 train_time:86311ms step_avg:60.53ms -step:1427/2285 train_time:86373ms step_avg:60.53ms -step:1428/2285 train_time:86433ms step_avg:60.53ms -step:1429/2285 train_time:86496ms step_avg:60.53ms -step:1430/2285 train_time:86556ms step_avg:60.53ms -step:1431/2285 train_time:86618ms step_avg:60.53ms -step:1432/2285 train_time:86677ms step_avg:60.53ms -step:1433/2285 train_time:86739ms step_avg:60.53ms -step:1434/2285 train_time:86799ms step_avg:60.53ms -step:1435/2285 train_time:86863ms step_avg:60.53ms -step:1436/2285 train_time:86922ms step_avg:60.53ms -step:1437/2285 train_time:86984ms step_avg:60.53ms -step:1438/2285 train_time:87044ms step_avg:60.53ms -step:1439/2285 train_time:87106ms step_avg:60.53ms -step:1440/2285 train_time:87165ms step_avg:60.53ms -step:1441/2285 train_time:87228ms step_avg:60.53ms -step:1442/2285 train_time:87288ms step_avg:60.53ms -step:1443/2285 train_time:87350ms step_avg:60.53ms -step:1444/2285 train_time:87410ms step_avg:60.53ms -step:1445/2285 train_time:87473ms step_avg:60.54ms -step:1446/2285 train_time:87534ms step_avg:60.54ms -step:1447/2285 train_time:87597ms step_avg:60.54ms -step:1448/2285 train_time:87657ms step_avg:60.54ms -step:1449/2285 train_time:87719ms step_avg:60.54ms -step:1450/2285 train_time:87778ms step_avg:60.54ms -step:1451/2285 train_time:87840ms step_avg:60.54ms -step:1452/2285 train_time:87900ms step_avg:60.54ms -step:1453/2285 train_time:87963ms step_avg:60.54ms -step:1454/2285 train_time:88022ms step_avg:60.54ms -step:1455/2285 train_time:88084ms step_avg:60.54ms -step:1456/2285 train_time:88144ms step_avg:60.54ms -step:1457/2285 train_time:88206ms step_avg:60.54ms -step:1458/2285 train_time:88266ms step_avg:60.54ms -step:1459/2285 train_time:88328ms step_avg:60.54ms -step:1460/2285 train_time:88388ms step_avg:60.54ms -step:1461/2285 train_time:88451ms step_avg:60.54ms -step:1462/2285 train_time:88511ms step_avg:60.54ms -step:1463/2285 train_time:88574ms step_avg:60.54ms -step:1464/2285 train_time:88635ms step_avg:60.54ms -step:1465/2285 train_time:88697ms step_avg:60.54ms -step:1466/2285 train_time:88757ms step_avg:60.54ms -step:1467/2285 train_time:88818ms step_avg:60.54ms -step:1468/2285 train_time:88878ms step_avg:60.54ms -step:1469/2285 train_time:88940ms step_avg:60.54ms -step:1470/2285 train_time:89000ms step_avg:60.54ms -step:1471/2285 train_time:89062ms step_avg:60.55ms -step:1472/2285 train_time:89122ms step_avg:60.54ms -step:1473/2285 train_time:89184ms step_avg:60.55ms -step:1474/2285 train_time:89244ms step_avg:60.55ms -step:1475/2285 train_time:89306ms step_avg:60.55ms -step:1476/2285 train_time:89365ms step_avg:60.55ms -step:1477/2285 train_time:89428ms step_avg:60.55ms -step:1478/2285 train_time:89488ms step_avg:60.55ms -step:1479/2285 train_time:89550ms step_avg:60.55ms -step:1480/2285 train_time:89611ms step_avg:60.55ms -step:1481/2285 train_time:89674ms step_avg:60.55ms -step:1482/2285 train_time:89735ms step_avg:60.55ms -step:1483/2285 train_time:89797ms step_avg:60.55ms -step:1484/2285 train_time:89857ms step_avg:60.55ms -step:1485/2285 train_time:89919ms step_avg:60.55ms -step:1486/2285 train_time:89978ms step_avg:60.55ms -step:1487/2285 train_time:90040ms step_avg:60.55ms -step:1488/2285 train_time:90100ms step_avg:60.55ms -step:1489/2285 train_time:90162ms step_avg:60.55ms -step:1490/2285 train_time:90222ms step_avg:60.55ms -step:1491/2285 train_time:90284ms step_avg:60.55ms -step:1492/2285 train_time:90344ms step_avg:60.55ms -step:1493/2285 train_time:90406ms step_avg:60.55ms -step:1494/2285 train_time:90466ms step_avg:60.55ms -step:1495/2285 train_time:90529ms step_avg:60.55ms -step:1496/2285 train_time:90589ms step_avg:60.55ms -step:1497/2285 train_time:90652ms step_avg:60.56ms -step:1498/2285 train_time:90712ms step_avg:60.56ms -step:1499/2285 train_time:90774ms step_avg:60.56ms -step:1500/2285 train_time:90834ms step_avg:60.56ms -step:1500/2285 val_loss:3.4280 train_time:90899ms step_avg:60.60ms -step:1501/2285 train_time:90917ms step_avg:60.57ms -step:1502/2285 train_time:90960ms step_avg:60.56ms -step:1503/2285 train_time:91026ms step_avg:60.56ms -step:1504/2285 train_time:91089ms step_avg:60.56ms -step:1505/2285 train_time:91151ms step_avg:60.57ms -step:1506/2285 train_time:91211ms step_avg:60.57ms -step:1507/2285 train_time:91272ms step_avg:60.57ms -step:1508/2285 train_time:91331ms step_avg:60.56ms -step:1509/2285 train_time:91393ms step_avg:60.57ms -step:1510/2285 train_time:91452ms step_avg:60.56ms -step:1511/2285 train_time:91514ms step_avg:60.57ms -step:1512/2285 train_time:91573ms step_avg:60.56ms -step:1513/2285 train_time:91635ms step_avg:60.57ms -step:1514/2285 train_time:91694ms step_avg:60.56ms -step:1515/2285 train_time:91756ms step_avg:60.56ms -step:1516/2285 train_time:91815ms step_avg:60.56ms -step:1517/2285 train_time:91878ms step_avg:60.57ms -step:1518/2285 train_time:91940ms step_avg:60.57ms -step:1519/2285 train_time:92004ms step_avg:60.57ms -step:1520/2285 train_time:92064ms step_avg:60.57ms -step:1521/2285 train_time:92128ms step_avg:60.57ms -step:1522/2285 train_time:92188ms step_avg:60.57ms -step:1523/2285 train_time:92250ms step_avg:60.57ms -step:1524/2285 train_time:92309ms step_avg:60.57ms -step:1525/2285 train_time:92372ms step_avg:60.57ms -step:1526/2285 train_time:92431ms step_avg:60.57ms -step:1527/2285 train_time:92493ms step_avg:60.57ms -step:1528/2285 train_time:92552ms step_avg:60.57ms -step:1529/2285 train_time:92614ms step_avg:60.57ms -step:1530/2285 train_time:92674ms step_avg:60.57ms -step:1531/2285 train_time:92735ms step_avg:60.57ms -step:1532/2285 train_time:92795ms step_avg:60.57ms -step:1533/2285 train_time:92858ms step_avg:60.57ms -step:1534/2285 train_time:92919ms step_avg:60.57ms -step:1535/2285 train_time:92982ms step_avg:60.57ms -step:1536/2285 train_time:93042ms step_avg:60.57ms -step:1537/2285 train_time:93105ms step_avg:60.58ms -step:1538/2285 train_time:93165ms step_avg:60.58ms -step:1539/2285 train_time:93227ms step_avg:60.58ms -step:1540/2285 train_time:93287ms step_avg:60.58ms -step:1541/2285 train_time:93349ms step_avg:60.58ms -step:1542/2285 train_time:93409ms step_avg:60.58ms -step:1543/2285 train_time:93472ms step_avg:60.58ms -step:1544/2285 train_time:93532ms step_avg:60.58ms -step:1545/2285 train_time:93594ms step_avg:60.58ms -step:1546/2285 train_time:93654ms step_avg:60.58ms -step:1547/2285 train_time:93716ms step_avg:60.58ms -step:1548/2285 train_time:93776ms step_avg:60.58ms -step:1549/2285 train_time:93838ms step_avg:60.58ms -step:1550/2285 train_time:93899ms step_avg:60.58ms -step:1551/2285 train_time:93962ms step_avg:60.58ms -step:1552/2285 train_time:94021ms step_avg:60.58ms -step:1553/2285 train_time:94084ms step_avg:60.58ms -step:1554/2285 train_time:94144ms step_avg:60.58ms -step:1555/2285 train_time:94207ms step_avg:60.58ms -step:1556/2285 train_time:94267ms step_avg:60.58ms -step:1557/2285 train_time:94330ms step_avg:60.58ms -step:1558/2285 train_time:94390ms step_avg:60.58ms -step:1559/2285 train_time:94452ms step_avg:60.58ms -step:1560/2285 train_time:94511ms step_avg:60.58ms -step:1561/2285 train_time:94574ms step_avg:60.59ms -step:1562/2285 train_time:94634ms step_avg:60.59ms -step:1563/2285 train_time:94696ms step_avg:60.59ms -step:1564/2285 train_time:94756ms step_avg:60.59ms -step:1565/2285 train_time:94818ms step_avg:60.59ms -step:1566/2285 train_time:94878ms step_avg:60.59ms -step:1567/2285 train_time:94940ms step_avg:60.59ms -step:1568/2285 train_time:95001ms step_avg:60.59ms -step:1569/2285 train_time:95064ms step_avg:60.59ms -step:1570/2285 train_time:95124ms step_avg:60.59ms -step:1571/2285 train_time:95186ms step_avg:60.59ms -step:1572/2285 train_time:95246ms step_avg:60.59ms -step:1573/2285 train_time:95309ms step_avg:60.59ms -step:1574/2285 train_time:95369ms step_avg:60.59ms -step:1575/2285 train_time:95431ms step_avg:60.59ms -step:1576/2285 train_time:95491ms step_avg:60.59ms -step:1577/2285 train_time:95553ms step_avg:60.59ms -step:1578/2285 train_time:95613ms step_avg:60.59ms -step:1579/2285 train_time:95676ms step_avg:60.59ms -step:1580/2285 train_time:95736ms step_avg:60.59ms -step:1581/2285 train_time:95798ms step_avg:60.59ms -step:1582/2285 train_time:95859ms step_avg:60.59ms -step:1583/2285 train_time:95921ms step_avg:60.59ms -step:1584/2285 train_time:95981ms step_avg:60.59ms -step:1585/2285 train_time:96043ms step_avg:60.60ms -step:1586/2285 train_time:96103ms step_avg:60.59ms -step:1587/2285 train_time:96166ms step_avg:60.60ms -step:1588/2285 train_time:96226ms step_avg:60.60ms -step:1589/2285 train_time:96288ms step_avg:60.60ms -step:1590/2285 train_time:96348ms step_avg:60.60ms -step:1591/2285 train_time:96411ms step_avg:60.60ms -step:1592/2285 train_time:96471ms step_avg:60.60ms -step:1593/2285 train_time:96534ms step_avg:60.60ms -step:1594/2285 train_time:96594ms step_avg:60.60ms -step:1595/2285 train_time:96656ms step_avg:60.60ms -step:1596/2285 train_time:96716ms step_avg:60.60ms -step:1597/2285 train_time:96779ms step_avg:60.60ms -step:1598/2285 train_time:96839ms step_avg:60.60ms -step:1599/2285 train_time:96901ms step_avg:60.60ms -step:1600/2285 train_time:96960ms step_avg:60.60ms -step:1601/2285 train_time:97023ms step_avg:60.60ms -step:1602/2285 train_time:97083ms step_avg:60.60ms -step:1603/2285 train_time:97145ms step_avg:60.60ms -step:1604/2285 train_time:97205ms step_avg:60.60ms -step:1605/2285 train_time:97267ms step_avg:60.60ms -step:1606/2285 train_time:97327ms step_avg:60.60ms -step:1607/2285 train_time:97390ms step_avg:60.60ms -step:1608/2285 train_time:97449ms step_avg:60.60ms -step:1609/2285 train_time:97512ms step_avg:60.60ms -step:1610/2285 train_time:97573ms step_avg:60.60ms -step:1611/2285 train_time:97636ms step_avg:60.61ms -step:1612/2285 train_time:97696ms step_avg:60.61ms -step:1613/2285 train_time:97759ms step_avg:60.61ms -step:1614/2285 train_time:97818ms step_avg:60.61ms -step:1615/2285 train_time:97881ms step_avg:60.61ms -step:1616/2285 train_time:97941ms step_avg:60.61ms -step:1617/2285 train_time:98003ms step_avg:60.61ms -step:1618/2285 train_time:98062ms step_avg:60.61ms -step:1619/2285 train_time:98124ms step_avg:60.61ms -step:1620/2285 train_time:98184ms step_avg:60.61ms -step:1621/2285 train_time:98246ms step_avg:60.61ms -step:1622/2285 train_time:98306ms step_avg:60.61ms -step:1623/2285 train_time:98368ms step_avg:60.61ms -step:1624/2285 train_time:98429ms step_avg:60.61ms -step:1625/2285 train_time:98492ms step_avg:60.61ms -step:1626/2285 train_time:98552ms step_avg:60.61ms -step:1627/2285 train_time:98614ms step_avg:60.61ms -step:1628/2285 train_time:98674ms step_avg:60.61ms -step:1629/2285 train_time:98737ms step_avg:60.61ms -step:1630/2285 train_time:98797ms step_avg:60.61ms -step:1631/2285 train_time:98859ms step_avg:60.61ms -step:1632/2285 train_time:98919ms step_avg:60.61ms -step:1633/2285 train_time:98982ms step_avg:60.61ms -step:1634/2285 train_time:99041ms step_avg:60.61ms -step:1635/2285 train_time:99103ms step_avg:60.61ms -step:1636/2285 train_time:99163ms step_avg:60.61ms -step:1637/2285 train_time:99225ms step_avg:60.61ms -step:1638/2285 train_time:99285ms step_avg:60.61ms -step:1639/2285 train_time:99347ms step_avg:60.61ms -step:1640/2285 train_time:99408ms step_avg:60.61ms -step:1641/2285 train_time:99470ms step_avg:60.62ms -step:1642/2285 train_time:99531ms step_avg:60.62ms -step:1643/2285 train_time:99594ms step_avg:60.62ms -step:1644/2285 train_time:99654ms step_avg:60.62ms -step:1645/2285 train_time:99716ms step_avg:60.62ms -step:1646/2285 train_time:99777ms step_avg:60.62ms -step:1647/2285 train_time:99839ms step_avg:60.62ms -step:1648/2285 train_time:99899ms step_avg:60.62ms -step:1649/2285 train_time:99961ms step_avg:60.62ms -step:1650/2285 train_time:100021ms step_avg:60.62ms -step:1651/2285 train_time:100084ms step_avg:60.62ms -step:1652/2285 train_time:100144ms step_avg:60.62ms -step:1653/2285 train_time:100206ms step_avg:60.62ms -step:1654/2285 train_time:100265ms step_avg:60.62ms -step:1655/2285 train_time:100327ms step_avg:60.62ms -step:1656/2285 train_time:100387ms step_avg:60.62ms -step:1657/2285 train_time:100450ms step_avg:60.62ms -step:1658/2285 train_time:100511ms step_avg:60.62ms -step:1659/2285 train_time:100574ms step_avg:60.62ms -step:1660/2285 train_time:100634ms step_avg:60.62ms -step:1661/2285 train_time:100696ms step_avg:60.62ms -step:1662/2285 train_time:100756ms step_avg:60.62ms -step:1663/2285 train_time:100818ms step_avg:60.62ms -step:1664/2285 train_time:100878ms step_avg:60.62ms -step:1665/2285 train_time:100941ms step_avg:60.63ms -step:1666/2285 train_time:101001ms step_avg:60.62ms -step:1667/2285 train_time:101063ms step_avg:60.63ms -step:1668/2285 train_time:101122ms step_avg:60.62ms -step:1669/2285 train_time:101184ms step_avg:60.63ms -step:1670/2285 train_time:101244ms step_avg:60.63ms -step:1671/2285 train_time:101306ms step_avg:60.63ms -step:1672/2285 train_time:101367ms step_avg:60.63ms -step:1673/2285 train_time:101429ms step_avg:60.63ms -step:1674/2285 train_time:101489ms step_avg:60.63ms -step:1675/2285 train_time:101553ms step_avg:60.63ms -step:1676/2285 train_time:101613ms step_avg:60.63ms -step:1677/2285 train_time:101676ms step_avg:60.63ms -step:1678/2285 train_time:101736ms step_avg:60.63ms -step:1679/2285 train_time:101798ms step_avg:60.63ms -step:1680/2285 train_time:101858ms step_avg:60.63ms -step:1681/2285 train_time:101921ms step_avg:60.63ms -step:1682/2285 train_time:101980ms step_avg:60.63ms -step:1683/2285 train_time:102043ms step_avg:60.63ms -step:1684/2285 train_time:102103ms step_avg:60.63ms -step:1685/2285 train_time:102165ms step_avg:60.63ms -step:1686/2285 train_time:102225ms step_avg:60.63ms -step:1687/2285 train_time:102287ms step_avg:60.63ms -step:1688/2285 train_time:102347ms step_avg:60.63ms -step:1689/2285 train_time:102410ms step_avg:60.63ms -step:1690/2285 train_time:102471ms step_avg:60.63ms -step:1691/2285 train_time:102533ms step_avg:60.63ms -step:1692/2285 train_time:102594ms step_avg:60.63ms -step:1693/2285 train_time:102655ms step_avg:60.64ms -step:1694/2285 train_time:102715ms step_avg:60.63ms -step:1695/2285 train_time:102778ms step_avg:60.64ms -step:1696/2285 train_time:102838ms step_avg:60.64ms -step:1697/2285 train_time:102900ms step_avg:60.64ms -step:1698/2285 train_time:102960ms step_avg:60.64ms -step:1699/2285 train_time:103022ms step_avg:60.64ms -step:1700/2285 train_time:103082ms step_avg:60.64ms -step:1701/2285 train_time:103144ms step_avg:60.64ms -step:1702/2285 train_time:103204ms step_avg:60.64ms -step:1703/2285 train_time:103266ms step_avg:60.64ms -step:1704/2285 train_time:103325ms step_avg:60.64ms -step:1705/2285 train_time:103388ms step_avg:60.64ms -step:1706/2285 train_time:103448ms step_avg:60.64ms -step:1707/2285 train_time:103511ms step_avg:60.64ms -step:1708/2285 train_time:103571ms step_avg:60.64ms -step:1709/2285 train_time:103634ms step_avg:60.64ms -step:1710/2285 train_time:103694ms step_avg:60.64ms -step:1711/2285 train_time:103756ms step_avg:60.64ms -step:1712/2285 train_time:103816ms step_avg:60.64ms -step:1713/2285 train_time:103879ms step_avg:60.64ms -step:1714/2285 train_time:103939ms step_avg:60.64ms -step:1715/2285 train_time:104002ms step_avg:60.64ms -step:1716/2285 train_time:104061ms step_avg:60.64ms -step:1717/2285 train_time:104124ms step_avg:60.64ms -step:1718/2285 train_time:104184ms step_avg:60.64ms -step:1719/2285 train_time:104246ms step_avg:60.64ms -step:1720/2285 train_time:104306ms step_avg:60.64ms -step:1721/2285 train_time:104368ms step_avg:60.64ms -step:1722/2285 train_time:104428ms step_avg:60.64ms -step:1723/2285 train_time:104491ms step_avg:60.64ms -step:1724/2285 train_time:104551ms step_avg:60.64ms -step:1725/2285 train_time:104615ms step_avg:60.65ms -step:1726/2285 train_time:104675ms step_avg:60.65ms -step:1727/2285 train_time:104737ms step_avg:60.65ms -step:1728/2285 train_time:104797ms step_avg:60.65ms -step:1729/2285 train_time:104859ms step_avg:60.65ms -step:1730/2285 train_time:104919ms step_avg:60.65ms -step:1731/2285 train_time:104981ms step_avg:60.65ms -step:1732/2285 train_time:105041ms step_avg:60.65ms -step:1733/2285 train_time:105103ms step_avg:60.65ms -step:1734/2285 train_time:105163ms step_avg:60.65ms -step:1735/2285 train_time:105225ms step_avg:60.65ms -step:1736/2285 train_time:105284ms step_avg:60.65ms -step:1737/2285 train_time:105347ms step_avg:60.65ms -step:1738/2285 train_time:105407ms step_avg:60.65ms -step:1739/2285 train_time:105470ms step_avg:60.65ms -step:1740/2285 train_time:105530ms step_avg:60.65ms -step:1741/2285 train_time:105593ms step_avg:60.65ms -step:1742/2285 train_time:105653ms step_avg:60.65ms -step:1743/2285 train_time:105716ms step_avg:60.65ms -step:1744/2285 train_time:105776ms step_avg:60.65ms -step:1745/2285 train_time:105838ms step_avg:60.65ms -step:1746/2285 train_time:105898ms step_avg:60.65ms -step:1747/2285 train_time:105960ms step_avg:60.65ms -step:1748/2285 train_time:106019ms step_avg:60.65ms -step:1749/2285 train_time:106082ms step_avg:60.65ms -step:1750/2285 train_time:106141ms step_avg:60.65ms -step:1750/2285 val_loss:3.3665 train_time:106205ms step_avg:60.69ms -step:1751/2285 train_time:106223ms step_avg:60.66ms -step:1752/2285 train_time:106266ms step_avg:60.65ms -step:1753/2285 train_time:106329ms step_avg:60.66ms -step:1754/2285 train_time:106389ms step_avg:60.66ms -step:1755/2285 train_time:106454ms step_avg:60.66ms -step:1756/2285 train_time:106515ms step_avg:60.66ms -step:1757/2285 train_time:106576ms step_avg:60.66ms -step:1758/2285 train_time:106635ms step_avg:60.66ms -step:1759/2285 train_time:106697ms step_avg:60.66ms -step:1760/2285 train_time:106756ms step_avg:60.66ms -step:1761/2285 train_time:106818ms step_avg:60.66ms -step:1762/2285 train_time:106877ms step_avg:60.66ms -step:1763/2285 train_time:106939ms step_avg:60.66ms -step:1764/2285 train_time:106998ms step_avg:60.66ms -step:1765/2285 train_time:107059ms step_avg:60.66ms -step:1766/2285 train_time:107119ms step_avg:60.66ms -step:1767/2285 train_time:107183ms step_avg:60.66ms -step:1768/2285 train_time:107244ms step_avg:60.66ms -step:1769/2285 train_time:107306ms step_avg:60.66ms -step:1770/2285 train_time:107367ms step_avg:60.66ms -step:1771/2285 train_time:107429ms step_avg:60.66ms -step:1772/2285 train_time:107489ms step_avg:60.66ms -step:1773/2285 train_time:107552ms step_avg:60.66ms -step:1774/2285 train_time:107612ms step_avg:60.66ms -step:1775/2285 train_time:107675ms step_avg:60.66ms -step:1776/2285 train_time:107735ms step_avg:60.66ms -step:1777/2285 train_time:107796ms step_avg:60.66ms -step:1778/2285 train_time:107855ms step_avg:60.66ms -step:1779/2285 train_time:107917ms step_avg:60.66ms -step:1780/2285 train_time:107976ms step_avg:60.66ms -step:1781/2285 train_time:108038ms step_avg:60.66ms -step:1782/2285 train_time:108098ms step_avg:60.66ms -step:1783/2285 train_time:108161ms step_avg:60.66ms -step:1784/2285 train_time:108221ms step_avg:60.66ms -step:1785/2285 train_time:108284ms step_avg:60.66ms -step:1786/2285 train_time:108345ms step_avg:60.66ms -step:1787/2285 train_time:108407ms step_avg:60.66ms -step:1788/2285 train_time:108467ms step_avg:60.66ms -step:1789/2285 train_time:108530ms step_avg:60.67ms -step:1790/2285 train_time:108590ms step_avg:60.66ms -step:1791/2285 train_time:108653ms step_avg:60.67ms -step:1792/2285 train_time:108713ms step_avg:60.67ms -step:1793/2285 train_time:108774ms step_avg:60.67ms -step:1794/2285 train_time:108834ms step_avg:60.67ms -step:1795/2285 train_time:108896ms step_avg:60.67ms -step:1796/2285 train_time:108955ms step_avg:60.67ms -step:1797/2285 train_time:109017ms step_avg:60.67ms -step:1798/2285 train_time:109077ms step_avg:60.67ms -step:1799/2285 train_time:109140ms step_avg:60.67ms -step:1800/2285 train_time:109200ms step_avg:60.67ms -step:1801/2285 train_time:109263ms step_avg:60.67ms -step:1802/2285 train_time:109324ms step_avg:60.67ms -step:1803/2285 train_time:109386ms step_avg:60.67ms -step:1804/2285 train_time:109446ms step_avg:60.67ms -step:1805/2285 train_time:109508ms step_avg:60.67ms -step:1806/2285 train_time:109568ms step_avg:60.67ms -step:1807/2285 train_time:109631ms step_avg:60.67ms -step:1808/2285 train_time:109691ms step_avg:60.67ms -step:1809/2285 train_time:109753ms step_avg:60.67ms -step:1810/2285 train_time:109813ms step_avg:60.67ms -step:1811/2285 train_time:109876ms step_avg:60.67ms -step:1812/2285 train_time:109936ms step_avg:60.67ms -step:1813/2285 train_time:109998ms step_avg:60.67ms -step:1814/2285 train_time:110057ms step_avg:60.67ms -step:1815/2285 train_time:110120ms step_avg:60.67ms -step:1816/2285 train_time:110180ms step_avg:60.67ms -step:1817/2285 train_time:110244ms step_avg:60.67ms -step:1818/2285 train_time:110303ms step_avg:60.67ms -step:1819/2285 train_time:110366ms step_avg:60.67ms -step:1820/2285 train_time:110425ms step_avg:60.67ms -step:1821/2285 train_time:110488ms step_avg:60.67ms -step:1822/2285 train_time:110548ms step_avg:60.67ms -step:1823/2285 train_time:110610ms step_avg:60.67ms -step:1824/2285 train_time:110670ms step_avg:60.67ms -step:1825/2285 train_time:110733ms step_avg:60.68ms -step:1826/2285 train_time:110793ms step_avg:60.68ms -step:1827/2285 train_time:110856ms step_avg:60.68ms -step:1828/2285 train_time:110916ms step_avg:60.68ms -step:1829/2285 train_time:110978ms step_avg:60.68ms -step:1830/2285 train_time:111037ms step_avg:60.68ms -step:1831/2285 train_time:111100ms step_avg:60.68ms -step:1832/2285 train_time:111160ms step_avg:60.68ms -step:1833/2285 train_time:111223ms step_avg:60.68ms -step:1834/2285 train_time:111282ms step_avg:60.68ms -step:1835/2285 train_time:111344ms step_avg:60.68ms -step:1836/2285 train_time:111404ms step_avg:60.68ms -step:1837/2285 train_time:111466ms step_avg:60.68ms -step:1838/2285 train_time:111526ms step_avg:60.68ms -step:1839/2285 train_time:111588ms step_avg:60.68ms -step:1840/2285 train_time:111648ms step_avg:60.68ms -step:1841/2285 train_time:111710ms step_avg:60.68ms -step:1842/2285 train_time:111771ms step_avg:60.68ms -step:1843/2285 train_time:111834ms step_avg:60.68ms -step:1844/2285 train_time:111894ms step_avg:60.68ms -step:1845/2285 train_time:111957ms step_avg:60.68ms -step:1846/2285 train_time:112017ms step_avg:60.68ms -step:1847/2285 train_time:112079ms step_avg:60.68ms -step:1848/2285 train_time:112139ms step_avg:60.68ms -step:1849/2285 train_time:112201ms step_avg:60.68ms -step:1850/2285 train_time:112261ms step_avg:60.68ms -step:1851/2285 train_time:112324ms step_avg:60.68ms -step:1852/2285 train_time:112384ms step_avg:60.68ms -step:1853/2285 train_time:112445ms step_avg:60.68ms -step:1854/2285 train_time:112505ms step_avg:60.68ms -step:1855/2285 train_time:112567ms step_avg:60.68ms -step:1856/2285 train_time:112627ms step_avg:60.68ms -step:1857/2285 train_time:112689ms step_avg:60.68ms -step:1858/2285 train_time:112749ms step_avg:60.68ms -step:1859/2285 train_time:112812ms step_avg:60.68ms -step:1860/2285 train_time:112872ms step_avg:60.68ms -step:1861/2285 train_time:112936ms step_avg:60.69ms -step:1862/2285 train_time:112997ms step_avg:60.69ms -step:1863/2285 train_time:113058ms step_avg:60.69ms -step:1864/2285 train_time:113118ms step_avg:60.69ms -step:1865/2285 train_time:113181ms step_avg:60.69ms -step:1866/2285 train_time:113241ms step_avg:60.69ms -step:1867/2285 train_time:113303ms step_avg:60.69ms -step:1868/2285 train_time:113363ms step_avg:60.69ms -step:1869/2285 train_time:113424ms step_avg:60.69ms -step:1870/2285 train_time:113484ms step_avg:60.69ms -step:1871/2285 train_time:113546ms step_avg:60.69ms -step:1872/2285 train_time:113606ms step_avg:60.69ms -step:1873/2285 train_time:113669ms step_avg:60.69ms -step:1874/2285 train_time:113729ms step_avg:60.69ms -step:1875/2285 train_time:113792ms step_avg:60.69ms -step:1876/2285 train_time:113852ms step_avg:60.69ms -step:1877/2285 train_time:113916ms step_avg:60.69ms -step:1878/2285 train_time:113976ms step_avg:60.69ms -step:1879/2285 train_time:114038ms step_avg:60.69ms -step:1880/2285 train_time:114098ms step_avg:60.69ms -step:1881/2285 train_time:114160ms step_avg:60.69ms -step:1882/2285 train_time:114220ms step_avg:60.69ms -step:1883/2285 train_time:114282ms step_avg:60.69ms -step:1884/2285 train_time:114342ms step_avg:60.69ms -step:1885/2285 train_time:114404ms step_avg:60.69ms -step:1886/2285 train_time:114464ms step_avg:60.69ms -step:1887/2285 train_time:114526ms step_avg:60.69ms -step:1888/2285 train_time:114586ms step_avg:60.69ms -step:1889/2285 train_time:114648ms step_avg:60.69ms -step:1890/2285 train_time:114708ms step_avg:60.69ms -step:1891/2285 train_time:114770ms step_avg:60.69ms -step:1892/2285 train_time:114831ms step_avg:60.69ms -step:1893/2285 train_time:114893ms step_avg:60.69ms -step:1894/2285 train_time:114953ms step_avg:60.69ms -step:1895/2285 train_time:115016ms step_avg:60.69ms -step:1896/2285 train_time:115076ms step_avg:60.69ms -step:1897/2285 train_time:115138ms step_avg:60.69ms -step:1898/2285 train_time:115198ms step_avg:60.69ms -step:1899/2285 train_time:115260ms step_avg:60.70ms -step:1900/2285 train_time:115320ms step_avg:60.69ms -step:1901/2285 train_time:115382ms step_avg:60.70ms -step:1902/2285 train_time:115442ms step_avg:60.70ms -step:1903/2285 train_time:115505ms step_avg:60.70ms -step:1904/2285 train_time:115565ms step_avg:60.70ms -step:1905/2285 train_time:115628ms step_avg:60.70ms -step:1906/2285 train_time:115688ms step_avg:60.70ms -step:1907/2285 train_time:115751ms step_avg:60.70ms -step:1908/2285 train_time:115811ms step_avg:60.70ms -step:1909/2285 train_time:115873ms step_avg:60.70ms -step:1910/2285 train_time:115934ms step_avg:60.70ms -step:1911/2285 train_time:115996ms step_avg:60.70ms -step:1912/2285 train_time:116057ms step_avg:60.70ms -step:1913/2285 train_time:116119ms step_avg:60.70ms -step:1914/2285 train_time:116179ms step_avg:60.70ms -step:1915/2285 train_time:116242ms step_avg:60.70ms -step:1916/2285 train_time:116302ms step_avg:60.70ms -step:1917/2285 train_time:116364ms step_avg:60.70ms -step:1918/2285 train_time:116424ms step_avg:60.70ms -step:1919/2285 train_time:116487ms step_avg:60.70ms -step:1920/2285 train_time:116547ms step_avg:60.70ms -step:1921/2285 train_time:116609ms step_avg:60.70ms -step:1922/2285 train_time:116669ms step_avg:60.70ms -step:1923/2285 train_time:116732ms step_avg:60.70ms -step:1924/2285 train_time:116792ms step_avg:60.70ms -step:1925/2285 train_time:116855ms step_avg:60.70ms -step:1926/2285 train_time:116915ms step_avg:60.70ms -step:1927/2285 train_time:116977ms step_avg:60.70ms -step:1928/2285 train_time:117038ms step_avg:60.70ms -step:1929/2285 train_time:117100ms step_avg:60.70ms -step:1930/2285 train_time:117160ms step_avg:60.70ms -step:1931/2285 train_time:117222ms step_avg:60.71ms -step:1932/2285 train_time:117282ms step_avg:60.71ms -step:1933/2285 train_time:117344ms step_avg:60.71ms -step:1934/2285 train_time:117404ms step_avg:60.71ms -step:1935/2285 train_time:117467ms step_avg:60.71ms -step:1936/2285 train_time:117527ms step_avg:60.71ms -step:1937/2285 train_time:117589ms step_avg:60.71ms -step:1938/2285 train_time:117649ms step_avg:60.71ms -step:1939/2285 train_time:117712ms step_avg:60.71ms -step:1940/2285 train_time:117772ms step_avg:60.71ms -step:1941/2285 train_time:117835ms step_avg:60.71ms -step:1942/2285 train_time:117894ms step_avg:60.71ms -step:1943/2285 train_time:117957ms step_avg:60.71ms -step:1944/2285 train_time:118017ms step_avg:60.71ms -step:1945/2285 train_time:118079ms step_avg:60.71ms -step:1946/2285 train_time:118140ms step_avg:60.71ms -step:1947/2285 train_time:118203ms step_avg:60.71ms -step:1948/2285 train_time:118263ms step_avg:60.71ms -step:1949/2285 train_time:118325ms step_avg:60.71ms -step:1950/2285 train_time:118385ms step_avg:60.71ms -step:1951/2285 train_time:118447ms step_avg:60.71ms -step:1952/2285 train_time:118507ms step_avg:60.71ms -step:1953/2285 train_time:118569ms step_avg:60.71ms -step:1954/2285 train_time:118629ms step_avg:60.71ms -step:1955/2285 train_time:118692ms step_avg:60.71ms -step:1956/2285 train_time:118752ms step_avg:60.71ms -step:1957/2285 train_time:118814ms step_avg:60.71ms -step:1958/2285 train_time:118874ms step_avg:60.71ms -step:1959/2285 train_time:118937ms step_avg:60.71ms -step:1960/2285 train_time:118997ms step_avg:60.71ms -step:1961/2285 train_time:119059ms step_avg:60.71ms -step:1962/2285 train_time:119119ms step_avg:60.71ms -step:1963/2285 train_time:119182ms step_avg:60.71ms -step:1964/2285 train_time:119242ms step_avg:60.71ms -step:1965/2285 train_time:119304ms step_avg:60.71ms -step:1966/2285 train_time:119364ms step_avg:60.71ms -step:1967/2285 train_time:119426ms step_avg:60.72ms -step:1968/2285 train_time:119486ms step_avg:60.71ms -step:1969/2285 train_time:119549ms step_avg:60.72ms -step:1970/2285 train_time:119609ms step_avg:60.72ms -step:1971/2285 train_time:119672ms step_avg:60.72ms -step:1972/2285 train_time:119732ms step_avg:60.72ms -step:1973/2285 train_time:119794ms step_avg:60.72ms -step:1974/2285 train_time:119855ms step_avg:60.72ms -step:1975/2285 train_time:119917ms step_avg:60.72ms -step:1976/2285 train_time:119977ms step_avg:60.72ms -step:1977/2285 train_time:120039ms step_avg:60.72ms -step:1978/2285 train_time:120099ms step_avg:60.72ms -step:1979/2285 train_time:120162ms step_avg:60.72ms -step:1980/2285 train_time:120222ms step_avg:60.72ms -step:1981/2285 train_time:120284ms step_avg:60.72ms -step:1982/2285 train_time:120344ms step_avg:60.72ms -step:1983/2285 train_time:120407ms step_avg:60.72ms -step:1984/2285 train_time:120466ms step_avg:60.72ms -step:1985/2285 train_time:120528ms step_avg:60.72ms -step:1986/2285 train_time:120589ms step_avg:60.72ms -step:1987/2285 train_time:120652ms step_avg:60.72ms -step:1988/2285 train_time:120713ms step_avg:60.72ms -step:1989/2285 train_time:120775ms step_avg:60.72ms -step:1990/2285 train_time:120835ms step_avg:60.72ms -step:1991/2285 train_time:120898ms step_avg:60.72ms -step:1992/2285 train_time:120958ms step_avg:60.72ms -step:1993/2285 train_time:121021ms step_avg:60.72ms -step:1994/2285 train_time:121081ms step_avg:60.72ms -step:1995/2285 train_time:121144ms step_avg:60.72ms -step:1996/2285 train_time:121204ms step_avg:60.72ms -step:1997/2285 train_time:121266ms step_avg:60.72ms -step:1998/2285 train_time:121326ms step_avg:60.72ms -step:1999/2285 train_time:121388ms step_avg:60.72ms -step:2000/2285 train_time:121448ms step_avg:60.72ms -step:2000/2285 val_loss:3.3174 train_time:121512ms step_avg:60.76ms -step:2001/2285 train_time:121530ms step_avg:60.73ms -step:2002/2285 train_time:121574ms step_avg:60.73ms -step:2003/2285 train_time:121636ms step_avg:60.73ms -step:2004/2285 train_time:121697ms step_avg:60.73ms -step:2005/2285 train_time:121762ms step_avg:60.73ms -step:2006/2285 train_time:121822ms step_avg:60.73ms -step:2007/2285 train_time:121884ms step_avg:60.73ms -step:2008/2285 train_time:121945ms step_avg:60.73ms -step:2009/2285 train_time:122007ms step_avg:60.73ms -step:2010/2285 train_time:122066ms step_avg:60.73ms -step:2011/2285 train_time:122127ms step_avg:60.73ms -step:2012/2285 train_time:122186ms step_avg:60.73ms -step:2013/2285 train_time:122248ms step_avg:60.73ms -step:2014/2285 train_time:122309ms step_avg:60.73ms -step:2015/2285 train_time:122371ms step_avg:60.73ms -step:2016/2285 train_time:122432ms step_avg:60.73ms -step:2017/2285 train_time:122496ms step_avg:60.73ms -step:2018/2285 train_time:122557ms step_avg:60.73ms -step:2019/2285 train_time:122620ms step_avg:60.73ms -step:2020/2285 train_time:122681ms step_avg:60.73ms -step:2021/2285 train_time:122744ms step_avg:60.73ms -step:2022/2285 train_time:122804ms step_avg:60.73ms -step:2023/2285 train_time:122867ms step_avg:60.73ms -step:2024/2285 train_time:122927ms step_avg:60.73ms -step:2025/2285 train_time:122988ms step_avg:60.74ms -step:2026/2285 train_time:123048ms step_avg:60.73ms -step:2027/2285 train_time:123109ms step_avg:60.73ms -step:2028/2285 train_time:123169ms step_avg:60.73ms -step:2029/2285 train_time:123230ms step_avg:60.73ms -step:2030/2285 train_time:123290ms step_avg:60.73ms -step:2031/2285 train_time:123352ms step_avg:60.73ms -step:2032/2285 train_time:123413ms step_avg:60.73ms -step:2033/2285 train_time:123476ms step_avg:60.74ms -step:2034/2285 train_time:123536ms step_avg:60.74ms -step:2035/2285 train_time:123599ms step_avg:60.74ms -step:2036/2285 train_time:123660ms step_avg:60.74ms -step:2037/2285 train_time:123723ms step_avg:60.74ms -step:2038/2285 train_time:123784ms step_avg:60.74ms -step:2039/2285 train_time:123847ms step_avg:60.74ms -step:2040/2285 train_time:123907ms step_avg:60.74ms -step:2041/2285 train_time:123969ms step_avg:60.74ms -step:2042/2285 train_time:124029ms step_avg:60.74ms -step:2043/2285 train_time:124091ms step_avg:60.74ms -step:2044/2285 train_time:124151ms step_avg:60.74ms -step:2045/2285 train_time:124213ms step_avg:60.74ms -step:2046/2285 train_time:124272ms step_avg:60.74ms -step:2047/2285 train_time:124335ms step_avg:60.74ms -step:2048/2285 train_time:124395ms step_avg:60.74ms -step:2049/2285 train_time:124457ms step_avg:60.74ms -step:2050/2285 train_time:124517ms step_avg:60.74ms -step:2051/2285 train_time:124579ms step_avg:60.74ms -step:2052/2285 train_time:124640ms step_avg:60.74ms -step:2053/2285 train_time:124703ms step_avg:60.74ms -step:2054/2285 train_time:124764ms step_avg:60.74ms -step:2055/2285 train_time:124826ms step_avg:60.74ms -step:2056/2285 train_time:124886ms step_avg:60.74ms -step:2057/2285 train_time:124948ms step_avg:60.74ms -step:2058/2285 train_time:125008ms step_avg:60.74ms -step:2059/2285 train_time:125070ms step_avg:60.74ms -step:2060/2285 train_time:125130ms step_avg:60.74ms -step:2061/2285 train_time:125191ms step_avg:60.74ms -step:2062/2285 train_time:125251ms step_avg:60.74ms -step:2063/2285 train_time:125313ms step_avg:60.74ms -step:2064/2285 train_time:125373ms step_avg:60.74ms -step:2065/2285 train_time:125436ms step_avg:60.74ms -step:2066/2285 train_time:125496ms step_avg:60.74ms -step:2067/2285 train_time:125558ms step_avg:60.74ms -step:2068/2285 train_time:125619ms step_avg:60.74ms -step:2069/2285 train_time:125682ms step_avg:60.75ms -step:2070/2285 train_time:125742ms step_avg:60.75ms -step:2071/2285 train_time:125805ms step_avg:60.75ms -step:2072/2285 train_time:125866ms step_avg:60.75ms -step:2073/2285 train_time:125928ms step_avg:60.75ms -step:2074/2285 train_time:125988ms step_avg:60.75ms -step:2075/2285 train_time:126050ms step_avg:60.75ms -step:2076/2285 train_time:126110ms step_avg:60.75ms -step:2077/2285 train_time:126172ms step_avg:60.75ms -step:2078/2285 train_time:126232ms step_avg:60.75ms -step:2079/2285 train_time:126295ms step_avg:60.75ms -step:2080/2285 train_time:126355ms step_avg:60.75ms -step:2081/2285 train_time:126417ms step_avg:60.75ms -step:2082/2285 train_time:126477ms step_avg:60.75ms -step:2083/2285 train_time:126539ms step_avg:60.75ms -step:2084/2285 train_time:126600ms step_avg:60.75ms -step:2085/2285 train_time:126663ms step_avg:60.75ms -step:2086/2285 train_time:126723ms step_avg:60.75ms -step:2087/2285 train_time:126787ms step_avg:60.75ms -step:2088/2285 train_time:126847ms step_avg:60.75ms -step:2089/2285 train_time:126909ms step_avg:60.75ms -step:2090/2285 train_time:126969ms step_avg:60.75ms -step:2091/2285 train_time:127031ms step_avg:60.75ms -step:2092/2285 train_time:127091ms step_avg:60.75ms -step:2093/2285 train_time:127154ms step_avg:60.75ms -step:2094/2285 train_time:127214ms step_avg:60.75ms -step:2095/2285 train_time:127275ms step_avg:60.75ms -step:2096/2285 train_time:127335ms step_avg:60.75ms -step:2097/2285 train_time:127398ms step_avg:60.75ms -step:2098/2285 train_time:127457ms step_avg:60.75ms -step:2099/2285 train_time:127519ms step_avg:60.75ms -step:2100/2285 train_time:127580ms step_avg:60.75ms -step:2101/2285 train_time:127642ms step_avg:60.75ms -step:2102/2285 train_time:127703ms step_avg:60.75ms -step:2103/2285 train_time:127766ms step_avg:60.75ms -step:2104/2285 train_time:127826ms step_avg:60.75ms -step:2105/2285 train_time:127888ms step_avg:60.75ms -step:2106/2285 train_time:127948ms step_avg:60.75ms -step:2107/2285 train_time:128010ms step_avg:60.75ms -step:2108/2285 train_time:128071ms step_avg:60.75ms -step:2109/2285 train_time:128132ms step_avg:60.76ms -step:2110/2285 train_time:128192ms step_avg:60.75ms -step:2111/2285 train_time:128254ms step_avg:60.76ms -step:2112/2285 train_time:128314ms step_avg:60.75ms -step:2113/2285 train_time:128376ms step_avg:60.76ms -step:2114/2285 train_time:128436ms step_avg:60.75ms -step:2115/2285 train_time:128497ms step_avg:60.76ms -step:2116/2285 train_time:128558ms step_avg:60.76ms -step:2117/2285 train_time:128621ms step_avg:60.76ms -step:2118/2285 train_time:128681ms step_avg:60.76ms -step:2119/2285 train_time:128744ms step_avg:60.76ms -step:2120/2285 train_time:128805ms step_avg:60.76ms -step:2121/2285 train_time:128867ms step_avg:60.76ms -step:2122/2285 train_time:128928ms step_avg:60.76ms -step:2123/2285 train_time:128990ms step_avg:60.76ms -step:2124/2285 train_time:129050ms step_avg:60.76ms -step:2125/2285 train_time:129112ms step_avg:60.76ms -step:2126/2285 train_time:129173ms step_avg:60.76ms -step:2127/2285 train_time:129235ms step_avg:60.76ms -step:2128/2285 train_time:129294ms step_avg:60.76ms -step:2129/2285 train_time:129356ms step_avg:60.76ms -step:2130/2285 train_time:129416ms step_avg:60.76ms -step:2131/2285 train_time:129479ms step_avg:60.76ms -step:2132/2285 train_time:129539ms step_avg:60.76ms -step:2133/2285 train_time:129601ms step_avg:60.76ms -step:2134/2285 train_time:129663ms step_avg:60.76ms -step:2135/2285 train_time:129725ms step_avg:60.76ms -step:2136/2285 train_time:129785ms step_avg:60.76ms -step:2137/2285 train_time:129848ms step_avg:60.76ms -step:2138/2285 train_time:129908ms step_avg:60.76ms -step:2139/2285 train_time:129971ms step_avg:60.76ms -step:2140/2285 train_time:130031ms step_avg:60.76ms -step:2141/2285 train_time:130093ms step_avg:60.76ms -step:2142/2285 train_time:130153ms step_avg:60.76ms -step:2143/2285 train_time:130215ms step_avg:60.76ms -step:2144/2285 train_time:130275ms step_avg:60.76ms -step:2145/2285 train_time:130337ms step_avg:60.76ms -step:2146/2285 train_time:130397ms step_avg:60.76ms -step:2147/2285 train_time:130459ms step_avg:60.76ms -step:2148/2285 train_time:130520ms step_avg:60.76ms -step:2149/2285 train_time:130582ms step_avg:60.76ms -step:2150/2285 train_time:130642ms step_avg:60.76ms -step:2151/2285 train_time:130705ms step_avg:60.76ms -step:2152/2285 train_time:130766ms step_avg:60.76ms -step:2153/2285 train_time:130828ms step_avg:60.77ms -step:2154/2285 train_time:130888ms step_avg:60.77ms -step:2155/2285 train_time:130951ms step_avg:60.77ms -step:2156/2285 train_time:131011ms step_avg:60.77ms -step:2157/2285 train_time:131074ms step_avg:60.77ms -step:2158/2285 train_time:131134ms step_avg:60.77ms -step:2159/2285 train_time:131196ms step_avg:60.77ms -step:2160/2285 train_time:131256ms step_avg:60.77ms -step:2161/2285 train_time:131318ms step_avg:60.77ms -step:2162/2285 train_time:131378ms step_avg:60.77ms -step:2163/2285 train_time:131441ms step_avg:60.77ms -step:2164/2285 train_time:131501ms step_avg:60.77ms -step:2165/2285 train_time:131564ms step_avg:60.77ms -step:2166/2285 train_time:131624ms step_avg:60.77ms -step:2167/2285 train_time:131686ms step_avg:60.77ms -step:2168/2285 train_time:131746ms step_avg:60.77ms -step:2169/2285 train_time:131808ms step_avg:60.77ms -step:2170/2285 train_time:131868ms step_avg:60.77ms -step:2171/2285 train_time:131931ms step_avg:60.77ms -step:2172/2285 train_time:131991ms step_avg:60.77ms -step:2173/2285 train_time:132054ms step_avg:60.77ms -step:2174/2285 train_time:132113ms step_avg:60.77ms -step:2175/2285 train_time:132175ms step_avg:60.77ms -step:2176/2285 train_time:132235ms step_avg:60.77ms -step:2177/2285 train_time:132297ms step_avg:60.77ms -step:2178/2285 train_time:132357ms step_avg:60.77ms -step:2179/2285 train_time:132419ms step_avg:60.77ms -step:2180/2285 train_time:132479ms step_avg:60.77ms -step:2181/2285 train_time:132541ms step_avg:60.77ms -step:2182/2285 train_time:132602ms step_avg:60.77ms -step:2183/2285 train_time:132665ms step_avg:60.77ms -step:2184/2285 train_time:132725ms step_avg:60.77ms -step:2185/2285 train_time:132788ms step_avg:60.77ms -step:2186/2285 train_time:132848ms step_avg:60.77ms -step:2187/2285 train_time:132911ms step_avg:60.77ms -step:2188/2285 train_time:132971ms step_avg:60.77ms -step:2189/2285 train_time:133034ms step_avg:60.77ms -step:2190/2285 train_time:133093ms step_avg:60.77ms -step:2191/2285 train_time:133156ms step_avg:60.77ms -step:2192/2285 train_time:133216ms step_avg:60.77ms -step:2193/2285 train_time:133277ms step_avg:60.77ms -step:2194/2285 train_time:133337ms step_avg:60.77ms -step:2195/2285 train_time:133399ms step_avg:60.77ms -step:2196/2285 train_time:133459ms step_avg:60.77ms -step:2197/2285 train_time:133522ms step_avg:60.77ms -step:2198/2285 train_time:133582ms step_avg:60.77ms -step:2199/2285 train_time:133645ms step_avg:60.78ms -step:2200/2285 train_time:133705ms step_avg:60.78ms -step:2201/2285 train_time:133768ms step_avg:60.78ms -step:2202/2285 train_time:133828ms step_avg:60.78ms -step:2203/2285 train_time:133890ms step_avg:60.78ms -step:2204/2285 train_time:133951ms step_avg:60.78ms -step:2205/2285 train_time:134013ms step_avg:60.78ms -step:2206/2285 train_time:134073ms step_avg:60.78ms -step:2207/2285 train_time:134135ms step_avg:60.78ms -step:2208/2285 train_time:134195ms step_avg:60.78ms -step:2209/2285 train_time:134257ms step_avg:60.78ms -step:2210/2285 train_time:134317ms step_avg:60.78ms -step:2211/2285 train_time:134380ms step_avg:60.78ms -step:2212/2285 train_time:134439ms step_avg:60.78ms -step:2213/2285 train_time:134502ms step_avg:60.78ms -step:2214/2285 train_time:134563ms step_avg:60.78ms -step:2215/2285 train_time:134626ms step_avg:60.78ms -step:2216/2285 train_time:134685ms step_avg:60.78ms -step:2217/2285 train_time:134748ms step_avg:60.78ms -step:2218/2285 train_time:134808ms step_avg:60.78ms -step:2219/2285 train_time:134870ms step_avg:60.78ms -step:2220/2285 train_time:134930ms step_avg:60.78ms -step:2221/2285 train_time:134992ms step_avg:60.78ms -step:2222/2285 train_time:135052ms step_avg:60.78ms -step:2223/2285 train_time:135114ms step_avg:60.78ms -step:2224/2285 train_time:135175ms step_avg:60.78ms -step:2225/2285 train_time:135237ms step_avg:60.78ms -step:2226/2285 train_time:135297ms step_avg:60.78ms -step:2227/2285 train_time:135360ms step_avg:60.78ms -step:2228/2285 train_time:135420ms step_avg:60.78ms -step:2229/2285 train_time:135482ms step_avg:60.78ms -step:2230/2285 train_time:135543ms step_avg:60.78ms -step:2231/2285 train_time:135606ms step_avg:60.78ms -step:2232/2285 train_time:135666ms step_avg:60.78ms -step:2233/2285 train_time:135729ms step_avg:60.78ms -step:2234/2285 train_time:135788ms step_avg:60.78ms -step:2235/2285 train_time:135850ms step_avg:60.78ms -step:2236/2285 train_time:135910ms step_avg:60.78ms -step:2237/2285 train_time:135973ms step_avg:60.78ms -step:2238/2285 train_time:136032ms step_avg:60.78ms -step:2239/2285 train_time:136095ms step_avg:60.78ms -step:2240/2285 train_time:136155ms step_avg:60.78ms -step:2241/2285 train_time:136217ms step_avg:60.78ms -step:2242/2285 train_time:136277ms step_avg:60.78ms -step:2243/2285 train_time:136340ms step_avg:60.78ms -step:2244/2285 train_time:136400ms step_avg:60.78ms -step:2245/2285 train_time:136463ms step_avg:60.79ms -step:2246/2285 train_time:136523ms step_avg:60.79ms -step:2247/2285 train_time:136586ms step_avg:60.79ms -step:2248/2285 train_time:136646ms step_avg:60.79ms -step:2249/2285 train_time:136708ms step_avg:60.79ms -step:2250/2285 train_time:136769ms step_avg:60.79ms -step:2250/2285 val_loss:3.2821 train_time:136832ms step_avg:60.81ms -step:2251/2285 train_time:136851ms step_avg:60.80ms -step:2252/2285 train_time:136896ms step_avg:60.79ms -step:2253/2285 train_time:136961ms step_avg:60.79ms -step:2254/2285 train_time:137022ms step_avg:60.79ms -step:2255/2285 train_time:137085ms step_avg:60.79ms -step:2256/2285 train_time:137145ms step_avg:60.79ms -step:2257/2285 train_time:137206ms step_avg:60.79ms -step:2258/2285 train_time:137266ms step_avg:60.79ms -step:2259/2285 train_time:137327ms step_avg:60.79ms -step:2260/2285 train_time:137387ms step_avg:60.79ms -step:2261/2285 train_time:137449ms step_avg:60.79ms -step:2262/2285 train_time:137508ms step_avg:60.79ms -step:2263/2285 train_time:137570ms step_avg:60.79ms -step:2264/2285 train_time:137630ms step_avg:60.79ms -step:2265/2285 train_time:137692ms step_avg:60.79ms -step:2266/2285 train_time:137753ms step_avg:60.79ms -step:2267/2285 train_time:137818ms step_avg:60.79ms -step:2268/2285 train_time:137880ms step_avg:60.79ms -step:2269/2285 train_time:137943ms step_avg:60.79ms -step:2270/2285 train_time:138004ms step_avg:60.79ms -step:2271/2285 train_time:138067ms step_avg:60.80ms -step:2272/2285 train_time:138128ms step_avg:60.80ms -step:2273/2285 train_time:138190ms step_avg:60.80ms -step:2274/2285 train_time:138250ms step_avg:60.80ms -step:2275/2285 train_time:138312ms step_avg:60.80ms -step:2276/2285 train_time:138371ms step_avg:60.80ms -step:2277/2285 train_time:138433ms step_avg:60.80ms -step:2278/2285 train_time:138493ms step_avg:60.80ms -step:2279/2285 train_time:138555ms step_avg:60.80ms -step:2280/2285 train_time:138615ms step_avg:60.80ms -step:2281/2285 train_time:138677ms step_avg:60.80ms -step:2282/2285 train_time:138738ms step_avg:60.80ms -step:2283/2285 train_time:138802ms step_avg:60.80ms -step:2284/2285 train_time:138863ms step_avg:60.80ms -step:2285/2285 train_time:138925ms step_avg:60.80ms -step:2285/2285 val_loss:3.2766 train_time:138986ms step_avg:60.83ms -peak memory allocated: 29249 MiB reserved: 50528 MiB diff --git a/records/track_1_short/2025-10-27_FixMuonLR/6c588921-a777-458d-8003-f608774f040c.txt b/records/track_1_short/2025-10-27_FixMuonLR/6c588921-a777-458d-8003-f608774f040c.txt deleted file mode 100644 index 473bd8971..000000000 --- a/records/track_1_short/2025-10-27_FixMuonLR/6c588921-a777-458d-8003-f608774f040c.txt +++ /dev/null @@ -1,3814 +0,0 @@ -import os -import sys - -with open(sys.argv[0]) as f: - code = f.read() # read the code of this file ASAP, for logging -import copy -import glob -import math -import threading -import time -import uuid -from dataclasses import dataclass -from collections import defaultdict -from itertools import accumulate -from pathlib import Path - -os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" -import torch - -torch.empty( - 1, device="cuda", requires_grad=True -).backward() # prevents a bug on some systems -import torch._dynamo as dynamo -import torch.distributed as dist -import torch.nn.functional as F - -# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min -import triton -import triton.language as tl -from kernels import get_kernel -from torch import Tensor, nn - -dynamo.config.recompile_limit = 64 - -# ----------------------------------------------------------------------------- -# Custom operators: FP8 matmul by @YouJiacheng - - -@torch.library.custom_op("nanogpt::mm", mutates_args=()) -def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: - @torch.compile - def impl(x: Tensor, w: Tensor): - assert x.is_contiguous() and w.is_contiguous() - x_f8 = x.div(x_s).to(torch.float8_e4m3fn) - w_f8 = w.div(w_s).to(torch.float8_e4m3fn) - out = torch._scaled_mm( - x_f8, - w_f8.T, - out_dtype=torch.bfloat16, - scale_a=x.new_tensor(x_s, dtype=torch.float32), - scale_b=x.new_tensor(w_s, dtype=torch.float32), - use_fast_accum=True, - ) - return out, x_f8, w_f8 - - return impl(x, w) - -@mm_op.register_fake -def _(x: Tensor, w: Tensor, *_): - assert x.ndim == w.ndim == 2 - assert x.shape[1] == w.shape[1] - assert x.device == w.device - assert x.is_contiguous() and w.is_contiguous() - return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) - -@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) -def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: - @torch.compile - def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): - assert grad.is_contiguous() - x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) - w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) - grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) - grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) - grad_x = torch._scaled_mm( - grad_f8, - w_f8.T.contiguous().T, - out_dtype=torch.bfloat16, - scale_a=grad_inv_s, - scale_b=w_inv_s, - use_fast_accum=False, - ) - # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) - grad_w = torch._scaled_mm( - x_f8.T.contiguous(), - grad_f8.T.contiguous().T, - out_dtype=torch.float32, - scale_a=x_inv_s, - scale_b=grad_inv_s, - use_fast_accum=False, - ).T - return grad_x, grad_w - - return impl(g, x_f8, w_f8) - -@mm_backward_op.register_fake -def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): - return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) - -def backward(ctx, grad_out: Tensor, *_): - x_f8, w_f8 = ctx.saved_tensors - x_s, w_s, grad_s = ctx.scales - grad_x, grad_w = torch.ops.nanogpt.mm_backward( - grad_out, x_f8, w_f8, x_s, w_s, grad_s - ) - return grad_x, grad_w, None, None, None - -def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): - *_, x_s, w_s, grad_s = inputs - _, x_f8, w_f8 = output - ctx.save_for_backward(x_f8, w_f8) - ctx.scales = x_s, w_s, grad_s - ctx.set_materialize_grads(False) - -mm_op.register_autograd(backward, setup_context=setup_context) - -# ----------------------------------------------------------------------------- -# Triton kernel for symmetric matrix multiplication by @byronxu99 - -def _get_autotune_configs(): - return [ - triton.Config( - { - "BLOCK_SIZE_M": bm, - "BLOCK_SIZE_N": bn, - "BLOCK_SIZE_K": bk, - "GROUP_SIZE_M": 8, - "LOWER_UPPER": 1, - }, - num_stages=stages, - num_warps=warps, - ) - for bm in [64, 128] - for bn in [64, 128, 256] - for bk in [64, 128] - for stages, warps in [(3, 4), (3, 8), (4, 4)] - if bm // bn <= 2 and bn // bm <= 2 - ] - -@triton.jit -def _pid_to_block( - pid, - M, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, -): - # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) - num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) - num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) - - # Map PID to a single matrix in batch - batch_idx = pid // (num_pid_m * num_pid_n) - pid = pid % (num_pid_m * num_pid_n) - - # Map PID to 2D grid of blocks - pid_m = pid // num_pid_n - pid_n = pid % num_pid_n - pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) - - m_idx = pid_m * BLOCK_SIZE_M - n_idx = pid_n * BLOCK_SIZE_N - return batch_idx, m_idx, n_idx - -@triton.autotune( - configs=_get_autotune_configs(), - key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], -) -@triton.jit -def XXT_kernel( - A_ptr, C_ptr, - M, K, - a_stride_b, a_stride_r, a_stride_c, - c_stride_b, c_stride_r, c_stride_c, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, - LOWER_UPPER: tl.constexpr, -): - pid = tl.program_id(axis=0) - batch_idx, m_idx, n_idx = _pid_to_block( - pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M - ) - - # Skip blocks that don't need to be computed - skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) - skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) - if skip_block_below_diag or skip_block_above_diag: - return - - # Index into one matrix of batch - A_ptr += batch_idx * a_stride_b - C_ptr += batch_idx * c_stride_b - - # Create pointer arrays for A and A.T - offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M - offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M - offs_k = tl.arange(0, BLOCK_SIZE_K) - a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) - at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) - - accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - - # Accumulate over blocks of K - for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): - a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) - at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) - accumulator = tl.dot(a, at, accumulator) - a_ptrs += BLOCK_SIZE_K * a_stride_c - at_ptrs += BLOCK_SIZE_K * a_stride_c - - out_dtype = C_ptr.dtype.element_ty - output = accumulator.to(out_dtype) - - # Store block of C - offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) - offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) - c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) - c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) - tl.store(c_ptrs, output, mask=c_mask) - - # Store block of C mirrored across the diagonal - c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) - c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) - tl.store(c_ptrs_t, output.T, mask=c_mask_t) - -def XXT(A: torch.Tensor, out: torch.Tensor): - """ - Launch Triton kernel to compute C = A @ A.T - """ - assert A.ndim == 2 or A.ndim == 3 - M, K = A.shape[-2:] - assert out.size(-2) == M, "Output matrix has incorrect shape" - assert out.size(-1) == M, "Output matrix has incorrect shape" - - batch_size = A.size(0) if A.ndim == 3 else 1 - input_batch_stride = A.stride(0) if A.ndim == 3 else 0 - output_batch_stride = out.stride(0) if out.ndim == 3 else 0 - - grid = lambda meta: ( - batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), - ) - XXT_kernel[grid]( - A_ptr=A, - C_ptr=out, - M=M, - K=K, - a_stride_b=input_batch_stride, - a_stride_r=A.stride(-2), - a_stride_c=A.stride(-1), - c_stride_b=output_batch_stride, - c_stride_r=out.stride(-2), - c_stride_c=out.stride(-1), - ) - return out - -@triton.autotune( - configs=_get_autotune_configs(), - key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], -) -@triton.jit -def ba_plus_cAA_kernel( - A_ptr, C_ptr, - M, - a_stride_b, a_stride_r, a_stride_c, - c_stride_b, c_stride_r, c_stride_c, - alpha, beta, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, - LOWER_UPPER: tl.constexpr, -): - # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A - # Performance is slightly slower than XXT_kernel, so we use two separate kernels - pid = tl.program_id(axis=0) - batch_idx, m_idx, n_idx = _pid_to_block( - pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M - ) - - # Skip blocks that don't need to be computed - skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) - skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) - if skip_block_below_diag or skip_block_above_diag: - return - - # Index into one matrix of batch - A_ptr += batch_idx * a_stride_b - C_ptr += batch_idx * c_stride_b - - # Create pointer arrays for A and A.T - offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M - offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M - offs_k = tl.arange(0, BLOCK_SIZE_K) - a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) - at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) - - accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - - # Accumulate over blocks of K - for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): - a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) - at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) - accumulator = tl.dot(a, at, accumulator) - a_ptrs += BLOCK_SIZE_K * a_stride_c - at_ptrs += BLOCK_SIZE_K * a_stride_c - - # Load block of A to add (corresponds to the current block of C) - offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) - offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) - a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) - a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) - a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) - - # Apply alpha and beta - accumulator *= alpha - accumulator += a_add * beta - - out_dtype = C_ptr.dtype.element_ty - output = accumulator.to(out_dtype) - - # Store block of C - offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) - offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) - c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) - c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) - tl.store(c_ptrs, output, mask=c_mask) - - # Store block of C mirrored across the diagonal - c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) - c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) - tl.store(c_ptrs_t, output.T, mask=c_mask_t) - -def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): - """ - Launch Triton kernel to compute C = alpha * A @ A.T + beta * A - """ - assert A.ndim == 2 or A.ndim == 3 - M, K = A.shape[-2:] - assert M == K, "Input matrix must be square" - assert out.size(-2) == M - assert out.size(-1) == M - - batch_size = A.size(0) if A.ndim == 3 else 1 - input_batch_stride = A.stride(0) if A.ndim == 3 else 0 - output_batch_stride = out.stride(0) if out.ndim == 3 else 0 - - grid = lambda meta: ( - batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), - ) - ba_plus_cAA_kernel[grid]( - A_ptr=A, - C_ptr=out, - M=M, - a_stride_b=input_batch_stride, - a_stride_r=A.stride(-2), - a_stride_c=A.stride(-1), - c_stride_b=output_batch_stride, - c_stride_r=out.stride(-2), - c_stride_c=out.stride(-1), - alpha=alpha, - beta=beta, - ) - return out - -# Computed for num_iters=5, safety_factor=2e-2, cushion=2 -polar_express_coeffs = [ - (8.156554524902461, -22.48329292557795, 15.878769915207462), - (4.042929935166739, -2.808917465908714, 0.5000178451051316), - (3.8916678022926607, -2.772484153217685, 0.5060648178503393), - (3.285753657755655, -2.3681294933425376, 0.46449024233003106), - (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) -] - -@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower -def polar_express(G: torch.Tensor): - """ - Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 - by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. - Code adapted from https://github.com/NoahAmsel/PolarExpress/tree/main by @varunneal. - """ - X = G.bfloat16() - if G.size(-2) > G.size(-1): - X = X.mT - - # Ensure spectral norm is at most 1 - X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) - - # Allocate buffers - X = X.contiguous() - A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) - B = torch.empty_like(A) - C = torch.empty_like(X) - - aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm - - # Perform the iterations - for a, b, c in polar_express_coeffs: - XXT(X, out=A) # A = X @ X.mT - ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A - aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X - X, C = C, X # Swap references to avoid unnecessary copies - - if G.size(-2) > G.size(-1): - X = X.mT - return X - -# ----------------------------------------------------------------------------- -# Muon optimizer - -class Muon(torch.optim.Optimizer): - """ - Muon - MomentUm Orthogonalized by Newton-schulz - - https://kellerjordan.github.io/posts/muon/ - - Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- - processing step, in which each 2D parameter's update is replaced with the nearest orthogonal - matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has - the advantage that it can be stably run in bfloat16 on the GPU. - Note: A later PR replaced Newton-Shulz with Polar Express for the orthogonalization step - - Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, - or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - Though empirically small 1D params perform efficiently here: - NS approximately performs a magnitude normalization of the grad - This hyper-optimized class has faster execution time than the current impl of Adam for small params - - Custom distributed sizing: - The model stores all attn and mlp weights in the same shape, and then updates the view as - needed on the forward pass. This enables attn and mlp weights to be contained within the same - dist.reduce_scatter_tensor() call. The model architecture has been customized to enable - (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. - The scheduling is: - 1. reduce scatter smear_gate (1 param 7 padding params) - 2. reduce scatter attn_gate (10 params 6 padding params) - 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) - 4. reduce scatter attn/mlp round 2 (16 mlp params) - 5. wait on step 1, then compute update of 1 and schedule all gather - 6. wait on step 2, then compute update of 2 and schedule all gather - 7. wait on step 3, then compute update of 3 and schedule all gather - GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] - GPUs that receive params of type attn reshape before computing update - 8. wait on 4, then compute update of 4 and schedule all gather - 9. wait for each all gather to complete and update params - Empirically, leading with small params provides an additional 0.2s improvement. - """ - def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, eps=1e-8, beta2=0.95, custom_sizing=True): - defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) - self.world_size = dist.get_world_size() if dist.is_initialized() else 1 - # custom sizing requires 8 GPUs - if custom_sizing and dist.get_world_size()==8: - param_groups = self.generate_custom_param_groups(params) - else: - param_groups = self.generate_standard_param_groups(params) - super().__init__(param_groups, defaults) - - def reset(self): - # expose a reset for clearing buffers - for group in self.param_groups: - group["momentum_buffer"].zero_() - group["second_momentum_buffer"].zero_() - - def generate_standard_param_groups(self, params): - """ - Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. - Creates one param group per module. - """ - groups = defaultdict(list) - for param in params: - groups[param.label].append(param) - - param_groups = [] - for module_name, group_params in groups.items(): - chunk_size = (len(group_params) + self.world_size - 1) // self.world_size - param_groups.append(dict(params=group_params, chunk_size=chunk_size)) - - return param_groups - - def generate_custom_param_groups(self, params): - """ - Implementation requires that a single GPU does not receive both attn - and mlp params when a param group is split across GPUs. - """ - module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp_up', 'mlp_down'] - params_list = list(params) - params_list.sort(key=lambda x: module_group_order.index(x.label)) - - idx = 0 - group_sizes = [1, 10, 16, 16] - assert len(params_list) == sum(group_sizes) - param_groups = [] - for size in group_sizes: - chunk_size = (size + self.world_size - 1) // self.world_size - group_params = params_list[idx: idx + size] - param_groups.append(dict(params=group_params, chunk_size=chunk_size)) - idx += size - - return param_groups - - @torch.no_grad() - def step(self): - # Efficient systems-wise implementation of step developed by @YouJiacheng, - # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, - # @ryanyang0, @vagrawal, and @varunneal. - rank = dist.get_rank() - group_infos = [] - for group in self.param_groups: - params: list[Tensor] = group["params"] - if not params: - continue - - chunk_size = group["chunk_size"] - padded_num_params = chunk_size * self.world_size - - stacked_grads = torch.empty( - (padded_num_params, *params[0].shape), - dtype=params[0].dtype, - device=params[0].device - ) - for i, p in enumerate(params): - stacked_grads[i].copy_(p.grad, non_blocking=True) - if len(params) < padded_num_params: - stacked_grads[len(params):].zero_() - - grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) - - reduce_future = dist.reduce_scatter_tensor( - grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True - ).get_future() - - group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) - - all_gather_infos = [] - # Second pass: wait for gradients, compute updates for the local shard of parameters, - # and launch all async all_gather operations. - for group, info in zip(self.param_groups, group_infos): - info["reduce_future"].wait() - - params = group["params"] - grad_chunk = info["grad_chunk"] - chunk_size = group["chunk_size"] - padded_num_params = chunk_size * self.world_size - - start_idx = rank * chunk_size - module_idx = start_idx if start_idx < len(params) else 0 - - num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank - - if "momentum_buffer" not in group: - group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) - momentum_buffer = group["momentum_buffer"] - # Apply momentum update to the persistent momentum buffer in-place - momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) - updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) - - grad_shape = updated_grads.shape - if params[module_idx].label == 'attn': - # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] - for p in params[module_idx:module_idx + num_params]: - assert p.label == 'attn' - updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) - ref_param = params[module_idx] - param_shape = ref_param.shape - - if "second_momentum_buffer" not in group: - group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) - if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) - ) - second_momentum_buffer = group["second_momentum_buffer"] - - if "param_lr" not in group: - group["param_lr"] = ( - max(1., param_shape[-2] / param_shape[-1]) ** 0.5 - * ref_param.new_tensor( - [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] - ).view(-1, 1, 1) - ) - - group["param_wd"] = ref_param.new_tensor( - [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] - ).view(-1, 1, 1) - - # Determine LR and WR - eff_lr = group["lr"] * group["param_lr"] - eff_wd = group["weight_decay"] * group["param_wd"] - - # Compute zeropower for the entire chunk in a single, batched call. - if num_params == 0: - v_chunk = updated_grads - elif params[module_idx].label == "smear_gate": - # dividing by magnitude is equivalent of SVN for 1d tensors - v_chunk = updated_grads / (updated_grads.norm(dim=(-2, -1), keepdim=True).clamp_min(1e-10)) - else: - v_chunk = polar_express(updated_grads) - - # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) - v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) - v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) - second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) - step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() - v_chunk.mul_(step_size) - v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) - v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) - - v_chunk = v_chunk.view(grad_shape) - - updated_params = torch.empty_like(grad_chunk) - param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) - # Apply weight decay directly to the buffer. - param_chunk.mul_(1 - eff_wd) - - param_chunk.add_(-eff_lr * v_chunk) - - updated_params[:num_params].copy_(param_chunk) - if num_params < chunk_size: - updated_params[num_params:].zero_() - - stacked_params = torch.empty( - (padded_num_params, *param_shape), - dtype=updated_params.dtype, - device=updated_params.device, - ) - - gather_future = dist.all_gather_into_tensor( - stacked_params, updated_params, async_op=True - ).get_future() - - all_gather_infos.append( - { - "gather_future": gather_future, - "stacked_params": stacked_params, - "orig_params": params, - } - ) - - # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. - for info in all_gather_infos: - info["gather_future"].wait() - stacked_params = info["stacked_params"] - orig_params = info["orig_params"] - - unstacked_params = torch.unbind(stacked_params) - for i, p in enumerate(orig_params): - p.copy_(unstacked_params[i], non_blocking=True) - - -class DistAdam(torch.optim.Optimizer): - def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): - self.world_size = dist.get_world_size() if dist.is_initialized() else 1 - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) - params = list(params) - sizes = {p.shape for p in params} - # create one buffer per unique parameter-size - param_groups = [] - for size in sizes: - group_params = [p for p in params if p.shape == size] - param_groups.append(dict(params=group_params)) - super().__init__(param_groups, defaults) - # init state - for p in params: - chunk_size = p.size(0) // self.world_size - exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) - exp_avg_sq = torch.zeros_like(exp_avg) - self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) - # DistributedAdam implementation by @vagrawal - - @torch.compile - @torch.no_grad() - def step(self): - rank = dist.get_rank() - reduce_scatter_futures: list[torch.Future] = [] - all_gather_futures: list[torch.Future] = [] - grad_slices = [] - for group in self.param_groups: - params: list[Tensor] = group["params"] - for param in params: - grad = param.grad - rank_size = grad.shape[0] // self.world_size - grad_slice = torch.empty_like(grad[:rank_size]) - reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) - grad_slices.append(grad_slice) - - idx = 0 - for group in self.param_groups: - beta1, beta2 = group['betas'] - eps = group['eps'] - wd = group['weight_decay'] - params = group['params'] - for param in params: - reduce_scatter_futures[idx].wait() - rank_size = param.shape[0] // self.world_size - p_slice = param[rank * rank_size:(rank + 1) * rank_size] - lr = group['lr'] * getattr(param, "lr_mul", 1.0) - state = self.state[param] - g_slice = grad_slices[idx] - - exp_avg = state["exp_avg"] - exp_avg_sq = state["exp_avg_sq"] - state["step"] += 1 - t = state["step"] - # weight decay - if wd != 0: - eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) - p_slice.mul_(1 - eff_weight_decay) - # update running averages - exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) - exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) - # bias corrections - bias1 = 1 - beta1 ** t - bias2 = 1 - beta2 ** t - # compute step - denom = exp_avg_sq.sqrt().add_(eps) - step_size = lr * (bias2 ** 0.5 / bias1) - update = exp_avg.div(denom).mul_(step_size) - p_slice.add_(other=update, alpha=-1.0) - idx += 1 - all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) - torch.futures.collect_all(all_gather_futures).wait() - -# ----------------------------------------------------------------------------- -# PyTorch nn.Module definitions for the model - -def norm(x: Tensor): - return F.rms_norm(x, (x.size(-1),)) - -class CastedLinear(nn.Linear): - def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): - super().__init__(in_features, out_features, bias=False) - self.use_fp8 = use_fp8 - self.x_s = x_s - self.w_s = w_s - self.grad_s = grad_s - - def reset_parameters(self) -> None: - with torch.no_grad(): - self.weight.zero_() # @Grad62304977 and others - - def forward(self, x: Tensor): - if self.use_fp8 and self.training: - _x = x.flatten(0, -2) - out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] - return out.reshape(*x.shape[:-1], -1) - else: - return F.linear(x, self.weight.type_as(x)) - -# yarn implementation @classiclarryd -class Yarn(nn.Module): - def __init__(self, head_dim, max_seq_len): - super().__init__() - self.head_dim = head_dim - self.max_seq_len = max_seq_len - self.reset() - - def reset(self): - angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) - # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) - angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) - t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) - theta = torch.outer(t, angular_freq) - self.cos = nn.Buffer( - theta.cos().to(torch.bfloat16), persistent=False - ) - self.sin = nn.Buffer( - theta.sin().to(torch.bfloat16), persistent=False - ) - self.angular_freq = angular_freq - # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 - self.attn_scale = 0.1 - - def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): - rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) - scaling_factor = old_window / new_window - interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) - self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) - t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) - theta = torch.outer(t, self.angular_freq) - self.cos.copy_(theta.cos()) - self.sin.copy_(theta.sin()) - self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 - -def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): - assert cos.size(0) >= x_BTHD.size(-3) - cos, sin = ( - cos[None, : x_BTHD.size(-3), None, :], - sin[None, : x_BTHD.size(-3), None, :], - ) - x1, x2 = x_BTHD.chunk(2, dim=-1) - y1 = x1 * cos + x2 * sin - y2 = x1 * (-sin) + x2 * cos - return torch.cat((y1, y2), 3) - -@dataclass -class AttnArgs: - ve: torch.Tensor - sa_lambdas: torch.Tensor - seqlens: torch.Tensor - bm_size: int - cos: torch.Tensor - sin: torch.Tensor - attn_scale: float - -flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface - -class CausalSelfAttention(nn.Module): - def __init__(self, dim: int, head_dim: int, num_heads: int): - super().__init__() - self.num_heads = num_heads - self.head_dim = head_dim - self.dim = dim - self.hdim = num_heads * head_dim - - assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" - std = 0.5 * (self.dim ** -0.5) - bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng - # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng - # https://x.com/hi_tysam/status/1879699187107033311 - # make matrices the same shape as MLP to enable batched call in optimizer - self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) - # label module to enable custom optimizer sizing - self.qkvo_w.label='attn' - - with torch.no_grad(): - self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights - self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero - - # sparse gated attention to enable context based no-op by @classiclarryd - self.attn_gate = CastedLinear(12, num_heads) - # label module to enable custom optimizer sizing - self.attn_gate.weight.label = 'attn_gate' - - def forward(self, x: Tensor, attn_args: AttnArgs): - B, T = x.size(0), x.size(1) # batch size, sequence length - assert B == 1, "varlen sequences requires B == 1" - assert T % 16 == 0 - # unpack attention args - cos, sin = attn_args.cos, attn_args.sin - ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas - seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size - - q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) - q, k = norm(q), norm(k) # QK norm @Grad62304977 - q, k = rotary(q, cos, sin), rotary(k, cos, sin) - if ve is not None: - v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 - else: # skip mid-layers token value embeddings by @YouJiacheng - v = sa_lambdas[0] * v - - max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) - - # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng - y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, - max_seqlen_q=max_len, max_seqlen_k=max_len, - causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) - y = y.view(B, T, self.num_heads, self.head_dim) - y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) - y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side - y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) - return y - - -class MLP(nn.Module): - def __init__(self, dim: int): - super().__init__() - hdim = 4 * dim - # make matrices the same shape to enable batched call in optimizer - self.c_fc = nn.Parameter(torch.empty(dim, hdim)) - self.c_proj = nn.Parameter(torch.empty(dim, hdim)) - # label modules to enable custom optimizer sizing - self.c_fc.label = 'mlp_up' - self.c_proj.label = 'mlp_down' - # corrective factor to account for transpose - self.c_fc.lr_mul = 2. - - std = 0.5 * (dim ** -0.5) - bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng - with torch.no_grad(): - self.c_fc.uniform_(-bound, bound) - self.c_proj.zero_() # zero init suggested by @Grad62304977 - - def forward(self, x: Tensor): - x = F.linear(x, self.c_fc.T.type_as(x)) - x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 - x = F.linear(x, self.c_proj.type_as(x)) - return x - -class Block(nn.Module): - def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): - super().__init__() - # skip attention of blocks.7 (the 8th layer) by @YouJiacheng - self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None - # skip MLP blocks for first MLP layer by @EmelyanenkoK - self.mlp = MLP(dim) if layer_idx != 0 else None - - def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): - x = lambdas[0] * x + lambdas[1] * x0 - if self.attn is not None: - x = x + self.attn(norm(x), attn_args) - if self.mlp is not None: - x = x + self.mlp(norm(x)) - return x - -# ----------------------------------------------------------------------------- -# The main model - -def next_multiple_of_n(v: float | int, *, n: int): - return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) - -class GPT(nn.Module): - def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): - super().__init__() - vocab_size = next_multiple_of_n(vocab_size, n=128) - self.embed = nn.Embedding(vocab_size, model_dim) - self.smear_gate = CastedLinear(12, 1) - # label modules to enable custom optimizer sizing - self.smear_gate.weight.label = 'smear_gate' - # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 - # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 - self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) - self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) - self.yarn = Yarn(head_dim, max_seq_len) - # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. - # suggested to me by @Grad62304977. this originates from Karpathy's experiments. - use_fp8 = not os.environ.get("DISABLE_FP8", False) - self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) - # Add learnable skip connection weights for decoder layers - assert num_layers % 2 == 0 - pad = (-num_layers * 5 - 2) % dist.get_world_size() - self.scalars = nn.Parameter( - torch.cat( - [ - -1.5 - * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 - *[ - torch.tensor([1.0, 0.0]) for _ in range(num_layers) - ], # block lambdas - *[ - torch.tensor([0.5, 0.5]) for _ in range(num_layers) - ], # SA lambdas - torch.zeros(1), # smear_lambda - 0.5*torch.ones(1), # backout_lambda - torch.ones(pad), - ] - ) - ) - # set learning rates - for param in self.embed.parameters(): - param.lr_mul = 75. - for param in self.value_embeds.parameters(): - param.lr_mul = 75. - self.lm_head.weight.lr_mul = 1.0 - self.scalars.lr_mul = 5.0 - - def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): - assert input_seq.ndim == 1 - - ve = [value_embed(input_seq) for value_embed in self.value_embeds] - # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure - # dropping first layer updates this to .12 ... 012 - ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] - assert len(ve) == len(self.blocks) - - short_bm = ws_short * args.block_size - long_bm = ws_long * args.block_size - bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] - assert len(bm_sizes) == len(self.blocks) - - x = self.embed(input_seq) - - skip_weights = self.scalars[:(len(self.blocks) // 2)] - lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) - sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) - smear_lambda = self.scalars[5 * len(self.blocks)] - backout_lambda = self.scalars[5 * len(self.blocks)+1] - - # smear token embed forward 1 position @classiclarryd - smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) - x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) - x = x0 = norm(x[None]) - - # U-net design by @brendanh0gan - skip_connections = [] - n = len(self.blocks) // 2 - - x_backout = None - backout_layer = 8 - # skip layer zero - for i in range(1,len(self.blocks)): - attn_args = AttnArgs( - ve=ve[i], - sa_lambdas=sa_lambdas[i], - seqlens=seqlens, - bm_size=bm_sizes[i], - cos=self.yarn.cos, - sin=self.yarn.sin, - attn_scale=self.yarn.attn_scale - ) - # since layer 0 is skipped, layer 11 does not have skip_connection - if i >= n and i<11: - gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) - x = x + gate * skip_connections.pop() - x = self.blocks[i](x, x0, lambdas[i], attn_args) - if i < n: - skip_connections.append(x) - if i == backout_layer: - x_backout = x - - # back out contributions from first 8 layers that are only required for downstream context and not direct prediction - x -= backout_lambda * x_backout - x = norm(x) - logits = self.lm_head(x) - # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) - logits = 30 * torch.sigmoid(logits / 7.5) - logits_for_loss = logits.float() if not self.training else logits - loss = F.cross_entropy( - logits_for_loss.view(-1, logits_for_loss.size(-1)), - target_seq, - reduction="sum" if self.training else "mean", - ) - return loss - -# ----------------------------------------------------------------------------- -# Distributed data loader - -def _load_data_shard(file: Path): - header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 - assert header[0] == 20240520, "magic number mismatch in the data .bin file" - assert header[1] == 1, "unsupported version" - num_tokens = int(header[2]) # number of tokens (claimed) - with file.open("rb", buffering=0) as f: - tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng - f.seek(256 * 4) - nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng - assert nbytes == 2 * num_tokens, "number of tokens read does not match header" - return tokens - -BOS_ID = 50256 - -class BOSFinder: - # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd - def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): - # Precompute BOS positions once per shard - self.tokens=tokens - self.size = tokens.numel() - self.quickload = quickload - if quickload: - # only scan first 4 million tokens, then kickoff async thread to scan rest - self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() - self.thread = None - self.ready = threading.Event() - self.start() - else: - self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() - self.i = 0 - self.world_size = world_size - self.batch_iter = 0 - - def _load(self): - self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() - self.ready.set() - - def start(self): - self.ready.clear() - self.thread = threading.Thread(target=self._load) - self.thread.start() - - def get(self): - if self.thread: - self.ready.wait() - self.thread.join() - self.bos_idx = self.bos_idx_async - - def next_batch(self, num_tokens_local: int, max_seq_len: int): - # if quickload was used, repoint to the full dataset after 5 batches - if self.quickload and self.batch_iter==5: - self.get() - n = len(self.bos_idx) - starts = [[] for _ in range(self.world_size)] - ends = [[] for _ in range(self.world_size)] - - idx = self.i - for r in range(self.world_size): - cur_len = 0 - while cur_len <= num_tokens_local: - if idx >= n: - raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") - cur = self.bos_idx[idx] - starts[r].append(cur) - end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, - cur + max_seq_len, - cur + num_tokens_local - cur_len + 1) - ends[r].append(end) - cur_len += end - cur - idx += 1 - - assert cur_len == num_tokens_local + 1 - self.i = idx - self.batch_iter+=1 - return starts, ends - -class DataPreloader: - # Helper for asynchronously loading next shard and indexing bos tokens - def __init__(self, file_iter, world_size: int = 1): - self.file_iter = file_iter - self.world_size = world_size - self.thread = None - self.data = None - self.ready = threading.Event() - - def _load(self): - tokens = _load_data_shard(next(self.file_iter)) - self.data = (tokens, BOSFinder(tokens, self.world_size)) - self.ready.set() - - def start(self): - self.ready.clear() - self.thread = threading.Thread(target=self._load) - self.thread.start() - - def get(self): - if self.thread: - self.ready.wait() - self.thread.join() - return self.data - -def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): - # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len - rank = dist.get_rank() if dist.is_initialized() else 0 - world_size = dist.get_world_size() if dist.is_initialized() else 1 - assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" - num_tokens = num_tokens // grad_accum_steps - - files = [Path(file) for file in sorted(glob.glob(filename_pattern))] - if not files: - raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") - - file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training - tokens = _load_data_shard(next(file_iter)) - if align_to_bos: - finder = BOSFinder(tokens, world_size=world_size, quickload=True) - preloader = DataPreloader(file_iter, world_size) - preloader.start() - else: - pos = 0 # for unaligned case - - while True: - num_tokens_local = num_tokens // world_size - max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 - - if align_to_bos: - try: - seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) - start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) - except StopIteration: - # This shard is exhausted, load the next one in the next loop iteration. - tokens, finder = preloader.get() - preloader.start() - continue - - buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) - _inputs = buf[:-1] - _targets = buf[1:] - end_idxs[-1] -= 1 # last document was too long to account for _targets offset - cum_lengths = (end_idxs - start_idxs).cumsum(0) - - else: - if pos + num_tokens + 1 >= len(tokens): # should not occur for val data - tokens, pos = _load_data_shard(next(file_iter)), 0 - - pos_local = pos + rank * num_tokens_local - buf = tokens[pos_local: pos_local + num_tokens_local + 1] - _inputs = buf[:-1].view(num_tokens_local, ) - _targets = buf[1:].view(num_tokens_local, ) - - cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] - pos += num_tokens - - - _cum_lengths = torch.full((max_num_docs,), num_tokens_local) - _cum_lengths[0] = 0 - _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths - - new_params = yield ( - _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), - _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), - _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) - ) - - if new_params is not None: - # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() - new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params - assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" - num_tokens = new_num_tokens - max_seq_len = new_max_seq_len - grad_accum_steps = new_grad_accum_steps - - -# ----------------------------------------------------------------------------- -# int main - -@dataclass -class Hyperparameters: - # data - train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on - val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on - val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons - train_batch_size: int = 2048 * 16 * 8 - train_max_seq_len: int = 128 * 16 - val_batch_size: int = 4 * 64 * 1024 * 8 - # optimization - num_iterations: int = 2285 - lr_schedule = (0.5, 0.98) # breakpoints for 3-part schedule: (flat, linear decay, flat) - lr_min = 0.1 - # evaluation and logging - run_id: str = f"{uuid.uuid4()}" - val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end - save_checkpoint: bool = False - # attention masking - block_size: int = 128 - ws_schedule: tuple = (3, 5, 7, 9, 11, 13) - ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN - -args = Hyperparameters() - -data_path = os.environ.get("DATA_PATH", ".") -args.train_files = os.path.join(data_path, args.train_files) -args.val_files = os.path.join(data_path, args.val_files) - -# torchrun sets these env variables -rank = int(os.environ["RANK"]) -world_size = int(os.environ["WORLD_SIZE"]) -assert 8 % world_size == 0, "world_size must be a divisor of 8" -grad_accum_steps = 8 // world_size -assert torch.cuda.is_available() -device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) -torch.cuda.set_device(device) -dist.init_process_group(backend="nccl", device_id=device) -dist.barrier() -master_process = (rank == 0) # this process will do logging, checkpointing etc. - -# begin logging -logfile = None -if master_process: - run_id = args.run_id - os.makedirs("logs", exist_ok=True) - logfile = f"logs/{run_id}.txt" - print(logfile) -def print0(s, console=False): - if master_process: - with open(logfile, "a") as f: - if console: - print(s) - print(s, file=f) - -# begin by printing this file (the Python code) -print0(code) -print0("="*100) -# log information about the hardware/software environment this is running on -print0(f"Running Python {sys.version}") -print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") -print0(f"Running Triton version {triton.__version__}") - -def nvidia_smi(): - import subprocess # avoid top level import - return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout -print0(nvidia_smi()) -print0("="*100) - -model: nn.Module = GPT( - vocab_size=50257, - num_layers=12, - num_heads=6, - head_dim=128, - model_dim=768, - max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) -).cuda() -for m in model.modules(): - if isinstance(m, (nn.Embedding, nn.Linear)): - m.bfloat16() -for param in model.parameters(): - dist.broadcast(param.detach(), 0) - -# collect the parameters to optimize -hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] -embed_params = [p for n, p in model.named_parameters() if "embed" in n] -scalar_params = [p for p in model.parameters() if p.ndim < 2] -head_params = [model.lm_head.weight] -gate_params = [p for n, p in model.named_parameters() if "gate" in n] - -# init the optimizer(s) -# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence -# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 -optimizer1 = DistAdam( - scalar_params + head_params + embed_params, - lr=0.008, - betas=(0.65, 0.95), - eps=1e-8, - weight_decay=0.0, -) -optimizer2 = Muon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=0.0) -optimizers = [optimizer1, optimizer2] -for opt in optimizers: - for group in opt.param_groups: - group["initial_lr"] = group["lr"] - -def get_lr(step: int): - assert step < args.num_iterations - # Three part schedule: flat, linear decrease, flat - lr_schedule = args.lr_schedule - x = step / args.num_iterations - - if x < lr_schedule[0]: - return 1.0 - elif x < lr_schedule[1]: - progress = (x - lr_schedule[0]) / (lr_schedule[1] - lr_schedule[0]) - lr = 1.0 - (1.0 - args.lr_min) * progress - else: - lr = args.lr_min - return lr - -def get_ws(step: int): - assert step <= args.num_iterations - x = step / (args.num_iterations + 1) - ws_idx = int(len(args.ws_schedule) * x) - return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] - -def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): - # warmup phase: linearly increase momentum from min to max - # cooldown phase: linearly decrease momentum from max to min - momentum_cd_start = args.num_iterations - muon_cooldown_steps - if step < muon_warmup_steps: - frac = step / muon_warmup_steps - momentum = momentum_min + frac * (momentum_max - momentum_min) - elif step > momentum_cd_start: - frac = (step - momentum_cd_start) / muon_cooldown_steps - momentum = momentum_max - frac * (momentum_max - momentum_min) - else: - momentum = momentum_max - return momentum - -def step_optimizers(step: int, optimizers, model): - # update lr - for optimizer in optimizers: - for group in optimizer.param_groups: - group["lr"] = group["initial_lr"] * get_lr(step) - - # set muon momentum based on step - momentum = get_muon_momentum(step) - for group in optimizers[1].param_groups: - group["momentum"] = momentum - - # on even steps, only step Muon params - # on odd steps, step all params - if step%2==0: - optimizers[1].step() - optimizers[1].zero_grad(set_to_none=True) - else: - for optimizer in optimizers: - optimizer.step() - model.zero_grad(set_to_none=True) - -model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) - -######################################## -# Warmup kernels # -######################################## - -# Warmup the training kernels, then re-initialize the state so we aren't cheating -warmup_steps = 30 -initial_state = dict(model=copy.deepcopy(model.state_dict()), - optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state -train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) -for step in range(warmup_steps): - inputs, targets, cum_seqlens = next(train_loader) - # each window size is a new graph, need to warm up each with Yarn.attn_scale - ws_idx = step % len(args.ws_schedule) - if ws_idx==0: - model.yarn.reset() - ws_long = args.ws_schedule[0] - else: - new_ws_long = args.ws_schedule[ws_idx] - if new_ws_long > ws_long: - model.yarn.apply(ws_long, new_ws_long) - ws_long = new_ws_long - model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() - for opt in optimizers: - opt.step() - model.zero_grad(set_to_none=True) -model.yarn.reset() # rotary buffer is not stored in state_dict -model.load_state_dict(initial_state["model"]) -optimizer2.reset() # momentum buffer not in state dict -for opt, opt_state in zip(optimizers, initial_state["optimizers"]): - opt.load_state_dict(opt_state) -del train_loader, initial_state - -######################################## -# Training and validation # -######################################## - -train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) -training_time_ms = 0 -# start the clock -torch.cuda.synchronize() -t0 = time.perf_counter() -# begin training -train_steps = args.num_iterations -ws_short, ws_long = get_ws(0) -for step in range(train_steps + 1): - last_step = (step == train_steps) - ws_short, new_ws_long = get_ws(step) - if new_ws_long != ws_long: - model.yarn.apply(ws_long, new_ws_long) - ws_long=new_ws_long - - # --------------- VALIDATION SECTION ----------------- - if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): - if last_step: - ws_long = args.ws_validate_post_yarn_ext - # stop the clock - torch.cuda.synchronize() - training_time_ms += 1000 * (time.perf_counter() - t0) - model.eval() - assert args.val_tokens % args.val_batch_size == 0 - val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size - val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) - val_loss = 0 - with torch.no_grad(): - for _ in range(val_steps): - inputs, targets, cum_seqlens = next(val_loader) - val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) - val_loss /= val_steps - del val_loader - dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) - print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) - model.train() - # start the clock again - torch.cuda.synchronize() - t0 = time.perf_counter() - - if last_step: - if master_process and args.save_checkpoint: - log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) - os.makedirs(f"logs/{run_id}", exist_ok=True) - torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") - # the last step only has the validation loop, so break to avoid training - break - - # --------------- TRAINING SECTION ----------------- - loss = 0 - for _ in range(grad_accum_steps): - inputs, targets, cum_seqlens = next(train_loader) - loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps - loss.backward() - step_optimizers(step, optimizers, model) - - # logging - approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) - print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) - -print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " - f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) -dist.destroy_process_group() - -==================================================================================================== -Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] -Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 -Running Triton version 3.5.0 -Tue Oct 28 01:55:48 2025 -+-----------------------------------------------------------------------------------------+ -| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | -|-----------------------------------------+------------------------+----------------------+ -| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | -| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | -| | | MIG M. | -|=========================================+========================+======================| -| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | -| N/A 40C P0 128W / 700W | 5858MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | -| N/A 33C P0 128W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | -| N/A 32C P0 121W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | -| N/A 37C P0 123W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | -| N/A 38C P0 122W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | -| N/A 32C P0 119W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | -| N/A 37C P0 125W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | -| N/A 31C P0 116W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ - -+-----------------------------------------------------------------------------------------+ -| Processes: | -| GPU GI CI PID Type Process name GPU Memory | -| ID ID Usage | -|=========================================================================================| -+-----------------------------------------------------------------------------------------+ - -==================================================================================================== -step:0/2285 val_loss:10.8258 train_time:0ms step_avg:0.02ms -step:1/2285 train_time:116ms step_avg:115.76ms -step:2/2285 train_time:137ms step_avg:68.68ms -step:3/2285 train_time:174ms step_avg:58.06ms -step:4/2285 train_time:230ms step_avg:57.53ms -step:5/2285 train_time:289ms step_avg:57.89ms -step:6/2285 train_time:348ms step_avg:57.93ms -step:7/2285 train_time:408ms step_avg:58.30ms -step:8/2285 train_time:466ms step_avg:58.28ms -step:9/2285 train_time:527ms step_avg:58.60ms -step:10/2285 train_time:585ms step_avg:58.55ms -step:11/2285 train_time:646ms step_avg:58.73ms -step:12/2285 train_time:704ms step_avg:58.68ms -step:13/2285 train_time:765ms step_avg:58.87ms -step:14/2285 train_time:824ms step_avg:58.83ms -step:15/2285 train_time:884ms step_avg:58.96ms -step:16/2285 train_time:943ms step_avg:58.95ms -step:17/2285 train_time:1006ms step_avg:59.20ms -step:18/2285 train_time:1070ms step_avg:59.42ms -step:19/2285 train_time:1134ms step_avg:59.70ms -step:20/2285 train_time:1195ms step_avg:59.73ms -step:21/2285 train_time:1257ms step_avg:59.85ms -step:22/2285 train_time:1315ms step_avg:59.79ms -step:23/2285 train_time:1377ms step_avg:59.89ms -step:24/2285 train_time:1436ms step_avg:59.85ms -step:25/2285 train_time:1498ms step_avg:59.91ms -step:26/2285 train_time:1557ms step_avg:59.88ms -step:27/2285 train_time:1618ms step_avg:59.94ms -step:28/2285 train_time:1677ms step_avg:59.90ms -step:29/2285 train_time:1738ms step_avg:59.93ms -step:30/2285 train_time:1796ms step_avg:59.88ms -step:31/2285 train_time:1858ms step_avg:59.93ms -step:32/2285 train_time:1917ms step_avg:59.90ms -step:33/2285 train_time:1979ms step_avg:59.97ms -step:34/2285 train_time:2039ms step_avg:59.98ms -step:35/2285 train_time:2103ms step_avg:60.07ms -step:36/2285 train_time:2164ms step_avg:60.11ms -step:37/2285 train_time:2227ms step_avg:60.18ms -step:38/2285 train_time:2286ms step_avg:60.15ms -step:39/2285 train_time:2347ms step_avg:60.19ms -step:40/2285 train_time:2407ms step_avg:60.18ms -step:41/2285 train_time:2467ms step_avg:60.18ms -step:42/2285 train_time:2526ms step_avg:60.15ms -step:43/2285 train_time:2588ms step_avg:60.18ms -step:44/2285 train_time:2646ms step_avg:60.14ms -step:45/2285 train_time:2708ms step_avg:60.18ms -step:46/2285 train_time:2768ms step_avg:60.18ms -step:47/2285 train_time:2830ms step_avg:60.20ms -step:48/2285 train_time:2888ms step_avg:60.17ms -step:49/2285 train_time:2950ms step_avg:60.20ms -step:50/2285 train_time:3009ms step_avg:60.18ms -step:51/2285 train_time:3070ms step_avg:60.21ms -step:52/2285 train_time:3130ms step_avg:60.18ms -step:53/2285 train_time:3191ms step_avg:60.21ms -step:54/2285 train_time:3250ms step_avg:60.19ms -step:55/2285 train_time:3312ms step_avg:60.21ms -step:56/2285 train_time:3371ms step_avg:60.20ms -step:57/2285 train_time:3432ms step_avg:60.21ms -step:58/2285 train_time:3490ms step_avg:60.18ms -step:59/2285 train_time:3551ms step_avg:60.19ms -step:60/2285 train_time:3610ms step_avg:60.17ms -step:61/2285 train_time:3672ms step_avg:60.19ms -step:62/2285 train_time:3730ms step_avg:60.16ms -step:63/2285 train_time:3792ms step_avg:60.19ms -step:64/2285 train_time:3850ms step_avg:60.16ms -step:65/2285 train_time:3911ms step_avg:60.17ms -step:66/2285 train_time:3970ms step_avg:60.15ms -step:67/2285 train_time:4032ms step_avg:60.18ms -step:68/2285 train_time:4091ms step_avg:60.15ms -step:69/2285 train_time:4152ms step_avg:60.17ms -step:70/2285 train_time:4211ms step_avg:60.15ms -step:71/2285 train_time:4271ms step_avg:60.16ms -step:72/2285 train_time:4330ms step_avg:60.14ms -step:73/2285 train_time:4391ms step_avg:60.15ms -step:74/2285 train_time:4450ms step_avg:60.13ms -step:75/2285 train_time:4511ms step_avg:60.15ms -step:76/2285 train_time:4570ms step_avg:60.13ms -step:77/2285 train_time:4631ms step_avg:60.14ms -step:78/2285 train_time:4690ms step_avg:60.13ms -step:79/2285 train_time:4751ms step_avg:60.14ms -step:80/2285 train_time:4809ms step_avg:60.12ms -step:81/2285 train_time:4871ms step_avg:60.14ms -step:82/2285 train_time:4930ms step_avg:60.12ms -step:83/2285 train_time:4992ms step_avg:60.14ms -step:84/2285 train_time:5050ms step_avg:60.12ms -step:85/2285 train_time:5112ms step_avg:60.14ms -step:86/2285 train_time:5170ms step_avg:60.12ms -step:87/2285 train_time:5232ms step_avg:60.14ms -step:88/2285 train_time:5290ms step_avg:60.12ms -step:89/2285 train_time:5351ms step_avg:60.13ms -step:90/2285 train_time:5410ms step_avg:60.11ms -step:91/2285 train_time:5471ms step_avg:60.12ms -step:92/2285 train_time:5530ms step_avg:60.11ms -step:93/2285 train_time:5590ms step_avg:60.11ms -step:94/2285 train_time:5649ms step_avg:60.10ms -step:95/2285 train_time:5710ms step_avg:60.11ms -step:96/2285 train_time:5769ms step_avg:60.09ms -step:97/2285 train_time:5830ms step_avg:60.10ms -step:98/2285 train_time:5889ms step_avg:60.09ms -step:99/2285 train_time:5950ms step_avg:60.10ms -step:100/2285 train_time:6009ms step_avg:60.09ms -step:101/2285 train_time:6070ms step_avg:60.10ms -step:102/2285 train_time:6129ms step_avg:60.09ms -step:103/2285 train_time:6190ms step_avg:60.10ms -step:104/2285 train_time:6249ms step_avg:60.09ms -step:105/2285 train_time:6310ms step_avg:60.10ms -step:106/2285 train_time:6369ms step_avg:60.08ms -step:107/2285 train_time:6430ms step_avg:60.09ms -step:108/2285 train_time:6489ms step_avg:60.08ms -step:109/2285 train_time:6549ms step_avg:60.09ms -step:110/2285 train_time:6608ms step_avg:60.07ms -step:111/2285 train_time:6669ms step_avg:60.08ms -step:112/2285 train_time:6727ms step_avg:60.07ms -step:113/2285 train_time:6788ms step_avg:60.07ms -step:114/2285 train_time:6847ms step_avg:60.06ms -step:115/2285 train_time:6908ms step_avg:60.07ms -step:116/2285 train_time:6967ms step_avg:60.06ms -step:117/2285 train_time:7029ms step_avg:60.08ms -step:118/2285 train_time:7088ms step_avg:60.07ms -step:119/2285 train_time:7149ms step_avg:60.08ms -step:120/2285 train_time:7208ms step_avg:60.07ms -step:121/2285 train_time:7269ms step_avg:60.08ms -step:122/2285 train_time:7328ms step_avg:60.07ms -step:123/2285 train_time:7389ms step_avg:60.07ms -step:124/2285 train_time:7447ms step_avg:60.06ms -step:125/2285 train_time:7509ms step_avg:60.07ms -step:126/2285 train_time:7567ms step_avg:60.06ms -step:127/2285 train_time:7628ms step_avg:60.06ms -step:128/2285 train_time:7687ms step_avg:60.05ms -step:129/2285 train_time:7748ms step_avg:60.06ms -step:130/2285 train_time:7807ms step_avg:60.05ms -step:131/2285 train_time:7868ms step_avg:60.06ms -step:132/2285 train_time:7927ms step_avg:60.05ms -step:133/2285 train_time:7988ms step_avg:60.06ms -step:134/2285 train_time:8047ms step_avg:60.05ms -step:135/2285 train_time:8108ms step_avg:60.06ms -step:136/2285 train_time:8167ms step_avg:60.05ms -step:137/2285 train_time:8228ms step_avg:60.06ms -step:138/2285 train_time:8287ms step_avg:60.05ms -step:139/2285 train_time:8348ms step_avg:60.06ms -step:140/2285 train_time:8407ms step_avg:60.05ms -step:141/2285 train_time:8468ms step_avg:60.06ms -step:142/2285 train_time:8527ms step_avg:60.05ms -step:143/2285 train_time:8588ms step_avg:60.06ms -step:144/2285 train_time:8646ms step_avg:60.04ms -step:145/2285 train_time:8708ms step_avg:60.05ms -step:146/2285 train_time:8766ms step_avg:60.04ms -step:147/2285 train_time:8828ms step_avg:60.05ms -step:148/2285 train_time:8886ms step_avg:60.04ms -step:149/2285 train_time:8947ms step_avg:60.05ms -step:150/2285 train_time:9006ms step_avg:60.04ms -step:151/2285 train_time:9067ms step_avg:60.05ms -step:152/2285 train_time:9127ms step_avg:60.04ms -step:153/2285 train_time:9188ms step_avg:60.05ms -step:154/2285 train_time:9246ms step_avg:60.04ms -step:155/2285 train_time:9307ms step_avg:60.05ms -step:156/2285 train_time:9366ms step_avg:60.04ms -step:157/2285 train_time:9428ms step_avg:60.05ms -step:158/2285 train_time:9486ms step_avg:60.04ms -step:159/2285 train_time:9547ms step_avg:60.04ms -step:160/2285 train_time:9605ms step_avg:60.03ms -step:161/2285 train_time:9667ms step_avg:60.04ms -step:162/2285 train_time:9726ms step_avg:60.04ms -step:163/2285 train_time:9787ms step_avg:60.04ms -step:164/2285 train_time:9845ms step_avg:60.03ms -step:165/2285 train_time:9906ms step_avg:60.04ms -step:166/2285 train_time:9965ms step_avg:60.03ms -step:167/2285 train_time:10026ms step_avg:60.04ms -step:168/2285 train_time:10086ms step_avg:60.03ms -step:169/2285 train_time:10146ms step_avg:60.04ms -step:170/2285 train_time:10205ms step_avg:60.03ms -step:171/2285 train_time:10266ms step_avg:60.04ms -step:172/2285 train_time:10325ms step_avg:60.03ms -step:173/2285 train_time:10387ms step_avg:60.04ms -step:174/2285 train_time:10445ms step_avg:60.03ms -step:175/2285 train_time:10506ms step_avg:60.04ms -step:176/2285 train_time:10565ms step_avg:60.03ms -step:177/2285 train_time:10627ms step_avg:60.04ms -step:178/2285 train_time:10685ms step_avg:60.03ms -step:179/2285 train_time:10746ms step_avg:60.03ms -step:180/2285 train_time:10805ms step_avg:60.03ms -step:181/2285 train_time:10866ms step_avg:60.03ms -step:182/2285 train_time:10926ms step_avg:60.04ms -step:183/2285 train_time:10987ms step_avg:60.04ms -step:184/2285 train_time:11045ms step_avg:60.03ms -step:185/2285 train_time:11106ms step_avg:60.03ms -step:186/2285 train_time:11165ms step_avg:60.03ms -step:187/2285 train_time:11226ms step_avg:60.03ms -step:188/2285 train_time:11285ms step_avg:60.03ms -step:189/2285 train_time:11346ms step_avg:60.03ms -step:190/2285 train_time:11405ms step_avg:60.03ms -step:191/2285 train_time:11466ms step_avg:60.03ms -step:192/2285 train_time:11525ms step_avg:60.02ms -step:193/2285 train_time:11586ms step_avg:60.03ms -step:194/2285 train_time:11644ms step_avg:60.02ms -step:195/2285 train_time:11705ms step_avg:60.03ms -step:196/2285 train_time:11764ms step_avg:60.02ms -step:197/2285 train_time:11825ms step_avg:60.03ms -step:198/2285 train_time:11884ms step_avg:60.02ms -step:199/2285 train_time:11946ms step_avg:60.03ms -step:200/2285 train_time:12005ms step_avg:60.02ms -step:201/2285 train_time:12066ms step_avg:60.03ms -step:202/2285 train_time:12125ms step_avg:60.03ms -step:203/2285 train_time:12186ms step_avg:60.03ms -step:204/2285 train_time:12245ms step_avg:60.02ms -step:205/2285 train_time:12306ms step_avg:60.03ms -step:206/2285 train_time:12366ms step_avg:60.03ms -step:207/2285 train_time:12427ms step_avg:60.03ms -step:208/2285 train_time:12485ms step_avg:60.02ms -step:209/2285 train_time:12546ms step_avg:60.03ms -step:210/2285 train_time:12606ms step_avg:60.03ms -step:211/2285 train_time:12667ms step_avg:60.04ms -step:212/2285 train_time:12726ms step_avg:60.03ms -step:213/2285 train_time:12787ms step_avg:60.03ms -step:214/2285 train_time:12846ms step_avg:60.03ms -step:215/2285 train_time:12907ms step_avg:60.03ms -step:216/2285 train_time:12966ms step_avg:60.03ms -step:217/2285 train_time:13027ms step_avg:60.03ms -step:218/2285 train_time:13086ms step_avg:60.03ms -step:219/2285 train_time:13147ms step_avg:60.03ms -step:220/2285 train_time:13206ms step_avg:60.03ms -step:221/2285 train_time:13267ms step_avg:60.03ms -step:222/2285 train_time:13326ms step_avg:60.03ms -step:223/2285 train_time:13387ms step_avg:60.03ms -step:224/2285 train_time:13445ms step_avg:60.02ms -step:225/2285 train_time:13507ms step_avg:60.03ms -step:226/2285 train_time:13565ms step_avg:60.02ms -step:227/2285 train_time:13627ms step_avg:60.03ms -step:228/2285 train_time:13685ms step_avg:60.02ms -step:229/2285 train_time:13746ms step_avg:60.03ms -step:230/2285 train_time:13805ms step_avg:60.02ms -step:231/2285 train_time:13866ms step_avg:60.03ms -step:232/2285 train_time:13925ms step_avg:60.02ms -step:233/2285 train_time:13986ms step_avg:60.03ms -step:234/2285 train_time:14045ms step_avg:60.02ms -step:235/2285 train_time:14107ms step_avg:60.03ms -step:236/2285 train_time:14166ms step_avg:60.02ms -step:237/2285 train_time:14227ms step_avg:60.03ms -step:238/2285 train_time:14286ms step_avg:60.02ms -step:239/2285 train_time:14346ms step_avg:60.03ms -step:240/2285 train_time:14406ms step_avg:60.02ms -step:241/2285 train_time:14467ms step_avg:60.03ms -step:242/2285 train_time:14526ms step_avg:60.02ms -step:243/2285 train_time:14586ms step_avg:60.03ms -step:244/2285 train_time:14645ms step_avg:60.02ms -step:245/2285 train_time:14706ms step_avg:60.02ms -step:246/2285 train_time:14765ms step_avg:60.02ms -step:247/2285 train_time:14826ms step_avg:60.02ms -step:248/2285 train_time:14884ms step_avg:60.02ms -step:249/2285 train_time:14945ms step_avg:60.02ms -step:250/2285 train_time:15004ms step_avg:60.01ms -step:250/2285 val_loss:4.0876 train_time:15067ms step_avg:60.27ms -step:251/2285 train_time:15086ms step_avg:60.10ms -step:252/2285 train_time:15126ms step_avg:60.02ms -step:253/2285 train_time:15194ms step_avg:60.06ms -step:254/2285 train_time:15259ms step_avg:60.08ms -step:255/2285 train_time:15320ms step_avg:60.08ms -step:256/2285 train_time:15379ms step_avg:60.07ms -step:257/2285 train_time:15439ms step_avg:60.07ms -step:258/2285 train_time:15497ms step_avg:60.07ms -step:259/2285 train_time:15557ms step_avg:60.07ms -step:260/2285 train_time:15615ms step_avg:60.06ms -step:261/2285 train_time:15676ms step_avg:60.06ms -step:262/2285 train_time:15733ms step_avg:60.05ms -step:263/2285 train_time:15793ms step_avg:60.05ms -step:264/2285 train_time:15851ms step_avg:60.04ms -step:265/2285 train_time:15911ms step_avg:60.04ms -step:266/2285 train_time:15969ms step_avg:60.03ms -step:267/2285 train_time:16031ms step_avg:60.04ms -step:268/2285 train_time:16090ms step_avg:60.04ms -step:269/2285 train_time:16153ms step_avg:60.05ms -step:270/2285 train_time:16216ms step_avg:60.06ms -step:271/2285 train_time:16277ms step_avg:60.06ms -step:272/2285 train_time:16335ms step_avg:60.06ms -step:273/2285 train_time:16397ms step_avg:60.06ms -step:274/2285 train_time:16455ms step_avg:60.06ms -step:275/2285 train_time:16516ms step_avg:60.06ms -step:276/2285 train_time:16576ms step_avg:60.06ms -step:277/2285 train_time:16635ms step_avg:60.05ms -step:278/2285 train_time:16693ms step_avg:60.05ms -step:279/2285 train_time:16753ms step_avg:60.05ms -step:280/2285 train_time:16811ms step_avg:60.04ms -step:281/2285 train_time:16870ms step_avg:60.04ms -step:282/2285 train_time:16928ms step_avg:60.03ms -step:283/2285 train_time:16989ms step_avg:60.03ms -step:284/2285 train_time:17047ms step_avg:60.02ms -step:285/2285 train_time:17109ms step_avg:60.03ms -step:286/2285 train_time:17169ms step_avg:60.03ms -step:287/2285 train_time:17231ms step_avg:60.04ms -step:288/2285 train_time:17290ms step_avg:60.04ms -step:289/2285 train_time:17352ms step_avg:60.04ms -step:290/2285 train_time:17411ms step_avg:60.04ms -step:291/2285 train_time:17472ms step_avg:60.04ms -step:292/2285 train_time:17531ms step_avg:60.04ms -step:293/2285 train_time:17592ms step_avg:60.04ms -step:294/2285 train_time:17651ms step_avg:60.04ms -step:295/2285 train_time:17712ms step_avg:60.04ms -step:296/2285 train_time:17770ms step_avg:60.03ms -step:297/2285 train_time:17829ms step_avg:60.03ms -step:298/2285 train_time:17887ms step_avg:60.02ms -step:299/2285 train_time:17948ms step_avg:60.03ms -step:300/2285 train_time:18006ms step_avg:60.02ms -step:301/2285 train_time:18067ms step_avg:60.02ms -step:302/2285 train_time:18126ms step_avg:60.02ms -step:303/2285 train_time:18187ms step_avg:60.02ms -step:304/2285 train_time:18246ms step_avg:60.02ms -step:305/2285 train_time:18309ms step_avg:60.03ms -step:306/2285 train_time:18367ms step_avg:60.02ms -step:307/2285 train_time:18429ms step_avg:60.03ms -step:308/2285 train_time:18488ms step_avg:60.03ms -step:309/2285 train_time:18549ms step_avg:60.03ms -step:310/2285 train_time:18609ms step_avg:60.03ms -step:311/2285 train_time:18669ms step_avg:60.03ms -step:312/2285 train_time:18728ms step_avg:60.02ms -step:313/2285 train_time:18788ms step_avg:60.03ms -step:314/2285 train_time:18846ms step_avg:60.02ms -step:315/2285 train_time:18907ms step_avg:60.02ms -step:316/2285 train_time:18965ms step_avg:60.02ms -step:317/2285 train_time:19027ms step_avg:60.02ms -step:318/2285 train_time:19085ms step_avg:60.01ms -step:319/2285 train_time:19147ms step_avg:60.02ms -step:320/2285 train_time:19206ms step_avg:60.02ms -step:321/2285 train_time:19268ms step_avg:60.03ms -step:322/2285 train_time:19327ms step_avg:60.02ms -step:323/2285 train_time:19389ms step_avg:60.03ms -step:324/2285 train_time:19448ms step_avg:60.02ms -step:325/2285 train_time:19510ms step_avg:60.03ms -step:326/2285 train_time:19569ms step_avg:60.03ms -step:327/2285 train_time:19629ms step_avg:60.03ms -step:328/2285 train_time:19688ms step_avg:60.02ms -step:329/2285 train_time:19748ms step_avg:60.02ms -step:330/2285 train_time:19807ms step_avg:60.02ms -step:331/2285 train_time:19867ms step_avg:60.02ms -step:332/2285 train_time:19925ms step_avg:60.02ms -step:333/2285 train_time:19986ms step_avg:60.02ms -step:334/2285 train_time:20045ms step_avg:60.01ms -step:335/2285 train_time:20105ms step_avg:60.02ms -step:336/2285 train_time:20164ms step_avg:60.01ms -step:337/2285 train_time:20225ms step_avg:60.01ms -step:338/2285 train_time:20284ms step_avg:60.01ms -step:339/2285 train_time:20345ms step_avg:60.01ms -step:340/2285 train_time:20404ms step_avg:60.01ms -step:341/2285 train_time:20467ms step_avg:60.02ms -step:342/2285 train_time:20526ms step_avg:60.02ms -step:343/2285 train_time:20587ms step_avg:60.02ms -step:344/2285 train_time:20646ms step_avg:60.02ms -step:345/2285 train_time:20707ms step_avg:60.02ms -step:346/2285 train_time:20766ms step_avg:60.02ms -step:347/2285 train_time:20827ms step_avg:60.02ms -step:348/2285 train_time:20885ms step_avg:60.01ms -step:349/2285 train_time:20946ms step_avg:60.02ms -step:350/2285 train_time:21004ms step_avg:60.01ms -step:351/2285 train_time:21065ms step_avg:60.01ms -step:352/2285 train_time:21123ms step_avg:60.01ms -step:353/2285 train_time:21184ms step_avg:60.01ms -step:354/2285 train_time:21242ms step_avg:60.01ms -step:355/2285 train_time:21304ms step_avg:60.01ms -step:356/2285 train_time:21362ms step_avg:60.01ms -step:357/2285 train_time:21425ms step_avg:60.01ms -step:358/2285 train_time:21483ms step_avg:60.01ms -step:359/2285 train_time:21544ms step_avg:60.01ms -step:360/2285 train_time:21603ms step_avg:60.01ms -step:361/2285 train_time:21664ms step_avg:60.01ms -step:362/2285 train_time:21723ms step_avg:60.01ms -step:363/2285 train_time:21783ms step_avg:60.01ms -step:364/2285 train_time:21842ms step_avg:60.01ms -step:365/2285 train_time:21902ms step_avg:60.01ms -step:366/2285 train_time:21960ms step_avg:60.00ms -step:367/2285 train_time:22021ms step_avg:60.00ms -step:368/2285 train_time:22079ms step_avg:60.00ms -step:369/2285 train_time:22140ms step_avg:60.00ms -step:370/2285 train_time:22197ms step_avg:59.99ms -step:371/2285 train_time:22259ms step_avg:60.00ms -step:372/2285 train_time:22317ms step_avg:59.99ms -step:373/2285 train_time:22378ms step_avg:60.00ms -step:374/2285 train_time:22437ms step_avg:59.99ms -step:375/2285 train_time:22497ms step_avg:59.99ms -step:376/2285 train_time:22556ms step_avg:59.99ms -step:377/2285 train_time:22617ms step_avg:59.99ms -step:378/2285 train_time:22676ms step_avg:59.99ms -step:379/2285 train_time:22736ms step_avg:59.99ms -step:380/2285 train_time:22794ms step_avg:59.99ms -step:381/2285 train_time:22855ms step_avg:59.99ms -step:382/2285 train_time:22914ms step_avg:59.98ms -step:383/2285 train_time:22975ms step_avg:59.99ms -step:384/2285 train_time:23034ms step_avg:59.98ms -step:385/2285 train_time:23095ms step_avg:59.99ms -step:386/2285 train_time:23154ms step_avg:59.98ms -step:387/2285 train_time:23215ms step_avg:59.99ms -step:388/2285 train_time:23274ms step_avg:59.98ms -step:389/2285 train_time:23335ms step_avg:59.99ms -step:390/2285 train_time:23394ms step_avg:59.99ms -step:391/2285 train_time:23456ms step_avg:59.99ms -step:392/2285 train_time:23515ms step_avg:59.99ms -step:393/2285 train_time:23576ms step_avg:59.99ms -step:394/2285 train_time:23635ms step_avg:59.99ms -step:395/2285 train_time:23696ms step_avg:59.99ms -step:396/2285 train_time:23755ms step_avg:59.99ms -step:397/2285 train_time:23817ms step_avg:59.99ms -step:398/2285 train_time:23875ms step_avg:59.99ms -step:399/2285 train_time:23936ms step_avg:59.99ms -step:400/2285 train_time:23995ms step_avg:59.99ms -step:401/2285 train_time:24055ms step_avg:59.99ms -step:402/2285 train_time:24114ms step_avg:59.99ms -step:403/2285 train_time:24175ms step_avg:59.99ms -step:404/2285 train_time:24234ms step_avg:59.98ms -step:405/2285 train_time:24296ms step_avg:59.99ms -step:406/2285 train_time:24355ms step_avg:59.99ms -step:407/2285 train_time:24417ms step_avg:59.99ms -step:408/2285 train_time:24475ms step_avg:59.99ms -step:409/2285 train_time:24536ms step_avg:59.99ms -step:410/2285 train_time:24595ms step_avg:59.99ms -step:411/2285 train_time:24657ms step_avg:59.99ms -step:412/2285 train_time:24715ms step_avg:59.99ms -step:413/2285 train_time:24776ms step_avg:59.99ms -step:414/2285 train_time:24835ms step_avg:59.99ms -step:415/2285 train_time:24896ms step_avg:59.99ms -step:416/2285 train_time:24955ms step_avg:59.99ms -step:417/2285 train_time:25017ms step_avg:59.99ms -step:418/2285 train_time:25075ms step_avg:59.99ms -step:419/2285 train_time:25136ms step_avg:59.99ms -step:420/2285 train_time:25195ms step_avg:59.99ms -step:421/2285 train_time:25256ms step_avg:59.99ms -step:422/2285 train_time:25315ms step_avg:59.99ms -step:423/2285 train_time:25376ms step_avg:59.99ms -step:424/2285 train_time:25435ms step_avg:59.99ms -step:425/2285 train_time:25496ms step_avg:59.99ms -step:426/2285 train_time:25555ms step_avg:59.99ms -step:427/2285 train_time:25617ms step_avg:59.99ms -step:428/2285 train_time:25675ms step_avg:59.99ms -step:429/2285 train_time:25736ms step_avg:59.99ms -step:430/2285 train_time:25795ms step_avg:59.99ms -step:431/2285 train_time:25856ms step_avg:59.99ms -step:432/2285 train_time:25915ms step_avg:59.99ms -step:433/2285 train_time:25976ms step_avg:59.99ms -step:434/2285 train_time:26035ms step_avg:59.99ms -step:435/2285 train_time:26096ms step_avg:59.99ms -step:436/2285 train_time:26155ms step_avg:59.99ms -step:437/2285 train_time:26216ms step_avg:59.99ms -step:438/2285 train_time:26275ms step_avg:59.99ms -step:439/2285 train_time:26336ms step_avg:59.99ms -step:440/2285 train_time:26394ms step_avg:59.99ms -step:441/2285 train_time:26456ms step_avg:59.99ms -step:442/2285 train_time:26515ms step_avg:59.99ms -step:443/2285 train_time:26576ms step_avg:59.99ms -step:444/2285 train_time:26635ms step_avg:59.99ms -step:445/2285 train_time:26696ms step_avg:59.99ms -step:446/2285 train_time:26755ms step_avg:59.99ms -step:447/2285 train_time:26816ms step_avg:59.99ms -step:448/2285 train_time:26875ms step_avg:59.99ms -step:449/2285 train_time:26936ms step_avg:59.99ms -step:450/2285 train_time:26995ms step_avg:59.99ms -step:451/2285 train_time:27056ms step_avg:59.99ms -step:452/2285 train_time:27115ms step_avg:59.99ms -step:453/2285 train_time:27177ms step_avg:59.99ms -step:454/2285 train_time:27236ms step_avg:59.99ms -step:455/2285 train_time:27297ms step_avg:59.99ms -step:456/2285 train_time:27356ms step_avg:59.99ms -step:457/2285 train_time:27417ms step_avg:59.99ms -step:458/2285 train_time:27476ms step_avg:59.99ms -step:459/2285 train_time:27537ms step_avg:59.99ms -step:460/2285 train_time:27595ms step_avg:59.99ms -step:461/2285 train_time:27656ms step_avg:59.99ms -step:462/2285 train_time:27715ms step_avg:59.99ms -step:463/2285 train_time:27779ms step_avg:60.00ms -step:464/2285 train_time:27834ms step_avg:59.99ms -step:465/2285 train_time:27895ms step_avg:59.99ms -step:466/2285 train_time:27954ms step_avg:59.99ms -step:467/2285 train_time:28015ms step_avg:59.99ms -step:468/2285 train_time:28074ms step_avg:59.99ms -step:469/2285 train_time:28136ms step_avg:59.99ms -step:470/2285 train_time:28195ms step_avg:59.99ms -step:471/2285 train_time:28256ms step_avg:59.99ms -step:472/2285 train_time:28315ms step_avg:59.99ms -step:473/2285 train_time:28376ms step_avg:59.99ms -step:474/2285 train_time:28435ms step_avg:59.99ms -step:475/2285 train_time:28496ms step_avg:59.99ms -step:476/2285 train_time:28555ms step_avg:59.99ms -step:477/2285 train_time:28616ms step_avg:59.99ms -step:478/2285 train_time:28675ms step_avg:59.99ms -step:479/2285 train_time:28736ms step_avg:59.99ms -step:480/2285 train_time:28795ms step_avg:59.99ms -step:481/2285 train_time:28856ms step_avg:59.99ms -step:482/2285 train_time:28915ms step_avg:59.99ms -step:483/2285 train_time:28975ms step_avg:59.99ms -step:484/2285 train_time:29034ms step_avg:59.99ms -step:485/2285 train_time:29095ms step_avg:59.99ms -step:486/2285 train_time:29154ms step_avg:59.99ms -step:487/2285 train_time:29216ms step_avg:59.99ms -step:488/2285 train_time:29275ms step_avg:59.99ms -step:489/2285 train_time:29336ms step_avg:59.99ms -step:490/2285 train_time:29395ms step_avg:59.99ms -step:491/2285 train_time:29457ms step_avg:59.99ms -step:492/2285 train_time:29516ms step_avg:59.99ms -step:493/2285 train_time:29578ms step_avg:60.00ms -step:494/2285 train_time:29636ms step_avg:59.99ms -step:495/2285 train_time:29697ms step_avg:59.99ms -step:496/2285 train_time:29757ms step_avg:59.99ms -step:497/2285 train_time:29818ms step_avg:60.00ms -step:498/2285 train_time:29877ms step_avg:59.99ms -step:499/2285 train_time:29938ms step_avg:60.00ms -step:500/2285 train_time:29996ms step_avg:59.99ms -step:500/2285 val_loss:3.7874 train_time:30059ms step_avg:60.12ms -step:501/2285 train_time:30087ms step_avg:60.05ms -step:502/2285 train_time:30120ms step_avg:60.00ms -step:503/2285 train_time:30180ms step_avg:60.00ms -step:504/2285 train_time:30241ms step_avg:60.00ms -step:505/2285 train_time:30303ms step_avg:60.01ms -step:506/2285 train_time:30362ms step_avg:60.00ms -step:507/2285 train_time:30423ms step_avg:60.01ms -step:508/2285 train_time:30481ms step_avg:60.00ms -step:509/2285 train_time:30542ms step_avg:60.00ms -step:510/2285 train_time:30600ms step_avg:60.00ms -step:511/2285 train_time:30660ms step_avg:60.00ms -step:512/2285 train_time:30719ms step_avg:60.00ms -step:513/2285 train_time:30779ms step_avg:60.00ms -step:514/2285 train_time:30838ms step_avg:60.00ms -step:515/2285 train_time:30898ms step_avg:60.00ms -step:516/2285 train_time:30957ms step_avg:60.00ms -step:517/2285 train_time:31023ms step_avg:60.01ms -step:518/2285 train_time:31085ms step_avg:60.01ms -step:519/2285 train_time:31147ms step_avg:60.01ms -step:520/2285 train_time:31206ms step_avg:60.01ms -step:521/2285 train_time:31267ms step_avg:60.01ms -step:522/2285 train_time:31326ms step_avg:60.01ms -step:523/2285 train_time:31387ms step_avg:60.01ms -step:524/2285 train_time:31446ms step_avg:60.01ms -step:525/2285 train_time:31507ms step_avg:60.01ms -step:526/2285 train_time:31566ms step_avg:60.01ms -step:527/2285 train_time:31627ms step_avg:60.01ms -step:528/2285 train_time:31686ms step_avg:60.01ms -step:529/2285 train_time:31747ms step_avg:60.01ms -step:530/2285 train_time:31806ms step_avg:60.01ms -step:531/2285 train_time:31867ms step_avg:60.01ms -step:532/2285 train_time:31926ms step_avg:60.01ms -step:533/2285 train_time:31989ms step_avg:60.02ms -step:534/2285 train_time:32048ms step_avg:60.02ms -step:535/2285 train_time:32110ms step_avg:60.02ms -step:536/2285 train_time:32170ms step_avg:60.02ms -step:537/2285 train_time:32233ms step_avg:60.02ms -step:538/2285 train_time:32293ms step_avg:60.02ms -step:539/2285 train_time:32354ms step_avg:60.03ms -step:540/2285 train_time:32413ms step_avg:60.02ms -step:541/2285 train_time:32474ms step_avg:60.03ms -step:542/2285 train_time:32533ms step_avg:60.02ms -step:543/2285 train_time:32594ms step_avg:60.03ms -step:544/2285 train_time:32653ms step_avg:60.02ms -step:545/2285 train_time:32714ms step_avg:60.03ms -step:546/2285 train_time:32773ms step_avg:60.02ms -step:547/2285 train_time:32835ms step_avg:60.03ms -step:548/2285 train_time:32894ms step_avg:60.03ms -step:549/2285 train_time:32957ms step_avg:60.03ms -step:550/2285 train_time:33016ms step_avg:60.03ms -step:551/2285 train_time:33077ms step_avg:60.03ms -step:552/2285 train_time:33137ms step_avg:60.03ms -step:553/2285 train_time:33199ms step_avg:60.03ms -step:554/2285 train_time:33258ms step_avg:60.03ms -step:555/2285 train_time:33320ms step_avg:60.04ms -step:556/2285 train_time:33378ms step_avg:60.03ms -step:557/2285 train_time:33439ms step_avg:60.03ms -step:558/2285 train_time:33498ms step_avg:60.03ms -step:559/2285 train_time:33559ms step_avg:60.03ms -step:560/2285 train_time:33618ms step_avg:60.03ms -step:561/2285 train_time:33680ms step_avg:60.04ms -step:562/2285 train_time:33739ms step_avg:60.03ms -step:563/2285 train_time:33800ms step_avg:60.04ms -step:564/2285 train_time:33859ms step_avg:60.03ms -step:565/2285 train_time:33921ms step_avg:60.04ms -step:566/2285 train_time:33979ms step_avg:60.03ms -step:567/2285 train_time:34041ms step_avg:60.04ms -step:568/2285 train_time:34100ms step_avg:60.04ms -step:569/2285 train_time:34162ms step_avg:60.04ms -step:570/2285 train_time:34220ms step_avg:60.04ms -step:571/2285 train_time:34282ms step_avg:60.04ms -step:572/2285 train_time:34341ms step_avg:60.04ms -step:573/2285 train_time:34402ms step_avg:60.04ms -step:574/2285 train_time:34461ms step_avg:60.04ms -step:575/2285 train_time:34522ms step_avg:60.04ms -step:576/2285 train_time:34580ms step_avg:60.04ms -step:577/2285 train_time:34642ms step_avg:60.04ms -step:578/2285 train_time:34700ms step_avg:60.04ms -step:579/2285 train_time:34762ms step_avg:60.04ms -step:580/2285 train_time:34820ms step_avg:60.04ms -step:581/2285 train_time:34881ms step_avg:60.04ms -step:582/2285 train_time:34940ms step_avg:60.03ms -step:583/2285 train_time:35002ms step_avg:60.04ms -step:584/2285 train_time:35061ms step_avg:60.04ms -step:585/2285 train_time:35122ms step_avg:60.04ms -step:586/2285 train_time:35181ms step_avg:60.04ms -step:587/2285 train_time:35242ms step_avg:60.04ms -step:588/2285 train_time:35301ms step_avg:60.04ms -step:589/2285 train_time:35362ms step_avg:60.04ms -step:590/2285 train_time:35421ms step_avg:60.04ms -step:591/2285 train_time:35482ms step_avg:60.04ms -step:592/2285 train_time:35541ms step_avg:60.04ms -step:593/2285 train_time:35602ms step_avg:60.04ms -step:594/2285 train_time:35661ms step_avg:60.04ms -step:595/2285 train_time:35722ms step_avg:60.04ms -step:596/2285 train_time:35781ms step_avg:60.03ms -step:597/2285 train_time:35842ms step_avg:60.04ms -step:598/2285 train_time:35901ms step_avg:60.03ms -step:599/2285 train_time:35962ms step_avg:60.04ms -step:600/2285 train_time:36021ms step_avg:60.03ms -step:601/2285 train_time:36082ms step_avg:60.04ms -step:602/2285 train_time:36141ms step_avg:60.03ms -step:603/2285 train_time:36202ms step_avg:60.04ms -step:604/2285 train_time:36261ms step_avg:60.03ms -step:605/2285 train_time:36322ms step_avg:60.04ms -step:606/2285 train_time:36381ms step_avg:60.03ms -step:607/2285 train_time:36443ms step_avg:60.04ms -step:608/2285 train_time:36502ms step_avg:60.04ms -step:609/2285 train_time:36562ms step_avg:60.04ms -step:610/2285 train_time:36621ms step_avg:60.03ms -step:611/2285 train_time:36682ms step_avg:60.04ms -step:612/2285 train_time:36741ms step_avg:60.03ms -step:613/2285 train_time:36802ms step_avg:60.04ms -step:614/2285 train_time:36861ms step_avg:60.03ms -step:615/2285 train_time:36922ms step_avg:60.04ms -step:616/2285 train_time:36981ms step_avg:60.03ms -step:617/2285 train_time:37043ms step_avg:60.04ms -step:618/2285 train_time:37102ms step_avg:60.04ms -step:619/2285 train_time:37163ms step_avg:60.04ms -step:620/2285 train_time:37221ms step_avg:60.03ms -step:621/2285 train_time:37282ms step_avg:60.04ms -step:622/2285 train_time:37341ms step_avg:60.03ms -step:623/2285 train_time:37402ms step_avg:60.04ms -step:624/2285 train_time:37461ms step_avg:60.03ms -step:625/2285 train_time:37522ms step_avg:60.04ms -step:626/2285 train_time:37581ms step_avg:60.03ms -step:627/2285 train_time:37642ms step_avg:60.04ms -step:628/2285 train_time:37701ms step_avg:60.03ms -step:629/2285 train_time:37763ms step_avg:60.04ms -step:630/2285 train_time:37821ms step_avg:60.03ms -step:631/2285 train_time:37882ms step_avg:60.04ms -step:632/2285 train_time:37941ms step_avg:60.03ms -step:633/2285 train_time:38003ms step_avg:60.04ms -step:634/2285 train_time:38061ms step_avg:60.03ms -step:635/2285 train_time:38122ms step_avg:60.03ms -step:636/2285 train_time:38181ms step_avg:60.03ms -step:637/2285 train_time:38242ms step_avg:60.03ms -step:638/2285 train_time:38301ms step_avg:60.03ms -step:639/2285 train_time:38362ms step_avg:60.03ms -step:640/2285 train_time:38420ms step_avg:60.03ms -step:641/2285 train_time:38482ms step_avg:60.03ms -step:642/2285 train_time:38541ms step_avg:60.03ms -step:643/2285 train_time:38602ms step_avg:60.03ms -step:644/2285 train_time:38661ms step_avg:60.03ms -step:645/2285 train_time:38723ms step_avg:60.03ms -step:646/2285 train_time:38781ms step_avg:60.03ms -step:647/2285 train_time:38842ms step_avg:60.03ms -step:648/2285 train_time:38901ms step_avg:60.03ms -step:649/2285 train_time:38962ms step_avg:60.03ms -step:650/2285 train_time:39021ms step_avg:60.03ms -step:651/2285 train_time:39081ms step_avg:60.03ms -step:652/2285 train_time:39140ms step_avg:60.03ms -step:653/2285 train_time:39202ms step_avg:60.03ms -step:654/2285 train_time:39260ms step_avg:60.03ms -step:655/2285 train_time:39322ms step_avg:60.03ms -step:656/2285 train_time:39381ms step_avg:60.03ms -step:657/2285 train_time:39442ms step_avg:60.03ms -step:658/2285 train_time:39501ms step_avg:60.03ms -step:659/2285 train_time:39563ms step_avg:60.03ms -step:660/2285 train_time:39622ms step_avg:60.03ms -step:661/2285 train_time:39684ms step_avg:60.04ms -step:662/2285 train_time:39743ms step_avg:60.03ms -step:663/2285 train_time:39804ms step_avg:60.04ms -step:664/2285 train_time:39862ms step_avg:60.03ms -step:665/2285 train_time:39923ms step_avg:60.03ms -step:666/2285 train_time:39982ms step_avg:60.03ms -step:667/2285 train_time:40042ms step_avg:60.03ms -step:668/2285 train_time:40101ms step_avg:60.03ms -step:669/2285 train_time:40163ms step_avg:60.03ms -step:670/2285 train_time:40221ms step_avg:60.03ms -step:671/2285 train_time:40282ms step_avg:60.03ms -step:672/2285 train_time:40341ms step_avg:60.03ms -step:673/2285 train_time:40402ms step_avg:60.03ms -step:674/2285 train_time:40461ms step_avg:60.03ms -step:675/2285 train_time:40522ms step_avg:60.03ms -step:676/2285 train_time:40582ms step_avg:60.03ms -step:677/2285 train_time:40643ms step_avg:60.03ms -step:678/2285 train_time:40702ms step_avg:60.03ms -step:679/2285 train_time:40764ms step_avg:60.03ms -step:680/2285 train_time:40822ms step_avg:60.03ms -step:681/2285 train_time:40884ms step_avg:60.03ms -step:682/2285 train_time:40942ms step_avg:60.03ms -step:683/2285 train_time:41003ms step_avg:60.03ms -step:684/2285 train_time:41062ms step_avg:60.03ms -step:685/2285 train_time:41123ms step_avg:60.03ms -step:686/2285 train_time:41181ms step_avg:60.03ms -step:687/2285 train_time:41242ms step_avg:60.03ms -step:688/2285 train_time:41301ms step_avg:60.03ms -step:689/2285 train_time:41364ms step_avg:60.03ms -step:690/2285 train_time:41422ms step_avg:60.03ms -step:691/2285 train_time:41483ms step_avg:60.03ms -step:692/2285 train_time:41542ms step_avg:60.03ms -step:693/2285 train_time:41603ms step_avg:60.03ms -step:694/2285 train_time:41662ms step_avg:60.03ms -step:695/2285 train_time:41724ms step_avg:60.03ms -step:696/2285 train_time:41783ms step_avg:60.03ms -step:697/2285 train_time:41843ms step_avg:60.03ms -step:698/2285 train_time:41902ms step_avg:60.03ms -step:699/2285 train_time:41964ms step_avg:60.03ms -step:700/2285 train_time:42022ms step_avg:60.03ms -step:701/2285 train_time:42083ms step_avg:60.03ms -step:702/2285 train_time:42142ms step_avg:60.03ms -step:703/2285 train_time:42203ms step_avg:60.03ms -step:704/2285 train_time:42262ms step_avg:60.03ms -step:705/2285 train_time:42323ms step_avg:60.03ms -step:706/2285 train_time:42382ms step_avg:60.03ms -step:707/2285 train_time:42443ms step_avg:60.03ms -step:708/2285 train_time:42502ms step_avg:60.03ms -step:709/2285 train_time:42563ms step_avg:60.03ms -step:710/2285 train_time:42622ms step_avg:60.03ms -step:711/2285 train_time:42684ms step_avg:60.03ms -step:712/2285 train_time:42742ms step_avg:60.03ms -step:713/2285 train_time:42804ms step_avg:60.03ms -step:714/2285 train_time:42863ms step_avg:60.03ms -step:715/2285 train_time:42925ms step_avg:60.03ms -step:716/2285 train_time:42983ms step_avg:60.03ms -step:717/2285 train_time:43044ms step_avg:60.03ms -step:718/2285 train_time:43104ms step_avg:60.03ms -step:719/2285 train_time:43165ms step_avg:60.03ms -step:720/2285 train_time:43223ms step_avg:60.03ms -step:721/2285 train_time:43284ms step_avg:60.03ms -step:722/2285 train_time:43343ms step_avg:60.03ms -step:723/2285 train_time:43404ms step_avg:60.03ms -step:724/2285 train_time:43463ms step_avg:60.03ms -step:725/2285 train_time:43524ms step_avg:60.03ms -step:726/2285 train_time:43583ms step_avg:60.03ms -step:727/2285 train_time:43645ms step_avg:60.03ms -step:728/2285 train_time:43703ms step_avg:60.03ms -step:729/2285 train_time:43765ms step_avg:60.03ms -step:730/2285 train_time:43823ms step_avg:60.03ms -step:731/2285 train_time:43885ms step_avg:60.03ms -step:732/2285 train_time:43944ms step_avg:60.03ms -step:733/2285 train_time:44005ms step_avg:60.03ms -step:734/2285 train_time:44064ms step_avg:60.03ms -step:735/2285 train_time:44125ms step_avg:60.03ms -step:736/2285 train_time:44184ms step_avg:60.03ms -step:737/2285 train_time:44245ms step_avg:60.03ms -step:738/2285 train_time:44304ms step_avg:60.03ms -step:739/2285 train_time:44366ms step_avg:60.04ms -step:740/2285 train_time:44424ms step_avg:60.03ms -step:741/2285 train_time:44486ms step_avg:60.04ms -step:742/2285 train_time:44545ms step_avg:60.03ms -step:743/2285 train_time:44607ms step_avg:60.04ms -step:744/2285 train_time:44666ms step_avg:60.03ms -step:745/2285 train_time:44727ms step_avg:60.04ms -step:746/2285 train_time:44786ms step_avg:60.04ms -step:747/2285 train_time:44848ms step_avg:60.04ms -step:748/2285 train_time:44907ms step_avg:60.04ms -step:749/2285 train_time:44968ms step_avg:60.04ms -step:750/2285 train_time:45028ms step_avg:60.04ms -step:750/2285 val_loss:3.6604 train_time:45092ms step_avg:60.12ms -step:751/2285 train_time:45116ms step_avg:60.08ms -step:752/2285 train_time:45153ms step_avg:60.04ms -step:753/2285 train_time:45216ms step_avg:60.05ms -step:754/2285 train_time:45281ms step_avg:60.06ms -step:755/2285 train_time:45343ms step_avg:60.06ms -step:756/2285 train_time:45401ms step_avg:60.05ms -step:757/2285 train_time:45462ms step_avg:60.06ms -step:758/2285 train_time:45521ms step_avg:60.05ms -step:759/2285 train_time:45583ms step_avg:60.06ms -step:760/2285 train_time:45641ms step_avg:60.05ms -step:761/2285 train_time:45702ms step_avg:60.06ms -step:762/2285 train_time:45760ms step_avg:60.05ms -step:763/2285 train_time:45821ms step_avg:60.05ms -step:764/2285 train_time:45880ms step_avg:60.05ms -step:765/2285 train_time:45941ms step_avg:60.05ms -step:766/2285 train_time:46001ms step_avg:60.05ms -step:767/2285 train_time:46065ms step_avg:60.06ms -step:768/2285 train_time:46126ms step_avg:60.06ms -step:769/2285 train_time:46190ms step_avg:60.06ms -step:770/2285 train_time:46251ms step_avg:60.07ms -step:771/2285 train_time:46313ms step_avg:60.07ms -step:772/2285 train_time:46372ms step_avg:60.07ms -step:773/2285 train_time:46433ms step_avg:60.07ms -step:774/2285 train_time:46492ms step_avg:60.07ms -step:775/2285 train_time:46553ms step_avg:60.07ms -step:776/2285 train_time:46613ms step_avg:60.07ms -step:777/2285 train_time:46674ms step_avg:60.07ms -step:778/2285 train_time:46733ms step_avg:60.07ms -step:779/2285 train_time:46795ms step_avg:60.07ms -step:780/2285 train_time:46854ms step_avg:60.07ms -step:781/2285 train_time:46915ms step_avg:60.07ms -step:782/2285 train_time:46975ms step_avg:60.07ms -step:783/2285 train_time:47037ms step_avg:60.07ms -step:784/2285 train_time:47097ms step_avg:60.07ms -step:785/2285 train_time:47161ms step_avg:60.08ms -step:786/2285 train_time:47221ms step_avg:60.08ms -step:787/2285 train_time:47284ms step_avg:60.08ms -step:788/2285 train_time:47343ms step_avg:60.08ms -step:789/2285 train_time:47405ms step_avg:60.08ms -step:790/2285 train_time:47465ms step_avg:60.08ms -step:791/2285 train_time:47526ms step_avg:60.08ms -step:792/2285 train_time:47586ms step_avg:60.08ms -step:793/2285 train_time:47648ms step_avg:60.09ms -step:794/2285 train_time:47706ms step_avg:60.08ms -step:795/2285 train_time:47768ms step_avg:60.09ms -step:796/2285 train_time:47827ms step_avg:60.08ms -step:797/2285 train_time:47888ms step_avg:60.09ms -step:798/2285 train_time:47947ms step_avg:60.08ms -step:799/2285 train_time:48010ms step_avg:60.09ms -step:800/2285 train_time:48069ms step_avg:60.09ms -step:801/2285 train_time:48132ms step_avg:60.09ms -step:802/2285 train_time:48191ms step_avg:60.09ms -step:803/2285 train_time:48254ms step_avg:60.09ms -step:804/2285 train_time:48312ms step_avg:60.09ms -step:805/2285 train_time:48374ms step_avg:60.09ms -step:806/2285 train_time:48434ms step_avg:60.09ms -step:807/2285 train_time:48496ms step_avg:60.09ms -step:808/2285 train_time:48555ms step_avg:60.09ms -step:809/2285 train_time:48617ms step_avg:60.10ms -step:810/2285 train_time:48677ms step_avg:60.09ms -step:811/2285 train_time:48738ms step_avg:60.10ms -step:812/2285 train_time:48797ms step_avg:60.10ms -step:813/2285 train_time:48859ms step_avg:60.10ms -step:814/2285 train_time:48919ms step_avg:60.10ms -step:815/2285 train_time:48981ms step_avg:60.10ms -step:816/2285 train_time:49040ms step_avg:60.10ms -step:817/2285 train_time:49102ms step_avg:60.10ms -step:818/2285 train_time:49162ms step_avg:60.10ms -step:819/2285 train_time:49224ms step_avg:60.10ms -step:820/2285 train_time:49283ms step_avg:60.10ms -step:821/2285 train_time:49345ms step_avg:60.10ms -step:822/2285 train_time:49405ms step_avg:60.10ms -step:823/2285 train_time:49467ms step_avg:60.11ms -step:824/2285 train_time:49527ms step_avg:60.11ms -step:825/2285 train_time:49589ms step_avg:60.11ms -step:826/2285 train_time:49648ms step_avg:60.11ms -step:827/2285 train_time:49709ms step_avg:60.11ms -step:828/2285 train_time:49768ms step_avg:60.11ms -step:829/2285 train_time:49830ms step_avg:60.11ms -step:830/2285 train_time:49889ms step_avg:60.11ms -step:831/2285 train_time:49951ms step_avg:60.11ms -step:832/2285 train_time:50011ms step_avg:60.11ms -step:833/2285 train_time:50073ms step_avg:60.11ms -step:834/2285 train_time:50132ms step_avg:60.11ms -step:835/2285 train_time:50194ms step_avg:60.11ms -step:836/2285 train_time:50253ms step_avg:60.11ms -step:837/2285 train_time:50315ms step_avg:60.11ms -step:838/2285 train_time:50374ms step_avg:60.11ms -step:839/2285 train_time:50437ms step_avg:60.12ms -step:840/2285 train_time:50496ms step_avg:60.11ms -step:841/2285 train_time:50560ms step_avg:60.12ms -step:842/2285 train_time:50618ms step_avg:60.12ms -step:843/2285 train_time:50680ms step_avg:60.12ms -step:844/2285 train_time:50739ms step_avg:60.12ms -step:845/2285 train_time:50801ms step_avg:60.12ms -step:846/2285 train_time:50861ms step_avg:60.12ms -step:847/2285 train_time:50923ms step_avg:60.12ms -step:848/2285 train_time:50982ms step_avg:60.12ms -step:849/2285 train_time:51045ms step_avg:60.12ms -step:850/2285 train_time:51104ms step_avg:60.12ms -step:851/2285 train_time:51166ms step_avg:60.12ms -step:852/2285 train_time:51225ms step_avg:60.12ms -step:853/2285 train_time:51287ms step_avg:60.13ms -step:854/2285 train_time:51347ms step_avg:60.13ms -step:855/2285 train_time:51408ms step_avg:60.13ms -step:856/2285 train_time:51468ms step_avg:60.13ms -step:857/2285 train_time:51530ms step_avg:60.13ms -step:858/2285 train_time:51589ms step_avg:60.13ms -step:859/2285 train_time:51651ms step_avg:60.13ms -step:860/2285 train_time:51710ms step_avg:60.13ms -step:861/2285 train_time:51771ms step_avg:60.13ms -step:862/2285 train_time:51831ms step_avg:60.13ms -step:863/2285 train_time:51894ms step_avg:60.13ms -step:864/2285 train_time:51952ms step_avg:60.13ms -step:865/2285 train_time:52013ms step_avg:60.13ms -step:866/2285 train_time:52072ms step_avg:60.13ms -step:867/2285 train_time:52134ms step_avg:60.13ms -step:868/2285 train_time:52193ms step_avg:60.13ms -step:869/2285 train_time:52255ms step_avg:60.13ms -step:870/2285 train_time:52314ms step_avg:60.13ms -step:871/2285 train_time:52376ms step_avg:60.13ms -step:872/2285 train_time:52436ms step_avg:60.13ms -step:873/2285 train_time:52498ms step_avg:60.14ms -step:874/2285 train_time:52558ms step_avg:60.13ms -step:875/2285 train_time:52620ms step_avg:60.14ms -step:876/2285 train_time:52680ms step_avg:60.14ms -step:877/2285 train_time:52743ms step_avg:60.14ms -step:878/2285 train_time:52802ms step_avg:60.14ms -step:879/2285 train_time:52864ms step_avg:60.14ms -step:880/2285 train_time:52924ms step_avg:60.14ms -step:881/2285 train_time:52986ms step_avg:60.14ms -step:882/2285 train_time:53045ms step_avg:60.14ms -step:883/2285 train_time:53107ms step_avg:60.14ms -step:884/2285 train_time:53166ms step_avg:60.14ms -step:885/2285 train_time:53228ms step_avg:60.14ms -step:886/2285 train_time:53288ms step_avg:60.14ms -step:887/2285 train_time:53350ms step_avg:60.15ms -step:888/2285 train_time:53409ms step_avg:60.15ms -step:889/2285 train_time:53472ms step_avg:60.15ms -step:890/2285 train_time:53531ms step_avg:60.15ms -step:891/2285 train_time:53593ms step_avg:60.15ms -step:892/2285 train_time:53652ms step_avg:60.15ms -step:893/2285 train_time:53714ms step_avg:60.15ms -step:894/2285 train_time:53773ms step_avg:60.15ms -step:895/2285 train_time:53835ms step_avg:60.15ms -step:896/2285 train_time:53894ms step_avg:60.15ms -step:897/2285 train_time:53956ms step_avg:60.15ms -step:898/2285 train_time:54015ms step_avg:60.15ms -step:899/2285 train_time:54078ms step_avg:60.15ms -step:900/2285 train_time:54137ms step_avg:60.15ms -step:901/2285 train_time:54200ms step_avg:60.16ms -step:902/2285 train_time:54260ms step_avg:60.15ms -step:903/2285 train_time:54322ms step_avg:60.16ms -step:904/2285 train_time:54381ms step_avg:60.16ms -step:905/2285 train_time:54444ms step_avg:60.16ms -step:906/2285 train_time:54504ms step_avg:60.16ms -step:907/2285 train_time:54565ms step_avg:60.16ms -step:908/2285 train_time:54625ms step_avg:60.16ms -step:909/2285 train_time:54686ms step_avg:60.16ms -step:910/2285 train_time:54746ms step_avg:60.16ms -step:911/2285 train_time:54807ms step_avg:60.16ms -step:912/2285 train_time:54866ms step_avg:60.16ms -step:913/2285 train_time:54928ms step_avg:60.16ms -step:914/2285 train_time:54988ms step_avg:60.16ms -step:915/2285 train_time:55050ms step_avg:60.16ms -step:916/2285 train_time:55109ms step_avg:60.16ms -step:917/2285 train_time:55170ms step_avg:60.16ms -step:918/2285 train_time:55230ms step_avg:60.16ms -step:919/2285 train_time:55292ms step_avg:60.16ms -step:920/2285 train_time:55350ms step_avg:60.16ms -step:921/2285 train_time:55412ms step_avg:60.17ms -step:922/2285 train_time:55471ms step_avg:60.16ms -step:923/2285 train_time:55533ms step_avg:60.17ms -step:924/2285 train_time:55592ms step_avg:60.16ms -step:925/2285 train_time:55654ms step_avg:60.17ms -step:926/2285 train_time:55713ms step_avg:60.16ms -step:927/2285 train_time:55775ms step_avg:60.17ms -step:928/2285 train_time:55834ms step_avg:60.17ms -step:929/2285 train_time:55896ms step_avg:60.17ms -step:930/2285 train_time:55955ms step_avg:60.17ms -step:931/2285 train_time:56017ms step_avg:60.17ms -step:932/2285 train_time:56077ms step_avg:60.17ms -step:933/2285 train_time:56139ms step_avg:60.17ms -step:934/2285 train_time:56199ms step_avg:60.17ms -step:935/2285 train_time:56261ms step_avg:60.17ms -step:936/2285 train_time:56321ms step_avg:60.17ms -step:937/2285 train_time:56382ms step_avg:60.17ms -step:938/2285 train_time:56443ms step_avg:60.17ms -step:939/2285 train_time:56505ms step_avg:60.18ms -step:940/2285 train_time:56564ms step_avg:60.17ms -step:941/2285 train_time:56626ms step_avg:60.18ms -step:942/2285 train_time:56686ms step_avg:60.18ms -step:943/2285 train_time:56748ms step_avg:60.18ms -step:944/2285 train_time:56808ms step_avg:60.18ms -step:945/2285 train_time:56869ms step_avg:60.18ms -step:946/2285 train_time:56928ms step_avg:60.18ms -step:947/2285 train_time:56990ms step_avg:60.18ms -step:948/2285 train_time:57050ms step_avg:60.18ms -step:949/2285 train_time:57111ms step_avg:60.18ms -step:950/2285 train_time:57171ms step_avg:60.18ms -step:951/2285 train_time:57232ms step_avg:60.18ms -step:952/2285 train_time:57292ms step_avg:60.18ms -step:953/2285 train_time:57354ms step_avg:60.18ms -step:954/2285 train_time:57413ms step_avg:60.18ms -step:955/2285 train_time:57475ms step_avg:60.18ms -step:956/2285 train_time:57534ms step_avg:60.18ms -step:957/2285 train_time:57596ms step_avg:60.18ms -step:958/2285 train_time:57655ms step_avg:60.18ms -step:959/2285 train_time:57717ms step_avg:60.18ms -step:960/2285 train_time:57776ms step_avg:60.18ms -step:961/2285 train_time:57839ms step_avg:60.19ms -step:962/2285 train_time:57898ms step_avg:60.18ms -step:963/2285 train_time:57960ms step_avg:60.19ms -step:964/2285 train_time:58019ms step_avg:60.19ms -step:965/2285 train_time:58081ms step_avg:60.19ms -step:966/2285 train_time:58142ms step_avg:60.19ms -step:967/2285 train_time:58204ms step_avg:60.19ms -step:968/2285 train_time:58264ms step_avg:60.19ms -step:969/2285 train_time:58325ms step_avg:60.19ms -step:970/2285 train_time:58385ms step_avg:60.19ms -step:971/2285 train_time:58447ms step_avg:60.19ms -step:972/2285 train_time:58506ms step_avg:60.19ms -step:973/2285 train_time:58569ms step_avg:60.19ms -step:974/2285 train_time:58628ms step_avg:60.19ms -step:975/2285 train_time:58690ms step_avg:60.19ms -step:976/2285 train_time:58749ms step_avg:60.19ms -step:977/2285 train_time:58810ms step_avg:60.19ms -step:978/2285 train_time:58870ms step_avg:60.19ms -step:979/2285 train_time:58931ms step_avg:60.20ms -step:980/2285 train_time:58991ms step_avg:60.20ms -step:981/2285 train_time:59054ms step_avg:60.20ms -step:982/2285 train_time:59113ms step_avg:60.20ms -step:983/2285 train_time:59175ms step_avg:60.20ms -step:984/2285 train_time:59235ms step_avg:60.20ms -step:985/2285 train_time:59297ms step_avg:60.20ms -step:986/2285 train_time:59357ms step_avg:60.20ms -step:987/2285 train_time:59419ms step_avg:60.20ms -step:988/2285 train_time:59478ms step_avg:60.20ms -step:989/2285 train_time:59540ms step_avg:60.20ms -step:990/2285 train_time:59600ms step_avg:60.20ms -step:991/2285 train_time:59662ms step_avg:60.20ms -step:992/2285 train_time:59721ms step_avg:60.20ms -step:993/2285 train_time:59784ms step_avg:60.21ms -step:994/2285 train_time:59844ms step_avg:60.20ms -step:995/2285 train_time:59906ms step_avg:60.21ms -step:996/2285 train_time:59965ms step_avg:60.21ms -step:997/2285 train_time:60027ms step_avg:60.21ms -step:998/2285 train_time:60087ms step_avg:60.21ms -step:999/2285 train_time:60149ms step_avg:60.21ms -step:1000/2285 train_time:60209ms step_avg:60.21ms -step:1000/2285 val_loss:3.5730 train_time:60272ms step_avg:60.27ms -step:1001/2285 train_time:60293ms step_avg:60.23ms -step:1002/2285 train_time:60334ms step_avg:60.21ms -step:1003/2285 train_time:60398ms step_avg:60.22ms -step:1004/2285 train_time:60459ms step_avg:60.22ms -step:1005/2285 train_time:60522ms step_avg:60.22ms -step:1006/2285 train_time:60583ms step_avg:60.22ms -step:1007/2285 train_time:60644ms step_avg:60.22ms -step:1008/2285 train_time:60702ms step_avg:60.22ms -step:1009/2285 train_time:60764ms step_avg:60.22ms -step:1010/2285 train_time:60822ms step_avg:60.22ms -step:1011/2285 train_time:60883ms step_avg:60.22ms -step:1012/2285 train_time:60942ms step_avg:60.22ms -step:1013/2285 train_time:61003ms step_avg:60.22ms -step:1014/2285 train_time:61062ms step_avg:60.22ms -step:1015/2285 train_time:61123ms step_avg:60.22ms -step:1016/2285 train_time:61184ms step_avg:60.22ms -step:1017/2285 train_time:61250ms step_avg:60.23ms -step:1018/2285 train_time:61310ms step_avg:60.23ms -step:1019/2285 train_time:61372ms step_avg:60.23ms -step:1020/2285 train_time:61432ms step_avg:60.23ms -step:1021/2285 train_time:61495ms step_avg:60.23ms -step:1022/2285 train_time:61555ms step_avg:60.23ms -step:1023/2285 train_time:61617ms step_avg:60.23ms -step:1024/2285 train_time:61677ms step_avg:60.23ms -step:1025/2285 train_time:61738ms step_avg:60.23ms -step:1026/2285 train_time:61797ms step_avg:60.23ms -step:1027/2285 train_time:61859ms step_avg:60.23ms -step:1028/2285 train_time:61918ms step_avg:60.23ms -step:1029/2285 train_time:61979ms step_avg:60.23ms -step:1030/2285 train_time:62038ms step_avg:60.23ms -step:1031/2285 train_time:62100ms step_avg:60.23ms -step:1032/2285 train_time:62160ms step_avg:60.23ms -step:1033/2285 train_time:62223ms step_avg:60.23ms -step:1034/2285 train_time:62283ms step_avg:60.23ms -step:1035/2285 train_time:62345ms step_avg:60.24ms -step:1036/2285 train_time:62405ms step_avg:60.24ms -step:1037/2285 train_time:62467ms step_avg:60.24ms -step:1038/2285 train_time:62527ms step_avg:60.24ms -step:1039/2285 train_time:62588ms step_avg:60.24ms -step:1040/2285 train_time:62647ms step_avg:60.24ms -step:1041/2285 train_time:62709ms step_avg:60.24ms -step:1042/2285 train_time:62768ms step_avg:60.24ms -step:1043/2285 train_time:62830ms step_avg:60.24ms -step:1044/2285 train_time:62889ms step_avg:60.24ms -step:1045/2285 train_time:62950ms step_avg:60.24ms -step:1046/2285 train_time:63010ms step_avg:60.24ms -step:1047/2285 train_time:63072ms step_avg:60.24ms -step:1048/2285 train_time:63131ms step_avg:60.24ms -step:1049/2285 train_time:63193ms step_avg:60.24ms -step:1050/2285 train_time:63253ms step_avg:60.24ms -step:1051/2285 train_time:63316ms step_avg:60.24ms -step:1052/2285 train_time:63377ms step_avg:60.24ms -step:1053/2285 train_time:63439ms step_avg:60.25ms -step:1054/2285 train_time:63499ms step_avg:60.25ms -step:1055/2285 train_time:63561ms step_avg:60.25ms -step:1056/2285 train_time:63620ms step_avg:60.25ms -step:1057/2285 train_time:63682ms step_avg:60.25ms -step:1058/2285 train_time:63742ms step_avg:60.25ms -step:1059/2285 train_time:63803ms step_avg:60.25ms -step:1060/2285 train_time:63863ms step_avg:60.25ms -step:1061/2285 train_time:63925ms step_avg:60.25ms -step:1062/2285 train_time:63984ms step_avg:60.25ms -step:1063/2285 train_time:64046ms step_avg:60.25ms -step:1064/2285 train_time:64105ms step_avg:60.25ms -step:1065/2285 train_time:64167ms step_avg:60.25ms -step:1066/2285 train_time:64226ms step_avg:60.25ms -step:1067/2285 train_time:64288ms step_avg:60.25ms -step:1068/2285 train_time:64348ms step_avg:60.25ms -step:1069/2285 train_time:64410ms step_avg:60.25ms -step:1070/2285 train_time:64469ms step_avg:60.25ms -step:1071/2285 train_time:64531ms step_avg:60.25ms -step:1072/2285 train_time:64591ms step_avg:60.25ms -step:1073/2285 train_time:64653ms step_avg:60.25ms -step:1074/2285 train_time:64712ms step_avg:60.25ms -step:1075/2285 train_time:64774ms step_avg:60.26ms -step:1076/2285 train_time:64834ms step_avg:60.25ms -step:1077/2285 train_time:64897ms step_avg:60.26ms -step:1078/2285 train_time:64956ms step_avg:60.26ms -step:1079/2285 train_time:65019ms step_avg:60.26ms -step:1080/2285 train_time:65078ms step_avg:60.26ms -step:1081/2285 train_time:65140ms step_avg:60.26ms -step:1082/2285 train_time:65200ms step_avg:60.26ms -step:1083/2285 train_time:65262ms step_avg:60.26ms -step:1084/2285 train_time:65321ms step_avg:60.26ms -step:1085/2285 train_time:65383ms step_avg:60.26ms -step:1086/2285 train_time:65443ms step_avg:60.26ms -step:1087/2285 train_time:65505ms step_avg:60.26ms -step:1088/2285 train_time:65565ms step_avg:60.26ms -step:1089/2285 train_time:65627ms step_avg:60.26ms -step:1090/2285 train_time:65686ms step_avg:60.26ms -step:1091/2285 train_time:65747ms step_avg:60.26ms -step:1092/2285 train_time:65807ms step_avg:60.26ms -step:1093/2285 train_time:65868ms step_avg:60.26ms -step:1094/2285 train_time:65928ms step_avg:60.26ms -step:1095/2285 train_time:65990ms step_avg:60.26ms -step:1096/2285 train_time:66049ms step_avg:60.26ms -step:1097/2285 train_time:66111ms step_avg:60.27ms -step:1098/2285 train_time:66171ms step_avg:60.27ms -step:1099/2285 train_time:66234ms step_avg:60.27ms -step:1100/2285 train_time:66293ms step_avg:60.27ms -step:1101/2285 train_time:66355ms step_avg:60.27ms -step:1102/2285 train_time:66415ms step_avg:60.27ms -step:1103/2285 train_time:66478ms step_avg:60.27ms -step:1104/2285 train_time:66538ms step_avg:60.27ms -step:1105/2285 train_time:66600ms step_avg:60.27ms -step:1106/2285 train_time:66659ms step_avg:60.27ms -step:1107/2285 train_time:66722ms step_avg:60.27ms -step:1108/2285 train_time:66781ms step_avg:60.27ms -step:1109/2285 train_time:66843ms step_avg:60.27ms -step:1110/2285 train_time:66903ms step_avg:60.27ms -step:1111/2285 train_time:66965ms step_avg:60.27ms -step:1112/2285 train_time:67024ms step_avg:60.27ms -step:1113/2285 train_time:67086ms step_avg:60.27ms -step:1114/2285 train_time:67145ms step_avg:60.27ms -step:1115/2285 train_time:67207ms step_avg:60.28ms -step:1116/2285 train_time:67266ms step_avg:60.27ms -step:1117/2285 train_time:67328ms step_avg:60.28ms -step:1118/2285 train_time:67388ms step_avg:60.28ms -step:1119/2285 train_time:67449ms step_avg:60.28ms -step:1120/2285 train_time:67508ms step_avg:60.28ms -step:1121/2285 train_time:67571ms step_avg:60.28ms -step:1122/2285 train_time:67630ms step_avg:60.28ms -step:1123/2285 train_time:67692ms step_avg:60.28ms -step:1124/2285 train_time:67752ms step_avg:60.28ms -step:1125/2285 train_time:67814ms step_avg:60.28ms -step:1126/2285 train_time:67873ms step_avg:60.28ms -step:1127/2285 train_time:67936ms step_avg:60.28ms -step:1128/2285 train_time:67996ms step_avg:60.28ms -step:1129/2285 train_time:68058ms step_avg:60.28ms -step:1130/2285 train_time:68117ms step_avg:60.28ms -step:1131/2285 train_time:68180ms step_avg:60.28ms -step:1132/2285 train_time:68239ms step_avg:60.28ms -step:1133/2285 train_time:68301ms step_avg:60.28ms -step:1134/2285 train_time:68361ms step_avg:60.28ms -step:1135/2285 train_time:68423ms step_avg:60.28ms -step:1136/2285 train_time:68483ms step_avg:60.28ms -step:1137/2285 train_time:68544ms step_avg:60.29ms -step:1138/2285 train_time:68604ms step_avg:60.28ms -step:1139/2285 train_time:68666ms step_avg:60.29ms -step:1140/2285 train_time:68726ms step_avg:60.29ms -step:1141/2285 train_time:68788ms step_avg:60.29ms -step:1142/2285 train_time:68847ms step_avg:60.29ms -step:1143/2285 train_time:68909ms step_avg:60.29ms -step:1144/2285 train_time:68968ms step_avg:60.29ms -step:1145/2285 train_time:69030ms step_avg:60.29ms -step:1146/2285 train_time:69089ms step_avg:60.29ms -step:1147/2285 train_time:69152ms step_avg:60.29ms -step:1148/2285 train_time:69211ms step_avg:60.29ms -step:1149/2285 train_time:69274ms step_avg:60.29ms -step:1150/2285 train_time:69334ms step_avg:60.29ms -step:1151/2285 train_time:69397ms step_avg:60.29ms -step:1152/2285 train_time:69457ms step_avg:60.29ms -step:1153/2285 train_time:69520ms step_avg:60.29ms -step:1154/2285 train_time:69581ms step_avg:60.30ms -step:1155/2285 train_time:69643ms step_avg:60.30ms -step:1156/2285 train_time:69703ms step_avg:60.30ms -step:1157/2285 train_time:69765ms step_avg:60.30ms -step:1158/2285 train_time:69826ms step_avg:60.30ms -step:1159/2285 train_time:69887ms step_avg:60.30ms -step:1160/2285 train_time:69946ms step_avg:60.30ms -step:1161/2285 train_time:70008ms step_avg:60.30ms -step:1162/2285 train_time:70067ms step_avg:60.30ms -step:1163/2285 train_time:70129ms step_avg:60.30ms -step:1164/2285 train_time:70189ms step_avg:60.30ms -step:1165/2285 train_time:70251ms step_avg:60.30ms -step:1166/2285 train_time:70310ms step_avg:60.30ms -step:1167/2285 train_time:70373ms step_avg:60.30ms -step:1168/2285 train_time:70433ms step_avg:60.30ms -step:1169/2285 train_time:70496ms step_avg:60.30ms -step:1170/2285 train_time:70556ms step_avg:60.30ms -step:1171/2285 train_time:70620ms step_avg:60.31ms -step:1172/2285 train_time:70679ms step_avg:60.31ms -step:1173/2285 train_time:70741ms step_avg:60.31ms -step:1174/2285 train_time:70801ms step_avg:60.31ms -step:1175/2285 train_time:70863ms step_avg:60.31ms -step:1176/2285 train_time:70923ms step_avg:60.31ms -step:1177/2285 train_time:70986ms step_avg:60.31ms -step:1178/2285 train_time:71045ms step_avg:60.31ms -step:1179/2285 train_time:71106ms step_avg:60.31ms -step:1180/2285 train_time:71166ms step_avg:60.31ms -step:1181/2285 train_time:71228ms step_avg:60.31ms -step:1182/2285 train_time:71288ms step_avg:60.31ms -step:1183/2285 train_time:71350ms step_avg:60.31ms -step:1184/2285 train_time:71410ms step_avg:60.31ms -step:1185/2285 train_time:71472ms step_avg:60.31ms -step:1186/2285 train_time:71532ms step_avg:60.31ms -step:1187/2285 train_time:71595ms step_avg:60.32ms -step:1188/2285 train_time:71655ms step_avg:60.32ms -step:1189/2285 train_time:71717ms step_avg:60.32ms -step:1190/2285 train_time:71778ms step_avg:60.32ms -step:1191/2285 train_time:71840ms step_avg:60.32ms -step:1192/2285 train_time:71900ms step_avg:60.32ms -step:1193/2285 train_time:71962ms step_avg:60.32ms -step:1194/2285 train_time:72021ms step_avg:60.32ms -step:1195/2285 train_time:72084ms step_avg:60.32ms -step:1196/2285 train_time:72144ms step_avg:60.32ms -step:1197/2285 train_time:72205ms step_avg:60.32ms -step:1198/2285 train_time:72264ms step_avg:60.32ms -step:1199/2285 train_time:72326ms step_avg:60.32ms -step:1200/2285 train_time:72386ms step_avg:60.32ms -step:1201/2285 train_time:72448ms step_avg:60.32ms -step:1202/2285 train_time:72507ms step_avg:60.32ms -step:1203/2285 train_time:72570ms step_avg:60.32ms -step:1204/2285 train_time:72630ms step_avg:60.32ms -step:1205/2285 train_time:72692ms step_avg:60.33ms -step:1206/2285 train_time:72751ms step_avg:60.32ms -step:1207/2285 train_time:72814ms step_avg:60.33ms -step:1208/2285 train_time:72875ms step_avg:60.33ms -step:1209/2285 train_time:72937ms step_avg:60.33ms -step:1210/2285 train_time:72997ms step_avg:60.33ms -step:1211/2285 train_time:73059ms step_avg:60.33ms -step:1212/2285 train_time:73119ms step_avg:60.33ms -step:1213/2285 train_time:73182ms step_avg:60.33ms -step:1214/2285 train_time:73242ms step_avg:60.33ms -step:1215/2285 train_time:73304ms step_avg:60.33ms -step:1216/2285 train_time:73364ms step_avg:60.33ms -step:1217/2285 train_time:73426ms step_avg:60.33ms -step:1218/2285 train_time:73485ms step_avg:60.33ms -step:1219/2285 train_time:73547ms step_avg:60.33ms -step:1220/2285 train_time:73606ms step_avg:60.33ms -step:1221/2285 train_time:73668ms step_avg:60.33ms -step:1222/2285 train_time:73728ms step_avg:60.33ms -step:1223/2285 train_time:73791ms step_avg:60.34ms -step:1224/2285 train_time:73850ms step_avg:60.34ms -step:1225/2285 train_time:73912ms step_avg:60.34ms -step:1226/2285 train_time:73972ms step_avg:60.34ms -step:1227/2285 train_time:74034ms step_avg:60.34ms -step:1228/2285 train_time:74095ms step_avg:60.34ms -step:1229/2285 train_time:74158ms step_avg:60.34ms -step:1230/2285 train_time:74218ms step_avg:60.34ms -step:1231/2285 train_time:74280ms step_avg:60.34ms -step:1232/2285 train_time:74340ms step_avg:60.34ms -step:1233/2285 train_time:74402ms step_avg:60.34ms -step:1234/2285 train_time:74462ms step_avg:60.34ms -step:1235/2285 train_time:74525ms step_avg:60.34ms -step:1236/2285 train_time:74585ms step_avg:60.34ms -step:1237/2285 train_time:74646ms step_avg:60.34ms -step:1238/2285 train_time:74706ms step_avg:60.34ms -step:1239/2285 train_time:74767ms step_avg:60.34ms -step:1240/2285 train_time:74827ms step_avg:60.34ms -step:1241/2285 train_time:74889ms step_avg:60.35ms -step:1242/2285 train_time:74949ms step_avg:60.35ms -step:1243/2285 train_time:75012ms step_avg:60.35ms -step:1244/2285 train_time:75071ms step_avg:60.35ms -step:1245/2285 train_time:75134ms step_avg:60.35ms -step:1246/2285 train_time:75195ms step_avg:60.35ms -step:1247/2285 train_time:75256ms step_avg:60.35ms -step:1248/2285 train_time:75315ms step_avg:60.35ms -step:1249/2285 train_time:75378ms step_avg:60.35ms -step:1250/2285 train_time:75439ms step_avg:60.35ms -step:1250/2285 val_loss:3.4968 train_time:75503ms step_avg:60.40ms -step:1251/2285 train_time:75531ms step_avg:60.38ms -step:1252/2285 train_time:75563ms step_avg:60.35ms -step:1253/2285 train_time:75624ms step_avg:60.35ms -step:1254/2285 train_time:75684ms step_avg:60.35ms -step:1255/2285 train_time:75748ms step_avg:60.36ms -step:1256/2285 train_time:75808ms step_avg:60.36ms -step:1257/2285 train_time:75870ms step_avg:60.36ms -step:1258/2285 train_time:75929ms step_avg:60.36ms -step:1259/2285 train_time:75990ms step_avg:60.36ms -step:1260/2285 train_time:76048ms step_avg:60.36ms -step:1261/2285 train_time:76109ms step_avg:60.36ms -step:1262/2285 train_time:76168ms step_avg:60.35ms -step:1263/2285 train_time:76229ms step_avg:60.36ms -step:1264/2285 train_time:76287ms step_avg:60.35ms -step:1265/2285 train_time:76348ms step_avg:60.35ms -step:1266/2285 train_time:76410ms step_avg:60.36ms -step:1267/2285 train_time:76477ms step_avg:60.36ms -step:1268/2285 train_time:76537ms step_avg:60.36ms -step:1269/2285 train_time:76599ms step_avg:60.36ms -step:1270/2285 train_time:76659ms step_avg:60.36ms -step:1271/2285 train_time:76721ms step_avg:60.36ms -step:1272/2285 train_time:76781ms step_avg:60.36ms -step:1273/2285 train_time:76842ms step_avg:60.36ms -step:1274/2285 train_time:76902ms step_avg:60.36ms -step:1275/2285 train_time:76965ms step_avg:60.36ms -step:1276/2285 train_time:77024ms step_avg:60.36ms -step:1277/2285 train_time:77085ms step_avg:60.36ms -step:1278/2285 train_time:77144ms step_avg:60.36ms -step:1279/2285 train_time:77205ms step_avg:60.36ms -step:1280/2285 train_time:77265ms step_avg:60.36ms -step:1281/2285 train_time:77327ms step_avg:60.36ms -step:1282/2285 train_time:77388ms step_avg:60.37ms -step:1283/2285 train_time:77452ms step_avg:60.37ms -step:1284/2285 train_time:77511ms step_avg:60.37ms -step:1285/2285 train_time:77573ms step_avg:60.37ms -step:1286/2285 train_time:77633ms step_avg:60.37ms -step:1287/2285 train_time:77696ms step_avg:60.37ms -step:1288/2285 train_time:77755ms step_avg:60.37ms -step:1289/2285 train_time:77818ms step_avg:60.37ms -step:1290/2285 train_time:77878ms step_avg:60.37ms -step:1291/2285 train_time:77939ms step_avg:60.37ms -step:1292/2285 train_time:77999ms step_avg:60.37ms -step:1293/2285 train_time:78061ms step_avg:60.37ms -step:1294/2285 train_time:78120ms step_avg:60.37ms -step:1295/2285 train_time:78182ms step_avg:60.37ms -step:1296/2285 train_time:78241ms step_avg:60.37ms -step:1297/2285 train_time:78304ms step_avg:60.37ms -step:1298/2285 train_time:78364ms step_avg:60.37ms -step:1299/2285 train_time:78426ms step_avg:60.37ms -step:1300/2285 train_time:78487ms step_avg:60.37ms -step:1301/2285 train_time:78550ms step_avg:60.38ms -step:1302/2285 train_time:78609ms step_avg:60.38ms -step:1303/2285 train_time:78671ms step_avg:60.38ms -step:1304/2285 train_time:78731ms step_avg:60.38ms -step:1305/2285 train_time:78793ms step_avg:60.38ms -step:1306/2285 train_time:78853ms step_avg:60.38ms -step:1307/2285 train_time:78915ms step_avg:60.38ms -step:1308/2285 train_time:78975ms step_avg:60.38ms -step:1309/2285 train_time:79036ms step_avg:60.38ms -step:1310/2285 train_time:79096ms step_avg:60.38ms -step:1311/2285 train_time:79158ms step_avg:60.38ms -step:1312/2285 train_time:79218ms step_avg:60.38ms -step:1313/2285 train_time:79281ms step_avg:60.38ms -step:1314/2285 train_time:79341ms step_avg:60.38ms -step:1315/2285 train_time:79404ms step_avg:60.38ms -step:1316/2285 train_time:79464ms step_avg:60.38ms -step:1317/2285 train_time:79526ms step_avg:60.38ms -step:1318/2285 train_time:79586ms step_avg:60.38ms -step:1319/2285 train_time:79648ms step_avg:60.39ms -step:1320/2285 train_time:79708ms step_avg:60.39ms -step:1321/2285 train_time:79770ms step_avg:60.39ms -step:1322/2285 train_time:79830ms step_avg:60.39ms -step:1323/2285 train_time:79891ms step_avg:60.39ms -step:1324/2285 train_time:79950ms step_avg:60.39ms -step:1325/2285 train_time:80013ms step_avg:60.39ms -step:1326/2285 train_time:80073ms step_avg:60.39ms -step:1327/2285 train_time:80135ms step_avg:60.39ms -step:1328/2285 train_time:80194ms step_avg:60.39ms -step:1329/2285 train_time:80256ms step_avg:60.39ms -step:1330/2285 train_time:80316ms step_avg:60.39ms -step:1331/2285 train_time:80378ms step_avg:60.39ms -step:1332/2285 train_time:80438ms step_avg:60.39ms -step:1333/2285 train_time:80501ms step_avg:60.39ms -step:1334/2285 train_time:80562ms step_avg:60.39ms -step:1335/2285 train_time:80624ms step_avg:60.39ms -step:1336/2285 train_time:80684ms step_avg:60.39ms -step:1337/2285 train_time:80746ms step_avg:60.39ms -step:1338/2285 train_time:80805ms step_avg:60.39ms -step:1339/2285 train_time:80867ms step_avg:60.39ms -step:1340/2285 train_time:80927ms step_avg:60.39ms -step:1341/2285 train_time:80989ms step_avg:60.39ms -step:1342/2285 train_time:81048ms step_avg:60.39ms -step:1343/2285 train_time:81110ms step_avg:60.39ms -step:1344/2285 train_time:81170ms step_avg:60.39ms -step:1345/2285 train_time:81232ms step_avg:60.40ms -step:1346/2285 train_time:81292ms step_avg:60.40ms -step:1347/2285 train_time:81354ms step_avg:60.40ms -step:1348/2285 train_time:81414ms step_avg:60.40ms -step:1349/2285 train_time:81476ms step_avg:60.40ms -step:1350/2285 train_time:81535ms step_avg:60.40ms -step:1351/2285 train_time:81598ms step_avg:60.40ms -step:1352/2285 train_time:81658ms step_avg:60.40ms -step:1353/2285 train_time:81720ms step_avg:60.40ms -step:1354/2285 train_time:81780ms step_avg:60.40ms -step:1355/2285 train_time:81843ms step_avg:60.40ms -step:1356/2285 train_time:81902ms step_avg:60.40ms -step:1357/2285 train_time:81966ms step_avg:60.40ms -step:1358/2285 train_time:82025ms step_avg:60.40ms -step:1359/2285 train_time:82088ms step_avg:60.40ms -step:1360/2285 train_time:82147ms step_avg:60.40ms -step:1361/2285 train_time:82209ms step_avg:60.40ms -step:1362/2285 train_time:82269ms step_avg:60.40ms -step:1363/2285 train_time:82330ms step_avg:60.40ms -step:1364/2285 train_time:82390ms step_avg:60.40ms -step:1365/2285 train_time:82452ms step_avg:60.40ms -step:1366/2285 train_time:82512ms step_avg:60.40ms -step:1367/2285 train_time:82574ms step_avg:60.41ms -step:1368/2285 train_time:82634ms step_avg:60.41ms -step:1369/2285 train_time:82697ms step_avg:60.41ms -step:1370/2285 train_time:82756ms step_avg:60.41ms -step:1371/2285 train_time:82818ms step_avg:60.41ms -step:1372/2285 train_time:82878ms step_avg:60.41ms -step:1373/2285 train_time:82940ms step_avg:60.41ms -step:1374/2285 train_time:83000ms step_avg:60.41ms -step:1375/2285 train_time:83063ms step_avg:60.41ms -step:1376/2285 train_time:83123ms step_avg:60.41ms -step:1377/2285 train_time:83185ms step_avg:60.41ms -step:1378/2285 train_time:83244ms step_avg:60.41ms -step:1379/2285 train_time:83306ms step_avg:60.41ms -step:1380/2285 train_time:83367ms step_avg:60.41ms -step:1381/2285 train_time:83429ms step_avg:60.41ms -step:1382/2285 train_time:83489ms step_avg:60.41ms -step:1383/2285 train_time:83551ms step_avg:60.41ms -step:1384/2285 train_time:83610ms step_avg:60.41ms -step:1385/2285 train_time:83672ms step_avg:60.41ms -step:1386/2285 train_time:83732ms step_avg:60.41ms -step:1387/2285 train_time:83794ms step_avg:60.41ms -step:1388/2285 train_time:83853ms step_avg:60.41ms -step:1389/2285 train_time:83915ms step_avg:60.41ms -step:1390/2285 train_time:83975ms step_avg:60.41ms -step:1391/2285 train_time:84037ms step_avg:60.41ms -step:1392/2285 train_time:84096ms step_avg:60.41ms -step:1393/2285 train_time:84159ms step_avg:60.42ms -step:1394/2285 train_time:84219ms step_avg:60.42ms -step:1395/2285 train_time:84281ms step_avg:60.42ms -step:1396/2285 train_time:84341ms step_avg:60.42ms -step:1397/2285 train_time:84404ms step_avg:60.42ms -step:1398/2285 train_time:84464ms step_avg:60.42ms -step:1399/2285 train_time:84526ms step_avg:60.42ms -step:1400/2285 train_time:84586ms step_avg:60.42ms -step:1401/2285 train_time:84648ms step_avg:60.42ms -step:1402/2285 train_time:84707ms step_avg:60.42ms -step:1403/2285 train_time:84770ms step_avg:60.42ms -step:1404/2285 train_time:84830ms step_avg:60.42ms -step:1405/2285 train_time:84891ms step_avg:60.42ms -step:1406/2285 train_time:84951ms step_avg:60.42ms -step:1407/2285 train_time:85013ms step_avg:60.42ms -step:1408/2285 train_time:85073ms step_avg:60.42ms -step:1409/2285 train_time:85136ms step_avg:60.42ms -step:1410/2285 train_time:85194ms step_avg:60.42ms -step:1411/2285 train_time:85257ms step_avg:60.42ms -step:1412/2285 train_time:85317ms step_avg:60.42ms -step:1413/2285 train_time:85379ms step_avg:60.42ms -step:1414/2285 train_time:85439ms step_avg:60.42ms -step:1415/2285 train_time:85501ms step_avg:60.42ms -step:1416/2285 train_time:85562ms step_avg:60.43ms -step:1417/2285 train_time:85625ms step_avg:60.43ms -step:1418/2285 train_time:85684ms step_avg:60.43ms -step:1419/2285 train_time:85747ms step_avg:60.43ms -step:1420/2285 train_time:85807ms step_avg:60.43ms -step:1421/2285 train_time:85870ms step_avg:60.43ms -step:1422/2285 train_time:85929ms step_avg:60.43ms -step:1423/2285 train_time:85991ms step_avg:60.43ms -step:1424/2285 train_time:86050ms step_avg:60.43ms -step:1425/2285 train_time:86112ms step_avg:60.43ms -step:1426/2285 train_time:86171ms step_avg:60.43ms -step:1427/2285 train_time:86233ms step_avg:60.43ms -step:1428/2285 train_time:86292ms step_avg:60.43ms -step:1429/2285 train_time:86354ms step_avg:60.43ms -step:1430/2285 train_time:86414ms step_avg:60.43ms -step:1431/2285 train_time:86476ms step_avg:60.43ms -step:1432/2285 train_time:86536ms step_avg:60.43ms -step:1433/2285 train_time:86598ms step_avg:60.43ms -step:1434/2285 train_time:86658ms step_avg:60.43ms -step:1435/2285 train_time:86721ms step_avg:60.43ms -step:1436/2285 train_time:86781ms step_avg:60.43ms -step:1437/2285 train_time:86844ms step_avg:60.43ms -step:1438/2285 train_time:86903ms step_avg:60.43ms -step:1439/2285 train_time:86965ms step_avg:60.43ms -step:1440/2285 train_time:87025ms step_avg:60.43ms -step:1441/2285 train_time:87087ms step_avg:60.44ms -step:1442/2285 train_time:87147ms step_avg:60.43ms -step:1443/2285 train_time:87209ms step_avg:60.44ms -step:1444/2285 train_time:87268ms step_avg:60.44ms -step:1445/2285 train_time:87330ms step_avg:60.44ms -step:1446/2285 train_time:87390ms step_avg:60.44ms -step:1447/2285 train_time:87452ms step_avg:60.44ms -step:1448/2285 train_time:87512ms step_avg:60.44ms -step:1449/2285 train_time:87575ms step_avg:60.44ms -step:1450/2285 train_time:87635ms step_avg:60.44ms -step:1451/2285 train_time:87697ms step_avg:60.44ms -step:1452/2285 train_time:87757ms step_avg:60.44ms -step:1453/2285 train_time:87819ms step_avg:60.44ms -step:1454/2285 train_time:87879ms step_avg:60.44ms -step:1455/2285 train_time:87941ms step_avg:60.44ms -step:1456/2285 train_time:88001ms step_avg:60.44ms -step:1457/2285 train_time:88063ms step_avg:60.44ms -step:1458/2285 train_time:88123ms step_avg:60.44ms -step:1459/2285 train_time:88185ms step_avg:60.44ms -step:1460/2285 train_time:88245ms step_avg:60.44ms -step:1461/2285 train_time:88307ms step_avg:60.44ms -step:1462/2285 train_time:88367ms step_avg:60.44ms -step:1463/2285 train_time:88430ms step_avg:60.44ms -step:1464/2285 train_time:88489ms step_avg:60.44ms -step:1465/2285 train_time:88551ms step_avg:60.44ms -step:1466/2285 train_time:88611ms step_avg:60.44ms -step:1467/2285 train_time:88673ms step_avg:60.45ms -step:1468/2285 train_time:88733ms step_avg:60.44ms -step:1469/2285 train_time:88795ms step_avg:60.45ms -step:1470/2285 train_time:88855ms step_avg:60.45ms -step:1471/2285 train_time:88917ms step_avg:60.45ms -step:1472/2285 train_time:88976ms step_avg:60.45ms -step:1473/2285 train_time:89039ms step_avg:60.45ms -step:1474/2285 train_time:89099ms step_avg:60.45ms -step:1475/2285 train_time:89162ms step_avg:60.45ms -step:1476/2285 train_time:89221ms step_avg:60.45ms -step:1477/2285 train_time:89283ms step_avg:60.45ms -step:1478/2285 train_time:89343ms step_avg:60.45ms -step:1479/2285 train_time:89405ms step_avg:60.45ms -step:1480/2285 train_time:89466ms step_avg:60.45ms -step:1481/2285 train_time:89528ms step_avg:60.45ms -step:1482/2285 train_time:89588ms step_avg:60.45ms -step:1483/2285 train_time:89650ms step_avg:60.45ms -step:1484/2285 train_time:89709ms step_avg:60.45ms -step:1485/2285 train_time:89771ms step_avg:60.45ms -step:1486/2285 train_time:89831ms step_avg:60.45ms -step:1487/2285 train_time:89893ms step_avg:60.45ms -step:1488/2285 train_time:89953ms step_avg:60.45ms -step:1489/2285 train_time:90015ms step_avg:60.45ms -step:1490/2285 train_time:90075ms step_avg:60.45ms -step:1491/2285 train_time:90137ms step_avg:60.45ms -step:1492/2285 train_time:90197ms step_avg:60.45ms -step:1493/2285 train_time:90260ms step_avg:60.46ms -step:1494/2285 train_time:90320ms step_avg:60.45ms -step:1495/2285 train_time:90382ms step_avg:60.46ms -step:1496/2285 train_time:90442ms step_avg:60.46ms -step:1497/2285 train_time:90505ms step_avg:60.46ms -step:1498/2285 train_time:90565ms step_avg:60.46ms -step:1499/2285 train_time:90627ms step_avg:60.46ms -step:1500/2285 train_time:90687ms step_avg:60.46ms -step:1500/2285 val_loss:3.4294 train_time:90751ms step_avg:60.50ms -step:1501/2285 train_time:90773ms step_avg:60.48ms -step:1502/2285 train_time:90814ms step_avg:60.46ms -step:1503/2285 train_time:90879ms step_avg:60.47ms -step:1504/2285 train_time:90940ms step_avg:60.47ms -step:1505/2285 train_time:91002ms step_avg:60.47ms -step:1506/2285 train_time:91062ms step_avg:60.47ms -step:1507/2285 train_time:91123ms step_avg:60.47ms -step:1508/2285 train_time:91182ms step_avg:60.47ms -step:1509/2285 train_time:91244ms step_avg:60.47ms -step:1510/2285 train_time:91303ms step_avg:60.47ms -step:1511/2285 train_time:91364ms step_avg:60.47ms -step:1512/2285 train_time:91423ms step_avg:60.47ms -step:1513/2285 train_time:91484ms step_avg:60.47ms -step:1514/2285 train_time:91544ms step_avg:60.46ms -step:1515/2285 train_time:91606ms step_avg:60.47ms -step:1516/2285 train_time:91666ms step_avg:60.47ms -step:1517/2285 train_time:91729ms step_avg:60.47ms -step:1518/2285 train_time:91790ms step_avg:60.47ms -step:1519/2285 train_time:91853ms step_avg:60.47ms -step:1520/2285 train_time:91914ms step_avg:60.47ms -step:1521/2285 train_time:91976ms step_avg:60.47ms -step:1522/2285 train_time:92036ms step_avg:60.47ms -step:1523/2285 train_time:92098ms step_avg:60.47ms -step:1524/2285 train_time:92157ms step_avg:60.47ms -step:1525/2285 train_time:92220ms step_avg:60.47ms -step:1526/2285 train_time:92280ms step_avg:60.47ms -step:1527/2285 train_time:92342ms step_avg:60.47ms -step:1528/2285 train_time:92402ms step_avg:60.47ms -step:1529/2285 train_time:92464ms step_avg:60.47ms -step:1530/2285 train_time:92523ms step_avg:60.47ms -step:1531/2285 train_time:92585ms step_avg:60.47ms -step:1532/2285 train_time:92645ms step_avg:60.47ms -step:1533/2285 train_time:92708ms step_avg:60.47ms -step:1534/2285 train_time:92768ms step_avg:60.47ms -step:1535/2285 train_time:92831ms step_avg:60.48ms -step:1536/2285 train_time:92892ms step_avg:60.48ms -step:1537/2285 train_time:92954ms step_avg:60.48ms -step:1538/2285 train_time:93015ms step_avg:60.48ms -step:1539/2285 train_time:93077ms step_avg:60.48ms -step:1540/2285 train_time:93137ms step_avg:60.48ms -step:1541/2285 train_time:93200ms step_avg:60.48ms -step:1542/2285 train_time:93260ms step_avg:60.48ms -step:1543/2285 train_time:93322ms step_avg:60.48ms -step:1544/2285 train_time:93381ms step_avg:60.48ms -step:1545/2285 train_time:93443ms step_avg:60.48ms -step:1546/2285 train_time:93503ms step_avg:60.48ms -step:1547/2285 train_time:93565ms step_avg:60.48ms -step:1548/2285 train_time:93625ms step_avg:60.48ms -step:1549/2285 train_time:93688ms step_avg:60.48ms -step:1550/2285 train_time:93748ms step_avg:60.48ms -step:1551/2285 train_time:93811ms step_avg:60.48ms -step:1552/2285 train_time:93871ms step_avg:60.48ms -step:1553/2285 train_time:93934ms step_avg:60.49ms -step:1554/2285 train_time:93993ms step_avg:60.48ms -step:1555/2285 train_time:94056ms step_avg:60.49ms -step:1556/2285 train_time:94116ms step_avg:60.49ms -step:1557/2285 train_time:94178ms step_avg:60.49ms -step:1558/2285 train_time:94238ms step_avg:60.49ms -step:1559/2285 train_time:94301ms step_avg:60.49ms -step:1560/2285 train_time:94361ms step_avg:60.49ms -step:1561/2285 train_time:94423ms step_avg:60.49ms -step:1562/2285 train_time:94483ms step_avg:60.49ms -step:1563/2285 train_time:94544ms step_avg:60.49ms -step:1564/2285 train_time:94604ms step_avg:60.49ms -step:1565/2285 train_time:94667ms step_avg:60.49ms -step:1566/2285 train_time:94727ms step_avg:60.49ms -step:1567/2285 train_time:94789ms step_avg:60.49ms -step:1568/2285 train_time:94849ms step_avg:60.49ms -step:1569/2285 train_time:94912ms step_avg:60.49ms -step:1570/2285 train_time:94972ms step_avg:60.49ms -step:1571/2285 train_time:95034ms step_avg:60.49ms -step:1572/2285 train_time:95094ms step_avg:60.49ms -step:1573/2285 train_time:95156ms step_avg:60.49ms -step:1574/2285 train_time:95216ms step_avg:60.49ms -step:1575/2285 train_time:95279ms step_avg:60.49ms -step:1576/2285 train_time:95339ms step_avg:60.49ms -step:1577/2285 train_time:95402ms step_avg:60.50ms -step:1578/2285 train_time:95462ms step_avg:60.50ms -step:1579/2285 train_time:95524ms step_avg:60.50ms -step:1580/2285 train_time:95584ms step_avg:60.50ms -step:1581/2285 train_time:95646ms step_avg:60.50ms -step:1582/2285 train_time:95707ms step_avg:60.50ms -step:1583/2285 train_time:95769ms step_avg:60.50ms -step:1584/2285 train_time:95829ms step_avg:60.50ms -step:1585/2285 train_time:95891ms step_avg:60.50ms -step:1586/2285 train_time:95952ms step_avg:60.50ms -step:1587/2285 train_time:96014ms step_avg:60.50ms -step:1588/2285 train_time:96073ms step_avg:60.50ms -step:1589/2285 train_time:96136ms step_avg:60.50ms -step:1590/2285 train_time:96196ms step_avg:60.50ms -step:1591/2285 train_time:96258ms step_avg:60.50ms -step:1592/2285 train_time:96319ms step_avg:60.50ms -step:1593/2285 train_time:96381ms step_avg:60.50ms -step:1594/2285 train_time:96441ms step_avg:60.50ms -step:1595/2285 train_time:96504ms step_avg:60.50ms -step:1596/2285 train_time:96564ms step_avg:60.50ms -step:1597/2285 train_time:96626ms step_avg:60.50ms -step:1598/2285 train_time:96686ms step_avg:60.50ms -step:1599/2285 train_time:96749ms step_avg:60.51ms -step:1600/2285 train_time:96810ms step_avg:60.51ms -step:1601/2285 train_time:96872ms step_avg:60.51ms -step:1602/2285 train_time:96932ms step_avg:60.51ms -step:1603/2285 train_time:96994ms step_avg:60.51ms -step:1604/2285 train_time:97053ms step_avg:60.51ms -step:1605/2285 train_time:97116ms step_avg:60.51ms -step:1606/2285 train_time:97175ms step_avg:60.51ms -step:1607/2285 train_time:97237ms step_avg:60.51ms -step:1608/2285 train_time:97298ms step_avg:60.51ms -step:1609/2285 train_time:97360ms step_avg:60.51ms -step:1610/2285 train_time:97420ms step_avg:60.51ms -step:1611/2285 train_time:97482ms step_avg:60.51ms -step:1612/2285 train_time:97542ms step_avg:60.51ms -step:1613/2285 train_time:97605ms step_avg:60.51ms -step:1614/2285 train_time:97665ms step_avg:60.51ms -step:1615/2285 train_time:97728ms step_avg:60.51ms -step:1616/2285 train_time:97788ms step_avg:60.51ms -step:1617/2285 train_time:97851ms step_avg:60.51ms -step:1618/2285 train_time:97911ms step_avg:60.51ms -step:1619/2285 train_time:97973ms step_avg:60.51ms -step:1620/2285 train_time:98032ms step_avg:60.51ms -step:1621/2285 train_time:98094ms step_avg:60.51ms -step:1622/2285 train_time:98154ms step_avg:60.51ms -step:1623/2285 train_time:98216ms step_avg:60.52ms -step:1624/2285 train_time:98276ms step_avg:60.51ms -step:1625/2285 train_time:98339ms step_avg:60.52ms -step:1626/2285 train_time:98400ms step_avg:60.52ms -step:1627/2285 train_time:98463ms step_avg:60.52ms -step:1628/2285 train_time:98523ms step_avg:60.52ms -step:1629/2285 train_time:98585ms step_avg:60.52ms -step:1630/2285 train_time:98645ms step_avg:60.52ms -step:1631/2285 train_time:98708ms step_avg:60.52ms -step:1632/2285 train_time:98767ms step_avg:60.52ms -step:1633/2285 train_time:98830ms step_avg:60.52ms -step:1634/2285 train_time:98890ms step_avg:60.52ms -step:1635/2285 train_time:98952ms step_avg:60.52ms -step:1636/2285 train_time:99012ms step_avg:60.52ms -step:1637/2285 train_time:99074ms step_avg:60.52ms -step:1638/2285 train_time:99133ms step_avg:60.52ms -step:1639/2285 train_time:99195ms step_avg:60.52ms -step:1640/2285 train_time:99255ms step_avg:60.52ms -step:1641/2285 train_time:99318ms step_avg:60.52ms -step:1642/2285 train_time:99378ms step_avg:60.52ms -step:1643/2285 train_time:99441ms step_avg:60.52ms -step:1644/2285 train_time:99502ms step_avg:60.52ms -step:1645/2285 train_time:99564ms step_avg:60.53ms -step:1646/2285 train_time:99624ms step_avg:60.52ms -step:1647/2285 train_time:99686ms step_avg:60.53ms -step:1648/2285 train_time:99746ms step_avg:60.53ms -step:1649/2285 train_time:99808ms step_avg:60.53ms -step:1650/2285 train_time:99869ms step_avg:60.53ms -step:1651/2285 train_time:99930ms step_avg:60.53ms -step:1652/2285 train_time:99990ms step_avg:60.53ms -step:1653/2285 train_time:100052ms step_avg:60.53ms -step:1654/2285 train_time:100112ms step_avg:60.53ms -step:1655/2285 train_time:100175ms step_avg:60.53ms -step:1656/2285 train_time:100234ms step_avg:60.53ms -step:1657/2285 train_time:100297ms step_avg:60.53ms -step:1658/2285 train_time:100357ms step_avg:60.53ms -step:1659/2285 train_time:100421ms step_avg:60.53ms -step:1660/2285 train_time:100480ms step_avg:60.53ms -step:1661/2285 train_time:100543ms step_avg:60.53ms -step:1662/2285 train_time:100603ms step_avg:60.53ms -step:1663/2285 train_time:100665ms step_avg:60.53ms -step:1664/2285 train_time:100725ms step_avg:60.53ms -step:1665/2285 train_time:100788ms step_avg:60.53ms -step:1666/2285 train_time:100847ms step_avg:60.53ms -step:1667/2285 train_time:100910ms step_avg:60.53ms -step:1668/2285 train_time:100969ms step_avg:60.53ms -step:1669/2285 train_time:101031ms step_avg:60.53ms -step:1670/2285 train_time:101092ms step_avg:60.53ms -step:1671/2285 train_time:101154ms step_avg:60.54ms -step:1672/2285 train_time:101215ms step_avg:60.54ms -step:1673/2285 train_time:101277ms step_avg:60.54ms -step:1674/2285 train_time:101337ms step_avg:60.54ms -step:1675/2285 train_time:101399ms step_avg:60.54ms -step:1676/2285 train_time:101459ms step_avg:60.54ms -step:1677/2285 train_time:101522ms step_avg:60.54ms -step:1678/2285 train_time:101582ms step_avg:60.54ms -step:1679/2285 train_time:101645ms step_avg:60.54ms -step:1680/2285 train_time:101705ms step_avg:60.54ms -step:1681/2285 train_time:101768ms step_avg:60.54ms -step:1682/2285 train_time:101828ms step_avg:60.54ms -step:1683/2285 train_time:101890ms step_avg:60.54ms -step:1684/2285 train_time:101949ms step_avg:60.54ms -step:1685/2285 train_time:102012ms step_avg:60.54ms -step:1686/2285 train_time:102071ms step_avg:60.54ms -step:1687/2285 train_time:102133ms step_avg:60.54ms -step:1688/2285 train_time:102193ms step_avg:60.54ms -step:1689/2285 train_time:102255ms step_avg:60.54ms -step:1690/2285 train_time:102315ms step_avg:60.54ms -step:1691/2285 train_time:102378ms step_avg:60.54ms -step:1692/2285 train_time:102439ms step_avg:60.54ms -step:1693/2285 train_time:102502ms step_avg:60.54ms -step:1694/2285 train_time:102562ms step_avg:60.54ms -step:1695/2285 train_time:102624ms step_avg:60.55ms -step:1696/2285 train_time:102684ms step_avg:60.54ms -step:1697/2285 train_time:102746ms step_avg:60.55ms -step:1698/2285 train_time:102807ms step_avg:60.55ms -step:1699/2285 train_time:102869ms step_avg:60.55ms -step:1700/2285 train_time:102929ms step_avg:60.55ms -step:1701/2285 train_time:102991ms step_avg:60.55ms -step:1702/2285 train_time:103050ms step_avg:60.55ms -step:1703/2285 train_time:103113ms step_avg:60.55ms -step:1704/2285 train_time:103172ms step_avg:60.55ms -step:1705/2285 train_time:103235ms step_avg:60.55ms -step:1706/2285 train_time:103294ms step_avg:60.55ms -step:1707/2285 train_time:103358ms step_avg:60.55ms -step:1708/2285 train_time:103418ms step_avg:60.55ms -step:1709/2285 train_time:103481ms step_avg:60.55ms -step:1710/2285 train_time:103541ms step_avg:60.55ms -step:1711/2285 train_time:103604ms step_avg:60.55ms -step:1712/2285 train_time:103663ms step_avg:60.55ms -step:1713/2285 train_time:103726ms step_avg:60.55ms -step:1714/2285 train_time:103786ms step_avg:60.55ms -step:1715/2285 train_time:103848ms step_avg:60.55ms -step:1716/2285 train_time:103909ms step_avg:60.55ms -step:1717/2285 train_time:103970ms step_avg:60.55ms -step:1718/2285 train_time:104030ms step_avg:60.55ms -step:1719/2285 train_time:104092ms step_avg:60.55ms -step:1720/2285 train_time:104152ms step_avg:60.55ms -step:1721/2285 train_time:104214ms step_avg:60.55ms -step:1722/2285 train_time:104274ms step_avg:60.55ms -step:1723/2285 train_time:104336ms step_avg:60.56ms -step:1724/2285 train_time:104396ms step_avg:60.55ms -step:1725/2285 train_time:104459ms step_avg:60.56ms -step:1726/2285 train_time:104520ms step_avg:60.56ms -step:1727/2285 train_time:104582ms step_avg:60.56ms -step:1728/2285 train_time:104642ms step_avg:60.56ms -step:1729/2285 train_time:104705ms step_avg:60.56ms -step:1730/2285 train_time:104765ms step_avg:60.56ms -step:1731/2285 train_time:104827ms step_avg:60.56ms -step:1732/2285 train_time:104887ms step_avg:60.56ms -step:1733/2285 train_time:104949ms step_avg:60.56ms -step:1734/2285 train_time:105009ms step_avg:60.56ms -step:1735/2285 train_time:105072ms step_avg:60.56ms -step:1736/2285 train_time:105131ms step_avg:60.56ms -step:1737/2285 train_time:105193ms step_avg:60.56ms -step:1738/2285 train_time:105253ms step_avg:60.56ms -step:1739/2285 train_time:105315ms step_avg:60.56ms -step:1740/2285 train_time:105374ms step_avg:60.56ms -step:1741/2285 train_time:105437ms step_avg:60.56ms -step:1742/2285 train_time:105497ms step_avg:60.56ms -step:1743/2285 train_time:105560ms step_avg:60.56ms -step:1744/2285 train_time:105621ms step_avg:60.56ms -step:1745/2285 train_time:105683ms step_avg:60.56ms -step:1746/2285 train_time:105743ms step_avg:60.56ms -step:1747/2285 train_time:105806ms step_avg:60.56ms -step:1748/2285 train_time:105866ms step_avg:60.56ms -step:1749/2285 train_time:105928ms step_avg:60.57ms -step:1750/2285 train_time:105988ms step_avg:60.56ms -step:1750/2285 val_loss:3.3689 train_time:106052ms step_avg:60.60ms -step:1751/2285 train_time:106072ms step_avg:60.58ms -step:1752/2285 train_time:106112ms step_avg:60.57ms -step:1753/2285 train_time:106176ms step_avg:60.57ms -step:1754/2285 train_time:106237ms step_avg:60.57ms -step:1755/2285 train_time:106301ms step_avg:60.57ms -step:1756/2285 train_time:106362ms step_avg:60.57ms -step:1757/2285 train_time:106424ms step_avg:60.57ms -step:1758/2285 train_time:106483ms step_avg:60.57ms -step:1759/2285 train_time:106544ms step_avg:60.57ms -step:1760/2285 train_time:106603ms step_avg:60.57ms -step:1761/2285 train_time:106664ms step_avg:60.57ms -step:1762/2285 train_time:106723ms step_avg:60.57ms -step:1763/2285 train_time:106784ms step_avg:60.57ms -step:1764/2285 train_time:106843ms step_avg:60.57ms -step:1765/2285 train_time:106904ms step_avg:60.57ms -step:1766/2285 train_time:106966ms step_avg:60.57ms -step:1767/2285 train_time:107033ms step_avg:60.57ms -step:1768/2285 train_time:107094ms step_avg:60.57ms -step:1769/2285 train_time:107156ms step_avg:60.57ms -step:1770/2285 train_time:107217ms step_avg:60.57ms -step:1771/2285 train_time:107280ms step_avg:60.58ms -step:1772/2285 train_time:107340ms step_avg:60.58ms -step:1773/2285 train_time:107402ms step_avg:60.58ms -step:1774/2285 train_time:107462ms step_avg:60.58ms -step:1775/2285 train_time:107523ms step_avg:60.58ms -step:1776/2285 train_time:107583ms step_avg:60.58ms -step:1777/2285 train_time:107644ms step_avg:60.58ms -step:1778/2285 train_time:107703ms step_avg:60.58ms -step:1779/2285 train_time:107765ms step_avg:60.58ms -step:1780/2285 train_time:107824ms step_avg:60.58ms -step:1781/2285 train_time:107885ms step_avg:60.58ms -step:1782/2285 train_time:107946ms step_avg:60.58ms -step:1783/2285 train_time:108010ms step_avg:60.58ms -step:1784/2285 train_time:108071ms step_avg:60.58ms -step:1785/2285 train_time:108133ms step_avg:60.58ms -step:1786/2285 train_time:108193ms step_avg:60.58ms -step:1787/2285 train_time:108256ms step_avg:60.58ms -step:1788/2285 train_time:108316ms step_avg:60.58ms -step:1789/2285 train_time:108379ms step_avg:60.58ms -step:1790/2285 train_time:108439ms step_avg:60.58ms -step:1791/2285 train_time:108501ms step_avg:60.58ms -step:1792/2285 train_time:108561ms step_avg:60.58ms -step:1793/2285 train_time:108623ms step_avg:60.58ms -step:1794/2285 train_time:108682ms step_avg:60.58ms -step:1795/2285 train_time:108744ms step_avg:60.58ms -step:1796/2285 train_time:108803ms step_avg:60.58ms -step:1797/2285 train_time:108865ms step_avg:60.58ms -step:1798/2285 train_time:108926ms step_avg:60.58ms -step:1799/2285 train_time:108988ms step_avg:60.58ms -step:1800/2285 train_time:109048ms step_avg:60.58ms -step:1801/2285 train_time:109111ms step_avg:60.58ms -step:1802/2285 train_time:109172ms step_avg:60.58ms -step:1803/2285 train_time:109234ms step_avg:60.58ms -step:1804/2285 train_time:109294ms step_avg:60.58ms -step:1805/2285 train_time:109358ms step_avg:60.59ms -step:1806/2285 train_time:109418ms step_avg:60.59ms -step:1807/2285 train_time:109480ms step_avg:60.59ms -step:1808/2285 train_time:109540ms step_avg:60.59ms -step:1809/2285 train_time:109602ms step_avg:60.59ms -step:1810/2285 train_time:109662ms step_avg:60.59ms -step:1811/2285 train_time:109724ms step_avg:60.59ms -step:1812/2285 train_time:109783ms step_avg:60.59ms -step:1813/2285 train_time:109845ms step_avg:60.59ms -step:1814/2285 train_time:109905ms step_avg:60.59ms -step:1815/2285 train_time:109967ms step_avg:60.59ms -step:1816/2285 train_time:110027ms step_avg:60.59ms -step:1817/2285 train_time:110090ms step_avg:60.59ms -step:1818/2285 train_time:110150ms step_avg:60.59ms -step:1819/2285 train_time:110213ms step_avg:60.59ms -step:1820/2285 train_time:110272ms step_avg:60.59ms -step:1821/2285 train_time:110334ms step_avg:60.59ms -step:1822/2285 train_time:110394ms step_avg:60.59ms -step:1823/2285 train_time:110457ms step_avg:60.59ms -step:1824/2285 train_time:110516ms step_avg:60.59ms -step:1825/2285 train_time:110579ms step_avg:60.59ms -step:1826/2285 train_time:110639ms step_avg:60.59ms -step:1827/2285 train_time:110701ms step_avg:60.59ms -step:1828/2285 train_time:110761ms step_avg:60.59ms -step:1829/2285 train_time:110824ms step_avg:60.59ms -step:1830/2285 train_time:110883ms step_avg:60.59ms -step:1831/2285 train_time:110945ms step_avg:60.59ms -step:1832/2285 train_time:111005ms step_avg:60.59ms -step:1833/2285 train_time:111067ms step_avg:60.59ms -step:1834/2285 train_time:111127ms step_avg:60.59ms -step:1835/2285 train_time:111189ms step_avg:60.59ms -step:1836/2285 train_time:111249ms step_avg:60.59ms -step:1837/2285 train_time:111311ms step_avg:60.59ms -step:1838/2285 train_time:111371ms step_avg:60.59ms -step:1839/2285 train_time:111434ms step_avg:60.59ms -step:1840/2285 train_time:111494ms step_avg:60.59ms -step:1841/2285 train_time:111558ms step_avg:60.60ms -step:1842/2285 train_time:111618ms step_avg:60.60ms -step:1843/2285 train_time:111680ms step_avg:60.60ms -step:1844/2285 train_time:111740ms step_avg:60.60ms -step:1845/2285 train_time:111802ms step_avg:60.60ms -step:1846/2285 train_time:111862ms step_avg:60.60ms -step:1847/2285 train_time:111925ms step_avg:60.60ms -step:1848/2285 train_time:111985ms step_avg:60.60ms -step:1849/2285 train_time:112046ms step_avg:60.60ms -step:1850/2285 train_time:112107ms step_avg:60.60ms -step:1851/2285 train_time:112169ms step_avg:60.60ms -step:1852/2285 train_time:112228ms step_avg:60.60ms -step:1853/2285 train_time:112291ms step_avg:60.60ms -step:1854/2285 train_time:112350ms step_avg:60.60ms -step:1855/2285 train_time:112413ms step_avg:60.60ms -step:1856/2285 train_time:112473ms step_avg:60.60ms -step:1857/2285 train_time:112536ms step_avg:60.60ms -step:1858/2285 train_time:112596ms step_avg:60.60ms -step:1859/2285 train_time:112660ms step_avg:60.60ms -step:1860/2285 train_time:112720ms step_avg:60.60ms -step:1861/2285 train_time:112782ms step_avg:60.60ms -step:1862/2285 train_time:112841ms step_avg:60.60ms -step:1863/2285 train_time:112904ms step_avg:60.60ms -step:1864/2285 train_time:112964ms step_avg:60.60ms -step:1865/2285 train_time:113026ms step_avg:60.60ms -step:1866/2285 train_time:113085ms step_avg:60.60ms -step:1867/2285 train_time:113147ms step_avg:60.60ms -step:1868/2285 train_time:113207ms step_avg:60.60ms -step:1869/2285 train_time:113270ms step_avg:60.60ms -step:1870/2285 train_time:113330ms step_avg:60.60ms -step:1871/2285 train_time:113392ms step_avg:60.60ms -step:1872/2285 train_time:113452ms step_avg:60.60ms -step:1873/2285 train_time:113515ms step_avg:60.61ms -step:1874/2285 train_time:113575ms step_avg:60.61ms -step:1875/2285 train_time:113638ms step_avg:60.61ms -step:1876/2285 train_time:113698ms step_avg:60.61ms -step:1877/2285 train_time:113761ms step_avg:60.61ms -step:1878/2285 train_time:113821ms step_avg:60.61ms -step:1879/2285 train_time:113883ms step_avg:60.61ms -step:1880/2285 train_time:113944ms step_avg:60.61ms -step:1881/2285 train_time:114006ms step_avg:60.61ms -step:1882/2285 train_time:114065ms step_avg:60.61ms -step:1883/2285 train_time:114127ms step_avg:60.61ms -step:1884/2285 train_time:114187ms step_avg:60.61ms -step:1885/2285 train_time:114249ms step_avg:60.61ms -step:1886/2285 train_time:114309ms step_avg:60.61ms -step:1887/2285 train_time:114372ms step_avg:60.61ms -step:1888/2285 train_time:114432ms step_avg:60.61ms -step:1889/2285 train_time:114495ms step_avg:60.61ms -step:1890/2285 train_time:114554ms step_avg:60.61ms -step:1891/2285 train_time:114618ms step_avg:60.61ms -step:1892/2285 train_time:114678ms step_avg:60.61ms -step:1893/2285 train_time:114741ms step_avg:60.61ms -step:1894/2285 train_time:114800ms step_avg:60.61ms -step:1895/2285 train_time:114863ms step_avg:60.61ms -step:1896/2285 train_time:114923ms step_avg:60.61ms -step:1897/2285 train_time:114985ms step_avg:60.61ms -step:1898/2285 train_time:115044ms step_avg:60.61ms -step:1899/2285 train_time:115106ms step_avg:60.61ms -step:1900/2285 train_time:115166ms step_avg:60.61ms -step:1901/2285 train_time:115229ms step_avg:60.61ms -step:1902/2285 train_time:115289ms step_avg:60.61ms -step:1903/2285 train_time:115351ms step_avg:60.62ms -step:1904/2285 train_time:115411ms step_avg:60.62ms -step:1905/2285 train_time:115473ms step_avg:60.62ms -step:1906/2285 train_time:115533ms step_avg:60.62ms -step:1907/2285 train_time:115596ms step_avg:60.62ms -step:1908/2285 train_time:115656ms step_avg:60.62ms -step:1909/2285 train_time:115720ms step_avg:60.62ms -step:1910/2285 train_time:115780ms step_avg:60.62ms -step:1911/2285 train_time:115842ms step_avg:60.62ms -step:1912/2285 train_time:115902ms step_avg:60.62ms -step:1913/2285 train_time:115964ms step_avg:60.62ms -step:1914/2285 train_time:116025ms step_avg:60.62ms -step:1915/2285 train_time:116087ms step_avg:60.62ms -step:1916/2285 train_time:116146ms step_avg:60.62ms -step:1917/2285 train_time:116209ms step_avg:60.62ms -step:1918/2285 train_time:116269ms step_avg:60.62ms -step:1919/2285 train_time:116331ms step_avg:60.62ms -step:1920/2285 train_time:116391ms step_avg:60.62ms -step:1921/2285 train_time:116453ms step_avg:60.62ms -step:1922/2285 train_time:116514ms step_avg:60.62ms -step:1923/2285 train_time:116577ms step_avg:60.62ms -step:1924/2285 train_time:116638ms step_avg:60.62ms -step:1925/2285 train_time:116701ms step_avg:60.62ms -step:1926/2285 train_time:116761ms step_avg:60.62ms -step:1927/2285 train_time:116824ms step_avg:60.62ms -step:1928/2285 train_time:116884ms step_avg:60.62ms -step:1929/2285 train_time:116947ms step_avg:60.63ms -step:1930/2285 train_time:117006ms step_avg:60.63ms -step:1931/2285 train_time:117069ms step_avg:60.63ms -step:1932/2285 train_time:117129ms step_avg:60.63ms -step:1933/2285 train_time:117192ms step_avg:60.63ms -step:1934/2285 train_time:117251ms step_avg:60.63ms -step:1935/2285 train_time:117314ms step_avg:60.63ms -step:1936/2285 train_time:117374ms step_avg:60.63ms -step:1937/2285 train_time:117436ms step_avg:60.63ms -step:1938/2285 train_time:117496ms step_avg:60.63ms -step:1939/2285 train_time:117559ms step_avg:60.63ms -step:1940/2285 train_time:117619ms step_avg:60.63ms -step:1941/2285 train_time:117682ms step_avg:60.63ms -step:1942/2285 train_time:117742ms step_avg:60.63ms -step:1943/2285 train_time:117805ms step_avg:60.63ms -step:1944/2285 train_time:117865ms step_avg:60.63ms -step:1945/2285 train_time:117927ms step_avg:60.63ms -step:1946/2285 train_time:117987ms step_avg:60.63ms -step:1947/2285 train_time:118049ms step_avg:60.63ms -step:1948/2285 train_time:118109ms step_avg:60.63ms -step:1949/2285 train_time:118171ms step_avg:60.63ms -step:1950/2285 train_time:118231ms step_avg:60.63ms -step:1951/2285 train_time:118293ms step_avg:60.63ms -step:1952/2285 train_time:118353ms step_avg:60.63ms -step:1953/2285 train_time:118416ms step_avg:60.63ms -step:1954/2285 train_time:118476ms step_avg:60.63ms -step:1955/2285 train_time:118539ms step_avg:60.63ms -step:1956/2285 train_time:118598ms step_avg:60.63ms -step:1957/2285 train_time:118661ms step_avg:60.63ms -step:1958/2285 train_time:118721ms step_avg:60.63ms -step:1959/2285 train_time:118784ms step_avg:60.63ms -step:1960/2285 train_time:118844ms step_avg:60.63ms -step:1961/2285 train_time:118906ms step_avg:60.64ms -step:1962/2285 train_time:118966ms step_avg:60.63ms -step:1963/2285 train_time:119028ms step_avg:60.64ms -step:1964/2285 train_time:119088ms step_avg:60.64ms -step:1965/2285 train_time:119150ms step_avg:60.64ms -step:1966/2285 train_time:119210ms step_avg:60.64ms -step:1967/2285 train_time:119272ms step_avg:60.64ms -step:1968/2285 train_time:119332ms step_avg:60.64ms -step:1969/2285 train_time:119394ms step_avg:60.64ms -step:1970/2285 train_time:119455ms step_avg:60.64ms -step:1971/2285 train_time:119517ms step_avg:60.64ms -step:1972/2285 train_time:119577ms step_avg:60.64ms -step:1973/2285 train_time:119641ms step_avg:60.64ms -step:1974/2285 train_time:119701ms step_avg:60.64ms -step:1975/2285 train_time:119764ms step_avg:60.64ms -step:1976/2285 train_time:119824ms step_avg:60.64ms -step:1977/2285 train_time:119886ms step_avg:60.64ms -step:1978/2285 train_time:119945ms step_avg:60.64ms -step:1979/2285 train_time:120008ms step_avg:60.64ms -step:1980/2285 train_time:120068ms step_avg:60.64ms -step:1981/2285 train_time:120130ms step_avg:60.64ms -step:1982/2285 train_time:120190ms step_avg:60.64ms -step:1983/2285 train_time:120252ms step_avg:60.64ms -step:1984/2285 train_time:120312ms step_avg:60.64ms -step:1985/2285 train_time:120375ms step_avg:60.64ms -step:1986/2285 train_time:120435ms step_avg:60.64ms -step:1987/2285 train_time:120497ms step_avg:60.64ms -step:1988/2285 train_time:120558ms step_avg:60.64ms -step:1989/2285 train_time:120620ms step_avg:60.64ms -step:1990/2285 train_time:120680ms step_avg:60.64ms -step:1991/2285 train_time:120744ms step_avg:60.64ms -step:1992/2285 train_time:120803ms step_avg:60.64ms -step:1993/2285 train_time:120865ms step_avg:60.64ms -step:1994/2285 train_time:120925ms step_avg:60.64ms -step:1995/2285 train_time:120988ms step_avg:60.65ms -step:1996/2285 train_time:121048ms step_avg:60.65ms -step:1997/2285 train_time:121110ms step_avg:60.65ms -step:1998/2285 train_time:121170ms step_avg:60.65ms -step:1999/2285 train_time:121232ms step_avg:60.65ms -step:2000/2285 train_time:121292ms step_avg:60.65ms -step:2000/2285 val_loss:3.3201 train_time:121356ms step_avg:60.68ms -step:2001/2285 train_time:121381ms step_avg:60.66ms -step:2002/2285 train_time:121417ms step_avg:60.65ms -step:2003/2285 train_time:121479ms step_avg:60.65ms -step:2004/2285 train_time:121540ms step_avg:60.65ms -step:2005/2285 train_time:121605ms step_avg:60.65ms -step:2006/2285 train_time:121664ms step_avg:60.65ms -step:2007/2285 train_time:121726ms step_avg:60.65ms -step:2008/2285 train_time:121786ms step_avg:60.65ms -step:2009/2285 train_time:121848ms step_avg:60.65ms -step:2010/2285 train_time:121907ms step_avg:60.65ms -step:2011/2285 train_time:121968ms step_avg:60.65ms -step:2012/2285 train_time:122028ms step_avg:60.65ms -step:2013/2285 train_time:122092ms step_avg:60.65ms -step:2014/2285 train_time:122152ms step_avg:60.65ms -step:2015/2285 train_time:122213ms step_avg:60.65ms -step:2016/2285 train_time:122276ms step_avg:60.65ms -step:2017/2285 train_time:122340ms step_avg:60.65ms -step:2018/2285 train_time:122401ms step_avg:60.65ms -step:2019/2285 train_time:122465ms step_avg:60.66ms -step:2020/2285 train_time:122527ms step_avg:60.66ms -step:2021/2285 train_time:122590ms step_avg:60.66ms -step:2022/2285 train_time:122650ms step_avg:60.66ms -step:2023/2285 train_time:122713ms step_avg:60.66ms -step:2024/2285 train_time:122774ms step_avg:60.66ms -step:2025/2285 train_time:122835ms step_avg:60.66ms -step:2026/2285 train_time:122896ms step_avg:60.66ms -step:2027/2285 train_time:122957ms step_avg:60.66ms -step:2028/2285 train_time:123017ms step_avg:60.66ms -step:2029/2285 train_time:123079ms step_avg:60.66ms -step:2030/2285 train_time:123138ms step_avg:60.66ms -step:2031/2285 train_time:123200ms step_avg:60.66ms -step:2032/2285 train_time:123260ms step_avg:60.66ms -step:2033/2285 train_time:123323ms step_avg:60.66ms -step:2034/2285 train_time:123383ms step_avg:60.66ms -step:2035/2285 train_time:123447ms step_avg:60.66ms -step:2036/2285 train_time:123507ms step_avg:60.66ms -step:2037/2285 train_time:123570ms step_avg:60.66ms -step:2038/2285 train_time:123631ms step_avg:60.66ms -step:2039/2285 train_time:123694ms step_avg:60.66ms -step:2040/2285 train_time:123754ms step_avg:60.66ms -step:2041/2285 train_time:123817ms step_avg:60.66ms -step:2042/2285 train_time:123876ms step_avg:60.66ms -step:2043/2285 train_time:123939ms step_avg:60.67ms -step:2044/2285 train_time:123998ms step_avg:60.66ms -step:2045/2285 train_time:124060ms step_avg:60.67ms -step:2046/2285 train_time:124120ms step_avg:60.66ms -step:2047/2285 train_time:124182ms step_avg:60.67ms -step:2048/2285 train_time:124242ms step_avg:60.67ms -step:2049/2285 train_time:124305ms step_avg:60.67ms -step:2050/2285 train_time:124365ms step_avg:60.67ms -step:2051/2285 train_time:124429ms step_avg:60.67ms -step:2052/2285 train_time:124489ms step_avg:60.67ms -step:2053/2285 train_time:124552ms step_avg:60.67ms -step:2054/2285 train_time:124612ms step_avg:60.67ms -step:2055/2285 train_time:124675ms step_avg:60.67ms -step:2056/2285 train_time:124735ms step_avg:60.67ms -step:2057/2285 train_time:124798ms step_avg:60.67ms -step:2058/2285 train_time:124857ms step_avg:60.67ms -step:2059/2285 train_time:124920ms step_avg:60.67ms -step:2060/2285 train_time:124979ms step_avg:60.67ms -step:2061/2285 train_time:125041ms step_avg:60.67ms -step:2062/2285 train_time:125101ms step_avg:60.67ms -step:2063/2285 train_time:125163ms step_avg:60.67ms -step:2064/2285 train_time:125223ms step_avg:60.67ms -step:2065/2285 train_time:125285ms step_avg:60.67ms -step:2066/2285 train_time:125346ms step_avg:60.67ms -step:2067/2285 train_time:125409ms step_avg:60.67ms -step:2068/2285 train_time:125469ms step_avg:60.67ms -step:2069/2285 train_time:125533ms step_avg:60.67ms -step:2070/2285 train_time:125593ms step_avg:60.67ms -step:2071/2285 train_time:125655ms step_avg:60.67ms -step:2072/2285 train_time:125715ms step_avg:60.67ms -step:2073/2285 train_time:125778ms step_avg:60.67ms -step:2074/2285 train_time:125838ms step_avg:60.67ms -step:2075/2285 train_time:125900ms step_avg:60.67ms -step:2076/2285 train_time:125960ms step_avg:60.67ms -step:2077/2285 train_time:126022ms step_avg:60.68ms -step:2078/2285 train_time:126082ms step_avg:60.67ms -step:2079/2285 train_time:126145ms step_avg:60.68ms -step:2080/2285 train_time:126205ms step_avg:60.68ms -step:2081/2285 train_time:126268ms step_avg:60.68ms -step:2082/2285 train_time:126328ms step_avg:60.68ms -step:2083/2285 train_time:126391ms step_avg:60.68ms -step:2084/2285 train_time:126450ms step_avg:60.68ms -step:2085/2285 train_time:126513ms step_avg:60.68ms -step:2086/2285 train_time:126574ms step_avg:60.68ms -step:2087/2285 train_time:126637ms step_avg:60.68ms -step:2088/2285 train_time:126696ms step_avg:60.68ms -step:2089/2285 train_time:126759ms step_avg:60.68ms -step:2090/2285 train_time:126818ms step_avg:60.68ms -step:2091/2285 train_time:126881ms step_avg:60.68ms -step:2092/2285 train_time:126940ms step_avg:60.68ms -step:2093/2285 train_time:127003ms step_avg:60.68ms -step:2094/2285 train_time:127063ms step_avg:60.68ms -step:2095/2285 train_time:127125ms step_avg:60.68ms -step:2096/2285 train_time:127185ms step_avg:60.68ms -step:2097/2285 train_time:127248ms step_avg:60.68ms -step:2098/2285 train_time:127308ms step_avg:60.68ms -step:2099/2285 train_time:127371ms step_avg:60.68ms -step:2100/2285 train_time:127431ms step_avg:60.68ms -step:2101/2285 train_time:127493ms step_avg:60.68ms -step:2102/2285 train_time:127553ms step_avg:60.68ms -step:2103/2285 train_time:127616ms step_avg:60.68ms -step:2104/2285 train_time:127676ms step_avg:60.68ms -step:2105/2285 train_time:127740ms step_avg:60.68ms -step:2106/2285 train_time:127799ms step_avg:60.68ms -step:2107/2285 train_time:127862ms step_avg:60.68ms -step:2108/2285 train_time:127922ms step_avg:60.68ms -step:2109/2285 train_time:127985ms step_avg:60.69ms -step:2110/2285 train_time:128045ms step_avg:60.68ms -step:2111/2285 train_time:128108ms step_avg:60.69ms -step:2112/2285 train_time:128168ms step_avg:60.69ms -step:2113/2285 train_time:128231ms step_avg:60.69ms -step:2114/2285 train_time:128291ms step_avg:60.69ms -step:2115/2285 train_time:128354ms step_avg:60.69ms -step:2116/2285 train_time:128413ms step_avg:60.69ms -step:2117/2285 train_time:128475ms step_avg:60.69ms -step:2118/2285 train_time:128536ms step_avg:60.69ms -step:2119/2285 train_time:128598ms step_avg:60.69ms -step:2120/2285 train_time:128658ms step_avg:60.69ms -step:2121/2285 train_time:128720ms step_avg:60.69ms -step:2122/2285 train_time:128780ms step_avg:60.69ms -step:2123/2285 train_time:128843ms step_avg:60.69ms -step:2124/2285 train_time:128903ms step_avg:60.69ms -step:2125/2285 train_time:128965ms step_avg:60.69ms -step:2126/2285 train_time:129026ms step_avg:60.69ms -step:2127/2285 train_time:129089ms step_avg:60.69ms -step:2128/2285 train_time:129149ms step_avg:60.69ms -step:2129/2285 train_time:129211ms step_avg:60.69ms -step:2130/2285 train_time:129272ms step_avg:60.69ms -step:2131/2285 train_time:129334ms step_avg:60.69ms -step:2132/2285 train_time:129395ms step_avg:60.69ms -step:2133/2285 train_time:129457ms step_avg:60.69ms -step:2134/2285 train_time:129517ms step_avg:60.69ms -step:2135/2285 train_time:129579ms step_avg:60.69ms -step:2136/2285 train_time:129640ms step_avg:60.69ms -step:2137/2285 train_time:129703ms step_avg:60.69ms -step:2138/2285 train_time:129763ms step_avg:60.69ms -step:2139/2285 train_time:129825ms step_avg:60.69ms -step:2140/2285 train_time:129885ms step_avg:60.69ms -step:2141/2285 train_time:129947ms step_avg:60.69ms -step:2142/2285 train_time:130007ms step_avg:60.69ms -step:2143/2285 train_time:130070ms step_avg:60.70ms -step:2144/2285 train_time:130130ms step_avg:60.69ms -step:2145/2285 train_time:130193ms step_avg:60.70ms -step:2146/2285 train_time:130252ms step_avg:60.70ms -step:2147/2285 train_time:130315ms step_avg:60.70ms -step:2148/2285 train_time:130376ms step_avg:60.70ms -step:2149/2285 train_time:130438ms step_avg:60.70ms -step:2150/2285 train_time:130498ms step_avg:60.70ms -step:2151/2285 train_time:130561ms step_avg:60.70ms -step:2152/2285 train_time:130620ms step_avg:60.70ms -step:2153/2285 train_time:130683ms step_avg:60.70ms -step:2154/2285 train_time:130743ms step_avg:60.70ms -step:2155/2285 train_time:130805ms step_avg:60.70ms -step:2156/2285 train_time:130867ms step_avg:60.70ms -step:2157/2285 train_time:130929ms step_avg:60.70ms -step:2158/2285 train_time:130990ms step_avg:60.70ms -step:2159/2285 train_time:131052ms step_avg:60.70ms -step:2160/2285 train_time:131112ms step_avg:60.70ms -step:2161/2285 train_time:131174ms step_avg:60.70ms -step:2162/2285 train_time:131235ms step_avg:60.70ms -step:2163/2285 train_time:131297ms step_avg:60.70ms -step:2164/2285 train_time:131358ms step_avg:60.70ms -step:2165/2285 train_time:131421ms step_avg:60.70ms -step:2166/2285 train_time:131481ms step_avg:60.70ms -step:2167/2285 train_time:131544ms step_avg:60.70ms -step:2168/2285 train_time:131604ms step_avg:60.70ms -step:2169/2285 train_time:131667ms step_avg:60.70ms -step:2170/2285 train_time:131727ms step_avg:60.70ms -step:2171/2285 train_time:131789ms step_avg:60.70ms -step:2172/2285 train_time:131850ms step_avg:60.70ms -step:2173/2285 train_time:131912ms step_avg:60.71ms -step:2174/2285 train_time:131972ms step_avg:60.70ms -step:2175/2285 train_time:132035ms step_avg:60.71ms -step:2176/2285 train_time:132094ms step_avg:60.71ms -step:2177/2285 train_time:132157ms step_avg:60.71ms -step:2178/2285 train_time:132217ms step_avg:60.71ms -step:2179/2285 train_time:132279ms step_avg:60.71ms -step:2180/2285 train_time:132339ms step_avg:60.71ms -step:2181/2285 train_time:132401ms step_avg:60.71ms -step:2182/2285 train_time:132461ms step_avg:60.71ms -step:2183/2285 train_time:132523ms step_avg:60.71ms -step:2184/2285 train_time:132584ms step_avg:60.71ms -step:2185/2285 train_time:132646ms step_avg:60.71ms -step:2186/2285 train_time:132706ms step_avg:60.71ms -step:2187/2285 train_time:132769ms step_avg:60.71ms -step:2188/2285 train_time:132829ms step_avg:60.71ms -step:2189/2285 train_time:132892ms step_avg:60.71ms -step:2190/2285 train_time:132952ms step_avg:60.71ms -step:2191/2285 train_time:133014ms step_avg:60.71ms -step:2192/2285 train_time:133075ms step_avg:60.71ms -step:2193/2285 train_time:133137ms step_avg:60.71ms -step:2194/2285 train_time:133197ms step_avg:60.71ms -step:2195/2285 train_time:133259ms step_avg:60.71ms -step:2196/2285 train_time:133319ms step_avg:60.71ms -step:2197/2285 train_time:133382ms step_avg:60.71ms -step:2198/2285 train_time:133441ms step_avg:60.71ms -step:2199/2285 train_time:133504ms step_avg:60.71ms -step:2200/2285 train_time:133564ms step_avg:60.71ms -step:2201/2285 train_time:133626ms step_avg:60.71ms -step:2202/2285 train_time:133686ms step_avg:60.71ms -step:2203/2285 train_time:133750ms step_avg:60.71ms -step:2204/2285 train_time:133810ms step_avg:60.71ms -step:2205/2285 train_time:133873ms step_avg:60.71ms -step:2206/2285 train_time:133933ms step_avg:60.71ms -step:2207/2285 train_time:133996ms step_avg:60.71ms -step:2208/2285 train_time:134056ms step_avg:60.71ms -step:2209/2285 train_time:134118ms step_avg:60.71ms -step:2210/2285 train_time:134178ms step_avg:60.71ms -step:2211/2285 train_time:134240ms step_avg:60.71ms -step:2212/2285 train_time:134301ms step_avg:60.71ms -step:2213/2285 train_time:134363ms step_avg:60.72ms -step:2214/2285 train_time:134424ms step_avg:60.72ms -step:2215/2285 train_time:134487ms step_avg:60.72ms -step:2216/2285 train_time:134547ms step_avg:60.72ms -step:2217/2285 train_time:134610ms step_avg:60.72ms -step:2218/2285 train_time:134670ms step_avg:60.72ms -step:2219/2285 train_time:134733ms step_avg:60.72ms -step:2220/2285 train_time:134792ms step_avg:60.72ms -step:2221/2285 train_time:134854ms step_avg:60.72ms -step:2222/2285 train_time:134914ms step_avg:60.72ms -step:2223/2285 train_time:134976ms step_avg:60.72ms -step:2224/2285 train_time:135037ms step_avg:60.72ms -step:2225/2285 train_time:135099ms step_avg:60.72ms -step:2226/2285 train_time:135159ms step_avg:60.72ms -step:2227/2285 train_time:135221ms step_avg:60.72ms -step:2228/2285 train_time:135281ms step_avg:60.72ms -step:2229/2285 train_time:135343ms step_avg:60.72ms -step:2230/2285 train_time:135403ms step_avg:60.72ms -step:2231/2285 train_time:135465ms step_avg:60.72ms -step:2232/2285 train_time:135525ms step_avg:60.72ms -step:2233/2285 train_time:135588ms step_avg:60.72ms -step:2234/2285 train_time:135649ms step_avg:60.72ms -step:2235/2285 train_time:135711ms step_avg:60.72ms -step:2236/2285 train_time:135771ms step_avg:60.72ms -step:2237/2285 train_time:135833ms step_avg:60.72ms -step:2238/2285 train_time:135894ms step_avg:60.72ms -step:2239/2285 train_time:135955ms step_avg:60.72ms -step:2240/2285 train_time:136015ms step_avg:60.72ms -step:2241/2285 train_time:136078ms step_avg:60.72ms -step:2242/2285 train_time:136138ms step_avg:60.72ms -step:2243/2285 train_time:136200ms step_avg:60.72ms -step:2244/2285 train_time:136260ms step_avg:60.72ms -step:2245/2285 train_time:136323ms step_avg:60.72ms -step:2246/2285 train_time:136383ms step_avg:60.72ms -step:2247/2285 train_time:136446ms step_avg:60.72ms -step:2248/2285 train_time:136505ms step_avg:60.72ms -step:2249/2285 train_time:136568ms step_avg:60.72ms -step:2250/2285 train_time:136628ms step_avg:60.72ms -step:2250/2285 val_loss:3.2856 train_time:136692ms step_avg:60.75ms -step:2251/2285 train_time:136714ms step_avg:60.73ms -step:2252/2285 train_time:136754ms step_avg:60.73ms -step:2253/2285 train_time:136819ms step_avg:60.73ms -step:2254/2285 train_time:136880ms step_avg:60.73ms -step:2255/2285 train_time:136942ms step_avg:60.73ms -step:2256/2285 train_time:137002ms step_avg:60.73ms -step:2257/2285 train_time:137064ms step_avg:60.73ms -step:2258/2285 train_time:137123ms step_avg:60.73ms -step:2259/2285 train_time:137184ms step_avg:60.73ms -step:2260/2285 train_time:137244ms step_avg:60.73ms -step:2261/2285 train_time:137306ms step_avg:60.73ms -step:2262/2285 train_time:137366ms step_avg:60.73ms -step:2263/2285 train_time:137428ms step_avg:60.73ms -step:2264/2285 train_time:137488ms step_avg:60.73ms -step:2265/2285 train_time:137550ms step_avg:60.73ms -step:2266/2285 train_time:137610ms step_avg:60.73ms -step:2267/2285 train_time:137674ms step_avg:60.73ms -step:2268/2285 train_time:137735ms step_avg:60.73ms -step:2269/2285 train_time:137799ms step_avg:60.73ms -step:2270/2285 train_time:137859ms step_avg:60.73ms -step:2271/2285 train_time:137923ms step_avg:60.73ms -step:2272/2285 train_time:137983ms step_avg:60.73ms -step:2273/2285 train_time:138045ms step_avg:60.73ms -step:2274/2285 train_time:138104ms step_avg:60.73ms -step:2275/2285 train_time:138166ms step_avg:60.73ms -step:2276/2285 train_time:138225ms step_avg:60.73ms -step:2277/2285 train_time:138287ms step_avg:60.73ms -step:2278/2285 train_time:138346ms step_avg:60.73ms -step:2279/2285 train_time:138408ms step_avg:60.73ms -step:2280/2285 train_time:138468ms step_avg:60.73ms -step:2281/2285 train_time:138530ms step_avg:60.73ms -step:2282/2285 train_time:138590ms step_avg:60.73ms -step:2283/2285 train_time:138653ms step_avg:60.73ms -step:2284/2285 train_time:138714ms step_avg:60.73ms -step:2285/2285 train_time:138777ms step_avg:60.73ms -step:2285/2285 val_loss:3.2794 train_time:138838ms step_avg:60.76ms -peak memory allocated: 29626 MiB reserved: 50528 MiB diff --git a/records/track_1_short/2025-10-27_FixMuonLR/6e1efe80-8453-4ef6-a34d-8c73543618a8.txt b/records/track_1_short/2025-10-27_FixMuonLR/6e1efe80-8453-4ef6-a34d-8c73543618a8.txt deleted file mode 100644 index 17f0ced3a..000000000 --- a/records/track_1_short/2025-10-27_FixMuonLR/6e1efe80-8453-4ef6-a34d-8c73543618a8.txt +++ /dev/null @@ -1,3814 +0,0 @@ -import os -import sys - -with open(sys.argv[0]) as f: - code = f.read() # read the code of this file ASAP, for logging -import copy -import glob -import math -import threading -import time -import uuid -from dataclasses import dataclass -from collections import defaultdict -from itertools import accumulate -from pathlib import Path - -os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" -import torch - -torch.empty( - 1, device="cuda", requires_grad=True -).backward() # prevents a bug on some systems -import torch._dynamo as dynamo -import torch.distributed as dist -import torch.nn.functional as F - -# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min -import triton -import triton.language as tl -from kernels import get_kernel -from torch import Tensor, nn - -dynamo.config.recompile_limit = 64 - -# ----------------------------------------------------------------------------- -# Custom operators: FP8 matmul by @YouJiacheng - - -@torch.library.custom_op("nanogpt::mm", mutates_args=()) -def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: - @torch.compile - def impl(x: Tensor, w: Tensor): - assert x.is_contiguous() and w.is_contiguous() - x_f8 = x.div(x_s).to(torch.float8_e4m3fn) - w_f8 = w.div(w_s).to(torch.float8_e4m3fn) - out = torch._scaled_mm( - x_f8, - w_f8.T, - out_dtype=torch.bfloat16, - scale_a=x.new_tensor(x_s, dtype=torch.float32), - scale_b=x.new_tensor(w_s, dtype=torch.float32), - use_fast_accum=True, - ) - return out, x_f8, w_f8 - - return impl(x, w) - -@mm_op.register_fake -def _(x: Tensor, w: Tensor, *_): - assert x.ndim == w.ndim == 2 - assert x.shape[1] == w.shape[1] - assert x.device == w.device - assert x.is_contiguous() and w.is_contiguous() - return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) - -@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) -def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: - @torch.compile - def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): - assert grad.is_contiguous() - x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) - w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) - grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) - grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) - grad_x = torch._scaled_mm( - grad_f8, - w_f8.T.contiguous().T, - out_dtype=torch.bfloat16, - scale_a=grad_inv_s, - scale_b=w_inv_s, - use_fast_accum=False, - ) - # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) - grad_w = torch._scaled_mm( - x_f8.T.contiguous(), - grad_f8.T.contiguous().T, - out_dtype=torch.float32, - scale_a=x_inv_s, - scale_b=grad_inv_s, - use_fast_accum=False, - ).T - return grad_x, grad_w - - return impl(g, x_f8, w_f8) - -@mm_backward_op.register_fake -def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): - return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) - -def backward(ctx, grad_out: Tensor, *_): - x_f8, w_f8 = ctx.saved_tensors - x_s, w_s, grad_s = ctx.scales - grad_x, grad_w = torch.ops.nanogpt.mm_backward( - grad_out, x_f8, w_f8, x_s, w_s, grad_s - ) - return grad_x, grad_w, None, None, None - -def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): - *_, x_s, w_s, grad_s = inputs - _, x_f8, w_f8 = output - ctx.save_for_backward(x_f8, w_f8) - ctx.scales = x_s, w_s, grad_s - ctx.set_materialize_grads(False) - -mm_op.register_autograd(backward, setup_context=setup_context) - -# ----------------------------------------------------------------------------- -# Triton kernel for symmetric matrix multiplication by @byronxu99 - -def _get_autotune_configs(): - return [ - triton.Config( - { - "BLOCK_SIZE_M": bm, - "BLOCK_SIZE_N": bn, - "BLOCK_SIZE_K": bk, - "GROUP_SIZE_M": 8, - "LOWER_UPPER": 1, - }, - num_stages=stages, - num_warps=warps, - ) - for bm in [64, 128] - for bn in [64, 128, 256] - for bk in [64, 128] - for stages, warps in [(3, 4), (3, 8), (4, 4)] - if bm // bn <= 2 and bn // bm <= 2 - ] - -@triton.jit -def _pid_to_block( - pid, - M, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, -): - # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) - num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) - num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) - - # Map PID to a single matrix in batch - batch_idx = pid // (num_pid_m * num_pid_n) - pid = pid % (num_pid_m * num_pid_n) - - # Map PID to 2D grid of blocks - pid_m = pid // num_pid_n - pid_n = pid % num_pid_n - pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) - - m_idx = pid_m * BLOCK_SIZE_M - n_idx = pid_n * BLOCK_SIZE_N - return batch_idx, m_idx, n_idx - -@triton.autotune( - configs=_get_autotune_configs(), - key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], -) -@triton.jit -def XXT_kernel( - A_ptr, C_ptr, - M, K, - a_stride_b, a_stride_r, a_stride_c, - c_stride_b, c_stride_r, c_stride_c, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, - LOWER_UPPER: tl.constexpr, -): - pid = tl.program_id(axis=0) - batch_idx, m_idx, n_idx = _pid_to_block( - pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M - ) - - # Skip blocks that don't need to be computed - skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) - skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) - if skip_block_below_diag or skip_block_above_diag: - return - - # Index into one matrix of batch - A_ptr += batch_idx * a_stride_b - C_ptr += batch_idx * c_stride_b - - # Create pointer arrays for A and A.T - offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M - offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M - offs_k = tl.arange(0, BLOCK_SIZE_K) - a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) - at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) - - accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - - # Accumulate over blocks of K - for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): - a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) - at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) - accumulator = tl.dot(a, at, accumulator) - a_ptrs += BLOCK_SIZE_K * a_stride_c - at_ptrs += BLOCK_SIZE_K * a_stride_c - - out_dtype = C_ptr.dtype.element_ty - output = accumulator.to(out_dtype) - - # Store block of C - offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) - offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) - c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) - c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) - tl.store(c_ptrs, output, mask=c_mask) - - # Store block of C mirrored across the diagonal - c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) - c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) - tl.store(c_ptrs_t, output.T, mask=c_mask_t) - -def XXT(A: torch.Tensor, out: torch.Tensor): - """ - Launch Triton kernel to compute C = A @ A.T - """ - assert A.ndim == 2 or A.ndim == 3 - M, K = A.shape[-2:] - assert out.size(-2) == M, "Output matrix has incorrect shape" - assert out.size(-1) == M, "Output matrix has incorrect shape" - - batch_size = A.size(0) if A.ndim == 3 else 1 - input_batch_stride = A.stride(0) if A.ndim == 3 else 0 - output_batch_stride = out.stride(0) if out.ndim == 3 else 0 - - grid = lambda meta: ( - batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), - ) - XXT_kernel[grid]( - A_ptr=A, - C_ptr=out, - M=M, - K=K, - a_stride_b=input_batch_stride, - a_stride_r=A.stride(-2), - a_stride_c=A.stride(-1), - c_stride_b=output_batch_stride, - c_stride_r=out.stride(-2), - c_stride_c=out.stride(-1), - ) - return out - -@triton.autotune( - configs=_get_autotune_configs(), - key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], -) -@triton.jit -def ba_plus_cAA_kernel( - A_ptr, C_ptr, - M, - a_stride_b, a_stride_r, a_stride_c, - c_stride_b, c_stride_r, c_stride_c, - alpha, beta, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, - LOWER_UPPER: tl.constexpr, -): - # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A - # Performance is slightly slower than XXT_kernel, so we use two separate kernels - pid = tl.program_id(axis=0) - batch_idx, m_idx, n_idx = _pid_to_block( - pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M - ) - - # Skip blocks that don't need to be computed - skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) - skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) - if skip_block_below_diag or skip_block_above_diag: - return - - # Index into one matrix of batch - A_ptr += batch_idx * a_stride_b - C_ptr += batch_idx * c_stride_b - - # Create pointer arrays for A and A.T - offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M - offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M - offs_k = tl.arange(0, BLOCK_SIZE_K) - a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) - at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) - - accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - - # Accumulate over blocks of K - for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): - a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) - at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) - accumulator = tl.dot(a, at, accumulator) - a_ptrs += BLOCK_SIZE_K * a_stride_c - at_ptrs += BLOCK_SIZE_K * a_stride_c - - # Load block of A to add (corresponds to the current block of C) - offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) - offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) - a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) - a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) - a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) - - # Apply alpha and beta - accumulator *= alpha - accumulator += a_add * beta - - out_dtype = C_ptr.dtype.element_ty - output = accumulator.to(out_dtype) - - # Store block of C - offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) - offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) - c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) - c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) - tl.store(c_ptrs, output, mask=c_mask) - - # Store block of C mirrored across the diagonal - c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) - c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) - tl.store(c_ptrs_t, output.T, mask=c_mask_t) - -def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): - """ - Launch Triton kernel to compute C = alpha * A @ A.T + beta * A - """ - assert A.ndim == 2 or A.ndim == 3 - M, K = A.shape[-2:] - assert M == K, "Input matrix must be square" - assert out.size(-2) == M - assert out.size(-1) == M - - batch_size = A.size(0) if A.ndim == 3 else 1 - input_batch_stride = A.stride(0) if A.ndim == 3 else 0 - output_batch_stride = out.stride(0) if out.ndim == 3 else 0 - - grid = lambda meta: ( - batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), - ) - ba_plus_cAA_kernel[grid]( - A_ptr=A, - C_ptr=out, - M=M, - a_stride_b=input_batch_stride, - a_stride_r=A.stride(-2), - a_stride_c=A.stride(-1), - c_stride_b=output_batch_stride, - c_stride_r=out.stride(-2), - c_stride_c=out.stride(-1), - alpha=alpha, - beta=beta, - ) - return out - -# Computed for num_iters=5, safety_factor=2e-2, cushion=2 -polar_express_coeffs = [ - (8.156554524902461, -22.48329292557795, 15.878769915207462), - (4.042929935166739, -2.808917465908714, 0.5000178451051316), - (3.8916678022926607, -2.772484153217685, 0.5060648178503393), - (3.285753657755655, -2.3681294933425376, 0.46449024233003106), - (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) -] - -@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower -def polar_express(G: torch.Tensor): - """ - Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 - by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. - Code adapted from https://github.com/NoahAmsel/PolarExpress/tree/main by @varunneal. - """ - X = G.bfloat16() - if G.size(-2) > G.size(-1): - X = X.mT - - # Ensure spectral norm is at most 1 - X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) - - # Allocate buffers - X = X.contiguous() - A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) - B = torch.empty_like(A) - C = torch.empty_like(X) - - aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm - - # Perform the iterations - for a, b, c in polar_express_coeffs: - XXT(X, out=A) # A = X @ X.mT - ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A - aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X - X, C = C, X # Swap references to avoid unnecessary copies - - if G.size(-2) > G.size(-1): - X = X.mT - return X - -# ----------------------------------------------------------------------------- -# Muon optimizer - -class Muon(torch.optim.Optimizer): - """ - Muon - MomentUm Orthogonalized by Newton-schulz - - https://kellerjordan.github.io/posts/muon/ - - Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- - processing step, in which each 2D parameter's update is replaced with the nearest orthogonal - matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has - the advantage that it can be stably run in bfloat16 on the GPU. - Note: A later PR replaced Newton-Shulz with Polar Express for the orthogonalization step - - Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, - or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - Though empirically small 1D params perform efficiently here: - NS approximately performs a magnitude normalization of the grad - This hyper-optimized class has faster execution time than the current impl of Adam for small params - - Custom distributed sizing: - The model stores all attn and mlp weights in the same shape, and then updates the view as - needed on the forward pass. This enables attn and mlp weights to be contained within the same - dist.reduce_scatter_tensor() call. The model architecture has been customized to enable - (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. - The scheduling is: - 1. reduce scatter smear_gate (1 param 7 padding params) - 2. reduce scatter attn_gate (10 params 6 padding params) - 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) - 4. reduce scatter attn/mlp round 2 (16 mlp params) - 5. wait on step 1, then compute update of 1 and schedule all gather - 6. wait on step 2, then compute update of 2 and schedule all gather - 7. wait on step 3, then compute update of 3 and schedule all gather - GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] - GPUs that receive params of type attn reshape before computing update - 8. wait on 4, then compute update of 4 and schedule all gather - 9. wait for each all gather to complete and update params - Empirically, leading with small params provides an additional 0.2s improvement. - """ - def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, eps=1e-8, beta2=0.95, custom_sizing=True): - defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) - self.world_size = dist.get_world_size() if dist.is_initialized() else 1 - # custom sizing requires 8 GPUs - if custom_sizing and dist.get_world_size()==8: - param_groups = self.generate_custom_param_groups(params) - else: - param_groups = self.generate_standard_param_groups(params) - super().__init__(param_groups, defaults) - - def reset(self): - # expose a reset for clearing buffers - for group in self.param_groups: - group["momentum_buffer"].zero_() - group["second_momentum_buffer"].zero_() - - def generate_standard_param_groups(self, params): - """ - Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. - Creates one param group per module. - """ - groups = defaultdict(list) - for param in params: - groups[param.label].append(param) - - param_groups = [] - for module_name, group_params in groups.items(): - chunk_size = (len(group_params) + self.world_size - 1) // self.world_size - param_groups.append(dict(params=group_params, chunk_size=chunk_size)) - - return param_groups - - def generate_custom_param_groups(self, params): - """ - Implementation requires that a single GPU does not receive both attn - and mlp params when a param group is split across GPUs. - """ - module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp_up', 'mlp_down'] - params_list = list(params) - params_list.sort(key=lambda x: module_group_order.index(x.label)) - - idx = 0 - group_sizes = [1, 10, 16, 16] - assert len(params_list) == sum(group_sizes) - param_groups = [] - for size in group_sizes: - chunk_size = (size + self.world_size - 1) // self.world_size - group_params = params_list[idx: idx + size] - param_groups.append(dict(params=group_params, chunk_size=chunk_size)) - idx += size - - return param_groups - - @torch.no_grad() - def step(self): - # Efficient systems-wise implementation of step developed by @YouJiacheng, - # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, - # @ryanyang0, @vagrawal, and @varunneal. - rank = dist.get_rank() - group_infos = [] - for group in self.param_groups: - params: list[Tensor] = group["params"] - if not params: - continue - - chunk_size = group["chunk_size"] - padded_num_params = chunk_size * self.world_size - - stacked_grads = torch.empty( - (padded_num_params, *params[0].shape), - dtype=params[0].dtype, - device=params[0].device - ) - for i, p in enumerate(params): - stacked_grads[i].copy_(p.grad, non_blocking=True) - if len(params) < padded_num_params: - stacked_grads[len(params):].zero_() - - grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) - - reduce_future = dist.reduce_scatter_tensor( - grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True - ).get_future() - - group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) - - all_gather_infos = [] - # Second pass: wait for gradients, compute updates for the local shard of parameters, - # and launch all async all_gather operations. - for group, info in zip(self.param_groups, group_infos): - info["reduce_future"].wait() - - params = group["params"] - grad_chunk = info["grad_chunk"] - chunk_size = group["chunk_size"] - padded_num_params = chunk_size * self.world_size - - start_idx = rank * chunk_size - module_idx = start_idx if start_idx < len(params) else 0 - - num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank - - if "momentum_buffer" not in group: - group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) - momentum_buffer = group["momentum_buffer"] - # Apply momentum update to the persistent momentum buffer in-place - momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) - updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) - - grad_shape = updated_grads.shape - if params[module_idx].label == 'attn': - # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] - for p in params[module_idx:module_idx + num_params]: - assert p.label == 'attn' - updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) - ref_param = params[module_idx] - param_shape = ref_param.shape - - if "second_momentum_buffer" not in group: - group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) - if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) - ) - second_momentum_buffer = group["second_momentum_buffer"] - - if "param_lr" not in group: - group["param_lr"] = ( - max(1., param_shape[-2] / param_shape[-1]) ** 0.5 - * ref_param.new_tensor( - [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] - ).view(-1, 1, 1) - ) - - group["param_wd"] = ref_param.new_tensor( - [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] - ).view(-1, 1, 1) - - # Determine LR and WR - eff_lr = group["lr"] * group["param_lr"] - eff_wd = group["weight_decay"] * group["param_wd"] - - # Compute zeropower for the entire chunk in a single, batched call. - if num_params == 0: - v_chunk = updated_grads - elif params[module_idx].label == "smear_gate": - # dividing by magnitude is equivalent of SVN for 1d tensors - v_chunk = updated_grads / (updated_grads.norm(dim=(-2, -1), keepdim=True).clamp_min(1e-10)) - else: - v_chunk = polar_express(updated_grads) - - # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) - v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) - v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) - second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) - step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() - v_chunk.mul_(step_size) - v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) - v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) - - v_chunk = v_chunk.view(grad_shape) - - updated_params = torch.empty_like(grad_chunk) - param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) - # Apply weight decay directly to the buffer. - param_chunk.mul_(1 - eff_wd) - - param_chunk.add_(-eff_lr * v_chunk) - - updated_params[:num_params].copy_(param_chunk) - if num_params < chunk_size: - updated_params[num_params:].zero_() - - stacked_params = torch.empty( - (padded_num_params, *param_shape), - dtype=updated_params.dtype, - device=updated_params.device, - ) - - gather_future = dist.all_gather_into_tensor( - stacked_params, updated_params, async_op=True - ).get_future() - - all_gather_infos.append( - { - "gather_future": gather_future, - "stacked_params": stacked_params, - "orig_params": params, - } - ) - - # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. - for info in all_gather_infos: - info["gather_future"].wait() - stacked_params = info["stacked_params"] - orig_params = info["orig_params"] - - unstacked_params = torch.unbind(stacked_params) - for i, p in enumerate(orig_params): - p.copy_(unstacked_params[i], non_blocking=True) - - -class DistAdam(torch.optim.Optimizer): - def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): - self.world_size = dist.get_world_size() if dist.is_initialized() else 1 - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) - params = list(params) - sizes = {p.shape for p in params} - # create one buffer per unique parameter-size - param_groups = [] - for size in sizes: - group_params = [p for p in params if p.shape == size] - param_groups.append(dict(params=group_params)) - super().__init__(param_groups, defaults) - # init state - for p in params: - chunk_size = p.size(0) // self.world_size - exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) - exp_avg_sq = torch.zeros_like(exp_avg) - self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) - # DistributedAdam implementation by @vagrawal - - @torch.compile - @torch.no_grad() - def step(self): - rank = dist.get_rank() - reduce_scatter_futures: list[torch.Future] = [] - all_gather_futures: list[torch.Future] = [] - grad_slices = [] - for group in self.param_groups: - params: list[Tensor] = group["params"] - for param in params: - grad = param.grad - rank_size = grad.shape[0] // self.world_size - grad_slice = torch.empty_like(grad[:rank_size]) - reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) - grad_slices.append(grad_slice) - - idx = 0 - for group in self.param_groups: - beta1, beta2 = group['betas'] - eps = group['eps'] - wd = group['weight_decay'] - params = group['params'] - for param in params: - reduce_scatter_futures[idx].wait() - rank_size = param.shape[0] // self.world_size - p_slice = param[rank * rank_size:(rank + 1) * rank_size] - lr = group['lr'] * getattr(param, "lr_mul", 1.0) - state = self.state[param] - g_slice = grad_slices[idx] - - exp_avg = state["exp_avg"] - exp_avg_sq = state["exp_avg_sq"] - state["step"] += 1 - t = state["step"] - # weight decay - if wd != 0: - eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) - p_slice.mul_(1 - eff_weight_decay) - # update running averages - exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) - exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) - # bias corrections - bias1 = 1 - beta1 ** t - bias2 = 1 - beta2 ** t - # compute step - denom = exp_avg_sq.sqrt().add_(eps) - step_size = lr * (bias2 ** 0.5 / bias1) - update = exp_avg.div(denom).mul_(step_size) - p_slice.add_(other=update, alpha=-1.0) - idx += 1 - all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) - torch.futures.collect_all(all_gather_futures).wait() - -# ----------------------------------------------------------------------------- -# PyTorch nn.Module definitions for the model - -def norm(x: Tensor): - return F.rms_norm(x, (x.size(-1),)) - -class CastedLinear(nn.Linear): - def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): - super().__init__(in_features, out_features, bias=False) - self.use_fp8 = use_fp8 - self.x_s = x_s - self.w_s = w_s - self.grad_s = grad_s - - def reset_parameters(self) -> None: - with torch.no_grad(): - self.weight.zero_() # @Grad62304977 and others - - def forward(self, x: Tensor): - if self.use_fp8 and self.training: - _x = x.flatten(0, -2) - out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] - return out.reshape(*x.shape[:-1], -1) - else: - return F.linear(x, self.weight.type_as(x)) - -# yarn implementation @classiclarryd -class Yarn(nn.Module): - def __init__(self, head_dim, max_seq_len): - super().__init__() - self.head_dim = head_dim - self.max_seq_len = max_seq_len - self.reset() - - def reset(self): - angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) - # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) - angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) - t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) - theta = torch.outer(t, angular_freq) - self.cos = nn.Buffer( - theta.cos().to(torch.bfloat16), persistent=False - ) - self.sin = nn.Buffer( - theta.sin().to(torch.bfloat16), persistent=False - ) - self.angular_freq = angular_freq - # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 - self.attn_scale = 0.1 - - def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): - rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) - scaling_factor = old_window / new_window - interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) - self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) - t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) - theta = torch.outer(t, self.angular_freq) - self.cos.copy_(theta.cos()) - self.sin.copy_(theta.sin()) - self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 - -def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): - assert cos.size(0) >= x_BTHD.size(-3) - cos, sin = ( - cos[None, : x_BTHD.size(-3), None, :], - sin[None, : x_BTHD.size(-3), None, :], - ) - x1, x2 = x_BTHD.chunk(2, dim=-1) - y1 = x1 * cos + x2 * sin - y2 = x1 * (-sin) + x2 * cos - return torch.cat((y1, y2), 3) - -@dataclass -class AttnArgs: - ve: torch.Tensor - sa_lambdas: torch.Tensor - seqlens: torch.Tensor - bm_size: int - cos: torch.Tensor - sin: torch.Tensor - attn_scale: float - -flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface - -class CausalSelfAttention(nn.Module): - def __init__(self, dim: int, head_dim: int, num_heads: int): - super().__init__() - self.num_heads = num_heads - self.head_dim = head_dim - self.dim = dim - self.hdim = num_heads * head_dim - - assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" - std = 0.5 * (self.dim ** -0.5) - bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng - # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng - # https://x.com/hi_tysam/status/1879699187107033311 - # make matrices the same shape as MLP to enable batched call in optimizer - self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) - # label module to enable custom optimizer sizing - self.qkvo_w.label='attn' - - with torch.no_grad(): - self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights - self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero - - # sparse gated attention to enable context based no-op by @classiclarryd - self.attn_gate = CastedLinear(12, num_heads) - # label module to enable custom optimizer sizing - self.attn_gate.weight.label = 'attn_gate' - - def forward(self, x: Tensor, attn_args: AttnArgs): - B, T = x.size(0), x.size(1) # batch size, sequence length - assert B == 1, "varlen sequences requires B == 1" - assert T % 16 == 0 - # unpack attention args - cos, sin = attn_args.cos, attn_args.sin - ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas - seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size - - q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) - q, k = norm(q), norm(k) # QK norm @Grad62304977 - q, k = rotary(q, cos, sin), rotary(k, cos, sin) - if ve is not None: - v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 - else: # skip mid-layers token value embeddings by @YouJiacheng - v = sa_lambdas[0] * v - - max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) - - # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng - y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, - max_seqlen_q=max_len, max_seqlen_k=max_len, - causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) - y = y.view(B, T, self.num_heads, self.head_dim) - y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) - y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side - y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) - return y - - -class MLP(nn.Module): - def __init__(self, dim: int): - super().__init__() - hdim = 4 * dim - # make matrices the same shape to enable batched call in optimizer - self.c_fc = nn.Parameter(torch.empty(dim, hdim)) - self.c_proj = nn.Parameter(torch.empty(dim, hdim)) - # label modules to enable custom optimizer sizing - self.c_fc.label = 'mlp_up' - self.c_proj.label = 'mlp_down' - # corrective factor to account for transpose - self.c_fc.lr_mul = 2. - - std = 0.5 * (dim ** -0.5) - bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng - with torch.no_grad(): - self.c_fc.uniform_(-bound, bound) - self.c_proj.zero_() # zero init suggested by @Grad62304977 - - def forward(self, x: Tensor): - x = F.linear(x, self.c_fc.T.type_as(x)) - x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 - x = F.linear(x, self.c_proj.type_as(x)) - return x - -class Block(nn.Module): - def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): - super().__init__() - # skip attention of blocks.7 (the 8th layer) by @YouJiacheng - self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None - # skip MLP blocks for first MLP layer by @EmelyanenkoK - self.mlp = MLP(dim) if layer_idx != 0 else None - - def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): - x = lambdas[0] * x + lambdas[1] * x0 - if self.attn is not None: - x = x + self.attn(norm(x), attn_args) - if self.mlp is not None: - x = x + self.mlp(norm(x)) - return x - -# ----------------------------------------------------------------------------- -# The main model - -def next_multiple_of_n(v: float | int, *, n: int): - return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) - -class GPT(nn.Module): - def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): - super().__init__() - vocab_size = next_multiple_of_n(vocab_size, n=128) - self.embed = nn.Embedding(vocab_size, model_dim) - self.smear_gate = CastedLinear(12, 1) - # label modules to enable custom optimizer sizing - self.smear_gate.weight.label = 'smear_gate' - # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 - # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 - self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) - self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) - self.yarn = Yarn(head_dim, max_seq_len) - # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. - # suggested to me by @Grad62304977. this originates from Karpathy's experiments. - use_fp8 = not os.environ.get("DISABLE_FP8", False) - self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) - # Add learnable skip connection weights for decoder layers - assert num_layers % 2 == 0 - pad = (-num_layers * 5 - 2) % dist.get_world_size() - self.scalars = nn.Parameter( - torch.cat( - [ - -1.5 - * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 - *[ - torch.tensor([1.0, 0.0]) for _ in range(num_layers) - ], # block lambdas - *[ - torch.tensor([0.5, 0.5]) for _ in range(num_layers) - ], # SA lambdas - torch.zeros(1), # smear_lambda - 0.5*torch.ones(1), # backout_lambda - torch.ones(pad), - ] - ) - ) - # set learning rates - for param in self.embed.parameters(): - param.lr_mul = 75. - for param in self.value_embeds.parameters(): - param.lr_mul = 75. - self.lm_head.weight.lr_mul = 1.0 - self.scalars.lr_mul = 5.0 - - def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): - assert input_seq.ndim == 1 - - ve = [value_embed(input_seq) for value_embed in self.value_embeds] - # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure - # dropping first layer updates this to .12 ... 012 - ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] - assert len(ve) == len(self.blocks) - - short_bm = ws_short * args.block_size - long_bm = ws_long * args.block_size - bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] - assert len(bm_sizes) == len(self.blocks) - - x = self.embed(input_seq) - - skip_weights = self.scalars[:(len(self.blocks) // 2)] - lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) - sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) - smear_lambda = self.scalars[5 * len(self.blocks)] - backout_lambda = self.scalars[5 * len(self.blocks)+1] - - # smear token embed forward 1 position @classiclarryd - smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) - x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) - x = x0 = norm(x[None]) - - # U-net design by @brendanh0gan - skip_connections = [] - n = len(self.blocks) // 2 - - x_backout = None - backout_layer = 8 - # skip layer zero - for i in range(1,len(self.blocks)): - attn_args = AttnArgs( - ve=ve[i], - sa_lambdas=sa_lambdas[i], - seqlens=seqlens, - bm_size=bm_sizes[i], - cos=self.yarn.cos, - sin=self.yarn.sin, - attn_scale=self.yarn.attn_scale - ) - # since layer 0 is skipped, layer 11 does not have skip_connection - if i >= n and i<11: - gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) - x = x + gate * skip_connections.pop() - x = self.blocks[i](x, x0, lambdas[i], attn_args) - if i < n: - skip_connections.append(x) - if i == backout_layer: - x_backout = x - - # back out contributions from first 8 layers that are only required for downstream context and not direct prediction - x -= backout_lambda * x_backout - x = norm(x) - logits = self.lm_head(x) - # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) - logits = 30 * torch.sigmoid(logits / 7.5) - logits_for_loss = logits.float() if not self.training else logits - loss = F.cross_entropy( - logits_for_loss.view(-1, logits_for_loss.size(-1)), - target_seq, - reduction="sum" if self.training else "mean", - ) - return loss - -# ----------------------------------------------------------------------------- -# Distributed data loader - -def _load_data_shard(file: Path): - header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 - assert header[0] == 20240520, "magic number mismatch in the data .bin file" - assert header[1] == 1, "unsupported version" - num_tokens = int(header[2]) # number of tokens (claimed) - with file.open("rb", buffering=0) as f: - tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng - f.seek(256 * 4) - nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng - assert nbytes == 2 * num_tokens, "number of tokens read does not match header" - return tokens - -BOS_ID = 50256 - -class BOSFinder: - # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd - def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): - # Precompute BOS positions once per shard - self.tokens=tokens - self.size = tokens.numel() - self.quickload = quickload - if quickload: - # only scan first 4 million tokens, then kickoff async thread to scan rest - self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() - self.thread = None - self.ready = threading.Event() - self.start() - else: - self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() - self.i = 0 - self.world_size = world_size - self.batch_iter = 0 - - def _load(self): - self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() - self.ready.set() - - def start(self): - self.ready.clear() - self.thread = threading.Thread(target=self._load) - self.thread.start() - - def get(self): - if self.thread: - self.ready.wait() - self.thread.join() - self.bos_idx = self.bos_idx_async - - def next_batch(self, num_tokens_local: int, max_seq_len: int): - # if quickload was used, repoint to the full dataset after 5 batches - if self.quickload and self.batch_iter==5: - self.get() - n = len(self.bos_idx) - starts = [[] for _ in range(self.world_size)] - ends = [[] for _ in range(self.world_size)] - - idx = self.i - for r in range(self.world_size): - cur_len = 0 - while cur_len <= num_tokens_local: - if idx >= n: - raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") - cur = self.bos_idx[idx] - starts[r].append(cur) - end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, - cur + max_seq_len, - cur + num_tokens_local - cur_len + 1) - ends[r].append(end) - cur_len += end - cur - idx += 1 - - assert cur_len == num_tokens_local + 1 - self.i = idx - self.batch_iter+=1 - return starts, ends - -class DataPreloader: - # Helper for asynchronously loading next shard and indexing bos tokens - def __init__(self, file_iter, world_size: int = 1): - self.file_iter = file_iter - self.world_size = world_size - self.thread = None - self.data = None - self.ready = threading.Event() - - def _load(self): - tokens = _load_data_shard(next(self.file_iter)) - self.data = (tokens, BOSFinder(tokens, self.world_size)) - self.ready.set() - - def start(self): - self.ready.clear() - self.thread = threading.Thread(target=self._load) - self.thread.start() - - def get(self): - if self.thread: - self.ready.wait() - self.thread.join() - return self.data - -def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): - # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len - rank = dist.get_rank() if dist.is_initialized() else 0 - world_size = dist.get_world_size() if dist.is_initialized() else 1 - assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" - num_tokens = num_tokens // grad_accum_steps - - files = [Path(file) for file in sorted(glob.glob(filename_pattern))] - if not files: - raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") - - file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training - tokens = _load_data_shard(next(file_iter)) - if align_to_bos: - finder = BOSFinder(tokens, world_size=world_size, quickload=True) - preloader = DataPreloader(file_iter, world_size) - preloader.start() - else: - pos = 0 # for unaligned case - - while True: - num_tokens_local = num_tokens // world_size - max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 - - if align_to_bos: - try: - seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) - start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) - except StopIteration: - # This shard is exhausted, load the next one in the next loop iteration. - tokens, finder = preloader.get() - preloader.start() - continue - - buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) - _inputs = buf[:-1] - _targets = buf[1:] - end_idxs[-1] -= 1 # last document was too long to account for _targets offset - cum_lengths = (end_idxs - start_idxs).cumsum(0) - - else: - if pos + num_tokens + 1 >= len(tokens): # should not occur for val data - tokens, pos = _load_data_shard(next(file_iter)), 0 - - pos_local = pos + rank * num_tokens_local - buf = tokens[pos_local: pos_local + num_tokens_local + 1] - _inputs = buf[:-1].view(num_tokens_local, ) - _targets = buf[1:].view(num_tokens_local, ) - - cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] - pos += num_tokens - - - _cum_lengths = torch.full((max_num_docs,), num_tokens_local) - _cum_lengths[0] = 0 - _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths - - new_params = yield ( - _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), - _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), - _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) - ) - - if new_params is not None: - # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() - new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params - assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" - num_tokens = new_num_tokens - max_seq_len = new_max_seq_len - grad_accum_steps = new_grad_accum_steps - - -# ----------------------------------------------------------------------------- -# int main - -@dataclass -class Hyperparameters: - # data - train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on - val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on - val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons - train_batch_size: int = 2048 * 16 * 8 - train_max_seq_len: int = 128 * 16 - val_batch_size: int = 4 * 64 * 1024 * 8 - # optimization - num_iterations: int = 2285 - lr_schedule = (0.5, 0.98) # breakpoints for 3-part schedule: (flat, linear decay, flat) - lr_min = 0.1 - # evaluation and logging - run_id: str = f"{uuid.uuid4()}" - val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end - save_checkpoint: bool = False - # attention masking - block_size: int = 128 - ws_schedule: tuple = (3, 5, 7, 9, 11, 13) - ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN - -args = Hyperparameters() - -data_path = os.environ.get("DATA_PATH", ".") -args.train_files = os.path.join(data_path, args.train_files) -args.val_files = os.path.join(data_path, args.val_files) - -# torchrun sets these env variables -rank = int(os.environ["RANK"]) -world_size = int(os.environ["WORLD_SIZE"]) -assert 8 % world_size == 0, "world_size must be a divisor of 8" -grad_accum_steps = 8 // world_size -assert torch.cuda.is_available() -device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) -torch.cuda.set_device(device) -dist.init_process_group(backend="nccl", device_id=device) -dist.barrier() -master_process = (rank == 0) # this process will do logging, checkpointing etc. - -# begin logging -logfile = None -if master_process: - run_id = args.run_id - os.makedirs("logs", exist_ok=True) - logfile = f"logs/{run_id}.txt" - print(logfile) -def print0(s, console=False): - if master_process: - with open(logfile, "a") as f: - if console: - print(s) - print(s, file=f) - -# begin by printing this file (the Python code) -print0(code) -print0("="*100) -# log information about the hardware/software environment this is running on -print0(f"Running Python {sys.version}") -print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") -print0(f"Running Triton version {triton.__version__}") - -def nvidia_smi(): - import subprocess # avoid top level import - return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout -print0(nvidia_smi()) -print0("="*100) - -model: nn.Module = GPT( - vocab_size=50257, - num_layers=12, - num_heads=6, - head_dim=128, - model_dim=768, - max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) -).cuda() -for m in model.modules(): - if isinstance(m, (nn.Embedding, nn.Linear)): - m.bfloat16() -for param in model.parameters(): - dist.broadcast(param.detach(), 0) - -# collect the parameters to optimize -hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] -embed_params = [p for n, p in model.named_parameters() if "embed" in n] -scalar_params = [p for p in model.parameters() if p.ndim < 2] -head_params = [model.lm_head.weight] -gate_params = [p for n, p in model.named_parameters() if "gate" in n] - -# init the optimizer(s) -# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence -# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 -optimizer1 = DistAdam( - scalar_params + head_params + embed_params, - lr=0.008, - betas=(0.65, 0.95), - eps=1e-8, - weight_decay=0.0, -) -optimizer2 = Muon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=0.0) -optimizers = [optimizer1, optimizer2] -for opt in optimizers: - for group in opt.param_groups: - group["initial_lr"] = group["lr"] - -def get_lr(step: int): - assert step < args.num_iterations - # Three part schedule: flat, linear decrease, flat - lr_schedule = args.lr_schedule - x = step / args.num_iterations - - if x < lr_schedule[0]: - return 1.0 - elif x < lr_schedule[1]: - progress = (x - lr_schedule[0]) / (lr_schedule[1] - lr_schedule[0]) - lr = 1.0 - (1.0 - args.lr_min) * progress - else: - lr = args.lr_min - return lr - -def get_ws(step: int): - assert step <= args.num_iterations - x = step / (args.num_iterations + 1) - ws_idx = int(len(args.ws_schedule) * x) - return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] - -def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): - # warmup phase: linearly increase momentum from min to max - # cooldown phase: linearly decrease momentum from max to min - momentum_cd_start = args.num_iterations - muon_cooldown_steps - if step < muon_warmup_steps: - frac = step / muon_warmup_steps - momentum = momentum_min + frac * (momentum_max - momentum_min) - elif step > momentum_cd_start: - frac = (step - momentum_cd_start) / muon_cooldown_steps - momentum = momentum_max - frac * (momentum_max - momentum_min) - else: - momentum = momentum_max - return momentum - -def step_optimizers(step: int, optimizers, model): - # update lr - for optimizer in optimizers: - for group in optimizer.param_groups: - group["lr"] = group["initial_lr"] * get_lr(step) - - # set muon momentum based on step - momentum = get_muon_momentum(step) - for group in optimizers[1].param_groups: - group["momentum"] = momentum - - # on even steps, only step Muon params - # on odd steps, step all params - if step%2==0: - optimizers[1].step() - optimizers[1].zero_grad(set_to_none=True) - else: - for optimizer in optimizers: - optimizer.step() - model.zero_grad(set_to_none=True) - -model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) - -######################################## -# Warmup kernels # -######################################## - -# Warmup the training kernels, then re-initialize the state so we aren't cheating -warmup_steps = 30 -initial_state = dict(model=copy.deepcopy(model.state_dict()), - optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state -train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) -for step in range(warmup_steps): - inputs, targets, cum_seqlens = next(train_loader) - # each window size is a new graph, need to warm up each with Yarn.attn_scale - ws_idx = step % len(args.ws_schedule) - if ws_idx==0: - model.yarn.reset() - ws_long = args.ws_schedule[0] - else: - new_ws_long = args.ws_schedule[ws_idx] - if new_ws_long > ws_long: - model.yarn.apply(ws_long, new_ws_long) - ws_long = new_ws_long - model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() - for opt in optimizers: - opt.step() - model.zero_grad(set_to_none=True) -model.yarn.reset() # rotary buffer is not stored in state_dict -model.load_state_dict(initial_state["model"]) -optimizer2.reset() # momentum buffer not in state dict -for opt, opt_state in zip(optimizers, initial_state["optimizers"]): - opt.load_state_dict(opt_state) -del train_loader, initial_state - -######################################## -# Training and validation # -######################################## - -train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) -training_time_ms = 0 -# start the clock -torch.cuda.synchronize() -t0 = time.perf_counter() -# begin training -train_steps = args.num_iterations -ws_short, ws_long = get_ws(0) -for step in range(train_steps + 1): - last_step = (step == train_steps) - ws_short, new_ws_long = get_ws(step) - if new_ws_long != ws_long: - model.yarn.apply(ws_long, new_ws_long) - ws_long=new_ws_long - - # --------------- VALIDATION SECTION ----------------- - if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): - if last_step: - ws_long = args.ws_validate_post_yarn_ext - # stop the clock - torch.cuda.synchronize() - training_time_ms += 1000 * (time.perf_counter() - t0) - model.eval() - assert args.val_tokens % args.val_batch_size == 0 - val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size - val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) - val_loss = 0 - with torch.no_grad(): - for _ in range(val_steps): - inputs, targets, cum_seqlens = next(val_loader) - val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) - val_loss /= val_steps - del val_loader - dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) - print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) - model.train() - # start the clock again - torch.cuda.synchronize() - t0 = time.perf_counter() - - if last_step: - if master_process and args.save_checkpoint: - log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) - os.makedirs(f"logs/{run_id}", exist_ok=True) - torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") - # the last step only has the validation loop, so break to avoid training - break - - # --------------- TRAINING SECTION ----------------- - loss = 0 - for _ in range(grad_accum_steps): - inputs, targets, cum_seqlens = next(train_loader) - loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps - loss.backward() - step_optimizers(step, optimizers, model) - - # logging - approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) - print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) - -print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " - f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) -dist.destroy_process_group() - -==================================================================================================== -Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] -Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 -Running Triton version 3.5.0 -Tue Oct 28 02:00:09 2025 -+-----------------------------------------------------------------------------------------+ -| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | -|-----------------------------------------+------------------------+----------------------+ -| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | -| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | -| | | MIG M. | -|=========================================+========================+======================| -| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | -| N/A 41C P0 129W / 700W | 5858MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | -| N/A 33C P0 127W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | -| N/A 32C P0 121W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | -| N/A 38C P0 126W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | -| N/A 39C P0 121W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | -| N/A 32C P0 120W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | -| N/A 38C P0 125W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | -| N/A 31C P0 115W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ - -+-----------------------------------------------------------------------------------------+ -| Processes: | -| GPU GI CI PID Type Process name GPU Memory | -| ID ID Usage | -|=========================================================================================| -+-----------------------------------------------------------------------------------------+ - -==================================================================================================== -step:0/2285 val_loss:10.8258 train_time:0ms step_avg:0.11ms -step:1/2285 train_time:110ms step_avg:109.58ms -step:2/2285 train_time:131ms step_avg:65.39ms -step:3/2285 train_time:168ms step_avg:56.15ms -step:4/2285 train_time:225ms step_avg:56.20ms -step:5/2285 train_time:284ms step_avg:56.77ms -step:6/2285 train_time:342ms step_avg:56.97ms -step:7/2285 train_time:402ms step_avg:57.48ms -step:8/2285 train_time:461ms step_avg:57.59ms -step:9/2285 train_time:521ms step_avg:57.93ms -step:10/2285 train_time:580ms step_avg:58.01ms -step:11/2285 train_time:641ms step_avg:58.29ms -step:12/2285 train_time:700ms step_avg:58.33ms -step:13/2285 train_time:760ms step_avg:58.48ms -step:14/2285 train_time:820ms step_avg:58.55ms -step:15/2285 train_time:881ms step_avg:58.70ms -step:16/2285 train_time:939ms step_avg:58.69ms -step:17/2285 train_time:1003ms step_avg:58.98ms -step:18/2285 train_time:1065ms step_avg:59.19ms -step:19/2285 train_time:1131ms step_avg:59.50ms -step:20/2285 train_time:1190ms step_avg:59.50ms -step:21/2285 train_time:1252ms step_avg:59.60ms -step:22/2285 train_time:1311ms step_avg:59.57ms -step:23/2285 train_time:1371ms step_avg:59.62ms -step:24/2285 train_time:1430ms step_avg:59.58ms -step:25/2285 train_time:1491ms step_avg:59.63ms -step:26/2285 train_time:1549ms step_avg:59.59ms -step:27/2285 train_time:1610ms step_avg:59.65ms -step:28/2285 train_time:1669ms step_avg:59.61ms -step:29/2285 train_time:1730ms step_avg:59.67ms -step:30/2285 train_time:1789ms step_avg:59.64ms -step:31/2285 train_time:1851ms step_avg:59.70ms -step:32/2285 train_time:1910ms step_avg:59.68ms -step:33/2285 train_time:1972ms step_avg:59.75ms -step:34/2285 train_time:2031ms step_avg:59.74ms -step:35/2285 train_time:2093ms step_avg:59.81ms -step:36/2285 train_time:2152ms step_avg:59.78ms -step:37/2285 train_time:2214ms step_avg:59.84ms -step:38/2285 train_time:2273ms step_avg:59.81ms -step:39/2285 train_time:2334ms step_avg:59.86ms -step:40/2285 train_time:2393ms step_avg:59.83ms -step:41/2285 train_time:2454ms step_avg:59.85ms -step:42/2285 train_time:2512ms step_avg:59.82ms -step:43/2285 train_time:2574ms step_avg:59.85ms -step:44/2285 train_time:2633ms step_avg:59.83ms -step:45/2285 train_time:2694ms step_avg:59.86ms -step:46/2285 train_time:2753ms step_avg:59.84ms -step:47/2285 train_time:2814ms step_avg:59.87ms -step:48/2285 train_time:2873ms step_avg:59.85ms -step:49/2285 train_time:2934ms step_avg:59.88ms -step:50/2285 train_time:2993ms step_avg:59.86ms -step:51/2285 train_time:3054ms step_avg:59.89ms -step:52/2285 train_time:3113ms step_avg:59.87ms -step:53/2285 train_time:3175ms step_avg:59.91ms -step:54/2285 train_time:3234ms step_avg:59.89ms -step:55/2285 train_time:3296ms step_avg:59.92ms -step:56/2285 train_time:3355ms step_avg:59.91ms -step:57/2285 train_time:3416ms step_avg:59.94ms -step:58/2285 train_time:3475ms step_avg:59.92ms -step:59/2285 train_time:3537ms step_avg:59.95ms -step:60/2285 train_time:3596ms step_avg:59.93ms -step:61/2285 train_time:3657ms step_avg:59.96ms -step:62/2285 train_time:3717ms step_avg:59.95ms -step:63/2285 train_time:3778ms step_avg:59.97ms -step:64/2285 train_time:3838ms step_avg:59.96ms -step:65/2285 train_time:3899ms step_avg:59.99ms -step:66/2285 train_time:3958ms step_avg:59.98ms -step:67/2285 train_time:4020ms step_avg:60.00ms -step:68/2285 train_time:4080ms step_avg:60.00ms -step:69/2285 train_time:4142ms step_avg:60.03ms -step:70/2285 train_time:4201ms step_avg:60.02ms -step:71/2285 train_time:4263ms step_avg:60.04ms -step:72/2285 train_time:4322ms step_avg:60.03ms -step:73/2285 train_time:4383ms step_avg:60.04ms -step:74/2285 train_time:4442ms step_avg:60.03ms -step:75/2285 train_time:4503ms step_avg:60.04ms -step:76/2285 train_time:4562ms step_avg:60.03ms -step:77/2285 train_time:4624ms step_avg:60.05ms -step:78/2285 train_time:4684ms step_avg:60.05ms -step:79/2285 train_time:4745ms step_avg:60.07ms -step:80/2285 train_time:4804ms step_avg:60.06ms -step:81/2285 train_time:4866ms step_avg:60.07ms -step:82/2285 train_time:4925ms step_avg:60.06ms -step:83/2285 train_time:4986ms step_avg:60.07ms -step:84/2285 train_time:5045ms step_avg:60.06ms -step:85/2285 train_time:5106ms step_avg:60.08ms -step:86/2285 train_time:5165ms step_avg:60.06ms -step:87/2285 train_time:5226ms step_avg:60.07ms -step:88/2285 train_time:5284ms step_avg:60.05ms -step:89/2285 train_time:5345ms step_avg:60.06ms -step:90/2285 train_time:5404ms step_avg:60.04ms -step:91/2285 train_time:5465ms step_avg:60.06ms -step:92/2285 train_time:5524ms step_avg:60.04ms -step:93/2285 train_time:5585ms step_avg:60.05ms -step:94/2285 train_time:5644ms step_avg:60.04ms -step:95/2285 train_time:5705ms step_avg:60.05ms -step:96/2285 train_time:5764ms step_avg:60.04ms -step:97/2285 train_time:5825ms step_avg:60.06ms -step:98/2285 train_time:5884ms step_avg:60.04ms -step:99/2285 train_time:5945ms step_avg:60.06ms -step:100/2285 train_time:6004ms step_avg:60.04ms -step:101/2285 train_time:6066ms step_avg:60.06ms -step:102/2285 train_time:6124ms step_avg:60.04ms -step:103/2285 train_time:6186ms step_avg:60.06ms -step:104/2285 train_time:6245ms step_avg:60.05ms -step:105/2285 train_time:6305ms step_avg:60.05ms -step:106/2285 train_time:6364ms step_avg:60.03ms -step:107/2285 train_time:6425ms step_avg:60.05ms -step:108/2285 train_time:6484ms step_avg:60.03ms -step:109/2285 train_time:6545ms step_avg:60.04ms -step:110/2285 train_time:6604ms step_avg:60.03ms -step:111/2285 train_time:6666ms step_avg:60.05ms -step:112/2285 train_time:6725ms step_avg:60.04ms -step:113/2285 train_time:6786ms step_avg:60.05ms -step:114/2285 train_time:6845ms step_avg:60.04ms -step:115/2285 train_time:6906ms step_avg:60.05ms -step:116/2285 train_time:6966ms step_avg:60.05ms -step:117/2285 train_time:7027ms step_avg:60.06ms -step:118/2285 train_time:7086ms step_avg:60.05ms -step:119/2285 train_time:7147ms step_avg:60.06ms -step:120/2285 train_time:7206ms step_avg:60.05ms -step:121/2285 train_time:7267ms step_avg:60.06ms -step:122/2285 train_time:7327ms step_avg:60.06ms -step:123/2285 train_time:7388ms step_avg:60.07ms -step:124/2285 train_time:7447ms step_avg:60.05ms -step:125/2285 train_time:7507ms step_avg:60.06ms -step:126/2285 train_time:7566ms step_avg:60.05ms -step:127/2285 train_time:7627ms step_avg:60.06ms -step:128/2285 train_time:7687ms step_avg:60.05ms -step:129/2285 train_time:7748ms step_avg:60.06ms -step:130/2285 train_time:7807ms step_avg:60.06ms -step:131/2285 train_time:7869ms step_avg:60.07ms -step:132/2285 train_time:7929ms step_avg:60.06ms -step:133/2285 train_time:7990ms step_avg:60.07ms -step:134/2285 train_time:8049ms step_avg:60.07ms -step:135/2285 train_time:8110ms step_avg:60.07ms -step:136/2285 train_time:8169ms step_avg:60.06ms -step:137/2285 train_time:8230ms step_avg:60.07ms -step:138/2285 train_time:8289ms step_avg:60.06ms -step:139/2285 train_time:8350ms step_avg:60.07ms -step:140/2285 train_time:8408ms step_avg:60.06ms -step:141/2285 train_time:8469ms step_avg:60.06ms -step:142/2285 train_time:8528ms step_avg:60.06ms -step:143/2285 train_time:8588ms step_avg:60.06ms -step:144/2285 train_time:8647ms step_avg:60.05ms -step:145/2285 train_time:8708ms step_avg:60.06ms -step:146/2285 train_time:8767ms step_avg:60.05ms -step:147/2285 train_time:8829ms step_avg:60.06ms -step:148/2285 train_time:8888ms step_avg:60.05ms -step:149/2285 train_time:8949ms step_avg:60.06ms -step:150/2285 train_time:9008ms step_avg:60.05ms -step:151/2285 train_time:9069ms step_avg:60.06ms -step:152/2285 train_time:9128ms step_avg:60.05ms -step:153/2285 train_time:9188ms step_avg:60.05ms -step:154/2285 train_time:9247ms step_avg:60.05ms -step:155/2285 train_time:9308ms step_avg:60.05ms -step:156/2285 train_time:9367ms step_avg:60.05ms -step:157/2285 train_time:9428ms step_avg:60.05ms -step:158/2285 train_time:9487ms step_avg:60.04ms -step:159/2285 train_time:9548ms step_avg:60.05ms -step:160/2285 train_time:9606ms step_avg:60.04ms -step:161/2285 train_time:9666ms step_avg:60.04ms -step:162/2285 train_time:9725ms step_avg:60.03ms -step:163/2285 train_time:9786ms step_avg:60.04ms -step:164/2285 train_time:9845ms step_avg:60.03ms -step:165/2285 train_time:9907ms step_avg:60.04ms -step:166/2285 train_time:9966ms step_avg:60.03ms -step:167/2285 train_time:10027ms step_avg:60.04ms -step:168/2285 train_time:10086ms step_avg:60.03ms -step:169/2285 train_time:10147ms step_avg:60.04ms -step:170/2285 train_time:10205ms step_avg:60.03ms -step:171/2285 train_time:10266ms step_avg:60.04ms -step:172/2285 train_time:10325ms step_avg:60.03ms -step:173/2285 train_time:10386ms step_avg:60.04ms -step:174/2285 train_time:10445ms step_avg:60.03ms -step:175/2285 train_time:10506ms step_avg:60.04ms -step:176/2285 train_time:10565ms step_avg:60.03ms -step:177/2285 train_time:10626ms step_avg:60.03ms -step:178/2285 train_time:10685ms step_avg:60.03ms -step:179/2285 train_time:10746ms step_avg:60.03ms -step:180/2285 train_time:10805ms step_avg:60.03ms -step:181/2285 train_time:10866ms step_avg:60.03ms -step:182/2285 train_time:10925ms step_avg:60.03ms -step:183/2285 train_time:10985ms step_avg:60.03ms -step:184/2285 train_time:11044ms step_avg:60.02ms -step:185/2285 train_time:11105ms step_avg:60.03ms -step:186/2285 train_time:11165ms step_avg:60.03ms -step:187/2285 train_time:11226ms step_avg:60.03ms -step:188/2285 train_time:11284ms step_avg:60.02ms -step:189/2285 train_time:11345ms step_avg:60.03ms -step:190/2285 train_time:11404ms step_avg:60.02ms -step:191/2285 train_time:11465ms step_avg:60.03ms -step:192/2285 train_time:11524ms step_avg:60.02ms -step:193/2285 train_time:11585ms step_avg:60.03ms -step:194/2285 train_time:11644ms step_avg:60.02ms -step:195/2285 train_time:11705ms step_avg:60.03ms -step:196/2285 train_time:11763ms step_avg:60.02ms -step:197/2285 train_time:11825ms step_avg:60.02ms -step:198/2285 train_time:11883ms step_avg:60.02ms -step:199/2285 train_time:11944ms step_avg:60.02ms -step:200/2285 train_time:12003ms step_avg:60.01ms -step:201/2285 train_time:12064ms step_avg:60.02ms -step:202/2285 train_time:12123ms step_avg:60.02ms -step:203/2285 train_time:12184ms step_avg:60.02ms -step:204/2285 train_time:12243ms step_avg:60.02ms -step:205/2285 train_time:12304ms step_avg:60.02ms -step:206/2285 train_time:12363ms step_avg:60.02ms -step:207/2285 train_time:12425ms step_avg:60.02ms -step:208/2285 train_time:12483ms step_avg:60.01ms -step:209/2285 train_time:12544ms step_avg:60.02ms -step:210/2285 train_time:12603ms step_avg:60.01ms -step:211/2285 train_time:12664ms step_avg:60.02ms -step:212/2285 train_time:12723ms step_avg:60.01ms -step:213/2285 train_time:12784ms step_avg:60.02ms -step:214/2285 train_time:12843ms step_avg:60.01ms -step:215/2285 train_time:12904ms step_avg:60.02ms -step:216/2285 train_time:12963ms step_avg:60.01ms -step:217/2285 train_time:13025ms step_avg:60.02ms -step:218/2285 train_time:13083ms step_avg:60.01ms -step:219/2285 train_time:13145ms step_avg:60.02ms -step:220/2285 train_time:13203ms step_avg:60.01ms -step:221/2285 train_time:13264ms step_avg:60.02ms -step:222/2285 train_time:13323ms step_avg:60.01ms -step:223/2285 train_time:13384ms step_avg:60.02ms -step:224/2285 train_time:13443ms step_avg:60.01ms -step:225/2285 train_time:13504ms step_avg:60.02ms -step:226/2285 train_time:13563ms step_avg:60.01ms -step:227/2285 train_time:13625ms step_avg:60.02ms -step:228/2285 train_time:13683ms step_avg:60.01ms -step:229/2285 train_time:13744ms step_avg:60.02ms -step:230/2285 train_time:13802ms step_avg:60.01ms -step:231/2285 train_time:13865ms step_avg:60.02ms -step:232/2285 train_time:13922ms step_avg:60.01ms -step:233/2285 train_time:13983ms step_avg:60.01ms -step:234/2285 train_time:14042ms step_avg:60.01ms -step:235/2285 train_time:14103ms step_avg:60.01ms -step:236/2285 train_time:14162ms step_avg:60.01ms -step:237/2285 train_time:14224ms step_avg:60.02ms -step:238/2285 train_time:14282ms step_avg:60.01ms -step:239/2285 train_time:14343ms step_avg:60.01ms -step:240/2285 train_time:14402ms step_avg:60.01ms -step:241/2285 train_time:14464ms step_avg:60.02ms -step:242/2285 train_time:14522ms step_avg:60.01ms -step:243/2285 train_time:14583ms step_avg:60.01ms -step:244/2285 train_time:14642ms step_avg:60.01ms -step:245/2285 train_time:14703ms step_avg:60.01ms -step:246/2285 train_time:14761ms step_avg:60.00ms -step:247/2285 train_time:14823ms step_avg:60.01ms -step:248/2285 train_time:14882ms step_avg:60.01ms -step:249/2285 train_time:14943ms step_avg:60.01ms -step:250/2285 train_time:15001ms step_avg:60.01ms -step:250/2285 val_loss:4.0805 train_time:15064ms step_avg:60.26ms -step:251/2285 train_time:15083ms step_avg:60.09ms -step:252/2285 train_time:15123ms step_avg:60.01ms -step:253/2285 train_time:15191ms step_avg:60.04ms -step:254/2285 train_time:15258ms step_avg:60.07ms -step:255/2285 train_time:15320ms step_avg:60.08ms -step:256/2285 train_time:15379ms step_avg:60.08ms -step:257/2285 train_time:15440ms step_avg:60.08ms -step:258/2285 train_time:15498ms step_avg:60.07ms -step:259/2285 train_time:15558ms step_avg:60.07ms -step:260/2285 train_time:15616ms step_avg:60.06ms -step:261/2285 train_time:15676ms step_avg:60.06ms -step:262/2285 train_time:15734ms step_avg:60.05ms -step:263/2285 train_time:15794ms step_avg:60.05ms -step:264/2285 train_time:15852ms step_avg:60.04ms -step:265/2285 train_time:15912ms step_avg:60.04ms -step:266/2285 train_time:15969ms step_avg:60.04ms -step:267/2285 train_time:16030ms step_avg:60.04ms -step:268/2285 train_time:16090ms step_avg:60.04ms -step:269/2285 train_time:16153ms step_avg:60.05ms -step:270/2285 train_time:16213ms step_avg:60.05ms -step:271/2285 train_time:16276ms step_avg:60.06ms -step:272/2285 train_time:16335ms step_avg:60.06ms -step:273/2285 train_time:16396ms step_avg:60.06ms -step:274/2285 train_time:16455ms step_avg:60.05ms -step:275/2285 train_time:16515ms step_avg:60.06ms -step:276/2285 train_time:16574ms step_avg:60.05ms -step:277/2285 train_time:16634ms step_avg:60.05ms -step:278/2285 train_time:16692ms step_avg:60.04ms -step:279/2285 train_time:16753ms step_avg:60.05ms -step:280/2285 train_time:16811ms step_avg:60.04ms -step:281/2285 train_time:16872ms step_avg:60.04ms -step:282/2285 train_time:16930ms step_avg:60.04ms -step:283/2285 train_time:16991ms step_avg:60.04ms -step:284/2285 train_time:17049ms step_avg:60.03ms -step:285/2285 train_time:17110ms step_avg:60.04ms -step:286/2285 train_time:17170ms step_avg:60.04ms -step:287/2285 train_time:17232ms step_avg:60.04ms -step:288/2285 train_time:17291ms step_avg:60.04ms -step:289/2285 train_time:17353ms step_avg:60.05ms -step:290/2285 train_time:17412ms step_avg:60.04ms -step:291/2285 train_time:17473ms step_avg:60.05ms -step:292/2285 train_time:17532ms step_avg:60.04ms -step:293/2285 train_time:17593ms step_avg:60.04ms -step:294/2285 train_time:17651ms step_avg:60.04ms -step:295/2285 train_time:17711ms step_avg:60.04ms -step:296/2285 train_time:17770ms step_avg:60.03ms -step:297/2285 train_time:17830ms step_avg:60.03ms -step:298/2285 train_time:17888ms step_avg:60.03ms -step:299/2285 train_time:17949ms step_avg:60.03ms -step:300/2285 train_time:18008ms step_avg:60.03ms -step:301/2285 train_time:18069ms step_avg:60.03ms -step:302/2285 train_time:18128ms step_avg:60.03ms -step:303/2285 train_time:18190ms step_avg:60.03ms -step:304/2285 train_time:18248ms step_avg:60.03ms -step:305/2285 train_time:18310ms step_avg:60.03ms -step:306/2285 train_time:18369ms step_avg:60.03ms -step:307/2285 train_time:18431ms step_avg:60.04ms -step:308/2285 train_time:18489ms step_avg:60.03ms -step:309/2285 train_time:18551ms step_avg:60.03ms -step:310/2285 train_time:18609ms step_avg:60.03ms -step:311/2285 train_time:18670ms step_avg:60.03ms -step:312/2285 train_time:18728ms step_avg:60.03ms -step:313/2285 train_time:18789ms step_avg:60.03ms -step:314/2285 train_time:18847ms step_avg:60.02ms -step:315/2285 train_time:18908ms step_avg:60.03ms -step:316/2285 train_time:18967ms step_avg:60.02ms -step:317/2285 train_time:19027ms step_avg:60.02ms -step:318/2285 train_time:19086ms step_avg:60.02ms -step:319/2285 train_time:19147ms step_avg:60.02ms -step:320/2285 train_time:19206ms step_avg:60.02ms -step:321/2285 train_time:19268ms step_avg:60.02ms -step:322/2285 train_time:19326ms step_avg:60.02ms -step:323/2285 train_time:19387ms step_avg:60.02ms -step:324/2285 train_time:19446ms step_avg:60.02ms -step:325/2285 train_time:19507ms step_avg:60.02ms -step:326/2285 train_time:19566ms step_avg:60.02ms -step:327/2285 train_time:19627ms step_avg:60.02ms -step:328/2285 train_time:19686ms step_avg:60.02ms -step:329/2285 train_time:19746ms step_avg:60.02ms -step:330/2285 train_time:19804ms step_avg:60.01ms -step:331/2285 train_time:19865ms step_avg:60.01ms -step:332/2285 train_time:19923ms step_avg:60.01ms -step:333/2285 train_time:19984ms step_avg:60.01ms -step:334/2285 train_time:20042ms step_avg:60.01ms -step:335/2285 train_time:20103ms step_avg:60.01ms -step:336/2285 train_time:20161ms step_avg:60.00ms -step:337/2285 train_time:20222ms step_avg:60.01ms -step:338/2285 train_time:20281ms step_avg:60.00ms -step:339/2285 train_time:20342ms step_avg:60.01ms -step:340/2285 train_time:20401ms step_avg:60.00ms -step:341/2285 train_time:20462ms step_avg:60.01ms -step:342/2285 train_time:20521ms step_avg:60.00ms -step:343/2285 train_time:20582ms step_avg:60.01ms -step:344/2285 train_time:20641ms step_avg:60.00ms -step:345/2285 train_time:20702ms step_avg:60.01ms -step:346/2285 train_time:20761ms step_avg:60.00ms -step:347/2285 train_time:20822ms step_avg:60.00ms -step:348/2285 train_time:20880ms step_avg:60.00ms -step:349/2285 train_time:20941ms step_avg:60.00ms -step:350/2285 train_time:20999ms step_avg:60.00ms -step:351/2285 train_time:21060ms step_avg:60.00ms -step:352/2285 train_time:21119ms step_avg:60.00ms -step:353/2285 train_time:21179ms step_avg:60.00ms -step:354/2285 train_time:21238ms step_avg:59.99ms -step:355/2285 train_time:21298ms step_avg:60.00ms -step:356/2285 train_time:21357ms step_avg:59.99ms -step:357/2285 train_time:21419ms step_avg:60.00ms -step:358/2285 train_time:21478ms step_avg:59.99ms -step:359/2285 train_time:21539ms step_avg:60.00ms -step:360/2285 train_time:21598ms step_avg:59.99ms -step:361/2285 train_time:21659ms step_avg:60.00ms -step:362/2285 train_time:21718ms step_avg:59.99ms -step:363/2285 train_time:21779ms step_avg:60.00ms -step:364/2285 train_time:21837ms step_avg:59.99ms -step:365/2285 train_time:21898ms step_avg:59.99ms -step:366/2285 train_time:21956ms step_avg:59.99ms -step:367/2285 train_time:22017ms step_avg:59.99ms -step:368/2285 train_time:22075ms step_avg:59.99ms -step:369/2285 train_time:22136ms step_avg:59.99ms -step:370/2285 train_time:22194ms step_avg:59.98ms -step:371/2285 train_time:22255ms step_avg:59.99ms -step:372/2285 train_time:22314ms step_avg:59.98ms -step:373/2285 train_time:22375ms step_avg:59.99ms -step:374/2285 train_time:22434ms step_avg:59.98ms -step:375/2285 train_time:22495ms step_avg:59.99ms -step:376/2285 train_time:22553ms step_avg:59.98ms -step:377/2285 train_time:22615ms step_avg:59.99ms -step:378/2285 train_time:22674ms step_avg:59.98ms -step:379/2285 train_time:22734ms step_avg:59.98ms -step:380/2285 train_time:22793ms step_avg:59.98ms -step:381/2285 train_time:22853ms step_avg:59.98ms -step:382/2285 train_time:22912ms step_avg:59.98ms -step:383/2285 train_time:22973ms step_avg:59.98ms -step:384/2285 train_time:23032ms step_avg:59.98ms -step:385/2285 train_time:23094ms step_avg:59.98ms -step:386/2285 train_time:23153ms step_avg:59.98ms -step:387/2285 train_time:23214ms step_avg:59.98ms -step:388/2285 train_time:23273ms step_avg:59.98ms -step:389/2285 train_time:23335ms step_avg:59.99ms -step:390/2285 train_time:23394ms step_avg:59.98ms -step:391/2285 train_time:23455ms step_avg:59.99ms -step:392/2285 train_time:23514ms step_avg:59.98ms -step:393/2285 train_time:23575ms step_avg:59.99ms -step:394/2285 train_time:23634ms step_avg:59.99ms -step:395/2285 train_time:23696ms step_avg:59.99ms -step:396/2285 train_time:23755ms step_avg:59.99ms -step:397/2285 train_time:23816ms step_avg:59.99ms -step:398/2285 train_time:23875ms step_avg:59.99ms -step:399/2285 train_time:23936ms step_avg:59.99ms -step:400/2285 train_time:23995ms step_avg:59.99ms -step:401/2285 train_time:24056ms step_avg:59.99ms -step:402/2285 train_time:24115ms step_avg:59.99ms -step:403/2285 train_time:24177ms step_avg:59.99ms -step:404/2285 train_time:24236ms step_avg:59.99ms -step:405/2285 train_time:24297ms step_avg:59.99ms -step:406/2285 train_time:24356ms step_avg:59.99ms -step:407/2285 train_time:24418ms step_avg:60.00ms -step:408/2285 train_time:24477ms step_avg:59.99ms -step:409/2285 train_time:24539ms step_avg:60.00ms -step:410/2285 train_time:24597ms step_avg:59.99ms -step:411/2285 train_time:24658ms step_avg:60.00ms -step:412/2285 train_time:24718ms step_avg:59.99ms -step:413/2285 train_time:24779ms step_avg:60.00ms -step:414/2285 train_time:24838ms step_avg:60.00ms -step:415/2285 train_time:24899ms step_avg:60.00ms -step:416/2285 train_time:24959ms step_avg:60.00ms -step:417/2285 train_time:25020ms step_avg:60.00ms -step:418/2285 train_time:25079ms step_avg:60.00ms -step:419/2285 train_time:25140ms step_avg:60.00ms -step:420/2285 train_time:25199ms step_avg:60.00ms -step:421/2285 train_time:25261ms step_avg:60.00ms -step:422/2285 train_time:25320ms step_avg:60.00ms -step:423/2285 train_time:25381ms step_avg:60.00ms -step:424/2285 train_time:25440ms step_avg:60.00ms -step:425/2285 train_time:25501ms step_avg:60.00ms -step:426/2285 train_time:25560ms step_avg:60.00ms -step:427/2285 train_time:25622ms step_avg:60.00ms -step:428/2285 train_time:25681ms step_avg:60.00ms -step:429/2285 train_time:25742ms step_avg:60.00ms -step:430/2285 train_time:25801ms step_avg:60.00ms -step:431/2285 train_time:25863ms step_avg:60.01ms -step:432/2285 train_time:25922ms step_avg:60.00ms -step:433/2285 train_time:25983ms step_avg:60.01ms -step:434/2285 train_time:26042ms step_avg:60.00ms -step:435/2285 train_time:26104ms step_avg:60.01ms -step:436/2285 train_time:26163ms step_avg:60.01ms -step:437/2285 train_time:26225ms step_avg:60.01ms -step:438/2285 train_time:26284ms step_avg:60.01ms -step:439/2285 train_time:26345ms step_avg:60.01ms -step:440/2285 train_time:26404ms step_avg:60.01ms -step:441/2285 train_time:26466ms step_avg:60.01ms -step:442/2285 train_time:26525ms step_avg:60.01ms -step:443/2285 train_time:26587ms step_avg:60.02ms -step:444/2285 train_time:26646ms step_avg:60.01ms -step:445/2285 train_time:26707ms step_avg:60.02ms -step:446/2285 train_time:26766ms step_avg:60.01ms -step:447/2285 train_time:26827ms step_avg:60.02ms -step:448/2285 train_time:26887ms step_avg:60.02ms -step:449/2285 train_time:26948ms step_avg:60.02ms -step:450/2285 train_time:27007ms step_avg:60.02ms -step:451/2285 train_time:27068ms step_avg:60.02ms -step:452/2285 train_time:27127ms step_avg:60.02ms -step:453/2285 train_time:27189ms step_avg:60.02ms -step:454/2285 train_time:27247ms step_avg:60.02ms -step:455/2285 train_time:27309ms step_avg:60.02ms -step:456/2285 train_time:27367ms step_avg:60.02ms -step:457/2285 train_time:27429ms step_avg:60.02ms -step:458/2285 train_time:27489ms step_avg:60.02ms -step:459/2285 train_time:27550ms step_avg:60.02ms -step:460/2285 train_time:27609ms step_avg:60.02ms -step:461/2285 train_time:27671ms step_avg:60.02ms -step:462/2285 train_time:27730ms step_avg:60.02ms -step:463/2285 train_time:27791ms step_avg:60.02ms -step:464/2285 train_time:27850ms step_avg:60.02ms -step:465/2285 train_time:27911ms step_avg:60.02ms -step:466/2285 train_time:27970ms step_avg:60.02ms -step:467/2285 train_time:28032ms step_avg:60.03ms -step:468/2285 train_time:28091ms step_avg:60.02ms -step:469/2285 train_time:28153ms step_avg:60.03ms -step:470/2285 train_time:28213ms step_avg:60.03ms -step:471/2285 train_time:28274ms step_avg:60.03ms -step:472/2285 train_time:28333ms step_avg:60.03ms -step:473/2285 train_time:28395ms step_avg:60.03ms -step:474/2285 train_time:28454ms step_avg:60.03ms -step:475/2285 train_time:28515ms step_avg:60.03ms -step:476/2285 train_time:28574ms step_avg:60.03ms -step:477/2285 train_time:28636ms step_avg:60.03ms -step:478/2285 train_time:28694ms step_avg:60.03ms -step:479/2285 train_time:28755ms step_avg:60.03ms -step:480/2285 train_time:28814ms step_avg:60.03ms -step:481/2285 train_time:28875ms step_avg:60.03ms -step:482/2285 train_time:28934ms step_avg:60.03ms -step:483/2285 train_time:28995ms step_avg:60.03ms -step:484/2285 train_time:29054ms step_avg:60.03ms -step:485/2285 train_time:29116ms step_avg:60.03ms -step:486/2285 train_time:29175ms step_avg:60.03ms -step:487/2285 train_time:29236ms step_avg:60.03ms -step:488/2285 train_time:29295ms step_avg:60.03ms -step:489/2285 train_time:29356ms step_avg:60.03ms -step:490/2285 train_time:29415ms step_avg:60.03ms -step:491/2285 train_time:29476ms step_avg:60.03ms -step:492/2285 train_time:29536ms step_avg:60.03ms -step:493/2285 train_time:29596ms step_avg:60.03ms -step:494/2285 train_time:29655ms step_avg:60.03ms -step:495/2285 train_time:29717ms step_avg:60.03ms -step:496/2285 train_time:29776ms step_avg:60.03ms -step:497/2285 train_time:29838ms step_avg:60.04ms -step:498/2285 train_time:29896ms step_avg:60.03ms -step:499/2285 train_time:29958ms step_avg:60.04ms -step:500/2285 train_time:30017ms step_avg:60.03ms -step:500/2285 val_loss:3.7901 train_time:30079ms step_avg:60.16ms -step:501/2285 train_time:30099ms step_avg:60.08ms -step:502/2285 train_time:30140ms step_avg:60.04ms -step:503/2285 train_time:30200ms step_avg:60.04ms -step:504/2285 train_time:30258ms step_avg:60.04ms -step:505/2285 train_time:30319ms step_avg:60.04ms -step:506/2285 train_time:30378ms step_avg:60.04ms -step:507/2285 train_time:30439ms step_avg:60.04ms -step:508/2285 train_time:30497ms step_avg:60.03ms -step:509/2285 train_time:30557ms step_avg:60.03ms -step:510/2285 train_time:30615ms step_avg:60.03ms -step:511/2285 train_time:30676ms step_avg:60.03ms -step:512/2285 train_time:30734ms step_avg:60.03ms -step:513/2285 train_time:30794ms step_avg:60.03ms -step:514/2285 train_time:30852ms step_avg:60.02ms -step:515/2285 train_time:30913ms step_avg:60.02ms -step:516/2285 train_time:30973ms step_avg:60.02ms -step:517/2285 train_time:31038ms step_avg:60.04ms -step:518/2285 train_time:31100ms step_avg:60.04ms -step:519/2285 train_time:31161ms step_avg:60.04ms -step:520/2285 train_time:31220ms step_avg:60.04ms -step:521/2285 train_time:31282ms step_avg:60.04ms -step:522/2285 train_time:31341ms step_avg:60.04ms -step:523/2285 train_time:31402ms step_avg:60.04ms -step:524/2285 train_time:31461ms step_avg:60.04ms -step:525/2285 train_time:31522ms step_avg:60.04ms -step:526/2285 train_time:31581ms step_avg:60.04ms -step:527/2285 train_time:31642ms step_avg:60.04ms -step:528/2285 train_time:31701ms step_avg:60.04ms -step:529/2285 train_time:31762ms step_avg:60.04ms -step:530/2285 train_time:31822ms step_avg:60.04ms -step:531/2285 train_time:31883ms step_avg:60.04ms -step:532/2285 train_time:31943ms step_avg:60.04ms -step:533/2285 train_time:32007ms step_avg:60.05ms -step:534/2285 train_time:32067ms step_avg:60.05ms -step:535/2285 train_time:32130ms step_avg:60.06ms -step:536/2285 train_time:32189ms step_avg:60.05ms -step:537/2285 train_time:32251ms step_avg:60.06ms -step:538/2285 train_time:32310ms step_avg:60.06ms -step:539/2285 train_time:32371ms step_avg:60.06ms -step:540/2285 train_time:32431ms step_avg:60.06ms -step:541/2285 train_time:32492ms step_avg:60.06ms -step:542/2285 train_time:32551ms step_avg:60.06ms -step:543/2285 train_time:32612ms step_avg:60.06ms -step:544/2285 train_time:32671ms step_avg:60.06ms -step:545/2285 train_time:32733ms step_avg:60.06ms -step:546/2285 train_time:32792ms step_avg:60.06ms -step:547/2285 train_time:32853ms step_avg:60.06ms -step:548/2285 train_time:32912ms step_avg:60.06ms -step:549/2285 train_time:32974ms step_avg:60.06ms -step:550/2285 train_time:33033ms step_avg:60.06ms -step:551/2285 train_time:33095ms step_avg:60.06ms -step:552/2285 train_time:33154ms step_avg:60.06ms -step:553/2285 train_time:33216ms step_avg:60.06ms -step:554/2285 train_time:33275ms step_avg:60.06ms -step:555/2285 train_time:33336ms step_avg:60.06ms -step:556/2285 train_time:33395ms step_avg:60.06ms -step:557/2285 train_time:33456ms step_avg:60.06ms -step:558/2285 train_time:33515ms step_avg:60.06ms -step:559/2285 train_time:33576ms step_avg:60.06ms -step:560/2285 train_time:33635ms step_avg:60.06ms -step:561/2285 train_time:33696ms step_avg:60.06ms -step:562/2285 train_time:33755ms step_avg:60.06ms -step:563/2285 train_time:33817ms step_avg:60.07ms -step:564/2285 train_time:33876ms step_avg:60.06ms -step:565/2285 train_time:33938ms step_avg:60.07ms -step:566/2285 train_time:33996ms step_avg:60.06ms -step:567/2285 train_time:34058ms step_avg:60.07ms -step:568/2285 train_time:34117ms step_avg:60.07ms -step:569/2285 train_time:34178ms step_avg:60.07ms -step:570/2285 train_time:34237ms step_avg:60.06ms -step:571/2285 train_time:34298ms step_avg:60.07ms -step:572/2285 train_time:34356ms step_avg:60.06ms -step:573/2285 train_time:34418ms step_avg:60.07ms -step:574/2285 train_time:34476ms step_avg:60.06ms -step:575/2285 train_time:34537ms step_avg:60.06ms -step:576/2285 train_time:34595ms step_avg:60.06ms -step:577/2285 train_time:34656ms step_avg:60.06ms -step:578/2285 train_time:34716ms step_avg:60.06ms -step:579/2285 train_time:34777ms step_avg:60.06ms -step:580/2285 train_time:34836ms step_avg:60.06ms -step:581/2285 train_time:34897ms step_avg:60.06ms -step:582/2285 train_time:34956ms step_avg:60.06ms -step:583/2285 train_time:35017ms step_avg:60.06ms -step:584/2285 train_time:35077ms step_avg:60.06ms -step:585/2285 train_time:35138ms step_avg:60.07ms -step:586/2285 train_time:35197ms step_avg:60.06ms -step:587/2285 train_time:35258ms step_avg:60.06ms -step:588/2285 train_time:35317ms step_avg:60.06ms -step:589/2285 train_time:35378ms step_avg:60.06ms -step:590/2285 train_time:35436ms step_avg:60.06ms -step:591/2285 train_time:35497ms step_avg:60.06ms -step:592/2285 train_time:35556ms step_avg:60.06ms -step:593/2285 train_time:35617ms step_avg:60.06ms -step:594/2285 train_time:35676ms step_avg:60.06ms -step:595/2285 train_time:35739ms step_avg:60.07ms -step:596/2285 train_time:35796ms step_avg:60.06ms -step:597/2285 train_time:35857ms step_avg:60.06ms -step:598/2285 train_time:35916ms step_avg:60.06ms -step:599/2285 train_time:35978ms step_avg:60.06ms -step:600/2285 train_time:36036ms step_avg:60.06ms -step:601/2285 train_time:36098ms step_avg:60.06ms -step:602/2285 train_time:36157ms step_avg:60.06ms -step:603/2285 train_time:36218ms step_avg:60.06ms -step:604/2285 train_time:36277ms step_avg:60.06ms -step:605/2285 train_time:36339ms step_avg:60.06ms -step:606/2285 train_time:36397ms step_avg:60.06ms -step:607/2285 train_time:36459ms step_avg:60.06ms -step:608/2285 train_time:36517ms step_avg:60.06ms -step:609/2285 train_time:36578ms step_avg:60.06ms -step:610/2285 train_time:36637ms step_avg:60.06ms -step:611/2285 train_time:36698ms step_avg:60.06ms -step:612/2285 train_time:36756ms step_avg:60.06ms -step:613/2285 train_time:36818ms step_avg:60.06ms -step:614/2285 train_time:36877ms step_avg:60.06ms -step:615/2285 train_time:36938ms step_avg:60.06ms -step:616/2285 train_time:36997ms step_avg:60.06ms -step:617/2285 train_time:37059ms step_avg:60.06ms -step:618/2285 train_time:37118ms step_avg:60.06ms -step:619/2285 train_time:37179ms step_avg:60.06ms -step:620/2285 train_time:37238ms step_avg:60.06ms -step:621/2285 train_time:37299ms step_avg:60.06ms -step:622/2285 train_time:37359ms step_avg:60.06ms -step:623/2285 train_time:37420ms step_avg:60.06ms -step:624/2285 train_time:37478ms step_avg:60.06ms -step:625/2285 train_time:37540ms step_avg:60.06ms -step:626/2285 train_time:37599ms step_avg:60.06ms -step:627/2285 train_time:37660ms step_avg:60.06ms -step:628/2285 train_time:37719ms step_avg:60.06ms -step:629/2285 train_time:37781ms step_avg:60.06ms -step:630/2285 train_time:37840ms step_avg:60.06ms -step:631/2285 train_time:37902ms step_avg:60.07ms -step:632/2285 train_time:37961ms step_avg:60.06ms -step:633/2285 train_time:38022ms step_avg:60.07ms -step:634/2285 train_time:38081ms step_avg:60.06ms -step:635/2285 train_time:38143ms step_avg:60.07ms -step:636/2285 train_time:38202ms step_avg:60.07ms -step:637/2285 train_time:38263ms step_avg:60.07ms -step:638/2285 train_time:38323ms step_avg:60.07ms -step:639/2285 train_time:38384ms step_avg:60.07ms -step:640/2285 train_time:38444ms step_avg:60.07ms -step:641/2285 train_time:38506ms step_avg:60.07ms -step:642/2285 train_time:38566ms step_avg:60.07ms -step:643/2285 train_time:38627ms step_avg:60.07ms -step:644/2285 train_time:38687ms step_avg:60.07ms -step:645/2285 train_time:38748ms step_avg:60.07ms -step:646/2285 train_time:38808ms step_avg:60.07ms -step:647/2285 train_time:38869ms step_avg:60.08ms -step:648/2285 train_time:38928ms step_avg:60.07ms -step:649/2285 train_time:38990ms step_avg:60.08ms -step:650/2285 train_time:39050ms step_avg:60.08ms -step:651/2285 train_time:39111ms step_avg:60.08ms -step:652/2285 train_time:39171ms step_avg:60.08ms -step:653/2285 train_time:39232ms step_avg:60.08ms -step:654/2285 train_time:39292ms step_avg:60.08ms -step:655/2285 train_time:39353ms step_avg:60.08ms -step:656/2285 train_time:39412ms step_avg:60.08ms -step:657/2285 train_time:39473ms step_avg:60.08ms -step:658/2285 train_time:39532ms step_avg:60.08ms -step:659/2285 train_time:39594ms step_avg:60.08ms -step:660/2285 train_time:39653ms step_avg:60.08ms -step:661/2285 train_time:39714ms step_avg:60.08ms -step:662/2285 train_time:39773ms step_avg:60.08ms -step:663/2285 train_time:39835ms step_avg:60.08ms -step:664/2285 train_time:39893ms step_avg:60.08ms -step:665/2285 train_time:39955ms step_avg:60.08ms -step:666/2285 train_time:40014ms step_avg:60.08ms -step:667/2285 train_time:40075ms step_avg:60.08ms -step:668/2285 train_time:40134ms step_avg:60.08ms -step:669/2285 train_time:40196ms step_avg:60.08ms -step:670/2285 train_time:40255ms step_avg:60.08ms -step:671/2285 train_time:40317ms step_avg:60.08ms -step:672/2285 train_time:40376ms step_avg:60.08ms -step:673/2285 train_time:40437ms step_avg:60.08ms -step:674/2285 train_time:40495ms step_avg:60.08ms -step:675/2285 train_time:40557ms step_avg:60.08ms -step:676/2285 train_time:40616ms step_avg:60.08ms -step:677/2285 train_time:40677ms step_avg:60.08ms -step:678/2285 train_time:40736ms step_avg:60.08ms -step:679/2285 train_time:40797ms step_avg:60.08ms -step:680/2285 train_time:40857ms step_avg:60.08ms -step:681/2285 train_time:40918ms step_avg:60.09ms -step:682/2285 train_time:40977ms step_avg:60.08ms -step:683/2285 train_time:41039ms step_avg:60.09ms -step:684/2285 train_time:41097ms step_avg:60.08ms -step:685/2285 train_time:41159ms step_avg:60.09ms -step:686/2285 train_time:41217ms step_avg:60.08ms -step:687/2285 train_time:41278ms step_avg:60.08ms -step:688/2285 train_time:41337ms step_avg:60.08ms -step:689/2285 train_time:41398ms step_avg:60.08ms -step:690/2285 train_time:41457ms step_avg:60.08ms -step:691/2285 train_time:41518ms step_avg:60.08ms -step:692/2285 train_time:41577ms step_avg:60.08ms -step:693/2285 train_time:41638ms step_avg:60.08ms -step:694/2285 train_time:41697ms step_avg:60.08ms -step:695/2285 train_time:41758ms step_avg:60.08ms -step:696/2285 train_time:41817ms step_avg:60.08ms -step:697/2285 train_time:41879ms step_avg:60.08ms -step:698/2285 train_time:41938ms step_avg:60.08ms -step:699/2285 train_time:41999ms step_avg:60.08ms -step:700/2285 train_time:42058ms step_avg:60.08ms -step:701/2285 train_time:42120ms step_avg:60.08ms -step:702/2285 train_time:42178ms step_avg:60.08ms -step:703/2285 train_time:42239ms step_avg:60.08ms -step:704/2285 train_time:42298ms step_avg:60.08ms -step:705/2285 train_time:42360ms step_avg:60.08ms -step:706/2285 train_time:42418ms step_avg:60.08ms -step:707/2285 train_time:42480ms step_avg:60.08ms -step:708/2285 train_time:42538ms step_avg:60.08ms -step:709/2285 train_time:42600ms step_avg:60.08ms -step:710/2285 train_time:42659ms step_avg:60.08ms -step:711/2285 train_time:42720ms step_avg:60.08ms -step:712/2285 train_time:42779ms step_avg:60.08ms -step:713/2285 train_time:42840ms step_avg:60.08ms -step:714/2285 train_time:42899ms step_avg:60.08ms -step:715/2285 train_time:42961ms step_avg:60.09ms -step:716/2285 train_time:43020ms step_avg:60.08ms -step:717/2285 train_time:43081ms step_avg:60.09ms -step:718/2285 train_time:43140ms step_avg:60.08ms -step:719/2285 train_time:43201ms step_avg:60.09ms -step:720/2285 train_time:43260ms step_avg:60.08ms -step:721/2285 train_time:43322ms step_avg:60.09ms -step:722/2285 train_time:43381ms step_avg:60.08ms -step:723/2285 train_time:43443ms step_avg:60.09ms -step:724/2285 train_time:43503ms step_avg:60.09ms -step:725/2285 train_time:43564ms step_avg:60.09ms -step:726/2285 train_time:43624ms step_avg:60.09ms -step:727/2285 train_time:43685ms step_avg:60.09ms -step:728/2285 train_time:43744ms step_avg:60.09ms -step:729/2285 train_time:43806ms step_avg:60.09ms -step:730/2285 train_time:43865ms step_avg:60.09ms -step:731/2285 train_time:43927ms step_avg:60.09ms -step:732/2285 train_time:43986ms step_avg:60.09ms -step:733/2285 train_time:44048ms step_avg:60.09ms -step:734/2285 train_time:44107ms step_avg:60.09ms -step:735/2285 train_time:44168ms step_avg:60.09ms -step:736/2285 train_time:44228ms step_avg:60.09ms -step:737/2285 train_time:44289ms step_avg:60.09ms -step:738/2285 train_time:44349ms step_avg:60.09ms -step:739/2285 train_time:44411ms step_avg:60.10ms -step:740/2285 train_time:44470ms step_avg:60.09ms -step:741/2285 train_time:44532ms step_avg:60.10ms -step:742/2285 train_time:44591ms step_avg:60.10ms -step:743/2285 train_time:44654ms step_avg:60.10ms -step:744/2285 train_time:44713ms step_avg:60.10ms -step:745/2285 train_time:44775ms step_avg:60.10ms -step:746/2285 train_time:44834ms step_avg:60.10ms -step:747/2285 train_time:44895ms step_avg:60.10ms -step:748/2285 train_time:44954ms step_avg:60.10ms -step:749/2285 train_time:45015ms step_avg:60.10ms -step:750/2285 train_time:45074ms step_avg:60.10ms -step:750/2285 val_loss:3.6583 train_time:45136ms step_avg:60.18ms -step:751/2285 train_time:45155ms step_avg:60.13ms -step:752/2285 train_time:45196ms step_avg:60.10ms -step:753/2285 train_time:45259ms step_avg:60.10ms -step:754/2285 train_time:45319ms step_avg:60.10ms -step:755/2285 train_time:45380ms step_avg:60.11ms -step:756/2285 train_time:45439ms step_avg:60.10ms -step:757/2285 train_time:45499ms step_avg:60.10ms -step:758/2285 train_time:45558ms step_avg:60.10ms -step:759/2285 train_time:45619ms step_avg:60.10ms -step:760/2285 train_time:45678ms step_avg:60.10ms -step:761/2285 train_time:45739ms step_avg:60.10ms -step:762/2285 train_time:45797ms step_avg:60.10ms -step:763/2285 train_time:45859ms step_avg:60.10ms -step:764/2285 train_time:45919ms step_avg:60.10ms -step:765/2285 train_time:45981ms step_avg:60.11ms -step:766/2285 train_time:46040ms step_avg:60.10ms -step:767/2285 train_time:46104ms step_avg:60.11ms -step:768/2285 train_time:46166ms step_avg:60.11ms -step:769/2285 train_time:46229ms step_avg:60.12ms -step:770/2285 train_time:46288ms step_avg:60.11ms -step:771/2285 train_time:46350ms step_avg:60.12ms -step:772/2285 train_time:46409ms step_avg:60.12ms -step:773/2285 train_time:46470ms step_avg:60.12ms -step:774/2285 train_time:46530ms step_avg:60.12ms -step:775/2285 train_time:46591ms step_avg:60.12ms -step:776/2285 train_time:46650ms step_avg:60.12ms -step:777/2285 train_time:46711ms step_avg:60.12ms -step:778/2285 train_time:46770ms step_avg:60.12ms -step:779/2285 train_time:46832ms step_avg:60.12ms -step:780/2285 train_time:46891ms step_avg:60.12ms -step:781/2285 train_time:46952ms step_avg:60.12ms -step:782/2285 train_time:47013ms step_avg:60.12ms -step:783/2285 train_time:47076ms step_avg:60.12ms -step:784/2285 train_time:47137ms step_avg:60.12ms -step:785/2285 train_time:47200ms step_avg:60.13ms -step:786/2285 train_time:47260ms step_avg:60.13ms -step:787/2285 train_time:47322ms step_avg:60.13ms -step:788/2285 train_time:47382ms step_avg:60.13ms -step:789/2285 train_time:47443ms step_avg:60.13ms -step:790/2285 train_time:47502ms step_avg:60.13ms -step:791/2285 train_time:47564ms step_avg:60.13ms -step:792/2285 train_time:47624ms step_avg:60.13ms -step:793/2285 train_time:47684ms step_avg:60.13ms -step:794/2285 train_time:47744ms step_avg:60.13ms -step:795/2285 train_time:47805ms step_avg:60.13ms -step:796/2285 train_time:47865ms step_avg:60.13ms -step:797/2285 train_time:47927ms step_avg:60.13ms -step:798/2285 train_time:47987ms step_avg:60.13ms -step:799/2285 train_time:48049ms step_avg:60.14ms -step:800/2285 train_time:48109ms step_avg:60.14ms -step:801/2285 train_time:48171ms step_avg:60.14ms -step:802/2285 train_time:48230ms step_avg:60.14ms -step:803/2285 train_time:48292ms step_avg:60.14ms -step:804/2285 train_time:48351ms step_avg:60.14ms -step:805/2285 train_time:48413ms step_avg:60.14ms -step:806/2285 train_time:48472ms step_avg:60.14ms -step:807/2285 train_time:48533ms step_avg:60.14ms -step:808/2285 train_time:48593ms step_avg:60.14ms -step:809/2285 train_time:48655ms step_avg:60.14ms -step:810/2285 train_time:48715ms step_avg:60.14ms -step:811/2285 train_time:48777ms step_avg:60.14ms -step:812/2285 train_time:48837ms step_avg:60.14ms -step:813/2285 train_time:48899ms step_avg:60.15ms -step:814/2285 train_time:48958ms step_avg:60.15ms -step:815/2285 train_time:49021ms step_avg:60.15ms -step:816/2285 train_time:49080ms step_avg:60.15ms -step:817/2285 train_time:49142ms step_avg:60.15ms -step:818/2285 train_time:49201ms step_avg:60.15ms -step:819/2285 train_time:49264ms step_avg:60.15ms -step:820/2285 train_time:49324ms step_avg:60.15ms -step:821/2285 train_time:49385ms step_avg:60.15ms -step:822/2285 train_time:49445ms step_avg:60.15ms -step:823/2285 train_time:49507ms step_avg:60.15ms -step:824/2285 train_time:49567ms step_avg:60.15ms -step:825/2285 train_time:49629ms step_avg:60.16ms -step:826/2285 train_time:49688ms step_avg:60.15ms -step:827/2285 train_time:49750ms step_avg:60.16ms -step:828/2285 train_time:49809ms step_avg:60.16ms -step:829/2285 train_time:49870ms step_avg:60.16ms -step:830/2285 train_time:49930ms step_avg:60.16ms -step:831/2285 train_time:49991ms step_avg:60.16ms -step:832/2285 train_time:50050ms step_avg:60.16ms -step:833/2285 train_time:50112ms step_avg:60.16ms -step:834/2285 train_time:50171ms step_avg:60.16ms -step:835/2285 train_time:50234ms step_avg:60.16ms -step:836/2285 train_time:50294ms step_avg:60.16ms -step:837/2285 train_time:50356ms step_avg:60.16ms -step:838/2285 train_time:50415ms step_avg:60.16ms -step:839/2285 train_time:50478ms step_avg:60.16ms -step:840/2285 train_time:50538ms step_avg:60.16ms -step:841/2285 train_time:50600ms step_avg:60.17ms -step:842/2285 train_time:50659ms step_avg:60.17ms -step:843/2285 train_time:50722ms step_avg:60.17ms -step:844/2285 train_time:50781ms step_avg:60.17ms -step:845/2285 train_time:50843ms step_avg:60.17ms -step:846/2285 train_time:50903ms step_avg:60.17ms -step:847/2285 train_time:50965ms step_avg:60.17ms -step:848/2285 train_time:51024ms step_avg:60.17ms -step:849/2285 train_time:51086ms step_avg:60.17ms -step:850/2285 train_time:51145ms step_avg:60.17ms -step:851/2285 train_time:51208ms step_avg:60.17ms -step:852/2285 train_time:51268ms step_avg:60.17ms -step:853/2285 train_time:51329ms step_avg:60.18ms -step:854/2285 train_time:51388ms step_avg:60.17ms -step:855/2285 train_time:51450ms step_avg:60.18ms -step:856/2285 train_time:51509ms step_avg:60.17ms -step:857/2285 train_time:51571ms step_avg:60.18ms -step:858/2285 train_time:51631ms step_avg:60.18ms -step:859/2285 train_time:51693ms step_avg:60.18ms -step:860/2285 train_time:51752ms step_avg:60.18ms -step:861/2285 train_time:51815ms step_avg:60.18ms -step:862/2285 train_time:51874ms step_avg:60.18ms -step:863/2285 train_time:51936ms step_avg:60.18ms -step:864/2285 train_time:51996ms step_avg:60.18ms -step:865/2285 train_time:52058ms step_avg:60.18ms -step:866/2285 train_time:52117ms step_avg:60.18ms -step:867/2285 train_time:52179ms step_avg:60.18ms -step:868/2285 train_time:52239ms step_avg:60.18ms -step:869/2285 train_time:52301ms step_avg:60.19ms -step:870/2285 train_time:52360ms step_avg:60.18ms -step:871/2285 train_time:52422ms step_avg:60.19ms -step:872/2285 train_time:52482ms step_avg:60.19ms -step:873/2285 train_time:52544ms step_avg:60.19ms -step:874/2285 train_time:52604ms step_avg:60.19ms -step:875/2285 train_time:52666ms step_avg:60.19ms -step:876/2285 train_time:52725ms step_avg:60.19ms -step:877/2285 train_time:52787ms step_avg:60.19ms -step:878/2285 train_time:52846ms step_avg:60.19ms -step:879/2285 train_time:52908ms step_avg:60.19ms -step:880/2285 train_time:52967ms step_avg:60.19ms -step:881/2285 train_time:53029ms step_avg:60.19ms -step:882/2285 train_time:53089ms step_avg:60.19ms -step:883/2285 train_time:53151ms step_avg:60.19ms -step:884/2285 train_time:53211ms step_avg:60.19ms -step:885/2285 train_time:53272ms step_avg:60.19ms -step:886/2285 train_time:53331ms step_avg:60.19ms -step:887/2285 train_time:53393ms step_avg:60.19ms -step:888/2285 train_time:53452ms step_avg:60.19ms -step:889/2285 train_time:53514ms step_avg:60.20ms -step:890/2285 train_time:53573ms step_avg:60.19ms -step:891/2285 train_time:53635ms step_avg:60.20ms -step:892/2285 train_time:53695ms step_avg:60.20ms -step:893/2285 train_time:53757ms step_avg:60.20ms -step:894/2285 train_time:53817ms step_avg:60.20ms -step:895/2285 train_time:53880ms step_avg:60.20ms -step:896/2285 train_time:53939ms step_avg:60.20ms -step:897/2285 train_time:54001ms step_avg:60.20ms -step:898/2285 train_time:54061ms step_avg:60.20ms -step:899/2285 train_time:54123ms step_avg:60.20ms -step:900/2285 train_time:54182ms step_avg:60.20ms -step:901/2285 train_time:54243ms step_avg:60.20ms -step:902/2285 train_time:54302ms step_avg:60.20ms -step:903/2285 train_time:54364ms step_avg:60.20ms -step:904/2285 train_time:54424ms step_avg:60.20ms -step:905/2285 train_time:54486ms step_avg:60.21ms -step:906/2285 train_time:54546ms step_avg:60.21ms -step:907/2285 train_time:54608ms step_avg:60.21ms -step:908/2285 train_time:54668ms step_avg:60.21ms -step:909/2285 train_time:54730ms step_avg:60.21ms -step:910/2285 train_time:54789ms step_avg:60.21ms -step:911/2285 train_time:54850ms step_avg:60.21ms -step:912/2285 train_time:54910ms step_avg:60.21ms -step:913/2285 train_time:54972ms step_avg:60.21ms -step:914/2285 train_time:55031ms step_avg:60.21ms -step:915/2285 train_time:55093ms step_avg:60.21ms -step:916/2285 train_time:55153ms step_avg:60.21ms -step:917/2285 train_time:55215ms step_avg:60.21ms -step:918/2285 train_time:55275ms step_avg:60.21ms -step:919/2285 train_time:55337ms step_avg:60.21ms -step:920/2285 train_time:55397ms step_avg:60.21ms -step:921/2285 train_time:55459ms step_avg:60.22ms -step:922/2285 train_time:55519ms step_avg:60.22ms -step:923/2285 train_time:55581ms step_avg:60.22ms -step:924/2285 train_time:55640ms step_avg:60.22ms -step:925/2285 train_time:55702ms step_avg:60.22ms -step:926/2285 train_time:55762ms step_avg:60.22ms -step:927/2285 train_time:55824ms step_avg:60.22ms -step:928/2285 train_time:55883ms step_avg:60.22ms -step:929/2285 train_time:55944ms step_avg:60.22ms -step:930/2285 train_time:56004ms step_avg:60.22ms -step:931/2285 train_time:56066ms step_avg:60.22ms -step:932/2285 train_time:56126ms step_avg:60.22ms -step:933/2285 train_time:56187ms step_avg:60.22ms -step:934/2285 train_time:56247ms step_avg:60.22ms -step:935/2285 train_time:56308ms step_avg:60.22ms -step:936/2285 train_time:56368ms step_avg:60.22ms -step:937/2285 train_time:56429ms step_avg:60.22ms -step:938/2285 train_time:56489ms step_avg:60.22ms -step:939/2285 train_time:56551ms step_avg:60.23ms -step:940/2285 train_time:56611ms step_avg:60.22ms -step:941/2285 train_time:56673ms step_avg:60.23ms -step:942/2285 train_time:56732ms step_avg:60.22ms -step:943/2285 train_time:56794ms step_avg:60.23ms -step:944/2285 train_time:56853ms step_avg:60.23ms -step:945/2285 train_time:56915ms step_avg:60.23ms -step:946/2285 train_time:56974ms step_avg:60.23ms -step:947/2285 train_time:57036ms step_avg:60.23ms -step:948/2285 train_time:57096ms step_avg:60.23ms -step:949/2285 train_time:57159ms step_avg:60.23ms -step:950/2285 train_time:57219ms step_avg:60.23ms -step:951/2285 train_time:57281ms step_avg:60.23ms -step:952/2285 train_time:57340ms step_avg:60.23ms -step:953/2285 train_time:57402ms step_avg:60.23ms -step:954/2285 train_time:57463ms step_avg:60.23ms -step:955/2285 train_time:57524ms step_avg:60.23ms -step:956/2285 train_time:57584ms step_avg:60.23ms -step:957/2285 train_time:57645ms step_avg:60.24ms -step:958/2285 train_time:57705ms step_avg:60.23ms -step:959/2285 train_time:57767ms step_avg:60.24ms -step:960/2285 train_time:57827ms step_avg:60.24ms -step:961/2285 train_time:57889ms step_avg:60.24ms -step:962/2285 train_time:57948ms step_avg:60.24ms -step:963/2285 train_time:58009ms step_avg:60.24ms -step:964/2285 train_time:58069ms step_avg:60.24ms -step:965/2285 train_time:58131ms step_avg:60.24ms -step:966/2285 train_time:58190ms step_avg:60.24ms -step:967/2285 train_time:58252ms step_avg:60.24ms -step:968/2285 train_time:58311ms step_avg:60.24ms -step:969/2285 train_time:58374ms step_avg:60.24ms -step:970/2285 train_time:58433ms step_avg:60.24ms -step:971/2285 train_time:58495ms step_avg:60.24ms -step:972/2285 train_time:58555ms step_avg:60.24ms -step:973/2285 train_time:58617ms step_avg:60.24ms -step:974/2285 train_time:58676ms step_avg:60.24ms -step:975/2285 train_time:58739ms step_avg:60.24ms -step:976/2285 train_time:58798ms step_avg:60.24ms -step:977/2285 train_time:58860ms step_avg:60.25ms -step:978/2285 train_time:58920ms step_avg:60.25ms -step:979/2285 train_time:58982ms step_avg:60.25ms -step:980/2285 train_time:59041ms step_avg:60.25ms -step:981/2285 train_time:59103ms step_avg:60.25ms -step:982/2285 train_time:59163ms step_avg:60.25ms -step:983/2285 train_time:59225ms step_avg:60.25ms -step:984/2285 train_time:59285ms step_avg:60.25ms -step:985/2285 train_time:59347ms step_avg:60.25ms -step:986/2285 train_time:59406ms step_avg:60.25ms -step:987/2285 train_time:59469ms step_avg:60.25ms -step:988/2285 train_time:59528ms step_avg:60.25ms -step:989/2285 train_time:59590ms step_avg:60.25ms -step:990/2285 train_time:59649ms step_avg:60.25ms -step:991/2285 train_time:59711ms step_avg:60.25ms -step:992/2285 train_time:59770ms step_avg:60.25ms -step:993/2285 train_time:59832ms step_avg:60.25ms -step:994/2285 train_time:59890ms step_avg:60.25ms -step:995/2285 train_time:59953ms step_avg:60.25ms -step:996/2285 train_time:60012ms step_avg:60.25ms -step:997/2285 train_time:60074ms step_avg:60.25ms -step:998/2285 train_time:60133ms step_avg:60.25ms -step:999/2285 train_time:60195ms step_avg:60.26ms -step:1000/2285 train_time:60255ms step_avg:60.25ms -step:1000/2285 val_loss:3.5692 train_time:60318ms step_avg:60.32ms -step:1001/2285 train_time:60337ms step_avg:60.28ms -step:1002/2285 train_time:60380ms step_avg:60.26ms -step:1003/2285 train_time:60442ms step_avg:60.26ms -step:1004/2285 train_time:60503ms step_avg:60.26ms -step:1005/2285 train_time:60567ms step_avg:60.27ms -step:1006/2285 train_time:60626ms step_avg:60.26ms -step:1007/2285 train_time:60687ms step_avg:60.27ms -step:1008/2285 train_time:60745ms step_avg:60.26ms -step:1009/2285 train_time:60806ms step_avg:60.26ms -step:1010/2285 train_time:60865ms step_avg:60.26ms -step:1011/2285 train_time:60925ms step_avg:60.26ms -step:1012/2285 train_time:60984ms step_avg:60.26ms -step:1013/2285 train_time:61045ms step_avg:60.26ms -step:1014/2285 train_time:61105ms step_avg:60.26ms -step:1015/2285 train_time:61166ms step_avg:60.26ms -step:1016/2285 train_time:61229ms step_avg:60.26ms -step:1017/2285 train_time:61296ms step_avg:60.27ms -step:1018/2285 train_time:61356ms step_avg:60.27ms -step:1019/2285 train_time:61418ms step_avg:60.27ms -step:1020/2285 train_time:61478ms step_avg:60.27ms -step:1021/2285 train_time:61539ms step_avg:60.27ms -step:1022/2285 train_time:61599ms step_avg:60.27ms -step:1023/2285 train_time:61661ms step_avg:60.27ms -step:1024/2285 train_time:61720ms step_avg:60.27ms -step:1025/2285 train_time:61782ms step_avg:60.27ms -step:1026/2285 train_time:61841ms step_avg:60.27ms -step:1027/2285 train_time:61902ms step_avg:60.27ms -step:1028/2285 train_time:61961ms step_avg:60.27ms -step:1029/2285 train_time:62023ms step_avg:60.27ms -step:1030/2285 train_time:62083ms step_avg:60.27ms -step:1031/2285 train_time:62146ms step_avg:60.28ms -step:1032/2285 train_time:62207ms step_avg:60.28ms -step:1033/2285 train_time:62270ms step_avg:60.28ms -step:1034/2285 train_time:62330ms step_avg:60.28ms -step:1035/2285 train_time:62393ms step_avg:60.28ms -step:1036/2285 train_time:62453ms step_avg:60.28ms -step:1037/2285 train_time:62515ms step_avg:60.28ms -step:1038/2285 train_time:62574ms step_avg:60.28ms -step:1039/2285 train_time:62636ms step_avg:60.28ms -step:1040/2285 train_time:62695ms step_avg:60.28ms -step:1041/2285 train_time:62757ms step_avg:60.28ms -step:1042/2285 train_time:62816ms step_avg:60.28ms -step:1043/2285 train_time:62877ms step_avg:60.28ms -step:1044/2285 train_time:62936ms step_avg:60.28ms -step:1045/2285 train_time:62997ms step_avg:60.28ms -step:1046/2285 train_time:63057ms step_avg:60.28ms -step:1047/2285 train_time:63119ms step_avg:60.29ms -step:1048/2285 train_time:63179ms step_avg:60.29ms -step:1049/2285 train_time:63243ms step_avg:60.29ms -step:1050/2285 train_time:63304ms step_avg:60.29ms -step:1051/2285 train_time:63367ms step_avg:60.29ms -step:1052/2285 train_time:63427ms step_avg:60.29ms -step:1053/2285 train_time:63489ms step_avg:60.29ms -step:1054/2285 train_time:63548ms step_avg:60.29ms -step:1055/2285 train_time:63610ms step_avg:60.29ms -step:1056/2285 train_time:63669ms step_avg:60.29ms -step:1057/2285 train_time:63732ms step_avg:60.29ms -step:1058/2285 train_time:63791ms step_avg:60.29ms -step:1059/2285 train_time:63853ms step_avg:60.30ms -step:1060/2285 train_time:63912ms step_avg:60.29ms -step:1061/2285 train_time:63973ms step_avg:60.30ms -step:1062/2285 train_time:64032ms step_avg:60.29ms -step:1063/2285 train_time:64095ms step_avg:60.30ms -step:1064/2285 train_time:64155ms step_avg:60.30ms -step:1065/2285 train_time:64218ms step_avg:60.30ms -step:1066/2285 train_time:64277ms step_avg:60.30ms -step:1067/2285 train_time:64339ms step_avg:60.30ms -step:1068/2285 train_time:64399ms step_avg:60.30ms -step:1069/2285 train_time:64461ms step_avg:60.30ms -step:1070/2285 train_time:64521ms step_avg:60.30ms -step:1071/2285 train_time:64583ms step_avg:60.30ms -step:1072/2285 train_time:64643ms step_avg:60.30ms -step:1073/2285 train_time:64705ms step_avg:60.30ms -step:1074/2285 train_time:64764ms step_avg:60.30ms -step:1075/2285 train_time:64826ms step_avg:60.30ms -step:1076/2285 train_time:64885ms step_avg:60.30ms -step:1077/2285 train_time:64947ms step_avg:60.30ms -step:1078/2285 train_time:65006ms step_avg:60.30ms -step:1079/2285 train_time:65068ms step_avg:60.30ms -step:1080/2285 train_time:65128ms step_avg:60.30ms -step:1081/2285 train_time:65191ms step_avg:60.31ms -step:1082/2285 train_time:65250ms step_avg:60.31ms -step:1083/2285 train_time:65312ms step_avg:60.31ms -step:1084/2285 train_time:65372ms step_avg:60.31ms -step:1085/2285 train_time:65433ms step_avg:60.31ms -step:1086/2285 train_time:65493ms step_avg:60.31ms -step:1087/2285 train_time:65555ms step_avg:60.31ms -step:1088/2285 train_time:65614ms step_avg:60.31ms -step:1089/2285 train_time:65676ms step_avg:60.31ms -step:1090/2285 train_time:65735ms step_avg:60.31ms -step:1091/2285 train_time:65796ms step_avg:60.31ms -step:1092/2285 train_time:65856ms step_avg:60.31ms -step:1093/2285 train_time:65917ms step_avg:60.31ms -step:1094/2285 train_time:65977ms step_avg:60.31ms -step:1095/2285 train_time:66039ms step_avg:60.31ms -step:1096/2285 train_time:66099ms step_avg:60.31ms -step:1097/2285 train_time:66161ms step_avg:60.31ms -step:1098/2285 train_time:66220ms step_avg:60.31ms -step:1099/2285 train_time:66283ms step_avg:60.31ms -step:1100/2285 train_time:66343ms step_avg:60.31ms -step:1101/2285 train_time:66406ms step_avg:60.31ms -step:1102/2285 train_time:66465ms step_avg:60.31ms -step:1103/2285 train_time:66527ms step_avg:60.31ms -step:1104/2285 train_time:66587ms step_avg:60.31ms -step:1105/2285 train_time:66649ms step_avg:60.32ms -step:1106/2285 train_time:66709ms step_avg:60.32ms -step:1107/2285 train_time:66771ms step_avg:60.32ms -step:1108/2285 train_time:66830ms step_avg:60.32ms -step:1109/2285 train_time:66892ms step_avg:60.32ms -step:1110/2285 train_time:66951ms step_avg:60.32ms -step:1111/2285 train_time:67013ms step_avg:60.32ms -step:1112/2285 train_time:67072ms step_avg:60.32ms -step:1113/2285 train_time:67133ms step_avg:60.32ms -step:1114/2285 train_time:67193ms step_avg:60.32ms -step:1115/2285 train_time:67255ms step_avg:60.32ms -step:1116/2285 train_time:67314ms step_avg:60.32ms -step:1117/2285 train_time:67376ms step_avg:60.32ms -step:1118/2285 train_time:67436ms step_avg:60.32ms -step:1119/2285 train_time:67498ms step_avg:60.32ms -step:1120/2285 train_time:67557ms step_avg:60.32ms -step:1121/2285 train_time:67619ms step_avg:60.32ms -step:1122/2285 train_time:67679ms step_avg:60.32ms -step:1123/2285 train_time:67741ms step_avg:60.32ms -step:1124/2285 train_time:67801ms step_avg:60.32ms -step:1125/2285 train_time:67863ms step_avg:60.32ms -step:1126/2285 train_time:67923ms step_avg:60.32ms -step:1127/2285 train_time:67986ms step_avg:60.32ms -step:1128/2285 train_time:68045ms step_avg:60.32ms -step:1129/2285 train_time:68106ms step_avg:60.32ms -step:1130/2285 train_time:68165ms step_avg:60.32ms -step:1131/2285 train_time:68227ms step_avg:60.32ms -step:1132/2285 train_time:68287ms step_avg:60.32ms -step:1133/2285 train_time:68349ms step_avg:60.33ms -step:1134/2285 train_time:68409ms step_avg:60.33ms -step:1135/2285 train_time:68470ms step_avg:60.33ms -step:1136/2285 train_time:68531ms step_avg:60.33ms -step:1137/2285 train_time:68593ms step_avg:60.33ms -step:1138/2285 train_time:68652ms step_avg:60.33ms -step:1139/2285 train_time:68714ms step_avg:60.33ms -step:1140/2285 train_time:68774ms step_avg:60.33ms -step:1141/2285 train_time:68835ms step_avg:60.33ms -step:1142/2285 train_time:68895ms step_avg:60.33ms -step:1143/2285 train_time:68957ms step_avg:60.33ms -step:1144/2285 train_time:69016ms step_avg:60.33ms -step:1145/2285 train_time:69078ms step_avg:60.33ms -step:1146/2285 train_time:69137ms step_avg:60.33ms -step:1147/2285 train_time:69199ms step_avg:60.33ms -step:1148/2285 train_time:69259ms step_avg:60.33ms -step:1149/2285 train_time:69322ms step_avg:60.33ms -step:1150/2285 train_time:69383ms step_avg:60.33ms -step:1151/2285 train_time:69445ms step_avg:60.33ms -step:1152/2285 train_time:69505ms step_avg:60.33ms -step:1153/2285 train_time:69568ms step_avg:60.34ms -step:1154/2285 train_time:69628ms step_avg:60.34ms -step:1155/2285 train_time:69690ms step_avg:60.34ms -step:1156/2285 train_time:69750ms step_avg:60.34ms -step:1157/2285 train_time:69812ms step_avg:60.34ms -step:1158/2285 train_time:69871ms step_avg:60.34ms -step:1159/2285 train_time:69933ms step_avg:60.34ms -step:1160/2285 train_time:69993ms step_avg:60.34ms -step:1161/2285 train_time:70054ms step_avg:60.34ms -step:1162/2285 train_time:70114ms step_avg:60.34ms -step:1163/2285 train_time:70176ms step_avg:60.34ms -step:1164/2285 train_time:70236ms step_avg:60.34ms -step:1165/2285 train_time:70298ms step_avg:60.34ms -step:1166/2285 train_time:70357ms step_avg:60.34ms -step:1167/2285 train_time:70419ms step_avg:60.34ms -step:1168/2285 train_time:70479ms step_avg:60.34ms -step:1169/2285 train_time:70542ms step_avg:60.34ms -step:1170/2285 train_time:70603ms step_avg:60.34ms -step:1171/2285 train_time:70666ms step_avg:60.35ms -step:1172/2285 train_time:70725ms step_avg:60.35ms -step:1173/2285 train_time:70787ms step_avg:60.35ms -step:1174/2285 train_time:70846ms step_avg:60.35ms -step:1175/2285 train_time:70908ms step_avg:60.35ms -step:1176/2285 train_time:70968ms step_avg:60.35ms -step:1177/2285 train_time:71030ms step_avg:60.35ms -step:1178/2285 train_time:71090ms step_avg:60.35ms -step:1179/2285 train_time:71152ms step_avg:60.35ms -step:1180/2285 train_time:71212ms step_avg:60.35ms -step:1181/2285 train_time:71274ms step_avg:60.35ms -step:1182/2285 train_time:71334ms step_avg:60.35ms -step:1183/2285 train_time:71395ms step_avg:60.35ms -step:1184/2285 train_time:71455ms step_avg:60.35ms -step:1185/2285 train_time:71517ms step_avg:60.35ms -step:1186/2285 train_time:71577ms step_avg:60.35ms -step:1187/2285 train_time:71639ms step_avg:60.35ms -step:1188/2285 train_time:71699ms step_avg:60.35ms -step:1189/2285 train_time:71762ms step_avg:60.35ms -step:1190/2285 train_time:71822ms step_avg:60.35ms -step:1191/2285 train_time:71885ms step_avg:60.36ms -step:1192/2285 train_time:71945ms step_avg:60.36ms -step:1193/2285 train_time:72007ms step_avg:60.36ms -step:1194/2285 train_time:72066ms step_avg:60.36ms -step:1195/2285 train_time:72128ms step_avg:60.36ms -step:1196/2285 train_time:72188ms step_avg:60.36ms -step:1197/2285 train_time:72250ms step_avg:60.36ms -step:1198/2285 train_time:72311ms step_avg:60.36ms -step:1199/2285 train_time:72373ms step_avg:60.36ms -step:1200/2285 train_time:72433ms step_avg:60.36ms -step:1201/2285 train_time:72495ms step_avg:60.36ms -step:1202/2285 train_time:72555ms step_avg:60.36ms -step:1203/2285 train_time:72617ms step_avg:60.36ms -step:1204/2285 train_time:72676ms step_avg:60.36ms -step:1205/2285 train_time:72739ms step_avg:60.36ms -step:1206/2285 train_time:72799ms step_avg:60.36ms -step:1207/2285 train_time:72862ms step_avg:60.37ms -step:1208/2285 train_time:72922ms step_avg:60.37ms -step:1209/2285 train_time:72986ms step_avg:60.37ms -step:1210/2285 train_time:73045ms step_avg:60.37ms -step:1211/2285 train_time:73108ms step_avg:60.37ms -step:1212/2285 train_time:73167ms step_avg:60.37ms -step:1213/2285 train_time:73229ms step_avg:60.37ms -step:1214/2285 train_time:73289ms step_avg:60.37ms -step:1215/2285 train_time:73352ms step_avg:60.37ms -step:1216/2285 train_time:73412ms step_avg:60.37ms -step:1217/2285 train_time:73474ms step_avg:60.37ms -step:1218/2285 train_time:73533ms step_avg:60.37ms -step:1219/2285 train_time:73596ms step_avg:60.37ms -step:1220/2285 train_time:73655ms step_avg:60.37ms -step:1221/2285 train_time:73717ms step_avg:60.37ms -step:1222/2285 train_time:73777ms step_avg:60.37ms -step:1223/2285 train_time:73839ms step_avg:60.38ms -step:1224/2285 train_time:73899ms step_avg:60.38ms -step:1225/2285 train_time:73961ms step_avg:60.38ms -step:1226/2285 train_time:74022ms step_avg:60.38ms -step:1227/2285 train_time:74085ms step_avg:60.38ms -step:1228/2285 train_time:74145ms step_avg:60.38ms -step:1229/2285 train_time:74208ms step_avg:60.38ms -step:1230/2285 train_time:74267ms step_avg:60.38ms -step:1231/2285 train_time:74329ms step_avg:60.38ms -step:1232/2285 train_time:74389ms step_avg:60.38ms -step:1233/2285 train_time:74451ms step_avg:60.38ms -step:1234/2285 train_time:74511ms step_avg:60.38ms -step:1235/2285 train_time:74573ms step_avg:60.38ms -step:1236/2285 train_time:74633ms step_avg:60.38ms -step:1237/2285 train_time:74695ms step_avg:60.38ms -step:1238/2285 train_time:74755ms step_avg:60.38ms -step:1239/2285 train_time:74817ms step_avg:60.38ms -step:1240/2285 train_time:74876ms step_avg:60.38ms -step:1241/2285 train_time:74939ms step_avg:60.39ms -step:1242/2285 train_time:74998ms step_avg:60.39ms -step:1243/2285 train_time:75061ms step_avg:60.39ms -step:1244/2285 train_time:75121ms step_avg:60.39ms -step:1245/2285 train_time:75183ms step_avg:60.39ms -step:1246/2285 train_time:75244ms step_avg:60.39ms -step:1247/2285 train_time:75306ms step_avg:60.39ms -step:1248/2285 train_time:75366ms step_avg:60.39ms -step:1249/2285 train_time:75428ms step_avg:60.39ms -step:1250/2285 train_time:75487ms step_avg:60.39ms -step:1250/2285 val_loss:3.4960 train_time:75551ms step_avg:60.44ms -step:1251/2285 train_time:75570ms step_avg:60.41ms -step:1252/2285 train_time:75610ms step_avg:60.39ms -step:1253/2285 train_time:75673ms step_avg:60.39ms -step:1254/2285 train_time:75734ms step_avg:60.39ms -step:1255/2285 train_time:75798ms step_avg:60.40ms -step:1256/2285 train_time:75859ms step_avg:60.40ms -step:1257/2285 train_time:75920ms step_avg:60.40ms -step:1258/2285 train_time:75979ms step_avg:60.40ms -step:1259/2285 train_time:76040ms step_avg:60.40ms -step:1260/2285 train_time:76098ms step_avg:60.40ms -step:1261/2285 train_time:76159ms step_avg:60.40ms -step:1262/2285 train_time:76218ms step_avg:60.39ms -step:1263/2285 train_time:76279ms step_avg:60.40ms -step:1264/2285 train_time:76337ms step_avg:60.39ms -step:1265/2285 train_time:76398ms step_avg:60.39ms -step:1266/2285 train_time:76462ms step_avg:60.40ms -step:1267/2285 train_time:76529ms step_avg:60.40ms -step:1268/2285 train_time:76591ms step_avg:60.40ms -step:1269/2285 train_time:76654ms step_avg:60.40ms -step:1270/2285 train_time:76714ms step_avg:60.40ms -step:1271/2285 train_time:76776ms step_avg:60.41ms -step:1272/2285 train_time:76836ms step_avg:60.41ms -step:1273/2285 train_time:76899ms step_avg:60.41ms -step:1274/2285 train_time:76958ms step_avg:60.41ms -step:1275/2285 train_time:77020ms step_avg:60.41ms -step:1276/2285 train_time:77079ms step_avg:60.41ms -step:1277/2285 train_time:77140ms step_avg:60.41ms -step:1278/2285 train_time:77198ms step_avg:60.41ms -step:1279/2285 train_time:77260ms step_avg:60.41ms -step:1280/2285 train_time:77319ms step_avg:60.41ms -step:1281/2285 train_time:77382ms step_avg:60.41ms -step:1282/2285 train_time:77443ms step_avg:60.41ms -step:1283/2285 train_time:77506ms step_avg:60.41ms -step:1284/2285 train_time:77568ms step_avg:60.41ms -step:1285/2285 train_time:77630ms step_avg:60.41ms -step:1286/2285 train_time:77690ms step_avg:60.41ms -step:1287/2285 train_time:77752ms step_avg:60.41ms -step:1288/2285 train_time:77811ms step_avg:60.41ms -step:1289/2285 train_time:77873ms step_avg:60.41ms -step:1290/2285 train_time:77933ms step_avg:60.41ms -step:1291/2285 train_time:77995ms step_avg:60.41ms -step:1292/2285 train_time:78054ms step_avg:60.41ms -step:1293/2285 train_time:78115ms step_avg:60.41ms -step:1294/2285 train_time:78175ms step_avg:60.41ms -step:1295/2285 train_time:78236ms step_avg:60.41ms -step:1296/2285 train_time:78295ms step_avg:60.41ms -step:1297/2285 train_time:78357ms step_avg:60.41ms -step:1298/2285 train_time:78417ms step_avg:60.41ms -step:1299/2285 train_time:78481ms step_avg:60.42ms -step:1300/2285 train_time:78541ms step_avg:60.42ms -step:1301/2285 train_time:78604ms step_avg:60.42ms -step:1302/2285 train_time:78664ms step_avg:60.42ms -step:1303/2285 train_time:78726ms step_avg:60.42ms -step:1304/2285 train_time:78786ms step_avg:60.42ms -step:1305/2285 train_time:78848ms step_avg:60.42ms -step:1306/2285 train_time:78908ms step_avg:60.42ms -step:1307/2285 train_time:78970ms step_avg:60.42ms -step:1308/2285 train_time:79030ms step_avg:60.42ms -step:1309/2285 train_time:79091ms step_avg:60.42ms -step:1310/2285 train_time:79150ms step_avg:60.42ms -step:1311/2285 train_time:79212ms step_avg:60.42ms -step:1312/2285 train_time:79272ms step_avg:60.42ms -step:1313/2285 train_time:79333ms step_avg:60.42ms -step:1314/2285 train_time:79392ms step_avg:60.42ms -step:1315/2285 train_time:79455ms step_avg:60.42ms -step:1316/2285 train_time:79514ms step_avg:60.42ms -step:1317/2285 train_time:79577ms step_avg:60.42ms -step:1318/2285 train_time:79638ms step_avg:60.42ms -step:1319/2285 train_time:79701ms step_avg:60.43ms -step:1320/2285 train_time:79761ms step_avg:60.42ms -step:1321/2285 train_time:79823ms step_avg:60.43ms -step:1322/2285 train_time:79882ms step_avg:60.43ms -step:1323/2285 train_time:79945ms step_avg:60.43ms -step:1324/2285 train_time:80004ms step_avg:60.43ms -step:1325/2285 train_time:80066ms step_avg:60.43ms -step:1326/2285 train_time:80127ms step_avg:60.43ms -step:1327/2285 train_time:80189ms step_avg:60.43ms -step:1328/2285 train_time:80248ms step_avg:60.43ms -step:1329/2285 train_time:80310ms step_avg:60.43ms -step:1330/2285 train_time:80371ms step_avg:60.43ms -step:1331/2285 train_time:80433ms step_avg:60.43ms -step:1332/2285 train_time:80492ms step_avg:60.43ms -step:1333/2285 train_time:80555ms step_avg:60.43ms -step:1334/2285 train_time:80614ms step_avg:60.43ms -step:1335/2285 train_time:80677ms step_avg:60.43ms -step:1336/2285 train_time:80737ms step_avg:60.43ms -step:1337/2285 train_time:80800ms step_avg:60.43ms -step:1338/2285 train_time:80860ms step_avg:60.43ms -step:1339/2285 train_time:80922ms step_avg:60.43ms -step:1340/2285 train_time:80981ms step_avg:60.43ms -step:1341/2285 train_time:81043ms step_avg:60.43ms -step:1342/2285 train_time:81103ms step_avg:60.43ms -step:1343/2285 train_time:81165ms step_avg:60.44ms -step:1344/2285 train_time:81225ms step_avg:60.44ms -step:1345/2285 train_time:81288ms step_avg:60.44ms -step:1346/2285 train_time:81348ms step_avg:60.44ms -step:1347/2285 train_time:81410ms step_avg:60.44ms -step:1348/2285 train_time:81469ms step_avg:60.44ms -step:1349/2285 train_time:81531ms step_avg:60.44ms -step:1350/2285 train_time:81590ms step_avg:60.44ms -step:1351/2285 train_time:81653ms step_avg:60.44ms -step:1352/2285 train_time:81713ms step_avg:60.44ms -step:1353/2285 train_time:81774ms step_avg:60.44ms -step:1354/2285 train_time:81834ms step_avg:60.44ms -step:1355/2285 train_time:81896ms step_avg:60.44ms -step:1356/2285 train_time:81957ms step_avg:60.44ms -step:1357/2285 train_time:82019ms step_avg:60.44ms -step:1358/2285 train_time:82079ms step_avg:60.44ms -step:1359/2285 train_time:82141ms step_avg:60.44ms -step:1360/2285 train_time:82201ms step_avg:60.44ms -step:1361/2285 train_time:82263ms step_avg:60.44ms -step:1362/2285 train_time:82323ms step_avg:60.44ms -step:1363/2285 train_time:82384ms step_avg:60.44ms -step:1364/2285 train_time:82444ms step_avg:60.44ms -step:1365/2285 train_time:82506ms step_avg:60.44ms -step:1366/2285 train_time:82566ms step_avg:60.44ms -step:1367/2285 train_time:82628ms step_avg:60.45ms -step:1368/2285 train_time:82689ms step_avg:60.44ms -step:1369/2285 train_time:82751ms step_avg:60.45ms -step:1370/2285 train_time:82811ms step_avg:60.45ms -step:1371/2285 train_time:82873ms step_avg:60.45ms -step:1372/2285 train_time:82933ms step_avg:60.45ms -step:1373/2285 train_time:82995ms step_avg:60.45ms -step:1374/2285 train_time:83055ms step_avg:60.45ms -step:1375/2285 train_time:83117ms step_avg:60.45ms -step:1376/2285 train_time:83177ms step_avg:60.45ms -step:1377/2285 train_time:83240ms step_avg:60.45ms -step:1378/2285 train_time:83299ms step_avg:60.45ms -step:1379/2285 train_time:83362ms step_avg:60.45ms -step:1380/2285 train_time:83422ms step_avg:60.45ms -step:1381/2285 train_time:83483ms step_avg:60.45ms -step:1382/2285 train_time:83543ms step_avg:60.45ms -step:1383/2285 train_time:83606ms step_avg:60.45ms -step:1384/2285 train_time:83667ms step_avg:60.45ms -step:1385/2285 train_time:83729ms step_avg:60.45ms -step:1386/2285 train_time:83789ms step_avg:60.45ms -step:1387/2285 train_time:83851ms step_avg:60.45ms -step:1388/2285 train_time:83910ms step_avg:60.45ms -step:1389/2285 train_time:83972ms step_avg:60.46ms -step:1390/2285 train_time:84032ms step_avg:60.45ms -step:1391/2285 train_time:84094ms step_avg:60.46ms -step:1392/2285 train_time:84154ms step_avg:60.46ms -step:1393/2285 train_time:84216ms step_avg:60.46ms -step:1394/2285 train_time:84276ms step_avg:60.46ms -step:1395/2285 train_time:84339ms step_avg:60.46ms -step:1396/2285 train_time:84398ms step_avg:60.46ms -step:1397/2285 train_time:84461ms step_avg:60.46ms -step:1398/2285 train_time:84521ms step_avg:60.46ms -step:1399/2285 train_time:84583ms step_avg:60.46ms -step:1400/2285 train_time:84643ms step_avg:60.46ms -step:1401/2285 train_time:84705ms step_avg:60.46ms -step:1402/2285 train_time:84765ms step_avg:60.46ms -step:1403/2285 train_time:84827ms step_avg:60.46ms -step:1404/2285 train_time:84887ms step_avg:60.46ms -step:1405/2285 train_time:84949ms step_avg:60.46ms -step:1406/2285 train_time:85009ms step_avg:60.46ms -step:1407/2285 train_time:85071ms step_avg:60.46ms -step:1408/2285 train_time:85131ms step_avg:60.46ms -step:1409/2285 train_time:85193ms step_avg:60.46ms -step:1410/2285 train_time:85252ms step_avg:60.46ms -step:1411/2285 train_time:85314ms step_avg:60.46ms -step:1412/2285 train_time:85374ms step_avg:60.46ms -step:1413/2285 train_time:85437ms step_avg:60.46ms -step:1414/2285 train_time:85497ms step_avg:60.46ms -step:1415/2285 train_time:85559ms step_avg:60.47ms -step:1416/2285 train_time:85620ms step_avg:60.47ms -step:1417/2285 train_time:85682ms step_avg:60.47ms -step:1418/2285 train_time:85742ms step_avg:60.47ms -step:1419/2285 train_time:85803ms step_avg:60.47ms -step:1420/2285 train_time:85863ms step_avg:60.47ms -step:1421/2285 train_time:85925ms step_avg:60.47ms -step:1422/2285 train_time:85985ms step_avg:60.47ms -step:1423/2285 train_time:86047ms step_avg:60.47ms -step:1424/2285 train_time:86107ms step_avg:60.47ms -step:1425/2285 train_time:86169ms step_avg:60.47ms -step:1426/2285 train_time:86229ms step_avg:60.47ms -step:1427/2285 train_time:86292ms step_avg:60.47ms -step:1428/2285 train_time:86351ms step_avg:60.47ms -step:1429/2285 train_time:86414ms step_avg:60.47ms -step:1430/2285 train_time:86473ms step_avg:60.47ms -step:1431/2285 train_time:86535ms step_avg:60.47ms -step:1432/2285 train_time:86595ms step_avg:60.47ms -step:1433/2285 train_time:86658ms step_avg:60.47ms -step:1434/2285 train_time:86718ms step_avg:60.47ms -step:1435/2285 train_time:86781ms step_avg:60.47ms -step:1436/2285 train_time:86841ms step_avg:60.47ms -step:1437/2285 train_time:86903ms step_avg:60.48ms -step:1438/2285 train_time:86962ms step_avg:60.47ms -step:1439/2285 train_time:87024ms step_avg:60.48ms -step:1440/2285 train_time:87084ms step_avg:60.47ms -step:1441/2285 train_time:87145ms step_avg:60.48ms -step:1442/2285 train_time:87205ms step_avg:60.48ms -step:1443/2285 train_time:87267ms step_avg:60.48ms -step:1444/2285 train_time:87327ms step_avg:60.48ms -step:1445/2285 train_time:87389ms step_avg:60.48ms -step:1446/2285 train_time:87449ms step_avg:60.48ms -step:1447/2285 train_time:87512ms step_avg:60.48ms -step:1448/2285 train_time:87571ms step_avg:60.48ms -step:1449/2285 train_time:87634ms step_avg:60.48ms -step:1450/2285 train_time:87693ms step_avg:60.48ms -step:1451/2285 train_time:87755ms step_avg:60.48ms -step:1452/2285 train_time:87815ms step_avg:60.48ms -step:1453/2285 train_time:87877ms step_avg:60.48ms -step:1454/2285 train_time:87937ms step_avg:60.48ms -step:1455/2285 train_time:88000ms step_avg:60.48ms -step:1456/2285 train_time:88060ms step_avg:60.48ms -step:1457/2285 train_time:88122ms step_avg:60.48ms -step:1458/2285 train_time:88182ms step_avg:60.48ms -step:1459/2285 train_time:88244ms step_avg:60.48ms -step:1460/2285 train_time:88305ms step_avg:60.48ms -step:1461/2285 train_time:88367ms step_avg:60.48ms -step:1462/2285 train_time:88427ms step_avg:60.48ms -step:1463/2285 train_time:88489ms step_avg:60.48ms -step:1464/2285 train_time:88549ms step_avg:60.48ms -step:1465/2285 train_time:88611ms step_avg:60.49ms -step:1466/2285 train_time:88671ms step_avg:60.48ms -step:1467/2285 train_time:88733ms step_avg:60.49ms -step:1468/2285 train_time:88792ms step_avg:60.48ms -step:1469/2285 train_time:88854ms step_avg:60.49ms -step:1470/2285 train_time:88914ms step_avg:60.49ms -step:1471/2285 train_time:88977ms step_avg:60.49ms -step:1472/2285 train_time:89037ms step_avg:60.49ms -step:1473/2285 train_time:89099ms step_avg:60.49ms -step:1474/2285 train_time:89159ms step_avg:60.49ms -step:1475/2285 train_time:89221ms step_avg:60.49ms -step:1476/2285 train_time:89281ms step_avg:60.49ms -step:1477/2285 train_time:89343ms step_avg:60.49ms -step:1478/2285 train_time:89403ms step_avg:60.49ms -step:1479/2285 train_time:89465ms step_avg:60.49ms -step:1480/2285 train_time:89525ms step_avg:60.49ms -step:1481/2285 train_time:89588ms step_avg:60.49ms -step:1482/2285 train_time:89647ms step_avg:60.49ms -step:1483/2285 train_time:89709ms step_avg:60.49ms -step:1484/2285 train_time:89769ms step_avg:60.49ms -step:1485/2285 train_time:89831ms step_avg:60.49ms -step:1486/2285 train_time:89891ms step_avg:60.49ms -step:1487/2285 train_time:89952ms step_avg:60.49ms -step:1488/2285 train_time:90012ms step_avg:60.49ms -step:1489/2285 train_time:90074ms step_avg:60.49ms -step:1490/2285 train_time:90134ms step_avg:60.49ms -step:1491/2285 train_time:90196ms step_avg:60.49ms -step:1492/2285 train_time:90256ms step_avg:60.49ms -step:1493/2285 train_time:90319ms step_avg:60.49ms -step:1494/2285 train_time:90378ms step_avg:60.49ms -step:1495/2285 train_time:90441ms step_avg:60.50ms -step:1496/2285 train_time:90501ms step_avg:60.50ms -step:1497/2285 train_time:90564ms step_avg:60.50ms -step:1498/2285 train_time:90624ms step_avg:60.50ms -step:1499/2285 train_time:90687ms step_avg:60.50ms -step:1500/2285 train_time:90746ms step_avg:60.50ms -step:1500/2285 val_loss:3.4273 train_time:90810ms step_avg:60.54ms -step:1501/2285 train_time:90828ms step_avg:60.51ms -step:1502/2285 train_time:90870ms step_avg:60.50ms -step:1503/2285 train_time:90935ms step_avg:60.50ms -step:1504/2285 train_time:90996ms step_avg:60.50ms -step:1505/2285 train_time:91058ms step_avg:60.50ms -step:1506/2285 train_time:91119ms step_avg:60.50ms -step:1507/2285 train_time:91180ms step_avg:60.50ms -step:1508/2285 train_time:91238ms step_avg:60.50ms -step:1509/2285 train_time:91300ms step_avg:60.50ms -step:1510/2285 train_time:91359ms step_avg:60.50ms -step:1511/2285 train_time:91420ms step_avg:60.50ms -step:1512/2285 train_time:91479ms step_avg:60.50ms -step:1513/2285 train_time:91540ms step_avg:60.50ms -step:1514/2285 train_time:91601ms step_avg:60.50ms -step:1515/2285 train_time:91664ms step_avg:60.50ms -step:1516/2285 train_time:91724ms step_avg:60.50ms -step:1517/2285 train_time:91787ms step_avg:60.51ms -step:1518/2285 train_time:91848ms step_avg:60.51ms -step:1519/2285 train_time:91911ms step_avg:60.51ms -step:1520/2285 train_time:91972ms step_avg:60.51ms -step:1521/2285 train_time:92034ms step_avg:60.51ms -step:1522/2285 train_time:92094ms step_avg:60.51ms -step:1523/2285 train_time:92156ms step_avg:60.51ms -step:1524/2285 train_time:92215ms step_avg:60.51ms -step:1525/2285 train_time:92277ms step_avg:60.51ms -step:1526/2285 train_time:92336ms step_avg:60.51ms -step:1527/2285 train_time:92398ms step_avg:60.51ms -step:1528/2285 train_time:92457ms step_avg:60.51ms -step:1529/2285 train_time:92519ms step_avg:60.51ms -step:1530/2285 train_time:92578ms step_avg:60.51ms -step:1531/2285 train_time:92641ms step_avg:60.51ms -step:1532/2285 train_time:92701ms step_avg:60.51ms -step:1533/2285 train_time:92763ms step_avg:60.51ms -step:1534/2285 train_time:92824ms step_avg:60.51ms -step:1535/2285 train_time:92888ms step_avg:60.51ms -step:1536/2285 train_time:92948ms step_avg:60.51ms -step:1537/2285 train_time:93011ms step_avg:60.51ms -step:1538/2285 train_time:93071ms step_avg:60.51ms -step:1539/2285 train_time:93133ms step_avg:60.52ms -step:1540/2285 train_time:93194ms step_avg:60.52ms -step:1541/2285 train_time:93256ms step_avg:60.52ms -step:1542/2285 train_time:93315ms step_avg:60.52ms -step:1543/2285 train_time:93377ms step_avg:60.52ms -step:1544/2285 train_time:93436ms step_avg:60.52ms -step:1545/2285 train_time:93498ms step_avg:60.52ms -step:1546/2285 train_time:93558ms step_avg:60.52ms -step:1547/2285 train_time:93620ms step_avg:60.52ms -step:1548/2285 train_time:93680ms step_avg:60.52ms -step:1549/2285 train_time:93743ms step_avg:60.52ms -step:1550/2285 train_time:93803ms step_avg:60.52ms -step:1551/2285 train_time:93866ms step_avg:60.52ms -step:1552/2285 train_time:93927ms step_avg:60.52ms -step:1553/2285 train_time:93990ms step_avg:60.52ms -step:1554/2285 train_time:94050ms step_avg:60.52ms -step:1555/2285 train_time:94112ms step_avg:60.52ms -step:1556/2285 train_time:94172ms step_avg:60.52ms -step:1557/2285 train_time:94234ms step_avg:60.52ms -step:1558/2285 train_time:94293ms step_avg:60.52ms -step:1559/2285 train_time:94356ms step_avg:60.52ms -step:1560/2285 train_time:94415ms step_avg:60.52ms -step:1561/2285 train_time:94477ms step_avg:60.52ms -step:1562/2285 train_time:94537ms step_avg:60.52ms -step:1563/2285 train_time:94599ms step_avg:60.52ms -step:1564/2285 train_time:94659ms step_avg:60.52ms -step:1565/2285 train_time:94721ms step_avg:60.52ms -step:1566/2285 train_time:94781ms step_avg:60.52ms -step:1567/2285 train_time:94844ms step_avg:60.53ms -step:1568/2285 train_time:94906ms step_avg:60.53ms -step:1569/2285 train_time:94969ms step_avg:60.53ms -step:1570/2285 train_time:95029ms step_avg:60.53ms -step:1571/2285 train_time:95091ms step_avg:60.53ms -step:1572/2285 train_time:95151ms step_avg:60.53ms -step:1573/2285 train_time:95213ms step_avg:60.53ms -step:1574/2285 train_time:95273ms step_avg:60.53ms -step:1575/2285 train_time:95335ms step_avg:60.53ms -step:1576/2285 train_time:95395ms step_avg:60.53ms -step:1577/2285 train_time:95457ms step_avg:60.53ms -step:1578/2285 train_time:95517ms step_avg:60.53ms -step:1579/2285 train_time:95579ms step_avg:60.53ms -step:1580/2285 train_time:95639ms step_avg:60.53ms -step:1581/2285 train_time:95701ms step_avg:60.53ms -step:1582/2285 train_time:95761ms step_avg:60.53ms -step:1583/2285 train_time:95824ms step_avg:60.53ms -step:1584/2285 train_time:95884ms step_avg:60.53ms -step:1585/2285 train_time:95948ms step_avg:60.53ms -step:1586/2285 train_time:96008ms step_avg:60.53ms -step:1587/2285 train_time:96070ms step_avg:60.54ms -step:1588/2285 train_time:96130ms step_avg:60.54ms -step:1589/2285 train_time:96192ms step_avg:60.54ms -step:1590/2285 train_time:96253ms step_avg:60.54ms -step:1591/2285 train_time:96316ms step_avg:60.54ms -step:1592/2285 train_time:96375ms step_avg:60.54ms -step:1593/2285 train_time:96437ms step_avg:60.54ms -step:1594/2285 train_time:96497ms step_avg:60.54ms -step:1595/2285 train_time:96560ms step_avg:60.54ms -step:1596/2285 train_time:96620ms step_avg:60.54ms -step:1597/2285 train_time:96681ms step_avg:60.54ms -step:1598/2285 train_time:96741ms step_avg:60.54ms -step:1599/2285 train_time:96803ms step_avg:60.54ms -step:1600/2285 train_time:96864ms step_avg:60.54ms -step:1601/2285 train_time:96926ms step_avg:60.54ms -step:1602/2285 train_time:96986ms step_avg:60.54ms -step:1603/2285 train_time:97049ms step_avg:60.54ms -step:1604/2285 train_time:97109ms step_avg:60.54ms -step:1605/2285 train_time:97171ms step_avg:60.54ms -step:1606/2285 train_time:97231ms step_avg:60.54ms -step:1607/2285 train_time:97294ms step_avg:60.54ms -step:1608/2285 train_time:97354ms step_avg:60.54ms -step:1609/2285 train_time:97417ms step_avg:60.54ms -step:1610/2285 train_time:97476ms step_avg:60.54ms -step:1611/2285 train_time:97538ms step_avg:60.55ms -step:1612/2285 train_time:97598ms step_avg:60.54ms -step:1613/2285 train_time:97661ms step_avg:60.55ms -step:1614/2285 train_time:97721ms step_avg:60.55ms -step:1615/2285 train_time:97783ms step_avg:60.55ms -step:1616/2285 train_time:97843ms step_avg:60.55ms -step:1617/2285 train_time:97907ms step_avg:60.55ms -step:1618/2285 train_time:97967ms step_avg:60.55ms -step:1619/2285 train_time:98030ms step_avg:60.55ms -step:1620/2285 train_time:98090ms step_avg:60.55ms -step:1621/2285 train_time:98152ms step_avg:60.55ms -step:1622/2285 train_time:98213ms step_avg:60.55ms -step:1623/2285 train_time:98275ms step_avg:60.55ms -step:1624/2285 train_time:98335ms step_avg:60.55ms -step:1625/2285 train_time:98397ms step_avg:60.55ms -step:1626/2285 train_time:98456ms step_avg:60.55ms -step:1627/2285 train_time:98518ms step_avg:60.55ms -step:1628/2285 train_time:98577ms step_avg:60.55ms -step:1629/2285 train_time:98640ms step_avg:60.55ms -step:1630/2285 train_time:98700ms step_avg:60.55ms -step:1631/2285 train_time:98762ms step_avg:60.55ms -step:1632/2285 train_time:98822ms step_avg:60.55ms -step:1633/2285 train_time:98884ms step_avg:60.55ms -step:1634/2285 train_time:98944ms step_avg:60.55ms -step:1635/2285 train_time:99007ms step_avg:60.55ms -step:1636/2285 train_time:99067ms step_avg:60.55ms -step:1637/2285 train_time:99130ms step_avg:60.56ms -step:1638/2285 train_time:99190ms step_avg:60.56ms -step:1639/2285 train_time:99254ms step_avg:60.56ms -step:1640/2285 train_time:99313ms step_avg:60.56ms -step:1641/2285 train_time:99376ms step_avg:60.56ms -step:1642/2285 train_time:99435ms step_avg:60.56ms -step:1643/2285 train_time:99497ms step_avg:60.56ms -step:1644/2285 train_time:99557ms step_avg:60.56ms -step:1645/2285 train_time:99619ms step_avg:60.56ms -step:1646/2285 train_time:99678ms step_avg:60.56ms -step:1647/2285 train_time:99740ms step_avg:60.56ms -step:1648/2285 train_time:99800ms step_avg:60.56ms -step:1649/2285 train_time:99862ms step_avg:60.56ms -step:1650/2285 train_time:99923ms step_avg:60.56ms -step:1651/2285 train_time:99986ms step_avg:60.56ms -step:1652/2285 train_time:100046ms step_avg:60.56ms -step:1653/2285 train_time:100109ms step_avg:60.56ms -step:1654/2285 train_time:100169ms step_avg:60.56ms -step:1655/2285 train_time:100232ms step_avg:60.56ms -step:1656/2285 train_time:100292ms step_avg:60.56ms -step:1657/2285 train_time:100354ms step_avg:60.56ms -step:1658/2285 train_time:100415ms step_avg:60.56ms -step:1659/2285 train_time:100476ms step_avg:60.56ms -step:1660/2285 train_time:100536ms step_avg:60.56ms -step:1661/2285 train_time:100598ms step_avg:60.56ms -step:1662/2285 train_time:100658ms step_avg:60.56ms -step:1663/2285 train_time:100720ms step_avg:60.57ms -step:1664/2285 train_time:100780ms step_avg:60.56ms -step:1665/2285 train_time:100842ms step_avg:60.57ms -step:1666/2285 train_time:100902ms step_avg:60.57ms -step:1667/2285 train_time:100965ms step_avg:60.57ms -step:1668/2285 train_time:101026ms step_avg:60.57ms -step:1669/2285 train_time:101088ms step_avg:60.57ms -step:1670/2285 train_time:101149ms step_avg:60.57ms -step:1671/2285 train_time:101212ms step_avg:60.57ms -step:1672/2285 train_time:101272ms step_avg:60.57ms -step:1673/2285 train_time:101334ms step_avg:60.57ms -step:1674/2285 train_time:101393ms step_avg:60.57ms -step:1675/2285 train_time:101456ms step_avg:60.57ms -step:1676/2285 train_time:101515ms step_avg:60.57ms -step:1677/2285 train_time:101577ms step_avg:60.57ms -step:1678/2285 train_time:101637ms step_avg:60.57ms -step:1679/2285 train_time:101699ms step_avg:60.57ms -step:1680/2285 train_time:101759ms step_avg:60.57ms -step:1681/2285 train_time:101822ms step_avg:60.57ms -step:1682/2285 train_time:101881ms step_avg:60.57ms -step:1683/2285 train_time:101944ms step_avg:60.57ms -step:1684/2285 train_time:102004ms step_avg:60.57ms -step:1685/2285 train_time:102067ms step_avg:60.57ms -step:1686/2285 train_time:102127ms step_avg:60.57ms -step:1687/2285 train_time:102190ms step_avg:60.58ms -step:1688/2285 train_time:102250ms step_avg:60.57ms -step:1689/2285 train_time:102313ms step_avg:60.58ms -step:1690/2285 train_time:102372ms step_avg:60.58ms -step:1691/2285 train_time:102436ms step_avg:60.58ms -step:1692/2285 train_time:102496ms step_avg:60.58ms -step:1693/2285 train_time:102557ms step_avg:60.58ms -step:1694/2285 train_time:102617ms step_avg:60.58ms -step:1695/2285 train_time:102679ms step_avg:60.58ms -step:1696/2285 train_time:102738ms step_avg:60.58ms -step:1697/2285 train_time:102800ms step_avg:60.58ms -step:1698/2285 train_time:102860ms step_avg:60.58ms -step:1699/2285 train_time:102922ms step_avg:60.58ms -step:1700/2285 train_time:102983ms step_avg:60.58ms -step:1701/2285 train_time:103046ms step_avg:60.58ms -step:1702/2285 train_time:103107ms step_avg:60.58ms -step:1703/2285 train_time:103169ms step_avg:60.58ms -step:1704/2285 train_time:103230ms step_avg:60.58ms -step:1705/2285 train_time:103292ms step_avg:60.58ms -step:1706/2285 train_time:103352ms step_avg:60.58ms -step:1707/2285 train_time:103415ms step_avg:60.58ms -step:1708/2285 train_time:103475ms step_avg:60.58ms -step:1709/2285 train_time:103537ms step_avg:60.58ms -step:1710/2285 train_time:103596ms step_avg:60.58ms -step:1711/2285 train_time:103658ms step_avg:60.58ms -step:1712/2285 train_time:103718ms step_avg:60.58ms -step:1713/2285 train_time:103779ms step_avg:60.58ms -step:1714/2285 train_time:103839ms step_avg:60.58ms -step:1715/2285 train_time:103902ms step_avg:60.58ms -step:1716/2285 train_time:103962ms step_avg:60.58ms -step:1717/2285 train_time:104025ms step_avg:60.59ms -step:1718/2285 train_time:104085ms step_avg:60.58ms -step:1719/2285 train_time:104148ms step_avg:60.59ms -step:1720/2285 train_time:104208ms step_avg:60.59ms -step:1721/2285 train_time:104271ms step_avg:60.59ms -step:1722/2285 train_time:104331ms step_avg:60.59ms -step:1723/2285 train_time:104393ms step_avg:60.59ms -step:1724/2285 train_time:104453ms step_avg:60.59ms -step:1725/2285 train_time:104516ms step_avg:60.59ms -step:1726/2285 train_time:104575ms step_avg:60.59ms -step:1727/2285 train_time:104637ms step_avg:60.59ms -step:1728/2285 train_time:104697ms step_avg:60.59ms -step:1729/2285 train_time:104759ms step_avg:60.59ms -step:1730/2285 train_time:104818ms step_avg:60.59ms -step:1731/2285 train_time:104881ms step_avg:60.59ms -step:1732/2285 train_time:104941ms step_avg:60.59ms -step:1733/2285 train_time:105003ms step_avg:60.59ms -step:1734/2285 train_time:105064ms step_avg:60.59ms -step:1735/2285 train_time:105127ms step_avg:60.59ms -step:1736/2285 train_time:105188ms step_avg:60.59ms -step:1737/2285 train_time:105251ms step_avg:60.59ms -step:1738/2285 train_time:105311ms step_avg:60.59ms -step:1739/2285 train_time:105373ms step_avg:60.59ms -step:1740/2285 train_time:105433ms step_avg:60.59ms -step:1741/2285 train_time:105495ms step_avg:60.59ms -step:1742/2285 train_time:105555ms step_avg:60.59ms -step:1743/2285 train_time:105617ms step_avg:60.59ms -step:1744/2285 train_time:105676ms step_avg:60.59ms -step:1745/2285 train_time:105739ms step_avg:60.60ms -step:1746/2285 train_time:105799ms step_avg:60.59ms -step:1747/2285 train_time:105861ms step_avg:60.60ms -step:1748/2285 train_time:105921ms step_avg:60.60ms -step:1749/2285 train_time:105982ms step_avg:60.60ms -step:1750/2285 train_time:106043ms step_avg:60.60ms -step:1750/2285 val_loss:3.3663 train_time:106108ms step_avg:60.63ms -step:1751/2285 train_time:106129ms step_avg:60.61ms -step:1752/2285 train_time:106167ms step_avg:60.60ms -step:1753/2285 train_time:106230ms step_avg:60.60ms -step:1754/2285 train_time:106291ms step_avg:60.60ms -step:1755/2285 train_time:106355ms step_avg:60.60ms -step:1756/2285 train_time:106416ms step_avg:60.60ms -step:1757/2285 train_time:106478ms step_avg:60.60ms -step:1758/2285 train_time:106537ms step_avg:60.60ms -step:1759/2285 train_time:106599ms step_avg:60.60ms -step:1760/2285 train_time:106658ms step_avg:60.60ms -step:1761/2285 train_time:106719ms step_avg:60.60ms -step:1762/2285 train_time:106778ms step_avg:60.60ms -step:1763/2285 train_time:106841ms step_avg:60.60ms -step:1764/2285 train_time:106901ms step_avg:60.60ms -step:1765/2285 train_time:106963ms step_avg:60.60ms -step:1766/2285 train_time:107025ms step_avg:60.60ms -step:1767/2285 train_time:107090ms step_avg:60.61ms -step:1768/2285 train_time:107150ms step_avg:60.61ms -step:1769/2285 train_time:107212ms step_avg:60.61ms -step:1770/2285 train_time:107272ms step_avg:60.61ms -step:1771/2285 train_time:107335ms step_avg:60.61ms -step:1772/2285 train_time:107395ms step_avg:60.61ms -step:1773/2285 train_time:107458ms step_avg:60.61ms -step:1774/2285 train_time:107517ms step_avg:60.61ms -step:1775/2285 train_time:107579ms step_avg:60.61ms -step:1776/2285 train_time:107638ms step_avg:60.61ms -step:1777/2285 train_time:107700ms step_avg:60.61ms -step:1778/2285 train_time:107759ms step_avg:60.61ms -step:1779/2285 train_time:107821ms step_avg:60.61ms -step:1780/2285 train_time:107880ms step_avg:60.61ms -step:1781/2285 train_time:107943ms step_avg:60.61ms -step:1782/2285 train_time:108004ms step_avg:60.61ms -step:1783/2285 train_time:108068ms step_avg:60.61ms -step:1784/2285 train_time:108127ms step_avg:60.61ms -step:1785/2285 train_time:108190ms step_avg:60.61ms -step:1786/2285 train_time:108251ms step_avg:60.61ms -step:1787/2285 train_time:108314ms step_avg:60.61ms -step:1788/2285 train_time:108374ms step_avg:60.61ms -step:1789/2285 train_time:108436ms step_avg:60.61ms -step:1790/2285 train_time:108495ms step_avg:60.61ms -step:1791/2285 train_time:108557ms step_avg:60.61ms -step:1792/2285 train_time:108617ms step_avg:60.61ms -step:1793/2285 train_time:108678ms step_avg:60.61ms -step:1794/2285 train_time:108738ms step_avg:60.61ms -step:1795/2285 train_time:108800ms step_avg:60.61ms -step:1796/2285 train_time:108859ms step_avg:60.61ms -step:1797/2285 train_time:108921ms step_avg:60.61ms -step:1798/2285 train_time:108981ms step_avg:60.61ms -step:1799/2285 train_time:109044ms step_avg:60.61ms -step:1800/2285 train_time:109104ms step_avg:60.61ms -step:1801/2285 train_time:109168ms step_avg:60.61ms -step:1802/2285 train_time:109228ms step_avg:60.61ms -step:1803/2285 train_time:109290ms step_avg:60.62ms -step:1804/2285 train_time:109350ms step_avg:60.62ms -step:1805/2285 train_time:109413ms step_avg:60.62ms -step:1806/2285 train_time:109473ms step_avg:60.62ms -step:1807/2285 train_time:109534ms step_avg:60.62ms -step:1808/2285 train_time:109594ms step_avg:60.62ms -step:1809/2285 train_time:109655ms step_avg:60.62ms -step:1810/2285 train_time:109716ms step_avg:60.62ms -step:1811/2285 train_time:109778ms step_avg:60.62ms -step:1812/2285 train_time:109837ms step_avg:60.62ms -step:1813/2285 train_time:109900ms step_avg:60.62ms -step:1814/2285 train_time:109959ms step_avg:60.62ms -step:1815/2285 train_time:110022ms step_avg:60.62ms -step:1816/2285 train_time:110083ms step_avg:60.62ms -step:1817/2285 train_time:110146ms step_avg:60.62ms -step:1818/2285 train_time:110207ms step_avg:60.62ms -step:1819/2285 train_time:110270ms step_avg:60.62ms -step:1820/2285 train_time:110329ms step_avg:60.62ms -step:1821/2285 train_time:110392ms step_avg:60.62ms -step:1822/2285 train_time:110452ms step_avg:60.62ms -step:1823/2285 train_time:110514ms step_avg:60.62ms -step:1824/2285 train_time:110573ms step_avg:60.62ms -step:1825/2285 train_time:110635ms step_avg:60.62ms -step:1826/2285 train_time:110695ms step_avg:60.62ms -step:1827/2285 train_time:110757ms step_avg:60.62ms -step:1828/2285 train_time:110817ms step_avg:60.62ms -step:1829/2285 train_time:110879ms step_avg:60.62ms -step:1830/2285 train_time:110939ms step_avg:60.62ms -step:1831/2285 train_time:111002ms step_avg:60.62ms -step:1832/2285 train_time:111062ms step_avg:60.62ms -step:1833/2285 train_time:111124ms step_avg:60.62ms -step:1834/2285 train_time:111185ms step_avg:60.62ms -step:1835/2285 train_time:111248ms step_avg:60.63ms -step:1836/2285 train_time:111307ms step_avg:60.62ms -step:1837/2285 train_time:111370ms step_avg:60.63ms -step:1838/2285 train_time:111430ms step_avg:60.63ms -step:1839/2285 train_time:111492ms step_avg:60.63ms -step:1840/2285 train_time:111553ms step_avg:60.63ms -step:1841/2285 train_time:111614ms step_avg:60.63ms -step:1842/2285 train_time:111674ms step_avg:60.63ms -step:1843/2285 train_time:111736ms step_avg:60.63ms -step:1844/2285 train_time:111795ms step_avg:60.63ms -step:1845/2285 train_time:111858ms step_avg:60.63ms -step:1846/2285 train_time:111918ms step_avg:60.63ms -step:1847/2285 train_time:111980ms step_avg:60.63ms -step:1848/2285 train_time:112040ms step_avg:60.63ms -step:1849/2285 train_time:112103ms step_avg:60.63ms -step:1850/2285 train_time:112163ms step_avg:60.63ms -step:1851/2285 train_time:112226ms step_avg:60.63ms -step:1852/2285 train_time:112286ms step_avg:60.63ms -step:1853/2285 train_time:112349ms step_avg:60.63ms -step:1854/2285 train_time:112408ms step_avg:60.63ms -step:1855/2285 train_time:112471ms step_avg:60.63ms -step:1856/2285 train_time:112531ms step_avg:60.63ms -step:1857/2285 train_time:112593ms step_avg:60.63ms -step:1858/2285 train_time:112653ms step_avg:60.63ms -step:1859/2285 train_time:112715ms step_avg:60.63ms -step:1860/2285 train_time:112774ms step_avg:60.63ms -step:1861/2285 train_time:112837ms step_avg:60.63ms -step:1862/2285 train_time:112896ms step_avg:60.63ms -step:1863/2285 train_time:112959ms step_avg:60.63ms -step:1864/2285 train_time:113018ms step_avg:60.63ms -step:1865/2285 train_time:113080ms step_avg:60.63ms -step:1866/2285 train_time:113140ms step_avg:60.63ms -step:1867/2285 train_time:113203ms step_avg:60.63ms -step:1868/2285 train_time:113263ms step_avg:60.63ms -step:1869/2285 train_time:113326ms step_avg:60.63ms -step:1870/2285 train_time:113386ms step_avg:60.63ms -step:1871/2285 train_time:113449ms step_avg:60.64ms -step:1872/2285 train_time:113508ms step_avg:60.63ms -step:1873/2285 train_time:113570ms step_avg:60.64ms -step:1874/2285 train_time:113630ms step_avg:60.63ms -step:1875/2285 train_time:113692ms step_avg:60.64ms -step:1876/2285 train_time:113753ms step_avg:60.64ms -step:1877/2285 train_time:113815ms step_avg:60.64ms -step:1878/2285 train_time:113875ms step_avg:60.64ms -step:1879/2285 train_time:113937ms step_avg:60.64ms -step:1880/2285 train_time:113997ms step_avg:60.64ms -step:1881/2285 train_time:114060ms step_avg:60.64ms -step:1882/2285 train_time:114120ms step_avg:60.64ms -step:1883/2285 train_time:114181ms step_avg:60.64ms -step:1884/2285 train_time:114241ms step_avg:60.64ms -step:1885/2285 train_time:114304ms step_avg:60.64ms -step:1886/2285 train_time:114364ms step_avg:60.64ms -step:1887/2285 train_time:114427ms step_avg:60.64ms -step:1888/2285 train_time:114486ms step_avg:60.64ms -step:1889/2285 train_time:114549ms step_avg:60.64ms -step:1890/2285 train_time:114609ms step_avg:60.64ms -step:1891/2285 train_time:114671ms step_avg:60.64ms -step:1892/2285 train_time:114731ms step_avg:60.64ms -step:1893/2285 train_time:114794ms step_avg:60.64ms -step:1894/2285 train_time:114854ms step_avg:60.64ms -step:1895/2285 train_time:114916ms step_avg:60.64ms -step:1896/2285 train_time:114976ms step_avg:60.64ms -step:1897/2285 train_time:115039ms step_avg:60.64ms -step:1898/2285 train_time:115099ms step_avg:60.64ms -step:1899/2285 train_time:115161ms step_avg:60.64ms -step:1900/2285 train_time:115220ms step_avg:60.64ms -step:1901/2285 train_time:115283ms step_avg:60.64ms -step:1902/2285 train_time:115344ms step_avg:60.64ms -step:1903/2285 train_time:115407ms step_avg:60.64ms -step:1904/2285 train_time:115466ms step_avg:60.64ms -step:1905/2285 train_time:115529ms step_avg:60.65ms -step:1906/2285 train_time:115589ms step_avg:60.64ms -step:1907/2285 train_time:115651ms step_avg:60.65ms -step:1908/2285 train_time:115711ms step_avg:60.65ms -step:1909/2285 train_time:115773ms step_avg:60.65ms -step:1910/2285 train_time:115833ms step_avg:60.65ms -step:1911/2285 train_time:115896ms step_avg:60.65ms -step:1912/2285 train_time:115956ms step_avg:60.65ms -step:1913/2285 train_time:116018ms step_avg:60.65ms -step:1914/2285 train_time:116078ms step_avg:60.65ms -step:1915/2285 train_time:116140ms step_avg:60.65ms -step:1916/2285 train_time:116201ms step_avg:60.65ms -step:1917/2285 train_time:116264ms step_avg:60.65ms -step:1918/2285 train_time:116324ms step_avg:60.65ms -step:1919/2285 train_time:116387ms step_avg:60.65ms -step:1920/2285 train_time:116447ms step_avg:60.65ms -step:1921/2285 train_time:116510ms step_avg:60.65ms -step:1922/2285 train_time:116570ms step_avg:60.65ms -step:1923/2285 train_time:116633ms step_avg:60.65ms -step:1924/2285 train_time:116693ms step_avg:60.65ms -step:1925/2285 train_time:116755ms step_avg:60.65ms -step:1926/2285 train_time:116815ms step_avg:60.65ms -step:1927/2285 train_time:116878ms step_avg:60.65ms -step:1928/2285 train_time:116938ms step_avg:60.65ms -step:1929/2285 train_time:117000ms step_avg:60.65ms -step:1930/2285 train_time:117060ms step_avg:60.65ms -step:1931/2285 train_time:117122ms step_avg:60.65ms -step:1932/2285 train_time:117182ms step_avg:60.65ms -step:1933/2285 train_time:117245ms step_avg:60.65ms -step:1934/2285 train_time:117305ms step_avg:60.65ms -step:1935/2285 train_time:117367ms step_avg:60.65ms -step:1936/2285 train_time:117427ms step_avg:60.65ms -step:1937/2285 train_time:117489ms step_avg:60.66ms -step:1938/2285 train_time:117549ms step_avg:60.65ms -step:1939/2285 train_time:117612ms step_avg:60.66ms -step:1940/2285 train_time:117671ms step_avg:60.66ms -step:1941/2285 train_time:117734ms step_avg:60.66ms -step:1942/2285 train_time:117794ms step_avg:60.66ms -step:1943/2285 train_time:117856ms step_avg:60.66ms -step:1944/2285 train_time:117916ms step_avg:60.66ms -step:1945/2285 train_time:117978ms step_avg:60.66ms -step:1946/2285 train_time:118038ms step_avg:60.66ms -step:1947/2285 train_time:118100ms step_avg:60.66ms -step:1948/2285 train_time:118161ms step_avg:60.66ms -step:1949/2285 train_time:118223ms step_avg:60.66ms -step:1950/2285 train_time:118284ms step_avg:60.66ms -step:1951/2285 train_time:118346ms step_avg:60.66ms -step:1952/2285 train_time:118406ms step_avg:60.66ms -step:1953/2285 train_time:118468ms step_avg:60.66ms -step:1954/2285 train_time:118528ms step_avg:60.66ms -step:1955/2285 train_time:118591ms step_avg:60.66ms -step:1956/2285 train_time:118651ms step_avg:60.66ms -step:1957/2285 train_time:118714ms step_avg:60.66ms -step:1958/2285 train_time:118774ms step_avg:60.66ms -step:1959/2285 train_time:118836ms step_avg:60.66ms -step:1960/2285 train_time:118897ms step_avg:60.66ms -step:1961/2285 train_time:118959ms step_avg:60.66ms -step:1962/2285 train_time:119018ms step_avg:60.66ms -step:1963/2285 train_time:119081ms step_avg:60.66ms -step:1964/2285 train_time:119141ms step_avg:60.66ms -step:1965/2285 train_time:119203ms step_avg:60.66ms -step:1966/2285 train_time:119263ms step_avg:60.66ms -step:1967/2285 train_time:119326ms step_avg:60.66ms -step:1968/2285 train_time:119386ms step_avg:60.66ms -step:1969/2285 train_time:119448ms step_avg:60.66ms -step:1970/2285 train_time:119508ms step_avg:60.66ms -step:1971/2285 train_time:119570ms step_avg:60.66ms -step:1972/2285 train_time:119631ms step_avg:60.66ms -step:1973/2285 train_time:119693ms step_avg:60.67ms -step:1974/2285 train_time:119754ms step_avg:60.67ms -step:1975/2285 train_time:119816ms step_avg:60.67ms -step:1976/2285 train_time:119876ms step_avg:60.67ms -step:1977/2285 train_time:119938ms step_avg:60.67ms -step:1978/2285 train_time:119999ms step_avg:60.67ms -step:1979/2285 train_time:120061ms step_avg:60.67ms -step:1980/2285 train_time:120121ms step_avg:60.67ms -step:1981/2285 train_time:120183ms step_avg:60.67ms -step:1982/2285 train_time:120244ms step_avg:60.67ms -step:1983/2285 train_time:120306ms step_avg:60.67ms -step:1984/2285 train_time:120367ms step_avg:60.67ms -step:1985/2285 train_time:120429ms step_avg:60.67ms -step:1986/2285 train_time:120489ms step_avg:60.67ms -step:1987/2285 train_time:120551ms step_avg:60.67ms -step:1988/2285 train_time:120611ms step_avg:60.67ms -step:1989/2285 train_time:120674ms step_avg:60.67ms -step:1990/2285 train_time:120734ms step_avg:60.67ms -step:1991/2285 train_time:120796ms step_avg:60.67ms -step:1992/2285 train_time:120857ms step_avg:60.67ms -step:1993/2285 train_time:120919ms step_avg:60.67ms -step:1994/2285 train_time:120979ms step_avg:60.67ms -step:1995/2285 train_time:121041ms step_avg:60.67ms -step:1996/2285 train_time:121101ms step_avg:60.67ms -step:1997/2285 train_time:121163ms step_avg:60.67ms -step:1998/2285 train_time:121223ms step_avg:60.67ms -step:1999/2285 train_time:121285ms step_avg:60.67ms -step:2000/2285 train_time:121345ms step_avg:60.67ms -step:2000/2285 val_loss:3.3174 train_time:121410ms step_avg:60.70ms -step:2001/2285 train_time:121428ms step_avg:60.68ms -step:2002/2285 train_time:121470ms step_avg:60.67ms -step:2003/2285 train_time:121533ms step_avg:60.68ms -step:2004/2285 train_time:121595ms step_avg:60.68ms -step:2005/2285 train_time:121658ms step_avg:60.68ms -step:2006/2285 train_time:121718ms step_avg:60.68ms -step:2007/2285 train_time:121780ms step_avg:60.68ms -step:2008/2285 train_time:121839ms step_avg:60.68ms -step:2009/2285 train_time:121901ms step_avg:60.68ms -step:2010/2285 train_time:121960ms step_avg:60.68ms -step:2011/2285 train_time:122021ms step_avg:60.68ms -step:2012/2285 train_time:122081ms step_avg:60.68ms -step:2013/2285 train_time:122142ms step_avg:60.68ms -step:2014/2285 train_time:122202ms step_avg:60.68ms -step:2015/2285 train_time:122263ms step_avg:60.68ms -step:2016/2285 train_time:122325ms step_avg:60.68ms -step:2017/2285 train_time:122390ms step_avg:60.68ms -step:2018/2285 train_time:122451ms step_avg:60.68ms -step:2019/2285 train_time:122513ms step_avg:60.68ms -step:2020/2285 train_time:122574ms step_avg:60.68ms -step:2021/2285 train_time:122638ms step_avg:60.68ms -step:2022/2285 train_time:122698ms step_avg:60.68ms -step:2023/2285 train_time:122760ms step_avg:60.68ms -step:2024/2285 train_time:122820ms step_avg:60.68ms -step:2025/2285 train_time:122882ms step_avg:60.68ms -step:2026/2285 train_time:122942ms step_avg:60.68ms -step:2027/2285 train_time:123003ms step_avg:60.68ms -step:2028/2285 train_time:123063ms step_avg:60.68ms -step:2029/2285 train_time:123124ms step_avg:60.68ms -step:2030/2285 train_time:123184ms step_avg:60.68ms -step:2031/2285 train_time:123245ms step_avg:60.68ms -step:2032/2285 train_time:123306ms step_avg:60.68ms -step:2033/2285 train_time:123370ms step_avg:60.68ms -step:2034/2285 train_time:123430ms step_avg:60.68ms -step:2035/2285 train_time:123493ms step_avg:60.68ms -step:2036/2285 train_time:123553ms step_avg:60.68ms -step:2037/2285 train_time:123617ms step_avg:60.69ms -step:2038/2285 train_time:123677ms step_avg:60.69ms -step:2039/2285 train_time:123739ms step_avg:60.69ms -step:2040/2285 train_time:123799ms step_avg:60.69ms -step:2041/2285 train_time:123861ms step_avg:60.69ms -step:2042/2285 train_time:123921ms step_avg:60.69ms -step:2043/2285 train_time:123983ms step_avg:60.69ms -step:2044/2285 train_time:124043ms step_avg:60.69ms -step:2045/2285 train_time:124105ms step_avg:60.69ms -step:2046/2285 train_time:124165ms step_avg:60.69ms -step:2047/2285 train_time:124227ms step_avg:60.69ms -step:2048/2285 train_time:124287ms step_avg:60.69ms -step:2049/2285 train_time:124349ms step_avg:60.69ms -step:2050/2285 train_time:124410ms step_avg:60.69ms -step:2051/2285 train_time:124473ms step_avg:60.69ms -step:2052/2285 train_time:124533ms step_avg:60.69ms -step:2053/2285 train_time:124596ms step_avg:60.69ms -step:2054/2285 train_time:124656ms step_avg:60.69ms -step:2055/2285 train_time:124719ms step_avg:60.69ms -step:2056/2285 train_time:124779ms step_avg:60.69ms -step:2057/2285 train_time:124841ms step_avg:60.69ms -step:2058/2285 train_time:124901ms step_avg:60.69ms -step:2059/2285 train_time:124963ms step_avg:60.69ms -step:2060/2285 train_time:125023ms step_avg:60.69ms -step:2061/2285 train_time:125085ms step_avg:60.69ms -step:2062/2285 train_time:125145ms step_avg:60.69ms -step:2063/2285 train_time:125207ms step_avg:60.69ms -step:2064/2285 train_time:125267ms step_avg:60.69ms -step:2065/2285 train_time:125330ms step_avg:60.69ms -step:2066/2285 train_time:125391ms step_avg:60.69ms -step:2067/2285 train_time:125453ms step_avg:60.69ms -step:2068/2285 train_time:125514ms step_avg:60.69ms -step:2069/2285 train_time:125576ms step_avg:60.69ms -step:2070/2285 train_time:125637ms step_avg:60.69ms -step:2071/2285 train_time:125699ms step_avg:60.69ms -step:2072/2285 train_time:125759ms step_avg:60.69ms -step:2073/2285 train_time:125821ms step_avg:60.70ms -step:2074/2285 train_time:125881ms step_avg:60.69ms -step:2075/2285 train_time:125943ms step_avg:60.70ms -step:2076/2285 train_time:126003ms step_avg:60.69ms -step:2077/2285 train_time:126065ms step_avg:60.70ms -step:2078/2285 train_time:126125ms step_avg:60.70ms -step:2079/2285 train_time:126187ms step_avg:60.70ms -step:2080/2285 train_time:126247ms step_avg:60.70ms -step:2081/2285 train_time:126309ms step_avg:60.70ms -step:2082/2285 train_time:126369ms step_avg:60.70ms -step:2083/2285 train_time:126432ms step_avg:60.70ms -step:2084/2285 train_time:126492ms step_avg:60.70ms -step:2085/2285 train_time:126555ms step_avg:60.70ms -step:2086/2285 train_time:126615ms step_avg:60.70ms -step:2087/2285 train_time:126678ms step_avg:60.70ms -step:2088/2285 train_time:126738ms step_avg:60.70ms -step:2089/2285 train_time:126801ms step_avg:60.70ms -step:2090/2285 train_time:126860ms step_avg:60.70ms -step:2091/2285 train_time:126923ms step_avg:60.70ms -step:2092/2285 train_time:126983ms step_avg:60.70ms -step:2093/2285 train_time:127045ms step_avg:60.70ms -step:2094/2285 train_time:127105ms step_avg:60.70ms -step:2095/2285 train_time:127167ms step_avg:60.70ms -step:2096/2285 train_time:127227ms step_avg:60.70ms -step:2097/2285 train_time:127289ms step_avg:60.70ms -step:2098/2285 train_time:127349ms step_avg:60.70ms -step:2099/2285 train_time:127411ms step_avg:60.70ms -step:2100/2285 train_time:127471ms step_avg:60.70ms -step:2101/2285 train_time:127534ms step_avg:60.70ms -step:2102/2285 train_time:127594ms step_avg:60.70ms -step:2103/2285 train_time:127657ms step_avg:60.70ms -step:2104/2285 train_time:127716ms step_avg:60.70ms -step:2105/2285 train_time:127779ms step_avg:60.70ms -step:2106/2285 train_time:127839ms step_avg:60.70ms -step:2107/2285 train_time:127901ms step_avg:60.70ms -step:2108/2285 train_time:127962ms step_avg:60.70ms -step:2109/2285 train_time:128024ms step_avg:60.70ms -step:2110/2285 train_time:128085ms step_avg:60.70ms -step:2111/2285 train_time:128147ms step_avg:60.70ms -step:2112/2285 train_time:128207ms step_avg:60.70ms -step:2113/2285 train_time:128269ms step_avg:60.70ms -step:2114/2285 train_time:128329ms step_avg:60.70ms -step:2115/2285 train_time:128392ms step_avg:60.71ms -step:2116/2285 train_time:128452ms step_avg:60.70ms -step:2117/2285 train_time:128514ms step_avg:60.71ms -step:2118/2285 train_time:128575ms step_avg:60.71ms -step:2119/2285 train_time:128637ms step_avg:60.71ms -step:2120/2285 train_time:128697ms step_avg:60.71ms -step:2121/2285 train_time:128760ms step_avg:60.71ms -step:2122/2285 train_time:128820ms step_avg:60.71ms -step:2123/2285 train_time:128883ms step_avg:60.71ms -step:2124/2285 train_time:128943ms step_avg:60.71ms -step:2125/2285 train_time:129005ms step_avg:60.71ms -step:2126/2285 train_time:129065ms step_avg:60.71ms -step:2127/2285 train_time:129128ms step_avg:60.71ms -step:2128/2285 train_time:129188ms step_avg:60.71ms -step:2129/2285 train_time:129250ms step_avg:60.71ms -step:2130/2285 train_time:129310ms step_avg:60.71ms -step:2131/2285 train_time:129372ms step_avg:60.71ms -step:2132/2285 train_time:129432ms step_avg:60.71ms -step:2133/2285 train_time:129495ms step_avg:60.71ms -step:2134/2285 train_time:129555ms step_avg:60.71ms -step:2135/2285 train_time:129618ms step_avg:60.71ms -step:2136/2285 train_time:129678ms step_avg:60.71ms -step:2137/2285 train_time:129740ms step_avg:60.71ms -step:2138/2285 train_time:129800ms step_avg:60.71ms -step:2139/2285 train_time:129863ms step_avg:60.71ms -step:2140/2285 train_time:129923ms step_avg:60.71ms -step:2141/2285 train_time:129985ms step_avg:60.71ms -step:2142/2285 train_time:130045ms step_avg:60.71ms -step:2143/2285 train_time:130108ms step_avg:60.71ms -step:2144/2285 train_time:130168ms step_avg:60.71ms -step:2145/2285 train_time:130230ms step_avg:60.71ms -step:2146/2285 train_time:130290ms step_avg:60.71ms -step:2147/2285 train_time:130352ms step_avg:60.71ms -step:2148/2285 train_time:130412ms step_avg:60.71ms -step:2149/2285 train_time:130475ms step_avg:60.71ms -step:2150/2285 train_time:130535ms step_avg:60.71ms -step:2151/2285 train_time:130599ms step_avg:60.72ms -step:2152/2285 train_time:130658ms step_avg:60.71ms -step:2153/2285 train_time:130720ms step_avg:60.72ms -step:2154/2285 train_time:130781ms step_avg:60.72ms -step:2155/2285 train_time:130844ms step_avg:60.72ms -step:2156/2285 train_time:130904ms step_avg:60.72ms -step:2157/2285 train_time:130966ms step_avg:60.72ms -step:2158/2285 train_time:131026ms step_avg:60.72ms -step:2159/2285 train_time:131088ms step_avg:60.72ms -step:2160/2285 train_time:131148ms step_avg:60.72ms -step:2161/2285 train_time:131210ms step_avg:60.72ms -step:2162/2285 train_time:131270ms step_avg:60.72ms -step:2163/2285 train_time:131333ms step_avg:60.72ms -step:2164/2285 train_time:131393ms step_avg:60.72ms -step:2165/2285 train_time:131455ms step_avg:60.72ms -step:2166/2285 train_time:131515ms step_avg:60.72ms -step:2167/2285 train_time:131579ms step_avg:60.72ms -step:2168/2285 train_time:131638ms step_avg:60.72ms -step:2169/2285 train_time:131701ms step_avg:60.72ms -step:2170/2285 train_time:131761ms step_avg:60.72ms -step:2171/2285 train_time:131824ms step_avg:60.72ms -step:2172/2285 train_time:131884ms step_avg:60.72ms -step:2173/2285 train_time:131946ms step_avg:60.72ms -step:2174/2285 train_time:132006ms step_avg:60.72ms -step:2175/2285 train_time:132068ms step_avg:60.72ms -step:2176/2285 train_time:132128ms step_avg:60.72ms -step:2177/2285 train_time:132190ms step_avg:60.72ms -step:2178/2285 train_time:132250ms step_avg:60.72ms -step:2179/2285 train_time:132313ms step_avg:60.72ms -step:2180/2285 train_time:132375ms step_avg:60.72ms -step:2181/2285 train_time:132437ms step_avg:60.72ms -step:2182/2285 train_time:132497ms step_avg:60.72ms -step:2183/2285 train_time:132559ms step_avg:60.72ms -step:2184/2285 train_time:132619ms step_avg:60.72ms -step:2185/2285 train_time:132682ms step_avg:60.72ms -step:2186/2285 train_time:132741ms step_avg:60.72ms -step:2187/2285 train_time:132804ms step_avg:60.72ms -step:2188/2285 train_time:132864ms step_avg:60.72ms -step:2189/2285 train_time:132926ms step_avg:60.72ms -step:2190/2285 train_time:132987ms step_avg:60.72ms -step:2191/2285 train_time:133049ms step_avg:60.73ms -step:2192/2285 train_time:133109ms step_avg:60.72ms -step:2193/2285 train_time:133171ms step_avg:60.73ms -step:2194/2285 train_time:133231ms step_avg:60.73ms -step:2195/2285 train_time:133293ms step_avg:60.73ms -step:2196/2285 train_time:133354ms step_avg:60.73ms -step:2197/2285 train_time:133417ms step_avg:60.73ms -step:2198/2285 train_time:133477ms step_avg:60.73ms -step:2199/2285 train_time:133539ms step_avg:60.73ms -step:2200/2285 train_time:133599ms step_avg:60.73ms -step:2201/2285 train_time:133661ms step_avg:60.73ms -step:2202/2285 train_time:133721ms step_avg:60.73ms -step:2203/2285 train_time:133784ms step_avg:60.73ms -step:2204/2285 train_time:133844ms step_avg:60.73ms -step:2205/2285 train_time:133907ms step_avg:60.73ms -step:2206/2285 train_time:133967ms step_avg:60.73ms -step:2207/2285 train_time:134029ms step_avg:60.73ms -step:2208/2285 train_time:134089ms step_avg:60.73ms -step:2209/2285 train_time:134151ms step_avg:60.73ms -step:2210/2285 train_time:134212ms step_avg:60.73ms -step:2211/2285 train_time:134276ms step_avg:60.73ms -step:2212/2285 train_time:134336ms step_avg:60.73ms -step:2213/2285 train_time:134398ms step_avg:60.73ms -step:2214/2285 train_time:134458ms step_avg:60.73ms -step:2215/2285 train_time:134520ms step_avg:60.73ms -step:2216/2285 train_time:134581ms step_avg:60.73ms -step:2217/2285 train_time:134643ms step_avg:60.73ms -step:2218/2285 train_time:134703ms step_avg:60.73ms -step:2219/2285 train_time:134765ms step_avg:60.73ms -step:2220/2285 train_time:134825ms step_avg:60.73ms -step:2221/2285 train_time:134888ms step_avg:60.73ms -step:2222/2285 train_time:134947ms step_avg:60.73ms -step:2223/2285 train_time:135009ms step_avg:60.73ms -step:2224/2285 train_time:135070ms step_avg:60.73ms -step:2225/2285 train_time:135132ms step_avg:60.73ms -step:2226/2285 train_time:135192ms step_avg:60.73ms -step:2227/2285 train_time:135255ms step_avg:60.73ms -step:2228/2285 train_time:135316ms step_avg:60.73ms -step:2229/2285 train_time:135378ms step_avg:60.74ms -step:2230/2285 train_time:135438ms step_avg:60.73ms -step:2231/2285 train_time:135501ms step_avg:60.74ms -step:2232/2285 train_time:135561ms step_avg:60.74ms -step:2233/2285 train_time:135624ms step_avg:60.74ms -step:2234/2285 train_time:135684ms step_avg:60.74ms -step:2235/2285 train_time:135746ms step_avg:60.74ms -step:2236/2285 train_time:135806ms step_avg:60.74ms -step:2237/2285 train_time:135868ms step_avg:60.74ms -step:2238/2285 train_time:135927ms step_avg:60.74ms -step:2239/2285 train_time:135990ms step_avg:60.74ms -step:2240/2285 train_time:136050ms step_avg:60.74ms -step:2241/2285 train_time:136112ms step_avg:60.74ms -step:2242/2285 train_time:136172ms step_avg:60.74ms -step:2243/2285 train_time:136235ms step_avg:60.74ms -step:2244/2285 train_time:136296ms step_avg:60.74ms -step:2245/2285 train_time:136359ms step_avg:60.74ms -step:2246/2285 train_time:136420ms step_avg:60.74ms -step:2247/2285 train_time:136482ms step_avg:60.74ms -step:2248/2285 train_time:136542ms step_avg:60.74ms -step:2249/2285 train_time:136605ms step_avg:60.74ms -step:2250/2285 train_time:136665ms step_avg:60.74ms -step:2250/2285 val_loss:3.2822 train_time:136728ms step_avg:60.77ms -step:2251/2285 train_time:136747ms step_avg:60.75ms -step:2252/2285 train_time:136789ms step_avg:60.74ms -step:2253/2285 train_time:136855ms step_avg:60.74ms -step:2254/2285 train_time:136917ms step_avg:60.74ms -step:2255/2285 train_time:136980ms step_avg:60.74ms -step:2256/2285 train_time:137040ms step_avg:60.74ms -step:2257/2285 train_time:137102ms step_avg:60.75ms -step:2258/2285 train_time:137162ms step_avg:60.74ms -step:2259/2285 train_time:137224ms step_avg:60.75ms -step:2260/2285 train_time:137283ms step_avg:60.74ms -step:2261/2285 train_time:137345ms step_avg:60.75ms -step:2262/2285 train_time:137404ms step_avg:60.74ms -step:2263/2285 train_time:137466ms step_avg:60.75ms -step:2264/2285 train_time:137527ms step_avg:60.74ms -step:2265/2285 train_time:137589ms step_avg:60.75ms -step:2266/2285 train_time:137649ms step_avg:60.75ms -step:2267/2285 train_time:137713ms step_avg:60.75ms -step:2268/2285 train_time:137773ms step_avg:60.75ms -step:2269/2285 train_time:137837ms step_avg:60.75ms -step:2270/2285 train_time:137897ms step_avg:60.75ms -step:2271/2285 train_time:137960ms step_avg:60.75ms -step:2272/2285 train_time:138020ms step_avg:60.75ms -step:2273/2285 train_time:138082ms step_avg:60.75ms -step:2274/2285 train_time:138142ms step_avg:60.75ms -step:2275/2285 train_time:138204ms step_avg:60.75ms -step:2276/2285 train_time:138263ms step_avg:60.75ms -step:2277/2285 train_time:138325ms step_avg:60.75ms -step:2278/2285 train_time:138384ms step_avg:60.75ms -step:2279/2285 train_time:138446ms step_avg:60.75ms -step:2280/2285 train_time:138506ms step_avg:60.75ms -step:2281/2285 train_time:138569ms step_avg:60.75ms -step:2282/2285 train_time:138630ms step_avg:60.75ms -step:2283/2285 train_time:138693ms step_avg:60.75ms -step:2284/2285 train_time:138753ms step_avg:60.75ms -step:2285/2285 train_time:138816ms step_avg:60.75ms -step:2285/2285 val_loss:3.2770 train_time:138877ms step_avg:60.78ms -peak memory allocated: 29626 MiB reserved: 50528 MiB diff --git a/records/track_1_short/2025-10-27_FixMuonLR/72231598-c098-4e79-94f2-26952a4bbdc6.txt b/records/track_1_short/2025-10-27_FixMuonLR/72231598-c098-4e79-94f2-26952a4bbdc6.txt deleted file mode 100644 index 558ac579d..000000000 --- a/records/track_1_short/2025-10-27_FixMuonLR/72231598-c098-4e79-94f2-26952a4bbdc6.txt +++ /dev/null @@ -1,3814 +0,0 @@ -import os -import sys - -with open(sys.argv[0]) as f: - code = f.read() # read the code of this file ASAP, for logging -import copy -import glob -import math -import threading -import time -import uuid -from dataclasses import dataclass -from collections import defaultdict -from itertools import accumulate -from pathlib import Path - -os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" -import torch - -torch.empty( - 1, device="cuda", requires_grad=True -).backward() # prevents a bug on some systems -import torch._dynamo as dynamo -import torch.distributed as dist -import torch.nn.functional as F - -# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min -import triton -import triton.language as tl -from kernels import get_kernel -from torch import Tensor, nn - -dynamo.config.recompile_limit = 64 - -# ----------------------------------------------------------------------------- -# Custom operators: FP8 matmul by @YouJiacheng - - -@torch.library.custom_op("nanogpt::mm", mutates_args=()) -def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: - @torch.compile - def impl(x: Tensor, w: Tensor): - assert x.is_contiguous() and w.is_contiguous() - x_f8 = x.div(x_s).to(torch.float8_e4m3fn) - w_f8 = w.div(w_s).to(torch.float8_e4m3fn) - out = torch._scaled_mm( - x_f8, - w_f8.T, - out_dtype=torch.bfloat16, - scale_a=x.new_tensor(x_s, dtype=torch.float32), - scale_b=x.new_tensor(w_s, dtype=torch.float32), - use_fast_accum=True, - ) - return out, x_f8, w_f8 - - return impl(x, w) - -@mm_op.register_fake -def _(x: Tensor, w: Tensor, *_): - assert x.ndim == w.ndim == 2 - assert x.shape[1] == w.shape[1] - assert x.device == w.device - assert x.is_contiguous() and w.is_contiguous() - return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) - -@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) -def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: - @torch.compile - def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): - assert grad.is_contiguous() - x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) - w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) - grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) - grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) - grad_x = torch._scaled_mm( - grad_f8, - w_f8.T.contiguous().T, - out_dtype=torch.bfloat16, - scale_a=grad_inv_s, - scale_b=w_inv_s, - use_fast_accum=False, - ) - # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) - grad_w = torch._scaled_mm( - x_f8.T.contiguous(), - grad_f8.T.contiguous().T, - out_dtype=torch.float32, - scale_a=x_inv_s, - scale_b=grad_inv_s, - use_fast_accum=False, - ).T - return grad_x, grad_w - - return impl(g, x_f8, w_f8) - -@mm_backward_op.register_fake -def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): - return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) - -def backward(ctx, grad_out: Tensor, *_): - x_f8, w_f8 = ctx.saved_tensors - x_s, w_s, grad_s = ctx.scales - grad_x, grad_w = torch.ops.nanogpt.mm_backward( - grad_out, x_f8, w_f8, x_s, w_s, grad_s - ) - return grad_x, grad_w, None, None, None - -def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): - *_, x_s, w_s, grad_s = inputs - _, x_f8, w_f8 = output - ctx.save_for_backward(x_f8, w_f8) - ctx.scales = x_s, w_s, grad_s - ctx.set_materialize_grads(False) - -mm_op.register_autograd(backward, setup_context=setup_context) - -# ----------------------------------------------------------------------------- -# Triton kernel for symmetric matrix multiplication by @byronxu99 - -def _get_autotune_configs(): - return [ - triton.Config( - { - "BLOCK_SIZE_M": bm, - "BLOCK_SIZE_N": bn, - "BLOCK_SIZE_K": bk, - "GROUP_SIZE_M": 8, - "LOWER_UPPER": 1, - }, - num_stages=stages, - num_warps=warps, - ) - for bm in [64, 128] - for bn in [64, 128, 256] - for bk in [64, 128] - for stages, warps in [(3, 4), (3, 8), (4, 4)] - if bm // bn <= 2 and bn // bm <= 2 - ] - -@triton.jit -def _pid_to_block( - pid, - M, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, -): - # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) - num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) - num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) - - # Map PID to a single matrix in batch - batch_idx = pid // (num_pid_m * num_pid_n) - pid = pid % (num_pid_m * num_pid_n) - - # Map PID to 2D grid of blocks - pid_m = pid // num_pid_n - pid_n = pid % num_pid_n - pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) - - m_idx = pid_m * BLOCK_SIZE_M - n_idx = pid_n * BLOCK_SIZE_N - return batch_idx, m_idx, n_idx - -@triton.autotune( - configs=_get_autotune_configs(), - key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], -) -@triton.jit -def XXT_kernel( - A_ptr, C_ptr, - M, K, - a_stride_b, a_stride_r, a_stride_c, - c_stride_b, c_stride_r, c_stride_c, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, - LOWER_UPPER: tl.constexpr, -): - pid = tl.program_id(axis=0) - batch_idx, m_idx, n_idx = _pid_to_block( - pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M - ) - - # Skip blocks that don't need to be computed - skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) - skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) - if skip_block_below_diag or skip_block_above_diag: - return - - # Index into one matrix of batch - A_ptr += batch_idx * a_stride_b - C_ptr += batch_idx * c_stride_b - - # Create pointer arrays for A and A.T - offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M - offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M - offs_k = tl.arange(0, BLOCK_SIZE_K) - a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) - at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) - - accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - - # Accumulate over blocks of K - for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): - a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) - at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) - accumulator = tl.dot(a, at, accumulator) - a_ptrs += BLOCK_SIZE_K * a_stride_c - at_ptrs += BLOCK_SIZE_K * a_stride_c - - out_dtype = C_ptr.dtype.element_ty - output = accumulator.to(out_dtype) - - # Store block of C - offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) - offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) - c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) - c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) - tl.store(c_ptrs, output, mask=c_mask) - - # Store block of C mirrored across the diagonal - c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) - c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) - tl.store(c_ptrs_t, output.T, mask=c_mask_t) - -def XXT(A: torch.Tensor, out: torch.Tensor): - """ - Launch Triton kernel to compute C = A @ A.T - """ - assert A.ndim == 2 or A.ndim == 3 - M, K = A.shape[-2:] - assert out.size(-2) == M, "Output matrix has incorrect shape" - assert out.size(-1) == M, "Output matrix has incorrect shape" - - batch_size = A.size(0) if A.ndim == 3 else 1 - input_batch_stride = A.stride(0) if A.ndim == 3 else 0 - output_batch_stride = out.stride(0) if out.ndim == 3 else 0 - - grid = lambda meta: ( - batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), - ) - XXT_kernel[grid]( - A_ptr=A, - C_ptr=out, - M=M, - K=K, - a_stride_b=input_batch_stride, - a_stride_r=A.stride(-2), - a_stride_c=A.stride(-1), - c_stride_b=output_batch_stride, - c_stride_r=out.stride(-2), - c_stride_c=out.stride(-1), - ) - return out - -@triton.autotune( - configs=_get_autotune_configs(), - key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], -) -@triton.jit -def ba_plus_cAA_kernel( - A_ptr, C_ptr, - M, - a_stride_b, a_stride_r, a_stride_c, - c_stride_b, c_stride_r, c_stride_c, - alpha, beta, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, - LOWER_UPPER: tl.constexpr, -): - # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A - # Performance is slightly slower than XXT_kernel, so we use two separate kernels - pid = tl.program_id(axis=0) - batch_idx, m_idx, n_idx = _pid_to_block( - pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M - ) - - # Skip blocks that don't need to be computed - skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) - skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) - if skip_block_below_diag or skip_block_above_diag: - return - - # Index into one matrix of batch - A_ptr += batch_idx * a_stride_b - C_ptr += batch_idx * c_stride_b - - # Create pointer arrays for A and A.T - offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M - offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M - offs_k = tl.arange(0, BLOCK_SIZE_K) - a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) - at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) - - accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - - # Accumulate over blocks of K - for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): - a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) - at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) - accumulator = tl.dot(a, at, accumulator) - a_ptrs += BLOCK_SIZE_K * a_stride_c - at_ptrs += BLOCK_SIZE_K * a_stride_c - - # Load block of A to add (corresponds to the current block of C) - offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) - offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) - a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) - a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) - a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) - - # Apply alpha and beta - accumulator *= alpha - accumulator += a_add * beta - - out_dtype = C_ptr.dtype.element_ty - output = accumulator.to(out_dtype) - - # Store block of C - offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) - offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) - c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) - c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) - tl.store(c_ptrs, output, mask=c_mask) - - # Store block of C mirrored across the diagonal - c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) - c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) - tl.store(c_ptrs_t, output.T, mask=c_mask_t) - -def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): - """ - Launch Triton kernel to compute C = alpha * A @ A.T + beta * A - """ - assert A.ndim == 2 or A.ndim == 3 - M, K = A.shape[-2:] - assert M == K, "Input matrix must be square" - assert out.size(-2) == M - assert out.size(-1) == M - - batch_size = A.size(0) if A.ndim == 3 else 1 - input_batch_stride = A.stride(0) if A.ndim == 3 else 0 - output_batch_stride = out.stride(0) if out.ndim == 3 else 0 - - grid = lambda meta: ( - batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), - ) - ba_plus_cAA_kernel[grid]( - A_ptr=A, - C_ptr=out, - M=M, - a_stride_b=input_batch_stride, - a_stride_r=A.stride(-2), - a_stride_c=A.stride(-1), - c_stride_b=output_batch_stride, - c_stride_r=out.stride(-2), - c_stride_c=out.stride(-1), - alpha=alpha, - beta=beta, - ) - return out - -# Computed for num_iters=5, safety_factor=2e-2, cushion=2 -polar_express_coeffs = [ - (8.156554524902461, -22.48329292557795, 15.878769915207462), - (4.042929935166739, -2.808917465908714, 0.5000178451051316), - (3.8916678022926607, -2.772484153217685, 0.5060648178503393), - (3.285753657755655, -2.3681294933425376, 0.46449024233003106), - (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) -] - -@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower -def polar_express(G: torch.Tensor): - """ - Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 - by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. - Code adapted from https://github.com/NoahAmsel/PolarExpress/tree/main by @varunneal. - """ - X = G.bfloat16() - if G.size(-2) > G.size(-1): - X = X.mT - - # Ensure spectral norm is at most 1 - X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) - - # Allocate buffers - X = X.contiguous() - A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) - B = torch.empty_like(A) - C = torch.empty_like(X) - - aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm - - # Perform the iterations - for a, b, c in polar_express_coeffs: - XXT(X, out=A) # A = X @ X.mT - ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A - aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X - X, C = C, X # Swap references to avoid unnecessary copies - - if G.size(-2) > G.size(-1): - X = X.mT - return X - -# ----------------------------------------------------------------------------- -# Muon optimizer - -class Muon(torch.optim.Optimizer): - """ - Muon - MomentUm Orthogonalized by Newton-schulz - - https://kellerjordan.github.io/posts/muon/ - - Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- - processing step, in which each 2D parameter's update is replaced with the nearest orthogonal - matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has - the advantage that it can be stably run in bfloat16 on the GPU. - Note: A later PR replaced Newton-Shulz with Polar Express for the orthogonalization step - - Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, - or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - Though empirically small 1D params perform efficiently here: - NS approximately performs a magnitude normalization of the grad - This hyper-optimized class has faster execution time than the current impl of Adam for small params - - Custom distributed sizing: - The model stores all attn and mlp weights in the same shape, and then updates the view as - needed on the forward pass. This enables attn and mlp weights to be contained within the same - dist.reduce_scatter_tensor() call. The model architecture has been customized to enable - (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. - The scheduling is: - 1. reduce scatter smear_gate (1 param 7 padding params) - 2. reduce scatter attn_gate (10 params 6 padding params) - 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) - 4. reduce scatter attn/mlp round 2 (16 mlp params) - 5. wait on step 1, then compute update of 1 and schedule all gather - 6. wait on step 2, then compute update of 2 and schedule all gather - 7. wait on step 3, then compute update of 3 and schedule all gather - GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] - GPUs that receive params of type attn reshape before computing update - 8. wait on 4, then compute update of 4 and schedule all gather - 9. wait for each all gather to complete and update params - Empirically, leading with small params provides an additional 0.2s improvement. - """ - def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, eps=1e-8, beta2=0.95, custom_sizing=True): - defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) - self.world_size = dist.get_world_size() if dist.is_initialized() else 1 - # custom sizing requires 8 GPUs - if custom_sizing and dist.get_world_size()==8: - param_groups = self.generate_custom_param_groups(params) - else: - param_groups = self.generate_standard_param_groups(params) - super().__init__(param_groups, defaults) - - def reset(self): - # expose a reset for clearing buffers - for group in self.param_groups: - group["momentum_buffer"].zero_() - group["second_momentum_buffer"].zero_() - - def generate_standard_param_groups(self, params): - """ - Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. - Creates one param group per module. - """ - groups = defaultdict(list) - for param in params: - groups[param.label].append(param) - - param_groups = [] - for module_name, group_params in groups.items(): - chunk_size = (len(group_params) + self.world_size - 1) // self.world_size - param_groups.append(dict(params=group_params, chunk_size=chunk_size)) - - return param_groups - - def generate_custom_param_groups(self, params): - """ - Implementation requires that a single GPU does not receive both attn - and mlp params when a param group is split across GPUs. - """ - module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp_up', 'mlp_down'] - params_list = list(params) - params_list.sort(key=lambda x: module_group_order.index(x.label)) - - idx = 0 - group_sizes = [1, 10, 16, 16] - assert len(params_list) == sum(group_sizes) - param_groups = [] - for size in group_sizes: - chunk_size = (size + self.world_size - 1) // self.world_size - group_params = params_list[idx: idx + size] - param_groups.append(dict(params=group_params, chunk_size=chunk_size)) - idx += size - - return param_groups - - @torch.no_grad() - def step(self): - # Efficient systems-wise implementation of step developed by @YouJiacheng, - # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, - # @ryanyang0, @vagrawal, and @varunneal. - rank = dist.get_rank() - group_infos = [] - for group in self.param_groups: - params: list[Tensor] = group["params"] - if not params: - continue - - chunk_size = group["chunk_size"] - padded_num_params = chunk_size * self.world_size - - stacked_grads = torch.empty( - (padded_num_params, *params[0].shape), - dtype=params[0].dtype, - device=params[0].device - ) - for i, p in enumerate(params): - stacked_grads[i].copy_(p.grad, non_blocking=True) - if len(params) < padded_num_params: - stacked_grads[len(params):].zero_() - - grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) - - reduce_future = dist.reduce_scatter_tensor( - grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True - ).get_future() - - group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) - - all_gather_infos = [] - # Second pass: wait for gradients, compute updates for the local shard of parameters, - # and launch all async all_gather operations. - for group, info in zip(self.param_groups, group_infos): - info["reduce_future"].wait() - - params = group["params"] - grad_chunk = info["grad_chunk"] - chunk_size = group["chunk_size"] - padded_num_params = chunk_size * self.world_size - - start_idx = rank * chunk_size - module_idx = start_idx if start_idx < len(params) else 0 - - num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank - - if "momentum_buffer" not in group: - group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) - momentum_buffer = group["momentum_buffer"] - # Apply momentum update to the persistent momentum buffer in-place - momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) - updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) - - grad_shape = updated_grads.shape - if params[module_idx].label == 'attn': - # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] - for p in params[module_idx:module_idx + num_params]: - assert p.label == 'attn' - updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) - ref_param = params[module_idx] - param_shape = ref_param.shape - - if "second_momentum_buffer" not in group: - group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) - if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) - ) - second_momentum_buffer = group["second_momentum_buffer"] - - if "param_lr" not in group: - group["param_lr"] = ( - max(1., param_shape[-2] / param_shape[-1]) ** 0.5 - * ref_param.new_tensor( - [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] - ).view(-1, 1, 1) - ) - - group["param_wd"] = ref_param.new_tensor( - [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] - ).view(-1, 1, 1) - - # Determine LR and WR - eff_lr = group["lr"] * group["param_lr"] - eff_wd = group["weight_decay"] * group["param_wd"] - - # Compute zeropower for the entire chunk in a single, batched call. - if num_params == 0: - v_chunk = updated_grads - elif params[module_idx].label == "smear_gate": - # dividing by magnitude is equivalent of SVN for 1d tensors - v_chunk = updated_grads / (updated_grads.norm(dim=(-2, -1), keepdim=True).clamp_min(1e-10)) - else: - v_chunk = polar_express(updated_grads) - - # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) - v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) - v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) - second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) - step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() - v_chunk.mul_(step_size) - v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) - v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) - - v_chunk = v_chunk.view(grad_shape) - - updated_params = torch.empty_like(grad_chunk) - param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) - # Apply weight decay directly to the buffer. - param_chunk.mul_(1 - eff_wd) - - param_chunk.add_(-eff_lr * v_chunk) - - updated_params[:num_params].copy_(param_chunk) - if num_params < chunk_size: - updated_params[num_params:].zero_() - - stacked_params = torch.empty( - (padded_num_params, *param_shape), - dtype=updated_params.dtype, - device=updated_params.device, - ) - - gather_future = dist.all_gather_into_tensor( - stacked_params, updated_params, async_op=True - ).get_future() - - all_gather_infos.append( - { - "gather_future": gather_future, - "stacked_params": stacked_params, - "orig_params": params, - } - ) - - # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. - for info in all_gather_infos: - info["gather_future"].wait() - stacked_params = info["stacked_params"] - orig_params = info["orig_params"] - - unstacked_params = torch.unbind(stacked_params) - for i, p in enumerate(orig_params): - p.copy_(unstacked_params[i], non_blocking=True) - - -class DistAdam(torch.optim.Optimizer): - def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): - self.world_size = dist.get_world_size() if dist.is_initialized() else 1 - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) - params = list(params) - sizes = {p.shape for p in params} - # create one buffer per unique parameter-size - param_groups = [] - for size in sizes: - group_params = [p for p in params if p.shape == size] - param_groups.append(dict(params=group_params)) - super().__init__(param_groups, defaults) - # init state - for p in params: - chunk_size = p.size(0) // self.world_size - exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) - exp_avg_sq = torch.zeros_like(exp_avg) - self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) - # DistributedAdam implementation by @vagrawal - - @torch.compile - @torch.no_grad() - def step(self): - rank = dist.get_rank() - reduce_scatter_futures: list[torch.Future] = [] - all_gather_futures: list[torch.Future] = [] - grad_slices = [] - for group in self.param_groups: - params: list[Tensor] = group["params"] - for param in params: - grad = param.grad - rank_size = grad.shape[0] // self.world_size - grad_slice = torch.empty_like(grad[:rank_size]) - reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) - grad_slices.append(grad_slice) - - idx = 0 - for group in self.param_groups: - beta1, beta2 = group['betas'] - eps = group['eps'] - wd = group['weight_decay'] - params = group['params'] - for param in params: - reduce_scatter_futures[idx].wait() - rank_size = param.shape[0] // self.world_size - p_slice = param[rank * rank_size:(rank + 1) * rank_size] - lr = group['lr'] * getattr(param, "lr_mul", 1.0) - state = self.state[param] - g_slice = grad_slices[idx] - - exp_avg = state["exp_avg"] - exp_avg_sq = state["exp_avg_sq"] - state["step"] += 1 - t = state["step"] - # weight decay - if wd != 0: - eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) - p_slice.mul_(1 - eff_weight_decay) - # update running averages - exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) - exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) - # bias corrections - bias1 = 1 - beta1 ** t - bias2 = 1 - beta2 ** t - # compute step - denom = exp_avg_sq.sqrt().add_(eps) - step_size = lr * (bias2 ** 0.5 / bias1) - update = exp_avg.div(denom).mul_(step_size) - p_slice.add_(other=update, alpha=-1.0) - idx += 1 - all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) - torch.futures.collect_all(all_gather_futures).wait() - -# ----------------------------------------------------------------------------- -# PyTorch nn.Module definitions for the model - -def norm(x: Tensor): - return F.rms_norm(x, (x.size(-1),)) - -class CastedLinear(nn.Linear): - def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): - super().__init__(in_features, out_features, bias=False) - self.use_fp8 = use_fp8 - self.x_s = x_s - self.w_s = w_s - self.grad_s = grad_s - - def reset_parameters(self) -> None: - with torch.no_grad(): - self.weight.zero_() # @Grad62304977 and others - - def forward(self, x: Tensor): - if self.use_fp8 and self.training: - _x = x.flatten(0, -2) - out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] - return out.reshape(*x.shape[:-1], -1) - else: - return F.linear(x, self.weight.type_as(x)) - -# yarn implementation @classiclarryd -class Yarn(nn.Module): - def __init__(self, head_dim, max_seq_len): - super().__init__() - self.head_dim = head_dim - self.max_seq_len = max_seq_len - self.reset() - - def reset(self): - angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) - # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) - angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) - t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) - theta = torch.outer(t, angular_freq) - self.cos = nn.Buffer( - theta.cos().to(torch.bfloat16), persistent=False - ) - self.sin = nn.Buffer( - theta.sin().to(torch.bfloat16), persistent=False - ) - self.angular_freq = angular_freq - # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 - self.attn_scale = 0.1 - - def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): - rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) - scaling_factor = old_window / new_window - interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) - self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) - t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) - theta = torch.outer(t, self.angular_freq) - self.cos.copy_(theta.cos()) - self.sin.copy_(theta.sin()) - self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 - -def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): - assert cos.size(0) >= x_BTHD.size(-3) - cos, sin = ( - cos[None, : x_BTHD.size(-3), None, :], - sin[None, : x_BTHD.size(-3), None, :], - ) - x1, x2 = x_BTHD.chunk(2, dim=-1) - y1 = x1 * cos + x2 * sin - y2 = x1 * (-sin) + x2 * cos - return torch.cat((y1, y2), 3) - -@dataclass -class AttnArgs: - ve: torch.Tensor - sa_lambdas: torch.Tensor - seqlens: torch.Tensor - bm_size: int - cos: torch.Tensor - sin: torch.Tensor - attn_scale: float - -flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface - -class CausalSelfAttention(nn.Module): - def __init__(self, dim: int, head_dim: int, num_heads: int): - super().__init__() - self.num_heads = num_heads - self.head_dim = head_dim - self.dim = dim - self.hdim = num_heads * head_dim - - assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" - std = 0.5 * (self.dim ** -0.5) - bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng - # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng - # https://x.com/hi_tysam/status/1879699187107033311 - # make matrices the same shape as MLP to enable batched call in optimizer - self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) - # label module to enable custom optimizer sizing - self.qkvo_w.label='attn' - - with torch.no_grad(): - self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights - self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero - - # sparse gated attention to enable context based no-op by @classiclarryd - self.attn_gate = CastedLinear(12, num_heads) - # label module to enable custom optimizer sizing - self.attn_gate.weight.label = 'attn_gate' - - def forward(self, x: Tensor, attn_args: AttnArgs): - B, T = x.size(0), x.size(1) # batch size, sequence length - assert B == 1, "varlen sequences requires B == 1" - assert T % 16 == 0 - # unpack attention args - cos, sin = attn_args.cos, attn_args.sin - ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas - seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size - - q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) - q, k = norm(q), norm(k) # QK norm @Grad62304977 - q, k = rotary(q, cos, sin), rotary(k, cos, sin) - if ve is not None: - v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 - else: # skip mid-layers token value embeddings by @YouJiacheng - v = sa_lambdas[0] * v - - max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) - - # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng - y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, - max_seqlen_q=max_len, max_seqlen_k=max_len, - causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) - y = y.view(B, T, self.num_heads, self.head_dim) - y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) - y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side - y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) - return y - - -class MLP(nn.Module): - def __init__(self, dim: int): - super().__init__() - hdim = 4 * dim - # make matrices the same shape to enable batched call in optimizer - self.c_fc = nn.Parameter(torch.empty(dim, hdim)) - self.c_proj = nn.Parameter(torch.empty(dim, hdim)) - # label modules to enable custom optimizer sizing - self.c_fc.label = 'mlp_up' - self.c_proj.label = 'mlp_down' - # corrective factor to account for transpose - self.c_fc.lr_mul = 2. - - std = 0.5 * (dim ** -0.5) - bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng - with torch.no_grad(): - self.c_fc.uniform_(-bound, bound) - self.c_proj.zero_() # zero init suggested by @Grad62304977 - - def forward(self, x: Tensor): - x = F.linear(x, self.c_fc.T.type_as(x)) - x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 - x = F.linear(x, self.c_proj.type_as(x)) - return x - -class Block(nn.Module): - def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): - super().__init__() - # skip attention of blocks.7 (the 8th layer) by @YouJiacheng - self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None - # skip MLP blocks for first MLP layer by @EmelyanenkoK - self.mlp = MLP(dim) if layer_idx != 0 else None - - def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): - x = lambdas[0] * x + lambdas[1] * x0 - if self.attn is not None: - x = x + self.attn(norm(x), attn_args) - if self.mlp is not None: - x = x + self.mlp(norm(x)) - return x - -# ----------------------------------------------------------------------------- -# The main model - -def next_multiple_of_n(v: float | int, *, n: int): - return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) - -class GPT(nn.Module): - def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): - super().__init__() - vocab_size = next_multiple_of_n(vocab_size, n=128) - self.embed = nn.Embedding(vocab_size, model_dim) - self.smear_gate = CastedLinear(12, 1) - # label modules to enable custom optimizer sizing - self.smear_gate.weight.label = 'smear_gate' - # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 - # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 - self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) - self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) - self.yarn = Yarn(head_dim, max_seq_len) - # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. - # suggested to me by @Grad62304977. this originates from Karpathy's experiments. - use_fp8 = not os.environ.get("DISABLE_FP8", False) - self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) - # Add learnable skip connection weights for decoder layers - assert num_layers % 2 == 0 - pad = (-num_layers * 5 - 2) % dist.get_world_size() - self.scalars = nn.Parameter( - torch.cat( - [ - -1.5 - * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 - *[ - torch.tensor([1.0, 0.0]) for _ in range(num_layers) - ], # block lambdas - *[ - torch.tensor([0.5, 0.5]) for _ in range(num_layers) - ], # SA lambdas - torch.zeros(1), # smear_lambda - 0.5*torch.ones(1), # backout_lambda - torch.ones(pad), - ] - ) - ) - # set learning rates - for param in self.embed.parameters(): - param.lr_mul = 75. - for param in self.value_embeds.parameters(): - param.lr_mul = 75. - self.lm_head.weight.lr_mul = 1.0 - self.scalars.lr_mul = 5.0 - - def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): - assert input_seq.ndim == 1 - - ve = [value_embed(input_seq) for value_embed in self.value_embeds] - # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure - # dropping first layer updates this to .12 ... 012 - ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] - assert len(ve) == len(self.blocks) - - short_bm = ws_short * args.block_size - long_bm = ws_long * args.block_size - bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] - assert len(bm_sizes) == len(self.blocks) - - x = self.embed(input_seq) - - skip_weights = self.scalars[:(len(self.blocks) // 2)] - lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) - sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) - smear_lambda = self.scalars[5 * len(self.blocks)] - backout_lambda = self.scalars[5 * len(self.blocks)+1] - - # smear token embed forward 1 position @classiclarryd - smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) - x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) - x = x0 = norm(x[None]) - - # U-net design by @brendanh0gan - skip_connections = [] - n = len(self.blocks) // 2 - - x_backout = None - backout_layer = 8 - # skip layer zero - for i in range(1,len(self.blocks)): - attn_args = AttnArgs( - ve=ve[i], - sa_lambdas=sa_lambdas[i], - seqlens=seqlens, - bm_size=bm_sizes[i], - cos=self.yarn.cos, - sin=self.yarn.sin, - attn_scale=self.yarn.attn_scale - ) - # since layer 0 is skipped, layer 11 does not have skip_connection - if i >= n and i<11: - gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) - x = x + gate * skip_connections.pop() - x = self.blocks[i](x, x0, lambdas[i], attn_args) - if i < n: - skip_connections.append(x) - if i == backout_layer: - x_backout = x - - # back out contributions from first 8 layers that are only required for downstream context and not direct prediction - x -= backout_lambda * x_backout - x = norm(x) - logits = self.lm_head(x) - # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) - logits = 30 * torch.sigmoid(logits / 7.5) - logits_for_loss = logits.float() if not self.training else logits - loss = F.cross_entropy( - logits_for_loss.view(-1, logits_for_loss.size(-1)), - target_seq, - reduction="sum" if self.training else "mean", - ) - return loss - -# ----------------------------------------------------------------------------- -# Distributed data loader - -def _load_data_shard(file: Path): - header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 - assert header[0] == 20240520, "magic number mismatch in the data .bin file" - assert header[1] == 1, "unsupported version" - num_tokens = int(header[2]) # number of tokens (claimed) - with file.open("rb", buffering=0) as f: - tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng - f.seek(256 * 4) - nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng - assert nbytes == 2 * num_tokens, "number of tokens read does not match header" - return tokens - -BOS_ID = 50256 - -class BOSFinder: - # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd - def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): - # Precompute BOS positions once per shard - self.tokens=tokens - self.size = tokens.numel() - self.quickload = quickload - if quickload: - # only scan first 4 million tokens, then kickoff async thread to scan rest - self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() - self.thread = None - self.ready = threading.Event() - self.start() - else: - self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() - self.i = 0 - self.world_size = world_size - self.batch_iter = 0 - - def _load(self): - self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() - self.ready.set() - - def start(self): - self.ready.clear() - self.thread = threading.Thread(target=self._load) - self.thread.start() - - def get(self): - if self.thread: - self.ready.wait() - self.thread.join() - self.bos_idx = self.bos_idx_async - - def next_batch(self, num_tokens_local: int, max_seq_len: int): - # if quickload was used, repoint to the full dataset after 5 batches - if self.quickload and self.batch_iter==5: - self.get() - n = len(self.bos_idx) - starts = [[] for _ in range(self.world_size)] - ends = [[] for _ in range(self.world_size)] - - idx = self.i - for r in range(self.world_size): - cur_len = 0 - while cur_len <= num_tokens_local: - if idx >= n: - raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") - cur = self.bos_idx[idx] - starts[r].append(cur) - end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, - cur + max_seq_len, - cur + num_tokens_local - cur_len + 1) - ends[r].append(end) - cur_len += end - cur - idx += 1 - - assert cur_len == num_tokens_local + 1 - self.i = idx - self.batch_iter+=1 - return starts, ends - -class DataPreloader: - # Helper for asynchronously loading next shard and indexing bos tokens - def __init__(self, file_iter, world_size: int = 1): - self.file_iter = file_iter - self.world_size = world_size - self.thread = None - self.data = None - self.ready = threading.Event() - - def _load(self): - tokens = _load_data_shard(next(self.file_iter)) - self.data = (tokens, BOSFinder(tokens, self.world_size)) - self.ready.set() - - def start(self): - self.ready.clear() - self.thread = threading.Thread(target=self._load) - self.thread.start() - - def get(self): - if self.thread: - self.ready.wait() - self.thread.join() - return self.data - -def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): - # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len - rank = dist.get_rank() if dist.is_initialized() else 0 - world_size = dist.get_world_size() if dist.is_initialized() else 1 - assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" - num_tokens = num_tokens // grad_accum_steps - - files = [Path(file) for file in sorted(glob.glob(filename_pattern))] - if not files: - raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") - - file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training - tokens = _load_data_shard(next(file_iter)) - if align_to_bos: - finder = BOSFinder(tokens, world_size=world_size, quickload=True) - preloader = DataPreloader(file_iter, world_size) - preloader.start() - else: - pos = 0 # for unaligned case - - while True: - num_tokens_local = num_tokens // world_size - max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 - - if align_to_bos: - try: - seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) - start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) - except StopIteration: - # This shard is exhausted, load the next one in the next loop iteration. - tokens, finder = preloader.get() - preloader.start() - continue - - buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) - _inputs = buf[:-1] - _targets = buf[1:] - end_idxs[-1] -= 1 # last document was too long to account for _targets offset - cum_lengths = (end_idxs - start_idxs).cumsum(0) - - else: - if pos + num_tokens + 1 >= len(tokens): # should not occur for val data - tokens, pos = _load_data_shard(next(file_iter)), 0 - - pos_local = pos + rank * num_tokens_local - buf = tokens[pos_local: pos_local + num_tokens_local + 1] - _inputs = buf[:-1].view(num_tokens_local, ) - _targets = buf[1:].view(num_tokens_local, ) - - cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] - pos += num_tokens - - - _cum_lengths = torch.full((max_num_docs,), num_tokens_local) - _cum_lengths[0] = 0 - _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths - - new_params = yield ( - _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), - _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), - _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) - ) - - if new_params is not None: - # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() - new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params - assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" - num_tokens = new_num_tokens - max_seq_len = new_max_seq_len - grad_accum_steps = new_grad_accum_steps - - -# ----------------------------------------------------------------------------- -# int main - -@dataclass -class Hyperparameters: - # data - train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on - val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on - val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons - train_batch_size: int = 2048 * 16 * 8 - train_max_seq_len: int = 128 * 16 - val_batch_size: int = 4 * 64 * 1024 * 8 - # optimization - num_iterations: int = 2285 - lr_schedule = (0.5, 0.98) # breakpoints for 3-part schedule: (flat, linear decay, flat) - lr_min = 0.1 - # evaluation and logging - run_id: str = f"{uuid.uuid4()}" - val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end - save_checkpoint: bool = False - # attention masking - block_size: int = 128 - ws_schedule: tuple = (3, 5, 7, 9, 11, 13) - ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN - -args = Hyperparameters() - -data_path = os.environ.get("DATA_PATH", ".") -args.train_files = os.path.join(data_path, args.train_files) -args.val_files = os.path.join(data_path, args.val_files) - -# torchrun sets these env variables -rank = int(os.environ["RANK"]) -world_size = int(os.environ["WORLD_SIZE"]) -assert 8 % world_size == 0, "world_size must be a divisor of 8" -grad_accum_steps = 8 // world_size -assert torch.cuda.is_available() -device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) -torch.cuda.set_device(device) -dist.init_process_group(backend="nccl", device_id=device) -dist.barrier() -master_process = (rank == 0) # this process will do logging, checkpointing etc. - -# begin logging -logfile = None -if master_process: - run_id = args.run_id - os.makedirs("logs", exist_ok=True) - logfile = f"logs/{run_id}.txt" - print(logfile) -def print0(s, console=False): - if master_process: - with open(logfile, "a") as f: - if console: - print(s) - print(s, file=f) - -# begin by printing this file (the Python code) -print0(code) -print0("="*100) -# log information about the hardware/software environment this is running on -print0(f"Running Python {sys.version}") -print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") -print0(f"Running Triton version {triton.__version__}") - -def nvidia_smi(): - import subprocess # avoid top level import - return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout -print0(nvidia_smi()) -print0("="*100) - -model: nn.Module = GPT( - vocab_size=50257, - num_layers=12, - num_heads=6, - head_dim=128, - model_dim=768, - max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) -).cuda() -for m in model.modules(): - if isinstance(m, (nn.Embedding, nn.Linear)): - m.bfloat16() -for param in model.parameters(): - dist.broadcast(param.detach(), 0) - -# collect the parameters to optimize -hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] -embed_params = [p for n, p in model.named_parameters() if "embed" in n] -scalar_params = [p for p in model.parameters() if p.ndim < 2] -head_params = [model.lm_head.weight] -gate_params = [p for n, p in model.named_parameters() if "gate" in n] - -# init the optimizer(s) -# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence -# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 -optimizer1 = DistAdam( - scalar_params + head_params + embed_params, - lr=0.008, - betas=(0.65, 0.95), - eps=1e-8, - weight_decay=0.0, -) -optimizer2 = Muon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=0.0) -optimizers = [optimizer1, optimizer2] -for opt in optimizers: - for group in opt.param_groups: - group["initial_lr"] = group["lr"] - -def get_lr(step: int): - assert step < args.num_iterations - # Three part schedule: flat, linear decrease, flat - lr_schedule = args.lr_schedule - x = step / args.num_iterations - - if x < lr_schedule[0]: - return 1.0 - elif x < lr_schedule[1]: - progress = (x - lr_schedule[0]) / (lr_schedule[1] - lr_schedule[0]) - lr = 1.0 - (1.0 - args.lr_min) * progress - else: - lr = args.lr_min - return lr - -def get_ws(step: int): - assert step <= args.num_iterations - x = step / (args.num_iterations + 1) - ws_idx = int(len(args.ws_schedule) * x) - return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] - -def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): - # warmup phase: linearly increase momentum from min to max - # cooldown phase: linearly decrease momentum from max to min - momentum_cd_start = args.num_iterations - muon_cooldown_steps - if step < muon_warmup_steps: - frac = step / muon_warmup_steps - momentum = momentum_min + frac * (momentum_max - momentum_min) - elif step > momentum_cd_start: - frac = (step - momentum_cd_start) / muon_cooldown_steps - momentum = momentum_max - frac * (momentum_max - momentum_min) - else: - momentum = momentum_max - return momentum - -def step_optimizers(step: int, optimizers, model): - # update lr - for optimizer in optimizers: - for group in optimizer.param_groups: - group["lr"] = group["initial_lr"] * get_lr(step) - - # set muon momentum based on step - momentum = get_muon_momentum(step) - for group in optimizers[1].param_groups: - group["momentum"] = momentum - - # on even steps, only step Muon params - # on odd steps, step all params - if step%2==0: - optimizers[1].step() - optimizers[1].zero_grad(set_to_none=True) - else: - for optimizer in optimizers: - optimizer.step() - model.zero_grad(set_to_none=True) - -model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) - -######################################## -# Warmup kernels # -######################################## - -# Warmup the training kernels, then re-initialize the state so we aren't cheating -warmup_steps = 30 -initial_state = dict(model=copy.deepcopy(model.state_dict()), - optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state -train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) -for step in range(warmup_steps): - inputs, targets, cum_seqlens = next(train_loader) - # each window size is a new graph, need to warm up each with Yarn.attn_scale - ws_idx = step % len(args.ws_schedule) - if ws_idx==0: - model.yarn.reset() - ws_long = args.ws_schedule[0] - else: - new_ws_long = args.ws_schedule[ws_idx] - if new_ws_long > ws_long: - model.yarn.apply(ws_long, new_ws_long) - ws_long = new_ws_long - model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() - for opt in optimizers: - opt.step() - model.zero_grad(set_to_none=True) -model.yarn.reset() # rotary buffer is not stored in state_dict -model.load_state_dict(initial_state["model"]) -optimizer2.reset() # momentum buffer not in state dict -for opt, opt_state in zip(optimizers, initial_state["optimizers"]): - opt.load_state_dict(opt_state) -del train_loader, initial_state - -######################################## -# Training and validation # -######################################## - -train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) -training_time_ms = 0 -# start the clock -torch.cuda.synchronize() -t0 = time.perf_counter() -# begin training -train_steps = args.num_iterations -ws_short, ws_long = get_ws(0) -for step in range(train_steps + 1): - last_step = (step == train_steps) - ws_short, new_ws_long = get_ws(step) - if new_ws_long != ws_long: - model.yarn.apply(ws_long, new_ws_long) - ws_long=new_ws_long - - # --------------- VALIDATION SECTION ----------------- - if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): - if last_step: - ws_long = args.ws_validate_post_yarn_ext - # stop the clock - torch.cuda.synchronize() - training_time_ms += 1000 * (time.perf_counter() - t0) - model.eval() - assert args.val_tokens % args.val_batch_size == 0 - val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size - val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) - val_loss = 0 - with torch.no_grad(): - for _ in range(val_steps): - inputs, targets, cum_seqlens = next(val_loader) - val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) - val_loss /= val_steps - del val_loader - dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) - print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) - model.train() - # start the clock again - torch.cuda.synchronize() - t0 = time.perf_counter() - - if last_step: - if master_process and args.save_checkpoint: - log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) - os.makedirs(f"logs/{run_id}", exist_ok=True) - torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") - # the last step only has the validation loop, so break to avoid training - break - - # --------------- TRAINING SECTION ----------------- - loss = 0 - for _ in range(grad_accum_steps): - inputs, targets, cum_seqlens = next(train_loader) - loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps - loss.backward() - step_optimizers(step, optimizers, model) - - # logging - approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) - print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) - -print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " - f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) -dist.destroy_process_group() - -==================================================================================================== -Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] -Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 -Running Triton version 3.5.0 -Tue Oct 28 02:04:31 2025 -+-----------------------------------------------------------------------------------------+ -| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | -|-----------------------------------------+------------------------+----------------------+ -| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | -| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | -| | | MIG M. | -|=========================================+========================+======================| -| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | -| N/A 40C P0 129W / 700W | 5858MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | -| N/A 33C P0 126W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | -| N/A 32C P0 122W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | -| N/A 37C P0 125W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | -| N/A 39C P0 122W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | -| N/A 32C P0 119W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | -| N/A 37C P0 124W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | -| N/A 31C P0 115W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ - -+-----------------------------------------------------------------------------------------+ -| Processes: | -| GPU GI CI PID Type Process name GPU Memory | -| ID ID Usage | -|=========================================================================================| -+-----------------------------------------------------------------------------------------+ - -==================================================================================================== -step:0/2285 val_loss:10.8258 train_time:0ms step_avg:0.04ms -step:1/2285 train_time:119ms step_avg:119.32ms -step:2/2285 train_time:141ms step_avg:70.40ms -step:3/2285 train_time:178ms step_avg:59.44ms -step:4/2285 train_time:234ms step_avg:58.60ms -step:5/2285 train_time:294ms step_avg:58.77ms -step:6/2285 train_time:352ms step_avg:58.61ms -step:7/2285 train_time:412ms step_avg:58.87ms -step:8/2285 train_time:471ms step_avg:58.83ms -step:9/2285 train_time:531ms step_avg:59.02ms -step:10/2285 train_time:590ms step_avg:59.00ms -step:11/2285 train_time:651ms step_avg:59.15ms -step:12/2285 train_time:709ms step_avg:59.08ms -step:13/2285 train_time:770ms step_avg:59.20ms -step:14/2285 train_time:828ms step_avg:59.17ms -step:15/2285 train_time:889ms step_avg:59.27ms -step:16/2285 train_time:948ms step_avg:59.23ms -step:17/2285 train_time:1012ms step_avg:59.55ms -step:18/2285 train_time:1076ms step_avg:59.78ms -step:19/2285 train_time:1140ms step_avg:60.03ms -step:20/2285 train_time:1201ms step_avg:60.03ms -step:21/2285 train_time:1262ms step_avg:60.07ms -step:22/2285 train_time:1321ms step_avg:60.03ms -step:23/2285 train_time:1381ms step_avg:60.06ms -step:24/2285 train_time:1440ms step_avg:60.01ms -step:25/2285 train_time:1501ms step_avg:60.06ms -step:26/2285 train_time:1561ms step_avg:60.03ms -step:27/2285 train_time:1622ms step_avg:60.07ms -step:28/2285 train_time:1681ms step_avg:60.05ms -step:29/2285 train_time:1743ms step_avg:60.09ms -step:30/2285 train_time:1802ms step_avg:60.07ms -step:31/2285 train_time:1863ms step_avg:60.09ms -step:32/2285 train_time:1922ms step_avg:60.06ms -step:33/2285 train_time:1985ms step_avg:60.14ms -step:34/2285 train_time:2044ms step_avg:60.13ms -step:35/2285 train_time:2106ms step_avg:60.18ms -step:36/2285 train_time:2165ms step_avg:60.15ms -step:37/2285 train_time:2227ms step_avg:60.18ms -step:38/2285 train_time:2286ms step_avg:60.15ms -step:39/2285 train_time:2347ms step_avg:60.18ms -step:40/2285 train_time:2406ms step_avg:60.15ms -step:41/2285 train_time:2467ms step_avg:60.18ms -step:42/2285 train_time:2526ms step_avg:60.15ms -step:43/2285 train_time:2587ms step_avg:60.17ms -step:44/2285 train_time:2647ms step_avg:60.16ms -step:45/2285 train_time:2709ms step_avg:60.20ms -step:46/2285 train_time:2768ms step_avg:60.17ms -step:47/2285 train_time:2830ms step_avg:60.21ms -step:48/2285 train_time:2889ms step_avg:60.19ms -step:49/2285 train_time:2951ms step_avg:60.23ms -step:50/2285 train_time:3011ms step_avg:60.22ms -step:51/2285 train_time:3073ms step_avg:60.25ms -step:52/2285 train_time:3132ms step_avg:60.24ms -step:53/2285 train_time:3194ms step_avg:60.27ms -step:54/2285 train_time:3253ms step_avg:60.24ms -step:55/2285 train_time:3315ms step_avg:60.27ms -step:56/2285 train_time:3373ms step_avg:60.24ms -step:57/2285 train_time:3435ms step_avg:60.26ms -step:58/2285 train_time:3494ms step_avg:60.24ms -step:59/2285 train_time:3555ms step_avg:60.26ms -step:60/2285 train_time:3615ms step_avg:60.25ms -step:61/2285 train_time:3677ms step_avg:60.27ms -step:62/2285 train_time:3736ms step_avg:60.25ms -step:63/2285 train_time:3797ms step_avg:60.27ms -step:64/2285 train_time:3856ms step_avg:60.25ms -step:65/2285 train_time:3918ms step_avg:60.27ms -step:66/2285 train_time:3977ms step_avg:60.26ms -step:67/2285 train_time:4038ms step_avg:60.28ms -step:68/2285 train_time:4098ms step_avg:60.26ms -step:69/2285 train_time:4159ms step_avg:60.28ms -step:70/2285 train_time:4218ms step_avg:60.26ms -step:71/2285 train_time:4279ms step_avg:60.27ms -step:72/2285 train_time:4338ms step_avg:60.25ms -step:73/2285 train_time:4399ms step_avg:60.26ms -step:74/2285 train_time:4458ms step_avg:60.24ms -step:75/2285 train_time:4519ms step_avg:60.25ms -step:76/2285 train_time:4578ms step_avg:60.24ms -step:77/2285 train_time:4641ms step_avg:60.27ms -step:78/2285 train_time:4700ms step_avg:60.25ms -step:79/2285 train_time:4761ms step_avg:60.26ms -step:80/2285 train_time:4820ms step_avg:60.25ms -step:81/2285 train_time:4881ms step_avg:60.26ms -step:82/2285 train_time:4940ms step_avg:60.24ms -step:83/2285 train_time:5002ms step_avg:60.26ms -step:84/2285 train_time:5060ms step_avg:60.24ms -step:85/2285 train_time:5122ms step_avg:60.25ms -step:86/2285 train_time:5180ms step_avg:60.24ms -step:87/2285 train_time:5242ms step_avg:60.25ms -step:88/2285 train_time:5301ms step_avg:60.23ms -step:89/2285 train_time:5362ms step_avg:60.24ms -step:90/2285 train_time:5421ms step_avg:60.23ms -step:91/2285 train_time:5482ms step_avg:60.24ms -step:92/2285 train_time:5541ms step_avg:60.23ms -step:93/2285 train_time:5602ms step_avg:60.24ms -step:94/2285 train_time:5661ms step_avg:60.22ms -step:95/2285 train_time:5722ms step_avg:60.23ms -step:96/2285 train_time:5780ms step_avg:60.21ms -step:97/2285 train_time:5841ms step_avg:60.22ms -step:98/2285 train_time:5900ms step_avg:60.20ms -step:99/2285 train_time:5961ms step_avg:60.22ms -step:100/2285 train_time:6020ms step_avg:60.20ms -step:101/2285 train_time:6081ms step_avg:60.21ms -step:102/2285 train_time:6140ms step_avg:60.19ms -step:103/2285 train_time:6201ms step_avg:60.20ms -step:104/2285 train_time:6259ms step_avg:60.19ms -step:105/2285 train_time:6320ms step_avg:60.19ms -step:106/2285 train_time:6379ms step_avg:60.18ms -step:107/2285 train_time:6441ms step_avg:60.19ms -step:108/2285 train_time:6500ms step_avg:60.18ms -step:109/2285 train_time:6561ms step_avg:60.19ms -step:110/2285 train_time:6619ms step_avg:60.18ms -step:111/2285 train_time:6681ms step_avg:60.19ms -step:112/2285 train_time:6739ms step_avg:60.17ms -step:113/2285 train_time:6800ms step_avg:60.18ms -step:114/2285 train_time:6859ms step_avg:60.16ms -step:115/2285 train_time:6920ms step_avg:60.17ms -step:116/2285 train_time:6979ms step_avg:60.16ms -step:117/2285 train_time:7040ms step_avg:60.17ms -step:118/2285 train_time:7100ms step_avg:60.17ms -step:119/2285 train_time:7161ms step_avg:60.17ms -step:120/2285 train_time:7219ms step_avg:60.16ms -step:121/2285 train_time:7280ms step_avg:60.17ms -step:122/2285 train_time:7339ms step_avg:60.15ms -step:123/2285 train_time:7400ms step_avg:60.16ms -step:124/2285 train_time:7459ms step_avg:60.15ms -step:125/2285 train_time:7519ms step_avg:60.15ms -step:126/2285 train_time:7578ms step_avg:60.14ms -step:127/2285 train_time:7639ms step_avg:60.15ms -step:128/2285 train_time:7698ms step_avg:60.14ms -step:129/2285 train_time:7759ms step_avg:60.15ms -step:130/2285 train_time:7818ms step_avg:60.14ms -step:131/2285 train_time:7879ms step_avg:60.14ms -step:132/2285 train_time:7937ms step_avg:60.13ms -step:133/2285 train_time:7998ms step_avg:60.13ms -step:134/2285 train_time:8057ms step_avg:60.12ms -step:135/2285 train_time:8118ms step_avg:60.13ms -step:136/2285 train_time:8177ms step_avg:60.12ms -step:137/2285 train_time:8238ms step_avg:60.13ms -step:138/2285 train_time:8297ms step_avg:60.13ms -step:139/2285 train_time:8358ms step_avg:60.13ms -step:140/2285 train_time:8417ms step_avg:60.12ms -step:141/2285 train_time:8478ms step_avg:60.13ms -step:142/2285 train_time:8537ms step_avg:60.12ms -step:143/2285 train_time:8598ms step_avg:60.13ms -step:144/2285 train_time:8657ms step_avg:60.12ms -step:145/2285 train_time:8717ms step_avg:60.12ms -step:146/2285 train_time:8776ms step_avg:60.11ms -step:147/2285 train_time:8838ms step_avg:60.12ms -step:148/2285 train_time:8896ms step_avg:60.11ms -step:149/2285 train_time:8958ms step_avg:60.12ms -step:150/2285 train_time:9016ms step_avg:60.11ms -step:151/2285 train_time:9078ms step_avg:60.12ms -step:152/2285 train_time:9136ms step_avg:60.11ms -step:153/2285 train_time:9197ms step_avg:60.11ms -step:154/2285 train_time:9256ms step_avg:60.10ms -step:155/2285 train_time:9317ms step_avg:60.11ms -step:156/2285 train_time:9376ms step_avg:60.10ms -step:157/2285 train_time:9437ms step_avg:60.11ms -step:158/2285 train_time:9496ms step_avg:60.10ms -step:159/2285 train_time:9557ms step_avg:60.11ms -step:160/2285 train_time:9615ms step_avg:60.10ms -step:161/2285 train_time:9677ms step_avg:60.10ms -step:162/2285 train_time:9736ms step_avg:60.10ms -step:163/2285 train_time:9797ms step_avg:60.10ms -step:164/2285 train_time:9855ms step_avg:60.09ms -step:165/2285 train_time:9916ms step_avg:60.10ms -step:166/2285 train_time:9975ms step_avg:60.09ms -step:167/2285 train_time:10036ms step_avg:60.10ms -step:168/2285 train_time:10095ms step_avg:60.09ms -step:169/2285 train_time:10156ms step_avg:60.09ms -step:170/2285 train_time:10215ms step_avg:60.09ms -step:171/2285 train_time:10276ms step_avg:60.09ms -step:172/2285 train_time:10334ms step_avg:60.08ms -step:173/2285 train_time:10395ms step_avg:60.09ms -step:174/2285 train_time:10454ms step_avg:60.08ms -step:175/2285 train_time:10515ms step_avg:60.09ms -step:176/2285 train_time:10574ms step_avg:60.08ms -step:177/2285 train_time:10635ms step_avg:60.09ms -step:178/2285 train_time:10694ms step_avg:60.08ms -step:179/2285 train_time:10755ms step_avg:60.08ms -step:180/2285 train_time:10814ms step_avg:60.08ms -step:181/2285 train_time:10875ms step_avg:60.08ms -step:182/2285 train_time:10934ms step_avg:60.08ms -step:183/2285 train_time:10996ms step_avg:60.09ms -step:184/2285 train_time:11055ms step_avg:60.08ms -step:185/2285 train_time:11116ms step_avg:60.09ms -step:186/2285 train_time:11175ms step_avg:60.08ms -step:187/2285 train_time:11235ms step_avg:60.08ms -step:188/2285 train_time:11294ms step_avg:60.08ms -step:189/2285 train_time:11355ms step_avg:60.08ms -step:190/2285 train_time:11414ms step_avg:60.07ms -step:191/2285 train_time:11474ms step_avg:60.08ms -step:192/2285 train_time:11533ms step_avg:60.07ms -step:193/2285 train_time:11595ms step_avg:60.08ms -step:194/2285 train_time:11654ms step_avg:60.07ms -step:195/2285 train_time:11715ms step_avg:60.08ms -step:196/2285 train_time:11773ms step_avg:60.07ms -step:197/2285 train_time:11834ms step_avg:60.07ms -step:198/2285 train_time:11893ms step_avg:60.07ms -step:199/2285 train_time:11954ms step_avg:60.07ms -step:200/2285 train_time:12013ms step_avg:60.07ms -step:201/2285 train_time:12074ms step_avg:60.07ms -step:202/2285 train_time:12133ms step_avg:60.06ms -step:203/2285 train_time:12195ms step_avg:60.07ms -step:204/2285 train_time:12253ms step_avg:60.06ms -step:205/2285 train_time:12315ms step_avg:60.07ms -step:206/2285 train_time:12373ms step_avg:60.07ms -step:207/2285 train_time:12434ms step_avg:60.07ms -step:208/2285 train_time:12493ms step_avg:60.06ms -step:209/2285 train_time:12555ms step_avg:60.07ms -step:210/2285 train_time:12614ms step_avg:60.06ms -step:211/2285 train_time:12674ms step_avg:60.07ms -step:212/2285 train_time:12733ms step_avg:60.06ms -step:213/2285 train_time:12794ms step_avg:60.07ms -step:214/2285 train_time:12853ms step_avg:60.06ms -step:215/2285 train_time:12914ms step_avg:60.06ms -step:216/2285 train_time:12973ms step_avg:60.06ms -step:217/2285 train_time:13034ms step_avg:60.06ms -step:218/2285 train_time:13094ms step_avg:60.06ms -step:219/2285 train_time:13155ms step_avg:60.07ms -step:220/2285 train_time:13214ms step_avg:60.06ms -step:221/2285 train_time:13275ms step_avg:60.07ms -step:222/2285 train_time:13334ms step_avg:60.06ms -step:223/2285 train_time:13395ms step_avg:60.07ms -step:224/2285 train_time:13454ms step_avg:60.06ms -step:225/2285 train_time:13515ms step_avg:60.07ms -step:226/2285 train_time:13573ms step_avg:60.06ms -step:227/2285 train_time:13634ms step_avg:60.06ms -step:228/2285 train_time:13694ms step_avg:60.06ms -step:229/2285 train_time:13755ms step_avg:60.07ms -step:230/2285 train_time:13814ms step_avg:60.06ms -step:231/2285 train_time:13875ms step_avg:60.07ms -step:232/2285 train_time:13934ms step_avg:60.06ms -step:233/2285 train_time:13994ms step_avg:60.06ms -step:234/2285 train_time:14053ms step_avg:60.05ms -step:235/2285 train_time:14114ms step_avg:60.06ms -step:236/2285 train_time:14172ms step_avg:60.05ms -step:237/2285 train_time:14233ms step_avg:60.06ms -step:238/2285 train_time:14292ms step_avg:60.05ms -step:239/2285 train_time:14353ms step_avg:60.05ms -step:240/2285 train_time:14412ms step_avg:60.05ms -step:241/2285 train_time:14472ms step_avg:60.05ms -step:242/2285 train_time:14531ms step_avg:60.05ms -step:243/2285 train_time:14592ms step_avg:60.05ms -step:244/2285 train_time:14651ms step_avg:60.04ms -step:245/2285 train_time:14711ms step_avg:60.05ms -step:246/2285 train_time:14770ms step_avg:60.04ms -step:247/2285 train_time:14831ms step_avg:60.04ms -step:248/2285 train_time:14890ms step_avg:60.04ms -step:249/2285 train_time:14951ms step_avg:60.04ms -step:250/2285 train_time:15009ms step_avg:60.04ms -step:250/2285 val_loss:4.0723 train_time:15071ms step_avg:60.29ms -step:251/2285 train_time:15089ms step_avg:60.12ms -step:252/2285 train_time:15130ms step_avg:60.04ms -step:253/2285 train_time:15197ms step_avg:60.07ms -step:254/2285 train_time:15259ms step_avg:60.08ms -step:255/2285 train_time:15321ms step_avg:60.08ms -step:256/2285 train_time:15380ms step_avg:60.08ms -step:257/2285 train_time:15440ms step_avg:60.08ms -step:258/2285 train_time:15499ms step_avg:60.07ms -step:259/2285 train_time:15559ms step_avg:60.07ms -step:260/2285 train_time:15618ms step_avg:60.07ms -step:261/2285 train_time:15678ms step_avg:60.07ms -step:262/2285 train_time:15735ms step_avg:60.06ms -step:263/2285 train_time:15796ms step_avg:60.06ms -step:264/2285 train_time:15853ms step_avg:60.05ms -step:265/2285 train_time:15913ms step_avg:60.05ms -step:266/2285 train_time:15971ms step_avg:60.04ms -step:267/2285 train_time:16031ms step_avg:60.04ms -step:268/2285 train_time:16090ms step_avg:60.04ms -step:269/2285 train_time:16152ms step_avg:60.04ms -step:270/2285 train_time:16212ms step_avg:60.04ms -step:271/2285 train_time:16273ms step_avg:60.05ms -step:272/2285 train_time:16333ms step_avg:60.05ms -step:273/2285 train_time:16394ms step_avg:60.05ms -step:274/2285 train_time:16453ms step_avg:60.05ms -step:275/2285 train_time:16514ms step_avg:60.05ms -step:276/2285 train_time:16572ms step_avg:60.04ms -step:277/2285 train_time:16633ms step_avg:60.05ms -step:278/2285 train_time:16692ms step_avg:60.04ms -step:279/2285 train_time:16752ms step_avg:60.04ms -step:280/2285 train_time:16810ms step_avg:60.04ms -step:281/2285 train_time:16870ms step_avg:60.04ms -step:282/2285 train_time:16928ms step_avg:60.03ms -step:283/2285 train_time:16988ms step_avg:60.03ms -step:284/2285 train_time:17046ms step_avg:60.02ms -step:285/2285 train_time:17107ms step_avg:60.03ms -step:286/2285 train_time:17166ms step_avg:60.02ms -step:287/2285 train_time:17228ms step_avg:60.03ms -step:288/2285 train_time:17287ms step_avg:60.02ms -step:289/2285 train_time:17349ms step_avg:60.03ms -step:290/2285 train_time:17407ms step_avg:60.03ms -step:291/2285 train_time:17469ms step_avg:60.03ms -step:292/2285 train_time:17528ms step_avg:60.03ms -step:293/2285 train_time:17589ms step_avg:60.03ms -step:294/2285 train_time:17648ms step_avg:60.03ms -step:295/2285 train_time:17709ms step_avg:60.03ms -step:296/2285 train_time:17767ms step_avg:60.02ms -step:297/2285 train_time:17828ms step_avg:60.03ms -step:298/2285 train_time:17886ms step_avg:60.02ms -step:299/2285 train_time:17946ms step_avg:60.02ms -step:300/2285 train_time:18005ms step_avg:60.02ms -step:301/2285 train_time:18065ms step_avg:60.02ms -step:302/2285 train_time:18124ms step_avg:60.01ms -step:303/2285 train_time:18185ms step_avg:60.02ms -step:304/2285 train_time:18244ms step_avg:60.01ms -step:305/2285 train_time:18306ms step_avg:60.02ms -step:306/2285 train_time:18365ms step_avg:60.02ms -step:307/2285 train_time:18427ms step_avg:60.02ms -step:308/2285 train_time:18486ms step_avg:60.02ms -step:309/2285 train_time:18547ms step_avg:60.02ms -step:310/2285 train_time:18606ms step_avg:60.02ms -step:311/2285 train_time:18666ms step_avg:60.02ms -step:312/2285 train_time:18724ms step_avg:60.01ms -step:313/2285 train_time:18785ms step_avg:60.02ms -step:314/2285 train_time:18843ms step_avg:60.01ms -step:315/2285 train_time:18904ms step_avg:60.01ms -step:316/2285 train_time:18962ms step_avg:60.01ms -step:317/2285 train_time:19023ms step_avg:60.01ms -step:318/2285 train_time:19081ms step_avg:60.00ms -step:319/2285 train_time:19142ms step_avg:60.01ms -step:320/2285 train_time:19200ms step_avg:60.00ms -step:321/2285 train_time:19262ms step_avg:60.01ms -step:322/2285 train_time:19320ms step_avg:60.00ms -step:323/2285 train_time:19381ms step_avg:60.00ms -step:324/2285 train_time:19440ms step_avg:60.00ms -step:325/2285 train_time:19502ms step_avg:60.01ms -step:326/2285 train_time:19560ms step_avg:60.00ms -step:327/2285 train_time:19621ms step_avg:60.00ms -step:328/2285 train_time:19680ms step_avg:60.00ms -step:329/2285 train_time:19741ms step_avg:60.00ms -step:330/2285 train_time:19800ms step_avg:60.00ms -step:331/2285 train_time:19860ms step_avg:60.00ms -step:332/2285 train_time:19918ms step_avg:59.99ms -step:333/2285 train_time:19979ms step_avg:60.00ms -step:334/2285 train_time:20037ms step_avg:59.99ms -step:335/2285 train_time:20097ms step_avg:59.99ms -step:336/2285 train_time:20156ms step_avg:59.99ms -step:337/2285 train_time:20216ms step_avg:59.99ms -step:338/2285 train_time:20275ms step_avg:59.98ms -step:339/2285 train_time:20336ms step_avg:59.99ms -step:340/2285 train_time:20394ms step_avg:59.98ms -step:341/2285 train_time:20456ms step_avg:59.99ms -step:342/2285 train_time:20514ms step_avg:59.98ms -step:343/2285 train_time:20575ms step_avg:59.99ms -step:344/2285 train_time:20634ms step_avg:59.98ms -step:345/2285 train_time:20695ms step_avg:59.98ms -step:346/2285 train_time:20753ms step_avg:59.98ms -step:347/2285 train_time:20814ms step_avg:59.98ms -step:348/2285 train_time:20872ms step_avg:59.98ms -step:349/2285 train_time:20933ms step_avg:59.98ms -step:350/2285 train_time:20991ms step_avg:59.98ms -step:351/2285 train_time:21052ms step_avg:59.98ms -step:352/2285 train_time:21110ms step_avg:59.97ms -step:353/2285 train_time:21171ms step_avg:59.97ms -step:354/2285 train_time:21229ms step_avg:59.97ms -step:355/2285 train_time:21290ms step_avg:59.97ms -step:356/2285 train_time:21349ms step_avg:59.97ms -step:357/2285 train_time:21410ms step_avg:59.97ms -step:358/2285 train_time:21469ms step_avg:59.97ms -step:359/2285 train_time:21529ms step_avg:59.97ms -step:360/2285 train_time:21588ms step_avg:59.97ms -step:361/2285 train_time:21649ms step_avg:59.97ms -step:362/2285 train_time:21707ms step_avg:59.97ms -step:363/2285 train_time:21768ms step_avg:59.97ms -step:364/2285 train_time:21827ms step_avg:59.96ms -step:365/2285 train_time:21888ms step_avg:59.97ms -step:366/2285 train_time:21946ms step_avg:59.96ms -step:367/2285 train_time:22007ms step_avg:59.96ms -step:368/2285 train_time:22065ms step_avg:59.96ms -step:369/2285 train_time:22126ms step_avg:59.96ms -step:370/2285 train_time:22185ms step_avg:59.96ms -step:371/2285 train_time:22247ms step_avg:59.97ms -step:372/2285 train_time:22306ms step_avg:59.96ms -step:373/2285 train_time:22367ms step_avg:59.97ms -step:374/2285 train_time:22426ms step_avg:59.96ms -step:375/2285 train_time:22487ms step_avg:59.96ms -step:376/2285 train_time:22546ms step_avg:59.96ms -step:377/2285 train_time:22607ms step_avg:59.97ms -step:378/2285 train_time:22666ms step_avg:59.96ms -step:379/2285 train_time:22727ms step_avg:59.97ms -step:380/2285 train_time:22786ms step_avg:59.96ms -step:381/2285 train_time:22847ms step_avg:59.97ms -step:382/2285 train_time:22906ms step_avg:59.96ms -step:383/2285 train_time:22967ms step_avg:59.97ms -step:384/2285 train_time:23026ms step_avg:59.96ms -step:385/2285 train_time:23087ms step_avg:59.97ms -step:386/2285 train_time:23146ms step_avg:59.96ms -step:387/2285 train_time:23208ms step_avg:59.97ms -step:388/2285 train_time:23267ms step_avg:59.97ms -step:389/2285 train_time:23328ms step_avg:59.97ms -step:390/2285 train_time:23387ms step_avg:59.97ms -step:391/2285 train_time:23449ms step_avg:59.97ms -step:392/2285 train_time:23508ms step_avg:59.97ms -step:393/2285 train_time:23570ms step_avg:59.97ms -step:394/2285 train_time:23628ms step_avg:59.97ms -step:395/2285 train_time:23690ms step_avg:59.97ms -step:396/2285 train_time:23749ms step_avg:59.97ms -step:397/2285 train_time:23810ms step_avg:59.98ms -step:398/2285 train_time:23869ms step_avg:59.97ms -step:399/2285 train_time:23930ms step_avg:59.98ms -step:400/2285 train_time:23989ms step_avg:59.97ms -step:401/2285 train_time:24051ms step_avg:59.98ms -step:402/2285 train_time:24110ms step_avg:59.98ms -step:403/2285 train_time:24172ms step_avg:59.98ms -step:404/2285 train_time:24231ms step_avg:59.98ms -step:405/2285 train_time:24292ms step_avg:59.98ms -step:406/2285 train_time:24350ms step_avg:59.98ms -step:407/2285 train_time:24411ms step_avg:59.98ms -step:408/2285 train_time:24470ms step_avg:59.98ms -step:409/2285 train_time:24532ms step_avg:59.98ms -step:410/2285 train_time:24592ms step_avg:59.98ms -step:411/2285 train_time:24653ms step_avg:59.98ms -step:412/2285 train_time:24712ms step_avg:59.98ms -step:413/2285 train_time:24773ms step_avg:59.98ms -step:414/2285 train_time:24832ms step_avg:59.98ms -step:415/2285 train_time:24893ms step_avg:59.98ms -step:416/2285 train_time:24952ms step_avg:59.98ms -step:417/2285 train_time:25014ms step_avg:59.98ms -step:418/2285 train_time:25073ms step_avg:59.98ms -step:419/2285 train_time:25133ms step_avg:59.98ms -step:420/2285 train_time:25192ms step_avg:59.98ms -step:421/2285 train_time:25253ms step_avg:59.98ms -step:422/2285 train_time:25313ms step_avg:59.98ms -step:423/2285 train_time:25374ms step_avg:59.99ms -step:424/2285 train_time:25433ms step_avg:59.98ms -step:425/2285 train_time:25494ms step_avg:59.99ms -step:426/2285 train_time:25553ms step_avg:59.98ms -step:427/2285 train_time:25614ms step_avg:59.99ms -step:428/2285 train_time:25673ms step_avg:59.98ms -step:429/2285 train_time:25734ms step_avg:59.99ms -step:430/2285 train_time:25794ms step_avg:59.99ms -step:431/2285 train_time:25856ms step_avg:59.99ms -step:432/2285 train_time:25915ms step_avg:59.99ms -step:433/2285 train_time:25976ms step_avg:59.99ms -step:434/2285 train_time:26035ms step_avg:59.99ms -step:435/2285 train_time:26096ms step_avg:59.99ms -step:436/2285 train_time:26155ms step_avg:59.99ms -step:437/2285 train_time:26217ms step_avg:59.99ms -step:438/2285 train_time:26276ms step_avg:59.99ms -step:439/2285 train_time:26337ms step_avg:59.99ms -step:440/2285 train_time:26396ms step_avg:59.99ms -step:441/2285 train_time:26458ms step_avg:59.99ms -step:442/2285 train_time:26517ms step_avg:59.99ms -step:443/2285 train_time:26578ms step_avg:59.99ms -step:444/2285 train_time:26636ms step_avg:59.99ms -step:445/2285 train_time:26697ms step_avg:59.99ms -step:446/2285 train_time:26756ms step_avg:59.99ms -step:447/2285 train_time:26818ms step_avg:59.99ms -step:448/2285 train_time:26876ms step_avg:59.99ms -step:449/2285 train_time:26937ms step_avg:59.99ms -step:450/2285 train_time:26996ms step_avg:59.99ms -step:451/2285 train_time:27057ms step_avg:59.99ms -step:452/2285 train_time:27117ms step_avg:59.99ms -step:453/2285 train_time:27178ms step_avg:59.99ms -step:454/2285 train_time:27236ms step_avg:59.99ms -step:455/2285 train_time:27297ms step_avg:59.99ms -step:456/2285 train_time:27357ms step_avg:59.99ms -step:457/2285 train_time:27418ms step_avg:60.00ms -step:458/2285 train_time:27477ms step_avg:59.99ms -step:459/2285 train_time:27538ms step_avg:60.00ms -step:460/2285 train_time:27596ms step_avg:59.99ms -step:461/2285 train_time:27658ms step_avg:59.99ms -step:462/2285 train_time:27717ms step_avg:59.99ms -step:463/2285 train_time:27778ms step_avg:60.00ms -step:464/2285 train_time:27837ms step_avg:59.99ms -step:465/2285 train_time:27898ms step_avg:59.99ms -step:466/2285 train_time:27957ms step_avg:59.99ms -step:467/2285 train_time:28018ms step_avg:60.00ms -step:468/2285 train_time:28077ms step_avg:59.99ms -step:469/2285 train_time:28138ms step_avg:60.00ms -step:470/2285 train_time:28197ms step_avg:59.99ms -step:471/2285 train_time:28258ms step_avg:60.00ms -step:472/2285 train_time:28318ms step_avg:59.99ms -step:473/2285 train_time:28378ms step_avg:60.00ms -step:474/2285 train_time:28437ms step_avg:59.99ms -step:475/2285 train_time:28498ms step_avg:60.00ms -step:476/2285 train_time:28557ms step_avg:59.99ms -step:477/2285 train_time:28618ms step_avg:60.00ms -step:478/2285 train_time:28677ms step_avg:59.99ms -step:479/2285 train_time:28738ms step_avg:60.00ms -step:480/2285 train_time:28798ms step_avg:60.00ms -step:481/2285 train_time:28859ms step_avg:60.00ms -step:482/2285 train_time:28918ms step_avg:60.00ms -step:483/2285 train_time:28979ms step_avg:60.00ms -step:484/2285 train_time:29037ms step_avg:59.99ms -step:485/2285 train_time:29099ms step_avg:60.00ms -step:486/2285 train_time:29158ms step_avg:60.00ms -step:487/2285 train_time:29219ms step_avg:60.00ms -step:488/2285 train_time:29277ms step_avg:59.99ms -step:489/2285 train_time:29338ms step_avg:60.00ms -step:490/2285 train_time:29397ms step_avg:59.99ms -step:491/2285 train_time:29458ms step_avg:60.00ms -step:492/2285 train_time:29517ms step_avg:59.99ms -step:493/2285 train_time:29578ms step_avg:60.00ms -step:494/2285 train_time:29637ms step_avg:59.99ms -step:495/2285 train_time:29699ms step_avg:60.00ms -step:496/2285 train_time:29758ms step_avg:60.00ms -step:497/2285 train_time:29819ms step_avg:60.00ms -step:498/2285 train_time:29877ms step_avg:59.99ms -step:499/2285 train_time:29938ms step_avg:60.00ms -step:500/2285 train_time:29997ms step_avg:59.99ms -step:500/2285 val_loss:3.7842 train_time:30060ms step_avg:60.12ms -step:501/2285 train_time:30083ms step_avg:60.05ms -step:502/2285 train_time:30120ms step_avg:60.00ms -step:503/2285 train_time:30180ms step_avg:60.00ms -step:504/2285 train_time:30238ms step_avg:60.00ms -step:505/2285 train_time:30299ms step_avg:60.00ms -step:506/2285 train_time:30358ms step_avg:60.00ms -step:507/2285 train_time:30418ms step_avg:60.00ms -step:508/2285 train_time:30476ms step_avg:59.99ms -step:509/2285 train_time:30537ms step_avg:59.99ms -step:510/2285 train_time:30595ms step_avg:59.99ms -step:511/2285 train_time:30656ms step_avg:59.99ms -step:512/2285 train_time:30714ms step_avg:59.99ms -step:513/2285 train_time:30774ms step_avg:59.99ms -step:514/2285 train_time:30833ms step_avg:59.99ms -step:515/2285 train_time:30893ms step_avg:59.99ms -step:516/2285 train_time:30955ms step_avg:59.99ms -step:517/2285 train_time:31023ms step_avg:60.01ms -step:518/2285 train_time:31085ms step_avg:60.01ms -step:519/2285 train_time:31146ms step_avg:60.01ms -step:520/2285 train_time:31205ms step_avg:60.01ms -step:521/2285 train_time:31266ms step_avg:60.01ms -step:522/2285 train_time:31325ms step_avg:60.01ms -step:523/2285 train_time:31386ms step_avg:60.01ms -step:524/2285 train_time:31445ms step_avg:60.01ms -step:525/2285 train_time:31506ms step_avg:60.01ms -step:526/2285 train_time:31565ms step_avg:60.01ms -step:527/2285 train_time:31627ms step_avg:60.01ms -step:528/2285 train_time:31686ms step_avg:60.01ms -step:529/2285 train_time:31747ms step_avg:60.01ms -step:530/2285 train_time:31806ms step_avg:60.01ms -step:531/2285 train_time:31867ms step_avg:60.01ms -step:532/2285 train_time:31927ms step_avg:60.01ms -step:533/2285 train_time:31990ms step_avg:60.02ms -step:534/2285 train_time:32049ms step_avg:60.02ms -step:535/2285 train_time:32111ms step_avg:60.02ms -step:536/2285 train_time:32171ms step_avg:60.02ms -step:537/2285 train_time:32233ms step_avg:60.02ms -step:538/2285 train_time:32292ms step_avg:60.02ms -step:539/2285 train_time:32353ms step_avg:60.02ms -step:540/2285 train_time:32412ms step_avg:60.02ms -step:541/2285 train_time:32473ms step_avg:60.02ms -step:542/2285 train_time:32533ms step_avg:60.02ms -step:543/2285 train_time:32594ms step_avg:60.03ms -step:544/2285 train_time:32653ms step_avg:60.02ms -step:545/2285 train_time:32714ms step_avg:60.03ms -step:546/2285 train_time:32773ms step_avg:60.02ms -step:547/2285 train_time:32834ms step_avg:60.03ms -step:548/2285 train_time:32893ms step_avg:60.02ms -step:549/2285 train_time:32955ms step_avg:60.03ms -step:550/2285 train_time:33014ms step_avg:60.02ms -step:551/2285 train_time:33076ms step_avg:60.03ms -step:552/2285 train_time:33135ms step_avg:60.03ms -step:553/2285 train_time:33197ms step_avg:60.03ms -step:554/2285 train_time:33257ms step_avg:60.03ms -step:555/2285 train_time:33319ms step_avg:60.03ms -step:556/2285 train_time:33378ms step_avg:60.03ms -step:557/2285 train_time:33439ms step_avg:60.03ms -step:558/2285 train_time:33498ms step_avg:60.03ms -step:559/2285 train_time:33560ms step_avg:60.04ms -step:560/2285 train_time:33619ms step_avg:60.03ms -step:561/2285 train_time:33679ms step_avg:60.03ms -step:562/2285 train_time:33739ms step_avg:60.03ms -step:563/2285 train_time:33800ms step_avg:60.04ms -step:564/2285 train_time:33859ms step_avg:60.03ms -step:565/2285 train_time:33920ms step_avg:60.04ms -step:566/2285 train_time:33980ms step_avg:60.04ms -step:567/2285 train_time:34042ms step_avg:60.04ms -step:568/2285 train_time:34101ms step_avg:60.04ms -step:569/2285 train_time:34163ms step_avg:60.04ms -step:570/2285 train_time:34222ms step_avg:60.04ms -step:571/2285 train_time:34284ms step_avg:60.04ms -step:572/2285 train_time:34342ms step_avg:60.04ms -step:573/2285 train_time:34404ms step_avg:60.04ms -step:574/2285 train_time:34463ms step_avg:60.04ms -step:575/2285 train_time:34525ms step_avg:60.04ms -step:576/2285 train_time:34584ms step_avg:60.04ms -step:577/2285 train_time:34645ms step_avg:60.04ms -step:578/2285 train_time:34704ms step_avg:60.04ms -step:579/2285 train_time:34765ms step_avg:60.04ms -step:580/2285 train_time:34824ms step_avg:60.04ms -step:581/2285 train_time:34886ms step_avg:60.04ms -step:582/2285 train_time:34945ms step_avg:60.04ms -step:583/2285 train_time:35006ms step_avg:60.04ms -step:584/2285 train_time:35065ms step_avg:60.04ms -step:585/2285 train_time:35126ms step_avg:60.04ms -step:586/2285 train_time:35185ms step_avg:60.04ms -step:587/2285 train_time:35246ms step_avg:60.04ms -step:588/2285 train_time:35305ms step_avg:60.04ms -step:589/2285 train_time:35367ms step_avg:60.05ms -step:590/2285 train_time:35426ms step_avg:60.04ms -step:591/2285 train_time:35487ms step_avg:60.05ms -step:592/2285 train_time:35546ms step_avg:60.04ms -step:593/2285 train_time:35607ms step_avg:60.05ms -step:594/2285 train_time:35667ms step_avg:60.04ms -step:595/2285 train_time:35728ms step_avg:60.05ms -step:596/2285 train_time:35787ms step_avg:60.05ms -step:597/2285 train_time:35849ms step_avg:60.05ms -step:598/2285 train_time:35908ms step_avg:60.05ms -step:599/2285 train_time:35970ms step_avg:60.05ms -step:600/2285 train_time:36029ms step_avg:60.05ms -step:601/2285 train_time:36091ms step_avg:60.05ms -step:602/2285 train_time:36150ms step_avg:60.05ms -step:603/2285 train_time:36211ms step_avg:60.05ms -step:604/2285 train_time:36270ms step_avg:60.05ms -step:605/2285 train_time:36331ms step_avg:60.05ms -step:606/2285 train_time:36391ms step_avg:60.05ms -step:607/2285 train_time:36452ms step_avg:60.05ms -step:608/2285 train_time:36511ms step_avg:60.05ms -step:609/2285 train_time:36572ms step_avg:60.05ms -step:610/2285 train_time:36631ms step_avg:60.05ms -step:611/2285 train_time:36692ms step_avg:60.05ms -step:612/2285 train_time:36751ms step_avg:60.05ms -step:613/2285 train_time:36813ms step_avg:60.05ms -step:614/2285 train_time:36871ms step_avg:60.05ms -step:615/2285 train_time:36933ms step_avg:60.05ms -step:616/2285 train_time:36992ms step_avg:60.05ms -step:617/2285 train_time:37054ms step_avg:60.05ms -step:618/2285 train_time:37113ms step_avg:60.05ms -step:619/2285 train_time:37175ms step_avg:60.06ms -step:620/2285 train_time:37234ms step_avg:60.06ms -step:621/2285 train_time:37296ms step_avg:60.06ms -step:622/2285 train_time:37355ms step_avg:60.06ms -step:623/2285 train_time:37416ms step_avg:60.06ms -step:624/2285 train_time:37476ms step_avg:60.06ms -step:625/2285 train_time:37538ms step_avg:60.06ms -step:626/2285 train_time:37597ms step_avg:60.06ms -step:627/2285 train_time:37659ms step_avg:60.06ms -step:628/2285 train_time:37718ms step_avg:60.06ms -step:629/2285 train_time:37780ms step_avg:60.06ms -step:630/2285 train_time:37839ms step_avg:60.06ms -step:631/2285 train_time:37901ms step_avg:60.07ms -step:632/2285 train_time:37960ms step_avg:60.06ms -step:633/2285 train_time:38022ms step_avg:60.07ms -step:634/2285 train_time:38081ms step_avg:60.06ms -step:635/2285 train_time:38143ms step_avg:60.07ms -step:636/2285 train_time:38202ms step_avg:60.07ms -step:637/2285 train_time:38263ms step_avg:60.07ms -step:638/2285 train_time:38322ms step_avg:60.07ms -step:639/2285 train_time:38384ms step_avg:60.07ms -step:640/2285 train_time:38444ms step_avg:60.07ms -step:641/2285 train_time:38505ms step_avg:60.07ms -step:642/2285 train_time:38564ms step_avg:60.07ms -step:643/2285 train_time:38626ms step_avg:60.07ms -step:644/2285 train_time:38685ms step_avg:60.07ms -step:645/2285 train_time:38747ms step_avg:60.07ms -step:646/2285 train_time:38806ms step_avg:60.07ms -step:647/2285 train_time:38867ms step_avg:60.07ms -step:648/2285 train_time:38926ms step_avg:60.07ms -step:649/2285 train_time:38988ms step_avg:60.07ms -step:650/2285 train_time:39047ms step_avg:60.07ms -step:651/2285 train_time:39108ms step_avg:60.07ms -step:652/2285 train_time:39167ms step_avg:60.07ms -step:653/2285 train_time:39229ms step_avg:60.07ms -step:654/2285 train_time:39288ms step_avg:60.07ms -step:655/2285 train_time:39350ms step_avg:60.08ms -step:656/2285 train_time:39409ms step_avg:60.07ms -step:657/2285 train_time:39470ms step_avg:60.08ms -step:658/2285 train_time:39529ms step_avg:60.07ms -step:659/2285 train_time:39591ms step_avg:60.08ms -step:660/2285 train_time:39650ms step_avg:60.08ms -step:661/2285 train_time:39712ms step_avg:60.08ms -step:662/2285 train_time:39770ms step_avg:60.08ms -step:663/2285 train_time:39832ms step_avg:60.08ms -step:664/2285 train_time:39891ms step_avg:60.08ms -step:665/2285 train_time:39952ms step_avg:60.08ms -step:666/2285 train_time:40011ms step_avg:60.08ms -step:667/2285 train_time:40072ms step_avg:60.08ms -step:668/2285 train_time:40131ms step_avg:60.08ms -step:669/2285 train_time:40193ms step_avg:60.08ms -step:670/2285 train_time:40252ms step_avg:60.08ms -step:671/2285 train_time:40314ms step_avg:60.08ms -step:672/2285 train_time:40373ms step_avg:60.08ms -step:673/2285 train_time:40434ms step_avg:60.08ms -step:674/2285 train_time:40493ms step_avg:60.08ms -step:675/2285 train_time:40555ms step_avg:60.08ms -step:676/2285 train_time:40614ms step_avg:60.08ms -step:677/2285 train_time:40675ms step_avg:60.08ms -step:678/2285 train_time:40734ms step_avg:60.08ms -step:679/2285 train_time:40796ms step_avg:60.08ms -step:680/2285 train_time:40855ms step_avg:60.08ms -step:681/2285 train_time:40916ms step_avg:60.08ms -step:682/2285 train_time:40975ms step_avg:60.08ms -step:683/2285 train_time:41037ms step_avg:60.08ms -step:684/2285 train_time:41097ms step_avg:60.08ms -step:685/2285 train_time:41159ms step_avg:60.09ms -step:686/2285 train_time:41218ms step_avg:60.08ms -step:687/2285 train_time:41280ms step_avg:60.09ms -step:688/2285 train_time:41339ms step_avg:60.09ms -step:689/2285 train_time:41400ms step_avg:60.09ms -step:690/2285 train_time:41460ms step_avg:60.09ms -step:691/2285 train_time:41521ms step_avg:60.09ms -step:692/2285 train_time:41580ms step_avg:60.09ms -step:693/2285 train_time:41642ms step_avg:60.09ms -step:694/2285 train_time:41701ms step_avg:60.09ms -step:695/2285 train_time:41762ms step_avg:60.09ms -step:696/2285 train_time:41822ms step_avg:60.09ms -step:697/2285 train_time:41883ms step_avg:60.09ms -step:698/2285 train_time:41942ms step_avg:60.09ms -step:699/2285 train_time:42004ms step_avg:60.09ms -step:700/2285 train_time:42063ms step_avg:60.09ms -step:701/2285 train_time:42124ms step_avg:60.09ms -step:702/2285 train_time:42183ms step_avg:60.09ms -step:703/2285 train_time:42245ms step_avg:60.09ms -step:704/2285 train_time:42304ms step_avg:60.09ms -step:705/2285 train_time:42365ms step_avg:60.09ms -step:706/2285 train_time:42424ms step_avg:60.09ms -step:707/2285 train_time:42486ms step_avg:60.09ms -step:708/2285 train_time:42545ms step_avg:60.09ms -step:709/2285 train_time:42607ms step_avg:60.09ms -step:710/2285 train_time:42667ms step_avg:60.09ms -step:711/2285 train_time:42728ms step_avg:60.10ms -step:712/2285 train_time:42787ms step_avg:60.09ms -step:713/2285 train_time:42849ms step_avg:60.10ms -step:714/2285 train_time:42908ms step_avg:60.10ms -step:715/2285 train_time:42969ms step_avg:60.10ms -step:716/2285 train_time:43028ms step_avg:60.10ms -step:717/2285 train_time:43090ms step_avg:60.10ms -step:718/2285 train_time:43149ms step_avg:60.10ms -step:719/2285 train_time:43210ms step_avg:60.10ms -step:720/2285 train_time:43270ms step_avg:60.10ms -step:721/2285 train_time:43331ms step_avg:60.10ms -step:722/2285 train_time:43391ms step_avg:60.10ms -step:723/2285 train_time:43453ms step_avg:60.10ms -step:724/2285 train_time:43512ms step_avg:60.10ms -step:725/2285 train_time:43574ms step_avg:60.10ms -step:726/2285 train_time:43633ms step_avg:60.10ms -step:727/2285 train_time:43695ms step_avg:60.10ms -step:728/2285 train_time:43753ms step_avg:60.10ms -step:729/2285 train_time:43815ms step_avg:60.10ms -step:730/2285 train_time:43874ms step_avg:60.10ms -step:731/2285 train_time:43936ms step_avg:60.10ms -step:732/2285 train_time:43995ms step_avg:60.10ms -step:733/2285 train_time:44057ms step_avg:60.10ms -step:734/2285 train_time:44116ms step_avg:60.10ms -step:735/2285 train_time:44178ms step_avg:60.11ms -step:736/2285 train_time:44237ms step_avg:60.10ms -step:737/2285 train_time:44299ms step_avg:60.11ms -step:738/2285 train_time:44358ms step_avg:60.11ms -step:739/2285 train_time:44420ms step_avg:60.11ms -step:740/2285 train_time:44479ms step_avg:60.11ms -step:741/2285 train_time:44540ms step_avg:60.11ms -step:742/2285 train_time:44599ms step_avg:60.11ms -step:743/2285 train_time:44661ms step_avg:60.11ms -step:744/2285 train_time:44720ms step_avg:60.11ms -step:745/2285 train_time:44782ms step_avg:60.11ms -step:746/2285 train_time:44841ms step_avg:60.11ms -step:747/2285 train_time:44902ms step_avg:60.11ms -step:748/2285 train_time:44961ms step_avg:60.11ms -step:749/2285 train_time:45023ms step_avg:60.11ms -step:750/2285 train_time:45082ms step_avg:60.11ms -step:750/2285 val_loss:3.6546 train_time:45145ms step_avg:60.19ms -step:751/2285 train_time:45163ms step_avg:60.14ms -step:752/2285 train_time:45206ms step_avg:60.11ms -step:753/2285 train_time:45270ms step_avg:60.12ms -step:754/2285 train_time:45331ms step_avg:60.12ms -step:755/2285 train_time:45393ms step_avg:60.12ms -step:756/2285 train_time:45452ms step_avg:60.12ms -step:757/2285 train_time:45512ms step_avg:60.12ms -step:758/2285 train_time:45571ms step_avg:60.12ms -step:759/2285 train_time:45631ms step_avg:60.12ms -step:760/2285 train_time:45690ms step_avg:60.12ms -step:761/2285 train_time:45750ms step_avg:60.12ms -step:762/2285 train_time:45808ms step_avg:60.12ms -step:763/2285 train_time:45869ms step_avg:60.12ms -step:764/2285 train_time:45929ms step_avg:60.12ms -step:765/2285 train_time:45989ms step_avg:60.12ms -step:766/2285 train_time:46049ms step_avg:60.12ms -step:767/2285 train_time:46113ms step_avg:60.12ms -step:768/2285 train_time:46174ms step_avg:60.12ms -step:769/2285 train_time:46238ms step_avg:60.13ms -step:770/2285 train_time:46297ms step_avg:60.13ms -step:771/2285 train_time:46359ms step_avg:60.13ms -step:772/2285 train_time:46419ms step_avg:60.13ms -step:773/2285 train_time:46481ms step_avg:60.13ms -step:774/2285 train_time:46540ms step_avg:60.13ms -step:775/2285 train_time:46602ms step_avg:60.13ms -step:776/2285 train_time:46661ms step_avg:60.13ms -step:777/2285 train_time:46723ms step_avg:60.13ms -step:778/2285 train_time:46783ms step_avg:60.13ms -step:779/2285 train_time:46844ms step_avg:60.13ms -step:780/2285 train_time:46903ms step_avg:60.13ms -step:781/2285 train_time:46965ms step_avg:60.13ms -step:782/2285 train_time:47025ms step_avg:60.13ms -step:783/2285 train_time:47087ms step_avg:60.14ms -step:784/2285 train_time:47147ms step_avg:60.14ms -step:785/2285 train_time:47209ms step_avg:60.14ms -step:786/2285 train_time:47268ms step_avg:60.14ms -step:787/2285 train_time:47331ms step_avg:60.14ms -step:788/2285 train_time:47391ms step_avg:60.14ms -step:789/2285 train_time:47453ms step_avg:60.14ms -step:790/2285 train_time:47513ms step_avg:60.14ms -step:791/2285 train_time:47575ms step_avg:60.15ms -step:792/2285 train_time:47634ms step_avg:60.14ms -step:793/2285 train_time:47696ms step_avg:60.15ms -step:794/2285 train_time:47755ms step_avg:60.15ms -step:795/2285 train_time:47816ms step_avg:60.15ms -step:796/2285 train_time:47876ms step_avg:60.15ms -step:797/2285 train_time:47938ms step_avg:60.15ms -step:798/2285 train_time:47997ms step_avg:60.15ms -step:799/2285 train_time:48059ms step_avg:60.15ms -step:800/2285 train_time:48119ms step_avg:60.15ms -step:801/2285 train_time:48181ms step_avg:60.15ms -step:802/2285 train_time:48241ms step_avg:60.15ms -step:803/2285 train_time:48304ms step_avg:60.15ms -step:804/2285 train_time:48364ms step_avg:60.15ms -step:805/2285 train_time:48427ms step_avg:60.16ms -step:806/2285 train_time:48487ms step_avg:60.16ms -step:807/2285 train_time:48549ms step_avg:60.16ms -step:808/2285 train_time:48608ms step_avg:60.16ms -step:809/2285 train_time:48669ms step_avg:60.16ms -step:810/2285 train_time:48728ms step_avg:60.16ms -step:811/2285 train_time:48790ms step_avg:60.16ms -step:812/2285 train_time:48850ms step_avg:60.16ms -step:813/2285 train_time:48912ms step_avg:60.16ms -step:814/2285 train_time:48972ms step_avg:60.16ms -step:815/2285 train_time:49033ms step_avg:60.16ms -step:816/2285 train_time:49093ms step_avg:60.16ms -step:817/2285 train_time:49155ms step_avg:60.16ms -step:818/2285 train_time:49214ms step_avg:60.16ms -step:819/2285 train_time:49276ms step_avg:60.17ms -step:820/2285 train_time:49336ms step_avg:60.17ms -step:821/2285 train_time:49399ms step_avg:60.17ms -step:822/2285 train_time:49458ms step_avg:60.17ms -step:823/2285 train_time:49520ms step_avg:60.17ms -step:824/2285 train_time:49579ms step_avg:60.17ms -step:825/2285 train_time:49642ms step_avg:60.17ms -step:826/2285 train_time:49701ms step_avg:60.17ms -step:827/2285 train_time:49763ms step_avg:60.17ms -step:828/2285 train_time:49823ms step_avg:60.17ms -step:829/2285 train_time:49885ms step_avg:60.17ms -step:830/2285 train_time:49944ms step_avg:60.17ms -step:831/2285 train_time:50006ms step_avg:60.18ms -step:832/2285 train_time:50066ms step_avg:60.17ms -step:833/2285 train_time:50127ms step_avg:60.18ms -step:834/2285 train_time:50187ms step_avg:60.18ms -step:835/2285 train_time:50249ms step_avg:60.18ms -step:836/2285 train_time:50309ms step_avg:60.18ms -step:837/2285 train_time:50371ms step_avg:60.18ms -step:838/2285 train_time:50432ms step_avg:60.18ms -step:839/2285 train_time:50494ms step_avg:60.18ms -step:840/2285 train_time:50553ms step_avg:60.18ms -step:841/2285 train_time:50614ms step_avg:60.18ms -step:842/2285 train_time:50674ms step_avg:60.18ms -step:843/2285 train_time:50736ms step_avg:60.19ms -step:844/2285 train_time:50795ms step_avg:60.18ms -step:845/2285 train_time:50857ms step_avg:60.19ms -step:846/2285 train_time:50916ms step_avg:60.18ms -step:847/2285 train_time:50978ms step_avg:60.19ms -step:848/2285 train_time:51037ms step_avg:60.18ms -step:849/2285 train_time:51098ms step_avg:60.19ms -step:850/2285 train_time:51158ms step_avg:60.19ms -step:851/2285 train_time:51220ms step_avg:60.19ms -step:852/2285 train_time:51280ms step_avg:60.19ms -step:853/2285 train_time:51343ms step_avg:60.19ms -step:854/2285 train_time:51403ms step_avg:60.19ms -step:855/2285 train_time:51466ms step_avg:60.19ms -step:856/2285 train_time:51525ms step_avg:60.19ms -step:857/2285 train_time:51587ms step_avg:60.19ms -step:858/2285 train_time:51646ms step_avg:60.19ms -step:859/2285 train_time:51708ms step_avg:60.20ms -step:860/2285 train_time:51767ms step_avg:60.19ms -step:861/2285 train_time:51829ms step_avg:60.20ms -step:862/2285 train_time:51889ms step_avg:60.20ms -step:863/2285 train_time:51951ms step_avg:60.20ms -step:864/2285 train_time:52010ms step_avg:60.20ms -step:865/2285 train_time:52072ms step_avg:60.20ms -step:866/2285 train_time:52132ms step_avg:60.20ms -step:867/2285 train_time:52194ms step_avg:60.20ms -step:868/2285 train_time:52253ms step_avg:60.20ms -step:869/2285 train_time:52315ms step_avg:60.20ms -step:870/2285 train_time:52375ms step_avg:60.20ms -step:871/2285 train_time:52438ms step_avg:60.20ms -step:872/2285 train_time:52497ms step_avg:60.20ms -step:873/2285 train_time:52558ms step_avg:60.20ms -step:874/2285 train_time:52617ms step_avg:60.20ms -step:875/2285 train_time:52679ms step_avg:60.21ms -step:876/2285 train_time:52739ms step_avg:60.20ms -step:877/2285 train_time:52801ms step_avg:60.21ms -step:878/2285 train_time:52861ms step_avg:60.21ms -step:879/2285 train_time:52923ms step_avg:60.21ms -step:880/2285 train_time:52983ms step_avg:60.21ms -step:881/2285 train_time:53045ms step_avg:60.21ms -step:882/2285 train_time:53104ms step_avg:60.21ms -step:883/2285 train_time:53166ms step_avg:60.21ms -step:884/2285 train_time:53226ms step_avg:60.21ms -step:885/2285 train_time:53288ms step_avg:60.21ms -step:886/2285 train_time:53347ms step_avg:60.21ms -step:887/2285 train_time:53409ms step_avg:60.21ms -step:888/2285 train_time:53469ms step_avg:60.21ms -step:889/2285 train_time:53531ms step_avg:60.21ms -step:890/2285 train_time:53591ms step_avg:60.21ms -step:891/2285 train_time:53652ms step_avg:60.22ms -step:892/2285 train_time:53712ms step_avg:60.22ms -step:893/2285 train_time:53774ms step_avg:60.22ms -step:894/2285 train_time:53834ms step_avg:60.22ms -step:895/2285 train_time:53895ms step_avg:60.22ms -step:896/2285 train_time:53955ms step_avg:60.22ms -step:897/2285 train_time:54016ms step_avg:60.22ms -step:898/2285 train_time:54075ms step_avg:60.22ms -step:899/2285 train_time:54137ms step_avg:60.22ms -step:900/2285 train_time:54196ms step_avg:60.22ms -step:901/2285 train_time:54258ms step_avg:60.22ms -step:902/2285 train_time:54317ms step_avg:60.22ms -step:903/2285 train_time:54379ms step_avg:60.22ms -step:904/2285 train_time:54439ms step_avg:60.22ms -step:905/2285 train_time:54501ms step_avg:60.22ms -step:906/2285 train_time:54560ms step_avg:60.22ms -step:907/2285 train_time:54622ms step_avg:60.22ms -step:908/2285 train_time:54682ms step_avg:60.22ms -step:909/2285 train_time:54744ms step_avg:60.22ms -step:910/2285 train_time:54804ms step_avg:60.22ms -step:911/2285 train_time:54866ms step_avg:60.23ms -step:912/2285 train_time:54925ms step_avg:60.23ms -step:913/2285 train_time:54987ms step_avg:60.23ms -step:914/2285 train_time:55046ms step_avg:60.23ms -step:915/2285 train_time:55108ms step_avg:60.23ms -step:916/2285 train_time:55167ms step_avg:60.23ms -step:917/2285 train_time:55229ms step_avg:60.23ms -step:918/2285 train_time:55288ms step_avg:60.23ms -step:919/2285 train_time:55351ms step_avg:60.23ms -step:920/2285 train_time:55410ms step_avg:60.23ms -step:921/2285 train_time:55472ms step_avg:60.23ms -step:922/2285 train_time:55532ms step_avg:60.23ms -step:923/2285 train_time:55593ms step_avg:60.23ms -step:924/2285 train_time:55653ms step_avg:60.23ms -step:925/2285 train_time:55715ms step_avg:60.23ms -step:926/2285 train_time:55775ms step_avg:60.23ms -step:927/2285 train_time:55837ms step_avg:60.23ms -step:928/2285 train_time:55897ms step_avg:60.23ms -step:929/2285 train_time:55958ms step_avg:60.23ms -step:930/2285 train_time:56017ms step_avg:60.23ms -step:931/2285 train_time:56079ms step_avg:60.23ms -step:932/2285 train_time:56138ms step_avg:60.23ms -step:933/2285 train_time:56200ms step_avg:60.24ms -step:934/2285 train_time:56259ms step_avg:60.23ms -step:935/2285 train_time:56322ms step_avg:60.24ms -step:936/2285 train_time:56382ms step_avg:60.24ms -step:937/2285 train_time:56444ms step_avg:60.24ms -step:938/2285 train_time:56504ms step_avg:60.24ms -step:939/2285 train_time:56566ms step_avg:60.24ms -step:940/2285 train_time:56625ms step_avg:60.24ms -step:941/2285 train_time:56688ms step_avg:60.24ms -step:942/2285 train_time:56747ms step_avg:60.24ms -step:943/2285 train_time:56808ms step_avg:60.24ms -step:944/2285 train_time:56868ms step_avg:60.24ms -step:945/2285 train_time:56931ms step_avg:60.24ms -step:946/2285 train_time:56991ms step_avg:60.24ms -step:947/2285 train_time:57052ms step_avg:60.25ms -step:948/2285 train_time:57112ms step_avg:60.24ms -step:949/2285 train_time:57174ms step_avg:60.25ms -step:950/2285 train_time:57234ms step_avg:60.25ms -step:951/2285 train_time:57295ms step_avg:60.25ms -step:952/2285 train_time:57355ms step_avg:60.25ms -step:953/2285 train_time:57417ms step_avg:60.25ms -step:954/2285 train_time:57476ms step_avg:60.25ms -step:955/2285 train_time:57538ms step_avg:60.25ms -step:956/2285 train_time:57597ms step_avg:60.25ms -step:957/2285 train_time:57659ms step_avg:60.25ms -step:958/2285 train_time:57718ms step_avg:60.25ms -step:959/2285 train_time:57781ms step_avg:60.25ms -step:960/2285 train_time:57841ms step_avg:60.25ms -step:961/2285 train_time:57902ms step_avg:60.25ms -step:962/2285 train_time:57962ms step_avg:60.25ms -step:963/2285 train_time:58025ms step_avg:60.25ms -step:964/2285 train_time:58085ms step_avg:60.25ms -step:965/2285 train_time:58147ms step_avg:60.26ms -step:966/2285 train_time:58206ms step_avg:60.25ms -step:967/2285 train_time:58268ms step_avg:60.26ms -step:968/2285 train_time:58327ms step_avg:60.26ms -step:969/2285 train_time:58390ms step_avg:60.26ms -step:970/2285 train_time:58449ms step_avg:60.26ms -step:971/2285 train_time:58511ms step_avg:60.26ms -step:972/2285 train_time:58571ms step_avg:60.26ms -step:973/2285 train_time:58633ms step_avg:60.26ms -step:974/2285 train_time:58693ms step_avg:60.26ms -step:975/2285 train_time:58755ms step_avg:60.26ms -step:976/2285 train_time:58815ms step_avg:60.26ms -step:977/2285 train_time:58877ms step_avg:60.26ms -step:978/2285 train_time:58936ms step_avg:60.26ms -step:979/2285 train_time:58997ms step_avg:60.26ms -step:980/2285 train_time:59057ms step_avg:60.26ms -step:981/2285 train_time:59119ms step_avg:60.26ms -step:982/2285 train_time:59179ms step_avg:60.26ms -step:983/2285 train_time:59241ms step_avg:60.27ms -step:984/2285 train_time:59301ms step_avg:60.27ms -step:985/2285 train_time:59363ms step_avg:60.27ms -step:986/2285 train_time:59423ms step_avg:60.27ms -step:987/2285 train_time:59485ms step_avg:60.27ms -step:988/2285 train_time:59545ms step_avg:60.27ms -step:989/2285 train_time:59607ms step_avg:60.27ms -step:990/2285 train_time:59666ms step_avg:60.27ms -step:991/2285 train_time:59728ms step_avg:60.27ms -step:992/2285 train_time:59787ms step_avg:60.27ms -step:993/2285 train_time:59849ms step_avg:60.27ms -step:994/2285 train_time:59909ms step_avg:60.27ms -step:995/2285 train_time:59971ms step_avg:60.27ms -step:996/2285 train_time:60031ms step_avg:60.27ms -step:997/2285 train_time:60093ms step_avg:60.27ms -step:998/2285 train_time:60152ms step_avg:60.27ms -step:999/2285 train_time:60214ms step_avg:60.27ms -step:1000/2285 train_time:60274ms step_avg:60.27ms -step:1000/2285 val_loss:3.5659 train_time:60338ms step_avg:60.34ms -step:1001/2285 train_time:60357ms step_avg:60.30ms -step:1002/2285 train_time:60400ms step_avg:60.28ms -step:1003/2285 train_time:60461ms step_avg:60.28ms -step:1004/2285 train_time:60521ms step_avg:60.28ms -step:1005/2285 train_time:60583ms step_avg:60.28ms -step:1006/2285 train_time:60643ms step_avg:60.28ms -step:1007/2285 train_time:60704ms step_avg:60.28ms -step:1008/2285 train_time:60763ms step_avg:60.28ms -step:1009/2285 train_time:60824ms step_avg:60.28ms -step:1010/2285 train_time:60883ms step_avg:60.28ms -step:1011/2285 train_time:60944ms step_avg:60.28ms -step:1012/2285 train_time:61003ms step_avg:60.28ms -step:1013/2285 train_time:61064ms step_avg:60.28ms -step:1014/2285 train_time:61123ms step_avg:60.28ms -step:1015/2285 train_time:61184ms step_avg:60.28ms -step:1016/2285 train_time:61243ms step_avg:60.28ms -step:1017/2285 train_time:61309ms step_avg:60.28ms -step:1018/2285 train_time:61371ms step_avg:60.29ms -step:1019/2285 train_time:61433ms step_avg:60.29ms -step:1020/2285 train_time:61492ms step_avg:60.29ms -step:1021/2285 train_time:61554ms step_avg:60.29ms -step:1022/2285 train_time:61614ms step_avg:60.29ms -step:1023/2285 train_time:61676ms step_avg:60.29ms -step:1024/2285 train_time:61735ms step_avg:60.29ms -step:1025/2285 train_time:61797ms step_avg:60.29ms -step:1026/2285 train_time:61857ms step_avg:60.29ms -step:1027/2285 train_time:61918ms step_avg:60.29ms -step:1028/2285 train_time:61977ms step_avg:60.29ms -step:1029/2285 train_time:62038ms step_avg:60.29ms -step:1030/2285 train_time:62097ms step_avg:60.29ms -step:1031/2285 train_time:62159ms step_avg:60.29ms -step:1032/2285 train_time:62219ms step_avg:60.29ms -step:1033/2285 train_time:62281ms step_avg:60.29ms -step:1034/2285 train_time:62341ms step_avg:60.29ms -step:1035/2285 train_time:62403ms step_avg:60.29ms -step:1036/2285 train_time:62462ms step_avg:60.29ms -step:1037/2285 train_time:62524ms step_avg:60.29ms -step:1038/2285 train_time:62585ms step_avg:60.29ms -step:1039/2285 train_time:62647ms step_avg:60.30ms -step:1040/2285 train_time:62706ms step_avg:60.29ms -step:1041/2285 train_time:62768ms step_avg:60.30ms -step:1042/2285 train_time:62828ms step_avg:60.30ms -step:1043/2285 train_time:62891ms step_avg:60.30ms -step:1044/2285 train_time:62950ms step_avg:60.30ms -step:1045/2285 train_time:63012ms step_avg:60.30ms -step:1046/2285 train_time:63071ms step_avg:60.30ms -step:1047/2285 train_time:63133ms step_avg:60.30ms -step:1048/2285 train_time:63192ms step_avg:60.30ms -step:1049/2285 train_time:63254ms step_avg:60.30ms -step:1050/2285 train_time:63314ms step_avg:60.30ms -step:1051/2285 train_time:63376ms step_avg:60.30ms -step:1052/2285 train_time:63435ms step_avg:60.30ms -step:1053/2285 train_time:63497ms step_avg:60.30ms -step:1054/2285 train_time:63557ms step_avg:60.30ms -step:1055/2285 train_time:63619ms step_avg:60.30ms -step:1056/2285 train_time:63678ms step_avg:60.30ms -step:1057/2285 train_time:63740ms step_avg:60.30ms -step:1058/2285 train_time:63799ms step_avg:60.30ms -step:1059/2285 train_time:63860ms step_avg:60.30ms -step:1060/2285 train_time:63920ms step_avg:60.30ms -step:1061/2285 train_time:63982ms step_avg:60.30ms -step:1062/2285 train_time:64041ms step_avg:60.30ms -step:1063/2285 train_time:64102ms step_avg:60.30ms -step:1064/2285 train_time:64162ms step_avg:60.30ms -step:1065/2285 train_time:64224ms step_avg:60.30ms -step:1066/2285 train_time:64284ms step_avg:60.30ms -step:1067/2285 train_time:64345ms step_avg:60.30ms -step:1068/2285 train_time:64405ms step_avg:60.30ms -step:1069/2285 train_time:64467ms step_avg:60.31ms -step:1070/2285 train_time:64527ms step_avg:60.31ms -step:1071/2285 train_time:64589ms step_avg:60.31ms -step:1072/2285 train_time:64649ms step_avg:60.31ms -step:1073/2285 train_time:64710ms step_avg:60.31ms -step:1074/2285 train_time:64770ms step_avg:60.31ms -step:1075/2285 train_time:64831ms step_avg:60.31ms -step:1076/2285 train_time:64891ms step_avg:60.31ms -step:1077/2285 train_time:64953ms step_avg:60.31ms -step:1078/2285 train_time:65012ms step_avg:60.31ms -step:1079/2285 train_time:65074ms step_avg:60.31ms -step:1080/2285 train_time:65134ms step_avg:60.31ms -step:1081/2285 train_time:65195ms step_avg:60.31ms -step:1082/2285 train_time:65255ms step_avg:60.31ms -step:1083/2285 train_time:65317ms step_avg:60.31ms -step:1084/2285 train_time:65377ms step_avg:60.31ms -step:1085/2285 train_time:65440ms step_avg:60.31ms -step:1086/2285 train_time:65499ms step_avg:60.31ms -step:1087/2285 train_time:65561ms step_avg:60.31ms -step:1088/2285 train_time:65620ms step_avg:60.31ms -step:1089/2285 train_time:65682ms step_avg:60.31ms -step:1090/2285 train_time:65742ms step_avg:60.31ms -step:1091/2285 train_time:65804ms step_avg:60.31ms -step:1092/2285 train_time:65863ms step_avg:60.31ms -step:1093/2285 train_time:65925ms step_avg:60.32ms -step:1094/2285 train_time:65985ms step_avg:60.32ms -step:1095/2285 train_time:66047ms step_avg:60.32ms -step:1096/2285 train_time:66106ms step_avg:60.32ms -step:1097/2285 train_time:66169ms step_avg:60.32ms -step:1098/2285 train_time:66228ms step_avg:60.32ms -step:1099/2285 train_time:66290ms step_avg:60.32ms -step:1100/2285 train_time:66350ms step_avg:60.32ms -step:1101/2285 train_time:66412ms step_avg:60.32ms -step:1102/2285 train_time:66471ms step_avg:60.32ms -step:1103/2285 train_time:66533ms step_avg:60.32ms -step:1104/2285 train_time:66593ms step_avg:60.32ms -step:1105/2285 train_time:66654ms step_avg:60.32ms -step:1106/2285 train_time:66714ms step_avg:60.32ms -step:1107/2285 train_time:66776ms step_avg:60.32ms -step:1108/2285 train_time:66836ms step_avg:60.32ms -step:1109/2285 train_time:66898ms step_avg:60.32ms -step:1110/2285 train_time:66958ms step_avg:60.32ms -step:1111/2285 train_time:67020ms step_avg:60.32ms -step:1112/2285 train_time:67079ms step_avg:60.32ms -step:1113/2285 train_time:67141ms step_avg:60.32ms -step:1114/2285 train_time:67201ms step_avg:60.32ms -step:1115/2285 train_time:67263ms step_avg:60.33ms -step:1116/2285 train_time:67322ms step_avg:60.32ms -step:1117/2285 train_time:67384ms step_avg:60.33ms -step:1118/2285 train_time:67443ms step_avg:60.33ms -step:1119/2285 train_time:67506ms step_avg:60.33ms -step:1120/2285 train_time:67565ms step_avg:60.33ms -step:1121/2285 train_time:67627ms step_avg:60.33ms -step:1122/2285 train_time:67687ms step_avg:60.33ms -step:1123/2285 train_time:67750ms step_avg:60.33ms -step:1124/2285 train_time:67809ms step_avg:60.33ms -step:1125/2285 train_time:67870ms step_avg:60.33ms -step:1126/2285 train_time:67930ms step_avg:60.33ms -step:1127/2285 train_time:67992ms step_avg:60.33ms -step:1128/2285 train_time:68052ms step_avg:60.33ms -step:1129/2285 train_time:68115ms step_avg:60.33ms -step:1130/2285 train_time:68174ms step_avg:60.33ms -step:1131/2285 train_time:68236ms step_avg:60.33ms -step:1132/2285 train_time:68295ms step_avg:60.33ms -step:1133/2285 train_time:68357ms step_avg:60.33ms -step:1134/2285 train_time:68417ms step_avg:60.33ms -step:1135/2285 train_time:68480ms step_avg:60.34ms -step:1136/2285 train_time:68539ms step_avg:60.33ms -step:1137/2285 train_time:68601ms step_avg:60.34ms -step:1138/2285 train_time:68660ms step_avg:60.33ms -step:1139/2285 train_time:68722ms step_avg:60.34ms -step:1140/2285 train_time:68781ms step_avg:60.33ms -step:1141/2285 train_time:68843ms step_avg:60.34ms -step:1142/2285 train_time:68902ms step_avg:60.33ms -step:1143/2285 train_time:68964ms step_avg:60.34ms -step:1144/2285 train_time:69024ms step_avg:60.34ms -step:1145/2285 train_time:69086ms step_avg:60.34ms -step:1146/2285 train_time:69146ms step_avg:60.34ms -step:1147/2285 train_time:69209ms step_avg:60.34ms -step:1148/2285 train_time:69270ms step_avg:60.34ms -step:1149/2285 train_time:69332ms step_avg:60.34ms -step:1150/2285 train_time:69392ms step_avg:60.34ms -step:1151/2285 train_time:69454ms step_avg:60.34ms -step:1152/2285 train_time:69513ms step_avg:60.34ms -step:1153/2285 train_time:69576ms step_avg:60.34ms -step:1154/2285 train_time:69635ms step_avg:60.34ms -step:1155/2285 train_time:69698ms step_avg:60.34ms -step:1156/2285 train_time:69758ms step_avg:60.34ms -step:1157/2285 train_time:69820ms step_avg:60.35ms -step:1158/2285 train_time:69880ms step_avg:60.35ms -step:1159/2285 train_time:69942ms step_avg:60.35ms -step:1160/2285 train_time:70002ms step_avg:60.35ms -step:1161/2285 train_time:70064ms step_avg:60.35ms -step:1162/2285 train_time:70123ms step_avg:60.35ms -step:1163/2285 train_time:70185ms step_avg:60.35ms -step:1164/2285 train_time:70245ms step_avg:60.35ms -step:1165/2285 train_time:70307ms step_avg:60.35ms -step:1166/2285 train_time:70368ms step_avg:60.35ms -step:1167/2285 train_time:70431ms step_avg:60.35ms -step:1168/2285 train_time:70491ms step_avg:60.35ms -step:1169/2285 train_time:70553ms step_avg:60.35ms -step:1170/2285 train_time:70613ms step_avg:60.35ms -step:1171/2285 train_time:70674ms step_avg:60.35ms -step:1172/2285 train_time:70734ms step_avg:60.35ms -step:1173/2285 train_time:70796ms step_avg:60.35ms -step:1174/2285 train_time:70856ms step_avg:60.35ms -step:1175/2285 train_time:70918ms step_avg:60.36ms -step:1176/2285 train_time:70978ms step_avg:60.36ms -step:1177/2285 train_time:71040ms step_avg:60.36ms -step:1178/2285 train_time:71099ms step_avg:60.36ms -step:1179/2285 train_time:71161ms step_avg:60.36ms -step:1180/2285 train_time:71221ms step_avg:60.36ms -step:1181/2285 train_time:71284ms step_avg:60.36ms -step:1182/2285 train_time:71344ms step_avg:60.36ms -step:1183/2285 train_time:71406ms step_avg:60.36ms -step:1184/2285 train_time:71467ms step_avg:60.36ms -step:1185/2285 train_time:71530ms step_avg:60.36ms -step:1186/2285 train_time:71591ms step_avg:60.36ms -step:1187/2285 train_time:71653ms step_avg:60.36ms -step:1188/2285 train_time:71712ms step_avg:60.36ms -step:1189/2285 train_time:71775ms step_avg:60.37ms -step:1190/2285 train_time:71834ms step_avg:60.36ms -step:1191/2285 train_time:71896ms step_avg:60.37ms -step:1192/2285 train_time:71956ms step_avg:60.37ms -step:1193/2285 train_time:72018ms step_avg:60.37ms -step:1194/2285 train_time:72078ms step_avg:60.37ms -step:1195/2285 train_time:72141ms step_avg:60.37ms -step:1196/2285 train_time:72200ms step_avg:60.37ms -step:1197/2285 train_time:72262ms step_avg:60.37ms -step:1198/2285 train_time:72323ms step_avg:60.37ms -step:1199/2285 train_time:72385ms step_avg:60.37ms -step:1200/2285 train_time:72444ms step_avg:60.37ms -step:1201/2285 train_time:72506ms step_avg:60.37ms -step:1202/2285 train_time:72566ms step_avg:60.37ms -step:1203/2285 train_time:72629ms step_avg:60.37ms -step:1204/2285 train_time:72689ms step_avg:60.37ms -step:1205/2285 train_time:72751ms step_avg:60.37ms -step:1206/2285 train_time:72811ms step_avg:60.37ms -step:1207/2285 train_time:72873ms step_avg:60.38ms -step:1208/2285 train_time:72933ms step_avg:60.37ms -step:1209/2285 train_time:72994ms step_avg:60.38ms -step:1210/2285 train_time:73054ms step_avg:60.38ms -step:1211/2285 train_time:73117ms step_avg:60.38ms -step:1212/2285 train_time:73177ms step_avg:60.38ms -step:1213/2285 train_time:73240ms step_avg:60.38ms -step:1214/2285 train_time:73300ms step_avg:60.38ms -step:1215/2285 train_time:73362ms step_avg:60.38ms -step:1216/2285 train_time:73422ms step_avg:60.38ms -step:1217/2285 train_time:73484ms step_avg:60.38ms -step:1218/2285 train_time:73544ms step_avg:60.38ms -step:1219/2285 train_time:73606ms step_avg:60.38ms -step:1220/2285 train_time:73666ms step_avg:60.38ms -step:1221/2285 train_time:73728ms step_avg:60.38ms -step:1222/2285 train_time:73789ms step_avg:60.38ms -step:1223/2285 train_time:73851ms step_avg:60.39ms -step:1224/2285 train_time:73911ms step_avg:60.38ms -step:1225/2285 train_time:73973ms step_avg:60.39ms -step:1226/2285 train_time:74032ms step_avg:60.39ms -step:1227/2285 train_time:74094ms step_avg:60.39ms -step:1228/2285 train_time:74154ms step_avg:60.39ms -step:1229/2285 train_time:74216ms step_avg:60.39ms -step:1230/2285 train_time:74277ms step_avg:60.39ms -step:1231/2285 train_time:74340ms step_avg:60.39ms -step:1232/2285 train_time:74399ms step_avg:60.39ms -step:1233/2285 train_time:74461ms step_avg:60.39ms -step:1234/2285 train_time:74521ms step_avg:60.39ms -step:1235/2285 train_time:74584ms step_avg:60.39ms -step:1236/2285 train_time:74644ms step_avg:60.39ms -step:1237/2285 train_time:74706ms step_avg:60.39ms -step:1238/2285 train_time:74766ms step_avg:60.39ms -step:1239/2285 train_time:74829ms step_avg:60.39ms -step:1240/2285 train_time:74888ms step_avg:60.39ms -step:1241/2285 train_time:74950ms step_avg:60.40ms -step:1242/2285 train_time:75010ms step_avg:60.39ms -step:1243/2285 train_time:75073ms step_avg:60.40ms -step:1244/2285 train_time:75133ms step_avg:60.40ms -step:1245/2285 train_time:75195ms step_avg:60.40ms -step:1246/2285 train_time:75254ms step_avg:60.40ms -step:1247/2285 train_time:75316ms step_avg:60.40ms -step:1248/2285 train_time:75376ms step_avg:60.40ms -step:1249/2285 train_time:75439ms step_avg:60.40ms -step:1250/2285 train_time:75500ms step_avg:60.40ms -step:1250/2285 val_loss:3.4966 train_time:75563ms step_avg:60.45ms -step:1251/2285 train_time:75586ms step_avg:60.42ms -step:1252/2285 train_time:75626ms step_avg:60.40ms -step:1253/2285 train_time:75687ms step_avg:60.40ms -step:1254/2285 train_time:75745ms step_avg:60.40ms -step:1255/2285 train_time:75807ms step_avg:60.40ms -step:1256/2285 train_time:75866ms step_avg:60.40ms -step:1257/2285 train_time:75927ms step_avg:60.40ms -step:1258/2285 train_time:75986ms step_avg:60.40ms -step:1259/2285 train_time:76047ms step_avg:60.40ms -step:1260/2285 train_time:76106ms step_avg:60.40ms -step:1261/2285 train_time:76167ms step_avg:60.40ms -step:1262/2285 train_time:76226ms step_avg:60.40ms -step:1263/2285 train_time:76287ms step_avg:60.40ms -step:1264/2285 train_time:76346ms step_avg:60.40ms -step:1265/2285 train_time:76407ms step_avg:60.40ms -step:1266/2285 train_time:76476ms step_avg:60.41ms -step:1267/2285 train_time:76544ms step_avg:60.41ms -step:1268/2285 train_time:76604ms step_avg:60.41ms -step:1269/2285 train_time:76667ms step_avg:60.41ms -step:1270/2285 train_time:76726ms step_avg:60.41ms -step:1271/2285 train_time:76787ms step_avg:60.42ms -step:1272/2285 train_time:76847ms step_avg:60.41ms -step:1273/2285 train_time:76908ms step_avg:60.42ms -step:1274/2285 train_time:76967ms step_avg:60.41ms -step:1275/2285 train_time:77028ms step_avg:60.41ms -step:1276/2285 train_time:77087ms step_avg:60.41ms -step:1277/2285 train_time:77148ms step_avg:60.41ms -step:1278/2285 train_time:77208ms step_avg:60.41ms -step:1279/2285 train_time:77269ms step_avg:60.41ms -step:1280/2285 train_time:77328ms step_avg:60.41ms -step:1281/2285 train_time:77391ms step_avg:60.41ms -step:1282/2285 train_time:77454ms step_avg:60.42ms -step:1283/2285 train_time:77518ms step_avg:60.42ms -step:1284/2285 train_time:77579ms step_avg:60.42ms -step:1285/2285 train_time:77641ms step_avg:60.42ms -step:1286/2285 train_time:77701ms step_avg:60.42ms -step:1287/2285 train_time:77763ms step_avg:60.42ms -step:1288/2285 train_time:77822ms step_avg:60.42ms -step:1289/2285 train_time:77884ms step_avg:60.42ms -step:1290/2285 train_time:77943ms step_avg:60.42ms -step:1291/2285 train_time:78004ms step_avg:60.42ms -step:1292/2285 train_time:78063ms step_avg:60.42ms -step:1293/2285 train_time:78125ms step_avg:60.42ms -step:1294/2285 train_time:78185ms step_avg:60.42ms -step:1295/2285 train_time:78246ms step_avg:60.42ms -step:1296/2285 train_time:78305ms step_avg:60.42ms -step:1297/2285 train_time:78367ms step_avg:60.42ms -step:1298/2285 train_time:78428ms step_avg:60.42ms -step:1299/2285 train_time:78491ms step_avg:60.42ms -step:1300/2285 train_time:78553ms step_avg:60.43ms -step:1301/2285 train_time:78617ms step_avg:60.43ms -step:1302/2285 train_time:78676ms step_avg:60.43ms -step:1303/2285 train_time:78738ms step_avg:60.43ms -step:1304/2285 train_time:78798ms step_avg:60.43ms -step:1305/2285 train_time:78859ms step_avg:60.43ms -step:1306/2285 train_time:78919ms step_avg:60.43ms -step:1307/2285 train_time:78982ms step_avg:60.43ms -step:1308/2285 train_time:79041ms step_avg:60.43ms -step:1309/2285 train_time:79103ms step_avg:60.43ms -step:1310/2285 train_time:79162ms step_avg:60.43ms -step:1311/2285 train_time:79224ms step_avg:60.43ms -step:1312/2285 train_time:79283ms step_avg:60.43ms -step:1313/2285 train_time:79345ms step_avg:60.43ms -step:1314/2285 train_time:79405ms step_avg:60.43ms -step:1315/2285 train_time:79468ms step_avg:60.43ms -step:1316/2285 train_time:79529ms step_avg:60.43ms -step:1317/2285 train_time:79592ms step_avg:60.43ms -step:1318/2285 train_time:79653ms step_avg:60.44ms -step:1319/2285 train_time:79716ms step_avg:60.44ms -step:1320/2285 train_time:79775ms step_avg:60.44ms -step:1321/2285 train_time:79837ms step_avg:60.44ms -step:1322/2285 train_time:79897ms step_avg:60.44ms -step:1323/2285 train_time:79959ms step_avg:60.44ms -step:1324/2285 train_time:80019ms step_avg:60.44ms -step:1325/2285 train_time:80081ms step_avg:60.44ms -step:1326/2285 train_time:80141ms step_avg:60.44ms -step:1327/2285 train_time:80203ms step_avg:60.44ms -step:1328/2285 train_time:80263ms step_avg:60.44ms -step:1329/2285 train_time:80324ms step_avg:60.44ms -step:1330/2285 train_time:80384ms step_avg:60.44ms -step:1331/2285 train_time:80446ms step_avg:60.44ms -step:1332/2285 train_time:80506ms step_avg:60.44ms -step:1333/2285 train_time:80569ms step_avg:60.44ms -step:1334/2285 train_time:80628ms step_avg:60.44ms -step:1335/2285 train_time:80691ms step_avg:60.44ms -step:1336/2285 train_time:80752ms step_avg:60.44ms -step:1337/2285 train_time:80815ms step_avg:60.44ms -step:1338/2285 train_time:80874ms step_avg:60.44ms -step:1339/2285 train_time:80936ms step_avg:60.45ms -step:1340/2285 train_time:80996ms step_avg:60.45ms -step:1341/2285 train_time:81058ms step_avg:60.45ms -step:1342/2285 train_time:81117ms step_avg:60.44ms -step:1343/2285 train_time:81179ms step_avg:60.45ms -step:1344/2285 train_time:81238ms step_avg:60.45ms -step:1345/2285 train_time:81300ms step_avg:60.45ms -step:1346/2285 train_time:81362ms step_avg:60.45ms -step:1347/2285 train_time:81424ms step_avg:60.45ms -step:1348/2285 train_time:81483ms step_avg:60.45ms -step:1349/2285 train_time:81546ms step_avg:60.45ms -step:1350/2285 train_time:81606ms step_avg:60.45ms -step:1351/2285 train_time:81668ms step_avg:60.45ms -step:1352/2285 train_time:81727ms step_avg:60.45ms -step:1353/2285 train_time:81790ms step_avg:60.45ms -step:1354/2285 train_time:81850ms step_avg:60.45ms -step:1355/2285 train_time:81913ms step_avg:60.45ms -step:1356/2285 train_time:81973ms step_avg:60.45ms -step:1357/2285 train_time:82035ms step_avg:60.45ms -step:1358/2285 train_time:82094ms step_avg:60.45ms -step:1359/2285 train_time:82157ms step_avg:60.45ms -step:1360/2285 train_time:82216ms step_avg:60.45ms -step:1361/2285 train_time:82278ms step_avg:60.45ms -step:1362/2285 train_time:82338ms step_avg:60.45ms -step:1363/2285 train_time:82400ms step_avg:60.45ms -step:1364/2285 train_time:82460ms step_avg:60.45ms -step:1365/2285 train_time:82523ms step_avg:60.46ms -step:1366/2285 train_time:82583ms step_avg:60.46ms -step:1367/2285 train_time:82645ms step_avg:60.46ms -step:1368/2285 train_time:82705ms step_avg:60.46ms -step:1369/2285 train_time:82767ms step_avg:60.46ms -step:1370/2285 train_time:82827ms step_avg:60.46ms -step:1371/2285 train_time:82889ms step_avg:60.46ms -step:1372/2285 train_time:82949ms step_avg:60.46ms -step:1373/2285 train_time:83011ms step_avg:60.46ms -step:1374/2285 train_time:83071ms step_avg:60.46ms -step:1375/2285 train_time:83133ms step_avg:60.46ms -step:1376/2285 train_time:83193ms step_avg:60.46ms -step:1377/2285 train_time:83255ms step_avg:60.46ms -step:1378/2285 train_time:83315ms step_avg:60.46ms -step:1379/2285 train_time:83377ms step_avg:60.46ms -step:1380/2285 train_time:83436ms step_avg:60.46ms -step:1381/2285 train_time:83499ms step_avg:60.46ms -step:1382/2285 train_time:83559ms step_avg:60.46ms -step:1383/2285 train_time:83621ms step_avg:60.46ms -step:1384/2285 train_time:83681ms step_avg:60.46ms -step:1385/2285 train_time:83744ms step_avg:60.46ms -step:1386/2285 train_time:83804ms step_avg:60.46ms -step:1387/2285 train_time:83866ms step_avg:60.47ms -step:1388/2285 train_time:83926ms step_avg:60.47ms -step:1389/2285 train_time:83987ms step_avg:60.47ms -step:1390/2285 train_time:84047ms step_avg:60.47ms -step:1391/2285 train_time:84110ms step_avg:60.47ms -step:1392/2285 train_time:84170ms step_avg:60.47ms -step:1393/2285 train_time:84232ms step_avg:60.47ms -step:1394/2285 train_time:84292ms step_avg:60.47ms -step:1395/2285 train_time:84355ms step_avg:60.47ms -step:1396/2285 train_time:84415ms step_avg:60.47ms -step:1397/2285 train_time:84477ms step_avg:60.47ms -step:1398/2285 train_time:84536ms step_avg:60.47ms -step:1399/2285 train_time:84599ms step_avg:60.47ms -step:1400/2285 train_time:84659ms step_avg:60.47ms -step:1401/2285 train_time:84721ms step_avg:60.47ms -step:1402/2285 train_time:84781ms step_avg:60.47ms -step:1403/2285 train_time:84843ms step_avg:60.47ms -step:1404/2285 train_time:84903ms step_avg:60.47ms -step:1405/2285 train_time:84965ms step_avg:60.47ms -step:1406/2285 train_time:85025ms step_avg:60.47ms -step:1407/2285 train_time:85087ms step_avg:60.47ms -step:1408/2285 train_time:85146ms step_avg:60.47ms -step:1409/2285 train_time:85208ms step_avg:60.47ms -step:1410/2285 train_time:85268ms step_avg:60.47ms -step:1411/2285 train_time:85330ms step_avg:60.47ms -step:1412/2285 train_time:85390ms step_avg:60.47ms -step:1413/2285 train_time:85454ms step_avg:60.48ms -step:1414/2285 train_time:85514ms step_avg:60.48ms -step:1415/2285 train_time:85576ms step_avg:60.48ms -step:1416/2285 train_time:85636ms step_avg:60.48ms -step:1417/2285 train_time:85698ms step_avg:60.48ms -step:1418/2285 train_time:85758ms step_avg:60.48ms -step:1419/2285 train_time:85820ms step_avg:60.48ms -step:1420/2285 train_time:85880ms step_avg:60.48ms -step:1421/2285 train_time:85943ms step_avg:60.48ms -step:1422/2285 train_time:86003ms step_avg:60.48ms -step:1423/2285 train_time:86064ms step_avg:60.48ms -step:1424/2285 train_time:86124ms step_avg:60.48ms -step:1425/2285 train_time:86186ms step_avg:60.48ms -step:1426/2285 train_time:86245ms step_avg:60.48ms -step:1427/2285 train_time:86307ms step_avg:60.48ms -step:1428/2285 train_time:86367ms step_avg:60.48ms -step:1429/2285 train_time:86430ms step_avg:60.48ms -step:1430/2285 train_time:86491ms step_avg:60.48ms -step:1431/2285 train_time:86554ms step_avg:60.49ms -step:1432/2285 train_time:86614ms step_avg:60.48ms -step:1433/2285 train_time:86676ms step_avg:60.49ms -step:1434/2285 train_time:86736ms step_avg:60.49ms -step:1435/2285 train_time:86798ms step_avg:60.49ms -step:1436/2285 train_time:86858ms step_avg:60.49ms -step:1437/2285 train_time:86920ms step_avg:60.49ms -step:1438/2285 train_time:86979ms step_avg:60.49ms -step:1439/2285 train_time:87041ms step_avg:60.49ms -step:1440/2285 train_time:87101ms step_avg:60.49ms -step:1441/2285 train_time:87163ms step_avg:60.49ms -step:1442/2285 train_time:87223ms step_avg:60.49ms -step:1443/2285 train_time:87285ms step_avg:60.49ms -step:1444/2285 train_time:87345ms step_avg:60.49ms -step:1445/2285 train_time:87407ms step_avg:60.49ms -step:1446/2285 train_time:87467ms step_avg:60.49ms -step:1447/2285 train_time:87529ms step_avg:60.49ms -step:1448/2285 train_time:87590ms step_avg:60.49ms -step:1449/2285 train_time:87654ms step_avg:60.49ms -step:1450/2285 train_time:87714ms step_avg:60.49ms -step:1451/2285 train_time:87776ms step_avg:60.49ms -step:1452/2285 train_time:87835ms step_avg:60.49ms -step:1453/2285 train_time:87897ms step_avg:60.49ms -step:1454/2285 train_time:87957ms step_avg:60.49ms -step:1455/2285 train_time:88019ms step_avg:60.49ms -step:1456/2285 train_time:88079ms step_avg:60.49ms -step:1457/2285 train_time:88141ms step_avg:60.49ms -step:1458/2285 train_time:88201ms step_avg:60.49ms -step:1459/2285 train_time:88263ms step_avg:60.50ms -step:1460/2285 train_time:88323ms step_avg:60.50ms -step:1461/2285 train_time:88385ms step_avg:60.50ms -step:1462/2285 train_time:88445ms step_avg:60.50ms -step:1463/2285 train_time:88508ms step_avg:60.50ms -step:1464/2285 train_time:88568ms step_avg:60.50ms -step:1465/2285 train_time:88630ms step_avg:60.50ms -step:1466/2285 train_time:88690ms step_avg:60.50ms -step:1467/2285 train_time:88753ms step_avg:60.50ms -step:1468/2285 train_time:88813ms step_avg:60.50ms -step:1469/2285 train_time:88875ms step_avg:60.50ms -step:1470/2285 train_time:88935ms step_avg:60.50ms -step:1471/2285 train_time:88997ms step_avg:60.50ms -step:1472/2285 train_time:89057ms step_avg:60.50ms -step:1473/2285 train_time:89119ms step_avg:60.50ms -step:1474/2285 train_time:89179ms step_avg:60.50ms -step:1475/2285 train_time:89241ms step_avg:60.50ms -step:1476/2285 train_time:89301ms step_avg:60.50ms -step:1477/2285 train_time:89364ms step_avg:60.50ms -step:1478/2285 train_time:89424ms step_avg:60.50ms -step:1479/2285 train_time:89486ms step_avg:60.50ms -step:1480/2285 train_time:89545ms step_avg:60.50ms -step:1481/2285 train_time:89608ms step_avg:60.50ms -step:1482/2285 train_time:89667ms step_avg:60.50ms -step:1483/2285 train_time:89729ms step_avg:60.51ms -step:1484/2285 train_time:89789ms step_avg:60.50ms -step:1485/2285 train_time:89853ms step_avg:60.51ms -step:1486/2285 train_time:89913ms step_avg:60.51ms -step:1487/2285 train_time:89975ms step_avg:60.51ms -step:1488/2285 train_time:90035ms step_avg:60.51ms -step:1489/2285 train_time:90097ms step_avg:60.51ms -step:1490/2285 train_time:90156ms step_avg:60.51ms -step:1491/2285 train_time:90218ms step_avg:60.51ms -step:1492/2285 train_time:90278ms step_avg:60.51ms -step:1493/2285 train_time:90340ms step_avg:60.51ms -step:1494/2285 train_time:90401ms step_avg:60.51ms -step:1495/2285 train_time:90465ms step_avg:60.51ms -step:1496/2285 train_time:90524ms step_avg:60.51ms -step:1497/2285 train_time:90586ms step_avg:60.51ms -step:1498/2285 train_time:90645ms step_avg:60.51ms -step:1499/2285 train_time:90707ms step_avg:60.51ms -step:1500/2285 train_time:90767ms step_avg:60.51ms -step:1500/2285 val_loss:3.4283 train_time:90830ms step_avg:60.55ms -step:1501/2285 train_time:90848ms step_avg:60.53ms -step:1502/2285 train_time:90892ms step_avg:60.51ms -step:1503/2285 train_time:90957ms step_avg:60.52ms -step:1504/2285 train_time:91017ms step_avg:60.52ms -step:1505/2285 train_time:91079ms step_avg:60.52ms -step:1506/2285 train_time:91138ms step_avg:60.52ms -step:1507/2285 train_time:91200ms step_avg:60.52ms -step:1508/2285 train_time:91259ms step_avg:60.52ms -step:1509/2285 train_time:91320ms step_avg:60.52ms -step:1510/2285 train_time:91380ms step_avg:60.52ms -step:1511/2285 train_time:91443ms step_avg:60.52ms -step:1512/2285 train_time:91502ms step_avg:60.52ms -step:1513/2285 train_time:91563ms step_avg:60.52ms -step:1514/2285 train_time:91623ms step_avg:60.52ms -step:1515/2285 train_time:91684ms step_avg:60.52ms -step:1516/2285 train_time:91745ms step_avg:60.52ms -step:1517/2285 train_time:91809ms step_avg:60.52ms -step:1518/2285 train_time:91870ms step_avg:60.52ms -step:1519/2285 train_time:91933ms step_avg:60.52ms -step:1520/2285 train_time:91994ms step_avg:60.52ms -step:1521/2285 train_time:92056ms step_avg:60.52ms -step:1522/2285 train_time:92116ms step_avg:60.52ms -step:1523/2285 train_time:92177ms step_avg:60.52ms -step:1524/2285 train_time:92236ms step_avg:60.52ms -step:1525/2285 train_time:92298ms step_avg:60.52ms -step:1526/2285 train_time:92358ms step_avg:60.52ms -step:1527/2285 train_time:92421ms step_avg:60.52ms -step:1528/2285 train_time:92480ms step_avg:60.52ms -step:1529/2285 train_time:92542ms step_avg:60.52ms -step:1530/2285 train_time:92601ms step_avg:60.52ms -step:1531/2285 train_time:92664ms step_avg:60.52ms -step:1532/2285 train_time:92724ms step_avg:60.52ms -step:1533/2285 train_time:92788ms step_avg:60.53ms -step:1534/2285 train_time:92849ms step_avg:60.53ms -step:1535/2285 train_time:92913ms step_avg:60.53ms -step:1536/2285 train_time:92973ms step_avg:60.53ms -step:1537/2285 train_time:93035ms step_avg:60.53ms -step:1538/2285 train_time:93095ms step_avg:60.53ms -step:1539/2285 train_time:93157ms step_avg:60.53ms -step:1540/2285 train_time:93217ms step_avg:60.53ms -step:1541/2285 train_time:93279ms step_avg:60.53ms -step:1542/2285 train_time:93339ms step_avg:60.53ms -step:1543/2285 train_time:93401ms step_avg:60.53ms -step:1544/2285 train_time:93461ms step_avg:60.53ms -step:1545/2285 train_time:93522ms step_avg:60.53ms -step:1546/2285 train_time:93582ms step_avg:60.53ms -step:1547/2285 train_time:93643ms step_avg:60.53ms -step:1548/2285 train_time:93704ms step_avg:60.53ms -step:1549/2285 train_time:93767ms step_avg:60.53ms -step:1550/2285 train_time:93828ms step_avg:60.53ms -step:1551/2285 train_time:93891ms step_avg:60.54ms -step:1552/2285 train_time:93950ms step_avg:60.53ms -step:1553/2285 train_time:94013ms step_avg:60.54ms -step:1554/2285 train_time:94073ms step_avg:60.54ms -step:1555/2285 train_time:94135ms step_avg:60.54ms -step:1556/2285 train_time:94196ms step_avg:60.54ms -step:1557/2285 train_time:94258ms step_avg:60.54ms -step:1558/2285 train_time:94317ms step_avg:60.54ms -step:1559/2285 train_time:94380ms step_avg:60.54ms -step:1560/2285 train_time:94439ms step_avg:60.54ms -step:1561/2285 train_time:94501ms step_avg:60.54ms -step:1562/2285 train_time:94560ms step_avg:60.54ms -step:1563/2285 train_time:94622ms step_avg:60.54ms -step:1564/2285 train_time:94682ms step_avg:60.54ms -step:1565/2285 train_time:94745ms step_avg:60.54ms -step:1566/2285 train_time:94807ms step_avg:60.54ms -step:1567/2285 train_time:94870ms step_avg:60.54ms -step:1568/2285 train_time:94930ms step_avg:60.54ms -step:1569/2285 train_time:94993ms step_avg:60.54ms -step:1570/2285 train_time:95053ms step_avg:60.54ms -step:1571/2285 train_time:95116ms step_avg:60.54ms -step:1572/2285 train_time:95176ms step_avg:60.54ms -step:1573/2285 train_time:95238ms step_avg:60.55ms -step:1574/2285 train_time:95298ms step_avg:60.55ms -step:1575/2285 train_time:95360ms step_avg:60.55ms -step:1576/2285 train_time:95419ms step_avg:60.55ms -step:1577/2285 train_time:95482ms step_avg:60.55ms -step:1578/2285 train_time:95542ms step_avg:60.55ms -step:1579/2285 train_time:95603ms step_avg:60.55ms -step:1580/2285 train_time:95663ms step_avg:60.55ms -step:1581/2285 train_time:95726ms step_avg:60.55ms -step:1582/2285 train_time:95788ms step_avg:60.55ms -step:1583/2285 train_time:95850ms step_avg:60.55ms -step:1584/2285 train_time:95910ms step_avg:60.55ms -step:1585/2285 train_time:95973ms step_avg:60.55ms -step:1586/2285 train_time:96033ms step_avg:60.55ms -step:1587/2285 train_time:96095ms step_avg:60.55ms -step:1588/2285 train_time:96155ms step_avg:60.55ms -step:1589/2285 train_time:96217ms step_avg:60.55ms -step:1590/2285 train_time:96277ms step_avg:60.55ms -step:1591/2285 train_time:96339ms step_avg:60.55ms -step:1592/2285 train_time:96398ms step_avg:60.55ms -step:1593/2285 train_time:96460ms step_avg:60.55ms -step:1594/2285 train_time:96520ms step_avg:60.55ms -step:1595/2285 train_time:96583ms step_avg:60.55ms -step:1596/2285 train_time:96642ms step_avg:60.55ms -step:1597/2285 train_time:96705ms step_avg:60.55ms -step:1598/2285 train_time:96766ms step_avg:60.55ms -step:1599/2285 train_time:96830ms step_avg:60.56ms -step:1600/2285 train_time:96890ms step_avg:60.56ms -step:1601/2285 train_time:96951ms step_avg:60.56ms -step:1602/2285 train_time:97011ms step_avg:60.56ms -step:1603/2285 train_time:97073ms step_avg:60.56ms -step:1604/2285 train_time:97133ms step_avg:60.56ms -step:1605/2285 train_time:97195ms step_avg:60.56ms -step:1606/2285 train_time:97255ms step_avg:60.56ms -step:1607/2285 train_time:97317ms step_avg:60.56ms -step:1608/2285 train_time:97377ms step_avg:60.56ms -step:1609/2285 train_time:97440ms step_avg:60.56ms -step:1610/2285 train_time:97500ms step_avg:60.56ms -step:1611/2285 train_time:97561ms step_avg:60.56ms -step:1612/2285 train_time:97621ms step_avg:60.56ms -step:1613/2285 train_time:97683ms step_avg:60.56ms -step:1614/2285 train_time:97744ms step_avg:60.56ms -step:1615/2285 train_time:97808ms step_avg:60.56ms -step:1616/2285 train_time:97869ms step_avg:60.56ms -step:1617/2285 train_time:97931ms step_avg:60.56ms -step:1618/2285 train_time:97992ms step_avg:60.56ms -step:1619/2285 train_time:98054ms step_avg:60.56ms -step:1620/2285 train_time:98114ms step_avg:60.56ms -step:1621/2285 train_time:98176ms step_avg:60.56ms -step:1622/2285 train_time:98235ms step_avg:60.56ms -step:1623/2285 train_time:98297ms step_avg:60.57ms -step:1624/2285 train_time:98357ms step_avg:60.56ms -step:1625/2285 train_time:98419ms step_avg:60.57ms -step:1626/2285 train_time:98479ms step_avg:60.57ms -step:1627/2285 train_time:98541ms step_avg:60.57ms -step:1628/2285 train_time:98601ms step_avg:60.57ms -step:1629/2285 train_time:98663ms step_avg:60.57ms -step:1630/2285 train_time:98723ms step_avg:60.57ms -step:1631/2285 train_time:98786ms step_avg:60.57ms -step:1632/2285 train_time:98848ms step_avg:60.57ms -step:1633/2285 train_time:98911ms step_avg:60.57ms -step:1634/2285 train_time:98970ms step_avg:60.57ms -step:1635/2285 train_time:99033ms step_avg:60.57ms -step:1636/2285 train_time:99093ms step_avg:60.57ms -step:1637/2285 train_time:99156ms step_avg:60.57ms -step:1638/2285 train_time:99216ms step_avg:60.57ms -step:1639/2285 train_time:99278ms step_avg:60.57ms -step:1640/2285 train_time:99338ms step_avg:60.57ms -step:1641/2285 train_time:99400ms step_avg:60.57ms -step:1642/2285 train_time:99460ms step_avg:60.57ms -step:1643/2285 train_time:99522ms step_avg:60.57ms -step:1644/2285 train_time:99582ms step_avg:60.57ms -step:1645/2285 train_time:99643ms step_avg:60.57ms -step:1646/2285 train_time:99703ms step_avg:60.57ms -step:1647/2285 train_time:99766ms step_avg:60.57ms -step:1648/2285 train_time:99827ms step_avg:60.57ms -step:1649/2285 train_time:99890ms step_avg:60.58ms -step:1650/2285 train_time:99950ms step_avg:60.58ms -step:1651/2285 train_time:100013ms step_avg:60.58ms -step:1652/2285 train_time:100073ms step_avg:60.58ms -step:1653/2285 train_time:100135ms step_avg:60.58ms -step:1654/2285 train_time:100195ms step_avg:60.58ms -step:1655/2285 train_time:100257ms step_avg:60.58ms -step:1656/2285 train_time:100317ms step_avg:60.58ms -step:1657/2285 train_time:100379ms step_avg:60.58ms -step:1658/2285 train_time:100439ms step_avg:60.58ms -step:1659/2285 train_time:100501ms step_avg:60.58ms -step:1660/2285 train_time:100561ms step_avg:60.58ms -step:1661/2285 train_time:100624ms step_avg:60.58ms -step:1662/2285 train_time:100684ms step_avg:60.58ms -step:1663/2285 train_time:100747ms step_avg:60.58ms -step:1664/2285 train_time:100807ms step_avg:60.58ms -step:1665/2285 train_time:100870ms step_avg:60.58ms -step:1666/2285 train_time:100930ms step_avg:60.58ms -step:1667/2285 train_time:100992ms step_avg:60.58ms -step:1668/2285 train_time:101052ms step_avg:60.58ms -step:1669/2285 train_time:101114ms step_avg:60.58ms -step:1670/2285 train_time:101174ms step_avg:60.58ms -step:1671/2285 train_time:101236ms step_avg:60.58ms -step:1672/2285 train_time:101297ms step_avg:60.58ms -step:1673/2285 train_time:101359ms step_avg:60.59ms -step:1674/2285 train_time:101419ms step_avg:60.58ms -step:1675/2285 train_time:101481ms step_avg:60.59ms -step:1676/2285 train_time:101541ms step_avg:60.59ms -step:1677/2285 train_time:101603ms step_avg:60.59ms -step:1678/2285 train_time:101664ms step_avg:60.59ms -step:1679/2285 train_time:101726ms step_avg:60.59ms -step:1680/2285 train_time:101787ms step_avg:60.59ms -step:1681/2285 train_time:101849ms step_avg:60.59ms -step:1682/2285 train_time:101909ms step_avg:60.59ms -step:1683/2285 train_time:101971ms step_avg:60.59ms -step:1684/2285 train_time:102031ms step_avg:60.59ms -step:1685/2285 train_time:102093ms step_avg:60.59ms -step:1686/2285 train_time:102153ms step_avg:60.59ms -step:1687/2285 train_time:102216ms step_avg:60.59ms -step:1688/2285 train_time:102276ms step_avg:60.59ms -step:1689/2285 train_time:102338ms step_avg:60.59ms -step:1690/2285 train_time:102398ms step_avg:60.59ms -step:1691/2285 train_time:102460ms step_avg:60.59ms -step:1692/2285 train_time:102520ms step_avg:60.59ms -step:1693/2285 train_time:102582ms step_avg:60.59ms -step:1694/2285 train_time:102642ms step_avg:60.59ms -step:1695/2285 train_time:102705ms step_avg:60.59ms -step:1696/2285 train_time:102765ms step_avg:60.59ms -step:1697/2285 train_time:102828ms step_avg:60.59ms -step:1698/2285 train_time:102888ms step_avg:60.59ms -step:1699/2285 train_time:102950ms step_avg:60.59ms -step:1700/2285 train_time:103011ms step_avg:60.59ms -step:1701/2285 train_time:103073ms step_avg:60.60ms -step:1702/2285 train_time:103133ms step_avg:60.60ms -step:1703/2285 train_time:103195ms step_avg:60.60ms -step:1704/2285 train_time:103255ms step_avg:60.60ms -step:1705/2285 train_time:103317ms step_avg:60.60ms -step:1706/2285 train_time:103377ms step_avg:60.60ms -step:1707/2285 train_time:103440ms step_avg:60.60ms -step:1708/2285 train_time:103500ms step_avg:60.60ms -step:1709/2285 train_time:103562ms step_avg:60.60ms -step:1710/2285 train_time:103621ms step_avg:60.60ms -step:1711/2285 train_time:103684ms step_avg:60.60ms -step:1712/2285 train_time:103743ms step_avg:60.60ms -step:1713/2285 train_time:103805ms step_avg:60.60ms -step:1714/2285 train_time:103866ms step_avg:60.60ms -step:1715/2285 train_time:103929ms step_avg:60.60ms -step:1716/2285 train_time:103990ms step_avg:60.60ms -step:1717/2285 train_time:104052ms step_avg:60.60ms -step:1718/2285 train_time:104112ms step_avg:60.60ms -step:1719/2285 train_time:104175ms step_avg:60.60ms -step:1720/2285 train_time:104235ms step_avg:60.60ms -step:1721/2285 train_time:104297ms step_avg:60.60ms -step:1722/2285 train_time:104356ms step_avg:60.60ms -step:1723/2285 train_time:104419ms step_avg:60.60ms -step:1724/2285 train_time:104478ms step_avg:60.60ms -step:1725/2285 train_time:104540ms step_avg:60.60ms -step:1726/2285 train_time:104601ms step_avg:60.60ms -step:1727/2285 train_time:104663ms step_avg:60.60ms -step:1728/2285 train_time:104723ms step_avg:60.60ms -step:1729/2285 train_time:104786ms step_avg:60.60ms -step:1730/2285 train_time:104846ms step_avg:60.60ms -step:1731/2285 train_time:104909ms step_avg:60.61ms -step:1732/2285 train_time:104969ms step_avg:60.61ms -step:1733/2285 train_time:105031ms step_avg:60.61ms -step:1734/2285 train_time:105092ms step_avg:60.61ms -step:1735/2285 train_time:105154ms step_avg:60.61ms -step:1736/2285 train_time:105215ms step_avg:60.61ms -step:1737/2285 train_time:105277ms step_avg:60.61ms -step:1738/2285 train_time:105336ms step_avg:60.61ms -step:1739/2285 train_time:105399ms step_avg:60.61ms -step:1740/2285 train_time:105459ms step_avg:60.61ms -step:1741/2285 train_time:105521ms step_avg:60.61ms -step:1742/2285 train_time:105580ms step_avg:60.61ms -step:1743/2285 train_time:105642ms step_avg:60.61ms -step:1744/2285 train_time:105703ms step_avg:60.61ms -step:1745/2285 train_time:105765ms step_avg:60.61ms -step:1746/2285 train_time:105826ms step_avg:60.61ms -step:1747/2285 train_time:105889ms step_avg:60.61ms -step:1748/2285 train_time:105949ms step_avg:60.61ms -step:1749/2285 train_time:106011ms step_avg:60.61ms -step:1750/2285 train_time:106071ms step_avg:60.61ms -step:1750/2285 val_loss:3.3684 train_time:106135ms step_avg:60.65ms -step:1751/2285 train_time:106157ms step_avg:60.63ms -step:1752/2285 train_time:106196ms step_avg:60.61ms -step:1753/2285 train_time:106259ms step_avg:60.62ms -step:1754/2285 train_time:106321ms step_avg:60.62ms -step:1755/2285 train_time:106386ms step_avg:60.62ms -step:1756/2285 train_time:106447ms step_avg:60.62ms -step:1757/2285 train_time:106508ms step_avg:60.62ms -step:1758/2285 train_time:106567ms step_avg:60.62ms -step:1759/2285 train_time:106628ms step_avg:60.62ms -step:1760/2285 train_time:106687ms step_avg:60.62ms -step:1761/2285 train_time:106749ms step_avg:60.62ms -step:1762/2285 train_time:106808ms step_avg:60.62ms -step:1763/2285 train_time:106870ms step_avg:60.62ms -step:1764/2285 train_time:106930ms step_avg:60.62ms -step:1765/2285 train_time:106991ms step_avg:60.62ms -step:1766/2285 train_time:107052ms step_avg:60.62ms -step:1767/2285 train_time:107115ms step_avg:60.62ms -step:1768/2285 train_time:107176ms step_avg:60.62ms -step:1769/2285 train_time:107239ms step_avg:60.62ms -step:1770/2285 train_time:107300ms step_avg:60.62ms -step:1771/2285 train_time:107362ms step_avg:60.62ms -step:1772/2285 train_time:107423ms step_avg:60.62ms -step:1773/2285 train_time:107485ms step_avg:60.62ms -step:1774/2285 train_time:107545ms step_avg:60.62ms -step:1775/2285 train_time:107607ms step_avg:60.62ms -step:1776/2285 train_time:107666ms step_avg:60.62ms -step:1777/2285 train_time:107727ms step_avg:60.62ms -step:1778/2285 train_time:107786ms step_avg:60.62ms -step:1779/2285 train_time:107848ms step_avg:60.62ms -step:1780/2285 train_time:107907ms step_avg:60.62ms -step:1781/2285 train_time:107969ms step_avg:60.62ms -step:1782/2285 train_time:108029ms step_avg:60.62ms -step:1783/2285 train_time:108093ms step_avg:60.62ms -step:1784/2285 train_time:108153ms step_avg:60.62ms -step:1785/2285 train_time:108216ms step_avg:60.63ms -step:1786/2285 train_time:108276ms step_avg:60.63ms -step:1787/2285 train_time:108339ms step_avg:60.63ms -step:1788/2285 train_time:108399ms step_avg:60.63ms -step:1789/2285 train_time:108462ms step_avg:60.63ms -step:1790/2285 train_time:108522ms step_avg:60.63ms -step:1791/2285 train_time:108584ms step_avg:60.63ms -step:1792/2285 train_time:108644ms step_avg:60.63ms -step:1793/2285 train_time:108706ms step_avg:60.63ms -step:1794/2285 train_time:108766ms step_avg:60.63ms -step:1795/2285 train_time:108828ms step_avg:60.63ms -step:1796/2285 train_time:108887ms step_avg:60.63ms -step:1797/2285 train_time:108949ms step_avg:60.63ms -step:1798/2285 train_time:109008ms step_avg:60.63ms -step:1799/2285 train_time:109072ms step_avg:60.63ms -step:1800/2285 train_time:109133ms step_avg:60.63ms -step:1801/2285 train_time:109196ms step_avg:60.63ms -step:1802/2285 train_time:109256ms step_avg:60.63ms -step:1803/2285 train_time:109319ms step_avg:60.63ms -step:1804/2285 train_time:109378ms step_avg:60.63ms -step:1805/2285 train_time:109441ms step_avg:60.63ms -step:1806/2285 train_time:109500ms step_avg:60.63ms -step:1807/2285 train_time:109562ms step_avg:60.63ms -step:1808/2285 train_time:109622ms step_avg:60.63ms -step:1809/2285 train_time:109684ms step_avg:60.63ms -step:1810/2285 train_time:109744ms step_avg:60.63ms -step:1811/2285 train_time:109806ms step_avg:60.63ms -step:1812/2285 train_time:109866ms step_avg:60.63ms -step:1813/2285 train_time:109929ms step_avg:60.63ms -step:1814/2285 train_time:109988ms step_avg:60.63ms -step:1815/2285 train_time:110051ms step_avg:60.63ms -step:1816/2285 train_time:110111ms step_avg:60.63ms -step:1817/2285 train_time:110174ms step_avg:60.63ms -step:1818/2285 train_time:110235ms step_avg:60.64ms -step:1819/2285 train_time:110297ms step_avg:60.64ms -step:1820/2285 train_time:110357ms step_avg:60.64ms -step:1821/2285 train_time:110420ms step_avg:60.64ms -step:1822/2285 train_time:110479ms step_avg:60.64ms -step:1823/2285 train_time:110541ms step_avg:60.64ms -step:1824/2285 train_time:110601ms step_avg:60.64ms -step:1825/2285 train_time:110662ms step_avg:60.64ms -step:1826/2285 train_time:110722ms step_avg:60.64ms -step:1827/2285 train_time:110784ms step_avg:60.64ms -step:1828/2285 train_time:110844ms step_avg:60.64ms -step:1829/2285 train_time:110907ms step_avg:60.64ms -step:1830/2285 train_time:110967ms step_avg:60.64ms -step:1831/2285 train_time:111029ms step_avg:60.64ms -step:1832/2285 train_time:111089ms step_avg:60.64ms -step:1833/2285 train_time:111151ms step_avg:60.64ms -step:1834/2285 train_time:111211ms step_avg:60.64ms -step:1835/2285 train_time:111274ms step_avg:60.64ms -step:1836/2285 train_time:111335ms step_avg:60.64ms -step:1837/2285 train_time:111398ms step_avg:60.64ms -step:1838/2285 train_time:111458ms step_avg:60.64ms -step:1839/2285 train_time:111520ms step_avg:60.64ms -step:1840/2285 train_time:111580ms step_avg:60.64ms -step:1841/2285 train_time:111642ms step_avg:60.64ms -step:1842/2285 train_time:111702ms step_avg:60.64ms -step:1843/2285 train_time:111764ms step_avg:60.64ms -step:1844/2285 train_time:111825ms step_avg:60.64ms -step:1845/2285 train_time:111888ms step_avg:60.64ms -step:1846/2285 train_time:111948ms step_avg:60.64ms -step:1847/2285 train_time:112010ms step_avg:60.64ms -step:1848/2285 train_time:112069ms step_avg:60.64ms -step:1849/2285 train_time:112132ms step_avg:60.64ms -step:1850/2285 train_time:112192ms step_avg:60.64ms -step:1851/2285 train_time:112254ms step_avg:60.65ms -step:1852/2285 train_time:112313ms step_avg:60.64ms -step:1853/2285 train_time:112376ms step_avg:60.65ms -step:1854/2285 train_time:112436ms step_avg:60.64ms -step:1855/2285 train_time:112498ms step_avg:60.65ms -step:1856/2285 train_time:112558ms step_avg:60.65ms -step:1857/2285 train_time:112620ms step_avg:60.65ms -step:1858/2285 train_time:112680ms step_avg:60.65ms -step:1859/2285 train_time:112742ms step_avg:60.65ms -step:1860/2285 train_time:112801ms step_avg:60.65ms -step:1861/2285 train_time:112865ms step_avg:60.65ms -step:1862/2285 train_time:112925ms step_avg:60.65ms -step:1863/2285 train_time:112987ms step_avg:60.65ms -step:1864/2285 train_time:113048ms step_avg:60.65ms -step:1865/2285 train_time:113110ms step_avg:60.65ms -step:1866/2285 train_time:113170ms step_avg:60.65ms -step:1867/2285 train_time:113232ms step_avg:60.65ms -step:1868/2285 train_time:113291ms step_avg:60.65ms -step:1869/2285 train_time:113354ms step_avg:60.65ms -step:1870/2285 train_time:113414ms step_avg:60.65ms -step:1871/2285 train_time:113476ms step_avg:60.65ms -step:1872/2285 train_time:113536ms step_avg:60.65ms -step:1873/2285 train_time:113598ms step_avg:60.65ms -step:1874/2285 train_time:113657ms step_avg:60.65ms -step:1875/2285 train_time:113720ms step_avg:60.65ms -step:1876/2285 train_time:113779ms step_avg:60.65ms -step:1877/2285 train_time:113842ms step_avg:60.65ms -step:1878/2285 train_time:113901ms step_avg:60.65ms -step:1879/2285 train_time:113964ms step_avg:60.65ms -step:1880/2285 train_time:114025ms step_avg:60.65ms -step:1881/2285 train_time:114088ms step_avg:60.65ms -step:1882/2285 train_time:114148ms step_avg:60.65ms -step:1883/2285 train_time:114210ms step_avg:60.65ms -step:1884/2285 train_time:114270ms step_avg:60.65ms -step:1885/2285 train_time:114332ms step_avg:60.65ms -step:1886/2285 train_time:114392ms step_avg:60.65ms -step:1887/2285 train_time:114454ms step_avg:60.65ms -step:1888/2285 train_time:114515ms step_avg:60.65ms -step:1889/2285 train_time:114577ms step_avg:60.65ms -step:1890/2285 train_time:114637ms step_avg:60.65ms -step:1891/2285 train_time:114699ms step_avg:60.66ms -step:1892/2285 train_time:114759ms step_avg:60.65ms -step:1893/2285 train_time:114821ms step_avg:60.66ms -step:1894/2285 train_time:114881ms step_avg:60.66ms -step:1895/2285 train_time:114943ms step_avg:60.66ms -step:1896/2285 train_time:115004ms step_avg:60.66ms -step:1897/2285 train_time:115066ms step_avg:60.66ms -step:1898/2285 train_time:115126ms step_avg:60.66ms -step:1899/2285 train_time:115189ms step_avg:60.66ms -step:1900/2285 train_time:115249ms step_avg:60.66ms -step:1901/2285 train_time:115312ms step_avg:60.66ms -step:1902/2285 train_time:115372ms step_avg:60.66ms -step:1903/2285 train_time:115434ms step_avg:60.66ms -step:1904/2285 train_time:115495ms step_avg:60.66ms -step:1905/2285 train_time:115557ms step_avg:60.66ms -step:1906/2285 train_time:115618ms step_avg:60.66ms -step:1907/2285 train_time:115679ms step_avg:60.66ms -step:1908/2285 train_time:115739ms step_avg:60.66ms -step:1909/2285 train_time:115801ms step_avg:60.66ms -step:1910/2285 train_time:115862ms step_avg:60.66ms -step:1911/2285 train_time:115924ms step_avg:60.66ms -step:1912/2285 train_time:115984ms step_avg:60.66ms -step:1913/2285 train_time:116047ms step_avg:60.66ms -step:1914/2285 train_time:116107ms step_avg:60.66ms -step:1915/2285 train_time:116170ms step_avg:60.66ms -step:1916/2285 train_time:116230ms step_avg:60.66ms -step:1917/2285 train_time:116293ms step_avg:60.66ms -step:1918/2285 train_time:116352ms step_avg:60.66ms -step:1919/2285 train_time:116414ms step_avg:60.66ms -step:1920/2285 train_time:116475ms step_avg:60.66ms -step:1921/2285 train_time:116537ms step_avg:60.66ms -step:1922/2285 train_time:116597ms step_avg:60.66ms -step:1923/2285 train_time:116659ms step_avg:60.67ms -step:1924/2285 train_time:116719ms step_avg:60.66ms -step:1925/2285 train_time:116781ms step_avg:60.67ms -step:1926/2285 train_time:116841ms step_avg:60.67ms -step:1927/2285 train_time:116904ms step_avg:60.67ms -step:1928/2285 train_time:116964ms step_avg:60.67ms -step:1929/2285 train_time:117027ms step_avg:60.67ms -step:1930/2285 train_time:117087ms step_avg:60.67ms -step:1931/2285 train_time:117150ms step_avg:60.67ms -step:1932/2285 train_time:117209ms step_avg:60.67ms -step:1933/2285 train_time:117272ms step_avg:60.67ms -step:1934/2285 train_time:117331ms step_avg:60.67ms -step:1935/2285 train_time:117394ms step_avg:60.67ms -step:1936/2285 train_time:117454ms step_avg:60.67ms -step:1937/2285 train_time:117517ms step_avg:60.67ms -step:1938/2285 train_time:117577ms step_avg:60.67ms -step:1939/2285 train_time:117639ms step_avg:60.67ms -step:1940/2285 train_time:117699ms step_avg:60.67ms -step:1941/2285 train_time:117762ms step_avg:60.67ms -step:1942/2285 train_time:117822ms step_avg:60.67ms -step:1943/2285 train_time:117884ms step_avg:60.67ms -step:1944/2285 train_time:117944ms step_avg:60.67ms -step:1945/2285 train_time:118006ms step_avg:60.67ms -step:1946/2285 train_time:118066ms step_avg:60.67ms -step:1947/2285 train_time:118129ms step_avg:60.67ms -step:1948/2285 train_time:118189ms step_avg:60.67ms -step:1949/2285 train_time:118252ms step_avg:60.67ms -step:1950/2285 train_time:118312ms step_avg:60.67ms -step:1951/2285 train_time:118374ms step_avg:60.67ms -step:1952/2285 train_time:118434ms step_avg:60.67ms -step:1953/2285 train_time:118496ms step_avg:60.67ms -step:1954/2285 train_time:118557ms step_avg:60.67ms -step:1955/2285 train_time:118619ms step_avg:60.67ms -step:1956/2285 train_time:118679ms step_avg:60.67ms -step:1957/2285 train_time:118742ms step_avg:60.68ms -step:1958/2285 train_time:118802ms step_avg:60.68ms -step:1959/2285 train_time:118864ms step_avg:60.68ms -step:1960/2285 train_time:118924ms step_avg:60.68ms -step:1961/2285 train_time:118987ms step_avg:60.68ms -step:1962/2285 train_time:119047ms step_avg:60.68ms -step:1963/2285 train_time:119109ms step_avg:60.68ms -step:1964/2285 train_time:119169ms step_avg:60.68ms -step:1965/2285 train_time:119232ms step_avg:60.68ms -step:1966/2285 train_time:119292ms step_avg:60.68ms -step:1967/2285 train_time:119355ms step_avg:60.68ms -step:1968/2285 train_time:119415ms step_avg:60.68ms -step:1969/2285 train_time:119477ms step_avg:60.68ms -step:1970/2285 train_time:119538ms step_avg:60.68ms -step:1971/2285 train_time:119600ms step_avg:60.68ms -step:1972/2285 train_time:119659ms step_avg:60.68ms -step:1973/2285 train_time:119722ms step_avg:60.68ms -step:1974/2285 train_time:119782ms step_avg:60.68ms -step:1975/2285 train_time:119844ms step_avg:60.68ms -step:1976/2285 train_time:119903ms step_avg:60.68ms -step:1977/2285 train_time:119966ms step_avg:60.68ms -step:1978/2285 train_time:120026ms step_avg:60.68ms -step:1979/2285 train_time:120088ms step_avg:60.68ms -step:1980/2285 train_time:120148ms step_avg:60.68ms -step:1981/2285 train_time:120210ms step_avg:60.68ms -step:1982/2285 train_time:120271ms step_avg:60.68ms -step:1983/2285 train_time:120334ms step_avg:60.68ms -step:1984/2285 train_time:120394ms step_avg:60.68ms -step:1985/2285 train_time:120456ms step_avg:60.68ms -step:1986/2285 train_time:120516ms step_avg:60.68ms -step:1987/2285 train_time:120578ms step_avg:60.68ms -step:1988/2285 train_time:120638ms step_avg:60.68ms -step:1989/2285 train_time:120701ms step_avg:60.68ms -step:1990/2285 train_time:120761ms step_avg:60.68ms -step:1991/2285 train_time:120823ms step_avg:60.68ms -step:1992/2285 train_time:120883ms step_avg:60.68ms -step:1993/2285 train_time:120946ms step_avg:60.69ms -step:1994/2285 train_time:121006ms step_avg:60.69ms -step:1995/2285 train_time:121069ms step_avg:60.69ms -step:1996/2285 train_time:121129ms step_avg:60.69ms -step:1997/2285 train_time:121191ms step_avg:60.69ms -step:1998/2285 train_time:121251ms step_avg:60.69ms -step:1999/2285 train_time:121314ms step_avg:60.69ms -step:2000/2285 train_time:121374ms step_avg:60.69ms -step:2000/2285 val_loss:3.3190 train_time:121438ms step_avg:60.72ms -step:2001/2285 train_time:121458ms step_avg:60.70ms -step:2002/2285 train_time:121499ms step_avg:60.69ms -step:2003/2285 train_time:121562ms step_avg:60.69ms -step:2004/2285 train_time:121624ms step_avg:60.69ms -step:2005/2285 train_time:121688ms step_avg:60.69ms -step:2006/2285 train_time:121749ms step_avg:60.69ms -step:2007/2285 train_time:121810ms step_avg:60.69ms -step:2008/2285 train_time:121870ms step_avg:60.69ms -step:2009/2285 train_time:121932ms step_avg:60.69ms -step:2010/2285 train_time:121991ms step_avg:60.69ms -step:2011/2285 train_time:122053ms step_avg:60.69ms -step:2012/2285 train_time:122112ms step_avg:60.69ms -step:2013/2285 train_time:122174ms step_avg:60.69ms -step:2014/2285 train_time:122233ms step_avg:60.69ms -step:2015/2285 train_time:122295ms step_avg:60.69ms -step:2016/2285 train_time:122356ms step_avg:60.69ms -step:2017/2285 train_time:122420ms step_avg:60.69ms -step:2018/2285 train_time:122481ms step_avg:60.69ms -step:2019/2285 train_time:122545ms step_avg:60.70ms -step:2020/2285 train_time:122606ms step_avg:60.70ms -step:2021/2285 train_time:122669ms step_avg:60.70ms -step:2022/2285 train_time:122730ms step_avg:60.70ms -step:2023/2285 train_time:122792ms step_avg:60.70ms -step:2024/2285 train_time:122852ms step_avg:60.70ms -step:2025/2285 train_time:122914ms step_avg:60.70ms -step:2026/2285 train_time:122973ms step_avg:60.70ms -step:2027/2285 train_time:123035ms step_avg:60.70ms -step:2028/2285 train_time:123095ms step_avg:60.70ms -step:2029/2285 train_time:123156ms step_avg:60.70ms -step:2030/2285 train_time:123216ms step_avg:60.70ms -step:2031/2285 train_time:123278ms step_avg:60.70ms -step:2032/2285 train_time:123338ms step_avg:60.70ms -step:2033/2285 train_time:123401ms step_avg:60.70ms -step:2034/2285 train_time:123461ms step_avg:60.70ms -step:2035/2285 train_time:123524ms step_avg:60.70ms -step:2036/2285 train_time:123584ms step_avg:60.70ms -step:2037/2285 train_time:123647ms step_avg:60.70ms -step:2038/2285 train_time:123707ms step_avg:60.70ms -step:2039/2285 train_time:123770ms step_avg:60.70ms -step:2040/2285 train_time:123830ms step_avg:60.70ms -step:2041/2285 train_time:123893ms step_avg:60.70ms -step:2042/2285 train_time:123952ms step_avg:60.70ms -step:2043/2285 train_time:124014ms step_avg:60.70ms -step:2044/2285 train_time:124075ms step_avg:60.70ms -step:2045/2285 train_time:124137ms step_avg:60.70ms -step:2046/2285 train_time:124196ms step_avg:60.70ms -step:2047/2285 train_time:124258ms step_avg:60.70ms -step:2048/2285 train_time:124319ms step_avg:60.70ms -step:2049/2285 train_time:124382ms step_avg:60.70ms -step:2050/2285 train_time:124442ms step_avg:60.70ms -step:2051/2285 train_time:124505ms step_avg:60.70ms -step:2052/2285 train_time:124566ms step_avg:60.70ms -step:2053/2285 train_time:124629ms step_avg:60.71ms -step:2054/2285 train_time:124689ms step_avg:60.71ms -step:2055/2285 train_time:124752ms step_avg:60.71ms -step:2056/2285 train_time:124812ms step_avg:60.71ms -step:2057/2285 train_time:124875ms step_avg:60.71ms -step:2058/2285 train_time:124934ms step_avg:60.71ms -step:2059/2285 train_time:124996ms step_avg:60.71ms -step:2060/2285 train_time:125056ms step_avg:60.71ms -step:2061/2285 train_time:125118ms step_avg:60.71ms -step:2062/2285 train_time:125178ms step_avg:60.71ms -step:2063/2285 train_time:125240ms step_avg:60.71ms -step:2064/2285 train_time:125300ms step_avg:60.71ms -step:2065/2285 train_time:125363ms step_avg:60.71ms -step:2066/2285 train_time:125423ms step_avg:60.71ms -step:2067/2285 train_time:125486ms step_avg:60.71ms -step:2068/2285 train_time:125546ms step_avg:60.71ms -step:2069/2285 train_time:125610ms step_avg:60.71ms -step:2070/2285 train_time:125670ms step_avg:60.71ms -step:2071/2285 train_time:125732ms step_avg:60.71ms -step:2072/2285 train_time:125792ms step_avg:60.71ms -step:2073/2285 train_time:125854ms step_avg:60.71ms -step:2074/2285 train_time:125914ms step_avg:60.71ms -step:2075/2285 train_time:125977ms step_avg:60.71ms -step:2076/2285 train_time:126037ms step_avg:60.71ms -step:2077/2285 train_time:126099ms step_avg:60.71ms -step:2078/2285 train_time:126159ms step_avg:60.71ms -step:2079/2285 train_time:126221ms step_avg:60.71ms -step:2080/2285 train_time:126282ms step_avg:60.71ms -step:2081/2285 train_time:126344ms step_avg:60.71ms -step:2082/2285 train_time:126404ms step_avg:60.71ms -step:2083/2285 train_time:126466ms step_avg:60.71ms -step:2084/2285 train_time:126526ms step_avg:60.71ms -step:2085/2285 train_time:126589ms step_avg:60.71ms -step:2086/2285 train_time:126649ms step_avg:60.71ms -step:2087/2285 train_time:126712ms step_avg:60.71ms -step:2088/2285 train_time:126772ms step_avg:60.71ms -step:2089/2285 train_time:126835ms step_avg:60.72ms -step:2090/2285 train_time:126894ms step_avg:60.71ms -step:2091/2285 train_time:126956ms step_avg:60.72ms -step:2092/2285 train_time:127016ms step_avg:60.72ms -step:2093/2285 train_time:127079ms step_avg:60.72ms -step:2094/2285 train_time:127138ms step_avg:60.72ms -step:2095/2285 train_time:127201ms step_avg:60.72ms -step:2096/2285 train_time:127261ms step_avg:60.72ms -step:2097/2285 train_time:127324ms step_avg:60.72ms -step:2098/2285 train_time:127384ms step_avg:60.72ms -step:2099/2285 train_time:127447ms step_avg:60.72ms -step:2100/2285 train_time:127507ms step_avg:60.72ms -step:2101/2285 train_time:127569ms step_avg:60.72ms -step:2102/2285 train_time:127630ms step_avg:60.72ms -step:2103/2285 train_time:127692ms step_avg:60.72ms -step:2104/2285 train_time:127752ms step_avg:60.72ms -step:2105/2285 train_time:127814ms step_avg:60.72ms -step:2106/2285 train_time:127875ms step_avg:60.72ms -step:2107/2285 train_time:127937ms step_avg:60.72ms -step:2108/2285 train_time:127997ms step_avg:60.72ms -step:2109/2285 train_time:128059ms step_avg:60.72ms -step:2110/2285 train_time:128120ms step_avg:60.72ms -step:2111/2285 train_time:128182ms step_avg:60.72ms -step:2112/2285 train_time:128241ms step_avg:60.72ms -step:2113/2285 train_time:128304ms step_avg:60.72ms -step:2114/2285 train_time:128364ms step_avg:60.72ms -step:2115/2285 train_time:128426ms step_avg:60.72ms -step:2116/2285 train_time:128486ms step_avg:60.72ms -step:2117/2285 train_time:128549ms step_avg:60.72ms -step:2118/2285 train_time:128609ms step_avg:60.72ms -step:2119/2285 train_time:128671ms step_avg:60.72ms -step:2120/2285 train_time:128731ms step_avg:60.72ms -step:2121/2285 train_time:128794ms step_avg:60.72ms -step:2122/2285 train_time:128854ms step_avg:60.72ms -step:2123/2285 train_time:128917ms step_avg:60.72ms -step:2124/2285 train_time:128977ms step_avg:60.72ms -step:2125/2285 train_time:129039ms step_avg:60.72ms -step:2126/2285 train_time:129100ms step_avg:60.72ms -step:2127/2285 train_time:129163ms step_avg:60.73ms -step:2128/2285 train_time:129223ms step_avg:60.72ms -step:2129/2285 train_time:129285ms step_avg:60.73ms -step:2130/2285 train_time:129344ms step_avg:60.72ms -step:2131/2285 train_time:129407ms step_avg:60.73ms -step:2132/2285 train_time:129467ms step_avg:60.73ms -step:2133/2285 train_time:129530ms step_avg:60.73ms -step:2134/2285 train_time:129591ms step_avg:60.73ms -step:2135/2285 train_time:129652ms step_avg:60.73ms -step:2136/2285 train_time:129713ms step_avg:60.73ms -step:2137/2285 train_time:129775ms step_avg:60.73ms -step:2138/2285 train_time:129835ms step_avg:60.73ms -step:2139/2285 train_time:129898ms step_avg:60.73ms -step:2140/2285 train_time:129958ms step_avg:60.73ms -step:2141/2285 train_time:130020ms step_avg:60.73ms -step:2142/2285 train_time:130080ms step_avg:60.73ms -step:2143/2285 train_time:130142ms step_avg:60.73ms -step:2144/2285 train_time:130202ms step_avg:60.73ms -step:2145/2285 train_time:130265ms step_avg:60.73ms -step:2146/2285 train_time:130325ms step_avg:60.73ms -step:2147/2285 train_time:130387ms step_avg:60.73ms -step:2148/2285 train_time:130447ms step_avg:60.73ms -step:2149/2285 train_time:130509ms step_avg:60.73ms -step:2150/2285 train_time:130569ms step_avg:60.73ms -step:2151/2285 train_time:130632ms step_avg:60.73ms -step:2152/2285 train_time:130692ms step_avg:60.73ms -step:2153/2285 train_time:130754ms step_avg:60.73ms -step:2154/2285 train_time:130815ms step_avg:60.73ms -step:2155/2285 train_time:130877ms step_avg:60.73ms -step:2156/2285 train_time:130937ms step_avg:60.73ms -step:2157/2285 train_time:130999ms step_avg:60.73ms -step:2158/2285 train_time:131060ms step_avg:60.73ms -step:2159/2285 train_time:131123ms step_avg:60.73ms -step:2160/2285 train_time:131182ms step_avg:60.73ms -step:2161/2285 train_time:131245ms step_avg:60.73ms -step:2162/2285 train_time:131305ms step_avg:60.73ms -step:2163/2285 train_time:131368ms step_avg:60.73ms -step:2164/2285 train_time:131428ms step_avg:60.73ms -step:2165/2285 train_time:131490ms step_avg:60.73ms -step:2166/2285 train_time:131550ms step_avg:60.73ms -step:2167/2285 train_time:131612ms step_avg:60.73ms -step:2168/2285 train_time:131672ms step_avg:60.73ms -step:2169/2285 train_time:131734ms step_avg:60.73ms -step:2170/2285 train_time:131794ms step_avg:60.73ms -step:2171/2285 train_time:131856ms step_avg:60.74ms -step:2172/2285 train_time:131917ms step_avg:60.74ms -step:2173/2285 train_time:131979ms step_avg:60.74ms -step:2174/2285 train_time:132039ms step_avg:60.74ms -step:2175/2285 train_time:132102ms step_avg:60.74ms -step:2176/2285 train_time:132163ms step_avg:60.74ms -step:2177/2285 train_time:132225ms step_avg:60.74ms -step:2178/2285 train_time:132285ms step_avg:60.74ms -step:2179/2285 train_time:132347ms step_avg:60.74ms -step:2180/2285 train_time:132407ms step_avg:60.74ms -step:2181/2285 train_time:132470ms step_avg:60.74ms -step:2182/2285 train_time:132530ms step_avg:60.74ms -step:2183/2285 train_time:132592ms step_avg:60.74ms -step:2184/2285 train_time:132652ms step_avg:60.74ms -step:2185/2285 train_time:132715ms step_avg:60.74ms -step:2186/2285 train_time:132775ms step_avg:60.74ms -step:2187/2285 train_time:132837ms step_avg:60.74ms -step:2188/2285 train_time:132898ms step_avg:60.74ms -step:2189/2285 train_time:132960ms step_avg:60.74ms -step:2190/2285 train_time:133020ms step_avg:60.74ms -step:2191/2285 train_time:133083ms step_avg:60.74ms -step:2192/2285 train_time:133143ms step_avg:60.74ms -step:2193/2285 train_time:133205ms step_avg:60.74ms -step:2194/2285 train_time:133266ms step_avg:60.74ms -step:2195/2285 train_time:133328ms step_avg:60.74ms -step:2196/2285 train_time:133388ms step_avg:60.74ms -step:2197/2285 train_time:133450ms step_avg:60.74ms -step:2198/2285 train_time:133510ms step_avg:60.74ms -step:2199/2285 train_time:133573ms step_avg:60.74ms -step:2200/2285 train_time:133632ms step_avg:60.74ms -step:2201/2285 train_time:133694ms step_avg:60.74ms -step:2202/2285 train_time:133754ms step_avg:60.74ms -step:2203/2285 train_time:133817ms step_avg:60.74ms -step:2204/2285 train_time:133878ms step_avg:60.74ms -step:2205/2285 train_time:133940ms step_avg:60.74ms -step:2206/2285 train_time:134000ms step_avg:60.74ms -step:2207/2285 train_time:134063ms step_avg:60.74ms -step:2208/2285 train_time:134123ms step_avg:60.74ms -step:2209/2285 train_time:134186ms step_avg:60.75ms -step:2210/2285 train_time:134246ms step_avg:60.74ms -step:2211/2285 train_time:134309ms step_avg:60.75ms -step:2212/2285 train_time:134369ms step_avg:60.75ms -step:2213/2285 train_time:134432ms step_avg:60.75ms -step:2214/2285 train_time:134492ms step_avg:60.75ms -step:2215/2285 train_time:134555ms step_avg:60.75ms -step:2216/2285 train_time:134614ms step_avg:60.75ms -step:2217/2285 train_time:134677ms step_avg:60.75ms -step:2218/2285 train_time:134737ms step_avg:60.75ms -step:2219/2285 train_time:134799ms step_avg:60.75ms -step:2220/2285 train_time:134859ms step_avg:60.75ms -step:2221/2285 train_time:134922ms step_avg:60.75ms -step:2222/2285 train_time:134982ms step_avg:60.75ms -step:2223/2285 train_time:135044ms step_avg:60.75ms -step:2224/2285 train_time:135105ms step_avg:60.75ms -step:2225/2285 train_time:135167ms step_avg:60.75ms -step:2226/2285 train_time:135227ms step_avg:60.75ms -step:2227/2285 train_time:135289ms step_avg:60.75ms -step:2228/2285 train_time:135349ms step_avg:60.75ms -step:2229/2285 train_time:135411ms step_avg:60.75ms -step:2230/2285 train_time:135472ms step_avg:60.75ms -step:2231/2285 train_time:135534ms step_avg:60.75ms -step:2232/2285 train_time:135594ms step_avg:60.75ms -step:2233/2285 train_time:135656ms step_avg:60.75ms -step:2234/2285 train_time:135716ms step_avg:60.75ms -step:2235/2285 train_time:135778ms step_avg:60.75ms -step:2236/2285 train_time:135838ms step_avg:60.75ms -step:2237/2285 train_time:135901ms step_avg:60.75ms -step:2238/2285 train_time:135961ms step_avg:60.75ms -step:2239/2285 train_time:136023ms step_avg:60.75ms -step:2240/2285 train_time:136083ms step_avg:60.75ms -step:2241/2285 train_time:136145ms step_avg:60.75ms -step:2242/2285 train_time:136206ms step_avg:60.75ms -step:2243/2285 train_time:136268ms step_avg:60.75ms -step:2244/2285 train_time:136328ms step_avg:60.75ms -step:2245/2285 train_time:136391ms step_avg:60.75ms -step:2246/2285 train_time:136450ms step_avg:60.75ms -step:2247/2285 train_time:136513ms step_avg:60.75ms -step:2248/2285 train_time:136573ms step_avg:60.75ms -step:2249/2285 train_time:136635ms step_avg:60.75ms -step:2250/2285 train_time:136695ms step_avg:60.75ms -step:2250/2285 val_loss:3.2836 train_time:136759ms step_avg:60.78ms -step:2251/2285 train_time:136778ms step_avg:60.76ms -step:2252/2285 train_time:136819ms step_avg:60.75ms -step:2253/2285 train_time:136884ms step_avg:60.76ms -step:2254/2285 train_time:136945ms step_avg:60.76ms -step:2255/2285 train_time:137007ms step_avg:60.76ms -step:2256/2285 train_time:137067ms step_avg:60.76ms -step:2257/2285 train_time:137128ms step_avg:60.76ms -step:2258/2285 train_time:137188ms step_avg:60.76ms -step:2259/2285 train_time:137250ms step_avg:60.76ms -step:2260/2285 train_time:137309ms step_avg:60.76ms -step:2261/2285 train_time:137372ms step_avg:60.76ms -step:2262/2285 train_time:137432ms step_avg:60.76ms -step:2263/2285 train_time:137494ms step_avg:60.76ms -step:2264/2285 train_time:137553ms step_avg:60.76ms -step:2265/2285 train_time:137615ms step_avg:60.76ms -step:2266/2285 train_time:137675ms step_avg:60.76ms -step:2267/2285 train_time:137740ms step_avg:60.76ms -step:2268/2285 train_time:137802ms step_avg:60.76ms -step:2269/2285 train_time:137864ms step_avg:60.76ms -step:2270/2285 train_time:137925ms step_avg:60.76ms -step:2271/2285 train_time:137988ms step_avg:60.76ms -step:2272/2285 train_time:138048ms step_avg:60.76ms -step:2273/2285 train_time:138110ms step_avg:60.76ms -step:2274/2285 train_time:138169ms step_avg:60.76ms -step:2275/2285 train_time:138231ms step_avg:60.76ms -step:2276/2285 train_time:138290ms step_avg:60.76ms -step:2277/2285 train_time:138352ms step_avg:60.76ms -step:2278/2285 train_time:138412ms step_avg:60.76ms -step:2279/2285 train_time:138474ms step_avg:60.76ms -step:2280/2285 train_time:138534ms step_avg:60.76ms -step:2281/2285 train_time:138596ms step_avg:60.76ms -step:2282/2285 train_time:138656ms step_avg:60.76ms -step:2283/2285 train_time:138720ms step_avg:60.76ms -step:2284/2285 train_time:138780ms step_avg:60.76ms -step:2285/2285 train_time:138843ms step_avg:60.76ms -step:2285/2285 val_loss:3.2776 train_time:138905ms step_avg:60.79ms -peak memory allocated: 29626 MiB reserved: 50528 MiB diff --git a/records/track_1_short/2025-10-27_FixMuonLR/74ef00d7-4030-46f2-a269-bea707f0f0bd.txt b/records/track_1_short/2025-10-27_FixMuonLR/74ef00d7-4030-46f2-a269-bea707f0f0bd.txt deleted file mode 100644 index 628b9ca0e..000000000 --- a/records/track_1_short/2025-10-27_FixMuonLR/74ef00d7-4030-46f2-a269-bea707f0f0bd.txt +++ /dev/null @@ -1,3814 +0,0 @@ -import os -import sys - -with open(sys.argv[0]) as f: - code = f.read() # read the code of this file ASAP, for logging -import copy -import glob -import math -import threading -import time -import uuid -from dataclasses import dataclass -from collections import defaultdict -from itertools import accumulate -from pathlib import Path - -os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" -import torch - -torch.empty( - 1, device="cuda", requires_grad=True -).backward() # prevents a bug on some systems -import torch._dynamo as dynamo -import torch.distributed as dist -import torch.nn.functional as F - -# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min -import triton -import triton.language as tl -from kernels import get_kernel -from torch import Tensor, nn - -dynamo.config.recompile_limit = 64 - -# ----------------------------------------------------------------------------- -# Custom operators: FP8 matmul by @YouJiacheng - - -@torch.library.custom_op("nanogpt::mm", mutates_args=()) -def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: - @torch.compile - def impl(x: Tensor, w: Tensor): - assert x.is_contiguous() and w.is_contiguous() - x_f8 = x.div(x_s).to(torch.float8_e4m3fn) - w_f8 = w.div(w_s).to(torch.float8_e4m3fn) - out = torch._scaled_mm( - x_f8, - w_f8.T, - out_dtype=torch.bfloat16, - scale_a=x.new_tensor(x_s, dtype=torch.float32), - scale_b=x.new_tensor(w_s, dtype=torch.float32), - use_fast_accum=True, - ) - return out, x_f8, w_f8 - - return impl(x, w) - -@mm_op.register_fake -def _(x: Tensor, w: Tensor, *_): - assert x.ndim == w.ndim == 2 - assert x.shape[1] == w.shape[1] - assert x.device == w.device - assert x.is_contiguous() and w.is_contiguous() - return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) - -@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) -def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: - @torch.compile - def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): - assert grad.is_contiguous() - x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) - w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) - grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) - grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) - grad_x = torch._scaled_mm( - grad_f8, - w_f8.T.contiguous().T, - out_dtype=torch.bfloat16, - scale_a=grad_inv_s, - scale_b=w_inv_s, - use_fast_accum=False, - ) - # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) - grad_w = torch._scaled_mm( - x_f8.T.contiguous(), - grad_f8.T.contiguous().T, - out_dtype=torch.float32, - scale_a=x_inv_s, - scale_b=grad_inv_s, - use_fast_accum=False, - ).T - return grad_x, grad_w - - return impl(g, x_f8, w_f8) - -@mm_backward_op.register_fake -def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): - return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) - -def backward(ctx, grad_out: Tensor, *_): - x_f8, w_f8 = ctx.saved_tensors - x_s, w_s, grad_s = ctx.scales - grad_x, grad_w = torch.ops.nanogpt.mm_backward( - grad_out, x_f8, w_f8, x_s, w_s, grad_s - ) - return grad_x, grad_w, None, None, None - -def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): - *_, x_s, w_s, grad_s = inputs - _, x_f8, w_f8 = output - ctx.save_for_backward(x_f8, w_f8) - ctx.scales = x_s, w_s, grad_s - ctx.set_materialize_grads(False) - -mm_op.register_autograd(backward, setup_context=setup_context) - -# ----------------------------------------------------------------------------- -# Triton kernel for symmetric matrix multiplication by @byronxu99 - -def _get_autotune_configs(): - return [ - triton.Config( - { - "BLOCK_SIZE_M": bm, - "BLOCK_SIZE_N": bn, - "BLOCK_SIZE_K": bk, - "GROUP_SIZE_M": 8, - "LOWER_UPPER": 1, - }, - num_stages=stages, - num_warps=warps, - ) - for bm in [64, 128] - for bn in [64, 128, 256] - for bk in [64, 128] - for stages, warps in [(3, 4), (3, 8), (4, 4)] - if bm // bn <= 2 and bn // bm <= 2 - ] - -@triton.jit -def _pid_to_block( - pid, - M, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, -): - # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) - num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) - num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) - - # Map PID to a single matrix in batch - batch_idx = pid // (num_pid_m * num_pid_n) - pid = pid % (num_pid_m * num_pid_n) - - # Map PID to 2D grid of blocks - pid_m = pid // num_pid_n - pid_n = pid % num_pid_n - pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) - - m_idx = pid_m * BLOCK_SIZE_M - n_idx = pid_n * BLOCK_SIZE_N - return batch_idx, m_idx, n_idx - -@triton.autotune( - configs=_get_autotune_configs(), - key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], -) -@triton.jit -def XXT_kernel( - A_ptr, C_ptr, - M, K, - a_stride_b, a_stride_r, a_stride_c, - c_stride_b, c_stride_r, c_stride_c, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, - LOWER_UPPER: tl.constexpr, -): - pid = tl.program_id(axis=0) - batch_idx, m_idx, n_idx = _pid_to_block( - pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M - ) - - # Skip blocks that don't need to be computed - skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) - skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) - if skip_block_below_diag or skip_block_above_diag: - return - - # Index into one matrix of batch - A_ptr += batch_idx * a_stride_b - C_ptr += batch_idx * c_stride_b - - # Create pointer arrays for A and A.T - offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M - offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M - offs_k = tl.arange(0, BLOCK_SIZE_K) - a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) - at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) - - accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - - # Accumulate over blocks of K - for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): - a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) - at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) - accumulator = tl.dot(a, at, accumulator) - a_ptrs += BLOCK_SIZE_K * a_stride_c - at_ptrs += BLOCK_SIZE_K * a_stride_c - - out_dtype = C_ptr.dtype.element_ty - output = accumulator.to(out_dtype) - - # Store block of C - offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) - offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) - c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) - c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) - tl.store(c_ptrs, output, mask=c_mask) - - # Store block of C mirrored across the diagonal - c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) - c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) - tl.store(c_ptrs_t, output.T, mask=c_mask_t) - -def XXT(A: torch.Tensor, out: torch.Tensor): - """ - Launch Triton kernel to compute C = A @ A.T - """ - assert A.ndim == 2 or A.ndim == 3 - M, K = A.shape[-2:] - assert out.size(-2) == M, "Output matrix has incorrect shape" - assert out.size(-1) == M, "Output matrix has incorrect shape" - - batch_size = A.size(0) if A.ndim == 3 else 1 - input_batch_stride = A.stride(0) if A.ndim == 3 else 0 - output_batch_stride = out.stride(0) if out.ndim == 3 else 0 - - grid = lambda meta: ( - batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), - ) - XXT_kernel[grid]( - A_ptr=A, - C_ptr=out, - M=M, - K=K, - a_stride_b=input_batch_stride, - a_stride_r=A.stride(-2), - a_stride_c=A.stride(-1), - c_stride_b=output_batch_stride, - c_stride_r=out.stride(-2), - c_stride_c=out.stride(-1), - ) - return out - -@triton.autotune( - configs=_get_autotune_configs(), - key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], -) -@triton.jit -def ba_plus_cAA_kernel( - A_ptr, C_ptr, - M, - a_stride_b, a_stride_r, a_stride_c, - c_stride_b, c_stride_r, c_stride_c, - alpha, beta, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, - LOWER_UPPER: tl.constexpr, -): - # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A - # Performance is slightly slower than XXT_kernel, so we use two separate kernels - pid = tl.program_id(axis=0) - batch_idx, m_idx, n_idx = _pid_to_block( - pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M - ) - - # Skip blocks that don't need to be computed - skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) - skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) - if skip_block_below_diag or skip_block_above_diag: - return - - # Index into one matrix of batch - A_ptr += batch_idx * a_stride_b - C_ptr += batch_idx * c_stride_b - - # Create pointer arrays for A and A.T - offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M - offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M - offs_k = tl.arange(0, BLOCK_SIZE_K) - a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) - at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) - - accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - - # Accumulate over blocks of K - for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): - a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) - at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) - accumulator = tl.dot(a, at, accumulator) - a_ptrs += BLOCK_SIZE_K * a_stride_c - at_ptrs += BLOCK_SIZE_K * a_stride_c - - # Load block of A to add (corresponds to the current block of C) - offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) - offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) - a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) - a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) - a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) - - # Apply alpha and beta - accumulator *= alpha - accumulator += a_add * beta - - out_dtype = C_ptr.dtype.element_ty - output = accumulator.to(out_dtype) - - # Store block of C - offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) - offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) - c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) - c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) - tl.store(c_ptrs, output, mask=c_mask) - - # Store block of C mirrored across the diagonal - c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) - c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) - tl.store(c_ptrs_t, output.T, mask=c_mask_t) - -def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): - """ - Launch Triton kernel to compute C = alpha * A @ A.T + beta * A - """ - assert A.ndim == 2 or A.ndim == 3 - M, K = A.shape[-2:] - assert M == K, "Input matrix must be square" - assert out.size(-2) == M - assert out.size(-1) == M - - batch_size = A.size(0) if A.ndim == 3 else 1 - input_batch_stride = A.stride(0) if A.ndim == 3 else 0 - output_batch_stride = out.stride(0) if out.ndim == 3 else 0 - - grid = lambda meta: ( - batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), - ) - ba_plus_cAA_kernel[grid]( - A_ptr=A, - C_ptr=out, - M=M, - a_stride_b=input_batch_stride, - a_stride_r=A.stride(-2), - a_stride_c=A.stride(-1), - c_stride_b=output_batch_stride, - c_stride_r=out.stride(-2), - c_stride_c=out.stride(-1), - alpha=alpha, - beta=beta, - ) - return out - -# Computed for num_iters=5, safety_factor=2e-2, cushion=2 -polar_express_coeffs = [ - (8.156554524902461, -22.48329292557795, 15.878769915207462), - (4.042929935166739, -2.808917465908714, 0.5000178451051316), - (3.8916678022926607, -2.772484153217685, 0.5060648178503393), - (3.285753657755655, -2.3681294933425376, 0.46449024233003106), - (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) -] - -@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower -def polar_express(G: torch.Tensor): - """ - Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 - by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. - Code adapted from https://github.com/NoahAmsel/PolarExpress/tree/main by @varunneal. - """ - X = G.bfloat16() - if G.size(-2) > G.size(-1): - X = X.mT - - # Ensure spectral norm is at most 1 - X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) - - # Allocate buffers - X = X.contiguous() - A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) - B = torch.empty_like(A) - C = torch.empty_like(X) - - aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm - - # Perform the iterations - for a, b, c in polar_express_coeffs: - XXT(X, out=A) # A = X @ X.mT - ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A - aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X - X, C = C, X # Swap references to avoid unnecessary copies - - if G.size(-2) > G.size(-1): - X = X.mT - return X - -# ----------------------------------------------------------------------------- -# Muon optimizer - -class Muon(torch.optim.Optimizer): - """ - Muon - MomentUm Orthogonalized by Newton-schulz - - https://kellerjordan.github.io/posts/muon/ - - Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- - processing step, in which each 2D parameter's update is replaced with the nearest orthogonal - matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has - the advantage that it can be stably run in bfloat16 on the GPU. - Note: A later PR replaced Newton-Shulz with Polar Express for the orthogonalization step - - Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, - or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - Though empirically small 1D params perform efficiently here: - NS approximately performs a magnitude normalization of the grad - This hyper-optimized class has faster execution time than the current impl of Adam for small params - - Custom distributed sizing: - The model stores all attn and mlp weights in the same shape, and then updates the view as - needed on the forward pass. This enables attn and mlp weights to be contained within the same - dist.reduce_scatter_tensor() call. The model architecture has been customized to enable - (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. - The scheduling is: - 1. reduce scatter smear_gate (1 param 7 padding params) - 2. reduce scatter attn_gate (10 params 6 padding params) - 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) - 4. reduce scatter attn/mlp round 2 (16 mlp params) - 5. wait on step 1, then compute update of 1 and schedule all gather - 6. wait on step 2, then compute update of 2 and schedule all gather - 7. wait on step 3, then compute update of 3 and schedule all gather - GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] - GPUs that receive params of type attn reshape before computing update - 8. wait on 4, then compute update of 4 and schedule all gather - 9. wait for each all gather to complete and update params - Empirically, leading with small params provides an additional 0.2s improvement. - """ - def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, eps=1e-8, beta2=0.95, custom_sizing=True): - defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) - self.world_size = dist.get_world_size() if dist.is_initialized() else 1 - # custom sizing requires 8 GPUs - if custom_sizing and dist.get_world_size()==8: - param_groups = self.generate_custom_param_groups(params) - else: - param_groups = self.generate_standard_param_groups(params) - super().__init__(param_groups, defaults) - - def reset(self): - # expose a reset for clearing buffers - for group in self.param_groups: - group["momentum_buffer"].zero_() - group["second_momentum_buffer"].zero_() - - def generate_standard_param_groups(self, params): - """ - Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. - Creates one param group per module. - """ - groups = defaultdict(list) - for param in params: - groups[param.label].append(param) - - param_groups = [] - for module_name, group_params in groups.items(): - chunk_size = (len(group_params) + self.world_size - 1) // self.world_size - param_groups.append(dict(params=group_params, chunk_size=chunk_size)) - - return param_groups - - def generate_custom_param_groups(self, params): - """ - Implementation requires that a single GPU does not receive both attn - and mlp params when a param group is split across GPUs. - """ - module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp_up', 'mlp_down'] - params_list = list(params) - params_list.sort(key=lambda x: module_group_order.index(x.label)) - - idx = 0 - group_sizes = [1, 10, 16, 16] - assert len(params_list) == sum(group_sizes) - param_groups = [] - for size in group_sizes: - chunk_size = (size + self.world_size - 1) // self.world_size - group_params = params_list[idx: idx + size] - param_groups.append(dict(params=group_params, chunk_size=chunk_size)) - idx += size - - return param_groups - - @torch.no_grad() - def step(self): - # Efficient systems-wise implementation of step developed by @YouJiacheng, - # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, - # @ryanyang0, @vagrawal, and @varunneal. - rank = dist.get_rank() - group_infos = [] - for group in self.param_groups: - params: list[Tensor] = group["params"] - if not params: - continue - - chunk_size = group["chunk_size"] - padded_num_params = chunk_size * self.world_size - - stacked_grads = torch.empty( - (padded_num_params, *params[0].shape), - dtype=params[0].dtype, - device=params[0].device - ) - for i, p in enumerate(params): - stacked_grads[i].copy_(p.grad, non_blocking=True) - if len(params) < padded_num_params: - stacked_grads[len(params):].zero_() - - grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) - - reduce_future = dist.reduce_scatter_tensor( - grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True - ).get_future() - - group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) - - all_gather_infos = [] - # Second pass: wait for gradients, compute updates for the local shard of parameters, - # and launch all async all_gather operations. - for group, info in zip(self.param_groups, group_infos): - info["reduce_future"].wait() - - params = group["params"] - grad_chunk = info["grad_chunk"] - chunk_size = group["chunk_size"] - padded_num_params = chunk_size * self.world_size - - start_idx = rank * chunk_size - module_idx = start_idx if start_idx < len(params) else 0 - - num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank - - if "momentum_buffer" not in group: - group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) - momentum_buffer = group["momentum_buffer"] - # Apply momentum update to the persistent momentum buffer in-place - momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) - updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) - - grad_shape = updated_grads.shape - if params[module_idx].label == 'attn': - # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] - for p in params[module_idx:module_idx + num_params]: - assert p.label == 'attn' - updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) - ref_param = params[module_idx] - param_shape = ref_param.shape - - if "second_momentum_buffer" not in group: - group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) - if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) - ) - second_momentum_buffer = group["second_momentum_buffer"] - - if "param_lr" not in group: - group["param_lr"] = ( - max(1., param_shape[-2] / param_shape[-1]) ** 0.5 - * ref_param.new_tensor( - [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] - ).view(-1, 1, 1) - ) - - group["param_wd"] = ref_param.new_tensor( - [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] - ).view(-1, 1, 1) - - # Determine LR and WR - eff_lr = group["lr"] * group["param_lr"] - eff_wd = group["weight_decay"] * group["param_wd"] - - # Compute zeropower for the entire chunk in a single, batched call. - if num_params == 0: - v_chunk = updated_grads - elif params[module_idx].label == "smear_gate": - # dividing by magnitude is equivalent of SVN for 1d tensors - v_chunk = updated_grads / (updated_grads.norm(dim=(-2, -1), keepdim=True).clamp_min(1e-10)) - else: - v_chunk = polar_express(updated_grads) - - # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) - v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) - v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) - second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) - step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() - v_chunk.mul_(step_size) - v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) - v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) - - v_chunk = v_chunk.view(grad_shape) - - updated_params = torch.empty_like(grad_chunk) - param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) - # Apply weight decay directly to the buffer. - param_chunk.mul_(1 - eff_wd) - - param_chunk.add_(-eff_lr * v_chunk) - - updated_params[:num_params].copy_(param_chunk) - if num_params < chunk_size: - updated_params[num_params:].zero_() - - stacked_params = torch.empty( - (padded_num_params, *param_shape), - dtype=updated_params.dtype, - device=updated_params.device, - ) - - gather_future = dist.all_gather_into_tensor( - stacked_params, updated_params, async_op=True - ).get_future() - - all_gather_infos.append( - { - "gather_future": gather_future, - "stacked_params": stacked_params, - "orig_params": params, - } - ) - - # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. - for info in all_gather_infos: - info["gather_future"].wait() - stacked_params = info["stacked_params"] - orig_params = info["orig_params"] - - unstacked_params = torch.unbind(stacked_params) - for i, p in enumerate(orig_params): - p.copy_(unstacked_params[i], non_blocking=True) - - -class DistAdam(torch.optim.Optimizer): - def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): - self.world_size = dist.get_world_size() if dist.is_initialized() else 1 - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) - params = list(params) - sizes = {p.shape for p in params} - # create one buffer per unique parameter-size - param_groups = [] - for size in sizes: - group_params = [p for p in params if p.shape == size] - param_groups.append(dict(params=group_params)) - super().__init__(param_groups, defaults) - # init state - for p in params: - chunk_size = p.size(0) // self.world_size - exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) - exp_avg_sq = torch.zeros_like(exp_avg) - self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) - # DistributedAdam implementation by @vagrawal - - @torch.compile - @torch.no_grad() - def step(self): - rank = dist.get_rank() - reduce_scatter_futures: list[torch.Future] = [] - all_gather_futures: list[torch.Future] = [] - grad_slices = [] - for group in self.param_groups: - params: list[Tensor] = group["params"] - for param in params: - grad = param.grad - rank_size = grad.shape[0] // self.world_size - grad_slice = torch.empty_like(grad[:rank_size]) - reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) - grad_slices.append(grad_slice) - - idx = 0 - for group in self.param_groups: - beta1, beta2 = group['betas'] - eps = group['eps'] - wd = group['weight_decay'] - params = group['params'] - for param in params: - reduce_scatter_futures[idx].wait() - rank_size = param.shape[0] // self.world_size - p_slice = param[rank * rank_size:(rank + 1) * rank_size] - lr = group['lr'] * getattr(param, "lr_mul", 1.0) - state = self.state[param] - g_slice = grad_slices[idx] - - exp_avg = state["exp_avg"] - exp_avg_sq = state["exp_avg_sq"] - state["step"] += 1 - t = state["step"] - # weight decay - if wd != 0: - eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) - p_slice.mul_(1 - eff_weight_decay) - # update running averages - exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) - exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) - # bias corrections - bias1 = 1 - beta1 ** t - bias2 = 1 - beta2 ** t - # compute step - denom = exp_avg_sq.sqrt().add_(eps) - step_size = lr * (bias2 ** 0.5 / bias1) - update = exp_avg.div(denom).mul_(step_size) - p_slice.add_(other=update, alpha=-1.0) - idx += 1 - all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) - torch.futures.collect_all(all_gather_futures).wait() - -# ----------------------------------------------------------------------------- -# PyTorch nn.Module definitions for the model - -def norm(x: Tensor): - return F.rms_norm(x, (x.size(-1),)) - -class CastedLinear(nn.Linear): - def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): - super().__init__(in_features, out_features, bias=False) - self.use_fp8 = use_fp8 - self.x_s = x_s - self.w_s = w_s - self.grad_s = grad_s - - def reset_parameters(self) -> None: - with torch.no_grad(): - self.weight.zero_() # @Grad62304977 and others - - def forward(self, x: Tensor): - if self.use_fp8 and self.training: - _x = x.flatten(0, -2) - out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] - return out.reshape(*x.shape[:-1], -1) - else: - return F.linear(x, self.weight.type_as(x)) - -# yarn implementation @classiclarryd -class Yarn(nn.Module): - def __init__(self, head_dim, max_seq_len): - super().__init__() - self.head_dim = head_dim - self.max_seq_len = max_seq_len - self.reset() - - def reset(self): - angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) - # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) - angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) - t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) - theta = torch.outer(t, angular_freq) - self.cos = nn.Buffer( - theta.cos().to(torch.bfloat16), persistent=False - ) - self.sin = nn.Buffer( - theta.sin().to(torch.bfloat16), persistent=False - ) - self.angular_freq = angular_freq - # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 - self.attn_scale = 0.1 - - def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): - rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) - scaling_factor = old_window / new_window - interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) - self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) - t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) - theta = torch.outer(t, self.angular_freq) - self.cos.copy_(theta.cos()) - self.sin.copy_(theta.sin()) - self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 - -def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): - assert cos.size(0) >= x_BTHD.size(-3) - cos, sin = ( - cos[None, : x_BTHD.size(-3), None, :], - sin[None, : x_BTHD.size(-3), None, :], - ) - x1, x2 = x_BTHD.chunk(2, dim=-1) - y1 = x1 * cos + x2 * sin - y2 = x1 * (-sin) + x2 * cos - return torch.cat((y1, y2), 3) - -@dataclass -class AttnArgs: - ve: torch.Tensor - sa_lambdas: torch.Tensor - seqlens: torch.Tensor - bm_size: int - cos: torch.Tensor - sin: torch.Tensor - attn_scale: float - -flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface - -class CausalSelfAttention(nn.Module): - def __init__(self, dim: int, head_dim: int, num_heads: int): - super().__init__() - self.num_heads = num_heads - self.head_dim = head_dim - self.dim = dim - self.hdim = num_heads * head_dim - - assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" - std = 0.5 * (self.dim ** -0.5) - bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng - # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng - # https://x.com/hi_tysam/status/1879699187107033311 - # make matrices the same shape as MLP to enable batched call in optimizer - self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) - # label module to enable custom optimizer sizing - self.qkvo_w.label='attn' - - with torch.no_grad(): - self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights - self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero - - # sparse gated attention to enable context based no-op by @classiclarryd - self.attn_gate = CastedLinear(12, num_heads) - # label module to enable custom optimizer sizing - self.attn_gate.weight.label = 'attn_gate' - - def forward(self, x: Tensor, attn_args: AttnArgs): - B, T = x.size(0), x.size(1) # batch size, sequence length - assert B == 1, "varlen sequences requires B == 1" - assert T % 16 == 0 - # unpack attention args - cos, sin = attn_args.cos, attn_args.sin - ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas - seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size - - q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) - q, k = norm(q), norm(k) # QK norm @Grad62304977 - q, k = rotary(q, cos, sin), rotary(k, cos, sin) - if ve is not None: - v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 - else: # skip mid-layers token value embeddings by @YouJiacheng - v = sa_lambdas[0] * v - - max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) - - # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng - y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, - max_seqlen_q=max_len, max_seqlen_k=max_len, - causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) - y = y.view(B, T, self.num_heads, self.head_dim) - y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) - y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side - y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) - return y - - -class MLP(nn.Module): - def __init__(self, dim: int): - super().__init__() - hdim = 4 * dim - # make matrices the same shape to enable batched call in optimizer - self.c_fc = nn.Parameter(torch.empty(dim, hdim)) - self.c_proj = nn.Parameter(torch.empty(dim, hdim)) - # label modules to enable custom optimizer sizing - self.c_fc.label = 'mlp_up' - self.c_proj.label = 'mlp_down' - # corrective factor to account for transpose - self.c_fc.lr_mul = 2. - - std = 0.5 * (dim ** -0.5) - bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng - with torch.no_grad(): - self.c_fc.uniform_(-bound, bound) - self.c_proj.zero_() # zero init suggested by @Grad62304977 - - def forward(self, x: Tensor): - x = F.linear(x, self.c_fc.T.type_as(x)) - x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 - x = F.linear(x, self.c_proj.type_as(x)) - return x - -class Block(nn.Module): - def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): - super().__init__() - # skip attention of blocks.7 (the 8th layer) by @YouJiacheng - self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None - # skip MLP blocks for first MLP layer by @EmelyanenkoK - self.mlp = MLP(dim) if layer_idx != 0 else None - - def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): - x = lambdas[0] * x + lambdas[1] * x0 - if self.attn is not None: - x = x + self.attn(norm(x), attn_args) - if self.mlp is not None: - x = x + self.mlp(norm(x)) - return x - -# ----------------------------------------------------------------------------- -# The main model - -def next_multiple_of_n(v: float | int, *, n: int): - return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) - -class GPT(nn.Module): - def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): - super().__init__() - vocab_size = next_multiple_of_n(vocab_size, n=128) - self.embed = nn.Embedding(vocab_size, model_dim) - self.smear_gate = CastedLinear(12, 1) - # label modules to enable custom optimizer sizing - self.smear_gate.weight.label = 'smear_gate' - # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 - # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 - self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) - self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) - self.yarn = Yarn(head_dim, max_seq_len) - # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. - # suggested to me by @Grad62304977. this originates from Karpathy's experiments. - use_fp8 = not os.environ.get("DISABLE_FP8", False) - self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) - # Add learnable skip connection weights for decoder layers - assert num_layers % 2 == 0 - pad = (-num_layers * 5 - 2) % dist.get_world_size() - self.scalars = nn.Parameter( - torch.cat( - [ - -1.5 - * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 - *[ - torch.tensor([1.0, 0.0]) for _ in range(num_layers) - ], # block lambdas - *[ - torch.tensor([0.5, 0.5]) for _ in range(num_layers) - ], # SA lambdas - torch.zeros(1), # smear_lambda - 0.5*torch.ones(1), # backout_lambda - torch.ones(pad), - ] - ) - ) - # set learning rates - for param in self.embed.parameters(): - param.lr_mul = 75. - for param in self.value_embeds.parameters(): - param.lr_mul = 75. - self.lm_head.weight.lr_mul = 1.0 - self.scalars.lr_mul = 5.0 - - def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): - assert input_seq.ndim == 1 - - ve = [value_embed(input_seq) for value_embed in self.value_embeds] - # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure - # dropping first layer updates this to .12 ... 012 - ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] - assert len(ve) == len(self.blocks) - - short_bm = ws_short * args.block_size - long_bm = ws_long * args.block_size - bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] - assert len(bm_sizes) == len(self.blocks) - - x = self.embed(input_seq) - - skip_weights = self.scalars[:(len(self.blocks) // 2)] - lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) - sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) - smear_lambda = self.scalars[5 * len(self.blocks)] - backout_lambda = self.scalars[5 * len(self.blocks)+1] - - # smear token embed forward 1 position @classiclarryd - smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) - x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) - x = x0 = norm(x[None]) - - # U-net design by @brendanh0gan - skip_connections = [] - n = len(self.blocks) // 2 - - x_backout = None - backout_layer = 8 - # skip layer zero - for i in range(1,len(self.blocks)): - attn_args = AttnArgs( - ve=ve[i], - sa_lambdas=sa_lambdas[i], - seqlens=seqlens, - bm_size=bm_sizes[i], - cos=self.yarn.cos, - sin=self.yarn.sin, - attn_scale=self.yarn.attn_scale - ) - # since layer 0 is skipped, layer 11 does not have skip_connection - if i >= n and i<11: - gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) - x = x + gate * skip_connections.pop() - x = self.blocks[i](x, x0, lambdas[i], attn_args) - if i < n: - skip_connections.append(x) - if i == backout_layer: - x_backout = x - - # back out contributions from first 8 layers that are only required for downstream context and not direct prediction - x -= backout_lambda * x_backout - x = norm(x) - logits = self.lm_head(x) - # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) - logits = 30 * torch.sigmoid(logits / 7.5) - logits_for_loss = logits.float() if not self.training else logits - loss = F.cross_entropy( - logits_for_loss.view(-1, logits_for_loss.size(-1)), - target_seq, - reduction="sum" if self.training else "mean", - ) - return loss - -# ----------------------------------------------------------------------------- -# Distributed data loader - -def _load_data_shard(file: Path): - header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 - assert header[0] == 20240520, "magic number mismatch in the data .bin file" - assert header[1] == 1, "unsupported version" - num_tokens = int(header[2]) # number of tokens (claimed) - with file.open("rb", buffering=0) as f: - tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng - f.seek(256 * 4) - nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng - assert nbytes == 2 * num_tokens, "number of tokens read does not match header" - return tokens - -BOS_ID = 50256 - -class BOSFinder: - # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd - def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): - # Precompute BOS positions once per shard - self.tokens=tokens - self.size = tokens.numel() - self.quickload = quickload - if quickload: - # only scan first 4 million tokens, then kickoff async thread to scan rest - self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() - self.thread = None - self.ready = threading.Event() - self.start() - else: - self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() - self.i = 0 - self.world_size = world_size - self.batch_iter = 0 - - def _load(self): - self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() - self.ready.set() - - def start(self): - self.ready.clear() - self.thread = threading.Thread(target=self._load) - self.thread.start() - - def get(self): - if self.thread: - self.ready.wait() - self.thread.join() - self.bos_idx = self.bos_idx_async - - def next_batch(self, num_tokens_local: int, max_seq_len: int): - # if quickload was used, repoint to the full dataset after 5 batches - if self.quickload and self.batch_iter==5: - self.get() - n = len(self.bos_idx) - starts = [[] for _ in range(self.world_size)] - ends = [[] for _ in range(self.world_size)] - - idx = self.i - for r in range(self.world_size): - cur_len = 0 - while cur_len <= num_tokens_local: - if idx >= n: - raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") - cur = self.bos_idx[idx] - starts[r].append(cur) - end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, - cur + max_seq_len, - cur + num_tokens_local - cur_len + 1) - ends[r].append(end) - cur_len += end - cur - idx += 1 - - assert cur_len == num_tokens_local + 1 - self.i = idx - self.batch_iter+=1 - return starts, ends - -class DataPreloader: - # Helper for asynchronously loading next shard and indexing bos tokens - def __init__(self, file_iter, world_size: int = 1): - self.file_iter = file_iter - self.world_size = world_size - self.thread = None - self.data = None - self.ready = threading.Event() - - def _load(self): - tokens = _load_data_shard(next(self.file_iter)) - self.data = (tokens, BOSFinder(tokens, self.world_size)) - self.ready.set() - - def start(self): - self.ready.clear() - self.thread = threading.Thread(target=self._load) - self.thread.start() - - def get(self): - if self.thread: - self.ready.wait() - self.thread.join() - return self.data - -def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): - # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len - rank = dist.get_rank() if dist.is_initialized() else 0 - world_size = dist.get_world_size() if dist.is_initialized() else 1 - assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" - num_tokens = num_tokens // grad_accum_steps - - files = [Path(file) for file in sorted(glob.glob(filename_pattern))] - if not files: - raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") - - file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training - tokens = _load_data_shard(next(file_iter)) - if align_to_bos: - finder = BOSFinder(tokens, world_size=world_size, quickload=True) - preloader = DataPreloader(file_iter, world_size) - preloader.start() - else: - pos = 0 # for unaligned case - - while True: - num_tokens_local = num_tokens // world_size - max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 - - if align_to_bos: - try: - seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) - start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) - except StopIteration: - # This shard is exhausted, load the next one in the next loop iteration. - tokens, finder = preloader.get() - preloader.start() - continue - - buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) - _inputs = buf[:-1] - _targets = buf[1:] - end_idxs[-1] -= 1 # last document was too long to account for _targets offset - cum_lengths = (end_idxs - start_idxs).cumsum(0) - - else: - if pos + num_tokens + 1 >= len(tokens): # should not occur for val data - tokens, pos = _load_data_shard(next(file_iter)), 0 - - pos_local = pos + rank * num_tokens_local - buf = tokens[pos_local: pos_local + num_tokens_local + 1] - _inputs = buf[:-1].view(num_tokens_local, ) - _targets = buf[1:].view(num_tokens_local, ) - - cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] - pos += num_tokens - - - _cum_lengths = torch.full((max_num_docs,), num_tokens_local) - _cum_lengths[0] = 0 - _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths - - new_params = yield ( - _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), - _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), - _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) - ) - - if new_params is not None: - # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() - new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params - assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" - num_tokens = new_num_tokens - max_seq_len = new_max_seq_len - grad_accum_steps = new_grad_accum_steps - - -# ----------------------------------------------------------------------------- -# int main - -@dataclass -class Hyperparameters: - # data - train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on - val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on - val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons - train_batch_size: int = 2048 * 16 * 8 - train_max_seq_len: int = 128 * 16 - val_batch_size: int = 4 * 64 * 1024 * 8 - # optimization - num_iterations: int = 2285 - lr_schedule = (0.5, 0.98) # breakpoints for 3-part schedule: (flat, linear decay, flat) - lr_min = 0.1 - # evaluation and logging - run_id: str = f"{uuid.uuid4()}" - val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end - save_checkpoint: bool = False - # attention masking - block_size: int = 128 - ws_schedule: tuple = (3, 5, 7, 9, 11, 13) - ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN - -args = Hyperparameters() - -data_path = os.environ.get("DATA_PATH", ".") -args.train_files = os.path.join(data_path, args.train_files) -args.val_files = os.path.join(data_path, args.val_files) - -# torchrun sets these env variables -rank = int(os.environ["RANK"]) -world_size = int(os.environ["WORLD_SIZE"]) -assert 8 % world_size == 0, "world_size must be a divisor of 8" -grad_accum_steps = 8 // world_size -assert torch.cuda.is_available() -device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) -torch.cuda.set_device(device) -dist.init_process_group(backend="nccl", device_id=device) -dist.barrier() -master_process = (rank == 0) # this process will do logging, checkpointing etc. - -# begin logging -logfile = None -if master_process: - run_id = args.run_id - os.makedirs("logs", exist_ok=True) - logfile = f"logs/{run_id}.txt" - print(logfile) -def print0(s, console=False): - if master_process: - with open(logfile, "a") as f: - if console: - print(s) - print(s, file=f) - -# begin by printing this file (the Python code) -print0(code) -print0("="*100) -# log information about the hardware/software environment this is running on -print0(f"Running Python {sys.version}") -print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") -print0(f"Running Triton version {triton.__version__}") - -def nvidia_smi(): - import subprocess # avoid top level import - return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout -print0(nvidia_smi()) -print0("="*100) - -model: nn.Module = GPT( - vocab_size=50257, - num_layers=12, - num_heads=6, - head_dim=128, - model_dim=768, - max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) -).cuda() -for m in model.modules(): - if isinstance(m, (nn.Embedding, nn.Linear)): - m.bfloat16() -for param in model.parameters(): - dist.broadcast(param.detach(), 0) - -# collect the parameters to optimize -hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] -embed_params = [p for n, p in model.named_parameters() if "embed" in n] -scalar_params = [p for p in model.parameters() if p.ndim < 2] -head_params = [model.lm_head.weight] -gate_params = [p for n, p in model.named_parameters() if "gate" in n] - -# init the optimizer(s) -# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence -# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 -optimizer1 = DistAdam( - scalar_params + head_params + embed_params, - lr=0.008, - betas=(0.65, 0.95), - eps=1e-8, - weight_decay=0.0, -) -optimizer2 = Muon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=0.0) -optimizers = [optimizer1, optimizer2] -for opt in optimizers: - for group in opt.param_groups: - group["initial_lr"] = group["lr"] - -def get_lr(step: int): - assert step < args.num_iterations - # Three part schedule: flat, linear decrease, flat - lr_schedule = args.lr_schedule - x = step / args.num_iterations - - if x < lr_schedule[0]: - return 1.0 - elif x < lr_schedule[1]: - progress = (x - lr_schedule[0]) / (lr_schedule[1] - lr_schedule[0]) - lr = 1.0 - (1.0 - args.lr_min) * progress - else: - lr = args.lr_min - return lr - -def get_ws(step: int): - assert step <= args.num_iterations - x = step / (args.num_iterations + 1) - ws_idx = int(len(args.ws_schedule) * x) - return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] - -def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): - # warmup phase: linearly increase momentum from min to max - # cooldown phase: linearly decrease momentum from max to min - momentum_cd_start = args.num_iterations - muon_cooldown_steps - if step < muon_warmup_steps: - frac = step / muon_warmup_steps - momentum = momentum_min + frac * (momentum_max - momentum_min) - elif step > momentum_cd_start: - frac = (step - momentum_cd_start) / muon_cooldown_steps - momentum = momentum_max - frac * (momentum_max - momentum_min) - else: - momentum = momentum_max - return momentum - -def step_optimizers(step: int, optimizers, model): - # update lr - for optimizer in optimizers: - for group in optimizer.param_groups: - group["lr"] = group["initial_lr"] * get_lr(step) - - # set muon momentum based on step - momentum = get_muon_momentum(step) - for group in optimizers[1].param_groups: - group["momentum"] = momentum - - # on even steps, only step Muon params - # on odd steps, step all params - if step%2==0: - optimizers[1].step() - optimizers[1].zero_grad(set_to_none=True) - else: - for optimizer in optimizers: - optimizer.step() - model.zero_grad(set_to_none=True) - -model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) - -######################################## -# Warmup kernels # -######################################## - -# Warmup the training kernels, then re-initialize the state so we aren't cheating -warmup_steps = 30 -initial_state = dict(model=copy.deepcopy(model.state_dict()), - optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state -train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) -for step in range(warmup_steps): - inputs, targets, cum_seqlens = next(train_loader) - # each window size is a new graph, need to warm up each with Yarn.attn_scale - ws_idx = step % len(args.ws_schedule) - if ws_idx==0: - model.yarn.reset() - ws_long = args.ws_schedule[0] - else: - new_ws_long = args.ws_schedule[ws_idx] - if new_ws_long > ws_long: - model.yarn.apply(ws_long, new_ws_long) - ws_long = new_ws_long - model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() - for opt in optimizers: - opt.step() - model.zero_grad(set_to_none=True) -model.yarn.reset() # rotary buffer is not stored in state_dict -model.load_state_dict(initial_state["model"]) -optimizer2.reset() # momentum buffer not in state dict -for opt, opt_state in zip(optimizers, initial_state["optimizers"]): - opt.load_state_dict(opt_state) -del train_loader, initial_state - -######################################## -# Training and validation # -######################################## - -train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) -training_time_ms = 0 -# start the clock -torch.cuda.synchronize() -t0 = time.perf_counter() -# begin training -train_steps = args.num_iterations -ws_short, ws_long = get_ws(0) -for step in range(train_steps + 1): - last_step = (step == train_steps) - ws_short, new_ws_long = get_ws(step) - if new_ws_long != ws_long: - model.yarn.apply(ws_long, new_ws_long) - ws_long=new_ws_long - - # --------------- VALIDATION SECTION ----------------- - if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): - if last_step: - ws_long = args.ws_validate_post_yarn_ext - # stop the clock - torch.cuda.synchronize() - training_time_ms += 1000 * (time.perf_counter() - t0) - model.eval() - assert args.val_tokens % args.val_batch_size == 0 - val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size - val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) - val_loss = 0 - with torch.no_grad(): - for _ in range(val_steps): - inputs, targets, cum_seqlens = next(val_loader) - val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) - val_loss /= val_steps - del val_loader - dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) - print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) - model.train() - # start the clock again - torch.cuda.synchronize() - t0 = time.perf_counter() - - if last_step: - if master_process and args.save_checkpoint: - log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) - os.makedirs(f"logs/{run_id}", exist_ok=True) - torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") - # the last step only has the validation loop, so break to avoid training - break - - # --------------- TRAINING SECTION ----------------- - loss = 0 - for _ in range(grad_accum_steps): - inputs, targets, cum_seqlens = next(train_loader) - loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps - loss.backward() - step_optimizers(step, optimizers, model) - - # logging - approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) - print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) - -print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " - f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) -dist.destroy_process_group() - -==================================================================================================== -Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] -Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 -Running Triton version 3.5.0 -Tue Oct 28 02:13:13 2025 -+-----------------------------------------------------------------------------------------+ -| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | -|-----------------------------------------+------------------------+----------------------+ -| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | -| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | -| | | MIG M. | -|=========================================+========================+======================| -| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | -| N/A 40C P0 128W / 700W | 5858MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | -| N/A 33C P0 127W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | -| N/A 32C P0 121W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | -| N/A 38C P0 124W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | -| N/A 39C P0 120W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | -| N/A 32C P0 120W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | -| N/A 38C P0 124W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | -| N/A 31C P0 114W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ - -+-----------------------------------------------------------------------------------------+ -| Processes: | -| GPU GI CI PID Type Process name GPU Memory | -| ID ID Usage | -|=========================================================================================| -+-----------------------------------------------------------------------------------------+ - -==================================================================================================== -step:0/2285 val_loss:10.8258 train_time:0ms step_avg:0.02ms -step:1/2285 train_time:120ms step_avg:119.52ms -step:2/2285 train_time:140ms step_avg:70.10ms -step:3/2285 train_time:179ms step_avg:59.70ms -step:4/2285 train_time:235ms step_avg:58.79ms -step:5/2285 train_time:294ms step_avg:58.87ms -step:6/2285 train_time:353ms step_avg:58.79ms -step:7/2285 train_time:414ms step_avg:59.12ms -step:8/2285 train_time:472ms step_avg:59.00ms -step:9/2285 train_time:532ms step_avg:59.16ms -step:10/2285 train_time:591ms step_avg:59.06ms -step:11/2285 train_time:651ms step_avg:59.21ms -step:12/2285 train_time:710ms step_avg:59.14ms -step:13/2285 train_time:770ms step_avg:59.22ms -step:14/2285 train_time:828ms step_avg:59.16ms -step:15/2285 train_time:889ms step_avg:59.24ms -step:16/2285 train_time:947ms step_avg:59.19ms -step:17/2285 train_time:1009ms step_avg:59.33ms -step:18/2285 train_time:1071ms step_avg:59.49ms -step:19/2285 train_time:1137ms step_avg:59.82ms -step:20/2285 train_time:1198ms step_avg:59.89ms -step:21/2285 train_time:1259ms step_avg:59.96ms -step:22/2285 train_time:1318ms step_avg:59.91ms -step:23/2285 train_time:1379ms step_avg:59.97ms -step:24/2285 train_time:1438ms step_avg:59.91ms -step:25/2285 train_time:1499ms step_avg:59.95ms -step:26/2285 train_time:1558ms step_avg:59.92ms -step:27/2285 train_time:1619ms step_avg:59.95ms -step:28/2285 train_time:1678ms step_avg:59.91ms -step:29/2285 train_time:1739ms step_avg:59.95ms -step:30/2285 train_time:1797ms step_avg:59.91ms -step:31/2285 train_time:1858ms step_avg:59.95ms -step:32/2285 train_time:1917ms step_avg:59.90ms -step:33/2285 train_time:1978ms step_avg:59.94ms -step:34/2285 train_time:2038ms step_avg:59.93ms -step:35/2285 train_time:2101ms step_avg:60.03ms -step:36/2285 train_time:2162ms step_avg:60.04ms -step:37/2285 train_time:2223ms step_avg:60.09ms -step:38/2285 train_time:2283ms step_avg:60.08ms -step:39/2285 train_time:2345ms step_avg:60.12ms -step:40/2285 train_time:2403ms step_avg:60.08ms -step:41/2285 train_time:2465ms step_avg:60.13ms -step:42/2285 train_time:2524ms step_avg:60.10ms -step:43/2285 train_time:2586ms step_avg:60.13ms -step:44/2285 train_time:2645ms step_avg:60.11ms -step:45/2285 train_time:2706ms step_avg:60.13ms -step:46/2285 train_time:2765ms step_avg:60.11ms -step:47/2285 train_time:2826ms step_avg:60.14ms -step:48/2285 train_time:2886ms step_avg:60.12ms -step:49/2285 train_time:2948ms step_avg:60.16ms -step:50/2285 train_time:3007ms step_avg:60.14ms -step:51/2285 train_time:3070ms step_avg:60.20ms -step:52/2285 train_time:3130ms step_avg:60.19ms -step:53/2285 train_time:3191ms step_avg:60.21ms -step:54/2285 train_time:3250ms step_avg:60.19ms -step:55/2285 train_time:3312ms step_avg:60.21ms -step:56/2285 train_time:3371ms step_avg:60.19ms -step:57/2285 train_time:3432ms step_avg:60.21ms -step:58/2285 train_time:3491ms step_avg:60.19ms -step:59/2285 train_time:3553ms step_avg:60.23ms -step:60/2285 train_time:3612ms step_avg:60.20ms -step:61/2285 train_time:3673ms step_avg:60.21ms -step:62/2285 train_time:3731ms step_avg:60.18ms -step:63/2285 train_time:3792ms step_avg:60.20ms -step:64/2285 train_time:3852ms step_avg:60.18ms -step:65/2285 train_time:3913ms step_avg:60.20ms -step:66/2285 train_time:3972ms step_avg:60.18ms -step:67/2285 train_time:4033ms step_avg:60.19ms -step:68/2285 train_time:4092ms step_avg:60.18ms -step:69/2285 train_time:4154ms step_avg:60.20ms -step:70/2285 train_time:4213ms step_avg:60.18ms -step:71/2285 train_time:4274ms step_avg:60.20ms -step:72/2285 train_time:4332ms step_avg:60.17ms -step:73/2285 train_time:4394ms step_avg:60.19ms -step:74/2285 train_time:4454ms step_avg:60.19ms -step:75/2285 train_time:4514ms step_avg:60.18ms -step:76/2285 train_time:4572ms step_avg:60.16ms -step:77/2285 train_time:4634ms step_avg:60.18ms -step:78/2285 train_time:4693ms step_avg:60.17ms -step:79/2285 train_time:4755ms step_avg:60.19ms -step:80/2285 train_time:4814ms step_avg:60.17ms -step:81/2285 train_time:4876ms step_avg:60.20ms -step:82/2285 train_time:4935ms step_avg:60.18ms -step:83/2285 train_time:4996ms step_avg:60.19ms -step:84/2285 train_time:5055ms step_avg:60.18ms -step:85/2285 train_time:5116ms step_avg:60.19ms -step:86/2285 train_time:5174ms step_avg:60.17ms -step:87/2285 train_time:5236ms step_avg:60.18ms -step:88/2285 train_time:5294ms step_avg:60.16ms -step:89/2285 train_time:5355ms step_avg:60.17ms -step:90/2285 train_time:5414ms step_avg:60.16ms -step:91/2285 train_time:5475ms step_avg:60.16ms -step:92/2285 train_time:5533ms step_avg:60.15ms -step:93/2285 train_time:5595ms step_avg:60.16ms -step:94/2285 train_time:5653ms step_avg:60.14ms -step:95/2285 train_time:5715ms step_avg:60.16ms -step:96/2285 train_time:5774ms step_avg:60.14ms -step:97/2285 train_time:5835ms step_avg:60.15ms -step:98/2285 train_time:5894ms step_avg:60.14ms -step:99/2285 train_time:5955ms step_avg:60.15ms -step:100/2285 train_time:6014ms step_avg:60.14ms -step:101/2285 train_time:6074ms step_avg:60.14ms -step:102/2285 train_time:6133ms step_avg:60.13ms -step:103/2285 train_time:6194ms step_avg:60.14ms -step:104/2285 train_time:6253ms step_avg:60.12ms -step:105/2285 train_time:6314ms step_avg:60.14ms -step:106/2285 train_time:6373ms step_avg:60.12ms -step:107/2285 train_time:6434ms step_avg:60.13ms -step:108/2285 train_time:6492ms step_avg:60.11ms -step:109/2285 train_time:6553ms step_avg:60.12ms -step:110/2285 train_time:6611ms step_avg:60.10ms -step:111/2285 train_time:6673ms step_avg:60.12ms -step:112/2285 train_time:6731ms step_avg:60.10ms -step:113/2285 train_time:6793ms step_avg:60.12ms -step:114/2285 train_time:6852ms step_avg:60.11ms -step:115/2285 train_time:6913ms step_avg:60.11ms -step:116/2285 train_time:6971ms step_avg:60.10ms -step:117/2285 train_time:7032ms step_avg:60.11ms -step:118/2285 train_time:7091ms step_avg:60.09ms -step:119/2285 train_time:7152ms step_avg:60.10ms -step:120/2285 train_time:7210ms step_avg:60.09ms -step:121/2285 train_time:7272ms step_avg:60.10ms -step:122/2285 train_time:7331ms step_avg:60.09ms -step:123/2285 train_time:7392ms step_avg:60.10ms -step:124/2285 train_time:7451ms step_avg:60.09ms -step:125/2285 train_time:7512ms step_avg:60.09ms -step:126/2285 train_time:7570ms step_avg:60.08ms -step:127/2285 train_time:7631ms step_avg:60.08ms -step:128/2285 train_time:7690ms step_avg:60.08ms -step:129/2285 train_time:7751ms step_avg:60.08ms -step:130/2285 train_time:7809ms step_avg:60.07ms -step:131/2285 train_time:7870ms step_avg:60.08ms -step:132/2285 train_time:7929ms step_avg:60.07ms -step:133/2285 train_time:7990ms step_avg:60.07ms -step:134/2285 train_time:8048ms step_avg:60.06ms -step:135/2285 train_time:8109ms step_avg:60.07ms -step:136/2285 train_time:8168ms step_avg:60.06ms -step:137/2285 train_time:8229ms step_avg:60.07ms -step:138/2285 train_time:8288ms step_avg:60.06ms -step:139/2285 train_time:8349ms step_avg:60.06ms -step:140/2285 train_time:8407ms step_avg:60.05ms -step:141/2285 train_time:8468ms step_avg:60.06ms -step:142/2285 train_time:8527ms step_avg:60.05ms -step:143/2285 train_time:8588ms step_avg:60.06ms -step:144/2285 train_time:8647ms step_avg:60.05ms -step:145/2285 train_time:8708ms step_avg:60.06ms -step:146/2285 train_time:8767ms step_avg:60.05ms -step:147/2285 train_time:8827ms step_avg:60.05ms -step:148/2285 train_time:8886ms step_avg:60.04ms -step:149/2285 train_time:8948ms step_avg:60.05ms -step:150/2285 train_time:9007ms step_avg:60.04ms -step:151/2285 train_time:9068ms step_avg:60.06ms -step:152/2285 train_time:9127ms step_avg:60.04ms -step:153/2285 train_time:9188ms step_avg:60.06ms -step:154/2285 train_time:9247ms step_avg:60.05ms -step:155/2285 train_time:9308ms step_avg:60.05ms -step:156/2285 train_time:9367ms step_avg:60.04ms -step:157/2285 train_time:9428ms step_avg:60.05ms -step:158/2285 train_time:9486ms step_avg:60.04ms -step:159/2285 train_time:9548ms step_avg:60.05ms -step:160/2285 train_time:9607ms step_avg:60.04ms -step:161/2285 train_time:9669ms step_avg:60.05ms -step:162/2285 train_time:9727ms step_avg:60.04ms -step:163/2285 train_time:9788ms step_avg:60.05ms -step:164/2285 train_time:9847ms step_avg:60.04ms -step:165/2285 train_time:9908ms step_avg:60.05ms -step:166/2285 train_time:9966ms step_avg:60.04ms -step:167/2285 train_time:10027ms step_avg:60.04ms -step:168/2285 train_time:10086ms step_avg:60.04ms -step:169/2285 train_time:10148ms step_avg:60.05ms -step:170/2285 train_time:10206ms step_avg:60.04ms -step:171/2285 train_time:10268ms step_avg:60.05ms -step:172/2285 train_time:10327ms step_avg:60.04ms -step:173/2285 train_time:10388ms step_avg:60.04ms -step:174/2285 train_time:10447ms step_avg:60.04ms -step:175/2285 train_time:10508ms step_avg:60.05ms -step:176/2285 train_time:10567ms step_avg:60.04ms -step:177/2285 train_time:10627ms step_avg:60.04ms -step:178/2285 train_time:10686ms step_avg:60.03ms -step:179/2285 train_time:10748ms step_avg:60.04ms -step:180/2285 train_time:10807ms step_avg:60.04ms -step:181/2285 train_time:10867ms step_avg:60.04ms -step:182/2285 train_time:10926ms step_avg:60.03ms -step:183/2285 train_time:10987ms step_avg:60.04ms -step:184/2285 train_time:11046ms step_avg:60.03ms -step:185/2285 train_time:11107ms step_avg:60.04ms -step:186/2285 train_time:11166ms step_avg:60.03ms -step:187/2285 train_time:11227ms step_avg:60.04ms -step:188/2285 train_time:11285ms step_avg:60.03ms -step:189/2285 train_time:11346ms step_avg:60.03ms -step:190/2285 train_time:11405ms step_avg:60.03ms -step:191/2285 train_time:11467ms step_avg:60.04ms -step:192/2285 train_time:11525ms step_avg:60.03ms -step:193/2285 train_time:11586ms step_avg:60.03ms -step:194/2285 train_time:11645ms step_avg:60.03ms -step:195/2285 train_time:11706ms step_avg:60.03ms -step:196/2285 train_time:11765ms step_avg:60.03ms -step:197/2285 train_time:11826ms step_avg:60.03ms -step:198/2285 train_time:11885ms step_avg:60.03ms -step:199/2285 train_time:11946ms step_avg:60.03ms -step:200/2285 train_time:12005ms step_avg:60.03ms -step:201/2285 train_time:12067ms step_avg:60.04ms -step:202/2285 train_time:12126ms step_avg:60.03ms -step:203/2285 train_time:12187ms step_avg:60.04ms -step:204/2285 train_time:12246ms step_avg:60.03ms -step:205/2285 train_time:12307ms step_avg:60.04ms -step:206/2285 train_time:12366ms step_avg:60.03ms -step:207/2285 train_time:12427ms step_avg:60.04ms -step:208/2285 train_time:12486ms step_avg:60.03ms -step:209/2285 train_time:12548ms step_avg:60.04ms -step:210/2285 train_time:12607ms step_avg:60.03ms -step:211/2285 train_time:12668ms step_avg:60.04ms -step:212/2285 train_time:12727ms step_avg:60.03ms -step:213/2285 train_time:12788ms step_avg:60.04ms -step:214/2285 train_time:12847ms step_avg:60.03ms -step:215/2285 train_time:12908ms step_avg:60.04ms -step:216/2285 train_time:12966ms step_avg:60.03ms -step:217/2285 train_time:13028ms step_avg:60.04ms -step:218/2285 train_time:13087ms step_avg:60.03ms -step:219/2285 train_time:13149ms step_avg:60.04ms -step:220/2285 train_time:13207ms step_avg:60.03ms -step:221/2285 train_time:13268ms step_avg:60.04ms -step:222/2285 train_time:13327ms step_avg:60.03ms -step:223/2285 train_time:13388ms step_avg:60.03ms -step:224/2285 train_time:13447ms step_avg:60.03ms -step:225/2285 train_time:13508ms step_avg:60.04ms -step:226/2285 train_time:13567ms step_avg:60.03ms -step:227/2285 train_time:13628ms step_avg:60.03ms -step:228/2285 train_time:13686ms step_avg:60.03ms -step:229/2285 train_time:13747ms step_avg:60.03ms -step:230/2285 train_time:13806ms step_avg:60.03ms -step:231/2285 train_time:13868ms step_avg:60.03ms -step:232/2285 train_time:13926ms step_avg:60.03ms -step:233/2285 train_time:13987ms step_avg:60.03ms -step:234/2285 train_time:14046ms step_avg:60.02ms -step:235/2285 train_time:14107ms step_avg:60.03ms -step:236/2285 train_time:14166ms step_avg:60.02ms -step:237/2285 train_time:14227ms step_avg:60.03ms -step:238/2285 train_time:14286ms step_avg:60.02ms -step:239/2285 train_time:14348ms step_avg:60.03ms -step:240/2285 train_time:14406ms step_avg:60.03ms -step:241/2285 train_time:14468ms step_avg:60.03ms -step:242/2285 train_time:14526ms step_avg:60.03ms -step:243/2285 train_time:14588ms step_avg:60.03ms -step:244/2285 train_time:14647ms step_avg:60.03ms -step:245/2285 train_time:14707ms step_avg:60.03ms -step:246/2285 train_time:14766ms step_avg:60.02ms -step:247/2285 train_time:14828ms step_avg:60.03ms -step:248/2285 train_time:14886ms step_avg:60.03ms -step:249/2285 train_time:14948ms step_avg:60.03ms -step:250/2285 train_time:15007ms step_avg:60.03ms -step:250/2285 val_loss:4.0735 train_time:15069ms step_avg:60.28ms -step:251/2285 train_time:15089ms step_avg:60.11ms -step:252/2285 train_time:15129ms step_avg:60.04ms -step:253/2285 train_time:15194ms step_avg:60.05ms -step:254/2285 train_time:15257ms step_avg:60.07ms -step:255/2285 train_time:15319ms step_avg:60.07ms -step:256/2285 train_time:15378ms step_avg:60.07ms -step:257/2285 train_time:15438ms step_avg:60.07ms -step:258/2285 train_time:15497ms step_avg:60.06ms -step:259/2285 train_time:15556ms step_avg:60.06ms -step:260/2285 train_time:15614ms step_avg:60.05ms -step:261/2285 train_time:15675ms step_avg:60.06ms -step:262/2285 train_time:15732ms step_avg:60.05ms -step:263/2285 train_time:15792ms step_avg:60.05ms -step:264/2285 train_time:15850ms step_avg:60.04ms -step:265/2285 train_time:15910ms step_avg:60.04ms -step:266/2285 train_time:15968ms step_avg:60.03ms -step:267/2285 train_time:16029ms step_avg:60.03ms -step:268/2285 train_time:16088ms step_avg:60.03ms -step:269/2285 train_time:16152ms step_avg:60.04ms -step:270/2285 train_time:16213ms step_avg:60.05ms -step:271/2285 train_time:16276ms step_avg:60.06ms -step:272/2285 train_time:16335ms step_avg:60.06ms -step:273/2285 train_time:16397ms step_avg:60.06ms -step:274/2285 train_time:16455ms step_avg:60.05ms -step:275/2285 train_time:16516ms step_avg:60.06ms -step:276/2285 train_time:16574ms step_avg:60.05ms -step:277/2285 train_time:16634ms step_avg:60.05ms -step:278/2285 train_time:16692ms step_avg:60.04ms -step:279/2285 train_time:16752ms step_avg:60.04ms -step:280/2285 train_time:16810ms step_avg:60.04ms -step:281/2285 train_time:16871ms step_avg:60.04ms -step:282/2285 train_time:16929ms step_avg:60.03ms -step:283/2285 train_time:16989ms step_avg:60.03ms -step:284/2285 train_time:17048ms step_avg:60.03ms -step:285/2285 train_time:17110ms step_avg:60.03ms -step:286/2285 train_time:17169ms step_avg:60.03ms -step:287/2285 train_time:17232ms step_avg:60.04ms -step:288/2285 train_time:17292ms step_avg:60.04ms -step:289/2285 train_time:17353ms step_avg:60.05ms -step:290/2285 train_time:17413ms step_avg:60.04ms -step:291/2285 train_time:17474ms step_avg:60.05ms -step:292/2285 train_time:17532ms step_avg:60.04ms -step:293/2285 train_time:17593ms step_avg:60.04ms -step:294/2285 train_time:17651ms step_avg:60.04ms -step:295/2285 train_time:17712ms step_avg:60.04ms -step:296/2285 train_time:17769ms step_avg:60.03ms -step:297/2285 train_time:17830ms step_avg:60.03ms -step:298/2285 train_time:17888ms step_avg:60.03ms -step:299/2285 train_time:17949ms step_avg:60.03ms -step:300/2285 train_time:18008ms step_avg:60.03ms -step:301/2285 train_time:18069ms step_avg:60.03ms -step:302/2285 train_time:18128ms step_avg:60.03ms -step:303/2285 train_time:18189ms step_avg:60.03ms -step:304/2285 train_time:18249ms step_avg:60.03ms -step:305/2285 train_time:18310ms step_avg:60.03ms -step:306/2285 train_time:18370ms step_avg:60.03ms -step:307/2285 train_time:18432ms step_avg:60.04ms -step:308/2285 train_time:18491ms step_avg:60.04ms -step:309/2285 train_time:18552ms step_avg:60.04ms -step:310/2285 train_time:18611ms step_avg:60.04ms -step:311/2285 train_time:18671ms step_avg:60.04ms -step:312/2285 train_time:18729ms step_avg:60.03ms -step:313/2285 train_time:18790ms step_avg:60.03ms -step:314/2285 train_time:18848ms step_avg:60.03ms -step:315/2285 train_time:18908ms step_avg:60.03ms -step:316/2285 train_time:18966ms step_avg:60.02ms -step:317/2285 train_time:19027ms step_avg:60.02ms -step:318/2285 train_time:19085ms step_avg:60.02ms -step:319/2285 train_time:19147ms step_avg:60.02ms -step:320/2285 train_time:19205ms step_avg:60.02ms -step:321/2285 train_time:19266ms step_avg:60.02ms -step:322/2285 train_time:19325ms step_avg:60.02ms -step:323/2285 train_time:19386ms step_avg:60.02ms -step:324/2285 train_time:19446ms step_avg:60.02ms -step:325/2285 train_time:19507ms step_avg:60.02ms -step:326/2285 train_time:19565ms step_avg:60.02ms -step:327/2285 train_time:19626ms step_avg:60.02ms -step:328/2285 train_time:19684ms step_avg:60.01ms -step:329/2285 train_time:19744ms step_avg:60.01ms -step:330/2285 train_time:19802ms step_avg:60.01ms -step:331/2285 train_time:19863ms step_avg:60.01ms -step:332/2285 train_time:19921ms step_avg:60.00ms -step:333/2285 train_time:19982ms step_avg:60.01ms -step:334/2285 train_time:20040ms step_avg:60.00ms -step:335/2285 train_time:20101ms step_avg:60.00ms -step:336/2285 train_time:20159ms step_avg:60.00ms -step:337/2285 train_time:20220ms step_avg:60.00ms -step:338/2285 train_time:20279ms step_avg:60.00ms -step:339/2285 train_time:20340ms step_avg:60.00ms -step:340/2285 train_time:20400ms step_avg:60.00ms -step:341/2285 train_time:20461ms step_avg:60.00ms -step:342/2285 train_time:20519ms step_avg:60.00ms -step:343/2285 train_time:20580ms step_avg:60.00ms -step:344/2285 train_time:20639ms step_avg:60.00ms -step:345/2285 train_time:20700ms step_avg:60.00ms -step:346/2285 train_time:20758ms step_avg:60.00ms -step:347/2285 train_time:20820ms step_avg:60.00ms -step:348/2285 train_time:20878ms step_avg:59.99ms -step:349/2285 train_time:20938ms step_avg:60.00ms -step:350/2285 train_time:20997ms step_avg:59.99ms -step:351/2285 train_time:21057ms step_avg:59.99ms -step:352/2285 train_time:21115ms step_avg:59.99ms -step:353/2285 train_time:21176ms step_avg:59.99ms -step:354/2285 train_time:21234ms step_avg:59.98ms -step:355/2285 train_time:21295ms step_avg:59.99ms -step:356/2285 train_time:21354ms step_avg:59.98ms -step:357/2285 train_time:21416ms step_avg:59.99ms -step:358/2285 train_time:21475ms step_avg:59.99ms -step:359/2285 train_time:21536ms step_avg:59.99ms -step:360/2285 train_time:21595ms step_avg:59.99ms -step:361/2285 train_time:21656ms step_avg:59.99ms -step:362/2285 train_time:21714ms step_avg:59.98ms -step:363/2285 train_time:21775ms step_avg:59.99ms -step:364/2285 train_time:21833ms step_avg:59.98ms -step:365/2285 train_time:21894ms step_avg:59.98ms -step:366/2285 train_time:21953ms step_avg:59.98ms -step:367/2285 train_time:22013ms step_avg:59.98ms -step:368/2285 train_time:22072ms step_avg:59.98ms -step:369/2285 train_time:22132ms step_avg:59.98ms -step:370/2285 train_time:22191ms step_avg:59.98ms -step:371/2285 train_time:22252ms step_avg:59.98ms -step:372/2285 train_time:22310ms step_avg:59.97ms -step:373/2285 train_time:22372ms step_avg:59.98ms -step:374/2285 train_time:22431ms step_avg:59.97ms -step:375/2285 train_time:22492ms step_avg:59.98ms -step:376/2285 train_time:22551ms step_avg:59.98ms -step:377/2285 train_time:22612ms step_avg:59.98ms -step:378/2285 train_time:22671ms step_avg:59.98ms -step:379/2285 train_time:22732ms step_avg:59.98ms -step:380/2285 train_time:22791ms step_avg:59.98ms -step:381/2285 train_time:22852ms step_avg:59.98ms -step:382/2285 train_time:22910ms step_avg:59.97ms -step:383/2285 train_time:22971ms step_avg:59.98ms -step:384/2285 train_time:23030ms step_avg:59.97ms -step:385/2285 train_time:23091ms step_avg:59.98ms -step:386/2285 train_time:23150ms step_avg:59.97ms -step:387/2285 train_time:23211ms step_avg:59.98ms -step:388/2285 train_time:23270ms step_avg:59.97ms -step:389/2285 train_time:23332ms step_avg:59.98ms -step:390/2285 train_time:23391ms step_avg:59.98ms -step:391/2285 train_time:23453ms step_avg:59.98ms -step:392/2285 train_time:23512ms step_avg:59.98ms -step:393/2285 train_time:23574ms step_avg:59.98ms -step:394/2285 train_time:23633ms step_avg:59.98ms -step:395/2285 train_time:23694ms step_avg:59.99ms -step:396/2285 train_time:23754ms step_avg:59.98ms -step:397/2285 train_time:23815ms step_avg:59.99ms -step:398/2285 train_time:23874ms step_avg:59.98ms -step:399/2285 train_time:23935ms step_avg:59.99ms -step:400/2285 train_time:23994ms step_avg:59.99ms -step:401/2285 train_time:24055ms step_avg:59.99ms -step:402/2285 train_time:24114ms step_avg:59.99ms -step:403/2285 train_time:24176ms step_avg:59.99ms -step:404/2285 train_time:24235ms step_avg:59.99ms -step:405/2285 train_time:24298ms step_avg:59.99ms -step:406/2285 train_time:24355ms step_avg:59.99ms -step:407/2285 train_time:24416ms step_avg:59.99ms -step:408/2285 train_time:24475ms step_avg:59.99ms -step:409/2285 train_time:24537ms step_avg:59.99ms -step:410/2285 train_time:24596ms step_avg:59.99ms -step:411/2285 train_time:24657ms step_avg:59.99ms -step:412/2285 train_time:24716ms step_avg:59.99ms -step:413/2285 train_time:24778ms step_avg:59.99ms -step:414/2285 train_time:24837ms step_avg:59.99ms -step:415/2285 train_time:24898ms step_avg:59.99ms -step:416/2285 train_time:24956ms step_avg:59.99ms -step:417/2285 train_time:25018ms step_avg:59.99ms -step:418/2285 train_time:25077ms step_avg:59.99ms -step:419/2285 train_time:25138ms step_avg:60.00ms -step:420/2285 train_time:25197ms step_avg:59.99ms -step:421/2285 train_time:25259ms step_avg:60.00ms -step:422/2285 train_time:25317ms step_avg:59.99ms -step:423/2285 train_time:25379ms step_avg:60.00ms -step:424/2285 train_time:25437ms step_avg:59.99ms -step:425/2285 train_time:25499ms step_avg:60.00ms -step:426/2285 train_time:25558ms step_avg:60.00ms -step:427/2285 train_time:25620ms step_avg:60.00ms -step:428/2285 train_time:25679ms step_avg:60.00ms -step:429/2285 train_time:25740ms step_avg:60.00ms -step:430/2285 train_time:25800ms step_avg:60.00ms -step:431/2285 train_time:25860ms step_avg:60.00ms -step:432/2285 train_time:25919ms step_avg:60.00ms -step:433/2285 train_time:25980ms step_avg:60.00ms -step:434/2285 train_time:26039ms step_avg:60.00ms -step:435/2285 train_time:26100ms step_avg:60.00ms -step:436/2285 train_time:26159ms step_avg:60.00ms -step:437/2285 train_time:26220ms step_avg:60.00ms -step:438/2285 train_time:26279ms step_avg:60.00ms -step:439/2285 train_time:26340ms step_avg:60.00ms -step:440/2285 train_time:26399ms step_avg:60.00ms -step:441/2285 train_time:26460ms step_avg:60.00ms -step:442/2285 train_time:26519ms step_avg:60.00ms -step:443/2285 train_time:26580ms step_avg:60.00ms -step:444/2285 train_time:26640ms step_avg:60.00ms -step:445/2285 train_time:26701ms step_avg:60.00ms -step:446/2285 train_time:26760ms step_avg:60.00ms -step:447/2285 train_time:26822ms step_avg:60.00ms -step:448/2285 train_time:26880ms step_avg:60.00ms -step:449/2285 train_time:26941ms step_avg:60.00ms -step:450/2285 train_time:27000ms step_avg:60.00ms -step:451/2285 train_time:27062ms step_avg:60.00ms -step:452/2285 train_time:27121ms step_avg:60.00ms -step:453/2285 train_time:27182ms step_avg:60.00ms -step:454/2285 train_time:27241ms step_avg:60.00ms -step:455/2285 train_time:27302ms step_avg:60.00ms -step:456/2285 train_time:27361ms step_avg:60.00ms -step:457/2285 train_time:27423ms step_avg:60.01ms -step:458/2285 train_time:27481ms step_avg:60.00ms -step:459/2285 train_time:27542ms step_avg:60.01ms -step:460/2285 train_time:27602ms step_avg:60.00ms -step:461/2285 train_time:27663ms step_avg:60.01ms -step:462/2285 train_time:27722ms step_avg:60.00ms -step:463/2285 train_time:27782ms step_avg:60.01ms -step:464/2285 train_time:27841ms step_avg:60.00ms -step:465/2285 train_time:27902ms step_avg:60.00ms -step:466/2285 train_time:27961ms step_avg:60.00ms -step:467/2285 train_time:28022ms step_avg:60.00ms -step:468/2285 train_time:28081ms step_avg:60.00ms -step:469/2285 train_time:28142ms step_avg:60.00ms -step:470/2285 train_time:28201ms step_avg:60.00ms -step:471/2285 train_time:28263ms step_avg:60.01ms -step:472/2285 train_time:28322ms step_avg:60.00ms -step:473/2285 train_time:28383ms step_avg:60.01ms -step:474/2285 train_time:28442ms step_avg:60.00ms -step:475/2285 train_time:28503ms step_avg:60.01ms -step:476/2285 train_time:28562ms step_avg:60.01ms -step:477/2285 train_time:28624ms step_avg:60.01ms -step:478/2285 train_time:28683ms step_avg:60.01ms -step:479/2285 train_time:28744ms step_avg:60.01ms -step:480/2285 train_time:28803ms step_avg:60.01ms -step:481/2285 train_time:28864ms step_avg:60.01ms -step:482/2285 train_time:28922ms step_avg:60.01ms -step:483/2285 train_time:28983ms step_avg:60.01ms -step:484/2285 train_time:29042ms step_avg:60.00ms -step:485/2285 train_time:29104ms step_avg:60.01ms -step:486/2285 train_time:29163ms step_avg:60.01ms -step:487/2285 train_time:29224ms step_avg:60.01ms -step:488/2285 train_time:29282ms step_avg:60.00ms -step:489/2285 train_time:29344ms step_avg:60.01ms -step:490/2285 train_time:29403ms step_avg:60.01ms -step:491/2285 train_time:29464ms step_avg:60.01ms -step:492/2285 train_time:29523ms step_avg:60.01ms -step:493/2285 train_time:29584ms step_avg:60.01ms -step:494/2285 train_time:29644ms step_avg:60.01ms -step:495/2285 train_time:29705ms step_avg:60.01ms -step:496/2285 train_time:29764ms step_avg:60.01ms -step:497/2285 train_time:29825ms step_avg:60.01ms -step:498/2285 train_time:29884ms step_avg:60.01ms -step:499/2285 train_time:29945ms step_avg:60.01ms -step:500/2285 train_time:30004ms step_avg:60.01ms -step:500/2285 val_loss:3.7835 train_time:30067ms step_avg:60.13ms -step:501/2285 train_time:30085ms step_avg:60.05ms -step:502/2285 train_time:30127ms step_avg:60.01ms -step:503/2285 train_time:30189ms step_avg:60.02ms -step:504/2285 train_time:30251ms step_avg:60.02ms -step:505/2285 train_time:30314ms step_avg:60.03ms -step:506/2285 train_time:30374ms step_avg:60.03ms -step:507/2285 train_time:30435ms step_avg:60.03ms -step:508/2285 train_time:30494ms step_avg:60.03ms -step:509/2285 train_time:30555ms step_avg:60.03ms -step:510/2285 train_time:30613ms step_avg:60.03ms -step:511/2285 train_time:30674ms step_avg:60.03ms -step:512/2285 train_time:30733ms step_avg:60.02ms -step:513/2285 train_time:30794ms step_avg:60.03ms -step:514/2285 train_time:30853ms step_avg:60.03ms -step:515/2285 train_time:30914ms step_avg:60.03ms -step:516/2285 train_time:30974ms step_avg:60.03ms -step:517/2285 train_time:31038ms step_avg:60.04ms -step:518/2285 train_time:31098ms step_avg:60.03ms -step:519/2285 train_time:31160ms step_avg:60.04ms -step:520/2285 train_time:31220ms step_avg:60.04ms -step:521/2285 train_time:31281ms step_avg:60.04ms -step:522/2285 train_time:31340ms step_avg:60.04ms -step:523/2285 train_time:31402ms step_avg:60.04ms -step:524/2285 train_time:31462ms step_avg:60.04ms -step:525/2285 train_time:31523ms step_avg:60.04ms -step:526/2285 train_time:31581ms step_avg:60.04ms -step:527/2285 train_time:31643ms step_avg:60.04ms -step:528/2285 train_time:31702ms step_avg:60.04ms -step:529/2285 train_time:31764ms step_avg:60.05ms -step:530/2285 train_time:31823ms step_avg:60.04ms -step:531/2285 train_time:31885ms step_avg:60.05ms -step:532/2285 train_time:31944ms step_avg:60.05ms -step:533/2285 train_time:32005ms step_avg:60.05ms -step:534/2285 train_time:32065ms step_avg:60.05ms -step:535/2285 train_time:32127ms step_avg:60.05ms -step:536/2285 train_time:32187ms step_avg:60.05ms -step:537/2285 train_time:32249ms step_avg:60.05ms -step:538/2285 train_time:32309ms step_avg:60.05ms -step:539/2285 train_time:32371ms step_avg:60.06ms -step:540/2285 train_time:32430ms step_avg:60.06ms -step:541/2285 train_time:32492ms step_avg:60.06ms -step:542/2285 train_time:32551ms step_avg:60.06ms -step:543/2285 train_time:32613ms step_avg:60.06ms -step:544/2285 train_time:32672ms step_avg:60.06ms -step:545/2285 train_time:32733ms step_avg:60.06ms -step:546/2285 train_time:32792ms step_avg:60.06ms -step:547/2285 train_time:32854ms step_avg:60.06ms -step:548/2285 train_time:32913ms step_avg:60.06ms -step:549/2285 train_time:32974ms step_avg:60.06ms -step:550/2285 train_time:33033ms step_avg:60.06ms -step:551/2285 train_time:33095ms step_avg:60.06ms -step:552/2285 train_time:33154ms step_avg:60.06ms -step:553/2285 train_time:33216ms step_avg:60.06ms -step:554/2285 train_time:33275ms step_avg:60.06ms -step:555/2285 train_time:33337ms step_avg:60.07ms -step:556/2285 train_time:33395ms step_avg:60.06ms -step:557/2285 train_time:33457ms step_avg:60.07ms -step:558/2285 train_time:33516ms step_avg:60.06ms -step:559/2285 train_time:33577ms step_avg:60.07ms -step:560/2285 train_time:33636ms step_avg:60.06ms -step:561/2285 train_time:33697ms step_avg:60.07ms -step:562/2285 train_time:33756ms step_avg:60.06ms -step:563/2285 train_time:33817ms step_avg:60.07ms -step:564/2285 train_time:33876ms step_avg:60.06ms -step:565/2285 train_time:33938ms step_avg:60.07ms -step:566/2285 train_time:33997ms step_avg:60.07ms -step:567/2285 train_time:34058ms step_avg:60.07ms -step:568/2285 train_time:34117ms step_avg:60.07ms -step:569/2285 train_time:34179ms step_avg:60.07ms -step:570/2285 train_time:34238ms step_avg:60.07ms -step:571/2285 train_time:34300ms step_avg:60.07ms -step:572/2285 train_time:34359ms step_avg:60.07ms -step:573/2285 train_time:34420ms step_avg:60.07ms -step:574/2285 train_time:34479ms step_avg:60.07ms -step:575/2285 train_time:34540ms step_avg:60.07ms -step:576/2285 train_time:34599ms step_avg:60.07ms -step:577/2285 train_time:34661ms step_avg:60.07ms -step:578/2285 train_time:34720ms step_avg:60.07ms -step:579/2285 train_time:34782ms step_avg:60.07ms -step:580/2285 train_time:34841ms step_avg:60.07ms -step:581/2285 train_time:34903ms step_avg:60.07ms -step:582/2285 train_time:34962ms step_avg:60.07ms -step:583/2285 train_time:35023ms step_avg:60.07ms -step:584/2285 train_time:35082ms step_avg:60.07ms -step:585/2285 train_time:35144ms step_avg:60.07ms -step:586/2285 train_time:35203ms step_avg:60.07ms -step:587/2285 train_time:35265ms step_avg:60.08ms -step:588/2285 train_time:35324ms step_avg:60.08ms -step:589/2285 train_time:35386ms step_avg:60.08ms -step:590/2285 train_time:35446ms step_avg:60.08ms -step:591/2285 train_time:35508ms step_avg:60.08ms -step:592/2285 train_time:35567ms step_avg:60.08ms -step:593/2285 train_time:35628ms step_avg:60.08ms -step:594/2285 train_time:35687ms step_avg:60.08ms -step:595/2285 train_time:35749ms step_avg:60.08ms -step:596/2285 train_time:35808ms step_avg:60.08ms -step:597/2285 train_time:35870ms step_avg:60.08ms -step:598/2285 train_time:35929ms step_avg:60.08ms -step:599/2285 train_time:35990ms step_avg:60.08ms -step:600/2285 train_time:36049ms step_avg:60.08ms -step:601/2285 train_time:36111ms step_avg:60.08ms -step:602/2285 train_time:36170ms step_avg:60.08ms -step:603/2285 train_time:36232ms step_avg:60.09ms -step:604/2285 train_time:36291ms step_avg:60.08ms -step:605/2285 train_time:36352ms step_avg:60.09ms -step:606/2285 train_time:36412ms step_avg:60.08ms -step:607/2285 train_time:36473ms step_avg:60.09ms -step:608/2285 train_time:36532ms step_avg:60.09ms -step:609/2285 train_time:36594ms step_avg:60.09ms -step:610/2285 train_time:36653ms step_avg:60.09ms -step:611/2285 train_time:36715ms step_avg:60.09ms -step:612/2285 train_time:36774ms step_avg:60.09ms -step:613/2285 train_time:36835ms step_avg:60.09ms -step:614/2285 train_time:36894ms step_avg:60.09ms -step:615/2285 train_time:36955ms step_avg:60.09ms -step:616/2285 train_time:37014ms step_avg:60.09ms -step:617/2285 train_time:37075ms step_avg:60.09ms -step:618/2285 train_time:37134ms step_avg:60.09ms -step:619/2285 train_time:37195ms step_avg:60.09ms -step:620/2285 train_time:37254ms step_avg:60.09ms -step:621/2285 train_time:37316ms step_avg:60.09ms -step:622/2285 train_time:37375ms step_avg:60.09ms -step:623/2285 train_time:37436ms step_avg:60.09ms -step:624/2285 train_time:37496ms step_avg:60.09ms -step:625/2285 train_time:37558ms step_avg:60.09ms -step:626/2285 train_time:37616ms step_avg:60.09ms -step:627/2285 train_time:37678ms step_avg:60.09ms -step:628/2285 train_time:37737ms step_avg:60.09ms -step:629/2285 train_time:37798ms step_avg:60.09ms -step:630/2285 train_time:37858ms step_avg:60.09ms -step:631/2285 train_time:37918ms step_avg:60.09ms -step:632/2285 train_time:37976ms step_avg:60.09ms -step:633/2285 train_time:38038ms step_avg:60.09ms -step:634/2285 train_time:38096ms step_avg:60.09ms -step:635/2285 train_time:38158ms step_avg:60.09ms -step:636/2285 train_time:38217ms step_avg:60.09ms -step:637/2285 train_time:38278ms step_avg:60.09ms -step:638/2285 train_time:38337ms step_avg:60.09ms -step:639/2285 train_time:38399ms step_avg:60.09ms -step:640/2285 train_time:38458ms step_avg:60.09ms -step:641/2285 train_time:38519ms step_avg:60.09ms -step:642/2285 train_time:38578ms step_avg:60.09ms -step:643/2285 train_time:38639ms step_avg:60.09ms -step:644/2285 train_time:38698ms step_avg:60.09ms -step:645/2285 train_time:38759ms step_avg:60.09ms -step:646/2285 train_time:38818ms step_avg:60.09ms -step:647/2285 train_time:38879ms step_avg:60.09ms -step:648/2285 train_time:38938ms step_avg:60.09ms -step:649/2285 train_time:38999ms step_avg:60.09ms -step:650/2285 train_time:39058ms step_avg:60.09ms -step:651/2285 train_time:39120ms step_avg:60.09ms -step:652/2285 train_time:39179ms step_avg:60.09ms -step:653/2285 train_time:39240ms step_avg:60.09ms -step:654/2285 train_time:39299ms step_avg:60.09ms -step:655/2285 train_time:39360ms step_avg:60.09ms -step:656/2285 train_time:39420ms step_avg:60.09ms -step:657/2285 train_time:39481ms step_avg:60.09ms -step:658/2285 train_time:39539ms step_avg:60.09ms -step:659/2285 train_time:39601ms step_avg:60.09ms -step:660/2285 train_time:39660ms step_avg:60.09ms -step:661/2285 train_time:39722ms step_avg:60.09ms -step:662/2285 train_time:39781ms step_avg:60.09ms -step:663/2285 train_time:39842ms step_avg:60.09ms -step:664/2285 train_time:39902ms step_avg:60.09ms -step:665/2285 train_time:39963ms step_avg:60.09ms -step:666/2285 train_time:40022ms step_avg:60.09ms -step:667/2285 train_time:40083ms step_avg:60.09ms -step:668/2285 train_time:40142ms step_avg:60.09ms -step:669/2285 train_time:40203ms step_avg:60.09ms -step:670/2285 train_time:40263ms step_avg:60.09ms -step:671/2285 train_time:40325ms step_avg:60.10ms -step:672/2285 train_time:40384ms step_avg:60.10ms -step:673/2285 train_time:40445ms step_avg:60.10ms -step:674/2285 train_time:40505ms step_avg:60.10ms -step:675/2285 train_time:40567ms step_avg:60.10ms -step:676/2285 train_time:40625ms step_avg:60.10ms -step:677/2285 train_time:40687ms step_avg:60.10ms -step:678/2285 train_time:40747ms step_avg:60.10ms -step:679/2285 train_time:40809ms step_avg:60.10ms -step:680/2285 train_time:40868ms step_avg:60.10ms -step:681/2285 train_time:40930ms step_avg:60.10ms -step:682/2285 train_time:40989ms step_avg:60.10ms -step:683/2285 train_time:41051ms step_avg:60.10ms -step:684/2285 train_time:41111ms step_avg:60.10ms -step:685/2285 train_time:41173ms step_avg:60.11ms -step:686/2285 train_time:41233ms step_avg:60.11ms -step:687/2285 train_time:41294ms step_avg:60.11ms -step:688/2285 train_time:41353ms step_avg:60.11ms -step:689/2285 train_time:41416ms step_avg:60.11ms -step:690/2285 train_time:41475ms step_avg:60.11ms -step:691/2285 train_time:41536ms step_avg:60.11ms -step:692/2285 train_time:41595ms step_avg:60.11ms -step:693/2285 train_time:41656ms step_avg:60.11ms -step:694/2285 train_time:41715ms step_avg:60.11ms -step:695/2285 train_time:41777ms step_avg:60.11ms -step:696/2285 train_time:41835ms step_avg:60.11ms -step:697/2285 train_time:41897ms step_avg:60.11ms -step:698/2285 train_time:41956ms step_avg:60.11ms -step:699/2285 train_time:42018ms step_avg:60.11ms -step:700/2285 train_time:42076ms step_avg:60.11ms -step:701/2285 train_time:42138ms step_avg:60.11ms -step:702/2285 train_time:42197ms step_avg:60.11ms -step:703/2285 train_time:42258ms step_avg:60.11ms -step:704/2285 train_time:42317ms step_avg:60.11ms -step:705/2285 train_time:42379ms step_avg:60.11ms -step:706/2285 train_time:42437ms step_avg:60.11ms -step:707/2285 train_time:42499ms step_avg:60.11ms -step:708/2285 train_time:42558ms step_avg:60.11ms -step:709/2285 train_time:42619ms step_avg:60.11ms -step:710/2285 train_time:42678ms step_avg:60.11ms -step:711/2285 train_time:42739ms step_avg:60.11ms -step:712/2285 train_time:42798ms step_avg:60.11ms -step:713/2285 train_time:42860ms step_avg:60.11ms -step:714/2285 train_time:42919ms step_avg:60.11ms -step:715/2285 train_time:42981ms step_avg:60.11ms -step:716/2285 train_time:43040ms step_avg:60.11ms -step:717/2285 train_time:43101ms step_avg:60.11ms -step:718/2285 train_time:43161ms step_avg:60.11ms -step:719/2285 train_time:43223ms step_avg:60.12ms -step:720/2285 train_time:43282ms step_avg:60.11ms -step:721/2285 train_time:43343ms step_avg:60.12ms -step:722/2285 train_time:43402ms step_avg:60.11ms -step:723/2285 train_time:43464ms step_avg:60.12ms -step:724/2285 train_time:43523ms step_avg:60.11ms -step:725/2285 train_time:43584ms step_avg:60.12ms -step:726/2285 train_time:43643ms step_avg:60.11ms -step:727/2285 train_time:43704ms step_avg:60.12ms -step:728/2285 train_time:43765ms step_avg:60.12ms -step:729/2285 train_time:43826ms step_avg:60.12ms -step:730/2285 train_time:43885ms step_avg:60.12ms -step:731/2285 train_time:43947ms step_avg:60.12ms -step:732/2285 train_time:44006ms step_avg:60.12ms -step:733/2285 train_time:44067ms step_avg:60.12ms -step:734/2285 train_time:44126ms step_avg:60.12ms -step:735/2285 train_time:44189ms step_avg:60.12ms -step:736/2285 train_time:44248ms step_avg:60.12ms -step:737/2285 train_time:44309ms step_avg:60.12ms -step:738/2285 train_time:44369ms step_avg:60.12ms -step:739/2285 train_time:44430ms step_avg:60.12ms -step:740/2285 train_time:44490ms step_avg:60.12ms -step:741/2285 train_time:44552ms step_avg:60.12ms -step:742/2285 train_time:44611ms step_avg:60.12ms -step:743/2285 train_time:44672ms step_avg:60.12ms -step:744/2285 train_time:44731ms step_avg:60.12ms -step:745/2285 train_time:44793ms step_avg:60.12ms -step:746/2285 train_time:44852ms step_avg:60.12ms -step:747/2285 train_time:44914ms step_avg:60.13ms -step:748/2285 train_time:44973ms step_avg:60.12ms -step:749/2285 train_time:45035ms step_avg:60.13ms -step:750/2285 train_time:45095ms step_avg:60.13ms -step:750/2285 val_loss:3.6604 train_time:45158ms step_avg:60.21ms -step:751/2285 train_time:45177ms step_avg:60.16ms -step:752/2285 train_time:45218ms step_avg:60.13ms -step:753/2285 train_time:45282ms step_avg:60.14ms -step:754/2285 train_time:45343ms step_avg:60.14ms -step:755/2285 train_time:45405ms step_avg:60.14ms -step:756/2285 train_time:45464ms step_avg:60.14ms -step:757/2285 train_time:45525ms step_avg:60.14ms -step:758/2285 train_time:45583ms step_avg:60.14ms -step:759/2285 train_time:45644ms step_avg:60.14ms -step:760/2285 train_time:45703ms step_avg:60.14ms -step:761/2285 train_time:45764ms step_avg:60.14ms -step:762/2285 train_time:45823ms step_avg:60.13ms -step:763/2285 train_time:45884ms step_avg:60.14ms -step:764/2285 train_time:45943ms step_avg:60.13ms -step:765/2285 train_time:46004ms step_avg:60.14ms -step:766/2285 train_time:46064ms step_avg:60.14ms -step:767/2285 train_time:46127ms step_avg:60.14ms -step:768/2285 train_time:46188ms step_avg:60.14ms -step:769/2285 train_time:46251ms step_avg:60.14ms -step:770/2285 train_time:46311ms step_avg:60.14ms -step:771/2285 train_time:46374ms step_avg:60.15ms -step:772/2285 train_time:46433ms step_avg:60.15ms -step:773/2285 train_time:46494ms step_avg:60.15ms -step:774/2285 train_time:46554ms step_avg:60.15ms -step:775/2285 train_time:46615ms step_avg:60.15ms -step:776/2285 train_time:46674ms step_avg:60.15ms -step:777/2285 train_time:46736ms step_avg:60.15ms -step:778/2285 train_time:46795ms step_avg:60.15ms -step:779/2285 train_time:46857ms step_avg:60.15ms -step:780/2285 train_time:46916ms step_avg:60.15ms -step:781/2285 train_time:46978ms step_avg:60.15ms -step:782/2285 train_time:47037ms step_avg:60.15ms -step:783/2285 train_time:47099ms step_avg:60.15ms -step:784/2285 train_time:47160ms step_avg:60.15ms -step:785/2285 train_time:47222ms step_avg:60.16ms -step:786/2285 train_time:47282ms step_avg:60.16ms -step:787/2285 train_time:47344ms step_avg:60.16ms -step:788/2285 train_time:47405ms step_avg:60.16ms -step:789/2285 train_time:47467ms step_avg:60.16ms -step:790/2285 train_time:47526ms step_avg:60.16ms -step:791/2285 train_time:47588ms step_avg:60.16ms -step:792/2285 train_time:47647ms step_avg:60.16ms -step:793/2285 train_time:47709ms step_avg:60.16ms -step:794/2285 train_time:47768ms step_avg:60.16ms -step:795/2285 train_time:47830ms step_avg:60.16ms -step:796/2285 train_time:47889ms step_avg:60.16ms -step:797/2285 train_time:47951ms step_avg:60.16ms -step:798/2285 train_time:48010ms step_avg:60.16ms -step:799/2285 train_time:48072ms step_avg:60.17ms -step:800/2285 train_time:48132ms step_avg:60.17ms -step:801/2285 train_time:48194ms step_avg:60.17ms -step:802/2285 train_time:48253ms step_avg:60.17ms -step:803/2285 train_time:48315ms step_avg:60.17ms -step:804/2285 train_time:48374ms step_avg:60.17ms -step:805/2285 train_time:48436ms step_avg:60.17ms -step:806/2285 train_time:48496ms step_avg:60.17ms -step:807/2285 train_time:48557ms step_avg:60.17ms -step:808/2285 train_time:48617ms step_avg:60.17ms -step:809/2285 train_time:48679ms step_avg:60.17ms -step:810/2285 train_time:48738ms step_avg:60.17ms -step:811/2285 train_time:48800ms step_avg:60.17ms -step:812/2285 train_time:48859ms step_avg:60.17ms -step:813/2285 train_time:48920ms step_avg:60.17ms -step:814/2285 train_time:48980ms step_avg:60.17ms -step:815/2285 train_time:49042ms step_avg:60.17ms -step:816/2285 train_time:49103ms step_avg:60.17ms -step:817/2285 train_time:49165ms step_avg:60.18ms -step:818/2285 train_time:49225ms step_avg:60.18ms -step:819/2285 train_time:49287ms step_avg:60.18ms -step:820/2285 train_time:49347ms step_avg:60.18ms -step:821/2285 train_time:49409ms step_avg:60.18ms -step:822/2285 train_time:49469ms step_avg:60.18ms -step:823/2285 train_time:49531ms step_avg:60.18ms -step:824/2285 train_time:49590ms step_avg:60.18ms -step:825/2285 train_time:49652ms step_avg:60.18ms -step:826/2285 train_time:49712ms step_avg:60.18ms -step:827/2285 train_time:49774ms step_avg:60.19ms -step:828/2285 train_time:49833ms step_avg:60.18ms -step:829/2285 train_time:49894ms step_avg:60.19ms -step:830/2285 train_time:49954ms step_avg:60.19ms -step:831/2285 train_time:50016ms step_avg:60.19ms -step:832/2285 train_time:50075ms step_avg:60.19ms -step:833/2285 train_time:50137ms step_avg:60.19ms -step:834/2285 train_time:50197ms step_avg:60.19ms -step:835/2285 train_time:50258ms step_avg:60.19ms -step:836/2285 train_time:50318ms step_avg:60.19ms -step:837/2285 train_time:50380ms step_avg:60.19ms -step:838/2285 train_time:50439ms step_avg:60.19ms -step:839/2285 train_time:50501ms step_avg:60.19ms -step:840/2285 train_time:50561ms step_avg:60.19ms -step:841/2285 train_time:50623ms step_avg:60.19ms -step:842/2285 train_time:50683ms step_avg:60.19ms -step:843/2285 train_time:50745ms step_avg:60.20ms -step:844/2285 train_time:50804ms step_avg:60.19ms -step:845/2285 train_time:50866ms step_avg:60.20ms -step:846/2285 train_time:50927ms step_avg:60.20ms -step:847/2285 train_time:50988ms step_avg:60.20ms -step:848/2285 train_time:51047ms step_avg:60.20ms -step:849/2285 train_time:51110ms step_avg:60.20ms -step:850/2285 train_time:51169ms step_avg:60.20ms -step:851/2285 train_time:51231ms step_avg:60.20ms -step:852/2285 train_time:51291ms step_avg:60.20ms -step:853/2285 train_time:51352ms step_avg:60.20ms -step:854/2285 train_time:51412ms step_avg:60.20ms -step:855/2285 train_time:51473ms step_avg:60.20ms -step:856/2285 train_time:51533ms step_avg:60.20ms -step:857/2285 train_time:51595ms step_avg:60.20ms -step:858/2285 train_time:51654ms step_avg:60.20ms -step:859/2285 train_time:51716ms step_avg:60.20ms -step:860/2285 train_time:51775ms step_avg:60.20ms -step:861/2285 train_time:51837ms step_avg:60.21ms -step:862/2285 train_time:51897ms step_avg:60.20ms -step:863/2285 train_time:51958ms step_avg:60.21ms -step:864/2285 train_time:52018ms step_avg:60.21ms -step:865/2285 train_time:52079ms step_avg:60.21ms -step:866/2285 train_time:52139ms step_avg:60.21ms -step:867/2285 train_time:52201ms step_avg:60.21ms -step:868/2285 train_time:52260ms step_avg:60.21ms -step:869/2285 train_time:52322ms step_avg:60.21ms -step:870/2285 train_time:52382ms step_avg:60.21ms -step:871/2285 train_time:52444ms step_avg:60.21ms -step:872/2285 train_time:52505ms step_avg:60.21ms -step:873/2285 train_time:52567ms step_avg:60.21ms -step:874/2285 train_time:52627ms step_avg:60.21ms -step:875/2285 train_time:52689ms step_avg:60.22ms -step:876/2285 train_time:52748ms step_avg:60.21ms -step:877/2285 train_time:52810ms step_avg:60.22ms -step:878/2285 train_time:52869ms step_avg:60.22ms -step:879/2285 train_time:52931ms step_avg:60.22ms -step:880/2285 train_time:52991ms step_avg:60.22ms -step:881/2285 train_time:53054ms step_avg:60.22ms -step:882/2285 train_time:53113ms step_avg:60.22ms -step:883/2285 train_time:53175ms step_avg:60.22ms -step:884/2285 train_time:53234ms step_avg:60.22ms -step:885/2285 train_time:53296ms step_avg:60.22ms -step:886/2285 train_time:53356ms step_avg:60.22ms -step:887/2285 train_time:53417ms step_avg:60.22ms -step:888/2285 train_time:53477ms step_avg:60.22ms -step:889/2285 train_time:53539ms step_avg:60.22ms -step:890/2285 train_time:53598ms step_avg:60.22ms -step:891/2285 train_time:53660ms step_avg:60.22ms -step:892/2285 train_time:53720ms step_avg:60.22ms -step:893/2285 train_time:53781ms step_avg:60.23ms -step:894/2285 train_time:53841ms step_avg:60.22ms -step:895/2285 train_time:53904ms step_avg:60.23ms -step:896/2285 train_time:53963ms step_avg:60.23ms -step:897/2285 train_time:54025ms step_avg:60.23ms -step:898/2285 train_time:54085ms step_avg:60.23ms -step:899/2285 train_time:54147ms step_avg:60.23ms -step:900/2285 train_time:54207ms step_avg:60.23ms -step:901/2285 train_time:54269ms step_avg:60.23ms -step:902/2285 train_time:54329ms step_avg:60.23ms -step:903/2285 train_time:54390ms step_avg:60.23ms -step:904/2285 train_time:54450ms step_avg:60.23ms -step:905/2285 train_time:54511ms step_avg:60.23ms -step:906/2285 train_time:54571ms step_avg:60.23ms -step:907/2285 train_time:54632ms step_avg:60.23ms -step:908/2285 train_time:54692ms step_avg:60.23ms -step:909/2285 train_time:54753ms step_avg:60.23ms -step:910/2285 train_time:54813ms step_avg:60.23ms -step:911/2285 train_time:54875ms step_avg:60.24ms -step:912/2285 train_time:54935ms step_avg:60.24ms -step:913/2285 train_time:54997ms step_avg:60.24ms -step:914/2285 train_time:55056ms step_avg:60.24ms -step:915/2285 train_time:55118ms step_avg:60.24ms -step:916/2285 train_time:55178ms step_avg:60.24ms -step:917/2285 train_time:55240ms step_avg:60.24ms -step:918/2285 train_time:55299ms step_avg:60.24ms -step:919/2285 train_time:55361ms step_avg:60.24ms -step:920/2285 train_time:55421ms step_avg:60.24ms -step:921/2285 train_time:55483ms step_avg:60.24ms -step:922/2285 train_time:55542ms step_avg:60.24ms -step:923/2285 train_time:55604ms step_avg:60.24ms -step:924/2285 train_time:55663ms step_avg:60.24ms -step:925/2285 train_time:55725ms step_avg:60.24ms -step:926/2285 train_time:55784ms step_avg:60.24ms -step:927/2285 train_time:55846ms step_avg:60.24ms -step:928/2285 train_time:55907ms step_avg:60.24ms -step:929/2285 train_time:55969ms step_avg:60.25ms -step:930/2285 train_time:56028ms step_avg:60.25ms -step:931/2285 train_time:56090ms step_avg:60.25ms -step:932/2285 train_time:56150ms step_avg:60.25ms -step:933/2285 train_time:56212ms step_avg:60.25ms -step:934/2285 train_time:56271ms step_avg:60.25ms -step:935/2285 train_time:56334ms step_avg:60.25ms -step:936/2285 train_time:56393ms step_avg:60.25ms -step:937/2285 train_time:56455ms step_avg:60.25ms -step:938/2285 train_time:56514ms step_avg:60.25ms -step:939/2285 train_time:56576ms step_avg:60.25ms -step:940/2285 train_time:56635ms step_avg:60.25ms -step:941/2285 train_time:56696ms step_avg:60.25ms -step:942/2285 train_time:56756ms step_avg:60.25ms -step:943/2285 train_time:56818ms step_avg:60.25ms -step:944/2285 train_time:56878ms step_avg:60.25ms -step:945/2285 train_time:56939ms step_avg:60.25ms -step:946/2285 train_time:56999ms step_avg:60.25ms -step:947/2285 train_time:57061ms step_avg:60.25ms -step:948/2285 train_time:57121ms step_avg:60.25ms -step:949/2285 train_time:57183ms step_avg:60.26ms -step:950/2285 train_time:57243ms step_avg:60.26ms -step:951/2285 train_time:57305ms step_avg:60.26ms -step:952/2285 train_time:57365ms step_avg:60.26ms -step:953/2285 train_time:57427ms step_avg:60.26ms -step:954/2285 train_time:57487ms step_avg:60.26ms -step:955/2285 train_time:57548ms step_avg:60.26ms -step:956/2285 train_time:57608ms step_avg:60.26ms -step:957/2285 train_time:57670ms step_avg:60.26ms -step:958/2285 train_time:57730ms step_avg:60.26ms -step:959/2285 train_time:57792ms step_avg:60.26ms -step:960/2285 train_time:57852ms step_avg:60.26ms -step:961/2285 train_time:57913ms step_avg:60.26ms -step:962/2285 train_time:57973ms step_avg:60.26ms -step:963/2285 train_time:58034ms step_avg:60.26ms -step:964/2285 train_time:58094ms step_avg:60.26ms -step:965/2285 train_time:58156ms step_avg:60.27ms -step:966/2285 train_time:58216ms step_avg:60.27ms -step:967/2285 train_time:58278ms step_avg:60.27ms -step:968/2285 train_time:58337ms step_avg:60.27ms -step:969/2285 train_time:58399ms step_avg:60.27ms -step:970/2285 train_time:58459ms step_avg:60.27ms -step:971/2285 train_time:58521ms step_avg:60.27ms -step:972/2285 train_time:58580ms step_avg:60.27ms -step:973/2285 train_time:58641ms step_avg:60.27ms -step:974/2285 train_time:58701ms step_avg:60.27ms -step:975/2285 train_time:58763ms step_avg:60.27ms -step:976/2285 train_time:58823ms step_avg:60.27ms -step:977/2285 train_time:58885ms step_avg:60.27ms -step:978/2285 train_time:58945ms step_avg:60.27ms -step:979/2285 train_time:59008ms step_avg:60.27ms -step:980/2285 train_time:59067ms step_avg:60.27ms -step:981/2285 train_time:59129ms step_avg:60.27ms -step:982/2285 train_time:59189ms step_avg:60.27ms -step:983/2285 train_time:59251ms step_avg:60.28ms -step:984/2285 train_time:59310ms step_avg:60.27ms -step:985/2285 train_time:59372ms step_avg:60.28ms -step:986/2285 train_time:59432ms step_avg:60.28ms -step:987/2285 train_time:59494ms step_avg:60.28ms -step:988/2285 train_time:59553ms step_avg:60.28ms -step:989/2285 train_time:59615ms step_avg:60.28ms -step:990/2285 train_time:59675ms step_avg:60.28ms -step:991/2285 train_time:59737ms step_avg:60.28ms -step:992/2285 train_time:59796ms step_avg:60.28ms -step:993/2285 train_time:59858ms step_avg:60.28ms -step:994/2285 train_time:59918ms step_avg:60.28ms -step:995/2285 train_time:59980ms step_avg:60.28ms -step:996/2285 train_time:60039ms step_avg:60.28ms -step:997/2285 train_time:60101ms step_avg:60.28ms -step:998/2285 train_time:60161ms step_avg:60.28ms -step:999/2285 train_time:60222ms step_avg:60.28ms -step:1000/2285 train_time:60281ms step_avg:60.28ms -step:1000/2285 val_loss:3.5649 train_time:60345ms step_avg:60.35ms -step:1001/2285 train_time:60364ms step_avg:60.30ms -step:1002/2285 train_time:60406ms step_avg:60.29ms -step:1003/2285 train_time:60468ms step_avg:60.29ms -step:1004/2285 train_time:60528ms step_avg:60.29ms -step:1005/2285 train_time:60593ms step_avg:60.29ms -step:1006/2285 train_time:60654ms step_avg:60.29ms -step:1007/2285 train_time:60715ms step_avg:60.29ms -step:1008/2285 train_time:60774ms step_avg:60.29ms -step:1009/2285 train_time:60835ms step_avg:60.29ms -step:1010/2285 train_time:60894ms step_avg:60.29ms -step:1011/2285 train_time:60955ms step_avg:60.29ms -step:1012/2285 train_time:61013ms step_avg:60.29ms -step:1013/2285 train_time:61074ms step_avg:60.29ms -step:1014/2285 train_time:61133ms step_avg:60.29ms -step:1015/2285 train_time:61194ms step_avg:60.29ms -step:1016/2285 train_time:61254ms step_avg:60.29ms -step:1017/2285 train_time:61317ms step_avg:60.29ms -step:1018/2285 train_time:61377ms step_avg:60.29ms -step:1019/2285 train_time:61440ms step_avg:60.29ms -step:1020/2285 train_time:61501ms step_avg:60.29ms -step:1021/2285 train_time:61563ms step_avg:60.30ms -step:1022/2285 train_time:61623ms step_avg:60.30ms -step:1023/2285 train_time:61685ms step_avg:60.30ms -step:1024/2285 train_time:61744ms step_avg:60.30ms -step:1025/2285 train_time:61806ms step_avg:60.30ms -step:1026/2285 train_time:61866ms step_avg:60.30ms -step:1027/2285 train_time:61927ms step_avg:60.30ms -step:1028/2285 train_time:61986ms step_avg:60.30ms -step:1029/2285 train_time:62048ms step_avg:60.30ms -step:1030/2285 train_time:62107ms step_avg:60.30ms -step:1031/2285 train_time:62168ms step_avg:60.30ms -step:1032/2285 train_time:62228ms step_avg:60.30ms -step:1033/2285 train_time:62290ms step_avg:60.30ms -step:1034/2285 train_time:62351ms step_avg:60.30ms -step:1035/2285 train_time:62413ms step_avg:60.30ms -step:1036/2285 train_time:62474ms step_avg:60.30ms -step:1037/2285 train_time:62536ms step_avg:60.30ms -step:1038/2285 train_time:62596ms step_avg:60.30ms -step:1039/2285 train_time:62658ms step_avg:60.31ms -step:1040/2285 train_time:62717ms step_avg:60.31ms -step:1041/2285 train_time:62780ms step_avg:60.31ms -step:1042/2285 train_time:62838ms step_avg:60.31ms -step:1043/2285 train_time:62900ms step_avg:60.31ms -step:1044/2285 train_time:62959ms step_avg:60.31ms -step:1045/2285 train_time:63021ms step_avg:60.31ms -step:1046/2285 train_time:63081ms step_avg:60.31ms -step:1047/2285 train_time:63142ms step_avg:60.31ms -step:1048/2285 train_time:63202ms step_avg:60.31ms -step:1049/2285 train_time:63264ms step_avg:60.31ms -step:1050/2285 train_time:63324ms step_avg:60.31ms -step:1051/2285 train_time:63386ms step_avg:60.31ms -step:1052/2285 train_time:63446ms step_avg:60.31ms -step:1053/2285 train_time:63508ms step_avg:60.31ms -step:1054/2285 train_time:63568ms step_avg:60.31ms -step:1055/2285 train_time:63630ms step_avg:60.31ms -step:1056/2285 train_time:63691ms step_avg:60.31ms -step:1057/2285 train_time:63754ms step_avg:60.32ms -step:1058/2285 train_time:63814ms step_avg:60.32ms -step:1059/2285 train_time:63875ms step_avg:60.32ms -step:1060/2285 train_time:63935ms step_avg:60.32ms -step:1061/2285 train_time:63996ms step_avg:60.32ms -step:1062/2285 train_time:64056ms step_avg:60.32ms -step:1063/2285 train_time:64117ms step_avg:60.32ms -step:1064/2285 train_time:64177ms step_avg:60.32ms -step:1065/2285 train_time:64239ms step_avg:60.32ms -step:1066/2285 train_time:64299ms step_avg:60.32ms -step:1067/2285 train_time:64361ms step_avg:60.32ms -step:1068/2285 train_time:64420ms step_avg:60.32ms -step:1069/2285 train_time:64482ms step_avg:60.32ms -step:1070/2285 train_time:64541ms step_avg:60.32ms -step:1071/2285 train_time:64603ms step_avg:60.32ms -step:1072/2285 train_time:64663ms step_avg:60.32ms -step:1073/2285 train_time:64726ms step_avg:60.32ms -step:1074/2285 train_time:64786ms step_avg:60.32ms -step:1075/2285 train_time:64848ms step_avg:60.32ms -step:1076/2285 train_time:64907ms step_avg:60.32ms -step:1077/2285 train_time:64969ms step_avg:60.32ms -step:1078/2285 train_time:65028ms step_avg:60.32ms -step:1079/2285 train_time:65091ms step_avg:60.32ms -step:1080/2285 train_time:65150ms step_avg:60.32ms -step:1081/2285 train_time:65212ms step_avg:60.33ms -step:1082/2285 train_time:65272ms step_avg:60.33ms -step:1083/2285 train_time:65334ms step_avg:60.33ms -step:1084/2285 train_time:65393ms step_avg:60.33ms -step:1085/2285 train_time:65456ms step_avg:60.33ms -step:1086/2285 train_time:65515ms step_avg:60.33ms -step:1087/2285 train_time:65577ms step_avg:60.33ms -step:1088/2285 train_time:65637ms step_avg:60.33ms -step:1089/2285 train_time:65699ms step_avg:60.33ms -step:1090/2285 train_time:65758ms step_avg:60.33ms -step:1091/2285 train_time:65821ms step_avg:60.33ms -step:1092/2285 train_time:65880ms step_avg:60.33ms -step:1093/2285 train_time:65942ms step_avg:60.33ms -step:1094/2285 train_time:66001ms step_avg:60.33ms -step:1095/2285 train_time:66063ms step_avg:60.33ms -step:1096/2285 train_time:66123ms step_avg:60.33ms -step:1097/2285 train_time:66184ms step_avg:60.33ms -step:1098/2285 train_time:66244ms step_avg:60.33ms -step:1099/2285 train_time:66305ms step_avg:60.33ms -step:1100/2285 train_time:66365ms step_avg:60.33ms -step:1101/2285 train_time:66427ms step_avg:60.33ms -step:1102/2285 train_time:66486ms step_avg:60.33ms -step:1103/2285 train_time:66548ms step_avg:60.33ms -step:1104/2285 train_time:66608ms step_avg:60.33ms -step:1105/2285 train_time:66672ms step_avg:60.34ms -step:1106/2285 train_time:66730ms step_avg:60.33ms -step:1107/2285 train_time:66792ms step_avg:60.34ms -step:1108/2285 train_time:66852ms step_avg:60.34ms -step:1109/2285 train_time:66914ms step_avg:60.34ms -step:1110/2285 train_time:66974ms step_avg:60.34ms -step:1111/2285 train_time:67035ms step_avg:60.34ms -step:1112/2285 train_time:67095ms step_avg:60.34ms -step:1113/2285 train_time:67157ms step_avg:60.34ms -step:1114/2285 train_time:67216ms step_avg:60.34ms -step:1115/2285 train_time:67277ms step_avg:60.34ms -step:1116/2285 train_time:67336ms step_avg:60.34ms -step:1117/2285 train_time:67398ms step_avg:60.34ms -step:1118/2285 train_time:67458ms step_avg:60.34ms -step:1119/2285 train_time:67520ms step_avg:60.34ms -step:1120/2285 train_time:67579ms step_avg:60.34ms -step:1121/2285 train_time:67642ms step_avg:60.34ms -step:1122/2285 train_time:67701ms step_avg:60.34ms -step:1123/2285 train_time:67763ms step_avg:60.34ms -step:1124/2285 train_time:67822ms step_avg:60.34ms -step:1125/2285 train_time:67884ms step_avg:60.34ms -step:1126/2285 train_time:67943ms step_avg:60.34ms -step:1127/2285 train_time:68005ms step_avg:60.34ms -step:1128/2285 train_time:68066ms step_avg:60.34ms -step:1129/2285 train_time:68128ms step_avg:60.34ms -step:1130/2285 train_time:68187ms step_avg:60.34ms -step:1131/2285 train_time:68249ms step_avg:60.34ms -step:1132/2285 train_time:68309ms step_avg:60.34ms -step:1133/2285 train_time:68371ms step_avg:60.34ms -step:1134/2285 train_time:68430ms step_avg:60.34ms -step:1135/2285 train_time:68492ms step_avg:60.35ms -step:1136/2285 train_time:68553ms step_avg:60.35ms -step:1137/2285 train_time:68614ms step_avg:60.35ms -step:1138/2285 train_time:68674ms step_avg:60.35ms -step:1139/2285 train_time:68736ms step_avg:60.35ms -step:1140/2285 train_time:68795ms step_avg:60.35ms -step:1141/2285 train_time:68857ms step_avg:60.35ms -step:1142/2285 train_time:68917ms step_avg:60.35ms -step:1143/2285 train_time:68980ms step_avg:60.35ms -step:1144/2285 train_time:69039ms step_avg:60.35ms -step:1145/2285 train_time:69101ms step_avg:60.35ms -step:1146/2285 train_time:69160ms step_avg:60.35ms -step:1147/2285 train_time:69222ms step_avg:60.35ms -step:1148/2285 train_time:69282ms step_avg:60.35ms -step:1149/2285 train_time:69345ms step_avg:60.35ms -step:1150/2285 train_time:69404ms step_avg:60.35ms -step:1151/2285 train_time:69467ms step_avg:60.35ms -step:1152/2285 train_time:69526ms step_avg:60.35ms -step:1153/2285 train_time:69588ms step_avg:60.35ms -step:1154/2285 train_time:69649ms step_avg:60.35ms -step:1155/2285 train_time:69711ms step_avg:60.36ms -step:1156/2285 train_time:69771ms step_avg:60.36ms -step:1157/2285 train_time:69834ms step_avg:60.36ms -step:1158/2285 train_time:69894ms step_avg:60.36ms -step:1159/2285 train_time:69956ms step_avg:60.36ms -step:1160/2285 train_time:70016ms step_avg:60.36ms -step:1161/2285 train_time:70078ms step_avg:60.36ms -step:1162/2285 train_time:70138ms step_avg:60.36ms -step:1163/2285 train_time:70200ms step_avg:60.36ms -step:1164/2285 train_time:70260ms step_avg:60.36ms -step:1165/2285 train_time:70322ms step_avg:60.36ms -step:1166/2285 train_time:70382ms step_avg:60.36ms -step:1167/2285 train_time:70444ms step_avg:60.36ms -step:1168/2285 train_time:70504ms step_avg:60.36ms -step:1169/2285 train_time:70567ms step_avg:60.37ms -step:1170/2285 train_time:70627ms step_avg:60.37ms -step:1171/2285 train_time:70689ms step_avg:60.37ms -step:1172/2285 train_time:70749ms step_avg:60.37ms -step:1173/2285 train_time:70811ms step_avg:60.37ms -step:1174/2285 train_time:70871ms step_avg:60.37ms -step:1175/2285 train_time:70934ms step_avg:60.37ms -step:1176/2285 train_time:70994ms step_avg:60.37ms -step:1177/2285 train_time:71056ms step_avg:60.37ms -step:1178/2285 train_time:71116ms step_avg:60.37ms -step:1179/2285 train_time:71178ms step_avg:60.37ms -step:1180/2285 train_time:71238ms step_avg:60.37ms -step:1181/2285 train_time:71300ms step_avg:60.37ms -step:1182/2285 train_time:71360ms step_avg:60.37ms -step:1183/2285 train_time:71422ms step_avg:60.37ms -step:1184/2285 train_time:71481ms step_avg:60.37ms -step:1185/2285 train_time:71543ms step_avg:60.37ms -step:1186/2285 train_time:71603ms step_avg:60.37ms -step:1187/2285 train_time:71666ms step_avg:60.38ms -step:1188/2285 train_time:71726ms step_avg:60.38ms -step:1189/2285 train_time:71789ms step_avg:60.38ms -step:1190/2285 train_time:71850ms step_avg:60.38ms -step:1191/2285 train_time:71913ms step_avg:60.38ms -step:1192/2285 train_time:71973ms step_avg:60.38ms -step:1193/2285 train_time:72035ms step_avg:60.38ms -step:1194/2285 train_time:72094ms step_avg:60.38ms -step:1195/2285 train_time:72156ms step_avg:60.38ms -step:1196/2285 train_time:72216ms step_avg:60.38ms -step:1197/2285 train_time:72278ms step_avg:60.38ms -step:1198/2285 train_time:72338ms step_avg:60.38ms -step:1199/2285 train_time:72400ms step_avg:60.38ms -step:1200/2285 train_time:72460ms step_avg:60.38ms -step:1201/2285 train_time:72522ms step_avg:60.38ms -step:1202/2285 train_time:72583ms step_avg:60.38ms -step:1203/2285 train_time:72644ms step_avg:60.39ms -step:1204/2285 train_time:72703ms step_avg:60.38ms -step:1205/2285 train_time:72766ms step_avg:60.39ms -step:1206/2285 train_time:72826ms step_avg:60.39ms -step:1207/2285 train_time:72888ms step_avg:60.39ms -step:1208/2285 train_time:72948ms step_avg:60.39ms -step:1209/2285 train_time:73010ms step_avg:60.39ms -step:1210/2285 train_time:73070ms step_avg:60.39ms -step:1211/2285 train_time:73133ms step_avg:60.39ms -step:1212/2285 train_time:73193ms step_avg:60.39ms -step:1213/2285 train_time:73256ms step_avg:60.39ms -step:1214/2285 train_time:73316ms step_avg:60.39ms -step:1215/2285 train_time:73378ms step_avg:60.39ms -step:1216/2285 train_time:73438ms step_avg:60.39ms -step:1217/2285 train_time:73500ms step_avg:60.39ms -step:1218/2285 train_time:73560ms step_avg:60.39ms -step:1219/2285 train_time:73622ms step_avg:60.40ms -step:1220/2285 train_time:73682ms step_avg:60.39ms -step:1221/2285 train_time:73745ms step_avg:60.40ms -step:1222/2285 train_time:73805ms step_avg:60.40ms -step:1223/2285 train_time:73867ms step_avg:60.40ms -step:1224/2285 train_time:73927ms step_avg:60.40ms -step:1225/2285 train_time:73989ms step_avg:60.40ms -step:1226/2285 train_time:74050ms step_avg:60.40ms -step:1227/2285 train_time:74113ms step_avg:60.40ms -step:1228/2285 train_time:74173ms step_avg:60.40ms -step:1229/2285 train_time:74235ms step_avg:60.40ms -step:1230/2285 train_time:74295ms step_avg:60.40ms -step:1231/2285 train_time:74357ms step_avg:60.40ms -step:1232/2285 train_time:74418ms step_avg:60.40ms -step:1233/2285 train_time:74481ms step_avg:60.41ms -step:1234/2285 train_time:74540ms step_avg:60.40ms -step:1235/2285 train_time:74601ms step_avg:60.41ms -step:1236/2285 train_time:74661ms step_avg:60.41ms -step:1237/2285 train_time:74723ms step_avg:60.41ms -step:1238/2285 train_time:74782ms step_avg:60.41ms -step:1239/2285 train_time:74844ms step_avg:60.41ms -step:1240/2285 train_time:74904ms step_avg:60.41ms -step:1241/2285 train_time:74966ms step_avg:60.41ms -step:1242/2285 train_time:75026ms step_avg:60.41ms -step:1243/2285 train_time:75089ms step_avg:60.41ms -step:1244/2285 train_time:75149ms step_avg:60.41ms -step:1245/2285 train_time:75211ms step_avg:60.41ms -step:1246/2285 train_time:75272ms step_avg:60.41ms -step:1247/2285 train_time:75335ms step_avg:60.41ms -step:1248/2285 train_time:75395ms step_avg:60.41ms -step:1249/2285 train_time:75458ms step_avg:60.41ms -step:1250/2285 train_time:75517ms step_avg:60.41ms -step:1250/2285 val_loss:3.4939 train_time:75581ms step_avg:60.46ms -step:1251/2285 train_time:75600ms step_avg:60.43ms -step:1252/2285 train_time:75641ms step_avg:60.42ms -step:1253/2285 train_time:75703ms step_avg:60.42ms -step:1254/2285 train_time:75762ms step_avg:60.42ms -step:1255/2285 train_time:75823ms step_avg:60.42ms -step:1256/2285 train_time:75882ms step_avg:60.42ms -step:1257/2285 train_time:75943ms step_avg:60.42ms -step:1258/2285 train_time:76002ms step_avg:60.41ms -step:1259/2285 train_time:76063ms step_avg:60.42ms -step:1260/2285 train_time:76122ms step_avg:60.41ms -step:1261/2285 train_time:76183ms step_avg:60.41ms -step:1262/2285 train_time:76241ms step_avg:60.41ms -step:1263/2285 train_time:76302ms step_avg:60.41ms -step:1264/2285 train_time:76363ms step_avg:60.41ms -step:1265/2285 train_time:76422ms step_avg:60.41ms -step:1266/2285 train_time:76485ms step_avg:60.41ms -step:1267/2285 train_time:76553ms step_avg:60.42ms -step:1268/2285 train_time:76614ms step_avg:60.42ms -step:1269/2285 train_time:76676ms step_avg:60.42ms -step:1270/2285 train_time:76735ms step_avg:60.42ms -step:1271/2285 train_time:76798ms step_avg:60.42ms -step:1272/2285 train_time:76857ms step_avg:60.42ms -step:1273/2285 train_time:76918ms step_avg:60.42ms -step:1274/2285 train_time:76978ms step_avg:60.42ms -step:1275/2285 train_time:77040ms step_avg:60.42ms -step:1276/2285 train_time:77098ms step_avg:60.42ms -step:1277/2285 train_time:77160ms step_avg:60.42ms -step:1278/2285 train_time:77219ms step_avg:60.42ms -step:1279/2285 train_time:77280ms step_avg:60.42ms -step:1280/2285 train_time:77339ms step_avg:60.42ms -step:1281/2285 train_time:77401ms step_avg:60.42ms -step:1282/2285 train_time:77463ms step_avg:60.42ms -step:1283/2285 train_time:77527ms step_avg:60.43ms -step:1284/2285 train_time:77587ms step_avg:60.43ms -step:1285/2285 train_time:77649ms step_avg:60.43ms -step:1286/2285 train_time:77709ms step_avg:60.43ms -step:1287/2285 train_time:77771ms step_avg:60.43ms -step:1288/2285 train_time:77830ms step_avg:60.43ms -step:1289/2285 train_time:77893ms step_avg:60.43ms -step:1290/2285 train_time:77953ms step_avg:60.43ms -step:1291/2285 train_time:78014ms step_avg:60.43ms -step:1292/2285 train_time:78073ms step_avg:60.43ms -step:1293/2285 train_time:78135ms step_avg:60.43ms -step:1294/2285 train_time:78194ms step_avg:60.43ms -step:1295/2285 train_time:78256ms step_avg:60.43ms -step:1296/2285 train_time:78316ms step_avg:60.43ms -step:1297/2285 train_time:78380ms step_avg:60.43ms -step:1298/2285 train_time:78440ms step_avg:60.43ms -step:1299/2285 train_time:78503ms step_avg:60.43ms -step:1300/2285 train_time:78564ms step_avg:60.43ms -step:1301/2285 train_time:78626ms step_avg:60.43ms -step:1302/2285 train_time:78685ms step_avg:60.43ms -step:1303/2285 train_time:78747ms step_avg:60.44ms -step:1304/2285 train_time:78806ms step_avg:60.43ms -step:1305/2285 train_time:78868ms step_avg:60.44ms -step:1306/2285 train_time:78927ms step_avg:60.43ms -step:1307/2285 train_time:78988ms step_avg:60.43ms -step:1308/2285 train_time:79048ms step_avg:60.43ms -step:1309/2285 train_time:79110ms step_avg:60.44ms -step:1310/2285 train_time:79170ms step_avg:60.44ms -step:1311/2285 train_time:79232ms step_avg:60.44ms -step:1312/2285 train_time:79292ms step_avg:60.44ms -step:1313/2285 train_time:79354ms step_avg:60.44ms -step:1314/2285 train_time:79415ms step_avg:60.44ms -step:1315/2285 train_time:79477ms step_avg:60.44ms -step:1316/2285 train_time:79538ms step_avg:60.44ms -step:1317/2285 train_time:79601ms step_avg:60.44ms -step:1318/2285 train_time:79661ms step_avg:60.44ms -step:1319/2285 train_time:79723ms step_avg:60.44ms -step:1320/2285 train_time:79782ms step_avg:60.44ms -step:1321/2285 train_time:79844ms step_avg:60.44ms -step:1322/2285 train_time:79903ms step_avg:60.44ms -step:1323/2285 train_time:79965ms step_avg:60.44ms -step:1324/2285 train_time:80025ms step_avg:60.44ms -step:1325/2285 train_time:80087ms step_avg:60.44ms -step:1326/2285 train_time:80147ms step_avg:60.44ms -step:1327/2285 train_time:80208ms step_avg:60.44ms -step:1328/2285 train_time:80272ms step_avg:60.45ms -step:1329/2285 train_time:80331ms step_avg:60.44ms -step:1330/2285 train_time:80391ms step_avg:60.44ms -step:1331/2285 train_time:80453ms step_avg:60.45ms -step:1332/2285 train_time:80513ms step_avg:60.44ms -step:1333/2285 train_time:80575ms step_avg:60.45ms -step:1334/2285 train_time:80636ms step_avg:60.45ms -step:1335/2285 train_time:80699ms step_avg:60.45ms -step:1336/2285 train_time:80758ms step_avg:60.45ms -step:1337/2285 train_time:80820ms step_avg:60.45ms -step:1338/2285 train_time:80880ms step_avg:60.45ms -step:1339/2285 train_time:80943ms step_avg:60.45ms -step:1340/2285 train_time:81002ms step_avg:60.45ms -step:1341/2285 train_time:81064ms step_avg:60.45ms -step:1342/2285 train_time:81124ms step_avg:60.45ms -step:1343/2285 train_time:81186ms step_avg:60.45ms -step:1344/2285 train_time:81245ms step_avg:60.45ms -step:1345/2285 train_time:81308ms step_avg:60.45ms -step:1346/2285 train_time:81369ms step_avg:60.45ms -step:1347/2285 train_time:81430ms step_avg:60.45ms -step:1348/2285 train_time:81490ms step_avg:60.45ms -step:1349/2285 train_time:81553ms step_avg:60.45ms -step:1350/2285 train_time:81613ms step_avg:60.45ms -step:1351/2285 train_time:81676ms step_avg:60.46ms -step:1352/2285 train_time:81737ms step_avg:60.46ms -step:1353/2285 train_time:81799ms step_avg:60.46ms -step:1354/2285 train_time:81859ms step_avg:60.46ms -step:1355/2285 train_time:81920ms step_avg:60.46ms -step:1356/2285 train_time:81980ms step_avg:60.46ms -step:1357/2285 train_time:82042ms step_avg:60.46ms -step:1358/2285 train_time:82101ms step_avg:60.46ms -step:1359/2285 train_time:82163ms step_avg:60.46ms -step:1360/2285 train_time:82223ms step_avg:60.46ms -step:1361/2285 train_time:82285ms step_avg:60.46ms -step:1362/2285 train_time:82345ms step_avg:60.46ms -step:1363/2285 train_time:82407ms step_avg:60.46ms -step:1364/2285 train_time:82467ms step_avg:60.46ms -step:1365/2285 train_time:82529ms step_avg:60.46ms -step:1366/2285 train_time:82589ms step_avg:60.46ms -step:1367/2285 train_time:82652ms step_avg:60.46ms -step:1368/2285 train_time:82712ms step_avg:60.46ms -step:1369/2285 train_time:82774ms step_avg:60.46ms -step:1370/2285 train_time:82835ms step_avg:60.46ms -step:1371/2285 train_time:82897ms step_avg:60.46ms -step:1372/2285 train_time:82957ms step_avg:60.46ms -step:1373/2285 train_time:83020ms step_avg:60.47ms -step:1374/2285 train_time:83079ms step_avg:60.47ms -step:1375/2285 train_time:83141ms step_avg:60.47ms -step:1376/2285 train_time:83201ms step_avg:60.47ms -step:1377/2285 train_time:83264ms step_avg:60.47ms -step:1378/2285 train_time:83323ms step_avg:60.47ms -step:1379/2285 train_time:83385ms step_avg:60.47ms -step:1380/2285 train_time:83445ms step_avg:60.47ms -step:1381/2285 train_time:83507ms step_avg:60.47ms -step:1382/2285 train_time:83567ms step_avg:60.47ms -step:1383/2285 train_time:83629ms step_avg:60.47ms -step:1384/2285 train_time:83690ms step_avg:60.47ms -step:1385/2285 train_time:83753ms step_avg:60.47ms -step:1386/2285 train_time:83813ms step_avg:60.47ms -step:1387/2285 train_time:83875ms step_avg:60.47ms -step:1388/2285 train_time:83936ms step_avg:60.47ms -step:1389/2285 train_time:83998ms step_avg:60.47ms -step:1390/2285 train_time:84058ms step_avg:60.47ms -step:1391/2285 train_time:84120ms step_avg:60.47ms -step:1392/2285 train_time:84180ms step_avg:60.47ms -step:1393/2285 train_time:84242ms step_avg:60.47ms -step:1394/2285 train_time:84302ms step_avg:60.47ms -step:1395/2285 train_time:84364ms step_avg:60.48ms -step:1396/2285 train_time:84424ms step_avg:60.48ms -step:1397/2285 train_time:84486ms step_avg:60.48ms -step:1398/2285 train_time:84546ms step_avg:60.48ms -step:1399/2285 train_time:84608ms step_avg:60.48ms -step:1400/2285 train_time:84668ms step_avg:60.48ms -step:1401/2285 train_time:84730ms step_avg:60.48ms -step:1402/2285 train_time:84790ms step_avg:60.48ms -step:1403/2285 train_time:84853ms step_avg:60.48ms -step:1404/2285 train_time:84913ms step_avg:60.48ms -step:1405/2285 train_time:84974ms step_avg:60.48ms -step:1406/2285 train_time:85034ms step_avg:60.48ms -step:1407/2285 train_time:85097ms step_avg:60.48ms -step:1408/2285 train_time:85156ms step_avg:60.48ms -step:1409/2285 train_time:85219ms step_avg:60.48ms -step:1410/2285 train_time:85278ms step_avg:60.48ms -step:1411/2285 train_time:85341ms step_avg:60.48ms -step:1412/2285 train_time:85401ms step_avg:60.48ms -step:1413/2285 train_time:85463ms step_avg:60.48ms -step:1414/2285 train_time:85523ms step_avg:60.48ms -step:1415/2285 train_time:85585ms step_avg:60.48ms -step:1416/2285 train_time:85645ms step_avg:60.48ms -step:1417/2285 train_time:85707ms step_avg:60.49ms -step:1418/2285 train_time:85767ms step_avg:60.48ms -step:1419/2285 train_time:85829ms step_avg:60.49ms -step:1420/2285 train_time:85889ms step_avg:60.49ms -step:1421/2285 train_time:85952ms step_avg:60.49ms -step:1422/2285 train_time:86012ms step_avg:60.49ms -step:1423/2285 train_time:86074ms step_avg:60.49ms -step:1424/2285 train_time:86134ms step_avg:60.49ms -step:1425/2285 train_time:86196ms step_avg:60.49ms -step:1426/2285 train_time:86256ms step_avg:60.49ms -step:1427/2285 train_time:86318ms step_avg:60.49ms -step:1428/2285 train_time:86378ms step_avg:60.49ms -step:1429/2285 train_time:86440ms step_avg:60.49ms -step:1430/2285 train_time:86500ms step_avg:60.49ms -step:1431/2285 train_time:86563ms step_avg:60.49ms -step:1432/2285 train_time:86622ms step_avg:60.49ms -step:1433/2285 train_time:86685ms step_avg:60.49ms -step:1434/2285 train_time:86744ms step_avg:60.49ms -step:1435/2285 train_time:86806ms step_avg:60.49ms -step:1436/2285 train_time:86866ms step_avg:60.49ms -step:1437/2285 train_time:86928ms step_avg:60.49ms -step:1438/2285 train_time:86988ms step_avg:60.49ms -step:1439/2285 train_time:87050ms step_avg:60.49ms -step:1440/2285 train_time:87111ms step_avg:60.49ms -step:1441/2285 train_time:87173ms step_avg:60.49ms -step:1442/2285 train_time:87233ms step_avg:60.49ms -step:1443/2285 train_time:87295ms step_avg:60.50ms -step:1444/2285 train_time:87355ms step_avg:60.50ms -step:1445/2285 train_time:87417ms step_avg:60.50ms -step:1446/2285 train_time:87477ms step_avg:60.50ms -step:1447/2285 train_time:87540ms step_avg:60.50ms -step:1448/2285 train_time:87600ms step_avg:60.50ms -step:1449/2285 train_time:87663ms step_avg:60.50ms -step:1450/2285 train_time:87723ms step_avg:60.50ms -step:1451/2285 train_time:87785ms step_avg:60.50ms -step:1452/2285 train_time:87844ms step_avg:60.50ms -step:1453/2285 train_time:87906ms step_avg:60.50ms -step:1454/2285 train_time:87966ms step_avg:60.50ms -step:1455/2285 train_time:88027ms step_avg:60.50ms -step:1456/2285 train_time:88088ms step_avg:60.50ms -step:1457/2285 train_time:88150ms step_avg:60.50ms -step:1458/2285 train_time:88210ms step_avg:60.50ms -step:1459/2285 train_time:88273ms step_avg:60.50ms -step:1460/2285 train_time:88332ms step_avg:60.50ms -step:1461/2285 train_time:88395ms step_avg:60.50ms -step:1462/2285 train_time:88456ms step_avg:60.50ms -step:1463/2285 train_time:88519ms step_avg:60.51ms -step:1464/2285 train_time:88579ms step_avg:60.50ms -step:1465/2285 train_time:88641ms step_avg:60.51ms -step:1466/2285 train_time:88701ms step_avg:60.51ms -step:1467/2285 train_time:88763ms step_avg:60.51ms -step:1468/2285 train_time:88823ms step_avg:60.51ms -step:1469/2285 train_time:88886ms step_avg:60.51ms -step:1470/2285 train_time:88945ms step_avg:60.51ms -step:1471/2285 train_time:89007ms step_avg:60.51ms -step:1472/2285 train_time:89067ms step_avg:60.51ms -step:1473/2285 train_time:89129ms step_avg:60.51ms -step:1474/2285 train_time:89188ms step_avg:60.51ms -step:1475/2285 train_time:89250ms step_avg:60.51ms -step:1476/2285 train_time:89311ms step_avg:60.51ms -step:1477/2285 train_time:89374ms step_avg:60.51ms -step:1478/2285 train_time:89434ms step_avg:60.51ms -step:1479/2285 train_time:89496ms step_avg:60.51ms -step:1480/2285 train_time:89557ms step_avg:60.51ms -step:1481/2285 train_time:89620ms step_avg:60.51ms -step:1482/2285 train_time:89679ms step_avg:60.51ms -step:1483/2285 train_time:89742ms step_avg:60.51ms -step:1484/2285 train_time:89801ms step_avg:60.51ms -step:1485/2285 train_time:89863ms step_avg:60.51ms -step:1486/2285 train_time:89922ms step_avg:60.51ms -step:1487/2285 train_time:89985ms step_avg:60.51ms -step:1488/2285 train_time:90045ms step_avg:60.51ms -step:1489/2285 train_time:90107ms step_avg:60.51ms -step:1490/2285 train_time:90168ms step_avg:60.52ms -step:1491/2285 train_time:90228ms step_avg:60.52ms -step:1492/2285 train_time:90288ms step_avg:60.51ms -step:1493/2285 train_time:90350ms step_avg:60.52ms -step:1494/2285 train_time:90410ms step_avg:60.52ms -step:1495/2285 train_time:90473ms step_avg:60.52ms -step:1496/2285 train_time:90533ms step_avg:60.52ms -step:1497/2285 train_time:90595ms step_avg:60.52ms -step:1498/2285 train_time:90656ms step_avg:60.52ms -step:1499/2285 train_time:90718ms step_avg:60.52ms -step:1500/2285 train_time:90778ms step_avg:60.52ms -step:1500/2285 val_loss:3.4262 train_time:90841ms step_avg:60.56ms -step:1501/2285 train_time:90860ms step_avg:60.53ms -step:1502/2285 train_time:90902ms step_avg:60.52ms -step:1503/2285 train_time:90968ms step_avg:60.52ms -step:1504/2285 train_time:91027ms step_avg:60.52ms -step:1505/2285 train_time:91089ms step_avg:60.52ms -step:1506/2285 train_time:91148ms step_avg:60.52ms -step:1507/2285 train_time:91211ms step_avg:60.52ms -step:1508/2285 train_time:91268ms step_avg:60.52ms -step:1509/2285 train_time:91330ms step_avg:60.52ms -step:1510/2285 train_time:91389ms step_avg:60.52ms -step:1511/2285 train_time:91451ms step_avg:60.52ms -step:1512/2285 train_time:91510ms step_avg:60.52ms -step:1513/2285 train_time:91572ms step_avg:60.52ms -step:1514/2285 train_time:91632ms step_avg:60.52ms -step:1515/2285 train_time:91694ms step_avg:60.52ms -step:1516/2285 train_time:91755ms step_avg:60.52ms -step:1517/2285 train_time:91819ms step_avg:60.53ms -step:1518/2285 train_time:91879ms step_avg:60.53ms -step:1519/2285 train_time:91943ms step_avg:60.53ms -step:1520/2285 train_time:92004ms step_avg:60.53ms -step:1521/2285 train_time:92067ms step_avg:60.53ms -step:1522/2285 train_time:92126ms step_avg:60.53ms -step:1523/2285 train_time:92188ms step_avg:60.53ms -step:1524/2285 train_time:92247ms step_avg:60.53ms -step:1525/2285 train_time:92310ms step_avg:60.53ms -step:1526/2285 train_time:92369ms step_avg:60.53ms -step:1527/2285 train_time:92431ms step_avg:60.53ms -step:1528/2285 train_time:92491ms step_avg:60.53ms -step:1529/2285 train_time:92553ms step_avg:60.53ms -step:1530/2285 train_time:92612ms step_avg:60.53ms -step:1531/2285 train_time:92674ms step_avg:60.53ms -step:1532/2285 train_time:92735ms step_avg:60.53ms -step:1533/2285 train_time:92799ms step_avg:60.53ms -step:1534/2285 train_time:92859ms step_avg:60.53ms -step:1535/2285 train_time:92922ms step_avg:60.54ms -step:1536/2285 train_time:92983ms step_avg:60.54ms -step:1537/2285 train_time:93047ms step_avg:60.54ms -step:1538/2285 train_time:93107ms step_avg:60.54ms -step:1539/2285 train_time:93168ms step_avg:60.54ms -step:1540/2285 train_time:93228ms step_avg:60.54ms -step:1541/2285 train_time:93290ms step_avg:60.54ms -step:1542/2285 train_time:93350ms step_avg:60.54ms -step:1543/2285 train_time:93412ms step_avg:60.54ms -step:1544/2285 train_time:93471ms step_avg:60.54ms -step:1545/2285 train_time:93533ms step_avg:60.54ms -step:1546/2285 train_time:93593ms step_avg:60.54ms -step:1547/2285 train_time:93655ms step_avg:60.54ms -step:1548/2285 train_time:93715ms step_avg:60.54ms -step:1549/2285 train_time:93778ms step_avg:60.54ms -step:1550/2285 train_time:93838ms step_avg:60.54ms -step:1551/2285 train_time:93901ms step_avg:60.54ms -step:1552/2285 train_time:93962ms step_avg:60.54ms -step:1553/2285 train_time:94026ms step_avg:60.54ms -step:1554/2285 train_time:94086ms step_avg:60.54ms -step:1555/2285 train_time:94148ms step_avg:60.55ms -step:1556/2285 train_time:94207ms step_avg:60.54ms -step:1557/2285 train_time:94270ms step_avg:60.55ms -step:1558/2285 train_time:94330ms step_avg:60.55ms -step:1559/2285 train_time:94391ms step_avg:60.55ms -step:1560/2285 train_time:94451ms step_avg:60.55ms -step:1561/2285 train_time:94512ms step_avg:60.55ms -step:1562/2285 train_time:94572ms step_avg:60.55ms -step:1563/2285 train_time:94634ms step_avg:60.55ms -step:1564/2285 train_time:94694ms step_avg:60.55ms -step:1565/2285 train_time:94756ms step_avg:60.55ms -step:1566/2285 train_time:94816ms step_avg:60.55ms -step:1567/2285 train_time:94880ms step_avg:60.55ms -step:1568/2285 train_time:94940ms step_avg:60.55ms -step:1569/2285 train_time:95003ms step_avg:60.55ms -step:1570/2285 train_time:95064ms step_avg:60.55ms -step:1571/2285 train_time:95126ms step_avg:60.55ms -step:1572/2285 train_time:95186ms step_avg:60.55ms -step:1573/2285 train_time:95248ms step_avg:60.55ms -step:1574/2285 train_time:95308ms step_avg:60.55ms -step:1575/2285 train_time:95371ms step_avg:60.55ms -step:1576/2285 train_time:95430ms step_avg:60.55ms -step:1577/2285 train_time:95492ms step_avg:60.55ms -step:1578/2285 train_time:95552ms step_avg:60.55ms -step:1579/2285 train_time:95614ms step_avg:60.55ms -step:1580/2285 train_time:95674ms step_avg:60.55ms -step:1581/2285 train_time:95736ms step_avg:60.55ms -step:1582/2285 train_time:95796ms step_avg:60.55ms -step:1583/2285 train_time:95859ms step_avg:60.56ms -step:1584/2285 train_time:95919ms step_avg:60.56ms -step:1585/2285 train_time:95982ms step_avg:60.56ms -step:1586/2285 train_time:96043ms step_avg:60.56ms -step:1587/2285 train_time:96105ms step_avg:60.56ms -step:1588/2285 train_time:96164ms step_avg:60.56ms -step:1589/2285 train_time:96227ms step_avg:60.56ms -step:1590/2285 train_time:96286ms step_avg:60.56ms -step:1591/2285 train_time:96348ms step_avg:60.56ms -step:1592/2285 train_time:96408ms step_avg:60.56ms -step:1593/2285 train_time:96471ms step_avg:60.56ms -step:1594/2285 train_time:96530ms step_avg:60.56ms -step:1595/2285 train_time:96593ms step_avg:60.56ms -step:1596/2285 train_time:96652ms step_avg:60.56ms -step:1597/2285 train_time:96714ms step_avg:60.56ms -step:1598/2285 train_time:96774ms step_avg:60.56ms -step:1599/2285 train_time:96837ms step_avg:60.56ms -step:1600/2285 train_time:96898ms step_avg:60.56ms -step:1601/2285 train_time:96961ms step_avg:60.56ms -step:1602/2285 train_time:97020ms step_avg:60.56ms -step:1603/2285 train_time:97083ms step_avg:60.56ms -step:1604/2285 train_time:97144ms step_avg:60.56ms -step:1605/2285 train_time:97207ms step_avg:60.56ms -step:1606/2285 train_time:97266ms step_avg:60.56ms -step:1607/2285 train_time:97329ms step_avg:60.57ms -step:1608/2285 train_time:97388ms step_avg:60.56ms -step:1609/2285 train_time:97450ms step_avg:60.57ms -step:1610/2285 train_time:97510ms step_avg:60.57ms -step:1611/2285 train_time:97572ms step_avg:60.57ms -step:1612/2285 train_time:97632ms step_avg:60.57ms -step:1613/2285 train_time:97694ms step_avg:60.57ms -step:1614/2285 train_time:97754ms step_avg:60.57ms -step:1615/2285 train_time:97817ms step_avg:60.57ms -step:1616/2285 train_time:97877ms step_avg:60.57ms -step:1617/2285 train_time:97939ms step_avg:60.57ms -step:1618/2285 train_time:97999ms step_avg:60.57ms -step:1619/2285 train_time:98062ms step_avg:60.57ms -step:1620/2285 train_time:98122ms step_avg:60.57ms -step:1621/2285 train_time:98185ms step_avg:60.57ms -step:1622/2285 train_time:98245ms step_avg:60.57ms -step:1623/2285 train_time:98307ms step_avg:60.57ms -step:1624/2285 train_time:98367ms step_avg:60.57ms -step:1625/2285 train_time:98430ms step_avg:60.57ms -step:1626/2285 train_time:98490ms step_avg:60.57ms -step:1627/2285 train_time:98552ms step_avg:60.57ms -step:1628/2285 train_time:98611ms step_avg:60.57ms -step:1629/2285 train_time:98673ms step_avg:60.57ms -step:1630/2285 train_time:98733ms step_avg:60.57ms -step:1631/2285 train_time:98795ms step_avg:60.57ms -step:1632/2285 train_time:98855ms step_avg:60.57ms -step:1633/2285 train_time:98918ms step_avg:60.57ms -step:1634/2285 train_time:98977ms step_avg:60.57ms -step:1635/2285 train_time:99039ms step_avg:60.57ms -step:1636/2285 train_time:99100ms step_avg:60.57ms -step:1637/2285 train_time:99162ms step_avg:60.58ms -step:1638/2285 train_time:99224ms step_avg:60.58ms -step:1639/2285 train_time:99287ms step_avg:60.58ms -step:1640/2285 train_time:99346ms step_avg:60.58ms -step:1641/2285 train_time:99408ms step_avg:60.58ms -step:1642/2285 train_time:99468ms step_avg:60.58ms -step:1643/2285 train_time:99531ms step_avg:60.58ms -step:1644/2285 train_time:99591ms step_avg:60.58ms -step:1645/2285 train_time:99653ms step_avg:60.58ms -step:1646/2285 train_time:99712ms step_avg:60.58ms -step:1647/2285 train_time:99774ms step_avg:60.58ms -step:1648/2285 train_time:99834ms step_avg:60.58ms -step:1649/2285 train_time:99896ms step_avg:60.58ms -step:1650/2285 train_time:99957ms step_avg:60.58ms -step:1651/2285 train_time:100019ms step_avg:60.58ms -step:1652/2285 train_time:100079ms step_avg:60.58ms -step:1653/2285 train_time:100142ms step_avg:60.58ms -step:1654/2285 train_time:100202ms step_avg:60.58ms -step:1655/2285 train_time:100265ms step_avg:60.58ms -step:1656/2285 train_time:100325ms step_avg:60.58ms -step:1657/2285 train_time:100389ms step_avg:60.58ms -step:1658/2285 train_time:100449ms step_avg:60.58ms -step:1659/2285 train_time:100511ms step_avg:60.59ms -step:1660/2285 train_time:100571ms step_avg:60.59ms -step:1661/2285 train_time:100633ms step_avg:60.59ms -step:1662/2285 train_time:100693ms step_avg:60.59ms -step:1663/2285 train_time:100755ms step_avg:60.59ms -step:1664/2285 train_time:100816ms step_avg:60.59ms -step:1665/2285 train_time:100877ms step_avg:60.59ms -step:1666/2285 train_time:100937ms step_avg:60.59ms -step:1667/2285 train_time:101000ms step_avg:60.59ms -step:1668/2285 train_time:101059ms step_avg:60.59ms -step:1669/2285 train_time:101122ms step_avg:60.59ms -step:1670/2285 train_time:101182ms step_avg:60.59ms -step:1671/2285 train_time:101245ms step_avg:60.59ms -step:1672/2285 train_time:101304ms step_avg:60.59ms -step:1673/2285 train_time:101367ms step_avg:60.59ms -step:1674/2285 train_time:101427ms step_avg:60.59ms -step:1675/2285 train_time:101490ms step_avg:60.59ms -step:1676/2285 train_time:101550ms step_avg:60.59ms -step:1677/2285 train_time:101612ms step_avg:60.59ms -step:1678/2285 train_time:101672ms step_avg:60.59ms -step:1679/2285 train_time:101734ms step_avg:60.59ms -step:1680/2285 train_time:101793ms step_avg:60.59ms -step:1681/2285 train_time:101856ms step_avg:60.59ms -step:1682/2285 train_time:101916ms step_avg:60.59ms -step:1683/2285 train_time:101978ms step_avg:60.59ms -step:1684/2285 train_time:102038ms step_avg:60.59ms -step:1685/2285 train_time:102102ms step_avg:60.59ms -step:1686/2285 train_time:102162ms step_avg:60.59ms -step:1687/2285 train_time:102224ms step_avg:60.60ms -step:1688/2285 train_time:102285ms step_avg:60.60ms -step:1689/2285 train_time:102348ms step_avg:60.60ms -step:1690/2285 train_time:102407ms step_avg:60.60ms -step:1691/2285 train_time:102470ms step_avg:60.60ms -step:1692/2285 train_time:102530ms step_avg:60.60ms -step:1693/2285 train_time:102592ms step_avg:60.60ms -step:1694/2285 train_time:102651ms step_avg:60.60ms -step:1695/2285 train_time:102714ms step_avg:60.60ms -step:1696/2285 train_time:102773ms step_avg:60.60ms -step:1697/2285 train_time:102835ms step_avg:60.60ms -step:1698/2285 train_time:102895ms step_avg:60.60ms -step:1699/2285 train_time:102957ms step_avg:60.60ms -step:1700/2285 train_time:103019ms step_avg:60.60ms -step:1701/2285 train_time:103080ms step_avg:60.60ms -step:1702/2285 train_time:103140ms step_avg:60.60ms -step:1703/2285 train_time:103203ms step_avg:60.60ms -step:1704/2285 train_time:103263ms step_avg:60.60ms -step:1705/2285 train_time:103327ms step_avg:60.60ms -step:1706/2285 train_time:103387ms step_avg:60.60ms -step:1707/2285 train_time:103450ms step_avg:60.60ms -step:1708/2285 train_time:103510ms step_avg:60.60ms -step:1709/2285 train_time:103573ms step_avg:60.60ms -step:1710/2285 train_time:103633ms step_avg:60.60ms -step:1711/2285 train_time:103695ms step_avg:60.60ms -step:1712/2285 train_time:103754ms step_avg:60.60ms -step:1713/2285 train_time:103817ms step_avg:60.61ms -step:1714/2285 train_time:103877ms step_avg:60.60ms -step:1715/2285 train_time:103939ms step_avg:60.61ms -step:1716/2285 train_time:104000ms step_avg:60.61ms -step:1717/2285 train_time:104061ms step_avg:60.61ms -step:1718/2285 train_time:104121ms step_avg:60.61ms -step:1719/2285 train_time:104184ms step_avg:60.61ms -step:1720/2285 train_time:104244ms step_avg:60.61ms -step:1721/2285 train_time:104307ms step_avg:60.61ms -step:1722/2285 train_time:104367ms step_avg:60.61ms -step:1723/2285 train_time:104429ms step_avg:60.61ms -step:1724/2285 train_time:104490ms step_avg:60.61ms -step:1725/2285 train_time:104552ms step_avg:60.61ms -step:1726/2285 train_time:104612ms step_avg:60.61ms -step:1727/2285 train_time:104674ms step_avg:60.61ms -step:1728/2285 train_time:104733ms step_avg:60.61ms -step:1729/2285 train_time:104795ms step_avg:60.61ms -step:1730/2285 train_time:104854ms step_avg:60.61ms -step:1731/2285 train_time:104917ms step_avg:60.61ms -step:1732/2285 train_time:104977ms step_avg:60.61ms -step:1733/2285 train_time:105039ms step_avg:60.61ms -step:1734/2285 train_time:105100ms step_avg:60.61ms -step:1735/2285 train_time:105162ms step_avg:60.61ms -step:1736/2285 train_time:105222ms step_avg:60.61ms -step:1737/2285 train_time:105286ms step_avg:60.61ms -step:1738/2285 train_time:105346ms step_avg:60.61ms -step:1739/2285 train_time:105408ms step_avg:60.61ms -step:1740/2285 train_time:105468ms step_avg:60.61ms -step:1741/2285 train_time:105530ms step_avg:60.61ms -step:1742/2285 train_time:105591ms step_avg:60.61ms -step:1743/2285 train_time:105653ms step_avg:60.62ms -step:1744/2285 train_time:105712ms step_avg:60.61ms -step:1745/2285 train_time:105774ms step_avg:60.62ms -step:1746/2285 train_time:105834ms step_avg:60.61ms -step:1747/2285 train_time:105896ms step_avg:60.62ms -step:1748/2285 train_time:105956ms step_avg:60.62ms -step:1749/2285 train_time:106019ms step_avg:60.62ms -step:1750/2285 train_time:106079ms step_avg:60.62ms -step:1750/2285 val_loss:3.3662 train_time:106143ms step_avg:60.65ms -step:1751/2285 train_time:106162ms step_avg:60.63ms -step:1752/2285 train_time:106206ms step_avg:60.62ms -step:1753/2285 train_time:106269ms step_avg:60.62ms -step:1754/2285 train_time:106330ms step_avg:60.62ms -step:1755/2285 train_time:106393ms step_avg:60.62ms -step:1756/2285 train_time:106453ms step_avg:60.62ms -step:1757/2285 train_time:106514ms step_avg:60.62ms -step:1758/2285 train_time:106574ms step_avg:60.62ms -step:1759/2285 train_time:106635ms step_avg:60.62ms -step:1760/2285 train_time:106694ms step_avg:60.62ms -step:1761/2285 train_time:106756ms step_avg:60.62ms -step:1762/2285 train_time:106815ms step_avg:60.62ms -step:1763/2285 train_time:106876ms step_avg:60.62ms -step:1764/2285 train_time:106936ms step_avg:60.62ms -step:1765/2285 train_time:106998ms step_avg:60.62ms -step:1766/2285 train_time:107060ms step_avg:60.62ms -step:1767/2285 train_time:107125ms step_avg:60.63ms -step:1768/2285 train_time:107185ms step_avg:60.63ms -step:1769/2285 train_time:107247ms step_avg:60.63ms -step:1770/2285 train_time:107307ms step_avg:60.63ms -step:1771/2285 train_time:107370ms step_avg:60.63ms -step:1772/2285 train_time:107430ms step_avg:60.63ms -step:1773/2285 train_time:107492ms step_avg:60.63ms -step:1774/2285 train_time:107551ms step_avg:60.63ms -step:1775/2285 train_time:107614ms step_avg:60.63ms -step:1776/2285 train_time:107673ms step_avg:60.63ms -step:1777/2285 train_time:107735ms step_avg:60.63ms -step:1778/2285 train_time:107794ms step_avg:60.63ms -step:1779/2285 train_time:107856ms step_avg:60.63ms -step:1780/2285 train_time:107915ms step_avg:60.63ms -step:1781/2285 train_time:107977ms step_avg:60.63ms -step:1782/2285 train_time:108038ms step_avg:60.63ms -step:1783/2285 train_time:108102ms step_avg:60.63ms -step:1784/2285 train_time:108162ms step_avg:60.63ms -step:1785/2285 train_time:108225ms step_avg:60.63ms -step:1786/2285 train_time:108285ms step_avg:60.63ms -step:1787/2285 train_time:108347ms step_avg:60.63ms -step:1788/2285 train_time:108407ms step_avg:60.63ms -step:1789/2285 train_time:108469ms step_avg:60.63ms -step:1790/2285 train_time:108529ms step_avg:60.63ms -step:1791/2285 train_time:108591ms step_avg:60.63ms -step:1792/2285 train_time:108651ms step_avg:60.63ms -step:1793/2285 train_time:108713ms step_avg:60.63ms -step:1794/2285 train_time:108772ms step_avg:60.63ms -step:1795/2285 train_time:108834ms step_avg:60.63ms -step:1796/2285 train_time:108894ms step_avg:60.63ms -step:1797/2285 train_time:108956ms step_avg:60.63ms -step:1798/2285 train_time:109016ms step_avg:60.63ms -step:1799/2285 train_time:109079ms step_avg:60.63ms -step:1800/2285 train_time:109139ms step_avg:60.63ms -step:1801/2285 train_time:109203ms step_avg:60.63ms -step:1802/2285 train_time:109263ms step_avg:60.63ms -step:1803/2285 train_time:109325ms step_avg:60.63ms -step:1804/2285 train_time:109385ms step_avg:60.63ms -step:1805/2285 train_time:109446ms step_avg:60.64ms -step:1806/2285 train_time:109506ms step_avg:60.63ms -step:1807/2285 train_time:109568ms step_avg:60.64ms -step:1808/2285 train_time:109628ms step_avg:60.63ms -step:1809/2285 train_time:109690ms step_avg:60.64ms -step:1810/2285 train_time:109750ms step_avg:60.64ms -step:1811/2285 train_time:109813ms step_avg:60.64ms -step:1812/2285 train_time:109873ms step_avg:60.64ms -step:1813/2285 train_time:109936ms step_avg:60.64ms -step:1814/2285 train_time:109995ms step_avg:60.64ms -step:1815/2285 train_time:110058ms step_avg:60.64ms -step:1816/2285 train_time:110119ms step_avg:60.64ms -step:1817/2285 train_time:110182ms step_avg:60.64ms -step:1818/2285 train_time:110241ms step_avg:60.64ms -step:1819/2285 train_time:110304ms step_avg:60.64ms -step:1820/2285 train_time:110363ms step_avg:60.64ms -step:1821/2285 train_time:110425ms step_avg:60.64ms -step:1822/2285 train_time:110485ms step_avg:60.64ms -step:1823/2285 train_time:110547ms step_avg:60.64ms -step:1824/2285 train_time:110607ms step_avg:60.64ms -step:1825/2285 train_time:110669ms step_avg:60.64ms -step:1826/2285 train_time:110730ms step_avg:60.64ms -step:1827/2285 train_time:110792ms step_avg:60.64ms -step:1828/2285 train_time:110852ms step_avg:60.64ms -step:1829/2285 train_time:110915ms step_avg:60.64ms -step:1830/2285 train_time:110975ms step_avg:60.64ms -step:1831/2285 train_time:111037ms step_avg:60.64ms -step:1832/2285 train_time:111097ms step_avg:60.64ms -step:1833/2285 train_time:111160ms step_avg:60.64ms -step:1834/2285 train_time:111220ms step_avg:60.64ms -step:1835/2285 train_time:111282ms step_avg:60.64ms -step:1836/2285 train_time:111342ms step_avg:60.64ms -step:1837/2285 train_time:111404ms step_avg:60.64ms -step:1838/2285 train_time:111464ms step_avg:60.64ms -step:1839/2285 train_time:111526ms step_avg:60.64ms -step:1840/2285 train_time:111586ms step_avg:60.64ms -step:1841/2285 train_time:111648ms step_avg:60.65ms -step:1842/2285 train_time:111709ms step_avg:60.65ms -step:1843/2285 train_time:111771ms step_avg:60.65ms -step:1844/2285 train_time:111832ms step_avg:60.65ms -step:1845/2285 train_time:111894ms step_avg:60.65ms -step:1846/2285 train_time:111954ms step_avg:60.65ms -step:1847/2285 train_time:112017ms step_avg:60.65ms -step:1848/2285 train_time:112077ms step_avg:60.65ms -step:1849/2285 train_time:112140ms step_avg:60.65ms -step:1850/2285 train_time:112199ms step_avg:60.65ms -step:1851/2285 train_time:112262ms step_avg:60.65ms -step:1852/2285 train_time:112321ms step_avg:60.65ms -step:1853/2285 train_time:112384ms step_avg:60.65ms -step:1854/2285 train_time:112443ms step_avg:60.65ms -step:1855/2285 train_time:112505ms step_avg:60.65ms -step:1856/2285 train_time:112565ms step_avg:60.65ms -step:1857/2285 train_time:112627ms step_avg:60.65ms -step:1858/2285 train_time:112687ms step_avg:60.65ms -step:1859/2285 train_time:112750ms step_avg:60.65ms -step:1860/2285 train_time:112810ms step_avg:60.65ms -step:1861/2285 train_time:112872ms step_avg:60.65ms -step:1862/2285 train_time:112933ms step_avg:60.65ms -step:1863/2285 train_time:112995ms step_avg:60.65ms -step:1864/2285 train_time:113055ms step_avg:60.65ms -step:1865/2285 train_time:113117ms step_avg:60.65ms -step:1866/2285 train_time:113177ms step_avg:60.65ms -step:1867/2285 train_time:113240ms step_avg:60.65ms -step:1868/2285 train_time:113300ms step_avg:60.65ms -step:1869/2285 train_time:113362ms step_avg:60.65ms -step:1870/2285 train_time:113422ms step_avg:60.65ms -step:1871/2285 train_time:113484ms step_avg:60.65ms -step:1872/2285 train_time:113543ms step_avg:60.65ms -step:1873/2285 train_time:113605ms step_avg:60.65ms -step:1874/2285 train_time:113666ms step_avg:60.65ms -step:1875/2285 train_time:113728ms step_avg:60.66ms -step:1876/2285 train_time:113788ms step_avg:60.65ms -step:1877/2285 train_time:113850ms step_avg:60.66ms -step:1878/2285 train_time:113911ms step_avg:60.66ms -step:1879/2285 train_time:113975ms step_avg:60.66ms -step:1880/2285 train_time:114035ms step_avg:60.66ms -step:1881/2285 train_time:114097ms step_avg:60.66ms -step:1882/2285 train_time:114157ms step_avg:60.66ms -step:1883/2285 train_time:114220ms step_avg:60.66ms -step:1884/2285 train_time:114280ms step_avg:60.66ms -step:1885/2285 train_time:114342ms step_avg:60.66ms -step:1886/2285 train_time:114401ms step_avg:60.66ms -step:1887/2285 train_time:114463ms step_avg:60.66ms -step:1888/2285 train_time:114523ms step_avg:60.66ms -step:1889/2285 train_time:114586ms step_avg:60.66ms -step:1890/2285 train_time:114646ms step_avg:60.66ms -step:1891/2285 train_time:114708ms step_avg:60.66ms -step:1892/2285 train_time:114769ms step_avg:60.66ms -step:1893/2285 train_time:114831ms step_avg:60.66ms -step:1894/2285 train_time:114891ms step_avg:60.66ms -step:1895/2285 train_time:114954ms step_avg:60.66ms -step:1896/2285 train_time:115015ms step_avg:60.66ms -step:1897/2285 train_time:115077ms step_avg:60.66ms -step:1898/2285 train_time:115137ms step_avg:60.66ms -step:1899/2285 train_time:115200ms step_avg:60.66ms -step:1900/2285 train_time:115260ms step_avg:60.66ms -step:1901/2285 train_time:115322ms step_avg:60.66ms -step:1902/2285 train_time:115382ms step_avg:60.66ms -step:1903/2285 train_time:115443ms step_avg:60.66ms -step:1904/2285 train_time:115503ms step_avg:60.66ms -step:1905/2285 train_time:115565ms step_avg:60.66ms -step:1906/2285 train_time:115625ms step_avg:60.66ms -step:1907/2285 train_time:115687ms step_avg:60.66ms -step:1908/2285 train_time:115748ms step_avg:60.66ms -step:1909/2285 train_time:115810ms step_avg:60.67ms -step:1910/2285 train_time:115871ms step_avg:60.67ms -step:1911/2285 train_time:115933ms step_avg:60.67ms -step:1912/2285 train_time:115993ms step_avg:60.67ms -step:1913/2285 train_time:116056ms step_avg:60.67ms -step:1914/2285 train_time:116116ms step_avg:60.67ms -step:1915/2285 train_time:116178ms step_avg:60.67ms -step:1916/2285 train_time:116238ms step_avg:60.67ms -step:1917/2285 train_time:116301ms step_avg:60.67ms -step:1918/2285 train_time:116361ms step_avg:60.67ms -step:1919/2285 train_time:116423ms step_avg:60.67ms -step:1920/2285 train_time:116483ms step_avg:60.67ms -step:1921/2285 train_time:116545ms step_avg:60.67ms -step:1922/2285 train_time:116605ms step_avg:60.67ms -step:1923/2285 train_time:116667ms step_avg:60.67ms -step:1924/2285 train_time:116728ms step_avg:60.67ms -step:1925/2285 train_time:116790ms step_avg:60.67ms -step:1926/2285 train_time:116850ms step_avg:60.67ms -step:1927/2285 train_time:116913ms step_avg:60.67ms -step:1928/2285 train_time:116974ms step_avg:60.67ms -step:1929/2285 train_time:117037ms step_avg:60.67ms -step:1930/2285 train_time:117097ms step_avg:60.67ms -step:1931/2285 train_time:117159ms step_avg:60.67ms -step:1932/2285 train_time:117220ms step_avg:60.67ms -step:1933/2285 train_time:117282ms step_avg:60.67ms -step:1934/2285 train_time:117342ms step_avg:60.67ms -step:1935/2285 train_time:117404ms step_avg:60.67ms -step:1936/2285 train_time:117464ms step_avg:60.67ms -step:1937/2285 train_time:117526ms step_avg:60.67ms -step:1938/2285 train_time:117585ms step_avg:60.67ms -step:1939/2285 train_time:117648ms step_avg:60.67ms -step:1940/2285 train_time:117708ms step_avg:60.67ms -step:1941/2285 train_time:117771ms step_avg:60.68ms -step:1942/2285 train_time:117831ms step_avg:60.67ms -step:1943/2285 train_time:117893ms step_avg:60.68ms -step:1944/2285 train_time:117953ms step_avg:60.68ms -step:1945/2285 train_time:118015ms step_avg:60.68ms -step:1946/2285 train_time:118075ms step_avg:60.68ms -step:1947/2285 train_time:118138ms step_avg:60.68ms -step:1948/2285 train_time:118198ms step_avg:60.68ms -step:1949/2285 train_time:118262ms step_avg:60.68ms -step:1950/2285 train_time:118322ms step_avg:60.68ms -step:1951/2285 train_time:118383ms step_avg:60.68ms -step:1952/2285 train_time:118443ms step_avg:60.68ms -step:1953/2285 train_time:118505ms step_avg:60.68ms -step:1954/2285 train_time:118565ms step_avg:60.68ms -step:1955/2285 train_time:118627ms step_avg:60.68ms -step:1956/2285 train_time:118687ms step_avg:60.68ms -step:1957/2285 train_time:118750ms step_avg:60.68ms -step:1958/2285 train_time:118810ms step_avg:60.68ms -step:1959/2285 train_time:118873ms step_avg:60.68ms -step:1960/2285 train_time:118933ms step_avg:60.68ms -step:1961/2285 train_time:118996ms step_avg:60.68ms -step:1962/2285 train_time:119056ms step_avg:60.68ms -step:1963/2285 train_time:119119ms step_avg:60.68ms -step:1964/2285 train_time:119179ms step_avg:60.68ms -step:1965/2285 train_time:119242ms step_avg:60.68ms -step:1966/2285 train_time:119302ms step_avg:60.68ms -step:1967/2285 train_time:119364ms step_avg:60.68ms -step:1968/2285 train_time:119425ms step_avg:60.68ms -step:1969/2285 train_time:119486ms step_avg:60.68ms -step:1970/2285 train_time:119546ms step_avg:60.68ms -step:1971/2285 train_time:119609ms step_avg:60.68ms -step:1972/2285 train_time:119669ms step_avg:60.68ms -step:1973/2285 train_time:119731ms step_avg:60.68ms -step:1974/2285 train_time:119791ms step_avg:60.68ms -step:1975/2285 train_time:119853ms step_avg:60.69ms -step:1976/2285 train_time:119914ms step_avg:60.69ms -step:1977/2285 train_time:119976ms step_avg:60.69ms -step:1978/2285 train_time:120036ms step_avg:60.69ms -step:1979/2285 train_time:120098ms step_avg:60.69ms -step:1980/2285 train_time:120158ms step_avg:60.69ms -step:1981/2285 train_time:120220ms step_avg:60.69ms -step:1982/2285 train_time:120280ms step_avg:60.69ms -step:1983/2285 train_time:120343ms step_avg:60.69ms -step:1984/2285 train_time:120402ms step_avg:60.69ms -step:1985/2285 train_time:120464ms step_avg:60.69ms -step:1986/2285 train_time:120525ms step_avg:60.69ms -step:1987/2285 train_time:120587ms step_avg:60.69ms -step:1988/2285 train_time:120647ms step_avg:60.69ms -step:1989/2285 train_time:120709ms step_avg:60.69ms -step:1990/2285 train_time:120769ms step_avg:60.69ms -step:1991/2285 train_time:120832ms step_avg:60.69ms -step:1992/2285 train_time:120892ms step_avg:60.69ms -step:1993/2285 train_time:120955ms step_avg:60.69ms -step:1994/2285 train_time:121015ms step_avg:60.69ms -step:1995/2285 train_time:121078ms step_avg:60.69ms -step:1996/2285 train_time:121137ms step_avg:60.69ms -step:1997/2285 train_time:121200ms step_avg:60.69ms -step:1998/2285 train_time:121260ms step_avg:60.69ms -step:1999/2285 train_time:121323ms step_avg:60.69ms -step:2000/2285 train_time:121382ms step_avg:60.69ms -step:2000/2285 val_loss:3.3172 train_time:121446ms step_avg:60.72ms -step:2001/2285 train_time:121464ms step_avg:60.70ms -step:2002/2285 train_time:121507ms step_avg:60.69ms -step:2003/2285 train_time:121569ms step_avg:60.69ms -step:2004/2285 train_time:121630ms step_avg:60.69ms -step:2005/2285 train_time:121693ms step_avg:60.69ms -step:2006/2285 train_time:121754ms step_avg:60.69ms -step:2007/2285 train_time:121815ms step_avg:60.70ms -step:2008/2285 train_time:121874ms step_avg:60.69ms -step:2009/2285 train_time:121936ms step_avg:60.69ms -step:2010/2285 train_time:121995ms step_avg:60.69ms -step:2011/2285 train_time:122057ms step_avg:60.69ms -step:2012/2285 train_time:122116ms step_avg:60.69ms -step:2013/2285 train_time:122178ms step_avg:60.69ms -step:2014/2285 train_time:122238ms step_avg:60.69ms -step:2015/2285 train_time:122300ms step_avg:60.69ms -step:2016/2285 train_time:122362ms step_avg:60.70ms -step:2017/2285 train_time:122426ms step_avg:60.70ms -step:2018/2285 train_time:122488ms step_avg:60.70ms -step:2019/2285 train_time:122551ms step_avg:60.70ms -step:2020/2285 train_time:122611ms step_avg:60.70ms -step:2021/2285 train_time:122674ms step_avg:60.70ms -step:2022/2285 train_time:122734ms step_avg:60.70ms -step:2023/2285 train_time:122795ms step_avg:60.70ms -step:2024/2285 train_time:122855ms step_avg:60.70ms -step:2025/2285 train_time:122916ms step_avg:60.70ms -step:2026/2285 train_time:122979ms step_avg:60.70ms -step:2027/2285 train_time:123038ms step_avg:60.70ms -step:2028/2285 train_time:123098ms step_avg:60.70ms -step:2029/2285 train_time:123159ms step_avg:60.70ms -step:2030/2285 train_time:123219ms step_avg:60.70ms -step:2031/2285 train_time:123281ms step_avg:60.70ms -step:2032/2285 train_time:123342ms step_avg:60.70ms -step:2033/2285 train_time:123406ms step_avg:60.70ms -step:2034/2285 train_time:123467ms step_avg:60.70ms -step:2035/2285 train_time:123530ms step_avg:60.70ms -step:2036/2285 train_time:123590ms step_avg:60.70ms -step:2037/2285 train_time:123654ms step_avg:60.70ms -step:2038/2285 train_time:123713ms step_avg:60.70ms -step:2039/2285 train_time:123776ms step_avg:60.70ms -step:2040/2285 train_time:123836ms step_avg:60.70ms -step:2041/2285 train_time:123897ms step_avg:60.70ms -step:2042/2285 train_time:123957ms step_avg:60.70ms -step:2043/2285 train_time:124019ms step_avg:60.70ms -step:2044/2285 train_time:124079ms step_avg:60.70ms -step:2045/2285 train_time:124141ms step_avg:60.70ms -step:2046/2285 train_time:124201ms step_avg:60.70ms -step:2047/2285 train_time:124264ms step_avg:60.71ms -step:2048/2285 train_time:124325ms step_avg:60.71ms -step:2049/2285 train_time:124388ms step_avg:60.71ms -step:2050/2285 train_time:124448ms step_avg:60.71ms -step:2051/2285 train_time:124511ms step_avg:60.71ms -step:2052/2285 train_time:124571ms step_avg:60.71ms -step:2053/2285 train_time:124634ms step_avg:60.71ms -step:2054/2285 train_time:124694ms step_avg:60.71ms -step:2055/2285 train_time:124756ms step_avg:60.71ms -step:2056/2285 train_time:124816ms step_avg:60.71ms -step:2057/2285 train_time:124878ms step_avg:60.71ms -step:2058/2285 train_time:124937ms step_avg:60.71ms -step:2059/2285 train_time:124999ms step_avg:60.71ms -step:2060/2285 train_time:125059ms step_avg:60.71ms -step:2061/2285 train_time:125121ms step_avg:60.71ms -step:2062/2285 train_time:125182ms step_avg:60.71ms -step:2063/2285 train_time:125245ms step_avg:60.71ms -step:2064/2285 train_time:125305ms step_avg:60.71ms -step:2065/2285 train_time:125368ms step_avg:60.71ms -step:2066/2285 train_time:125428ms step_avg:60.71ms -step:2067/2285 train_time:125491ms step_avg:60.71ms -step:2068/2285 train_time:125551ms step_avg:60.71ms -step:2069/2285 train_time:125613ms step_avg:60.71ms -step:2070/2285 train_time:125673ms step_avg:60.71ms -step:2071/2285 train_time:125735ms step_avg:60.71ms -step:2072/2285 train_time:125796ms step_avg:60.71ms -step:2073/2285 train_time:125858ms step_avg:60.71ms -step:2074/2285 train_time:125918ms step_avg:60.71ms -step:2075/2285 train_time:125980ms step_avg:60.71ms -step:2076/2285 train_time:126039ms step_avg:60.71ms -step:2077/2285 train_time:126102ms step_avg:60.71ms -step:2078/2285 train_time:126162ms step_avg:60.71ms -step:2079/2285 train_time:126225ms step_avg:60.71ms -step:2080/2285 train_time:126286ms step_avg:60.71ms -step:2081/2285 train_time:126348ms step_avg:60.71ms -step:2082/2285 train_time:126408ms step_avg:60.71ms -step:2083/2285 train_time:126471ms step_avg:60.72ms -step:2084/2285 train_time:126531ms step_avg:60.72ms -step:2085/2285 train_time:126593ms step_avg:60.72ms -step:2086/2285 train_time:126653ms step_avg:60.72ms -step:2087/2285 train_time:126715ms step_avg:60.72ms -step:2088/2285 train_time:126775ms step_avg:60.72ms -step:2089/2285 train_time:126837ms step_avg:60.72ms -step:2090/2285 train_time:126897ms step_avg:60.72ms -step:2091/2285 train_time:126960ms step_avg:60.72ms -step:2092/2285 train_time:127019ms step_avg:60.72ms -step:2093/2285 train_time:127081ms step_avg:60.72ms -step:2094/2285 train_time:127142ms step_avg:60.72ms -step:2095/2285 train_time:127205ms step_avg:60.72ms -step:2096/2285 train_time:127265ms step_avg:60.72ms -step:2097/2285 train_time:127328ms step_avg:60.72ms -step:2098/2285 train_time:127387ms step_avg:60.72ms -step:2099/2285 train_time:127449ms step_avg:60.72ms -step:2100/2285 train_time:127509ms step_avg:60.72ms -step:2101/2285 train_time:127572ms step_avg:60.72ms -step:2102/2285 train_time:127632ms step_avg:60.72ms -step:2103/2285 train_time:127694ms step_avg:60.72ms -step:2104/2285 train_time:127754ms step_avg:60.72ms -step:2105/2285 train_time:127817ms step_avg:60.72ms -step:2106/2285 train_time:127876ms step_avg:60.72ms -step:2107/2285 train_time:127939ms step_avg:60.72ms -step:2108/2285 train_time:127999ms step_avg:60.72ms -step:2109/2285 train_time:128061ms step_avg:60.72ms -step:2110/2285 train_time:128120ms step_avg:60.72ms -step:2111/2285 train_time:128184ms step_avg:60.72ms -step:2112/2285 train_time:128244ms step_avg:60.72ms -step:2113/2285 train_time:128307ms step_avg:60.72ms -step:2114/2285 train_time:128367ms step_avg:60.72ms -step:2115/2285 train_time:128430ms step_avg:60.72ms -step:2116/2285 train_time:128490ms step_avg:60.72ms -step:2117/2285 train_time:128552ms step_avg:60.72ms -step:2118/2285 train_time:128612ms step_avg:60.72ms -step:2119/2285 train_time:128677ms step_avg:60.73ms -step:2120/2285 train_time:128735ms step_avg:60.72ms -step:2121/2285 train_time:128797ms step_avg:60.72ms -step:2122/2285 train_time:128857ms step_avg:60.72ms -step:2123/2285 train_time:128920ms step_avg:60.73ms -step:2124/2285 train_time:128979ms step_avg:60.72ms -step:2125/2285 train_time:129041ms step_avg:60.73ms -step:2126/2285 train_time:129102ms step_avg:60.73ms -step:2127/2285 train_time:129164ms step_avg:60.73ms -step:2128/2285 train_time:129224ms step_avg:60.73ms -step:2129/2285 train_time:129287ms step_avg:60.73ms -step:2130/2285 train_time:129348ms step_avg:60.73ms -step:2131/2285 train_time:129410ms step_avg:60.73ms -step:2132/2285 train_time:129471ms step_avg:60.73ms -step:2133/2285 train_time:129533ms step_avg:60.73ms -step:2134/2285 train_time:129593ms step_avg:60.73ms -step:2135/2285 train_time:129655ms step_avg:60.73ms -step:2136/2285 train_time:129715ms step_avg:60.73ms -step:2137/2285 train_time:129777ms step_avg:60.73ms -step:2138/2285 train_time:129837ms step_avg:60.73ms -step:2139/2285 train_time:129900ms step_avg:60.73ms -step:2140/2285 train_time:129960ms step_avg:60.73ms -step:2141/2285 train_time:130022ms step_avg:60.73ms -step:2142/2285 train_time:130082ms step_avg:60.73ms -step:2143/2285 train_time:130145ms step_avg:60.73ms -step:2144/2285 train_time:130205ms step_avg:60.73ms -step:2145/2285 train_time:130268ms step_avg:60.73ms -step:2146/2285 train_time:130328ms step_avg:60.73ms -step:2147/2285 train_time:130390ms step_avg:60.73ms -step:2148/2285 train_time:130451ms step_avg:60.73ms -step:2149/2285 train_time:130513ms step_avg:60.73ms -step:2150/2285 train_time:130573ms step_avg:60.73ms -step:2151/2285 train_time:130635ms step_avg:60.73ms -step:2152/2285 train_time:130695ms step_avg:60.73ms -step:2153/2285 train_time:130757ms step_avg:60.73ms -step:2154/2285 train_time:130818ms step_avg:60.73ms -step:2155/2285 train_time:130881ms step_avg:60.73ms -step:2156/2285 train_time:130940ms step_avg:60.73ms -step:2157/2285 train_time:131002ms step_avg:60.73ms -step:2158/2285 train_time:131063ms step_avg:60.73ms -step:2159/2285 train_time:131125ms step_avg:60.73ms -step:2160/2285 train_time:131185ms step_avg:60.73ms -step:2161/2285 train_time:131248ms step_avg:60.73ms -step:2162/2285 train_time:131308ms step_avg:60.73ms -step:2163/2285 train_time:131371ms step_avg:60.74ms -step:2164/2285 train_time:131431ms step_avg:60.74ms -step:2165/2285 train_time:131493ms step_avg:60.74ms -step:2166/2285 train_time:131553ms step_avg:60.74ms -step:2167/2285 train_time:131615ms step_avg:60.74ms -step:2168/2285 train_time:131676ms step_avg:60.74ms -step:2169/2285 train_time:131738ms step_avg:60.74ms -step:2170/2285 train_time:131798ms step_avg:60.74ms -step:2171/2285 train_time:131861ms step_avg:60.74ms -step:2172/2285 train_time:131921ms step_avg:60.74ms -step:2173/2285 train_time:131983ms step_avg:60.74ms -step:2174/2285 train_time:132043ms step_avg:60.74ms -step:2175/2285 train_time:132105ms step_avg:60.74ms -step:2176/2285 train_time:132165ms step_avg:60.74ms -step:2177/2285 train_time:132228ms step_avg:60.74ms -step:2178/2285 train_time:132288ms step_avg:60.74ms -step:2179/2285 train_time:132351ms step_avg:60.74ms -step:2180/2285 train_time:132412ms step_avg:60.74ms -step:2181/2285 train_time:132474ms step_avg:60.74ms -step:2182/2285 train_time:132535ms step_avg:60.74ms -step:2183/2285 train_time:132597ms step_avg:60.74ms -step:2184/2285 train_time:132657ms step_avg:60.74ms -step:2185/2285 train_time:132720ms step_avg:60.74ms -step:2186/2285 train_time:132780ms step_avg:60.74ms -step:2187/2285 train_time:132843ms step_avg:60.74ms -step:2188/2285 train_time:132903ms step_avg:60.74ms -step:2189/2285 train_time:132965ms step_avg:60.74ms -step:2190/2285 train_time:133025ms step_avg:60.74ms -step:2191/2285 train_time:133087ms step_avg:60.74ms -step:2192/2285 train_time:133147ms step_avg:60.74ms -step:2193/2285 train_time:133210ms step_avg:60.74ms -step:2194/2285 train_time:133270ms step_avg:60.74ms -step:2195/2285 train_time:133332ms step_avg:60.74ms -step:2196/2285 train_time:133392ms step_avg:60.74ms -step:2197/2285 train_time:133454ms step_avg:60.74ms -step:2198/2285 train_time:133516ms step_avg:60.74ms -step:2199/2285 train_time:133578ms step_avg:60.74ms -step:2200/2285 train_time:133638ms step_avg:60.74ms -step:2201/2285 train_time:133701ms step_avg:60.75ms -step:2202/2285 train_time:133761ms step_avg:60.75ms -step:2203/2285 train_time:133823ms step_avg:60.75ms -step:2204/2285 train_time:133882ms step_avg:60.75ms -step:2205/2285 train_time:133945ms step_avg:60.75ms -step:2206/2285 train_time:134005ms step_avg:60.75ms -step:2207/2285 train_time:134067ms step_avg:60.75ms -step:2208/2285 train_time:134127ms step_avg:60.75ms -step:2209/2285 train_time:134190ms step_avg:60.75ms -step:2210/2285 train_time:134251ms step_avg:60.75ms -step:2211/2285 train_time:134313ms step_avg:60.75ms -step:2212/2285 train_time:134373ms step_avg:60.75ms -step:2213/2285 train_time:134436ms step_avg:60.75ms -step:2214/2285 train_time:134496ms step_avg:60.75ms -step:2215/2285 train_time:134558ms step_avg:60.75ms -step:2216/2285 train_time:134618ms step_avg:60.75ms -step:2217/2285 train_time:134680ms step_avg:60.75ms -step:2218/2285 train_time:134740ms step_avg:60.75ms -step:2219/2285 train_time:134803ms step_avg:60.75ms -step:2220/2285 train_time:134863ms step_avg:60.75ms -step:2221/2285 train_time:134926ms step_avg:60.75ms -step:2222/2285 train_time:134986ms step_avg:60.75ms -step:2223/2285 train_time:135048ms step_avg:60.75ms -step:2224/2285 train_time:135109ms step_avg:60.75ms -step:2225/2285 train_time:135171ms step_avg:60.75ms -step:2226/2285 train_time:135231ms step_avg:60.75ms -step:2227/2285 train_time:135293ms step_avg:60.75ms -step:2228/2285 train_time:135354ms step_avg:60.75ms -step:2229/2285 train_time:135416ms step_avg:60.75ms -step:2230/2285 train_time:135477ms step_avg:60.75ms -step:2231/2285 train_time:135539ms step_avg:60.75ms -step:2232/2285 train_time:135600ms step_avg:60.75ms -step:2233/2285 train_time:135662ms step_avg:60.75ms -step:2234/2285 train_time:135721ms step_avg:60.75ms -step:2235/2285 train_time:135783ms step_avg:60.75ms -step:2236/2285 train_time:135844ms step_avg:60.75ms -step:2237/2285 train_time:135906ms step_avg:60.75ms -step:2238/2285 train_time:135966ms step_avg:60.75ms -step:2239/2285 train_time:136029ms step_avg:60.75ms -step:2240/2285 train_time:136089ms step_avg:60.75ms -step:2241/2285 train_time:136151ms step_avg:60.75ms -step:2242/2285 train_time:136211ms step_avg:60.75ms -step:2243/2285 train_time:136273ms step_avg:60.75ms -step:2244/2285 train_time:136333ms step_avg:60.75ms -step:2245/2285 train_time:136396ms step_avg:60.76ms -step:2246/2285 train_time:136456ms step_avg:60.76ms -step:2247/2285 train_time:136519ms step_avg:60.76ms -step:2248/2285 train_time:136579ms step_avg:60.76ms -step:2249/2285 train_time:136642ms step_avg:60.76ms -step:2250/2285 train_time:136702ms step_avg:60.76ms -step:2250/2285 val_loss:3.2821 train_time:136766ms step_avg:60.78ms -step:2251/2285 train_time:136784ms step_avg:60.77ms -step:2252/2285 train_time:136830ms step_avg:60.76ms -step:2253/2285 train_time:136894ms step_avg:60.76ms -step:2254/2285 train_time:136955ms step_avg:60.76ms -step:2255/2285 train_time:137016ms step_avg:60.76ms -step:2256/2285 train_time:137076ms step_avg:60.76ms -step:2257/2285 train_time:137138ms step_avg:60.76ms -step:2258/2285 train_time:137198ms step_avg:60.76ms -step:2259/2285 train_time:137260ms step_avg:60.76ms -step:2260/2285 train_time:137320ms step_avg:60.76ms -step:2261/2285 train_time:137383ms step_avg:60.76ms -step:2262/2285 train_time:137442ms step_avg:60.76ms -step:2263/2285 train_time:137504ms step_avg:60.76ms -step:2264/2285 train_time:137564ms step_avg:60.76ms -step:2265/2285 train_time:137626ms step_avg:60.76ms -step:2266/2285 train_time:137686ms step_avg:60.76ms -step:2267/2285 train_time:137749ms step_avg:60.76ms -step:2268/2285 train_time:137811ms step_avg:60.76ms -step:2269/2285 train_time:137874ms step_avg:60.76ms -step:2270/2285 train_time:137935ms step_avg:60.76ms -step:2271/2285 train_time:137998ms step_avg:60.77ms -step:2272/2285 train_time:138058ms step_avg:60.76ms -step:2273/2285 train_time:138120ms step_avg:60.77ms -step:2274/2285 train_time:138180ms step_avg:60.77ms -step:2275/2285 train_time:138242ms step_avg:60.77ms -step:2276/2285 train_time:138302ms step_avg:60.77ms -step:2277/2285 train_time:138363ms step_avg:60.77ms -step:2278/2285 train_time:138423ms step_avg:60.77ms -step:2279/2285 train_time:138485ms step_avg:60.77ms -step:2280/2285 train_time:138545ms step_avg:60.77ms -step:2281/2285 train_time:138607ms step_avg:60.77ms -step:2282/2285 train_time:138667ms step_avg:60.77ms -step:2283/2285 train_time:138730ms step_avg:60.77ms -step:2284/2285 train_time:138791ms step_avg:60.77ms -step:2285/2285 train_time:138854ms step_avg:60.77ms -step:2285/2285 val_loss:3.2760 train_time:138916ms step_avg:60.79ms -peak memory allocated: 29626 MiB reserved: 50528 MiB diff --git a/records/track_1_short/2025-10-27_FixMuonLR/README.md b/records/track_1_short/2025-10-27_FixMuonLR/README.md deleted file mode 100644 index 504ed2120..000000000 --- a/records/track_1_short/2025-10-27_FixMuonLR/README.md +++ /dev/null @@ -1,65 +0,0 @@ -# Faster Muon step, corrected learning rates - -This record improves the step time of Muon and addresses some bugs in our current effective learning rate calculation. It incorporates the results from [PR#144](https://github.com/KellerJordan/modded-nanogpt/pull/144). - -## Timing and Validation - -This record improves the final training by 30 steps and decreases by step by around 1%. - -This PR: - -``` -import scipy.stats -import torch - -losses = [3.2766, 3.2794, 3.2770, 3.2776, 3.2760, 3.2802, 3.2757] -times = [138.986, 138.838, 138.877, 138.905, 138.916, 138.846, 138.937] - -print("p=%.4f" % scipy.stats.ttest_1samp(losses, 3.28, alternative="less").pvalue) -# p=0.0041 - -print("losses:", torch.std_mean(torch.tensor(losses))) -# losses: (std=0.001706, mean=3.277500) - -print("time:", torch.std_mean(torch.tensor(times))) -# time: (std=0.052171, mean=138.900711) -``` - -Previous PR (timed on same machine): - -``` -import scipy.stats -import torch - -times = [142.379, 142.156, 141.391, 142.374, 142.316] - -print("time:", torch.std_mean(torch.tensor(times))) -# time: (std=0.419136, mean=142.123200) -``` - -In total, this corresponds to roughly a $3.22$ second decrease in training time. On a faster machine (the one used for official timing), this will probably be $\approx 3.17$ seconds. - -Thank you to Prime Intellect for sponsoring my research. - -## Changes - - -### (1) Vectorized Muon Step - -I vectorized several loops inside the Muon `step`, which slightly decreases step time. I am guessing we can apply `torch.compile` to a subpart of `step` for further gains, as well. I moved the momentum buffers to being properties of groups, not of states, though this requires that we add a `reset()` (similar to `Yarn`). - -### (2) Corrected learning rate - -In the previous Muon step, the `eff_lr_val` was scaling the learning rate on the attention parameters by `1/2`, since they were treated as `[dim, 4 * dim]`-shaped parameters. However, in practice, they are square parameters, so we should not do this. Moving the attention reshape in the step corrects this issue. - -Similarly, the MLP up-projection is also scaled down. Following the theory that effective learning rate is proportional to $\sqrt{\text{output_dim}}$ I have increased `lr_mul` on the MLP up-projection to `2.0`. I have removed the logic that requires all parameters in the same group the share the same learning rate and weight decay. - -Both of these changes meant that our previous Muon learning rate was ~twice as high as it should be, so I've decreased it to `0.03`. I have not further tuned this value. - -### (3) LR refactoring + WS Schedule tweak - -I removed the logic for iteration extension and instead changed `get_lr` to account for a "flat" section at the end. The hyperparameters for learning rate have been changed to instead to be fractional breakpoints, which helps in testing out lower step accounts. I believe that the LR schedule can be further improved. - -Since the WS schedule was also impacted by the iteration extension, I updated the schedule from being 3 parts to 6 parts. This schedule is different than the previous three part schedule though it performs essentially the same as the version with iteration extension. - -Additionally, I corrected a subtle bug where gradients were being summed in `grad_accum_steps` but averaged over ranks. In practice this is mostly irrelevant due to magnitude invariance, however it causes minor precision issues for $<8$ devices. diff --git a/records/track_1_short/2025-10-27_FixMuonLR/f196cb62-827b-4bb1-94f0-4169eb1c9375.txt b/records/track_1_short/2025-10-27_FixMuonLR/f196cb62-827b-4bb1-94f0-4169eb1c9375.txt deleted file mode 100644 index 183ae8822..000000000 --- a/records/track_1_short/2025-10-27_FixMuonLR/f196cb62-827b-4bb1-94f0-4169eb1c9375.txt +++ /dev/null @@ -1,3814 +0,0 @@ -import os -import sys - -with open(sys.argv[0]) as f: - code = f.read() # read the code of this file ASAP, for logging -import copy -import glob -import math -import threading -import time -import uuid -from dataclasses import dataclass -from collections import defaultdict -from itertools import accumulate -from pathlib import Path - -os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" -import torch - -torch.empty( - 1, device="cuda", requires_grad=True -).backward() # prevents a bug on some systems -import torch._dynamo as dynamo -import torch.distributed as dist -import torch.nn.functional as F - -# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min -import triton -import triton.language as tl -from kernels import get_kernel -from torch import Tensor, nn - -dynamo.config.recompile_limit = 64 - -# ----------------------------------------------------------------------------- -# Custom operators: FP8 matmul by @YouJiacheng - - -@torch.library.custom_op("nanogpt::mm", mutates_args=()) -def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: - @torch.compile - def impl(x: Tensor, w: Tensor): - assert x.is_contiguous() and w.is_contiguous() - x_f8 = x.div(x_s).to(torch.float8_e4m3fn) - w_f8 = w.div(w_s).to(torch.float8_e4m3fn) - out = torch._scaled_mm( - x_f8, - w_f8.T, - out_dtype=torch.bfloat16, - scale_a=x.new_tensor(x_s, dtype=torch.float32), - scale_b=x.new_tensor(w_s, dtype=torch.float32), - use_fast_accum=True, - ) - return out, x_f8, w_f8 - - return impl(x, w) - -@mm_op.register_fake -def _(x: Tensor, w: Tensor, *_): - assert x.ndim == w.ndim == 2 - assert x.shape[1] == w.shape[1] - assert x.device == w.device - assert x.is_contiguous() and w.is_contiguous() - return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) - -@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) -def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: - @torch.compile - def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): - assert grad.is_contiguous() - x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) - w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) - grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) - grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) - grad_x = torch._scaled_mm( - grad_f8, - w_f8.T.contiguous().T, - out_dtype=torch.bfloat16, - scale_a=grad_inv_s, - scale_b=w_inv_s, - use_fast_accum=False, - ) - # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) - grad_w = torch._scaled_mm( - x_f8.T.contiguous(), - grad_f8.T.contiguous().T, - out_dtype=torch.float32, - scale_a=x_inv_s, - scale_b=grad_inv_s, - use_fast_accum=False, - ).T - return grad_x, grad_w - - return impl(g, x_f8, w_f8) - -@mm_backward_op.register_fake -def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): - return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) - -def backward(ctx, grad_out: Tensor, *_): - x_f8, w_f8 = ctx.saved_tensors - x_s, w_s, grad_s = ctx.scales - grad_x, grad_w = torch.ops.nanogpt.mm_backward( - grad_out, x_f8, w_f8, x_s, w_s, grad_s - ) - return grad_x, grad_w, None, None, None - -def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): - *_, x_s, w_s, grad_s = inputs - _, x_f8, w_f8 = output - ctx.save_for_backward(x_f8, w_f8) - ctx.scales = x_s, w_s, grad_s - ctx.set_materialize_grads(False) - -mm_op.register_autograd(backward, setup_context=setup_context) - -# ----------------------------------------------------------------------------- -# Triton kernel for symmetric matrix multiplication by @byronxu99 - -def _get_autotune_configs(): - return [ - triton.Config( - { - "BLOCK_SIZE_M": bm, - "BLOCK_SIZE_N": bn, - "BLOCK_SIZE_K": bk, - "GROUP_SIZE_M": 8, - "LOWER_UPPER": 1, - }, - num_stages=stages, - num_warps=warps, - ) - for bm in [64, 128] - for bn in [64, 128, 256] - for bk in [64, 128] - for stages, warps in [(3, 4), (3, 8), (4, 4)] - if bm // bn <= 2 and bn // bm <= 2 - ] - -@triton.jit -def _pid_to_block( - pid, - M, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, -): - # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) - num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) - num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) - - # Map PID to a single matrix in batch - batch_idx = pid // (num_pid_m * num_pid_n) - pid = pid % (num_pid_m * num_pid_n) - - # Map PID to 2D grid of blocks - pid_m = pid // num_pid_n - pid_n = pid % num_pid_n - pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) - - m_idx = pid_m * BLOCK_SIZE_M - n_idx = pid_n * BLOCK_SIZE_N - return batch_idx, m_idx, n_idx - -@triton.autotune( - configs=_get_autotune_configs(), - key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], -) -@triton.jit -def XXT_kernel( - A_ptr, C_ptr, - M, K, - a_stride_b, a_stride_r, a_stride_c, - c_stride_b, c_stride_r, c_stride_c, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, - LOWER_UPPER: tl.constexpr, -): - pid = tl.program_id(axis=0) - batch_idx, m_idx, n_idx = _pid_to_block( - pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M - ) - - # Skip blocks that don't need to be computed - skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) - skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) - if skip_block_below_diag or skip_block_above_diag: - return - - # Index into one matrix of batch - A_ptr += batch_idx * a_stride_b - C_ptr += batch_idx * c_stride_b - - # Create pointer arrays for A and A.T - offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M - offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M - offs_k = tl.arange(0, BLOCK_SIZE_K) - a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) - at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) - - accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - - # Accumulate over blocks of K - for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): - a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) - at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) - accumulator = tl.dot(a, at, accumulator) - a_ptrs += BLOCK_SIZE_K * a_stride_c - at_ptrs += BLOCK_SIZE_K * a_stride_c - - out_dtype = C_ptr.dtype.element_ty - output = accumulator.to(out_dtype) - - # Store block of C - offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) - offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) - c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) - c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) - tl.store(c_ptrs, output, mask=c_mask) - - # Store block of C mirrored across the diagonal - c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) - c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) - tl.store(c_ptrs_t, output.T, mask=c_mask_t) - -def XXT(A: torch.Tensor, out: torch.Tensor): - """ - Launch Triton kernel to compute C = A @ A.T - """ - assert A.ndim == 2 or A.ndim == 3 - M, K = A.shape[-2:] - assert out.size(-2) == M, "Output matrix has incorrect shape" - assert out.size(-1) == M, "Output matrix has incorrect shape" - - batch_size = A.size(0) if A.ndim == 3 else 1 - input_batch_stride = A.stride(0) if A.ndim == 3 else 0 - output_batch_stride = out.stride(0) if out.ndim == 3 else 0 - - grid = lambda meta: ( - batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), - ) - XXT_kernel[grid]( - A_ptr=A, - C_ptr=out, - M=M, - K=K, - a_stride_b=input_batch_stride, - a_stride_r=A.stride(-2), - a_stride_c=A.stride(-1), - c_stride_b=output_batch_stride, - c_stride_r=out.stride(-2), - c_stride_c=out.stride(-1), - ) - return out - -@triton.autotune( - configs=_get_autotune_configs(), - key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], -) -@triton.jit -def ba_plus_cAA_kernel( - A_ptr, C_ptr, - M, - a_stride_b, a_stride_r, a_stride_c, - c_stride_b, c_stride_r, c_stride_c, - alpha, beta, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, - LOWER_UPPER: tl.constexpr, -): - # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A - # Performance is slightly slower than XXT_kernel, so we use two separate kernels - pid = tl.program_id(axis=0) - batch_idx, m_idx, n_idx = _pid_to_block( - pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M - ) - - # Skip blocks that don't need to be computed - skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) - skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) - if skip_block_below_diag or skip_block_above_diag: - return - - # Index into one matrix of batch - A_ptr += batch_idx * a_stride_b - C_ptr += batch_idx * c_stride_b - - # Create pointer arrays for A and A.T - offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M - offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M - offs_k = tl.arange(0, BLOCK_SIZE_K) - a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) - at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) - - accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - - # Accumulate over blocks of K - for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): - a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) - at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) - accumulator = tl.dot(a, at, accumulator) - a_ptrs += BLOCK_SIZE_K * a_stride_c - at_ptrs += BLOCK_SIZE_K * a_stride_c - - # Load block of A to add (corresponds to the current block of C) - offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) - offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) - a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) - a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) - a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) - - # Apply alpha and beta - accumulator *= alpha - accumulator += a_add * beta - - out_dtype = C_ptr.dtype.element_ty - output = accumulator.to(out_dtype) - - # Store block of C - offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) - offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) - c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) - c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) - tl.store(c_ptrs, output, mask=c_mask) - - # Store block of C mirrored across the diagonal - c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) - c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) - tl.store(c_ptrs_t, output.T, mask=c_mask_t) - -def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): - """ - Launch Triton kernel to compute C = alpha * A @ A.T + beta * A - """ - assert A.ndim == 2 or A.ndim == 3 - M, K = A.shape[-2:] - assert M == K, "Input matrix must be square" - assert out.size(-2) == M - assert out.size(-1) == M - - batch_size = A.size(0) if A.ndim == 3 else 1 - input_batch_stride = A.stride(0) if A.ndim == 3 else 0 - output_batch_stride = out.stride(0) if out.ndim == 3 else 0 - - grid = lambda meta: ( - batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), - ) - ba_plus_cAA_kernel[grid]( - A_ptr=A, - C_ptr=out, - M=M, - a_stride_b=input_batch_stride, - a_stride_r=A.stride(-2), - a_stride_c=A.stride(-1), - c_stride_b=output_batch_stride, - c_stride_r=out.stride(-2), - c_stride_c=out.stride(-1), - alpha=alpha, - beta=beta, - ) - return out - -# Computed for num_iters=5, safety_factor=2e-2, cushion=2 -polar_express_coeffs = [ - (8.156554524902461, -22.48329292557795, 15.878769915207462), - (4.042929935166739, -2.808917465908714, 0.5000178451051316), - (3.8916678022926607, -2.772484153217685, 0.5060648178503393), - (3.285753657755655, -2.3681294933425376, 0.46449024233003106), - (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) -] - -@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower -def polar_express(G: torch.Tensor): - """ - Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 - by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. - Code adapted from https://github.com/NoahAmsel/PolarExpress/tree/main by @varunneal. - """ - X = G.bfloat16() - if G.size(-2) > G.size(-1): - X = X.mT - - # Ensure spectral norm is at most 1 - X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) - - # Allocate buffers - X = X.contiguous() - A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) - B = torch.empty_like(A) - C = torch.empty_like(X) - - aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm - - # Perform the iterations - for a, b, c in polar_express_coeffs: - XXT(X, out=A) # A = X @ X.mT - ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A - aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X - X, C = C, X # Swap references to avoid unnecessary copies - - if G.size(-2) > G.size(-1): - X = X.mT - return X - -# ----------------------------------------------------------------------------- -# Muon optimizer - -class Muon(torch.optim.Optimizer): - """ - Muon - MomentUm Orthogonalized by Newton-schulz - - https://kellerjordan.github.io/posts/muon/ - - Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- - processing step, in which each 2D parameter's update is replaced with the nearest orthogonal - matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has - the advantage that it can be stably run in bfloat16 on the GPU. - Note: A later PR replaced Newton-Shulz with Polar Express for the orthogonalization step - - Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, - or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - Though empirically small 1D params perform efficiently here: - NS approximately performs a magnitude normalization of the grad - This hyper-optimized class has faster execution time than the current impl of Adam for small params - - Custom distributed sizing: - The model stores all attn and mlp weights in the same shape, and then updates the view as - needed on the forward pass. This enables attn and mlp weights to be contained within the same - dist.reduce_scatter_tensor() call. The model architecture has been customized to enable - (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. - The scheduling is: - 1. reduce scatter smear_gate (1 param 7 padding params) - 2. reduce scatter attn_gate (10 params 6 padding params) - 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) - 4. reduce scatter attn/mlp round 2 (16 mlp params) - 5. wait on step 1, then compute update of 1 and schedule all gather - 6. wait on step 2, then compute update of 2 and schedule all gather - 7. wait on step 3, then compute update of 3 and schedule all gather - GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] - GPUs that receive params of type attn reshape before computing update - 8. wait on 4, then compute update of 4 and schedule all gather - 9. wait for each all gather to complete and update params - Empirically, leading with small params provides an additional 0.2s improvement. - """ - def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, eps=1e-8, beta2=0.95, custom_sizing=True): - defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) - self.world_size = dist.get_world_size() if dist.is_initialized() else 1 - # custom sizing requires 8 GPUs - if custom_sizing and dist.get_world_size()==8: - param_groups = self.generate_custom_param_groups(params) - else: - param_groups = self.generate_standard_param_groups(params) - super().__init__(param_groups, defaults) - - def reset(self): - # expose a reset for clearing buffers - for group in self.param_groups: - group["momentum_buffer"].zero_() - group["second_momentum_buffer"].zero_() - - def generate_standard_param_groups(self, params): - """ - Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. - Creates one param group per module. - """ - groups = defaultdict(list) - for param in params: - groups[param.label].append(param) - - param_groups = [] - for module_name, group_params in groups.items(): - chunk_size = (len(group_params) + self.world_size - 1) // self.world_size - param_groups.append(dict(params=group_params, chunk_size=chunk_size)) - - return param_groups - - def generate_custom_param_groups(self, params): - """ - Implementation requires that a single GPU does not receive both attn - and mlp params when a param group is split across GPUs. - """ - module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp_up', 'mlp_down'] - params_list = list(params) - params_list.sort(key=lambda x: module_group_order.index(x.label)) - - idx = 0 - group_sizes = [1, 10, 16, 16] - assert len(params_list) == sum(group_sizes) - param_groups = [] - for size in group_sizes: - chunk_size = (size + self.world_size - 1) // self.world_size - group_params = params_list[idx: idx + size] - param_groups.append(dict(params=group_params, chunk_size=chunk_size)) - idx += size - - return param_groups - - @torch.no_grad() - def step(self): - # Efficient systems-wise implementation of step developed by @YouJiacheng, - # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, - # @ryanyang0, @vagrawal, and @varunneal. - rank = dist.get_rank() - group_infos = [] - for group in self.param_groups: - params: list[Tensor] = group["params"] - if not params: - continue - - chunk_size = group["chunk_size"] - padded_num_params = chunk_size * self.world_size - - stacked_grads = torch.empty( - (padded_num_params, *params[0].shape), - dtype=params[0].dtype, - device=params[0].device - ) - for i, p in enumerate(params): - stacked_grads[i].copy_(p.grad, non_blocking=True) - if len(params) < padded_num_params: - stacked_grads[len(params):].zero_() - - grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) - - reduce_future = dist.reduce_scatter_tensor( - grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True - ).get_future() - - group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) - - all_gather_infos = [] - # Second pass: wait for gradients, compute updates for the local shard of parameters, - # and launch all async all_gather operations. - for group, info in zip(self.param_groups, group_infos): - info["reduce_future"].wait() - - params = group["params"] - grad_chunk = info["grad_chunk"] - chunk_size = group["chunk_size"] - padded_num_params = chunk_size * self.world_size - - start_idx = rank * chunk_size - module_idx = start_idx if start_idx < len(params) else 0 - - num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank - - if "momentum_buffer" not in group: - group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) - momentum_buffer = group["momentum_buffer"] - # Apply momentum update to the persistent momentum buffer in-place - momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) - updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) - - grad_shape = updated_grads.shape - if params[module_idx].label == 'attn': - # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] - for p in params[module_idx:module_idx + num_params]: - assert p.label == 'attn' - updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) - ref_param = params[module_idx] - param_shape = ref_param.shape - - if "second_momentum_buffer" not in group: - group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) - if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) - ) - second_momentum_buffer = group["second_momentum_buffer"] - - if "param_lr" not in group: - group["param_lr"] = ( - max(1., param_shape[-2] / param_shape[-1]) ** 0.5 - * ref_param.new_tensor( - [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] - ).view(-1, 1, 1) - ) - - group["param_wd"] = ref_param.new_tensor( - [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] - ).view(-1, 1, 1) - - # Determine LR and WR - eff_lr = group["lr"] * group["param_lr"] - eff_wd = group["weight_decay"] * group["param_wd"] - - # Compute zeropower for the entire chunk in a single, batched call. - if num_params == 0: - v_chunk = updated_grads - elif params[module_idx].label == "smear_gate": - # dividing by magnitude is equivalent of SVN for 1d tensors - v_chunk = updated_grads / (updated_grads.norm(dim=(-2, -1), keepdim=True).clamp_min(1e-10)) - else: - v_chunk = polar_express(updated_grads) - - # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) - v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) - v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) - second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) - step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() - v_chunk.mul_(step_size) - v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) - v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) - - v_chunk = v_chunk.view(grad_shape) - - updated_params = torch.empty_like(grad_chunk) - param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) - # Apply weight decay directly to the buffer. - param_chunk.mul_(1 - eff_wd) - - param_chunk.add_(-eff_lr * v_chunk) - - updated_params[:num_params].copy_(param_chunk) - if num_params < chunk_size: - updated_params[num_params:].zero_() - - stacked_params = torch.empty( - (padded_num_params, *param_shape), - dtype=updated_params.dtype, - device=updated_params.device, - ) - - gather_future = dist.all_gather_into_tensor( - stacked_params, updated_params, async_op=True - ).get_future() - - all_gather_infos.append( - { - "gather_future": gather_future, - "stacked_params": stacked_params, - "orig_params": params, - } - ) - - # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. - for info in all_gather_infos: - info["gather_future"].wait() - stacked_params = info["stacked_params"] - orig_params = info["orig_params"] - - unstacked_params = torch.unbind(stacked_params) - for i, p in enumerate(orig_params): - p.copy_(unstacked_params[i], non_blocking=True) - - -class DistAdam(torch.optim.Optimizer): - def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): - self.world_size = dist.get_world_size() if dist.is_initialized() else 1 - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) - params = list(params) - sizes = {p.shape for p in params} - # create one buffer per unique parameter-size - param_groups = [] - for size in sizes: - group_params = [p for p in params if p.shape == size] - param_groups.append(dict(params=group_params)) - super().__init__(param_groups, defaults) - # init state - for p in params: - chunk_size = p.size(0) // self.world_size - exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) - exp_avg_sq = torch.zeros_like(exp_avg) - self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) - # DistributedAdam implementation by @vagrawal - - @torch.compile - @torch.no_grad() - def step(self): - rank = dist.get_rank() - reduce_scatter_futures: list[torch.Future] = [] - all_gather_futures: list[torch.Future] = [] - grad_slices = [] - for group in self.param_groups: - params: list[Tensor] = group["params"] - for param in params: - grad = param.grad - rank_size = grad.shape[0] // self.world_size - grad_slice = torch.empty_like(grad[:rank_size]) - reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) - grad_slices.append(grad_slice) - - idx = 0 - for group in self.param_groups: - beta1, beta2 = group['betas'] - eps = group['eps'] - wd = group['weight_decay'] - params = group['params'] - for param in params: - reduce_scatter_futures[idx].wait() - rank_size = param.shape[0] // self.world_size - p_slice = param[rank * rank_size:(rank + 1) * rank_size] - lr = group['lr'] * getattr(param, "lr_mul", 1.0) - state = self.state[param] - g_slice = grad_slices[idx] - - exp_avg = state["exp_avg"] - exp_avg_sq = state["exp_avg_sq"] - state["step"] += 1 - t = state["step"] - # weight decay - if wd != 0: - eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) - p_slice.mul_(1 - eff_weight_decay) - # update running averages - exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) - exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) - # bias corrections - bias1 = 1 - beta1 ** t - bias2 = 1 - beta2 ** t - # compute step - denom = exp_avg_sq.sqrt().add_(eps) - step_size = lr * (bias2 ** 0.5 / bias1) - update = exp_avg.div(denom).mul_(step_size) - p_slice.add_(other=update, alpha=-1.0) - idx += 1 - all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) - torch.futures.collect_all(all_gather_futures).wait() - -# ----------------------------------------------------------------------------- -# PyTorch nn.Module definitions for the model - -def norm(x: Tensor): - return F.rms_norm(x, (x.size(-1),)) - -class CastedLinear(nn.Linear): - def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): - super().__init__(in_features, out_features, bias=False) - self.use_fp8 = use_fp8 - self.x_s = x_s - self.w_s = w_s - self.grad_s = grad_s - - def reset_parameters(self) -> None: - with torch.no_grad(): - self.weight.zero_() # @Grad62304977 and others - - def forward(self, x: Tensor): - if self.use_fp8 and self.training: - _x = x.flatten(0, -2) - out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] - return out.reshape(*x.shape[:-1], -1) - else: - return F.linear(x, self.weight.type_as(x)) - -# yarn implementation @classiclarryd -class Yarn(nn.Module): - def __init__(self, head_dim, max_seq_len): - super().__init__() - self.head_dim = head_dim - self.max_seq_len = max_seq_len - self.reset() - - def reset(self): - angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) - # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) - angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) - t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) - theta = torch.outer(t, angular_freq) - self.cos = nn.Buffer( - theta.cos().to(torch.bfloat16), persistent=False - ) - self.sin = nn.Buffer( - theta.sin().to(torch.bfloat16), persistent=False - ) - self.angular_freq = angular_freq - # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 - self.attn_scale = 0.1 - - def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): - rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) - scaling_factor = old_window / new_window - interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) - self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) - t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) - theta = torch.outer(t, self.angular_freq) - self.cos.copy_(theta.cos()) - self.sin.copy_(theta.sin()) - self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 - -def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): - assert cos.size(0) >= x_BTHD.size(-3) - cos, sin = ( - cos[None, : x_BTHD.size(-3), None, :], - sin[None, : x_BTHD.size(-3), None, :], - ) - x1, x2 = x_BTHD.chunk(2, dim=-1) - y1 = x1 * cos + x2 * sin - y2 = x1 * (-sin) + x2 * cos - return torch.cat((y1, y2), 3) - -@dataclass -class AttnArgs: - ve: torch.Tensor - sa_lambdas: torch.Tensor - seqlens: torch.Tensor - bm_size: int - cos: torch.Tensor - sin: torch.Tensor - attn_scale: float - -flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface - -class CausalSelfAttention(nn.Module): - def __init__(self, dim: int, head_dim: int, num_heads: int): - super().__init__() - self.num_heads = num_heads - self.head_dim = head_dim - self.dim = dim - self.hdim = num_heads * head_dim - - assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" - std = 0.5 * (self.dim ** -0.5) - bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng - # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng - # https://x.com/hi_tysam/status/1879699187107033311 - # make matrices the same shape as MLP to enable batched call in optimizer - self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) - # label module to enable custom optimizer sizing - self.qkvo_w.label='attn' - - with torch.no_grad(): - self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights - self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero - - # sparse gated attention to enable context based no-op by @classiclarryd - self.attn_gate = CastedLinear(12, num_heads) - # label module to enable custom optimizer sizing - self.attn_gate.weight.label = 'attn_gate' - - def forward(self, x: Tensor, attn_args: AttnArgs): - B, T = x.size(0), x.size(1) # batch size, sequence length - assert B == 1, "varlen sequences requires B == 1" - assert T % 16 == 0 - # unpack attention args - cos, sin = attn_args.cos, attn_args.sin - ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas - seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size - - q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) - q, k = norm(q), norm(k) # QK norm @Grad62304977 - q, k = rotary(q, cos, sin), rotary(k, cos, sin) - if ve is not None: - v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 - else: # skip mid-layers token value embeddings by @YouJiacheng - v = sa_lambdas[0] * v - - max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) - - # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng - y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, - max_seqlen_q=max_len, max_seqlen_k=max_len, - causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) - y = y.view(B, T, self.num_heads, self.head_dim) - y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) - y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side - y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) - return y - - -class MLP(nn.Module): - def __init__(self, dim: int): - super().__init__() - hdim = 4 * dim - # make matrices the same shape to enable batched call in optimizer - self.c_fc = nn.Parameter(torch.empty(dim, hdim)) - self.c_proj = nn.Parameter(torch.empty(dim, hdim)) - # label modules to enable custom optimizer sizing - self.c_fc.label = 'mlp_up' - self.c_proj.label = 'mlp_down' - # corrective factor to account for transpose - self.c_fc.lr_mul = 2. - - std = 0.5 * (dim ** -0.5) - bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng - with torch.no_grad(): - self.c_fc.uniform_(-bound, bound) - self.c_proj.zero_() # zero init suggested by @Grad62304977 - - def forward(self, x: Tensor): - x = F.linear(x, self.c_fc.T.type_as(x)) - x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 - x = F.linear(x, self.c_proj.type_as(x)) - return x - -class Block(nn.Module): - def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): - super().__init__() - # skip attention of blocks.7 (the 8th layer) by @YouJiacheng - self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None - # skip MLP blocks for first MLP layer by @EmelyanenkoK - self.mlp = MLP(dim) if layer_idx != 0 else None - - def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): - x = lambdas[0] * x + lambdas[1] * x0 - if self.attn is not None: - x = x + self.attn(norm(x), attn_args) - if self.mlp is not None: - x = x + self.mlp(norm(x)) - return x - -# ----------------------------------------------------------------------------- -# The main model - -def next_multiple_of_n(v: float | int, *, n: int): - return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) - -class GPT(nn.Module): - def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): - super().__init__() - vocab_size = next_multiple_of_n(vocab_size, n=128) - self.embed = nn.Embedding(vocab_size, model_dim) - self.smear_gate = CastedLinear(12, 1) - # label modules to enable custom optimizer sizing - self.smear_gate.weight.label = 'smear_gate' - # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 - # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 - self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) - self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) - self.yarn = Yarn(head_dim, max_seq_len) - # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. - # suggested to me by @Grad62304977. this originates from Karpathy's experiments. - use_fp8 = not os.environ.get("DISABLE_FP8", False) - self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) - # Add learnable skip connection weights for decoder layers - assert num_layers % 2 == 0 - pad = (-num_layers * 5 - 2) % dist.get_world_size() - self.scalars = nn.Parameter( - torch.cat( - [ - -1.5 - * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 - *[ - torch.tensor([1.0, 0.0]) for _ in range(num_layers) - ], # block lambdas - *[ - torch.tensor([0.5, 0.5]) for _ in range(num_layers) - ], # SA lambdas - torch.zeros(1), # smear_lambda - 0.5*torch.ones(1), # backout_lambda - torch.ones(pad), - ] - ) - ) - # set learning rates - for param in self.embed.parameters(): - param.lr_mul = 75. - for param in self.value_embeds.parameters(): - param.lr_mul = 75. - self.lm_head.weight.lr_mul = 1.0 - self.scalars.lr_mul = 5.0 - - def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): - assert input_seq.ndim == 1 - - ve = [value_embed(input_seq) for value_embed in self.value_embeds] - # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure - # dropping first layer updates this to .12 ... 012 - ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] - assert len(ve) == len(self.blocks) - - short_bm = ws_short * args.block_size - long_bm = ws_long * args.block_size - bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] - assert len(bm_sizes) == len(self.blocks) - - x = self.embed(input_seq) - - skip_weights = self.scalars[:(len(self.blocks) // 2)] - lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) - sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) - smear_lambda = self.scalars[5 * len(self.blocks)] - backout_lambda = self.scalars[5 * len(self.blocks)+1] - - # smear token embed forward 1 position @classiclarryd - smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) - x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) - x = x0 = norm(x[None]) - - # U-net design by @brendanh0gan - skip_connections = [] - n = len(self.blocks) // 2 - - x_backout = None - backout_layer = 8 - # skip layer zero - for i in range(1,len(self.blocks)): - attn_args = AttnArgs( - ve=ve[i], - sa_lambdas=sa_lambdas[i], - seqlens=seqlens, - bm_size=bm_sizes[i], - cos=self.yarn.cos, - sin=self.yarn.sin, - attn_scale=self.yarn.attn_scale - ) - # since layer 0 is skipped, layer 11 does not have skip_connection - if i >= n and i<11: - gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) - x = x + gate * skip_connections.pop() - x = self.blocks[i](x, x0, lambdas[i], attn_args) - if i < n: - skip_connections.append(x) - if i == backout_layer: - x_backout = x - - # back out contributions from first 8 layers that are only required for downstream context and not direct prediction - x -= backout_lambda * x_backout - x = norm(x) - logits = self.lm_head(x) - # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) - logits = 30 * torch.sigmoid(logits / 7.5) - logits_for_loss = logits.float() if not self.training else logits - loss = F.cross_entropy( - logits_for_loss.view(-1, logits_for_loss.size(-1)), - target_seq, - reduction="sum" if self.training else "mean", - ) - return loss - -# ----------------------------------------------------------------------------- -# Distributed data loader - -def _load_data_shard(file: Path): - header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 - assert header[0] == 20240520, "magic number mismatch in the data .bin file" - assert header[1] == 1, "unsupported version" - num_tokens = int(header[2]) # number of tokens (claimed) - with file.open("rb", buffering=0) as f: - tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng - f.seek(256 * 4) - nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng - assert nbytes == 2 * num_tokens, "number of tokens read does not match header" - return tokens - -BOS_ID = 50256 - -class BOSFinder: - # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd - def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): - # Precompute BOS positions once per shard - self.tokens=tokens - self.size = tokens.numel() - self.quickload = quickload - if quickload: - # only scan first 4 million tokens, then kickoff async thread to scan rest - self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() - self.thread = None - self.ready = threading.Event() - self.start() - else: - self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() - self.i = 0 - self.world_size = world_size - self.batch_iter = 0 - - def _load(self): - self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() - self.ready.set() - - def start(self): - self.ready.clear() - self.thread = threading.Thread(target=self._load) - self.thread.start() - - def get(self): - if self.thread: - self.ready.wait() - self.thread.join() - self.bos_idx = self.bos_idx_async - - def next_batch(self, num_tokens_local: int, max_seq_len: int): - # if quickload was used, repoint to the full dataset after 5 batches - if self.quickload and self.batch_iter==5: - self.get() - n = len(self.bos_idx) - starts = [[] for _ in range(self.world_size)] - ends = [[] for _ in range(self.world_size)] - - idx = self.i - for r in range(self.world_size): - cur_len = 0 - while cur_len <= num_tokens_local: - if idx >= n: - raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") - cur = self.bos_idx[idx] - starts[r].append(cur) - end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, - cur + max_seq_len, - cur + num_tokens_local - cur_len + 1) - ends[r].append(end) - cur_len += end - cur - idx += 1 - - assert cur_len == num_tokens_local + 1 - self.i = idx - self.batch_iter+=1 - return starts, ends - -class DataPreloader: - # Helper for asynchronously loading next shard and indexing bos tokens - def __init__(self, file_iter, world_size: int = 1): - self.file_iter = file_iter - self.world_size = world_size - self.thread = None - self.data = None - self.ready = threading.Event() - - def _load(self): - tokens = _load_data_shard(next(self.file_iter)) - self.data = (tokens, BOSFinder(tokens, self.world_size)) - self.ready.set() - - def start(self): - self.ready.clear() - self.thread = threading.Thread(target=self._load) - self.thread.start() - - def get(self): - if self.thread: - self.ready.wait() - self.thread.join() - return self.data - -def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): - # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len - rank = dist.get_rank() if dist.is_initialized() else 0 - world_size = dist.get_world_size() if dist.is_initialized() else 1 - assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" - num_tokens = num_tokens // grad_accum_steps - - files = [Path(file) for file in sorted(glob.glob(filename_pattern))] - if not files: - raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") - - file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training - tokens = _load_data_shard(next(file_iter)) - if align_to_bos: - finder = BOSFinder(tokens, world_size=world_size, quickload=True) - preloader = DataPreloader(file_iter, world_size) - preloader.start() - else: - pos = 0 # for unaligned case - - while True: - num_tokens_local = num_tokens // world_size - max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 - - if align_to_bos: - try: - seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) - start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) - except StopIteration: - # This shard is exhausted, load the next one in the next loop iteration. - tokens, finder = preloader.get() - preloader.start() - continue - - buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) - _inputs = buf[:-1] - _targets = buf[1:] - end_idxs[-1] -= 1 # last document was too long to account for _targets offset - cum_lengths = (end_idxs - start_idxs).cumsum(0) - - else: - if pos + num_tokens + 1 >= len(tokens): # should not occur for val data - tokens, pos = _load_data_shard(next(file_iter)), 0 - - pos_local = pos + rank * num_tokens_local - buf = tokens[pos_local: pos_local + num_tokens_local + 1] - _inputs = buf[:-1].view(num_tokens_local, ) - _targets = buf[1:].view(num_tokens_local, ) - - cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] - pos += num_tokens - - - _cum_lengths = torch.full((max_num_docs,), num_tokens_local) - _cum_lengths[0] = 0 - _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths - - new_params = yield ( - _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), - _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), - _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) - ) - - if new_params is not None: - # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() - new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params - assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" - num_tokens = new_num_tokens - max_seq_len = new_max_seq_len - grad_accum_steps = new_grad_accum_steps - - -# ----------------------------------------------------------------------------- -# int main - -@dataclass -class Hyperparameters: - # data - train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on - val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on - val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons - train_batch_size: int = 2048 * 16 * 8 - train_max_seq_len: int = 128 * 16 - val_batch_size: int = 4 * 64 * 1024 * 8 - # optimization - num_iterations: int = 2285 - lr_schedule = (0.5, 0.98) # breakpoints for 3-part schedule: (flat, linear decay, flat) - lr_min = 0.1 - # evaluation and logging - run_id: str = f"{uuid.uuid4()}" - val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end - save_checkpoint: bool = False - # attention masking - block_size: int = 128 - ws_schedule: tuple = (3, 5, 7, 9, 11, 13) - ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN - -args = Hyperparameters() - -data_path = os.environ.get("DATA_PATH", ".") -args.train_files = os.path.join(data_path, args.train_files) -args.val_files = os.path.join(data_path, args.val_files) - -# torchrun sets these env variables -rank = int(os.environ["RANK"]) -world_size = int(os.environ["WORLD_SIZE"]) -assert 8 % world_size == 0, "world_size must be a divisor of 8" -grad_accum_steps = 8 // world_size -assert torch.cuda.is_available() -device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) -torch.cuda.set_device(device) -dist.init_process_group(backend="nccl", device_id=device) -dist.barrier() -master_process = (rank == 0) # this process will do logging, checkpointing etc. - -# begin logging -logfile = None -if master_process: - run_id = args.run_id - os.makedirs("logs", exist_ok=True) - logfile = f"logs/{run_id}.txt" - print(logfile) -def print0(s, console=False): - if master_process: - with open(logfile, "a") as f: - if console: - print(s) - print(s, file=f) - -# begin by printing this file (the Python code) -print0(code) -print0("="*100) -# log information about the hardware/software environment this is running on -print0(f"Running Python {sys.version}") -print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") -print0(f"Running Triton version {triton.__version__}") - -def nvidia_smi(): - import subprocess # avoid top level import - return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout -print0(nvidia_smi()) -print0("="*100) - -model: nn.Module = GPT( - vocab_size=50257, - num_layers=12, - num_heads=6, - head_dim=128, - model_dim=768, - max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) -).cuda() -for m in model.modules(): - if isinstance(m, (nn.Embedding, nn.Linear)): - m.bfloat16() -for param in model.parameters(): - dist.broadcast(param.detach(), 0) - -# collect the parameters to optimize -hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] -embed_params = [p for n, p in model.named_parameters() if "embed" in n] -scalar_params = [p for p in model.parameters() if p.ndim < 2] -head_params = [model.lm_head.weight] -gate_params = [p for n, p in model.named_parameters() if "gate" in n] - -# init the optimizer(s) -# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence -# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 -optimizer1 = DistAdam( - scalar_params + head_params + embed_params, - lr=0.008, - betas=(0.65, 0.95), - eps=1e-8, - weight_decay=0.0, -) -optimizer2 = Muon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=0.0) -optimizers = [optimizer1, optimizer2] -for opt in optimizers: - for group in opt.param_groups: - group["initial_lr"] = group["lr"] - -def get_lr(step: int): - assert step < args.num_iterations - # Three part schedule: flat, linear decrease, flat - lr_schedule = args.lr_schedule - x = step / args.num_iterations - - if x < lr_schedule[0]: - return 1.0 - elif x < lr_schedule[1]: - progress = (x - lr_schedule[0]) / (lr_schedule[1] - lr_schedule[0]) - lr = 1.0 - (1.0 - args.lr_min) * progress - else: - lr = args.lr_min - return lr - -def get_ws(step: int): - assert step <= args.num_iterations - x = step / (args.num_iterations + 1) - ws_idx = int(len(args.ws_schedule) * x) - return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] - -def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): - # warmup phase: linearly increase momentum from min to max - # cooldown phase: linearly decrease momentum from max to min - momentum_cd_start = args.num_iterations - muon_cooldown_steps - if step < muon_warmup_steps: - frac = step / muon_warmup_steps - momentum = momentum_min + frac * (momentum_max - momentum_min) - elif step > momentum_cd_start: - frac = (step - momentum_cd_start) / muon_cooldown_steps - momentum = momentum_max - frac * (momentum_max - momentum_min) - else: - momentum = momentum_max - return momentum - -def step_optimizers(step: int, optimizers, model): - # update lr - for optimizer in optimizers: - for group in optimizer.param_groups: - group["lr"] = group["initial_lr"] * get_lr(step) - - # set muon momentum based on step - momentum = get_muon_momentum(step) - for group in optimizers[1].param_groups: - group["momentum"] = momentum - - # on even steps, only step Muon params - # on odd steps, step all params - if step%2==0: - optimizers[1].step() - optimizers[1].zero_grad(set_to_none=True) - else: - for optimizer in optimizers: - optimizer.step() - model.zero_grad(set_to_none=True) - -model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) - -######################################## -# Warmup kernels # -######################################## - -# Warmup the training kernels, then re-initialize the state so we aren't cheating -warmup_steps = 30 -initial_state = dict(model=copy.deepcopy(model.state_dict()), - optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state -train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) -for step in range(warmup_steps): - inputs, targets, cum_seqlens = next(train_loader) - # each window size is a new graph, need to warm up each with Yarn.attn_scale - ws_idx = step % len(args.ws_schedule) - if ws_idx==0: - model.yarn.reset() - ws_long = args.ws_schedule[0] - else: - new_ws_long = args.ws_schedule[ws_idx] - if new_ws_long > ws_long: - model.yarn.apply(ws_long, new_ws_long) - ws_long = new_ws_long - model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() - for opt in optimizers: - opt.step() - model.zero_grad(set_to_none=True) -model.yarn.reset() # rotary buffer is not stored in state_dict -model.load_state_dict(initial_state["model"]) -optimizer2.reset() # momentum buffer not in state dict -for opt, opt_state in zip(optimizers, initial_state["optimizers"]): - opt.load_state_dict(opt_state) -del train_loader, initial_state - -######################################## -# Training and validation # -######################################## - -train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) -training_time_ms = 0 -# start the clock -torch.cuda.synchronize() -t0 = time.perf_counter() -# begin training -train_steps = args.num_iterations -ws_short, ws_long = get_ws(0) -for step in range(train_steps + 1): - last_step = (step == train_steps) - ws_short, new_ws_long = get_ws(step) - if new_ws_long != ws_long: - model.yarn.apply(ws_long, new_ws_long) - ws_long=new_ws_long - - # --------------- VALIDATION SECTION ----------------- - if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): - if last_step: - ws_long = args.ws_validate_post_yarn_ext - # stop the clock - torch.cuda.synchronize() - training_time_ms += 1000 * (time.perf_counter() - t0) - model.eval() - assert args.val_tokens % args.val_batch_size == 0 - val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size - val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) - val_loss = 0 - with torch.no_grad(): - for _ in range(val_steps): - inputs, targets, cum_seqlens = next(val_loader) - val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) - val_loss /= val_steps - del val_loader - dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) - print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) - model.train() - # start the clock again - torch.cuda.synchronize() - t0 = time.perf_counter() - - if last_step: - if master_process and args.save_checkpoint: - log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) - os.makedirs(f"logs/{run_id}", exist_ok=True) - torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") - # the last step only has the validation loop, so break to avoid training - break - - # --------------- TRAINING SECTION ----------------- - loss = 0 - for _ in range(grad_accum_steps): - inputs, targets, cum_seqlens = next(train_loader) - loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps - loss.backward() - step_optimizers(step, optimizers, model) - - # logging - approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) - print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) - -print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " - f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) -dist.destroy_process_group() - -==================================================================================================== -Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] -Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 -Running Triton version 3.5.0 -Tue Oct 28 01:51:26 2025 -+-----------------------------------------------------------------------------------------+ -| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | -|-----------------------------------------+------------------------+----------------------+ -| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | -| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | -| | | MIG M. | -|=========================================+========================+======================| -| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | -| N/A 33C P0 122W / 700W | 5858MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | -| N/A 31C P0 125W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | -| N/A 29C P0 119W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | -| N/A 31C P0 119W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | -| N/A 32C P0 117W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | -| N/A 29C P0 118W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | -| N/A 31C P0 119W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | -| N/A 28C P0 113W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ - -+-----------------------------------------------------------------------------------------+ -| Processes: | -| GPU GI CI PID Type Process name GPU Memory | -| ID ID Usage | -|=========================================================================================| -+-----------------------------------------------------------------------------------------+ - -==================================================================================================== -step:0/2285 val_loss:10.8258 train_time:0ms step_avg:0.02ms -step:1/2285 train_time:106ms step_avg:105.97ms -step:2/2285 train_time:128ms step_avg:64.03ms -step:3/2285 train_time:166ms step_avg:55.27ms -step:4/2285 train_time:222ms step_avg:55.49ms -step:5/2285 train_time:281ms step_avg:56.27ms -step:6/2285 train_time:339ms step_avg:56.55ms -step:7/2285 train_time:400ms step_avg:57.17ms -step:8/2285 train_time:458ms step_avg:57.31ms -step:9/2285 train_time:519ms step_avg:57.67ms -step:10/2285 train_time:578ms step_avg:57.76ms -step:11/2285 train_time:638ms step_avg:58.04ms -step:12/2285 train_time:697ms step_avg:58.08ms -step:13/2285 train_time:758ms step_avg:58.30ms -step:14/2285 train_time:816ms step_avg:58.32ms -step:15/2285 train_time:877ms step_avg:58.46ms -step:16/2285 train_time:935ms step_avg:58.47ms -step:17/2285 train_time:998ms step_avg:58.68ms -step:18/2285 train_time:1060ms step_avg:58.87ms -step:19/2285 train_time:1125ms step_avg:59.20ms -step:20/2285 train_time:1185ms step_avg:59.26ms -step:21/2285 train_time:1247ms step_avg:59.38ms -step:22/2285 train_time:1306ms step_avg:59.37ms -step:23/2285 train_time:1367ms step_avg:59.44ms -step:24/2285 train_time:1426ms step_avg:59.42ms -step:25/2285 train_time:1487ms step_avg:59.47ms -step:26/2285 train_time:1545ms step_avg:59.43ms -step:27/2285 train_time:1606ms step_avg:59.48ms -step:28/2285 train_time:1665ms step_avg:59.45ms -step:29/2285 train_time:1727ms step_avg:59.54ms -step:30/2285 train_time:1785ms step_avg:59.51ms -step:31/2285 train_time:1847ms step_avg:59.57ms -step:32/2285 train_time:1906ms step_avg:59.55ms -step:33/2285 train_time:1967ms step_avg:59.61ms -step:34/2285 train_time:2027ms step_avg:59.61ms -step:35/2285 train_time:2088ms step_avg:59.67ms -step:36/2285 train_time:2148ms step_avg:59.67ms -step:37/2285 train_time:2211ms step_avg:59.74ms -step:38/2285 train_time:2270ms step_avg:59.73ms -step:39/2285 train_time:2331ms step_avg:59.78ms -step:40/2285 train_time:2390ms step_avg:59.76ms -step:41/2285 train_time:2452ms step_avg:59.80ms -step:42/2285 train_time:2511ms step_avg:59.78ms -step:43/2285 train_time:2573ms step_avg:59.83ms -step:44/2285 train_time:2632ms step_avg:59.81ms -step:45/2285 train_time:2694ms step_avg:59.87ms -step:46/2285 train_time:2753ms step_avg:59.85ms -step:47/2285 train_time:2815ms step_avg:59.89ms -step:48/2285 train_time:2874ms step_avg:59.87ms -step:49/2285 train_time:2935ms step_avg:59.90ms -step:50/2285 train_time:2994ms step_avg:59.88ms -step:51/2285 train_time:3056ms step_avg:59.92ms -step:52/2285 train_time:3115ms step_avg:59.91ms -step:53/2285 train_time:3177ms step_avg:59.95ms -step:54/2285 train_time:3237ms step_avg:59.94ms -step:55/2285 train_time:3299ms step_avg:59.97ms -step:56/2285 train_time:3357ms step_avg:59.95ms -step:57/2285 train_time:3419ms step_avg:59.98ms -step:58/2285 train_time:3477ms step_avg:59.95ms -step:59/2285 train_time:3539ms step_avg:59.98ms -step:60/2285 train_time:3598ms step_avg:59.97ms -step:61/2285 train_time:3660ms step_avg:60.00ms -step:62/2285 train_time:3719ms step_avg:59.98ms -step:63/2285 train_time:3780ms step_avg:60.00ms -step:64/2285 train_time:3839ms step_avg:59.99ms -step:65/2285 train_time:3900ms step_avg:60.01ms -step:66/2285 train_time:3959ms step_avg:59.98ms -step:67/2285 train_time:4021ms step_avg:60.01ms -step:68/2285 train_time:4079ms step_avg:59.99ms -step:69/2285 train_time:4141ms step_avg:60.01ms -step:70/2285 train_time:4200ms step_avg:60.00ms -step:71/2285 train_time:4261ms step_avg:60.01ms -step:72/2285 train_time:4320ms step_avg:60.00ms -step:73/2285 train_time:4382ms step_avg:60.03ms -step:74/2285 train_time:4441ms step_avg:60.01ms -step:75/2285 train_time:4502ms step_avg:60.03ms -step:76/2285 train_time:4562ms step_avg:60.02ms -step:77/2285 train_time:4623ms step_avg:60.04ms -step:78/2285 train_time:4681ms step_avg:60.02ms -step:79/2285 train_time:4743ms step_avg:60.04ms -step:80/2285 train_time:4801ms step_avg:60.02ms -step:81/2285 train_time:4863ms step_avg:60.03ms -step:82/2285 train_time:4921ms step_avg:60.02ms -step:83/2285 train_time:4982ms step_avg:60.02ms -step:84/2285 train_time:5041ms step_avg:60.01ms -step:85/2285 train_time:5101ms step_avg:60.02ms -step:86/2285 train_time:5161ms step_avg:60.01ms -step:87/2285 train_time:5221ms step_avg:60.01ms -step:88/2285 train_time:5280ms step_avg:60.00ms -step:89/2285 train_time:5342ms step_avg:60.02ms -step:90/2285 train_time:5400ms step_avg:60.00ms -step:91/2285 train_time:5461ms step_avg:60.01ms -step:92/2285 train_time:5520ms step_avg:60.00ms -step:93/2285 train_time:5581ms step_avg:60.01ms -step:94/2285 train_time:5639ms step_avg:59.99ms -step:95/2285 train_time:5701ms step_avg:60.01ms -step:96/2285 train_time:5759ms step_avg:59.99ms -step:97/2285 train_time:5821ms step_avg:60.01ms -step:98/2285 train_time:5879ms step_avg:59.99ms -step:99/2285 train_time:5940ms step_avg:60.00ms -step:100/2285 train_time:5999ms step_avg:59.99ms -step:101/2285 train_time:6060ms step_avg:60.00ms -step:102/2285 train_time:6119ms step_avg:59.99ms -step:103/2285 train_time:6180ms step_avg:60.00ms -step:104/2285 train_time:6238ms step_avg:59.98ms -step:105/2285 train_time:6300ms step_avg:60.00ms -step:106/2285 train_time:6359ms step_avg:59.99ms -step:107/2285 train_time:6420ms step_avg:60.00ms -step:108/2285 train_time:6479ms step_avg:59.99ms -step:109/2285 train_time:6541ms step_avg:60.01ms -step:110/2285 train_time:6599ms step_avg:59.99ms -step:111/2285 train_time:6660ms step_avg:60.00ms -step:112/2285 train_time:6719ms step_avg:59.99ms -step:113/2285 train_time:6780ms step_avg:60.00ms -step:114/2285 train_time:6839ms step_avg:59.99ms -step:115/2285 train_time:6900ms step_avg:60.00ms -step:116/2285 train_time:6958ms step_avg:59.99ms -step:117/2285 train_time:7020ms step_avg:60.00ms -step:118/2285 train_time:7078ms step_avg:59.99ms -step:119/2285 train_time:7140ms step_avg:60.00ms -step:120/2285 train_time:7199ms step_avg:59.99ms -step:121/2285 train_time:7260ms step_avg:60.00ms -step:122/2285 train_time:7318ms step_avg:59.99ms -step:123/2285 train_time:7380ms step_avg:60.00ms -step:124/2285 train_time:7439ms step_avg:59.99ms -step:125/2285 train_time:7501ms step_avg:60.00ms -step:126/2285 train_time:7559ms step_avg:59.99ms -step:127/2285 train_time:7621ms step_avg:60.01ms -step:128/2285 train_time:7680ms step_avg:60.00ms -step:129/2285 train_time:7741ms step_avg:60.01ms -step:130/2285 train_time:7799ms step_avg:60.00ms -step:131/2285 train_time:7860ms step_avg:60.00ms -step:132/2285 train_time:7919ms step_avg:59.99ms -step:133/2285 train_time:7980ms step_avg:60.00ms -step:134/2285 train_time:8038ms step_avg:59.99ms -step:135/2285 train_time:8099ms step_avg:60.00ms -step:136/2285 train_time:8158ms step_avg:59.99ms -step:137/2285 train_time:8219ms step_avg:59.99ms -step:138/2285 train_time:8278ms step_avg:59.99ms -step:139/2285 train_time:8341ms step_avg:60.00ms -step:140/2285 train_time:8399ms step_avg:59.99ms -step:141/2285 train_time:8460ms step_avg:60.00ms -step:142/2285 train_time:8519ms step_avg:59.99ms -step:143/2285 train_time:8580ms step_avg:60.00ms -step:144/2285 train_time:8639ms step_avg:59.99ms -step:145/2285 train_time:8701ms step_avg:60.01ms -step:146/2285 train_time:8759ms step_avg:59.99ms -step:147/2285 train_time:8820ms step_avg:60.00ms -step:148/2285 train_time:8879ms step_avg:60.00ms -step:149/2285 train_time:8940ms step_avg:60.00ms -step:150/2285 train_time:8999ms step_avg:59.99ms -step:151/2285 train_time:9060ms step_avg:60.00ms -step:152/2285 train_time:9118ms step_avg:59.99ms -step:153/2285 train_time:9180ms step_avg:60.00ms -step:154/2285 train_time:9238ms step_avg:59.99ms -step:155/2285 train_time:9299ms step_avg:60.00ms -step:156/2285 train_time:9358ms step_avg:59.99ms -step:157/2285 train_time:9419ms step_avg:59.99ms -step:158/2285 train_time:9478ms step_avg:59.99ms -step:159/2285 train_time:9539ms step_avg:59.99ms -step:160/2285 train_time:9598ms step_avg:59.99ms -step:161/2285 train_time:9659ms step_avg:60.00ms -step:162/2285 train_time:9718ms step_avg:59.99ms -step:163/2285 train_time:9779ms step_avg:59.99ms -step:164/2285 train_time:9838ms step_avg:59.99ms -step:165/2285 train_time:9899ms step_avg:59.99ms -step:166/2285 train_time:9958ms step_avg:59.99ms -step:167/2285 train_time:10019ms step_avg:59.99ms -step:168/2285 train_time:10078ms step_avg:59.99ms -step:169/2285 train_time:10140ms step_avg:60.00ms -step:170/2285 train_time:10198ms step_avg:59.99ms -step:171/2285 train_time:10259ms step_avg:60.00ms -step:172/2285 train_time:10318ms step_avg:59.99ms -step:173/2285 train_time:10379ms step_avg:59.99ms -step:174/2285 train_time:10437ms step_avg:59.98ms -step:175/2285 train_time:10499ms step_avg:59.99ms -step:176/2285 train_time:10558ms step_avg:59.99ms -step:177/2285 train_time:10619ms step_avg:59.99ms -step:178/2285 train_time:10678ms step_avg:59.99ms -step:179/2285 train_time:10740ms step_avg:60.00ms -step:180/2285 train_time:10798ms step_avg:59.99ms -step:181/2285 train_time:10859ms step_avg:60.00ms -step:182/2285 train_time:10918ms step_avg:59.99ms -step:183/2285 train_time:10979ms step_avg:59.99ms -step:184/2285 train_time:11038ms step_avg:59.99ms -step:185/2285 train_time:11099ms step_avg:60.00ms -step:186/2285 train_time:11158ms step_avg:59.99ms -step:187/2285 train_time:11219ms step_avg:60.00ms -step:188/2285 train_time:11278ms step_avg:59.99ms -step:189/2285 train_time:11339ms step_avg:59.99ms -step:190/2285 train_time:11398ms step_avg:59.99ms -step:191/2285 train_time:11459ms step_avg:59.99ms -step:192/2285 train_time:11518ms step_avg:59.99ms -step:193/2285 train_time:11579ms step_avg:59.99ms -step:194/2285 train_time:11638ms step_avg:59.99ms -step:195/2285 train_time:11699ms step_avg:60.00ms -step:196/2285 train_time:11758ms step_avg:59.99ms -step:197/2285 train_time:11819ms step_avg:60.00ms -step:198/2285 train_time:11878ms step_avg:59.99ms -step:199/2285 train_time:11939ms step_avg:60.00ms -step:200/2285 train_time:11998ms step_avg:59.99ms -step:201/2285 train_time:12059ms step_avg:60.00ms -step:202/2285 train_time:12118ms step_avg:59.99ms -step:203/2285 train_time:12179ms step_avg:59.99ms -step:204/2285 train_time:12238ms step_avg:59.99ms -step:205/2285 train_time:12299ms step_avg:60.00ms -step:206/2285 train_time:12358ms step_avg:59.99ms -step:207/2285 train_time:12419ms step_avg:59.99ms -step:208/2285 train_time:12477ms step_avg:59.98ms -step:209/2285 train_time:12538ms step_avg:59.99ms -step:210/2285 train_time:12597ms step_avg:59.99ms -step:211/2285 train_time:12658ms step_avg:59.99ms -step:212/2285 train_time:12717ms step_avg:59.98ms -step:213/2285 train_time:12778ms step_avg:59.99ms -step:214/2285 train_time:12837ms step_avg:59.98ms -step:215/2285 train_time:12899ms step_avg:59.99ms -step:216/2285 train_time:12957ms step_avg:59.99ms -step:217/2285 train_time:13019ms step_avg:60.00ms -step:218/2285 train_time:13078ms step_avg:59.99ms -step:219/2285 train_time:13141ms step_avg:60.00ms -step:220/2285 train_time:13198ms step_avg:59.99ms -step:221/2285 train_time:13259ms step_avg:60.00ms -step:222/2285 train_time:13318ms step_avg:59.99ms -step:223/2285 train_time:13379ms step_avg:59.99ms -step:224/2285 train_time:13437ms step_avg:59.99ms -step:225/2285 train_time:13499ms step_avg:59.99ms -step:226/2285 train_time:13558ms step_avg:59.99ms -step:227/2285 train_time:13619ms step_avg:59.99ms -step:228/2285 train_time:13677ms step_avg:59.99ms -step:229/2285 train_time:13739ms step_avg:60.00ms -step:230/2285 train_time:13798ms step_avg:59.99ms -step:231/2285 train_time:13859ms step_avg:60.00ms -step:232/2285 train_time:13917ms step_avg:59.99ms -step:233/2285 train_time:13979ms step_avg:59.99ms -step:234/2285 train_time:14037ms step_avg:59.99ms -step:235/2285 train_time:14098ms step_avg:59.99ms -step:236/2285 train_time:14157ms step_avg:59.99ms -step:237/2285 train_time:14218ms step_avg:59.99ms -step:238/2285 train_time:14277ms step_avg:59.99ms -step:239/2285 train_time:14337ms step_avg:59.99ms -step:240/2285 train_time:14396ms step_avg:59.98ms -step:241/2285 train_time:14458ms step_avg:59.99ms -step:242/2285 train_time:14516ms step_avg:59.99ms -step:243/2285 train_time:14577ms step_avg:59.99ms -step:244/2285 train_time:14636ms step_avg:59.99ms -step:245/2285 train_time:14698ms step_avg:59.99ms -step:246/2285 train_time:14757ms step_avg:59.99ms -step:247/2285 train_time:14818ms step_avg:59.99ms -step:248/2285 train_time:14877ms step_avg:59.99ms -step:249/2285 train_time:14938ms step_avg:59.99ms -step:250/2285 train_time:14998ms step_avg:59.99ms -step:250/2285 val_loss:4.0701 train_time:15061ms step_avg:60.24ms -step:251/2285 train_time:15080ms step_avg:60.08ms -step:252/2285 train_time:15123ms step_avg:60.01ms -step:253/2285 train_time:15187ms step_avg:60.03ms -step:254/2285 train_time:15251ms step_avg:60.04ms -step:255/2285 train_time:15315ms step_avg:60.06ms -step:256/2285 train_time:15373ms step_avg:60.05ms -step:257/2285 train_time:15434ms step_avg:60.05ms -step:258/2285 train_time:15492ms step_avg:60.05ms -step:259/2285 train_time:15553ms step_avg:60.05ms -step:260/2285 train_time:15611ms step_avg:60.04ms -step:261/2285 train_time:15671ms step_avg:60.04ms -step:262/2285 train_time:15728ms step_avg:60.03ms -step:263/2285 train_time:15788ms step_avg:60.03ms -step:264/2285 train_time:15847ms step_avg:60.03ms -step:265/2285 train_time:15906ms step_avg:60.02ms -step:266/2285 train_time:15964ms step_avg:60.01ms -step:267/2285 train_time:16025ms step_avg:60.02ms -step:268/2285 train_time:16084ms step_avg:60.01ms -step:269/2285 train_time:16146ms step_avg:60.02ms -step:270/2285 train_time:16206ms step_avg:60.02ms -step:271/2285 train_time:16269ms step_avg:60.03ms -step:272/2285 train_time:16328ms step_avg:60.03ms -step:273/2285 train_time:16389ms step_avg:60.03ms -step:274/2285 train_time:16447ms step_avg:60.03ms -step:275/2285 train_time:16508ms step_avg:60.03ms -step:276/2285 train_time:16566ms step_avg:60.02ms -step:277/2285 train_time:16627ms step_avg:60.02ms -step:278/2285 train_time:16685ms step_avg:60.02ms -step:279/2285 train_time:16746ms step_avg:60.02ms -step:280/2285 train_time:16804ms step_avg:60.01ms -step:281/2285 train_time:16864ms step_avg:60.02ms -step:282/2285 train_time:16923ms step_avg:60.01ms -step:283/2285 train_time:16983ms step_avg:60.01ms -step:284/2285 train_time:17042ms step_avg:60.01ms -step:285/2285 train_time:17103ms step_avg:60.01ms -step:286/2285 train_time:17162ms step_avg:60.01ms -step:287/2285 train_time:17224ms step_avg:60.01ms -step:288/2285 train_time:17283ms step_avg:60.01ms -step:289/2285 train_time:17344ms step_avg:60.01ms -step:290/2285 train_time:17403ms step_avg:60.01ms -step:291/2285 train_time:17465ms step_avg:60.02ms -step:292/2285 train_time:17523ms step_avg:60.01ms -step:293/2285 train_time:17584ms step_avg:60.01ms -step:294/2285 train_time:17642ms step_avg:60.01ms -step:295/2285 train_time:17703ms step_avg:60.01ms -step:296/2285 train_time:17761ms step_avg:60.00ms -step:297/2285 train_time:17822ms step_avg:60.01ms -step:298/2285 train_time:17880ms step_avg:60.00ms -step:299/2285 train_time:17940ms step_avg:60.00ms -step:300/2285 train_time:17999ms step_avg:60.00ms -step:301/2285 train_time:18060ms step_avg:60.00ms -step:302/2285 train_time:18119ms step_avg:60.00ms -step:303/2285 train_time:18180ms step_avg:60.00ms -step:304/2285 train_time:18242ms step_avg:60.01ms -step:305/2285 train_time:18301ms step_avg:60.00ms -step:306/2285 train_time:18361ms step_avg:60.00ms -step:307/2285 train_time:18422ms step_avg:60.01ms -step:308/2285 train_time:18481ms step_avg:60.00ms -step:309/2285 train_time:18543ms step_avg:60.01ms -step:310/2285 train_time:18601ms step_avg:60.00ms -step:311/2285 train_time:18662ms step_avg:60.01ms -step:312/2285 train_time:18720ms step_avg:60.00ms -step:313/2285 train_time:18781ms step_avg:60.00ms -step:314/2285 train_time:18842ms step_avg:60.00ms -step:315/2285 train_time:18900ms step_avg:60.00ms -step:316/2285 train_time:18959ms step_avg:60.00ms -step:317/2285 train_time:19020ms step_avg:60.00ms -step:318/2285 train_time:19079ms step_avg:60.00ms -step:319/2285 train_time:19140ms step_avg:60.00ms -step:320/2285 train_time:19199ms step_avg:60.00ms -step:321/2285 train_time:19261ms step_avg:60.00ms -step:322/2285 train_time:19320ms step_avg:60.00ms -step:323/2285 train_time:19382ms step_avg:60.00ms -step:324/2285 train_time:19441ms step_avg:60.00ms -step:325/2285 train_time:19503ms step_avg:60.01ms -step:326/2285 train_time:19562ms step_avg:60.00ms -step:327/2285 train_time:19622ms step_avg:60.01ms -step:328/2285 train_time:19681ms step_avg:60.00ms -step:329/2285 train_time:19741ms step_avg:60.00ms -step:330/2285 train_time:19800ms step_avg:60.00ms -step:331/2285 train_time:19861ms step_avg:60.00ms -step:332/2285 train_time:19919ms step_avg:60.00ms -step:333/2285 train_time:19980ms step_avg:60.00ms -step:334/2285 train_time:20039ms step_avg:60.00ms -step:335/2285 train_time:20100ms step_avg:60.00ms -step:336/2285 train_time:20159ms step_avg:60.00ms -step:337/2285 train_time:20220ms step_avg:60.00ms -step:338/2285 train_time:20279ms step_avg:60.00ms -step:339/2285 train_time:20342ms step_avg:60.01ms -step:340/2285 train_time:20400ms step_avg:60.00ms -step:341/2285 train_time:20461ms step_avg:60.00ms -step:342/2285 train_time:20520ms step_avg:60.00ms -step:343/2285 train_time:20581ms step_avg:60.00ms -step:344/2285 train_time:20640ms step_avg:60.00ms -step:345/2285 train_time:20702ms step_avg:60.01ms -step:346/2285 train_time:20761ms step_avg:60.00ms -step:347/2285 train_time:20822ms step_avg:60.01ms -step:348/2285 train_time:20881ms step_avg:60.00ms -step:349/2285 train_time:20944ms step_avg:60.01ms -step:350/2285 train_time:21001ms step_avg:60.00ms -step:351/2285 train_time:21061ms step_avg:60.00ms -step:352/2285 train_time:21120ms step_avg:60.00ms -step:353/2285 train_time:21181ms step_avg:60.00ms -step:354/2285 train_time:21239ms step_avg:60.00ms -step:355/2285 train_time:21300ms step_avg:60.00ms -step:356/2285 train_time:21359ms step_avg:60.00ms -step:357/2285 train_time:21420ms step_avg:60.00ms -step:358/2285 train_time:21479ms step_avg:60.00ms -step:359/2285 train_time:21541ms step_avg:60.00ms -step:360/2285 train_time:21601ms step_avg:60.00ms -step:361/2285 train_time:21661ms step_avg:60.00ms -step:362/2285 train_time:21720ms step_avg:60.00ms -step:363/2285 train_time:21780ms step_avg:60.00ms -step:364/2285 train_time:21839ms step_avg:60.00ms -step:365/2285 train_time:21900ms step_avg:60.00ms -step:366/2285 train_time:21959ms step_avg:60.00ms -step:367/2285 train_time:22020ms step_avg:60.00ms -step:368/2285 train_time:22078ms step_avg:60.00ms -step:369/2285 train_time:22139ms step_avg:60.00ms -step:370/2285 train_time:22198ms step_avg:59.99ms -step:371/2285 train_time:22258ms step_avg:60.00ms -step:372/2285 train_time:22317ms step_avg:59.99ms -step:373/2285 train_time:22378ms step_avg:59.99ms -step:374/2285 train_time:22437ms step_avg:59.99ms -step:375/2285 train_time:22498ms step_avg:59.99ms -step:376/2285 train_time:22557ms step_avg:59.99ms -step:377/2285 train_time:22618ms step_avg:59.99ms -step:378/2285 train_time:22677ms step_avg:59.99ms -step:379/2285 train_time:22738ms step_avg:59.99ms -step:380/2285 train_time:22797ms step_avg:59.99ms -step:381/2285 train_time:22858ms step_avg:59.99ms -step:382/2285 train_time:22917ms step_avg:59.99ms -step:383/2285 train_time:22978ms step_avg:60.00ms -step:384/2285 train_time:23037ms step_avg:59.99ms -step:385/2285 train_time:23098ms step_avg:59.99ms -step:386/2285 train_time:23157ms step_avg:59.99ms -step:387/2285 train_time:23218ms step_avg:60.00ms -step:388/2285 train_time:23277ms step_avg:59.99ms -step:389/2285 train_time:23338ms step_avg:60.00ms -step:390/2285 train_time:23398ms step_avg:59.99ms -step:391/2285 train_time:23459ms step_avg:60.00ms -step:392/2285 train_time:23519ms step_avg:60.00ms -step:393/2285 train_time:23580ms step_avg:60.00ms -step:394/2285 train_time:23639ms step_avg:60.00ms -step:395/2285 train_time:23700ms step_avg:60.00ms -step:396/2285 train_time:23760ms step_avg:60.00ms -step:397/2285 train_time:23821ms step_avg:60.00ms -step:398/2285 train_time:23880ms step_avg:60.00ms -step:399/2285 train_time:23942ms step_avg:60.01ms -step:400/2285 train_time:24001ms step_avg:60.00ms -step:401/2285 train_time:24062ms step_avg:60.01ms -step:402/2285 train_time:24122ms step_avg:60.00ms -step:403/2285 train_time:24182ms step_avg:60.01ms -step:404/2285 train_time:24242ms step_avg:60.00ms -step:405/2285 train_time:24303ms step_avg:60.01ms -step:406/2285 train_time:24362ms step_avg:60.00ms -step:407/2285 train_time:24423ms step_avg:60.01ms -step:408/2285 train_time:24482ms step_avg:60.00ms -step:409/2285 train_time:24543ms step_avg:60.01ms -step:410/2285 train_time:24602ms step_avg:60.01ms -step:411/2285 train_time:24663ms step_avg:60.01ms -step:412/2285 train_time:24722ms step_avg:60.00ms -step:413/2285 train_time:24784ms step_avg:60.01ms -step:414/2285 train_time:24842ms step_avg:60.01ms -step:415/2285 train_time:24904ms step_avg:60.01ms -step:416/2285 train_time:24963ms step_avg:60.01ms -step:417/2285 train_time:25024ms step_avg:60.01ms -step:418/2285 train_time:25083ms step_avg:60.01ms -step:419/2285 train_time:25144ms step_avg:60.01ms -step:420/2285 train_time:25203ms step_avg:60.01ms -step:421/2285 train_time:25264ms step_avg:60.01ms -step:422/2285 train_time:25323ms step_avg:60.01ms -step:423/2285 train_time:25385ms step_avg:60.01ms -step:424/2285 train_time:25444ms step_avg:60.01ms -step:425/2285 train_time:25506ms step_avg:60.01ms -step:426/2285 train_time:25565ms step_avg:60.01ms -step:427/2285 train_time:25626ms step_avg:60.01ms -step:428/2285 train_time:25685ms step_avg:60.01ms -step:429/2285 train_time:25746ms step_avg:60.01ms -step:430/2285 train_time:25805ms step_avg:60.01ms -step:431/2285 train_time:25867ms step_avg:60.02ms -step:432/2285 train_time:25926ms step_avg:60.01ms -step:433/2285 train_time:25987ms step_avg:60.02ms -step:434/2285 train_time:26046ms step_avg:60.01ms -step:435/2285 train_time:26107ms step_avg:60.02ms -step:436/2285 train_time:26166ms step_avg:60.01ms -step:437/2285 train_time:26227ms step_avg:60.02ms -step:438/2285 train_time:26286ms step_avg:60.01ms -step:439/2285 train_time:26348ms step_avg:60.02ms -step:440/2285 train_time:26407ms step_avg:60.02ms -step:441/2285 train_time:26468ms step_avg:60.02ms -step:442/2285 train_time:26527ms step_avg:60.02ms -step:443/2285 train_time:26588ms step_avg:60.02ms -step:444/2285 train_time:26647ms step_avg:60.02ms -step:445/2285 train_time:26708ms step_avg:60.02ms -step:446/2285 train_time:26767ms step_avg:60.02ms -step:447/2285 train_time:26828ms step_avg:60.02ms -step:448/2285 train_time:26887ms step_avg:60.02ms -step:449/2285 train_time:26948ms step_avg:60.02ms -step:450/2285 train_time:27007ms step_avg:60.01ms -step:451/2285 train_time:27068ms step_avg:60.02ms -step:452/2285 train_time:27126ms step_avg:60.01ms -step:453/2285 train_time:27187ms step_avg:60.02ms -step:454/2285 train_time:27246ms step_avg:60.01ms -step:455/2285 train_time:27307ms step_avg:60.02ms -step:456/2285 train_time:27366ms step_avg:60.01ms -step:457/2285 train_time:27428ms step_avg:60.02ms -step:458/2285 train_time:27487ms step_avg:60.01ms -step:459/2285 train_time:27548ms step_avg:60.02ms -step:460/2285 train_time:27607ms step_avg:60.02ms -step:461/2285 train_time:27669ms step_avg:60.02ms -step:462/2285 train_time:27728ms step_avg:60.02ms -step:463/2285 train_time:27790ms step_avg:60.02ms -step:464/2285 train_time:27849ms step_avg:60.02ms -step:465/2285 train_time:27910ms step_avg:60.02ms -step:466/2285 train_time:27970ms step_avg:60.02ms -step:467/2285 train_time:28030ms step_avg:60.02ms -step:468/2285 train_time:28089ms step_avg:60.02ms -step:469/2285 train_time:28152ms step_avg:60.03ms -step:470/2285 train_time:28210ms step_avg:60.02ms -step:471/2285 train_time:28271ms step_avg:60.02ms -step:472/2285 train_time:28330ms step_avg:60.02ms -step:473/2285 train_time:28391ms step_avg:60.02ms -step:474/2285 train_time:28450ms step_avg:60.02ms -step:475/2285 train_time:28512ms step_avg:60.02ms -step:476/2285 train_time:28570ms step_avg:60.02ms -step:477/2285 train_time:28632ms step_avg:60.02ms -step:478/2285 train_time:28690ms step_avg:60.02ms -step:479/2285 train_time:28752ms step_avg:60.03ms -step:480/2285 train_time:28811ms step_avg:60.02ms -step:481/2285 train_time:28873ms step_avg:60.03ms -step:482/2285 train_time:28932ms step_avg:60.02ms -step:483/2285 train_time:28993ms step_avg:60.03ms -step:484/2285 train_time:29052ms step_avg:60.02ms -step:485/2285 train_time:29113ms step_avg:60.03ms -step:486/2285 train_time:29172ms step_avg:60.02ms -step:487/2285 train_time:29233ms step_avg:60.03ms -step:488/2285 train_time:29292ms step_avg:60.02ms -step:489/2285 train_time:29353ms step_avg:60.03ms -step:490/2285 train_time:29412ms step_avg:60.03ms -step:491/2285 train_time:29474ms step_avg:60.03ms -step:492/2285 train_time:29533ms step_avg:60.03ms -step:493/2285 train_time:29595ms step_avg:60.03ms -step:494/2285 train_time:29655ms step_avg:60.03ms -step:495/2285 train_time:29716ms step_avg:60.03ms -step:496/2285 train_time:29775ms step_avg:60.03ms -step:497/2285 train_time:29836ms step_avg:60.03ms -step:498/2285 train_time:29895ms step_avg:60.03ms -step:499/2285 train_time:29956ms step_avg:60.03ms -step:500/2285 train_time:30015ms step_avg:60.03ms -step:500/2285 val_loss:3.7855 train_time:30078ms step_avg:60.16ms -step:501/2285 train_time:30097ms step_avg:60.07ms -step:502/2285 train_time:30139ms step_avg:60.04ms -step:503/2285 train_time:30201ms step_avg:60.04ms -step:504/2285 train_time:30261ms step_avg:60.04ms -step:505/2285 train_time:30324ms step_avg:60.05ms -step:506/2285 train_time:30384ms step_avg:60.05ms -step:507/2285 train_time:30445ms step_avg:60.05ms -step:508/2285 train_time:30504ms step_avg:60.05ms -step:509/2285 train_time:30565ms step_avg:60.05ms -step:510/2285 train_time:30623ms step_avg:60.05ms -step:511/2285 train_time:30683ms step_avg:60.05ms -step:512/2285 train_time:30741ms step_avg:60.04ms -step:513/2285 train_time:30803ms step_avg:60.05ms -step:514/2285 train_time:30862ms step_avg:60.04ms -step:515/2285 train_time:30923ms step_avg:60.04ms -step:516/2285 train_time:30985ms step_avg:60.05ms -step:517/2285 train_time:31051ms step_avg:60.06ms -step:518/2285 train_time:31111ms step_avg:60.06ms -step:519/2285 train_time:31173ms step_avg:60.06ms -step:520/2285 train_time:31232ms step_avg:60.06ms -step:521/2285 train_time:31294ms step_avg:60.06ms -step:522/2285 train_time:31353ms step_avg:60.06ms -step:523/2285 train_time:31414ms step_avg:60.06ms -step:524/2285 train_time:31472ms step_avg:60.06ms -step:525/2285 train_time:31533ms step_avg:60.06ms -step:526/2285 train_time:31592ms step_avg:60.06ms -step:527/2285 train_time:31653ms step_avg:60.06ms -step:528/2285 train_time:31712ms step_avg:60.06ms -step:529/2285 train_time:31773ms step_avg:60.06ms -step:530/2285 train_time:31831ms step_avg:60.06ms -step:531/2285 train_time:31894ms step_avg:60.06ms -step:532/2285 train_time:31954ms step_avg:60.06ms -step:533/2285 train_time:32016ms step_avg:60.07ms -step:534/2285 train_time:32075ms step_avg:60.07ms -step:535/2285 train_time:32136ms step_avg:60.07ms -step:536/2285 train_time:32195ms step_avg:60.06ms -step:537/2285 train_time:32256ms step_avg:60.07ms -step:538/2285 train_time:32315ms step_avg:60.06ms -step:539/2285 train_time:32376ms step_avg:60.07ms -step:540/2285 train_time:32435ms step_avg:60.06ms -step:541/2285 train_time:32497ms step_avg:60.07ms -step:542/2285 train_time:32555ms step_avg:60.07ms -step:543/2285 train_time:32617ms step_avg:60.07ms -step:544/2285 train_time:32675ms step_avg:60.06ms -step:545/2285 train_time:32737ms step_avg:60.07ms -step:546/2285 train_time:32796ms step_avg:60.07ms -step:547/2285 train_time:32857ms step_avg:60.07ms -step:548/2285 train_time:32916ms step_avg:60.07ms -step:549/2285 train_time:32977ms step_avg:60.07ms -step:550/2285 train_time:33037ms step_avg:60.07ms -step:551/2285 train_time:33098ms step_avg:60.07ms -step:552/2285 train_time:33157ms step_avg:60.07ms -step:553/2285 train_time:33218ms step_avg:60.07ms -step:554/2285 train_time:33277ms step_avg:60.07ms -step:555/2285 train_time:33339ms step_avg:60.07ms -step:556/2285 train_time:33398ms step_avg:60.07ms -step:557/2285 train_time:33460ms step_avg:60.07ms -step:558/2285 train_time:33519ms step_avg:60.07ms -step:559/2285 train_time:33581ms step_avg:60.07ms -step:560/2285 train_time:33640ms step_avg:60.07ms -step:561/2285 train_time:33703ms step_avg:60.08ms -step:562/2285 train_time:33762ms step_avg:60.08ms -step:563/2285 train_time:33824ms step_avg:60.08ms -step:564/2285 train_time:33883ms step_avg:60.08ms -step:565/2285 train_time:33945ms step_avg:60.08ms -step:566/2285 train_time:34005ms step_avg:60.08ms -step:567/2285 train_time:34066ms step_avg:60.08ms -step:568/2285 train_time:34125ms step_avg:60.08ms -step:569/2285 train_time:34186ms step_avg:60.08ms -step:570/2285 train_time:34246ms step_avg:60.08ms -step:571/2285 train_time:34307ms step_avg:60.08ms -step:572/2285 train_time:34367ms step_avg:60.08ms -step:573/2285 train_time:34428ms step_avg:60.08ms -step:574/2285 train_time:34487ms step_avg:60.08ms -step:575/2285 train_time:34548ms step_avg:60.08ms -step:576/2285 train_time:34607ms step_avg:60.08ms -step:577/2285 train_time:34669ms step_avg:60.09ms -step:578/2285 train_time:34728ms step_avg:60.08ms -step:579/2285 train_time:34789ms step_avg:60.09ms -step:580/2285 train_time:34848ms step_avg:60.08ms -step:581/2285 train_time:34910ms step_avg:60.09ms -step:582/2285 train_time:34969ms step_avg:60.08ms -step:583/2285 train_time:35031ms step_avg:60.09ms -step:584/2285 train_time:35089ms step_avg:60.08ms -step:585/2285 train_time:35151ms step_avg:60.09ms -step:586/2285 train_time:35210ms step_avg:60.09ms -step:587/2285 train_time:35272ms step_avg:60.09ms -step:588/2285 train_time:35331ms step_avg:60.09ms -step:589/2285 train_time:35392ms step_avg:60.09ms -step:590/2285 train_time:35450ms step_avg:60.09ms -step:591/2285 train_time:35511ms step_avg:60.09ms -step:592/2285 train_time:35570ms step_avg:60.08ms -step:593/2285 train_time:35631ms step_avg:60.09ms -step:594/2285 train_time:35690ms step_avg:60.08ms -step:595/2285 train_time:35752ms step_avg:60.09ms -step:596/2285 train_time:35811ms step_avg:60.08ms -step:597/2285 train_time:35872ms step_avg:60.09ms -step:598/2285 train_time:35930ms step_avg:60.08ms -step:599/2285 train_time:35991ms step_avg:60.09ms -step:600/2285 train_time:36050ms step_avg:60.08ms -step:601/2285 train_time:36112ms step_avg:60.09ms -step:602/2285 train_time:36171ms step_avg:60.08ms -step:603/2285 train_time:36232ms step_avg:60.09ms -step:604/2285 train_time:36291ms step_avg:60.08ms -step:605/2285 train_time:36352ms step_avg:60.09ms -step:606/2285 train_time:36411ms step_avg:60.08ms -step:607/2285 train_time:36472ms step_avg:60.09ms -step:608/2285 train_time:36530ms step_avg:60.08ms -step:609/2285 train_time:36592ms step_avg:60.08ms -step:610/2285 train_time:36650ms step_avg:60.08ms -step:611/2285 train_time:36711ms step_avg:60.08ms -step:612/2285 train_time:36770ms step_avg:60.08ms -step:613/2285 train_time:36832ms step_avg:60.08ms -step:614/2285 train_time:36891ms step_avg:60.08ms -step:615/2285 train_time:36953ms step_avg:60.09ms -step:616/2285 train_time:37012ms step_avg:60.08ms -step:617/2285 train_time:37073ms step_avg:60.09ms -step:618/2285 train_time:37132ms step_avg:60.08ms -step:619/2285 train_time:37192ms step_avg:60.08ms -step:620/2285 train_time:37251ms step_avg:60.08ms -step:621/2285 train_time:37312ms step_avg:60.08ms -step:622/2285 train_time:37371ms step_avg:60.08ms -step:623/2285 train_time:37432ms step_avg:60.08ms -step:624/2285 train_time:37490ms step_avg:60.08ms -step:625/2285 train_time:37553ms step_avg:60.08ms -step:626/2285 train_time:37610ms step_avg:60.08ms -step:627/2285 train_time:37672ms step_avg:60.08ms -step:628/2285 train_time:37730ms step_avg:60.08ms -step:629/2285 train_time:37792ms step_avg:60.08ms -step:630/2285 train_time:37850ms step_avg:60.08ms -step:631/2285 train_time:37912ms step_avg:60.08ms -step:632/2285 train_time:37971ms step_avg:60.08ms -step:633/2285 train_time:38032ms step_avg:60.08ms -step:634/2285 train_time:38092ms step_avg:60.08ms -step:635/2285 train_time:38153ms step_avg:60.08ms -step:636/2285 train_time:38212ms step_avg:60.08ms -step:637/2285 train_time:38273ms step_avg:60.08ms -step:638/2285 train_time:38331ms step_avg:60.08ms -step:639/2285 train_time:38392ms step_avg:60.08ms -step:640/2285 train_time:38451ms step_avg:60.08ms -step:641/2285 train_time:38512ms step_avg:60.08ms -step:642/2285 train_time:38571ms step_avg:60.08ms -step:643/2285 train_time:38632ms step_avg:60.08ms -step:644/2285 train_time:38691ms step_avg:60.08ms -step:645/2285 train_time:38752ms step_avg:60.08ms -step:646/2285 train_time:38811ms step_avg:60.08ms -step:647/2285 train_time:38872ms step_avg:60.08ms -step:648/2285 train_time:38931ms step_avg:60.08ms -step:649/2285 train_time:38993ms step_avg:60.08ms -step:650/2285 train_time:39052ms step_avg:60.08ms -step:651/2285 train_time:39113ms step_avg:60.08ms -step:652/2285 train_time:39172ms step_avg:60.08ms -step:653/2285 train_time:39233ms step_avg:60.08ms -step:654/2285 train_time:39292ms step_avg:60.08ms -step:655/2285 train_time:39353ms step_avg:60.08ms -step:656/2285 train_time:39412ms step_avg:60.08ms -step:657/2285 train_time:39474ms step_avg:60.08ms -step:658/2285 train_time:39532ms step_avg:60.08ms -step:659/2285 train_time:39593ms step_avg:60.08ms -step:660/2285 train_time:39652ms step_avg:60.08ms -step:661/2285 train_time:39713ms step_avg:60.08ms -step:662/2285 train_time:39772ms step_avg:60.08ms -step:663/2285 train_time:39833ms step_avg:60.08ms -step:664/2285 train_time:39893ms step_avg:60.08ms -step:665/2285 train_time:39954ms step_avg:60.08ms -step:666/2285 train_time:40013ms step_avg:60.08ms -step:667/2285 train_time:40074ms step_avg:60.08ms -step:668/2285 train_time:40133ms step_avg:60.08ms -step:669/2285 train_time:40194ms step_avg:60.08ms -step:670/2285 train_time:40253ms step_avg:60.08ms -step:671/2285 train_time:40315ms step_avg:60.08ms -step:672/2285 train_time:40373ms step_avg:60.08ms -step:673/2285 train_time:40434ms step_avg:60.08ms -step:674/2285 train_time:40493ms step_avg:60.08ms -step:675/2285 train_time:40554ms step_avg:60.08ms -step:676/2285 train_time:40613ms step_avg:60.08ms -step:677/2285 train_time:40673ms step_avg:60.08ms -step:678/2285 train_time:40732ms step_avg:60.08ms -step:679/2285 train_time:40794ms step_avg:60.08ms -step:680/2285 train_time:40853ms step_avg:60.08ms -step:681/2285 train_time:40915ms step_avg:60.08ms -step:682/2285 train_time:40974ms step_avg:60.08ms -step:683/2285 train_time:41035ms step_avg:60.08ms -step:684/2285 train_time:41094ms step_avg:60.08ms -step:685/2285 train_time:41155ms step_avg:60.08ms -step:686/2285 train_time:41214ms step_avg:60.08ms -step:687/2285 train_time:41275ms step_avg:60.08ms -step:688/2285 train_time:41334ms step_avg:60.08ms -step:689/2285 train_time:41395ms step_avg:60.08ms -step:690/2285 train_time:41454ms step_avg:60.08ms -step:691/2285 train_time:41515ms step_avg:60.08ms -step:692/2285 train_time:41575ms step_avg:60.08ms -step:693/2285 train_time:41636ms step_avg:60.08ms -step:694/2285 train_time:41695ms step_avg:60.08ms -step:695/2285 train_time:41756ms step_avg:60.08ms -step:696/2285 train_time:41816ms step_avg:60.08ms -step:697/2285 train_time:41877ms step_avg:60.08ms -step:698/2285 train_time:41936ms step_avg:60.08ms -step:699/2285 train_time:41997ms step_avg:60.08ms -step:700/2285 train_time:42056ms step_avg:60.08ms -step:701/2285 train_time:42118ms step_avg:60.08ms -step:702/2285 train_time:42177ms step_avg:60.08ms -step:703/2285 train_time:42238ms step_avg:60.08ms -step:704/2285 train_time:42297ms step_avg:60.08ms -step:705/2285 train_time:42359ms step_avg:60.08ms -step:706/2285 train_time:42418ms step_avg:60.08ms -step:707/2285 train_time:42480ms step_avg:60.09ms -step:708/2285 train_time:42539ms step_avg:60.08ms -step:709/2285 train_time:42601ms step_avg:60.09ms -step:710/2285 train_time:42661ms step_avg:60.09ms -step:711/2285 train_time:42722ms step_avg:60.09ms -step:712/2285 train_time:42781ms step_avg:60.09ms -step:713/2285 train_time:42844ms step_avg:60.09ms -step:714/2285 train_time:42903ms step_avg:60.09ms -step:715/2285 train_time:42964ms step_avg:60.09ms -step:716/2285 train_time:43024ms step_avg:60.09ms -step:717/2285 train_time:43085ms step_avg:60.09ms -step:718/2285 train_time:43144ms step_avg:60.09ms -step:719/2285 train_time:43206ms step_avg:60.09ms -step:720/2285 train_time:43265ms step_avg:60.09ms -step:721/2285 train_time:43327ms step_avg:60.09ms -step:722/2285 train_time:43386ms step_avg:60.09ms -step:723/2285 train_time:43449ms step_avg:60.10ms -step:724/2285 train_time:43508ms step_avg:60.09ms -step:725/2285 train_time:43570ms step_avg:60.10ms -step:726/2285 train_time:43628ms step_avg:60.09ms -step:727/2285 train_time:43689ms step_avg:60.10ms -step:728/2285 train_time:43749ms step_avg:60.09ms -step:729/2285 train_time:43810ms step_avg:60.10ms -step:730/2285 train_time:43869ms step_avg:60.09ms -step:731/2285 train_time:43931ms step_avg:60.10ms -step:732/2285 train_time:43989ms step_avg:60.09ms -step:733/2285 train_time:44051ms step_avg:60.10ms -step:734/2285 train_time:44110ms step_avg:60.09ms -step:735/2285 train_time:44171ms step_avg:60.10ms -step:736/2285 train_time:44230ms step_avg:60.10ms -step:737/2285 train_time:44291ms step_avg:60.10ms -step:738/2285 train_time:44350ms step_avg:60.10ms -step:739/2285 train_time:44412ms step_avg:60.10ms -step:740/2285 train_time:44470ms step_avg:60.09ms -step:741/2285 train_time:44531ms step_avg:60.10ms -step:742/2285 train_time:44590ms step_avg:60.09ms -step:743/2285 train_time:44651ms step_avg:60.10ms -step:744/2285 train_time:44710ms step_avg:60.09ms -step:745/2285 train_time:44771ms step_avg:60.10ms -step:746/2285 train_time:44829ms step_avg:60.09ms -step:747/2285 train_time:44890ms step_avg:60.09ms -step:748/2285 train_time:44949ms step_avg:60.09ms -step:749/2285 train_time:45010ms step_avg:60.09ms -step:750/2285 train_time:45069ms step_avg:60.09ms -step:750/2285 val_loss:3.6572 train_time:45133ms step_avg:60.18ms -step:751/2285 train_time:45153ms step_avg:60.12ms -step:752/2285 train_time:45193ms step_avg:60.10ms -step:753/2285 train_time:45256ms step_avg:60.10ms -step:754/2285 train_time:45316ms step_avg:60.10ms -step:755/2285 train_time:45377ms step_avg:60.10ms -step:756/2285 train_time:45436ms step_avg:60.10ms -step:757/2285 train_time:45497ms step_avg:60.10ms -step:758/2285 train_time:45556ms step_avg:60.10ms -step:759/2285 train_time:45616ms step_avg:60.10ms -step:760/2285 train_time:45675ms step_avg:60.10ms -step:761/2285 train_time:45736ms step_avg:60.10ms -step:762/2285 train_time:45798ms step_avg:60.10ms -step:763/2285 train_time:45858ms step_avg:60.10ms -step:764/2285 train_time:45918ms step_avg:60.10ms -step:765/2285 train_time:45979ms step_avg:60.10ms -step:766/2285 train_time:46038ms step_avg:60.10ms -step:767/2285 train_time:46101ms step_avg:60.11ms -step:768/2285 train_time:46162ms step_avg:60.11ms -step:769/2285 train_time:46224ms step_avg:60.11ms -step:770/2285 train_time:46284ms step_avg:60.11ms -step:771/2285 train_time:46346ms step_avg:60.11ms -step:772/2285 train_time:46405ms step_avg:60.11ms -step:773/2285 train_time:46467ms step_avg:60.11ms -step:774/2285 train_time:46526ms step_avg:60.11ms -step:775/2285 train_time:46587ms step_avg:60.11ms -step:776/2285 train_time:46646ms step_avg:60.11ms -step:777/2285 train_time:46707ms step_avg:60.11ms -step:778/2285 train_time:46766ms step_avg:60.11ms -step:779/2285 train_time:46828ms step_avg:60.11ms -step:780/2285 train_time:46887ms step_avg:60.11ms -step:781/2285 train_time:46949ms step_avg:60.11ms -step:782/2285 train_time:47009ms step_avg:60.11ms -step:783/2285 train_time:47071ms step_avg:60.12ms -step:784/2285 train_time:47130ms step_avg:60.12ms -step:785/2285 train_time:47192ms step_avg:60.12ms -step:786/2285 train_time:47252ms step_avg:60.12ms -step:787/2285 train_time:47314ms step_avg:60.12ms -step:788/2285 train_time:47374ms step_avg:60.12ms -step:789/2285 train_time:47435ms step_avg:60.12ms -step:790/2285 train_time:47495ms step_avg:60.12ms -step:791/2285 train_time:47557ms step_avg:60.12ms -step:792/2285 train_time:47617ms step_avg:60.12ms -step:793/2285 train_time:47678ms step_avg:60.12ms -step:794/2285 train_time:47737ms step_avg:60.12ms -step:795/2285 train_time:47799ms step_avg:60.12ms -step:796/2285 train_time:47859ms step_avg:60.12ms -step:797/2285 train_time:47920ms step_avg:60.13ms -step:798/2285 train_time:47979ms step_avg:60.12ms -step:799/2285 train_time:48040ms step_avg:60.13ms -step:800/2285 train_time:48100ms step_avg:60.12ms -step:801/2285 train_time:48162ms step_avg:60.13ms -step:802/2285 train_time:48222ms step_avg:60.13ms -step:803/2285 train_time:48284ms step_avg:60.13ms -step:804/2285 train_time:48343ms step_avg:60.13ms -step:805/2285 train_time:48406ms step_avg:60.13ms -step:806/2285 train_time:48465ms step_avg:60.13ms -step:807/2285 train_time:48527ms step_avg:60.13ms -step:808/2285 train_time:48586ms step_avg:60.13ms -step:809/2285 train_time:48647ms step_avg:60.13ms -step:810/2285 train_time:48706ms step_avg:60.13ms -step:811/2285 train_time:48768ms step_avg:60.13ms -step:812/2285 train_time:48827ms step_avg:60.13ms -step:813/2285 train_time:48888ms step_avg:60.13ms -step:814/2285 train_time:48948ms step_avg:60.13ms -step:815/2285 train_time:49010ms step_avg:60.13ms -step:816/2285 train_time:49070ms step_avg:60.13ms -step:817/2285 train_time:49132ms step_avg:60.14ms -step:818/2285 train_time:49192ms step_avg:60.14ms -step:819/2285 train_time:49254ms step_avg:60.14ms -step:820/2285 train_time:49314ms step_avg:60.14ms -step:821/2285 train_time:49377ms step_avg:60.14ms -step:822/2285 train_time:49436ms step_avg:60.14ms -step:823/2285 train_time:49498ms step_avg:60.14ms -step:824/2285 train_time:49558ms step_avg:60.14ms -step:825/2285 train_time:49619ms step_avg:60.14ms -step:826/2285 train_time:49679ms step_avg:60.14ms -step:827/2285 train_time:49740ms step_avg:60.15ms -step:828/2285 train_time:49800ms step_avg:60.15ms -step:829/2285 train_time:49861ms step_avg:60.15ms -step:830/2285 train_time:49920ms step_avg:60.15ms -step:831/2285 train_time:49982ms step_avg:60.15ms -step:832/2285 train_time:50041ms step_avg:60.15ms -step:833/2285 train_time:50103ms step_avg:60.15ms -step:834/2285 train_time:50162ms step_avg:60.15ms -step:835/2285 train_time:50225ms step_avg:60.15ms -step:836/2285 train_time:50285ms step_avg:60.15ms -step:837/2285 train_time:50347ms step_avg:60.15ms -step:838/2285 train_time:50406ms step_avg:60.15ms -step:839/2285 train_time:50468ms step_avg:60.15ms -step:840/2285 train_time:50527ms step_avg:60.15ms -step:841/2285 train_time:50589ms step_avg:60.15ms -step:842/2285 train_time:50649ms step_avg:60.15ms -step:843/2285 train_time:50710ms step_avg:60.15ms -step:844/2285 train_time:50770ms step_avg:60.15ms -step:845/2285 train_time:50832ms step_avg:60.16ms -step:846/2285 train_time:50892ms step_avg:60.16ms -step:847/2285 train_time:50954ms step_avg:60.16ms -step:848/2285 train_time:51014ms step_avg:60.16ms -step:849/2285 train_time:51077ms step_avg:60.16ms -step:850/2285 train_time:51136ms step_avg:60.16ms -step:851/2285 train_time:51199ms step_avg:60.16ms -step:852/2285 train_time:51258ms step_avg:60.16ms -step:853/2285 train_time:51320ms step_avg:60.16ms -step:854/2285 train_time:51380ms step_avg:60.16ms -step:855/2285 train_time:51441ms step_avg:60.17ms -step:856/2285 train_time:51501ms step_avg:60.16ms -step:857/2285 train_time:51562ms step_avg:60.17ms -step:858/2285 train_time:51621ms step_avg:60.16ms -step:859/2285 train_time:51682ms step_avg:60.17ms -step:860/2285 train_time:51741ms step_avg:60.16ms -step:861/2285 train_time:51803ms step_avg:60.17ms -step:862/2285 train_time:51863ms step_avg:60.17ms -step:863/2285 train_time:51925ms step_avg:60.17ms -step:864/2285 train_time:51984ms step_avg:60.17ms -step:865/2285 train_time:52046ms step_avg:60.17ms -step:866/2285 train_time:52105ms step_avg:60.17ms -step:867/2285 train_time:52167ms step_avg:60.17ms -step:868/2285 train_time:52226ms step_avg:60.17ms -step:869/2285 train_time:52288ms step_avg:60.17ms -step:870/2285 train_time:52347ms step_avg:60.17ms -step:871/2285 train_time:52410ms step_avg:60.17ms -step:872/2285 train_time:52469ms step_avg:60.17ms -step:873/2285 train_time:52531ms step_avg:60.17ms -step:874/2285 train_time:52590ms step_avg:60.17ms -step:875/2285 train_time:52652ms step_avg:60.17ms -step:876/2285 train_time:52712ms step_avg:60.17ms -step:877/2285 train_time:52774ms step_avg:60.18ms -step:878/2285 train_time:52834ms step_avg:60.18ms -step:879/2285 train_time:52896ms step_avg:60.18ms -step:880/2285 train_time:52956ms step_avg:60.18ms -step:881/2285 train_time:53017ms step_avg:60.18ms -step:882/2285 train_time:53077ms step_avg:60.18ms -step:883/2285 train_time:53139ms step_avg:60.18ms -step:884/2285 train_time:53198ms step_avg:60.18ms -step:885/2285 train_time:53260ms step_avg:60.18ms -step:886/2285 train_time:53320ms step_avg:60.18ms -step:887/2285 train_time:53380ms step_avg:60.18ms -step:888/2285 train_time:53440ms step_avg:60.18ms -step:889/2285 train_time:53501ms step_avg:60.18ms -step:890/2285 train_time:53560ms step_avg:60.18ms -step:891/2285 train_time:53622ms step_avg:60.18ms -step:892/2285 train_time:53681ms step_avg:60.18ms -step:893/2285 train_time:53743ms step_avg:60.18ms -step:894/2285 train_time:53803ms step_avg:60.18ms -step:895/2285 train_time:53865ms step_avg:60.18ms -step:896/2285 train_time:53924ms step_avg:60.18ms -step:897/2285 train_time:53985ms step_avg:60.18ms -step:898/2285 train_time:54044ms step_avg:60.18ms -step:899/2285 train_time:54105ms step_avg:60.18ms -step:900/2285 train_time:54165ms step_avg:60.18ms -step:901/2285 train_time:54226ms step_avg:60.18ms -step:902/2285 train_time:54286ms step_avg:60.18ms -step:903/2285 train_time:54348ms step_avg:60.19ms -step:904/2285 train_time:54408ms step_avg:60.19ms -step:905/2285 train_time:54470ms step_avg:60.19ms -step:906/2285 train_time:54529ms step_avg:60.19ms -step:907/2285 train_time:54591ms step_avg:60.19ms -step:908/2285 train_time:54651ms step_avg:60.19ms -step:909/2285 train_time:54713ms step_avg:60.19ms -step:910/2285 train_time:54773ms step_avg:60.19ms -step:911/2285 train_time:54834ms step_avg:60.19ms -step:912/2285 train_time:54894ms step_avg:60.19ms -step:913/2285 train_time:54957ms step_avg:60.19ms -step:914/2285 train_time:55017ms step_avg:60.19ms -step:915/2285 train_time:55079ms step_avg:60.20ms -step:916/2285 train_time:55138ms step_avg:60.19ms -step:917/2285 train_time:55200ms step_avg:60.20ms -step:918/2285 train_time:55259ms step_avg:60.20ms -step:919/2285 train_time:55321ms step_avg:60.20ms -step:920/2285 train_time:55380ms step_avg:60.20ms -step:921/2285 train_time:55442ms step_avg:60.20ms -step:922/2285 train_time:55501ms step_avg:60.20ms -step:923/2285 train_time:55563ms step_avg:60.20ms -step:924/2285 train_time:55623ms step_avg:60.20ms -step:925/2285 train_time:55684ms step_avg:60.20ms -step:926/2285 train_time:55744ms step_avg:60.20ms -step:927/2285 train_time:55806ms step_avg:60.20ms -step:928/2285 train_time:55865ms step_avg:60.20ms -step:929/2285 train_time:55927ms step_avg:60.20ms -step:930/2285 train_time:55986ms step_avg:60.20ms -step:931/2285 train_time:56047ms step_avg:60.20ms -step:932/2285 train_time:56107ms step_avg:60.20ms -step:933/2285 train_time:56169ms step_avg:60.20ms -step:934/2285 train_time:56229ms step_avg:60.20ms -step:935/2285 train_time:56291ms step_avg:60.20ms -step:936/2285 train_time:56350ms step_avg:60.20ms -step:937/2285 train_time:56412ms step_avg:60.21ms -step:938/2285 train_time:56472ms step_avg:60.20ms -step:939/2285 train_time:56534ms step_avg:60.21ms -step:940/2285 train_time:56593ms step_avg:60.21ms -step:941/2285 train_time:56656ms step_avg:60.21ms -step:942/2285 train_time:56715ms step_avg:60.21ms -step:943/2285 train_time:56778ms step_avg:60.21ms -step:944/2285 train_time:56837ms step_avg:60.21ms -step:945/2285 train_time:56899ms step_avg:60.21ms -step:946/2285 train_time:56959ms step_avg:60.21ms -step:947/2285 train_time:57020ms step_avg:60.21ms -step:948/2285 train_time:57080ms step_avg:60.21ms -step:949/2285 train_time:57141ms step_avg:60.21ms -step:950/2285 train_time:57201ms step_avg:60.21ms -step:951/2285 train_time:57262ms step_avg:60.21ms -step:952/2285 train_time:57321ms step_avg:60.21ms -step:953/2285 train_time:57382ms step_avg:60.21ms -step:954/2285 train_time:57441ms step_avg:60.21ms -step:955/2285 train_time:57506ms step_avg:60.22ms -step:956/2285 train_time:57564ms step_avg:60.21ms -step:957/2285 train_time:57626ms step_avg:60.21ms -step:958/2285 train_time:57685ms step_avg:60.21ms -step:959/2285 train_time:57747ms step_avg:60.22ms -step:960/2285 train_time:57806ms step_avg:60.21ms -step:961/2285 train_time:57868ms step_avg:60.22ms -step:962/2285 train_time:57927ms step_avg:60.22ms -step:963/2285 train_time:57989ms step_avg:60.22ms -step:964/2285 train_time:58048ms step_avg:60.22ms -step:965/2285 train_time:58110ms step_avg:60.22ms -step:966/2285 train_time:58170ms step_avg:60.22ms -step:967/2285 train_time:58232ms step_avg:60.22ms -step:968/2285 train_time:58292ms step_avg:60.22ms -step:969/2285 train_time:58354ms step_avg:60.22ms -step:970/2285 train_time:58414ms step_avg:60.22ms -step:971/2285 train_time:58476ms step_avg:60.22ms -step:972/2285 train_time:58536ms step_avg:60.22ms -step:973/2285 train_time:58598ms step_avg:60.22ms -step:974/2285 train_time:58657ms step_avg:60.22ms -step:975/2285 train_time:58719ms step_avg:60.22ms -step:976/2285 train_time:58779ms step_avg:60.22ms -step:977/2285 train_time:58840ms step_avg:60.23ms -step:978/2285 train_time:58900ms step_avg:60.22ms -step:979/2285 train_time:58961ms step_avg:60.23ms -step:980/2285 train_time:59021ms step_avg:60.23ms -step:981/2285 train_time:59082ms step_avg:60.23ms -step:982/2285 train_time:59142ms step_avg:60.23ms -step:983/2285 train_time:59203ms step_avg:60.23ms -step:984/2285 train_time:59262ms step_avg:60.23ms -step:985/2285 train_time:59324ms step_avg:60.23ms -step:986/2285 train_time:59383ms step_avg:60.23ms -step:987/2285 train_time:59445ms step_avg:60.23ms -step:988/2285 train_time:59505ms step_avg:60.23ms -step:989/2285 train_time:59566ms step_avg:60.23ms -step:990/2285 train_time:59625ms step_avg:60.23ms -step:991/2285 train_time:59687ms step_avg:60.23ms -step:992/2285 train_time:59747ms step_avg:60.23ms -step:993/2285 train_time:59809ms step_avg:60.23ms -step:994/2285 train_time:59868ms step_avg:60.23ms -step:995/2285 train_time:59931ms step_avg:60.23ms -step:996/2285 train_time:59990ms step_avg:60.23ms -step:997/2285 train_time:60053ms step_avg:60.23ms -step:998/2285 train_time:60112ms step_avg:60.23ms -step:999/2285 train_time:60174ms step_avg:60.23ms -step:1000/2285 train_time:60234ms step_avg:60.23ms -step:1000/2285 val_loss:3.5694 train_time:60298ms step_avg:60.30ms -step:1001/2285 train_time:60318ms step_avg:60.26ms -step:1002/2285 train_time:60360ms step_avg:60.24ms -step:1003/2285 train_time:60422ms step_avg:60.24ms -step:1004/2285 train_time:60481ms step_avg:60.24ms -step:1005/2285 train_time:60543ms step_avg:60.24ms -step:1006/2285 train_time:60602ms step_avg:60.24ms -step:1007/2285 train_time:60663ms step_avg:60.24ms -step:1008/2285 train_time:60722ms step_avg:60.24ms -step:1009/2285 train_time:60783ms step_avg:60.24ms -step:1010/2285 train_time:60841ms step_avg:60.24ms -step:1011/2285 train_time:60902ms step_avg:60.24ms -step:1012/2285 train_time:60960ms step_avg:60.24ms -step:1013/2285 train_time:61022ms step_avg:60.24ms -step:1014/2285 train_time:61080ms step_avg:60.24ms -step:1015/2285 train_time:61141ms step_avg:60.24ms -step:1016/2285 train_time:61204ms step_avg:60.24ms -step:1017/2285 train_time:61271ms step_avg:60.25ms -step:1018/2285 train_time:61331ms step_avg:60.25ms -step:1019/2285 train_time:61393ms step_avg:60.25ms -step:1020/2285 train_time:61453ms step_avg:60.25ms -step:1021/2285 train_time:61515ms step_avg:60.25ms -step:1022/2285 train_time:61575ms step_avg:60.25ms -step:1023/2285 train_time:61637ms step_avg:60.25ms -step:1024/2285 train_time:61697ms step_avg:60.25ms -step:1025/2285 train_time:61759ms step_avg:60.25ms -step:1026/2285 train_time:61818ms step_avg:60.25ms -step:1027/2285 train_time:61879ms step_avg:60.25ms -step:1028/2285 train_time:61938ms step_avg:60.25ms -step:1029/2285 train_time:61999ms step_avg:60.25ms -step:1030/2285 train_time:62057ms step_avg:60.25ms -step:1031/2285 train_time:62119ms step_avg:60.25ms -step:1032/2285 train_time:62179ms step_avg:60.25ms -step:1033/2285 train_time:62243ms step_avg:60.25ms -step:1034/2285 train_time:62303ms step_avg:60.25ms -step:1035/2285 train_time:62366ms step_avg:60.26ms -step:1036/2285 train_time:62426ms step_avg:60.26ms -step:1037/2285 train_time:62489ms step_avg:60.26ms -step:1038/2285 train_time:62548ms step_avg:60.26ms -step:1039/2285 train_time:62610ms step_avg:60.26ms -step:1040/2285 train_time:62669ms step_avg:60.26ms -step:1041/2285 train_time:62731ms step_avg:60.26ms -step:1042/2285 train_time:62791ms step_avg:60.26ms -step:1043/2285 train_time:62853ms step_avg:60.26ms -step:1044/2285 train_time:62912ms step_avg:60.26ms -step:1045/2285 train_time:62974ms step_avg:60.26ms -step:1046/2285 train_time:63034ms step_avg:60.26ms -step:1047/2285 train_time:63097ms step_avg:60.26ms -step:1048/2285 train_time:63156ms step_avg:60.26ms -step:1049/2285 train_time:63218ms step_avg:60.27ms -step:1050/2285 train_time:63278ms step_avg:60.26ms -step:1051/2285 train_time:63341ms step_avg:60.27ms -step:1052/2285 train_time:63400ms step_avg:60.27ms -step:1053/2285 train_time:63462ms step_avg:60.27ms -step:1054/2285 train_time:63521ms step_avg:60.27ms -step:1055/2285 train_time:63583ms step_avg:60.27ms -step:1056/2285 train_time:63642ms step_avg:60.27ms -step:1057/2285 train_time:63704ms step_avg:60.27ms -step:1058/2285 train_time:63763ms step_avg:60.27ms -step:1059/2285 train_time:63825ms step_avg:60.27ms -step:1060/2285 train_time:63884ms step_avg:60.27ms -step:1061/2285 train_time:63946ms step_avg:60.27ms -step:1062/2285 train_time:64005ms step_avg:60.27ms -step:1063/2285 train_time:64067ms step_avg:60.27ms -step:1064/2285 train_time:64127ms step_avg:60.27ms -step:1065/2285 train_time:64189ms step_avg:60.27ms -step:1066/2285 train_time:64248ms step_avg:60.27ms -step:1067/2285 train_time:64310ms step_avg:60.27ms -step:1068/2285 train_time:64370ms step_avg:60.27ms -step:1069/2285 train_time:64433ms step_avg:60.27ms -step:1070/2285 train_time:64493ms step_avg:60.27ms -step:1071/2285 train_time:64555ms step_avg:60.28ms -step:1072/2285 train_time:64615ms step_avg:60.27ms -step:1073/2285 train_time:64677ms step_avg:60.28ms -step:1074/2285 train_time:64736ms step_avg:60.28ms -step:1075/2285 train_time:64798ms step_avg:60.28ms -step:1076/2285 train_time:64857ms step_avg:60.28ms -step:1077/2285 train_time:64919ms step_avg:60.28ms -step:1078/2285 train_time:64978ms step_avg:60.28ms -step:1079/2285 train_time:65040ms step_avg:60.28ms -step:1080/2285 train_time:65099ms step_avg:60.28ms -step:1081/2285 train_time:65161ms step_avg:60.28ms -step:1082/2285 train_time:65220ms step_avg:60.28ms -step:1083/2285 train_time:65282ms step_avg:60.28ms -step:1084/2285 train_time:65341ms step_avg:60.28ms -step:1085/2285 train_time:65402ms step_avg:60.28ms -step:1086/2285 train_time:65462ms step_avg:60.28ms -step:1087/2285 train_time:65524ms step_avg:60.28ms -step:1088/2285 train_time:65583ms step_avg:60.28ms -step:1089/2285 train_time:65646ms step_avg:60.28ms -step:1090/2285 train_time:65704ms step_avg:60.28ms -step:1091/2285 train_time:65766ms step_avg:60.28ms -step:1092/2285 train_time:65825ms step_avg:60.28ms -step:1093/2285 train_time:65887ms step_avg:60.28ms -step:1094/2285 train_time:65946ms step_avg:60.28ms -step:1095/2285 train_time:66008ms step_avg:60.28ms -step:1096/2285 train_time:66067ms step_avg:60.28ms -step:1097/2285 train_time:66129ms step_avg:60.28ms -step:1098/2285 train_time:66189ms step_avg:60.28ms -step:1099/2285 train_time:66251ms step_avg:60.28ms -step:1100/2285 train_time:66311ms step_avg:60.28ms -step:1101/2285 train_time:66374ms step_avg:60.28ms -step:1102/2285 train_time:66434ms step_avg:60.28ms -step:1103/2285 train_time:66496ms step_avg:60.29ms -step:1104/2285 train_time:66556ms step_avg:60.29ms -step:1105/2285 train_time:66618ms step_avg:60.29ms -step:1106/2285 train_time:66678ms step_avg:60.29ms -step:1107/2285 train_time:66740ms step_avg:60.29ms -step:1108/2285 train_time:66799ms step_avg:60.29ms -step:1109/2285 train_time:66861ms step_avg:60.29ms -step:1110/2285 train_time:66920ms step_avg:60.29ms -step:1111/2285 train_time:66981ms step_avg:60.29ms -step:1112/2285 train_time:67041ms step_avg:60.29ms -step:1113/2285 train_time:67102ms step_avg:60.29ms -step:1114/2285 train_time:67162ms step_avg:60.29ms -step:1115/2285 train_time:67223ms step_avg:60.29ms -step:1116/2285 train_time:67283ms step_avg:60.29ms -step:1117/2285 train_time:67344ms step_avg:60.29ms -step:1118/2285 train_time:67403ms step_avg:60.29ms -step:1119/2285 train_time:67465ms step_avg:60.29ms -step:1120/2285 train_time:67524ms step_avg:60.29ms -step:1121/2285 train_time:67586ms step_avg:60.29ms -step:1122/2285 train_time:67645ms step_avg:60.29ms -step:1123/2285 train_time:67708ms step_avg:60.29ms -step:1124/2285 train_time:67768ms step_avg:60.29ms -step:1125/2285 train_time:67829ms step_avg:60.29ms -step:1126/2285 train_time:67890ms step_avg:60.29ms -step:1127/2285 train_time:67952ms step_avg:60.29ms -step:1128/2285 train_time:68012ms step_avg:60.29ms -step:1129/2285 train_time:68074ms step_avg:60.30ms -step:1130/2285 train_time:68133ms step_avg:60.30ms -step:1131/2285 train_time:68196ms step_avg:60.30ms -step:1132/2285 train_time:68255ms step_avg:60.30ms -step:1133/2285 train_time:68317ms step_avg:60.30ms -step:1134/2285 train_time:68377ms step_avg:60.30ms -step:1135/2285 train_time:68439ms step_avg:60.30ms -step:1136/2285 train_time:68498ms step_avg:60.30ms -step:1137/2285 train_time:68560ms step_avg:60.30ms -step:1138/2285 train_time:68619ms step_avg:60.30ms -step:1139/2285 train_time:68681ms step_avg:60.30ms -step:1140/2285 train_time:68740ms step_avg:60.30ms -step:1141/2285 train_time:68802ms step_avg:60.30ms -step:1142/2285 train_time:68861ms step_avg:60.30ms -step:1143/2285 train_time:68923ms step_avg:60.30ms -step:1144/2285 train_time:68983ms step_avg:60.30ms -step:1145/2285 train_time:69044ms step_avg:60.30ms -step:1146/2285 train_time:69104ms step_avg:60.30ms -step:1147/2285 train_time:69167ms step_avg:60.30ms -step:1148/2285 train_time:69226ms step_avg:60.30ms -step:1149/2285 train_time:69288ms step_avg:60.30ms -step:1150/2285 train_time:69348ms step_avg:60.30ms -step:1151/2285 train_time:69410ms step_avg:60.30ms -step:1152/2285 train_time:69470ms step_avg:60.30ms -step:1153/2285 train_time:69533ms step_avg:60.31ms -step:1154/2285 train_time:69594ms step_avg:60.31ms -step:1155/2285 train_time:69656ms step_avg:60.31ms -step:1156/2285 train_time:69716ms step_avg:60.31ms -step:1157/2285 train_time:69778ms step_avg:60.31ms -step:1158/2285 train_time:69838ms step_avg:60.31ms -step:1159/2285 train_time:69900ms step_avg:60.31ms -step:1160/2285 train_time:69960ms step_avg:60.31ms -step:1161/2285 train_time:70021ms step_avg:60.31ms -step:1162/2285 train_time:70080ms step_avg:60.31ms -step:1163/2285 train_time:70142ms step_avg:60.31ms -step:1164/2285 train_time:70202ms step_avg:60.31ms -step:1165/2285 train_time:70264ms step_avg:60.31ms -step:1166/2285 train_time:70324ms step_avg:60.31ms -step:1167/2285 train_time:70386ms step_avg:60.31ms -step:1168/2285 train_time:70445ms step_avg:60.31ms -step:1169/2285 train_time:70507ms step_avg:60.31ms -step:1170/2285 train_time:70567ms step_avg:60.31ms -step:1171/2285 train_time:70630ms step_avg:60.32ms -step:1172/2285 train_time:70690ms step_avg:60.32ms -step:1173/2285 train_time:70752ms step_avg:60.32ms -step:1174/2285 train_time:70813ms step_avg:60.32ms -step:1175/2285 train_time:70876ms step_avg:60.32ms -step:1176/2285 train_time:70935ms step_avg:60.32ms -step:1177/2285 train_time:70998ms step_avg:60.32ms -step:1178/2285 train_time:71058ms step_avg:60.32ms -step:1179/2285 train_time:71120ms step_avg:60.32ms -step:1180/2285 train_time:71179ms step_avg:60.32ms -step:1181/2285 train_time:71241ms step_avg:60.32ms -step:1182/2285 train_time:71301ms step_avg:60.32ms -step:1183/2285 train_time:71362ms step_avg:60.32ms -step:1184/2285 train_time:71422ms step_avg:60.32ms -step:1185/2285 train_time:71485ms step_avg:60.32ms -step:1186/2285 train_time:71545ms step_avg:60.32ms -step:1187/2285 train_time:71607ms step_avg:60.33ms -step:1188/2285 train_time:71667ms step_avg:60.33ms -step:1189/2285 train_time:71729ms step_avg:60.33ms -step:1190/2285 train_time:71789ms step_avg:60.33ms -step:1191/2285 train_time:71851ms step_avg:60.33ms -step:1192/2285 train_time:71912ms step_avg:60.33ms -step:1193/2285 train_time:71975ms step_avg:60.33ms -step:1194/2285 train_time:72035ms step_avg:60.33ms -step:1195/2285 train_time:72097ms step_avg:60.33ms -step:1196/2285 train_time:72157ms step_avg:60.33ms -step:1197/2285 train_time:72219ms step_avg:60.33ms -step:1198/2285 train_time:72278ms step_avg:60.33ms -step:1199/2285 train_time:72341ms step_avg:60.33ms -step:1200/2285 train_time:72400ms step_avg:60.33ms -step:1201/2285 train_time:72463ms step_avg:60.34ms -step:1202/2285 train_time:72522ms step_avg:60.33ms -step:1203/2285 train_time:72584ms step_avg:60.34ms -step:1204/2285 train_time:72643ms step_avg:60.33ms -step:1205/2285 train_time:72705ms step_avg:60.34ms -step:1206/2285 train_time:72765ms step_avg:60.34ms -step:1207/2285 train_time:72827ms step_avg:60.34ms -step:1208/2285 train_time:72887ms step_avg:60.34ms -step:1209/2285 train_time:72949ms step_avg:60.34ms -step:1210/2285 train_time:73009ms step_avg:60.34ms -step:1211/2285 train_time:73072ms step_avg:60.34ms -step:1212/2285 train_time:73132ms step_avg:60.34ms -step:1213/2285 train_time:73195ms step_avg:60.34ms -step:1214/2285 train_time:73256ms step_avg:60.34ms -step:1215/2285 train_time:73318ms step_avg:60.34ms -step:1216/2285 train_time:73378ms step_avg:60.34ms -step:1217/2285 train_time:73440ms step_avg:60.35ms -step:1218/2285 train_time:73500ms step_avg:60.34ms -step:1219/2285 train_time:73562ms step_avg:60.35ms -step:1220/2285 train_time:73621ms step_avg:60.34ms -step:1221/2285 train_time:73683ms step_avg:60.35ms -step:1222/2285 train_time:73742ms step_avg:60.35ms -step:1223/2285 train_time:73805ms step_avg:60.35ms -step:1224/2285 train_time:73864ms step_avg:60.35ms -step:1225/2285 train_time:73927ms step_avg:60.35ms -step:1226/2285 train_time:73987ms step_avg:60.35ms -step:1227/2285 train_time:74049ms step_avg:60.35ms -step:1228/2285 train_time:74109ms step_avg:60.35ms -step:1229/2285 train_time:74171ms step_avg:60.35ms -step:1230/2285 train_time:74232ms step_avg:60.35ms -step:1231/2285 train_time:74294ms step_avg:60.35ms -step:1232/2285 train_time:74354ms step_avg:60.35ms -step:1233/2285 train_time:74417ms step_avg:60.35ms -step:1234/2285 train_time:74477ms step_avg:60.35ms -step:1235/2285 train_time:74538ms step_avg:60.35ms -step:1236/2285 train_time:74598ms step_avg:60.35ms -step:1237/2285 train_time:74660ms step_avg:60.36ms -step:1238/2285 train_time:74720ms step_avg:60.36ms -step:1239/2285 train_time:74782ms step_avg:60.36ms -step:1240/2285 train_time:74841ms step_avg:60.36ms -step:1241/2285 train_time:74903ms step_avg:60.36ms -step:1242/2285 train_time:74963ms step_avg:60.36ms -step:1243/2285 train_time:75025ms step_avg:60.36ms -step:1244/2285 train_time:75085ms step_avg:60.36ms -step:1245/2285 train_time:75148ms step_avg:60.36ms -step:1246/2285 train_time:75207ms step_avg:60.36ms -step:1247/2285 train_time:75269ms step_avg:60.36ms -step:1248/2285 train_time:75330ms step_avg:60.36ms -step:1249/2285 train_time:75392ms step_avg:60.36ms -step:1250/2285 train_time:75452ms step_avg:60.36ms -step:1250/2285 val_loss:3.4998 train_time:75516ms step_avg:60.41ms -step:1251/2285 train_time:75535ms step_avg:60.38ms -step:1252/2285 train_time:75578ms step_avg:60.37ms -step:1253/2285 train_time:75639ms step_avg:60.37ms -step:1254/2285 train_time:75699ms step_avg:60.37ms -step:1255/2285 train_time:75761ms step_avg:60.37ms -step:1256/2285 train_time:75822ms step_avg:60.37ms -step:1257/2285 train_time:75883ms step_avg:60.37ms -step:1258/2285 train_time:75941ms step_avg:60.37ms -step:1259/2285 train_time:76002ms step_avg:60.37ms -step:1260/2285 train_time:76061ms step_avg:60.37ms -step:1261/2285 train_time:76122ms step_avg:60.37ms -step:1262/2285 train_time:76181ms step_avg:60.37ms -step:1263/2285 train_time:76242ms step_avg:60.37ms -step:1264/2285 train_time:76301ms step_avg:60.36ms -step:1265/2285 train_time:76363ms step_avg:60.37ms -step:1266/2285 train_time:76428ms step_avg:60.37ms -step:1267/2285 train_time:76494ms step_avg:60.37ms -step:1268/2285 train_time:76555ms step_avg:60.37ms -step:1269/2285 train_time:76617ms step_avg:60.38ms -step:1270/2285 train_time:76677ms step_avg:60.38ms -step:1271/2285 train_time:76739ms step_avg:60.38ms -step:1272/2285 train_time:76798ms step_avg:60.38ms -step:1273/2285 train_time:76860ms step_avg:60.38ms -step:1274/2285 train_time:76919ms step_avg:60.38ms -step:1275/2285 train_time:76980ms step_avg:60.38ms -step:1276/2285 train_time:77039ms step_avg:60.38ms -step:1277/2285 train_time:77100ms step_avg:60.38ms -step:1278/2285 train_time:77159ms step_avg:60.38ms -step:1279/2285 train_time:77221ms step_avg:60.38ms -step:1280/2285 train_time:77280ms step_avg:60.38ms -step:1281/2285 train_time:77343ms step_avg:60.38ms -step:1282/2285 train_time:77405ms step_avg:60.38ms -step:1283/2285 train_time:77470ms step_avg:60.38ms -step:1284/2285 train_time:77530ms step_avg:60.38ms -step:1285/2285 train_time:77592ms step_avg:60.38ms -step:1286/2285 train_time:77651ms step_avg:60.38ms -step:1287/2285 train_time:77713ms step_avg:60.38ms -step:1288/2285 train_time:77773ms step_avg:60.38ms -step:1289/2285 train_time:77834ms step_avg:60.38ms -step:1290/2285 train_time:77893ms step_avg:60.38ms -step:1291/2285 train_time:77955ms step_avg:60.38ms -step:1292/2285 train_time:78015ms step_avg:60.38ms -step:1293/2285 train_time:78077ms step_avg:60.38ms -step:1294/2285 train_time:78136ms step_avg:60.38ms -step:1295/2285 train_time:78197ms step_avg:60.38ms -step:1296/2285 train_time:78257ms step_avg:60.38ms -step:1297/2285 train_time:78319ms step_avg:60.38ms -step:1298/2285 train_time:78380ms step_avg:60.39ms -step:1299/2285 train_time:78444ms step_avg:60.39ms -step:1300/2285 train_time:78504ms step_avg:60.39ms -step:1301/2285 train_time:78567ms step_avg:60.39ms -step:1302/2285 train_time:78627ms step_avg:60.39ms -step:1303/2285 train_time:78689ms step_avg:60.39ms -step:1304/2285 train_time:78749ms step_avg:60.39ms -step:1305/2285 train_time:78811ms step_avg:60.39ms -step:1306/2285 train_time:78870ms step_avg:60.39ms -step:1307/2285 train_time:78931ms step_avg:60.39ms -step:1308/2285 train_time:78991ms step_avg:60.39ms -step:1309/2285 train_time:79053ms step_avg:60.39ms -step:1310/2285 train_time:79113ms step_avg:60.39ms -step:1311/2285 train_time:79175ms step_avg:60.39ms -step:1312/2285 train_time:79235ms step_avg:60.39ms -step:1313/2285 train_time:79297ms step_avg:60.39ms -step:1314/2285 train_time:79356ms step_avg:60.39ms -step:1315/2285 train_time:79419ms step_avg:60.39ms -step:1316/2285 train_time:79479ms step_avg:60.39ms -step:1317/2285 train_time:79542ms step_avg:60.40ms -step:1318/2285 train_time:79602ms step_avg:60.40ms -step:1319/2285 train_time:79665ms step_avg:60.40ms -step:1320/2285 train_time:79724ms step_avg:60.40ms -step:1321/2285 train_time:79786ms step_avg:60.40ms -step:1322/2285 train_time:79846ms step_avg:60.40ms -step:1323/2285 train_time:79908ms step_avg:60.40ms -step:1324/2285 train_time:79968ms step_avg:60.40ms -step:1325/2285 train_time:80030ms step_avg:60.40ms -step:1326/2285 train_time:80089ms step_avg:60.40ms -step:1327/2285 train_time:80151ms step_avg:60.40ms -step:1328/2285 train_time:80211ms step_avg:60.40ms -step:1329/2285 train_time:80273ms step_avg:60.40ms -step:1330/2285 train_time:80333ms step_avg:60.40ms -step:1331/2285 train_time:80396ms step_avg:60.40ms -step:1332/2285 train_time:80455ms step_avg:60.40ms -step:1333/2285 train_time:80518ms step_avg:60.40ms -step:1334/2285 train_time:80577ms step_avg:60.40ms -step:1335/2285 train_time:80640ms step_avg:60.40ms -step:1336/2285 train_time:80700ms step_avg:60.40ms -step:1337/2285 train_time:80763ms step_avg:60.41ms -step:1338/2285 train_time:80822ms step_avg:60.41ms -step:1339/2285 train_time:80884ms step_avg:60.41ms -step:1340/2285 train_time:80944ms step_avg:60.41ms -step:1341/2285 train_time:81006ms step_avg:60.41ms -step:1342/2285 train_time:81066ms step_avg:60.41ms -step:1343/2285 train_time:81128ms step_avg:60.41ms -step:1344/2285 train_time:81187ms step_avg:60.41ms -step:1345/2285 train_time:81250ms step_avg:60.41ms -step:1346/2285 train_time:81309ms step_avg:60.41ms -step:1347/2285 train_time:81371ms step_avg:60.41ms -step:1348/2285 train_time:81431ms step_avg:60.41ms -step:1349/2285 train_time:81493ms step_avg:60.41ms -step:1350/2285 train_time:81553ms step_avg:60.41ms -step:1351/2285 train_time:81616ms step_avg:60.41ms -step:1352/2285 train_time:81675ms step_avg:60.41ms -step:1353/2285 train_time:81737ms step_avg:60.41ms -step:1354/2285 train_time:81797ms step_avg:60.41ms -step:1355/2285 train_time:81859ms step_avg:60.41ms -step:1356/2285 train_time:81919ms step_avg:60.41ms -step:1357/2285 train_time:81982ms step_avg:60.41ms -step:1358/2285 train_time:82042ms step_avg:60.41ms -step:1359/2285 train_time:82104ms step_avg:60.41ms -step:1360/2285 train_time:82164ms step_avg:60.41ms -step:1361/2285 train_time:82226ms step_avg:60.42ms -step:1362/2285 train_time:82286ms step_avg:60.42ms -step:1363/2285 train_time:82348ms step_avg:60.42ms -step:1364/2285 train_time:82408ms step_avg:60.42ms -step:1365/2285 train_time:82471ms step_avg:60.42ms -step:1366/2285 train_time:82530ms step_avg:60.42ms -step:1367/2285 train_time:82592ms step_avg:60.42ms -step:1368/2285 train_time:82652ms step_avg:60.42ms -step:1369/2285 train_time:82715ms step_avg:60.42ms -step:1370/2285 train_time:82774ms step_avg:60.42ms -step:1371/2285 train_time:82836ms step_avg:60.42ms -step:1372/2285 train_time:82896ms step_avg:60.42ms -step:1373/2285 train_time:82958ms step_avg:60.42ms -step:1374/2285 train_time:83017ms step_avg:60.42ms -step:1375/2285 train_time:83080ms step_avg:60.42ms -step:1376/2285 train_time:83140ms step_avg:60.42ms -step:1377/2285 train_time:83202ms step_avg:60.42ms -step:1378/2285 train_time:83262ms step_avg:60.42ms -step:1379/2285 train_time:83324ms step_avg:60.42ms -step:1380/2285 train_time:83384ms step_avg:60.42ms -step:1381/2285 train_time:83446ms step_avg:60.42ms -step:1382/2285 train_time:83506ms step_avg:60.42ms -step:1383/2285 train_time:83568ms step_avg:60.43ms -step:1384/2285 train_time:83632ms step_avg:60.43ms -step:1385/2285 train_time:83691ms step_avg:60.43ms -step:1386/2285 train_time:83750ms step_avg:60.43ms -step:1387/2285 train_time:83812ms step_avg:60.43ms -step:1388/2285 train_time:83871ms step_avg:60.43ms -step:1389/2285 train_time:83933ms step_avg:60.43ms -step:1390/2285 train_time:83993ms step_avg:60.43ms -step:1391/2285 train_time:84055ms step_avg:60.43ms -step:1392/2285 train_time:84115ms step_avg:60.43ms -step:1393/2285 train_time:84177ms step_avg:60.43ms -step:1394/2285 train_time:84237ms step_avg:60.43ms -step:1395/2285 train_time:84299ms step_avg:60.43ms -step:1396/2285 train_time:84359ms step_avg:60.43ms -step:1397/2285 train_time:84421ms step_avg:60.43ms -step:1398/2285 train_time:84482ms step_avg:60.43ms -step:1399/2285 train_time:84544ms step_avg:60.43ms -step:1400/2285 train_time:84604ms step_avg:60.43ms -step:1401/2285 train_time:84666ms step_avg:60.43ms -step:1402/2285 train_time:84726ms step_avg:60.43ms -step:1403/2285 train_time:84788ms step_avg:60.43ms -step:1404/2285 train_time:84848ms step_avg:60.43ms -step:1405/2285 train_time:84910ms step_avg:60.43ms -step:1406/2285 train_time:84969ms step_avg:60.43ms -step:1407/2285 train_time:85031ms step_avg:60.43ms -step:1408/2285 train_time:85091ms step_avg:60.43ms -step:1409/2285 train_time:85153ms step_avg:60.43ms -step:1410/2285 train_time:85213ms step_avg:60.43ms -step:1411/2285 train_time:85276ms step_avg:60.44ms -step:1412/2285 train_time:85336ms step_avg:60.44ms -step:1413/2285 train_time:85398ms step_avg:60.44ms -step:1414/2285 train_time:85458ms step_avg:60.44ms -step:1415/2285 train_time:85521ms step_avg:60.44ms -step:1416/2285 train_time:85581ms step_avg:60.44ms -step:1417/2285 train_time:85643ms step_avg:60.44ms -step:1418/2285 train_time:85703ms step_avg:60.44ms -step:1419/2285 train_time:85766ms step_avg:60.44ms -step:1420/2285 train_time:85826ms step_avg:60.44ms -step:1421/2285 train_time:85888ms step_avg:60.44ms -step:1422/2285 train_time:85947ms step_avg:60.44ms -step:1423/2285 train_time:86009ms step_avg:60.44ms -step:1424/2285 train_time:86069ms step_avg:60.44ms -step:1425/2285 train_time:86130ms step_avg:60.44ms -step:1426/2285 train_time:86190ms step_avg:60.44ms -step:1427/2285 train_time:86252ms step_avg:60.44ms -step:1428/2285 train_time:86312ms step_avg:60.44ms -step:1429/2285 train_time:86374ms step_avg:60.44ms -step:1430/2285 train_time:86435ms step_avg:60.44ms -step:1431/2285 train_time:86497ms step_avg:60.44ms -step:1432/2285 train_time:86556ms step_avg:60.44ms -step:1433/2285 train_time:86619ms step_avg:60.45ms -step:1434/2285 train_time:86679ms step_avg:60.45ms -step:1435/2285 train_time:86741ms step_avg:60.45ms -step:1436/2285 train_time:86802ms step_avg:60.45ms -step:1437/2285 train_time:86865ms step_avg:60.45ms -step:1438/2285 train_time:86925ms step_avg:60.45ms -step:1439/2285 train_time:86987ms step_avg:60.45ms -step:1440/2285 train_time:87046ms step_avg:60.45ms -step:1441/2285 train_time:87108ms step_avg:60.45ms -step:1442/2285 train_time:87169ms step_avg:60.45ms -step:1443/2285 train_time:87231ms step_avg:60.45ms -step:1444/2285 train_time:87290ms step_avg:60.45ms -step:1445/2285 train_time:87352ms step_avg:60.45ms -step:1446/2285 train_time:87412ms step_avg:60.45ms -step:1447/2285 train_time:87474ms step_avg:60.45ms -step:1448/2285 train_time:87533ms step_avg:60.45ms -step:1449/2285 train_time:87596ms step_avg:60.45ms -step:1450/2285 train_time:87655ms step_avg:60.45ms -step:1451/2285 train_time:87717ms step_avg:60.45ms -step:1452/2285 train_time:87777ms step_avg:60.45ms -step:1453/2285 train_time:87839ms step_avg:60.45ms -step:1454/2285 train_time:87899ms step_avg:60.45ms -step:1455/2285 train_time:87961ms step_avg:60.45ms -step:1456/2285 train_time:88022ms step_avg:60.45ms -step:1457/2285 train_time:88085ms step_avg:60.46ms -step:1458/2285 train_time:88145ms step_avg:60.46ms -step:1459/2285 train_time:88207ms step_avg:60.46ms -step:1460/2285 train_time:88268ms step_avg:60.46ms -step:1461/2285 train_time:88330ms step_avg:60.46ms -step:1462/2285 train_time:88389ms step_avg:60.46ms -step:1463/2285 train_time:88452ms step_avg:60.46ms -step:1464/2285 train_time:88511ms step_avg:60.46ms -step:1465/2285 train_time:88573ms step_avg:60.46ms -step:1466/2285 train_time:88633ms step_avg:60.46ms -step:1467/2285 train_time:88695ms step_avg:60.46ms -step:1468/2285 train_time:88755ms step_avg:60.46ms -step:1469/2285 train_time:88817ms step_avg:60.46ms -step:1470/2285 train_time:88876ms step_avg:60.46ms -step:1471/2285 train_time:88938ms step_avg:60.46ms -step:1472/2285 train_time:88998ms step_avg:60.46ms -step:1473/2285 train_time:89061ms step_avg:60.46ms -step:1474/2285 train_time:89121ms step_avg:60.46ms -step:1475/2285 train_time:89183ms step_avg:60.46ms -step:1476/2285 train_time:89243ms step_avg:60.46ms -step:1477/2285 train_time:89306ms step_avg:60.46ms -step:1478/2285 train_time:89366ms step_avg:60.46ms -step:1479/2285 train_time:89430ms step_avg:60.47ms -step:1480/2285 train_time:89488ms step_avg:60.47ms -step:1481/2285 train_time:89550ms step_avg:60.47ms -step:1482/2285 train_time:89610ms step_avg:60.47ms -step:1483/2285 train_time:89672ms step_avg:60.47ms -step:1484/2285 train_time:89731ms step_avg:60.47ms -step:1485/2285 train_time:89794ms step_avg:60.47ms -step:1486/2285 train_time:89853ms step_avg:60.47ms -step:1487/2285 train_time:89916ms step_avg:60.47ms -step:1488/2285 train_time:89976ms step_avg:60.47ms -step:1489/2285 train_time:90038ms step_avg:60.47ms -step:1490/2285 train_time:90097ms step_avg:60.47ms -step:1491/2285 train_time:90159ms step_avg:60.47ms -step:1492/2285 train_time:90220ms step_avg:60.47ms -step:1493/2285 train_time:90283ms step_avg:60.47ms -step:1494/2285 train_time:90343ms step_avg:60.47ms -step:1495/2285 train_time:90406ms step_avg:60.47ms -step:1496/2285 train_time:90466ms step_avg:60.47ms -step:1497/2285 train_time:90528ms step_avg:60.47ms -step:1498/2285 train_time:90588ms step_avg:60.47ms -step:1499/2285 train_time:90650ms step_avg:60.47ms -step:1500/2285 train_time:90710ms step_avg:60.47ms -step:1500/2285 val_loss:3.4305 train_time:90773ms step_avg:60.52ms -step:1501/2285 train_time:90791ms step_avg:60.49ms -step:1502/2285 train_time:90834ms step_avg:60.48ms -step:1503/2285 train_time:90898ms step_avg:60.48ms -step:1504/2285 train_time:90958ms step_avg:60.48ms -step:1505/2285 train_time:91020ms step_avg:60.48ms -step:1506/2285 train_time:91080ms step_avg:60.48ms -step:1507/2285 train_time:91141ms step_avg:60.48ms -step:1508/2285 train_time:91200ms step_avg:60.48ms -step:1509/2285 train_time:91261ms step_avg:60.48ms -step:1510/2285 train_time:91321ms step_avg:60.48ms -step:1511/2285 train_time:91383ms step_avg:60.48ms -step:1512/2285 train_time:91443ms step_avg:60.48ms -step:1513/2285 train_time:91504ms step_avg:60.48ms -step:1514/2285 train_time:91564ms step_avg:60.48ms -step:1515/2285 train_time:91626ms step_avg:60.48ms -step:1516/2285 train_time:91687ms step_avg:60.48ms -step:1517/2285 train_time:91751ms step_avg:60.48ms -step:1518/2285 train_time:91812ms step_avg:60.48ms -step:1519/2285 train_time:91875ms step_avg:60.48ms -step:1520/2285 train_time:91935ms step_avg:60.48ms -step:1521/2285 train_time:91997ms step_avg:60.48ms -step:1522/2285 train_time:92056ms step_avg:60.48ms -step:1523/2285 train_time:92118ms step_avg:60.48ms -step:1524/2285 train_time:92177ms step_avg:60.48ms -step:1525/2285 train_time:92239ms step_avg:60.48ms -step:1526/2285 train_time:92298ms step_avg:60.48ms -step:1527/2285 train_time:92360ms step_avg:60.48ms -step:1528/2285 train_time:92419ms step_avg:60.48ms -step:1529/2285 train_time:92481ms step_avg:60.48ms -step:1530/2285 train_time:92540ms step_avg:60.48ms -step:1531/2285 train_time:92602ms step_avg:60.48ms -step:1532/2285 train_time:92663ms step_avg:60.49ms -step:1533/2285 train_time:92727ms step_avg:60.49ms -step:1534/2285 train_time:92788ms step_avg:60.49ms -step:1535/2285 train_time:92851ms step_avg:60.49ms -step:1536/2285 train_time:92911ms step_avg:60.49ms -step:1537/2285 train_time:92973ms step_avg:60.49ms -step:1538/2285 train_time:93033ms step_avg:60.49ms -step:1539/2285 train_time:93095ms step_avg:60.49ms -step:1540/2285 train_time:93155ms step_avg:60.49ms -step:1541/2285 train_time:93217ms step_avg:60.49ms -step:1542/2285 train_time:93277ms step_avg:60.49ms -step:1543/2285 train_time:93339ms step_avg:60.49ms -step:1544/2285 train_time:93398ms step_avg:60.49ms -step:1545/2285 train_time:93461ms step_avg:60.49ms -step:1546/2285 train_time:93520ms step_avg:60.49ms -step:1547/2285 train_time:93582ms step_avg:60.49ms -step:1548/2285 train_time:93642ms step_avg:60.49ms -step:1549/2285 train_time:93706ms step_avg:60.49ms -step:1550/2285 train_time:93767ms step_avg:60.49ms -step:1551/2285 train_time:93830ms step_avg:60.50ms -step:1552/2285 train_time:93891ms step_avg:60.50ms -step:1553/2285 train_time:93953ms step_avg:60.50ms -step:1554/2285 train_time:94014ms step_avg:60.50ms -step:1555/2285 train_time:94076ms step_avg:60.50ms -step:1556/2285 train_time:94135ms step_avg:60.50ms -step:1557/2285 train_time:94197ms step_avg:60.50ms -step:1558/2285 train_time:94257ms step_avg:60.50ms -step:1559/2285 train_time:94319ms step_avg:60.50ms -step:1560/2285 train_time:94378ms step_avg:60.50ms -step:1561/2285 train_time:94440ms step_avg:60.50ms -step:1562/2285 train_time:94500ms step_avg:60.50ms -step:1563/2285 train_time:94563ms step_avg:60.50ms -step:1564/2285 train_time:94623ms step_avg:60.50ms -step:1565/2285 train_time:94686ms step_avg:60.50ms -step:1566/2285 train_time:94747ms step_avg:60.50ms -step:1567/2285 train_time:94810ms step_avg:60.50ms -step:1568/2285 train_time:94870ms step_avg:60.50ms -step:1569/2285 train_time:94933ms step_avg:60.51ms -step:1570/2285 train_time:94993ms step_avg:60.50ms -step:1571/2285 train_time:95054ms step_avg:60.51ms -step:1572/2285 train_time:95114ms step_avg:60.51ms -step:1573/2285 train_time:95177ms step_avg:60.51ms -step:1574/2285 train_time:95236ms step_avg:60.51ms -step:1575/2285 train_time:95298ms step_avg:60.51ms -step:1576/2285 train_time:95358ms step_avg:60.51ms -step:1577/2285 train_time:95420ms step_avg:60.51ms -step:1578/2285 train_time:95479ms step_avg:60.51ms -step:1579/2285 train_time:95541ms step_avg:60.51ms -step:1580/2285 train_time:95601ms step_avg:60.51ms -step:1581/2285 train_time:95663ms step_avg:60.51ms -step:1582/2285 train_time:95724ms step_avg:60.51ms -step:1583/2285 train_time:95787ms step_avg:60.51ms -step:1584/2285 train_time:95848ms step_avg:60.51ms -step:1585/2285 train_time:95911ms step_avg:60.51ms -step:1586/2285 train_time:95971ms step_avg:60.51ms -step:1587/2285 train_time:96034ms step_avg:60.51ms -step:1588/2285 train_time:96094ms step_avg:60.51ms -step:1589/2285 train_time:96156ms step_avg:60.51ms -step:1590/2285 train_time:96216ms step_avg:60.51ms -step:1591/2285 train_time:96278ms step_avg:60.51ms -step:1592/2285 train_time:96337ms step_avg:60.51ms -step:1593/2285 train_time:96399ms step_avg:60.51ms -step:1594/2285 train_time:96459ms step_avg:60.51ms -step:1595/2285 train_time:96521ms step_avg:60.51ms -step:1596/2285 train_time:96580ms step_avg:60.51ms -step:1597/2285 train_time:96643ms step_avg:60.52ms -step:1598/2285 train_time:96703ms step_avg:60.52ms -step:1599/2285 train_time:96767ms step_avg:60.52ms -step:1600/2285 train_time:96827ms step_avg:60.52ms -step:1601/2285 train_time:96890ms step_avg:60.52ms -step:1602/2285 train_time:96949ms step_avg:60.52ms -step:1603/2285 train_time:97012ms step_avg:60.52ms -step:1604/2285 train_time:97072ms step_avg:60.52ms -step:1605/2285 train_time:97135ms step_avg:60.52ms -step:1606/2285 train_time:97195ms step_avg:60.52ms -step:1607/2285 train_time:97256ms step_avg:60.52ms -step:1608/2285 train_time:97316ms step_avg:60.52ms -step:1609/2285 train_time:97378ms step_avg:60.52ms -step:1610/2285 train_time:97437ms step_avg:60.52ms -step:1611/2285 train_time:97500ms step_avg:60.52ms -step:1612/2285 train_time:97560ms step_avg:60.52ms -step:1613/2285 train_time:97622ms step_avg:60.52ms -step:1614/2285 train_time:97682ms step_avg:60.52ms -step:1615/2285 train_time:97745ms step_avg:60.52ms -step:1616/2285 train_time:97807ms step_avg:60.52ms -step:1617/2285 train_time:97869ms step_avg:60.53ms -step:1618/2285 train_time:97929ms step_avg:60.52ms -step:1619/2285 train_time:97992ms step_avg:60.53ms -step:1620/2285 train_time:98051ms step_avg:60.53ms -step:1621/2285 train_time:98114ms step_avg:60.53ms -step:1622/2285 train_time:98174ms step_avg:60.53ms -step:1623/2285 train_time:98235ms step_avg:60.53ms -step:1624/2285 train_time:98295ms step_avg:60.53ms -step:1625/2285 train_time:98357ms step_avg:60.53ms -step:1626/2285 train_time:98417ms step_avg:60.53ms -step:1627/2285 train_time:98479ms step_avg:60.53ms -step:1628/2285 train_time:98539ms step_avg:60.53ms -step:1629/2285 train_time:98601ms step_avg:60.53ms -step:1630/2285 train_time:98662ms step_avg:60.53ms -step:1631/2285 train_time:98724ms step_avg:60.53ms -step:1632/2285 train_time:98784ms step_avg:60.53ms -step:1633/2285 train_time:98847ms step_avg:60.53ms -step:1634/2285 train_time:98908ms step_avg:60.53ms -step:1635/2285 train_time:98970ms step_avg:60.53ms -step:1636/2285 train_time:99031ms step_avg:60.53ms -step:1637/2285 train_time:99093ms step_avg:60.53ms -step:1638/2285 train_time:99153ms step_avg:60.53ms -step:1639/2285 train_time:99215ms step_avg:60.53ms -step:1640/2285 train_time:99275ms step_avg:60.53ms -step:1641/2285 train_time:99337ms step_avg:60.53ms -step:1642/2285 train_time:99396ms step_avg:60.53ms -step:1643/2285 train_time:99458ms step_avg:60.53ms -step:1644/2285 train_time:99518ms step_avg:60.53ms -step:1645/2285 train_time:99580ms step_avg:60.53ms -step:1646/2285 train_time:99640ms step_avg:60.53ms -step:1647/2285 train_time:99702ms step_avg:60.54ms -step:1648/2285 train_time:99762ms step_avg:60.53ms -step:1649/2285 train_time:99825ms step_avg:60.54ms -step:1650/2285 train_time:99885ms step_avg:60.54ms -step:1651/2285 train_time:99949ms step_avg:60.54ms -step:1652/2285 train_time:100010ms step_avg:60.54ms -step:1653/2285 train_time:100072ms step_avg:60.54ms -step:1654/2285 train_time:100132ms step_avg:60.54ms -step:1655/2285 train_time:100194ms step_avg:60.54ms -step:1656/2285 train_time:100254ms step_avg:60.54ms -step:1657/2285 train_time:100317ms step_avg:60.54ms -step:1658/2285 train_time:100376ms step_avg:60.54ms -step:1659/2285 train_time:100438ms step_avg:60.54ms -step:1660/2285 train_time:100498ms step_avg:60.54ms -step:1661/2285 train_time:100560ms step_avg:60.54ms -step:1662/2285 train_time:100620ms step_avg:60.54ms -step:1663/2285 train_time:100682ms step_avg:60.54ms -step:1664/2285 train_time:100741ms step_avg:60.54ms -step:1665/2285 train_time:100804ms step_avg:60.54ms -step:1666/2285 train_time:100864ms step_avg:60.54ms -step:1667/2285 train_time:100926ms step_avg:60.54ms -step:1668/2285 train_time:100987ms step_avg:60.54ms -step:1669/2285 train_time:101049ms step_avg:60.54ms -step:1670/2285 train_time:101110ms step_avg:60.54ms -step:1671/2285 train_time:101172ms step_avg:60.55ms -step:1672/2285 train_time:101232ms step_avg:60.55ms -step:1673/2285 train_time:101295ms step_avg:60.55ms -step:1674/2285 train_time:101354ms step_avg:60.55ms -step:1675/2285 train_time:101416ms step_avg:60.55ms -step:1676/2285 train_time:101475ms step_avg:60.55ms -step:1677/2285 train_time:101538ms step_avg:60.55ms -step:1678/2285 train_time:101598ms step_avg:60.55ms -step:1679/2285 train_time:101660ms step_avg:60.55ms -step:1680/2285 train_time:101720ms step_avg:60.55ms -step:1681/2285 train_time:101782ms step_avg:60.55ms -step:1682/2285 train_time:101842ms step_avg:60.55ms -step:1683/2285 train_time:101905ms step_avg:60.55ms -step:1684/2285 train_time:101965ms step_avg:60.55ms -step:1685/2285 train_time:102028ms step_avg:60.55ms -step:1686/2285 train_time:102088ms step_avg:60.55ms -step:1687/2285 train_time:102152ms step_avg:60.55ms -step:1688/2285 train_time:102212ms step_avg:60.55ms -step:1689/2285 train_time:102275ms step_avg:60.55ms -step:1690/2285 train_time:102334ms step_avg:60.55ms -step:1691/2285 train_time:102396ms step_avg:60.55ms -step:1692/2285 train_time:102456ms step_avg:60.55ms -step:1693/2285 train_time:102518ms step_avg:60.55ms -step:1694/2285 train_time:102578ms step_avg:60.55ms -step:1695/2285 train_time:102640ms step_avg:60.55ms -step:1696/2285 train_time:102700ms step_avg:60.55ms -step:1697/2285 train_time:102762ms step_avg:60.56ms -step:1698/2285 train_time:102822ms step_avg:60.55ms -step:1699/2285 train_time:102885ms step_avg:60.56ms -step:1700/2285 train_time:102945ms step_avg:60.56ms -step:1701/2285 train_time:103008ms step_avg:60.56ms -step:1702/2285 train_time:103069ms step_avg:60.56ms -step:1703/2285 train_time:103132ms step_avg:60.56ms -step:1704/2285 train_time:103192ms step_avg:60.56ms -step:1705/2285 train_time:103254ms step_avg:60.56ms -step:1706/2285 train_time:103314ms step_avg:60.56ms -step:1707/2285 train_time:103378ms step_avg:60.56ms -step:1708/2285 train_time:103436ms step_avg:60.56ms -step:1709/2285 train_time:103499ms step_avg:60.56ms -step:1710/2285 train_time:103559ms step_avg:60.56ms -step:1711/2285 train_time:103621ms step_avg:60.56ms -step:1712/2285 train_time:103680ms step_avg:60.56ms -step:1713/2285 train_time:103742ms step_avg:60.56ms -step:1714/2285 train_time:103802ms step_avg:60.56ms -step:1715/2285 train_time:103865ms step_avg:60.56ms -step:1716/2285 train_time:103925ms step_avg:60.56ms -step:1717/2285 train_time:103988ms step_avg:60.56ms -step:1718/2285 train_time:104048ms step_avg:60.56ms -step:1719/2285 train_time:104112ms step_avg:60.57ms -step:1720/2285 train_time:104171ms step_avg:60.56ms -step:1721/2285 train_time:104234ms step_avg:60.57ms -step:1722/2285 train_time:104294ms step_avg:60.57ms -step:1723/2285 train_time:104356ms step_avg:60.57ms -step:1724/2285 train_time:104416ms step_avg:60.57ms -step:1725/2285 train_time:104478ms step_avg:60.57ms -step:1726/2285 train_time:104537ms step_avg:60.57ms -step:1727/2285 train_time:104599ms step_avg:60.57ms -step:1728/2285 train_time:104659ms step_avg:60.57ms -step:1729/2285 train_time:104721ms step_avg:60.57ms -step:1730/2285 train_time:104781ms step_avg:60.57ms -step:1731/2285 train_time:104843ms step_avg:60.57ms -step:1732/2285 train_time:104904ms step_avg:60.57ms -step:1733/2285 train_time:104967ms step_avg:60.57ms -step:1734/2285 train_time:105027ms step_avg:60.57ms -step:1735/2285 train_time:105090ms step_avg:60.57ms -step:1736/2285 train_time:105150ms step_avg:60.57ms -step:1737/2285 train_time:105212ms step_avg:60.57ms -step:1738/2285 train_time:105272ms step_avg:60.57ms -step:1739/2285 train_time:105334ms step_avg:60.57ms -step:1740/2285 train_time:105393ms step_avg:60.57ms -step:1741/2285 train_time:105456ms step_avg:60.57ms -step:1742/2285 train_time:105516ms step_avg:60.57ms -step:1743/2285 train_time:105578ms step_avg:60.57ms -step:1744/2285 train_time:105638ms step_avg:60.57ms -step:1745/2285 train_time:105700ms step_avg:60.57ms -step:1746/2285 train_time:105759ms step_avg:60.57ms -step:1747/2285 train_time:105821ms step_avg:60.57ms -step:1748/2285 train_time:105881ms step_avg:60.57ms -step:1749/2285 train_time:105944ms step_avg:60.57ms -step:1750/2285 train_time:106004ms step_avg:60.57ms -step:1750/2285 val_loss:3.3698 train_time:106069ms step_avg:60.61ms -step:1751/2285 train_time:106087ms step_avg:60.59ms -step:1752/2285 train_time:106132ms step_avg:60.58ms -step:1753/2285 train_time:106194ms step_avg:60.58ms -step:1754/2285 train_time:106254ms step_avg:60.58ms -step:1755/2285 train_time:106318ms step_avg:60.58ms -step:1756/2285 train_time:106378ms step_avg:60.58ms -step:1757/2285 train_time:106439ms step_avg:60.58ms -step:1758/2285 train_time:106498ms step_avg:60.58ms -step:1759/2285 train_time:106559ms step_avg:60.58ms -step:1760/2285 train_time:106618ms step_avg:60.58ms -step:1761/2285 train_time:106680ms step_avg:60.58ms -step:1762/2285 train_time:106740ms step_avg:60.58ms -step:1763/2285 train_time:106803ms step_avg:60.58ms -step:1764/2285 train_time:106862ms step_avg:60.58ms -step:1765/2285 train_time:106923ms step_avg:60.58ms -step:1766/2285 train_time:106986ms step_avg:60.58ms -step:1767/2285 train_time:107050ms step_avg:60.58ms -step:1768/2285 train_time:107110ms step_avg:60.58ms -step:1769/2285 train_time:107173ms step_avg:60.58ms -step:1770/2285 train_time:107233ms step_avg:60.58ms -step:1771/2285 train_time:107295ms step_avg:60.58ms -step:1772/2285 train_time:107355ms step_avg:60.58ms -step:1773/2285 train_time:107417ms step_avg:60.59ms -step:1774/2285 train_time:107477ms step_avg:60.58ms -step:1775/2285 train_time:107539ms step_avg:60.59ms -step:1776/2285 train_time:107598ms step_avg:60.58ms -step:1777/2285 train_time:107660ms step_avg:60.59ms -step:1778/2285 train_time:107719ms step_avg:60.58ms -step:1779/2285 train_time:107781ms step_avg:60.59ms -step:1780/2285 train_time:107841ms step_avg:60.58ms -step:1781/2285 train_time:107903ms step_avg:60.59ms -step:1782/2285 train_time:107964ms step_avg:60.59ms -step:1783/2285 train_time:108027ms step_avg:60.59ms -step:1784/2285 train_time:108087ms step_avg:60.59ms -step:1785/2285 train_time:108150ms step_avg:60.59ms -step:1786/2285 train_time:108209ms step_avg:60.59ms -step:1787/2285 train_time:108271ms step_avg:60.59ms -step:1788/2285 train_time:108331ms step_avg:60.59ms -step:1789/2285 train_time:108393ms step_avg:60.59ms -step:1790/2285 train_time:108453ms step_avg:60.59ms -step:1791/2285 train_time:108515ms step_avg:60.59ms -step:1792/2285 train_time:108575ms step_avg:60.59ms -step:1793/2285 train_time:108637ms step_avg:60.59ms -step:1794/2285 train_time:108697ms step_avg:60.59ms -step:1795/2285 train_time:108759ms step_avg:60.59ms -step:1796/2285 train_time:108818ms step_avg:60.59ms -step:1797/2285 train_time:108881ms step_avg:60.59ms -step:1798/2285 train_time:108942ms step_avg:60.59ms -step:1799/2285 train_time:109005ms step_avg:60.59ms -step:1800/2285 train_time:109066ms step_avg:60.59ms -step:1801/2285 train_time:109128ms step_avg:60.59ms -step:1802/2285 train_time:109188ms step_avg:60.59ms -step:1803/2285 train_time:109251ms step_avg:60.59ms -step:1804/2285 train_time:109310ms step_avg:60.59ms -step:1805/2285 train_time:109372ms step_avg:60.59ms -step:1806/2285 train_time:109432ms step_avg:60.59ms -step:1807/2285 train_time:109494ms step_avg:60.59ms -step:1808/2285 train_time:109554ms step_avg:60.59ms -step:1809/2285 train_time:109616ms step_avg:60.59ms -step:1810/2285 train_time:109676ms step_avg:60.59ms -step:1811/2285 train_time:109738ms step_avg:60.60ms -step:1812/2285 train_time:109798ms step_avg:60.60ms -step:1813/2285 train_time:109861ms step_avg:60.60ms -step:1814/2285 train_time:109921ms step_avg:60.60ms -step:1815/2285 train_time:109984ms step_avg:60.60ms -step:1816/2285 train_time:110044ms step_avg:60.60ms -step:1817/2285 train_time:110106ms step_avg:60.60ms -step:1818/2285 train_time:110166ms step_avg:60.60ms -step:1819/2285 train_time:110228ms step_avg:60.60ms -step:1820/2285 train_time:110289ms step_avg:60.60ms -step:1821/2285 train_time:110351ms step_avg:60.60ms -step:1822/2285 train_time:110414ms step_avg:60.60ms -step:1823/2285 train_time:110473ms step_avg:60.60ms -step:1824/2285 train_time:110532ms step_avg:60.60ms -step:1825/2285 train_time:110594ms step_avg:60.60ms -step:1826/2285 train_time:110654ms step_avg:60.60ms -step:1827/2285 train_time:110717ms step_avg:60.60ms -step:1828/2285 train_time:110777ms step_avg:60.60ms -step:1829/2285 train_time:110840ms step_avg:60.60ms -step:1830/2285 train_time:110900ms step_avg:60.60ms -step:1831/2285 train_time:110963ms step_avg:60.60ms -step:1832/2285 train_time:111023ms step_avg:60.60ms -step:1833/2285 train_time:111086ms step_avg:60.60ms -step:1834/2285 train_time:111146ms step_avg:60.60ms -step:1835/2285 train_time:111208ms step_avg:60.60ms -step:1836/2285 train_time:111268ms step_avg:60.60ms -step:1837/2285 train_time:111330ms step_avg:60.60ms -step:1838/2285 train_time:111391ms step_avg:60.60ms -step:1839/2285 train_time:111453ms step_avg:60.61ms -step:1840/2285 train_time:111512ms step_avg:60.60ms -step:1841/2285 train_time:111574ms step_avg:60.61ms -step:1842/2285 train_time:111634ms step_avg:60.60ms -step:1843/2285 train_time:111696ms step_avg:60.61ms -step:1844/2285 train_time:111756ms step_avg:60.61ms -step:1845/2285 train_time:111819ms step_avg:60.61ms -step:1846/2285 train_time:111880ms step_avg:60.61ms -step:1847/2285 train_time:111943ms step_avg:60.61ms -step:1848/2285 train_time:112002ms step_avg:60.61ms -step:1849/2285 train_time:112065ms step_avg:60.61ms -step:1850/2285 train_time:112125ms step_avg:60.61ms -step:1851/2285 train_time:112188ms step_avg:60.61ms -step:1852/2285 train_time:112248ms step_avg:60.61ms -step:1853/2285 train_time:112312ms step_avg:60.61ms -step:1854/2285 train_time:112370ms step_avg:60.61ms -step:1855/2285 train_time:112432ms step_avg:60.61ms -step:1856/2285 train_time:112492ms step_avg:60.61ms -step:1857/2285 train_time:112554ms step_avg:60.61ms -step:1858/2285 train_time:112613ms step_avg:60.61ms -step:1859/2285 train_time:112675ms step_avg:60.61ms -step:1860/2285 train_time:112735ms step_avg:60.61ms -step:1861/2285 train_time:112798ms step_avg:60.61ms -step:1862/2285 train_time:112859ms step_avg:60.61ms -step:1863/2285 train_time:112921ms step_avg:60.61ms -step:1864/2285 train_time:112981ms step_avg:60.61ms -step:1865/2285 train_time:113044ms step_avg:60.61ms -step:1866/2285 train_time:113104ms step_avg:60.61ms -step:1867/2285 train_time:113167ms step_avg:60.61ms -step:1868/2285 train_time:113226ms step_avg:60.61ms -step:1869/2285 train_time:113288ms step_avg:60.61ms -step:1870/2285 train_time:113348ms step_avg:60.61ms -step:1871/2285 train_time:113410ms step_avg:60.61ms -step:1872/2285 train_time:113470ms step_avg:60.61ms -step:1873/2285 train_time:113532ms step_avg:60.61ms -step:1874/2285 train_time:113591ms step_avg:60.61ms -step:1875/2285 train_time:113654ms step_avg:60.62ms -step:1876/2285 train_time:113713ms step_avg:60.61ms -step:1877/2285 train_time:113775ms step_avg:60.62ms -step:1878/2285 train_time:113835ms step_avg:60.62ms -step:1879/2285 train_time:113897ms step_avg:60.62ms -step:1880/2285 train_time:113958ms step_avg:60.62ms -step:1881/2285 train_time:114021ms step_avg:60.62ms -step:1882/2285 train_time:114081ms step_avg:60.62ms -step:1883/2285 train_time:114144ms step_avg:60.62ms -step:1884/2285 train_time:114204ms step_avg:60.62ms -step:1885/2285 train_time:114266ms step_avg:60.62ms -step:1886/2285 train_time:114326ms step_avg:60.62ms -step:1887/2285 train_time:114389ms step_avg:60.62ms -step:1888/2285 train_time:114448ms step_avg:60.62ms -step:1889/2285 train_time:114511ms step_avg:60.62ms -step:1890/2285 train_time:114570ms step_avg:60.62ms -step:1891/2285 train_time:114632ms step_avg:60.62ms -step:1892/2285 train_time:114691ms step_avg:60.62ms -step:1893/2285 train_time:114754ms step_avg:60.62ms -step:1894/2285 train_time:114813ms step_avg:60.62ms -step:1895/2285 train_time:114876ms step_avg:60.62ms -step:1896/2285 train_time:114936ms step_avg:60.62ms -step:1897/2285 train_time:114999ms step_avg:60.62ms -step:1898/2285 train_time:115060ms step_avg:60.62ms -step:1899/2285 train_time:115123ms step_avg:60.62ms -step:1900/2285 train_time:115183ms step_avg:60.62ms -step:1901/2285 train_time:115246ms step_avg:60.62ms -step:1902/2285 train_time:115305ms step_avg:60.62ms -step:1903/2285 train_time:115369ms step_avg:60.62ms -step:1904/2285 train_time:115428ms step_avg:60.62ms -step:1905/2285 train_time:115490ms step_avg:60.62ms -step:1906/2285 train_time:115550ms step_avg:60.62ms -step:1907/2285 train_time:115612ms step_avg:60.63ms -step:1908/2285 train_time:115671ms step_avg:60.62ms -step:1909/2285 train_time:115734ms step_avg:60.63ms -step:1910/2285 train_time:115794ms step_avg:60.63ms -step:1911/2285 train_time:115856ms step_avg:60.63ms -step:1912/2285 train_time:115917ms step_avg:60.63ms -step:1913/2285 train_time:115980ms step_avg:60.63ms -step:1914/2285 train_time:116041ms step_avg:60.63ms -step:1915/2285 train_time:116103ms step_avg:60.63ms -step:1916/2285 train_time:116164ms step_avg:60.63ms -step:1917/2285 train_time:116226ms step_avg:60.63ms -step:1918/2285 train_time:116287ms step_avg:60.63ms -step:1919/2285 train_time:116349ms step_avg:60.63ms -step:1920/2285 train_time:116409ms step_avg:60.63ms -step:1921/2285 train_time:116471ms step_avg:60.63ms -step:1922/2285 train_time:116531ms step_avg:60.63ms -step:1923/2285 train_time:116593ms step_avg:60.63ms -step:1924/2285 train_time:116653ms step_avg:60.63ms -step:1925/2285 train_time:116714ms step_avg:60.63ms -step:1926/2285 train_time:116774ms step_avg:60.63ms -step:1927/2285 train_time:116837ms step_avg:60.63ms -step:1928/2285 train_time:116897ms step_avg:60.63ms -step:1929/2285 train_time:116960ms step_avg:60.63ms -step:1930/2285 train_time:117021ms step_avg:60.63ms -step:1931/2285 train_time:117083ms step_avg:60.63ms -step:1932/2285 train_time:117144ms step_avg:60.63ms -step:1933/2285 train_time:117206ms step_avg:60.63ms -step:1934/2285 train_time:117266ms step_avg:60.63ms -step:1935/2285 train_time:117329ms step_avg:60.64ms -step:1936/2285 train_time:117389ms step_avg:60.63ms -step:1937/2285 train_time:117451ms step_avg:60.64ms -step:1938/2285 train_time:117511ms step_avg:60.64ms -step:1939/2285 train_time:117573ms step_avg:60.64ms -step:1940/2285 train_time:117632ms step_avg:60.64ms -step:1941/2285 train_time:117695ms step_avg:60.64ms -step:1942/2285 train_time:117754ms step_avg:60.64ms -step:1943/2285 train_time:117817ms step_avg:60.64ms -step:1944/2285 train_time:117877ms step_avg:60.64ms -step:1945/2285 train_time:117940ms step_avg:60.64ms -step:1946/2285 train_time:118001ms step_avg:60.64ms -step:1947/2285 train_time:118063ms step_avg:60.64ms -step:1948/2285 train_time:118123ms step_avg:60.64ms -step:1949/2285 train_time:118186ms step_avg:60.64ms -step:1950/2285 train_time:118247ms step_avg:60.64ms -step:1951/2285 train_time:118308ms step_avg:60.64ms -step:1952/2285 train_time:118368ms step_avg:60.64ms -step:1953/2285 train_time:118431ms step_avg:60.64ms -step:1954/2285 train_time:118491ms step_avg:60.64ms -step:1955/2285 train_time:118553ms step_avg:60.64ms -step:1956/2285 train_time:118612ms step_avg:60.64ms -step:1957/2285 train_time:118674ms step_avg:60.64ms -step:1958/2285 train_time:118735ms step_avg:60.64ms -step:1959/2285 train_time:118797ms step_avg:60.64ms -step:1960/2285 train_time:118858ms step_avg:60.64ms -step:1961/2285 train_time:118920ms step_avg:60.64ms -step:1962/2285 train_time:118981ms step_avg:60.64ms -step:1963/2285 train_time:119044ms step_avg:60.64ms -step:1964/2285 train_time:119103ms step_avg:60.64ms -step:1965/2285 train_time:119166ms step_avg:60.64ms -step:1966/2285 train_time:119226ms step_avg:60.64ms -step:1967/2285 train_time:119289ms step_avg:60.64ms -step:1968/2285 train_time:119349ms step_avg:60.64ms -step:1969/2285 train_time:119411ms step_avg:60.65ms -step:1970/2285 train_time:119471ms step_avg:60.65ms -step:1971/2285 train_time:119533ms step_avg:60.65ms -step:1972/2285 train_time:119592ms step_avg:60.65ms -step:1973/2285 train_time:119654ms step_avg:60.65ms -step:1974/2285 train_time:119714ms step_avg:60.65ms -step:1975/2285 train_time:119777ms step_avg:60.65ms -step:1976/2285 train_time:119837ms step_avg:60.65ms -step:1977/2285 train_time:119900ms step_avg:60.65ms -step:1978/2285 train_time:119960ms step_avg:60.65ms -step:1979/2285 train_time:120023ms step_avg:60.65ms -step:1980/2285 train_time:120082ms step_avg:60.65ms -step:1981/2285 train_time:120145ms step_avg:60.65ms -step:1982/2285 train_time:120205ms step_avg:60.65ms -step:1983/2285 train_time:120268ms step_avg:60.65ms -step:1984/2285 train_time:120328ms step_avg:60.65ms -step:1985/2285 train_time:120390ms step_avg:60.65ms -step:1986/2285 train_time:120451ms step_avg:60.65ms -step:1987/2285 train_time:120512ms step_avg:60.65ms -step:1988/2285 train_time:120572ms step_avg:60.65ms -step:1989/2285 train_time:120634ms step_avg:60.65ms -step:1990/2285 train_time:120693ms step_avg:60.65ms -step:1991/2285 train_time:120756ms step_avg:60.65ms -step:1992/2285 train_time:120819ms step_avg:60.65ms -step:1993/2285 train_time:120879ms step_avg:60.65ms -step:1994/2285 train_time:120940ms step_avg:60.65ms -step:1995/2285 train_time:121003ms step_avg:60.65ms -step:1996/2285 train_time:121063ms step_avg:60.65ms -step:1997/2285 train_time:121125ms step_avg:60.65ms -step:1998/2285 train_time:121185ms step_avg:60.65ms -step:1999/2285 train_time:121248ms step_avg:60.65ms -step:2000/2285 train_time:121308ms step_avg:60.65ms -step:2000/2285 val_loss:3.3212 train_time:121371ms step_avg:60.69ms -step:2001/2285 train_time:121390ms step_avg:60.66ms -step:2002/2285 train_time:121435ms step_avg:60.66ms -step:2003/2285 train_time:121497ms step_avg:60.66ms -step:2004/2285 train_time:121559ms step_avg:60.66ms -step:2005/2285 train_time:121622ms step_avg:60.66ms -step:2006/2285 train_time:121683ms step_avg:60.66ms -step:2007/2285 train_time:121745ms step_avg:60.66ms -step:2008/2285 train_time:121804ms step_avg:60.66ms -step:2009/2285 train_time:121866ms step_avg:60.66ms -step:2010/2285 train_time:121925ms step_avg:60.66ms -step:2011/2285 train_time:121987ms step_avg:60.66ms -step:2012/2285 train_time:122046ms step_avg:60.66ms -step:2013/2285 train_time:122108ms step_avg:60.66ms -step:2014/2285 train_time:122167ms step_avg:60.66ms -step:2015/2285 train_time:122229ms step_avg:60.66ms -step:2016/2285 train_time:122292ms step_avg:60.66ms -step:2017/2285 train_time:122357ms step_avg:60.66ms -step:2018/2285 train_time:122418ms step_avg:60.66ms -step:2019/2285 train_time:122481ms step_avg:60.66ms -step:2020/2285 train_time:122542ms step_avg:60.66ms -step:2021/2285 train_time:122606ms step_avg:60.67ms -step:2022/2285 train_time:122667ms step_avg:60.67ms -step:2023/2285 train_time:122729ms step_avg:60.67ms -step:2024/2285 train_time:122788ms step_avg:60.67ms -step:2025/2285 train_time:122850ms step_avg:60.67ms -step:2026/2285 train_time:122909ms step_avg:60.67ms -step:2027/2285 train_time:122970ms step_avg:60.67ms -step:2028/2285 train_time:123029ms step_avg:60.67ms -step:2029/2285 train_time:123091ms step_avg:60.67ms -step:2030/2285 train_time:123151ms step_avg:60.67ms -step:2031/2285 train_time:123213ms step_avg:60.67ms -step:2032/2285 train_time:123274ms step_avg:60.67ms -step:2033/2285 train_time:123337ms step_avg:60.67ms -step:2034/2285 train_time:123398ms step_avg:60.67ms -step:2035/2285 train_time:123461ms step_avg:60.67ms -step:2036/2285 train_time:123521ms step_avg:60.67ms -step:2037/2285 train_time:123585ms step_avg:60.67ms -step:2038/2285 train_time:123645ms step_avg:60.67ms -step:2039/2285 train_time:123707ms step_avg:60.67ms -step:2040/2285 train_time:123767ms step_avg:60.67ms -step:2041/2285 train_time:123829ms step_avg:60.67ms -step:2042/2285 train_time:123889ms step_avg:60.67ms -step:2043/2285 train_time:123950ms step_avg:60.67ms -step:2044/2285 train_time:124010ms step_avg:60.67ms -step:2045/2285 train_time:124072ms step_avg:60.67ms -step:2046/2285 train_time:124132ms step_avg:60.67ms -step:2047/2285 train_time:124194ms step_avg:60.67ms -step:2048/2285 train_time:124253ms step_avg:60.67ms -step:2049/2285 train_time:124316ms step_avg:60.67ms -step:2050/2285 train_time:124377ms step_avg:60.67ms -step:2051/2285 train_time:124439ms step_avg:60.67ms -step:2052/2285 train_time:124501ms step_avg:60.67ms -step:2053/2285 train_time:124565ms step_avg:60.67ms -step:2054/2285 train_time:124625ms step_avg:60.67ms -step:2055/2285 train_time:124687ms step_avg:60.67ms -step:2056/2285 train_time:124747ms step_avg:60.67ms -step:2057/2285 train_time:124809ms step_avg:60.68ms -step:2058/2285 train_time:124869ms step_avg:60.67ms -step:2059/2285 train_time:124931ms step_avg:60.68ms -step:2060/2285 train_time:124990ms step_avg:60.67ms -step:2061/2285 train_time:125052ms step_avg:60.68ms -step:2062/2285 train_time:125112ms step_avg:60.67ms -step:2063/2285 train_time:125174ms step_avg:60.68ms -step:2064/2285 train_time:125233ms step_avg:60.68ms -step:2065/2285 train_time:125296ms step_avg:60.68ms -step:2066/2285 train_time:125356ms step_avg:60.68ms -step:2067/2285 train_time:125419ms step_avg:60.68ms -step:2068/2285 train_time:125480ms step_avg:60.68ms -step:2069/2285 train_time:125543ms step_avg:60.68ms -step:2070/2285 train_time:125604ms step_avg:60.68ms -step:2071/2285 train_time:125667ms step_avg:60.68ms -step:2072/2285 train_time:125726ms step_avg:60.68ms -step:2073/2285 train_time:125788ms step_avg:60.68ms -step:2074/2285 train_time:125848ms step_avg:60.68ms -step:2075/2285 train_time:125911ms step_avg:60.68ms -step:2076/2285 train_time:125971ms step_avg:60.68ms -step:2077/2285 train_time:126033ms step_avg:60.68ms -step:2078/2285 train_time:126093ms step_avg:60.68ms -step:2079/2285 train_time:126155ms step_avg:60.68ms -step:2080/2285 train_time:126214ms step_avg:60.68ms -step:2081/2285 train_time:126277ms step_avg:60.68ms -step:2082/2285 train_time:126337ms step_avg:60.68ms -step:2083/2285 train_time:126401ms step_avg:60.68ms -step:2084/2285 train_time:126461ms step_avg:60.68ms -step:2085/2285 train_time:126524ms step_avg:60.68ms -step:2086/2285 train_time:126584ms step_avg:60.68ms -step:2087/2285 train_time:126646ms step_avg:60.68ms -step:2088/2285 train_time:126706ms step_avg:60.68ms -step:2089/2285 train_time:126768ms step_avg:60.68ms -step:2090/2285 train_time:126828ms step_avg:60.68ms -step:2091/2285 train_time:126890ms step_avg:60.68ms -step:2092/2285 train_time:126950ms step_avg:60.68ms -step:2093/2285 train_time:127012ms step_avg:60.68ms -step:2094/2285 train_time:127073ms step_avg:60.68ms -step:2095/2285 train_time:127135ms step_avg:60.68ms -step:2096/2285 train_time:127194ms step_avg:60.68ms -step:2097/2285 train_time:127257ms step_avg:60.69ms -step:2098/2285 train_time:127316ms step_avg:60.68ms -step:2099/2285 train_time:127379ms step_avg:60.69ms -step:2100/2285 train_time:127439ms step_avg:60.69ms -step:2101/2285 train_time:127502ms step_avg:60.69ms -step:2102/2285 train_time:127563ms step_avg:60.69ms -step:2103/2285 train_time:127625ms step_avg:60.69ms -step:2104/2285 train_time:127685ms step_avg:60.69ms -step:2105/2285 train_time:127748ms step_avg:60.69ms -step:2106/2285 train_time:127808ms step_avg:60.69ms -step:2107/2285 train_time:127870ms step_avg:60.69ms -step:2108/2285 train_time:127930ms step_avg:60.69ms -step:2109/2285 train_time:127992ms step_avg:60.69ms -step:2110/2285 train_time:128052ms step_avg:60.69ms -step:2111/2285 train_time:128114ms step_avg:60.69ms -step:2112/2285 train_time:128174ms step_avg:60.69ms -step:2113/2285 train_time:128236ms step_avg:60.69ms -step:2114/2285 train_time:128296ms step_avg:60.69ms -step:2115/2285 train_time:128359ms step_avg:60.69ms -step:2116/2285 train_time:128419ms step_avg:60.69ms -step:2117/2285 train_time:128482ms step_avg:60.69ms -step:2118/2285 train_time:128542ms step_avg:60.69ms -step:2119/2285 train_time:128605ms step_avg:60.69ms -step:2120/2285 train_time:128666ms step_avg:60.69ms -step:2121/2285 train_time:128728ms step_avg:60.69ms -step:2122/2285 train_time:128788ms step_avg:60.69ms -step:2123/2285 train_time:128850ms step_avg:60.69ms -step:2124/2285 train_time:128911ms step_avg:60.69ms -step:2125/2285 train_time:128973ms step_avg:60.69ms -step:2126/2285 train_time:129032ms step_avg:60.69ms -step:2127/2285 train_time:129095ms step_avg:60.69ms -step:2128/2285 train_time:129154ms step_avg:60.69ms -step:2129/2285 train_time:129217ms step_avg:60.69ms -step:2130/2285 train_time:129277ms step_avg:60.69ms -step:2131/2285 train_time:129340ms step_avg:60.69ms -step:2132/2285 train_time:129400ms step_avg:60.69ms -step:2133/2285 train_time:129463ms step_avg:60.70ms -step:2134/2285 train_time:129523ms step_avg:60.70ms -step:2135/2285 train_time:129586ms step_avg:60.70ms -step:2136/2285 train_time:129646ms step_avg:60.70ms -step:2137/2285 train_time:129708ms step_avg:60.70ms -step:2138/2285 train_time:129769ms step_avg:60.70ms -step:2139/2285 train_time:129831ms step_avg:60.70ms -step:2140/2285 train_time:129891ms step_avg:60.70ms -step:2141/2285 train_time:129953ms step_avg:60.70ms -step:2142/2285 train_time:130013ms step_avg:60.70ms -step:2143/2285 train_time:130075ms step_avg:60.70ms -step:2144/2285 train_time:130135ms step_avg:60.70ms -step:2145/2285 train_time:130197ms step_avg:60.70ms -step:2146/2285 train_time:130257ms step_avg:60.70ms -step:2147/2285 train_time:130320ms step_avg:60.70ms -step:2148/2285 train_time:130380ms step_avg:60.70ms -step:2149/2285 train_time:130443ms step_avg:60.70ms -step:2150/2285 train_time:130503ms step_avg:60.70ms -step:2151/2285 train_time:130567ms step_avg:60.70ms -step:2152/2285 train_time:130626ms step_avg:60.70ms -step:2153/2285 train_time:130689ms step_avg:60.70ms -step:2154/2285 train_time:130749ms step_avg:60.70ms -step:2155/2285 train_time:130811ms step_avg:60.70ms -step:2156/2285 train_time:130871ms step_avg:60.70ms -step:2157/2285 train_time:130933ms step_avg:60.70ms -step:2158/2285 train_time:130993ms step_avg:60.70ms -step:2159/2285 train_time:131055ms step_avg:60.70ms -step:2160/2285 train_time:131115ms step_avg:60.70ms -step:2161/2285 train_time:131177ms step_avg:60.70ms -step:2162/2285 train_time:131237ms step_avg:60.70ms -step:2163/2285 train_time:131300ms step_avg:60.70ms -step:2164/2285 train_time:131360ms step_avg:60.70ms -step:2165/2285 train_time:131423ms step_avg:60.70ms -step:2166/2285 train_time:131484ms step_avg:60.70ms -step:2167/2285 train_time:131546ms step_avg:60.70ms -step:2168/2285 train_time:131606ms step_avg:60.70ms -step:2169/2285 train_time:131670ms step_avg:60.71ms -step:2170/2285 train_time:131730ms step_avg:60.71ms -step:2171/2285 train_time:131793ms step_avg:60.71ms -step:2172/2285 train_time:131853ms step_avg:60.71ms -step:2173/2285 train_time:131915ms step_avg:60.71ms -step:2174/2285 train_time:131976ms step_avg:60.71ms -step:2175/2285 train_time:132038ms step_avg:60.71ms -step:2176/2285 train_time:132099ms step_avg:60.71ms -step:2177/2285 train_time:132161ms step_avg:60.71ms -step:2178/2285 train_time:132221ms step_avg:60.71ms -step:2179/2285 train_time:132284ms step_avg:60.71ms -step:2180/2285 train_time:132344ms step_avg:60.71ms -step:2181/2285 train_time:132407ms step_avg:60.71ms -step:2182/2285 train_time:132467ms step_avg:60.71ms -step:2183/2285 train_time:132529ms step_avg:60.71ms -step:2184/2285 train_time:132589ms step_avg:60.71ms -step:2185/2285 train_time:132652ms step_avg:60.71ms -step:2186/2285 train_time:132712ms step_avg:60.71ms -step:2187/2285 train_time:132776ms step_avg:60.71ms -step:2188/2285 train_time:132835ms step_avg:60.71ms -step:2189/2285 train_time:132898ms step_avg:60.71ms -step:2190/2285 train_time:132958ms step_avg:60.71ms -step:2191/2285 train_time:133021ms step_avg:60.71ms -step:2192/2285 train_time:133081ms step_avg:60.71ms -step:2193/2285 train_time:133143ms step_avg:60.71ms -step:2194/2285 train_time:133203ms step_avg:60.71ms -step:2195/2285 train_time:133265ms step_avg:60.71ms -step:2196/2285 train_time:133325ms step_avg:60.71ms -step:2197/2285 train_time:133387ms step_avg:60.71ms -step:2198/2285 train_time:133447ms step_avg:60.71ms -step:2199/2285 train_time:133510ms step_avg:60.71ms -step:2200/2285 train_time:133570ms step_avg:60.71ms -step:2201/2285 train_time:133633ms step_avg:60.71ms -step:2202/2285 train_time:133693ms step_avg:60.71ms -step:2203/2285 train_time:133756ms step_avg:60.72ms -step:2204/2285 train_time:133816ms step_avg:60.72ms -step:2205/2285 train_time:133878ms step_avg:60.72ms -step:2206/2285 train_time:133939ms step_avg:60.72ms -step:2207/2285 train_time:134002ms step_avg:60.72ms -step:2208/2285 train_time:134062ms step_avg:60.72ms -step:2209/2285 train_time:134124ms step_avg:60.72ms -step:2210/2285 train_time:134184ms step_avg:60.72ms -step:2211/2285 train_time:134247ms step_avg:60.72ms -step:2212/2285 train_time:134307ms step_avg:60.72ms -step:2213/2285 train_time:134369ms step_avg:60.72ms -step:2214/2285 train_time:134428ms step_avg:60.72ms -step:2215/2285 train_time:134491ms step_avg:60.72ms -step:2216/2285 train_time:134551ms step_avg:60.72ms -step:2217/2285 train_time:134614ms step_avg:60.72ms -step:2218/2285 train_time:134673ms step_avg:60.72ms -step:2219/2285 train_time:134736ms step_avg:60.72ms -step:2220/2285 train_time:134796ms step_avg:60.72ms -step:2221/2285 train_time:134858ms step_avg:60.72ms -step:2222/2285 train_time:134919ms step_avg:60.72ms -step:2223/2285 train_time:134982ms step_avg:60.72ms -step:2224/2285 train_time:135043ms step_avg:60.72ms -step:2225/2285 train_time:135104ms step_avg:60.72ms -step:2226/2285 train_time:135165ms step_avg:60.72ms -step:2227/2285 train_time:135227ms step_avg:60.72ms -step:2228/2285 train_time:135287ms step_avg:60.72ms -step:2229/2285 train_time:135349ms step_avg:60.72ms -step:2230/2285 train_time:135409ms step_avg:60.72ms -step:2231/2285 train_time:135472ms step_avg:60.72ms -step:2232/2285 train_time:135531ms step_avg:60.72ms -step:2233/2285 train_time:135594ms step_avg:60.72ms -step:2234/2285 train_time:135654ms step_avg:60.72ms -step:2235/2285 train_time:135717ms step_avg:60.72ms -step:2236/2285 train_time:135777ms step_avg:60.72ms -step:2237/2285 train_time:135839ms step_avg:60.72ms -step:2238/2285 train_time:135900ms step_avg:60.72ms -step:2239/2285 train_time:135963ms step_avg:60.72ms -step:2240/2285 train_time:136023ms step_avg:60.72ms -step:2241/2285 train_time:136085ms step_avg:60.73ms -step:2242/2285 train_time:136145ms step_avg:60.72ms -step:2243/2285 train_time:136207ms step_avg:60.73ms -step:2244/2285 train_time:136268ms step_avg:60.73ms -step:2245/2285 train_time:136330ms step_avg:60.73ms -step:2246/2285 train_time:136391ms step_avg:60.73ms -step:2247/2285 train_time:136453ms step_avg:60.73ms -step:2248/2285 train_time:136512ms step_avg:60.73ms -step:2249/2285 train_time:136574ms step_avg:60.73ms -step:2250/2285 train_time:136634ms step_avg:60.73ms -step:2250/2285 val_loss:3.2861 train_time:136698ms step_avg:60.75ms -step:2251/2285 train_time:136715ms step_avg:60.74ms -step:2252/2285 train_time:136760ms step_avg:60.73ms -step:2253/2285 train_time:136823ms step_avg:60.73ms -step:2254/2285 train_time:136883ms step_avg:60.73ms -step:2255/2285 train_time:136945ms step_avg:60.73ms -step:2256/2285 train_time:137005ms step_avg:60.73ms -step:2257/2285 train_time:137066ms step_avg:60.73ms -step:2258/2285 train_time:137126ms step_avg:60.73ms -step:2259/2285 train_time:137187ms step_avg:60.73ms -step:2260/2285 train_time:137247ms step_avg:60.73ms -step:2261/2285 train_time:137310ms step_avg:60.73ms -step:2262/2285 train_time:137370ms step_avg:60.73ms -step:2263/2285 train_time:137433ms step_avg:60.73ms -step:2264/2285 train_time:137492ms step_avg:60.73ms -step:2265/2285 train_time:137555ms step_avg:60.73ms -step:2266/2285 train_time:137616ms step_avg:60.73ms -step:2267/2285 train_time:137680ms step_avg:60.73ms -step:2268/2285 train_time:137741ms step_avg:60.73ms -step:2269/2285 train_time:137804ms step_avg:60.73ms -step:2270/2285 train_time:137864ms step_avg:60.73ms -step:2271/2285 train_time:137927ms step_avg:60.73ms -step:2272/2285 train_time:137987ms step_avg:60.73ms -step:2273/2285 train_time:138049ms step_avg:60.73ms -step:2274/2285 train_time:138109ms step_avg:60.73ms -step:2275/2285 train_time:138171ms step_avg:60.73ms -step:2276/2285 train_time:138231ms step_avg:60.73ms -step:2277/2285 train_time:138293ms step_avg:60.73ms -step:2278/2285 train_time:138353ms step_avg:60.73ms -step:2279/2285 train_time:138416ms step_avg:60.74ms -step:2280/2285 train_time:138476ms step_avg:60.74ms -step:2281/2285 train_time:138539ms step_avg:60.74ms -step:2282/2285 train_time:138599ms step_avg:60.74ms -step:2283/2285 train_time:138662ms step_avg:60.74ms -step:2284/2285 train_time:138722ms step_avg:60.74ms -step:2285/2285 train_time:138785ms step_avg:60.74ms -step:2285/2285 val_loss:3.2802 train_time:138846ms step_avg:60.76ms -peak memory allocated: 29249 MiB reserved: 50528 MiB diff --git a/records/track_1_short/2025-10-27_FixMuonLR/fc12c205-f953-4028-bfdf-0519c72fb269.txt b/records/track_1_short/2025-10-27_FixMuonLR/fc12c205-f953-4028-bfdf-0519c72fb269.txt deleted file mode 100644 index 115b6510f..000000000 --- a/records/track_1_short/2025-10-27_FixMuonLR/fc12c205-f953-4028-bfdf-0519c72fb269.txt +++ /dev/null @@ -1,3814 +0,0 @@ -import os -import sys - -with open(sys.argv[0]) as f: - code = f.read() # read the code of this file ASAP, for logging -import copy -import glob -import math -import threading -import time -import uuid -from dataclasses import dataclass -from collections import defaultdict -from itertools import accumulate -from pathlib import Path - -os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" -import torch - -torch.empty( - 1, device="cuda", requires_grad=True -).backward() # prevents a bug on some systems -import torch._dynamo as dynamo -import torch.distributed as dist -import torch.nn.functional as F - -# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min -import triton -import triton.language as tl -from kernels import get_kernel -from torch import Tensor, nn - -dynamo.config.recompile_limit = 64 - -# ----------------------------------------------------------------------------- -# Custom operators: FP8 matmul by @YouJiacheng - - -@torch.library.custom_op("nanogpt::mm", mutates_args=()) -def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: - @torch.compile - def impl(x: Tensor, w: Tensor): - assert x.is_contiguous() and w.is_contiguous() - x_f8 = x.div(x_s).to(torch.float8_e4m3fn) - w_f8 = w.div(w_s).to(torch.float8_e4m3fn) - out = torch._scaled_mm( - x_f8, - w_f8.T, - out_dtype=torch.bfloat16, - scale_a=x.new_tensor(x_s, dtype=torch.float32), - scale_b=x.new_tensor(w_s, dtype=torch.float32), - use_fast_accum=True, - ) - return out, x_f8, w_f8 - - return impl(x, w) - -@mm_op.register_fake -def _(x: Tensor, w: Tensor, *_): - assert x.ndim == w.ndim == 2 - assert x.shape[1] == w.shape[1] - assert x.device == w.device - assert x.is_contiguous() and w.is_contiguous() - return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) - -@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) -def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: - @torch.compile - def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): - assert grad.is_contiguous() - x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) - w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) - grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) - grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) - grad_x = torch._scaled_mm( - grad_f8, - w_f8.T.contiguous().T, - out_dtype=torch.bfloat16, - scale_a=grad_inv_s, - scale_b=w_inv_s, - use_fast_accum=False, - ) - # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) - grad_w = torch._scaled_mm( - x_f8.T.contiguous(), - grad_f8.T.contiguous().T, - out_dtype=torch.float32, - scale_a=x_inv_s, - scale_b=grad_inv_s, - use_fast_accum=False, - ).T - return grad_x, grad_w - - return impl(g, x_f8, w_f8) - -@mm_backward_op.register_fake -def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): - return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) - -def backward(ctx, grad_out: Tensor, *_): - x_f8, w_f8 = ctx.saved_tensors - x_s, w_s, grad_s = ctx.scales - grad_x, grad_w = torch.ops.nanogpt.mm_backward( - grad_out, x_f8, w_f8, x_s, w_s, grad_s - ) - return grad_x, grad_w, None, None, None - -def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): - *_, x_s, w_s, grad_s = inputs - _, x_f8, w_f8 = output - ctx.save_for_backward(x_f8, w_f8) - ctx.scales = x_s, w_s, grad_s - ctx.set_materialize_grads(False) - -mm_op.register_autograd(backward, setup_context=setup_context) - -# ----------------------------------------------------------------------------- -# Triton kernel for symmetric matrix multiplication by @byronxu99 - -def _get_autotune_configs(): - return [ - triton.Config( - { - "BLOCK_SIZE_M": bm, - "BLOCK_SIZE_N": bn, - "BLOCK_SIZE_K": bk, - "GROUP_SIZE_M": 8, - "LOWER_UPPER": 1, - }, - num_stages=stages, - num_warps=warps, - ) - for bm in [64, 128] - for bn in [64, 128, 256] - for bk in [64, 128] - for stages, warps in [(3, 4), (3, 8), (4, 4)] - if bm // bn <= 2 and bn // bm <= 2 - ] - -@triton.jit -def _pid_to_block( - pid, - M, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, -): - # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) - num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) - num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) - - # Map PID to a single matrix in batch - batch_idx = pid // (num_pid_m * num_pid_n) - pid = pid % (num_pid_m * num_pid_n) - - # Map PID to 2D grid of blocks - pid_m = pid // num_pid_n - pid_n = pid % num_pid_n - pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) - - m_idx = pid_m * BLOCK_SIZE_M - n_idx = pid_n * BLOCK_SIZE_N - return batch_idx, m_idx, n_idx - -@triton.autotune( - configs=_get_autotune_configs(), - key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], -) -@triton.jit -def XXT_kernel( - A_ptr, C_ptr, - M, K, - a_stride_b, a_stride_r, a_stride_c, - c_stride_b, c_stride_r, c_stride_c, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, - LOWER_UPPER: tl.constexpr, -): - pid = tl.program_id(axis=0) - batch_idx, m_idx, n_idx = _pid_to_block( - pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M - ) - - # Skip blocks that don't need to be computed - skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) - skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) - if skip_block_below_diag or skip_block_above_diag: - return - - # Index into one matrix of batch - A_ptr += batch_idx * a_stride_b - C_ptr += batch_idx * c_stride_b - - # Create pointer arrays for A and A.T - offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M - offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M - offs_k = tl.arange(0, BLOCK_SIZE_K) - a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) - at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) - - accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - - # Accumulate over blocks of K - for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): - a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) - at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) - accumulator = tl.dot(a, at, accumulator) - a_ptrs += BLOCK_SIZE_K * a_stride_c - at_ptrs += BLOCK_SIZE_K * a_stride_c - - out_dtype = C_ptr.dtype.element_ty - output = accumulator.to(out_dtype) - - # Store block of C - offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) - offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) - c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) - c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) - tl.store(c_ptrs, output, mask=c_mask) - - # Store block of C mirrored across the diagonal - c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) - c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) - tl.store(c_ptrs_t, output.T, mask=c_mask_t) - -def XXT(A: torch.Tensor, out: torch.Tensor): - """ - Launch Triton kernel to compute C = A @ A.T - """ - assert A.ndim == 2 or A.ndim == 3 - M, K = A.shape[-2:] - assert out.size(-2) == M, "Output matrix has incorrect shape" - assert out.size(-1) == M, "Output matrix has incorrect shape" - - batch_size = A.size(0) if A.ndim == 3 else 1 - input_batch_stride = A.stride(0) if A.ndim == 3 else 0 - output_batch_stride = out.stride(0) if out.ndim == 3 else 0 - - grid = lambda meta: ( - batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), - ) - XXT_kernel[grid]( - A_ptr=A, - C_ptr=out, - M=M, - K=K, - a_stride_b=input_batch_stride, - a_stride_r=A.stride(-2), - a_stride_c=A.stride(-1), - c_stride_b=output_batch_stride, - c_stride_r=out.stride(-2), - c_stride_c=out.stride(-1), - ) - return out - -@triton.autotune( - configs=_get_autotune_configs(), - key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], -) -@triton.jit -def ba_plus_cAA_kernel( - A_ptr, C_ptr, - M, - a_stride_b, a_stride_r, a_stride_c, - c_stride_b, c_stride_r, c_stride_c, - alpha, beta, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, - LOWER_UPPER: tl.constexpr, -): - # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A - # Performance is slightly slower than XXT_kernel, so we use two separate kernels - pid = tl.program_id(axis=0) - batch_idx, m_idx, n_idx = _pid_to_block( - pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M - ) - - # Skip blocks that don't need to be computed - skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) - skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) - if skip_block_below_diag or skip_block_above_diag: - return - - # Index into one matrix of batch - A_ptr += batch_idx * a_stride_b - C_ptr += batch_idx * c_stride_b - - # Create pointer arrays for A and A.T - offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M - offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M - offs_k = tl.arange(0, BLOCK_SIZE_K) - a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) - at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) - - accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - - # Accumulate over blocks of K - for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): - a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) - at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) - accumulator = tl.dot(a, at, accumulator) - a_ptrs += BLOCK_SIZE_K * a_stride_c - at_ptrs += BLOCK_SIZE_K * a_stride_c - - # Load block of A to add (corresponds to the current block of C) - offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) - offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) - a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) - a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) - a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) - - # Apply alpha and beta - accumulator *= alpha - accumulator += a_add * beta - - out_dtype = C_ptr.dtype.element_ty - output = accumulator.to(out_dtype) - - # Store block of C - offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) - offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) - c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) - c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) - tl.store(c_ptrs, output, mask=c_mask) - - # Store block of C mirrored across the diagonal - c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) - c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) - tl.store(c_ptrs_t, output.T, mask=c_mask_t) - -def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): - """ - Launch Triton kernel to compute C = alpha * A @ A.T + beta * A - """ - assert A.ndim == 2 or A.ndim == 3 - M, K = A.shape[-2:] - assert M == K, "Input matrix must be square" - assert out.size(-2) == M - assert out.size(-1) == M - - batch_size = A.size(0) if A.ndim == 3 else 1 - input_batch_stride = A.stride(0) if A.ndim == 3 else 0 - output_batch_stride = out.stride(0) if out.ndim == 3 else 0 - - grid = lambda meta: ( - batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), - ) - ba_plus_cAA_kernel[grid]( - A_ptr=A, - C_ptr=out, - M=M, - a_stride_b=input_batch_stride, - a_stride_r=A.stride(-2), - a_stride_c=A.stride(-1), - c_stride_b=output_batch_stride, - c_stride_r=out.stride(-2), - c_stride_c=out.stride(-1), - alpha=alpha, - beta=beta, - ) - return out - -# Computed for num_iters=5, safety_factor=2e-2, cushion=2 -polar_express_coeffs = [ - (8.156554524902461, -22.48329292557795, 15.878769915207462), - (4.042929935166739, -2.808917465908714, 0.5000178451051316), - (3.8916678022926607, -2.772484153217685, 0.5060648178503393), - (3.285753657755655, -2.3681294933425376, 0.46449024233003106), - (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) -] - -@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower -def polar_express(G: torch.Tensor): - """ - Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 - by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. - Code adapted from https://github.com/NoahAmsel/PolarExpress/tree/main by @varunneal. - """ - X = G.bfloat16() - if G.size(-2) > G.size(-1): - X = X.mT - - # Ensure spectral norm is at most 1 - X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) - - # Allocate buffers - X = X.contiguous() - A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) - B = torch.empty_like(A) - C = torch.empty_like(X) - - aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm - - # Perform the iterations - for a, b, c in polar_express_coeffs: - XXT(X, out=A) # A = X @ X.mT - ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A - aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X - X, C = C, X # Swap references to avoid unnecessary copies - - if G.size(-2) > G.size(-1): - X = X.mT - return X - -# ----------------------------------------------------------------------------- -# Muon optimizer - -class Muon(torch.optim.Optimizer): - """ - Muon - MomentUm Orthogonalized by Newton-schulz - - https://kellerjordan.github.io/posts/muon/ - - Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- - processing step, in which each 2D parameter's update is replaced with the nearest orthogonal - matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has - the advantage that it can be stably run in bfloat16 on the GPU. - Note: A later PR replaced Newton-Shulz with Polar Express for the orthogonalization step - - Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, - or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - Though empirically small 1D params perform efficiently here: - NS approximately performs a magnitude normalization of the grad - This hyper-optimized class has faster execution time than the current impl of Adam for small params - - Custom distributed sizing: - The model stores all attn and mlp weights in the same shape, and then updates the view as - needed on the forward pass. This enables attn and mlp weights to be contained within the same - dist.reduce_scatter_tensor() call. The model architecture has been customized to enable - (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. - The scheduling is: - 1. reduce scatter smear_gate (1 param 7 padding params) - 2. reduce scatter attn_gate (10 params 6 padding params) - 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) - 4. reduce scatter attn/mlp round 2 (16 mlp params) - 5. wait on step 1, then compute update of 1 and schedule all gather - 6. wait on step 2, then compute update of 2 and schedule all gather - 7. wait on step 3, then compute update of 3 and schedule all gather - GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] - GPUs that receive params of type attn reshape before computing update - 8. wait on 4, then compute update of 4 and schedule all gather - 9. wait for each all gather to complete and update params - Empirically, leading with small params provides an additional 0.2s improvement. - """ - def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, eps=1e-8, beta2=0.95, custom_sizing=True): - defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) - self.world_size = dist.get_world_size() if dist.is_initialized() else 1 - # custom sizing requires 8 GPUs - if custom_sizing and dist.get_world_size()==8: - param_groups = self.generate_custom_param_groups(params) - else: - param_groups = self.generate_standard_param_groups(params) - super().__init__(param_groups, defaults) - - def reset(self): - # expose a reset for clearing buffers - for group in self.param_groups: - group["momentum_buffer"].zero_() - group["second_momentum_buffer"].zero_() - - def generate_standard_param_groups(self, params): - """ - Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. - Creates one param group per module. - """ - groups = defaultdict(list) - for param in params: - groups[param.label].append(param) - - param_groups = [] - for module_name, group_params in groups.items(): - chunk_size = (len(group_params) + self.world_size - 1) // self.world_size - param_groups.append(dict(params=group_params, chunk_size=chunk_size)) - - return param_groups - - def generate_custom_param_groups(self, params): - """ - Implementation requires that a single GPU does not receive both attn - and mlp params when a param group is split across GPUs. - """ - module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp_up', 'mlp_down'] - params_list = list(params) - params_list.sort(key=lambda x: module_group_order.index(x.label)) - - idx = 0 - group_sizes = [1, 10, 16, 16] - assert len(params_list) == sum(group_sizes) - param_groups = [] - for size in group_sizes: - chunk_size = (size + self.world_size - 1) // self.world_size - group_params = params_list[idx: idx + size] - param_groups.append(dict(params=group_params, chunk_size=chunk_size)) - idx += size - - return param_groups - - @torch.no_grad() - def step(self): - # Efficient systems-wise implementation of step developed by @YouJiacheng, - # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, - # @ryanyang0, @vagrawal, and @varunneal. - rank = dist.get_rank() - group_infos = [] - for group in self.param_groups: - params: list[Tensor] = group["params"] - if not params: - continue - - chunk_size = group["chunk_size"] - padded_num_params = chunk_size * self.world_size - - stacked_grads = torch.empty( - (padded_num_params, *params[0].shape), - dtype=params[0].dtype, - device=params[0].device - ) - for i, p in enumerate(params): - stacked_grads[i].copy_(p.grad, non_blocking=True) - if len(params) < padded_num_params: - stacked_grads[len(params):].zero_() - - grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) - - reduce_future = dist.reduce_scatter_tensor( - grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True - ).get_future() - - group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) - - all_gather_infos = [] - # Second pass: wait for gradients, compute updates for the local shard of parameters, - # and launch all async all_gather operations. - for group, info in zip(self.param_groups, group_infos): - info["reduce_future"].wait() - - params = group["params"] - grad_chunk = info["grad_chunk"] - chunk_size = group["chunk_size"] - padded_num_params = chunk_size * self.world_size - - start_idx = rank * chunk_size - module_idx = start_idx if start_idx < len(params) else 0 - - num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank - - if "momentum_buffer" not in group: - group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) - momentum_buffer = group["momentum_buffer"] - # Apply momentum update to the persistent momentum buffer in-place - momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) - updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) - - grad_shape = updated_grads.shape - if params[module_idx].label == 'attn': - # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] - for p in params[module_idx:module_idx + num_params]: - assert p.label == 'attn' - updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) - ref_param = params[module_idx] - param_shape = ref_param.shape - - if "second_momentum_buffer" not in group: - group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) - if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) - ) - second_momentum_buffer = group["second_momentum_buffer"] - - if "param_lr" not in group: - group["param_lr"] = ( - max(1., param_shape[-2] / param_shape[-1]) ** 0.5 - * ref_param.new_tensor( - [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] - ).view(-1, 1, 1) - ) - - group["param_wd"] = ref_param.new_tensor( - [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] - ).view(-1, 1, 1) - - # Determine LR and WR - eff_lr = group["lr"] * group["param_lr"] - eff_wd = group["weight_decay"] * group["param_wd"] - - # Compute zeropower for the entire chunk in a single, batched call. - if num_params == 0: - v_chunk = updated_grads - elif params[module_idx].label == "smear_gate": - # dividing by magnitude is equivalent of SVN for 1d tensors - v_chunk = updated_grads / (updated_grads.norm(dim=(-2, -1), keepdim=True).clamp_min(1e-10)) - else: - v_chunk = polar_express(updated_grads) - - # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) - v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) - v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) - second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) - step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() - v_chunk.mul_(step_size) - v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) - v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) - - v_chunk = v_chunk.view(grad_shape) - - updated_params = torch.empty_like(grad_chunk) - param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) - # Apply weight decay directly to the buffer. - param_chunk.mul_(1 - eff_wd) - - param_chunk.add_(-eff_lr * v_chunk) - - updated_params[:num_params].copy_(param_chunk) - if num_params < chunk_size: - updated_params[num_params:].zero_() - - stacked_params = torch.empty( - (padded_num_params, *param_shape), - dtype=updated_params.dtype, - device=updated_params.device, - ) - - gather_future = dist.all_gather_into_tensor( - stacked_params, updated_params, async_op=True - ).get_future() - - all_gather_infos.append( - { - "gather_future": gather_future, - "stacked_params": stacked_params, - "orig_params": params, - } - ) - - # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. - for info in all_gather_infos: - info["gather_future"].wait() - stacked_params = info["stacked_params"] - orig_params = info["orig_params"] - - unstacked_params = torch.unbind(stacked_params) - for i, p in enumerate(orig_params): - p.copy_(unstacked_params[i], non_blocking=True) - - -class DistAdam(torch.optim.Optimizer): - def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): - self.world_size = dist.get_world_size() if dist.is_initialized() else 1 - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) - params = list(params) - sizes = {p.shape for p in params} - # create one buffer per unique parameter-size - param_groups = [] - for size in sizes: - group_params = [p for p in params if p.shape == size] - param_groups.append(dict(params=group_params)) - super().__init__(param_groups, defaults) - # init state - for p in params: - chunk_size = p.size(0) // self.world_size - exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) - exp_avg_sq = torch.zeros_like(exp_avg) - self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) - # DistributedAdam implementation by @vagrawal - - @torch.compile - @torch.no_grad() - def step(self): - rank = dist.get_rank() - reduce_scatter_futures: list[torch.Future] = [] - all_gather_futures: list[torch.Future] = [] - grad_slices = [] - for group in self.param_groups: - params: list[Tensor] = group["params"] - for param in params: - grad = param.grad - rank_size = grad.shape[0] // self.world_size - grad_slice = torch.empty_like(grad[:rank_size]) - reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) - grad_slices.append(grad_slice) - - idx = 0 - for group in self.param_groups: - beta1, beta2 = group['betas'] - eps = group['eps'] - wd = group['weight_decay'] - params = group['params'] - for param in params: - reduce_scatter_futures[idx].wait() - rank_size = param.shape[0] // self.world_size - p_slice = param[rank * rank_size:(rank + 1) * rank_size] - lr = group['lr'] * getattr(param, "lr_mul", 1.0) - state = self.state[param] - g_slice = grad_slices[idx] - - exp_avg = state["exp_avg"] - exp_avg_sq = state["exp_avg_sq"] - state["step"] += 1 - t = state["step"] - # weight decay - if wd != 0: - eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) - p_slice.mul_(1 - eff_weight_decay) - # update running averages - exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) - exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) - # bias corrections - bias1 = 1 - beta1 ** t - bias2 = 1 - beta2 ** t - # compute step - denom = exp_avg_sq.sqrt().add_(eps) - step_size = lr * (bias2 ** 0.5 / bias1) - update = exp_avg.div(denom).mul_(step_size) - p_slice.add_(other=update, alpha=-1.0) - idx += 1 - all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) - torch.futures.collect_all(all_gather_futures).wait() - -# ----------------------------------------------------------------------------- -# PyTorch nn.Module definitions for the model - -def norm(x: Tensor): - return F.rms_norm(x, (x.size(-1),)) - -class CastedLinear(nn.Linear): - def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): - super().__init__(in_features, out_features, bias=False) - self.use_fp8 = use_fp8 - self.x_s = x_s - self.w_s = w_s - self.grad_s = grad_s - - def reset_parameters(self) -> None: - with torch.no_grad(): - self.weight.zero_() # @Grad62304977 and others - - def forward(self, x: Tensor): - if self.use_fp8 and self.training: - _x = x.flatten(0, -2) - out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] - return out.reshape(*x.shape[:-1], -1) - else: - return F.linear(x, self.weight.type_as(x)) - -# yarn implementation @classiclarryd -class Yarn(nn.Module): - def __init__(self, head_dim, max_seq_len): - super().__init__() - self.head_dim = head_dim - self.max_seq_len = max_seq_len - self.reset() - - def reset(self): - angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) - # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) - angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) - t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) - theta = torch.outer(t, angular_freq) - self.cos = nn.Buffer( - theta.cos().to(torch.bfloat16), persistent=False - ) - self.sin = nn.Buffer( - theta.sin().to(torch.bfloat16), persistent=False - ) - self.angular_freq = angular_freq - # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 - self.attn_scale = 0.1 - - def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): - rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) - scaling_factor = old_window / new_window - interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) - self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) - t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) - theta = torch.outer(t, self.angular_freq) - self.cos.copy_(theta.cos()) - self.sin.copy_(theta.sin()) - self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 - -def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): - assert cos.size(0) >= x_BTHD.size(-3) - cos, sin = ( - cos[None, : x_BTHD.size(-3), None, :], - sin[None, : x_BTHD.size(-3), None, :], - ) - x1, x2 = x_BTHD.chunk(2, dim=-1) - y1 = x1 * cos + x2 * sin - y2 = x1 * (-sin) + x2 * cos - return torch.cat((y1, y2), 3) - -@dataclass -class AttnArgs: - ve: torch.Tensor - sa_lambdas: torch.Tensor - seqlens: torch.Tensor - bm_size: int - cos: torch.Tensor - sin: torch.Tensor - attn_scale: float - -flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface - -class CausalSelfAttention(nn.Module): - def __init__(self, dim: int, head_dim: int, num_heads: int): - super().__init__() - self.num_heads = num_heads - self.head_dim = head_dim - self.dim = dim - self.hdim = num_heads * head_dim - - assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" - std = 0.5 * (self.dim ** -0.5) - bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng - # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng - # https://x.com/hi_tysam/status/1879699187107033311 - # make matrices the same shape as MLP to enable batched call in optimizer - self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) - # label module to enable custom optimizer sizing - self.qkvo_w.label='attn' - - with torch.no_grad(): - self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights - self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero - - # sparse gated attention to enable context based no-op by @classiclarryd - self.attn_gate = CastedLinear(12, num_heads) - # label module to enable custom optimizer sizing - self.attn_gate.weight.label = 'attn_gate' - - def forward(self, x: Tensor, attn_args: AttnArgs): - B, T = x.size(0), x.size(1) # batch size, sequence length - assert B == 1, "varlen sequences requires B == 1" - assert T % 16 == 0 - # unpack attention args - cos, sin = attn_args.cos, attn_args.sin - ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas - seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size - - q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) - q, k = norm(q), norm(k) # QK norm @Grad62304977 - q, k = rotary(q, cos, sin), rotary(k, cos, sin) - if ve is not None: - v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 - else: # skip mid-layers token value embeddings by @YouJiacheng - v = sa_lambdas[0] * v - - max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) - - # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng - y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, - max_seqlen_q=max_len, max_seqlen_k=max_len, - causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) - y = y.view(B, T, self.num_heads, self.head_dim) - y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) - y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side - y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) - return y - - -class MLP(nn.Module): - def __init__(self, dim: int): - super().__init__() - hdim = 4 * dim - # make matrices the same shape to enable batched call in optimizer - self.c_fc = nn.Parameter(torch.empty(dim, hdim)) - self.c_proj = nn.Parameter(torch.empty(dim, hdim)) - # label modules to enable custom optimizer sizing - self.c_fc.label = 'mlp_up' - self.c_proj.label = 'mlp_down' - # corrective factor to account for transpose - self.c_fc.lr_mul = 2. - - std = 0.5 * (dim ** -0.5) - bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng - with torch.no_grad(): - self.c_fc.uniform_(-bound, bound) - self.c_proj.zero_() # zero init suggested by @Grad62304977 - - def forward(self, x: Tensor): - x = F.linear(x, self.c_fc.T.type_as(x)) - x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 - x = F.linear(x, self.c_proj.type_as(x)) - return x - -class Block(nn.Module): - def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): - super().__init__() - # skip attention of blocks.7 (the 8th layer) by @YouJiacheng - self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None - # skip MLP blocks for first MLP layer by @EmelyanenkoK - self.mlp = MLP(dim) if layer_idx != 0 else None - - def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): - x = lambdas[0] * x + lambdas[1] * x0 - if self.attn is not None: - x = x + self.attn(norm(x), attn_args) - if self.mlp is not None: - x = x + self.mlp(norm(x)) - return x - -# ----------------------------------------------------------------------------- -# The main model - -def next_multiple_of_n(v: float | int, *, n: int): - return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) - -class GPT(nn.Module): - def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): - super().__init__() - vocab_size = next_multiple_of_n(vocab_size, n=128) - self.embed = nn.Embedding(vocab_size, model_dim) - self.smear_gate = CastedLinear(12, 1) - # label modules to enable custom optimizer sizing - self.smear_gate.weight.label = 'smear_gate' - # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 - # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 - self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) - self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) - self.yarn = Yarn(head_dim, max_seq_len) - # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. - # suggested to me by @Grad62304977. this originates from Karpathy's experiments. - use_fp8 = not os.environ.get("DISABLE_FP8", False) - self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) - # Add learnable skip connection weights for decoder layers - assert num_layers % 2 == 0 - pad = (-num_layers * 5 - 2) % dist.get_world_size() - self.scalars = nn.Parameter( - torch.cat( - [ - -1.5 - * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 - *[ - torch.tensor([1.0, 0.0]) for _ in range(num_layers) - ], # block lambdas - *[ - torch.tensor([0.5, 0.5]) for _ in range(num_layers) - ], # SA lambdas - torch.zeros(1), # smear_lambda - 0.5*torch.ones(1), # backout_lambda - torch.ones(pad), - ] - ) - ) - # set learning rates - for param in self.embed.parameters(): - param.lr_mul = 75. - for param in self.value_embeds.parameters(): - param.lr_mul = 75. - self.lm_head.weight.lr_mul = 1.0 - self.scalars.lr_mul = 5.0 - - def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): - assert input_seq.ndim == 1 - - ve = [value_embed(input_seq) for value_embed in self.value_embeds] - # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure - # dropping first layer updates this to .12 ... 012 - ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] - assert len(ve) == len(self.blocks) - - short_bm = ws_short * args.block_size - long_bm = ws_long * args.block_size - bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] - assert len(bm_sizes) == len(self.blocks) - - x = self.embed(input_seq) - - skip_weights = self.scalars[:(len(self.blocks) // 2)] - lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) - sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) - smear_lambda = self.scalars[5 * len(self.blocks)] - backout_lambda = self.scalars[5 * len(self.blocks)+1] - - # smear token embed forward 1 position @classiclarryd - smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) - x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) - x = x0 = norm(x[None]) - - # U-net design by @brendanh0gan - skip_connections = [] - n = len(self.blocks) // 2 - - x_backout = None - backout_layer = 8 - # skip layer zero - for i in range(1,len(self.blocks)): - attn_args = AttnArgs( - ve=ve[i], - sa_lambdas=sa_lambdas[i], - seqlens=seqlens, - bm_size=bm_sizes[i], - cos=self.yarn.cos, - sin=self.yarn.sin, - attn_scale=self.yarn.attn_scale - ) - # since layer 0 is skipped, layer 11 does not have skip_connection - if i >= n and i<11: - gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) - x = x + gate * skip_connections.pop() - x = self.blocks[i](x, x0, lambdas[i], attn_args) - if i < n: - skip_connections.append(x) - if i == backout_layer: - x_backout = x - - # back out contributions from first 8 layers that are only required for downstream context and not direct prediction - x -= backout_lambda * x_backout - x = norm(x) - logits = self.lm_head(x) - # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) - logits = 30 * torch.sigmoid(logits / 7.5) - logits_for_loss = logits.float() if not self.training else logits - loss = F.cross_entropy( - logits_for_loss.view(-1, logits_for_loss.size(-1)), - target_seq, - reduction="sum" if self.training else "mean", - ) - return loss - -# ----------------------------------------------------------------------------- -# Distributed data loader - -def _load_data_shard(file: Path): - header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 - assert header[0] == 20240520, "magic number mismatch in the data .bin file" - assert header[1] == 1, "unsupported version" - num_tokens = int(header[2]) # number of tokens (claimed) - with file.open("rb", buffering=0) as f: - tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng - f.seek(256 * 4) - nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng - assert nbytes == 2 * num_tokens, "number of tokens read does not match header" - return tokens - -BOS_ID = 50256 - -class BOSFinder: - # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd - def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): - # Precompute BOS positions once per shard - self.tokens=tokens - self.size = tokens.numel() - self.quickload = quickload - if quickload: - # only scan first 4 million tokens, then kickoff async thread to scan rest - self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() - self.thread = None - self.ready = threading.Event() - self.start() - else: - self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() - self.i = 0 - self.world_size = world_size - self.batch_iter = 0 - - def _load(self): - self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() - self.ready.set() - - def start(self): - self.ready.clear() - self.thread = threading.Thread(target=self._load) - self.thread.start() - - def get(self): - if self.thread: - self.ready.wait() - self.thread.join() - self.bos_idx = self.bos_idx_async - - def next_batch(self, num_tokens_local: int, max_seq_len: int): - # if quickload was used, repoint to the full dataset after 5 batches - if self.quickload and self.batch_iter==5: - self.get() - n = len(self.bos_idx) - starts = [[] for _ in range(self.world_size)] - ends = [[] for _ in range(self.world_size)] - - idx = self.i - for r in range(self.world_size): - cur_len = 0 - while cur_len <= num_tokens_local: - if idx >= n: - raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") - cur = self.bos_idx[idx] - starts[r].append(cur) - end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, - cur + max_seq_len, - cur + num_tokens_local - cur_len + 1) - ends[r].append(end) - cur_len += end - cur - idx += 1 - - assert cur_len == num_tokens_local + 1 - self.i = idx - self.batch_iter+=1 - return starts, ends - -class DataPreloader: - # Helper for asynchronously loading next shard and indexing bos tokens - def __init__(self, file_iter, world_size: int = 1): - self.file_iter = file_iter - self.world_size = world_size - self.thread = None - self.data = None - self.ready = threading.Event() - - def _load(self): - tokens = _load_data_shard(next(self.file_iter)) - self.data = (tokens, BOSFinder(tokens, self.world_size)) - self.ready.set() - - def start(self): - self.ready.clear() - self.thread = threading.Thread(target=self._load) - self.thread.start() - - def get(self): - if self.thread: - self.ready.wait() - self.thread.join() - return self.data - -def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): - # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len - rank = dist.get_rank() if dist.is_initialized() else 0 - world_size = dist.get_world_size() if dist.is_initialized() else 1 - assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" - num_tokens = num_tokens // grad_accum_steps - - files = [Path(file) for file in sorted(glob.glob(filename_pattern))] - if not files: - raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") - - file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training - tokens = _load_data_shard(next(file_iter)) - if align_to_bos: - finder = BOSFinder(tokens, world_size=world_size, quickload=True) - preloader = DataPreloader(file_iter, world_size) - preloader.start() - else: - pos = 0 # for unaligned case - - while True: - num_tokens_local = num_tokens // world_size - max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 - - if align_to_bos: - try: - seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) - start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) - except StopIteration: - # This shard is exhausted, load the next one in the next loop iteration. - tokens, finder = preloader.get() - preloader.start() - continue - - buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) - _inputs = buf[:-1] - _targets = buf[1:] - end_idxs[-1] -= 1 # last document was too long to account for _targets offset - cum_lengths = (end_idxs - start_idxs).cumsum(0) - - else: - if pos + num_tokens + 1 >= len(tokens): # should not occur for val data - tokens, pos = _load_data_shard(next(file_iter)), 0 - - pos_local = pos + rank * num_tokens_local - buf = tokens[pos_local: pos_local + num_tokens_local + 1] - _inputs = buf[:-1].view(num_tokens_local, ) - _targets = buf[1:].view(num_tokens_local, ) - - cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] - pos += num_tokens - - - _cum_lengths = torch.full((max_num_docs,), num_tokens_local) - _cum_lengths[0] = 0 - _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths - - new_params = yield ( - _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), - _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), - _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) - ) - - if new_params is not None: - # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() - new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params - assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" - num_tokens = new_num_tokens - max_seq_len = new_max_seq_len - grad_accum_steps = new_grad_accum_steps - - -# ----------------------------------------------------------------------------- -# int main - -@dataclass -class Hyperparameters: - # data - train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on - val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on - val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons - train_batch_size: int = 2048 * 16 * 8 - train_max_seq_len: int = 128 * 16 - val_batch_size: int = 4 * 64 * 1024 * 8 - # optimization - num_iterations: int = 2285 - lr_schedule = (0.5, 0.98) # breakpoints for 3-part schedule: (flat, linear decay, flat) - lr_min = 0.1 - # evaluation and logging - run_id: str = f"{uuid.uuid4()}" - val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end - save_checkpoint: bool = False - # attention masking - block_size: int = 128 - ws_schedule: tuple = (3, 5, 7, 9, 11, 13) - ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN - -args = Hyperparameters() - -data_path = os.environ.get("DATA_PATH", ".") -args.train_files = os.path.join(data_path, args.train_files) -args.val_files = os.path.join(data_path, args.val_files) - -# torchrun sets these env variables -rank = int(os.environ["RANK"]) -world_size = int(os.environ["WORLD_SIZE"]) -assert 8 % world_size == 0, "world_size must be a divisor of 8" -grad_accum_steps = 8 // world_size -assert torch.cuda.is_available() -device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) -torch.cuda.set_device(device) -dist.init_process_group(backend="nccl", device_id=device) -dist.barrier() -master_process = (rank == 0) # this process will do logging, checkpointing etc. - -# begin logging -logfile = None -if master_process: - run_id = args.run_id - os.makedirs("logs", exist_ok=True) - logfile = f"logs/{run_id}.txt" - print(logfile) -def print0(s, console=False): - if master_process: - with open(logfile, "a") as f: - if console: - print(s) - print(s, file=f) - -# begin by printing this file (the Python code) -print0(code) -print0("="*100) -# log information about the hardware/software environment this is running on -print0(f"Running Python {sys.version}") -print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") -print0(f"Running Triton version {triton.__version__}") - -def nvidia_smi(): - import subprocess # avoid top level import - return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout -print0(nvidia_smi()) -print0("="*100) - -model: nn.Module = GPT( - vocab_size=50257, - num_layers=12, - num_heads=6, - head_dim=128, - model_dim=768, - max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) -).cuda() -for m in model.modules(): - if isinstance(m, (nn.Embedding, nn.Linear)): - m.bfloat16() -for param in model.parameters(): - dist.broadcast(param.detach(), 0) - -# collect the parameters to optimize -hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] -embed_params = [p for n, p in model.named_parameters() if "embed" in n] -scalar_params = [p for p in model.parameters() if p.ndim < 2] -head_params = [model.lm_head.weight] -gate_params = [p for n, p in model.named_parameters() if "gate" in n] - -# init the optimizer(s) -# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence -# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 -optimizer1 = DistAdam( - scalar_params + head_params + embed_params, - lr=0.008, - betas=(0.65, 0.95), - eps=1e-8, - weight_decay=0.0, -) -optimizer2 = Muon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=0.0) -optimizers = [optimizer1, optimizer2] -for opt in optimizers: - for group in opt.param_groups: - group["initial_lr"] = group["lr"] - -def get_lr(step: int): - assert step < args.num_iterations - # Three part schedule: flat, linear decrease, flat - lr_schedule = args.lr_schedule - x = step / args.num_iterations - - if x < lr_schedule[0]: - return 1.0 - elif x < lr_schedule[1]: - progress = (x - lr_schedule[0]) / (lr_schedule[1] - lr_schedule[0]) - lr = 1.0 - (1.0 - args.lr_min) * progress - else: - lr = args.lr_min - return lr - -def get_ws(step: int): - assert step <= args.num_iterations - x = step / (args.num_iterations + 1) - ws_idx = int(len(args.ws_schedule) * x) - return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] - -def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): - # warmup phase: linearly increase momentum from min to max - # cooldown phase: linearly decrease momentum from max to min - momentum_cd_start = args.num_iterations - muon_cooldown_steps - if step < muon_warmup_steps: - frac = step / muon_warmup_steps - momentum = momentum_min + frac * (momentum_max - momentum_min) - elif step > momentum_cd_start: - frac = (step - momentum_cd_start) / muon_cooldown_steps - momentum = momentum_max - frac * (momentum_max - momentum_min) - else: - momentum = momentum_max - return momentum - -def step_optimizers(step: int, optimizers, model): - # update lr - for optimizer in optimizers: - for group in optimizer.param_groups: - group["lr"] = group["initial_lr"] * get_lr(step) - - # set muon momentum based on step - momentum = get_muon_momentum(step) - for group in optimizers[1].param_groups: - group["momentum"] = momentum - - # on even steps, only step Muon params - # on odd steps, step all params - if step%2==0: - optimizers[1].step() - optimizers[1].zero_grad(set_to_none=True) - else: - for optimizer in optimizers: - optimizer.step() - model.zero_grad(set_to_none=True) - -model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) - -######################################## -# Warmup kernels # -######################################## - -# Warmup the training kernels, then re-initialize the state so we aren't cheating -warmup_steps = 30 -initial_state = dict(model=copy.deepcopy(model.state_dict()), - optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state -train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) -for step in range(warmup_steps): - inputs, targets, cum_seqlens = next(train_loader) - # each window size is a new graph, need to warm up each with Yarn.attn_scale - ws_idx = step % len(args.ws_schedule) - if ws_idx==0: - model.yarn.reset() - ws_long = args.ws_schedule[0] - else: - new_ws_long = args.ws_schedule[ws_idx] - if new_ws_long > ws_long: - model.yarn.apply(ws_long, new_ws_long) - ws_long = new_ws_long - model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() - for opt in optimizers: - opt.step() - model.zero_grad(set_to_none=True) -model.yarn.reset() # rotary buffer is not stored in state_dict -model.load_state_dict(initial_state["model"]) -optimizer2.reset() # momentum buffer not in state dict -for opt, opt_state in zip(optimizers, initial_state["optimizers"]): - opt.load_state_dict(opt_state) -del train_loader, initial_state - -######################################## -# Training and validation # -######################################## - -train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) -training_time_ms = 0 -# start the clock -torch.cuda.synchronize() -t0 = time.perf_counter() -# begin training -train_steps = args.num_iterations -ws_short, ws_long = get_ws(0) -for step in range(train_steps + 1): - last_step = (step == train_steps) - ws_short, new_ws_long = get_ws(step) - if new_ws_long != ws_long: - model.yarn.apply(ws_long, new_ws_long) - ws_long=new_ws_long - - # --------------- VALIDATION SECTION ----------------- - if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): - if last_step: - ws_long = args.ws_validate_post_yarn_ext - # stop the clock - torch.cuda.synchronize() - training_time_ms += 1000 * (time.perf_counter() - t0) - model.eval() - assert args.val_tokens % args.val_batch_size == 0 - val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size - val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) - val_loss = 0 - with torch.no_grad(): - for _ in range(val_steps): - inputs, targets, cum_seqlens = next(val_loader) - val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) - val_loss /= val_steps - del val_loader - dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) - print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) - model.train() - # start the clock again - torch.cuda.synchronize() - t0 = time.perf_counter() - - if last_step: - if master_process and args.save_checkpoint: - log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) - os.makedirs(f"logs/{run_id}", exist_ok=True) - torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") - # the last step only has the validation loop, so break to avoid training - break - - # --------------- TRAINING SECTION ----------------- - loss = 0 - for _ in range(grad_accum_steps): - inputs, targets, cum_seqlens = next(train_loader) - loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps - loss.backward() - step_optimizers(step, optimizers, model) - - # logging - approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) - print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) - -print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " - f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) -dist.destroy_process_group() - -==================================================================================================== -Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] -Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 -Running Triton version 3.5.0 -Tue Oct 28 02:08:52 2025 -+-----------------------------------------------------------------------------------------+ -| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | -|-----------------------------------------+------------------------+----------------------+ -| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | -| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | -| | | MIG M. | -|=========================================+========================+======================| -| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | -| N/A 40C P0 128W / 700W | 5858MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | -| N/A 33C P0 127W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | -| N/A 32C P0 121W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | -| N/A 37C P0 124W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | -| N/A 39C P0 121W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | -| N/A 32C P0 120W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | -| N/A 38C P0 126W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ -| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | -| N/A 31C P0 114W / 700W | 1520MiB / 81559MiB | 0% Default | -| | | Disabled | -+-----------------------------------------+------------------------+----------------------+ - -+-----------------------------------------------------------------------------------------+ -| Processes: | -| GPU GI CI PID Type Process name GPU Memory | -| ID ID Usage | -|=========================================================================================| -+-----------------------------------------------------------------------------------------+ - -==================================================================================================== -step:0/2285 val_loss:10.8258 train_time:0ms step_avg:0.02ms -step:1/2285 train_time:109ms step_avg:109.38ms -step:2/2285 train_time:130ms step_avg:64.89ms -step:3/2285 train_time:168ms step_avg:56.07ms -step:4/2285 train_time:224ms step_avg:56.11ms -step:5/2285 train_time:284ms step_avg:56.71ms -step:6/2285 train_time:342ms step_avg:56.96ms -step:7/2285 train_time:402ms step_avg:57.43ms -step:8/2285 train_time:460ms step_avg:57.54ms -step:9/2285 train_time:521ms step_avg:57.89ms -step:10/2285 train_time:580ms step_avg:57.95ms -step:11/2285 train_time:640ms step_avg:58.21ms -step:12/2285 train_time:699ms step_avg:58.22ms -step:13/2285 train_time:759ms step_avg:58.39ms -step:14/2285 train_time:818ms step_avg:58.41ms -step:15/2285 train_time:878ms step_avg:58.55ms -step:16/2285 train_time:937ms step_avg:58.54ms -step:17/2285 train_time:999ms step_avg:58.76ms -step:18/2285 train_time:1061ms step_avg:58.93ms -step:19/2285 train_time:1126ms step_avg:59.24ms -step:20/2285 train_time:1186ms step_avg:59.32ms -step:21/2285 train_time:1248ms step_avg:59.42ms -step:22/2285 train_time:1307ms step_avg:59.40ms -step:23/2285 train_time:1368ms step_avg:59.46ms -step:24/2285 train_time:1426ms step_avg:59.42ms -step:25/2285 train_time:1487ms step_avg:59.49ms -step:26/2285 train_time:1546ms step_avg:59.46ms -step:27/2285 train_time:1607ms step_avg:59.53ms -step:28/2285 train_time:1666ms step_avg:59.49ms -step:29/2285 train_time:1727ms step_avg:59.55ms -step:30/2285 train_time:1786ms step_avg:59.53ms -step:31/2285 train_time:1847ms step_avg:59.59ms -step:32/2285 train_time:1906ms step_avg:59.56ms -step:33/2285 train_time:1968ms step_avg:59.63ms -step:34/2285 train_time:2027ms step_avg:59.63ms -step:35/2285 train_time:2090ms step_avg:59.71ms -step:36/2285 train_time:2149ms step_avg:59.70ms -step:37/2285 train_time:2211ms step_avg:59.76ms -step:38/2285 train_time:2270ms step_avg:59.73ms -step:39/2285 train_time:2331ms step_avg:59.78ms -step:40/2285 train_time:2390ms step_avg:59.76ms -step:41/2285 train_time:2452ms step_avg:59.81ms -step:42/2285 train_time:2512ms step_avg:59.80ms -step:43/2285 train_time:2573ms step_avg:59.83ms -step:44/2285 train_time:2632ms step_avg:59.82ms -step:45/2285 train_time:2693ms step_avg:59.85ms -step:46/2285 train_time:2752ms step_avg:59.83ms -step:47/2285 train_time:2813ms step_avg:59.86ms -step:48/2285 train_time:2873ms step_avg:59.85ms -step:49/2285 train_time:2934ms step_avg:59.89ms -step:50/2285 train_time:2995ms step_avg:59.89ms -step:51/2285 train_time:3057ms step_avg:59.94ms -step:52/2285 train_time:3117ms step_avg:59.94ms -step:53/2285 train_time:3178ms step_avg:59.97ms -step:54/2285 train_time:3237ms step_avg:59.94ms -step:55/2285 train_time:3299ms step_avg:59.98ms -step:56/2285 train_time:3358ms step_avg:59.96ms -step:57/2285 train_time:3419ms step_avg:59.99ms -step:58/2285 train_time:3479ms step_avg:59.98ms -step:59/2285 train_time:3542ms step_avg:60.03ms -step:60/2285 train_time:3601ms step_avg:60.01ms -step:61/2285 train_time:3662ms step_avg:60.03ms -step:62/2285 train_time:3721ms step_avg:60.01ms -step:63/2285 train_time:3782ms step_avg:60.04ms -step:64/2285 train_time:3842ms step_avg:60.02ms -step:65/2285 train_time:3903ms step_avg:60.05ms -step:66/2285 train_time:3963ms step_avg:60.05ms -step:67/2285 train_time:4025ms step_avg:60.07ms -step:68/2285 train_time:4083ms step_avg:60.05ms -step:69/2285 train_time:4145ms step_avg:60.07ms -step:70/2285 train_time:4204ms step_avg:60.05ms -step:71/2285 train_time:4265ms step_avg:60.07ms -step:72/2285 train_time:4324ms step_avg:60.05ms -step:73/2285 train_time:4385ms step_avg:60.06ms -step:74/2285 train_time:4444ms step_avg:60.06ms -step:75/2285 train_time:4506ms step_avg:60.08ms -step:76/2285 train_time:4565ms step_avg:60.06ms -step:77/2285 train_time:4626ms step_avg:60.08ms -step:78/2285 train_time:4685ms step_avg:60.06ms -step:79/2285 train_time:4747ms step_avg:60.08ms -step:80/2285 train_time:4806ms step_avg:60.08ms -step:81/2285 train_time:4868ms step_avg:60.10ms -step:82/2285 train_time:4927ms step_avg:60.08ms -step:83/2285 train_time:4988ms step_avg:60.10ms -step:84/2285 train_time:5048ms step_avg:60.09ms -step:85/2285 train_time:5109ms step_avg:60.11ms -step:86/2285 train_time:5168ms step_avg:60.10ms -step:87/2285 train_time:5229ms step_avg:60.11ms -step:88/2285 train_time:5288ms step_avg:60.09ms -step:89/2285 train_time:5349ms step_avg:60.10ms -step:90/2285 train_time:5408ms step_avg:60.09ms -step:91/2285 train_time:5469ms step_avg:60.10ms -step:92/2285 train_time:5529ms step_avg:60.10ms -step:93/2285 train_time:5590ms step_avg:60.11ms -step:94/2285 train_time:5650ms step_avg:60.11ms -step:95/2285 train_time:5712ms step_avg:60.12ms -step:96/2285 train_time:5770ms step_avg:60.11ms -step:97/2285 train_time:5832ms step_avg:60.12ms -step:98/2285 train_time:5892ms step_avg:60.12ms -step:99/2285 train_time:5953ms step_avg:60.13ms -step:100/2285 train_time:6012ms step_avg:60.12ms -step:101/2285 train_time:6073ms step_avg:60.13ms -step:102/2285 train_time:6131ms step_avg:60.11ms -step:103/2285 train_time:6193ms step_avg:60.13ms -step:104/2285 train_time:6252ms step_avg:60.12ms -step:105/2285 train_time:6313ms step_avg:60.12ms -step:106/2285 train_time:6372ms step_avg:60.11ms -step:107/2285 train_time:6433ms step_avg:60.12ms -step:108/2285 train_time:6493ms step_avg:60.12ms -step:109/2285 train_time:6555ms step_avg:60.14ms -step:110/2285 train_time:6614ms step_avg:60.13ms -step:111/2285 train_time:6675ms step_avg:60.14ms -step:112/2285 train_time:6734ms step_avg:60.13ms -step:113/2285 train_time:6796ms step_avg:60.14ms -step:114/2285 train_time:6855ms step_avg:60.14ms -step:115/2285 train_time:6917ms step_avg:60.15ms -step:116/2285 train_time:6976ms step_avg:60.14ms -step:117/2285 train_time:7037ms step_avg:60.15ms -step:118/2285 train_time:7096ms step_avg:60.13ms -step:119/2285 train_time:7158ms step_avg:60.15ms -step:120/2285 train_time:7217ms step_avg:60.14ms -step:121/2285 train_time:7278ms step_avg:60.15ms -step:122/2285 train_time:7337ms step_avg:60.14ms -step:123/2285 train_time:7398ms step_avg:60.14ms -step:124/2285 train_time:7456ms step_avg:60.13ms -step:125/2285 train_time:7518ms step_avg:60.14ms -step:126/2285 train_time:7577ms step_avg:60.13ms -step:127/2285 train_time:7638ms step_avg:60.14ms -step:128/2285 train_time:7697ms step_avg:60.13ms -step:129/2285 train_time:7758ms step_avg:60.14ms -step:130/2285 train_time:7817ms step_avg:60.13ms -step:131/2285 train_time:7878ms step_avg:60.14ms -step:132/2285 train_time:7937ms step_avg:60.13ms -step:133/2285 train_time:7999ms step_avg:60.14ms -step:134/2285 train_time:8058ms step_avg:60.13ms -step:135/2285 train_time:8118ms step_avg:60.14ms -step:136/2285 train_time:8177ms step_avg:60.12ms -step:137/2285 train_time:8238ms step_avg:60.13ms -step:138/2285 train_time:8296ms step_avg:60.12ms -step:139/2285 train_time:8357ms step_avg:60.12ms -step:140/2285 train_time:8417ms step_avg:60.12ms -step:141/2285 train_time:8479ms step_avg:60.13ms -step:142/2285 train_time:8537ms step_avg:60.12ms -step:143/2285 train_time:8599ms step_avg:60.13ms -step:144/2285 train_time:8658ms step_avg:60.12ms -step:145/2285 train_time:8719ms step_avg:60.13ms -step:146/2285 train_time:8778ms step_avg:60.12ms -step:147/2285 train_time:8839ms step_avg:60.13ms -step:148/2285 train_time:8898ms step_avg:60.12ms -step:149/2285 train_time:8959ms step_avg:60.13ms -step:150/2285 train_time:9018ms step_avg:60.12ms -step:151/2285 train_time:9079ms step_avg:60.13ms -step:152/2285 train_time:9138ms step_avg:60.12ms -step:153/2285 train_time:9199ms step_avg:60.12ms -step:154/2285 train_time:9257ms step_avg:60.11ms -step:155/2285 train_time:9318ms step_avg:60.12ms -step:156/2285 train_time:9377ms step_avg:60.11ms -step:157/2285 train_time:9438ms step_avg:60.11ms -step:158/2285 train_time:9497ms step_avg:60.11ms -step:159/2285 train_time:9558ms step_avg:60.11ms -step:160/2285 train_time:9617ms step_avg:60.10ms -step:161/2285 train_time:9678ms step_avg:60.11ms -step:162/2285 train_time:9737ms step_avg:60.10ms -step:163/2285 train_time:9798ms step_avg:60.11ms -step:164/2285 train_time:9856ms step_avg:60.10ms -step:165/2285 train_time:9918ms step_avg:60.11ms -step:166/2285 train_time:9976ms step_avg:60.10ms -step:167/2285 train_time:10037ms step_avg:60.10ms -step:168/2285 train_time:10096ms step_avg:60.10ms -step:169/2285 train_time:10158ms step_avg:60.10ms -step:170/2285 train_time:10216ms step_avg:60.09ms -step:171/2285 train_time:10277ms step_avg:60.10ms -step:172/2285 train_time:10335ms step_avg:60.09ms -step:173/2285 train_time:10397ms step_avg:60.10ms -step:174/2285 train_time:10455ms step_avg:60.09ms -step:175/2285 train_time:10516ms step_avg:60.09ms -step:176/2285 train_time:10575ms step_avg:60.08ms -step:177/2285 train_time:10637ms step_avg:60.09ms -step:178/2285 train_time:10696ms step_avg:60.09ms -step:179/2285 train_time:10757ms step_avg:60.09ms -step:180/2285 train_time:10815ms step_avg:60.09ms -step:181/2285 train_time:10876ms step_avg:60.09ms -step:182/2285 train_time:10935ms step_avg:60.08ms -step:183/2285 train_time:10997ms step_avg:60.09ms -step:184/2285 train_time:11056ms step_avg:60.08ms -step:185/2285 train_time:11117ms step_avg:60.09ms -step:186/2285 train_time:11176ms step_avg:60.08ms -step:187/2285 train_time:11237ms step_avg:60.09ms -step:188/2285 train_time:11296ms step_avg:60.08ms -step:189/2285 train_time:11357ms step_avg:60.09ms -step:190/2285 train_time:11416ms step_avg:60.08ms -step:191/2285 train_time:11477ms step_avg:60.09ms -step:192/2285 train_time:11536ms step_avg:60.08ms -step:193/2285 train_time:11597ms step_avg:60.09ms -step:194/2285 train_time:11655ms step_avg:60.08ms -step:195/2285 train_time:11716ms step_avg:60.08ms -step:196/2285 train_time:11775ms step_avg:60.08ms -step:197/2285 train_time:11837ms step_avg:60.08ms -step:198/2285 train_time:11896ms step_avg:60.08ms -step:199/2285 train_time:11957ms step_avg:60.08ms -step:200/2285 train_time:12015ms step_avg:60.08ms -step:201/2285 train_time:12076ms step_avg:60.08ms -step:202/2285 train_time:12135ms step_avg:60.07ms -step:203/2285 train_time:12196ms step_avg:60.08ms -step:204/2285 train_time:12255ms step_avg:60.07ms -step:205/2285 train_time:12316ms step_avg:60.08ms -step:206/2285 train_time:12375ms step_avg:60.07ms -step:207/2285 train_time:12436ms step_avg:60.08ms -step:208/2285 train_time:12495ms step_avg:60.07ms -step:209/2285 train_time:12556ms step_avg:60.08ms -step:210/2285 train_time:12615ms step_avg:60.07ms -step:211/2285 train_time:12676ms step_avg:60.08ms -step:212/2285 train_time:12735ms step_avg:60.07ms -step:213/2285 train_time:12797ms step_avg:60.08ms -step:214/2285 train_time:12856ms step_avg:60.07ms -step:215/2285 train_time:12917ms step_avg:60.08ms -step:216/2285 train_time:12975ms step_avg:60.07ms -step:217/2285 train_time:13036ms step_avg:60.07ms -step:218/2285 train_time:13095ms step_avg:60.07ms -step:219/2285 train_time:13156ms step_avg:60.07ms -step:220/2285 train_time:13215ms step_avg:60.07ms -step:221/2285 train_time:13276ms step_avg:60.07ms -step:222/2285 train_time:13334ms step_avg:60.06ms -step:223/2285 train_time:13395ms step_avg:60.07ms -step:224/2285 train_time:13454ms step_avg:60.06ms -step:225/2285 train_time:13515ms step_avg:60.07ms -step:226/2285 train_time:13574ms step_avg:60.06ms -step:227/2285 train_time:13635ms step_avg:60.07ms -step:228/2285 train_time:13694ms step_avg:60.06ms -step:229/2285 train_time:13756ms step_avg:60.07ms -step:230/2285 train_time:13815ms step_avg:60.06ms -step:231/2285 train_time:13876ms step_avg:60.07ms -step:232/2285 train_time:13934ms step_avg:60.06ms -step:233/2285 train_time:13996ms step_avg:60.07ms -step:234/2285 train_time:14054ms step_avg:60.06ms -step:235/2285 train_time:14115ms step_avg:60.06ms -step:236/2285 train_time:14174ms step_avg:60.06ms -step:237/2285 train_time:14235ms step_avg:60.06ms -step:238/2285 train_time:14295ms step_avg:60.06ms -step:239/2285 train_time:14356ms step_avg:60.07ms -step:240/2285 train_time:14415ms step_avg:60.06ms -step:241/2285 train_time:14476ms step_avg:60.07ms -step:242/2285 train_time:14534ms step_avg:60.06ms -step:243/2285 train_time:14596ms step_avg:60.06ms -step:244/2285 train_time:14655ms step_avg:60.06ms -step:245/2285 train_time:14716ms step_avg:60.06ms -step:246/2285 train_time:14774ms step_avg:60.06ms -step:247/2285 train_time:14835ms step_avg:60.06ms -step:248/2285 train_time:14894ms step_avg:60.06ms -step:249/2285 train_time:14955ms step_avg:60.06ms -step:250/2285 train_time:15014ms step_avg:60.05ms -step:250/2285 val_loss:4.0722 train_time:15076ms step_avg:60.30ms -step:251/2285 train_time:15094ms step_avg:60.14ms -step:252/2285 train_time:15135ms step_avg:60.06ms -step:253/2285 train_time:15202ms step_avg:60.09ms -step:254/2285 train_time:15264ms step_avg:60.09ms -step:255/2285 train_time:15327ms step_avg:60.11ms -step:256/2285 train_time:15387ms step_avg:60.10ms -step:257/2285 train_time:15448ms step_avg:60.11ms -step:258/2285 train_time:15506ms step_avg:60.10ms -step:259/2285 train_time:15567ms step_avg:60.10ms -step:260/2285 train_time:15625ms step_avg:60.09ms -step:261/2285 train_time:15684ms step_avg:60.09ms -step:262/2285 train_time:15742ms step_avg:60.08ms -step:263/2285 train_time:15802ms step_avg:60.08ms -step:264/2285 train_time:15860ms step_avg:60.08ms -step:265/2285 train_time:15920ms step_avg:60.07ms -step:266/2285 train_time:15977ms step_avg:60.07ms -step:267/2285 train_time:16037ms step_avg:60.07ms -step:268/2285 train_time:16096ms step_avg:60.06ms -step:269/2285 train_time:16157ms step_avg:60.06ms -step:270/2285 train_time:16216ms step_avg:60.06ms -step:271/2285 train_time:16279ms step_avg:60.07ms -step:272/2285 train_time:16338ms step_avg:60.07ms -step:273/2285 train_time:16399ms step_avg:60.07ms -step:274/2285 train_time:16458ms step_avg:60.07ms -step:275/2285 train_time:16519ms step_avg:60.07ms -step:276/2285 train_time:16578ms step_avg:60.07ms -step:277/2285 train_time:16639ms step_avg:60.07ms -step:278/2285 train_time:16698ms step_avg:60.06ms -step:279/2285 train_time:16758ms step_avg:60.07ms -step:280/2285 train_time:16816ms step_avg:60.06ms -step:281/2285 train_time:16877ms step_avg:60.06ms -step:282/2285 train_time:16936ms step_avg:60.06ms -step:283/2285 train_time:16996ms step_avg:60.06ms -step:284/2285 train_time:17054ms step_avg:60.05ms -step:285/2285 train_time:17115ms step_avg:60.05ms -step:286/2285 train_time:17173ms step_avg:60.05ms -step:287/2285 train_time:17235ms step_avg:60.05ms -step:288/2285 train_time:17293ms step_avg:60.05ms -step:289/2285 train_time:17355ms step_avg:60.05ms -step:290/2285 train_time:17414ms step_avg:60.05ms -step:291/2285 train_time:17476ms step_avg:60.05ms -step:292/2285 train_time:17535ms step_avg:60.05ms -step:293/2285 train_time:17596ms step_avg:60.06ms -step:294/2285 train_time:17655ms step_avg:60.05ms -step:295/2285 train_time:17716ms step_avg:60.05ms -step:296/2285 train_time:17774ms step_avg:60.05ms -step:297/2285 train_time:17835ms step_avg:60.05ms -step:298/2285 train_time:17894ms step_avg:60.05ms -step:299/2285 train_time:17954ms step_avg:60.05ms -step:300/2285 train_time:18012ms step_avg:60.04ms -step:301/2285 train_time:18072ms step_avg:60.04ms -step:302/2285 train_time:18130ms step_avg:60.03ms -step:303/2285 train_time:18191ms step_avg:60.04ms -step:304/2285 train_time:18250ms step_avg:60.03ms -step:305/2285 train_time:18311ms step_avg:60.04ms -step:306/2285 train_time:18370ms step_avg:60.03ms -step:307/2285 train_time:18431ms step_avg:60.04ms -step:308/2285 train_time:18490ms step_avg:60.03ms -step:309/2285 train_time:18552ms step_avg:60.04ms -step:310/2285 train_time:18611ms step_avg:60.03ms -step:311/2285 train_time:18672ms step_avg:60.04ms -step:312/2285 train_time:18730ms step_avg:60.03ms -step:313/2285 train_time:18791ms step_avg:60.04ms -step:314/2285 train_time:18850ms step_avg:60.03ms -step:315/2285 train_time:18911ms step_avg:60.03ms -step:316/2285 train_time:18969ms step_avg:60.03ms -step:317/2285 train_time:19030ms step_avg:60.03ms -step:318/2285 train_time:19088ms step_avg:60.02ms -step:319/2285 train_time:19148ms step_avg:60.03ms -step:320/2285 train_time:19207ms step_avg:60.02ms -step:321/2285 train_time:19269ms step_avg:60.03ms -step:322/2285 train_time:19328ms step_avg:60.02ms -step:323/2285 train_time:19390ms step_avg:60.03ms -step:324/2285 train_time:19449ms step_avg:60.03ms -step:325/2285 train_time:19511ms step_avg:60.03ms -step:326/2285 train_time:19570ms step_avg:60.03ms -step:327/2285 train_time:19631ms step_avg:60.03ms -step:328/2285 train_time:19690ms step_avg:60.03ms -step:329/2285 train_time:19752ms step_avg:60.04ms -step:330/2285 train_time:19810ms step_avg:60.03ms -step:331/2285 train_time:19871ms step_avg:60.03ms -step:332/2285 train_time:19930ms step_avg:60.03ms -step:333/2285 train_time:19990ms step_avg:60.03ms -step:334/2285 train_time:20048ms step_avg:60.03ms -step:335/2285 train_time:20109ms step_avg:60.03ms -step:336/2285 train_time:20167ms step_avg:60.02ms -step:337/2285 train_time:20228ms step_avg:60.02ms -step:338/2285 train_time:20286ms step_avg:60.02ms -step:339/2285 train_time:20348ms step_avg:60.02ms -step:340/2285 train_time:20407ms step_avg:60.02ms -step:341/2285 train_time:20468ms step_avg:60.02ms -step:342/2285 train_time:20526ms step_avg:60.02ms -step:343/2285 train_time:20588ms step_avg:60.02ms -step:344/2285 train_time:20647ms step_avg:60.02ms -step:345/2285 train_time:20709ms step_avg:60.03ms -step:346/2285 train_time:20768ms step_avg:60.02ms -step:347/2285 train_time:20829ms step_avg:60.02ms -step:348/2285 train_time:20887ms step_avg:60.02ms -step:349/2285 train_time:20948ms step_avg:60.02ms -step:350/2285 train_time:21007ms step_avg:60.02ms -step:351/2285 train_time:21067ms step_avg:60.02ms -step:352/2285 train_time:21126ms step_avg:60.02ms -step:353/2285 train_time:21186ms step_avg:60.02ms -step:354/2285 train_time:21245ms step_avg:60.01ms -step:355/2285 train_time:21306ms step_avg:60.02ms -step:356/2285 train_time:21365ms step_avg:60.01ms -step:357/2285 train_time:21426ms step_avg:60.02ms -step:358/2285 train_time:21485ms step_avg:60.02ms -step:359/2285 train_time:21547ms step_avg:60.02ms -step:360/2285 train_time:21606ms step_avg:60.02ms -step:361/2285 train_time:21667ms step_avg:60.02ms -step:362/2285 train_time:21726ms step_avg:60.02ms -step:363/2285 train_time:21787ms step_avg:60.02ms -step:364/2285 train_time:21846ms step_avg:60.02ms -step:365/2285 train_time:21907ms step_avg:60.02ms -step:366/2285 train_time:21965ms step_avg:60.01ms -step:367/2285 train_time:22027ms step_avg:60.02ms -step:368/2285 train_time:22085ms step_avg:60.01ms -step:369/2285 train_time:22146ms step_avg:60.02ms -step:370/2285 train_time:22204ms step_avg:60.01ms -step:371/2285 train_time:22264ms step_avg:60.01ms -step:372/2285 train_time:22323ms step_avg:60.01ms -step:373/2285 train_time:22384ms step_avg:60.01ms -step:374/2285 train_time:22443ms step_avg:60.01ms -step:375/2285 train_time:22504ms step_avg:60.01ms -step:376/2285 train_time:22563ms step_avg:60.01ms -step:377/2285 train_time:22624ms step_avg:60.01ms -step:378/2285 train_time:22683ms step_avg:60.01ms -step:379/2285 train_time:22745ms step_avg:60.01ms -step:380/2285 train_time:22804ms step_avg:60.01ms -step:381/2285 train_time:22865ms step_avg:60.01ms -step:382/2285 train_time:22924ms step_avg:60.01ms -step:383/2285 train_time:22985ms step_avg:60.01ms -step:384/2285 train_time:23044ms step_avg:60.01ms -step:385/2285 train_time:23105ms step_avg:60.01ms -step:386/2285 train_time:23164ms step_avg:60.01ms -step:387/2285 train_time:23225ms step_avg:60.01ms -step:388/2285 train_time:23284ms step_avg:60.01ms -step:389/2285 train_time:23345ms step_avg:60.01ms -step:390/2285 train_time:23404ms step_avg:60.01ms -step:391/2285 train_time:23466ms step_avg:60.01ms -step:392/2285 train_time:23525ms step_avg:60.01ms -step:393/2285 train_time:23587ms step_avg:60.02ms -step:394/2285 train_time:23648ms step_avg:60.02ms -step:395/2285 train_time:23709ms step_avg:60.02ms -step:396/2285 train_time:23769ms step_avg:60.02ms -step:397/2285 train_time:23830ms step_avg:60.03ms -step:398/2285 train_time:23889ms step_avg:60.02ms -step:399/2285 train_time:23951ms step_avg:60.03ms -step:400/2285 train_time:24010ms step_avg:60.02ms -step:401/2285 train_time:24071ms step_avg:60.03ms -step:402/2285 train_time:24130ms step_avg:60.03ms -step:403/2285 train_time:24192ms step_avg:60.03ms -step:404/2285 train_time:24251ms step_avg:60.03ms -step:405/2285 train_time:24312ms step_avg:60.03ms -step:406/2285 train_time:24371ms step_avg:60.03ms -step:407/2285 train_time:24433ms step_avg:60.03ms -step:408/2285 train_time:24492ms step_avg:60.03ms -step:409/2285 train_time:24554ms step_avg:60.03ms -step:410/2285 train_time:24614ms step_avg:60.03ms -step:411/2285 train_time:24675ms step_avg:60.04ms -step:412/2285 train_time:24735ms step_avg:60.04ms -step:413/2285 train_time:24796ms step_avg:60.04ms -step:414/2285 train_time:24856ms step_avg:60.04ms -step:415/2285 train_time:24917ms step_avg:60.04ms -step:416/2285 train_time:24976ms step_avg:60.04ms -step:417/2285 train_time:25037ms step_avg:60.04ms -step:418/2285 train_time:25095ms step_avg:60.04ms -step:419/2285 train_time:25157ms step_avg:60.04ms -step:420/2285 train_time:25216ms step_avg:60.04ms -step:421/2285 train_time:25277ms step_avg:60.04ms -step:422/2285 train_time:25335ms step_avg:60.04ms -step:423/2285 train_time:25397ms step_avg:60.04ms -step:424/2285 train_time:25456ms step_avg:60.04ms -step:425/2285 train_time:25518ms step_avg:60.04ms -step:426/2285 train_time:25576ms step_avg:60.04ms -step:427/2285 train_time:25638ms step_avg:60.04ms -step:428/2285 train_time:25697ms step_avg:60.04ms -step:429/2285 train_time:25758ms step_avg:60.04ms -step:430/2285 train_time:25817ms step_avg:60.04ms -step:431/2285 train_time:25879ms step_avg:60.04ms -step:432/2285 train_time:25937ms step_avg:60.04ms -step:433/2285 train_time:25999ms step_avg:60.04ms -step:434/2285 train_time:26058ms step_avg:60.04ms -step:435/2285 train_time:26120ms step_avg:60.05ms -step:436/2285 train_time:26179ms step_avg:60.04ms -step:437/2285 train_time:26239ms step_avg:60.04ms -step:438/2285 train_time:26299ms step_avg:60.04ms -step:439/2285 train_time:26360ms step_avg:60.04ms -step:440/2285 train_time:26419ms step_avg:60.04ms -step:441/2285 train_time:26479ms step_avg:60.04ms -step:442/2285 train_time:26538ms step_avg:60.04ms -step:443/2285 train_time:26599ms step_avg:60.04ms -step:444/2285 train_time:26658ms step_avg:60.04ms -step:445/2285 train_time:26719ms step_avg:60.04ms -step:446/2285 train_time:26778ms step_avg:60.04ms -step:447/2285 train_time:26840ms step_avg:60.04ms -step:448/2285 train_time:26899ms step_avg:60.04ms -step:449/2285 train_time:26960ms step_avg:60.05ms -step:450/2285 train_time:27019ms step_avg:60.04ms -step:451/2285 train_time:27081ms step_avg:60.05ms -step:452/2285 train_time:27140ms step_avg:60.04ms -step:453/2285 train_time:27201ms step_avg:60.05ms -step:454/2285 train_time:27260ms step_avg:60.04ms -step:455/2285 train_time:27321ms step_avg:60.05ms -step:456/2285 train_time:27380ms step_avg:60.04ms -step:457/2285 train_time:27441ms step_avg:60.05ms -step:458/2285 train_time:27500ms step_avg:60.04ms -step:459/2285 train_time:27561ms step_avg:60.05ms -step:460/2285 train_time:27620ms step_avg:60.04ms -step:461/2285 train_time:27681ms step_avg:60.05ms -step:462/2285 train_time:27740ms step_avg:60.04ms -step:463/2285 train_time:27801ms step_avg:60.05ms -step:464/2285 train_time:27860ms step_avg:60.04ms -step:465/2285 train_time:27921ms step_avg:60.04ms -step:466/2285 train_time:27980ms step_avg:60.04ms -step:467/2285 train_time:28041ms step_avg:60.05ms -step:468/2285 train_time:28100ms step_avg:60.04ms -step:469/2285 train_time:28162ms step_avg:60.05ms -step:470/2285 train_time:28220ms step_avg:60.04ms -step:471/2285 train_time:28281ms step_avg:60.05ms -step:472/2285 train_time:28340ms step_avg:60.04ms -step:473/2285 train_time:28401ms step_avg:60.04ms -step:474/2285 train_time:28461ms step_avg:60.04ms -step:475/2285 train_time:28522ms step_avg:60.05ms -step:476/2285 train_time:28581ms step_avg:60.04ms -step:477/2285 train_time:28642ms step_avg:60.05ms -step:478/2285 train_time:28702ms step_avg:60.05ms -step:479/2285 train_time:28763ms step_avg:60.05ms -step:480/2285 train_time:28822ms step_avg:60.05ms -step:481/2285 train_time:28883ms step_avg:60.05ms -step:482/2285 train_time:28942ms step_avg:60.05ms -step:483/2285 train_time:29003ms step_avg:60.05ms -step:484/2285 train_time:29062ms step_avg:60.05ms -step:485/2285 train_time:29124ms step_avg:60.05ms -step:486/2285 train_time:29182ms step_avg:60.05ms -step:487/2285 train_time:29244ms step_avg:60.05ms -step:488/2285 train_time:29302ms step_avg:60.05ms -step:489/2285 train_time:29364ms step_avg:60.05ms -step:490/2285 train_time:29423ms step_avg:60.05ms -step:491/2285 train_time:29484ms step_avg:60.05ms -step:492/2285 train_time:29543ms step_avg:60.05ms -step:493/2285 train_time:29604ms step_avg:60.05ms -step:494/2285 train_time:29663ms step_avg:60.05ms -step:495/2285 train_time:29724ms step_avg:60.05ms -step:496/2285 train_time:29782ms step_avg:60.04ms -step:497/2285 train_time:29844ms step_avg:60.05ms -step:498/2285 train_time:29903ms step_avg:60.05ms -step:499/2285 train_time:29964ms step_avg:60.05ms -step:500/2285 train_time:30023ms step_avg:60.05ms -step:500/2285 val_loss:3.7848 train_time:30086ms step_avg:60.17ms -step:501/2285 train_time:30105ms step_avg:60.09ms -step:502/2285 train_time:30146ms step_avg:60.05ms -step:503/2285 train_time:30206ms step_avg:60.05ms -step:504/2285 train_time:30266ms step_avg:60.05ms -step:505/2285 train_time:30330ms step_avg:60.06ms -step:506/2285 train_time:30389ms step_avg:60.06ms -step:507/2285 train_time:30449ms step_avg:60.06ms -step:508/2285 train_time:30507ms step_avg:60.05ms -step:509/2285 train_time:30568ms step_avg:60.06ms -step:510/2285 train_time:30627ms step_avg:60.05ms -step:511/2285 train_time:30687ms step_avg:60.05ms -step:512/2285 train_time:30745ms step_avg:60.05ms -step:513/2285 train_time:30807ms step_avg:60.05ms -step:514/2285 train_time:30867ms step_avg:60.05ms -step:515/2285 train_time:30928ms step_avg:60.05ms -step:516/2285 train_time:30988ms step_avg:60.05ms -step:517/2285 train_time:31055ms step_avg:60.07ms -step:518/2285 train_time:31115ms step_avg:60.07ms -step:519/2285 train_time:31178ms step_avg:60.07ms -step:520/2285 train_time:31236ms step_avg:60.07ms -step:521/2285 train_time:31298ms step_avg:60.07ms -step:522/2285 train_time:31357ms step_avg:60.07ms -step:523/2285 train_time:31417ms step_avg:60.07ms -step:524/2285 train_time:31476ms step_avg:60.07ms -step:525/2285 train_time:31537ms step_avg:60.07ms -step:526/2285 train_time:31596ms step_avg:60.07ms -step:527/2285 train_time:31657ms step_avg:60.07ms -step:528/2285 train_time:31716ms step_avg:60.07ms -step:529/2285 train_time:31777ms step_avg:60.07ms -step:530/2285 train_time:31837ms step_avg:60.07ms -step:531/2285 train_time:31899ms step_avg:60.07ms -step:532/2285 train_time:31958ms step_avg:60.07ms -step:533/2285 train_time:32019ms step_avg:60.07ms -step:534/2285 train_time:32079ms step_avg:60.07ms -step:535/2285 train_time:32140ms step_avg:60.08ms -step:536/2285 train_time:32199ms step_avg:60.07ms -step:537/2285 train_time:32261ms step_avg:60.08ms -step:538/2285 train_time:32320ms step_avg:60.07ms -step:539/2285 train_time:32381ms step_avg:60.08ms -step:540/2285 train_time:32440ms step_avg:60.07ms -step:541/2285 train_time:32501ms step_avg:60.08ms -step:542/2285 train_time:32560ms step_avg:60.07ms -step:543/2285 train_time:32622ms step_avg:60.08ms -step:544/2285 train_time:32680ms step_avg:60.07ms -step:545/2285 train_time:32741ms step_avg:60.08ms -step:546/2285 train_time:32800ms step_avg:60.07ms -step:547/2285 train_time:32862ms step_avg:60.08ms -step:548/2285 train_time:32921ms step_avg:60.07ms -step:549/2285 train_time:32982ms step_avg:60.08ms -step:550/2285 train_time:33041ms step_avg:60.07ms -step:551/2285 train_time:33102ms step_avg:60.08ms -step:552/2285 train_time:33161ms step_avg:60.07ms -step:553/2285 train_time:33223ms step_avg:60.08ms -step:554/2285 train_time:33282ms step_avg:60.08ms -step:555/2285 train_time:33344ms step_avg:60.08ms -step:556/2285 train_time:33403ms step_avg:60.08ms -step:557/2285 train_time:33465ms step_avg:60.08ms -step:558/2285 train_time:33525ms step_avg:60.08ms -step:559/2285 train_time:33586ms step_avg:60.08ms -step:560/2285 train_time:33645ms step_avg:60.08ms -step:561/2285 train_time:33707ms step_avg:60.08ms -step:562/2285 train_time:33766ms step_avg:60.08ms -step:563/2285 train_time:33828ms step_avg:60.08ms -step:564/2285 train_time:33887ms step_avg:60.08ms -step:565/2285 train_time:33948ms step_avg:60.09ms -step:566/2285 train_time:34008ms step_avg:60.08ms -step:567/2285 train_time:34069ms step_avg:60.09ms -step:568/2285 train_time:34129ms step_avg:60.09ms -step:569/2285 train_time:34190ms step_avg:60.09ms -step:570/2285 train_time:34249ms step_avg:60.09ms -step:571/2285 train_time:34311ms step_avg:60.09ms -step:572/2285 train_time:34369ms step_avg:60.09ms -step:573/2285 train_time:34431ms step_avg:60.09ms -step:574/2285 train_time:34490ms step_avg:60.09ms -step:575/2285 train_time:34551ms step_avg:60.09ms -step:576/2285 train_time:34610ms step_avg:60.09ms -step:577/2285 train_time:34671ms step_avg:60.09ms -step:578/2285 train_time:34730ms step_avg:60.09ms -step:579/2285 train_time:34792ms step_avg:60.09ms -step:580/2285 train_time:34851ms step_avg:60.09ms -step:581/2285 train_time:34913ms step_avg:60.09ms -step:582/2285 train_time:34972ms step_avg:60.09ms -step:583/2285 train_time:35033ms step_avg:60.09ms -step:584/2285 train_time:35092ms step_avg:60.09ms -step:585/2285 train_time:35154ms step_avg:60.09ms -step:586/2285 train_time:35213ms step_avg:60.09ms -step:587/2285 train_time:35275ms step_avg:60.09ms -step:588/2285 train_time:35334ms step_avg:60.09ms -step:589/2285 train_time:35395ms step_avg:60.09ms -step:590/2285 train_time:35454ms step_avg:60.09ms -step:591/2285 train_time:35515ms step_avg:60.09ms -step:592/2285 train_time:35574ms step_avg:60.09ms -step:593/2285 train_time:35636ms step_avg:60.09ms -step:594/2285 train_time:35695ms step_avg:60.09ms -step:595/2285 train_time:35756ms step_avg:60.09ms -step:596/2285 train_time:35815ms step_avg:60.09ms -step:597/2285 train_time:35876ms step_avg:60.09ms -step:598/2285 train_time:35936ms step_avg:60.09ms -step:599/2285 train_time:35997ms step_avg:60.10ms -step:600/2285 train_time:36056ms step_avg:60.09ms -step:601/2285 train_time:36117ms step_avg:60.10ms -step:602/2285 train_time:36177ms step_avg:60.09ms -step:603/2285 train_time:36240ms step_avg:60.10ms -step:604/2285 train_time:36298ms step_avg:60.10ms -step:605/2285 train_time:36360ms step_avg:60.10ms -step:606/2285 train_time:36418ms step_avg:60.10ms -step:607/2285 train_time:36480ms step_avg:60.10ms -step:608/2285 train_time:36539ms step_avg:60.10ms -step:609/2285 train_time:36600ms step_avg:60.10ms -step:610/2285 train_time:36659ms step_avg:60.10ms -step:611/2285 train_time:36721ms step_avg:60.10ms -step:612/2285 train_time:36781ms step_avg:60.10ms -step:613/2285 train_time:36842ms step_avg:60.10ms -step:614/2285 train_time:36900ms step_avg:60.10ms -step:615/2285 train_time:36963ms step_avg:60.10ms -step:616/2285 train_time:37022ms step_avg:60.10ms -step:617/2285 train_time:37084ms step_avg:60.10ms -step:618/2285 train_time:37143ms step_avg:60.10ms -step:619/2285 train_time:37205ms step_avg:60.10ms -step:620/2285 train_time:37263ms step_avg:60.10ms -step:621/2285 train_time:37324ms step_avg:60.10ms -step:622/2285 train_time:37384ms step_avg:60.10ms -step:623/2285 train_time:37445ms step_avg:60.10ms -step:624/2285 train_time:37504ms step_avg:60.10ms -step:625/2285 train_time:37565ms step_avg:60.10ms -step:626/2285 train_time:37624ms step_avg:60.10ms -step:627/2285 train_time:37686ms step_avg:60.11ms -step:628/2285 train_time:37746ms step_avg:60.10ms -step:629/2285 train_time:37807ms step_avg:60.11ms -step:630/2285 train_time:37866ms step_avg:60.11ms -step:631/2285 train_time:37929ms step_avg:60.11ms -step:632/2285 train_time:37988ms step_avg:60.11ms -step:633/2285 train_time:38050ms step_avg:60.11ms -step:634/2285 train_time:38109ms step_avg:60.11ms -step:635/2285 train_time:38170ms step_avg:60.11ms -step:636/2285 train_time:38229ms step_avg:60.11ms -step:637/2285 train_time:38291ms step_avg:60.11ms -step:638/2285 train_time:38350ms step_avg:60.11ms -step:639/2285 train_time:38412ms step_avg:60.11ms -step:640/2285 train_time:38471ms step_avg:60.11ms -step:641/2285 train_time:38532ms step_avg:60.11ms -step:642/2285 train_time:38591ms step_avg:60.11ms -step:643/2285 train_time:38653ms step_avg:60.11ms -step:644/2285 train_time:38712ms step_avg:60.11ms -step:645/2285 train_time:38774ms step_avg:60.11ms -step:646/2285 train_time:38833ms step_avg:60.11ms -step:647/2285 train_time:38896ms step_avg:60.12ms -step:648/2285 train_time:38956ms step_avg:60.12ms -step:649/2285 train_time:39017ms step_avg:60.12ms -step:650/2285 train_time:39075ms step_avg:60.12ms -step:651/2285 train_time:39136ms step_avg:60.12ms -step:652/2285 train_time:39196ms step_avg:60.12ms -step:653/2285 train_time:39256ms step_avg:60.12ms -step:654/2285 train_time:39316ms step_avg:60.12ms -step:655/2285 train_time:39377ms step_avg:60.12ms -step:656/2285 train_time:39436ms step_avg:60.12ms -step:657/2285 train_time:39497ms step_avg:60.12ms -step:658/2285 train_time:39556ms step_avg:60.12ms -step:659/2285 train_time:39617ms step_avg:60.12ms -step:660/2285 train_time:39676ms step_avg:60.12ms -step:661/2285 train_time:39737ms step_avg:60.12ms -step:662/2285 train_time:39796ms step_avg:60.12ms -step:663/2285 train_time:39858ms step_avg:60.12ms -step:664/2285 train_time:39918ms step_avg:60.12ms -step:665/2285 train_time:39979ms step_avg:60.12ms -step:666/2285 train_time:40038ms step_avg:60.12ms -step:667/2285 train_time:40100ms step_avg:60.12ms -step:668/2285 train_time:40159ms step_avg:60.12ms -step:669/2285 train_time:40220ms step_avg:60.12ms -step:670/2285 train_time:40279ms step_avg:60.12ms -step:671/2285 train_time:40340ms step_avg:60.12ms -step:672/2285 train_time:40400ms step_avg:60.12ms -step:673/2285 train_time:40461ms step_avg:60.12ms -step:674/2285 train_time:40520ms step_avg:60.12ms -step:675/2285 train_time:40582ms step_avg:60.12ms -step:676/2285 train_time:40641ms step_avg:60.12ms -step:677/2285 train_time:40704ms step_avg:60.12ms -step:678/2285 train_time:40763ms step_avg:60.12ms -step:679/2285 train_time:40825ms step_avg:60.12ms -step:680/2285 train_time:40884ms step_avg:60.12ms -step:681/2285 train_time:40946ms step_avg:60.13ms -step:682/2285 train_time:41005ms step_avg:60.12ms -step:683/2285 train_time:41066ms step_avg:60.13ms -step:684/2285 train_time:41126ms step_avg:60.13ms -step:685/2285 train_time:41187ms step_avg:60.13ms -step:686/2285 train_time:41247ms step_avg:60.13ms -step:687/2285 train_time:41308ms step_avg:60.13ms -step:688/2285 train_time:41367ms step_avg:60.13ms -step:689/2285 train_time:41429ms step_avg:60.13ms -step:690/2285 train_time:41488ms step_avg:60.13ms -step:691/2285 train_time:41550ms step_avg:60.13ms -step:692/2285 train_time:41609ms step_avg:60.13ms -step:693/2285 train_time:41671ms step_avg:60.13ms -step:694/2285 train_time:41730ms step_avg:60.13ms -step:695/2285 train_time:41791ms step_avg:60.13ms -step:696/2285 train_time:41850ms step_avg:60.13ms -step:697/2285 train_time:41912ms step_avg:60.13ms -step:698/2285 train_time:41971ms step_avg:60.13ms -step:699/2285 train_time:42032ms step_avg:60.13ms -step:700/2285 train_time:42091ms step_avg:60.13ms -step:701/2285 train_time:42153ms step_avg:60.13ms -step:702/2285 train_time:42212ms step_avg:60.13ms -step:703/2285 train_time:42273ms step_avg:60.13ms -step:704/2285 train_time:42333ms step_avg:60.13ms -step:705/2285 train_time:42394ms step_avg:60.13ms -step:706/2285 train_time:42454ms step_avg:60.13ms -step:707/2285 train_time:42515ms step_avg:60.14ms -step:708/2285 train_time:42575ms step_avg:60.13ms -step:709/2285 train_time:42636ms step_avg:60.14ms -step:710/2285 train_time:42696ms step_avg:60.14ms -step:711/2285 train_time:42757ms step_avg:60.14ms -step:712/2285 train_time:42816ms step_avg:60.13ms -step:713/2285 train_time:42877ms step_avg:60.14ms -step:714/2285 train_time:42936ms step_avg:60.14ms -step:715/2285 train_time:42998ms step_avg:60.14ms -step:716/2285 train_time:43057ms step_avg:60.14ms -step:717/2285 train_time:43118ms step_avg:60.14ms -step:718/2285 train_time:43178ms step_avg:60.14ms -step:719/2285 train_time:43240ms step_avg:60.14ms -step:720/2285 train_time:43299ms step_avg:60.14ms -step:721/2285 train_time:43360ms step_avg:60.14ms -step:722/2285 train_time:43418ms step_avg:60.14ms -step:723/2285 train_time:43480ms step_avg:60.14ms -step:724/2285 train_time:43539ms step_avg:60.14ms -step:725/2285 train_time:43601ms step_avg:60.14ms -step:726/2285 train_time:43660ms step_avg:60.14ms -step:727/2285 train_time:43721ms step_avg:60.14ms -step:728/2285 train_time:43780ms step_avg:60.14ms -step:729/2285 train_time:43842ms step_avg:60.14ms -step:730/2285 train_time:43901ms step_avg:60.14ms -step:731/2285 train_time:43962ms step_avg:60.14ms -step:732/2285 train_time:44021ms step_avg:60.14ms -step:733/2285 train_time:44083ms step_avg:60.14ms -step:734/2285 train_time:44141ms step_avg:60.14ms -step:735/2285 train_time:44203ms step_avg:60.14ms -step:736/2285 train_time:44262ms step_avg:60.14ms -step:737/2285 train_time:44323ms step_avg:60.14ms -step:738/2285 train_time:44382ms step_avg:60.14ms -step:739/2285 train_time:44444ms step_avg:60.14ms -step:740/2285 train_time:44503ms step_avg:60.14ms -step:741/2285 train_time:44565ms step_avg:60.14ms -step:742/2285 train_time:44624ms step_avg:60.14ms -step:743/2285 train_time:44686ms step_avg:60.14ms -step:744/2285 train_time:44745ms step_avg:60.14ms -step:745/2285 train_time:44806ms step_avg:60.14ms -step:746/2285 train_time:44865ms step_avg:60.14ms -step:747/2285 train_time:44927ms step_avg:60.14ms -step:748/2285 train_time:44986ms step_avg:60.14ms -step:749/2285 train_time:45048ms step_avg:60.14ms -step:750/2285 train_time:45107ms step_avg:60.14ms -step:750/2285 val_loss:3.6533 train_time:45170ms step_avg:60.23ms -step:751/2285 train_time:45189ms step_avg:60.17ms -step:752/2285 train_time:45231ms step_avg:60.15ms -step:753/2285 train_time:45294ms step_avg:60.15ms -step:754/2285 train_time:45355ms step_avg:60.15ms -step:755/2285 train_time:45416ms step_avg:60.15ms -step:756/2285 train_time:45475ms step_avg:60.15ms -step:757/2285 train_time:45536ms step_avg:60.15ms -step:758/2285 train_time:45595ms step_avg:60.15ms -step:759/2285 train_time:45655ms step_avg:60.15ms -step:760/2285 train_time:45713ms step_avg:60.15ms -step:761/2285 train_time:45774ms step_avg:60.15ms -step:762/2285 train_time:45832ms step_avg:60.15ms -step:763/2285 train_time:45893ms step_avg:60.15ms -step:764/2285 train_time:45953ms step_avg:60.15ms -step:765/2285 train_time:46015ms step_avg:60.15ms -step:766/2285 train_time:46075ms step_avg:60.15ms -step:767/2285 train_time:46137ms step_avg:60.15ms -step:768/2285 train_time:46199ms step_avg:60.15ms -step:769/2285 train_time:46262ms step_avg:60.16ms -step:770/2285 train_time:46321ms step_avg:60.16ms -step:771/2285 train_time:46384ms step_avg:60.16ms -step:772/2285 train_time:46443ms step_avg:60.16ms -step:773/2285 train_time:46505ms step_avg:60.16ms -step:774/2285 train_time:46564ms step_avg:60.16ms -step:775/2285 train_time:46625ms step_avg:60.16ms -step:776/2285 train_time:46684ms step_avg:60.16ms -step:777/2285 train_time:46745ms step_avg:60.16ms -step:778/2285 train_time:46805ms step_avg:60.16ms -step:779/2285 train_time:46866ms step_avg:60.16ms -step:780/2285 train_time:46925ms step_avg:60.16ms -step:781/2285 train_time:46987ms step_avg:60.16ms -step:782/2285 train_time:47047ms step_avg:60.16ms -step:783/2285 train_time:47110ms step_avg:60.17ms -step:784/2285 train_time:47170ms step_avg:60.17ms -step:785/2285 train_time:47233ms step_avg:60.17ms -step:786/2285 train_time:47293ms step_avg:60.17ms -step:787/2285 train_time:47355ms step_avg:60.17ms -step:788/2285 train_time:47415ms step_avg:60.17ms -step:789/2285 train_time:47476ms step_avg:60.17ms -step:790/2285 train_time:47536ms step_avg:60.17ms -step:791/2285 train_time:47598ms step_avg:60.17ms -step:792/2285 train_time:47657ms step_avg:60.17ms -step:793/2285 train_time:47719ms step_avg:60.17ms -step:794/2285 train_time:47778ms step_avg:60.17ms -step:795/2285 train_time:47839ms step_avg:60.18ms -step:796/2285 train_time:47899ms step_avg:60.17ms -step:797/2285 train_time:47961ms step_avg:60.18ms -step:798/2285 train_time:48020ms step_avg:60.18ms -step:799/2285 train_time:48083ms step_avg:60.18ms -step:800/2285 train_time:48142ms step_avg:60.18ms -step:801/2285 train_time:48205ms step_avg:60.18ms -step:802/2285 train_time:48264ms step_avg:60.18ms -step:803/2285 train_time:48327ms step_avg:60.18ms -step:804/2285 train_time:48387ms step_avg:60.18ms -step:805/2285 train_time:48449ms step_avg:60.18ms -step:806/2285 train_time:48508ms step_avg:60.18ms -step:807/2285 train_time:48570ms step_avg:60.19ms -step:808/2285 train_time:48629ms step_avg:60.18ms -step:809/2285 train_time:48691ms step_avg:60.19ms -step:810/2285 train_time:48750ms step_avg:60.18ms -step:811/2285 train_time:48811ms step_avg:60.19ms -step:812/2285 train_time:48871ms step_avg:60.19ms -step:813/2285 train_time:48932ms step_avg:60.19ms -step:814/2285 train_time:48991ms step_avg:60.19ms -step:815/2285 train_time:49053ms step_avg:60.19ms -step:816/2285 train_time:49112ms step_avg:60.19ms -step:817/2285 train_time:49174ms step_avg:60.19ms -step:818/2285 train_time:49234ms step_avg:60.19ms -step:819/2285 train_time:49297ms step_avg:60.19ms -step:820/2285 train_time:49356ms step_avg:60.19ms -step:821/2285 train_time:49419ms step_avg:60.19ms -step:822/2285 train_time:49478ms step_avg:60.19ms -step:823/2285 train_time:49540ms step_avg:60.19ms -step:824/2285 train_time:49600ms step_avg:60.19ms -step:825/2285 train_time:49662ms step_avg:60.20ms -step:826/2285 train_time:49721ms step_avg:60.20ms -step:827/2285 train_time:49783ms step_avg:60.20ms -step:828/2285 train_time:49842ms step_avg:60.20ms -step:829/2285 train_time:49904ms step_avg:60.20ms -step:830/2285 train_time:49963ms step_avg:60.20ms -step:831/2285 train_time:50025ms step_avg:60.20ms -step:832/2285 train_time:50085ms step_avg:60.20ms -step:833/2285 train_time:50146ms step_avg:60.20ms -step:834/2285 train_time:50206ms step_avg:60.20ms -step:835/2285 train_time:50268ms step_avg:60.20ms -step:836/2285 train_time:50328ms step_avg:60.20ms -step:837/2285 train_time:50390ms step_avg:60.20ms -step:838/2285 train_time:50450ms step_avg:60.20ms -step:839/2285 train_time:50511ms step_avg:60.20ms -step:840/2285 train_time:50571ms step_avg:60.20ms -step:841/2285 train_time:50633ms step_avg:60.21ms -step:842/2285 train_time:50692ms step_avg:60.20ms -step:843/2285 train_time:50754ms step_avg:60.21ms -step:844/2285 train_time:50813ms step_avg:60.21ms -step:845/2285 train_time:50875ms step_avg:60.21ms -step:846/2285 train_time:50934ms step_avg:60.21ms -step:847/2285 train_time:50996ms step_avg:60.21ms -step:848/2285 train_time:51055ms step_avg:60.21ms -step:849/2285 train_time:51117ms step_avg:60.21ms -step:850/2285 train_time:51177ms step_avg:60.21ms -step:851/2285 train_time:51239ms step_avg:60.21ms -step:852/2285 train_time:51299ms step_avg:60.21ms -step:853/2285 train_time:51361ms step_avg:60.21ms -step:854/2285 train_time:51421ms step_avg:60.21ms -step:855/2285 train_time:51482ms step_avg:60.21ms -step:856/2285 train_time:51542ms step_avg:60.21ms -step:857/2285 train_time:51604ms step_avg:60.21ms -step:858/2285 train_time:51663ms step_avg:60.21ms -step:859/2285 train_time:51726ms step_avg:60.22ms -step:860/2285 train_time:51785ms step_avg:60.22ms -step:861/2285 train_time:51847ms step_avg:60.22ms -step:862/2285 train_time:51905ms step_avg:60.22ms -step:863/2285 train_time:51968ms step_avg:60.22ms -step:864/2285 train_time:52027ms step_avg:60.22ms -step:865/2285 train_time:52089ms step_avg:60.22ms -step:866/2285 train_time:52149ms step_avg:60.22ms -step:867/2285 train_time:52211ms step_avg:60.22ms -step:868/2285 train_time:52271ms step_avg:60.22ms -step:869/2285 train_time:52333ms step_avg:60.22ms -step:870/2285 train_time:52392ms step_avg:60.22ms -step:871/2285 train_time:52454ms step_avg:60.22ms -step:872/2285 train_time:52513ms step_avg:60.22ms -step:873/2285 train_time:52575ms step_avg:60.22ms -step:874/2285 train_time:52634ms step_avg:60.22ms -step:875/2285 train_time:52697ms step_avg:60.22ms -step:876/2285 train_time:52756ms step_avg:60.22ms -step:877/2285 train_time:52818ms step_avg:60.23ms -step:878/2285 train_time:52877ms step_avg:60.22ms -step:879/2285 train_time:52939ms step_avg:60.23ms -step:880/2285 train_time:52999ms step_avg:60.23ms -step:881/2285 train_time:53061ms step_avg:60.23ms -step:882/2285 train_time:53121ms step_avg:60.23ms -step:883/2285 train_time:53183ms step_avg:60.23ms -step:884/2285 train_time:53243ms step_avg:60.23ms -step:885/2285 train_time:53305ms step_avg:60.23ms -step:886/2285 train_time:53364ms step_avg:60.23ms -step:887/2285 train_time:53426ms step_avg:60.23ms -step:888/2285 train_time:53486ms step_avg:60.23ms -step:889/2285 train_time:53548ms step_avg:60.23ms -step:890/2285 train_time:53608ms step_avg:60.23ms -step:891/2285 train_time:53669ms step_avg:60.23ms -step:892/2285 train_time:53729ms step_avg:60.23ms -step:893/2285 train_time:53790ms step_avg:60.24ms -step:894/2285 train_time:53849ms step_avg:60.23ms -step:895/2285 train_time:53911ms step_avg:60.24ms -step:896/2285 train_time:53971ms step_avg:60.24ms -step:897/2285 train_time:54032ms step_avg:60.24ms -step:898/2285 train_time:54092ms step_avg:60.24ms -step:899/2285 train_time:54153ms step_avg:60.24ms -step:900/2285 train_time:54212ms step_avg:60.24ms -step:901/2285 train_time:54275ms step_avg:60.24ms -step:902/2285 train_time:54334ms step_avg:60.24ms -step:903/2285 train_time:54396ms step_avg:60.24ms -step:904/2285 train_time:54456ms step_avg:60.24ms -step:905/2285 train_time:54518ms step_avg:60.24ms -step:906/2285 train_time:54577ms step_avg:60.24ms -step:907/2285 train_time:54639ms step_avg:60.24ms -step:908/2285 train_time:54699ms step_avg:60.24ms -step:909/2285 train_time:54760ms step_avg:60.24ms -step:910/2285 train_time:54820ms step_avg:60.24ms -step:911/2285 train_time:54882ms step_avg:60.24ms -step:912/2285 train_time:54942ms step_avg:60.24ms -step:913/2285 train_time:55004ms step_avg:60.25ms -step:914/2285 train_time:55063ms step_avg:60.24ms -step:915/2285 train_time:55125ms step_avg:60.25ms -step:916/2285 train_time:55184ms step_avg:60.24ms -step:917/2285 train_time:55246ms step_avg:60.25ms -step:918/2285 train_time:55306ms step_avg:60.25ms -step:919/2285 train_time:55368ms step_avg:60.25ms -step:920/2285 train_time:55428ms step_avg:60.25ms -step:921/2285 train_time:55490ms step_avg:60.25ms -step:922/2285 train_time:55550ms step_avg:60.25ms -step:923/2285 train_time:55613ms step_avg:60.25ms -step:924/2285 train_time:55672ms step_avg:60.25ms -step:925/2285 train_time:55734ms step_avg:60.25ms -step:926/2285 train_time:55793ms step_avg:60.25ms -step:927/2285 train_time:55855ms step_avg:60.25ms -step:928/2285 train_time:55914ms step_avg:60.25ms -step:929/2285 train_time:55977ms step_avg:60.25ms -step:930/2285 train_time:56037ms step_avg:60.25ms -step:931/2285 train_time:56099ms step_avg:60.26ms -step:932/2285 train_time:56158ms step_avg:60.26ms -step:933/2285 train_time:56220ms step_avg:60.26ms -step:934/2285 train_time:56279ms step_avg:60.26ms -step:935/2285 train_time:56340ms step_avg:60.26ms -step:936/2285 train_time:56400ms step_avg:60.26ms -step:937/2285 train_time:56462ms step_avg:60.26ms -step:938/2285 train_time:56522ms step_avg:60.26ms -step:939/2285 train_time:56583ms step_avg:60.26ms -step:940/2285 train_time:56643ms step_avg:60.26ms -step:941/2285 train_time:56704ms step_avg:60.26ms -step:942/2285 train_time:56764ms step_avg:60.26ms -step:943/2285 train_time:56826ms step_avg:60.26ms -step:944/2285 train_time:56886ms step_avg:60.26ms -step:945/2285 train_time:56948ms step_avg:60.26ms -step:946/2285 train_time:57008ms step_avg:60.26ms -step:947/2285 train_time:57070ms step_avg:60.26ms -step:948/2285 train_time:57130ms step_avg:60.26ms -step:949/2285 train_time:57192ms step_avg:60.27ms -step:950/2285 train_time:57251ms step_avg:60.26ms -step:951/2285 train_time:57312ms step_avg:60.26ms -step:952/2285 train_time:57372ms step_avg:60.26ms -step:953/2285 train_time:57434ms step_avg:60.27ms -step:954/2285 train_time:57493ms step_avg:60.27ms -step:955/2285 train_time:57555ms step_avg:60.27ms -step:956/2285 train_time:57614ms step_avg:60.27ms -step:957/2285 train_time:57676ms step_avg:60.27ms -step:958/2285 train_time:57736ms step_avg:60.27ms -step:959/2285 train_time:57798ms step_avg:60.27ms -step:960/2285 train_time:57857ms step_avg:60.27ms -step:961/2285 train_time:57920ms step_avg:60.27ms -step:962/2285 train_time:57980ms step_avg:60.27ms -step:963/2285 train_time:58042ms step_avg:60.27ms -step:964/2285 train_time:58101ms step_avg:60.27ms -step:965/2285 train_time:58163ms step_avg:60.27ms -step:966/2285 train_time:58223ms step_avg:60.27ms -step:967/2285 train_time:58284ms step_avg:60.27ms -step:968/2285 train_time:58344ms step_avg:60.27ms -step:969/2285 train_time:58405ms step_avg:60.27ms -step:970/2285 train_time:58465ms step_avg:60.27ms -step:971/2285 train_time:58527ms step_avg:60.27ms -step:972/2285 train_time:58586ms step_avg:60.27ms -step:973/2285 train_time:58648ms step_avg:60.28ms -step:974/2285 train_time:58707ms step_avg:60.27ms -step:975/2285 train_time:58769ms step_avg:60.28ms -step:976/2285 train_time:58829ms step_avg:60.28ms -step:977/2285 train_time:58891ms step_avg:60.28ms -step:978/2285 train_time:58951ms step_avg:60.28ms -step:979/2285 train_time:59012ms step_avg:60.28ms -step:980/2285 train_time:59072ms step_avg:60.28ms -step:981/2285 train_time:59134ms step_avg:60.28ms -step:982/2285 train_time:59193ms step_avg:60.28ms -step:983/2285 train_time:59255ms step_avg:60.28ms -step:984/2285 train_time:59314ms step_avg:60.28ms -step:985/2285 train_time:59376ms step_avg:60.28ms -step:986/2285 train_time:59436ms step_avg:60.28ms -step:987/2285 train_time:59497ms step_avg:60.28ms -step:988/2285 train_time:59557ms step_avg:60.28ms -step:989/2285 train_time:59618ms step_avg:60.28ms -step:990/2285 train_time:59678ms step_avg:60.28ms -step:991/2285 train_time:59740ms step_avg:60.28ms -step:992/2285 train_time:59801ms step_avg:60.28ms -step:993/2285 train_time:59862ms step_avg:60.28ms -step:994/2285 train_time:59922ms step_avg:60.28ms -step:995/2285 train_time:59985ms step_avg:60.29ms -step:996/2285 train_time:60044ms step_avg:60.29ms -step:997/2285 train_time:60106ms step_avg:60.29ms -step:998/2285 train_time:60165ms step_avg:60.29ms -step:999/2285 train_time:60227ms step_avg:60.29ms -step:1000/2285 train_time:60286ms step_avg:60.29ms -step:1000/2285 val_loss:3.5663 train_time:60349ms step_avg:60.35ms -step:1001/2285 train_time:60368ms step_avg:60.31ms -step:1002/2285 train_time:60411ms step_avg:60.29ms -step:1003/2285 train_time:60472ms step_avg:60.29ms -step:1004/2285 train_time:60532ms step_avg:60.29ms -step:1005/2285 train_time:60595ms step_avg:60.29ms -step:1006/2285 train_time:60655ms step_avg:60.29ms -step:1007/2285 train_time:60716ms step_avg:60.29ms -step:1008/2285 train_time:60775ms step_avg:60.29ms -step:1009/2285 train_time:60836ms step_avg:60.29ms -step:1010/2285 train_time:60894ms step_avg:60.29ms -step:1011/2285 train_time:60955ms step_avg:60.29ms -step:1012/2285 train_time:61014ms step_avg:60.29ms -step:1013/2285 train_time:61074ms step_avg:60.29ms -step:1014/2285 train_time:61134ms step_avg:60.29ms -step:1015/2285 train_time:61195ms step_avg:60.29ms -step:1016/2285 train_time:61257ms step_avg:60.29ms -step:1017/2285 train_time:61325ms step_avg:60.30ms -step:1018/2285 train_time:61385ms step_avg:60.30ms -step:1019/2285 train_time:61447ms step_avg:60.30ms -step:1020/2285 train_time:61507ms step_avg:60.30ms -step:1021/2285 train_time:61568ms step_avg:60.30ms -step:1022/2285 train_time:61628ms step_avg:60.30ms -step:1023/2285 train_time:61690ms step_avg:60.30ms -step:1024/2285 train_time:61749ms step_avg:60.30ms -step:1025/2285 train_time:61810ms step_avg:60.30ms -step:1026/2285 train_time:61870ms step_avg:60.30ms -step:1027/2285 train_time:61931ms step_avg:60.30ms -step:1028/2285 train_time:61990ms step_avg:60.30ms -step:1029/2285 train_time:62051ms step_avg:60.30ms -step:1030/2285 train_time:62110ms step_avg:60.30ms -step:1031/2285 train_time:62172ms step_avg:60.30ms -step:1032/2285 train_time:62232ms step_avg:60.30ms -step:1033/2285 train_time:62295ms step_avg:60.31ms -step:1034/2285 train_time:62357ms step_avg:60.31ms -step:1035/2285 train_time:62420ms step_avg:60.31ms -step:1036/2285 train_time:62480ms step_avg:60.31ms -step:1037/2285 train_time:62542ms step_avg:60.31ms -step:1038/2285 train_time:62601ms step_avg:60.31ms -step:1039/2285 train_time:62663ms step_avg:60.31ms -step:1040/2285 train_time:62722ms step_avg:60.31ms -step:1041/2285 train_time:62783ms step_avg:60.31ms -step:1042/2285 train_time:62843ms step_avg:60.31ms -step:1043/2285 train_time:62904ms step_avg:60.31ms -step:1044/2285 train_time:62963ms step_avg:60.31ms -step:1045/2285 train_time:63025ms step_avg:60.31ms -step:1046/2285 train_time:63084ms step_avg:60.31ms -step:1047/2285 train_time:63146ms step_avg:60.31ms -step:1048/2285 train_time:63206ms step_avg:60.31ms -step:1049/2285 train_time:63268ms step_avg:60.31ms -step:1050/2285 train_time:63329ms step_avg:60.31ms -step:1051/2285 train_time:63391ms step_avg:60.31ms -step:1052/2285 train_time:63451ms step_avg:60.31ms -step:1053/2285 train_time:63513ms step_avg:60.32ms -step:1054/2285 train_time:63573ms step_avg:60.32ms -step:1055/2285 train_time:63635ms step_avg:60.32ms -step:1056/2285 train_time:63694ms step_avg:60.32ms -step:1057/2285 train_time:63756ms step_avg:60.32ms -step:1058/2285 train_time:63816ms step_avg:60.32ms -step:1059/2285 train_time:63878ms step_avg:60.32ms -step:1060/2285 train_time:63938ms step_avg:60.32ms -step:1061/2285 train_time:63999ms step_avg:60.32ms -step:1062/2285 train_time:64059ms step_avg:60.32ms -step:1063/2285 train_time:64121ms step_avg:60.32ms -step:1064/2285 train_time:64180ms step_avg:60.32ms -step:1065/2285 train_time:64242ms step_avg:60.32ms -step:1066/2285 train_time:64301ms step_avg:60.32ms -step:1067/2285 train_time:64363ms step_avg:60.32ms -step:1068/2285 train_time:64423ms step_avg:60.32ms -step:1069/2285 train_time:64485ms step_avg:60.32ms -step:1070/2285 train_time:64545ms step_avg:60.32ms -step:1071/2285 train_time:64606ms step_avg:60.32ms -step:1072/2285 train_time:64666ms step_avg:60.32ms -step:1073/2285 train_time:64728ms step_avg:60.32ms -step:1074/2285 train_time:64788ms step_avg:60.32ms -step:1075/2285 train_time:64850ms step_avg:60.33ms -step:1076/2285 train_time:64909ms step_avg:60.32ms -step:1077/2285 train_time:64971ms step_avg:60.33ms -step:1078/2285 train_time:65030ms step_avg:60.33ms -step:1079/2285 train_time:65092ms step_avg:60.33ms -step:1080/2285 train_time:65152ms step_avg:60.33ms -step:1081/2285 train_time:65214ms step_avg:60.33ms -step:1082/2285 train_time:65273ms step_avg:60.33ms -step:1083/2285 train_time:65336ms step_avg:60.33ms -step:1084/2285 train_time:65396ms step_avg:60.33ms -step:1085/2285 train_time:65458ms step_avg:60.33ms -step:1086/2285 train_time:65518ms step_avg:60.33ms -step:1087/2285 train_time:65580ms step_avg:60.33ms -step:1088/2285 train_time:65639ms step_avg:60.33ms -step:1089/2285 train_time:65701ms step_avg:60.33ms -step:1090/2285 train_time:65760ms step_avg:60.33ms -step:1091/2285 train_time:65822ms step_avg:60.33ms -step:1092/2285 train_time:65882ms step_avg:60.33ms -step:1093/2285 train_time:65943ms step_avg:60.33ms -step:1094/2285 train_time:66002ms step_avg:60.33ms -step:1095/2285 train_time:66065ms step_avg:60.33ms -step:1096/2285 train_time:66124ms step_avg:60.33ms -step:1097/2285 train_time:66186ms step_avg:60.33ms -step:1098/2285 train_time:66246ms step_avg:60.33ms -step:1099/2285 train_time:66308ms step_avg:60.33ms -step:1100/2285 train_time:66367ms step_avg:60.33ms -step:1101/2285 train_time:66429ms step_avg:60.34ms -step:1102/2285 train_time:66489ms step_avg:60.33ms -step:1103/2285 train_time:66550ms step_avg:60.34ms -step:1104/2285 train_time:66610ms step_avg:60.34ms -step:1105/2285 train_time:66673ms step_avg:60.34ms -step:1106/2285 train_time:66733ms step_avg:60.34ms -step:1107/2285 train_time:66795ms step_avg:60.34ms -step:1108/2285 train_time:66855ms step_avg:60.34ms -step:1109/2285 train_time:66917ms step_avg:60.34ms -step:1110/2285 train_time:66977ms step_avg:60.34ms -step:1111/2285 train_time:67039ms step_avg:60.34ms -step:1112/2285 train_time:67098ms step_avg:60.34ms -step:1113/2285 train_time:67160ms step_avg:60.34ms -step:1114/2285 train_time:67220ms step_avg:60.34ms -step:1115/2285 train_time:67281ms step_avg:60.34ms -step:1116/2285 train_time:67341ms step_avg:60.34ms -step:1117/2285 train_time:67403ms step_avg:60.34ms -step:1118/2285 train_time:67463ms step_avg:60.34ms -step:1119/2285 train_time:67525ms step_avg:60.34ms -step:1120/2285 train_time:67585ms step_avg:60.34ms -step:1121/2285 train_time:67646ms step_avg:60.34ms -step:1122/2285 train_time:67706ms step_avg:60.34ms -step:1123/2285 train_time:67768ms step_avg:60.35ms -step:1124/2285 train_time:67829ms step_avg:60.35ms -step:1125/2285 train_time:67890ms step_avg:60.35ms -step:1126/2285 train_time:67950ms step_avg:60.35ms -step:1127/2285 train_time:68011ms step_avg:60.35ms -step:1128/2285 train_time:68071ms step_avg:60.35ms -step:1129/2285 train_time:68133ms step_avg:60.35ms -step:1130/2285 train_time:68193ms step_avg:60.35ms -step:1131/2285 train_time:68255ms step_avg:60.35ms -step:1132/2285 train_time:68316ms step_avg:60.35ms -step:1133/2285 train_time:68378ms step_avg:60.35ms -step:1134/2285 train_time:68438ms step_avg:60.35ms -step:1135/2285 train_time:68500ms step_avg:60.35ms -step:1136/2285 train_time:68559ms step_avg:60.35ms -step:1137/2285 train_time:68621ms step_avg:60.35ms -step:1138/2285 train_time:68681ms step_avg:60.35ms -step:1139/2285 train_time:68743ms step_avg:60.35ms -step:1140/2285 train_time:68803ms step_avg:60.35ms -step:1141/2285 train_time:68865ms step_avg:60.35ms -step:1142/2285 train_time:68924ms step_avg:60.35ms -step:1143/2285 train_time:68986ms step_avg:60.36ms -step:1144/2285 train_time:69046ms step_avg:60.36ms -step:1145/2285 train_time:69108ms step_avg:60.36ms -step:1146/2285 train_time:69168ms step_avg:60.36ms -step:1147/2285 train_time:69231ms step_avg:60.36ms -step:1148/2285 train_time:69290ms step_avg:60.36ms -step:1149/2285 train_time:69352ms step_avg:60.36ms -step:1150/2285 train_time:69412ms step_avg:60.36ms -step:1151/2285 train_time:69474ms step_avg:60.36ms -step:1152/2285 train_time:69534ms step_avg:60.36ms -step:1153/2285 train_time:69595ms step_avg:60.36ms -step:1154/2285 train_time:69656ms step_avg:60.36ms -step:1155/2285 train_time:69719ms step_avg:60.36ms -step:1156/2285 train_time:69778ms step_avg:60.36ms -step:1157/2285 train_time:69840ms step_avg:60.36ms -step:1158/2285 train_time:69901ms step_avg:60.36ms -step:1159/2285 train_time:69963ms step_avg:60.37ms -step:1160/2285 train_time:70024ms step_avg:60.37ms -step:1161/2285 train_time:70086ms step_avg:60.37ms -step:1162/2285 train_time:70145ms step_avg:60.37ms -step:1163/2285 train_time:70207ms step_avg:60.37ms -step:1164/2285 train_time:70267ms step_avg:60.37ms -step:1165/2285 train_time:70329ms step_avg:60.37ms -step:1166/2285 train_time:70389ms step_avg:60.37ms -step:1167/2285 train_time:70451ms step_avg:60.37ms -step:1168/2285 train_time:70511ms step_avg:60.37ms -step:1169/2285 train_time:70573ms step_avg:60.37ms -step:1170/2285 train_time:70633ms step_avg:60.37ms -step:1171/2285 train_time:70694ms step_avg:60.37ms -step:1172/2285 train_time:70755ms step_avg:60.37ms -step:1173/2285 train_time:70818ms step_avg:60.37ms -step:1174/2285 train_time:70878ms step_avg:60.37ms -step:1175/2285 train_time:70940ms step_avg:60.37ms -step:1176/2285 train_time:71000ms step_avg:60.37ms -step:1177/2285 train_time:71062ms step_avg:60.38ms -step:1178/2285 train_time:71121ms step_avg:60.37ms -step:1179/2285 train_time:71183ms step_avg:60.38ms -step:1180/2285 train_time:71242ms step_avg:60.37ms -step:1181/2285 train_time:71305ms step_avg:60.38ms -step:1182/2285 train_time:71364ms step_avg:60.38ms -step:1183/2285 train_time:71427ms step_avg:60.38ms -step:1184/2285 train_time:71487ms step_avg:60.38ms -step:1185/2285 train_time:71549ms step_avg:60.38ms -step:1186/2285 train_time:71609ms step_avg:60.38ms -step:1187/2285 train_time:71672ms step_avg:60.38ms -step:1188/2285 train_time:71731ms step_avg:60.38ms -step:1189/2285 train_time:71794ms step_avg:60.38ms -step:1190/2285 train_time:71854ms step_avg:60.38ms -step:1191/2285 train_time:71917ms step_avg:60.38ms -step:1192/2285 train_time:71977ms step_avg:60.38ms -step:1193/2285 train_time:72040ms step_avg:60.39ms -step:1194/2285 train_time:72099ms step_avg:60.38ms -step:1195/2285 train_time:72161ms step_avg:60.39ms -step:1196/2285 train_time:72221ms step_avg:60.39ms -step:1197/2285 train_time:72283ms step_avg:60.39ms -step:1198/2285 train_time:72343ms step_avg:60.39ms -step:1199/2285 train_time:72405ms step_avg:60.39ms -step:1200/2285 train_time:72464ms step_avg:60.39ms -step:1201/2285 train_time:72526ms step_avg:60.39ms -step:1202/2285 train_time:72587ms step_avg:60.39ms -step:1203/2285 train_time:72649ms step_avg:60.39ms -step:1204/2285 train_time:72709ms step_avg:60.39ms -step:1205/2285 train_time:72771ms step_avg:60.39ms -step:1206/2285 train_time:72830ms step_avg:60.39ms -step:1207/2285 train_time:72893ms step_avg:60.39ms -step:1208/2285 train_time:72953ms step_avg:60.39ms -step:1209/2285 train_time:73015ms step_avg:60.39ms -step:1210/2285 train_time:73075ms step_avg:60.39ms -step:1211/2285 train_time:73138ms step_avg:60.39ms -step:1212/2285 train_time:73198ms step_avg:60.39ms -step:1213/2285 train_time:73261ms step_avg:60.40ms -step:1214/2285 train_time:73321ms step_avg:60.40ms -step:1215/2285 train_time:73382ms step_avg:60.40ms -step:1216/2285 train_time:73442ms step_avg:60.40ms -step:1217/2285 train_time:73504ms step_avg:60.40ms -step:1218/2285 train_time:73564ms step_avg:60.40ms -step:1219/2285 train_time:73626ms step_avg:60.40ms -step:1220/2285 train_time:73686ms step_avg:60.40ms -step:1221/2285 train_time:73748ms step_avg:60.40ms -step:1222/2285 train_time:73808ms step_avg:60.40ms -step:1223/2285 train_time:73871ms step_avg:60.40ms -step:1224/2285 train_time:73931ms step_avg:60.40ms -step:1225/2285 train_time:73993ms step_avg:60.40ms -step:1226/2285 train_time:74053ms step_avg:60.40ms -step:1227/2285 train_time:74116ms step_avg:60.40ms -step:1228/2285 train_time:74176ms step_avg:60.40ms -step:1229/2285 train_time:74239ms step_avg:60.41ms -step:1230/2285 train_time:74298ms step_avg:60.41ms -step:1231/2285 train_time:74360ms step_avg:60.41ms -step:1232/2285 train_time:74420ms step_avg:60.41ms -step:1233/2285 train_time:74483ms step_avg:60.41ms -step:1234/2285 train_time:74542ms step_avg:60.41ms -step:1235/2285 train_time:74604ms step_avg:60.41ms -step:1236/2285 train_time:74664ms step_avg:60.41ms -step:1237/2285 train_time:74726ms step_avg:60.41ms -step:1238/2285 train_time:74786ms step_avg:60.41ms -step:1239/2285 train_time:74848ms step_avg:60.41ms -step:1240/2285 train_time:74908ms step_avg:60.41ms -step:1241/2285 train_time:74970ms step_avg:60.41ms -step:1242/2285 train_time:75030ms step_avg:60.41ms -step:1243/2285 train_time:75093ms step_avg:60.41ms -step:1244/2285 train_time:75153ms step_avg:60.41ms -step:1245/2285 train_time:75215ms step_avg:60.41ms -step:1246/2285 train_time:75274ms step_avg:60.41ms -step:1247/2285 train_time:75336ms step_avg:60.41ms -step:1248/2285 train_time:75396ms step_avg:60.41ms -step:1249/2285 train_time:75459ms step_avg:60.42ms -step:1250/2285 train_time:75520ms step_avg:60.42ms -step:1250/2285 val_loss:3.4929 train_time:75583ms step_avg:60.47ms -step:1251/2285 train_time:75602ms step_avg:60.43ms -step:1252/2285 train_time:75645ms step_avg:60.42ms -step:1253/2285 train_time:75706ms step_avg:60.42ms -step:1254/2285 train_time:75765ms step_avg:60.42ms -step:1255/2285 train_time:75828ms step_avg:60.42ms -step:1256/2285 train_time:75887ms step_avg:60.42ms -step:1257/2285 train_time:75948ms step_avg:60.42ms -step:1258/2285 train_time:76007ms step_avg:60.42ms -step:1259/2285 train_time:76067ms step_avg:60.42ms -step:1260/2285 train_time:76126ms step_avg:60.42ms -step:1261/2285 train_time:76188ms step_avg:60.42ms -step:1262/2285 train_time:76247ms step_avg:60.42ms -step:1263/2285 train_time:76308ms step_avg:60.42ms -step:1264/2285 train_time:76366ms step_avg:60.42ms -step:1265/2285 train_time:76428ms step_avg:60.42ms -step:1266/2285 train_time:76496ms step_avg:60.42ms -step:1267/2285 train_time:76563ms step_avg:60.43ms -step:1268/2285 train_time:76624ms step_avg:60.43ms -step:1269/2285 train_time:76686ms step_avg:60.43ms -step:1270/2285 train_time:76745ms step_avg:60.43ms -step:1271/2285 train_time:76808ms step_avg:60.43ms -step:1272/2285 train_time:76867ms step_avg:60.43ms -step:1273/2285 train_time:76928ms step_avg:60.43ms -step:1274/2285 train_time:76987ms step_avg:60.43ms -step:1275/2285 train_time:77048ms step_avg:60.43ms -step:1276/2285 train_time:77107ms step_avg:60.43ms -step:1277/2285 train_time:77168ms step_avg:60.43ms -step:1278/2285 train_time:77227ms step_avg:60.43ms -step:1279/2285 train_time:77288ms step_avg:60.43ms -step:1280/2285 train_time:77348ms step_avg:60.43ms -step:1281/2285 train_time:77412ms step_avg:60.43ms -step:1282/2285 train_time:77475ms step_avg:60.43ms -step:1283/2285 train_time:77539ms step_avg:60.44ms -step:1284/2285 train_time:77599ms step_avg:60.44ms -step:1285/2285 train_time:77662ms step_avg:60.44ms -step:1286/2285 train_time:77722ms step_avg:60.44ms -step:1287/2285 train_time:77783ms step_avg:60.44ms -step:1288/2285 train_time:77843ms step_avg:60.44ms -step:1289/2285 train_time:77905ms step_avg:60.44ms -step:1290/2285 train_time:77964ms step_avg:60.44ms -step:1291/2285 train_time:78025ms step_avg:60.44ms -step:1292/2285 train_time:78084ms step_avg:60.44ms -step:1293/2285 train_time:78146ms step_avg:60.44ms -step:1294/2285 train_time:78205ms step_avg:60.44ms -step:1295/2285 train_time:78266ms step_avg:60.44ms -step:1296/2285 train_time:78326ms step_avg:60.44ms -step:1297/2285 train_time:78389ms step_avg:60.44ms -step:1298/2285 train_time:78451ms step_avg:60.44ms -step:1299/2285 train_time:78515ms step_avg:60.44ms -step:1300/2285 train_time:78575ms step_avg:60.44ms -step:1301/2285 train_time:78637ms step_avg:60.44ms -step:1302/2285 train_time:78697ms step_avg:60.44ms -step:1303/2285 train_time:78758ms step_avg:60.44ms -step:1304/2285 train_time:78818ms step_avg:60.44ms -step:1305/2285 train_time:78880ms step_avg:60.44ms -step:1306/2285 train_time:78940ms step_avg:60.44ms -step:1307/2285 train_time:79001ms step_avg:60.44ms -step:1308/2285 train_time:79061ms step_avg:60.44ms -step:1309/2285 train_time:79123ms step_avg:60.45ms -step:1310/2285 train_time:79182ms step_avg:60.44ms -step:1311/2285 train_time:79243ms step_avg:60.45ms -step:1312/2285 train_time:79303ms step_avg:60.44ms -step:1313/2285 train_time:79366ms step_avg:60.45ms -step:1314/2285 train_time:79426ms step_avg:60.45ms -step:1315/2285 train_time:79489ms step_avg:60.45ms -step:1316/2285 train_time:79549ms step_avg:60.45ms -step:1317/2285 train_time:79612ms step_avg:60.45ms -step:1318/2285 train_time:79672ms step_avg:60.45ms -step:1319/2285 train_time:79734ms step_avg:60.45ms -step:1320/2285 train_time:79794ms step_avg:60.45ms -step:1321/2285 train_time:79856ms step_avg:60.45ms -step:1322/2285 train_time:79915ms step_avg:60.45ms -step:1323/2285 train_time:79977ms step_avg:60.45ms -step:1324/2285 train_time:80037ms step_avg:60.45ms -step:1325/2285 train_time:80099ms step_avg:60.45ms -step:1326/2285 train_time:80159ms step_avg:60.45ms -step:1327/2285 train_time:80221ms step_avg:60.45ms -step:1328/2285 train_time:80281ms step_avg:60.45ms -step:1329/2285 train_time:80344ms step_avg:60.45ms -step:1330/2285 train_time:80404ms step_avg:60.45ms -step:1331/2285 train_time:80466ms step_avg:60.46ms -step:1332/2285 train_time:80525ms step_avg:60.45ms -step:1333/2285 train_time:80588ms step_avg:60.46ms -step:1334/2285 train_time:80648ms step_avg:60.46ms -step:1335/2285 train_time:80710ms step_avg:60.46ms -step:1336/2285 train_time:80769ms step_avg:60.46ms -step:1337/2285 train_time:80832ms step_avg:60.46ms -step:1338/2285 train_time:80892ms step_avg:60.46ms -step:1339/2285 train_time:80954ms step_avg:60.46ms -step:1340/2285 train_time:81014ms step_avg:60.46ms -step:1341/2285 train_time:81076ms step_avg:60.46ms -step:1342/2285 train_time:81136ms step_avg:60.46ms -step:1343/2285 train_time:81198ms step_avg:60.46ms -step:1344/2285 train_time:81257ms step_avg:60.46ms -step:1345/2285 train_time:81319ms step_avg:60.46ms -step:1346/2285 train_time:81380ms step_avg:60.46ms -step:1347/2285 train_time:81443ms step_avg:60.46ms -step:1348/2285 train_time:81503ms step_avg:60.46ms -step:1349/2285 train_time:81565ms step_avg:60.46ms -step:1350/2285 train_time:81624ms step_avg:60.46ms -step:1351/2285 train_time:81687ms step_avg:60.46ms -step:1352/2285 train_time:81747ms step_avg:60.46ms -step:1353/2285 train_time:81809ms step_avg:60.46ms -step:1354/2285 train_time:81868ms step_avg:60.46ms -step:1355/2285 train_time:81930ms step_avg:60.47ms -step:1356/2285 train_time:81991ms step_avg:60.47ms -step:1357/2285 train_time:82053ms step_avg:60.47ms -step:1358/2285 train_time:82113ms step_avg:60.47ms -step:1359/2285 train_time:82175ms step_avg:60.47ms -step:1360/2285 train_time:82235ms step_avg:60.47ms -step:1361/2285 train_time:82297ms step_avg:60.47ms -step:1362/2285 train_time:82356ms step_avg:60.47ms -step:1363/2285 train_time:82418ms step_avg:60.47ms -step:1364/2285 train_time:82478ms step_avg:60.47ms -step:1365/2285 train_time:82541ms step_avg:60.47ms -step:1366/2285 train_time:82601ms step_avg:60.47ms -step:1367/2285 train_time:82664ms step_avg:60.47ms -step:1368/2285 train_time:82724ms step_avg:60.47ms -step:1369/2285 train_time:82786ms step_avg:60.47ms -step:1370/2285 train_time:82846ms step_avg:60.47ms -step:1371/2285 train_time:82908ms step_avg:60.47ms -step:1372/2285 train_time:82968ms step_avg:60.47ms -step:1373/2285 train_time:83030ms step_avg:60.47ms -step:1374/2285 train_time:83090ms step_avg:60.47ms -step:1375/2285 train_time:83152ms step_avg:60.47ms -step:1376/2285 train_time:83212ms step_avg:60.47ms -step:1377/2285 train_time:83275ms step_avg:60.48ms -step:1378/2285 train_time:83335ms step_avg:60.47ms -step:1379/2285 train_time:83397ms step_avg:60.48ms -step:1380/2285 train_time:83457ms step_avg:60.48ms -step:1381/2285 train_time:83519ms step_avg:60.48ms -step:1382/2285 train_time:83579ms step_avg:60.48ms -step:1383/2285 train_time:83641ms step_avg:60.48ms -step:1384/2285 train_time:83702ms step_avg:60.48ms -step:1385/2285 train_time:83765ms step_avg:60.48ms -step:1386/2285 train_time:83824ms step_avg:60.48ms -step:1387/2285 train_time:83886ms step_avg:60.48ms -step:1388/2285 train_time:83945ms step_avg:60.48ms -step:1389/2285 train_time:84007ms step_avg:60.48ms -step:1390/2285 train_time:84067ms step_avg:60.48ms -step:1391/2285 train_time:84130ms step_avg:60.48ms -step:1392/2285 train_time:84191ms step_avg:60.48ms -step:1393/2285 train_time:84253ms step_avg:60.48ms -step:1394/2285 train_time:84313ms step_avg:60.48ms -step:1395/2285 train_time:84375ms step_avg:60.48ms -step:1396/2285 train_time:84435ms step_avg:60.48ms -step:1397/2285 train_time:84497ms step_avg:60.48ms -step:1398/2285 train_time:84557ms step_avg:60.48ms -step:1399/2285 train_time:84619ms step_avg:60.49ms -step:1400/2285 train_time:84679ms step_avg:60.48ms -step:1401/2285 train_time:84741ms step_avg:60.49ms -step:1402/2285 train_time:84801ms step_avg:60.49ms -step:1403/2285 train_time:84864ms step_avg:60.49ms -step:1404/2285 train_time:84924ms step_avg:60.49ms -step:1405/2285 train_time:84986ms step_avg:60.49ms -step:1406/2285 train_time:85046ms step_avg:60.49ms -step:1407/2285 train_time:85108ms step_avg:60.49ms -step:1408/2285 train_time:85167ms step_avg:60.49ms -step:1409/2285 train_time:85230ms step_avg:60.49ms -step:1410/2285 train_time:85290ms step_avg:60.49ms -step:1411/2285 train_time:85352ms step_avg:60.49ms -step:1412/2285 train_time:85412ms step_avg:60.49ms -step:1413/2285 train_time:85475ms step_avg:60.49ms -step:1414/2285 train_time:85534ms step_avg:60.49ms -step:1415/2285 train_time:85597ms step_avg:60.49ms -step:1416/2285 train_time:85656ms step_avg:60.49ms -step:1417/2285 train_time:85718ms step_avg:60.49ms -step:1418/2285 train_time:85778ms step_avg:60.49ms -step:1419/2285 train_time:85840ms step_avg:60.49ms -step:1420/2285 train_time:85901ms step_avg:60.49ms -step:1421/2285 train_time:85963ms step_avg:60.50ms -step:1422/2285 train_time:86024ms step_avg:60.49ms -step:1423/2285 train_time:86086ms step_avg:60.50ms -step:1424/2285 train_time:86145ms step_avg:60.50ms -step:1425/2285 train_time:86208ms step_avg:60.50ms -step:1426/2285 train_time:86267ms step_avg:60.50ms -step:1427/2285 train_time:86329ms step_avg:60.50ms -step:1428/2285 train_time:86390ms step_avg:60.50ms -step:1429/2285 train_time:86453ms step_avg:60.50ms -step:1430/2285 train_time:86513ms step_avg:60.50ms -step:1431/2285 train_time:86576ms step_avg:60.50ms -step:1432/2285 train_time:86635ms step_avg:60.50ms -step:1433/2285 train_time:86697ms step_avg:60.50ms -step:1434/2285 train_time:86757ms step_avg:60.50ms -step:1435/2285 train_time:86819ms step_avg:60.50ms -step:1436/2285 train_time:86879ms step_avg:60.50ms -step:1437/2285 train_time:86941ms step_avg:60.50ms -step:1438/2285 train_time:87001ms step_avg:60.50ms -step:1439/2285 train_time:87063ms step_avg:60.50ms -step:1440/2285 train_time:87124ms step_avg:60.50ms -step:1441/2285 train_time:87186ms step_avg:60.50ms -step:1442/2285 train_time:87246ms step_avg:60.50ms -step:1443/2285 train_time:87308ms step_avg:60.50ms -step:1444/2285 train_time:87368ms step_avg:60.50ms -step:1445/2285 train_time:87431ms step_avg:60.51ms -step:1446/2285 train_time:87491ms step_avg:60.51ms -step:1447/2285 train_time:87554ms step_avg:60.51ms -step:1448/2285 train_time:87613ms step_avg:60.51ms -step:1449/2285 train_time:87675ms step_avg:60.51ms -step:1450/2285 train_time:87735ms step_avg:60.51ms -step:1451/2285 train_time:87797ms step_avg:60.51ms -step:1452/2285 train_time:87856ms step_avg:60.51ms -step:1453/2285 train_time:87918ms step_avg:60.51ms -step:1454/2285 train_time:87978ms step_avg:60.51ms -step:1455/2285 train_time:88040ms step_avg:60.51ms -step:1456/2285 train_time:88101ms step_avg:60.51ms -step:1457/2285 train_time:88164ms step_avg:60.51ms -step:1458/2285 train_time:88223ms step_avg:60.51ms -step:1459/2285 train_time:88285ms step_avg:60.51ms -step:1460/2285 train_time:88345ms step_avg:60.51ms -step:1461/2285 train_time:88407ms step_avg:60.51ms -step:1462/2285 train_time:88467ms step_avg:60.51ms -step:1463/2285 train_time:88531ms step_avg:60.51ms -step:1464/2285 train_time:88591ms step_avg:60.51ms -step:1465/2285 train_time:88653ms step_avg:60.51ms -step:1466/2285 train_time:88713ms step_avg:60.51ms -step:1467/2285 train_time:88775ms step_avg:60.51ms -step:1468/2285 train_time:88835ms step_avg:60.51ms -step:1469/2285 train_time:88897ms step_avg:60.52ms -step:1470/2285 train_time:88957ms step_avg:60.51ms -step:1471/2285 train_time:89019ms step_avg:60.52ms -step:1472/2285 train_time:89078ms step_avg:60.52ms -step:1473/2285 train_time:89141ms step_avg:60.52ms -step:1474/2285 train_time:89201ms step_avg:60.52ms -step:1475/2285 train_time:89264ms step_avg:60.52ms -step:1476/2285 train_time:89324ms step_avg:60.52ms -step:1477/2285 train_time:89386ms step_avg:60.52ms -step:1478/2285 train_time:89446ms step_avg:60.52ms -step:1479/2285 train_time:89508ms step_avg:60.52ms -step:1480/2285 train_time:89568ms step_avg:60.52ms -step:1481/2285 train_time:89631ms step_avg:60.52ms -step:1482/2285 train_time:89691ms step_avg:60.52ms -step:1483/2285 train_time:89754ms step_avg:60.52ms -step:1484/2285 train_time:89813ms step_avg:60.52ms -step:1485/2285 train_time:89875ms step_avg:60.52ms -step:1486/2285 train_time:89935ms step_avg:60.52ms -step:1487/2285 train_time:89997ms step_avg:60.52ms -step:1488/2285 train_time:90056ms step_avg:60.52ms -step:1489/2285 train_time:90119ms step_avg:60.52ms -step:1490/2285 train_time:90178ms step_avg:60.52ms -step:1491/2285 train_time:90241ms step_avg:60.52ms -step:1492/2285 train_time:90301ms step_avg:60.52ms -step:1493/2285 train_time:90363ms step_avg:60.52ms -step:1494/2285 train_time:90423ms step_avg:60.52ms -step:1495/2285 train_time:90486ms step_avg:60.53ms -step:1496/2285 train_time:90545ms step_avg:60.53ms -step:1497/2285 train_time:90608ms step_avg:60.53ms -step:1498/2285 train_time:90668ms step_avg:60.53ms -step:1499/2285 train_time:90731ms step_avg:60.53ms -step:1500/2285 train_time:90791ms step_avg:60.53ms -step:1500/2285 val_loss:3.4260 train_time:90854ms step_avg:60.57ms -step:1501/2285 train_time:90873ms step_avg:60.54ms -step:1502/2285 train_time:90917ms step_avg:60.53ms -step:1503/2285 train_time:90981ms step_avg:60.53ms -step:1504/2285 train_time:91042ms step_avg:60.53ms -step:1505/2285 train_time:91103ms step_avg:60.53ms -step:1506/2285 train_time:91163ms step_avg:60.53ms -step:1507/2285 train_time:91225ms step_avg:60.53ms -step:1508/2285 train_time:91284ms step_avg:60.53ms -step:1509/2285 train_time:91346ms step_avg:60.53ms -step:1510/2285 train_time:91405ms step_avg:60.53ms -step:1511/2285 train_time:91467ms step_avg:60.53ms -step:1512/2285 train_time:91527ms step_avg:60.53ms -step:1513/2285 train_time:91588ms step_avg:60.53ms -step:1514/2285 train_time:91648ms step_avg:60.53ms -step:1515/2285 train_time:91709ms step_avg:60.53ms -step:1516/2285 train_time:91770ms step_avg:60.53ms -step:1517/2285 train_time:91834ms step_avg:60.54ms -step:1518/2285 train_time:91894ms step_avg:60.54ms -step:1519/2285 train_time:91957ms step_avg:60.54ms -step:1520/2285 train_time:92017ms step_avg:60.54ms -step:1521/2285 train_time:92079ms step_avg:60.54ms -step:1522/2285 train_time:92139ms step_avg:60.54ms -step:1523/2285 train_time:92201ms step_avg:60.54ms -step:1524/2285 train_time:92261ms step_avg:60.54ms -step:1525/2285 train_time:92322ms step_avg:60.54ms -step:1526/2285 train_time:92382ms step_avg:60.54ms -step:1527/2285 train_time:92444ms step_avg:60.54ms -step:1528/2285 train_time:92504ms step_avg:60.54ms -step:1529/2285 train_time:92565ms step_avg:60.54ms -step:1530/2285 train_time:92624ms step_avg:60.54ms -step:1531/2285 train_time:92687ms step_avg:60.54ms -step:1532/2285 train_time:92748ms step_avg:60.54ms -step:1533/2285 train_time:92811ms step_avg:60.54ms -step:1534/2285 train_time:92872ms step_avg:60.54ms -step:1535/2285 train_time:92936ms step_avg:60.54ms -step:1536/2285 train_time:92996ms step_avg:60.54ms -step:1537/2285 train_time:93058ms step_avg:60.55ms -step:1538/2285 train_time:93118ms step_avg:60.54ms -step:1539/2285 train_time:93180ms step_avg:60.55ms -step:1540/2285 train_time:93240ms step_avg:60.55ms -step:1541/2285 train_time:93302ms step_avg:60.55ms -step:1542/2285 train_time:93362ms step_avg:60.55ms -step:1543/2285 train_time:93423ms step_avg:60.55ms -step:1544/2285 train_time:93483ms step_avg:60.55ms -step:1545/2285 train_time:93544ms step_avg:60.55ms -step:1546/2285 train_time:93604ms step_avg:60.55ms -step:1547/2285 train_time:93666ms step_avg:60.55ms -step:1548/2285 train_time:93727ms step_avg:60.55ms -step:1549/2285 train_time:93789ms step_avg:60.55ms -step:1550/2285 train_time:93850ms step_avg:60.55ms -step:1551/2285 train_time:93913ms step_avg:60.55ms -step:1552/2285 train_time:93973ms step_avg:60.55ms -step:1553/2285 train_time:94036ms step_avg:60.55ms -step:1554/2285 train_time:94096ms step_avg:60.55ms -step:1555/2285 train_time:94159ms step_avg:60.55ms -step:1556/2285 train_time:94218ms step_avg:60.55ms -step:1557/2285 train_time:94281ms step_avg:60.55ms -step:1558/2285 train_time:94341ms step_avg:60.55ms -step:1559/2285 train_time:94402ms step_avg:60.55ms -step:1560/2285 train_time:94462ms step_avg:60.55ms -step:1561/2285 train_time:94524ms step_avg:60.55ms -step:1562/2285 train_time:94584ms step_avg:60.55ms -step:1563/2285 train_time:94646ms step_avg:60.55ms -step:1564/2285 train_time:94706ms step_avg:60.55ms -step:1565/2285 train_time:94769ms step_avg:60.56ms -step:1566/2285 train_time:94830ms step_avg:60.56ms -step:1567/2285 train_time:94893ms step_avg:60.56ms -step:1568/2285 train_time:94953ms step_avg:60.56ms -step:1569/2285 train_time:95015ms step_avg:60.56ms -step:1570/2285 train_time:95075ms step_avg:60.56ms -step:1571/2285 train_time:95138ms step_avg:60.56ms -step:1572/2285 train_time:95197ms step_avg:60.56ms -step:1573/2285 train_time:95260ms step_avg:60.56ms -step:1574/2285 train_time:95319ms step_avg:60.56ms -step:1575/2285 train_time:95381ms step_avg:60.56ms -step:1576/2285 train_time:95442ms step_avg:60.56ms -step:1577/2285 train_time:95504ms step_avg:60.56ms -step:1578/2285 train_time:95565ms step_avg:60.56ms -step:1579/2285 train_time:95627ms step_avg:60.56ms -step:1580/2285 train_time:95687ms step_avg:60.56ms -step:1581/2285 train_time:95750ms step_avg:60.56ms -step:1582/2285 train_time:95810ms step_avg:60.56ms -step:1583/2285 train_time:95873ms step_avg:60.56ms -step:1584/2285 train_time:95933ms step_avg:60.56ms -step:1585/2285 train_time:95996ms step_avg:60.57ms -step:1586/2285 train_time:96055ms step_avg:60.56ms -step:1587/2285 train_time:96118ms step_avg:60.57ms -step:1588/2285 train_time:96178ms step_avg:60.57ms -step:1589/2285 train_time:96240ms step_avg:60.57ms -step:1590/2285 train_time:96300ms step_avg:60.57ms -step:1591/2285 train_time:96361ms step_avg:60.57ms -step:1592/2285 train_time:96421ms step_avg:60.57ms -step:1593/2285 train_time:96483ms step_avg:60.57ms -step:1594/2285 train_time:96544ms step_avg:60.57ms -step:1595/2285 train_time:96607ms step_avg:60.57ms -step:1596/2285 train_time:96666ms step_avg:60.57ms -step:1597/2285 train_time:96728ms step_avg:60.57ms -step:1598/2285 train_time:96789ms step_avg:60.57ms -step:1599/2285 train_time:96852ms step_avg:60.57ms -step:1600/2285 train_time:96912ms step_avg:60.57ms -step:1601/2285 train_time:96975ms step_avg:60.57ms -step:1602/2285 train_time:97035ms step_avg:60.57ms -step:1603/2285 train_time:97096ms step_avg:60.57ms -step:1604/2285 train_time:97156ms step_avg:60.57ms -step:1605/2285 train_time:97218ms step_avg:60.57ms -step:1606/2285 train_time:97279ms step_avg:60.57ms -step:1607/2285 train_time:97341ms step_avg:60.57ms -step:1608/2285 train_time:97401ms step_avg:60.57ms -step:1609/2285 train_time:97463ms step_avg:60.57ms -step:1610/2285 train_time:97523ms step_avg:60.57ms -step:1611/2285 train_time:97585ms step_avg:60.57ms -step:1612/2285 train_time:97645ms step_avg:60.57ms -step:1613/2285 train_time:97708ms step_avg:60.58ms -step:1614/2285 train_time:97768ms step_avg:60.57ms -step:1615/2285 train_time:97830ms step_avg:60.58ms -step:1616/2285 train_time:97890ms step_avg:60.58ms -step:1617/2285 train_time:97953ms step_avg:60.58ms -step:1618/2285 train_time:98013ms step_avg:60.58ms -step:1619/2285 train_time:98075ms step_avg:60.58ms -step:1620/2285 train_time:98135ms step_avg:60.58ms -step:1621/2285 train_time:98198ms step_avg:60.58ms -step:1622/2285 train_time:98258ms step_avg:60.58ms -step:1623/2285 train_time:98320ms step_avg:60.58ms -step:1624/2285 train_time:98380ms step_avg:60.58ms -step:1625/2285 train_time:98442ms step_avg:60.58ms -step:1626/2285 train_time:98502ms step_avg:60.58ms -step:1627/2285 train_time:98564ms step_avg:60.58ms -step:1628/2285 train_time:98625ms step_avg:60.58ms -step:1629/2285 train_time:98687ms step_avg:60.58ms -step:1630/2285 train_time:98747ms step_avg:60.58ms -step:1631/2285 train_time:98809ms step_avg:60.58ms -step:1632/2285 train_time:98869ms step_avg:60.58ms -step:1633/2285 train_time:98932ms step_avg:60.58ms -step:1634/2285 train_time:98993ms step_avg:60.58ms -step:1635/2285 train_time:99055ms step_avg:60.58ms -step:1636/2285 train_time:99115ms step_avg:60.58ms -step:1637/2285 train_time:99177ms step_avg:60.58ms -step:1638/2285 train_time:99237ms step_avg:60.58ms -step:1639/2285 train_time:99300ms step_avg:60.59ms -step:1640/2285 train_time:99360ms step_avg:60.59ms -step:1641/2285 train_time:99422ms step_avg:60.59ms -step:1642/2285 train_time:99482ms step_avg:60.59ms -step:1643/2285 train_time:99544ms step_avg:60.59ms -step:1644/2285 train_time:99604ms step_avg:60.59ms -step:1645/2285 train_time:99666ms step_avg:60.59ms -step:1646/2285 train_time:99726ms step_avg:60.59ms -step:1647/2285 train_time:99789ms step_avg:60.59ms -step:1648/2285 train_time:99849ms step_avg:60.59ms -step:1649/2285 train_time:99912ms step_avg:60.59ms -step:1650/2285 train_time:99973ms step_avg:60.59ms -step:1651/2285 train_time:100036ms step_avg:60.59ms -step:1652/2285 train_time:100095ms step_avg:60.59ms -step:1653/2285 train_time:100157ms step_avg:60.59ms -step:1654/2285 train_time:100217ms step_avg:60.59ms -step:1655/2285 train_time:100279ms step_avg:60.59ms -step:1656/2285 train_time:100340ms step_avg:60.59ms -step:1657/2285 train_time:100402ms step_avg:60.59ms -step:1658/2285 train_time:100462ms step_avg:60.59ms -step:1659/2285 train_time:100524ms step_avg:60.59ms -step:1660/2285 train_time:100583ms step_avg:60.59ms -step:1661/2285 train_time:100646ms step_avg:60.59ms -step:1662/2285 train_time:100706ms step_avg:60.59ms -step:1663/2285 train_time:100768ms step_avg:60.59ms -step:1664/2285 train_time:100827ms step_avg:60.59ms -step:1665/2285 train_time:100890ms step_avg:60.59ms -step:1666/2285 train_time:100951ms step_avg:60.59ms -step:1667/2285 train_time:101014ms step_avg:60.60ms -step:1668/2285 train_time:101074ms step_avg:60.60ms -step:1669/2285 train_time:101136ms step_avg:60.60ms -step:1670/2285 train_time:101195ms step_avg:60.60ms -step:1671/2285 train_time:101258ms step_avg:60.60ms -step:1672/2285 train_time:101318ms step_avg:60.60ms -step:1673/2285 train_time:101380ms step_avg:60.60ms -step:1674/2285 train_time:101440ms step_avg:60.60ms -step:1675/2285 train_time:101502ms step_avg:60.60ms -step:1676/2285 train_time:101562ms step_avg:60.60ms -step:1677/2285 train_time:101624ms step_avg:60.60ms -step:1678/2285 train_time:101685ms step_avg:60.60ms -step:1679/2285 train_time:101746ms step_avg:60.60ms -step:1680/2285 train_time:101806ms step_avg:60.60ms -step:1681/2285 train_time:101869ms step_avg:60.60ms -step:1682/2285 train_time:101930ms step_avg:60.60ms -step:1683/2285 train_time:101992ms step_avg:60.60ms -step:1684/2285 train_time:102052ms step_avg:60.60ms -step:1685/2285 train_time:102115ms step_avg:60.60ms -step:1686/2285 train_time:102174ms step_avg:60.60ms -step:1687/2285 train_time:102236ms step_avg:60.60ms -step:1688/2285 train_time:102297ms step_avg:60.60ms -step:1689/2285 train_time:102359ms step_avg:60.60ms -step:1690/2285 train_time:102419ms step_avg:60.60ms -step:1691/2285 train_time:102481ms step_avg:60.60ms -step:1692/2285 train_time:102541ms step_avg:60.60ms -step:1693/2285 train_time:102604ms step_avg:60.60ms -step:1694/2285 train_time:102664ms step_avg:60.60ms -step:1695/2285 train_time:102726ms step_avg:60.61ms -step:1696/2285 train_time:102786ms step_avg:60.61ms -step:1697/2285 train_time:102849ms step_avg:60.61ms -step:1698/2285 train_time:102909ms step_avg:60.61ms -step:1699/2285 train_time:102972ms step_avg:60.61ms -step:1700/2285 train_time:103033ms step_avg:60.61ms -step:1701/2285 train_time:103095ms step_avg:60.61ms -step:1702/2285 train_time:103155ms step_avg:60.61ms -step:1703/2285 train_time:103217ms step_avg:60.61ms -step:1704/2285 train_time:103277ms step_avg:60.61ms -step:1705/2285 train_time:103340ms step_avg:60.61ms -step:1706/2285 train_time:103400ms step_avg:60.61ms -step:1707/2285 train_time:103463ms step_avg:60.61ms -step:1708/2285 train_time:103522ms step_avg:60.61ms -step:1709/2285 train_time:103584ms step_avg:60.61ms -step:1710/2285 train_time:103644ms step_avg:60.61ms -step:1711/2285 train_time:103706ms step_avg:60.61ms -step:1712/2285 train_time:103766ms step_avg:60.61ms -step:1713/2285 train_time:103828ms step_avg:60.61ms -step:1714/2285 train_time:103888ms step_avg:60.61ms -step:1715/2285 train_time:103951ms step_avg:60.61ms -step:1716/2285 train_time:104010ms step_avg:60.61ms -step:1717/2285 train_time:104074ms step_avg:60.61ms -step:1718/2285 train_time:104134ms step_avg:60.61ms -step:1719/2285 train_time:104196ms step_avg:60.61ms -step:1720/2285 train_time:104257ms step_avg:60.61ms -step:1721/2285 train_time:104320ms step_avg:60.62ms -step:1722/2285 train_time:104380ms step_avg:60.62ms -step:1723/2285 train_time:104442ms step_avg:60.62ms -step:1724/2285 train_time:104503ms step_avg:60.62ms -step:1725/2285 train_time:104565ms step_avg:60.62ms -step:1726/2285 train_time:104625ms step_avg:60.62ms -step:1727/2285 train_time:104686ms step_avg:60.62ms -step:1728/2285 train_time:104746ms step_avg:60.62ms -step:1729/2285 train_time:104808ms step_avg:60.62ms -step:1730/2285 train_time:104868ms step_avg:60.62ms -step:1731/2285 train_time:104931ms step_avg:60.62ms -step:1732/2285 train_time:104992ms step_avg:60.62ms -step:1733/2285 train_time:105054ms step_avg:60.62ms -step:1734/2285 train_time:105114ms step_avg:60.62ms -step:1735/2285 train_time:105176ms step_avg:60.62ms -step:1736/2285 train_time:105237ms step_avg:60.62ms -step:1737/2285 train_time:105299ms step_avg:60.62ms -step:1738/2285 train_time:105359ms step_avg:60.62ms -step:1739/2285 train_time:105421ms step_avg:60.62ms -step:1740/2285 train_time:105481ms step_avg:60.62ms -step:1741/2285 train_time:105543ms step_avg:60.62ms -step:1742/2285 train_time:105602ms step_avg:60.62ms -step:1743/2285 train_time:105664ms step_avg:60.62ms -step:1744/2285 train_time:105724ms step_avg:60.62ms -step:1745/2285 train_time:105787ms step_avg:60.62ms -step:1746/2285 train_time:105847ms step_avg:60.62ms -step:1747/2285 train_time:105910ms step_avg:60.62ms -step:1748/2285 train_time:105970ms step_avg:60.62ms -step:1749/2285 train_time:106033ms step_avg:60.63ms -step:1750/2285 train_time:106093ms step_avg:60.62ms -step:1750/2285 val_loss:3.3655 train_time:106157ms step_avg:60.66ms -step:1751/2285 train_time:106175ms step_avg:60.64ms -step:1752/2285 train_time:106221ms step_avg:60.63ms -step:1753/2285 train_time:106283ms step_avg:60.63ms -step:1754/2285 train_time:106344ms step_avg:60.63ms -step:1755/2285 train_time:106408ms step_avg:60.63ms -step:1756/2285 train_time:106468ms step_avg:60.63ms -step:1757/2285 train_time:106530ms step_avg:60.63ms -step:1758/2285 train_time:106588ms step_avg:60.63ms -step:1759/2285 train_time:106650ms step_avg:60.63ms -step:1760/2285 train_time:106709ms step_avg:60.63ms -step:1761/2285 train_time:106770ms step_avg:60.63ms -step:1762/2285 train_time:106830ms step_avg:60.63ms -step:1763/2285 train_time:106891ms step_avg:60.63ms -step:1764/2285 train_time:106951ms step_avg:60.63ms -step:1765/2285 train_time:107013ms step_avg:60.63ms -step:1766/2285 train_time:107075ms step_avg:60.63ms -step:1767/2285 train_time:107141ms step_avg:60.63ms -step:1768/2285 train_time:107201ms step_avg:60.63ms -step:1769/2285 train_time:107264ms step_avg:60.64ms -step:1770/2285 train_time:107325ms step_avg:60.64ms -step:1771/2285 train_time:107387ms step_avg:60.64ms -step:1772/2285 train_time:107447ms step_avg:60.64ms -step:1773/2285 train_time:107510ms step_avg:60.64ms -step:1774/2285 train_time:107569ms step_avg:60.64ms -step:1775/2285 train_time:107630ms step_avg:60.64ms -step:1776/2285 train_time:107689ms step_avg:60.64ms -step:1777/2285 train_time:107751ms step_avg:60.64ms -step:1778/2285 train_time:107810ms step_avg:60.64ms -step:1779/2285 train_time:107872ms step_avg:60.64ms -step:1780/2285 train_time:107931ms step_avg:60.64ms -step:1781/2285 train_time:107993ms step_avg:60.64ms -step:1782/2285 train_time:108055ms step_avg:60.64ms -step:1783/2285 train_time:108119ms step_avg:60.64ms -step:1784/2285 train_time:108179ms step_avg:60.64ms -step:1785/2285 train_time:108243ms step_avg:60.64ms -step:1786/2285 train_time:108303ms step_avg:60.64ms -step:1787/2285 train_time:108365ms step_avg:60.64ms -step:1788/2285 train_time:108425ms step_avg:60.64ms -step:1789/2285 train_time:108487ms step_avg:60.64ms -step:1790/2285 train_time:108546ms step_avg:60.64ms -step:1791/2285 train_time:108608ms step_avg:60.64ms -step:1792/2285 train_time:108667ms step_avg:60.64ms -step:1793/2285 train_time:108729ms step_avg:60.64ms -step:1794/2285 train_time:108788ms step_avg:60.64ms -step:1795/2285 train_time:108850ms step_avg:60.64ms -step:1796/2285 train_time:108909ms step_avg:60.64ms -step:1797/2285 train_time:108972ms step_avg:60.64ms -step:1798/2285 train_time:109032ms step_avg:60.64ms -step:1799/2285 train_time:109096ms step_avg:60.64ms -step:1800/2285 train_time:109157ms step_avg:60.64ms -step:1801/2285 train_time:109219ms step_avg:60.64ms -step:1802/2285 train_time:109280ms step_avg:60.64ms -step:1803/2285 train_time:109342ms step_avg:60.64ms -step:1804/2285 train_time:109402ms step_avg:60.64ms -step:1805/2285 train_time:109464ms step_avg:60.64ms -step:1806/2285 train_time:109525ms step_avg:60.64ms -step:1807/2285 train_time:109587ms step_avg:60.65ms -step:1808/2285 train_time:109646ms step_avg:60.64ms -step:1809/2285 train_time:109708ms step_avg:60.65ms -step:1810/2285 train_time:109767ms step_avg:60.64ms -step:1811/2285 train_time:109829ms step_avg:60.65ms -step:1812/2285 train_time:109888ms step_avg:60.64ms -step:1813/2285 train_time:109951ms step_avg:60.65ms -step:1814/2285 train_time:110011ms step_avg:60.65ms -step:1815/2285 train_time:110074ms step_avg:60.65ms -step:1816/2285 train_time:110135ms step_avg:60.65ms -step:1817/2285 train_time:110198ms step_avg:60.65ms -step:1818/2285 train_time:110259ms step_avg:60.65ms -step:1819/2285 train_time:110321ms step_avg:60.65ms -step:1820/2285 train_time:110381ms step_avg:60.65ms -step:1821/2285 train_time:110443ms step_avg:60.65ms -step:1822/2285 train_time:110503ms step_avg:60.65ms -step:1823/2285 train_time:110565ms step_avg:60.65ms -step:1824/2285 train_time:110624ms step_avg:60.65ms -step:1825/2285 train_time:110686ms step_avg:60.65ms -step:1826/2285 train_time:110746ms step_avg:60.65ms -step:1827/2285 train_time:110808ms step_avg:60.65ms -step:1828/2285 train_time:110868ms step_avg:60.65ms -step:1829/2285 train_time:110931ms step_avg:60.65ms -step:1830/2285 train_time:110990ms step_avg:60.65ms -step:1831/2285 train_time:111053ms step_avg:60.65ms -step:1832/2285 train_time:111113ms step_avg:60.65ms -step:1833/2285 train_time:111176ms step_avg:60.65ms -step:1834/2285 train_time:111236ms step_avg:60.65ms -step:1835/2285 train_time:111299ms step_avg:60.65ms -step:1836/2285 train_time:111359ms step_avg:60.65ms -step:1837/2285 train_time:111421ms step_avg:60.65ms -step:1838/2285 train_time:111481ms step_avg:60.65ms -step:1839/2285 train_time:111543ms step_avg:60.65ms -step:1840/2285 train_time:111603ms step_avg:60.65ms -step:1841/2285 train_time:111665ms step_avg:60.65ms -step:1842/2285 train_time:111725ms step_avg:60.65ms -step:1843/2285 train_time:111787ms step_avg:60.66ms -step:1844/2285 train_time:111847ms step_avg:60.65ms -step:1845/2285 train_time:111910ms step_avg:60.66ms -step:1846/2285 train_time:111970ms step_avg:60.66ms -step:1847/2285 train_time:112032ms step_avg:60.66ms -step:1848/2285 train_time:112092ms step_avg:60.66ms -step:1849/2285 train_time:112155ms step_avg:60.66ms -step:1850/2285 train_time:112215ms step_avg:60.66ms -step:1851/2285 train_time:112278ms step_avg:60.66ms -step:1852/2285 train_time:112338ms step_avg:60.66ms -step:1853/2285 train_time:112401ms step_avg:60.66ms -step:1854/2285 train_time:112461ms step_avg:60.66ms -step:1855/2285 train_time:112523ms step_avg:60.66ms -step:1856/2285 train_time:112583ms step_avg:60.66ms -step:1857/2285 train_time:112644ms step_avg:60.66ms -step:1858/2285 train_time:112704ms step_avg:60.66ms -step:1859/2285 train_time:112767ms step_avg:60.66ms -step:1860/2285 train_time:112827ms step_avg:60.66ms -step:1861/2285 train_time:112890ms step_avg:60.66ms -step:1862/2285 train_time:112949ms step_avg:60.66ms -step:1863/2285 train_time:113011ms step_avg:60.66ms -step:1864/2285 train_time:113072ms step_avg:60.66ms -step:1865/2285 train_time:113134ms step_avg:60.66ms -step:1866/2285 train_time:113194ms step_avg:60.66ms -step:1867/2285 train_time:113257ms step_avg:60.66ms -step:1868/2285 train_time:113318ms step_avg:60.66ms -step:1869/2285 train_time:113380ms step_avg:60.66ms -step:1870/2285 train_time:113439ms step_avg:60.66ms -step:1871/2285 train_time:113501ms step_avg:60.66ms -step:1872/2285 train_time:113561ms step_avg:60.66ms -step:1873/2285 train_time:113623ms step_avg:60.66ms -step:1874/2285 train_time:113683ms step_avg:60.66ms -step:1875/2285 train_time:113745ms step_avg:60.66ms -step:1876/2285 train_time:113805ms step_avg:60.66ms -step:1877/2285 train_time:113868ms step_avg:60.66ms -step:1878/2285 train_time:113928ms step_avg:60.66ms -step:1879/2285 train_time:113990ms step_avg:60.67ms -step:1880/2285 train_time:114050ms step_avg:60.66ms -step:1881/2285 train_time:114112ms step_avg:60.67ms -step:1882/2285 train_time:114173ms step_avg:60.67ms -step:1883/2285 train_time:114236ms step_avg:60.67ms -step:1884/2285 train_time:114296ms step_avg:60.67ms -step:1885/2285 train_time:114358ms step_avg:60.67ms -step:1886/2285 train_time:114418ms step_avg:60.67ms -step:1887/2285 train_time:114481ms step_avg:60.67ms -step:1888/2285 train_time:114540ms step_avg:60.67ms -step:1889/2285 train_time:114602ms step_avg:60.67ms -step:1890/2285 train_time:114662ms step_avg:60.67ms -step:1891/2285 train_time:114725ms step_avg:60.67ms -step:1892/2285 train_time:114785ms step_avg:60.67ms -step:1893/2285 train_time:114847ms step_avg:60.67ms -step:1894/2285 train_time:114907ms step_avg:60.67ms -step:1895/2285 train_time:114970ms step_avg:60.67ms -step:1896/2285 train_time:115030ms step_avg:60.67ms -step:1897/2285 train_time:115092ms step_avg:60.67ms -step:1898/2285 train_time:115153ms step_avg:60.67ms -step:1899/2285 train_time:115216ms step_avg:60.67ms -step:1900/2285 train_time:115276ms step_avg:60.67ms -step:1901/2285 train_time:115338ms step_avg:60.67ms -step:1902/2285 train_time:115397ms step_avg:60.67ms -step:1903/2285 train_time:115461ms step_avg:60.67ms -step:1904/2285 train_time:115520ms step_avg:60.67ms -step:1905/2285 train_time:115583ms step_avg:60.67ms -step:1906/2285 train_time:115642ms step_avg:60.67ms -step:1907/2285 train_time:115705ms step_avg:60.67ms -step:1908/2285 train_time:115765ms step_avg:60.67ms -step:1909/2285 train_time:115827ms step_avg:60.67ms -step:1910/2285 train_time:115888ms step_avg:60.67ms -step:1911/2285 train_time:115950ms step_avg:60.68ms -step:1912/2285 train_time:116010ms step_avg:60.67ms -step:1913/2285 train_time:116073ms step_avg:60.68ms -step:1914/2285 train_time:116133ms step_avg:60.68ms -step:1915/2285 train_time:116196ms step_avg:60.68ms -step:1916/2285 train_time:116256ms step_avg:60.68ms -step:1917/2285 train_time:116318ms step_avg:60.68ms -step:1918/2285 train_time:116379ms step_avg:60.68ms -step:1919/2285 train_time:116441ms step_avg:60.68ms -step:1920/2285 train_time:116501ms step_avg:60.68ms -step:1921/2285 train_time:116564ms step_avg:60.68ms -step:1922/2285 train_time:116625ms step_avg:60.68ms -step:1923/2285 train_time:116687ms step_avg:60.68ms -step:1924/2285 train_time:116747ms step_avg:60.68ms -step:1925/2285 train_time:116809ms step_avg:60.68ms -step:1926/2285 train_time:116869ms step_avg:60.68ms -step:1927/2285 train_time:116932ms step_avg:60.68ms -step:1928/2285 train_time:116992ms step_avg:60.68ms -step:1929/2285 train_time:117054ms step_avg:60.68ms -step:1930/2285 train_time:117115ms step_avg:60.68ms -step:1931/2285 train_time:117177ms step_avg:60.68ms -step:1932/2285 train_time:117237ms step_avg:60.68ms -step:1933/2285 train_time:117300ms step_avg:60.68ms -step:1934/2285 train_time:117359ms step_avg:60.68ms -step:1935/2285 train_time:117421ms step_avg:60.68ms -step:1936/2285 train_time:117482ms step_avg:60.68ms -step:1937/2285 train_time:117544ms step_avg:60.68ms -step:1938/2285 train_time:117604ms step_avg:60.68ms -step:1939/2285 train_time:117666ms step_avg:60.68ms -step:1940/2285 train_time:117726ms step_avg:60.68ms -step:1941/2285 train_time:117789ms step_avg:60.68ms -step:1942/2285 train_time:117849ms step_avg:60.68ms -step:1943/2285 train_time:117911ms step_avg:60.69ms -step:1944/2285 train_time:117971ms step_avg:60.68ms -step:1945/2285 train_time:118033ms step_avg:60.69ms -step:1946/2285 train_time:118093ms step_avg:60.69ms -step:1947/2285 train_time:118156ms step_avg:60.69ms -step:1948/2285 train_time:118216ms step_avg:60.69ms -step:1949/2285 train_time:118279ms step_avg:60.69ms -step:1950/2285 train_time:118339ms step_avg:60.69ms -step:1951/2285 train_time:118401ms step_avg:60.69ms -step:1952/2285 train_time:118461ms step_avg:60.69ms -step:1953/2285 train_time:118523ms step_avg:60.69ms -step:1954/2285 train_time:118583ms step_avg:60.69ms -step:1955/2285 train_time:118645ms step_avg:60.69ms -step:1956/2285 train_time:118706ms step_avg:60.69ms -step:1957/2285 train_time:118769ms step_avg:60.69ms -step:1958/2285 train_time:118829ms step_avg:60.69ms -step:1959/2285 train_time:118891ms step_avg:60.69ms -step:1960/2285 train_time:118951ms step_avg:60.69ms -step:1961/2285 train_time:119013ms step_avg:60.69ms -step:1962/2285 train_time:119073ms step_avg:60.69ms -step:1963/2285 train_time:119136ms step_avg:60.69ms -step:1964/2285 train_time:119196ms step_avg:60.69ms -step:1965/2285 train_time:119258ms step_avg:60.69ms -step:1966/2285 train_time:119319ms step_avg:60.69ms -step:1967/2285 train_time:119381ms step_avg:60.69ms -step:1968/2285 train_time:119440ms step_avg:60.69ms -step:1969/2285 train_time:119503ms step_avg:60.69ms -step:1970/2285 train_time:119564ms step_avg:60.69ms -step:1971/2285 train_time:119627ms step_avg:60.69ms -step:1972/2285 train_time:119687ms step_avg:60.69ms -step:1973/2285 train_time:119749ms step_avg:60.69ms -step:1974/2285 train_time:119809ms step_avg:60.69ms -step:1975/2285 train_time:119871ms step_avg:60.69ms -step:1976/2285 train_time:119932ms step_avg:60.69ms -step:1977/2285 train_time:119994ms step_avg:60.69ms -step:1978/2285 train_time:120053ms step_avg:60.69ms -step:1979/2285 train_time:120116ms step_avg:60.70ms -step:1980/2285 train_time:120176ms step_avg:60.69ms -step:1981/2285 train_time:120238ms step_avg:60.70ms -step:1982/2285 train_time:120298ms step_avg:60.70ms -step:1983/2285 train_time:120361ms step_avg:60.70ms -step:1984/2285 train_time:120420ms step_avg:60.70ms -step:1985/2285 train_time:120483ms step_avg:60.70ms -step:1986/2285 train_time:120543ms step_avg:60.70ms -step:1987/2285 train_time:120605ms step_avg:60.70ms -step:1988/2285 train_time:120665ms step_avg:60.70ms -step:1989/2285 train_time:120728ms step_avg:60.70ms -step:1990/2285 train_time:120788ms step_avg:60.70ms -step:1991/2285 train_time:120850ms step_avg:60.70ms -step:1992/2285 train_time:120911ms step_avg:60.70ms -step:1993/2285 train_time:120973ms step_avg:60.70ms -step:1994/2285 train_time:121033ms step_avg:60.70ms -step:1995/2285 train_time:121096ms step_avg:60.70ms -step:1996/2285 train_time:121156ms step_avg:60.70ms -step:1997/2285 train_time:121219ms step_avg:60.70ms -step:1998/2285 train_time:121279ms step_avg:60.70ms -step:1999/2285 train_time:121341ms step_avg:60.70ms -step:2000/2285 train_time:121401ms step_avg:60.70ms -step:2000/2285 val_loss:3.3166 train_time:121465ms step_avg:60.73ms -step:2001/2285 train_time:121483ms step_avg:60.71ms -step:2002/2285 train_time:121526ms step_avg:60.70ms -step:2003/2285 train_time:121588ms step_avg:60.70ms -step:2004/2285 train_time:121648ms step_avg:60.70ms -step:2005/2285 train_time:121712ms step_avg:60.70ms -step:2006/2285 train_time:121771ms step_avg:60.70ms -step:2007/2285 train_time:121833ms step_avg:60.70ms -step:2008/2285 train_time:121892ms step_avg:60.70ms -step:2009/2285 train_time:121954ms step_avg:60.70ms -step:2010/2285 train_time:122013ms step_avg:60.70ms -step:2011/2285 train_time:122074ms step_avg:60.70ms -step:2012/2285 train_time:122133ms step_avg:60.70ms -step:2013/2285 train_time:122195ms step_avg:60.70ms -step:2014/2285 train_time:122255ms step_avg:60.70ms -step:2015/2285 train_time:122316ms step_avg:60.70ms -step:2016/2285 train_time:122379ms step_avg:60.70ms -step:2017/2285 train_time:122447ms step_avg:60.71ms -step:2018/2285 train_time:122508ms step_avg:60.71ms -step:2019/2285 train_time:122571ms step_avg:60.71ms -step:2020/2285 train_time:122631ms step_avg:60.71ms -step:2021/2285 train_time:122694ms step_avg:60.71ms -step:2022/2285 train_time:122754ms step_avg:60.71ms -step:2023/2285 train_time:122816ms step_avg:60.71ms -step:2024/2285 train_time:122876ms step_avg:60.71ms -step:2025/2285 train_time:122937ms step_avg:60.71ms -step:2026/2285 train_time:122998ms step_avg:60.71ms -step:2027/2285 train_time:123059ms step_avg:60.71ms -step:2028/2285 train_time:123118ms step_avg:60.71ms -step:2029/2285 train_time:123180ms step_avg:60.71ms -step:2030/2285 train_time:123239ms step_avg:60.71ms -step:2031/2285 train_time:123302ms step_avg:60.71ms -step:2032/2285 train_time:123363ms step_avg:60.71ms -step:2033/2285 train_time:123426ms step_avg:60.71ms -step:2034/2285 train_time:123487ms step_avg:60.71ms -step:2035/2285 train_time:123549ms step_avg:60.71ms -step:2036/2285 train_time:123610ms step_avg:60.71ms -step:2037/2285 train_time:123672ms step_avg:60.71ms -step:2038/2285 train_time:123733ms step_avg:60.71ms -step:2039/2285 train_time:123796ms step_avg:60.71ms -step:2040/2285 train_time:123856ms step_avg:60.71ms -step:2041/2285 train_time:123918ms step_avg:60.71ms -step:2042/2285 train_time:123978ms step_avg:60.71ms -step:2043/2285 train_time:124039ms step_avg:60.71ms -step:2044/2285 train_time:124099ms step_avg:60.71ms -step:2045/2285 train_time:124161ms step_avg:60.71ms -step:2046/2285 train_time:124221ms step_avg:60.71ms -step:2047/2285 train_time:124284ms step_avg:60.72ms -step:2048/2285 train_time:124344ms step_avg:60.72ms -step:2049/2285 train_time:124408ms step_avg:60.72ms -step:2050/2285 train_time:124468ms step_avg:60.72ms -step:2051/2285 train_time:124531ms step_avg:60.72ms -step:2052/2285 train_time:124591ms step_avg:60.72ms -step:2053/2285 train_time:124653ms step_avg:60.72ms -step:2054/2285 train_time:124714ms step_avg:60.72ms -step:2055/2285 train_time:124776ms step_avg:60.72ms -step:2056/2285 train_time:124836ms step_avg:60.72ms -step:2057/2285 train_time:124898ms step_avg:60.72ms -step:2058/2285 train_time:124958ms step_avg:60.72ms -step:2059/2285 train_time:125020ms step_avg:60.72ms -step:2060/2285 train_time:125079ms step_avg:60.72ms -step:2061/2285 train_time:125142ms step_avg:60.72ms -step:2062/2285 train_time:125202ms step_avg:60.72ms -step:2063/2285 train_time:125264ms step_avg:60.72ms -step:2064/2285 train_time:125324ms step_avg:60.72ms -step:2065/2285 train_time:125386ms step_avg:60.72ms -step:2066/2285 train_time:125447ms step_avg:60.72ms -step:2067/2285 train_time:125509ms step_avg:60.72ms -step:2068/2285 train_time:125570ms step_avg:60.72ms -step:2069/2285 train_time:125633ms step_avg:60.72ms -step:2070/2285 train_time:125693ms step_avg:60.72ms -step:2071/2285 train_time:125755ms step_avg:60.72ms -step:2072/2285 train_time:125815ms step_avg:60.72ms -step:2073/2285 train_time:125878ms step_avg:60.72ms -step:2074/2285 train_time:125938ms step_avg:60.72ms -step:2075/2285 train_time:126001ms step_avg:60.72ms -step:2076/2285 train_time:126061ms step_avg:60.72ms -step:2077/2285 train_time:126123ms step_avg:60.72ms -step:2078/2285 train_time:126183ms step_avg:60.72ms -step:2079/2285 train_time:126245ms step_avg:60.72ms -step:2080/2285 train_time:126304ms step_avg:60.72ms -step:2081/2285 train_time:126366ms step_avg:60.72ms -step:2082/2285 train_time:126427ms step_avg:60.72ms -step:2083/2285 train_time:126489ms step_avg:60.72ms -step:2084/2285 train_time:126549ms step_avg:60.72ms -step:2085/2285 train_time:126612ms step_avg:60.73ms -step:2086/2285 train_time:126673ms step_avg:60.73ms -step:2087/2285 train_time:126735ms step_avg:60.73ms -step:2088/2285 train_time:126796ms step_avg:60.73ms -step:2089/2285 train_time:126857ms step_avg:60.73ms -step:2090/2285 train_time:126918ms step_avg:60.73ms -step:2091/2285 train_time:126980ms step_avg:60.73ms -step:2092/2285 train_time:127040ms step_avg:60.73ms -step:2093/2285 train_time:127103ms step_avg:60.73ms -step:2094/2285 train_time:127163ms step_avg:60.73ms -step:2095/2285 train_time:127224ms step_avg:60.73ms -step:2096/2285 train_time:127284ms step_avg:60.73ms -step:2097/2285 train_time:127346ms step_avg:60.73ms -step:2098/2285 train_time:127406ms step_avg:60.73ms -step:2099/2285 train_time:127468ms step_avg:60.73ms -step:2100/2285 train_time:127529ms step_avg:60.73ms -step:2101/2285 train_time:127591ms step_avg:60.73ms -step:2102/2285 train_time:127652ms step_avg:60.73ms -step:2103/2285 train_time:127715ms step_avg:60.73ms -step:2104/2285 train_time:127775ms step_avg:60.73ms -step:2105/2285 train_time:127837ms step_avg:60.73ms -step:2106/2285 train_time:127898ms step_avg:60.73ms -step:2107/2285 train_time:127960ms step_avg:60.73ms -step:2108/2285 train_time:128020ms step_avg:60.73ms -step:2109/2285 train_time:128082ms step_avg:60.73ms -step:2110/2285 train_time:128142ms step_avg:60.73ms -step:2111/2285 train_time:128205ms step_avg:60.73ms -step:2112/2285 train_time:128265ms step_avg:60.73ms -step:2113/2285 train_time:128327ms step_avg:60.73ms -step:2114/2285 train_time:128387ms step_avg:60.73ms -step:2115/2285 train_time:128450ms step_avg:60.73ms -step:2116/2285 train_time:128510ms step_avg:60.73ms -step:2117/2285 train_time:128572ms step_avg:60.73ms -step:2118/2285 train_time:128632ms step_avg:60.73ms -step:2119/2285 train_time:128695ms step_avg:60.73ms -step:2120/2285 train_time:128755ms step_avg:60.73ms -step:2121/2285 train_time:128818ms step_avg:60.73ms -step:2122/2285 train_time:128878ms step_avg:60.73ms -step:2123/2285 train_time:128940ms step_avg:60.73ms -step:2124/2285 train_time:129000ms step_avg:60.73ms -step:2125/2285 train_time:129063ms step_avg:60.74ms -step:2126/2285 train_time:129123ms step_avg:60.74ms -step:2127/2285 train_time:129185ms step_avg:60.74ms -step:2128/2285 train_time:129245ms step_avg:60.74ms -step:2129/2285 train_time:129307ms step_avg:60.74ms -step:2130/2285 train_time:129367ms step_avg:60.74ms -step:2131/2285 train_time:129429ms step_avg:60.74ms -step:2132/2285 train_time:129490ms step_avg:60.74ms -step:2133/2285 train_time:129552ms step_avg:60.74ms -step:2134/2285 train_time:129612ms step_avg:60.74ms -step:2135/2285 train_time:129675ms step_avg:60.74ms -step:2136/2285 train_time:129736ms step_avg:60.74ms -step:2137/2285 train_time:129799ms step_avg:60.74ms -step:2138/2285 train_time:129859ms step_avg:60.74ms -step:2139/2285 train_time:129921ms step_avg:60.74ms -step:2140/2285 train_time:129981ms step_avg:60.74ms -step:2141/2285 train_time:130044ms step_avg:60.74ms -step:2142/2285 train_time:130104ms step_avg:60.74ms -step:2143/2285 train_time:130166ms step_avg:60.74ms -step:2144/2285 train_time:130226ms step_avg:60.74ms -step:2145/2285 train_time:130288ms step_avg:60.74ms -step:2146/2285 train_time:130348ms step_avg:60.74ms -step:2147/2285 train_time:130411ms step_avg:60.74ms -step:2148/2285 train_time:130471ms step_avg:60.74ms -step:2149/2285 train_time:130533ms step_avg:60.74ms -step:2150/2285 train_time:130593ms step_avg:60.74ms -step:2151/2285 train_time:130656ms step_avg:60.74ms -step:2152/2285 train_time:130716ms step_avg:60.74ms -step:2153/2285 train_time:130778ms step_avg:60.74ms -step:2154/2285 train_time:130838ms step_avg:60.74ms -step:2155/2285 train_time:130901ms step_avg:60.74ms -step:2156/2285 train_time:130961ms step_avg:60.74ms -step:2157/2285 train_time:131024ms step_avg:60.74ms -step:2158/2285 train_time:131084ms step_avg:60.74ms -step:2159/2285 train_time:131147ms step_avg:60.74ms -step:2160/2285 train_time:131207ms step_avg:60.74ms -step:2161/2285 train_time:131269ms step_avg:60.74ms -step:2162/2285 train_time:131329ms step_avg:60.74ms -step:2163/2285 train_time:131392ms step_avg:60.75ms -step:2164/2285 train_time:131452ms step_avg:60.74ms -step:2165/2285 train_time:131514ms step_avg:60.75ms -step:2166/2285 train_time:131574ms step_avg:60.75ms -step:2167/2285 train_time:131636ms step_avg:60.75ms -step:2168/2285 train_time:131697ms step_avg:60.75ms -step:2169/2285 train_time:131759ms step_avg:60.75ms -step:2170/2285 train_time:131819ms step_avg:60.75ms -step:2171/2285 train_time:131882ms step_avg:60.75ms -step:2172/2285 train_time:131942ms step_avg:60.75ms -step:2173/2285 train_time:132004ms step_avg:60.75ms -step:2174/2285 train_time:132064ms step_avg:60.75ms -step:2175/2285 train_time:132127ms step_avg:60.75ms -step:2176/2285 train_time:132187ms step_avg:60.75ms -step:2177/2285 train_time:132250ms step_avg:60.75ms -step:2178/2285 train_time:132310ms step_avg:60.75ms -step:2179/2285 train_time:132372ms step_avg:60.75ms -step:2180/2285 train_time:132432ms step_avg:60.75ms -step:2181/2285 train_time:132495ms step_avg:60.75ms -step:2182/2285 train_time:132554ms step_avg:60.75ms -step:2183/2285 train_time:132617ms step_avg:60.75ms -step:2184/2285 train_time:132677ms step_avg:60.75ms -step:2185/2285 train_time:132739ms step_avg:60.75ms -step:2186/2285 train_time:132799ms step_avg:60.75ms -step:2187/2285 train_time:132861ms step_avg:60.75ms -step:2188/2285 train_time:132922ms step_avg:60.75ms -step:2189/2285 train_time:132984ms step_avg:60.75ms -step:2190/2285 train_time:133044ms step_avg:60.75ms -step:2191/2285 train_time:133106ms step_avg:60.75ms -step:2192/2285 train_time:133166ms step_avg:60.75ms -step:2193/2285 train_time:133229ms step_avg:60.75ms -step:2194/2285 train_time:133289ms step_avg:60.75ms -step:2195/2285 train_time:133351ms step_avg:60.75ms -step:2196/2285 train_time:133411ms step_avg:60.75ms -step:2197/2285 train_time:133473ms step_avg:60.75ms -step:2198/2285 train_time:133533ms step_avg:60.75ms -step:2199/2285 train_time:133597ms step_avg:60.75ms -step:2200/2285 train_time:133657ms step_avg:60.75ms -step:2201/2285 train_time:133719ms step_avg:60.75ms -step:2202/2285 train_time:133779ms step_avg:60.75ms -step:2203/2285 train_time:133842ms step_avg:60.75ms -step:2204/2285 train_time:133902ms step_avg:60.75ms -step:2205/2285 train_time:133965ms step_avg:60.75ms -step:2206/2285 train_time:134025ms step_avg:60.75ms -step:2207/2285 train_time:134087ms step_avg:60.76ms -step:2208/2285 train_time:134148ms step_avg:60.76ms -step:2209/2285 train_time:134210ms step_avg:60.76ms -step:2210/2285 train_time:134270ms step_avg:60.76ms -step:2211/2285 train_time:134333ms step_avg:60.76ms -step:2212/2285 train_time:134393ms step_avg:60.76ms -step:2213/2285 train_time:134455ms step_avg:60.76ms -step:2214/2285 train_time:134515ms step_avg:60.76ms -step:2215/2285 train_time:134578ms step_avg:60.76ms -step:2216/2285 train_time:134638ms step_avg:60.76ms -step:2217/2285 train_time:134701ms step_avg:60.76ms -step:2218/2285 train_time:134760ms step_avg:60.76ms -step:2219/2285 train_time:134823ms step_avg:60.76ms -step:2220/2285 train_time:134883ms step_avg:60.76ms -step:2221/2285 train_time:134945ms step_avg:60.76ms -step:2222/2285 train_time:135005ms step_avg:60.76ms -step:2223/2285 train_time:135068ms step_avg:60.76ms -step:2224/2285 train_time:135128ms step_avg:60.76ms -step:2225/2285 train_time:135190ms step_avg:60.76ms -step:2226/2285 train_time:135250ms step_avg:60.76ms -step:2227/2285 train_time:135313ms step_avg:60.76ms -step:2228/2285 train_time:135373ms step_avg:60.76ms -step:2229/2285 train_time:135435ms step_avg:60.76ms -step:2230/2285 train_time:135495ms step_avg:60.76ms -step:2231/2285 train_time:135557ms step_avg:60.76ms -step:2232/2285 train_time:135617ms step_avg:60.76ms -step:2233/2285 train_time:135679ms step_avg:60.76ms -step:2234/2285 train_time:135739ms step_avg:60.76ms -step:2235/2285 train_time:135801ms step_avg:60.76ms -step:2236/2285 train_time:135861ms step_avg:60.76ms -step:2237/2285 train_time:135924ms step_avg:60.76ms -step:2238/2285 train_time:135984ms step_avg:60.76ms -step:2239/2285 train_time:136046ms step_avg:60.76ms -step:2240/2285 train_time:136106ms step_avg:60.76ms -step:2241/2285 train_time:136168ms step_avg:60.76ms -step:2242/2285 train_time:136229ms step_avg:60.76ms -step:2243/2285 train_time:136293ms step_avg:60.76ms -step:2244/2285 train_time:136353ms step_avg:60.76ms -step:2245/2285 train_time:136416ms step_avg:60.76ms -step:2246/2285 train_time:136476ms step_avg:60.76ms -step:2247/2285 train_time:136538ms step_avg:60.76ms -step:2248/2285 train_time:136598ms step_avg:60.76ms -step:2249/2285 train_time:136660ms step_avg:60.76ms -step:2250/2285 train_time:136720ms step_avg:60.76ms -step:2250/2285 val_loss:3.2816 train_time:136784ms step_avg:60.79ms -step:2251/2285 train_time:136802ms step_avg:60.77ms -step:2252/2285 train_time:136847ms step_avg:60.77ms -step:2253/2285 train_time:136910ms step_avg:60.77ms -step:2254/2285 train_time:136971ms step_avg:60.77ms -step:2255/2285 train_time:137034ms step_avg:60.77ms -step:2256/2285 train_time:137094ms step_avg:60.77ms -step:2257/2285 train_time:137156ms step_avg:60.77ms -step:2258/2285 train_time:137217ms step_avg:60.77ms -step:2259/2285 train_time:137279ms step_avg:60.77ms -step:2260/2285 train_time:137339ms step_avg:60.77ms -step:2261/2285 train_time:137402ms step_avg:60.77ms -step:2262/2285 train_time:137461ms step_avg:60.77ms -step:2263/2285 train_time:137523ms step_avg:60.77ms -step:2264/2285 train_time:137584ms step_avg:60.77ms -step:2265/2285 train_time:137646ms step_avg:60.77ms -step:2266/2285 train_time:137706ms step_avg:60.77ms -step:2267/2285 train_time:137771ms step_avg:60.77ms -step:2268/2285 train_time:137832ms step_avg:60.77ms -step:2269/2285 train_time:137895ms step_avg:60.77ms -step:2270/2285 train_time:137956ms step_avg:60.77ms -step:2271/2285 train_time:138018ms step_avg:60.77ms -step:2272/2285 train_time:138078ms step_avg:60.77ms -step:2273/2285 train_time:138141ms step_avg:60.77ms -step:2274/2285 train_time:138202ms step_avg:60.77ms -step:2275/2285 train_time:138264ms step_avg:60.78ms -step:2276/2285 train_time:138323ms step_avg:60.77ms -step:2277/2285 train_time:138385ms step_avg:60.78ms -step:2278/2285 train_time:138445ms step_avg:60.77ms -step:2279/2285 train_time:138507ms step_avg:60.78ms -step:2280/2285 train_time:138567ms step_avg:60.77ms -step:2281/2285 train_time:138629ms step_avg:60.78ms -step:2282/2285 train_time:138690ms step_avg:60.78ms -step:2283/2285 train_time:138753ms step_avg:60.78ms -step:2284/2285 train_time:138813ms step_avg:60.78ms -step:2285/2285 train_time:138876ms step_avg:60.78ms -step:2285/2285 val_loss:3.2757 train_time:138937ms step_avg:60.80ms -peak memory allocated: 29626 MiB reserved: 50528 MiB diff --git a/records/track_1_short/2025-11-10_CautiousWD/1aac0132-a891-4ed9-b358-0fd2abd1b019.txt b/records/track_1_short/2025-11-10_CautiousWD/1aac0132-a891-4ed9-b358-0fd2abd1b019.txt new file mode 100644 index 000000000..72e3b5871 --- /dev/null +++ b/records/track_1_short/2025-11-10_CautiousWD/1aac0132-a891-4ed9-b358-0fd2abd1b019.txt @@ -0,0 +1,3772 @@ +import os +import sys + +with open(sys.argv[0]) as f: + code = f.read() # read the code of this file ASAP, for logging +import copy +import glob +import math +import threading +import time +import uuid +from dataclasses import dataclass +from collections import defaultdict +from itertools import accumulate +from pathlib import Path + +os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" +import torch + +torch.empty( + 1, device="cuda", requires_grad=True +).backward() # prevents a bug on some systems +import torch._dynamo as dynamo +import torch.distributed as dist +import torch.nn.functional as F + +# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min +import triton +import triton.language as tl +from kernels import get_kernel +from torch import Tensor, nn + +dynamo.config.recompile_limit = 64 + +# ----------------------------------------------------------------------------- +# Custom operators: FP8 matmul by @YouJiacheng + + +@torch.library.custom_op("nanogpt::mm", mutates_args=()) +def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: + @torch.compile + def impl(x: Tensor, w: Tensor): + assert x.is_contiguous() and w.is_contiguous() + x_f8 = x.div(x_s).to(torch.float8_e4m3fn) + w_f8 = w.div(w_s).to(torch.float8_e4m3fn) + out = torch._scaled_mm( + x_f8, + w_f8.T, + out_dtype=torch.bfloat16, + scale_a=x.new_tensor(x_s, dtype=torch.float32), + scale_b=x.new_tensor(w_s, dtype=torch.float32), + use_fast_accum=True, + ) + return out, x_f8, w_f8 + + return impl(x, w) + +@mm_op.register_fake +def _(x: Tensor, w: Tensor, *_): + assert x.ndim == w.ndim == 2 + assert x.shape[1] == w.shape[1] + assert x.device == w.device + assert x.is_contiguous() and w.is_contiguous() + return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) + +@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) +def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: + @torch.compile + def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): + assert grad.is_contiguous() + x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) + w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) + grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) + grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) + grad_x = torch._scaled_mm( + grad_f8, + w_f8.T.contiguous().T, + out_dtype=torch.bfloat16, + scale_a=grad_inv_s, + scale_b=w_inv_s, + use_fast_accum=False, + ) + # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) + grad_w = torch._scaled_mm( + x_f8.T.contiguous(), + grad_f8.T.contiguous().T, + out_dtype=torch.float32, + scale_a=x_inv_s, + scale_b=grad_inv_s, + use_fast_accum=False, + ).T + return grad_x, grad_w + + return impl(g, x_f8, w_f8) + +@mm_backward_op.register_fake +def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): + return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) + +def backward(ctx, grad_out: Tensor, *_): + x_f8, w_f8 = ctx.saved_tensors + x_s, w_s, grad_s = ctx.scales + grad_x, grad_w = torch.ops.nanogpt.mm_backward( + grad_out, x_f8, w_f8, x_s, w_s, grad_s + ) + return grad_x, grad_w, None, None, None + +def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): + *_, x_s, w_s, grad_s = inputs + _, x_f8, w_f8 = output + ctx.save_for_backward(x_f8, w_f8) + ctx.scales = x_s, w_s, grad_s + ctx.set_materialize_grads(False) + +mm_op.register_autograd(backward, setup_context=setup_context) + +# ----------------------------------------------------------------------------- +# Triton kernel for symmetric matrix multiplication by @byronxu99 + +def _get_autotune_configs(): + return [ + triton.Config( + { + "BLOCK_SIZE_M": bm, + "BLOCK_SIZE_N": bn, + "BLOCK_SIZE_K": bk, + "GROUP_SIZE_M": 8, + "LOWER_UPPER": 1, + }, + num_stages=stages, + num_warps=warps, + ) + for bm in [64, 128] + for bn in [64, 128, 256] + for bk in [64, 128] + for stages, warps in [(3, 4), (3, 8), (4, 4)] + if bm // bn <= 2 and bn // bm <= 2 + ] + +@triton.jit +def _pid_to_block( + pid, + M, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) + + # Map PID to a single matrix in batch + batch_idx = pid // (num_pid_m * num_pid_n) + pid = pid % (num_pid_m * num_pid_n) + + # Map PID to 2D grid of blocks + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) + + m_idx = pid_m * BLOCK_SIZE_M + n_idx = pid_n * BLOCK_SIZE_N + return batch_idx, m_idx, n_idx + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def XXT_kernel( + A_ptr, C_ptr, + M, K, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def XXT(A: torch.Tensor, out: torch.Tensor): + """ + Launch Triton kernel to compute C = A @ A.T + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert out.size(-2) == M, "Output matrix has incorrect shape" + assert out.size(-1) == M, "Output matrix has incorrect shape" + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + XXT_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + K=K, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + ) + return out + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def ba_plus_cAA_kernel( + A_ptr, C_ptr, + M, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + alpha, beta, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A + # Performance is slightly slower than XXT_kernel, so we use two separate kernels + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + # Load block of A to add (corresponds to the current block of C) + offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) + a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) + a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) + a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) + + # Apply alpha and beta + accumulator *= alpha + accumulator += a_add * beta + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): + """ + Launch Triton kernel to compute C = alpha * A @ A.T + beta * A + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert M == K, "Input matrix must be square" + assert out.size(-2) == M + assert out.size(-1) == M + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + ba_plus_cAA_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + alpha=alpha, + beta=beta, + ) + return out + +# Computed for num_iters=5, safety_factor=2e-2, cushion=2 +polar_express_coeffs = [ + (8.156554524902461, -22.48329292557795, 15.878769915207462), + (4.042929935166739, -2.808917465908714, 0.5000178451051316), + (3.8916678022926607, -2.772484153217685, 0.5060648178503393), + (3.285753657755655, -2.3681294933425376, 0.46449024233003106), + (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) +] + +@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower +def polar_express(G: torch.Tensor): + """ + Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 + by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. + """ + X = G.bfloat16() + if G.size(-2) > G.size(-1): + X = X.mT + + # Ensure spectral norm is at most 1 + X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) + + # Allocate buffers + X = X.contiguous() + A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) + B = torch.empty_like(A) + C = torch.empty_like(X) + + aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm + + # Perform the iterations + for a, b, c in polar_express_coeffs: + XXT(X, out=A) # A = X @ X.mT + ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A + aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X + X, C = C, X # Swap references to avoid unnecessary copies + + if G.size(-2) > G.size(-1): + X = X.mT + return X + +# ----------------------------------------------------------------------------- +# Muon optimizer + +class NorMuon(torch.optim.Optimizer): + """ + Muon - MomentUm Orthogonalized by Newton-schulz + + https://kellerjordan.github.io/posts/muon/ + + Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- + processing step, in which each 2D parameter's update is replaced with the nearest orthogonal + matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has + the advantage that it can be stably run in bfloat16 on the GPU. + + Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, + or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). + + Differences from standard Muon: + - Newton-Shulz is replaced with Polar Express for the orthogonalization step + - NorMuon adds a low-rank variance estimator similar to Adafactor. + - small 1D parameters handled here instead of in Adam + - Cautious weight decay, a gated version of decoupled weight decay + - Custom distributed sizing: + The model stores all attn and mlp weights in the same shape, and then updates the view as + needed on the forward pass. This enables attn and mlp weights to be contained within the same + dist.reduce_scatter_tensor() call. The model architecture has been customized to enable + (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. + The scheduling is: + 1. reduce scatter smear_gate (1 param 7 padding params) + 2. reduce scatter attn_gate (10 params 6 padding params) + 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) + 4. reduce scatter attn/mlp round 2 (16 mlp params) + 5. wait on step 1, then compute update of 1 and schedule all gather + 6. wait on step 2, then compute update of 2 and schedule all gather + 7. wait on step 3, then compute update of 3 and schedule all gather + GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] + GPUs that receive params of type attn reshape before computing update + 8. wait on 4, then compute update of 4 and schedule all gather + 9. wait for each all gather to complete and update params + Empirically, leading with small params provides an additional 0.2s improvement. + """ + def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, beta2=0.95, custom_sizing=True): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + # custom sizing requires 8 GPUs + if custom_sizing and dist.get_world_size()==8: + param_groups = self.generate_custom_param_groups(params) + else: + param_groups = self.generate_standard_param_groups(params) + super().__init__(param_groups, defaults) + + def reset(self): + # expose a reset for clearing buffers + for group in self.param_groups: + group["momentum_buffer"].zero_() + group["second_momentum_buffer"].zero_() + + def generate_standard_param_groups(self, params): + """ + Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. + Creates one param group per module. + """ + groups = defaultdict(list) + for param in params: + groups[param.label].append(param) + + param_groups = [] + for module_name, group_params in groups.items(): + chunk_size = (len(group_params) + self.world_size - 1) // self.world_size + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + + return param_groups + + def generate_custom_param_groups(self, params): + """ + Implementation requires that a single GPU does not receive both attn + and mlp params when a param group is split across GPUs. + """ + module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp'] + params_list = list(params) + params_list.sort(key=lambda x: module_group_order.index(x.label)) + + idx = 0 + group_sizes = [1, 10, 16, 16] + assert len(params_list) == sum(group_sizes) + param_groups = [] + for size in group_sizes: + chunk_size = (size + self.world_size - 1) // self.world_size + group_params = params_list[idx: idx + size] + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + idx += size + + return param_groups + + @torch.no_grad() + def step(self): + # Efficient systems-wise implementation of step developed by @YouJiacheng, + # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, + # @ryanyang0, @vagrawal, and @varunneal. + rank = dist.get_rank() + group_infos = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + if not params: + continue + + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + stacked_grads = torch.empty( + (padded_num_params, *params[0].shape), + dtype=params[0].dtype, + device=params[0].device + ) + for i, p in enumerate(params): + stacked_grads[i].copy_(p.grad, non_blocking=True) + if len(params) < padded_num_params: + stacked_grads[len(params):].zero_() + + grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) + + reduce_future = dist.reduce_scatter_tensor( + grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True + ).get_future() + + group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) + + all_gather_infos = [] + # Second pass: wait for gradients, compute updates for the local shard of parameters, + # and launch all async all_gather operations. + for group, info in zip(self.param_groups, group_infos): + info["reduce_future"].wait() + + params = group["params"] + grad_chunk = info["grad_chunk"] + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + start_idx = rank * chunk_size + module_idx = start_idx if start_idx < len(params) else 0 + + num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank + + if "momentum_buffer" not in group: + group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) + momentum_buffer = group["momentum_buffer"] + # Apply momentum update to the persistent momentum buffer in-place + momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) + updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) + + grad_shape = updated_grads.shape + if params[module_idx].label == 'attn': + # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] + for p in params[module_idx:module_idx + num_params]: + assert p.label == 'attn' + updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) + ref_param = params[module_idx] + param_shape = ref_param.shape + + if "second_momentum_buffer" not in group: + group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) + if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) + ) + second_momentum_buffer = group["second_momentum_buffer"] + + if "param_lr" not in group: + group["param_lr"] = ( + max(1., param_shape[-2] / param_shape[-1]) ** 0.5 + * ref_param.new_tensor( + [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + ) + + group["param_wd"] = ref_param.new_tensor( + [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + + # Determine LR and WR + eff_lr = group["lr"] * group["param_lr"] + eff_wd = group["lr"] * group["weight_decay"] * group["param_wd"] + + # Compute zeropower for the entire chunk in a single, batched call. + if num_params == 0: + v_chunk = updated_grads + else: + v_chunk = polar_express(updated_grads) + + # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) + v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) + second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) + step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() + v_chunk.mul_(step_size) + v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) + + v_chunk = v_chunk.view(grad_shape) + + updated_params = torch.empty_like(grad_chunk) + param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) + + # "Cautious" weight decay (https://arxiv.org/abs/2510.12402) + mask = (v_chunk * param_chunk) >= 0 + v_chunk.addcmul_(param_chunk, (eff_wd * mask).to(ref_param.dtype)) + + param_chunk.addcmul_(v_chunk, -eff_lr) + + updated_params[:num_params].copy_(param_chunk) + if num_params < chunk_size: + updated_params[num_params:].zero_() + + stacked_params = torch.empty( + (padded_num_params, *param_shape), + dtype=updated_params.dtype, + device=updated_params.device, + ) + + gather_future = dist.all_gather_into_tensor( + stacked_params, updated_params, async_op=True + ).get_future() + + all_gather_infos.append( + { + "gather_future": gather_future, + "stacked_params": stacked_params, + "orig_params": params, + } + ) + + # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. + for info in all_gather_infos: + info["gather_future"].wait() + stacked_params = info["stacked_params"] + orig_params = info["orig_params"] + + unstacked_params = torch.unbind(stacked_params) + for i, p in enumerate(orig_params): + p.copy_(unstacked_params[i], non_blocking=True) + + +class DistAdam(torch.optim.Optimizer): + def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + params = list(params) + sizes = {p.shape for p in params} + # create one buffer per unique parameter-size + param_groups = [] + for size in sizes: + group_params = [p for p in params if p.shape == size] + param_groups.append(dict(params=group_params)) + super().__init__(param_groups, defaults) + # init state + for p in params: + chunk_size = p.size(0) // self.world_size + exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) + exp_avg_sq = torch.zeros_like(exp_avg) + self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) + # DistributedAdam implementation by @vagrawal + + @torch.compile + @torch.no_grad() + def step(self): + rank = dist.get_rank() + reduce_scatter_futures: list[torch.Future] = [] + all_gather_futures: list[torch.Future] = [] + grad_slices = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + for param in params: + grad = param.grad + rank_size = grad.shape[0] // self.world_size + grad_slice = torch.empty_like(grad[:rank_size]) + reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) + grad_slices.append(grad_slice) + + idx = 0 + for group in self.param_groups: + beta1, beta2 = group['betas'] + eps = group['eps'] + wd = group['weight_decay'] + params = group['params'] + for param in params: + reduce_scatter_futures[idx].wait() + rank_size = param.shape[0] // self.world_size + p_slice = param[rank * rank_size:(rank + 1) * rank_size] + lr = group['lr'] * getattr(param, "lr_mul", 1.0) + state = self.state[param] + g_slice = grad_slices[idx] + + exp_avg = state["exp_avg"] + exp_avg_sq = state["exp_avg_sq"] + state["step"] += 1 + t = state["step"] + # weight decay + if wd != 0: + eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) + p_slice.mul_(1 - eff_weight_decay) + # update running averages + exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) + # bias corrections + bias1 = 1 - beta1 ** t + bias2 = 1 - beta2 ** t + # compute step + denom = exp_avg_sq.sqrt().add_(eps) + step_size = lr * (bias2 ** 0.5 / bias1) + update = exp_avg.div(denom).mul_(step_size) + p_slice.add_(other=update, alpha=-1.0) + idx += 1 + all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) + torch.futures.collect_all(all_gather_futures).wait() + +# ----------------------------------------------------------------------------- +# PyTorch nn.Module definitions for the model + +def norm(x: Tensor): + return F.rms_norm(x, (x.size(-1),)) + +class CastedLinear(nn.Linear): + def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): + super().__init__(in_features, out_features, bias=False) + self.use_fp8 = use_fp8 + self.x_s = x_s + self.w_s = w_s + self.grad_s = grad_s + + def reset_parameters(self) -> None: + with torch.no_grad(): + self.weight.zero_() # @Grad62304977 and others + + def forward(self, x: Tensor): + if self.use_fp8 and self.training: + _x = x.flatten(0, -2) + out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] + return out.reshape(*x.shape[:-1], -1) + else: + return F.linear(x, self.weight.type_as(x)) + +# yarn implementation @classiclarryd +class Yarn(nn.Module): + def __init__(self, head_dim, max_seq_len): + super().__init__() + self.head_dim = head_dim + self.max_seq_len = max_seq_len + self.reset() + + def reset(self): + angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) + # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) + angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) + theta = torch.outer(t, angular_freq) + self.cos = nn.Buffer( + theta.cos().to(torch.bfloat16), persistent=False + ) + self.sin = nn.Buffer( + theta.sin().to(torch.bfloat16), persistent=False + ) + self.angular_freq = angular_freq + # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 + self.attn_scale = 0.1 + + def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): + rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) + scaling_factor = old_window / new_window + interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) + self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) + theta = torch.outer(t, self.angular_freq) + self.cos.copy_(theta.cos()) + self.sin.copy_(theta.sin()) + self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 + +def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): + assert cos.size(0) >= x_BTHD.size(-3) + cos, sin = ( + cos[None, : x_BTHD.size(-3), None, :], + sin[None, : x_BTHD.size(-3), None, :], + ) + x1, x2 = x_BTHD.chunk(2, dim=-1) + y1 = x1 * cos + x2 * sin + y2 = x1 * (-sin) + x2 * cos + return torch.cat((y1, y2), 3) + +@dataclass +class AttnArgs: + ve: torch.Tensor + sa_lambdas: torch.Tensor + seqlens: torch.Tensor + bm_size: int + cos: torch.Tensor + sin: torch.Tensor + attn_scale: float + +flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface + +class CausalSelfAttention(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int): + super().__init__() + self.num_heads = num_heads + self.head_dim = head_dim + self.dim = dim + self.hdim = num_heads * head_dim + + assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" + std = 0.5 * (self.dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng + # https://x.com/hi_tysam/status/1879699187107033311 + # make matrices the same shape as MLP to enable batched call in optimizer + self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) + # label module to enable custom optimizer sizing + self.qkvo_w.label='attn' + + with torch.no_grad(): + self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights + self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero + + # sparse gated attention to enable context based no-op by @classiclarryd + self.attn_gate = CastedLinear(12, num_heads) + # label module to enable custom optimizer sizing + self.attn_gate.weight.label = 'attn_gate' + + def forward(self, x: Tensor, attn_args: AttnArgs): + B, T = x.size(0), x.size(1) # batch size, sequence length + assert B == 1, "varlen sequences requires B == 1" + assert T % 16 == 0 + # unpack attention args + cos, sin = attn_args.cos, attn_args.sin + ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas + seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size + + q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) + q, k = norm(q), norm(k) # QK norm @Grad62304977 + q, k = rotary(q, cos, sin), rotary(k, cos, sin) + if ve is not None: + v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 + else: # skip mid-layers token value embeddings by @YouJiacheng + v = sa_lambdas[0] * v + + max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) + + # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng + y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, + max_seqlen_q=max_len, max_seqlen_k=max_len, + causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) + y = y.view(B, T, self.num_heads, self.head_dim) + y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) + y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side + y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) + return y + + +class MLP(nn.Module): + def __init__(self, dim: int): + super().__init__() + hdim = 4 * dim + # make matrices the same shape to enable batched call in optimizer + self.c_fc = nn.Parameter(torch.empty(dim, hdim)) + self.c_proj = nn.Parameter(torch.empty(dim, hdim)) + # label modules to enable custom optimizer sizing + self.c_fc.label = 'mlp' + self.c_proj.label = 'mlp' + # corrective factor to account for transpose + self.c_fc.lr_mul = 2. + + std = 0.5 * (dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + with torch.no_grad(): + self.c_fc.uniform_(-bound, bound) + self.c_proj.zero_() # zero init suggested by @Grad62304977 + + def forward(self, x: Tensor): + x = F.linear(x, self.c_fc.T.type_as(x)) + x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 + x = F.linear(x, self.c_proj.type_as(x)) + return x + +class Block(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): + super().__init__() + # skip attention of blocks.7 (the 8th layer) by @YouJiacheng + self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None + # skip MLP blocks for first MLP layer by @EmelyanenkoK + self.mlp = MLP(dim) if layer_idx != 0 else None + + def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): + x = lambdas[0] * x + lambdas[1] * x0 + if self.attn is not None: + x = x + self.attn(norm(x), attn_args) + if self.mlp is not None: + x = x + self.mlp(norm(x)) + return x + +# ----------------------------------------------------------------------------- +# The main model + +def next_multiple_of_n(v: float | int, *, n: int): + return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) + +class GPT(nn.Module): + def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): + super().__init__() + vocab_size = next_multiple_of_n(vocab_size, n=128) + self.embed = nn.Embedding(vocab_size, model_dim) + self.smear_gate = CastedLinear(12, 1) + # label modules to enable custom optimizer sizing + self.smear_gate.weight.label = 'smear_gate' + # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 + # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 + self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) + self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) + self.yarn = Yarn(head_dim, max_seq_len) + # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. + # suggested to me by @Grad62304977. this originates from Karpathy's experiments. + use_fp8 = not os.environ.get("DISABLE_FP8", False) + self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) + # Add learnable skip connection weights for decoder layers + assert num_layers % 2 == 0 + pad = (-num_layers * 5 - 2) % dist.get_world_size() + self.scalars = nn.Parameter( + torch.cat( + [ + -1.5 + * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 + *[ + torch.tensor([1.0, 0.0]) for _ in range(num_layers) + ], # block lambdas + *[ + torch.tensor([0.5, 0.5]) for _ in range(num_layers) + ], # SA lambdas + torch.zeros(1), # smear_lambda + 0.5*torch.ones(1), # backout_lambda + torch.ones(pad), + ] + ) + ) + # set learning rates + for param in self.embed.parameters(): + param.lr_mul = 75. + for param in self.value_embeds.parameters(): + param.lr_mul = 75. + self.lm_head.weight.lr_mul = 1.0 + self.scalars.lr_mul = 5.0 + + def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): + assert input_seq.ndim == 1 + + ve = [value_embed(input_seq) for value_embed in self.value_embeds] + # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure + # dropping first layer updates this to .12 ... 012 + ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] + assert len(ve) == len(self.blocks) + + short_bm = ws_short * args.block_size + long_bm = ws_long * args.block_size + bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] + assert len(bm_sizes) == len(self.blocks) + + x = self.embed(input_seq) + + skip_weights = self.scalars[:(len(self.blocks) // 2)] + lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) + sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) + smear_lambda = self.scalars[5 * len(self.blocks)] + backout_lambda = self.scalars[5 * len(self.blocks)+1] + + # smear token embed forward 1 position @classiclarryd + smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) + x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) + x = x0 = norm(x[None]) + + # U-net design by @brendanh0gan + skip_connections = [] + n = len(self.blocks) // 2 + + x_backout = None + backout_layer = 8 + # skip layer zero + for i in range(1,len(self.blocks)): + attn_args = AttnArgs( + ve=ve[i], + sa_lambdas=sa_lambdas[i], + seqlens=seqlens, + bm_size=bm_sizes[i], + cos=self.yarn.cos, + sin=self.yarn.sin, + attn_scale=self.yarn.attn_scale + ) + # since layer 0 is skipped, layer 11 does not have skip_connection + if i >= n and i<11: + gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) + x = x + gate * skip_connections.pop() + x = self.blocks[i](x, x0, lambdas[i], attn_args) + if i < n: + skip_connections.append(x) + if i == backout_layer: + x_backout = x + + # back out contributions from first 8 layers that are only required for downstream context and not direct prediction + x -= backout_lambda * x_backout + x = norm(x) + logits = self.lm_head(x) + # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) + logits = 30 * torch.sigmoid(logits / 7.5) + logits_for_loss = logits.float() if not self.training else logits + loss = F.cross_entropy( + logits_for_loss.view(-1, logits_for_loss.size(-1)), + target_seq, + reduction="sum" if self.training else "mean", + ) + return loss + +# ----------------------------------------------------------------------------- +# Distributed data loader + +def _load_data_shard(file: Path): + header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 + assert header[0] == 20240520, "magic number mismatch in the data .bin file" + assert header[1] == 1, "unsupported version" + num_tokens = int(header[2]) # number of tokens (claimed) + with file.open("rb", buffering=0) as f: + tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng + f.seek(256 * 4) + nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng + assert nbytes == 2 * num_tokens, "number of tokens read does not match header" + return tokens + +BOS_ID = 50256 + +class BOSFinder: + # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd + def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): + # Precompute BOS positions once per shard + self.tokens=tokens + self.size = tokens.numel() + self.quickload = quickload + if quickload: + # only scan first 4 million tokens, then kickoff async thread to scan rest + self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.thread = None + self.ready = threading.Event() + self.start() + else: + self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.i = 0 + self.world_size = world_size + self.batch_iter = 0 + + def _load(self): + self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + self.bos_idx = self.bos_idx_async + + def next_batch(self, num_tokens_local: int, max_seq_len: int): + # if quickload was used, repoint to the full dataset after 5 batches + if self.quickload and self.batch_iter==5: + self.get() + n = len(self.bos_idx) + starts = [[] for _ in range(self.world_size)] + ends = [[] for _ in range(self.world_size)] + + idx = self.i + for r in range(self.world_size): + cur_len = 0 + while cur_len <= num_tokens_local: + if idx >= n: + raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") + cur = self.bos_idx[idx] + starts[r].append(cur) + end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, + cur + max_seq_len, + cur + num_tokens_local - cur_len + 1) + ends[r].append(end) + cur_len += end - cur + idx += 1 + + assert cur_len == num_tokens_local + 1 + self.i = idx + self.batch_iter+=1 + return starts, ends + +class DataPreloader: + # Helper for asynchronously loading next shard and indexing bos tokens + def __init__(self, file_iter, world_size: int = 1): + self.file_iter = file_iter + self.world_size = world_size + self.thread = None + self.data = None + self.ready = threading.Event() + + def _load(self): + tokens = _load_data_shard(next(self.file_iter)) + self.data = (tokens, BOSFinder(tokens, self.world_size)) + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + return self.data + +def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): + # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len + rank = dist.get_rank() if dist.is_initialized() else 0 + world_size = dist.get_world_size() if dist.is_initialized() else 1 + assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" + num_tokens = num_tokens // grad_accum_steps + + files = [Path(file) for file in sorted(glob.glob(filename_pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") + + file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training + tokens = _load_data_shard(next(file_iter)) + if align_to_bos: + finder = BOSFinder(tokens, world_size=world_size, quickload=True) + preloader = DataPreloader(file_iter, world_size) + preloader.start() + else: + pos = 0 # for unaligned case + + while True: + num_tokens_local = num_tokens // world_size + max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 + + if align_to_bos: + try: + seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) + start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) + except StopIteration: + # This shard is exhausted, load the next one in the next loop iteration. + tokens, finder = preloader.get() + preloader.start() + continue + + buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) + _inputs = buf[:-1] + _targets = buf[1:] + end_idxs[-1] -= 1 # last document was too long to account for _targets offset + cum_lengths = (end_idxs - start_idxs).cumsum(0) + + else: + if pos + num_tokens + 1 >= len(tokens): # should not occur for val data + tokens, pos = _load_data_shard(next(file_iter)), 0 + + pos_local = pos + rank * num_tokens_local + buf = tokens[pos_local: pos_local + num_tokens_local + 1] + _inputs = buf[:-1].view(num_tokens_local, ) + _targets = buf[1:].view(num_tokens_local, ) + + cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] + pos += num_tokens + + + _cum_lengths = torch.full((max_num_docs,), num_tokens_local) + _cum_lengths[0] = 0 + _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths + + new_params = yield ( + _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), + _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), + _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) + ) + + if new_params is not None: + # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() + new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params + assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" + num_tokens = new_num_tokens + max_seq_len = new_max_seq_len + grad_accum_steps = new_grad_accum_steps + + +# ----------------------------------------------------------------------------- +# int main + +@dataclass +class Hyperparameters: + # data + train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on + val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on + val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons + train_batch_size: int = 2048 * 16 * 8 + train_max_seq_len: int = 128 * 16 + val_batch_size: int = 4 * 64 * 1024 * 8 + # optimization + num_scheduled_iterations: int = 2205 # number of steps to complete lr and ws schedule + num_extension_iterations: int = 40 # number of steps to continue training at final lr and ws + num_iterations: int = num_scheduled_iterations + num_extension_iterations + cooldown_frac: float = 0.50 # fraction of num_scheduled_iterations spent cooling down the learning rate + # evaluation and logging + run_id: str = f"{uuid.uuid4()}" + val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end + save_checkpoint: bool = False + # attention masking + block_size: int = 128 + ws_schedule: tuple = (3, 7, 11) + ws_final: int = 13 # increase final validation ws, used for YaRN extension and short window size @classiclarryd + ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN + +args = Hyperparameters() + +data_path = os.environ.get("DATA_PATH", ".") +args.train_files = os.path.join(data_path, args.train_files) +args.val_files = os.path.join(data_path, args.val_files) + +# torchrun sets these env variables +rank = int(os.environ["RANK"]) +world_size = int(os.environ["WORLD_SIZE"]) +assert 8 % world_size == 0, "world_size must be a divisor of 8" +grad_accum_steps = 8 // world_size +assert torch.cuda.is_available() +device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) +torch.cuda.set_device(device) +dist.init_process_group(backend="nccl", device_id=device) +dist.barrier() +master_process = (rank == 0) # this process will do logging, checkpointing etc. + +# begin logging +logfile = None +if master_process: + run_id = args.run_id + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{run_id}.txt" + print(logfile) +def print0(s, console=False): + if master_process: + with open(logfile, "a") as f: + if console: + print(s) + print(s, file=f) + +# begin by printing this file (the Python code) +print0(code) +print0("="*100) +# log information about the hardware/software environment this is running on +print0(f"Running Python {sys.version}") +print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") +print0(f"Running Triton version {triton.__version__}") + +def nvidia_smi(): + import subprocess # avoid top level import + return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout +print0(nvidia_smi()) +print0("="*100) + +model: nn.Module = GPT( + vocab_size=50257, + num_layers=12, + num_heads=6, + head_dim=128, + model_dim=768, + max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) +).cuda() +for m in model.modules(): + if isinstance(m, (nn.Embedding, nn.Linear)): + m.bfloat16() +for param in model.parameters(): + dist.broadcast(param.detach(), 0) + +# collect the parameters to optimize +hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] +embed_params = [p for n, p in model.named_parameters() if "embed" in n] +scalar_params = [p for p in model.parameters() if p.ndim < 2] +head_params = [model.lm_head.weight] +gate_params = [p for n, p in model.named_parameters() if "gate" in n] + +# init the optimizer(s) +# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence +# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 +optimizer1 = DistAdam( + scalar_params + head_params + embed_params, + lr=0.008, + betas=(0.65, 0.95), + eps=1e-8, + weight_decay=0.0, +) +optimizer2 = NorMuon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=1.2) +optimizers = [optimizer1, optimizer2] +for opt in optimizers: + for group in opt.param_groups: + group["initial_lr"] = group["lr"] + +# learning rate schedule: flat, then linear decay, then flat +def get_lr(step: int): + x = min(0.9999, step / args.num_scheduled_iterations) + assert 0 <= x < 1 + lr = 1.0 + if x >= 1 - args.cooldown_frac: + w = (1 - x) / args.cooldown_frac + lr = w * 1.0 + (1 - w) * 0.1 + return lr + +def get_ws(step: int): + # set short window size to half of long window size + # Higher ws on "extension" steps + if step >= args.num_scheduled_iterations: + return args.ws_final // 2, args.ws_final + x = step / args.num_scheduled_iterations + assert 0 <= x < 1 + ws_idx = int(len(args.ws_schedule) * x) + return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] + +def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): + # warmup phase: linearly increase momentum from min to max + # cooldown phase: linearly decrease momentum from max to min + momentum_cd_start = args.num_iterations - muon_cooldown_steps + if step < muon_warmup_steps: + frac = step / muon_warmup_steps + momentum = momentum_min + frac * (momentum_max - momentum_min) + elif step > momentum_cd_start: + frac = (step - momentum_cd_start) / muon_cooldown_steps + momentum = momentum_max - frac * (momentum_max - momentum_min) + else: + momentum = momentum_max + return momentum + +def step_optimizers(step: int, optimizers, model): + # update lr + for optimizer in optimizers: + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * get_lr(step) + + # set muon momentum based on step + momentum = get_muon_momentum(step) + for group in optimizers[1].param_groups: + group["momentum"] = momentum + + # on even steps, only step Muon params + # on odd steps, step all params + if step%2==0: + optimizers[1].step() + optimizers[1].zero_grad(set_to_none=True) + else: + for optimizer in optimizers: + optimizer.step() + model.zero_grad(set_to_none=True) + +model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) + +######################################## +# Warmup kernels # +######################################## + +# Warmup the training kernels, then re-initialize the state so we aren't cheating +warmup_steps = 30 +initial_state = dict(model=copy.deepcopy(model.state_dict()), + optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +ws_schedule = list(args.ws_schedule) + [args.ws_final] +ws_long = ws_schedule[0] +for step in range(warmup_steps): + inputs, targets, cum_seqlens = next(train_loader) + # each window size is a new graph, need to warm up each with Yarn.attn_scale + ws_idx = step % len(ws_schedule) + if ws_idx==0: + model.yarn.reset() + ws_long = ws_schedule[0] + else: + new_ws_long = ws_schedule[ws_idx] + model.yarn.apply(ws_long, new_ws_long) + ws_long = new_ws_long + model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() + for opt in optimizers: + opt.step() + model.zero_grad(set_to_none=True) +model.yarn.reset() # rotary buffer is not stored in state_dict +model.load_state_dict(initial_state["model"]) +optimizer2.reset() # muon momentum buffers not in state dict +for opt, opt_state in zip(optimizers, initial_state["optimizers"]): + opt.load_state_dict(opt_state) +del train_loader, initial_state + +######################################## +# Training and validation # +######################################## + +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +training_time_ms = 0 +# start the clock +torch.cuda.synchronize() +t0 = time.perf_counter() +# begin training +train_steps = args.num_iterations +ws_short, ws_long = get_ws(0) +for step in range(train_steps + 1): + last_step = (step == train_steps) + ws_short, new_ws_long = get_ws(step) + if new_ws_long != ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long=new_ws_long + + # --------------- VALIDATION SECTION ----------------- + if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): + if last_step: + ws_long = args.ws_validate_post_yarn_ext + # stop the clock + torch.cuda.synchronize() + training_time_ms += 1000 * (time.perf_counter() - t0) + model.eval() + assert args.val_tokens % args.val_batch_size == 0 + val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size + val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) + val_loss = 0 + with torch.no_grad(): + for _ in range(val_steps): + inputs, targets, cum_seqlens = next(val_loader) + val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) + val_loss /= val_steps + del val_loader + dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) + print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) + model.train() + # start the clock again + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if master_process and args.save_checkpoint: + log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) + os.makedirs(f"logs/{run_id}", exist_ok=True) + torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") + # the last step only has the validation loop, so break to avoid training + break + + # --------------- TRAINING SECTION ----------------- + for _ in range(grad_accum_steps): + inputs, targets, cum_seqlens = next(train_loader) + (model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps).backward() + step_optimizers(step, optimizers, model) + + # logging + approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) + print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) + +print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) +dist.destroy_process_group() + +==================================================================================================== +Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] +Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 +Running Triton version 3.5.0 +Mon Nov 10 22:01:45 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | +| N/A 42C P0 130W / 700W | 5858MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | +| N/A 35C P0 122W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | +| N/A 34C P0 118W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | +| N/A 39C P0 122W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | +| N/A 41C P0 130W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | +| N/A 34C P0 123W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | +| N/A 40C P0 123W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | +| N/A 34C P0 120W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| ++-----------------------------------------------------------------------------------------+ + +==================================================================================================== +step:0/2245 val_loss:10.8258 train_time:0ms step_avg:0.02ms +step:1/2245 train_time:115ms step_avg:115.15ms +step:2/2245 train_time:136ms step_avg:68.22ms +step:3/2245 train_time:175ms step_avg:58.21ms +step:4/2245 train_time:231ms step_avg:57.75ms +step:5/2245 train_time:291ms step_avg:58.15ms +step:6/2245 train_time:350ms step_avg:58.32ms +step:7/2245 train_time:411ms step_avg:58.72ms +step:8/2245 train_time:469ms step_avg:58.67ms +step:9/2245 train_time:530ms step_avg:58.91ms +step:10/2245 train_time:589ms step_avg:58.89ms +step:11/2245 train_time:650ms step_avg:59.12ms +step:12/2245 train_time:709ms step_avg:59.06ms +step:13/2245 train_time:770ms step_avg:59.22ms +step:14/2245 train_time:828ms step_avg:59.16ms +step:15/2245 train_time:889ms step_avg:59.29ms +step:16/2245 train_time:949ms step_avg:59.30ms +step:17/2245 train_time:1013ms step_avg:59.62ms +step:18/2245 train_time:1077ms step_avg:59.84ms +step:19/2245 train_time:1142ms step_avg:60.13ms +step:20/2245 train_time:1203ms step_avg:60.13ms +step:21/2245 train_time:1265ms step_avg:60.24ms +step:22/2245 train_time:1324ms step_avg:60.18ms +step:23/2245 train_time:1386ms step_avg:60.25ms +step:24/2245 train_time:1445ms step_avg:60.20ms +step:25/2245 train_time:1506ms step_avg:60.24ms +step:26/2245 train_time:1565ms step_avg:60.20ms +step:27/2245 train_time:1627ms step_avg:60.24ms +step:28/2245 train_time:1686ms step_avg:60.20ms +step:29/2245 train_time:1747ms step_avg:60.24ms +step:30/2245 train_time:1806ms step_avg:60.19ms +step:31/2245 train_time:1867ms step_avg:60.23ms +step:32/2245 train_time:1926ms step_avg:60.19ms +step:33/2245 train_time:1989ms step_avg:60.29ms +step:34/2245 train_time:2050ms step_avg:60.31ms +step:35/2245 train_time:2115ms step_avg:60.41ms +step:36/2245 train_time:2175ms step_avg:60.40ms +step:37/2245 train_time:2237ms step_avg:60.45ms +step:38/2245 train_time:2297ms step_avg:60.44ms +step:39/2245 train_time:2360ms step_avg:60.50ms +step:40/2245 train_time:2419ms step_avg:60.47ms +step:41/2245 train_time:2481ms step_avg:60.51ms +step:42/2245 train_time:2540ms step_avg:60.48ms +step:43/2245 train_time:2602ms step_avg:60.51ms +step:44/2245 train_time:2661ms step_avg:60.48ms +step:45/2245 train_time:2723ms step_avg:60.51ms +step:46/2245 train_time:2781ms step_avg:60.47ms +step:47/2245 train_time:2843ms step_avg:60.49ms +step:48/2245 train_time:2902ms step_avg:60.46ms +step:49/2245 train_time:2963ms step_avg:60.47ms +step:50/2245 train_time:3022ms step_avg:60.44ms +step:51/2245 train_time:3085ms step_avg:60.49ms +step:52/2245 train_time:3145ms step_avg:60.49ms +step:53/2245 train_time:3209ms step_avg:60.54ms +step:54/2245 train_time:3268ms step_avg:60.52ms +step:55/2245 train_time:3330ms step_avg:60.55ms +step:56/2245 train_time:3390ms step_avg:60.53ms +step:57/2245 train_time:3452ms step_avg:60.55ms +step:58/2245 train_time:3511ms step_avg:60.54ms +step:59/2245 train_time:3573ms step_avg:60.57ms +step:60/2245 train_time:3632ms step_avg:60.54ms +step:61/2245 train_time:3695ms step_avg:60.57ms +step:62/2245 train_time:3755ms step_avg:60.57ms +step:63/2245 train_time:3818ms step_avg:60.60ms +step:64/2245 train_time:3877ms step_avg:60.58ms +step:65/2245 train_time:3939ms step_avg:60.60ms +step:66/2245 train_time:3998ms step_avg:60.58ms +step:67/2245 train_time:4062ms step_avg:60.62ms +step:68/2245 train_time:4120ms step_avg:60.59ms +step:69/2245 train_time:4182ms step_avg:60.61ms +step:70/2245 train_time:4242ms step_avg:60.60ms +step:71/2245 train_time:4303ms step_avg:60.61ms +step:72/2245 train_time:4363ms step_avg:60.60ms +step:73/2245 train_time:4424ms step_avg:60.60ms +step:74/2245 train_time:4483ms step_avg:60.58ms +step:75/2245 train_time:4544ms step_avg:60.59ms +step:76/2245 train_time:4604ms step_avg:60.57ms +step:77/2245 train_time:4665ms step_avg:60.59ms +step:78/2245 train_time:4724ms step_avg:60.57ms +step:79/2245 train_time:4787ms step_avg:60.59ms +step:80/2245 train_time:4846ms step_avg:60.58ms +step:81/2245 train_time:4909ms step_avg:60.60ms +step:82/2245 train_time:4969ms step_avg:60.59ms +step:83/2245 train_time:5031ms step_avg:60.61ms +step:84/2245 train_time:5090ms step_avg:60.60ms +step:85/2245 train_time:5152ms step_avg:60.62ms +step:86/2245 train_time:5212ms step_avg:60.61ms +step:87/2245 train_time:5275ms step_avg:60.63ms +step:88/2245 train_time:5334ms step_avg:60.62ms +step:89/2245 train_time:5396ms step_avg:60.63ms +step:90/2245 train_time:5456ms step_avg:60.62ms +step:91/2245 train_time:5518ms step_avg:60.63ms +step:92/2245 train_time:5577ms step_avg:60.62ms +step:93/2245 train_time:5638ms step_avg:60.63ms +step:94/2245 train_time:5698ms step_avg:60.62ms +step:95/2245 train_time:5761ms step_avg:60.64ms +step:96/2245 train_time:5820ms step_avg:60.62ms +step:97/2245 train_time:5882ms step_avg:60.63ms +step:98/2245 train_time:5940ms step_avg:60.61ms +step:99/2245 train_time:6001ms step_avg:60.62ms +step:100/2245 train_time:6060ms step_avg:60.60ms +step:101/2245 train_time:6122ms step_avg:60.61ms +step:102/2245 train_time:6181ms step_avg:60.59ms +step:103/2245 train_time:6242ms step_avg:60.61ms +step:104/2245 train_time:6301ms step_avg:60.59ms +step:105/2245 train_time:6363ms step_avg:60.60ms +step:106/2245 train_time:6422ms step_avg:60.58ms +step:107/2245 train_time:6483ms step_avg:60.59ms +step:108/2245 train_time:6542ms step_avg:60.58ms +step:109/2245 train_time:6605ms step_avg:60.59ms +step:110/2245 train_time:6664ms step_avg:60.58ms +step:111/2245 train_time:6726ms step_avg:60.59ms +step:112/2245 train_time:6785ms step_avg:60.58ms +step:113/2245 train_time:6847ms step_avg:60.59ms +step:114/2245 train_time:6906ms step_avg:60.58ms +step:115/2245 train_time:6967ms step_avg:60.58ms +step:116/2245 train_time:7026ms step_avg:60.57ms +step:117/2245 train_time:7088ms step_avg:60.58ms +step:118/2245 train_time:7147ms step_avg:60.57ms +step:119/2245 train_time:7209ms step_avg:60.58ms +step:120/2245 train_time:7268ms step_avg:60.56ms +step:121/2245 train_time:7330ms step_avg:60.58ms +step:122/2245 train_time:7389ms step_avg:60.57ms +step:123/2245 train_time:7451ms step_avg:60.58ms +step:124/2245 train_time:7510ms step_avg:60.57ms +step:125/2245 train_time:7572ms step_avg:60.58ms +step:126/2245 train_time:7631ms step_avg:60.57ms +step:127/2245 train_time:7693ms step_avg:60.58ms +step:128/2245 train_time:7753ms step_avg:60.57ms +step:129/2245 train_time:7815ms step_avg:60.58ms +step:130/2245 train_time:7874ms step_avg:60.57ms +step:131/2245 train_time:7936ms step_avg:60.58ms +step:132/2245 train_time:7995ms step_avg:60.57ms +step:133/2245 train_time:8057ms step_avg:60.58ms +step:134/2245 train_time:8117ms step_avg:60.57ms +step:135/2245 train_time:8179ms step_avg:60.59ms +step:136/2245 train_time:8238ms step_avg:60.57ms +step:137/2245 train_time:8300ms step_avg:60.58ms +step:138/2245 train_time:8359ms step_avg:60.57ms +step:139/2245 train_time:8420ms step_avg:60.58ms +step:140/2245 train_time:8479ms step_avg:60.56ms +step:141/2245 train_time:8540ms step_avg:60.57ms +step:142/2245 train_time:8599ms step_avg:60.56ms +step:143/2245 train_time:8661ms step_avg:60.57ms +step:144/2245 train_time:8720ms step_avg:60.56ms +step:145/2245 train_time:8782ms step_avg:60.56ms +step:146/2245 train_time:8840ms step_avg:60.55ms +step:147/2245 train_time:8902ms step_avg:60.56ms +step:148/2245 train_time:8961ms step_avg:60.54ms +step:149/2245 train_time:9022ms step_avg:60.55ms +step:150/2245 train_time:9081ms step_avg:60.54ms +step:151/2245 train_time:9142ms step_avg:60.54ms +step:152/2245 train_time:9201ms step_avg:60.53ms +step:153/2245 train_time:9262ms step_avg:60.54ms +step:154/2245 train_time:9321ms step_avg:60.52ms +step:155/2245 train_time:9382ms step_avg:60.53ms +step:156/2245 train_time:9441ms step_avg:60.52ms +step:157/2245 train_time:9502ms step_avg:60.52ms +step:158/2245 train_time:9561ms step_avg:60.51ms +step:159/2245 train_time:9622ms step_avg:60.52ms +step:160/2245 train_time:9681ms step_avg:60.50ms +step:161/2245 train_time:9742ms step_avg:60.51ms +step:162/2245 train_time:9800ms step_avg:60.50ms +step:163/2245 train_time:9862ms step_avg:60.50ms +step:164/2245 train_time:9920ms step_avg:60.49ms +step:165/2245 train_time:9981ms step_avg:60.49ms +step:166/2245 train_time:10040ms step_avg:60.48ms +step:167/2245 train_time:10101ms step_avg:60.49ms +step:168/2245 train_time:10161ms step_avg:60.48ms +step:169/2245 train_time:10222ms step_avg:60.49ms +step:170/2245 train_time:10281ms step_avg:60.47ms +step:171/2245 train_time:10342ms step_avg:60.48ms +step:172/2245 train_time:10401ms step_avg:60.47ms +step:173/2245 train_time:10462ms step_avg:60.47ms +step:174/2245 train_time:10520ms step_avg:60.46ms +step:175/2245 train_time:10582ms step_avg:60.47ms +step:176/2245 train_time:10641ms step_avg:60.46ms +step:177/2245 train_time:10702ms step_avg:60.46ms +step:178/2245 train_time:10761ms step_avg:60.46ms +step:179/2245 train_time:10822ms step_avg:60.46ms +step:180/2245 train_time:10881ms step_avg:60.45ms +step:181/2245 train_time:10942ms step_avg:60.45ms +step:182/2245 train_time:11001ms step_avg:60.44ms +step:183/2245 train_time:11062ms step_avg:60.45ms +step:184/2245 train_time:11121ms step_avg:60.44ms +step:185/2245 train_time:11182ms step_avg:60.44ms +step:186/2245 train_time:11240ms step_avg:60.43ms +step:187/2245 train_time:11301ms step_avg:60.43ms +step:188/2245 train_time:11360ms step_avg:60.42ms +step:189/2245 train_time:11421ms step_avg:60.43ms +step:190/2245 train_time:11480ms step_avg:60.42ms +step:191/2245 train_time:11541ms step_avg:60.43ms +step:192/2245 train_time:11600ms step_avg:60.42ms +step:193/2245 train_time:11661ms step_avg:60.42ms +step:194/2245 train_time:11719ms step_avg:60.41ms +step:195/2245 train_time:11781ms step_avg:60.41ms +step:196/2245 train_time:11839ms step_avg:60.40ms +step:197/2245 train_time:11901ms step_avg:60.41ms +step:198/2245 train_time:11959ms step_avg:60.40ms +step:199/2245 train_time:12021ms step_avg:60.41ms +step:200/2245 train_time:12080ms step_avg:60.40ms +step:201/2245 train_time:12141ms step_avg:60.40ms +step:202/2245 train_time:12200ms step_avg:60.39ms +step:203/2245 train_time:12261ms step_avg:60.40ms +step:204/2245 train_time:12319ms step_avg:60.39ms +step:205/2245 train_time:12381ms step_avg:60.39ms +step:206/2245 train_time:12440ms step_avg:60.39ms +step:207/2245 train_time:12501ms step_avg:60.39ms +step:208/2245 train_time:12560ms step_avg:60.38ms +step:209/2245 train_time:12622ms step_avg:60.39ms +step:210/2245 train_time:12680ms step_avg:60.38ms +step:211/2245 train_time:12742ms step_avg:60.39ms +step:212/2245 train_time:12800ms step_avg:60.38ms +step:213/2245 train_time:12861ms step_avg:60.38ms +step:214/2245 train_time:12919ms step_avg:60.37ms +step:215/2245 train_time:12981ms step_avg:60.38ms +step:216/2245 train_time:13039ms step_avg:60.37ms +step:217/2245 train_time:13100ms step_avg:60.37ms +step:218/2245 train_time:13159ms step_avg:60.36ms +step:219/2245 train_time:13221ms step_avg:60.37ms +step:220/2245 train_time:13279ms step_avg:60.36ms +step:221/2245 train_time:13340ms step_avg:60.36ms +step:222/2245 train_time:13399ms step_avg:60.36ms +step:223/2245 train_time:13461ms step_avg:60.36ms +step:224/2245 train_time:13520ms step_avg:60.36ms +step:225/2245 train_time:13581ms step_avg:60.36ms +step:226/2245 train_time:13640ms step_avg:60.35ms +step:227/2245 train_time:13702ms step_avg:60.36ms +step:228/2245 train_time:13760ms step_avg:60.35ms +step:229/2245 train_time:13821ms step_avg:60.36ms +step:230/2245 train_time:13880ms step_avg:60.35ms +step:231/2245 train_time:13941ms step_avg:60.35ms +step:232/2245 train_time:13999ms step_avg:60.34ms +step:233/2245 train_time:14061ms step_avg:60.35ms +step:234/2245 train_time:14120ms step_avg:60.34ms +step:235/2245 train_time:14181ms step_avg:60.35ms +step:236/2245 train_time:14240ms step_avg:60.34ms +step:237/2245 train_time:14301ms step_avg:60.34ms +step:238/2245 train_time:14360ms step_avg:60.34ms +step:239/2245 train_time:14421ms step_avg:60.34ms +step:240/2245 train_time:14480ms step_avg:60.33ms +step:241/2245 train_time:14542ms step_avg:60.34ms +step:242/2245 train_time:14600ms step_avg:60.33ms +step:243/2245 train_time:14662ms step_avg:60.34ms +step:244/2245 train_time:14720ms step_avg:60.33ms +step:245/2245 train_time:14781ms step_avg:60.33ms +step:246/2245 train_time:14840ms step_avg:60.33ms +step:247/2245 train_time:14901ms step_avg:60.33ms +step:248/2245 train_time:14959ms step_avg:60.32ms +step:249/2245 train_time:15021ms step_avg:60.33ms +step:250/2245 train_time:15080ms step_avg:60.32ms +step:250/2245 val_loss:4.0956 train_time:15142ms step_avg:60.57ms +step:251/2245 train_time:15161ms step_avg:60.40ms +step:252/2245 train_time:15203ms step_avg:60.33ms +step:253/2245 train_time:15269ms step_avg:60.35ms +step:254/2245 train_time:15331ms step_avg:60.36ms +step:255/2245 train_time:15394ms step_avg:60.37ms +step:256/2245 train_time:15454ms step_avg:60.37ms +step:257/2245 train_time:15514ms step_avg:60.37ms +step:258/2245 train_time:15573ms step_avg:60.36ms +step:259/2245 train_time:15634ms step_avg:60.36ms +step:260/2245 train_time:15692ms step_avg:60.35ms +step:261/2245 train_time:15752ms step_avg:60.35ms +step:262/2245 train_time:15811ms step_avg:60.35ms +step:263/2245 train_time:15871ms step_avg:60.35ms +step:264/2245 train_time:15929ms step_avg:60.34ms +step:265/2245 train_time:15990ms step_avg:60.34ms +step:266/2245 train_time:16048ms step_avg:60.33ms +step:267/2245 train_time:16109ms step_avg:60.33ms +step:268/2245 train_time:16169ms step_avg:60.33ms +step:269/2245 train_time:16233ms step_avg:60.35ms +step:270/2245 train_time:16294ms step_avg:60.35ms +step:271/2245 train_time:16357ms step_avg:60.36ms +step:272/2245 train_time:16416ms step_avg:60.35ms +step:273/2245 train_time:16478ms step_avg:60.36ms +step:274/2245 train_time:16536ms step_avg:60.35ms +step:275/2245 train_time:16598ms step_avg:60.35ms +step:276/2245 train_time:16656ms step_avg:60.35ms +step:277/2245 train_time:16717ms step_avg:60.35ms +step:278/2245 train_time:16776ms step_avg:60.34ms +step:279/2245 train_time:16837ms step_avg:60.35ms +step:280/2245 train_time:16896ms step_avg:60.34ms +step:281/2245 train_time:16957ms step_avg:60.35ms +step:282/2245 train_time:17017ms step_avg:60.34ms +step:283/2245 train_time:17078ms step_avg:60.35ms +step:284/2245 train_time:17138ms step_avg:60.34ms +step:285/2245 train_time:17200ms step_avg:60.35ms +step:286/2245 train_time:17260ms step_avg:60.35ms +step:287/2245 train_time:17322ms step_avg:60.35ms +step:288/2245 train_time:17380ms step_avg:60.35ms +step:289/2245 train_time:17442ms step_avg:60.35ms +step:290/2245 train_time:17501ms step_avg:60.35ms +step:291/2245 train_time:17562ms step_avg:60.35ms +step:292/2245 train_time:17621ms step_avg:60.34ms +step:293/2245 train_time:17682ms step_avg:60.35ms +step:294/2245 train_time:17741ms step_avg:60.34ms +step:295/2245 train_time:17802ms step_avg:60.34ms +step:296/2245 train_time:17860ms step_avg:60.34ms +step:297/2245 train_time:17922ms step_avg:60.34ms +step:298/2245 train_time:17981ms step_avg:60.34ms +step:299/2245 train_time:18042ms step_avg:60.34ms +step:300/2245 train_time:18102ms step_avg:60.34ms +step:301/2245 train_time:18163ms step_avg:60.34ms +step:302/2245 train_time:18223ms step_avg:60.34ms +step:303/2245 train_time:18285ms step_avg:60.35ms +step:304/2245 train_time:18343ms step_avg:60.34ms +step:305/2245 train_time:18405ms step_avg:60.34ms +step:306/2245 train_time:18463ms step_avg:60.34ms +step:307/2245 train_time:18524ms step_avg:60.34ms +step:308/2245 train_time:18583ms step_avg:60.33ms +step:309/2245 train_time:18644ms step_avg:60.34ms +step:310/2245 train_time:18703ms step_avg:60.33ms +step:311/2245 train_time:18763ms step_avg:60.33ms +step:312/2245 train_time:18822ms step_avg:60.33ms +step:313/2245 train_time:18884ms step_avg:60.33ms +step:314/2245 train_time:18942ms step_avg:60.32ms +step:315/2245 train_time:19004ms step_avg:60.33ms +step:316/2245 train_time:19063ms step_avg:60.33ms +step:317/2245 train_time:19125ms step_avg:60.33ms +step:318/2245 train_time:19184ms step_avg:60.33ms +step:319/2245 train_time:19246ms step_avg:60.33ms +step:320/2245 train_time:19305ms step_avg:60.33ms +step:321/2245 train_time:19366ms step_avg:60.33ms +step:322/2245 train_time:19424ms step_avg:60.32ms +step:323/2245 train_time:19485ms step_avg:60.33ms +step:324/2245 train_time:19544ms step_avg:60.32ms +step:325/2245 train_time:19605ms step_avg:60.32ms +step:326/2245 train_time:19663ms step_avg:60.32ms +step:327/2245 train_time:19724ms step_avg:60.32ms +step:328/2245 train_time:19783ms step_avg:60.31ms +step:329/2245 train_time:19844ms step_avg:60.32ms +step:330/2245 train_time:19903ms step_avg:60.31ms +step:331/2245 train_time:19964ms step_avg:60.32ms +step:332/2245 train_time:20023ms step_avg:60.31ms +step:333/2245 train_time:20084ms step_avg:60.31ms +step:334/2245 train_time:20143ms step_avg:60.31ms +step:335/2245 train_time:20205ms step_avg:60.31ms +step:336/2245 train_time:20263ms step_avg:60.31ms +step:337/2245 train_time:20325ms step_avg:60.31ms +step:338/2245 train_time:20384ms step_avg:60.31ms +step:339/2245 train_time:20445ms step_avg:60.31ms +step:340/2245 train_time:20503ms step_avg:60.30ms +step:341/2245 train_time:20565ms step_avg:60.31ms +step:342/2245 train_time:20623ms step_avg:60.30ms +step:343/2245 train_time:20684ms step_avg:60.30ms +step:344/2245 train_time:20742ms step_avg:60.30ms +step:345/2245 train_time:20804ms step_avg:60.30ms +step:346/2245 train_time:20863ms step_avg:60.30ms +step:347/2245 train_time:20924ms step_avg:60.30ms +step:348/2245 train_time:20982ms step_avg:60.29ms +step:349/2245 train_time:21044ms step_avg:60.30ms +step:350/2245 train_time:21103ms step_avg:60.29ms +step:351/2245 train_time:21164ms step_avg:60.30ms +step:352/2245 train_time:21223ms step_avg:60.29ms +step:353/2245 train_time:21285ms step_avg:60.30ms +step:354/2245 train_time:21343ms step_avg:60.29ms +step:355/2245 train_time:21404ms step_avg:60.29ms +step:356/2245 train_time:21463ms step_avg:60.29ms +step:357/2245 train_time:21524ms step_avg:60.29ms +step:358/2245 train_time:21583ms step_avg:60.29ms +step:359/2245 train_time:21644ms step_avg:60.29ms +step:360/2245 train_time:21702ms step_avg:60.28ms +step:361/2245 train_time:21764ms step_avg:60.29ms +step:362/2245 train_time:21823ms step_avg:60.28ms +step:363/2245 train_time:21884ms step_avg:60.29ms +step:364/2245 train_time:21942ms step_avg:60.28ms +step:365/2245 train_time:22004ms step_avg:60.29ms +step:366/2245 train_time:22063ms step_avg:60.28ms +step:367/2245 train_time:22124ms step_avg:60.28ms +step:368/2245 train_time:22183ms step_avg:60.28ms +step:369/2245 train_time:22244ms step_avg:60.28ms +step:370/2245 train_time:22303ms step_avg:60.28ms +step:371/2245 train_time:22364ms step_avg:60.28ms +step:372/2245 train_time:22423ms step_avg:60.28ms +step:373/2245 train_time:22484ms step_avg:60.28ms +step:374/2245 train_time:22543ms step_avg:60.28ms +step:375/2245 train_time:22604ms step_avg:60.28ms +step:376/2245 train_time:22663ms step_avg:60.27ms +step:377/2245 train_time:22724ms step_avg:60.28ms +step:378/2245 train_time:22783ms step_avg:60.27ms +step:379/2245 train_time:22844ms step_avg:60.28ms +step:380/2245 train_time:22903ms step_avg:60.27ms +step:381/2245 train_time:22964ms step_avg:60.27ms +step:382/2245 train_time:23023ms step_avg:60.27ms +step:383/2245 train_time:23084ms step_avg:60.27ms +step:384/2245 train_time:23143ms step_avg:60.27ms +step:385/2245 train_time:23205ms step_avg:60.27ms +step:386/2245 train_time:23264ms step_avg:60.27ms +step:387/2245 train_time:23325ms step_avg:60.27ms +step:388/2245 train_time:23384ms step_avg:60.27ms +step:389/2245 train_time:23445ms step_avg:60.27ms +step:390/2245 train_time:23504ms step_avg:60.27ms +step:391/2245 train_time:23565ms step_avg:60.27ms +step:392/2245 train_time:23623ms step_avg:60.26ms +step:393/2245 train_time:23684ms step_avg:60.27ms +step:394/2245 train_time:23743ms step_avg:60.26ms +step:395/2245 train_time:23804ms step_avg:60.26ms +step:396/2245 train_time:23863ms step_avg:60.26ms +step:397/2245 train_time:23924ms step_avg:60.26ms +step:398/2245 train_time:23983ms step_avg:60.26ms +step:399/2245 train_time:24045ms step_avg:60.26ms +step:400/2245 train_time:24104ms step_avg:60.26ms +step:401/2245 train_time:24165ms step_avg:60.26ms +step:402/2245 train_time:24223ms step_avg:60.26ms +step:403/2245 train_time:24285ms step_avg:60.26ms +step:404/2245 train_time:24344ms step_avg:60.26ms +step:405/2245 train_time:24405ms step_avg:60.26ms +step:406/2245 train_time:24463ms step_avg:60.25ms +step:407/2245 train_time:24525ms step_avg:60.26ms +step:408/2245 train_time:24584ms step_avg:60.25ms +step:409/2245 train_time:24645ms step_avg:60.26ms +step:410/2245 train_time:24704ms step_avg:60.25ms +step:411/2245 train_time:24765ms step_avg:60.26ms +step:412/2245 train_time:24823ms step_avg:60.25ms +step:413/2245 train_time:24885ms step_avg:60.25ms +step:414/2245 train_time:24943ms step_avg:60.25ms +step:415/2245 train_time:25005ms step_avg:60.25ms +step:416/2245 train_time:25063ms step_avg:60.25ms +step:417/2245 train_time:25125ms step_avg:60.25ms +step:418/2245 train_time:25184ms step_avg:60.25ms +step:419/2245 train_time:25246ms step_avg:60.25ms +step:420/2245 train_time:25304ms step_avg:60.25ms +step:421/2245 train_time:25365ms step_avg:60.25ms +step:422/2245 train_time:25424ms step_avg:60.25ms +step:423/2245 train_time:25485ms step_avg:60.25ms +step:424/2245 train_time:25543ms step_avg:60.24ms +step:425/2245 train_time:25604ms step_avg:60.25ms +step:426/2245 train_time:25663ms step_avg:60.24ms +step:427/2245 train_time:25724ms step_avg:60.24ms +step:428/2245 train_time:25783ms step_avg:60.24ms +step:429/2245 train_time:25844ms step_avg:60.24ms +step:430/2245 train_time:25903ms step_avg:60.24ms +step:431/2245 train_time:25964ms step_avg:60.24ms +step:432/2245 train_time:26023ms step_avg:60.24ms +step:433/2245 train_time:26084ms step_avg:60.24ms +step:434/2245 train_time:26144ms step_avg:60.24ms +step:435/2245 train_time:26205ms step_avg:60.24ms +step:436/2245 train_time:26263ms step_avg:60.24ms +step:437/2245 train_time:26325ms step_avg:60.24ms +step:438/2245 train_time:26383ms step_avg:60.24ms +step:439/2245 train_time:26444ms step_avg:60.24ms +step:440/2245 train_time:26503ms step_avg:60.23ms +step:441/2245 train_time:26564ms step_avg:60.24ms +step:442/2245 train_time:26623ms step_avg:60.23ms +step:443/2245 train_time:26684ms step_avg:60.23ms +step:444/2245 train_time:26743ms step_avg:60.23ms +step:445/2245 train_time:26805ms step_avg:60.24ms +step:446/2245 train_time:26863ms step_avg:60.23ms +step:447/2245 train_time:26924ms step_avg:60.23ms +step:448/2245 train_time:26983ms step_avg:60.23ms +step:449/2245 train_time:27044ms step_avg:60.23ms +step:450/2245 train_time:27103ms step_avg:60.23ms +step:451/2245 train_time:27165ms step_avg:60.23ms +step:452/2245 train_time:27223ms step_avg:60.23ms +step:453/2245 train_time:27285ms step_avg:60.23ms +step:454/2245 train_time:27343ms step_avg:60.23ms +step:455/2245 train_time:27405ms step_avg:60.23ms +step:456/2245 train_time:27463ms step_avg:60.23ms +step:457/2245 train_time:27524ms step_avg:60.23ms +step:458/2245 train_time:27583ms step_avg:60.22ms +step:459/2245 train_time:27644ms step_avg:60.23ms +step:460/2245 train_time:27703ms step_avg:60.22ms +step:461/2245 train_time:27765ms step_avg:60.23ms +step:462/2245 train_time:27823ms step_avg:60.22ms +step:463/2245 train_time:27884ms step_avg:60.23ms +step:464/2245 train_time:27943ms step_avg:60.22ms +step:465/2245 train_time:28004ms step_avg:60.22ms +step:466/2245 train_time:28063ms step_avg:60.22ms +step:467/2245 train_time:28124ms step_avg:60.22ms +step:468/2245 train_time:28183ms step_avg:60.22ms +step:469/2245 train_time:28245ms step_avg:60.22ms +step:470/2245 train_time:28304ms step_avg:60.22ms +step:471/2245 train_time:28365ms step_avg:60.22ms +step:472/2245 train_time:28424ms step_avg:60.22ms +step:473/2245 train_time:28485ms step_avg:60.22ms +step:474/2245 train_time:28543ms step_avg:60.22ms +step:475/2245 train_time:28605ms step_avg:60.22ms +step:476/2245 train_time:28663ms step_avg:60.22ms +step:477/2245 train_time:28725ms step_avg:60.22ms +step:478/2245 train_time:28784ms step_avg:60.22ms +step:479/2245 train_time:28845ms step_avg:60.22ms +step:480/2245 train_time:28904ms step_avg:60.22ms +step:481/2245 train_time:28966ms step_avg:60.22ms +step:482/2245 train_time:29024ms step_avg:60.22ms +step:483/2245 train_time:29085ms step_avg:60.22ms +step:484/2245 train_time:29144ms step_avg:60.21ms +step:485/2245 train_time:29205ms step_avg:60.22ms +step:486/2245 train_time:29264ms step_avg:60.21ms +step:487/2245 train_time:29325ms step_avg:60.22ms +step:488/2245 train_time:29384ms step_avg:60.21ms +step:489/2245 train_time:29445ms step_avg:60.21ms +step:490/2245 train_time:29504ms step_avg:60.21ms +step:491/2245 train_time:29565ms step_avg:60.21ms +step:492/2245 train_time:29623ms step_avg:60.21ms +step:493/2245 train_time:29684ms step_avg:60.21ms +step:494/2245 train_time:29743ms step_avg:60.21ms +step:495/2245 train_time:29805ms step_avg:60.21ms +step:496/2245 train_time:29864ms step_avg:60.21ms +step:497/2245 train_time:29925ms step_avg:60.21ms +step:498/2245 train_time:29984ms step_avg:60.21ms +step:499/2245 train_time:30045ms step_avg:60.21ms +step:500/2245 train_time:30104ms step_avg:60.21ms +step:500/2245 val_loss:3.8197 train_time:30166ms step_avg:60.33ms +step:501/2245 train_time:30184ms step_avg:60.25ms +step:502/2245 train_time:30227ms step_avg:60.21ms +step:503/2245 train_time:30293ms step_avg:60.22ms +step:504/2245 train_time:30353ms step_avg:60.22ms +step:505/2245 train_time:30414ms step_avg:60.23ms +step:506/2245 train_time:30473ms step_avg:60.22ms +step:507/2245 train_time:30534ms step_avg:60.22ms +step:508/2245 train_time:30592ms step_avg:60.22ms +step:509/2245 train_time:30653ms step_avg:60.22ms +step:510/2245 train_time:30711ms step_avg:60.22ms +step:511/2245 train_time:30771ms step_avg:60.22ms +step:512/2245 train_time:30829ms step_avg:60.21ms +step:513/2245 train_time:30890ms step_avg:60.21ms +step:514/2245 train_time:30949ms step_avg:60.21ms +step:515/2245 train_time:31009ms step_avg:60.21ms +step:516/2245 train_time:31068ms step_avg:60.21ms +step:517/2245 train_time:31130ms step_avg:60.21ms +step:518/2245 train_time:31190ms step_avg:60.21ms +step:519/2245 train_time:31253ms step_avg:60.22ms +step:520/2245 train_time:31313ms step_avg:60.22ms +step:521/2245 train_time:31374ms step_avg:60.22ms +step:522/2245 train_time:31433ms step_avg:60.22ms +step:523/2245 train_time:31495ms step_avg:60.22ms +step:524/2245 train_time:31554ms step_avg:60.22ms +step:525/2245 train_time:31615ms step_avg:60.22ms +step:526/2245 train_time:31673ms step_avg:60.22ms +step:527/2245 train_time:31734ms step_avg:60.22ms +step:528/2245 train_time:31792ms step_avg:60.21ms +step:529/2245 train_time:31854ms step_avg:60.21ms +step:530/2245 train_time:31912ms step_avg:60.21ms +step:531/2245 train_time:31973ms step_avg:60.21ms +step:532/2245 train_time:32032ms step_avg:60.21ms +step:533/2245 train_time:32094ms step_avg:60.21ms +step:534/2245 train_time:32154ms step_avg:60.21ms +step:535/2245 train_time:32216ms step_avg:60.22ms +step:536/2245 train_time:32275ms step_avg:60.22ms +step:537/2245 train_time:32338ms step_avg:60.22ms +step:538/2245 train_time:32397ms step_avg:60.22ms +step:539/2245 train_time:32459ms step_avg:60.22ms +step:540/2245 train_time:32518ms step_avg:60.22ms +step:541/2245 train_time:32579ms step_avg:60.22ms +step:542/2245 train_time:32638ms step_avg:60.22ms +step:543/2245 train_time:32699ms step_avg:60.22ms +step:544/2245 train_time:32758ms step_avg:60.22ms +step:545/2245 train_time:32819ms step_avg:60.22ms +step:546/2245 train_time:32877ms step_avg:60.21ms +step:547/2245 train_time:32939ms step_avg:60.22ms +step:548/2245 train_time:32998ms step_avg:60.21ms +step:549/2245 train_time:33060ms step_avg:60.22ms +step:550/2245 train_time:33119ms step_avg:60.22ms +step:551/2245 train_time:33181ms step_avg:60.22ms +step:552/2245 train_time:33241ms step_avg:60.22ms +step:553/2245 train_time:33303ms step_avg:60.22ms +step:554/2245 train_time:33361ms step_avg:60.22ms +step:555/2245 train_time:33423ms step_avg:60.22ms +step:556/2245 train_time:33482ms step_avg:60.22ms +step:557/2245 train_time:33544ms step_avg:60.22ms +step:558/2245 train_time:33604ms step_avg:60.22ms +step:559/2245 train_time:33666ms step_avg:60.23ms +step:560/2245 train_time:33725ms step_avg:60.22ms +step:561/2245 train_time:33787ms step_avg:60.23ms +step:562/2245 train_time:33846ms step_avg:60.22ms +step:563/2245 train_time:33908ms step_avg:60.23ms +step:564/2245 train_time:33967ms step_avg:60.23ms +step:565/2245 train_time:34030ms step_avg:60.23ms +step:566/2245 train_time:34089ms step_avg:60.23ms +step:567/2245 train_time:34150ms step_avg:60.23ms +step:568/2245 train_time:34209ms step_avg:60.23ms +step:569/2245 train_time:34270ms step_avg:60.23ms +step:570/2245 train_time:34329ms step_avg:60.23ms +step:571/2245 train_time:34391ms step_avg:60.23ms +step:572/2245 train_time:34450ms step_avg:60.23ms +step:573/2245 train_time:34512ms step_avg:60.23ms +step:574/2245 train_time:34571ms step_avg:60.23ms +step:575/2245 train_time:34632ms step_avg:60.23ms +step:576/2245 train_time:34691ms step_avg:60.23ms +step:577/2245 train_time:34752ms step_avg:60.23ms +step:578/2245 train_time:34810ms step_avg:60.23ms +step:579/2245 train_time:34872ms step_avg:60.23ms +step:580/2245 train_time:34930ms step_avg:60.22ms +step:581/2245 train_time:34992ms step_avg:60.23ms +step:582/2245 train_time:35051ms step_avg:60.22ms +step:583/2245 train_time:35112ms step_avg:60.23ms +step:584/2245 train_time:35171ms step_avg:60.22ms +step:585/2245 train_time:35232ms step_avg:60.23ms +step:586/2245 train_time:35291ms step_avg:60.22ms +step:587/2245 train_time:35352ms step_avg:60.23ms +step:588/2245 train_time:35411ms step_avg:60.22ms +step:589/2245 train_time:35472ms step_avg:60.22ms +step:590/2245 train_time:35531ms step_avg:60.22ms +step:591/2245 train_time:35593ms step_avg:60.22ms +step:592/2245 train_time:35651ms step_avg:60.22ms +step:593/2245 train_time:35712ms step_avg:60.22ms +step:594/2245 train_time:35771ms step_avg:60.22ms +step:595/2245 train_time:35832ms step_avg:60.22ms +step:596/2245 train_time:35891ms step_avg:60.22ms +step:597/2245 train_time:35952ms step_avg:60.22ms +step:598/2245 train_time:36011ms step_avg:60.22ms +step:599/2245 train_time:36072ms step_avg:60.22ms +step:600/2245 train_time:36131ms step_avg:60.22ms +step:601/2245 train_time:36193ms step_avg:60.22ms +step:602/2245 train_time:36252ms step_avg:60.22ms +step:603/2245 train_time:36313ms step_avg:60.22ms +step:604/2245 train_time:36372ms step_avg:60.22ms +step:605/2245 train_time:36433ms step_avg:60.22ms +step:606/2245 train_time:36492ms step_avg:60.22ms +step:607/2245 train_time:36554ms step_avg:60.22ms +step:608/2245 train_time:36613ms step_avg:60.22ms +step:609/2245 train_time:36674ms step_avg:60.22ms +step:610/2245 train_time:36733ms step_avg:60.22ms +step:611/2245 train_time:36795ms step_avg:60.22ms +step:612/2245 train_time:36855ms step_avg:60.22ms +step:613/2245 train_time:36916ms step_avg:60.22ms +step:614/2245 train_time:36976ms step_avg:60.22ms +step:615/2245 train_time:37038ms step_avg:60.22ms +step:616/2245 train_time:37097ms step_avg:60.22ms +step:617/2245 train_time:37159ms step_avg:60.22ms +step:618/2245 train_time:37218ms step_avg:60.22ms +step:619/2245 train_time:37280ms step_avg:60.23ms +step:620/2245 train_time:37339ms step_avg:60.22ms +step:621/2245 train_time:37401ms step_avg:60.23ms +step:622/2245 train_time:37460ms step_avg:60.23ms +step:623/2245 train_time:37522ms step_avg:60.23ms +step:624/2245 train_time:37581ms step_avg:60.23ms +step:625/2245 train_time:37642ms step_avg:60.23ms +step:626/2245 train_time:37701ms step_avg:60.23ms +step:627/2245 train_time:37763ms step_avg:60.23ms +step:628/2245 train_time:37823ms step_avg:60.23ms +step:629/2245 train_time:37885ms step_avg:60.23ms +step:630/2245 train_time:37945ms step_avg:60.23ms +step:631/2245 train_time:38008ms step_avg:60.23ms +step:632/2245 train_time:38067ms step_avg:60.23ms +step:633/2245 train_time:38129ms step_avg:60.23ms +step:634/2245 train_time:38189ms step_avg:60.23ms +step:635/2245 train_time:38251ms step_avg:60.24ms +step:636/2245 train_time:38309ms step_avg:60.23ms +step:637/2245 train_time:38371ms step_avg:60.24ms +step:638/2245 train_time:38430ms step_avg:60.24ms +step:639/2245 train_time:38492ms step_avg:60.24ms +step:640/2245 train_time:38550ms step_avg:60.23ms +step:641/2245 train_time:38611ms step_avg:60.24ms +step:642/2245 train_time:38669ms step_avg:60.23ms +step:643/2245 train_time:38730ms step_avg:60.23ms +step:644/2245 train_time:38789ms step_avg:60.23ms +step:645/2245 train_time:38851ms step_avg:60.23ms +step:646/2245 train_time:38910ms step_avg:60.23ms +step:647/2245 train_time:38971ms step_avg:60.23ms +step:648/2245 train_time:39030ms step_avg:60.23ms +step:649/2245 train_time:39091ms step_avg:60.23ms +step:650/2245 train_time:39150ms step_avg:60.23ms +step:651/2245 train_time:39211ms step_avg:60.23ms +step:652/2245 train_time:39270ms step_avg:60.23ms +step:653/2245 train_time:39331ms step_avg:60.23ms +step:654/2245 train_time:39390ms step_avg:60.23ms +step:655/2245 train_time:39451ms step_avg:60.23ms +step:656/2245 train_time:39510ms step_avg:60.23ms +step:657/2245 train_time:39571ms step_avg:60.23ms +step:658/2245 train_time:39630ms step_avg:60.23ms +step:659/2245 train_time:39691ms step_avg:60.23ms +step:660/2245 train_time:39750ms step_avg:60.23ms +step:661/2245 train_time:39812ms step_avg:60.23ms +step:662/2245 train_time:39870ms step_avg:60.23ms +step:663/2245 train_time:39932ms step_avg:60.23ms +step:664/2245 train_time:39990ms step_avg:60.23ms +step:665/2245 train_time:40052ms step_avg:60.23ms +step:666/2245 train_time:40111ms step_avg:60.23ms +step:667/2245 train_time:40172ms step_avg:60.23ms +step:668/2245 train_time:40231ms step_avg:60.23ms +step:669/2245 train_time:40292ms step_avg:60.23ms +step:670/2245 train_time:40351ms step_avg:60.22ms +step:671/2245 train_time:40412ms step_avg:60.23ms +step:672/2245 train_time:40471ms step_avg:60.22ms +step:673/2245 train_time:40532ms step_avg:60.23ms +step:674/2245 train_time:40590ms step_avg:60.22ms +step:675/2245 train_time:40652ms step_avg:60.23ms +step:676/2245 train_time:40710ms step_avg:60.22ms +step:677/2245 train_time:40772ms step_avg:60.22ms +step:678/2245 train_time:40830ms step_avg:60.22ms +step:679/2245 train_time:40892ms step_avg:60.22ms +step:680/2245 train_time:40950ms step_avg:60.22ms +step:681/2245 train_time:41012ms step_avg:60.22ms +step:682/2245 train_time:41071ms step_avg:60.22ms +step:683/2245 train_time:41132ms step_avg:60.22ms +step:684/2245 train_time:41191ms step_avg:60.22ms +step:685/2245 train_time:41253ms step_avg:60.22ms +step:686/2245 train_time:41312ms step_avg:60.22ms +step:687/2245 train_time:41374ms step_avg:60.22ms +step:688/2245 train_time:41433ms step_avg:60.22ms +step:689/2245 train_time:41494ms step_avg:60.22ms +step:690/2245 train_time:41553ms step_avg:60.22ms +step:691/2245 train_time:41615ms step_avg:60.22ms +step:692/2245 train_time:41673ms step_avg:60.22ms +step:693/2245 train_time:41735ms step_avg:60.22ms +step:694/2245 train_time:41794ms step_avg:60.22ms +step:695/2245 train_time:41856ms step_avg:60.22ms +step:696/2245 train_time:41914ms step_avg:60.22ms +step:697/2245 train_time:41976ms step_avg:60.22ms +step:698/2245 train_time:42035ms step_avg:60.22ms +step:699/2245 train_time:42097ms step_avg:60.22ms +step:700/2245 train_time:42155ms step_avg:60.22ms +step:701/2245 train_time:42217ms step_avg:60.22ms +step:702/2245 train_time:42276ms step_avg:60.22ms +step:703/2245 train_time:42338ms step_avg:60.22ms +step:704/2245 train_time:42397ms step_avg:60.22ms +step:705/2245 train_time:42459ms step_avg:60.23ms +step:706/2245 train_time:42518ms step_avg:60.22ms +step:707/2245 train_time:42579ms step_avg:60.23ms +step:708/2245 train_time:42638ms step_avg:60.22ms +step:709/2245 train_time:42700ms step_avg:60.23ms +step:710/2245 train_time:42759ms step_avg:60.22ms +step:711/2245 train_time:42822ms step_avg:60.23ms +step:712/2245 train_time:42881ms step_avg:60.23ms +step:713/2245 train_time:42943ms step_avg:60.23ms +step:714/2245 train_time:43003ms step_avg:60.23ms +step:715/2245 train_time:43064ms step_avg:60.23ms +step:716/2245 train_time:43123ms step_avg:60.23ms +step:717/2245 train_time:43186ms step_avg:60.23ms +step:718/2245 train_time:43246ms step_avg:60.23ms +step:719/2245 train_time:43308ms step_avg:60.23ms +step:720/2245 train_time:43367ms step_avg:60.23ms +step:721/2245 train_time:43429ms step_avg:60.23ms +step:722/2245 train_time:43896ms step_avg:60.80ms +step:723/2245 train_time:43955ms step_avg:60.80ms +step:724/2245 train_time:44013ms step_avg:60.79ms +step:725/2245 train_time:44074ms step_avg:60.79ms +step:726/2245 train_time:44132ms step_avg:60.79ms +step:727/2245 train_time:44192ms step_avg:60.79ms +step:728/2245 train_time:44250ms step_avg:60.78ms +step:729/2245 train_time:44311ms step_avg:60.78ms +step:730/2245 train_time:44369ms step_avg:60.78ms +step:731/2245 train_time:44430ms step_avg:60.78ms +step:732/2245 train_time:44488ms step_avg:60.78ms +step:733/2245 train_time:44549ms step_avg:60.78ms +step:734/2245 train_time:44608ms step_avg:60.77ms +step:735/2245 train_time:44668ms step_avg:60.77ms +step:736/2245 train_time:44727ms step_avg:60.77ms +step:737/2245 train_time:44793ms step_avg:60.78ms +step:738/2245 train_time:44858ms step_avg:60.78ms +step:739/2245 train_time:44923ms step_avg:60.79ms +step:740/2245 train_time:44983ms step_avg:60.79ms +step:741/2245 train_time:45047ms step_avg:60.79ms +step:742/2245 train_time:45107ms step_avg:60.79ms +step:743/2245 train_time:45170ms step_avg:60.79ms +step:744/2245 train_time:45229ms step_avg:60.79ms +step:745/2245 train_time:45290ms step_avg:60.79ms +step:746/2245 train_time:45349ms step_avg:60.79ms +step:747/2245 train_time:45411ms step_avg:60.79ms +step:748/2245 train_time:45470ms step_avg:60.79ms +step:749/2245 train_time:45531ms step_avg:60.79ms +step:750/2245 train_time:45590ms step_avg:60.79ms +step:750/2245 val_loss:3.6701 train_time:45653ms step_avg:60.87ms +step:751/2245 train_time:45672ms step_avg:60.82ms +step:752/2245 train_time:45714ms step_avg:60.79ms +step:753/2245 train_time:45775ms step_avg:60.79ms +step:754/2245 train_time:45835ms step_avg:60.79ms +step:755/2245 train_time:45898ms step_avg:60.79ms +step:756/2245 train_time:45958ms step_avg:60.79ms +step:757/2245 train_time:46019ms step_avg:60.79ms +step:758/2245 train_time:46077ms step_avg:60.79ms +step:759/2245 train_time:46138ms step_avg:60.79ms +step:760/2245 train_time:46197ms step_avg:60.79ms +step:761/2245 train_time:46258ms step_avg:60.79ms +step:762/2245 train_time:46317ms step_avg:60.78ms +step:763/2245 train_time:46378ms step_avg:60.78ms +step:764/2245 train_time:46437ms step_avg:60.78ms +step:765/2245 train_time:46498ms step_avg:60.78ms +step:766/2245 train_time:46561ms step_avg:60.79ms +step:767/2245 train_time:46628ms step_avg:60.79ms +step:768/2245 train_time:46689ms step_avg:60.79ms +step:769/2245 train_time:46751ms step_avg:60.79ms +step:770/2245 train_time:46812ms step_avg:60.79ms +step:771/2245 train_time:46873ms step_avg:60.80ms +step:772/2245 train_time:46933ms step_avg:60.79ms +step:773/2245 train_time:46995ms step_avg:60.80ms +step:774/2245 train_time:47054ms step_avg:60.79ms +step:775/2245 train_time:47115ms step_avg:60.79ms +step:776/2245 train_time:47175ms step_avg:60.79ms +step:777/2245 train_time:47237ms step_avg:60.79ms +step:778/2245 train_time:47296ms step_avg:60.79ms +step:779/2245 train_time:47357ms step_avg:60.79ms +step:780/2245 train_time:47416ms step_avg:60.79ms +step:781/2245 train_time:47478ms step_avg:60.79ms +step:782/2245 train_time:47540ms step_avg:60.79ms +step:783/2245 train_time:47603ms step_avg:60.80ms +step:784/2245 train_time:47663ms step_avg:60.79ms +step:785/2245 train_time:47726ms step_avg:60.80ms +step:786/2245 train_time:47786ms step_avg:60.80ms +step:787/2245 train_time:47849ms step_avg:60.80ms +step:788/2245 train_time:47909ms step_avg:60.80ms +step:789/2245 train_time:47971ms step_avg:60.80ms +step:790/2245 train_time:48031ms step_avg:60.80ms +step:791/2245 train_time:48093ms step_avg:60.80ms +step:792/2245 train_time:48153ms step_avg:60.80ms +step:793/2245 train_time:48215ms step_avg:60.80ms +step:794/2245 train_time:48275ms step_avg:60.80ms +step:795/2245 train_time:48336ms step_avg:60.80ms +step:796/2245 train_time:48396ms step_avg:60.80ms +step:797/2245 train_time:48458ms step_avg:60.80ms +step:798/2245 train_time:48517ms step_avg:60.80ms +step:799/2245 train_time:48580ms step_avg:60.80ms +step:800/2245 train_time:48640ms step_avg:60.80ms +step:801/2245 train_time:48703ms step_avg:60.80ms +step:802/2245 train_time:48762ms step_avg:60.80ms +step:803/2245 train_time:48825ms step_avg:60.80ms +step:804/2245 train_time:48885ms step_avg:60.80ms +step:805/2245 train_time:48948ms step_avg:60.80ms +step:806/2245 train_time:49008ms step_avg:60.80ms +step:807/2245 train_time:49070ms step_avg:60.81ms +step:808/2245 train_time:49130ms step_avg:60.80ms +step:809/2245 train_time:49192ms step_avg:60.81ms +step:810/2245 train_time:49252ms step_avg:60.80ms +step:811/2245 train_time:49314ms step_avg:60.81ms +step:812/2245 train_time:49373ms step_avg:60.80ms +step:813/2245 train_time:49436ms step_avg:60.81ms +step:814/2245 train_time:49497ms step_avg:60.81ms +step:815/2245 train_time:49558ms step_avg:60.81ms +step:816/2245 train_time:49618ms step_avg:60.81ms +step:817/2245 train_time:49680ms step_avg:60.81ms +step:818/2245 train_time:49740ms step_avg:60.81ms +step:819/2245 train_time:49802ms step_avg:60.81ms +step:820/2245 train_time:49862ms step_avg:60.81ms +step:821/2245 train_time:49924ms step_avg:60.81ms +step:822/2245 train_time:49984ms step_avg:60.81ms +step:823/2245 train_time:50048ms step_avg:60.81ms +step:824/2245 train_time:50108ms step_avg:60.81ms +step:825/2245 train_time:50170ms step_avg:60.81ms +step:826/2245 train_time:50230ms step_avg:60.81ms +step:827/2245 train_time:50292ms step_avg:60.81ms +step:828/2245 train_time:50351ms step_avg:60.81ms +step:829/2245 train_time:50414ms step_avg:60.81ms +step:830/2245 train_time:50474ms step_avg:60.81ms +step:831/2245 train_time:50537ms step_avg:60.81ms +step:832/2245 train_time:50597ms step_avg:60.81ms +step:833/2245 train_time:50659ms step_avg:60.82ms +step:834/2245 train_time:50719ms step_avg:60.81ms +step:835/2245 train_time:50781ms step_avg:60.82ms +step:836/2245 train_time:50841ms step_avg:60.81ms +step:837/2245 train_time:50903ms step_avg:60.82ms +step:838/2245 train_time:50963ms step_avg:60.81ms +step:839/2245 train_time:51025ms step_avg:60.82ms +step:840/2245 train_time:51086ms step_avg:60.82ms +step:841/2245 train_time:51148ms step_avg:60.82ms +step:842/2245 train_time:51208ms step_avg:60.82ms +step:843/2245 train_time:51271ms step_avg:60.82ms +step:844/2245 train_time:51331ms step_avg:60.82ms +step:845/2245 train_time:51393ms step_avg:60.82ms +step:846/2245 train_time:51453ms step_avg:60.82ms +step:847/2245 train_time:51516ms step_avg:60.82ms +step:848/2245 train_time:51576ms step_avg:60.82ms +step:849/2245 train_time:51639ms step_avg:60.82ms +step:850/2245 train_time:51698ms step_avg:60.82ms +step:851/2245 train_time:51760ms step_avg:60.82ms +step:852/2245 train_time:51821ms step_avg:60.82ms +step:853/2245 train_time:51882ms step_avg:60.82ms +step:854/2245 train_time:51942ms step_avg:60.82ms +step:855/2245 train_time:52004ms step_avg:60.82ms +step:856/2245 train_time:52063ms step_avg:60.82ms +step:857/2245 train_time:52126ms step_avg:60.82ms +step:858/2245 train_time:52185ms step_avg:60.82ms +step:859/2245 train_time:52248ms step_avg:60.82ms +step:860/2245 train_time:52308ms step_avg:60.82ms +step:861/2245 train_time:52371ms step_avg:60.83ms +step:862/2245 train_time:52431ms step_avg:60.83ms +step:863/2245 train_time:52494ms step_avg:60.83ms +step:864/2245 train_time:52554ms step_avg:60.83ms +step:865/2245 train_time:52617ms step_avg:60.83ms +step:866/2245 train_time:52676ms step_avg:60.83ms +step:867/2245 train_time:52739ms step_avg:60.83ms +step:868/2245 train_time:52799ms step_avg:60.83ms +step:869/2245 train_time:52861ms step_avg:60.83ms +step:870/2245 train_time:52921ms step_avg:60.83ms +step:871/2245 train_time:52983ms step_avg:60.83ms +step:872/2245 train_time:53043ms step_avg:60.83ms +step:873/2245 train_time:53105ms step_avg:60.83ms +step:874/2245 train_time:53165ms step_avg:60.83ms +step:875/2245 train_time:53227ms step_avg:60.83ms +step:876/2245 train_time:53287ms step_avg:60.83ms +step:877/2245 train_time:53351ms step_avg:60.83ms +step:878/2245 train_time:53411ms step_avg:60.83ms +step:879/2245 train_time:53473ms step_avg:60.83ms +step:880/2245 train_time:53533ms step_avg:60.83ms +step:881/2245 train_time:53596ms step_avg:60.84ms +step:882/2245 train_time:53656ms step_avg:60.83ms +step:883/2245 train_time:53718ms step_avg:60.84ms +step:884/2245 train_time:53778ms step_avg:60.83ms +step:885/2245 train_time:53840ms step_avg:60.84ms +step:886/2245 train_time:53900ms step_avg:60.84ms +step:887/2245 train_time:53962ms step_avg:60.84ms +step:888/2245 train_time:54022ms step_avg:60.84ms +step:889/2245 train_time:54083ms step_avg:60.84ms +step:890/2245 train_time:54143ms step_avg:60.83ms +step:891/2245 train_time:54206ms step_avg:60.84ms +step:892/2245 train_time:54265ms step_avg:60.84ms +step:893/2245 train_time:54328ms step_avg:60.84ms +step:894/2245 train_time:54387ms step_avg:60.84ms +step:895/2245 train_time:54450ms step_avg:60.84ms +step:896/2245 train_time:54510ms step_avg:60.84ms +step:897/2245 train_time:54572ms step_avg:60.84ms +step:898/2245 train_time:54633ms step_avg:60.84ms +step:899/2245 train_time:54696ms step_avg:60.84ms +step:900/2245 train_time:54756ms step_avg:60.84ms +step:901/2245 train_time:54818ms step_avg:60.84ms +step:902/2245 train_time:54878ms step_avg:60.84ms +step:903/2245 train_time:54940ms step_avg:60.84ms +step:904/2245 train_time:55000ms step_avg:60.84ms +step:905/2245 train_time:55062ms step_avg:60.84ms +step:906/2245 train_time:55122ms step_avg:60.84ms +step:907/2245 train_time:55183ms step_avg:60.84ms +step:908/2245 train_time:55243ms step_avg:60.84ms +step:909/2245 train_time:55306ms step_avg:60.84ms +step:910/2245 train_time:55366ms step_avg:60.84ms +step:911/2245 train_time:55429ms step_avg:60.84ms +step:912/2245 train_time:55488ms step_avg:60.84ms +step:913/2245 train_time:55551ms step_avg:60.84ms +step:914/2245 train_time:55611ms step_avg:60.84ms +step:915/2245 train_time:55673ms step_avg:60.85ms +step:916/2245 train_time:55734ms step_avg:60.84ms +step:917/2245 train_time:55796ms step_avg:60.85ms +step:918/2245 train_time:55856ms step_avg:60.85ms +step:919/2245 train_time:55918ms step_avg:60.85ms +step:920/2245 train_time:55978ms step_avg:60.85ms +step:921/2245 train_time:56040ms step_avg:60.85ms +step:922/2245 train_time:56100ms step_avg:60.85ms +step:923/2245 train_time:56162ms step_avg:60.85ms +step:924/2245 train_time:56222ms step_avg:60.85ms +step:925/2245 train_time:56284ms step_avg:60.85ms +step:926/2245 train_time:56344ms step_avg:60.85ms +step:927/2245 train_time:56406ms step_avg:60.85ms +step:928/2245 train_time:56467ms step_avg:60.85ms +step:929/2245 train_time:56530ms step_avg:60.85ms +step:930/2245 train_time:56590ms step_avg:60.85ms +step:931/2245 train_time:56652ms step_avg:60.85ms +step:932/2245 train_time:56713ms step_avg:60.85ms +step:933/2245 train_time:56775ms step_avg:60.85ms +step:934/2245 train_time:56835ms step_avg:60.85ms +step:935/2245 train_time:56898ms step_avg:60.85ms +step:936/2245 train_time:56958ms step_avg:60.85ms +step:937/2245 train_time:57021ms step_avg:60.85ms +step:938/2245 train_time:57080ms step_avg:60.85ms +step:939/2245 train_time:57142ms step_avg:60.85ms +step:940/2245 train_time:57203ms step_avg:60.85ms +step:941/2245 train_time:57264ms step_avg:60.85ms +step:942/2245 train_time:57324ms step_avg:60.85ms +step:943/2245 train_time:57387ms step_avg:60.86ms +step:944/2245 train_time:57447ms step_avg:60.85ms +step:945/2245 train_time:57510ms step_avg:60.86ms +step:946/2245 train_time:57569ms step_avg:60.86ms +step:947/2245 train_time:57631ms step_avg:60.86ms +step:948/2245 train_time:57692ms step_avg:60.86ms +step:949/2245 train_time:57754ms step_avg:60.86ms +step:950/2245 train_time:57815ms step_avg:60.86ms +step:951/2245 train_time:57877ms step_avg:60.86ms +step:952/2245 train_time:57937ms step_avg:60.86ms +step:953/2245 train_time:58000ms step_avg:60.86ms +step:954/2245 train_time:58059ms step_avg:60.86ms +step:955/2245 train_time:58121ms step_avg:60.86ms +step:956/2245 train_time:58181ms step_avg:60.86ms +step:957/2245 train_time:58243ms step_avg:60.86ms +step:958/2245 train_time:58303ms step_avg:60.86ms +step:959/2245 train_time:58365ms step_avg:60.86ms +step:960/2245 train_time:58425ms step_avg:60.86ms +step:961/2245 train_time:58488ms step_avg:60.86ms +step:962/2245 train_time:58548ms step_avg:60.86ms +step:963/2245 train_time:58611ms step_avg:60.86ms +step:964/2245 train_time:58670ms step_avg:60.86ms +step:965/2245 train_time:58733ms step_avg:60.86ms +step:966/2245 train_time:58795ms step_avg:60.86ms +step:967/2245 train_time:58857ms step_avg:60.87ms +step:968/2245 train_time:58917ms step_avg:60.86ms +step:969/2245 train_time:58980ms step_avg:60.87ms +step:970/2245 train_time:59040ms step_avg:60.87ms +step:971/2245 train_time:59102ms step_avg:60.87ms +step:972/2245 train_time:59162ms step_avg:60.87ms +step:973/2245 train_time:59224ms step_avg:60.87ms +step:974/2245 train_time:59284ms step_avg:60.87ms +step:975/2245 train_time:59346ms step_avg:60.87ms +step:976/2245 train_time:59406ms step_avg:60.87ms +step:977/2245 train_time:59468ms step_avg:60.87ms +step:978/2245 train_time:59528ms step_avg:60.87ms +step:979/2245 train_time:59590ms step_avg:60.87ms +step:980/2245 train_time:59650ms step_avg:60.87ms +step:981/2245 train_time:59712ms step_avg:60.87ms +step:982/2245 train_time:59772ms step_avg:60.87ms +step:983/2245 train_time:59835ms step_avg:60.87ms +step:984/2245 train_time:59896ms step_avg:60.87ms +step:985/2245 train_time:59958ms step_avg:60.87ms +step:986/2245 train_time:60017ms step_avg:60.87ms +step:987/2245 train_time:60079ms step_avg:60.87ms +step:988/2245 train_time:60139ms step_avg:60.87ms +step:989/2245 train_time:60201ms step_avg:60.87ms +step:990/2245 train_time:60261ms step_avg:60.87ms +step:991/2245 train_time:60323ms step_avg:60.87ms +step:992/2245 train_time:60382ms step_avg:60.87ms +step:993/2245 train_time:60445ms step_avg:60.87ms +step:994/2245 train_time:60505ms step_avg:60.87ms +step:995/2245 train_time:60568ms step_avg:60.87ms +step:996/2245 train_time:60627ms step_avg:60.87ms +step:997/2245 train_time:60690ms step_avg:60.87ms +step:998/2245 train_time:60750ms step_avg:60.87ms +step:999/2245 train_time:60813ms step_avg:60.87ms +step:1000/2245 train_time:60873ms step_avg:60.87ms +step:1000/2245 val_loss:3.5925 train_time:60937ms step_avg:60.94ms +step:1001/2245 train_time:60956ms step_avg:60.90ms +step:1002/2245 train_time:60999ms step_avg:60.88ms +step:1003/2245 train_time:61065ms step_avg:60.88ms +step:1004/2245 train_time:61128ms step_avg:60.88ms +step:1005/2245 train_time:61192ms step_avg:60.89ms +step:1006/2245 train_time:61253ms step_avg:60.89ms +step:1007/2245 train_time:61314ms step_avg:60.89ms +step:1008/2245 train_time:61373ms step_avg:60.89ms +step:1009/2245 train_time:61434ms step_avg:60.89ms +step:1010/2245 train_time:61493ms step_avg:60.88ms +step:1011/2245 train_time:61555ms step_avg:60.89ms +step:1012/2245 train_time:61614ms step_avg:60.88ms +step:1013/2245 train_time:61675ms step_avg:60.88ms +step:1014/2245 train_time:61734ms step_avg:60.88ms +step:1015/2245 train_time:61796ms step_avg:60.88ms +step:1016/2245 train_time:61855ms step_avg:60.88ms +step:1017/2245 train_time:61918ms step_avg:60.88ms +step:1018/2245 train_time:61980ms step_avg:60.88ms +step:1019/2245 train_time:62046ms step_avg:60.89ms +step:1020/2245 train_time:62106ms step_avg:60.89ms +step:1021/2245 train_time:62170ms step_avg:60.89ms +step:1022/2245 train_time:62231ms step_avg:60.89ms +step:1023/2245 train_time:62293ms step_avg:60.89ms +step:1024/2245 train_time:62352ms step_avg:60.89ms +step:1025/2245 train_time:62415ms step_avg:60.89ms +step:1026/2245 train_time:62474ms step_avg:60.89ms +step:1027/2245 train_time:62537ms step_avg:60.89ms +step:1028/2245 train_time:62596ms step_avg:60.89ms +step:1029/2245 train_time:62657ms step_avg:60.89ms +step:1030/2245 train_time:62716ms step_avg:60.89ms +step:1031/2245 train_time:62778ms step_avg:60.89ms +step:1032/2245 train_time:62838ms step_avg:60.89ms +step:1033/2245 train_time:62900ms step_avg:60.89ms +step:1034/2245 train_time:62961ms step_avg:60.89ms +step:1035/2245 train_time:63025ms step_avg:60.89ms +step:1036/2245 train_time:63085ms step_avg:60.89ms +step:1037/2245 train_time:63149ms step_avg:60.90ms +step:1038/2245 train_time:63209ms step_avg:60.90ms +step:1039/2245 train_time:63272ms step_avg:60.90ms +step:1040/2245 train_time:63332ms step_avg:60.90ms +step:1041/2245 train_time:63394ms step_avg:60.90ms +step:1042/2245 train_time:63454ms step_avg:60.90ms +step:1043/2245 train_time:63516ms step_avg:60.90ms +step:1044/2245 train_time:63575ms step_avg:60.90ms +step:1045/2245 train_time:63637ms step_avg:60.90ms +step:1046/2245 train_time:63696ms step_avg:60.89ms +step:1047/2245 train_time:63758ms step_avg:60.90ms +step:1048/2245 train_time:63817ms step_avg:60.89ms +step:1049/2245 train_time:63880ms step_avg:60.90ms +step:1050/2245 train_time:63940ms step_avg:60.90ms +step:1051/2245 train_time:64003ms step_avg:60.90ms +step:1052/2245 train_time:64063ms step_avg:60.90ms +step:1053/2245 train_time:64126ms step_avg:60.90ms +step:1054/2245 train_time:64186ms step_avg:60.90ms +step:1055/2245 train_time:64249ms step_avg:60.90ms +step:1056/2245 train_time:64309ms step_avg:60.90ms +step:1057/2245 train_time:64372ms step_avg:60.90ms +step:1058/2245 train_time:64432ms step_avg:60.90ms +step:1059/2245 train_time:64493ms step_avg:60.90ms +step:1060/2245 train_time:64553ms step_avg:60.90ms +step:1061/2245 train_time:64615ms step_avg:60.90ms +step:1062/2245 train_time:64675ms step_avg:60.90ms +step:1063/2245 train_time:64737ms step_avg:60.90ms +step:1064/2245 train_time:64796ms step_avg:60.90ms +step:1065/2245 train_time:64860ms step_avg:60.90ms +step:1066/2245 train_time:64920ms step_avg:60.90ms +step:1067/2245 train_time:64982ms step_avg:60.90ms +step:1068/2245 train_time:65043ms step_avg:60.90ms +step:1069/2245 train_time:65106ms step_avg:60.90ms +step:1070/2245 train_time:65167ms step_avg:60.90ms +step:1071/2245 train_time:65230ms step_avg:60.91ms +step:1072/2245 train_time:65289ms step_avg:60.90ms +step:1073/2245 train_time:65352ms step_avg:60.91ms +step:1074/2245 train_time:65412ms step_avg:60.90ms +step:1075/2245 train_time:65474ms step_avg:60.91ms +step:1076/2245 train_time:65534ms step_avg:60.91ms +step:1077/2245 train_time:65596ms step_avg:60.91ms +step:1078/2245 train_time:65655ms step_avg:60.90ms +step:1079/2245 train_time:65717ms step_avg:60.91ms +step:1080/2245 train_time:65776ms step_avg:60.90ms +step:1081/2245 train_time:65838ms step_avg:60.91ms +step:1082/2245 train_time:65898ms step_avg:60.90ms +step:1083/2245 train_time:65961ms step_avg:60.91ms +step:1084/2245 train_time:66021ms step_avg:60.91ms +step:1085/2245 train_time:66085ms step_avg:60.91ms +step:1086/2245 train_time:66145ms step_avg:60.91ms +step:1087/2245 train_time:66207ms step_avg:60.91ms +step:1088/2245 train_time:66268ms step_avg:60.91ms +step:1089/2245 train_time:66332ms step_avg:60.91ms +step:1090/2245 train_time:66391ms step_avg:60.91ms +step:1091/2245 train_time:66454ms step_avg:60.91ms +step:1092/2245 train_time:66513ms step_avg:60.91ms +step:1093/2245 train_time:66576ms step_avg:60.91ms +step:1094/2245 train_time:66636ms step_avg:60.91ms +step:1095/2245 train_time:66698ms step_avg:60.91ms +step:1096/2245 train_time:66757ms step_avg:60.91ms +step:1097/2245 train_time:66819ms step_avg:60.91ms +step:1098/2245 train_time:66879ms step_avg:60.91ms +step:1099/2245 train_time:66942ms step_avg:60.91ms +step:1100/2245 train_time:67001ms step_avg:60.91ms +step:1101/2245 train_time:67064ms step_avg:60.91ms +step:1102/2245 train_time:67125ms step_avg:60.91ms +step:1103/2245 train_time:67187ms step_avg:60.91ms +step:1104/2245 train_time:67247ms step_avg:60.91ms +step:1105/2245 train_time:67310ms step_avg:60.91ms +step:1106/2245 train_time:67370ms step_avg:60.91ms +step:1107/2245 train_time:67433ms step_avg:60.91ms +step:1108/2245 train_time:67492ms step_avg:60.91ms +step:1109/2245 train_time:67555ms step_avg:60.92ms +step:1110/2245 train_time:67615ms step_avg:60.91ms +step:1111/2245 train_time:67678ms step_avg:60.92ms +step:1112/2245 train_time:67738ms step_avg:60.92ms +step:1113/2245 train_time:67800ms step_avg:60.92ms +step:1114/2245 train_time:67859ms step_avg:60.91ms +step:1115/2245 train_time:67921ms step_avg:60.92ms +step:1116/2245 train_time:67981ms step_avg:60.91ms +step:1117/2245 train_time:68044ms step_avg:60.92ms +step:1118/2245 train_time:68104ms step_avg:60.92ms +step:1119/2245 train_time:68167ms step_avg:60.92ms +step:1120/2245 train_time:68228ms step_avg:60.92ms +step:1121/2245 train_time:68291ms step_avg:60.92ms +step:1122/2245 train_time:68351ms step_avg:60.92ms +step:1123/2245 train_time:68413ms step_avg:60.92ms +step:1124/2245 train_time:68474ms step_avg:60.92ms +step:1125/2245 train_time:68536ms step_avg:60.92ms +step:1126/2245 train_time:68596ms step_avg:60.92ms +step:1127/2245 train_time:68658ms step_avg:60.92ms +step:1128/2245 train_time:68718ms step_avg:60.92ms +step:1129/2245 train_time:68781ms step_avg:60.92ms +step:1130/2245 train_time:68840ms step_avg:60.92ms +step:1131/2245 train_time:68902ms step_avg:60.92ms +step:1132/2245 train_time:68961ms step_avg:60.92ms +step:1133/2245 train_time:69023ms step_avg:60.92ms +step:1134/2245 train_time:69083ms step_avg:60.92ms +step:1135/2245 train_time:69147ms step_avg:60.92ms +step:1136/2245 train_time:69207ms step_avg:60.92ms +step:1137/2245 train_time:69270ms step_avg:60.92ms +step:1138/2245 train_time:69330ms step_avg:60.92ms +step:1139/2245 train_time:69392ms step_avg:60.92ms +step:1140/2245 train_time:69453ms step_avg:60.92ms +step:1141/2245 train_time:69515ms step_avg:60.92ms +step:1142/2245 train_time:69575ms step_avg:60.92ms +step:1143/2245 train_time:69637ms step_avg:60.92ms +step:1144/2245 train_time:69697ms step_avg:60.92ms +step:1145/2245 train_time:69759ms step_avg:60.92ms +step:1146/2245 train_time:69818ms step_avg:60.92ms +step:1147/2245 train_time:69880ms step_avg:60.92ms +step:1148/2245 train_time:69940ms step_avg:60.92ms +step:1149/2245 train_time:70003ms step_avg:60.92ms +step:1150/2245 train_time:70063ms step_avg:60.92ms +step:1151/2245 train_time:70125ms step_avg:60.93ms +step:1152/2245 train_time:70186ms step_avg:60.93ms +step:1153/2245 train_time:70249ms step_avg:60.93ms +step:1154/2245 train_time:70309ms step_avg:60.93ms +step:1155/2245 train_time:70372ms step_avg:60.93ms +step:1156/2245 train_time:70433ms step_avg:60.93ms +step:1157/2245 train_time:70496ms step_avg:60.93ms +step:1158/2245 train_time:70556ms step_avg:60.93ms +step:1159/2245 train_time:70618ms step_avg:60.93ms +step:1160/2245 train_time:70677ms step_avg:60.93ms +step:1161/2245 train_time:70739ms step_avg:60.93ms +step:1162/2245 train_time:70799ms step_avg:60.93ms +step:1163/2245 train_time:70861ms step_avg:60.93ms +step:1164/2245 train_time:70920ms step_avg:60.93ms +step:1165/2245 train_time:70982ms step_avg:60.93ms +step:1166/2245 train_time:71043ms step_avg:60.93ms +step:1167/2245 train_time:71105ms step_avg:60.93ms +step:1168/2245 train_time:71165ms step_avg:60.93ms +step:1169/2245 train_time:71228ms step_avg:60.93ms +step:1170/2245 train_time:71289ms step_avg:60.93ms +step:1171/2245 train_time:71352ms step_avg:60.93ms +step:1172/2245 train_time:71412ms step_avg:60.93ms +step:1173/2245 train_time:71476ms step_avg:60.93ms +step:1174/2245 train_time:71536ms step_avg:60.93ms +step:1175/2245 train_time:71598ms step_avg:60.93ms +step:1176/2245 train_time:71658ms step_avg:60.93ms +step:1177/2245 train_time:71721ms step_avg:60.94ms +step:1178/2245 train_time:71780ms step_avg:60.93ms +step:1179/2245 train_time:71843ms step_avg:60.94ms +step:1180/2245 train_time:71902ms step_avg:60.93ms +step:1181/2245 train_time:71964ms step_avg:60.93ms +step:1182/2245 train_time:72024ms step_avg:60.93ms +step:1183/2245 train_time:72086ms step_avg:60.94ms +step:1184/2245 train_time:72146ms step_avg:60.93ms +step:1185/2245 train_time:72209ms step_avg:60.94ms +step:1186/2245 train_time:72270ms step_avg:60.94ms +step:1187/2245 train_time:72332ms step_avg:60.94ms +step:1188/2245 train_time:72392ms step_avg:60.94ms +step:1189/2245 train_time:72454ms step_avg:60.94ms +step:1190/2245 train_time:72514ms step_avg:60.94ms +step:1191/2245 train_time:72576ms step_avg:60.94ms +step:1192/2245 train_time:72637ms step_avg:60.94ms +step:1193/2245 train_time:72699ms step_avg:60.94ms +step:1194/2245 train_time:72758ms step_avg:60.94ms +step:1195/2245 train_time:72820ms step_avg:60.94ms +step:1196/2245 train_time:72880ms step_avg:60.94ms +step:1197/2245 train_time:72942ms step_avg:60.94ms +step:1198/2245 train_time:73002ms step_avg:60.94ms +step:1199/2245 train_time:73065ms step_avg:60.94ms +step:1200/2245 train_time:73125ms step_avg:60.94ms +step:1201/2245 train_time:73188ms step_avg:60.94ms +step:1202/2245 train_time:73249ms step_avg:60.94ms +step:1203/2245 train_time:73312ms step_avg:60.94ms +step:1204/2245 train_time:73371ms step_avg:60.94ms +step:1205/2245 train_time:73434ms step_avg:60.94ms +step:1206/2245 train_time:73494ms step_avg:60.94ms +step:1207/2245 train_time:73557ms step_avg:60.94ms +step:1208/2245 train_time:73616ms step_avg:60.94ms +step:1209/2245 train_time:73679ms step_avg:60.94ms +step:1210/2245 train_time:73739ms step_avg:60.94ms +step:1211/2245 train_time:73801ms step_avg:60.94ms +step:1212/2245 train_time:73860ms step_avg:60.94ms +step:1213/2245 train_time:73922ms step_avg:60.94ms +step:1214/2245 train_time:73982ms step_avg:60.94ms +step:1215/2245 train_time:74045ms step_avg:60.94ms +step:1216/2245 train_time:74105ms step_avg:60.94ms +step:1217/2245 train_time:74168ms step_avg:60.94ms +step:1218/2245 train_time:74229ms step_avg:60.94ms +step:1219/2245 train_time:74292ms step_avg:60.94ms +step:1220/2245 train_time:74352ms step_avg:60.94ms +step:1221/2245 train_time:74415ms step_avg:60.95ms +step:1222/2245 train_time:74475ms step_avg:60.94ms +step:1223/2245 train_time:74536ms step_avg:60.95ms +step:1224/2245 train_time:74596ms step_avg:60.94ms +step:1225/2245 train_time:74659ms step_avg:60.95ms +step:1226/2245 train_time:74718ms step_avg:60.94ms +step:1227/2245 train_time:74780ms step_avg:60.95ms +step:1228/2245 train_time:74839ms step_avg:60.94ms +step:1229/2245 train_time:74901ms step_avg:60.94ms +step:1230/2245 train_time:74961ms step_avg:60.94ms +step:1231/2245 train_time:75024ms step_avg:60.95ms +step:1232/2245 train_time:75083ms step_avg:60.94ms +step:1233/2245 train_time:75146ms step_avg:60.95ms +step:1234/2245 train_time:75206ms step_avg:60.94ms +step:1235/2245 train_time:75269ms step_avg:60.95ms +step:1236/2245 train_time:75330ms step_avg:60.95ms +step:1237/2245 train_time:75393ms step_avg:60.95ms +step:1238/2245 train_time:75453ms step_avg:60.95ms +step:1239/2245 train_time:75516ms step_avg:60.95ms +step:1240/2245 train_time:75576ms step_avg:60.95ms +step:1241/2245 train_time:75638ms step_avg:60.95ms +step:1242/2245 train_time:75698ms step_avg:60.95ms +step:1243/2245 train_time:75760ms step_avg:60.95ms +step:1244/2245 train_time:75820ms step_avg:60.95ms +step:1245/2245 train_time:75882ms step_avg:60.95ms +step:1246/2245 train_time:75942ms step_avg:60.95ms +step:1247/2245 train_time:76004ms step_avg:60.95ms +step:1248/2245 train_time:76064ms step_avg:60.95ms +step:1249/2245 train_time:76127ms step_avg:60.95ms +step:1250/2245 train_time:76187ms step_avg:60.95ms +step:1250/2245 val_loss:3.5232 train_time:76252ms step_avg:61.00ms +step:1251/2245 train_time:76271ms step_avg:60.97ms +step:1252/2245 train_time:76312ms step_avg:60.95ms +step:1253/2245 train_time:76379ms step_avg:60.96ms +step:1254/2245 train_time:76440ms step_avg:60.96ms +step:1255/2245 train_time:76502ms step_avg:60.96ms +step:1256/2245 train_time:76562ms step_avg:60.96ms +step:1257/2245 train_time:76625ms step_avg:60.96ms +step:1258/2245 train_time:76685ms step_avg:60.96ms +step:1259/2245 train_time:76747ms step_avg:60.96ms +step:1260/2245 train_time:76807ms step_avg:60.96ms +step:1261/2245 train_time:76869ms step_avg:60.96ms +step:1262/2245 train_time:76929ms step_avg:60.96ms +step:1263/2245 train_time:76991ms step_avg:60.96ms +step:1264/2245 train_time:77051ms step_avg:60.96ms +step:1265/2245 train_time:77112ms step_avg:60.96ms +step:1266/2245 train_time:77172ms step_avg:60.96ms +step:1267/2245 train_time:77235ms step_avg:60.96ms +step:1268/2245 train_time:77295ms step_avg:60.96ms +step:1269/2245 train_time:77358ms step_avg:60.96ms +step:1270/2245 train_time:77419ms step_avg:60.96ms +step:1271/2245 train_time:77482ms step_avg:60.96ms +step:1272/2245 train_time:77542ms step_avg:60.96ms +step:1273/2245 train_time:77604ms step_avg:60.96ms +step:1274/2245 train_time:77664ms step_avg:60.96ms +step:1275/2245 train_time:77726ms step_avg:60.96ms +step:1276/2245 train_time:77786ms step_avg:60.96ms +step:1277/2245 train_time:77848ms step_avg:60.96ms +step:1278/2245 train_time:77908ms step_avg:60.96ms +step:1279/2245 train_time:77970ms step_avg:60.96ms +step:1280/2245 train_time:78029ms step_avg:60.96ms +step:1281/2245 train_time:78092ms step_avg:60.96ms +step:1282/2245 train_time:78152ms step_avg:60.96ms +step:1283/2245 train_time:78215ms step_avg:60.96ms +step:1284/2245 train_time:78275ms step_avg:60.96ms +step:1285/2245 train_time:78337ms step_avg:60.96ms +step:1286/2245 train_time:78397ms step_avg:60.96ms +step:1287/2245 train_time:78459ms step_avg:60.96ms +step:1288/2245 train_time:78519ms step_avg:60.96ms +step:1289/2245 train_time:78581ms step_avg:60.96ms +step:1290/2245 train_time:78641ms step_avg:60.96ms +step:1291/2245 train_time:78703ms step_avg:60.96ms +step:1292/2245 train_time:78764ms step_avg:60.96ms +step:1293/2245 train_time:78826ms step_avg:60.96ms +step:1294/2245 train_time:78886ms step_avg:60.96ms +step:1295/2245 train_time:78949ms step_avg:60.96ms +step:1296/2245 train_time:79009ms step_avg:60.96ms +step:1297/2245 train_time:79072ms step_avg:60.97ms +step:1298/2245 train_time:79131ms step_avg:60.96ms +step:1299/2245 train_time:79193ms step_avg:60.96ms +step:1300/2245 train_time:79254ms step_avg:60.96ms +step:1301/2245 train_time:79316ms step_avg:60.97ms +step:1302/2245 train_time:79376ms step_avg:60.96ms +step:1303/2245 train_time:79438ms step_avg:60.97ms +step:1304/2245 train_time:79498ms step_avg:60.96ms +step:1305/2245 train_time:79562ms step_avg:60.97ms +step:1306/2245 train_time:79622ms step_avg:60.97ms +step:1307/2245 train_time:79685ms step_avg:60.97ms +step:1308/2245 train_time:79745ms step_avg:60.97ms +step:1309/2245 train_time:79807ms step_avg:60.97ms +step:1310/2245 train_time:79868ms step_avg:60.97ms +step:1311/2245 train_time:79930ms step_avg:60.97ms +step:1312/2245 train_time:79990ms step_avg:60.97ms +step:1313/2245 train_time:80053ms step_avg:60.97ms +step:1314/2245 train_time:80114ms step_avg:60.97ms +step:1315/2245 train_time:80176ms step_avg:60.97ms +step:1316/2245 train_time:80236ms step_avg:60.97ms +step:1317/2245 train_time:80298ms step_avg:60.97ms +step:1318/2245 train_time:80359ms step_avg:60.97ms +step:1319/2245 train_time:80421ms step_avg:60.97ms +step:1320/2245 train_time:80481ms step_avg:60.97ms +step:1321/2245 train_time:80543ms step_avg:60.97ms +step:1322/2245 train_time:80603ms step_avg:60.97ms +step:1323/2245 train_time:80666ms step_avg:60.97ms +step:1324/2245 train_time:80726ms step_avg:60.97ms +step:1325/2245 train_time:80788ms step_avg:60.97ms +step:1326/2245 train_time:80849ms step_avg:60.97ms +step:1327/2245 train_time:80911ms step_avg:60.97ms +step:1328/2245 train_time:80971ms step_avg:60.97ms +step:1329/2245 train_time:81034ms step_avg:60.97ms +step:1330/2245 train_time:81093ms step_avg:60.97ms +step:1331/2245 train_time:81155ms step_avg:60.97ms +step:1332/2245 train_time:81214ms step_avg:60.97ms +step:1333/2245 train_time:81276ms step_avg:60.97ms +step:1334/2245 train_time:81336ms step_avg:60.97ms +step:1335/2245 train_time:81398ms step_avg:60.97ms +step:1336/2245 train_time:81458ms step_avg:60.97ms +step:1337/2245 train_time:81520ms step_avg:60.97ms +step:1338/2245 train_time:81581ms step_avg:60.97ms +step:1339/2245 train_time:81644ms step_avg:60.97ms +step:1340/2245 train_time:81703ms step_avg:60.97ms +step:1341/2245 train_time:81766ms step_avg:60.97ms +step:1342/2245 train_time:81826ms step_avg:60.97ms +step:1343/2245 train_time:81888ms step_avg:60.97ms +step:1344/2245 train_time:81948ms step_avg:60.97ms +step:1345/2245 train_time:82011ms step_avg:60.97ms +step:1346/2245 train_time:82071ms step_avg:60.97ms +step:1347/2245 train_time:82133ms step_avg:60.97ms +step:1348/2245 train_time:82192ms step_avg:60.97ms +step:1349/2245 train_time:82255ms step_avg:60.97ms +step:1350/2245 train_time:82315ms step_avg:60.97ms +step:1351/2245 train_time:82377ms step_avg:60.97ms +step:1352/2245 train_time:82437ms step_avg:60.97ms +step:1353/2245 train_time:82499ms step_avg:60.97ms +step:1354/2245 train_time:82560ms step_avg:60.97ms +step:1355/2245 train_time:82623ms step_avg:60.98ms +step:1356/2245 train_time:82683ms step_avg:60.98ms +step:1357/2245 train_time:82746ms step_avg:60.98ms +step:1358/2245 train_time:82806ms step_avg:60.98ms +step:1359/2245 train_time:82869ms step_avg:60.98ms +step:1360/2245 train_time:82928ms step_avg:60.98ms +step:1361/2245 train_time:82991ms step_avg:60.98ms +step:1362/2245 train_time:83051ms step_avg:60.98ms +step:1363/2245 train_time:83113ms step_avg:60.98ms +step:1364/2245 train_time:83173ms step_avg:60.98ms +step:1365/2245 train_time:83235ms step_avg:60.98ms +step:1366/2245 train_time:83294ms step_avg:60.98ms +step:1367/2245 train_time:83356ms step_avg:60.98ms +step:1368/2245 train_time:83416ms step_avg:60.98ms +step:1369/2245 train_time:83478ms step_avg:60.98ms +step:1370/2245 train_time:83538ms step_avg:60.98ms +step:1371/2245 train_time:83602ms step_avg:60.98ms +step:1372/2245 train_time:83662ms step_avg:60.98ms +step:1373/2245 train_time:83725ms step_avg:60.98ms +step:1374/2245 train_time:83784ms step_avg:60.98ms +step:1375/2245 train_time:83847ms step_avg:60.98ms +step:1376/2245 train_time:83907ms step_avg:60.98ms +step:1377/2245 train_time:83970ms step_avg:60.98ms +step:1378/2245 train_time:84030ms step_avg:60.98ms +step:1379/2245 train_time:84092ms step_avg:60.98ms +step:1380/2245 train_time:84152ms step_avg:60.98ms +step:1381/2245 train_time:84214ms step_avg:60.98ms +step:1382/2245 train_time:84275ms step_avg:60.98ms +step:1383/2245 train_time:84337ms step_avg:60.98ms +step:1384/2245 train_time:84397ms step_avg:60.98ms +step:1385/2245 train_time:84460ms step_avg:60.98ms +step:1386/2245 train_time:84520ms step_avg:60.98ms +step:1387/2245 train_time:84583ms step_avg:60.98ms +step:1388/2245 train_time:84643ms step_avg:60.98ms +step:1389/2245 train_time:84705ms step_avg:60.98ms +step:1390/2245 train_time:84766ms step_avg:60.98ms +step:1391/2245 train_time:84828ms step_avg:60.98ms +step:1392/2245 train_time:84888ms step_avg:60.98ms +step:1393/2245 train_time:84950ms step_avg:60.98ms +step:1394/2245 train_time:85010ms step_avg:60.98ms +step:1395/2245 train_time:85072ms step_avg:60.98ms +step:1396/2245 train_time:85132ms step_avg:60.98ms +step:1397/2245 train_time:85194ms step_avg:60.98ms +step:1398/2245 train_time:85254ms step_avg:60.98ms +step:1399/2245 train_time:85316ms step_avg:60.98ms +step:1400/2245 train_time:85376ms step_avg:60.98ms +step:1401/2245 train_time:85438ms step_avg:60.98ms +step:1402/2245 train_time:85498ms step_avg:60.98ms +step:1403/2245 train_time:85562ms step_avg:60.99ms +step:1404/2245 train_time:85622ms step_avg:60.98ms +step:1405/2245 train_time:85685ms step_avg:60.99ms +step:1406/2245 train_time:85744ms step_avg:60.98ms +step:1407/2245 train_time:85807ms step_avg:60.99ms +step:1408/2245 train_time:85867ms step_avg:60.98ms +step:1409/2245 train_time:85929ms step_avg:60.99ms +step:1410/2245 train_time:85990ms step_avg:60.99ms +step:1411/2245 train_time:86052ms step_avg:60.99ms +step:1412/2245 train_time:86112ms step_avg:60.99ms +step:1413/2245 train_time:86174ms step_avg:60.99ms +step:1414/2245 train_time:86234ms step_avg:60.99ms +step:1415/2245 train_time:86297ms step_avg:60.99ms +step:1416/2245 train_time:86357ms step_avg:60.99ms +step:1417/2245 train_time:86419ms step_avg:60.99ms +step:1418/2245 train_time:86479ms step_avg:60.99ms +step:1419/2245 train_time:86541ms step_avg:60.99ms +step:1420/2245 train_time:86601ms step_avg:60.99ms +step:1421/2245 train_time:86664ms step_avg:60.99ms +step:1422/2245 train_time:86724ms step_avg:60.99ms +step:1423/2245 train_time:86786ms step_avg:60.99ms +step:1424/2245 train_time:86846ms step_avg:60.99ms +step:1425/2245 train_time:86908ms step_avg:60.99ms +step:1426/2245 train_time:86969ms step_avg:60.99ms +step:1427/2245 train_time:87031ms step_avg:60.99ms +step:1428/2245 train_time:87091ms step_avg:60.99ms +step:1429/2245 train_time:87153ms step_avg:60.99ms +step:1430/2245 train_time:87214ms step_avg:60.99ms +step:1431/2245 train_time:87277ms step_avg:60.99ms +step:1432/2245 train_time:87337ms step_avg:60.99ms +step:1433/2245 train_time:87399ms step_avg:60.99ms +step:1434/2245 train_time:87459ms step_avg:60.99ms +step:1435/2245 train_time:87521ms step_avg:60.99ms +step:1436/2245 train_time:87580ms step_avg:60.99ms +step:1437/2245 train_time:87643ms step_avg:60.99ms +step:1438/2245 train_time:87703ms step_avg:60.99ms +step:1439/2245 train_time:87766ms step_avg:60.99ms +step:1440/2245 train_time:87827ms step_avg:60.99ms +step:1441/2245 train_time:87889ms step_avg:60.99ms +step:1442/2245 train_time:87949ms step_avg:60.99ms +step:1443/2245 train_time:88011ms step_avg:60.99ms +step:1444/2245 train_time:88071ms step_avg:60.99ms +step:1445/2245 train_time:88133ms step_avg:60.99ms +step:1446/2245 train_time:88194ms step_avg:60.99ms +step:1447/2245 train_time:88256ms step_avg:60.99ms +step:1448/2245 train_time:88316ms step_avg:60.99ms +step:1449/2245 train_time:88378ms step_avg:60.99ms +step:1450/2245 train_time:88438ms step_avg:60.99ms +step:1451/2245 train_time:88500ms step_avg:60.99ms +step:1452/2245 train_time:88560ms step_avg:60.99ms +step:1453/2245 train_time:88623ms step_avg:60.99ms +step:1454/2245 train_time:88683ms step_avg:60.99ms +step:1455/2245 train_time:88747ms step_avg:60.99ms +step:1456/2245 train_time:88806ms step_avg:60.99ms +step:1457/2245 train_time:88869ms step_avg:60.99ms +step:1458/2245 train_time:88929ms step_avg:60.99ms +step:1459/2245 train_time:88992ms step_avg:61.00ms +step:1460/2245 train_time:89052ms step_avg:60.99ms +step:1461/2245 train_time:89114ms step_avg:61.00ms +step:1462/2245 train_time:89175ms step_avg:60.99ms +step:1463/2245 train_time:89237ms step_avg:61.00ms +step:1464/2245 train_time:89296ms step_avg:60.99ms +step:1465/2245 train_time:89359ms step_avg:61.00ms +step:1466/2245 train_time:89418ms step_avg:60.99ms +step:1467/2245 train_time:89481ms step_avg:61.00ms +step:1468/2245 train_time:89541ms step_avg:61.00ms +step:1469/2245 train_time:89603ms step_avg:61.00ms +step:1470/2245 train_time:89663ms step_avg:61.00ms +step:1471/2245 train_time:89726ms step_avg:61.00ms +step:1472/2245 train_time:89786ms step_avg:61.00ms +step:1473/2245 train_time:89850ms step_avg:61.00ms +step:1474/2245 train_time:89910ms step_avg:61.00ms +step:1475/2245 train_time:89974ms step_avg:61.00ms +step:1476/2245 train_time:90035ms step_avg:61.00ms +step:1477/2245 train_time:90097ms step_avg:61.00ms +step:1478/2245 train_time:90158ms step_avg:61.00ms +step:1479/2245 train_time:90220ms step_avg:61.00ms +step:1480/2245 train_time:90280ms step_avg:61.00ms +step:1481/2245 train_time:90342ms step_avg:61.00ms +step:1482/2245 train_time:90403ms step_avg:61.00ms +step:1483/2245 train_time:90465ms step_avg:61.00ms +step:1484/2245 train_time:90526ms step_avg:61.00ms +step:1485/2245 train_time:90589ms step_avg:61.00ms +step:1486/2245 train_time:90649ms step_avg:61.00ms +step:1487/2245 train_time:90712ms step_avg:61.00ms +step:1488/2245 train_time:90773ms step_avg:61.00ms +step:1489/2245 train_time:90835ms step_avg:61.00ms +step:1490/2245 train_time:90896ms step_avg:61.00ms +step:1491/2245 train_time:90959ms step_avg:61.01ms +step:1492/2245 train_time:91020ms step_avg:61.01ms +step:1493/2245 train_time:91083ms step_avg:61.01ms +step:1494/2245 train_time:91144ms step_avg:61.01ms +step:1495/2245 train_time:91208ms step_avg:61.01ms +step:1496/2245 train_time:91268ms step_avg:61.01ms +step:1497/2245 train_time:91331ms step_avg:61.01ms +step:1498/2245 train_time:91391ms step_avg:61.01ms +step:1499/2245 train_time:91453ms step_avg:61.01ms +step:1500/2245 train_time:91513ms step_avg:61.01ms +step:1500/2245 val_loss:3.4421 train_time:91576ms step_avg:61.05ms +step:1501/2245 train_time:91595ms step_avg:61.02ms +step:1502/2245 train_time:91637ms step_avg:61.01ms +step:1503/2245 train_time:91699ms step_avg:61.01ms +step:1504/2245 train_time:91758ms step_avg:61.01ms +step:1505/2245 train_time:91821ms step_avg:61.01ms +step:1506/2245 train_time:91881ms step_avg:61.01ms +step:1507/2245 train_time:91943ms step_avg:61.01ms +step:1508/2245 train_time:92003ms step_avg:61.01ms +step:1509/2245 train_time:92065ms step_avg:61.01ms +step:1510/2245 train_time:92125ms step_avg:61.01ms +step:1511/2245 train_time:92187ms step_avg:61.01ms +step:1512/2245 train_time:92247ms step_avg:61.01ms +step:1513/2245 train_time:92309ms step_avg:61.01ms +step:1514/2245 train_time:92369ms step_avg:61.01ms +step:1515/2245 train_time:92433ms step_avg:61.01ms +step:1516/2245 train_time:92500ms step_avg:61.02ms +step:1517/2245 train_time:92568ms step_avg:61.02ms +step:1518/2245 train_time:92631ms step_avg:61.02ms +step:1519/2245 train_time:92694ms step_avg:61.02ms +step:1520/2245 train_time:92754ms step_avg:61.02ms +step:1521/2245 train_time:92816ms step_avg:61.02ms +step:1522/2245 train_time:92876ms step_avg:61.02ms +step:1523/2245 train_time:92938ms step_avg:61.02ms +step:1524/2245 train_time:92998ms step_avg:61.02ms +step:1525/2245 train_time:93061ms step_avg:61.02ms +step:1526/2245 train_time:93121ms step_avg:61.02ms +step:1527/2245 train_time:93184ms step_avg:61.02ms +step:1528/2245 train_time:93243ms step_avg:61.02ms +step:1529/2245 train_time:93307ms step_avg:61.02ms +step:1530/2245 train_time:93367ms step_avg:61.02ms +step:1531/2245 train_time:93431ms step_avg:61.03ms +step:1532/2245 train_time:93492ms step_avg:61.03ms +step:1533/2245 train_time:93556ms step_avg:61.03ms +step:1534/2245 train_time:93618ms step_avg:61.03ms +step:1535/2245 train_time:93681ms step_avg:61.03ms +step:1536/2245 train_time:93741ms step_avg:61.03ms +step:1537/2245 train_time:93805ms step_avg:61.03ms +step:1538/2245 train_time:93866ms step_avg:61.03ms +step:1539/2245 train_time:93929ms step_avg:61.03ms +step:1540/2245 train_time:93989ms step_avg:61.03ms +step:1541/2245 train_time:94052ms step_avg:61.03ms +step:1542/2245 train_time:94112ms step_avg:61.03ms +step:1543/2245 train_time:94174ms step_avg:61.03ms +step:1544/2245 train_time:94234ms step_avg:61.03ms +step:1545/2245 train_time:94297ms step_avg:61.03ms +step:1546/2245 train_time:94358ms step_avg:61.03ms +step:1547/2245 train_time:94422ms step_avg:61.04ms +step:1548/2245 train_time:94483ms step_avg:61.04ms +step:1549/2245 train_time:94547ms step_avg:61.04ms +step:1550/2245 train_time:94609ms step_avg:61.04ms +step:1551/2245 train_time:94673ms step_avg:61.04ms +step:1552/2245 train_time:94733ms step_avg:61.04ms +step:1553/2245 train_time:94796ms step_avg:61.04ms +step:1554/2245 train_time:94856ms step_avg:61.04ms +step:1555/2245 train_time:94919ms step_avg:61.04ms +step:1556/2245 train_time:94978ms step_avg:61.04ms +step:1557/2245 train_time:95042ms step_avg:61.04ms +step:1558/2245 train_time:95102ms step_avg:61.04ms +step:1559/2245 train_time:95165ms step_avg:61.04ms +step:1560/2245 train_time:95225ms step_avg:61.04ms +step:1561/2245 train_time:95288ms step_avg:61.04ms +step:1562/2245 train_time:95350ms step_avg:61.04ms +step:1563/2245 train_time:95413ms step_avg:61.04ms +step:1564/2245 train_time:95472ms step_avg:61.04ms +step:1565/2245 train_time:95536ms step_avg:61.05ms +step:1566/2245 train_time:95596ms step_avg:61.05ms +step:1567/2245 train_time:95659ms step_avg:61.05ms +step:1568/2245 train_time:95720ms step_avg:61.05ms +step:1569/2245 train_time:95782ms step_avg:61.05ms +step:1570/2245 train_time:95843ms step_avg:61.05ms +step:1571/2245 train_time:95906ms step_avg:61.05ms +step:1572/2245 train_time:95966ms step_avg:61.05ms +step:1573/2245 train_time:96030ms step_avg:61.05ms +step:1574/2245 train_time:96090ms step_avg:61.05ms +step:1575/2245 train_time:96153ms step_avg:61.05ms +step:1576/2245 train_time:96213ms step_avg:61.05ms +step:1577/2245 train_time:96276ms step_avg:61.05ms +step:1578/2245 train_time:96336ms step_avg:61.05ms +step:1579/2245 train_time:96399ms step_avg:61.05ms +step:1580/2245 train_time:96460ms step_avg:61.05ms +step:1581/2245 train_time:96523ms step_avg:61.05ms +step:1582/2245 train_time:96583ms step_avg:61.05ms +step:1583/2245 train_time:96647ms step_avg:61.05ms +step:1584/2245 train_time:96710ms step_avg:61.05ms +step:1585/2245 train_time:96773ms step_avg:61.06ms +step:1586/2245 train_time:96833ms step_avg:61.05ms +step:1587/2245 train_time:96895ms step_avg:61.06ms +step:1588/2245 train_time:96955ms step_avg:61.05ms +step:1589/2245 train_time:97018ms step_avg:61.06ms +step:1590/2245 train_time:97078ms step_avg:61.06ms +step:1591/2245 train_time:97141ms step_avg:61.06ms +step:1592/2245 train_time:97201ms step_avg:61.06ms +step:1593/2245 train_time:97265ms step_avg:61.06ms +step:1594/2245 train_time:97326ms step_avg:61.06ms +step:1595/2245 train_time:97389ms step_avg:61.06ms +step:1596/2245 train_time:97451ms step_avg:61.06ms +step:1597/2245 train_time:97514ms step_avg:61.06ms +step:1598/2245 train_time:97573ms step_avg:61.06ms +step:1599/2245 train_time:97636ms step_avg:61.06ms +step:1600/2245 train_time:97697ms step_avg:61.06ms +step:1601/2245 train_time:97760ms step_avg:61.06ms +step:1602/2245 train_time:97820ms step_avg:61.06ms +step:1603/2245 train_time:97883ms step_avg:61.06ms +step:1604/2245 train_time:97943ms step_avg:61.06ms +step:1605/2245 train_time:98006ms step_avg:61.06ms +step:1606/2245 train_time:98066ms step_avg:61.06ms +step:1607/2245 train_time:98129ms step_avg:61.06ms +step:1608/2245 train_time:98190ms step_avg:61.06ms +step:1609/2245 train_time:98253ms step_avg:61.06ms +step:1610/2245 train_time:98313ms step_avg:61.06ms +step:1611/2245 train_time:98376ms step_avg:61.06ms +step:1612/2245 train_time:98436ms step_avg:61.06ms +step:1613/2245 train_time:98498ms step_avg:61.07ms +step:1614/2245 train_time:98558ms step_avg:61.06ms +step:1615/2245 train_time:98621ms step_avg:61.07ms +step:1616/2245 train_time:98681ms step_avg:61.07ms +step:1617/2245 train_time:98745ms step_avg:61.07ms +step:1618/2245 train_time:98806ms step_avg:61.07ms +step:1619/2245 train_time:98869ms step_avg:61.07ms +step:1620/2245 train_time:98929ms step_avg:61.07ms +step:1621/2245 train_time:98992ms step_avg:61.07ms +step:1622/2245 train_time:99052ms step_avg:61.07ms +step:1623/2245 train_time:99115ms step_avg:61.07ms +step:1624/2245 train_time:99174ms step_avg:61.07ms +step:1625/2245 train_time:99237ms step_avg:61.07ms +step:1626/2245 train_time:99298ms step_avg:61.07ms +step:1627/2245 train_time:99361ms step_avg:61.07ms +step:1628/2245 train_time:99421ms step_avg:61.07ms +step:1629/2245 train_time:99484ms step_avg:61.07ms +step:1630/2245 train_time:99545ms step_avg:61.07ms +step:1631/2245 train_time:99608ms step_avg:61.07ms +step:1632/2245 train_time:99668ms step_avg:61.07ms +step:1633/2245 train_time:99732ms step_avg:61.07ms +step:1634/2245 train_time:99792ms step_avg:61.07ms +step:1635/2245 train_time:99854ms step_avg:61.07ms +step:1636/2245 train_time:99914ms step_avg:61.07ms +step:1637/2245 train_time:99976ms step_avg:61.07ms +step:1638/2245 train_time:100036ms step_avg:61.07ms +step:1639/2245 train_time:100099ms step_avg:61.07ms +step:1640/2245 train_time:100159ms step_avg:61.07ms +step:1641/2245 train_time:100222ms step_avg:61.07ms +step:1642/2245 train_time:100282ms step_avg:61.07ms +step:1643/2245 train_time:100345ms step_avg:61.07ms +step:1644/2245 train_time:100406ms step_avg:61.07ms +step:1645/2245 train_time:100469ms step_avg:61.08ms +step:1646/2245 train_time:100530ms step_avg:61.08ms +step:1647/2245 train_time:100593ms step_avg:61.08ms +step:1648/2245 train_time:100653ms step_avg:61.08ms +step:1649/2245 train_time:100717ms step_avg:61.08ms +step:1650/2245 train_time:100777ms step_avg:61.08ms +step:1651/2245 train_time:100840ms step_avg:61.08ms +step:1652/2245 train_time:100900ms step_avg:61.08ms +step:1653/2245 train_time:100963ms step_avg:61.08ms +step:1654/2245 train_time:101024ms step_avg:61.08ms +step:1655/2245 train_time:101088ms step_avg:61.08ms +step:1656/2245 train_time:101149ms step_avg:61.08ms +step:1657/2245 train_time:101212ms step_avg:61.08ms +step:1658/2245 train_time:101272ms step_avg:61.08ms +step:1659/2245 train_time:101334ms step_avg:61.08ms +step:1660/2245 train_time:101394ms step_avg:61.08ms +step:1661/2245 train_time:101457ms step_avg:61.08ms +step:1662/2245 train_time:101518ms step_avg:61.08ms +step:1663/2245 train_time:101580ms step_avg:61.08ms +step:1664/2245 train_time:101640ms step_avg:61.08ms +step:1665/2245 train_time:101704ms step_avg:61.08ms +step:1666/2245 train_time:101765ms step_avg:61.08ms +step:1667/2245 train_time:101829ms step_avg:61.08ms +step:1668/2245 train_time:101889ms step_avg:61.08ms +step:1669/2245 train_time:101952ms step_avg:61.09ms +step:1670/2245 train_time:102013ms step_avg:61.09ms +step:1671/2245 train_time:102075ms step_avg:61.09ms +step:1672/2245 train_time:102136ms step_avg:61.09ms +step:1673/2245 train_time:102198ms step_avg:61.09ms +step:1674/2245 train_time:102259ms step_avg:61.09ms +step:1675/2245 train_time:102322ms step_avg:61.09ms +step:1676/2245 train_time:102382ms step_avg:61.09ms +step:1677/2245 train_time:102446ms step_avg:61.09ms +step:1678/2245 train_time:102507ms step_avg:61.09ms +step:1679/2245 train_time:102570ms step_avg:61.09ms +step:1680/2245 train_time:102630ms step_avg:61.09ms +step:1681/2245 train_time:102694ms step_avg:61.09ms +step:1682/2245 train_time:102753ms step_avg:61.09ms +step:1683/2245 train_time:102816ms step_avg:61.09ms +step:1684/2245 train_time:102876ms step_avg:61.09ms +step:1685/2245 train_time:102939ms step_avg:61.09ms +step:1686/2245 train_time:102999ms step_avg:61.09ms +step:1687/2245 train_time:103062ms step_avg:61.09ms +step:1688/2245 train_time:103123ms step_avg:61.09ms +step:1689/2245 train_time:103186ms step_avg:61.09ms +step:1690/2245 train_time:103247ms step_avg:61.09ms +step:1691/2245 train_time:103310ms step_avg:61.09ms +step:1692/2245 train_time:103371ms step_avg:61.09ms +step:1693/2245 train_time:103434ms step_avg:61.09ms +step:1694/2245 train_time:103494ms step_avg:61.09ms +step:1695/2245 train_time:103557ms step_avg:61.10ms +step:1696/2245 train_time:103617ms step_avg:61.10ms +step:1697/2245 train_time:103680ms step_avg:61.10ms +step:1698/2245 train_time:103740ms step_avg:61.10ms +step:1699/2245 train_time:103803ms step_avg:61.10ms +step:1700/2245 train_time:103863ms step_avg:61.10ms +step:1701/2245 train_time:103926ms step_avg:61.10ms +step:1702/2245 train_time:103987ms step_avg:61.10ms +step:1703/2245 train_time:104050ms step_avg:61.10ms +step:1704/2245 train_time:104110ms step_avg:61.10ms +step:1705/2245 train_time:104173ms step_avg:61.10ms +step:1706/2245 train_time:104233ms step_avg:61.10ms +step:1707/2245 train_time:104296ms step_avg:61.10ms +step:1708/2245 train_time:104356ms step_avg:61.10ms +step:1709/2245 train_time:104419ms step_avg:61.10ms +step:1710/2245 train_time:104479ms step_avg:61.10ms +step:1711/2245 train_time:104542ms step_avg:61.10ms +step:1712/2245 train_time:104602ms step_avg:61.10ms +step:1713/2245 train_time:104665ms step_avg:61.10ms +step:1714/2245 train_time:104725ms step_avg:61.10ms +step:1715/2245 train_time:104788ms step_avg:61.10ms +step:1716/2245 train_time:104849ms step_avg:61.10ms +step:1717/2245 train_time:104912ms step_avg:61.10ms +step:1718/2245 train_time:104972ms step_avg:61.10ms +step:1719/2245 train_time:105035ms step_avg:61.10ms +step:1720/2245 train_time:105095ms step_avg:61.10ms +step:1721/2245 train_time:105158ms step_avg:61.10ms +step:1722/2245 train_time:105218ms step_avg:61.10ms +step:1723/2245 train_time:105281ms step_avg:61.10ms +step:1724/2245 train_time:105341ms step_avg:61.10ms +step:1725/2245 train_time:105405ms step_avg:61.10ms +step:1726/2245 train_time:105466ms step_avg:61.10ms +step:1727/2245 train_time:105530ms step_avg:61.11ms +step:1728/2245 train_time:105590ms step_avg:61.11ms +step:1729/2245 train_time:105653ms step_avg:61.11ms +step:1730/2245 train_time:105712ms step_avg:61.11ms +step:1731/2245 train_time:105775ms step_avg:61.11ms +step:1732/2245 train_time:105836ms step_avg:61.11ms +step:1733/2245 train_time:105899ms step_avg:61.11ms +step:1734/2245 train_time:105959ms step_avg:61.11ms +step:1735/2245 train_time:106022ms step_avg:61.11ms +step:1736/2245 train_time:106082ms step_avg:61.11ms +step:1737/2245 train_time:106146ms step_avg:61.11ms +step:1738/2245 train_time:106207ms step_avg:61.11ms +step:1739/2245 train_time:106270ms step_avg:61.11ms +step:1740/2245 train_time:106331ms step_avg:61.11ms +step:1741/2245 train_time:106393ms step_avg:61.11ms +step:1742/2245 train_time:106453ms step_avg:61.11ms +step:1743/2245 train_time:106517ms step_avg:61.11ms +step:1744/2245 train_time:106577ms step_avg:61.11ms +step:1745/2245 train_time:106639ms step_avg:61.11ms +step:1746/2245 train_time:106699ms step_avg:61.11ms +step:1747/2245 train_time:106762ms step_avg:61.11ms +step:1748/2245 train_time:106823ms step_avg:61.11ms +step:1749/2245 train_time:106886ms step_avg:61.11ms +step:1750/2245 train_time:106947ms step_avg:61.11ms +step:1750/2245 val_loss:3.3784 train_time:107012ms step_avg:61.15ms +step:1751/2245 train_time:107030ms step_avg:61.13ms +step:1752/2245 train_time:107075ms step_avg:61.12ms +step:1753/2245 train_time:107141ms step_avg:61.12ms +step:1754/2245 train_time:107202ms step_avg:61.12ms +step:1755/2245 train_time:107266ms step_avg:61.12ms +step:1756/2245 train_time:107326ms step_avg:61.12ms +step:1757/2245 train_time:107388ms step_avg:61.12ms +step:1758/2245 train_time:107448ms step_avg:61.12ms +step:1759/2245 train_time:107510ms step_avg:61.12ms +step:1760/2245 train_time:107570ms step_avg:61.12ms +step:1761/2245 train_time:107633ms step_avg:61.12ms +step:1762/2245 train_time:107693ms step_avg:61.12ms +step:1763/2245 train_time:107755ms step_avg:61.12ms +step:1764/2245 train_time:107815ms step_avg:61.12ms +step:1765/2245 train_time:107877ms step_avg:61.12ms +step:1766/2245 train_time:107938ms step_avg:61.12ms +step:1767/2245 train_time:108002ms step_avg:61.12ms +step:1768/2245 train_time:108064ms step_avg:61.12ms +step:1769/2245 train_time:108129ms step_avg:61.12ms +step:1770/2245 train_time:108191ms step_avg:61.12ms +step:1771/2245 train_time:108255ms step_avg:61.13ms +step:1772/2245 train_time:108315ms step_avg:61.13ms +step:1773/2245 train_time:108377ms step_avg:61.13ms +step:1774/2245 train_time:108437ms step_avg:61.13ms +step:1775/2245 train_time:108499ms step_avg:61.13ms +step:1776/2245 train_time:108559ms step_avg:61.13ms +step:1777/2245 train_time:108622ms step_avg:61.13ms +step:1778/2245 train_time:108682ms step_avg:61.13ms +step:1779/2245 train_time:108745ms step_avg:61.13ms +step:1780/2245 train_time:108805ms step_avg:61.13ms +step:1781/2245 train_time:108868ms step_avg:61.13ms +step:1782/2245 train_time:108931ms step_avg:61.13ms +step:1783/2245 train_time:108995ms step_avg:61.13ms +step:1784/2245 train_time:109055ms step_avg:61.13ms +step:1785/2245 train_time:109119ms step_avg:61.13ms +step:1786/2245 train_time:109179ms step_avg:61.13ms +step:1787/2245 train_time:109242ms step_avg:61.13ms +step:1788/2245 train_time:109304ms step_avg:61.13ms +step:1789/2245 train_time:109367ms step_avg:61.13ms +step:1790/2245 train_time:109427ms step_avg:61.13ms +step:1791/2245 train_time:109490ms step_avg:61.13ms +step:1792/2245 train_time:109550ms step_avg:61.13ms +step:1793/2245 train_time:109614ms step_avg:61.13ms +step:1794/2245 train_time:109673ms step_avg:61.13ms +step:1795/2245 train_time:109735ms step_avg:61.13ms +step:1796/2245 train_time:109794ms step_avg:61.13ms +step:1797/2245 train_time:109857ms step_avg:61.13ms +step:1798/2245 train_time:109917ms step_avg:61.13ms +step:1799/2245 train_time:109980ms step_avg:61.13ms +step:1800/2245 train_time:110042ms step_avg:61.13ms +step:1801/2245 train_time:110105ms step_avg:61.14ms +step:1802/2245 train_time:110167ms step_avg:61.14ms +step:1803/2245 train_time:110230ms step_avg:61.14ms +step:1804/2245 train_time:110291ms step_avg:61.14ms +step:1805/2245 train_time:110354ms step_avg:61.14ms +step:1806/2245 train_time:110415ms step_avg:61.14ms +step:1807/2245 train_time:110477ms step_avg:61.14ms +step:1808/2245 train_time:110537ms step_avg:61.14ms +step:1809/2245 train_time:110600ms step_avg:61.14ms +step:1810/2245 train_time:110660ms step_avg:61.14ms +step:1811/2245 train_time:110723ms step_avg:61.14ms +step:1812/2245 train_time:110783ms step_avg:61.14ms +step:1813/2245 train_time:110846ms step_avg:61.14ms +step:1814/2245 train_time:110906ms step_avg:61.14ms +step:1815/2245 train_time:110970ms step_avg:61.14ms +step:1816/2245 train_time:111031ms step_avg:61.14ms +step:1817/2245 train_time:111095ms step_avg:61.14ms +step:1818/2245 train_time:111156ms step_avg:61.14ms +step:1819/2245 train_time:111218ms step_avg:61.14ms +step:1820/2245 train_time:111278ms step_avg:61.14ms +step:1821/2245 train_time:111342ms step_avg:61.14ms +step:1822/2245 train_time:111402ms step_avg:61.14ms +step:1823/2245 train_time:111464ms step_avg:61.14ms +step:1824/2245 train_time:111525ms step_avg:61.14ms +step:1825/2245 train_time:111588ms step_avg:61.14ms +step:1826/2245 train_time:111648ms step_avg:61.14ms +step:1827/2245 train_time:111711ms step_avg:61.14ms +step:1828/2245 train_time:111772ms step_avg:61.14ms +step:1829/2245 train_time:111834ms step_avg:61.15ms +step:1830/2245 train_time:111894ms step_avg:61.14ms +step:1831/2245 train_time:111957ms step_avg:61.15ms +step:1832/2245 train_time:112017ms step_avg:61.14ms +step:1833/2245 train_time:112080ms step_avg:61.15ms +step:1834/2245 train_time:112141ms step_avg:61.15ms +step:1835/2245 train_time:112204ms step_avg:61.15ms +step:1836/2245 train_time:112264ms step_avg:61.15ms +step:1837/2245 train_time:112327ms step_avg:61.15ms +step:1838/2245 train_time:112387ms step_avg:61.15ms +step:1839/2245 train_time:112450ms step_avg:61.15ms +step:1840/2245 train_time:112511ms step_avg:61.15ms +step:1841/2245 train_time:112575ms step_avg:61.15ms +step:1842/2245 train_time:112635ms step_avg:61.15ms +step:1843/2245 train_time:112698ms step_avg:61.15ms +step:1844/2245 train_time:112758ms step_avg:61.15ms +step:1845/2245 train_time:112820ms step_avg:61.15ms +step:1846/2245 train_time:112881ms step_avg:61.15ms +step:1847/2245 train_time:112944ms step_avg:61.15ms +step:1848/2245 train_time:113004ms step_avg:61.15ms +step:1849/2245 train_time:113067ms step_avg:61.15ms +step:1850/2245 train_time:113128ms step_avg:61.15ms +step:1851/2245 train_time:113191ms step_avg:61.15ms +step:1852/2245 train_time:113252ms step_avg:61.15ms +step:1853/2245 train_time:113314ms step_avg:61.15ms +step:1854/2245 train_time:113374ms step_avg:61.15ms +step:1855/2245 train_time:113437ms step_avg:61.15ms +step:1856/2245 train_time:113497ms step_avg:61.15ms +step:1857/2245 train_time:113560ms step_avg:61.15ms +step:1858/2245 train_time:113620ms step_avg:61.15ms +step:1859/2245 train_time:113683ms step_avg:61.15ms +step:1860/2245 train_time:113743ms step_avg:61.15ms +step:1861/2245 train_time:113807ms step_avg:61.15ms +step:1862/2245 train_time:113867ms step_avg:61.15ms +step:1863/2245 train_time:113931ms step_avg:61.15ms +step:1864/2245 train_time:113992ms step_avg:61.15ms +step:1865/2245 train_time:114055ms step_avg:61.16ms +step:1866/2245 train_time:114116ms step_avg:61.16ms +step:1867/2245 train_time:114179ms step_avg:61.16ms +step:1868/2245 train_time:114240ms step_avg:61.16ms +step:1869/2245 train_time:114302ms step_avg:61.16ms +step:1870/2245 train_time:114362ms step_avg:61.16ms +step:1871/2245 train_time:114426ms step_avg:61.16ms +step:1872/2245 train_time:114486ms step_avg:61.16ms +step:1873/2245 train_time:114549ms step_avg:61.16ms +step:1874/2245 train_time:114609ms step_avg:61.16ms +step:1875/2245 train_time:114672ms step_avg:61.16ms +step:1876/2245 train_time:114733ms step_avg:61.16ms +step:1877/2245 train_time:114795ms step_avg:61.16ms +step:1878/2245 train_time:114856ms step_avg:61.16ms +step:1879/2245 train_time:114918ms step_avg:61.16ms +step:1880/2245 train_time:114978ms step_avg:61.16ms +step:1881/2245 train_time:115041ms step_avg:61.16ms +step:1882/2245 train_time:115101ms step_avg:61.16ms +step:1883/2245 train_time:115165ms step_avg:61.16ms +step:1884/2245 train_time:115225ms step_avg:61.16ms +step:1885/2245 train_time:115288ms step_avg:61.16ms +step:1886/2245 train_time:115349ms step_avg:61.16ms +step:1887/2245 train_time:115411ms step_avg:61.16ms +step:1888/2245 train_time:115471ms step_avg:61.16ms +step:1889/2245 train_time:115534ms step_avg:61.16ms +step:1890/2245 train_time:115595ms step_avg:61.16ms +step:1891/2245 train_time:115657ms step_avg:61.16ms +step:1892/2245 train_time:115718ms step_avg:61.16ms +step:1893/2245 train_time:115781ms step_avg:61.16ms +step:1894/2245 train_time:115841ms step_avg:61.16ms +step:1895/2245 train_time:115903ms step_avg:61.16ms +step:1896/2245 train_time:115964ms step_avg:61.16ms +step:1897/2245 train_time:116028ms step_avg:61.16ms +step:1898/2245 train_time:116088ms step_avg:61.16ms +step:1899/2245 train_time:116152ms step_avg:61.16ms +step:1900/2245 train_time:116212ms step_avg:61.16ms +step:1901/2245 train_time:116276ms step_avg:61.17ms +step:1902/2245 train_time:116336ms step_avg:61.16ms +step:1903/2245 train_time:116398ms step_avg:61.17ms +step:1904/2245 train_time:116458ms step_avg:61.16ms +step:1905/2245 train_time:116521ms step_avg:61.17ms +step:1906/2245 train_time:116581ms step_avg:61.17ms +step:1907/2245 train_time:116644ms step_avg:61.17ms +step:1908/2245 train_time:116704ms step_avg:61.17ms +step:1909/2245 train_time:116767ms step_avg:61.17ms +step:1910/2245 train_time:116828ms step_avg:61.17ms +step:1911/2245 train_time:116891ms step_avg:61.17ms +step:1912/2245 train_time:116951ms step_avg:61.17ms +step:1913/2245 train_time:117015ms step_avg:61.17ms +step:1914/2245 train_time:117075ms step_avg:61.17ms +step:1915/2245 train_time:117137ms step_avg:61.17ms +step:1916/2245 train_time:117198ms step_avg:61.17ms +step:1917/2245 train_time:117260ms step_avg:61.17ms +step:1918/2245 train_time:117321ms step_avg:61.17ms +step:1919/2245 train_time:117384ms step_avg:61.17ms +step:1920/2245 train_time:117444ms step_avg:61.17ms +step:1921/2245 train_time:117507ms step_avg:61.17ms +step:1922/2245 train_time:117568ms step_avg:61.17ms +step:1923/2245 train_time:117632ms step_avg:61.17ms +step:1924/2245 train_time:117693ms step_avg:61.17ms +step:1925/2245 train_time:117755ms step_avg:61.17ms +step:1926/2245 train_time:117815ms step_avg:61.17ms +step:1927/2245 train_time:117878ms step_avg:61.17ms +step:1928/2245 train_time:117939ms step_avg:61.17ms +step:1929/2245 train_time:118001ms step_avg:61.17ms +step:1930/2245 train_time:118061ms step_avg:61.17ms +step:1931/2245 train_time:118124ms step_avg:61.17ms +step:1932/2245 train_time:118183ms step_avg:61.17ms +step:1933/2245 train_time:118247ms step_avg:61.17ms +step:1934/2245 train_time:118307ms step_avg:61.17ms +step:1935/2245 train_time:118370ms step_avg:61.17ms +step:1936/2245 train_time:118431ms step_avg:61.17ms +step:1937/2245 train_time:118494ms step_avg:61.17ms +step:1938/2245 train_time:118555ms step_avg:61.17ms +step:1939/2245 train_time:118618ms step_avg:61.17ms +step:1940/2245 train_time:118678ms step_avg:61.17ms +step:1941/2245 train_time:118741ms step_avg:61.18ms +step:1942/2245 train_time:118801ms step_avg:61.17ms +step:1943/2245 train_time:118863ms step_avg:61.18ms +step:1944/2245 train_time:118923ms step_avg:61.17ms +step:1945/2245 train_time:118986ms step_avg:61.18ms +step:1946/2245 train_time:119047ms step_avg:61.18ms +step:1947/2245 train_time:119110ms step_avg:61.18ms +step:1948/2245 train_time:119170ms step_avg:61.18ms +step:1949/2245 train_time:119233ms step_avg:61.18ms +step:1950/2245 train_time:119293ms step_avg:61.18ms +step:1951/2245 train_time:119356ms step_avg:61.18ms +step:1952/2245 train_time:119416ms step_avg:61.18ms +step:1953/2245 train_time:119478ms step_avg:61.18ms +step:1954/2245 train_time:119539ms step_avg:61.18ms +step:1955/2245 train_time:119601ms step_avg:61.18ms +step:1956/2245 train_time:119662ms step_avg:61.18ms +step:1957/2245 train_time:119725ms step_avg:61.18ms +step:1958/2245 train_time:119786ms step_avg:61.18ms +step:1959/2245 train_time:119849ms step_avg:61.18ms +step:1960/2245 train_time:119909ms step_avg:61.18ms +step:1961/2245 train_time:119971ms step_avg:61.18ms +step:1962/2245 train_time:120032ms step_avg:61.18ms +step:1963/2245 train_time:120095ms step_avg:61.18ms +step:1964/2245 train_time:120155ms step_avg:61.18ms +step:1965/2245 train_time:120218ms step_avg:61.18ms +step:1966/2245 train_time:120278ms step_avg:61.18ms +step:1967/2245 train_time:120341ms step_avg:61.18ms +step:1968/2245 train_time:120401ms step_avg:61.18ms +step:1969/2245 train_time:120465ms step_avg:61.18ms +step:1970/2245 train_time:120525ms step_avg:61.18ms +step:1971/2245 train_time:120587ms step_avg:61.18ms +step:1972/2245 train_time:120648ms step_avg:61.18ms +step:1973/2245 train_time:120711ms step_avg:61.18ms +step:1974/2245 train_time:120772ms step_avg:61.18ms +step:1975/2245 train_time:120836ms step_avg:61.18ms +step:1976/2245 train_time:120896ms step_avg:61.18ms +step:1977/2245 train_time:120958ms step_avg:61.18ms +step:1978/2245 train_time:121018ms step_avg:61.18ms +step:1979/2245 train_time:121081ms step_avg:61.18ms +step:1980/2245 train_time:121143ms step_avg:61.18ms +step:1981/2245 train_time:121206ms step_avg:61.18ms +step:1982/2245 train_time:121267ms step_avg:61.18ms +step:1983/2245 train_time:121330ms step_avg:61.19ms +step:1984/2245 train_time:121391ms step_avg:61.18ms +step:1985/2245 train_time:121454ms step_avg:61.19ms +step:1986/2245 train_time:121514ms step_avg:61.19ms +step:1987/2245 train_time:121576ms step_avg:61.19ms +step:1988/2245 train_time:121636ms step_avg:61.19ms +step:1989/2245 train_time:121699ms step_avg:61.19ms +step:1990/2245 train_time:121759ms step_avg:61.19ms +step:1991/2245 train_time:121821ms step_avg:61.19ms +step:1992/2245 train_time:121881ms step_avg:61.19ms +step:1993/2245 train_time:121944ms step_avg:61.19ms +step:1994/2245 train_time:122004ms step_avg:61.19ms +step:1995/2245 train_time:122067ms step_avg:61.19ms +step:1996/2245 train_time:122129ms step_avg:61.19ms +step:1997/2245 train_time:122191ms step_avg:61.19ms +step:1998/2245 train_time:122252ms step_avg:61.19ms +step:1999/2245 train_time:122314ms step_avg:61.19ms +step:2000/2245 train_time:122375ms step_avg:61.19ms +step:2000/2245 val_loss:3.3237 train_time:122438ms step_avg:61.22ms +step:2001/2245 train_time:122457ms step_avg:61.20ms +step:2002/2245 train_time:122501ms step_avg:61.19ms +step:2003/2245 train_time:122568ms step_avg:61.19ms +step:2004/2245 train_time:122630ms step_avg:61.19ms +step:2005/2245 train_time:122694ms step_avg:61.19ms +step:2006/2245 train_time:122754ms step_avg:61.19ms +step:2007/2245 train_time:122816ms step_avg:61.19ms +step:2008/2245 train_time:122876ms step_avg:61.19ms +step:2009/2245 train_time:122938ms step_avg:61.19ms +step:2010/2245 train_time:122997ms step_avg:61.19ms +step:2011/2245 train_time:123059ms step_avg:61.19ms +step:2012/2245 train_time:123119ms step_avg:61.19ms +step:2013/2245 train_time:123181ms step_avg:61.19ms +step:2014/2245 train_time:123240ms step_avg:61.19ms +step:2015/2245 train_time:123302ms step_avg:61.19ms +step:2016/2245 train_time:123362ms step_avg:61.19ms +step:2017/2245 train_time:123426ms step_avg:61.19ms +step:2018/2245 train_time:123488ms step_avg:61.19ms +step:2019/2245 train_time:123553ms step_avg:61.19ms +step:2020/2245 train_time:123614ms step_avg:61.20ms +step:2021/2245 train_time:123678ms step_avg:61.20ms +step:2022/2245 train_time:123739ms step_avg:61.20ms +step:2023/2245 train_time:123801ms step_avg:61.20ms +step:2024/2245 train_time:123861ms step_avg:61.20ms +step:2025/2245 train_time:123925ms step_avg:61.20ms +step:2026/2245 train_time:123985ms step_avg:61.20ms +step:2027/2245 train_time:124048ms step_avg:61.20ms +step:2028/2245 train_time:124108ms step_avg:61.20ms +step:2029/2245 train_time:124170ms step_avg:61.20ms +step:2030/2245 train_time:124231ms step_avg:61.20ms +step:2031/2245 train_time:124294ms step_avg:61.20ms +step:2032/2245 train_time:124355ms step_avg:61.20ms +step:2033/2245 train_time:124419ms step_avg:61.20ms +step:2034/2245 train_time:124479ms step_avg:61.20ms +step:2035/2245 train_time:124542ms step_avg:61.20ms +step:2036/2245 train_time:124603ms step_avg:61.20ms +step:2037/2245 train_time:124667ms step_avg:61.20ms +step:2038/2245 train_time:124728ms step_avg:61.20ms +step:2039/2245 train_time:124792ms step_avg:61.20ms +step:2040/2245 train_time:124854ms step_avg:61.20ms +step:2041/2245 train_time:124916ms step_avg:61.20ms +step:2042/2245 train_time:124976ms step_avg:61.20ms +step:2043/2245 train_time:125039ms step_avg:61.20ms +step:2044/2245 train_time:125099ms step_avg:61.20ms +step:2045/2245 train_time:125161ms step_avg:61.20ms +step:2046/2245 train_time:125221ms step_avg:61.20ms +step:2047/2245 train_time:125284ms step_avg:61.20ms +step:2048/2245 train_time:125344ms step_avg:61.20ms +step:2049/2245 train_time:125407ms step_avg:61.20ms +step:2050/2245 train_time:125469ms step_avg:61.20ms +step:2051/2245 train_time:125532ms step_avg:61.21ms +step:2052/2245 train_time:125594ms step_avg:61.21ms +step:2053/2245 train_time:125657ms step_avg:61.21ms +step:2054/2245 train_time:125718ms step_avg:61.21ms +step:2055/2245 train_time:125781ms step_avg:61.21ms +step:2056/2245 train_time:125842ms step_avg:61.21ms +step:2057/2245 train_time:125905ms step_avg:61.21ms +step:2058/2245 train_time:125965ms step_avg:61.21ms +step:2059/2245 train_time:126028ms step_avg:61.21ms +step:2060/2245 train_time:126087ms step_avg:61.21ms +step:2061/2245 train_time:126151ms step_avg:61.21ms +step:2062/2245 train_time:126212ms step_avg:61.21ms +step:2063/2245 train_time:126275ms step_avg:61.21ms +step:2064/2245 train_time:126336ms step_avg:61.21ms +step:2065/2245 train_time:126399ms step_avg:61.21ms +step:2066/2245 train_time:126460ms step_avg:61.21ms +step:2067/2245 train_time:126523ms step_avg:61.21ms +step:2068/2245 train_time:126583ms step_avg:61.21ms +step:2069/2245 train_time:126646ms step_avg:61.21ms +step:2070/2245 train_time:126707ms step_avg:61.21ms +step:2071/2245 train_time:126771ms step_avg:61.21ms +step:2072/2245 train_time:126832ms step_avg:61.21ms +step:2073/2245 train_time:126897ms step_avg:61.21ms +step:2074/2245 train_time:126957ms step_avg:61.21ms +step:2075/2245 train_time:127019ms step_avg:61.21ms +step:2076/2245 train_time:127079ms step_avg:61.21ms +step:2077/2245 train_time:127142ms step_avg:61.21ms +step:2078/2245 train_time:127202ms step_avg:61.21ms +step:2079/2245 train_time:127265ms step_avg:61.21ms +step:2080/2245 train_time:127325ms step_avg:61.21ms +step:2081/2245 train_time:127387ms step_avg:61.21ms +step:2082/2245 train_time:127448ms step_avg:61.21ms +step:2083/2245 train_time:127512ms step_avg:61.22ms +step:2084/2245 train_time:127572ms step_avg:61.22ms +step:2085/2245 train_time:127635ms step_avg:61.22ms +step:2086/2245 train_time:127697ms step_avg:61.22ms +step:2087/2245 train_time:127759ms step_avg:61.22ms +step:2088/2245 train_time:127820ms step_avg:61.22ms +step:2089/2245 train_time:127883ms step_avg:61.22ms +step:2090/2245 train_time:127943ms step_avg:61.22ms +step:2091/2245 train_time:128006ms step_avg:61.22ms +step:2092/2245 train_time:128067ms step_avg:61.22ms +step:2093/2245 train_time:128130ms step_avg:61.22ms +step:2094/2245 train_time:128191ms step_avg:61.22ms +step:2095/2245 train_time:128255ms step_avg:61.22ms +step:2096/2245 train_time:128315ms step_avg:61.22ms +step:2097/2245 train_time:128377ms step_avg:61.22ms +step:2098/2245 train_time:128437ms step_avg:61.22ms +step:2099/2245 train_time:128500ms step_avg:61.22ms +step:2100/2245 train_time:128560ms step_avg:61.22ms +step:2101/2245 train_time:128623ms step_avg:61.22ms +step:2102/2245 train_time:128683ms step_avg:61.22ms +step:2103/2245 train_time:128746ms step_avg:61.22ms +step:2104/2245 train_time:128806ms step_avg:61.22ms +step:2105/2245 train_time:128871ms step_avg:61.22ms +step:2106/2245 train_time:128932ms step_avg:61.22ms +step:2107/2245 train_time:128995ms step_avg:61.22ms +step:2108/2245 train_time:129056ms step_avg:61.22ms +step:2109/2245 train_time:129118ms step_avg:61.22ms +step:2110/2245 train_time:129178ms step_avg:61.22ms +step:2111/2245 train_time:129241ms step_avg:61.22ms +step:2112/2245 train_time:129301ms step_avg:61.22ms +step:2113/2245 train_time:129364ms step_avg:61.22ms +step:2114/2245 train_time:129423ms step_avg:61.22ms +step:2115/2245 train_time:129486ms step_avg:61.22ms +step:2116/2245 train_time:129546ms step_avg:61.22ms +step:2117/2245 train_time:129609ms step_avg:61.22ms +step:2118/2245 train_time:129670ms step_avg:61.22ms +step:2119/2245 train_time:129733ms step_avg:61.22ms +step:2120/2245 train_time:129794ms step_avg:61.22ms +step:2121/2245 train_time:129857ms step_avg:61.22ms +step:2122/2245 train_time:129917ms step_avg:61.22ms +step:2123/2245 train_time:129980ms step_avg:61.22ms +step:2124/2245 train_time:130041ms step_avg:61.22ms +step:2125/2245 train_time:130103ms step_avg:61.23ms +step:2126/2245 train_time:130163ms step_avg:61.22ms +step:2127/2245 train_time:130226ms step_avg:61.23ms +step:2128/2245 train_time:130286ms step_avg:61.22ms +step:2129/2245 train_time:130349ms step_avg:61.23ms +step:2130/2245 train_time:130410ms step_avg:61.23ms +step:2131/2245 train_time:130473ms step_avg:61.23ms +step:2132/2245 train_time:130534ms step_avg:61.23ms +step:2133/2245 train_time:130598ms step_avg:61.23ms +step:2134/2245 train_time:130658ms step_avg:61.23ms +step:2135/2245 train_time:130720ms step_avg:61.23ms +step:2136/2245 train_time:130780ms step_avg:61.23ms +step:2137/2245 train_time:130844ms step_avg:61.23ms +step:2138/2245 train_time:130905ms step_avg:61.23ms +step:2139/2245 train_time:130967ms step_avg:61.23ms +step:2140/2245 train_time:131028ms step_avg:61.23ms +step:2141/2245 train_time:131091ms step_avg:61.23ms +step:2142/2245 train_time:131152ms step_avg:61.23ms +step:2143/2245 train_time:131216ms step_avg:61.23ms +step:2144/2245 train_time:131276ms step_avg:61.23ms +step:2145/2245 train_time:131338ms step_avg:61.23ms +step:2146/2245 train_time:131398ms step_avg:61.23ms +step:2147/2245 train_time:131461ms step_avg:61.23ms +step:2148/2245 train_time:131521ms step_avg:61.23ms +step:2149/2245 train_time:131584ms step_avg:61.23ms +step:2150/2245 train_time:131645ms step_avg:61.23ms +step:2151/2245 train_time:131708ms step_avg:61.23ms +step:2152/2245 train_time:131769ms step_avg:61.23ms +step:2153/2245 train_time:131833ms step_avg:61.23ms +step:2154/2245 train_time:131894ms step_avg:61.23ms +step:2155/2245 train_time:131957ms step_avg:61.23ms +step:2156/2245 train_time:132016ms step_avg:61.23ms +step:2157/2245 train_time:132080ms step_avg:61.23ms +step:2158/2245 train_time:132140ms step_avg:61.23ms +step:2159/2245 train_time:132203ms step_avg:61.23ms +step:2160/2245 train_time:132263ms step_avg:61.23ms +step:2161/2245 train_time:132326ms step_avg:61.23ms +step:2162/2245 train_time:132386ms step_avg:61.23ms +step:2163/2245 train_time:132449ms step_avg:61.23ms +step:2164/2245 train_time:132510ms step_avg:61.23ms +step:2165/2245 train_time:132573ms step_avg:61.23ms +step:2166/2245 train_time:132635ms step_avg:61.23ms +step:2167/2245 train_time:132697ms step_avg:61.24ms +step:2168/2245 train_time:132757ms step_avg:61.23ms +step:2169/2245 train_time:132820ms step_avg:61.24ms +step:2170/2245 train_time:132880ms step_avg:61.24ms +step:2171/2245 train_time:132944ms step_avg:61.24ms +step:2172/2245 train_time:133005ms step_avg:61.24ms +step:2173/2245 train_time:133068ms step_avg:61.24ms +step:2174/2245 train_time:133128ms step_avg:61.24ms +step:2175/2245 train_time:133191ms step_avg:61.24ms +step:2176/2245 train_time:133252ms step_avg:61.24ms +step:2177/2245 train_time:133315ms step_avg:61.24ms +step:2178/2245 train_time:133375ms step_avg:61.24ms +step:2179/2245 train_time:133438ms step_avg:61.24ms +step:2180/2245 train_time:133498ms step_avg:61.24ms +step:2181/2245 train_time:133560ms step_avg:61.24ms +step:2182/2245 train_time:133620ms step_avg:61.24ms +step:2183/2245 train_time:133683ms step_avg:61.24ms +step:2184/2245 train_time:133744ms step_avg:61.24ms +step:2185/2245 train_time:133807ms step_avg:61.24ms +step:2186/2245 train_time:133868ms step_avg:61.24ms +step:2187/2245 train_time:133932ms step_avg:61.24ms +step:2188/2245 train_time:133993ms step_avg:61.24ms +step:2189/2245 train_time:134057ms step_avg:61.24ms +step:2190/2245 train_time:134117ms step_avg:61.24ms +step:2191/2245 train_time:134179ms step_avg:61.24ms +step:2192/2245 train_time:134239ms step_avg:61.24ms +step:2193/2245 train_time:134302ms step_avg:61.24ms +step:2194/2245 train_time:134363ms step_avg:61.24ms +step:2195/2245 train_time:134425ms step_avg:61.24ms +step:2196/2245 train_time:134486ms step_avg:61.24ms +step:2197/2245 train_time:134549ms step_avg:61.24ms +step:2198/2245 train_time:134610ms step_avg:61.24ms +step:2199/2245 train_time:134674ms step_avg:61.24ms +step:2200/2245 train_time:134734ms step_avg:61.24ms +step:2201/2245 train_time:134798ms step_avg:61.24ms +step:2202/2245 train_time:134858ms step_avg:61.24ms +step:2203/2245 train_time:134920ms step_avg:61.24ms +step:2204/2245 train_time:134981ms step_avg:61.24ms +step:2205/2245 train_time:135043ms step_avg:61.24ms +step:2206/2245 train_time:135104ms step_avg:61.24ms +step:2207/2245 train_time:135167ms step_avg:61.24ms +step:2208/2245 train_time:135227ms step_avg:61.24ms +step:2209/2245 train_time:135290ms step_avg:61.25ms +step:2210/2245 train_time:135352ms step_avg:61.25ms +step:2211/2245 train_time:135415ms step_avg:61.25ms +step:2212/2245 train_time:135475ms step_avg:61.25ms +step:2213/2245 train_time:135538ms step_avg:61.25ms +step:2214/2245 train_time:135599ms step_avg:61.25ms +step:2215/2245 train_time:135662ms step_avg:61.25ms +step:2216/2245 train_time:135722ms step_avg:61.25ms +step:2217/2245 train_time:135785ms step_avg:61.25ms +step:2218/2245 train_time:135845ms step_avg:61.25ms +step:2219/2245 train_time:135909ms step_avg:61.25ms +step:2220/2245 train_time:135970ms step_avg:61.25ms +step:2221/2245 train_time:136033ms step_avg:61.25ms +step:2222/2245 train_time:136094ms step_avg:61.25ms +step:2223/2245 train_time:136158ms step_avg:61.25ms +step:2224/2245 train_time:136217ms step_avg:61.25ms +step:2225/2245 train_time:136280ms step_avg:61.25ms +step:2226/2245 train_time:136341ms step_avg:61.25ms +step:2227/2245 train_time:136405ms step_avg:61.25ms +step:2228/2245 train_time:136466ms step_avg:61.25ms +step:2229/2245 train_time:136529ms step_avg:61.25ms +step:2230/2245 train_time:136589ms step_avg:61.25ms +step:2231/2245 train_time:136653ms step_avg:61.25ms +step:2232/2245 train_time:136713ms step_avg:61.25ms +step:2233/2245 train_time:136777ms step_avg:61.25ms +step:2234/2245 train_time:136837ms step_avg:61.25ms +step:2235/2245 train_time:136899ms step_avg:61.25ms +step:2236/2245 train_time:136960ms step_avg:61.25ms +step:2237/2245 train_time:137024ms step_avg:61.25ms +step:2238/2245 train_time:137085ms step_avg:61.25ms +step:2239/2245 train_time:137148ms step_avg:61.25ms +step:2240/2245 train_time:137210ms step_avg:61.25ms +step:2241/2245 train_time:137274ms step_avg:61.26ms +step:2242/2245 train_time:137335ms step_avg:61.26ms +step:2243/2245 train_time:137398ms step_avg:61.26ms +step:2244/2245 train_time:137459ms step_avg:61.26ms +step:2245/2245 train_time:137521ms step_avg:61.26ms +step:2245/2245 val_loss:3.2784 train_time:137582ms step_avg:61.28ms +peak memory allocated: 29248 MiB reserved: 50528 MiB diff --git a/records/track_1_short/2025-11-10_CautiousWD/1f62a34f-fb60-4228-bd77-639ac781809f.txt b/records/track_1_short/2025-11-10_CautiousWD/1f62a34f-fb60-4228-bd77-639ac781809f.txt new file mode 100644 index 000000000..01c4c9858 --- /dev/null +++ b/records/track_1_short/2025-11-10_CautiousWD/1f62a34f-fb60-4228-bd77-639ac781809f.txt @@ -0,0 +1,3772 @@ +import os +import sys + +with open(sys.argv[0]) as f: + code = f.read() # read the code of this file ASAP, for logging +import copy +import glob +import math +import threading +import time +import uuid +from dataclasses import dataclass +from collections import defaultdict +from itertools import accumulate +from pathlib import Path + +os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" +import torch + +torch.empty( + 1, device="cuda", requires_grad=True +).backward() # prevents a bug on some systems +import torch._dynamo as dynamo +import torch.distributed as dist +import torch.nn.functional as F + +# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min +import triton +import triton.language as tl +from kernels import get_kernel +from torch import Tensor, nn + +dynamo.config.recompile_limit = 64 + +# ----------------------------------------------------------------------------- +# Custom operators: FP8 matmul by @YouJiacheng + + +@torch.library.custom_op("nanogpt::mm", mutates_args=()) +def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: + @torch.compile + def impl(x: Tensor, w: Tensor): + assert x.is_contiguous() and w.is_contiguous() + x_f8 = x.div(x_s).to(torch.float8_e4m3fn) + w_f8 = w.div(w_s).to(torch.float8_e4m3fn) + out = torch._scaled_mm( + x_f8, + w_f8.T, + out_dtype=torch.bfloat16, + scale_a=x.new_tensor(x_s, dtype=torch.float32), + scale_b=x.new_tensor(w_s, dtype=torch.float32), + use_fast_accum=True, + ) + return out, x_f8, w_f8 + + return impl(x, w) + +@mm_op.register_fake +def _(x: Tensor, w: Tensor, *_): + assert x.ndim == w.ndim == 2 + assert x.shape[1] == w.shape[1] + assert x.device == w.device + assert x.is_contiguous() and w.is_contiguous() + return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) + +@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) +def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: + @torch.compile + def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): + assert grad.is_contiguous() + x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) + w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) + grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) + grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) + grad_x = torch._scaled_mm( + grad_f8, + w_f8.T.contiguous().T, + out_dtype=torch.bfloat16, + scale_a=grad_inv_s, + scale_b=w_inv_s, + use_fast_accum=False, + ) + # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) + grad_w = torch._scaled_mm( + x_f8.T.contiguous(), + grad_f8.T.contiguous().T, + out_dtype=torch.float32, + scale_a=x_inv_s, + scale_b=grad_inv_s, + use_fast_accum=False, + ).T + return grad_x, grad_w + + return impl(g, x_f8, w_f8) + +@mm_backward_op.register_fake +def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): + return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) + +def backward(ctx, grad_out: Tensor, *_): + x_f8, w_f8 = ctx.saved_tensors + x_s, w_s, grad_s = ctx.scales + grad_x, grad_w = torch.ops.nanogpt.mm_backward( + grad_out, x_f8, w_f8, x_s, w_s, grad_s + ) + return grad_x, grad_w, None, None, None + +def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): + *_, x_s, w_s, grad_s = inputs + _, x_f8, w_f8 = output + ctx.save_for_backward(x_f8, w_f8) + ctx.scales = x_s, w_s, grad_s + ctx.set_materialize_grads(False) + +mm_op.register_autograd(backward, setup_context=setup_context) + +# ----------------------------------------------------------------------------- +# Triton kernel for symmetric matrix multiplication by @byronxu99 + +def _get_autotune_configs(): + return [ + triton.Config( + { + "BLOCK_SIZE_M": bm, + "BLOCK_SIZE_N": bn, + "BLOCK_SIZE_K": bk, + "GROUP_SIZE_M": 8, + "LOWER_UPPER": 1, + }, + num_stages=stages, + num_warps=warps, + ) + for bm in [64, 128] + for bn in [64, 128, 256] + for bk in [64, 128] + for stages, warps in [(3, 4), (3, 8), (4, 4)] + if bm // bn <= 2 and bn // bm <= 2 + ] + +@triton.jit +def _pid_to_block( + pid, + M, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) + + # Map PID to a single matrix in batch + batch_idx = pid // (num_pid_m * num_pid_n) + pid = pid % (num_pid_m * num_pid_n) + + # Map PID to 2D grid of blocks + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) + + m_idx = pid_m * BLOCK_SIZE_M + n_idx = pid_n * BLOCK_SIZE_N + return batch_idx, m_idx, n_idx + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def XXT_kernel( + A_ptr, C_ptr, + M, K, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def XXT(A: torch.Tensor, out: torch.Tensor): + """ + Launch Triton kernel to compute C = A @ A.T + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert out.size(-2) == M, "Output matrix has incorrect shape" + assert out.size(-1) == M, "Output matrix has incorrect shape" + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + XXT_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + K=K, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + ) + return out + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def ba_plus_cAA_kernel( + A_ptr, C_ptr, + M, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + alpha, beta, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A + # Performance is slightly slower than XXT_kernel, so we use two separate kernels + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + # Load block of A to add (corresponds to the current block of C) + offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) + a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) + a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) + a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) + + # Apply alpha and beta + accumulator *= alpha + accumulator += a_add * beta + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): + """ + Launch Triton kernel to compute C = alpha * A @ A.T + beta * A + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert M == K, "Input matrix must be square" + assert out.size(-2) == M + assert out.size(-1) == M + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + ba_plus_cAA_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + alpha=alpha, + beta=beta, + ) + return out + +# Computed for num_iters=5, safety_factor=2e-2, cushion=2 +polar_express_coeffs = [ + (8.156554524902461, -22.48329292557795, 15.878769915207462), + (4.042929935166739, -2.808917465908714, 0.5000178451051316), + (3.8916678022926607, -2.772484153217685, 0.5060648178503393), + (3.285753657755655, -2.3681294933425376, 0.46449024233003106), + (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) +] + +@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower +def polar_express(G: torch.Tensor): + """ + Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 + by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. + """ + X = G.bfloat16() + if G.size(-2) > G.size(-1): + X = X.mT + + # Ensure spectral norm is at most 1 + X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) + + # Allocate buffers + X = X.contiguous() + A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) + B = torch.empty_like(A) + C = torch.empty_like(X) + + aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm + + # Perform the iterations + for a, b, c in polar_express_coeffs: + XXT(X, out=A) # A = X @ X.mT + ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A + aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X + X, C = C, X # Swap references to avoid unnecessary copies + + if G.size(-2) > G.size(-1): + X = X.mT + return X + +# ----------------------------------------------------------------------------- +# Muon optimizer + +class NorMuon(torch.optim.Optimizer): + """ + Muon - MomentUm Orthogonalized by Newton-schulz + + https://kellerjordan.github.io/posts/muon/ + + Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- + processing step, in which each 2D parameter's update is replaced with the nearest orthogonal + matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has + the advantage that it can be stably run in bfloat16 on the GPU. + + Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, + or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). + + Differences from standard Muon: + - Newton-Shulz is replaced with Polar Express for the orthogonalization step + - NorMuon adds a low-rank variance estimator similar to Adafactor. + - small 1D parameters handled here instead of in Adam + - Cautious weight decay, a gated version of decoupled weight decay + - Custom distributed sizing: + The model stores all attn and mlp weights in the same shape, and then updates the view as + needed on the forward pass. This enables attn and mlp weights to be contained within the same + dist.reduce_scatter_tensor() call. The model architecture has been customized to enable + (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. + The scheduling is: + 1. reduce scatter smear_gate (1 param 7 padding params) + 2. reduce scatter attn_gate (10 params 6 padding params) + 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) + 4. reduce scatter attn/mlp round 2 (16 mlp params) + 5. wait on step 1, then compute update of 1 and schedule all gather + 6. wait on step 2, then compute update of 2 and schedule all gather + 7. wait on step 3, then compute update of 3 and schedule all gather + GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] + GPUs that receive params of type attn reshape before computing update + 8. wait on 4, then compute update of 4 and schedule all gather + 9. wait for each all gather to complete and update params + Empirically, leading with small params provides an additional 0.2s improvement. + """ + def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, beta2=0.95, custom_sizing=True): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + # custom sizing requires 8 GPUs + if custom_sizing and dist.get_world_size()==8: + param_groups = self.generate_custom_param_groups(params) + else: + param_groups = self.generate_standard_param_groups(params) + super().__init__(param_groups, defaults) + + def reset(self): + # expose a reset for clearing buffers + for group in self.param_groups: + group["momentum_buffer"].zero_() + group["second_momentum_buffer"].zero_() + + def generate_standard_param_groups(self, params): + """ + Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. + Creates one param group per module. + """ + groups = defaultdict(list) + for param in params: + groups[param.label].append(param) + + param_groups = [] + for module_name, group_params in groups.items(): + chunk_size = (len(group_params) + self.world_size - 1) // self.world_size + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + + return param_groups + + def generate_custom_param_groups(self, params): + """ + Implementation requires that a single GPU does not receive both attn + and mlp params when a param group is split across GPUs. + """ + module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp'] + params_list = list(params) + params_list.sort(key=lambda x: module_group_order.index(x.label)) + + idx = 0 + group_sizes = [1, 10, 16, 16] + assert len(params_list) == sum(group_sizes) + param_groups = [] + for size in group_sizes: + chunk_size = (size + self.world_size - 1) // self.world_size + group_params = params_list[idx: idx + size] + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + idx += size + + return param_groups + + @torch.no_grad() + def step(self): + # Efficient systems-wise implementation of step developed by @YouJiacheng, + # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, + # @ryanyang0, @vagrawal, and @varunneal. + rank = dist.get_rank() + group_infos = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + if not params: + continue + + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + stacked_grads = torch.empty( + (padded_num_params, *params[0].shape), + dtype=params[0].dtype, + device=params[0].device + ) + for i, p in enumerate(params): + stacked_grads[i].copy_(p.grad, non_blocking=True) + if len(params) < padded_num_params: + stacked_grads[len(params):].zero_() + + grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) + + reduce_future = dist.reduce_scatter_tensor( + grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True + ).get_future() + + group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) + + all_gather_infos = [] + # Second pass: wait for gradients, compute updates for the local shard of parameters, + # and launch all async all_gather operations. + for group, info in zip(self.param_groups, group_infos): + info["reduce_future"].wait() + + params = group["params"] + grad_chunk = info["grad_chunk"] + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + start_idx = rank * chunk_size + module_idx = start_idx if start_idx < len(params) else 0 + + num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank + + if "momentum_buffer" not in group: + group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) + momentum_buffer = group["momentum_buffer"] + # Apply momentum update to the persistent momentum buffer in-place + momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) + updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) + + grad_shape = updated_grads.shape + if params[module_idx].label == 'attn': + # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] + for p in params[module_idx:module_idx + num_params]: + assert p.label == 'attn' + updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) + ref_param = params[module_idx] + param_shape = ref_param.shape + + if "second_momentum_buffer" not in group: + group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) + if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) + ) + second_momentum_buffer = group["second_momentum_buffer"] + + if "param_lr" not in group: + group["param_lr"] = ( + max(1., param_shape[-2] / param_shape[-1]) ** 0.5 + * ref_param.new_tensor( + [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + ) + + group["param_wd"] = ref_param.new_tensor( + [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + + # Determine LR and WR + eff_lr = group["lr"] * group["param_lr"] + eff_wd = group["lr"] * group["weight_decay"] * group["param_wd"] + + # Compute zeropower for the entire chunk in a single, batched call. + if num_params == 0: + v_chunk = updated_grads + else: + v_chunk = polar_express(updated_grads) + + # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) + v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) + second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) + step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() + v_chunk.mul_(step_size) + v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) + + v_chunk = v_chunk.view(grad_shape) + + updated_params = torch.empty_like(grad_chunk) + param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) + + # "Cautious" weight decay (https://arxiv.org/abs/2510.12402) + mask = (v_chunk * param_chunk) >= 0 + v_chunk.addcmul_(param_chunk, (eff_wd * mask).to(ref_param.dtype)) + + param_chunk.addcmul_(v_chunk, -eff_lr) + + updated_params[:num_params].copy_(param_chunk) + if num_params < chunk_size: + updated_params[num_params:].zero_() + + stacked_params = torch.empty( + (padded_num_params, *param_shape), + dtype=updated_params.dtype, + device=updated_params.device, + ) + + gather_future = dist.all_gather_into_tensor( + stacked_params, updated_params, async_op=True + ).get_future() + + all_gather_infos.append( + { + "gather_future": gather_future, + "stacked_params": stacked_params, + "orig_params": params, + } + ) + + # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. + for info in all_gather_infos: + info["gather_future"].wait() + stacked_params = info["stacked_params"] + orig_params = info["orig_params"] + + unstacked_params = torch.unbind(stacked_params) + for i, p in enumerate(orig_params): + p.copy_(unstacked_params[i], non_blocking=True) + + +class DistAdam(torch.optim.Optimizer): + def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + params = list(params) + sizes = {p.shape for p in params} + # create one buffer per unique parameter-size + param_groups = [] + for size in sizes: + group_params = [p for p in params if p.shape == size] + param_groups.append(dict(params=group_params)) + super().__init__(param_groups, defaults) + # init state + for p in params: + chunk_size = p.size(0) // self.world_size + exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) + exp_avg_sq = torch.zeros_like(exp_avg) + self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) + # DistributedAdam implementation by @vagrawal + + @torch.compile + @torch.no_grad() + def step(self): + rank = dist.get_rank() + reduce_scatter_futures: list[torch.Future] = [] + all_gather_futures: list[torch.Future] = [] + grad_slices = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + for param in params: + grad = param.grad + rank_size = grad.shape[0] // self.world_size + grad_slice = torch.empty_like(grad[:rank_size]) + reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) + grad_slices.append(grad_slice) + + idx = 0 + for group in self.param_groups: + beta1, beta2 = group['betas'] + eps = group['eps'] + wd = group['weight_decay'] + params = group['params'] + for param in params: + reduce_scatter_futures[idx].wait() + rank_size = param.shape[0] // self.world_size + p_slice = param[rank * rank_size:(rank + 1) * rank_size] + lr = group['lr'] * getattr(param, "lr_mul", 1.0) + state = self.state[param] + g_slice = grad_slices[idx] + + exp_avg = state["exp_avg"] + exp_avg_sq = state["exp_avg_sq"] + state["step"] += 1 + t = state["step"] + # weight decay + if wd != 0: + eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) + p_slice.mul_(1 - eff_weight_decay) + # update running averages + exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) + # bias corrections + bias1 = 1 - beta1 ** t + bias2 = 1 - beta2 ** t + # compute step + denom = exp_avg_sq.sqrt().add_(eps) + step_size = lr * (bias2 ** 0.5 / bias1) + update = exp_avg.div(denom).mul_(step_size) + p_slice.add_(other=update, alpha=-1.0) + idx += 1 + all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) + torch.futures.collect_all(all_gather_futures).wait() + +# ----------------------------------------------------------------------------- +# PyTorch nn.Module definitions for the model + +def norm(x: Tensor): + return F.rms_norm(x, (x.size(-1),)) + +class CastedLinear(nn.Linear): + def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): + super().__init__(in_features, out_features, bias=False) + self.use_fp8 = use_fp8 + self.x_s = x_s + self.w_s = w_s + self.grad_s = grad_s + + def reset_parameters(self) -> None: + with torch.no_grad(): + self.weight.zero_() # @Grad62304977 and others + + def forward(self, x: Tensor): + if self.use_fp8 and self.training: + _x = x.flatten(0, -2) + out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] + return out.reshape(*x.shape[:-1], -1) + else: + return F.linear(x, self.weight.type_as(x)) + +# yarn implementation @classiclarryd +class Yarn(nn.Module): + def __init__(self, head_dim, max_seq_len): + super().__init__() + self.head_dim = head_dim + self.max_seq_len = max_seq_len + self.reset() + + def reset(self): + angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) + # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) + angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) + theta = torch.outer(t, angular_freq) + self.cos = nn.Buffer( + theta.cos().to(torch.bfloat16), persistent=False + ) + self.sin = nn.Buffer( + theta.sin().to(torch.bfloat16), persistent=False + ) + self.angular_freq = angular_freq + # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 + self.attn_scale = 0.1 + + def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): + rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) + scaling_factor = old_window / new_window + interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) + self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) + theta = torch.outer(t, self.angular_freq) + self.cos.copy_(theta.cos()) + self.sin.copy_(theta.sin()) + self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 + +def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): + assert cos.size(0) >= x_BTHD.size(-3) + cos, sin = ( + cos[None, : x_BTHD.size(-3), None, :], + sin[None, : x_BTHD.size(-3), None, :], + ) + x1, x2 = x_BTHD.chunk(2, dim=-1) + y1 = x1 * cos + x2 * sin + y2 = x1 * (-sin) + x2 * cos + return torch.cat((y1, y2), 3) + +@dataclass +class AttnArgs: + ve: torch.Tensor + sa_lambdas: torch.Tensor + seqlens: torch.Tensor + bm_size: int + cos: torch.Tensor + sin: torch.Tensor + attn_scale: float + +flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface + +class CausalSelfAttention(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int): + super().__init__() + self.num_heads = num_heads + self.head_dim = head_dim + self.dim = dim + self.hdim = num_heads * head_dim + + assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" + std = 0.5 * (self.dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng + # https://x.com/hi_tysam/status/1879699187107033311 + # make matrices the same shape as MLP to enable batched call in optimizer + self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) + # label module to enable custom optimizer sizing + self.qkvo_w.label='attn' + + with torch.no_grad(): + self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights + self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero + + # sparse gated attention to enable context based no-op by @classiclarryd + self.attn_gate = CastedLinear(12, num_heads) + # label module to enable custom optimizer sizing + self.attn_gate.weight.label = 'attn_gate' + + def forward(self, x: Tensor, attn_args: AttnArgs): + B, T = x.size(0), x.size(1) # batch size, sequence length + assert B == 1, "varlen sequences requires B == 1" + assert T % 16 == 0 + # unpack attention args + cos, sin = attn_args.cos, attn_args.sin + ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas + seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size + + q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) + q, k = norm(q), norm(k) # QK norm @Grad62304977 + q, k = rotary(q, cos, sin), rotary(k, cos, sin) + if ve is not None: + v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 + else: # skip mid-layers token value embeddings by @YouJiacheng + v = sa_lambdas[0] * v + + max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) + + # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng + y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, + max_seqlen_q=max_len, max_seqlen_k=max_len, + causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) + y = y.view(B, T, self.num_heads, self.head_dim) + y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) + y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side + y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) + return y + + +class MLP(nn.Module): + def __init__(self, dim: int): + super().__init__() + hdim = 4 * dim + # make matrices the same shape to enable batched call in optimizer + self.c_fc = nn.Parameter(torch.empty(dim, hdim)) + self.c_proj = nn.Parameter(torch.empty(dim, hdim)) + # label modules to enable custom optimizer sizing + self.c_fc.label = 'mlp' + self.c_proj.label = 'mlp' + # corrective factor to account for transpose + self.c_fc.lr_mul = 2. + + std = 0.5 * (dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + with torch.no_grad(): + self.c_fc.uniform_(-bound, bound) + self.c_proj.zero_() # zero init suggested by @Grad62304977 + + def forward(self, x: Tensor): + x = F.linear(x, self.c_fc.T.type_as(x)) + x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 + x = F.linear(x, self.c_proj.type_as(x)) + return x + +class Block(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): + super().__init__() + # skip attention of blocks.7 (the 8th layer) by @YouJiacheng + self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None + # skip MLP blocks for first MLP layer by @EmelyanenkoK + self.mlp = MLP(dim) if layer_idx != 0 else None + + def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): + x = lambdas[0] * x + lambdas[1] * x0 + if self.attn is not None: + x = x + self.attn(norm(x), attn_args) + if self.mlp is not None: + x = x + self.mlp(norm(x)) + return x + +# ----------------------------------------------------------------------------- +# The main model + +def next_multiple_of_n(v: float | int, *, n: int): + return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) + +class GPT(nn.Module): + def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): + super().__init__() + vocab_size = next_multiple_of_n(vocab_size, n=128) + self.embed = nn.Embedding(vocab_size, model_dim) + self.smear_gate = CastedLinear(12, 1) + # label modules to enable custom optimizer sizing + self.smear_gate.weight.label = 'smear_gate' + # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 + # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 + self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) + self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) + self.yarn = Yarn(head_dim, max_seq_len) + # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. + # suggested to me by @Grad62304977. this originates from Karpathy's experiments. + use_fp8 = not os.environ.get("DISABLE_FP8", False) + self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) + # Add learnable skip connection weights for decoder layers + assert num_layers % 2 == 0 + pad = (-num_layers * 5 - 2) % dist.get_world_size() + self.scalars = nn.Parameter( + torch.cat( + [ + -1.5 + * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 + *[ + torch.tensor([1.0, 0.0]) for _ in range(num_layers) + ], # block lambdas + *[ + torch.tensor([0.5, 0.5]) for _ in range(num_layers) + ], # SA lambdas + torch.zeros(1), # smear_lambda + 0.5*torch.ones(1), # backout_lambda + torch.ones(pad), + ] + ) + ) + # set learning rates + for param in self.embed.parameters(): + param.lr_mul = 75. + for param in self.value_embeds.parameters(): + param.lr_mul = 75. + self.lm_head.weight.lr_mul = 1.0 + self.scalars.lr_mul = 5.0 + + def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): + assert input_seq.ndim == 1 + + ve = [value_embed(input_seq) for value_embed in self.value_embeds] + # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure + # dropping first layer updates this to .12 ... 012 + ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] + assert len(ve) == len(self.blocks) + + short_bm = ws_short * args.block_size + long_bm = ws_long * args.block_size + bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] + assert len(bm_sizes) == len(self.blocks) + + x = self.embed(input_seq) + + skip_weights = self.scalars[:(len(self.blocks) // 2)] + lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) + sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) + smear_lambda = self.scalars[5 * len(self.blocks)] + backout_lambda = self.scalars[5 * len(self.blocks)+1] + + # smear token embed forward 1 position @classiclarryd + smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) + x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) + x = x0 = norm(x[None]) + + # U-net design by @brendanh0gan + skip_connections = [] + n = len(self.blocks) // 2 + + x_backout = None + backout_layer = 8 + # skip layer zero + for i in range(1,len(self.blocks)): + attn_args = AttnArgs( + ve=ve[i], + sa_lambdas=sa_lambdas[i], + seqlens=seqlens, + bm_size=bm_sizes[i], + cos=self.yarn.cos, + sin=self.yarn.sin, + attn_scale=self.yarn.attn_scale + ) + # since layer 0 is skipped, layer 11 does not have skip_connection + if i >= n and i<11: + gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) + x = x + gate * skip_connections.pop() + x = self.blocks[i](x, x0, lambdas[i], attn_args) + if i < n: + skip_connections.append(x) + if i == backout_layer: + x_backout = x + + # back out contributions from first 8 layers that are only required for downstream context and not direct prediction + x -= backout_lambda * x_backout + x = norm(x) + logits = self.lm_head(x) + # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) + logits = 30 * torch.sigmoid(logits / 7.5) + logits_for_loss = logits.float() if not self.training else logits + loss = F.cross_entropy( + logits_for_loss.view(-1, logits_for_loss.size(-1)), + target_seq, + reduction="sum" if self.training else "mean", + ) + return loss + +# ----------------------------------------------------------------------------- +# Distributed data loader + +def _load_data_shard(file: Path): + header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 + assert header[0] == 20240520, "magic number mismatch in the data .bin file" + assert header[1] == 1, "unsupported version" + num_tokens = int(header[2]) # number of tokens (claimed) + with file.open("rb", buffering=0) as f: + tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng + f.seek(256 * 4) + nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng + assert nbytes == 2 * num_tokens, "number of tokens read does not match header" + return tokens + +BOS_ID = 50256 + +class BOSFinder: + # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd + def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): + # Precompute BOS positions once per shard + self.tokens=tokens + self.size = tokens.numel() + self.quickload = quickload + if quickload: + # only scan first 4 million tokens, then kickoff async thread to scan rest + self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.thread = None + self.ready = threading.Event() + self.start() + else: + self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.i = 0 + self.world_size = world_size + self.batch_iter = 0 + + def _load(self): + self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + self.bos_idx = self.bos_idx_async + + def next_batch(self, num_tokens_local: int, max_seq_len: int): + # if quickload was used, repoint to the full dataset after 5 batches + if self.quickload and self.batch_iter==5: + self.get() + n = len(self.bos_idx) + starts = [[] for _ in range(self.world_size)] + ends = [[] for _ in range(self.world_size)] + + idx = self.i + for r in range(self.world_size): + cur_len = 0 + while cur_len <= num_tokens_local: + if idx >= n: + raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") + cur = self.bos_idx[idx] + starts[r].append(cur) + end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, + cur + max_seq_len, + cur + num_tokens_local - cur_len + 1) + ends[r].append(end) + cur_len += end - cur + idx += 1 + + assert cur_len == num_tokens_local + 1 + self.i = idx + self.batch_iter+=1 + return starts, ends + +class DataPreloader: + # Helper for asynchronously loading next shard and indexing bos tokens + def __init__(self, file_iter, world_size: int = 1): + self.file_iter = file_iter + self.world_size = world_size + self.thread = None + self.data = None + self.ready = threading.Event() + + def _load(self): + tokens = _load_data_shard(next(self.file_iter)) + self.data = (tokens, BOSFinder(tokens, self.world_size)) + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + return self.data + +def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): + # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len + rank = dist.get_rank() if dist.is_initialized() else 0 + world_size = dist.get_world_size() if dist.is_initialized() else 1 + assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" + num_tokens = num_tokens // grad_accum_steps + + files = [Path(file) for file in sorted(glob.glob(filename_pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") + + file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training + tokens = _load_data_shard(next(file_iter)) + if align_to_bos: + finder = BOSFinder(tokens, world_size=world_size, quickload=True) + preloader = DataPreloader(file_iter, world_size) + preloader.start() + else: + pos = 0 # for unaligned case + + while True: + num_tokens_local = num_tokens // world_size + max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 + + if align_to_bos: + try: + seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) + start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) + except StopIteration: + # This shard is exhausted, load the next one in the next loop iteration. + tokens, finder = preloader.get() + preloader.start() + continue + + buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) + _inputs = buf[:-1] + _targets = buf[1:] + end_idxs[-1] -= 1 # last document was too long to account for _targets offset + cum_lengths = (end_idxs - start_idxs).cumsum(0) + + else: + if pos + num_tokens + 1 >= len(tokens): # should not occur for val data + tokens, pos = _load_data_shard(next(file_iter)), 0 + + pos_local = pos + rank * num_tokens_local + buf = tokens[pos_local: pos_local + num_tokens_local + 1] + _inputs = buf[:-1].view(num_tokens_local, ) + _targets = buf[1:].view(num_tokens_local, ) + + cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] + pos += num_tokens + + + _cum_lengths = torch.full((max_num_docs,), num_tokens_local) + _cum_lengths[0] = 0 + _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths + + new_params = yield ( + _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), + _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), + _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) + ) + + if new_params is not None: + # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() + new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params + assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" + num_tokens = new_num_tokens + max_seq_len = new_max_seq_len + grad_accum_steps = new_grad_accum_steps + + +# ----------------------------------------------------------------------------- +# int main + +@dataclass +class Hyperparameters: + # data + train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on + val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on + val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons + train_batch_size: int = 2048 * 16 * 8 + train_max_seq_len: int = 128 * 16 + val_batch_size: int = 4 * 64 * 1024 * 8 + # optimization + num_scheduled_iterations: int = 2205 # number of steps to complete lr and ws schedule + num_extension_iterations: int = 40 # number of steps to continue training at final lr and ws + num_iterations: int = num_scheduled_iterations + num_extension_iterations + cooldown_frac: float = 0.50 # fraction of num_scheduled_iterations spent cooling down the learning rate + # evaluation and logging + run_id: str = f"{uuid.uuid4()}" + val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end + save_checkpoint: bool = False + # attention masking + block_size: int = 128 + ws_schedule: tuple = (3, 7, 11) + ws_final: int = 13 # increase final validation ws, used for YaRN extension and short window size @classiclarryd + ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN + +args = Hyperparameters() + +data_path = os.environ.get("DATA_PATH", ".") +args.train_files = os.path.join(data_path, args.train_files) +args.val_files = os.path.join(data_path, args.val_files) + +# torchrun sets these env variables +rank = int(os.environ["RANK"]) +world_size = int(os.environ["WORLD_SIZE"]) +assert 8 % world_size == 0, "world_size must be a divisor of 8" +grad_accum_steps = 8 // world_size +assert torch.cuda.is_available() +device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) +torch.cuda.set_device(device) +dist.init_process_group(backend="nccl", device_id=device) +dist.barrier() +master_process = (rank == 0) # this process will do logging, checkpointing etc. + +# begin logging +logfile = None +if master_process: + run_id = args.run_id + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{run_id}.txt" + print(logfile) +def print0(s, console=False): + if master_process: + with open(logfile, "a") as f: + if console: + print(s) + print(s, file=f) + +# begin by printing this file (the Python code) +print0(code) +print0("="*100) +# log information about the hardware/software environment this is running on +print0(f"Running Python {sys.version}") +print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") +print0(f"Running Triton version {triton.__version__}") + +def nvidia_smi(): + import subprocess # avoid top level import + return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout +print0(nvidia_smi()) +print0("="*100) + +model: nn.Module = GPT( + vocab_size=50257, + num_layers=12, + num_heads=6, + head_dim=128, + model_dim=768, + max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) +).cuda() +for m in model.modules(): + if isinstance(m, (nn.Embedding, nn.Linear)): + m.bfloat16() +for param in model.parameters(): + dist.broadcast(param.detach(), 0) + +# collect the parameters to optimize +hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] +embed_params = [p for n, p in model.named_parameters() if "embed" in n] +scalar_params = [p for p in model.parameters() if p.ndim < 2] +head_params = [model.lm_head.weight] +gate_params = [p for n, p in model.named_parameters() if "gate" in n] + +# init the optimizer(s) +# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence +# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 +optimizer1 = DistAdam( + scalar_params + head_params + embed_params, + lr=0.008, + betas=(0.65, 0.95), + eps=1e-8, + weight_decay=0.0, +) +optimizer2 = NorMuon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=1.2) +optimizers = [optimizer1, optimizer2] +for opt in optimizers: + for group in opt.param_groups: + group["initial_lr"] = group["lr"] + +# learning rate schedule: flat, then linear decay, then flat +def get_lr(step: int): + x = min(0.9999, step / args.num_scheduled_iterations) + assert 0 <= x < 1 + lr = 1.0 + if x >= 1 - args.cooldown_frac: + w = (1 - x) / args.cooldown_frac + lr = w * 1.0 + (1 - w) * 0.1 + return lr + +def get_ws(step: int): + # set short window size to half of long window size + # Higher ws on "extension" steps + if step >= args.num_scheduled_iterations: + return args.ws_final // 2, args.ws_final + x = step / args.num_scheduled_iterations + assert 0 <= x < 1 + ws_idx = int(len(args.ws_schedule) * x) + return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] + +def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): + # warmup phase: linearly increase momentum from min to max + # cooldown phase: linearly decrease momentum from max to min + momentum_cd_start = args.num_iterations - muon_cooldown_steps + if step < muon_warmup_steps: + frac = step / muon_warmup_steps + momentum = momentum_min + frac * (momentum_max - momentum_min) + elif step > momentum_cd_start: + frac = (step - momentum_cd_start) / muon_cooldown_steps + momentum = momentum_max - frac * (momentum_max - momentum_min) + else: + momentum = momentum_max + return momentum + +def step_optimizers(step: int, optimizers, model): + # update lr + for optimizer in optimizers: + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * get_lr(step) + + # set muon momentum based on step + momentum = get_muon_momentum(step) + for group in optimizers[1].param_groups: + group["momentum"] = momentum + + # on even steps, only step Muon params + # on odd steps, step all params + if step%2==0: + optimizers[1].step() + optimizers[1].zero_grad(set_to_none=True) + else: + for optimizer in optimizers: + optimizer.step() + model.zero_grad(set_to_none=True) + +model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) + +######################################## +# Warmup kernels # +######################################## + +# Warmup the training kernels, then re-initialize the state so we aren't cheating +warmup_steps = 30 +initial_state = dict(model=copy.deepcopy(model.state_dict()), + optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +ws_schedule = list(args.ws_schedule) + [args.ws_final] +ws_long = ws_schedule[0] +for step in range(warmup_steps): + inputs, targets, cum_seqlens = next(train_loader) + # each window size is a new graph, need to warm up each with Yarn.attn_scale + ws_idx = step % len(ws_schedule) + if ws_idx==0: + model.yarn.reset() + ws_long = ws_schedule[0] + else: + new_ws_long = ws_schedule[ws_idx] + model.yarn.apply(ws_long, new_ws_long) + ws_long = new_ws_long + model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() + for opt in optimizers: + opt.step() + model.zero_grad(set_to_none=True) +model.yarn.reset() # rotary buffer is not stored in state_dict +model.load_state_dict(initial_state["model"]) +optimizer2.reset() # muon momentum buffers not in state dict +for opt, opt_state in zip(optimizers, initial_state["optimizers"]): + opt.load_state_dict(opt_state) +del train_loader, initial_state + +######################################## +# Training and validation # +######################################## + +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +training_time_ms = 0 +# start the clock +torch.cuda.synchronize() +t0 = time.perf_counter() +# begin training +train_steps = args.num_iterations +ws_short, ws_long = get_ws(0) +for step in range(train_steps + 1): + last_step = (step == train_steps) + ws_short, new_ws_long = get_ws(step) + if new_ws_long != ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long=new_ws_long + + # --------------- VALIDATION SECTION ----------------- + if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): + if last_step: + ws_long = args.ws_validate_post_yarn_ext + # stop the clock + torch.cuda.synchronize() + training_time_ms += 1000 * (time.perf_counter() - t0) + model.eval() + assert args.val_tokens % args.val_batch_size == 0 + val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size + val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) + val_loss = 0 + with torch.no_grad(): + for _ in range(val_steps): + inputs, targets, cum_seqlens = next(val_loader) + val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) + val_loss /= val_steps + del val_loader + dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) + print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) + model.train() + # start the clock again + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if master_process and args.save_checkpoint: + log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) + os.makedirs(f"logs/{run_id}", exist_ok=True) + torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") + # the last step only has the validation loop, so break to avoid training + break + + # --------------- TRAINING SECTION ----------------- + for _ in range(grad_accum_steps): + inputs, targets, cum_seqlens = next(train_loader) + (model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps).backward() + step_optimizers(step, optimizers, model) + + # logging + approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) + print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) + +print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) +dist.destroy_process_group() + +==================================================================================================== +Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] +Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 +Running Triton version 3.5.0 +Mon Nov 10 21:48:07 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | +| N/A 41C P0 130W / 700W | 5858MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | +| N/A 35C P0 121W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | +| N/A 33C P0 119W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | +| N/A 39C P0 123W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | +| N/A 40C P0 130W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | +| N/A 34C P0 122W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | +| N/A 40C P0 122W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | +| N/A 34C P0 119W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| ++-----------------------------------------------------------------------------------------+ + +==================================================================================================== +step:0/2245 val_loss:10.8258 train_time:0ms step_avg:0.02ms +step:1/2245 train_time:120ms step_avg:119.72ms +step:2/2245 train_time:141ms step_avg:70.45ms +step:3/2245 train_time:179ms step_avg:59.78ms +step:4/2245 train_time:236ms step_avg:58.95ms +step:5/2245 train_time:295ms step_avg:59.06ms +step:6/2245 train_time:354ms step_avg:58.94ms +step:7/2245 train_time:415ms step_avg:59.28ms +step:8/2245 train_time:474ms step_avg:59.19ms +step:9/2245 train_time:534ms step_avg:59.39ms +step:10/2245 train_time:593ms step_avg:59.32ms +step:11/2245 train_time:654ms step_avg:59.49ms +step:12/2245 train_time:713ms step_avg:59.43ms +step:13/2245 train_time:775ms step_avg:59.60ms +step:14/2245 train_time:833ms step_avg:59.53ms +step:15/2245 train_time:895ms step_avg:59.65ms +step:16/2245 train_time:954ms step_avg:59.63ms +step:17/2245 train_time:1019ms step_avg:59.94ms +step:18/2245 train_time:1084ms step_avg:60.21ms +step:19/2245 train_time:1150ms step_avg:60.50ms +step:20/2245 train_time:1209ms step_avg:60.44ms +step:21/2245 train_time:1272ms step_avg:60.56ms +step:22/2245 train_time:1331ms step_avg:60.51ms +step:23/2245 train_time:1393ms step_avg:60.56ms +step:24/2245 train_time:1452ms step_avg:60.49ms +step:25/2245 train_time:1513ms step_avg:60.53ms +step:26/2245 train_time:1573ms step_avg:60.49ms +step:27/2245 train_time:1634ms step_avg:60.51ms +step:28/2245 train_time:1692ms step_avg:60.44ms +step:29/2245 train_time:1754ms step_avg:60.48ms +step:30/2245 train_time:1813ms step_avg:60.43ms +step:31/2245 train_time:1874ms step_avg:60.46ms +step:32/2245 train_time:1934ms step_avg:60.44ms +step:33/2245 train_time:1997ms step_avg:60.52ms +step:34/2245 train_time:2059ms step_avg:60.56ms +step:35/2245 train_time:2123ms step_avg:60.67ms +step:36/2245 train_time:2184ms step_avg:60.66ms +step:37/2245 train_time:2248ms step_avg:60.77ms +step:38/2245 train_time:2306ms step_avg:60.69ms +step:39/2245 train_time:2368ms step_avg:60.72ms +step:40/2245 train_time:2427ms step_avg:60.69ms +step:41/2245 train_time:2489ms step_avg:60.71ms +step:42/2245 train_time:2548ms step_avg:60.67ms +step:43/2245 train_time:2609ms step_avg:60.68ms +step:44/2245 train_time:2668ms step_avg:60.63ms +step:45/2245 train_time:2730ms step_avg:60.66ms +step:46/2245 train_time:2788ms step_avg:60.62ms +step:47/2245 train_time:2850ms step_avg:60.64ms +step:48/2245 train_time:2909ms step_avg:60.61ms +step:49/2245 train_time:2972ms step_avg:60.65ms +step:50/2245 train_time:3032ms step_avg:60.63ms +step:51/2245 train_time:3094ms step_avg:60.67ms +step:52/2245 train_time:3154ms step_avg:60.66ms +step:53/2245 train_time:3217ms step_avg:60.70ms +step:54/2245 train_time:3277ms step_avg:60.68ms +step:55/2245 train_time:3339ms step_avg:60.71ms +step:56/2245 train_time:3398ms step_avg:60.68ms +step:57/2245 train_time:3461ms step_avg:60.71ms +step:58/2245 train_time:3520ms step_avg:60.69ms +step:59/2245 train_time:3582ms step_avg:60.71ms +step:60/2245 train_time:3641ms step_avg:60.69ms +step:61/2245 train_time:3703ms step_avg:60.71ms +step:62/2245 train_time:3762ms step_avg:60.68ms +step:63/2245 train_time:3825ms step_avg:60.72ms +step:64/2245 train_time:3885ms step_avg:60.70ms +step:65/2245 train_time:3947ms step_avg:60.72ms +step:66/2245 train_time:4006ms step_avg:60.69ms +step:67/2245 train_time:4068ms step_avg:60.72ms +step:68/2245 train_time:4129ms step_avg:60.71ms +step:69/2245 train_time:4190ms step_avg:60.73ms +step:70/2245 train_time:4250ms step_avg:60.71ms +step:71/2245 train_time:4311ms step_avg:60.72ms +step:72/2245 train_time:4370ms step_avg:60.70ms +step:73/2245 train_time:4434ms step_avg:60.74ms +step:74/2245 train_time:4493ms step_avg:60.71ms +step:75/2245 train_time:4555ms step_avg:60.73ms +step:76/2245 train_time:4614ms step_avg:60.71ms +step:77/2245 train_time:4676ms step_avg:60.73ms +step:78/2245 train_time:4735ms step_avg:60.71ms +step:79/2245 train_time:4798ms step_avg:60.74ms +step:80/2245 train_time:4858ms step_avg:60.73ms +step:81/2245 train_time:4920ms step_avg:60.73ms +step:82/2245 train_time:4979ms step_avg:60.72ms +step:83/2245 train_time:5042ms step_avg:60.75ms +step:84/2245 train_time:5102ms step_avg:60.74ms +step:85/2245 train_time:5165ms step_avg:60.77ms +step:86/2245 train_time:5223ms step_avg:60.73ms +step:87/2245 train_time:5285ms step_avg:60.75ms +step:88/2245 train_time:5345ms step_avg:60.74ms +step:89/2245 train_time:5407ms step_avg:60.75ms +step:90/2245 train_time:5466ms step_avg:60.73ms +step:91/2245 train_time:5528ms step_avg:60.74ms +step:92/2245 train_time:5586ms step_avg:60.72ms +step:93/2245 train_time:5648ms step_avg:60.73ms +step:94/2245 train_time:5706ms step_avg:60.71ms +step:95/2245 train_time:5768ms step_avg:60.72ms +step:96/2245 train_time:5827ms step_avg:60.70ms +step:97/2245 train_time:5889ms step_avg:60.71ms +step:98/2245 train_time:5948ms step_avg:60.70ms +step:99/2245 train_time:6010ms step_avg:60.71ms +step:100/2245 train_time:6070ms step_avg:60.70ms +step:101/2245 train_time:6132ms step_avg:60.72ms +step:102/2245 train_time:6192ms step_avg:60.71ms +step:103/2245 train_time:6254ms step_avg:60.72ms +step:104/2245 train_time:6313ms step_avg:60.71ms +step:105/2245 train_time:6375ms step_avg:60.72ms +step:106/2245 train_time:6435ms step_avg:60.71ms +step:107/2245 train_time:6498ms step_avg:60.72ms +step:108/2245 train_time:6557ms step_avg:60.71ms +step:109/2245 train_time:6619ms step_avg:60.72ms +step:110/2245 train_time:6678ms step_avg:60.71ms +step:111/2245 train_time:6740ms step_avg:60.72ms +step:112/2245 train_time:6799ms step_avg:60.70ms +step:113/2245 train_time:6860ms step_avg:60.71ms +step:114/2245 train_time:6919ms step_avg:60.69ms +step:115/2245 train_time:6982ms step_avg:60.71ms +step:116/2245 train_time:7042ms step_avg:60.71ms +step:117/2245 train_time:7104ms step_avg:60.72ms +step:118/2245 train_time:7164ms step_avg:60.71ms +step:119/2245 train_time:7225ms step_avg:60.72ms +step:120/2245 train_time:7284ms step_avg:60.70ms +step:121/2245 train_time:7346ms step_avg:60.71ms +step:122/2245 train_time:7406ms step_avg:60.70ms +step:123/2245 train_time:7467ms step_avg:60.71ms +step:124/2245 train_time:7527ms step_avg:60.70ms +step:125/2245 train_time:7588ms step_avg:60.71ms +step:126/2245 train_time:7647ms step_avg:60.69ms +step:127/2245 train_time:7708ms step_avg:60.70ms +step:128/2245 train_time:7767ms step_avg:60.68ms +step:129/2245 train_time:7828ms step_avg:60.69ms +step:130/2245 train_time:7887ms step_avg:60.67ms +step:131/2245 train_time:7949ms step_avg:60.68ms +step:132/2245 train_time:8009ms step_avg:60.67ms +step:133/2245 train_time:8071ms step_avg:60.68ms +step:134/2245 train_time:8131ms step_avg:60.68ms +step:135/2245 train_time:8193ms step_avg:60.69ms +step:136/2245 train_time:8252ms step_avg:60.68ms +step:137/2245 train_time:8315ms step_avg:60.69ms +step:138/2245 train_time:8374ms step_avg:60.68ms +step:139/2245 train_time:8436ms step_avg:60.69ms +step:140/2245 train_time:8496ms step_avg:60.68ms +step:141/2245 train_time:8558ms step_avg:60.69ms +step:142/2245 train_time:8617ms step_avg:60.69ms +step:143/2245 train_time:8680ms step_avg:60.70ms +step:144/2245 train_time:8740ms step_avg:60.69ms +step:145/2245 train_time:8802ms step_avg:60.70ms +step:146/2245 train_time:8861ms step_avg:60.69ms +step:147/2245 train_time:8922ms step_avg:60.69ms +step:148/2245 train_time:8982ms step_avg:60.69ms +step:149/2245 train_time:9044ms step_avg:60.70ms +step:150/2245 train_time:9104ms step_avg:60.69ms +step:151/2245 train_time:9165ms step_avg:60.70ms +step:152/2245 train_time:9225ms step_avg:60.69ms +step:153/2245 train_time:9286ms step_avg:60.70ms +step:154/2245 train_time:9346ms step_avg:60.69ms +step:155/2245 train_time:9408ms step_avg:60.70ms +step:156/2245 train_time:9467ms step_avg:60.68ms +step:157/2245 train_time:9528ms step_avg:60.69ms +step:158/2245 train_time:9586ms step_avg:60.67ms +step:159/2245 train_time:9647ms step_avg:60.68ms +step:160/2245 train_time:9707ms step_avg:60.67ms +step:161/2245 train_time:9768ms step_avg:60.67ms +step:162/2245 train_time:9827ms step_avg:60.66ms +step:163/2245 train_time:9889ms step_avg:60.67ms +step:164/2245 train_time:9948ms step_avg:60.66ms +step:165/2245 train_time:10010ms step_avg:60.67ms +step:166/2245 train_time:10070ms step_avg:60.66ms +step:167/2245 train_time:10131ms step_avg:60.67ms +step:168/2245 train_time:10191ms step_avg:60.66ms +step:169/2245 train_time:10253ms step_avg:60.67ms +step:170/2245 train_time:10312ms step_avg:60.66ms +step:171/2245 train_time:10374ms step_avg:60.66ms +step:172/2245 train_time:10433ms step_avg:60.65ms +step:173/2245 train_time:10494ms step_avg:60.66ms +step:174/2245 train_time:10553ms step_avg:60.65ms +step:175/2245 train_time:10614ms step_avg:60.65ms +step:176/2245 train_time:10673ms step_avg:60.64ms +step:177/2245 train_time:10734ms step_avg:60.65ms +step:178/2245 train_time:10793ms step_avg:60.64ms +step:179/2245 train_time:10856ms step_avg:60.65ms +step:180/2245 train_time:10915ms step_avg:60.64ms +step:181/2245 train_time:10977ms step_avg:60.65ms +step:182/2245 train_time:11037ms step_avg:60.64ms +step:183/2245 train_time:11099ms step_avg:60.65ms +step:184/2245 train_time:11159ms step_avg:60.65ms +step:185/2245 train_time:11221ms step_avg:60.65ms +step:186/2245 train_time:11280ms step_avg:60.65ms +step:187/2245 train_time:11343ms step_avg:60.66ms +step:188/2245 train_time:11402ms step_avg:60.65ms +step:189/2245 train_time:11464ms step_avg:60.66ms +step:190/2245 train_time:11523ms step_avg:60.65ms +step:191/2245 train_time:11586ms step_avg:60.66ms +step:192/2245 train_time:11644ms step_avg:60.65ms +step:193/2245 train_time:11706ms step_avg:60.65ms +step:194/2245 train_time:11765ms step_avg:60.64ms +step:195/2245 train_time:11826ms step_avg:60.64ms +step:196/2245 train_time:11885ms step_avg:60.64ms +step:197/2245 train_time:11946ms step_avg:60.64ms +step:198/2245 train_time:12005ms step_avg:60.63ms +step:199/2245 train_time:12067ms step_avg:60.64ms +step:200/2245 train_time:12126ms step_avg:60.63ms +step:201/2245 train_time:12188ms step_avg:60.64ms +step:202/2245 train_time:12247ms step_avg:60.63ms +step:203/2245 train_time:12308ms step_avg:60.63ms +step:204/2245 train_time:12367ms step_avg:60.62ms +step:205/2245 train_time:12429ms step_avg:60.63ms +step:206/2245 train_time:12488ms step_avg:60.62ms +step:207/2245 train_time:12549ms step_avg:60.63ms +step:208/2245 train_time:12608ms step_avg:60.62ms +step:209/2245 train_time:12670ms step_avg:60.62ms +step:210/2245 train_time:12729ms step_avg:60.61ms +step:211/2245 train_time:12790ms step_avg:60.62ms +step:212/2245 train_time:12849ms step_avg:60.61ms +step:213/2245 train_time:12910ms step_avg:60.61ms +step:214/2245 train_time:12969ms step_avg:60.60ms +step:215/2245 train_time:13030ms step_avg:60.60ms +step:216/2245 train_time:13089ms step_avg:60.60ms +step:217/2245 train_time:13151ms step_avg:60.60ms +step:218/2245 train_time:13209ms step_avg:60.59ms +step:219/2245 train_time:13270ms step_avg:60.60ms +step:220/2245 train_time:13330ms step_avg:60.59ms +step:221/2245 train_time:13392ms step_avg:60.60ms +step:222/2245 train_time:13451ms step_avg:60.59ms +step:223/2245 train_time:13513ms step_avg:60.59ms +step:224/2245 train_time:13572ms step_avg:60.59ms +step:225/2245 train_time:13634ms step_avg:60.60ms +step:226/2245 train_time:13693ms step_avg:60.59ms +step:227/2245 train_time:13755ms step_avg:60.59ms +step:228/2245 train_time:13814ms step_avg:60.59ms +step:229/2245 train_time:13875ms step_avg:60.59ms +step:230/2245 train_time:13934ms step_avg:60.58ms +step:231/2245 train_time:13996ms step_avg:60.59ms +step:232/2245 train_time:14055ms step_avg:60.58ms +step:233/2245 train_time:14117ms step_avg:60.59ms +step:234/2245 train_time:14177ms step_avg:60.58ms +step:235/2245 train_time:14238ms step_avg:60.59ms +step:236/2245 train_time:14298ms step_avg:60.59ms +step:237/2245 train_time:14360ms step_avg:60.59ms +step:238/2245 train_time:14420ms step_avg:60.59ms +step:239/2245 train_time:14482ms step_avg:60.59ms +step:240/2245 train_time:14541ms step_avg:60.59ms +step:241/2245 train_time:14603ms step_avg:60.59ms +step:242/2245 train_time:14662ms step_avg:60.59ms +step:243/2245 train_time:14724ms step_avg:60.59ms +step:244/2245 train_time:14783ms step_avg:60.59ms +step:245/2245 train_time:14845ms step_avg:60.59ms +step:246/2245 train_time:14905ms step_avg:60.59ms +step:247/2245 train_time:14966ms step_avg:60.59ms +step:248/2245 train_time:15025ms step_avg:60.58ms +step:249/2245 train_time:15087ms step_avg:60.59ms +step:250/2245 train_time:15146ms step_avg:60.59ms +step:250/2245 val_loss:4.0677 train_time:15208ms step_avg:60.83ms +step:251/2245 train_time:15227ms step_avg:60.67ms +step:252/2245 train_time:15269ms step_avg:60.59ms +step:253/2245 train_time:15336ms step_avg:60.61ms +step:254/2245 train_time:15397ms step_avg:60.62ms +step:255/2245 train_time:15460ms step_avg:60.63ms +step:256/2245 train_time:15519ms step_avg:60.62ms +step:257/2245 train_time:15580ms step_avg:60.62ms +step:258/2245 train_time:15638ms step_avg:60.61ms +step:259/2245 train_time:15699ms step_avg:60.61ms +step:260/2245 train_time:15758ms step_avg:60.61ms +step:261/2245 train_time:15819ms step_avg:60.61ms +step:262/2245 train_time:15877ms step_avg:60.60ms +step:263/2245 train_time:15937ms step_avg:60.60ms +step:264/2245 train_time:15995ms step_avg:60.59ms +step:265/2245 train_time:16056ms step_avg:60.59ms +step:266/2245 train_time:16116ms step_avg:60.59ms +step:267/2245 train_time:16179ms step_avg:60.60ms +step:268/2245 train_time:16240ms step_avg:60.60ms +step:269/2245 train_time:16303ms step_avg:60.61ms +step:270/2245 train_time:16363ms step_avg:60.61ms +step:271/2245 train_time:16426ms step_avg:60.61ms +step:272/2245 train_time:16486ms step_avg:60.61ms +step:273/2245 train_time:16547ms step_avg:60.61ms +step:274/2245 train_time:16605ms step_avg:60.60ms +step:275/2245 train_time:16666ms step_avg:60.61ms +step:276/2245 train_time:16725ms step_avg:60.60ms +step:277/2245 train_time:16786ms step_avg:60.60ms +step:278/2245 train_time:16845ms step_avg:60.59ms +step:279/2245 train_time:16906ms step_avg:60.59ms +step:280/2245 train_time:16964ms step_avg:60.59ms +step:281/2245 train_time:17026ms step_avg:60.59ms +step:282/2245 train_time:17086ms step_avg:60.59ms +step:283/2245 train_time:17149ms step_avg:60.60ms +step:284/2245 train_time:17208ms step_avg:60.59ms +step:285/2245 train_time:17271ms step_avg:60.60ms +step:286/2245 train_time:17330ms step_avg:60.59ms +step:287/2245 train_time:17392ms step_avg:60.60ms +step:288/2245 train_time:17451ms step_avg:60.59ms +step:289/2245 train_time:17512ms step_avg:60.60ms +step:290/2245 train_time:17571ms step_avg:60.59ms +step:291/2245 train_time:17632ms step_avg:60.59ms +step:292/2245 train_time:17691ms step_avg:60.58ms +step:293/2245 train_time:17752ms step_avg:60.59ms +step:294/2245 train_time:17811ms step_avg:60.58ms +step:295/2245 train_time:17873ms step_avg:60.59ms +step:296/2245 train_time:17932ms step_avg:60.58ms +step:297/2245 train_time:17993ms step_avg:60.58ms +step:298/2245 train_time:18053ms step_avg:60.58ms +step:299/2245 train_time:18114ms step_avg:60.58ms +step:300/2245 train_time:18173ms step_avg:60.58ms +step:301/2245 train_time:18235ms step_avg:60.58ms +step:302/2245 train_time:18294ms step_avg:60.58ms +step:303/2245 train_time:18356ms step_avg:60.58ms +step:304/2245 train_time:18415ms step_avg:60.58ms +step:305/2245 train_time:18476ms step_avg:60.58ms +step:306/2245 train_time:18534ms step_avg:60.57ms +step:307/2245 train_time:18596ms step_avg:60.57ms +step:308/2245 train_time:18655ms step_avg:60.57ms +step:309/2245 train_time:18717ms step_avg:60.57ms +step:310/2245 train_time:18776ms step_avg:60.57ms +step:311/2245 train_time:18837ms step_avg:60.57ms +step:312/2245 train_time:18895ms step_avg:60.56ms +step:313/2245 train_time:18957ms step_avg:60.56ms +step:314/2245 train_time:19016ms step_avg:60.56ms +step:315/2245 train_time:19077ms step_avg:60.56ms +step:316/2245 train_time:19136ms step_avg:60.56ms +step:317/2245 train_time:19198ms step_avg:60.56ms +step:318/2245 train_time:19257ms step_avg:60.56ms +step:319/2245 train_time:19319ms step_avg:60.56ms +step:320/2245 train_time:19378ms step_avg:60.55ms +step:321/2245 train_time:19439ms step_avg:60.56ms +step:322/2245 train_time:19498ms step_avg:60.55ms +step:323/2245 train_time:19560ms step_avg:60.56ms +step:324/2245 train_time:19620ms step_avg:60.56ms +step:325/2245 train_time:19681ms step_avg:60.56ms +step:326/2245 train_time:19740ms step_avg:60.55ms +step:327/2245 train_time:19801ms step_avg:60.55ms +step:328/2245 train_time:19860ms step_avg:60.55ms +step:329/2245 train_time:19922ms step_avg:60.55ms +step:330/2245 train_time:19981ms step_avg:60.55ms +step:331/2245 train_time:20043ms step_avg:60.55ms +step:332/2245 train_time:20102ms step_avg:60.55ms +step:333/2245 train_time:20164ms step_avg:60.55ms +step:334/2245 train_time:20223ms step_avg:60.55ms +step:335/2245 train_time:20284ms step_avg:60.55ms +step:336/2245 train_time:20343ms step_avg:60.54ms +step:337/2245 train_time:20405ms step_avg:60.55ms +step:338/2245 train_time:20465ms step_avg:60.55ms +step:339/2245 train_time:20526ms step_avg:60.55ms +step:340/2245 train_time:20585ms step_avg:60.55ms +step:341/2245 train_time:20647ms step_avg:60.55ms +step:342/2245 train_time:20707ms step_avg:60.55ms +step:343/2245 train_time:20768ms step_avg:60.55ms +step:344/2245 train_time:20828ms step_avg:60.55ms +step:345/2245 train_time:20889ms step_avg:60.55ms +step:346/2245 train_time:20949ms step_avg:60.55ms +step:347/2245 train_time:21010ms step_avg:60.55ms +step:348/2245 train_time:21069ms step_avg:60.54ms +step:349/2245 train_time:21130ms step_avg:60.54ms +step:350/2245 train_time:21188ms step_avg:60.54ms +step:351/2245 train_time:21250ms step_avg:60.54ms +step:352/2245 train_time:21308ms step_avg:60.54ms +step:353/2245 train_time:21370ms step_avg:60.54ms +step:354/2245 train_time:21429ms step_avg:60.53ms +step:355/2245 train_time:21490ms step_avg:60.54ms +step:356/2245 train_time:21549ms step_avg:60.53ms +step:357/2245 train_time:21610ms step_avg:60.53ms +step:358/2245 train_time:21669ms step_avg:60.53ms +step:359/2245 train_time:21730ms step_avg:60.53ms +step:360/2245 train_time:21789ms step_avg:60.52ms +step:361/2245 train_time:21851ms step_avg:60.53ms +step:362/2245 train_time:21909ms step_avg:60.52ms +step:363/2245 train_time:21970ms step_avg:60.52ms +step:364/2245 train_time:22029ms step_avg:60.52ms +step:365/2245 train_time:22091ms step_avg:60.52ms +step:366/2245 train_time:22150ms step_avg:60.52ms +step:367/2245 train_time:22211ms step_avg:60.52ms +step:368/2245 train_time:22270ms step_avg:60.52ms +step:369/2245 train_time:22332ms step_avg:60.52ms +step:370/2245 train_time:22391ms step_avg:60.52ms +step:371/2245 train_time:22452ms step_avg:60.52ms +step:372/2245 train_time:22511ms step_avg:60.51ms +step:373/2245 train_time:22572ms step_avg:60.51ms +step:374/2245 train_time:22630ms step_avg:60.51ms +step:375/2245 train_time:22692ms step_avg:60.51ms +step:376/2245 train_time:22750ms step_avg:60.51ms +step:377/2245 train_time:22812ms step_avg:60.51ms +step:378/2245 train_time:22871ms step_avg:60.51ms +step:379/2245 train_time:22932ms step_avg:60.51ms +step:380/2245 train_time:22991ms step_avg:60.50ms +step:381/2245 train_time:23053ms step_avg:60.51ms +step:382/2245 train_time:23112ms step_avg:60.50ms +step:383/2245 train_time:23173ms step_avg:60.50ms +step:384/2245 train_time:23232ms step_avg:60.50ms +step:385/2245 train_time:23293ms step_avg:60.50ms +step:386/2245 train_time:23352ms step_avg:60.50ms +step:387/2245 train_time:23413ms step_avg:60.50ms +step:388/2245 train_time:23472ms step_avg:60.50ms +step:389/2245 train_time:23533ms step_avg:60.50ms +step:390/2245 train_time:23593ms step_avg:60.49ms +step:391/2245 train_time:23654ms step_avg:60.50ms +step:392/2245 train_time:23712ms step_avg:60.49ms +step:393/2245 train_time:23774ms step_avg:60.49ms +step:394/2245 train_time:23832ms step_avg:60.49ms +step:395/2245 train_time:23894ms step_avg:60.49ms +step:396/2245 train_time:23953ms step_avg:60.49ms +step:397/2245 train_time:24015ms step_avg:60.49ms +step:398/2245 train_time:24074ms step_avg:60.49ms +step:399/2245 train_time:24136ms step_avg:60.49ms +step:400/2245 train_time:24194ms step_avg:60.49ms +step:401/2245 train_time:24256ms step_avg:60.49ms +step:402/2245 train_time:24315ms step_avg:60.48ms +step:403/2245 train_time:24376ms step_avg:60.49ms +step:404/2245 train_time:24434ms step_avg:60.48ms +step:405/2245 train_time:24495ms step_avg:60.48ms +step:406/2245 train_time:24554ms step_avg:60.48ms +step:407/2245 train_time:24615ms step_avg:60.48ms +step:408/2245 train_time:24674ms step_avg:60.48ms +step:409/2245 train_time:24735ms step_avg:60.48ms +step:410/2245 train_time:24794ms step_avg:60.47ms +step:411/2245 train_time:24856ms step_avg:60.48ms +step:412/2245 train_time:24914ms step_avg:60.47ms +step:413/2245 train_time:24975ms step_avg:60.47ms +step:414/2245 train_time:25035ms step_avg:60.47ms +step:415/2245 train_time:25096ms step_avg:60.47ms +step:416/2245 train_time:25155ms step_avg:60.47ms +step:417/2245 train_time:25217ms step_avg:60.47ms +step:418/2245 train_time:25276ms step_avg:60.47ms +step:419/2245 train_time:25338ms step_avg:60.47ms +step:420/2245 train_time:25396ms step_avg:60.47ms +step:421/2245 train_time:25457ms step_avg:60.47ms +step:422/2245 train_time:25516ms step_avg:60.47ms +step:423/2245 train_time:25578ms step_avg:60.47ms +step:424/2245 train_time:25637ms step_avg:60.46ms +step:425/2245 train_time:25698ms step_avg:60.47ms +step:426/2245 train_time:25757ms step_avg:60.46ms +step:427/2245 train_time:25819ms step_avg:60.46ms +step:428/2245 train_time:25877ms step_avg:60.46ms +step:429/2245 train_time:25939ms step_avg:60.46ms +step:430/2245 train_time:25997ms step_avg:60.46ms +step:431/2245 train_time:26059ms step_avg:60.46ms +step:432/2245 train_time:26118ms step_avg:60.46ms +step:433/2245 train_time:26180ms step_avg:60.46ms +step:434/2245 train_time:26238ms step_avg:60.46ms +step:435/2245 train_time:26300ms step_avg:60.46ms +step:436/2245 train_time:26359ms step_avg:60.46ms +step:437/2245 train_time:26421ms step_avg:60.46ms +step:438/2245 train_time:26480ms step_avg:60.46ms +step:439/2245 train_time:26542ms step_avg:60.46ms +step:440/2245 train_time:26601ms step_avg:60.46ms +step:441/2245 train_time:26663ms step_avg:60.46ms +step:442/2245 train_time:26723ms step_avg:60.46ms +step:443/2245 train_time:26784ms step_avg:60.46ms +step:444/2245 train_time:26842ms step_avg:60.46ms +step:445/2245 train_time:26905ms step_avg:60.46ms +step:446/2245 train_time:26965ms step_avg:60.46ms +step:447/2245 train_time:27027ms step_avg:60.46ms +step:448/2245 train_time:27087ms step_avg:60.46ms +step:449/2245 train_time:27148ms step_avg:60.46ms +step:450/2245 train_time:27208ms step_avg:60.46ms +step:451/2245 train_time:27271ms step_avg:60.47ms +step:452/2245 train_time:27330ms step_avg:60.46ms +step:453/2245 train_time:27391ms step_avg:60.47ms +step:454/2245 train_time:27450ms step_avg:60.46ms +step:455/2245 train_time:27511ms step_avg:60.46ms +step:456/2245 train_time:27570ms step_avg:60.46ms +step:457/2245 train_time:27631ms step_avg:60.46ms +step:458/2245 train_time:27690ms step_avg:60.46ms +step:459/2245 train_time:27751ms step_avg:60.46ms +step:460/2245 train_time:27810ms step_avg:60.46ms +step:461/2245 train_time:27871ms step_avg:60.46ms +step:462/2245 train_time:27931ms step_avg:60.46ms +step:463/2245 train_time:27992ms step_avg:60.46ms +step:464/2245 train_time:28051ms step_avg:60.45ms +step:465/2245 train_time:28112ms step_avg:60.46ms +step:466/2245 train_time:28171ms step_avg:60.45ms +step:467/2245 train_time:28233ms step_avg:60.46ms +step:468/2245 train_time:28292ms step_avg:60.45ms +step:469/2245 train_time:28353ms step_avg:60.45ms +step:470/2245 train_time:28412ms step_avg:60.45ms +step:471/2245 train_time:28473ms step_avg:60.45ms +step:472/2245 train_time:28532ms step_avg:60.45ms +step:473/2245 train_time:28593ms step_avg:60.45ms +step:474/2245 train_time:28652ms step_avg:60.45ms +step:475/2245 train_time:28714ms step_avg:60.45ms +step:476/2245 train_time:28773ms step_avg:60.45ms +step:477/2245 train_time:28834ms step_avg:60.45ms +step:478/2245 train_time:28893ms step_avg:60.45ms +step:479/2245 train_time:28955ms step_avg:60.45ms +step:480/2245 train_time:29014ms step_avg:60.45ms +step:481/2245 train_time:29076ms step_avg:60.45ms +step:482/2245 train_time:29135ms step_avg:60.45ms +step:483/2245 train_time:29197ms step_avg:60.45ms +step:484/2245 train_time:29256ms step_avg:60.45ms +step:485/2245 train_time:29318ms step_avg:60.45ms +step:486/2245 train_time:29377ms step_avg:60.45ms +step:487/2245 train_time:29438ms step_avg:60.45ms +step:488/2245 train_time:29500ms step_avg:60.45ms +step:489/2245 train_time:29559ms step_avg:60.45ms +step:490/2245 train_time:29618ms step_avg:60.45ms +step:491/2245 train_time:29680ms step_avg:60.45ms +step:492/2245 train_time:29740ms step_avg:60.45ms +step:493/2245 train_time:29801ms step_avg:60.45ms +step:494/2245 train_time:29860ms step_avg:60.45ms +step:495/2245 train_time:29922ms step_avg:60.45ms +step:496/2245 train_time:29981ms step_avg:60.45ms +step:497/2245 train_time:30043ms step_avg:60.45ms +step:498/2245 train_time:30103ms step_avg:60.45ms +step:499/2245 train_time:30165ms step_avg:60.45ms +step:500/2245 train_time:30224ms step_avg:60.45ms +step:500/2245 val_loss:3.8138 train_time:30287ms step_avg:60.57ms +step:501/2245 train_time:30306ms step_avg:60.49ms +step:502/2245 train_time:30347ms step_avg:60.45ms +step:503/2245 train_time:30411ms step_avg:60.46ms +step:504/2245 train_time:30473ms step_avg:60.46ms +step:505/2245 train_time:30535ms step_avg:60.47ms +step:506/2245 train_time:30593ms step_avg:60.46ms +step:507/2245 train_time:30654ms step_avg:60.46ms +step:508/2245 train_time:30712ms step_avg:60.46ms +step:509/2245 train_time:30773ms step_avg:60.46ms +step:510/2245 train_time:30831ms step_avg:60.45ms +step:511/2245 train_time:30892ms step_avg:60.45ms +step:512/2245 train_time:30951ms step_avg:60.45ms +step:513/2245 train_time:31011ms step_avg:60.45ms +step:514/2245 train_time:31070ms step_avg:60.45ms +step:515/2245 train_time:31131ms step_avg:60.45ms +step:516/2245 train_time:31190ms step_avg:60.45ms +step:517/2245 train_time:31252ms step_avg:60.45ms +step:518/2245 train_time:31313ms step_avg:60.45ms +step:519/2245 train_time:31376ms step_avg:60.46ms +step:520/2245 train_time:31438ms step_avg:60.46ms +step:521/2245 train_time:31497ms step_avg:60.46ms +step:522/2245 train_time:31556ms step_avg:60.45ms +step:523/2245 train_time:31618ms step_avg:60.45ms +step:524/2245 train_time:31676ms step_avg:60.45ms +step:525/2245 train_time:31737ms step_avg:60.45ms +step:526/2245 train_time:31796ms step_avg:60.45ms +step:527/2245 train_time:31858ms step_avg:60.45ms +step:528/2245 train_time:31917ms step_avg:60.45ms +step:529/2245 train_time:31978ms step_avg:60.45ms +step:530/2245 train_time:32037ms step_avg:60.45ms +step:531/2245 train_time:32098ms step_avg:60.45ms +step:532/2245 train_time:32158ms step_avg:60.45ms +step:533/2245 train_time:32219ms step_avg:60.45ms +step:534/2245 train_time:32279ms step_avg:60.45ms +step:535/2245 train_time:32342ms step_avg:60.45ms +step:536/2245 train_time:32401ms step_avg:60.45ms +step:537/2245 train_time:32462ms step_avg:60.45ms +step:538/2245 train_time:32522ms step_avg:60.45ms +step:539/2245 train_time:32583ms step_avg:60.45ms +step:540/2245 train_time:32643ms step_avg:60.45ms +step:541/2245 train_time:32704ms step_avg:60.45ms +step:542/2245 train_time:32763ms step_avg:60.45ms +step:543/2245 train_time:32825ms step_avg:60.45ms +step:544/2245 train_time:32883ms step_avg:60.45ms +step:545/2245 train_time:32946ms step_avg:60.45ms +step:546/2245 train_time:33005ms step_avg:60.45ms +step:547/2245 train_time:33067ms step_avg:60.45ms +step:548/2245 train_time:33127ms step_avg:60.45ms +step:549/2245 train_time:33188ms step_avg:60.45ms +step:550/2245 train_time:33248ms step_avg:60.45ms +step:551/2245 train_time:33310ms step_avg:60.45ms +step:552/2245 train_time:33369ms step_avg:60.45ms +step:553/2245 train_time:33431ms step_avg:60.45ms +step:554/2245 train_time:33490ms step_avg:60.45ms +step:555/2245 train_time:33552ms step_avg:60.45ms +step:556/2245 train_time:33611ms step_avg:60.45ms +step:557/2245 train_time:33674ms step_avg:60.46ms +step:558/2245 train_time:33732ms step_avg:60.45ms +step:559/2245 train_time:33794ms step_avg:60.45ms +step:560/2245 train_time:33853ms step_avg:60.45ms +step:561/2245 train_time:33914ms step_avg:60.45ms +step:562/2245 train_time:33973ms step_avg:60.45ms +step:563/2245 train_time:34034ms step_avg:60.45ms +step:564/2245 train_time:34093ms step_avg:60.45ms +step:565/2245 train_time:34154ms step_avg:60.45ms +step:566/2245 train_time:34213ms step_avg:60.45ms +step:567/2245 train_time:34274ms step_avg:60.45ms +step:568/2245 train_time:34333ms step_avg:60.45ms +step:569/2245 train_time:34395ms step_avg:60.45ms +step:570/2245 train_time:34454ms step_avg:60.45ms +step:571/2245 train_time:34516ms step_avg:60.45ms +step:572/2245 train_time:34574ms step_avg:60.44ms +step:573/2245 train_time:34636ms step_avg:60.45ms +step:574/2245 train_time:34695ms step_avg:60.44ms +step:575/2245 train_time:34757ms step_avg:60.45ms +step:576/2245 train_time:34816ms step_avg:60.44ms +step:577/2245 train_time:34877ms step_avg:60.45ms +step:578/2245 train_time:34936ms step_avg:60.44ms +step:579/2245 train_time:34997ms step_avg:60.44ms +step:580/2245 train_time:35056ms step_avg:60.44ms +step:581/2245 train_time:35117ms step_avg:60.44ms +step:582/2245 train_time:35176ms step_avg:60.44ms +step:583/2245 train_time:35238ms step_avg:60.44ms +step:584/2245 train_time:35297ms step_avg:60.44ms +step:585/2245 train_time:35358ms step_avg:60.44ms +step:586/2245 train_time:35418ms step_avg:60.44ms +step:587/2245 train_time:35479ms step_avg:60.44ms +step:588/2245 train_time:35539ms step_avg:60.44ms +step:589/2245 train_time:35600ms step_avg:60.44ms +step:590/2245 train_time:35659ms step_avg:60.44ms +step:591/2245 train_time:35721ms step_avg:60.44ms +step:592/2245 train_time:35780ms step_avg:60.44ms +step:593/2245 train_time:35841ms step_avg:60.44ms +step:594/2245 train_time:35901ms step_avg:60.44ms +step:595/2245 train_time:35963ms step_avg:60.44ms +step:596/2245 train_time:36021ms step_avg:60.44ms +step:597/2245 train_time:36083ms step_avg:60.44ms +step:598/2245 train_time:36142ms step_avg:60.44ms +step:599/2245 train_time:36204ms step_avg:60.44ms +step:600/2245 train_time:36263ms step_avg:60.44ms +step:601/2245 train_time:36325ms step_avg:60.44ms +step:602/2245 train_time:36384ms step_avg:60.44ms +step:603/2245 train_time:36446ms step_avg:60.44ms +step:604/2245 train_time:36506ms step_avg:60.44ms +step:605/2245 train_time:36569ms step_avg:60.44ms +step:606/2245 train_time:36629ms step_avg:60.44ms +step:607/2245 train_time:36690ms step_avg:60.45ms +step:608/2245 train_time:36750ms step_avg:60.44ms +step:609/2245 train_time:36812ms step_avg:60.45ms +step:610/2245 train_time:36872ms step_avg:60.45ms +step:611/2245 train_time:36933ms step_avg:60.45ms +step:612/2245 train_time:36992ms step_avg:60.45ms +step:613/2245 train_time:37054ms step_avg:60.45ms +step:614/2245 train_time:37113ms step_avg:60.45ms +step:615/2245 train_time:37175ms step_avg:60.45ms +step:616/2245 train_time:37233ms step_avg:60.44ms +step:617/2245 train_time:37295ms step_avg:60.45ms +step:618/2245 train_time:37354ms step_avg:60.44ms +step:619/2245 train_time:37416ms step_avg:60.45ms +step:620/2245 train_time:37475ms step_avg:60.44ms +step:621/2245 train_time:37537ms step_avg:60.45ms +step:622/2245 train_time:37596ms step_avg:60.44ms +step:623/2245 train_time:37658ms step_avg:60.45ms +step:624/2245 train_time:37717ms step_avg:60.44ms +step:625/2245 train_time:37779ms step_avg:60.45ms +step:626/2245 train_time:37838ms step_avg:60.44ms +step:627/2245 train_time:37900ms step_avg:60.45ms +step:628/2245 train_time:37959ms step_avg:60.44ms +step:629/2245 train_time:38022ms step_avg:60.45ms +step:630/2245 train_time:38080ms step_avg:60.44ms +step:631/2245 train_time:38142ms step_avg:60.45ms +step:632/2245 train_time:38201ms step_avg:60.44ms +step:633/2245 train_time:38262ms step_avg:60.45ms +step:634/2245 train_time:38322ms step_avg:60.44ms +step:635/2245 train_time:38384ms step_avg:60.45ms +step:636/2245 train_time:38443ms step_avg:60.45ms +step:637/2245 train_time:38505ms step_avg:60.45ms +step:638/2245 train_time:38564ms step_avg:60.45ms +step:639/2245 train_time:38626ms step_avg:60.45ms +step:640/2245 train_time:38686ms step_avg:60.45ms +step:641/2245 train_time:38748ms step_avg:60.45ms +step:642/2245 train_time:38808ms step_avg:60.45ms +step:643/2245 train_time:38870ms step_avg:60.45ms +step:644/2245 train_time:38929ms step_avg:60.45ms +step:645/2245 train_time:38991ms step_avg:60.45ms +step:646/2245 train_time:39050ms step_avg:60.45ms +step:647/2245 train_time:39112ms step_avg:60.45ms +step:648/2245 train_time:39171ms step_avg:60.45ms +step:649/2245 train_time:39235ms step_avg:60.45ms +step:650/2245 train_time:39292ms step_avg:60.45ms +step:651/2245 train_time:39354ms step_avg:60.45ms +step:652/2245 train_time:39414ms step_avg:60.45ms +step:653/2245 train_time:39475ms step_avg:60.45ms +step:654/2245 train_time:39534ms step_avg:60.45ms +step:655/2245 train_time:39596ms step_avg:60.45ms +step:656/2245 train_time:39655ms step_avg:60.45ms +step:657/2245 train_time:39717ms step_avg:60.45ms +step:658/2245 train_time:39775ms step_avg:60.45ms +step:659/2245 train_time:39837ms step_avg:60.45ms +step:660/2245 train_time:39896ms step_avg:60.45ms +step:661/2245 train_time:39959ms step_avg:60.45ms +step:662/2245 train_time:40018ms step_avg:60.45ms +step:663/2245 train_time:40079ms step_avg:60.45ms +step:664/2245 train_time:40138ms step_avg:60.45ms +step:665/2245 train_time:40200ms step_avg:60.45ms +step:666/2245 train_time:40259ms step_avg:60.45ms +step:667/2245 train_time:40321ms step_avg:60.45ms +step:668/2245 train_time:40379ms step_avg:60.45ms +step:669/2245 train_time:40440ms step_avg:60.45ms +step:670/2245 train_time:40500ms step_avg:60.45ms +step:671/2245 train_time:40561ms step_avg:60.45ms +step:672/2245 train_time:40621ms step_avg:60.45ms +step:673/2245 train_time:40682ms step_avg:60.45ms +step:674/2245 train_time:40741ms step_avg:60.45ms +step:675/2245 train_time:40802ms step_avg:60.45ms +step:676/2245 train_time:40861ms step_avg:60.45ms +step:677/2245 train_time:40924ms step_avg:60.45ms +step:678/2245 train_time:40983ms step_avg:60.45ms +step:679/2245 train_time:41045ms step_avg:60.45ms +step:680/2245 train_time:41105ms step_avg:60.45ms +step:681/2245 train_time:41167ms step_avg:60.45ms +step:682/2245 train_time:41226ms step_avg:60.45ms +step:683/2245 train_time:41288ms step_avg:60.45ms +step:684/2245 train_time:41347ms step_avg:60.45ms +step:685/2245 train_time:41409ms step_avg:60.45ms +step:686/2245 train_time:41468ms step_avg:60.45ms +step:687/2245 train_time:41530ms step_avg:60.45ms +step:688/2245 train_time:41589ms step_avg:60.45ms +step:689/2245 train_time:41651ms step_avg:60.45ms +step:690/2245 train_time:41711ms step_avg:60.45ms +step:691/2245 train_time:41772ms step_avg:60.45ms +step:692/2245 train_time:41831ms step_avg:60.45ms +step:693/2245 train_time:41893ms step_avg:60.45ms +step:694/2245 train_time:41952ms step_avg:60.45ms +step:695/2245 train_time:42014ms step_avg:60.45ms +step:696/2245 train_time:42073ms step_avg:60.45ms +step:697/2245 train_time:42135ms step_avg:60.45ms +step:698/2245 train_time:42194ms step_avg:60.45ms +step:699/2245 train_time:42255ms step_avg:60.45ms +step:700/2245 train_time:42314ms step_avg:60.45ms +step:701/2245 train_time:42375ms step_avg:60.45ms +step:702/2245 train_time:42434ms step_avg:60.45ms +step:703/2245 train_time:42495ms step_avg:60.45ms +step:704/2245 train_time:42554ms step_avg:60.45ms +step:705/2245 train_time:42615ms step_avg:60.45ms +step:706/2245 train_time:42674ms step_avg:60.44ms +step:707/2245 train_time:42736ms step_avg:60.45ms +step:708/2245 train_time:42795ms step_avg:60.44ms +step:709/2245 train_time:42857ms step_avg:60.45ms +step:710/2245 train_time:42916ms step_avg:60.45ms +step:711/2245 train_time:42978ms step_avg:60.45ms +step:712/2245 train_time:43037ms step_avg:60.44ms +step:713/2245 train_time:43098ms step_avg:60.45ms +step:714/2245 train_time:43157ms step_avg:60.44ms +step:715/2245 train_time:43219ms step_avg:60.45ms +step:716/2245 train_time:43278ms step_avg:60.44ms +step:717/2245 train_time:43340ms step_avg:60.45ms +step:718/2245 train_time:43399ms step_avg:60.44ms +step:719/2245 train_time:43461ms step_avg:60.45ms +step:720/2245 train_time:43520ms step_avg:60.44ms +step:721/2245 train_time:43582ms step_avg:60.45ms +step:722/2245 train_time:44038ms step_avg:61.00ms +step:723/2245 train_time:44098ms step_avg:60.99ms +step:724/2245 train_time:44156ms step_avg:60.99ms +step:725/2245 train_time:44217ms step_avg:60.99ms +step:726/2245 train_time:44275ms step_avg:60.98ms +step:727/2245 train_time:44335ms step_avg:60.98ms +step:728/2245 train_time:44393ms step_avg:60.98ms +step:729/2245 train_time:44454ms step_avg:60.98ms +step:730/2245 train_time:44512ms step_avg:60.98ms +step:731/2245 train_time:44573ms step_avg:60.98ms +step:732/2245 train_time:44631ms step_avg:60.97ms +step:733/2245 train_time:44692ms step_avg:60.97ms +step:734/2245 train_time:44750ms step_avg:60.97ms +step:735/2245 train_time:44811ms step_avg:60.97ms +step:736/2245 train_time:44870ms step_avg:60.97ms +step:737/2245 train_time:44938ms step_avg:60.97ms +step:738/2245 train_time:45002ms step_avg:60.98ms +step:739/2245 train_time:45067ms step_avg:60.98ms +step:740/2245 train_time:45128ms step_avg:60.98ms +step:741/2245 train_time:45190ms step_avg:60.99ms +step:742/2245 train_time:45250ms step_avg:60.98ms +step:743/2245 train_time:45313ms step_avg:60.99ms +step:744/2245 train_time:45373ms step_avg:60.98ms +step:745/2245 train_time:45434ms step_avg:60.99ms +step:746/2245 train_time:45493ms step_avg:60.98ms +step:747/2245 train_time:45554ms step_avg:60.98ms +step:748/2245 train_time:45613ms step_avg:60.98ms +step:749/2245 train_time:45674ms step_avg:60.98ms +step:750/2245 train_time:45733ms step_avg:60.98ms +step:750/2245 val_loss:3.6648 train_time:45796ms step_avg:61.06ms +step:751/2245 train_time:45816ms step_avg:61.01ms +step:752/2245 train_time:45857ms step_avg:60.98ms +step:753/2245 train_time:45919ms step_avg:60.98ms +step:754/2245 train_time:45979ms step_avg:60.98ms +step:755/2245 train_time:46042ms step_avg:60.98ms +step:756/2245 train_time:46101ms step_avg:60.98ms +step:757/2245 train_time:46162ms step_avg:60.98ms +step:758/2245 train_time:46221ms step_avg:60.98ms +step:759/2245 train_time:46283ms step_avg:60.98ms +step:760/2245 train_time:46342ms step_avg:60.98ms +step:761/2245 train_time:46403ms step_avg:60.98ms +step:762/2245 train_time:46462ms step_avg:60.97ms +step:763/2245 train_time:46523ms step_avg:60.97ms +step:764/2245 train_time:46582ms step_avg:60.97ms +step:765/2245 train_time:46643ms step_avg:60.97ms +step:766/2245 train_time:46708ms step_avg:60.98ms +step:767/2245 train_time:46775ms step_avg:60.98ms +step:768/2245 train_time:46836ms step_avg:60.98ms +step:769/2245 train_time:46899ms step_avg:60.99ms +step:770/2245 train_time:46959ms step_avg:60.99ms +step:771/2245 train_time:47021ms step_avg:60.99ms +step:772/2245 train_time:47081ms step_avg:60.99ms +step:773/2245 train_time:47142ms step_avg:60.99ms +step:774/2245 train_time:47201ms step_avg:60.98ms +step:775/2245 train_time:47263ms step_avg:60.98ms +step:776/2245 train_time:47322ms step_avg:60.98ms +step:777/2245 train_time:47384ms step_avg:60.98ms +step:778/2245 train_time:47443ms step_avg:60.98ms +step:779/2245 train_time:47504ms step_avg:60.98ms +step:780/2245 train_time:47564ms step_avg:60.98ms +step:781/2245 train_time:47626ms step_avg:60.98ms +step:782/2245 train_time:47689ms step_avg:60.98ms +step:783/2245 train_time:47753ms step_avg:60.99ms +step:784/2245 train_time:47815ms step_avg:60.99ms +step:785/2245 train_time:47877ms step_avg:60.99ms +step:786/2245 train_time:47938ms step_avg:60.99ms +step:787/2245 train_time:47999ms step_avg:60.99ms +step:788/2245 train_time:48059ms step_avg:60.99ms +step:789/2245 train_time:48120ms step_avg:60.99ms +step:790/2245 train_time:48180ms step_avg:60.99ms +step:791/2245 train_time:48241ms step_avg:60.99ms +step:792/2245 train_time:48300ms step_avg:60.98ms +step:793/2245 train_time:48361ms step_avg:60.98ms +step:794/2245 train_time:48420ms step_avg:60.98ms +step:795/2245 train_time:48482ms step_avg:60.98ms +step:796/2245 train_time:48541ms step_avg:60.98ms +step:797/2245 train_time:48604ms step_avg:60.98ms +step:798/2245 train_time:48665ms step_avg:60.98ms +step:799/2245 train_time:48729ms step_avg:60.99ms +step:800/2245 train_time:48790ms step_avg:60.99ms +step:801/2245 train_time:48853ms step_avg:60.99ms +step:802/2245 train_time:48913ms step_avg:60.99ms +step:803/2245 train_time:48976ms step_avg:60.99ms +step:804/2245 train_time:49036ms step_avg:60.99ms +step:805/2245 train_time:49097ms step_avg:60.99ms +step:806/2245 train_time:49157ms step_avg:60.99ms +step:807/2245 train_time:49219ms step_avg:60.99ms +step:808/2245 train_time:49278ms step_avg:60.99ms +step:809/2245 train_time:49340ms step_avg:60.99ms +step:810/2245 train_time:49399ms step_avg:60.99ms +step:811/2245 train_time:49461ms step_avg:60.99ms +step:812/2245 train_time:49521ms step_avg:60.99ms +step:813/2245 train_time:49584ms step_avg:60.99ms +step:814/2245 train_time:49644ms step_avg:60.99ms +step:815/2245 train_time:49707ms step_avg:60.99ms +step:816/2245 train_time:49768ms step_avg:60.99ms +step:817/2245 train_time:49831ms step_avg:60.99ms +step:818/2245 train_time:49891ms step_avg:60.99ms +step:819/2245 train_time:49954ms step_avg:60.99ms +step:820/2245 train_time:50014ms step_avg:60.99ms +step:821/2245 train_time:50076ms step_avg:60.99ms +step:822/2245 train_time:50135ms step_avg:60.99ms +step:823/2245 train_time:50197ms step_avg:60.99ms +step:824/2245 train_time:50257ms step_avg:60.99ms +step:825/2245 train_time:50319ms step_avg:60.99ms +step:826/2245 train_time:50379ms step_avg:60.99ms +step:827/2245 train_time:50440ms step_avg:60.99ms +step:828/2245 train_time:50500ms step_avg:60.99ms +step:829/2245 train_time:50562ms step_avg:60.99ms +step:830/2245 train_time:50622ms step_avg:60.99ms +step:831/2245 train_time:50685ms step_avg:60.99ms +step:832/2245 train_time:50745ms step_avg:60.99ms +step:833/2245 train_time:50808ms step_avg:60.99ms +step:834/2245 train_time:50868ms step_avg:60.99ms +step:835/2245 train_time:50931ms step_avg:61.00ms +step:836/2245 train_time:50991ms step_avg:60.99ms +step:837/2245 train_time:51053ms step_avg:61.00ms +step:838/2245 train_time:51114ms step_avg:60.99ms +step:839/2245 train_time:51176ms step_avg:61.00ms +step:840/2245 train_time:51236ms step_avg:60.99ms +step:841/2245 train_time:51298ms step_avg:61.00ms +step:842/2245 train_time:51357ms step_avg:60.99ms +step:843/2245 train_time:51419ms step_avg:61.00ms +step:844/2245 train_time:51479ms step_avg:60.99ms +step:845/2245 train_time:51541ms step_avg:61.00ms +step:846/2245 train_time:51601ms step_avg:60.99ms +step:847/2245 train_time:51664ms step_avg:61.00ms +step:848/2245 train_time:51724ms step_avg:60.99ms +step:849/2245 train_time:51786ms step_avg:61.00ms +step:850/2245 train_time:51846ms step_avg:61.00ms +step:851/2245 train_time:51909ms step_avg:61.00ms +step:852/2245 train_time:51970ms step_avg:61.00ms +step:853/2245 train_time:52032ms step_avg:61.00ms +step:854/2245 train_time:52092ms step_avg:61.00ms +step:855/2245 train_time:52155ms step_avg:61.00ms +step:856/2245 train_time:52215ms step_avg:61.00ms +step:857/2245 train_time:52277ms step_avg:61.00ms +step:858/2245 train_time:52337ms step_avg:61.00ms +step:859/2245 train_time:52398ms step_avg:61.00ms +step:860/2245 train_time:52458ms step_avg:61.00ms +step:861/2245 train_time:52520ms step_avg:61.00ms +step:862/2245 train_time:52580ms step_avg:61.00ms +step:863/2245 train_time:52642ms step_avg:61.00ms +step:864/2245 train_time:52702ms step_avg:61.00ms +step:865/2245 train_time:52765ms step_avg:61.00ms +step:866/2245 train_time:52825ms step_avg:61.00ms +step:867/2245 train_time:52888ms step_avg:61.00ms +step:868/2245 train_time:52948ms step_avg:61.00ms +step:869/2245 train_time:53010ms step_avg:61.00ms +step:870/2245 train_time:53072ms step_avg:61.00ms +step:871/2245 train_time:53134ms step_avg:61.00ms +step:872/2245 train_time:53194ms step_avg:61.00ms +step:873/2245 train_time:53256ms step_avg:61.00ms +step:874/2245 train_time:53316ms step_avg:61.00ms +step:875/2245 train_time:53378ms step_avg:61.00ms +step:876/2245 train_time:53438ms step_avg:61.00ms +step:877/2245 train_time:53499ms step_avg:61.00ms +step:878/2245 train_time:53559ms step_avg:61.00ms +step:879/2245 train_time:53621ms step_avg:61.00ms +step:880/2245 train_time:53681ms step_avg:61.00ms +step:881/2245 train_time:53743ms step_avg:61.00ms +step:882/2245 train_time:53804ms step_avg:61.00ms +step:883/2245 train_time:53867ms step_avg:61.00ms +step:884/2245 train_time:53927ms step_avg:61.00ms +step:885/2245 train_time:53990ms step_avg:61.01ms +step:886/2245 train_time:54051ms step_avg:61.01ms +step:887/2245 train_time:54114ms step_avg:61.01ms +step:888/2245 train_time:54174ms step_avg:61.01ms +step:889/2245 train_time:54235ms step_avg:61.01ms +step:890/2245 train_time:54295ms step_avg:61.01ms +step:891/2245 train_time:54357ms step_avg:61.01ms +step:892/2245 train_time:54417ms step_avg:61.01ms +step:893/2245 train_time:54479ms step_avg:61.01ms +step:894/2245 train_time:54539ms step_avg:61.01ms +step:895/2245 train_time:54601ms step_avg:61.01ms +step:896/2245 train_time:54661ms step_avg:61.01ms +step:897/2245 train_time:54723ms step_avg:61.01ms +step:898/2245 train_time:54784ms step_avg:61.01ms +step:899/2245 train_time:54847ms step_avg:61.01ms +step:900/2245 train_time:54906ms step_avg:61.01ms +step:901/2245 train_time:54969ms step_avg:61.01ms +step:902/2245 train_time:55029ms step_avg:61.01ms +step:903/2245 train_time:55092ms step_avg:61.01ms +step:904/2245 train_time:55151ms step_avg:61.01ms +step:905/2245 train_time:55214ms step_avg:61.01ms +step:906/2245 train_time:55274ms step_avg:61.01ms +step:907/2245 train_time:55336ms step_avg:61.01ms +step:908/2245 train_time:55395ms step_avg:61.01ms +step:909/2245 train_time:55458ms step_avg:61.01ms +step:910/2245 train_time:55517ms step_avg:61.01ms +step:911/2245 train_time:55579ms step_avg:61.01ms +step:912/2245 train_time:55639ms step_avg:61.01ms +step:913/2245 train_time:55701ms step_avg:61.01ms +step:914/2245 train_time:55761ms step_avg:61.01ms +step:915/2245 train_time:55823ms step_avg:61.01ms +step:916/2245 train_time:55884ms step_avg:61.01ms +step:917/2245 train_time:55947ms step_avg:61.01ms +step:918/2245 train_time:56007ms step_avg:61.01ms +step:919/2245 train_time:56069ms step_avg:61.01ms +step:920/2245 train_time:56129ms step_avg:61.01ms +step:921/2245 train_time:56192ms step_avg:61.01ms +step:922/2245 train_time:56253ms step_avg:61.01ms +step:923/2245 train_time:56315ms step_avg:61.01ms +step:924/2245 train_time:56376ms step_avg:61.01ms +step:925/2245 train_time:56438ms step_avg:61.01ms +step:926/2245 train_time:56497ms step_avg:61.01ms +step:927/2245 train_time:56559ms step_avg:61.01ms +step:928/2245 train_time:56619ms step_avg:61.01ms +step:929/2245 train_time:56681ms step_avg:61.01ms +step:930/2245 train_time:56740ms step_avg:61.01ms +step:931/2245 train_time:56802ms step_avg:61.01ms +step:932/2245 train_time:56863ms step_avg:61.01ms +step:933/2245 train_time:56925ms step_avg:61.01ms +step:934/2245 train_time:56985ms step_avg:61.01ms +step:935/2245 train_time:57048ms step_avg:61.01ms +step:936/2245 train_time:57108ms step_avg:61.01ms +step:937/2245 train_time:57171ms step_avg:61.01ms +step:938/2245 train_time:57231ms step_avg:61.01ms +step:939/2245 train_time:57294ms step_avg:61.02ms +step:940/2245 train_time:57355ms step_avg:61.02ms +step:941/2245 train_time:57417ms step_avg:61.02ms +step:942/2245 train_time:57477ms step_avg:61.02ms +step:943/2245 train_time:57539ms step_avg:61.02ms +step:944/2245 train_time:57599ms step_avg:61.02ms +step:945/2245 train_time:57661ms step_avg:61.02ms +step:946/2245 train_time:57720ms step_avg:61.01ms +step:947/2245 train_time:57782ms step_avg:61.02ms +step:948/2245 train_time:57842ms step_avg:61.02ms +step:949/2245 train_time:57905ms step_avg:61.02ms +step:950/2245 train_time:57965ms step_avg:61.02ms +step:951/2245 train_time:58028ms step_avg:61.02ms +step:952/2245 train_time:58087ms step_avg:61.02ms +step:953/2245 train_time:58150ms step_avg:61.02ms +step:954/2245 train_time:58211ms step_avg:61.02ms +step:955/2245 train_time:58275ms step_avg:61.02ms +step:956/2245 train_time:58335ms step_avg:61.02ms +step:957/2245 train_time:58396ms step_avg:61.02ms +step:958/2245 train_time:58457ms step_avg:61.02ms +step:959/2245 train_time:58519ms step_avg:61.02ms +step:960/2245 train_time:58580ms step_avg:61.02ms +step:961/2245 train_time:58642ms step_avg:61.02ms +step:962/2245 train_time:58701ms step_avg:61.02ms +step:963/2245 train_time:58763ms step_avg:61.02ms +step:964/2245 train_time:58824ms step_avg:61.02ms +step:965/2245 train_time:58886ms step_avg:61.02ms +step:966/2245 train_time:58945ms step_avg:61.02ms +step:967/2245 train_time:59007ms step_avg:61.02ms +step:968/2245 train_time:59067ms step_avg:61.02ms +step:969/2245 train_time:59129ms step_avg:61.02ms +step:970/2245 train_time:59191ms step_avg:61.02ms +step:971/2245 train_time:59254ms step_avg:61.02ms +step:972/2245 train_time:59314ms step_avg:61.02ms +step:973/2245 train_time:59376ms step_avg:61.02ms +step:974/2245 train_time:59436ms step_avg:61.02ms +step:975/2245 train_time:59499ms step_avg:61.02ms +step:976/2245 train_time:59559ms step_avg:61.02ms +step:977/2245 train_time:59621ms step_avg:61.02ms +step:978/2245 train_time:59681ms step_avg:61.02ms +step:979/2245 train_time:59743ms step_avg:61.02ms +step:980/2245 train_time:59802ms step_avg:61.02ms +step:981/2245 train_time:59864ms step_avg:61.02ms +step:982/2245 train_time:59924ms step_avg:61.02ms +step:983/2245 train_time:59987ms step_avg:61.02ms +step:984/2245 train_time:60046ms step_avg:61.02ms +step:985/2245 train_time:60108ms step_avg:61.02ms +step:986/2245 train_time:60170ms step_avg:61.02ms +step:987/2245 train_time:60233ms step_avg:61.03ms +step:988/2245 train_time:60294ms step_avg:61.03ms +step:989/2245 train_time:60357ms step_avg:61.03ms +step:990/2245 train_time:60417ms step_avg:61.03ms +step:991/2245 train_time:60479ms step_avg:61.03ms +step:992/2245 train_time:60539ms step_avg:61.03ms +step:993/2245 train_time:60601ms step_avg:61.03ms +step:994/2245 train_time:60660ms step_avg:61.03ms +step:995/2245 train_time:60722ms step_avg:61.03ms +step:996/2245 train_time:60782ms step_avg:61.03ms +step:997/2245 train_time:60844ms step_avg:61.03ms +step:998/2245 train_time:60903ms step_avg:61.03ms +step:999/2245 train_time:60966ms step_avg:61.03ms +step:1000/2245 train_time:61025ms step_avg:61.03ms +step:1000/2245 val_loss:3.5973 train_time:61089ms step_avg:61.09ms +step:1001/2245 train_time:61107ms step_avg:61.05ms +step:1002/2245 train_time:61149ms step_avg:61.03ms +step:1003/2245 train_time:61214ms step_avg:61.03ms +step:1004/2245 train_time:61274ms step_avg:61.03ms +step:1005/2245 train_time:61337ms step_avg:61.03ms +step:1006/2245 train_time:61398ms step_avg:61.03ms +step:1007/2245 train_time:61459ms step_avg:61.03ms +step:1008/2245 train_time:61519ms step_avg:61.03ms +step:1009/2245 train_time:61581ms step_avg:61.03ms +step:1010/2245 train_time:61640ms step_avg:61.03ms +step:1011/2245 train_time:61702ms step_avg:61.03ms +step:1012/2245 train_time:61761ms step_avg:61.03ms +step:1013/2245 train_time:61823ms step_avg:61.03ms +step:1014/2245 train_time:61882ms step_avg:61.03ms +step:1015/2245 train_time:61944ms step_avg:61.03ms +step:1016/2245 train_time:62004ms step_avg:61.03ms +step:1017/2245 train_time:62068ms step_avg:61.03ms +step:1018/2245 train_time:62129ms step_avg:61.03ms +step:1019/2245 train_time:62192ms step_avg:61.03ms +step:1020/2245 train_time:62252ms step_avg:61.03ms +step:1021/2245 train_time:62315ms step_avg:61.03ms +step:1022/2245 train_time:62375ms step_avg:61.03ms +step:1023/2245 train_time:62439ms step_avg:61.03ms +step:1024/2245 train_time:62497ms step_avg:61.03ms +step:1025/2245 train_time:62559ms step_avg:61.03ms +step:1026/2245 train_time:62619ms step_avg:61.03ms +step:1027/2245 train_time:62682ms step_avg:61.03ms +step:1028/2245 train_time:62742ms step_avg:61.03ms +step:1029/2245 train_time:62804ms step_avg:61.03ms +step:1030/2245 train_time:62863ms step_avg:61.03ms +step:1031/2245 train_time:62924ms step_avg:61.03ms +step:1032/2245 train_time:62984ms step_avg:61.03ms +step:1033/2245 train_time:63047ms step_avg:61.03ms +step:1034/2245 train_time:63107ms step_avg:61.03ms +step:1035/2245 train_time:63171ms step_avg:61.03ms +step:1036/2245 train_time:63230ms step_avg:61.03ms +step:1037/2245 train_time:63294ms step_avg:61.04ms +step:1038/2245 train_time:63354ms step_avg:61.03ms +step:1039/2245 train_time:63417ms step_avg:61.04ms +step:1040/2245 train_time:63477ms step_avg:61.04ms +step:1041/2245 train_time:63539ms step_avg:61.04ms +step:1042/2245 train_time:63599ms step_avg:61.04ms +step:1043/2245 train_time:63662ms step_avg:61.04ms +step:1044/2245 train_time:63722ms step_avg:61.04ms +step:1045/2245 train_time:63784ms step_avg:61.04ms +step:1046/2245 train_time:63844ms step_avg:61.04ms +step:1047/2245 train_time:63906ms step_avg:61.04ms +step:1048/2245 train_time:63965ms step_avg:61.04ms +step:1049/2245 train_time:64027ms step_avg:61.04ms +step:1050/2245 train_time:64087ms step_avg:61.04ms +step:1051/2245 train_time:64150ms step_avg:61.04ms +step:1052/2245 train_time:64210ms step_avg:61.04ms +step:1053/2245 train_time:64273ms step_avg:61.04ms +step:1054/2245 train_time:64333ms step_avg:61.04ms +step:1055/2245 train_time:64396ms step_avg:61.04ms +step:1056/2245 train_time:64456ms step_avg:61.04ms +step:1057/2245 train_time:64519ms step_avg:61.04ms +step:1058/2245 train_time:64578ms step_avg:61.04ms +step:1059/2245 train_time:64641ms step_avg:61.04ms +step:1060/2245 train_time:64701ms step_avg:61.04ms +step:1061/2245 train_time:64762ms step_avg:61.04ms +step:1062/2245 train_time:64823ms step_avg:61.04ms +step:1063/2245 train_time:64885ms step_avg:61.04ms +step:1064/2245 train_time:64944ms step_avg:61.04ms +step:1065/2245 train_time:65007ms step_avg:61.04ms +step:1066/2245 train_time:65066ms step_avg:61.04ms +step:1067/2245 train_time:65129ms step_avg:61.04ms +step:1068/2245 train_time:65189ms step_avg:61.04ms +step:1069/2245 train_time:65252ms step_avg:61.04ms +step:1070/2245 train_time:65312ms step_avg:61.04ms +step:1071/2245 train_time:65376ms step_avg:61.04ms +step:1072/2245 train_time:65435ms step_avg:61.04ms +step:1073/2245 train_time:65498ms step_avg:61.04ms +step:1074/2245 train_time:65558ms step_avg:61.04ms +step:1075/2245 train_time:65620ms step_avg:61.04ms +step:1076/2245 train_time:65680ms step_avg:61.04ms +step:1077/2245 train_time:65742ms step_avg:61.04ms +step:1078/2245 train_time:65802ms step_avg:61.04ms +step:1079/2245 train_time:65864ms step_avg:61.04ms +step:1080/2245 train_time:65924ms step_avg:61.04ms +step:1081/2245 train_time:65986ms step_avg:61.04ms +step:1082/2245 train_time:66046ms step_avg:61.04ms +step:1083/2245 train_time:66109ms step_avg:61.04ms +step:1084/2245 train_time:66169ms step_avg:61.04ms +step:1085/2245 train_time:66231ms step_avg:61.04ms +step:1086/2245 train_time:66292ms step_avg:61.04ms +step:1087/2245 train_time:66355ms step_avg:61.04ms +step:1088/2245 train_time:66415ms step_avg:61.04ms +step:1089/2245 train_time:66478ms step_avg:61.05ms +step:1090/2245 train_time:66538ms step_avg:61.04ms +step:1091/2245 train_time:66600ms step_avg:61.05ms +step:1092/2245 train_time:66660ms step_avg:61.04ms +step:1093/2245 train_time:66722ms step_avg:61.05ms +step:1094/2245 train_time:66782ms step_avg:61.04ms +step:1095/2245 train_time:66844ms step_avg:61.04ms +step:1096/2245 train_time:66903ms step_avg:61.04ms +step:1097/2245 train_time:66966ms step_avg:61.04ms +step:1098/2245 train_time:67025ms step_avg:61.04ms +step:1099/2245 train_time:67087ms step_avg:61.04ms +step:1100/2245 train_time:67149ms step_avg:61.04ms +step:1101/2245 train_time:67211ms step_avg:61.05ms +step:1102/2245 train_time:67271ms step_avg:61.04ms +step:1103/2245 train_time:67334ms step_avg:61.05ms +step:1104/2245 train_time:67395ms step_avg:61.05ms +step:1105/2245 train_time:67458ms step_avg:61.05ms +step:1106/2245 train_time:67517ms step_avg:61.05ms +step:1107/2245 train_time:67580ms step_avg:61.05ms +step:1108/2245 train_time:67640ms step_avg:61.05ms +step:1109/2245 train_time:67703ms step_avg:61.05ms +step:1110/2245 train_time:67763ms step_avg:61.05ms +step:1111/2245 train_time:67825ms step_avg:61.05ms +step:1112/2245 train_time:67885ms step_avg:61.05ms +step:1113/2245 train_time:67948ms step_avg:61.05ms +step:1114/2245 train_time:68007ms step_avg:61.05ms +step:1115/2245 train_time:68070ms step_avg:61.05ms +step:1116/2245 train_time:68130ms step_avg:61.05ms +step:1117/2245 train_time:68193ms step_avg:61.05ms +step:1118/2245 train_time:68253ms step_avg:61.05ms +step:1119/2245 train_time:68316ms step_avg:61.05ms +step:1120/2245 train_time:68376ms step_avg:61.05ms +step:1121/2245 train_time:68439ms step_avg:61.05ms +step:1122/2245 train_time:68499ms step_avg:61.05ms +step:1123/2245 train_time:68561ms step_avg:61.05ms +step:1124/2245 train_time:68622ms step_avg:61.05ms +step:1125/2245 train_time:68684ms step_avg:61.05ms +step:1126/2245 train_time:68744ms step_avg:61.05ms +step:1127/2245 train_time:68807ms step_avg:61.05ms +step:1128/2245 train_time:68867ms step_avg:61.05ms +step:1129/2245 train_time:68929ms step_avg:61.05ms +step:1130/2245 train_time:68989ms step_avg:61.05ms +step:1131/2245 train_time:69051ms step_avg:61.05ms +step:1132/2245 train_time:69110ms step_avg:61.05ms +step:1133/2245 train_time:69173ms step_avg:61.05ms +step:1134/2245 train_time:69234ms step_avg:61.05ms +step:1135/2245 train_time:69297ms step_avg:61.05ms +step:1136/2245 train_time:69357ms step_avg:61.05ms +step:1137/2245 train_time:69420ms step_avg:61.06ms +step:1138/2245 train_time:69480ms step_avg:61.05ms +step:1139/2245 train_time:69544ms step_avg:61.06ms +step:1140/2245 train_time:69604ms step_avg:61.06ms +step:1141/2245 train_time:69666ms step_avg:61.06ms +step:1142/2245 train_time:69726ms step_avg:61.06ms +step:1143/2245 train_time:69789ms step_avg:61.06ms +step:1144/2245 train_time:69849ms step_avg:61.06ms +step:1145/2245 train_time:69912ms step_avg:61.06ms +step:1146/2245 train_time:69972ms step_avg:61.06ms +step:1147/2245 train_time:70034ms step_avg:61.06ms +step:1148/2245 train_time:70093ms step_avg:61.06ms +step:1149/2245 train_time:70156ms step_avg:61.06ms +step:1150/2245 train_time:70215ms step_avg:61.06ms +step:1151/2245 train_time:70278ms step_avg:61.06ms +step:1152/2245 train_time:70337ms step_avg:61.06ms +step:1153/2245 train_time:70400ms step_avg:61.06ms +step:1154/2245 train_time:70459ms step_avg:61.06ms +step:1155/2245 train_time:70522ms step_avg:61.06ms +step:1156/2245 train_time:70582ms step_avg:61.06ms +step:1157/2245 train_time:70644ms step_avg:61.06ms +step:1158/2245 train_time:70704ms step_avg:61.06ms +step:1159/2245 train_time:70767ms step_avg:61.06ms +step:1160/2245 train_time:70827ms step_avg:61.06ms +step:1161/2245 train_time:70889ms step_avg:61.06ms +step:1162/2245 train_time:70949ms step_avg:61.06ms +step:1163/2245 train_time:71011ms step_avg:61.06ms +step:1164/2245 train_time:71071ms step_avg:61.06ms +step:1165/2245 train_time:71133ms step_avg:61.06ms +step:1166/2245 train_time:71194ms step_avg:61.06ms +step:1167/2245 train_time:71257ms step_avg:61.06ms +step:1168/2245 train_time:71317ms step_avg:61.06ms +step:1169/2245 train_time:71379ms step_avg:61.06ms +step:1170/2245 train_time:71439ms step_avg:61.06ms +step:1171/2245 train_time:71502ms step_avg:61.06ms +step:1172/2245 train_time:71561ms step_avg:61.06ms +step:1173/2245 train_time:71624ms step_avg:61.06ms +step:1174/2245 train_time:71683ms step_avg:61.06ms +step:1175/2245 train_time:71746ms step_avg:61.06ms +step:1176/2245 train_time:71806ms step_avg:61.06ms +step:1177/2245 train_time:71869ms step_avg:61.06ms +step:1178/2245 train_time:71929ms step_avg:61.06ms +step:1179/2245 train_time:71991ms step_avg:61.06ms +step:1180/2245 train_time:72051ms step_avg:61.06ms +step:1181/2245 train_time:72113ms step_avg:61.06ms +step:1182/2245 train_time:72173ms step_avg:61.06ms +step:1183/2245 train_time:72236ms step_avg:61.06ms +step:1184/2245 train_time:72297ms step_avg:61.06ms +step:1185/2245 train_time:72359ms step_avg:61.06ms +step:1186/2245 train_time:72419ms step_avg:61.06ms +step:1187/2245 train_time:72481ms step_avg:61.06ms +step:1188/2245 train_time:72541ms step_avg:61.06ms +step:1189/2245 train_time:72604ms step_avg:61.06ms +step:1190/2245 train_time:72664ms step_avg:61.06ms +step:1191/2245 train_time:72727ms step_avg:61.06ms +step:1192/2245 train_time:72786ms step_avg:61.06ms +step:1193/2245 train_time:72848ms step_avg:61.06ms +step:1194/2245 train_time:72909ms step_avg:61.06ms +step:1195/2245 train_time:72971ms step_avg:61.06ms +step:1196/2245 train_time:73031ms step_avg:61.06ms +step:1197/2245 train_time:73093ms step_avg:61.06ms +step:1198/2245 train_time:73154ms step_avg:61.06ms +step:1199/2245 train_time:73217ms step_avg:61.07ms +step:1200/2245 train_time:73277ms step_avg:61.06ms +step:1201/2245 train_time:73339ms step_avg:61.06ms +step:1202/2245 train_time:73398ms step_avg:61.06ms +step:1203/2245 train_time:73461ms step_avg:61.06ms +step:1204/2245 train_time:73521ms step_avg:61.06ms +step:1205/2245 train_time:73583ms step_avg:61.06ms +step:1206/2245 train_time:73643ms step_avg:61.06ms +step:1207/2245 train_time:73705ms step_avg:61.06ms +step:1208/2245 train_time:73764ms step_avg:61.06ms +step:1209/2245 train_time:73827ms step_avg:61.06ms +step:1210/2245 train_time:73887ms step_avg:61.06ms +step:1211/2245 train_time:73950ms step_avg:61.06ms +step:1212/2245 train_time:74010ms step_avg:61.06ms +step:1213/2245 train_time:74072ms step_avg:61.07ms +step:1214/2245 train_time:74133ms step_avg:61.06ms +step:1215/2245 train_time:74196ms step_avg:61.07ms +step:1216/2245 train_time:74256ms step_avg:61.07ms +step:1217/2245 train_time:74319ms step_avg:61.07ms +step:1218/2245 train_time:74378ms step_avg:61.07ms +step:1219/2245 train_time:74441ms step_avg:61.07ms +step:1220/2245 train_time:74500ms step_avg:61.07ms +step:1221/2245 train_time:74563ms step_avg:61.07ms +step:1222/2245 train_time:74622ms step_avg:61.07ms +step:1223/2245 train_time:74685ms step_avg:61.07ms +step:1224/2245 train_time:74745ms step_avg:61.07ms +step:1225/2245 train_time:74807ms step_avg:61.07ms +step:1226/2245 train_time:74866ms step_avg:61.07ms +step:1227/2245 train_time:74929ms step_avg:61.07ms +step:1228/2245 train_time:74989ms step_avg:61.07ms +step:1229/2245 train_time:75051ms step_avg:61.07ms +step:1230/2245 train_time:75112ms step_avg:61.07ms +step:1231/2245 train_time:75175ms step_avg:61.07ms +step:1232/2245 train_time:75235ms step_avg:61.07ms +step:1233/2245 train_time:75299ms step_avg:61.07ms +step:1234/2245 train_time:75359ms step_avg:61.07ms +step:1235/2245 train_time:75421ms step_avg:61.07ms +step:1236/2245 train_time:75481ms step_avg:61.07ms +step:1237/2245 train_time:75543ms step_avg:61.07ms +step:1238/2245 train_time:75603ms step_avg:61.07ms +step:1239/2245 train_time:75666ms step_avg:61.07ms +step:1240/2245 train_time:75725ms step_avg:61.07ms +step:1241/2245 train_time:75788ms step_avg:61.07ms +step:1242/2245 train_time:75847ms step_avg:61.07ms +step:1243/2245 train_time:75909ms step_avg:61.07ms +step:1244/2245 train_time:75969ms step_avg:61.07ms +step:1245/2245 train_time:76035ms step_avg:61.07ms +step:1246/2245 train_time:76092ms step_avg:61.07ms +step:1247/2245 train_time:76155ms step_avg:61.07ms +step:1248/2245 train_time:76215ms step_avg:61.07ms +step:1249/2245 train_time:76278ms step_avg:61.07ms +step:1250/2245 train_time:76338ms step_avg:61.07ms +step:1250/2245 val_loss:3.5218 train_time:76402ms step_avg:61.12ms +step:1251/2245 train_time:76420ms step_avg:61.09ms +step:1252/2245 train_time:76463ms step_avg:61.07ms +step:1253/2245 train_time:76530ms step_avg:61.08ms +step:1254/2245 train_time:76591ms step_avg:61.08ms +step:1255/2245 train_time:76652ms step_avg:61.08ms +step:1256/2245 train_time:76712ms step_avg:61.08ms +step:1257/2245 train_time:76775ms step_avg:61.08ms +step:1258/2245 train_time:76834ms step_avg:61.08ms +step:1259/2245 train_time:76896ms step_avg:61.08ms +step:1260/2245 train_time:76955ms step_avg:61.08ms +step:1261/2245 train_time:77017ms step_avg:61.08ms +step:1262/2245 train_time:77077ms step_avg:61.08ms +step:1263/2245 train_time:77137ms step_avg:61.07ms +step:1264/2245 train_time:77197ms step_avg:61.07ms +step:1265/2245 train_time:77259ms step_avg:61.07ms +step:1266/2245 train_time:77319ms step_avg:61.07ms +step:1267/2245 train_time:77383ms step_avg:61.08ms +step:1268/2245 train_time:77446ms step_avg:61.08ms +step:1269/2245 train_time:77510ms step_avg:61.08ms +step:1270/2245 train_time:77572ms step_avg:61.08ms +step:1271/2245 train_time:77634ms step_avg:61.08ms +step:1272/2245 train_time:77694ms step_avg:61.08ms +step:1273/2245 train_time:77757ms step_avg:61.08ms +step:1274/2245 train_time:77817ms step_avg:61.08ms +step:1275/2245 train_time:77879ms step_avg:61.08ms +step:1276/2245 train_time:77938ms step_avg:61.08ms +step:1277/2245 train_time:78000ms step_avg:61.08ms +step:1278/2245 train_time:78060ms step_avg:61.08ms +step:1279/2245 train_time:78122ms step_avg:61.08ms +step:1280/2245 train_time:78181ms step_avg:61.08ms +step:1281/2245 train_time:78243ms step_avg:61.08ms +step:1282/2245 train_time:78304ms step_avg:61.08ms +step:1283/2245 train_time:78368ms step_avg:61.08ms +step:1284/2245 train_time:78429ms step_avg:61.08ms +step:1285/2245 train_time:78491ms step_avg:61.08ms +step:1286/2245 train_time:78552ms step_avg:61.08ms +step:1287/2245 train_time:78614ms step_avg:61.08ms +step:1288/2245 train_time:78674ms step_avg:61.08ms +step:1289/2245 train_time:78737ms step_avg:61.08ms +step:1290/2245 train_time:78797ms step_avg:61.08ms +step:1291/2245 train_time:78860ms step_avg:61.08ms +step:1292/2245 train_time:78920ms step_avg:61.08ms +step:1293/2245 train_time:78982ms step_avg:61.08ms +step:1294/2245 train_time:79042ms step_avg:61.08ms +step:1295/2245 train_time:79103ms step_avg:61.08ms +step:1296/2245 train_time:79162ms step_avg:61.08ms +step:1297/2245 train_time:79225ms step_avg:61.08ms +step:1298/2245 train_time:79285ms step_avg:61.08ms +step:1299/2245 train_time:79348ms step_avg:61.08ms +step:1300/2245 train_time:79408ms step_avg:61.08ms +step:1301/2245 train_time:79472ms step_avg:61.09ms +step:1302/2245 train_time:79530ms step_avg:61.08ms +step:1303/2245 train_time:79593ms step_avg:61.08ms +step:1304/2245 train_time:79653ms step_avg:61.08ms +step:1305/2245 train_time:79715ms step_avg:61.08ms +step:1306/2245 train_time:79775ms step_avg:61.08ms +step:1307/2245 train_time:79838ms step_avg:61.08ms +step:1308/2245 train_time:79898ms step_avg:61.08ms +step:1309/2245 train_time:79959ms step_avg:61.08ms +step:1310/2245 train_time:80019ms step_avg:61.08ms +step:1311/2245 train_time:80081ms step_avg:61.08ms +step:1312/2245 train_time:80141ms step_avg:61.08ms +step:1313/2245 train_time:80203ms step_avg:61.08ms +step:1314/2245 train_time:80263ms step_avg:61.08ms +step:1315/2245 train_time:80326ms step_avg:61.08ms +step:1316/2245 train_time:80386ms step_avg:61.08ms +step:1317/2245 train_time:80449ms step_avg:61.08ms +step:1318/2245 train_time:80509ms step_avg:61.08ms +step:1319/2245 train_time:80572ms step_avg:61.09ms +step:1320/2245 train_time:80632ms step_avg:61.08ms +step:1321/2245 train_time:80694ms step_avg:61.09ms +step:1322/2245 train_time:80754ms step_avg:61.08ms +step:1323/2245 train_time:80818ms step_avg:61.09ms +step:1324/2245 train_time:80877ms step_avg:61.09ms +step:1325/2245 train_time:80940ms step_avg:61.09ms +step:1326/2245 train_time:81000ms step_avg:61.09ms +step:1327/2245 train_time:81062ms step_avg:61.09ms +step:1328/2245 train_time:81122ms step_avg:61.09ms +step:1329/2245 train_time:81185ms step_avg:61.09ms +step:1330/2245 train_time:81245ms step_avg:61.09ms +step:1331/2245 train_time:81307ms step_avg:61.09ms +step:1332/2245 train_time:81367ms step_avg:61.09ms +step:1333/2245 train_time:81430ms step_avg:61.09ms +step:1334/2245 train_time:81490ms step_avg:61.09ms +step:1335/2245 train_time:81552ms step_avg:61.09ms +step:1336/2245 train_time:81613ms step_avg:61.09ms +step:1337/2245 train_time:81675ms step_avg:61.09ms +step:1338/2245 train_time:81735ms step_avg:61.09ms +step:1339/2245 train_time:81798ms step_avg:61.09ms +step:1340/2245 train_time:81858ms step_avg:61.09ms +step:1341/2245 train_time:81920ms step_avg:61.09ms +step:1342/2245 train_time:81980ms step_avg:61.09ms +step:1343/2245 train_time:82043ms step_avg:61.09ms +step:1344/2245 train_time:82103ms step_avg:61.09ms +step:1345/2245 train_time:82164ms step_avg:61.09ms +step:1346/2245 train_time:82224ms step_avg:61.09ms +step:1347/2245 train_time:82286ms step_avg:61.09ms +step:1348/2245 train_time:82346ms step_avg:61.09ms +step:1349/2245 train_time:82409ms step_avg:61.09ms +step:1350/2245 train_time:82470ms step_avg:61.09ms +step:1351/2245 train_time:82532ms step_avg:61.09ms +step:1352/2245 train_time:82592ms step_avg:61.09ms +step:1353/2245 train_time:82653ms step_avg:61.09ms +step:1354/2245 train_time:82713ms step_avg:61.09ms +step:1355/2245 train_time:82776ms step_avg:61.09ms +step:1356/2245 train_time:82836ms step_avg:61.09ms +step:1357/2245 train_time:82898ms step_avg:61.09ms +step:1358/2245 train_time:82958ms step_avg:61.09ms +step:1359/2245 train_time:83021ms step_avg:61.09ms +step:1360/2245 train_time:83080ms step_avg:61.09ms +step:1361/2245 train_time:83143ms step_avg:61.09ms +step:1362/2245 train_time:83203ms step_avg:61.09ms +step:1363/2245 train_time:83266ms step_avg:61.09ms +step:1364/2245 train_time:83326ms step_avg:61.09ms +step:1365/2245 train_time:83388ms step_avg:61.09ms +step:1366/2245 train_time:83449ms step_avg:61.09ms +step:1367/2245 train_time:83511ms step_avg:61.09ms +step:1368/2245 train_time:83571ms step_avg:61.09ms +step:1369/2245 train_time:83633ms step_avg:61.09ms +step:1370/2245 train_time:83692ms step_avg:61.09ms +step:1371/2245 train_time:83754ms step_avg:61.09ms +step:1372/2245 train_time:83814ms step_avg:61.09ms +step:1373/2245 train_time:83876ms step_avg:61.09ms +step:1374/2245 train_time:83937ms step_avg:61.09ms +step:1375/2245 train_time:83999ms step_avg:61.09ms +step:1376/2245 train_time:84059ms step_avg:61.09ms +step:1377/2245 train_time:84121ms step_avg:61.09ms +step:1378/2245 train_time:84182ms step_avg:61.09ms +step:1379/2245 train_time:84245ms step_avg:61.09ms +step:1380/2245 train_time:84305ms step_avg:61.09ms +step:1381/2245 train_time:84367ms step_avg:61.09ms +step:1382/2245 train_time:84428ms step_avg:61.09ms +step:1383/2245 train_time:84490ms step_avg:61.09ms +step:1384/2245 train_time:84549ms step_avg:61.09ms +step:1385/2245 train_time:84612ms step_avg:61.09ms +step:1386/2245 train_time:84672ms step_avg:61.09ms +step:1387/2245 train_time:84734ms step_avg:61.09ms +step:1388/2245 train_time:84794ms step_avg:61.09ms +step:1389/2245 train_time:84857ms step_avg:61.09ms +step:1390/2245 train_time:84917ms step_avg:61.09ms +step:1391/2245 train_time:84979ms step_avg:61.09ms +step:1392/2245 train_time:85039ms step_avg:61.09ms +step:1393/2245 train_time:85101ms step_avg:61.09ms +step:1394/2245 train_time:85162ms step_avg:61.09ms +step:1395/2245 train_time:85225ms step_avg:61.09ms +step:1396/2245 train_time:85285ms step_avg:61.09ms +step:1397/2245 train_time:85348ms step_avg:61.09ms +step:1398/2245 train_time:85408ms step_avg:61.09ms +step:1399/2245 train_time:85471ms step_avg:61.09ms +step:1400/2245 train_time:85532ms step_avg:61.09ms +step:1401/2245 train_time:85593ms step_avg:61.09ms +step:1402/2245 train_time:85653ms step_avg:61.09ms +step:1403/2245 train_time:85715ms step_avg:61.09ms +step:1404/2245 train_time:85775ms step_avg:61.09ms +step:1405/2245 train_time:85838ms step_avg:61.09ms +step:1406/2245 train_time:85898ms step_avg:61.09ms +step:1407/2245 train_time:85960ms step_avg:61.09ms +step:1408/2245 train_time:86020ms step_avg:61.09ms +step:1409/2245 train_time:86081ms step_avg:61.09ms +step:1410/2245 train_time:86142ms step_avg:61.09ms +step:1411/2245 train_time:86205ms step_avg:61.09ms +step:1412/2245 train_time:86265ms step_avg:61.09ms +step:1413/2245 train_time:86328ms step_avg:61.10ms +step:1414/2245 train_time:86388ms step_avg:61.09ms +step:1415/2245 train_time:86450ms step_avg:61.10ms +step:1416/2245 train_time:86509ms step_avg:61.09ms +step:1417/2245 train_time:86571ms step_avg:61.09ms +step:1418/2245 train_time:86632ms step_avg:61.09ms +step:1419/2245 train_time:86693ms step_avg:61.09ms +step:1420/2245 train_time:86753ms step_avg:61.09ms +step:1421/2245 train_time:86815ms step_avg:61.09ms +step:1422/2245 train_time:86875ms step_avg:61.09ms +step:1423/2245 train_time:86938ms step_avg:61.09ms +step:1424/2245 train_time:86997ms step_avg:61.09ms +step:1425/2245 train_time:87060ms step_avg:61.10ms +step:1426/2245 train_time:87121ms step_avg:61.09ms +step:1427/2245 train_time:87183ms step_avg:61.10ms +step:1428/2245 train_time:87243ms step_avg:61.09ms +step:1429/2245 train_time:87306ms step_avg:61.10ms +step:1430/2245 train_time:87367ms step_avg:61.10ms +step:1431/2245 train_time:87429ms step_avg:61.10ms +step:1432/2245 train_time:87488ms step_avg:61.10ms +step:1433/2245 train_time:87550ms step_avg:61.10ms +step:1434/2245 train_time:87610ms step_avg:61.09ms +step:1435/2245 train_time:87672ms step_avg:61.10ms +step:1436/2245 train_time:87732ms step_avg:61.09ms +step:1437/2245 train_time:87794ms step_avg:61.10ms +step:1438/2245 train_time:87854ms step_avg:61.09ms +step:1439/2245 train_time:87916ms step_avg:61.10ms +step:1440/2245 train_time:87977ms step_avg:61.09ms +step:1441/2245 train_time:88039ms step_avg:61.10ms +step:1442/2245 train_time:88099ms step_avg:61.10ms +step:1443/2245 train_time:88162ms step_avg:61.10ms +step:1444/2245 train_time:88222ms step_avg:61.10ms +step:1445/2245 train_time:88285ms step_avg:61.10ms +step:1446/2245 train_time:88345ms step_avg:61.10ms +step:1447/2245 train_time:88408ms step_avg:61.10ms +step:1448/2245 train_time:88469ms step_avg:61.10ms +step:1449/2245 train_time:88531ms step_avg:61.10ms +step:1450/2245 train_time:88591ms step_avg:61.10ms +step:1451/2245 train_time:88653ms step_avg:61.10ms +step:1452/2245 train_time:88712ms step_avg:61.10ms +step:1453/2245 train_time:88774ms step_avg:61.10ms +step:1454/2245 train_time:88834ms step_avg:61.10ms +step:1455/2245 train_time:88897ms step_avg:61.10ms +step:1456/2245 train_time:88957ms step_avg:61.10ms +step:1457/2245 train_time:89019ms step_avg:61.10ms +step:1458/2245 train_time:89079ms step_avg:61.10ms +step:1459/2245 train_time:89141ms step_avg:61.10ms +step:1460/2245 train_time:89201ms step_avg:61.10ms +step:1461/2245 train_time:89264ms step_avg:61.10ms +step:1462/2245 train_time:89326ms step_avg:61.10ms +step:1463/2245 train_time:89388ms step_avg:61.10ms +step:1464/2245 train_time:89448ms step_avg:61.10ms +step:1465/2245 train_time:89511ms step_avg:61.10ms +step:1466/2245 train_time:89571ms step_avg:61.10ms +step:1467/2245 train_time:89633ms step_avg:61.10ms +step:1468/2245 train_time:89693ms step_avg:61.10ms +step:1469/2245 train_time:89755ms step_avg:61.10ms +step:1470/2245 train_time:89815ms step_avg:61.10ms +step:1471/2245 train_time:89877ms step_avg:61.10ms +step:1472/2245 train_time:89938ms step_avg:61.10ms +step:1473/2245 train_time:90001ms step_avg:61.10ms +step:1474/2245 train_time:90061ms step_avg:61.10ms +step:1475/2245 train_time:90125ms step_avg:61.10ms +step:1476/2245 train_time:90185ms step_avg:61.10ms +step:1477/2245 train_time:90248ms step_avg:61.10ms +step:1478/2245 train_time:90308ms step_avg:61.10ms +step:1479/2245 train_time:90371ms step_avg:61.10ms +step:1480/2245 train_time:90432ms step_avg:61.10ms +step:1481/2245 train_time:90495ms step_avg:61.10ms +step:1482/2245 train_time:90556ms step_avg:61.10ms +step:1483/2245 train_time:90618ms step_avg:61.10ms +step:1484/2245 train_time:90679ms step_avg:61.10ms +step:1485/2245 train_time:90741ms step_avg:61.11ms +step:1486/2245 train_time:90802ms step_avg:61.10ms +step:1487/2245 train_time:90865ms step_avg:61.11ms +step:1488/2245 train_time:90925ms step_avg:61.11ms +step:1489/2245 train_time:90988ms step_avg:61.11ms +step:1490/2245 train_time:91048ms step_avg:61.11ms +step:1491/2245 train_time:91111ms step_avg:61.11ms +step:1492/2245 train_time:91171ms step_avg:61.11ms +step:1493/2245 train_time:91234ms step_avg:61.11ms +step:1494/2245 train_time:91294ms step_avg:61.11ms +step:1495/2245 train_time:91357ms step_avg:61.11ms +step:1496/2245 train_time:91418ms step_avg:61.11ms +step:1497/2245 train_time:91482ms step_avg:61.11ms +step:1498/2245 train_time:91543ms step_avg:61.11ms +step:1499/2245 train_time:91605ms step_avg:61.11ms +step:1500/2245 train_time:91665ms step_avg:61.11ms +step:1500/2245 val_loss:3.4406 train_time:91728ms step_avg:61.15ms +step:1501/2245 train_time:91747ms step_avg:61.12ms +step:1502/2245 train_time:91791ms step_avg:61.11ms +step:1503/2245 train_time:91853ms step_avg:61.11ms +step:1504/2245 train_time:91915ms step_avg:61.11ms +step:1505/2245 train_time:91978ms step_avg:61.11ms +step:1506/2245 train_time:92037ms step_avg:61.11ms +step:1507/2245 train_time:92099ms step_avg:61.11ms +step:1508/2245 train_time:92159ms step_avg:61.11ms +step:1509/2245 train_time:92221ms step_avg:61.11ms +step:1510/2245 train_time:92280ms step_avg:61.11ms +step:1511/2245 train_time:92342ms step_avg:61.11ms +step:1512/2245 train_time:92402ms step_avg:61.11ms +step:1513/2245 train_time:92464ms step_avg:61.11ms +step:1514/2245 train_time:92524ms step_avg:61.11ms +step:1515/2245 train_time:92587ms step_avg:61.11ms +step:1516/2245 train_time:92648ms step_avg:61.11ms +step:1517/2245 train_time:92714ms step_avg:61.12ms +step:1518/2245 train_time:92776ms step_avg:61.12ms +step:1519/2245 train_time:92840ms step_avg:61.12ms +step:1520/2245 train_time:92901ms step_avg:61.12ms +step:1521/2245 train_time:92964ms step_avg:61.12ms +step:1522/2245 train_time:93024ms step_avg:61.12ms +step:1523/2245 train_time:93087ms step_avg:61.12ms +step:1524/2245 train_time:93148ms step_avg:61.12ms +step:1525/2245 train_time:93211ms step_avg:61.12ms +step:1526/2245 train_time:93271ms step_avg:61.12ms +step:1527/2245 train_time:93333ms step_avg:61.12ms +step:1528/2245 train_time:93394ms step_avg:61.12ms +step:1529/2245 train_time:93456ms step_avg:61.12ms +step:1530/2245 train_time:93516ms step_avg:61.12ms +step:1531/2245 train_time:93579ms step_avg:61.12ms +step:1532/2245 train_time:93640ms step_avg:61.12ms +step:1533/2245 train_time:93703ms step_avg:61.12ms +step:1534/2245 train_time:93765ms step_avg:61.12ms +step:1535/2245 train_time:93829ms step_avg:61.13ms +step:1536/2245 train_time:93890ms step_avg:61.13ms +step:1537/2245 train_time:93953ms step_avg:61.13ms +step:1538/2245 train_time:94013ms step_avg:61.13ms +step:1539/2245 train_time:94077ms step_avg:61.13ms +step:1540/2245 train_time:94137ms step_avg:61.13ms +step:1541/2245 train_time:94199ms step_avg:61.13ms +step:1542/2245 train_time:94259ms step_avg:61.13ms +step:1543/2245 train_time:94322ms step_avg:61.13ms +step:1544/2245 train_time:94382ms step_avg:61.13ms +step:1545/2245 train_time:94445ms step_avg:61.13ms +step:1546/2245 train_time:94505ms step_avg:61.13ms +step:1547/2245 train_time:94568ms step_avg:61.13ms +step:1548/2245 train_time:94630ms step_avg:61.13ms +step:1549/2245 train_time:94694ms step_avg:61.13ms +step:1550/2245 train_time:94755ms step_avg:61.13ms +step:1551/2245 train_time:94818ms step_avg:61.13ms +step:1552/2245 train_time:94878ms step_avg:61.13ms +step:1553/2245 train_time:94941ms step_avg:61.13ms +step:1554/2245 train_time:95001ms step_avg:61.13ms +step:1555/2245 train_time:95065ms step_avg:61.13ms +step:1556/2245 train_time:95125ms step_avg:61.13ms +step:1557/2245 train_time:95188ms step_avg:61.14ms +step:1558/2245 train_time:95249ms step_avg:61.14ms +step:1559/2245 train_time:95312ms step_avg:61.14ms +step:1560/2245 train_time:95372ms step_avg:61.14ms +step:1561/2245 train_time:95435ms step_avg:61.14ms +step:1562/2245 train_time:95496ms step_avg:61.14ms +step:1563/2245 train_time:95558ms step_avg:61.14ms +step:1564/2245 train_time:95619ms step_avg:61.14ms +step:1565/2245 train_time:95682ms step_avg:61.14ms +step:1566/2245 train_time:95743ms step_avg:61.14ms +step:1567/2245 train_time:95807ms step_avg:61.14ms +step:1568/2245 train_time:95868ms step_avg:61.14ms +step:1569/2245 train_time:95931ms step_avg:61.14ms +step:1570/2245 train_time:95992ms step_avg:61.14ms +step:1571/2245 train_time:96056ms step_avg:61.14ms +step:1572/2245 train_time:96116ms step_avg:61.14ms +step:1573/2245 train_time:96179ms step_avg:61.14ms +step:1574/2245 train_time:96240ms step_avg:61.14ms +step:1575/2245 train_time:96302ms step_avg:61.14ms +step:1576/2245 train_time:96363ms step_avg:61.14ms +step:1577/2245 train_time:96426ms step_avg:61.15ms +step:1578/2245 train_time:96486ms step_avg:61.14ms +step:1579/2245 train_time:96549ms step_avg:61.15ms +step:1580/2245 train_time:96610ms step_avg:61.15ms +step:1581/2245 train_time:96674ms step_avg:61.15ms +step:1582/2245 train_time:96735ms step_avg:61.15ms +step:1583/2245 train_time:96800ms step_avg:61.15ms +step:1584/2245 train_time:96859ms step_avg:61.15ms +step:1585/2245 train_time:96922ms step_avg:61.15ms +step:1586/2245 train_time:96982ms step_avg:61.15ms +step:1587/2245 train_time:97045ms step_avg:61.15ms +step:1588/2245 train_time:97106ms step_avg:61.15ms +step:1589/2245 train_time:97168ms step_avg:61.15ms +step:1590/2245 train_time:97229ms step_avg:61.15ms +step:1591/2245 train_time:97293ms step_avg:61.15ms +step:1592/2245 train_time:97353ms step_avg:61.15ms +step:1593/2245 train_time:97415ms step_avg:61.15ms +step:1594/2245 train_time:97476ms step_avg:61.15ms +step:1595/2245 train_time:97539ms step_avg:61.15ms +step:1596/2245 train_time:97599ms step_avg:61.15ms +step:1597/2245 train_time:97662ms step_avg:61.15ms +step:1598/2245 train_time:97722ms step_avg:61.15ms +step:1599/2245 train_time:97785ms step_avg:61.15ms +step:1600/2245 train_time:97845ms step_avg:61.15ms +step:1601/2245 train_time:97908ms step_avg:61.15ms +step:1602/2245 train_time:97969ms step_avg:61.15ms +step:1603/2245 train_time:98033ms step_avg:61.16ms +step:1604/2245 train_time:98094ms step_avg:61.16ms +step:1605/2245 train_time:98157ms step_avg:61.16ms +step:1606/2245 train_time:98217ms step_avg:61.16ms +step:1607/2245 train_time:98281ms step_avg:61.16ms +step:1608/2245 train_time:98341ms step_avg:61.16ms +step:1609/2245 train_time:98403ms step_avg:61.16ms +step:1610/2245 train_time:98464ms step_avg:61.16ms +step:1611/2245 train_time:98527ms step_avg:61.16ms +step:1612/2245 train_time:98587ms step_avg:61.16ms +step:1613/2245 train_time:98650ms step_avg:61.16ms +step:1614/2245 train_time:98711ms step_avg:61.16ms +step:1615/2245 train_time:98774ms step_avg:61.16ms +step:1616/2245 train_time:98835ms step_avg:61.16ms +step:1617/2245 train_time:98898ms step_avg:61.16ms +step:1618/2245 train_time:98958ms step_avg:61.16ms +step:1619/2245 train_time:99021ms step_avg:61.16ms +step:1620/2245 train_time:99081ms step_avg:61.16ms +step:1621/2245 train_time:99143ms step_avg:61.16ms +step:1622/2245 train_time:99203ms step_avg:61.16ms +step:1623/2245 train_time:99266ms step_avg:61.16ms +step:1624/2245 train_time:99327ms step_avg:61.16ms +step:1625/2245 train_time:99390ms step_avg:61.16ms +step:1626/2245 train_time:99451ms step_avg:61.16ms +step:1627/2245 train_time:99515ms step_avg:61.16ms +step:1628/2245 train_time:99575ms step_avg:61.16ms +step:1629/2245 train_time:99638ms step_avg:61.17ms +step:1630/2245 train_time:99698ms step_avg:61.16ms +step:1631/2245 train_time:99760ms step_avg:61.17ms +step:1632/2245 train_time:99821ms step_avg:61.16ms +step:1633/2245 train_time:99884ms step_avg:61.17ms +step:1634/2245 train_time:99944ms step_avg:61.17ms +step:1635/2245 train_time:100008ms step_avg:61.17ms +step:1636/2245 train_time:100068ms step_avg:61.17ms +step:1637/2245 train_time:100132ms step_avg:61.17ms +step:1638/2245 train_time:100192ms step_avg:61.17ms +step:1639/2245 train_time:100255ms step_avg:61.17ms +step:1640/2245 train_time:100316ms step_avg:61.17ms +step:1641/2245 train_time:100379ms step_avg:61.17ms +step:1642/2245 train_time:100439ms step_avg:61.17ms +step:1643/2245 train_time:100502ms step_avg:61.17ms +step:1644/2245 train_time:100562ms step_avg:61.17ms +step:1645/2245 train_time:100625ms step_avg:61.17ms +step:1646/2245 train_time:100685ms step_avg:61.17ms +step:1647/2245 train_time:100748ms step_avg:61.17ms +step:1648/2245 train_time:100809ms step_avg:61.17ms +step:1649/2245 train_time:100872ms step_avg:61.17ms +step:1650/2245 train_time:100934ms step_avg:61.17ms +step:1651/2245 train_time:100997ms step_avg:61.17ms +step:1652/2245 train_time:101058ms step_avg:61.17ms +step:1653/2245 train_time:101121ms step_avg:61.17ms +step:1654/2245 train_time:101181ms step_avg:61.17ms +step:1655/2245 train_time:101244ms step_avg:61.17ms +step:1656/2245 train_time:101304ms step_avg:61.17ms +step:1657/2245 train_time:101367ms step_avg:61.17ms +step:1658/2245 train_time:101427ms step_avg:61.17ms +step:1659/2245 train_time:101491ms step_avg:61.18ms +step:1660/2245 train_time:101552ms step_avg:61.18ms +step:1661/2245 train_time:101615ms step_avg:61.18ms +step:1662/2245 train_time:101675ms step_avg:61.18ms +step:1663/2245 train_time:101737ms step_avg:61.18ms +step:1664/2245 train_time:101797ms step_avg:61.18ms +step:1665/2245 train_time:101860ms step_avg:61.18ms +step:1666/2245 train_time:101920ms step_avg:61.18ms +step:1667/2245 train_time:101983ms step_avg:61.18ms +step:1668/2245 train_time:102043ms step_avg:61.18ms +step:1669/2245 train_time:102106ms step_avg:61.18ms +step:1670/2245 train_time:102166ms step_avg:61.18ms +step:1671/2245 train_time:102229ms step_avg:61.18ms +step:1672/2245 train_time:102291ms step_avg:61.18ms +step:1673/2245 train_time:102354ms step_avg:61.18ms +step:1674/2245 train_time:102415ms step_avg:61.18ms +step:1675/2245 train_time:102477ms step_avg:61.18ms +step:1676/2245 train_time:102538ms step_avg:61.18ms +step:1677/2245 train_time:102600ms step_avg:61.18ms +step:1678/2245 train_time:102660ms step_avg:61.18ms +step:1679/2245 train_time:102723ms step_avg:61.18ms +step:1680/2245 train_time:102783ms step_avg:61.18ms +step:1681/2245 train_time:102847ms step_avg:61.18ms +step:1682/2245 train_time:102907ms step_avg:61.18ms +step:1683/2245 train_time:102970ms step_avg:61.18ms +step:1684/2245 train_time:103033ms step_avg:61.18ms +step:1685/2245 train_time:103096ms step_avg:61.18ms +step:1686/2245 train_time:103156ms step_avg:61.18ms +step:1687/2245 train_time:103219ms step_avg:61.18ms +step:1688/2245 train_time:103280ms step_avg:61.18ms +step:1689/2245 train_time:103343ms step_avg:61.19ms +step:1690/2245 train_time:103403ms step_avg:61.19ms +step:1691/2245 train_time:103466ms step_avg:61.19ms +step:1692/2245 train_time:103527ms step_avg:61.19ms +step:1693/2245 train_time:103590ms step_avg:61.19ms +step:1694/2245 train_time:103651ms step_avg:61.19ms +step:1695/2245 train_time:103714ms step_avg:61.19ms +step:1696/2245 train_time:103775ms step_avg:61.19ms +step:1697/2245 train_time:103838ms step_avg:61.19ms +step:1698/2245 train_time:103898ms step_avg:61.19ms +step:1699/2245 train_time:103961ms step_avg:61.19ms +step:1700/2245 train_time:104021ms step_avg:61.19ms +step:1701/2245 train_time:104084ms step_avg:61.19ms +step:1702/2245 train_time:104145ms step_avg:61.19ms +step:1703/2245 train_time:104208ms step_avg:61.19ms +step:1704/2245 train_time:104269ms step_avg:61.19ms +step:1705/2245 train_time:104332ms step_avg:61.19ms +step:1706/2245 train_time:104393ms step_avg:61.19ms +step:1707/2245 train_time:104457ms step_avg:61.19ms +step:1708/2245 train_time:104517ms step_avg:61.19ms +step:1709/2245 train_time:104581ms step_avg:61.19ms +step:1710/2245 train_time:104641ms step_avg:61.19ms +step:1711/2245 train_time:104704ms step_avg:61.19ms +step:1712/2245 train_time:104764ms step_avg:61.19ms +step:1713/2245 train_time:104827ms step_avg:61.20ms +step:1714/2245 train_time:104888ms step_avg:61.19ms +step:1715/2245 train_time:104951ms step_avg:61.20ms +step:1716/2245 train_time:105013ms step_avg:61.20ms +step:1717/2245 train_time:105075ms step_avg:61.20ms +step:1718/2245 train_time:105135ms step_avg:61.20ms +step:1719/2245 train_time:105198ms step_avg:61.20ms +step:1720/2245 train_time:105258ms step_avg:61.20ms +step:1721/2245 train_time:105321ms step_avg:61.20ms +step:1722/2245 train_time:105381ms step_avg:61.20ms +step:1723/2245 train_time:105444ms step_avg:61.20ms +step:1724/2245 train_time:105504ms step_avg:61.20ms +step:1725/2245 train_time:105567ms step_avg:61.20ms +step:1726/2245 train_time:105628ms step_avg:61.20ms +step:1727/2245 train_time:105692ms step_avg:61.20ms +step:1728/2245 train_time:105752ms step_avg:61.20ms +step:1729/2245 train_time:105815ms step_avg:61.20ms +step:1730/2245 train_time:105875ms step_avg:61.20ms +step:1731/2245 train_time:105939ms step_avg:61.20ms +step:1732/2245 train_time:105999ms step_avg:61.20ms +step:1733/2245 train_time:106062ms step_avg:61.20ms +step:1734/2245 train_time:106122ms step_avg:61.20ms +step:1735/2245 train_time:106185ms step_avg:61.20ms +step:1736/2245 train_time:106245ms step_avg:61.20ms +step:1737/2245 train_time:106308ms step_avg:61.20ms +step:1738/2245 train_time:106368ms step_avg:61.20ms +step:1739/2245 train_time:106431ms step_avg:61.20ms +step:1740/2245 train_time:106492ms step_avg:61.20ms +step:1741/2245 train_time:106556ms step_avg:61.20ms +step:1742/2245 train_time:106616ms step_avg:61.20ms +step:1743/2245 train_time:106678ms step_avg:61.20ms +step:1744/2245 train_time:106739ms step_avg:61.20ms +step:1745/2245 train_time:106801ms step_avg:61.20ms +step:1746/2245 train_time:106862ms step_avg:61.20ms +step:1747/2245 train_time:106925ms step_avg:61.20ms +step:1748/2245 train_time:106985ms step_avg:61.20ms +step:1749/2245 train_time:107049ms step_avg:61.21ms +step:1750/2245 train_time:107110ms step_avg:61.21ms +step:1750/2245 val_loss:3.3766 train_time:107173ms step_avg:61.24ms +step:1751/2245 train_time:107192ms step_avg:61.22ms +step:1752/2245 train_time:107236ms step_avg:61.21ms +step:1753/2245 train_time:107302ms step_avg:61.21ms +step:1754/2245 train_time:107364ms step_avg:61.21ms +step:1755/2245 train_time:107427ms step_avg:61.21ms +step:1756/2245 train_time:107487ms step_avg:61.21ms +step:1757/2245 train_time:107549ms step_avg:61.21ms +step:1758/2245 train_time:107609ms step_avg:61.21ms +step:1759/2245 train_time:107671ms step_avg:61.21ms +step:1760/2245 train_time:107730ms step_avg:61.21ms +step:1761/2245 train_time:107793ms step_avg:61.21ms +step:1762/2245 train_time:107852ms step_avg:61.21ms +step:1763/2245 train_time:107915ms step_avg:61.21ms +step:1764/2245 train_time:107976ms step_avg:61.21ms +step:1765/2245 train_time:108038ms step_avg:61.21ms +step:1766/2245 train_time:108099ms step_avg:61.21ms +step:1767/2245 train_time:108163ms step_avg:61.21ms +step:1768/2245 train_time:108225ms step_avg:61.21ms +step:1769/2245 train_time:108289ms step_avg:61.21ms +step:1770/2245 train_time:108350ms step_avg:61.21ms +step:1771/2245 train_time:108414ms step_avg:61.22ms +step:1772/2245 train_time:108475ms step_avg:61.22ms +step:1773/2245 train_time:108539ms step_avg:61.22ms +step:1774/2245 train_time:108600ms step_avg:61.22ms +step:1775/2245 train_time:108662ms step_avg:61.22ms +step:1776/2245 train_time:108722ms step_avg:61.22ms +step:1777/2245 train_time:108785ms step_avg:61.22ms +step:1778/2245 train_time:108844ms step_avg:61.22ms +step:1779/2245 train_time:108907ms step_avg:61.22ms +step:1780/2245 train_time:108966ms step_avg:61.22ms +step:1781/2245 train_time:109029ms step_avg:61.22ms +step:1782/2245 train_time:109090ms step_avg:61.22ms +step:1783/2245 train_time:109153ms step_avg:61.22ms +step:1784/2245 train_time:109214ms step_avg:61.22ms +step:1785/2245 train_time:109278ms step_avg:61.22ms +step:1786/2245 train_time:109339ms step_avg:61.22ms +step:1787/2245 train_time:109403ms step_avg:61.22ms +step:1788/2245 train_time:109463ms step_avg:61.22ms +step:1789/2245 train_time:109526ms step_avg:61.22ms +step:1790/2245 train_time:109586ms step_avg:61.22ms +step:1791/2245 train_time:109648ms step_avg:61.22ms +step:1792/2245 train_time:109708ms step_avg:61.22ms +step:1793/2245 train_time:109771ms step_avg:61.22ms +step:1794/2245 train_time:109831ms step_avg:61.22ms +step:1795/2245 train_time:109893ms step_avg:61.22ms +step:1796/2245 train_time:109953ms step_avg:61.22ms +step:1797/2245 train_time:110016ms step_avg:61.22ms +step:1798/2245 train_time:110077ms step_avg:61.22ms +step:1799/2245 train_time:110140ms step_avg:61.22ms +step:1800/2245 train_time:110201ms step_avg:61.22ms +step:1801/2245 train_time:110265ms step_avg:61.22ms +step:1802/2245 train_time:110325ms step_avg:61.22ms +step:1803/2245 train_time:110389ms step_avg:61.23ms +step:1804/2245 train_time:110450ms step_avg:61.22ms +step:1805/2245 train_time:110512ms step_avg:61.23ms +step:1806/2245 train_time:110573ms step_avg:61.23ms +step:1807/2245 train_time:110636ms step_avg:61.23ms +step:1808/2245 train_time:110696ms step_avg:61.23ms +step:1809/2245 train_time:110758ms step_avg:61.23ms +step:1810/2245 train_time:110819ms step_avg:61.23ms +step:1811/2245 train_time:110881ms step_avg:61.23ms +step:1812/2245 train_time:110941ms step_avg:61.23ms +step:1813/2245 train_time:111004ms step_avg:61.23ms +step:1814/2245 train_time:111064ms step_avg:61.23ms +step:1815/2245 train_time:111127ms step_avg:61.23ms +step:1816/2245 train_time:111188ms step_avg:61.23ms +step:1817/2245 train_time:111250ms step_avg:61.23ms +step:1818/2245 train_time:111311ms step_avg:61.23ms +step:1819/2245 train_time:111374ms step_avg:61.23ms +step:1820/2245 train_time:111436ms step_avg:61.23ms +step:1821/2245 train_time:111500ms step_avg:61.23ms +step:1822/2245 train_time:111561ms step_avg:61.23ms +step:1823/2245 train_time:111623ms step_avg:61.23ms +step:1824/2245 train_time:111683ms step_avg:61.23ms +step:1825/2245 train_time:111745ms step_avg:61.23ms +step:1826/2245 train_time:111805ms step_avg:61.23ms +step:1827/2245 train_time:111868ms step_avg:61.23ms +step:1828/2245 train_time:111928ms step_avg:61.23ms +step:1829/2245 train_time:111991ms step_avg:61.23ms +step:1830/2245 train_time:112051ms step_avg:61.23ms +step:1831/2245 train_time:112114ms step_avg:61.23ms +step:1832/2245 train_time:112175ms step_avg:61.23ms +step:1833/2245 train_time:112238ms step_avg:61.23ms +step:1834/2245 train_time:112298ms step_avg:61.23ms +step:1835/2245 train_time:112362ms step_avg:61.23ms +step:1836/2245 train_time:112423ms step_avg:61.23ms +step:1837/2245 train_time:112486ms step_avg:61.23ms +step:1838/2245 train_time:112546ms step_avg:61.23ms +step:1839/2245 train_time:112609ms step_avg:61.23ms +step:1840/2245 train_time:112670ms step_avg:61.23ms +step:1841/2245 train_time:112732ms step_avg:61.23ms +step:1842/2245 train_time:112793ms step_avg:61.23ms +step:1843/2245 train_time:112856ms step_avg:61.23ms +step:1844/2245 train_time:112917ms step_avg:61.23ms +step:1845/2245 train_time:112980ms step_avg:61.24ms +step:1846/2245 train_time:113040ms step_avg:61.24ms +step:1847/2245 train_time:113103ms step_avg:61.24ms +step:1848/2245 train_time:113163ms step_avg:61.24ms +step:1849/2245 train_time:113226ms step_avg:61.24ms +step:1850/2245 train_time:113286ms step_avg:61.24ms +step:1851/2245 train_time:113348ms step_avg:61.24ms +step:1852/2245 train_time:113409ms step_avg:61.24ms +step:1853/2245 train_time:113472ms step_avg:61.24ms +step:1854/2245 train_time:113532ms step_avg:61.24ms +step:1855/2245 train_time:113596ms step_avg:61.24ms +step:1856/2245 train_time:113657ms step_avg:61.24ms +step:1857/2245 train_time:113723ms step_avg:61.24ms +step:1858/2245 train_time:113781ms step_avg:61.24ms +step:1859/2245 train_time:113843ms step_avg:61.24ms +step:1860/2245 train_time:113903ms step_avg:61.24ms +step:1861/2245 train_time:113965ms step_avg:61.24ms +step:1862/2245 train_time:114025ms step_avg:61.24ms +step:1863/2245 train_time:114088ms step_avg:61.24ms +step:1864/2245 train_time:114148ms step_avg:61.24ms +step:1865/2245 train_time:114211ms step_avg:61.24ms +step:1866/2245 train_time:114271ms step_avg:61.24ms +step:1867/2245 train_time:114334ms step_avg:61.24ms +step:1868/2245 train_time:114395ms step_avg:61.24ms +step:1869/2245 train_time:114459ms step_avg:61.24ms +step:1870/2245 train_time:114519ms step_avg:61.24ms +step:1871/2245 train_time:114582ms step_avg:61.24ms +step:1872/2245 train_time:114643ms step_avg:61.24ms +step:1873/2245 train_time:114705ms step_avg:61.24ms +step:1874/2245 train_time:114766ms step_avg:61.24ms +step:1875/2245 train_time:114828ms step_avg:61.24ms +step:1876/2245 train_time:114888ms step_avg:61.24ms +step:1877/2245 train_time:114951ms step_avg:61.24ms +step:1878/2245 train_time:115011ms step_avg:61.24ms +step:1879/2245 train_time:115075ms step_avg:61.24ms +step:1880/2245 train_time:115135ms step_avg:61.24ms +step:1881/2245 train_time:115198ms step_avg:61.24ms +step:1882/2245 train_time:115259ms step_avg:61.24ms +step:1883/2245 train_time:115322ms step_avg:61.24ms +step:1884/2245 train_time:115383ms step_avg:61.24ms +step:1885/2245 train_time:115446ms step_avg:61.24ms +step:1886/2245 train_time:115506ms step_avg:61.24ms +step:1887/2245 train_time:115568ms step_avg:61.24ms +step:1888/2245 train_time:115630ms step_avg:61.24ms +step:1889/2245 train_time:115691ms step_avg:61.24ms +step:1890/2245 train_time:115751ms step_avg:61.24ms +step:1891/2245 train_time:115815ms step_avg:61.25ms +step:1892/2245 train_time:115876ms step_avg:61.25ms +step:1893/2245 train_time:115939ms step_avg:61.25ms +step:1894/2245 train_time:116000ms step_avg:61.25ms +step:1895/2245 train_time:116063ms step_avg:61.25ms +step:1896/2245 train_time:116123ms step_avg:61.25ms +step:1897/2245 train_time:116185ms step_avg:61.25ms +step:1898/2245 train_time:116245ms step_avg:61.25ms +step:1899/2245 train_time:116308ms step_avg:61.25ms +step:1900/2245 train_time:116370ms step_avg:61.25ms +step:1901/2245 train_time:116434ms step_avg:61.25ms +step:1902/2245 train_time:116493ms step_avg:61.25ms +step:1903/2245 train_time:116556ms step_avg:61.25ms +step:1904/2245 train_time:116618ms step_avg:61.25ms +step:1905/2245 train_time:116680ms step_avg:61.25ms +step:1906/2245 train_time:116740ms step_avg:61.25ms +step:1907/2245 train_time:116803ms step_avg:61.25ms +step:1908/2245 train_time:116863ms step_avg:61.25ms +step:1909/2245 train_time:116925ms step_avg:61.25ms +step:1910/2245 train_time:116985ms step_avg:61.25ms +step:1911/2245 train_time:117048ms step_avg:61.25ms +step:1912/2245 train_time:117108ms step_avg:61.25ms +step:1913/2245 train_time:117172ms step_avg:61.25ms +step:1914/2245 train_time:117232ms step_avg:61.25ms +step:1915/2245 train_time:117296ms step_avg:61.25ms +step:1916/2245 train_time:117357ms step_avg:61.25ms +step:1917/2245 train_time:117420ms step_avg:61.25ms +step:1918/2245 train_time:117481ms step_avg:61.25ms +step:1919/2245 train_time:117543ms step_avg:61.25ms +step:1920/2245 train_time:117604ms step_avg:61.25ms +step:1921/2245 train_time:117666ms step_avg:61.25ms +step:1922/2245 train_time:117727ms step_avg:61.25ms +step:1923/2245 train_time:117791ms step_avg:61.25ms +step:1924/2245 train_time:117850ms step_avg:61.25ms +step:1925/2245 train_time:117914ms step_avg:61.25ms +step:1926/2245 train_time:117974ms step_avg:61.25ms +step:1927/2245 train_time:118038ms step_avg:61.25ms +step:1928/2245 train_time:118099ms step_avg:61.25ms +step:1929/2245 train_time:118161ms step_avg:61.26ms +step:1930/2245 train_time:118222ms step_avg:61.25ms +step:1931/2245 train_time:118285ms step_avg:61.26ms +step:1932/2245 train_time:118345ms step_avg:61.26ms +step:1933/2245 train_time:118408ms step_avg:61.26ms +step:1934/2245 train_time:118469ms step_avg:61.26ms +step:1935/2245 train_time:118531ms step_avg:61.26ms +step:1936/2245 train_time:118592ms step_avg:61.26ms +step:1937/2245 train_time:118655ms step_avg:61.26ms +step:1938/2245 train_time:118716ms step_avg:61.26ms +step:1939/2245 train_time:118779ms step_avg:61.26ms +step:1940/2245 train_time:118839ms step_avg:61.26ms +step:1941/2245 train_time:118902ms step_avg:61.26ms +step:1942/2245 train_time:118962ms step_avg:61.26ms +step:1943/2245 train_time:119025ms step_avg:61.26ms +step:1944/2245 train_time:119085ms step_avg:61.26ms +step:1945/2245 train_time:119148ms step_avg:61.26ms +step:1946/2245 train_time:119209ms step_avg:61.26ms +step:1947/2245 train_time:119271ms step_avg:61.26ms +step:1948/2245 train_time:119332ms step_avg:61.26ms +step:1949/2245 train_time:119394ms step_avg:61.26ms +step:1950/2245 train_time:119455ms step_avg:61.26ms +step:1951/2245 train_time:119517ms step_avg:61.26ms +step:1952/2245 train_time:119579ms step_avg:61.26ms +step:1953/2245 train_time:119642ms step_avg:61.26ms +step:1954/2245 train_time:119702ms step_avg:61.26ms +step:1955/2245 train_time:119764ms step_avg:61.26ms +step:1956/2245 train_time:119824ms step_avg:61.26ms +step:1957/2245 train_time:119887ms step_avg:61.26ms +step:1958/2245 train_time:119947ms step_avg:61.26ms +step:1959/2245 train_time:120010ms step_avg:61.26ms +step:1960/2245 train_time:120070ms step_avg:61.26ms +step:1961/2245 train_time:120133ms step_avg:61.26ms +step:1962/2245 train_time:120193ms step_avg:61.26ms +step:1963/2245 train_time:120257ms step_avg:61.26ms +step:1964/2245 train_time:120319ms step_avg:61.26ms +step:1965/2245 train_time:120382ms step_avg:61.26ms +step:1966/2245 train_time:120442ms step_avg:61.26ms +step:1967/2245 train_time:120505ms step_avg:61.26ms +step:1968/2245 train_time:120566ms step_avg:61.26ms +step:1969/2245 train_time:120628ms step_avg:61.26ms +step:1970/2245 train_time:120689ms step_avg:61.26ms +step:1971/2245 train_time:120752ms step_avg:61.26ms +step:1972/2245 train_time:120813ms step_avg:61.26ms +step:1973/2245 train_time:120876ms step_avg:61.27ms +step:1974/2245 train_time:120937ms step_avg:61.27ms +step:1975/2245 train_time:121000ms step_avg:61.27ms +step:1976/2245 train_time:121061ms step_avg:61.27ms +step:1977/2245 train_time:121125ms step_avg:61.27ms +step:1978/2245 train_time:121184ms step_avg:61.27ms +step:1979/2245 train_time:121247ms step_avg:61.27ms +step:1980/2245 train_time:121307ms step_avg:61.27ms +step:1981/2245 train_time:121370ms step_avg:61.27ms +step:1982/2245 train_time:121431ms step_avg:61.27ms +step:1983/2245 train_time:121494ms step_avg:61.27ms +step:1984/2245 train_time:121555ms step_avg:61.27ms +step:1985/2245 train_time:121618ms step_avg:61.27ms +step:1986/2245 train_time:121679ms step_avg:61.27ms +step:1987/2245 train_time:121741ms step_avg:61.27ms +step:1988/2245 train_time:121801ms step_avg:61.27ms +step:1989/2245 train_time:121864ms step_avg:61.27ms +step:1990/2245 train_time:121924ms step_avg:61.27ms +step:1991/2245 train_time:121986ms step_avg:61.27ms +step:1992/2245 train_time:122046ms step_avg:61.27ms +step:1993/2245 train_time:122109ms step_avg:61.27ms +step:1994/2245 train_time:122170ms step_avg:61.27ms +step:1995/2245 train_time:122233ms step_avg:61.27ms +step:1996/2245 train_time:122294ms step_avg:61.27ms +step:1997/2245 train_time:122357ms step_avg:61.27ms +step:1998/2245 train_time:122418ms step_avg:61.27ms +step:1999/2245 train_time:122480ms step_avg:61.27ms +step:2000/2245 train_time:122541ms step_avg:61.27ms +step:2000/2245 val_loss:3.3224 train_time:122605ms step_avg:61.30ms +step:2001/2245 train_time:122626ms step_avg:61.28ms +step:2002/2245 train_time:122666ms step_avg:61.27ms +step:2003/2245 train_time:122733ms step_avg:61.27ms +step:2004/2245 train_time:122796ms step_avg:61.28ms +step:2005/2245 train_time:122859ms step_avg:61.28ms +step:2006/2245 train_time:122920ms step_avg:61.28ms +step:2007/2245 train_time:122983ms step_avg:61.28ms +step:2008/2245 train_time:123042ms step_avg:61.28ms +step:2009/2245 train_time:123104ms step_avg:61.28ms +step:2010/2245 train_time:123163ms step_avg:61.28ms +step:2011/2245 train_time:123225ms step_avg:61.28ms +step:2012/2245 train_time:123285ms step_avg:61.27ms +step:2013/2245 train_time:123347ms step_avg:61.28ms +step:2014/2245 train_time:123407ms step_avg:61.27ms +step:2015/2245 train_time:123469ms step_avg:61.27ms +step:2016/2245 train_time:123529ms step_avg:61.27ms +step:2017/2245 train_time:123593ms step_avg:61.28ms +step:2018/2245 train_time:123654ms step_avg:61.28ms +step:2019/2245 train_time:123719ms step_avg:61.28ms +step:2020/2245 train_time:123781ms step_avg:61.28ms +step:2021/2245 train_time:123845ms step_avg:61.28ms +step:2022/2245 train_time:123905ms step_avg:61.28ms +step:2023/2245 train_time:123968ms step_avg:61.28ms +step:2024/2245 train_time:124029ms step_avg:61.28ms +step:2025/2245 train_time:124092ms step_avg:61.28ms +step:2026/2245 train_time:124152ms step_avg:61.28ms +step:2027/2245 train_time:124214ms step_avg:61.28ms +step:2028/2245 train_time:124275ms step_avg:61.28ms +step:2029/2245 train_time:124337ms step_avg:61.28ms +step:2030/2245 train_time:124398ms step_avg:61.28ms +step:2031/2245 train_time:124461ms step_avg:61.28ms +step:2032/2245 train_time:124522ms step_avg:61.28ms +step:2033/2245 train_time:124585ms step_avg:61.28ms +step:2034/2245 train_time:124646ms step_avg:61.28ms +step:2035/2245 train_time:124709ms step_avg:61.28ms +step:2036/2245 train_time:124770ms step_avg:61.28ms +step:2037/2245 train_time:124833ms step_avg:61.28ms +step:2038/2245 train_time:124894ms step_avg:61.28ms +step:2039/2245 train_time:124958ms step_avg:61.28ms +step:2040/2245 train_time:125018ms step_avg:61.28ms +step:2041/2245 train_time:125082ms step_avg:61.28ms +step:2042/2245 train_time:125142ms step_avg:61.28ms +step:2043/2245 train_time:125204ms step_avg:61.28ms +step:2044/2245 train_time:125265ms step_avg:61.28ms +step:2045/2245 train_time:125327ms step_avg:61.28ms +step:2046/2245 train_time:125387ms step_avg:61.28ms +step:2047/2245 train_time:125451ms step_avg:61.29ms +step:2048/2245 train_time:125511ms step_avg:61.28ms +step:2049/2245 train_time:125574ms step_avg:61.29ms +step:2050/2245 train_time:125634ms step_avg:61.29ms +step:2051/2245 train_time:125698ms step_avg:61.29ms +step:2052/2245 train_time:125759ms step_avg:61.29ms +step:2053/2245 train_time:125822ms step_avg:61.29ms +step:2054/2245 train_time:125883ms step_avg:61.29ms +step:2055/2245 train_time:125946ms step_avg:61.29ms +step:2056/2245 train_time:126007ms step_avg:61.29ms +step:2057/2245 train_time:126070ms step_avg:61.29ms +step:2058/2245 train_time:126130ms step_avg:61.29ms +step:2059/2245 train_time:126193ms step_avg:61.29ms +step:2060/2245 train_time:126253ms step_avg:61.29ms +step:2061/2245 train_time:126317ms step_avg:61.29ms +step:2062/2245 train_time:126378ms step_avg:61.29ms +step:2063/2245 train_time:126441ms step_avg:61.29ms +step:2064/2245 train_time:126502ms step_avg:61.29ms +step:2065/2245 train_time:126565ms step_avg:61.29ms +step:2066/2245 train_time:126625ms step_avg:61.29ms +step:2067/2245 train_time:126688ms step_avg:61.29ms +step:2068/2245 train_time:126748ms step_avg:61.29ms +step:2069/2245 train_time:126812ms step_avg:61.29ms +step:2070/2245 train_time:126873ms step_avg:61.29ms +step:2071/2245 train_time:126936ms step_avg:61.29ms +step:2072/2245 train_time:126998ms step_avg:61.29ms +step:2073/2245 train_time:127062ms step_avg:61.29ms +step:2074/2245 train_time:127122ms step_avg:61.29ms +step:2075/2245 train_time:127185ms step_avg:61.29ms +step:2076/2245 train_time:127245ms step_avg:61.29ms +step:2077/2245 train_time:127307ms step_avg:61.29ms +step:2078/2245 train_time:127367ms step_avg:61.29ms +step:2079/2245 train_time:127430ms step_avg:61.29ms +step:2080/2245 train_time:127491ms step_avg:61.29ms +step:2081/2245 train_time:127553ms step_avg:61.29ms +step:2082/2245 train_time:127614ms step_avg:61.29ms +step:2083/2245 train_time:127678ms step_avg:61.30ms +step:2084/2245 train_time:127739ms step_avg:61.30ms +step:2085/2245 train_time:127803ms step_avg:61.30ms +step:2086/2245 train_time:127863ms step_avg:61.30ms +step:2087/2245 train_time:127925ms step_avg:61.30ms +step:2088/2245 train_time:127986ms step_avg:61.30ms +step:2089/2245 train_time:128049ms step_avg:61.30ms +step:2090/2245 train_time:128109ms step_avg:61.30ms +step:2091/2245 train_time:128172ms step_avg:61.30ms +step:2092/2245 train_time:128233ms step_avg:61.30ms +step:2093/2245 train_time:128296ms step_avg:61.30ms +step:2094/2245 train_time:128357ms step_avg:61.30ms +step:2095/2245 train_time:128421ms step_avg:61.30ms +step:2096/2245 train_time:128481ms step_avg:61.30ms +step:2097/2245 train_time:128544ms step_avg:61.30ms +step:2098/2245 train_time:128603ms step_avg:61.30ms +step:2099/2245 train_time:128667ms step_avg:61.30ms +step:2100/2245 train_time:128727ms step_avg:61.30ms +step:2101/2245 train_time:128790ms step_avg:61.30ms +step:2102/2245 train_time:128851ms step_avg:61.30ms +step:2103/2245 train_time:128914ms step_avg:61.30ms +step:2104/2245 train_time:128976ms step_avg:61.30ms +step:2105/2245 train_time:129040ms step_avg:61.30ms +step:2106/2245 train_time:129101ms step_avg:61.30ms +step:2107/2245 train_time:129163ms step_avg:61.30ms +step:2108/2245 train_time:129223ms step_avg:61.30ms +step:2109/2245 train_time:129286ms step_avg:61.30ms +step:2110/2245 train_time:129347ms step_avg:61.30ms +step:2111/2245 train_time:129409ms step_avg:61.30ms +step:2112/2245 train_time:129469ms step_avg:61.30ms +step:2113/2245 train_time:129532ms step_avg:61.30ms +step:2114/2245 train_time:129593ms step_avg:61.30ms +step:2115/2245 train_time:129657ms step_avg:61.30ms +step:2116/2245 train_time:129718ms step_avg:61.30ms +step:2117/2245 train_time:129782ms step_avg:61.30ms +step:2118/2245 train_time:129842ms step_avg:61.30ms +step:2119/2245 train_time:129905ms step_avg:61.30ms +step:2120/2245 train_time:129965ms step_avg:61.30ms +step:2121/2245 train_time:130028ms step_avg:61.31ms +step:2122/2245 train_time:130090ms step_avg:61.31ms +step:2123/2245 train_time:130153ms step_avg:61.31ms +step:2124/2245 train_time:130213ms step_avg:61.31ms +step:2125/2245 train_time:130276ms step_avg:61.31ms +step:2126/2245 train_time:130336ms step_avg:61.31ms +step:2127/2245 train_time:130399ms step_avg:61.31ms +step:2128/2245 train_time:130459ms step_avg:61.31ms +step:2129/2245 train_time:130523ms step_avg:61.31ms +step:2130/2245 train_time:130583ms step_avg:61.31ms +step:2131/2245 train_time:130646ms step_avg:61.31ms +step:2132/2245 train_time:130706ms step_avg:61.31ms +step:2133/2245 train_time:130769ms step_avg:61.31ms +step:2134/2245 train_time:130829ms step_avg:61.31ms +step:2135/2245 train_time:130892ms step_avg:61.31ms +step:2136/2245 train_time:130953ms step_avg:61.31ms +step:2137/2245 train_time:131016ms step_avg:61.31ms +step:2138/2245 train_time:131077ms step_avg:61.31ms +step:2139/2245 train_time:131140ms step_avg:61.31ms +step:2140/2245 train_time:131201ms step_avg:61.31ms +step:2141/2245 train_time:131263ms step_avg:61.31ms +step:2142/2245 train_time:131323ms step_avg:61.31ms +step:2143/2245 train_time:131386ms step_avg:61.31ms +step:2144/2245 train_time:131447ms step_avg:61.31ms +step:2145/2245 train_time:131509ms step_avg:61.31ms +step:2146/2245 train_time:131569ms step_avg:61.31ms +step:2147/2245 train_time:131633ms step_avg:61.31ms +step:2148/2245 train_time:131694ms step_avg:61.31ms +step:2149/2245 train_time:131759ms step_avg:61.31ms +step:2150/2245 train_time:131819ms step_avg:61.31ms +step:2151/2245 train_time:131883ms step_avg:61.31ms +step:2152/2245 train_time:131944ms step_avg:61.31ms +step:2153/2245 train_time:132006ms step_avg:61.31ms +step:2154/2245 train_time:132067ms step_avg:61.31ms +step:2155/2245 train_time:132129ms step_avg:61.31ms +step:2156/2245 train_time:132189ms step_avg:61.31ms +step:2157/2245 train_time:132253ms step_avg:61.31ms +step:2158/2245 train_time:132313ms step_avg:61.31ms +step:2159/2245 train_time:132376ms step_avg:61.31ms +step:2160/2245 train_time:132437ms step_avg:61.31ms +step:2161/2245 train_time:132501ms step_avg:61.31ms +step:2162/2245 train_time:132561ms step_avg:61.31ms +step:2163/2245 train_time:132624ms step_avg:61.31ms +step:2164/2245 train_time:132684ms step_avg:61.31ms +step:2165/2245 train_time:132747ms step_avg:61.31ms +step:2166/2245 train_time:132807ms step_avg:61.31ms +step:2167/2245 train_time:132870ms step_avg:61.32ms +step:2168/2245 train_time:132930ms step_avg:61.31ms +step:2169/2245 train_time:132993ms step_avg:61.32ms +step:2170/2245 train_time:133053ms step_avg:61.31ms +step:2171/2245 train_time:133117ms step_avg:61.32ms +step:2172/2245 train_time:133178ms step_avg:61.32ms +step:2173/2245 train_time:133242ms step_avg:61.32ms +step:2174/2245 train_time:133302ms step_avg:61.32ms +step:2175/2245 train_time:133365ms step_avg:61.32ms +step:2176/2245 train_time:133426ms step_avg:61.32ms +step:2177/2245 train_time:133488ms step_avg:61.32ms +step:2178/2245 train_time:133548ms step_avg:61.32ms +step:2179/2245 train_time:133611ms step_avg:61.32ms +step:2180/2245 train_time:133672ms step_avg:61.32ms +step:2181/2245 train_time:133735ms step_avg:61.32ms +step:2182/2245 train_time:133796ms step_avg:61.32ms +step:2183/2245 train_time:133860ms step_avg:61.32ms +step:2184/2245 train_time:133921ms step_avg:61.32ms +step:2185/2245 train_time:133984ms step_avg:61.32ms +step:2186/2245 train_time:134044ms step_avg:61.32ms +step:2187/2245 train_time:134107ms step_avg:61.32ms +step:2188/2245 train_time:134168ms step_avg:61.32ms +step:2189/2245 train_time:134230ms step_avg:61.32ms +step:2190/2245 train_time:134290ms step_avg:61.32ms +step:2191/2245 train_time:134353ms step_avg:61.32ms +step:2192/2245 train_time:134413ms step_avg:61.32ms +step:2193/2245 train_time:134477ms step_avg:61.32ms +step:2194/2245 train_time:134538ms step_avg:61.32ms +step:2195/2245 train_time:134601ms step_avg:61.32ms +step:2196/2245 train_time:134661ms step_avg:61.32ms +step:2197/2245 train_time:134724ms step_avg:61.32ms +step:2198/2245 train_time:134784ms step_avg:61.32ms +step:2199/2245 train_time:134848ms step_avg:61.32ms +step:2200/2245 train_time:134908ms step_avg:61.32ms +step:2201/2245 train_time:134970ms step_avg:61.32ms +step:2202/2245 train_time:135031ms step_avg:61.32ms +step:2203/2245 train_time:135094ms step_avg:61.32ms +step:2204/2245 train_time:135154ms step_avg:61.32ms +step:2205/2245 train_time:135217ms step_avg:61.32ms +step:2206/2245 train_time:135278ms step_avg:61.32ms +step:2207/2245 train_time:135341ms step_avg:61.32ms +step:2208/2245 train_time:135402ms step_avg:61.32ms +step:2209/2245 train_time:135465ms step_avg:61.32ms +step:2210/2245 train_time:135525ms step_avg:61.32ms +step:2211/2245 train_time:135588ms step_avg:61.32ms +step:2212/2245 train_time:135649ms step_avg:61.32ms +step:2213/2245 train_time:135712ms step_avg:61.32ms +step:2214/2245 train_time:135772ms step_avg:61.32ms +step:2215/2245 train_time:135836ms step_avg:61.33ms +step:2216/2245 train_time:135897ms step_avg:61.33ms +step:2217/2245 train_time:135960ms step_avg:61.33ms +step:2218/2245 train_time:136021ms step_avg:61.33ms +step:2219/2245 train_time:136084ms step_avg:61.33ms +step:2220/2245 train_time:136144ms step_avg:61.33ms +step:2221/2245 train_time:136207ms step_avg:61.33ms +step:2222/2245 train_time:136268ms step_avg:61.33ms +step:2223/2245 train_time:136331ms step_avg:61.33ms +step:2224/2245 train_time:136392ms step_avg:61.33ms +step:2225/2245 train_time:136455ms step_avg:61.33ms +step:2226/2245 train_time:136515ms step_avg:61.33ms +step:2227/2245 train_time:136579ms step_avg:61.33ms +step:2228/2245 train_time:136640ms step_avg:61.33ms +step:2229/2245 train_time:136704ms step_avg:61.33ms +step:2230/2245 train_time:136764ms step_avg:61.33ms +step:2231/2245 train_time:136827ms step_avg:61.33ms +step:2232/2245 train_time:136887ms step_avg:61.33ms +step:2233/2245 train_time:136950ms step_avg:61.33ms +step:2234/2245 train_time:137011ms step_avg:61.33ms +step:2235/2245 train_time:137074ms step_avg:61.33ms +step:2236/2245 train_time:137134ms step_avg:61.33ms +step:2237/2245 train_time:137197ms step_avg:61.33ms +step:2238/2245 train_time:137258ms step_avg:61.33ms +step:2239/2245 train_time:137321ms step_avg:61.33ms +step:2240/2245 train_time:137382ms step_avg:61.33ms +step:2241/2245 train_time:137444ms step_avg:61.33ms +step:2242/2245 train_time:137504ms step_avg:61.33ms +step:2243/2245 train_time:137568ms step_avg:61.33ms +step:2244/2245 train_time:137629ms step_avg:61.33ms +step:2245/2245 train_time:137692ms step_avg:61.33ms +step:2245/2245 val_loss:3.2771 train_time:137753ms step_avg:61.36ms +peak memory allocated: 29626 MiB reserved: 50528 MiB diff --git a/records/track_1_short/2025-11-10_CautiousWD/4c0708bf-8091-46ef-9557-f945cdf287c7.txt b/records/track_1_short/2025-11-10_CautiousWD/4c0708bf-8091-46ef-9557-f945cdf287c7.txt new file mode 100644 index 000000000..0b9f27d4c --- /dev/null +++ b/records/track_1_short/2025-11-10_CautiousWD/4c0708bf-8091-46ef-9557-f945cdf287c7.txt @@ -0,0 +1,3772 @@ +import os +import sys + +with open(sys.argv[0]) as f: + code = f.read() # read the code of this file ASAP, for logging +import copy +import glob +import math +import threading +import time +import uuid +from dataclasses import dataclass +from collections import defaultdict +from itertools import accumulate +from pathlib import Path + +os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" +import torch + +torch.empty( + 1, device="cuda", requires_grad=True +).backward() # prevents a bug on some systems +import torch._dynamo as dynamo +import torch.distributed as dist +import torch.nn.functional as F + +# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min +import triton +import triton.language as tl +from kernels import get_kernel +from torch import Tensor, nn + +dynamo.config.recompile_limit = 64 + +# ----------------------------------------------------------------------------- +# Custom operators: FP8 matmul by @YouJiacheng + + +@torch.library.custom_op("nanogpt::mm", mutates_args=()) +def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: + @torch.compile + def impl(x: Tensor, w: Tensor): + assert x.is_contiguous() and w.is_contiguous() + x_f8 = x.div(x_s).to(torch.float8_e4m3fn) + w_f8 = w.div(w_s).to(torch.float8_e4m3fn) + out = torch._scaled_mm( + x_f8, + w_f8.T, + out_dtype=torch.bfloat16, + scale_a=x.new_tensor(x_s, dtype=torch.float32), + scale_b=x.new_tensor(w_s, dtype=torch.float32), + use_fast_accum=True, + ) + return out, x_f8, w_f8 + + return impl(x, w) + +@mm_op.register_fake +def _(x: Tensor, w: Tensor, *_): + assert x.ndim == w.ndim == 2 + assert x.shape[1] == w.shape[1] + assert x.device == w.device + assert x.is_contiguous() and w.is_contiguous() + return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) + +@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) +def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: + @torch.compile + def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): + assert grad.is_contiguous() + x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) + w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) + grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) + grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) + grad_x = torch._scaled_mm( + grad_f8, + w_f8.T.contiguous().T, + out_dtype=torch.bfloat16, + scale_a=grad_inv_s, + scale_b=w_inv_s, + use_fast_accum=False, + ) + # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) + grad_w = torch._scaled_mm( + x_f8.T.contiguous(), + grad_f8.T.contiguous().T, + out_dtype=torch.float32, + scale_a=x_inv_s, + scale_b=grad_inv_s, + use_fast_accum=False, + ).T + return grad_x, grad_w + + return impl(g, x_f8, w_f8) + +@mm_backward_op.register_fake +def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): + return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) + +def backward(ctx, grad_out: Tensor, *_): + x_f8, w_f8 = ctx.saved_tensors + x_s, w_s, grad_s = ctx.scales + grad_x, grad_w = torch.ops.nanogpt.mm_backward( + grad_out, x_f8, w_f8, x_s, w_s, grad_s + ) + return grad_x, grad_w, None, None, None + +def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): + *_, x_s, w_s, grad_s = inputs + _, x_f8, w_f8 = output + ctx.save_for_backward(x_f8, w_f8) + ctx.scales = x_s, w_s, grad_s + ctx.set_materialize_grads(False) + +mm_op.register_autograd(backward, setup_context=setup_context) + +# ----------------------------------------------------------------------------- +# Triton kernel for symmetric matrix multiplication by @byronxu99 + +def _get_autotune_configs(): + return [ + triton.Config( + { + "BLOCK_SIZE_M": bm, + "BLOCK_SIZE_N": bn, + "BLOCK_SIZE_K": bk, + "GROUP_SIZE_M": 8, + "LOWER_UPPER": 1, + }, + num_stages=stages, + num_warps=warps, + ) + for bm in [64, 128] + for bn in [64, 128, 256] + for bk in [64, 128] + for stages, warps in [(3, 4), (3, 8), (4, 4)] + if bm // bn <= 2 and bn // bm <= 2 + ] + +@triton.jit +def _pid_to_block( + pid, + M, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) + + # Map PID to a single matrix in batch + batch_idx = pid // (num_pid_m * num_pid_n) + pid = pid % (num_pid_m * num_pid_n) + + # Map PID to 2D grid of blocks + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) + + m_idx = pid_m * BLOCK_SIZE_M + n_idx = pid_n * BLOCK_SIZE_N + return batch_idx, m_idx, n_idx + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def XXT_kernel( + A_ptr, C_ptr, + M, K, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def XXT(A: torch.Tensor, out: torch.Tensor): + """ + Launch Triton kernel to compute C = A @ A.T + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert out.size(-2) == M, "Output matrix has incorrect shape" + assert out.size(-1) == M, "Output matrix has incorrect shape" + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + XXT_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + K=K, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + ) + return out + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def ba_plus_cAA_kernel( + A_ptr, C_ptr, + M, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + alpha, beta, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A + # Performance is slightly slower than XXT_kernel, so we use two separate kernels + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + # Load block of A to add (corresponds to the current block of C) + offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) + a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) + a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) + a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) + + # Apply alpha and beta + accumulator *= alpha + accumulator += a_add * beta + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): + """ + Launch Triton kernel to compute C = alpha * A @ A.T + beta * A + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert M == K, "Input matrix must be square" + assert out.size(-2) == M + assert out.size(-1) == M + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + ba_plus_cAA_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + alpha=alpha, + beta=beta, + ) + return out + +# Computed for num_iters=5, safety_factor=2e-2, cushion=2 +polar_express_coeffs = [ + (8.156554524902461, -22.48329292557795, 15.878769915207462), + (4.042929935166739, -2.808917465908714, 0.5000178451051316), + (3.8916678022926607, -2.772484153217685, 0.5060648178503393), + (3.285753657755655, -2.3681294933425376, 0.46449024233003106), + (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) +] + +@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower +def polar_express(G: torch.Tensor): + """ + Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 + by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. + """ + X = G.bfloat16() + if G.size(-2) > G.size(-1): + X = X.mT + + # Ensure spectral norm is at most 1 + X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) + + # Allocate buffers + X = X.contiguous() + A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) + B = torch.empty_like(A) + C = torch.empty_like(X) + + aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm + + # Perform the iterations + for a, b, c in polar_express_coeffs: + XXT(X, out=A) # A = X @ X.mT + ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A + aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X + X, C = C, X # Swap references to avoid unnecessary copies + + if G.size(-2) > G.size(-1): + X = X.mT + return X + +# ----------------------------------------------------------------------------- +# Muon optimizer + +class NorMuon(torch.optim.Optimizer): + """ + Muon - MomentUm Orthogonalized by Newton-schulz + + https://kellerjordan.github.io/posts/muon/ + + Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- + processing step, in which each 2D parameter's update is replaced with the nearest orthogonal + matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has + the advantage that it can be stably run in bfloat16 on the GPU. + + Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, + or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). + + Differences from standard Muon: + - Newton-Shulz is replaced with Polar Express for the orthogonalization step + - NorMuon adds a low-rank variance estimator similar to Adafactor. + - small 1D parameters handled here instead of in Adam + - Cautious weight decay, a gated version of decoupled weight decay + - Custom distributed sizing: + The model stores all attn and mlp weights in the same shape, and then updates the view as + needed on the forward pass. This enables attn and mlp weights to be contained within the same + dist.reduce_scatter_tensor() call. The model architecture has been customized to enable + (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. + The scheduling is: + 1. reduce scatter smear_gate (1 param 7 padding params) + 2. reduce scatter attn_gate (10 params 6 padding params) + 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) + 4. reduce scatter attn/mlp round 2 (16 mlp params) + 5. wait on step 1, then compute update of 1 and schedule all gather + 6. wait on step 2, then compute update of 2 and schedule all gather + 7. wait on step 3, then compute update of 3 and schedule all gather + GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] + GPUs that receive params of type attn reshape before computing update + 8. wait on 4, then compute update of 4 and schedule all gather + 9. wait for each all gather to complete and update params + Empirically, leading with small params provides an additional 0.2s improvement. + """ + def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, beta2=0.95, custom_sizing=True): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + # custom sizing requires 8 GPUs + if custom_sizing and dist.get_world_size()==8: + param_groups = self.generate_custom_param_groups(params) + else: + param_groups = self.generate_standard_param_groups(params) + super().__init__(param_groups, defaults) + + def reset(self): + # expose a reset for clearing buffers + for group in self.param_groups: + group["momentum_buffer"].zero_() + group["second_momentum_buffer"].zero_() + + def generate_standard_param_groups(self, params): + """ + Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. + Creates one param group per module. + """ + groups = defaultdict(list) + for param in params: + groups[param.label].append(param) + + param_groups = [] + for module_name, group_params in groups.items(): + chunk_size = (len(group_params) + self.world_size - 1) // self.world_size + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + + return param_groups + + def generate_custom_param_groups(self, params): + """ + Implementation requires that a single GPU does not receive both attn + and mlp params when a param group is split across GPUs. + """ + module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp'] + params_list = list(params) + params_list.sort(key=lambda x: module_group_order.index(x.label)) + + idx = 0 + group_sizes = [1, 10, 16, 16] + assert len(params_list) == sum(group_sizes) + param_groups = [] + for size in group_sizes: + chunk_size = (size + self.world_size - 1) // self.world_size + group_params = params_list[idx: idx + size] + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + idx += size + + return param_groups + + @torch.no_grad() + def step(self): + # Efficient systems-wise implementation of step developed by @YouJiacheng, + # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, + # @ryanyang0, @vagrawal, and @varunneal. + rank = dist.get_rank() + group_infos = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + if not params: + continue + + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + stacked_grads = torch.empty( + (padded_num_params, *params[0].shape), + dtype=params[0].dtype, + device=params[0].device + ) + for i, p in enumerate(params): + stacked_grads[i].copy_(p.grad, non_blocking=True) + if len(params) < padded_num_params: + stacked_grads[len(params):].zero_() + + grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) + + reduce_future = dist.reduce_scatter_tensor( + grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True + ).get_future() + + group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) + + all_gather_infos = [] + # Second pass: wait for gradients, compute updates for the local shard of parameters, + # and launch all async all_gather operations. + for group, info in zip(self.param_groups, group_infos): + info["reduce_future"].wait() + + params = group["params"] + grad_chunk = info["grad_chunk"] + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + start_idx = rank * chunk_size + module_idx = start_idx if start_idx < len(params) else 0 + + num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank + + if "momentum_buffer" not in group: + group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) + momentum_buffer = group["momentum_buffer"] + # Apply momentum update to the persistent momentum buffer in-place + momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) + updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) + + grad_shape = updated_grads.shape + if params[module_idx].label == 'attn': + # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] + for p in params[module_idx:module_idx + num_params]: + assert p.label == 'attn' + updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) + ref_param = params[module_idx] + param_shape = ref_param.shape + + if "second_momentum_buffer" not in group: + group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) + if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) + ) + second_momentum_buffer = group["second_momentum_buffer"] + + if "param_lr" not in group: + group["param_lr"] = ( + max(1., param_shape[-2] / param_shape[-1]) ** 0.5 + * ref_param.new_tensor( + [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + ) + + group["param_wd"] = ref_param.new_tensor( + [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + + # Determine LR and WR + eff_lr = group["lr"] * group["param_lr"] + eff_wd = group["lr"] * group["weight_decay"] * group["param_wd"] + + # Compute zeropower for the entire chunk in a single, batched call. + if num_params == 0: + v_chunk = updated_grads + else: + v_chunk = polar_express(updated_grads) + + # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) + v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) + second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) + step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() + v_chunk.mul_(step_size) + v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) + + v_chunk = v_chunk.view(grad_shape) + + updated_params = torch.empty_like(grad_chunk) + param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) + + # "Cautious" weight decay (https://arxiv.org/abs/2510.12402) + mask = (v_chunk * param_chunk) >= 0 + v_chunk.addcmul_(param_chunk, (eff_wd * mask).to(ref_param.dtype)) + + param_chunk.addcmul_(v_chunk, -eff_lr) + + updated_params[:num_params].copy_(param_chunk) + if num_params < chunk_size: + updated_params[num_params:].zero_() + + stacked_params = torch.empty( + (padded_num_params, *param_shape), + dtype=updated_params.dtype, + device=updated_params.device, + ) + + gather_future = dist.all_gather_into_tensor( + stacked_params, updated_params, async_op=True + ).get_future() + + all_gather_infos.append( + { + "gather_future": gather_future, + "stacked_params": stacked_params, + "orig_params": params, + } + ) + + # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. + for info in all_gather_infos: + info["gather_future"].wait() + stacked_params = info["stacked_params"] + orig_params = info["orig_params"] + + unstacked_params = torch.unbind(stacked_params) + for i, p in enumerate(orig_params): + p.copy_(unstacked_params[i], non_blocking=True) + + +class DistAdam(torch.optim.Optimizer): + def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + params = list(params) + sizes = {p.shape for p in params} + # create one buffer per unique parameter-size + param_groups = [] + for size in sizes: + group_params = [p for p in params if p.shape == size] + param_groups.append(dict(params=group_params)) + super().__init__(param_groups, defaults) + # init state + for p in params: + chunk_size = p.size(0) // self.world_size + exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) + exp_avg_sq = torch.zeros_like(exp_avg) + self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) + # DistributedAdam implementation by @vagrawal + + @torch.compile + @torch.no_grad() + def step(self): + rank = dist.get_rank() + reduce_scatter_futures: list[torch.Future] = [] + all_gather_futures: list[torch.Future] = [] + grad_slices = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + for param in params: + grad = param.grad + rank_size = grad.shape[0] // self.world_size + grad_slice = torch.empty_like(grad[:rank_size]) + reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) + grad_slices.append(grad_slice) + + idx = 0 + for group in self.param_groups: + beta1, beta2 = group['betas'] + eps = group['eps'] + wd = group['weight_decay'] + params = group['params'] + for param in params: + reduce_scatter_futures[idx].wait() + rank_size = param.shape[0] // self.world_size + p_slice = param[rank * rank_size:(rank + 1) * rank_size] + lr = group['lr'] * getattr(param, "lr_mul", 1.0) + state = self.state[param] + g_slice = grad_slices[idx] + + exp_avg = state["exp_avg"] + exp_avg_sq = state["exp_avg_sq"] + state["step"] += 1 + t = state["step"] + # weight decay + if wd != 0: + eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) + p_slice.mul_(1 - eff_weight_decay) + # update running averages + exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) + # bias corrections + bias1 = 1 - beta1 ** t + bias2 = 1 - beta2 ** t + # compute step + denom = exp_avg_sq.sqrt().add_(eps) + step_size = lr * (bias2 ** 0.5 / bias1) + update = exp_avg.div(denom).mul_(step_size) + p_slice.add_(other=update, alpha=-1.0) + idx += 1 + all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) + torch.futures.collect_all(all_gather_futures).wait() + +# ----------------------------------------------------------------------------- +# PyTorch nn.Module definitions for the model + +def norm(x: Tensor): + return F.rms_norm(x, (x.size(-1),)) + +class CastedLinear(nn.Linear): + def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): + super().__init__(in_features, out_features, bias=False) + self.use_fp8 = use_fp8 + self.x_s = x_s + self.w_s = w_s + self.grad_s = grad_s + + def reset_parameters(self) -> None: + with torch.no_grad(): + self.weight.zero_() # @Grad62304977 and others + + def forward(self, x: Tensor): + if self.use_fp8 and self.training: + _x = x.flatten(0, -2) + out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] + return out.reshape(*x.shape[:-1], -1) + else: + return F.linear(x, self.weight.type_as(x)) + +# yarn implementation @classiclarryd +class Yarn(nn.Module): + def __init__(self, head_dim, max_seq_len): + super().__init__() + self.head_dim = head_dim + self.max_seq_len = max_seq_len + self.reset() + + def reset(self): + angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) + # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) + angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) + theta = torch.outer(t, angular_freq) + self.cos = nn.Buffer( + theta.cos().to(torch.bfloat16), persistent=False + ) + self.sin = nn.Buffer( + theta.sin().to(torch.bfloat16), persistent=False + ) + self.angular_freq = angular_freq + # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 + self.attn_scale = 0.1 + + def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): + rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) + scaling_factor = old_window / new_window + interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) + self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) + theta = torch.outer(t, self.angular_freq) + self.cos.copy_(theta.cos()) + self.sin.copy_(theta.sin()) + self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 + +def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): + assert cos.size(0) >= x_BTHD.size(-3) + cos, sin = ( + cos[None, : x_BTHD.size(-3), None, :], + sin[None, : x_BTHD.size(-3), None, :], + ) + x1, x2 = x_BTHD.chunk(2, dim=-1) + y1 = x1 * cos + x2 * sin + y2 = x1 * (-sin) + x2 * cos + return torch.cat((y1, y2), 3) + +@dataclass +class AttnArgs: + ve: torch.Tensor + sa_lambdas: torch.Tensor + seqlens: torch.Tensor + bm_size: int + cos: torch.Tensor + sin: torch.Tensor + attn_scale: float + +flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface + +class CausalSelfAttention(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int): + super().__init__() + self.num_heads = num_heads + self.head_dim = head_dim + self.dim = dim + self.hdim = num_heads * head_dim + + assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" + std = 0.5 * (self.dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng + # https://x.com/hi_tysam/status/1879699187107033311 + # make matrices the same shape as MLP to enable batched call in optimizer + self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) + # label module to enable custom optimizer sizing + self.qkvo_w.label='attn' + + with torch.no_grad(): + self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights + self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero + + # sparse gated attention to enable context based no-op by @classiclarryd + self.attn_gate = CastedLinear(12, num_heads) + # label module to enable custom optimizer sizing + self.attn_gate.weight.label = 'attn_gate' + + def forward(self, x: Tensor, attn_args: AttnArgs): + B, T = x.size(0), x.size(1) # batch size, sequence length + assert B == 1, "varlen sequences requires B == 1" + assert T % 16 == 0 + # unpack attention args + cos, sin = attn_args.cos, attn_args.sin + ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas + seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size + + q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) + q, k = norm(q), norm(k) # QK norm @Grad62304977 + q, k = rotary(q, cos, sin), rotary(k, cos, sin) + if ve is not None: + v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 + else: # skip mid-layers token value embeddings by @YouJiacheng + v = sa_lambdas[0] * v + + max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) + + # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng + y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, + max_seqlen_q=max_len, max_seqlen_k=max_len, + causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) + y = y.view(B, T, self.num_heads, self.head_dim) + y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) + y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side + y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) + return y + + +class MLP(nn.Module): + def __init__(self, dim: int): + super().__init__() + hdim = 4 * dim + # make matrices the same shape to enable batched call in optimizer + self.c_fc = nn.Parameter(torch.empty(dim, hdim)) + self.c_proj = nn.Parameter(torch.empty(dim, hdim)) + # label modules to enable custom optimizer sizing + self.c_fc.label = 'mlp' + self.c_proj.label = 'mlp' + # corrective factor to account for transpose + self.c_fc.lr_mul = 2. + + std = 0.5 * (dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + with torch.no_grad(): + self.c_fc.uniform_(-bound, bound) + self.c_proj.zero_() # zero init suggested by @Grad62304977 + + def forward(self, x: Tensor): + x = F.linear(x, self.c_fc.T.type_as(x)) + x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 + x = F.linear(x, self.c_proj.type_as(x)) + return x + +class Block(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): + super().__init__() + # skip attention of blocks.7 (the 8th layer) by @YouJiacheng + self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None + # skip MLP blocks for first MLP layer by @EmelyanenkoK + self.mlp = MLP(dim) if layer_idx != 0 else None + + def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): + x = lambdas[0] * x + lambdas[1] * x0 + if self.attn is not None: + x = x + self.attn(norm(x), attn_args) + if self.mlp is not None: + x = x + self.mlp(norm(x)) + return x + +# ----------------------------------------------------------------------------- +# The main model + +def next_multiple_of_n(v: float | int, *, n: int): + return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) + +class GPT(nn.Module): + def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): + super().__init__() + vocab_size = next_multiple_of_n(vocab_size, n=128) + self.embed = nn.Embedding(vocab_size, model_dim) + self.smear_gate = CastedLinear(12, 1) + # label modules to enable custom optimizer sizing + self.smear_gate.weight.label = 'smear_gate' + # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 + # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 + self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) + self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) + self.yarn = Yarn(head_dim, max_seq_len) + # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. + # suggested to me by @Grad62304977. this originates from Karpathy's experiments. + use_fp8 = not os.environ.get("DISABLE_FP8", False) + self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) + # Add learnable skip connection weights for decoder layers + assert num_layers % 2 == 0 + pad = (-num_layers * 5 - 2) % dist.get_world_size() + self.scalars = nn.Parameter( + torch.cat( + [ + -1.5 + * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 + *[ + torch.tensor([1.0, 0.0]) for _ in range(num_layers) + ], # block lambdas + *[ + torch.tensor([0.5, 0.5]) for _ in range(num_layers) + ], # SA lambdas + torch.zeros(1), # smear_lambda + 0.5*torch.ones(1), # backout_lambda + torch.ones(pad), + ] + ) + ) + # set learning rates + for param in self.embed.parameters(): + param.lr_mul = 75. + for param in self.value_embeds.parameters(): + param.lr_mul = 75. + self.lm_head.weight.lr_mul = 1.0 + self.scalars.lr_mul = 5.0 + + def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): + assert input_seq.ndim == 1 + + ve = [value_embed(input_seq) for value_embed in self.value_embeds] + # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure + # dropping first layer updates this to .12 ... 012 + ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] + assert len(ve) == len(self.blocks) + + short_bm = ws_short * args.block_size + long_bm = ws_long * args.block_size + bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] + assert len(bm_sizes) == len(self.blocks) + + x = self.embed(input_seq) + + skip_weights = self.scalars[:(len(self.blocks) // 2)] + lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) + sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) + smear_lambda = self.scalars[5 * len(self.blocks)] + backout_lambda = self.scalars[5 * len(self.blocks)+1] + + # smear token embed forward 1 position @classiclarryd + smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) + x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) + x = x0 = norm(x[None]) + + # U-net design by @brendanh0gan + skip_connections = [] + n = len(self.blocks) // 2 + + x_backout = None + backout_layer = 8 + # skip layer zero + for i in range(1,len(self.blocks)): + attn_args = AttnArgs( + ve=ve[i], + sa_lambdas=sa_lambdas[i], + seqlens=seqlens, + bm_size=bm_sizes[i], + cos=self.yarn.cos, + sin=self.yarn.sin, + attn_scale=self.yarn.attn_scale + ) + # since layer 0 is skipped, layer 11 does not have skip_connection + if i >= n and i<11: + gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) + x = x + gate * skip_connections.pop() + x = self.blocks[i](x, x0, lambdas[i], attn_args) + if i < n: + skip_connections.append(x) + if i == backout_layer: + x_backout = x + + # back out contributions from first 8 layers that are only required for downstream context and not direct prediction + x -= backout_lambda * x_backout + x = norm(x) + logits = self.lm_head(x) + # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) + logits = 30 * torch.sigmoid(logits / 7.5) + logits_for_loss = logits.float() if not self.training else logits + loss = F.cross_entropy( + logits_for_loss.view(-1, logits_for_loss.size(-1)), + target_seq, + reduction="sum" if self.training else "mean", + ) + return loss + +# ----------------------------------------------------------------------------- +# Distributed data loader + +def _load_data_shard(file: Path): + header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 + assert header[0] == 20240520, "magic number mismatch in the data .bin file" + assert header[1] == 1, "unsupported version" + num_tokens = int(header[2]) # number of tokens (claimed) + with file.open("rb", buffering=0) as f: + tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng + f.seek(256 * 4) + nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng + assert nbytes == 2 * num_tokens, "number of tokens read does not match header" + return tokens + +BOS_ID = 50256 + +class BOSFinder: + # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd + def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): + # Precompute BOS positions once per shard + self.tokens=tokens + self.size = tokens.numel() + self.quickload = quickload + if quickload: + # only scan first 4 million tokens, then kickoff async thread to scan rest + self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.thread = None + self.ready = threading.Event() + self.start() + else: + self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.i = 0 + self.world_size = world_size + self.batch_iter = 0 + + def _load(self): + self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + self.bos_idx = self.bos_idx_async + + def next_batch(self, num_tokens_local: int, max_seq_len: int): + # if quickload was used, repoint to the full dataset after 5 batches + if self.quickload and self.batch_iter==5: + self.get() + n = len(self.bos_idx) + starts = [[] for _ in range(self.world_size)] + ends = [[] for _ in range(self.world_size)] + + idx = self.i + for r in range(self.world_size): + cur_len = 0 + while cur_len <= num_tokens_local: + if idx >= n: + raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") + cur = self.bos_idx[idx] + starts[r].append(cur) + end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, + cur + max_seq_len, + cur + num_tokens_local - cur_len + 1) + ends[r].append(end) + cur_len += end - cur + idx += 1 + + assert cur_len == num_tokens_local + 1 + self.i = idx + self.batch_iter+=1 + return starts, ends + +class DataPreloader: + # Helper for asynchronously loading next shard and indexing bos tokens + def __init__(self, file_iter, world_size: int = 1): + self.file_iter = file_iter + self.world_size = world_size + self.thread = None + self.data = None + self.ready = threading.Event() + + def _load(self): + tokens = _load_data_shard(next(self.file_iter)) + self.data = (tokens, BOSFinder(tokens, self.world_size)) + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + return self.data + +def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): + # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len + rank = dist.get_rank() if dist.is_initialized() else 0 + world_size = dist.get_world_size() if dist.is_initialized() else 1 + assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" + num_tokens = num_tokens // grad_accum_steps + + files = [Path(file) for file in sorted(glob.glob(filename_pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") + + file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training + tokens = _load_data_shard(next(file_iter)) + if align_to_bos: + finder = BOSFinder(tokens, world_size=world_size, quickload=True) + preloader = DataPreloader(file_iter, world_size) + preloader.start() + else: + pos = 0 # for unaligned case + + while True: + num_tokens_local = num_tokens // world_size + max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 + + if align_to_bos: + try: + seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) + start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) + except StopIteration: + # This shard is exhausted, load the next one in the next loop iteration. + tokens, finder = preloader.get() + preloader.start() + continue + + buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) + _inputs = buf[:-1] + _targets = buf[1:] + end_idxs[-1] -= 1 # last document was too long to account for _targets offset + cum_lengths = (end_idxs - start_idxs).cumsum(0) + + else: + if pos + num_tokens + 1 >= len(tokens): # should not occur for val data + tokens, pos = _load_data_shard(next(file_iter)), 0 + + pos_local = pos + rank * num_tokens_local + buf = tokens[pos_local: pos_local + num_tokens_local + 1] + _inputs = buf[:-1].view(num_tokens_local, ) + _targets = buf[1:].view(num_tokens_local, ) + + cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] + pos += num_tokens + + + _cum_lengths = torch.full((max_num_docs,), num_tokens_local) + _cum_lengths[0] = 0 + _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths + + new_params = yield ( + _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), + _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), + _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) + ) + + if new_params is not None: + # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() + new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params + assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" + num_tokens = new_num_tokens + max_seq_len = new_max_seq_len + grad_accum_steps = new_grad_accum_steps + + +# ----------------------------------------------------------------------------- +# int main + +@dataclass +class Hyperparameters: + # data + train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on + val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on + val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons + train_batch_size: int = 2048 * 16 * 8 + train_max_seq_len: int = 128 * 16 + val_batch_size: int = 4 * 64 * 1024 * 8 + # optimization + num_scheduled_iterations: int = 2205 # number of steps to complete lr and ws schedule + num_extension_iterations: int = 40 # number of steps to continue training at final lr and ws + num_iterations: int = num_scheduled_iterations + num_extension_iterations + cooldown_frac: float = 0.50 # fraction of num_scheduled_iterations spent cooling down the learning rate + # evaluation and logging + run_id: str = f"{uuid.uuid4()}" + val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end + save_checkpoint: bool = False + # attention masking + block_size: int = 128 + ws_schedule: tuple = (3, 7, 11) + ws_final: int = 13 # increase final validation ws, used for YaRN extension and short window size @classiclarryd + ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN + +args = Hyperparameters() + +data_path = os.environ.get("DATA_PATH", ".") +args.train_files = os.path.join(data_path, args.train_files) +args.val_files = os.path.join(data_path, args.val_files) + +# torchrun sets these env variables +rank = int(os.environ["RANK"]) +world_size = int(os.environ["WORLD_SIZE"]) +assert 8 % world_size == 0, "world_size must be a divisor of 8" +grad_accum_steps = 8 // world_size +assert torch.cuda.is_available() +device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) +torch.cuda.set_device(device) +dist.init_process_group(backend="nccl", device_id=device) +dist.barrier() +master_process = (rank == 0) # this process will do logging, checkpointing etc. + +# begin logging +logfile = None +if master_process: + run_id = args.run_id + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{run_id}.txt" + print(logfile) +def print0(s, console=False): + if master_process: + with open(logfile, "a") as f: + if console: + print(s) + print(s, file=f) + +# begin by printing this file (the Python code) +print0(code) +print0("="*100) +# log information about the hardware/software environment this is running on +print0(f"Running Python {sys.version}") +print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") +print0(f"Running Triton version {triton.__version__}") + +def nvidia_smi(): + import subprocess # avoid top level import + return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout +print0(nvidia_smi()) +print0("="*100) + +model: nn.Module = GPT( + vocab_size=50257, + num_layers=12, + num_heads=6, + head_dim=128, + model_dim=768, + max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) +).cuda() +for m in model.modules(): + if isinstance(m, (nn.Embedding, nn.Linear)): + m.bfloat16() +for param in model.parameters(): + dist.broadcast(param.detach(), 0) + +# collect the parameters to optimize +hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] +embed_params = [p for n, p in model.named_parameters() if "embed" in n] +scalar_params = [p for p in model.parameters() if p.ndim < 2] +head_params = [model.lm_head.weight] +gate_params = [p for n, p in model.named_parameters() if "gate" in n] + +# init the optimizer(s) +# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence +# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 +optimizer1 = DistAdam( + scalar_params + head_params + embed_params, + lr=0.008, + betas=(0.65, 0.95), + eps=1e-8, + weight_decay=0.0, +) +optimizer2 = NorMuon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=1.2) +optimizers = [optimizer1, optimizer2] +for opt in optimizers: + for group in opt.param_groups: + group["initial_lr"] = group["lr"] + +# learning rate schedule: flat, then linear decay, then flat +def get_lr(step: int): + x = min(0.9999, step / args.num_scheduled_iterations) + assert 0 <= x < 1 + lr = 1.0 + if x >= 1 - args.cooldown_frac: + w = (1 - x) / args.cooldown_frac + lr = w * 1.0 + (1 - w) * 0.1 + return lr + +def get_ws(step: int): + # set short window size to half of long window size + # Higher ws on "extension" steps + if step >= args.num_scheduled_iterations: + return args.ws_final // 2, args.ws_final + x = step / args.num_scheduled_iterations + assert 0 <= x < 1 + ws_idx = int(len(args.ws_schedule) * x) + return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] + +def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): + # warmup phase: linearly increase momentum from min to max + # cooldown phase: linearly decrease momentum from max to min + momentum_cd_start = args.num_iterations - muon_cooldown_steps + if step < muon_warmup_steps: + frac = step / muon_warmup_steps + momentum = momentum_min + frac * (momentum_max - momentum_min) + elif step > momentum_cd_start: + frac = (step - momentum_cd_start) / muon_cooldown_steps + momentum = momentum_max - frac * (momentum_max - momentum_min) + else: + momentum = momentum_max + return momentum + +def step_optimizers(step: int, optimizers, model): + # update lr + for optimizer in optimizers: + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * get_lr(step) + + # set muon momentum based on step + momentum = get_muon_momentum(step) + for group in optimizers[1].param_groups: + group["momentum"] = momentum + + # on even steps, only step Muon params + # on odd steps, step all params + if step%2==0: + optimizers[1].step() + optimizers[1].zero_grad(set_to_none=True) + else: + for optimizer in optimizers: + optimizer.step() + model.zero_grad(set_to_none=True) + +model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) + +######################################## +# Warmup kernels # +######################################## + +# Warmup the training kernels, then re-initialize the state so we aren't cheating +warmup_steps = 30 +initial_state = dict(model=copy.deepcopy(model.state_dict()), + optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +ws_schedule = list(args.ws_schedule) + [args.ws_final] +ws_long = ws_schedule[0] +for step in range(warmup_steps): + inputs, targets, cum_seqlens = next(train_loader) + # each window size is a new graph, need to warm up each with Yarn.attn_scale + ws_idx = step % len(ws_schedule) + if ws_idx==0: + model.yarn.reset() + ws_long = ws_schedule[0] + else: + new_ws_long = ws_schedule[ws_idx] + model.yarn.apply(ws_long, new_ws_long) + ws_long = new_ws_long + model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() + for opt in optimizers: + opt.step() + model.zero_grad(set_to_none=True) +model.yarn.reset() # rotary buffer is not stored in state_dict +model.load_state_dict(initial_state["model"]) +optimizer2.reset() # muon momentum buffers not in state dict +for opt, opt_state in zip(optimizers, initial_state["optimizers"]): + opt.load_state_dict(opt_state) +del train_loader, initial_state + +######################################## +# Training and validation # +######################################## + +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +training_time_ms = 0 +# start the clock +torch.cuda.synchronize() +t0 = time.perf_counter() +# begin training +train_steps = args.num_iterations +ws_short, ws_long = get_ws(0) +for step in range(train_steps + 1): + last_step = (step == train_steps) + ws_short, new_ws_long = get_ws(step) + if new_ws_long != ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long=new_ws_long + + # --------------- VALIDATION SECTION ----------------- + if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): + if last_step: + ws_long = args.ws_validate_post_yarn_ext + # stop the clock + torch.cuda.synchronize() + training_time_ms += 1000 * (time.perf_counter() - t0) + model.eval() + assert args.val_tokens % args.val_batch_size == 0 + val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size + val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) + val_loss = 0 + with torch.no_grad(): + for _ in range(val_steps): + inputs, targets, cum_seqlens = next(val_loader) + val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) + val_loss /= val_steps + del val_loader + dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) + print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) + model.train() + # start the clock again + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if master_process and args.save_checkpoint: + log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) + os.makedirs(f"logs/{run_id}", exist_ok=True) + torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") + # the last step only has the validation loop, so break to avoid training + break + + # --------------- TRAINING SECTION ----------------- + for _ in range(grad_accum_steps): + inputs, targets, cum_seqlens = next(train_loader) + (model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps).backward() + step_optimizers(step, optimizers, model) + + # logging + approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) + print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) + +print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) +dist.destroy_process_group() + +==================================================================================================== +Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] +Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 +Running Triton version 3.5.0 +Mon Nov 10 21:57:43 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | +| N/A 41C P0 130W / 700W | 5858MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | +| N/A 35C P0 122W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | +| N/A 33C P0 119W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | +| N/A 39C P0 122W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | +| N/A 41C P0 133W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | +| N/A 34C P0 122W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | +| N/A 40C P0 123W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | +| N/A 34C P0 119W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| ++-----------------------------------------------------------------------------------------+ + +==================================================================================================== +step:0/2245 val_loss:10.8258 train_time:0ms step_avg:0.08ms +step:1/2245 train_time:118ms step_avg:118.18ms +step:2/2245 train_time:140ms step_avg:69.81ms +step:3/2245 train_time:178ms step_avg:59.20ms +step:4/2245 train_time:234ms step_avg:58.50ms +step:5/2245 train_time:294ms step_avg:58.73ms +step:6/2245 train_time:352ms step_avg:58.69ms +step:7/2245 train_time:413ms step_avg:59.01ms +step:8/2245 train_time:472ms step_avg:58.97ms +step:9/2245 train_time:533ms step_avg:59.24ms +step:10/2245 train_time:592ms step_avg:59.21ms +step:11/2245 train_time:653ms step_avg:59.37ms +step:12/2245 train_time:712ms step_avg:59.30ms +step:13/2245 train_time:772ms step_avg:59.42ms +step:14/2245 train_time:831ms step_avg:59.39ms +step:15/2245 train_time:893ms step_avg:59.52ms +step:16/2245 train_time:952ms step_avg:59.48ms +step:17/2245 train_time:1016ms step_avg:59.75ms +step:18/2245 train_time:1078ms step_avg:59.88ms +step:19/2245 train_time:1142ms step_avg:60.10ms +step:20/2245 train_time:1202ms step_avg:60.12ms +step:21/2245 train_time:1265ms step_avg:60.23ms +step:22/2245 train_time:1325ms step_avg:60.22ms +step:23/2245 train_time:1387ms step_avg:60.32ms +step:24/2245 train_time:1447ms step_avg:60.28ms +step:25/2245 train_time:1509ms step_avg:60.35ms +step:26/2245 train_time:1568ms step_avg:60.32ms +step:27/2245 train_time:1630ms step_avg:60.35ms +step:28/2245 train_time:1688ms step_avg:60.29ms +step:29/2245 train_time:1750ms step_avg:60.33ms +step:30/2245 train_time:1808ms step_avg:60.28ms +step:31/2245 train_time:1870ms step_avg:60.32ms +step:32/2245 train_time:1929ms step_avg:60.29ms +step:33/2245 train_time:1992ms step_avg:60.36ms +step:34/2245 train_time:2052ms step_avg:60.36ms +step:35/2245 train_time:2115ms step_avg:60.44ms +step:36/2245 train_time:2175ms step_avg:60.42ms +step:37/2245 train_time:2237ms step_avg:60.45ms +step:38/2245 train_time:2297ms step_avg:60.44ms +step:39/2245 train_time:2359ms step_avg:60.49ms +step:40/2245 train_time:2419ms step_avg:60.47ms +step:41/2245 train_time:2480ms step_avg:60.50ms +step:42/2245 train_time:2539ms step_avg:60.46ms +step:43/2245 train_time:2602ms step_avg:60.52ms +step:44/2245 train_time:2662ms step_avg:60.50ms +step:45/2245 train_time:2724ms step_avg:60.53ms +step:46/2245 train_time:2783ms step_avg:60.49ms +step:47/2245 train_time:2844ms step_avg:60.52ms +step:48/2245 train_time:2903ms step_avg:60.49ms +step:49/2245 train_time:2966ms step_avg:60.52ms +step:50/2245 train_time:3027ms step_avg:60.54ms +step:51/2245 train_time:3090ms step_avg:60.59ms +step:52/2245 train_time:3150ms step_avg:60.58ms +step:53/2245 train_time:3214ms step_avg:60.63ms +step:54/2245 train_time:3274ms step_avg:60.63ms +step:55/2245 train_time:3335ms step_avg:60.64ms +step:56/2245 train_time:3394ms step_avg:60.62ms +step:57/2245 train_time:3456ms step_avg:60.63ms +step:58/2245 train_time:3515ms step_avg:60.60ms +step:59/2245 train_time:3576ms step_avg:60.61ms +step:60/2245 train_time:3635ms step_avg:60.59ms +step:61/2245 train_time:3697ms step_avg:60.61ms +step:62/2245 train_time:3757ms step_avg:60.59ms +step:63/2245 train_time:3818ms step_avg:60.60ms +step:64/2245 train_time:3878ms step_avg:60.59ms +step:65/2245 train_time:3941ms step_avg:60.63ms +step:66/2245 train_time:4000ms step_avg:60.61ms +step:67/2245 train_time:4063ms step_avg:60.64ms +step:68/2245 train_time:4122ms step_avg:60.62ms +step:69/2245 train_time:4185ms step_avg:60.65ms +step:70/2245 train_time:4244ms step_avg:60.63ms +step:71/2245 train_time:4306ms step_avg:60.65ms +step:72/2245 train_time:4367ms step_avg:60.65ms +step:73/2245 train_time:4429ms step_avg:60.67ms +step:74/2245 train_time:4489ms step_avg:60.66ms +step:75/2245 train_time:4551ms step_avg:60.68ms +step:76/2245 train_time:4610ms step_avg:60.66ms +step:77/2245 train_time:4672ms step_avg:60.67ms +step:78/2245 train_time:4731ms step_avg:60.66ms +step:79/2245 train_time:4793ms step_avg:60.67ms +step:80/2245 train_time:4852ms step_avg:60.65ms +step:81/2245 train_time:4915ms step_avg:60.67ms +step:82/2245 train_time:4973ms step_avg:60.65ms +step:83/2245 train_time:5034ms step_avg:60.65ms +step:84/2245 train_time:5094ms step_avg:60.64ms +step:85/2245 train_time:5156ms step_avg:60.66ms +step:86/2245 train_time:5215ms step_avg:60.64ms +step:87/2245 train_time:5277ms step_avg:60.65ms +step:88/2245 train_time:5336ms step_avg:60.63ms +step:89/2245 train_time:5398ms step_avg:60.65ms +step:90/2245 train_time:5458ms step_avg:60.64ms +step:91/2245 train_time:5520ms step_avg:60.65ms +step:92/2245 train_time:5579ms step_avg:60.64ms +step:93/2245 train_time:5641ms step_avg:60.65ms +step:94/2245 train_time:5700ms step_avg:60.63ms +step:95/2245 train_time:5762ms step_avg:60.65ms +step:96/2245 train_time:5821ms step_avg:60.63ms +step:97/2245 train_time:5883ms step_avg:60.65ms +step:98/2245 train_time:5941ms step_avg:60.63ms +step:99/2245 train_time:6003ms step_avg:60.63ms +step:100/2245 train_time:6062ms step_avg:60.62ms +step:101/2245 train_time:6124ms step_avg:60.63ms +step:102/2245 train_time:6183ms step_avg:60.62ms +step:103/2245 train_time:6245ms step_avg:60.63ms +step:104/2245 train_time:6304ms step_avg:60.62ms +step:105/2245 train_time:6367ms step_avg:60.64ms +step:106/2245 train_time:6427ms step_avg:60.64ms +step:107/2245 train_time:6490ms step_avg:60.65ms +step:108/2245 train_time:6549ms step_avg:60.64ms +step:109/2245 train_time:6611ms step_avg:60.65ms +step:110/2245 train_time:6671ms step_avg:60.64ms +step:111/2245 train_time:6732ms step_avg:60.65ms +step:112/2245 train_time:6791ms step_avg:60.64ms +step:113/2245 train_time:6852ms step_avg:60.64ms +step:114/2245 train_time:6911ms step_avg:60.63ms +step:115/2245 train_time:6973ms step_avg:60.64ms +step:116/2245 train_time:7032ms step_avg:60.62ms +step:117/2245 train_time:7094ms step_avg:60.63ms +step:118/2245 train_time:7153ms step_avg:60.62ms +step:119/2245 train_time:7215ms step_avg:60.63ms +step:120/2245 train_time:7274ms step_avg:60.62ms +step:121/2245 train_time:7336ms step_avg:60.63ms +step:122/2245 train_time:7395ms step_avg:60.62ms +step:123/2245 train_time:7457ms step_avg:60.62ms +step:124/2245 train_time:7516ms step_avg:60.61ms +step:125/2245 train_time:7577ms step_avg:60.62ms +step:126/2245 train_time:7637ms step_avg:60.61ms +step:127/2245 train_time:7699ms step_avg:60.62ms +step:128/2245 train_time:7757ms step_avg:60.61ms +step:129/2245 train_time:7820ms step_avg:60.62ms +step:130/2245 train_time:7879ms step_avg:60.60ms +step:131/2245 train_time:7940ms step_avg:60.61ms +step:132/2245 train_time:7999ms step_avg:60.60ms +step:133/2245 train_time:8061ms step_avg:60.61ms +step:134/2245 train_time:8120ms step_avg:60.60ms +step:135/2245 train_time:8182ms step_avg:60.61ms +step:136/2245 train_time:8241ms step_avg:60.59ms +step:137/2245 train_time:8302ms step_avg:60.60ms +step:138/2245 train_time:8361ms step_avg:60.59ms +step:139/2245 train_time:8423ms step_avg:60.60ms +step:140/2245 train_time:8482ms step_avg:60.59ms +step:141/2245 train_time:8544ms step_avg:60.59ms +step:142/2245 train_time:8603ms step_avg:60.58ms +step:143/2245 train_time:8665ms step_avg:60.59ms +step:144/2245 train_time:8724ms step_avg:60.59ms +step:145/2245 train_time:8786ms step_avg:60.59ms +step:146/2245 train_time:8846ms step_avg:60.59ms +step:147/2245 train_time:8907ms step_avg:60.59ms +step:148/2245 train_time:8967ms step_avg:60.59ms +step:149/2245 train_time:9029ms step_avg:60.59ms +step:150/2245 train_time:9088ms step_avg:60.59ms +step:151/2245 train_time:9150ms step_avg:60.59ms +step:152/2245 train_time:9210ms step_avg:60.59ms +step:153/2245 train_time:9272ms step_avg:60.60ms +step:154/2245 train_time:9331ms step_avg:60.59ms +step:155/2245 train_time:9392ms step_avg:60.59ms +step:156/2245 train_time:9451ms step_avg:60.58ms +step:157/2245 train_time:9513ms step_avg:60.59ms +step:158/2245 train_time:9572ms step_avg:60.58ms +step:159/2245 train_time:9633ms step_avg:60.58ms +step:160/2245 train_time:9692ms step_avg:60.57ms +step:161/2245 train_time:9753ms step_avg:60.58ms +step:162/2245 train_time:9812ms step_avg:60.57ms +step:163/2245 train_time:9873ms step_avg:60.57ms +step:164/2245 train_time:9932ms step_avg:60.56ms +step:165/2245 train_time:9993ms step_avg:60.56ms +step:166/2245 train_time:10052ms step_avg:60.55ms +step:167/2245 train_time:10114ms step_avg:60.56ms +step:168/2245 train_time:10173ms step_avg:60.55ms +step:169/2245 train_time:10235ms step_avg:60.56ms +step:170/2245 train_time:10293ms step_avg:60.55ms +step:171/2245 train_time:10354ms step_avg:60.55ms +step:172/2245 train_time:10413ms step_avg:60.54ms +step:173/2245 train_time:10475ms step_avg:60.55ms +step:174/2245 train_time:10533ms step_avg:60.54ms +step:175/2245 train_time:10594ms step_avg:60.54ms +step:176/2245 train_time:10653ms step_avg:60.53ms +step:177/2245 train_time:10714ms step_avg:60.53ms +step:178/2245 train_time:10773ms step_avg:60.52ms +step:179/2245 train_time:10834ms step_avg:60.53ms +step:180/2245 train_time:10893ms step_avg:60.52ms +step:181/2245 train_time:10955ms step_avg:60.52ms +step:182/2245 train_time:11013ms step_avg:60.51ms +step:183/2245 train_time:11074ms step_avg:60.52ms +step:184/2245 train_time:11133ms step_avg:60.51ms +step:185/2245 train_time:11194ms step_avg:60.51ms +step:186/2245 train_time:11253ms step_avg:60.50ms +step:187/2245 train_time:11314ms step_avg:60.50ms +step:188/2245 train_time:11373ms step_avg:60.50ms +step:189/2245 train_time:11434ms step_avg:60.50ms +step:190/2245 train_time:11493ms step_avg:60.49ms +step:191/2245 train_time:11554ms step_avg:60.49ms +step:192/2245 train_time:11613ms step_avg:60.48ms +step:193/2245 train_time:11674ms step_avg:60.49ms +step:194/2245 train_time:11733ms step_avg:60.48ms +step:195/2245 train_time:11795ms step_avg:60.49ms +step:196/2245 train_time:11853ms step_avg:60.48ms +step:197/2245 train_time:11915ms step_avg:60.48ms +step:198/2245 train_time:11974ms step_avg:60.47ms +step:199/2245 train_time:12035ms step_avg:60.48ms +step:200/2245 train_time:12093ms step_avg:60.47ms +step:201/2245 train_time:12155ms step_avg:60.47ms +step:202/2245 train_time:12213ms step_avg:60.46ms +step:203/2245 train_time:12275ms step_avg:60.47ms +step:204/2245 train_time:12333ms step_avg:60.46ms +step:205/2245 train_time:12395ms step_avg:60.46ms +step:206/2245 train_time:12454ms step_avg:60.46ms +step:207/2245 train_time:12515ms step_avg:60.46ms +step:208/2245 train_time:12574ms step_avg:60.45ms +step:209/2245 train_time:12635ms step_avg:60.46ms +step:210/2245 train_time:12694ms step_avg:60.45ms +step:211/2245 train_time:12756ms step_avg:60.46ms +step:212/2245 train_time:12815ms step_avg:60.45ms +step:213/2245 train_time:12877ms step_avg:60.46ms +step:214/2245 train_time:12936ms step_avg:60.45ms +step:215/2245 train_time:12997ms step_avg:60.45ms +step:216/2245 train_time:13057ms step_avg:60.45ms +step:217/2245 train_time:13118ms step_avg:60.45ms +step:218/2245 train_time:13176ms step_avg:60.44ms +step:219/2245 train_time:13238ms step_avg:60.45ms +step:220/2245 train_time:13297ms step_avg:60.44ms +step:221/2245 train_time:13358ms step_avg:60.44ms +step:222/2245 train_time:13418ms step_avg:60.44ms +step:223/2245 train_time:13479ms step_avg:60.45ms +step:224/2245 train_time:13538ms step_avg:60.44ms +step:225/2245 train_time:13599ms step_avg:60.44ms +step:226/2245 train_time:13658ms step_avg:60.43ms +step:227/2245 train_time:13720ms step_avg:60.44ms +step:228/2245 train_time:13779ms step_avg:60.43ms +step:229/2245 train_time:13841ms step_avg:60.44ms +step:230/2245 train_time:13900ms step_avg:60.43ms +step:231/2245 train_time:13962ms step_avg:60.44ms +step:232/2245 train_time:14021ms step_avg:60.43ms +step:233/2245 train_time:14083ms step_avg:60.44ms +step:234/2245 train_time:14141ms step_avg:60.43ms +step:235/2245 train_time:14203ms step_avg:60.44ms +step:236/2245 train_time:14262ms step_avg:60.43ms +step:237/2245 train_time:14324ms step_avg:60.44ms +step:238/2245 train_time:14383ms step_avg:60.43ms +step:239/2245 train_time:14445ms step_avg:60.44ms +step:240/2245 train_time:14504ms step_avg:60.43ms +step:241/2245 train_time:14566ms step_avg:60.44ms +step:242/2245 train_time:14626ms step_avg:60.44ms +step:243/2245 train_time:14688ms step_avg:60.44ms +step:244/2245 train_time:14747ms step_avg:60.44ms +step:245/2245 train_time:14809ms step_avg:60.44ms +step:246/2245 train_time:14869ms step_avg:60.44ms +step:247/2245 train_time:14931ms step_avg:60.45ms +step:248/2245 train_time:14990ms step_avg:60.44ms +step:249/2245 train_time:15052ms step_avg:60.45ms +step:250/2245 train_time:15112ms step_avg:60.45ms +step:250/2245 val_loss:4.0877 train_time:15173ms step_avg:60.69ms +step:251/2245 train_time:15192ms step_avg:60.53ms +step:252/2245 train_time:15233ms step_avg:60.45ms +step:253/2245 train_time:15298ms step_avg:60.47ms +step:254/2245 train_time:15361ms step_avg:60.47ms +step:255/2245 train_time:15423ms step_avg:60.48ms +step:256/2245 train_time:15482ms step_avg:60.48ms +step:257/2245 train_time:15542ms step_avg:60.48ms +step:258/2245 train_time:15601ms step_avg:60.47ms +step:259/2245 train_time:15663ms step_avg:60.47ms +step:260/2245 train_time:15721ms step_avg:60.47ms +step:261/2245 train_time:15781ms step_avg:60.47ms +step:262/2245 train_time:15839ms step_avg:60.46ms +step:263/2245 train_time:15900ms step_avg:60.46ms +step:264/2245 train_time:15958ms step_avg:60.45ms +step:265/2245 train_time:16018ms step_avg:60.45ms +step:266/2245 train_time:16077ms step_avg:60.44ms +step:267/2245 train_time:16138ms step_avg:60.44ms +step:268/2245 train_time:16198ms step_avg:60.44ms +step:269/2245 train_time:16262ms step_avg:60.45ms +step:270/2245 train_time:16322ms step_avg:60.45ms +step:271/2245 train_time:16385ms step_avg:60.46ms +step:272/2245 train_time:16445ms step_avg:60.46ms +step:273/2245 train_time:16506ms step_avg:60.46ms +step:274/2245 train_time:16565ms step_avg:60.46ms +step:275/2245 train_time:16627ms step_avg:60.46ms +step:276/2245 train_time:16686ms step_avg:60.46ms +step:277/2245 train_time:16747ms step_avg:60.46ms +step:278/2245 train_time:16806ms step_avg:60.45ms +step:279/2245 train_time:16867ms step_avg:60.46ms +step:280/2245 train_time:16926ms step_avg:60.45ms +step:281/2245 train_time:16987ms step_avg:60.45ms +step:282/2245 train_time:17047ms step_avg:60.45ms +step:283/2245 train_time:17109ms step_avg:60.46ms +step:284/2245 train_time:17170ms step_avg:60.46ms +step:285/2245 train_time:17232ms step_avg:60.46ms +step:286/2245 train_time:17292ms step_avg:60.46ms +step:287/2245 train_time:17353ms step_avg:60.46ms +step:288/2245 train_time:17413ms step_avg:60.46ms +step:289/2245 train_time:17474ms step_avg:60.46ms +step:290/2245 train_time:17532ms step_avg:60.46ms +step:291/2245 train_time:17594ms step_avg:60.46ms +step:292/2245 train_time:17652ms step_avg:60.45ms +step:293/2245 train_time:17713ms step_avg:60.45ms +step:294/2245 train_time:17772ms step_avg:60.45ms +step:295/2245 train_time:17833ms step_avg:60.45ms +step:296/2245 train_time:17891ms step_avg:60.44ms +step:297/2245 train_time:17952ms step_avg:60.45ms +step:298/2245 train_time:18011ms step_avg:60.44ms +step:299/2245 train_time:18073ms step_avg:60.44ms +step:300/2245 train_time:18132ms step_avg:60.44ms +step:301/2245 train_time:18194ms step_avg:60.44ms +step:302/2245 train_time:18253ms step_avg:60.44ms +step:303/2245 train_time:18314ms step_avg:60.44ms +step:304/2245 train_time:18373ms step_avg:60.44ms +step:305/2245 train_time:18434ms step_avg:60.44ms +step:306/2245 train_time:18493ms step_avg:60.43ms +step:307/2245 train_time:18554ms step_avg:60.44ms +step:308/2245 train_time:18613ms step_avg:60.43ms +step:309/2245 train_time:18674ms step_avg:60.43ms +step:310/2245 train_time:18732ms step_avg:60.43ms +step:311/2245 train_time:18793ms step_avg:60.43ms +step:312/2245 train_time:18852ms step_avg:60.42ms +step:313/2245 train_time:18913ms step_avg:60.43ms +step:314/2245 train_time:18972ms step_avg:60.42ms +step:315/2245 train_time:19033ms step_avg:60.42ms +step:316/2245 train_time:19092ms step_avg:60.42ms +step:317/2245 train_time:19154ms step_avg:60.42ms +step:318/2245 train_time:19213ms step_avg:60.42ms +step:319/2245 train_time:19275ms step_avg:60.42ms +step:320/2245 train_time:19333ms step_avg:60.42ms +step:321/2245 train_time:19394ms step_avg:60.42ms +step:322/2245 train_time:19453ms step_avg:60.41ms +step:323/2245 train_time:19514ms step_avg:60.42ms +step:324/2245 train_time:19573ms step_avg:60.41ms +step:325/2245 train_time:19634ms step_avg:60.41ms +step:326/2245 train_time:19693ms step_avg:60.41ms +step:327/2245 train_time:19754ms step_avg:60.41ms +step:328/2245 train_time:19813ms step_avg:60.40ms +step:329/2245 train_time:19873ms step_avg:60.41ms +step:330/2245 train_time:19932ms step_avg:60.40ms +step:331/2245 train_time:19993ms step_avg:60.40ms +step:332/2245 train_time:20051ms step_avg:60.40ms +step:333/2245 train_time:20113ms step_avg:60.40ms +step:334/2245 train_time:20172ms step_avg:60.40ms +step:335/2245 train_time:20234ms step_avg:60.40ms +step:336/2245 train_time:20293ms step_avg:60.39ms +step:337/2245 train_time:20354ms step_avg:60.40ms +step:338/2245 train_time:20413ms step_avg:60.39ms +step:339/2245 train_time:20475ms step_avg:60.40ms +step:340/2245 train_time:20533ms step_avg:60.39ms +step:341/2245 train_time:20594ms step_avg:60.39ms +step:342/2245 train_time:20653ms step_avg:60.39ms +step:343/2245 train_time:20714ms step_avg:60.39ms +step:344/2245 train_time:20772ms step_avg:60.38ms +step:345/2245 train_time:20833ms step_avg:60.39ms +step:346/2245 train_time:20892ms step_avg:60.38ms +step:347/2245 train_time:20953ms step_avg:60.38ms +step:348/2245 train_time:21012ms step_avg:60.38ms +step:349/2245 train_time:21073ms step_avg:60.38ms +step:350/2245 train_time:21131ms step_avg:60.38ms +step:351/2245 train_time:21193ms step_avg:60.38ms +step:352/2245 train_time:21252ms step_avg:60.38ms +step:353/2245 train_time:21314ms step_avg:60.38ms +step:354/2245 train_time:21373ms step_avg:60.38ms +step:355/2245 train_time:21435ms step_avg:60.38ms +step:356/2245 train_time:21493ms step_avg:60.37ms +step:357/2245 train_time:21554ms step_avg:60.38ms +step:358/2245 train_time:21613ms step_avg:60.37ms +step:359/2245 train_time:21675ms step_avg:60.38ms +step:360/2245 train_time:21733ms step_avg:60.37ms +step:361/2245 train_time:21794ms step_avg:60.37ms +step:362/2245 train_time:21853ms step_avg:60.37ms +step:363/2245 train_time:21914ms step_avg:60.37ms +step:364/2245 train_time:21972ms step_avg:60.36ms +step:365/2245 train_time:22033ms step_avg:60.37ms +step:366/2245 train_time:22092ms step_avg:60.36ms +step:367/2245 train_time:22153ms step_avg:60.36ms +step:368/2245 train_time:22212ms step_avg:60.36ms +step:369/2245 train_time:22274ms step_avg:60.36ms +step:370/2245 train_time:22333ms step_avg:60.36ms +step:371/2245 train_time:22395ms step_avg:60.36ms +step:372/2245 train_time:22453ms step_avg:60.36ms +step:373/2245 train_time:22515ms step_avg:60.36ms +step:374/2245 train_time:22573ms step_avg:60.36ms +step:375/2245 train_time:22635ms step_avg:60.36ms +step:376/2245 train_time:22693ms step_avg:60.35ms +step:377/2245 train_time:22754ms step_avg:60.36ms +step:378/2245 train_time:22813ms step_avg:60.35ms +step:379/2245 train_time:22874ms step_avg:60.35ms +step:380/2245 train_time:22932ms step_avg:60.35ms +step:381/2245 train_time:22993ms step_avg:60.35ms +step:382/2245 train_time:23052ms step_avg:60.35ms +step:383/2245 train_time:23113ms step_avg:60.35ms +step:384/2245 train_time:23172ms step_avg:60.34ms +step:385/2245 train_time:23233ms step_avg:60.35ms +step:386/2245 train_time:23292ms step_avg:60.34ms +step:387/2245 train_time:23354ms step_avg:60.35ms +step:388/2245 train_time:23413ms step_avg:60.34ms +step:389/2245 train_time:23474ms step_avg:60.35ms +step:390/2245 train_time:23533ms step_avg:60.34ms +step:391/2245 train_time:23594ms step_avg:60.34ms +step:392/2245 train_time:23653ms step_avg:60.34ms +step:393/2245 train_time:23714ms step_avg:60.34ms +step:394/2245 train_time:23772ms step_avg:60.34ms +step:395/2245 train_time:23833ms step_avg:60.34ms +step:396/2245 train_time:23892ms step_avg:60.33ms +step:397/2245 train_time:23953ms step_avg:60.34ms +step:398/2245 train_time:24012ms step_avg:60.33ms +step:399/2245 train_time:24073ms step_avg:60.33ms +step:400/2245 train_time:24131ms step_avg:60.33ms +step:401/2245 train_time:24193ms step_avg:60.33ms +step:402/2245 train_time:24252ms step_avg:60.33ms +step:403/2245 train_time:24314ms step_avg:60.33ms +step:404/2245 train_time:24372ms step_avg:60.33ms +step:405/2245 train_time:24434ms step_avg:60.33ms +step:406/2245 train_time:24493ms step_avg:60.33ms +step:407/2245 train_time:24554ms step_avg:60.33ms +step:408/2245 train_time:24613ms step_avg:60.33ms +step:409/2245 train_time:24675ms step_avg:60.33ms +step:410/2245 train_time:24733ms step_avg:60.32ms +step:411/2245 train_time:24794ms step_avg:60.33ms +step:412/2245 train_time:24853ms step_avg:60.32ms +step:413/2245 train_time:24914ms step_avg:60.32ms +step:414/2245 train_time:24972ms step_avg:60.32ms +step:415/2245 train_time:25034ms step_avg:60.32ms +step:416/2245 train_time:25092ms step_avg:60.32ms +step:417/2245 train_time:25154ms step_avg:60.32ms +step:418/2245 train_time:25213ms step_avg:60.32ms +step:419/2245 train_time:25274ms step_avg:60.32ms +step:420/2245 train_time:25332ms step_avg:60.32ms +step:421/2245 train_time:25394ms step_avg:60.32ms +step:422/2245 train_time:25452ms step_avg:60.31ms +step:423/2245 train_time:25515ms step_avg:60.32ms +step:424/2245 train_time:25573ms step_avg:60.31ms +step:425/2245 train_time:25634ms step_avg:60.32ms +step:426/2245 train_time:25693ms step_avg:60.31ms +step:427/2245 train_time:25754ms step_avg:60.31ms +step:428/2245 train_time:25812ms step_avg:60.31ms +step:429/2245 train_time:25873ms step_avg:60.31ms +step:430/2245 train_time:25932ms step_avg:60.31ms +step:431/2245 train_time:25993ms step_avg:60.31ms +step:432/2245 train_time:26052ms step_avg:60.31ms +step:433/2245 train_time:26113ms step_avg:60.31ms +step:434/2245 train_time:26172ms step_avg:60.30ms +step:435/2245 train_time:26233ms step_avg:60.31ms +step:436/2245 train_time:26292ms step_avg:60.30ms +step:437/2245 train_time:26354ms step_avg:60.31ms +step:438/2245 train_time:26413ms step_avg:60.30ms +step:439/2245 train_time:26474ms step_avg:60.31ms +step:440/2245 train_time:26533ms step_avg:60.30ms +step:441/2245 train_time:26594ms step_avg:60.30ms +step:442/2245 train_time:26652ms step_avg:60.30ms +step:443/2245 train_time:26714ms step_avg:60.30ms +step:444/2245 train_time:26772ms step_avg:60.30ms +step:445/2245 train_time:26833ms step_avg:60.30ms +step:446/2245 train_time:26892ms step_avg:60.30ms +step:447/2245 train_time:26953ms step_avg:60.30ms +step:448/2245 train_time:27012ms step_avg:60.29ms +step:449/2245 train_time:27073ms step_avg:60.30ms +step:450/2245 train_time:27131ms step_avg:60.29ms +step:451/2245 train_time:27193ms step_avg:60.30ms +step:452/2245 train_time:27252ms step_avg:60.29ms +step:453/2245 train_time:27313ms step_avg:60.29ms +step:454/2245 train_time:27372ms step_avg:60.29ms +step:455/2245 train_time:27434ms step_avg:60.29ms +step:456/2245 train_time:27493ms step_avg:60.29ms +step:457/2245 train_time:27554ms step_avg:60.29ms +step:458/2245 train_time:27613ms step_avg:60.29ms +step:459/2245 train_time:27674ms step_avg:60.29ms +step:460/2245 train_time:27733ms step_avg:60.29ms +step:461/2245 train_time:27794ms step_avg:60.29ms +step:462/2245 train_time:27852ms step_avg:60.29ms +step:463/2245 train_time:27914ms step_avg:60.29ms +step:464/2245 train_time:27973ms step_avg:60.29ms +step:465/2245 train_time:28034ms step_avg:60.29ms +step:466/2245 train_time:28093ms step_avg:60.28ms +step:467/2245 train_time:28154ms step_avg:60.29ms +step:468/2245 train_time:28213ms step_avg:60.28ms +step:469/2245 train_time:28274ms step_avg:60.29ms +step:470/2245 train_time:28333ms step_avg:60.28ms +step:471/2245 train_time:28394ms step_avg:60.28ms +step:472/2245 train_time:28453ms step_avg:60.28ms +step:473/2245 train_time:28514ms step_avg:60.28ms +step:474/2245 train_time:28573ms step_avg:60.28ms +step:475/2245 train_time:28634ms step_avg:60.28ms +step:476/2245 train_time:28693ms step_avg:60.28ms +step:477/2245 train_time:28754ms step_avg:60.28ms +step:478/2245 train_time:28813ms step_avg:60.28ms +step:479/2245 train_time:28874ms step_avg:60.28ms +step:480/2245 train_time:28932ms step_avg:60.28ms +step:481/2245 train_time:28994ms step_avg:60.28ms +step:482/2245 train_time:29052ms step_avg:60.27ms +step:483/2245 train_time:29114ms step_avg:60.28ms +step:484/2245 train_time:29173ms step_avg:60.27ms +step:485/2245 train_time:29234ms step_avg:60.28ms +step:486/2245 train_time:29292ms step_avg:60.27ms +step:487/2245 train_time:29354ms step_avg:60.27ms +step:488/2245 train_time:29413ms step_avg:60.27ms +step:489/2245 train_time:29474ms step_avg:60.27ms +step:490/2245 train_time:29533ms step_avg:60.27ms +step:491/2245 train_time:29594ms step_avg:60.27ms +step:492/2245 train_time:29653ms step_avg:60.27ms +step:493/2245 train_time:29715ms step_avg:60.27ms +step:494/2245 train_time:29774ms step_avg:60.27ms +step:495/2245 train_time:29835ms step_avg:60.27ms +step:496/2245 train_time:29893ms step_avg:60.27ms +step:497/2245 train_time:29954ms step_avg:60.27ms +step:498/2245 train_time:30013ms step_avg:60.27ms +step:499/2245 train_time:30074ms step_avg:60.27ms +step:500/2245 train_time:30133ms step_avg:60.27ms +step:500/2245 val_loss:3.8285 train_time:30195ms step_avg:60.39ms +step:501/2245 train_time:30213ms step_avg:60.31ms +step:502/2245 train_time:30256ms step_avg:60.27ms +step:503/2245 train_time:30319ms step_avg:60.28ms +step:504/2245 train_time:30379ms step_avg:60.27ms +step:505/2245 train_time:30441ms step_avg:60.28ms +step:506/2245 train_time:30500ms step_avg:60.28ms +step:507/2245 train_time:30561ms step_avg:60.28ms +step:508/2245 train_time:30620ms step_avg:60.28ms +step:509/2245 train_time:30681ms step_avg:60.28ms +step:510/2245 train_time:30740ms step_avg:60.27ms +step:511/2245 train_time:30802ms step_avg:60.28ms +step:512/2245 train_time:30861ms step_avg:60.28ms +step:513/2245 train_time:30923ms step_avg:60.28ms +step:514/2245 train_time:30982ms step_avg:60.28ms +step:515/2245 train_time:31042ms step_avg:60.28ms +step:516/2245 train_time:31101ms step_avg:60.27ms +step:517/2245 train_time:31165ms step_avg:60.28ms +step:518/2245 train_time:31225ms step_avg:60.28ms +step:519/2245 train_time:31286ms step_avg:60.28ms +step:520/2245 train_time:31345ms step_avg:60.28ms +step:521/2245 train_time:31407ms step_avg:60.28ms +step:522/2245 train_time:31466ms step_avg:60.28ms +step:523/2245 train_time:31527ms step_avg:60.28ms +step:524/2245 train_time:31586ms step_avg:60.28ms +step:525/2245 train_time:31647ms step_avg:60.28ms +step:526/2245 train_time:31706ms step_avg:60.28ms +step:527/2245 train_time:31767ms step_avg:60.28ms +step:528/2245 train_time:31826ms step_avg:60.28ms +step:529/2245 train_time:31887ms step_avg:60.28ms +step:530/2245 train_time:31946ms step_avg:60.28ms +step:531/2245 train_time:32007ms step_avg:60.28ms +step:532/2245 train_time:32066ms step_avg:60.27ms +step:533/2245 train_time:32128ms step_avg:60.28ms +step:534/2245 train_time:32187ms step_avg:60.28ms +step:535/2245 train_time:32249ms step_avg:60.28ms +step:536/2245 train_time:32308ms step_avg:60.28ms +step:537/2245 train_time:32369ms step_avg:60.28ms +step:538/2245 train_time:32428ms step_avg:60.28ms +step:539/2245 train_time:32490ms step_avg:60.28ms +step:540/2245 train_time:32549ms step_avg:60.28ms +step:541/2245 train_time:32611ms step_avg:60.28ms +step:542/2245 train_time:32669ms step_avg:60.28ms +step:543/2245 train_time:32730ms step_avg:60.28ms +step:544/2245 train_time:32789ms step_avg:60.27ms +step:545/2245 train_time:32850ms step_avg:60.28ms +step:546/2245 train_time:32909ms step_avg:60.27ms +step:547/2245 train_time:32971ms step_avg:60.28ms +step:548/2245 train_time:33029ms step_avg:60.27ms +step:549/2245 train_time:33091ms step_avg:60.28ms +step:550/2245 train_time:33150ms step_avg:60.27ms +step:551/2245 train_time:33212ms step_avg:60.28ms +step:552/2245 train_time:33271ms step_avg:60.27ms +step:553/2245 train_time:33332ms step_avg:60.28ms +step:554/2245 train_time:33391ms step_avg:60.27ms +step:555/2245 train_time:33452ms step_avg:60.27ms +step:556/2245 train_time:33511ms step_avg:60.27ms +step:557/2245 train_time:33573ms step_avg:60.28ms +step:558/2245 train_time:33632ms step_avg:60.27ms +step:559/2245 train_time:33694ms step_avg:60.28ms +step:560/2245 train_time:33754ms step_avg:60.27ms +step:561/2245 train_time:33816ms step_avg:60.28ms +step:562/2245 train_time:33875ms step_avg:60.28ms +step:563/2245 train_time:33937ms step_avg:60.28ms +step:564/2245 train_time:33996ms step_avg:60.28ms +step:565/2245 train_time:34058ms step_avg:60.28ms +step:566/2245 train_time:34118ms step_avg:60.28ms +step:567/2245 train_time:34180ms step_avg:60.28ms +step:568/2245 train_time:34239ms step_avg:60.28ms +step:569/2245 train_time:34301ms step_avg:60.28ms +step:570/2245 train_time:34360ms step_avg:60.28ms +step:571/2245 train_time:34423ms step_avg:60.28ms +step:572/2245 train_time:34482ms step_avg:60.28ms +step:573/2245 train_time:34543ms step_avg:60.28ms +step:574/2245 train_time:34602ms step_avg:60.28ms +step:575/2245 train_time:34664ms step_avg:60.28ms +step:576/2245 train_time:34723ms step_avg:60.28ms +step:577/2245 train_time:34784ms step_avg:60.28ms +step:578/2245 train_time:34843ms step_avg:60.28ms +step:579/2245 train_time:34904ms step_avg:60.28ms +step:580/2245 train_time:34963ms step_avg:60.28ms +step:581/2245 train_time:35025ms step_avg:60.28ms +step:582/2245 train_time:35083ms step_avg:60.28ms +step:583/2245 train_time:35145ms step_avg:60.28ms +step:584/2245 train_time:35204ms step_avg:60.28ms +step:585/2245 train_time:35265ms step_avg:60.28ms +step:586/2245 train_time:35324ms step_avg:60.28ms +step:587/2245 train_time:35385ms step_avg:60.28ms +step:588/2245 train_time:35444ms step_avg:60.28ms +step:589/2245 train_time:35506ms step_avg:60.28ms +step:590/2245 train_time:35566ms step_avg:60.28ms +step:591/2245 train_time:35627ms step_avg:60.28ms +step:592/2245 train_time:35686ms step_avg:60.28ms +step:593/2245 train_time:35747ms step_avg:60.28ms +step:594/2245 train_time:35805ms step_avg:60.28ms +step:595/2245 train_time:35866ms step_avg:60.28ms +step:596/2245 train_time:35925ms step_avg:60.28ms +step:597/2245 train_time:35986ms step_avg:60.28ms +step:598/2245 train_time:36045ms step_avg:60.28ms +step:599/2245 train_time:36106ms step_avg:60.28ms +step:600/2245 train_time:36166ms step_avg:60.28ms +step:601/2245 train_time:36227ms step_avg:60.28ms +step:602/2245 train_time:36285ms step_avg:60.27ms +step:603/2245 train_time:36347ms step_avg:60.28ms +step:604/2245 train_time:36405ms step_avg:60.27ms +step:605/2245 train_time:36467ms step_avg:60.28ms +step:606/2245 train_time:36526ms step_avg:60.27ms +step:607/2245 train_time:36587ms step_avg:60.28ms +step:608/2245 train_time:36646ms step_avg:60.27ms +step:609/2245 train_time:36708ms step_avg:60.28ms +step:610/2245 train_time:36766ms step_avg:60.27ms +step:611/2245 train_time:36828ms step_avg:60.27ms +step:612/2245 train_time:36887ms step_avg:60.27ms +step:613/2245 train_time:36948ms step_avg:60.27ms +step:614/2245 train_time:37007ms step_avg:60.27ms +step:615/2245 train_time:37068ms step_avg:60.27ms +step:616/2245 train_time:37127ms step_avg:60.27ms +step:617/2245 train_time:37189ms step_avg:60.27ms +step:618/2245 train_time:37248ms step_avg:60.27ms +step:619/2245 train_time:37309ms step_avg:60.27ms +step:620/2245 train_time:37368ms step_avg:60.27ms +step:621/2245 train_time:37430ms step_avg:60.27ms +step:622/2245 train_time:37489ms step_avg:60.27ms +step:623/2245 train_time:37550ms step_avg:60.27ms +step:624/2245 train_time:37610ms step_avg:60.27ms +step:625/2245 train_time:37671ms step_avg:60.27ms +step:626/2245 train_time:37730ms step_avg:60.27ms +step:627/2245 train_time:37791ms step_avg:60.27ms +step:628/2245 train_time:37850ms step_avg:60.27ms +step:629/2245 train_time:37912ms step_avg:60.27ms +step:630/2245 train_time:37971ms step_avg:60.27ms +step:631/2245 train_time:38032ms step_avg:60.27ms +step:632/2245 train_time:38091ms step_avg:60.27ms +step:633/2245 train_time:38152ms step_avg:60.27ms +step:634/2245 train_time:38211ms step_avg:60.27ms +step:635/2245 train_time:38273ms step_avg:60.27ms +step:636/2245 train_time:38332ms step_avg:60.27ms +step:637/2245 train_time:38393ms step_avg:60.27ms +step:638/2245 train_time:38452ms step_avg:60.27ms +step:639/2245 train_time:38514ms step_avg:60.27ms +step:640/2245 train_time:38573ms step_avg:60.27ms +step:641/2245 train_time:38634ms step_avg:60.27ms +step:642/2245 train_time:38693ms step_avg:60.27ms +step:643/2245 train_time:38755ms step_avg:60.27ms +step:644/2245 train_time:38815ms step_avg:60.27ms +step:645/2245 train_time:38876ms step_avg:60.27ms +step:646/2245 train_time:38936ms step_avg:60.27ms +step:647/2245 train_time:38999ms step_avg:60.28ms +step:648/2245 train_time:39059ms step_avg:60.28ms +step:649/2245 train_time:39120ms step_avg:60.28ms +step:650/2245 train_time:39180ms step_avg:60.28ms +step:651/2245 train_time:39242ms step_avg:60.28ms +step:652/2245 train_time:39301ms step_avg:60.28ms +step:653/2245 train_time:39363ms step_avg:60.28ms +step:654/2245 train_time:39423ms step_avg:60.28ms +step:655/2245 train_time:39484ms step_avg:60.28ms +step:656/2245 train_time:39542ms step_avg:60.28ms +step:657/2245 train_time:39604ms step_avg:60.28ms +step:658/2245 train_time:39663ms step_avg:60.28ms +step:659/2245 train_time:39724ms step_avg:60.28ms +step:660/2245 train_time:39783ms step_avg:60.28ms +step:661/2245 train_time:39844ms step_avg:60.28ms +step:662/2245 train_time:39903ms step_avg:60.28ms +step:663/2245 train_time:39965ms step_avg:60.28ms +step:664/2245 train_time:40023ms step_avg:60.28ms +step:665/2245 train_time:40085ms step_avg:60.28ms +step:666/2245 train_time:40144ms step_avg:60.28ms +step:667/2245 train_time:40205ms step_avg:60.28ms +step:668/2245 train_time:40264ms step_avg:60.28ms +step:669/2245 train_time:40326ms step_avg:60.28ms +step:670/2245 train_time:40385ms step_avg:60.28ms +step:671/2245 train_time:40446ms step_avg:60.28ms +step:672/2245 train_time:40505ms step_avg:60.27ms +step:673/2245 train_time:40566ms step_avg:60.28ms +step:674/2245 train_time:40624ms step_avg:60.27ms +step:675/2245 train_time:40686ms step_avg:60.27ms +step:676/2245 train_time:40744ms step_avg:60.27ms +step:677/2245 train_time:40806ms step_avg:60.27ms +step:678/2245 train_time:40864ms step_avg:60.27ms +step:679/2245 train_time:40926ms step_avg:60.27ms +step:680/2245 train_time:40984ms step_avg:60.27ms +step:681/2245 train_time:41045ms step_avg:60.27ms +step:682/2245 train_time:41104ms step_avg:60.27ms +step:683/2245 train_time:41166ms step_avg:60.27ms +step:684/2245 train_time:41225ms step_avg:60.27ms +step:685/2245 train_time:41286ms step_avg:60.27ms +step:686/2245 train_time:41345ms step_avg:60.27ms +step:687/2245 train_time:41406ms step_avg:60.27ms +step:688/2245 train_time:41465ms step_avg:60.27ms +step:689/2245 train_time:41526ms step_avg:60.27ms +step:690/2245 train_time:41584ms step_avg:60.27ms +step:691/2245 train_time:41645ms step_avg:60.27ms +step:692/2245 train_time:41704ms step_avg:60.27ms +step:693/2245 train_time:41765ms step_avg:60.27ms +step:694/2245 train_time:41824ms step_avg:60.27ms +step:695/2245 train_time:41885ms step_avg:60.27ms +step:696/2245 train_time:41944ms step_avg:60.27ms +step:697/2245 train_time:42006ms step_avg:60.27ms +step:698/2245 train_time:42064ms step_avg:60.26ms +step:699/2245 train_time:42126ms step_avg:60.27ms +step:700/2245 train_time:42185ms step_avg:60.26ms +step:701/2245 train_time:42247ms step_avg:60.27ms +step:702/2245 train_time:42306ms step_avg:60.26ms +step:703/2245 train_time:42367ms step_avg:60.27ms +step:704/2245 train_time:42426ms step_avg:60.26ms +step:705/2245 train_time:42487ms step_avg:60.27ms +step:706/2245 train_time:42546ms step_avg:60.26ms +step:707/2245 train_time:42608ms step_avg:60.27ms +step:708/2245 train_time:42667ms step_avg:60.26ms +step:709/2245 train_time:42728ms step_avg:60.27ms +step:710/2245 train_time:42787ms step_avg:60.26ms +step:711/2245 train_time:42849ms step_avg:60.27ms +step:712/2245 train_time:42908ms step_avg:60.26ms +step:713/2245 train_time:42969ms step_avg:60.27ms +step:714/2245 train_time:43028ms step_avg:60.26ms +step:715/2245 train_time:43090ms step_avg:60.27ms +step:716/2245 train_time:43149ms step_avg:60.26ms +step:717/2245 train_time:43211ms step_avg:60.27ms +step:718/2245 train_time:43739ms step_avg:60.92ms +step:719/2245 train_time:43799ms step_avg:60.92ms +step:720/2245 train_time:43857ms step_avg:60.91ms +step:721/2245 train_time:43917ms step_avg:60.91ms +step:722/2245 train_time:43976ms step_avg:60.91ms +step:723/2245 train_time:44036ms step_avg:60.91ms +step:724/2245 train_time:44094ms step_avg:60.90ms +step:725/2245 train_time:44155ms step_avg:60.90ms +step:726/2245 train_time:44213ms step_avg:60.90ms +step:727/2245 train_time:44273ms step_avg:60.90ms +step:728/2245 train_time:44331ms step_avg:60.89ms +step:729/2245 train_time:44392ms step_avg:60.89ms +step:730/2245 train_time:44451ms step_avg:60.89ms +step:731/2245 train_time:44511ms step_avg:60.89ms +step:732/2245 train_time:44571ms step_avg:60.89ms +step:733/2245 train_time:44639ms step_avg:60.90ms +step:734/2245 train_time:44703ms step_avg:60.90ms +step:735/2245 train_time:44767ms step_avg:60.91ms +step:736/2245 train_time:44826ms step_avg:60.91ms +step:737/2245 train_time:44889ms step_avg:60.91ms +step:738/2245 train_time:44948ms step_avg:60.91ms +step:739/2245 train_time:45009ms step_avg:60.91ms +step:740/2245 train_time:45069ms step_avg:60.90ms +step:741/2245 train_time:45130ms step_avg:60.90ms +step:742/2245 train_time:45190ms step_avg:60.90ms +step:743/2245 train_time:45251ms step_avg:60.90ms +step:744/2245 train_time:45310ms step_avg:60.90ms +step:745/2245 train_time:45371ms step_avg:60.90ms +step:746/2245 train_time:45430ms step_avg:60.90ms +step:747/2245 train_time:45493ms step_avg:60.90ms +step:748/2245 train_time:45553ms step_avg:60.90ms +step:749/2245 train_time:45618ms step_avg:60.91ms +step:750/2245 train_time:45680ms step_avg:60.91ms +step:750/2245 val_loss:3.6681 train_time:45745ms step_avg:60.99ms +step:751/2245 train_time:45763ms step_avg:60.94ms +step:752/2245 train_time:45805ms step_avg:60.91ms +step:753/2245 train_time:45868ms step_avg:60.91ms +step:754/2245 train_time:45929ms step_avg:60.91ms +step:755/2245 train_time:45993ms step_avg:60.92ms +step:756/2245 train_time:46053ms step_avg:60.92ms +step:757/2245 train_time:46114ms step_avg:60.92ms +step:758/2245 train_time:46173ms step_avg:60.91ms +step:759/2245 train_time:46235ms step_avg:60.92ms +step:760/2245 train_time:46294ms step_avg:60.91ms +step:761/2245 train_time:46355ms step_avg:60.91ms +step:762/2245 train_time:46414ms step_avg:60.91ms +step:763/2245 train_time:46476ms step_avg:60.91ms +step:764/2245 train_time:46535ms step_avg:60.91ms +step:765/2245 train_time:46597ms step_avg:60.91ms +step:766/2245 train_time:46661ms step_avg:60.92ms +step:767/2245 train_time:46728ms step_avg:60.92ms +step:768/2245 train_time:46789ms step_avg:60.92ms +step:769/2245 train_time:46853ms step_avg:60.93ms +step:770/2245 train_time:46914ms step_avg:60.93ms +step:771/2245 train_time:46976ms step_avg:60.93ms +step:772/2245 train_time:47036ms step_avg:60.93ms +step:773/2245 train_time:47098ms step_avg:60.93ms +step:774/2245 train_time:47157ms step_avg:60.93ms +step:775/2245 train_time:47219ms step_avg:60.93ms +step:776/2245 train_time:47278ms step_avg:60.92ms +step:777/2245 train_time:47339ms step_avg:60.93ms +step:778/2245 train_time:47398ms step_avg:60.92ms +step:779/2245 train_time:47460ms step_avg:60.92ms +step:780/2245 train_time:47518ms step_avg:60.92ms +step:781/2245 train_time:47581ms step_avg:60.92ms +step:782/2245 train_time:47642ms step_avg:60.92ms +step:783/2245 train_time:47705ms step_avg:60.93ms +step:784/2245 train_time:47766ms step_avg:60.93ms +step:785/2245 train_time:47829ms step_avg:60.93ms +step:786/2245 train_time:47890ms step_avg:60.93ms +step:787/2245 train_time:47953ms step_avg:60.93ms +step:788/2245 train_time:48013ms step_avg:60.93ms +step:789/2245 train_time:48075ms step_avg:60.93ms +step:790/2245 train_time:48135ms step_avg:60.93ms +step:791/2245 train_time:48197ms step_avg:60.93ms +step:792/2245 train_time:48256ms step_avg:60.93ms +step:793/2245 train_time:48318ms step_avg:60.93ms +step:794/2245 train_time:48377ms step_avg:60.93ms +step:795/2245 train_time:48439ms step_avg:60.93ms +step:796/2245 train_time:48498ms step_avg:60.93ms +step:797/2245 train_time:48560ms step_avg:60.93ms +step:798/2245 train_time:48620ms step_avg:60.93ms +step:799/2245 train_time:48683ms step_avg:60.93ms +step:800/2245 train_time:48743ms step_avg:60.93ms +step:801/2245 train_time:48806ms step_avg:60.93ms +step:802/2245 train_time:48866ms step_avg:60.93ms +step:803/2245 train_time:48928ms step_avg:60.93ms +step:804/2245 train_time:48989ms step_avg:60.93ms +step:805/2245 train_time:49051ms step_avg:60.93ms +step:806/2245 train_time:49112ms step_avg:60.93ms +step:807/2245 train_time:49174ms step_avg:60.93ms +step:808/2245 train_time:49233ms step_avg:60.93ms +step:809/2245 train_time:49295ms step_avg:60.93ms +step:810/2245 train_time:49354ms step_avg:60.93ms +step:811/2245 train_time:49416ms step_avg:60.93ms +step:812/2245 train_time:49476ms step_avg:60.93ms +step:813/2245 train_time:49539ms step_avg:60.93ms +step:814/2245 train_time:49599ms step_avg:60.93ms +step:815/2245 train_time:49661ms step_avg:60.93ms +step:816/2245 train_time:49722ms step_avg:60.93ms +step:817/2245 train_time:49784ms step_avg:60.94ms +step:818/2245 train_time:49844ms step_avg:60.93ms +step:819/2245 train_time:49906ms step_avg:60.93ms +step:820/2245 train_time:49966ms step_avg:60.93ms +step:821/2245 train_time:50028ms step_avg:60.94ms +step:822/2245 train_time:50088ms step_avg:60.93ms +step:823/2245 train_time:50151ms step_avg:60.94ms +step:824/2245 train_time:50210ms step_avg:60.94ms +step:825/2245 train_time:50273ms step_avg:60.94ms +step:826/2245 train_time:50333ms step_avg:60.94ms +step:827/2245 train_time:50395ms step_avg:60.94ms +step:828/2245 train_time:50455ms step_avg:60.94ms +step:829/2245 train_time:50518ms step_avg:60.94ms +step:830/2245 train_time:50578ms step_avg:60.94ms +step:831/2245 train_time:50640ms step_avg:60.94ms +step:832/2245 train_time:50700ms step_avg:60.94ms +step:833/2245 train_time:50763ms step_avg:60.94ms +step:834/2245 train_time:50822ms step_avg:60.94ms +step:835/2245 train_time:50884ms step_avg:60.94ms +step:836/2245 train_time:50944ms step_avg:60.94ms +step:837/2245 train_time:51006ms step_avg:60.94ms +step:838/2245 train_time:51066ms step_avg:60.94ms +step:839/2245 train_time:51128ms step_avg:60.94ms +step:840/2245 train_time:51188ms step_avg:60.94ms +step:841/2245 train_time:51251ms step_avg:60.94ms +step:842/2245 train_time:51310ms step_avg:60.94ms +step:843/2245 train_time:51373ms step_avg:60.94ms +step:844/2245 train_time:51433ms step_avg:60.94ms +step:845/2245 train_time:51495ms step_avg:60.94ms +step:846/2245 train_time:51556ms step_avg:60.94ms +step:847/2245 train_time:51620ms step_avg:60.94ms +step:848/2245 train_time:51680ms step_avg:60.94ms +step:849/2245 train_time:51742ms step_avg:60.95ms +step:850/2245 train_time:51802ms step_avg:60.94ms +step:851/2245 train_time:51864ms step_avg:60.94ms +step:852/2245 train_time:51924ms step_avg:60.94ms +step:853/2245 train_time:51985ms step_avg:60.94ms +step:854/2245 train_time:52045ms step_avg:60.94ms +step:855/2245 train_time:52107ms step_avg:60.94ms +step:856/2245 train_time:52167ms step_avg:60.94ms +step:857/2245 train_time:52230ms step_avg:60.94ms +step:858/2245 train_time:52290ms step_avg:60.94ms +step:859/2245 train_time:52353ms step_avg:60.95ms +step:860/2245 train_time:52412ms step_avg:60.94ms +step:861/2245 train_time:52475ms step_avg:60.95ms +step:862/2245 train_time:52536ms step_avg:60.95ms +step:863/2245 train_time:52599ms step_avg:60.95ms +step:864/2245 train_time:52660ms step_avg:60.95ms +step:865/2245 train_time:52722ms step_avg:60.95ms +step:866/2245 train_time:52782ms step_avg:60.95ms +step:867/2245 train_time:52845ms step_avg:60.95ms +step:868/2245 train_time:52904ms step_avg:60.95ms +step:869/2245 train_time:52966ms step_avg:60.95ms +step:870/2245 train_time:53025ms step_avg:60.95ms +step:871/2245 train_time:53087ms step_avg:60.95ms +step:872/2245 train_time:53147ms step_avg:60.95ms +step:873/2245 train_time:53209ms step_avg:60.95ms +step:874/2245 train_time:53269ms step_avg:60.95ms +step:875/2245 train_time:53332ms step_avg:60.95ms +step:876/2245 train_time:53392ms step_avg:60.95ms +step:877/2245 train_time:53454ms step_avg:60.95ms +step:878/2245 train_time:53514ms step_avg:60.95ms +step:879/2245 train_time:53577ms step_avg:60.95ms +step:880/2245 train_time:53638ms step_avg:60.95ms +step:881/2245 train_time:53701ms step_avg:60.95ms +step:882/2245 train_time:53761ms step_avg:60.95ms +step:883/2245 train_time:53824ms step_avg:60.96ms +step:884/2245 train_time:53884ms step_avg:60.95ms +step:885/2245 train_time:53947ms step_avg:60.96ms +step:886/2245 train_time:54006ms step_avg:60.96ms +step:887/2245 train_time:54068ms step_avg:60.96ms +step:888/2245 train_time:54127ms step_avg:60.95ms +step:889/2245 train_time:54189ms step_avg:60.96ms +step:890/2245 train_time:54249ms step_avg:60.95ms +step:891/2245 train_time:54312ms step_avg:60.96ms +step:892/2245 train_time:54372ms step_avg:60.96ms +step:893/2245 train_time:54434ms step_avg:60.96ms +step:894/2245 train_time:54495ms step_avg:60.96ms +step:895/2245 train_time:54558ms step_avg:60.96ms +step:896/2245 train_time:54618ms step_avg:60.96ms +step:897/2245 train_time:54681ms step_avg:60.96ms +step:898/2245 train_time:54742ms step_avg:60.96ms +step:899/2245 train_time:54803ms step_avg:60.96ms +step:900/2245 train_time:54863ms step_avg:60.96ms +step:901/2245 train_time:54925ms step_avg:60.96ms +step:902/2245 train_time:54985ms step_avg:60.96ms +step:903/2245 train_time:55048ms step_avg:60.96ms +step:904/2245 train_time:55107ms step_avg:60.96ms +step:905/2245 train_time:55169ms step_avg:60.96ms +step:906/2245 train_time:55228ms step_avg:60.96ms +step:907/2245 train_time:55291ms step_avg:60.96ms +step:908/2245 train_time:55351ms step_avg:60.96ms +step:909/2245 train_time:55414ms step_avg:60.96ms +step:910/2245 train_time:55474ms step_avg:60.96ms +step:911/2245 train_time:55538ms step_avg:60.96ms +step:912/2245 train_time:55598ms step_avg:60.96ms +step:913/2245 train_time:55661ms step_avg:60.96ms +step:914/2245 train_time:55721ms step_avg:60.96ms +step:915/2245 train_time:55784ms step_avg:60.97ms +step:916/2245 train_time:55844ms step_avg:60.96ms +step:917/2245 train_time:55906ms step_avg:60.97ms +step:918/2245 train_time:55965ms step_avg:60.96ms +step:919/2245 train_time:56027ms step_avg:60.97ms +step:920/2245 train_time:56086ms step_avg:60.96ms +step:921/2245 train_time:56148ms step_avg:60.96ms +step:922/2245 train_time:56208ms step_avg:60.96ms +step:923/2245 train_time:56270ms step_avg:60.96ms +step:924/2245 train_time:56329ms step_avg:60.96ms +step:925/2245 train_time:56392ms step_avg:60.96ms +step:926/2245 train_time:56452ms step_avg:60.96ms +step:927/2245 train_time:56514ms step_avg:60.96ms +step:928/2245 train_time:56574ms step_avg:60.96ms +step:929/2245 train_time:56637ms step_avg:60.97ms +step:930/2245 train_time:56697ms step_avg:60.96ms +step:931/2245 train_time:56760ms step_avg:60.97ms +step:932/2245 train_time:56820ms step_avg:60.97ms +step:933/2245 train_time:56882ms step_avg:60.97ms +step:934/2245 train_time:56942ms step_avg:60.97ms +step:935/2245 train_time:57003ms step_avg:60.97ms +step:936/2245 train_time:57063ms step_avg:60.97ms +step:937/2245 train_time:57125ms step_avg:60.97ms +step:938/2245 train_time:57184ms step_avg:60.96ms +step:939/2245 train_time:57246ms step_avg:60.97ms +step:940/2245 train_time:57306ms step_avg:60.96ms +step:941/2245 train_time:57368ms step_avg:60.97ms +step:942/2245 train_time:57428ms step_avg:60.96ms +step:943/2245 train_time:57491ms step_avg:60.97ms +step:944/2245 train_time:57551ms step_avg:60.96ms +step:945/2245 train_time:57613ms step_avg:60.97ms +step:946/2245 train_time:57674ms step_avg:60.97ms +step:947/2245 train_time:57737ms step_avg:60.97ms +step:948/2245 train_time:57797ms step_avg:60.97ms +step:949/2245 train_time:57860ms step_avg:60.97ms +step:950/2245 train_time:57920ms step_avg:60.97ms +step:951/2245 train_time:57982ms step_avg:60.97ms +step:952/2245 train_time:58043ms step_avg:60.97ms +step:953/2245 train_time:58105ms step_avg:60.97ms +step:954/2245 train_time:58164ms step_avg:60.97ms +step:955/2245 train_time:58226ms step_avg:60.97ms +step:956/2245 train_time:58286ms step_avg:60.97ms +step:957/2245 train_time:58348ms step_avg:60.97ms +step:958/2245 train_time:58408ms step_avg:60.97ms +step:959/2245 train_time:58471ms step_avg:60.97ms +step:960/2245 train_time:58531ms step_avg:60.97ms +step:961/2245 train_time:58594ms step_avg:60.97ms +step:962/2245 train_time:58654ms step_avg:60.97ms +step:963/2245 train_time:58717ms step_avg:60.97ms +step:964/2245 train_time:58778ms step_avg:60.97ms +step:965/2245 train_time:58841ms step_avg:60.98ms +step:966/2245 train_time:58901ms step_avg:60.97ms +step:967/2245 train_time:58964ms step_avg:60.98ms +step:968/2245 train_time:59023ms step_avg:60.97ms +step:969/2245 train_time:59086ms step_avg:60.98ms +step:970/2245 train_time:59145ms step_avg:60.97ms +step:971/2245 train_time:59207ms step_avg:60.98ms +step:972/2245 train_time:59267ms step_avg:60.97ms +step:973/2245 train_time:59329ms step_avg:60.97ms +step:974/2245 train_time:59388ms step_avg:60.97ms +step:975/2245 train_time:59451ms step_avg:60.97ms +step:976/2245 train_time:59510ms step_avg:60.97ms +step:977/2245 train_time:59573ms step_avg:60.98ms +step:978/2245 train_time:59633ms step_avg:60.97ms +step:979/2245 train_time:59695ms step_avg:60.98ms +step:980/2245 train_time:59757ms step_avg:60.98ms +step:981/2245 train_time:59820ms step_avg:60.98ms +step:982/2245 train_time:59880ms step_avg:60.98ms +step:983/2245 train_time:59942ms step_avg:60.98ms +step:984/2245 train_time:60001ms step_avg:60.98ms +step:985/2245 train_time:60063ms step_avg:60.98ms +step:986/2245 train_time:60122ms step_avg:60.98ms +step:987/2245 train_time:60185ms step_avg:60.98ms +step:988/2245 train_time:60245ms step_avg:60.98ms +step:989/2245 train_time:60306ms step_avg:60.98ms +step:990/2245 train_time:60366ms step_avg:60.98ms +step:991/2245 train_time:60428ms step_avg:60.98ms +step:992/2245 train_time:60488ms step_avg:60.98ms +step:993/2245 train_time:60551ms step_avg:60.98ms +step:994/2245 train_time:60611ms step_avg:60.98ms +step:995/2245 train_time:60674ms step_avg:60.98ms +step:996/2245 train_time:60734ms step_avg:60.98ms +step:997/2245 train_time:60797ms step_avg:60.98ms +step:998/2245 train_time:60858ms step_avg:60.98ms +step:999/2245 train_time:60920ms step_avg:60.98ms +step:1000/2245 train_time:60980ms step_avg:60.98ms +step:1000/2245 val_loss:3.5907 train_time:61043ms step_avg:61.04ms +step:1001/2245 train_time:61061ms step_avg:61.00ms +step:1002/2245 train_time:61106ms step_avg:60.98ms +step:1003/2245 train_time:61171ms step_avg:60.99ms +step:1004/2245 train_time:61233ms step_avg:60.99ms +step:1005/2245 train_time:61297ms step_avg:60.99ms +step:1006/2245 train_time:61358ms step_avg:60.99ms +step:1007/2245 train_time:61419ms step_avg:60.99ms +step:1008/2245 train_time:61479ms step_avg:60.99ms +step:1009/2245 train_time:61539ms step_avg:60.99ms +step:1010/2245 train_time:61599ms step_avg:60.99ms +step:1011/2245 train_time:61660ms step_avg:60.99ms +step:1012/2245 train_time:61719ms step_avg:60.99ms +step:1013/2245 train_time:61780ms step_avg:60.99ms +step:1014/2245 train_time:61839ms step_avg:60.98ms +step:1015/2245 train_time:61900ms step_avg:60.99ms +step:1016/2245 train_time:61960ms step_avg:60.98ms +step:1017/2245 train_time:62023ms step_avg:60.99ms +step:1018/2245 train_time:62085ms step_avg:60.99ms +step:1019/2245 train_time:62149ms step_avg:60.99ms +step:1020/2245 train_time:62210ms step_avg:60.99ms +step:1021/2245 train_time:62274ms step_avg:60.99ms +step:1022/2245 train_time:62335ms step_avg:60.99ms +step:1023/2245 train_time:62397ms step_avg:60.99ms +step:1024/2245 train_time:62456ms step_avg:60.99ms +step:1025/2245 train_time:62518ms step_avg:60.99ms +step:1026/2245 train_time:62578ms step_avg:60.99ms +step:1027/2245 train_time:62639ms step_avg:60.99ms +step:1028/2245 train_time:62698ms step_avg:60.99ms +step:1029/2245 train_time:62759ms step_avg:60.99ms +step:1030/2245 train_time:62818ms step_avg:60.99ms +step:1031/2245 train_time:62880ms step_avg:60.99ms +step:1032/2245 train_time:62939ms step_avg:60.99ms +step:1033/2245 train_time:63002ms step_avg:60.99ms +step:1034/2245 train_time:63063ms step_avg:60.99ms +step:1035/2245 train_time:63126ms step_avg:60.99ms +step:1036/2245 train_time:63187ms step_avg:60.99ms +step:1037/2245 train_time:63250ms step_avg:60.99ms +step:1038/2245 train_time:63311ms step_avg:60.99ms +step:1039/2245 train_time:63374ms step_avg:61.00ms +step:1040/2245 train_time:63435ms step_avg:60.99ms +step:1041/2245 train_time:63498ms step_avg:61.00ms +step:1042/2245 train_time:63557ms step_avg:61.00ms +step:1043/2245 train_time:63619ms step_avg:61.00ms +step:1044/2245 train_time:63678ms step_avg:60.99ms +step:1045/2245 train_time:63740ms step_avg:60.99ms +step:1046/2245 train_time:63799ms step_avg:60.99ms +step:1047/2245 train_time:63861ms step_avg:60.99ms +step:1048/2245 train_time:63921ms step_avg:60.99ms +step:1049/2245 train_time:63983ms step_avg:60.99ms +step:1050/2245 train_time:64043ms step_avg:60.99ms +step:1051/2245 train_time:64105ms step_avg:60.99ms +step:1052/2245 train_time:64165ms step_avg:60.99ms +step:1053/2245 train_time:64229ms step_avg:61.00ms +step:1054/2245 train_time:64290ms step_avg:61.00ms +step:1055/2245 train_time:64352ms step_avg:61.00ms +step:1056/2245 train_time:64413ms step_avg:61.00ms +step:1057/2245 train_time:64476ms step_avg:61.00ms +step:1058/2245 train_time:64536ms step_avg:61.00ms +step:1059/2245 train_time:64598ms step_avg:61.00ms +step:1060/2245 train_time:64657ms step_avg:61.00ms +step:1061/2245 train_time:64719ms step_avg:61.00ms +step:1062/2245 train_time:64778ms step_avg:61.00ms +step:1063/2245 train_time:64840ms step_avg:61.00ms +step:1064/2245 train_time:64900ms step_avg:61.00ms +step:1065/2245 train_time:64962ms step_avg:61.00ms +step:1066/2245 train_time:65022ms step_avg:61.00ms +step:1067/2245 train_time:65085ms step_avg:61.00ms +step:1068/2245 train_time:65145ms step_avg:61.00ms +step:1069/2245 train_time:65207ms step_avg:61.00ms +step:1070/2245 train_time:65267ms step_avg:61.00ms +step:1071/2245 train_time:65331ms step_avg:61.00ms +step:1072/2245 train_time:65391ms step_avg:61.00ms +step:1073/2245 train_time:65454ms step_avg:61.00ms +step:1074/2245 train_time:65514ms step_avg:61.00ms +step:1075/2245 train_time:65577ms step_avg:61.00ms +step:1076/2245 train_time:65636ms step_avg:61.00ms +step:1077/2245 train_time:65698ms step_avg:61.00ms +step:1078/2245 train_time:65757ms step_avg:61.00ms +step:1079/2245 train_time:65819ms step_avg:61.00ms +step:1080/2245 train_time:65879ms step_avg:61.00ms +step:1081/2245 train_time:65940ms step_avg:61.00ms +step:1082/2245 train_time:66000ms step_avg:61.00ms +step:1083/2245 train_time:66062ms step_avg:61.00ms +step:1084/2245 train_time:66122ms step_avg:61.00ms +step:1085/2245 train_time:66184ms step_avg:61.00ms +step:1086/2245 train_time:66244ms step_avg:61.00ms +step:1087/2245 train_time:66307ms step_avg:61.00ms +step:1088/2245 train_time:66367ms step_avg:61.00ms +step:1089/2245 train_time:66430ms step_avg:61.00ms +step:1090/2245 train_time:66491ms step_avg:61.00ms +step:1091/2245 train_time:66555ms step_avg:61.00ms +step:1092/2245 train_time:66614ms step_avg:61.00ms +step:1093/2245 train_time:66677ms step_avg:61.00ms +step:1094/2245 train_time:66736ms step_avg:61.00ms +step:1095/2245 train_time:66798ms step_avg:61.00ms +step:1096/2245 train_time:66857ms step_avg:61.00ms +step:1097/2245 train_time:66920ms step_avg:61.00ms +step:1098/2245 train_time:66979ms step_avg:61.00ms +step:1099/2245 train_time:67041ms step_avg:61.00ms +step:1100/2245 train_time:67101ms step_avg:61.00ms +step:1101/2245 train_time:67163ms step_avg:61.00ms +step:1102/2245 train_time:67223ms step_avg:61.00ms +step:1103/2245 train_time:67285ms step_avg:61.00ms +step:1104/2245 train_time:67345ms step_avg:61.00ms +step:1105/2245 train_time:67409ms step_avg:61.00ms +step:1106/2245 train_time:67469ms step_avg:61.00ms +step:1107/2245 train_time:67532ms step_avg:61.00ms +step:1108/2245 train_time:67593ms step_avg:61.00ms +step:1109/2245 train_time:67655ms step_avg:61.01ms +step:1110/2245 train_time:67716ms step_avg:61.01ms +step:1111/2245 train_time:67777ms step_avg:61.01ms +step:1112/2245 train_time:67837ms step_avg:61.00ms +step:1113/2245 train_time:67899ms step_avg:61.01ms +step:1114/2245 train_time:67959ms step_avg:61.00ms +step:1115/2245 train_time:68021ms step_avg:61.01ms +step:1116/2245 train_time:68081ms step_avg:61.00ms +step:1117/2245 train_time:68143ms step_avg:61.01ms +step:1118/2245 train_time:68203ms step_avg:61.00ms +step:1119/2245 train_time:68266ms step_avg:61.01ms +step:1120/2245 train_time:68325ms step_avg:61.00ms +step:1121/2245 train_time:68389ms step_avg:61.01ms +step:1122/2245 train_time:68449ms step_avg:61.01ms +step:1123/2245 train_time:68511ms step_avg:61.01ms +step:1124/2245 train_time:68572ms step_avg:61.01ms +step:1125/2245 train_time:68634ms step_avg:61.01ms +step:1126/2245 train_time:68694ms step_avg:61.01ms +step:1127/2245 train_time:68757ms step_avg:61.01ms +step:1128/2245 train_time:68817ms step_avg:61.01ms +step:1129/2245 train_time:68879ms step_avg:61.01ms +step:1130/2245 train_time:68938ms step_avg:61.01ms +step:1131/2245 train_time:69001ms step_avg:61.01ms +step:1132/2245 train_time:69060ms step_avg:61.01ms +step:1133/2245 train_time:69123ms step_avg:61.01ms +step:1134/2245 train_time:69184ms step_avg:61.01ms +step:1135/2245 train_time:69246ms step_avg:61.01ms +step:1136/2245 train_time:69306ms step_avg:61.01ms +step:1137/2245 train_time:69368ms step_avg:61.01ms +step:1138/2245 train_time:69428ms step_avg:61.01ms +step:1139/2245 train_time:69490ms step_avg:61.01ms +step:1140/2245 train_time:69551ms step_avg:61.01ms +step:1141/2245 train_time:69613ms step_avg:61.01ms +step:1142/2245 train_time:69674ms step_avg:61.01ms +step:1143/2245 train_time:69737ms step_avg:61.01ms +step:1144/2245 train_time:69797ms step_avg:61.01ms +step:1145/2245 train_time:69859ms step_avg:61.01ms +step:1146/2245 train_time:69918ms step_avg:61.01ms +step:1147/2245 train_time:69980ms step_avg:61.01ms +step:1148/2245 train_time:70040ms step_avg:61.01ms +step:1149/2245 train_time:70103ms step_avg:61.01ms +step:1150/2245 train_time:70162ms step_avg:61.01ms +step:1151/2245 train_time:70224ms step_avg:61.01ms +step:1152/2245 train_time:70284ms step_avg:61.01ms +step:1153/2245 train_time:70346ms step_avg:61.01ms +step:1154/2245 train_time:70406ms step_avg:61.01ms +step:1155/2245 train_time:70469ms step_avg:61.01ms +step:1156/2245 train_time:70529ms step_avg:61.01ms +step:1157/2245 train_time:70591ms step_avg:61.01ms +step:1158/2245 train_time:70651ms step_avg:61.01ms +step:1159/2245 train_time:70714ms step_avg:61.01ms +step:1160/2245 train_time:70775ms step_avg:61.01ms +step:1161/2245 train_time:70838ms step_avg:61.01ms +step:1162/2245 train_time:70898ms step_avg:61.01ms +step:1163/2245 train_time:70960ms step_avg:61.01ms +step:1164/2245 train_time:71020ms step_avg:61.01ms +step:1165/2245 train_time:71082ms step_avg:61.01ms +step:1166/2245 train_time:71142ms step_avg:61.01ms +step:1167/2245 train_time:71204ms step_avg:61.01ms +step:1168/2245 train_time:71264ms step_avg:61.01ms +step:1169/2245 train_time:71326ms step_avg:61.01ms +step:1170/2245 train_time:71387ms step_avg:61.01ms +step:1171/2245 train_time:71449ms step_avg:61.02ms +step:1172/2245 train_time:71509ms step_avg:61.01ms +step:1173/2245 train_time:71572ms step_avg:61.02ms +step:1174/2245 train_time:71632ms step_avg:61.02ms +step:1175/2245 train_time:71695ms step_avg:61.02ms +step:1176/2245 train_time:71755ms step_avg:61.02ms +step:1177/2245 train_time:71817ms step_avg:61.02ms +step:1178/2245 train_time:71878ms step_avg:61.02ms +step:1179/2245 train_time:71940ms step_avg:61.02ms +step:1180/2245 train_time:71999ms step_avg:61.02ms +step:1181/2245 train_time:72062ms step_avg:61.02ms +step:1182/2245 train_time:72122ms step_avg:61.02ms +step:1183/2245 train_time:72184ms step_avg:61.02ms +step:1184/2245 train_time:72244ms step_avg:61.02ms +step:1185/2245 train_time:72306ms step_avg:61.02ms +step:1186/2245 train_time:72365ms step_avg:61.02ms +step:1187/2245 train_time:72428ms step_avg:61.02ms +step:1188/2245 train_time:72489ms step_avg:61.02ms +step:1189/2245 train_time:72551ms step_avg:61.02ms +step:1190/2245 train_time:72612ms step_avg:61.02ms +step:1191/2245 train_time:72675ms step_avg:61.02ms +step:1192/2245 train_time:72736ms step_avg:61.02ms +step:1193/2245 train_time:72798ms step_avg:61.02ms +step:1194/2245 train_time:72857ms step_avg:61.02ms +step:1195/2245 train_time:72919ms step_avg:61.02ms +step:1196/2245 train_time:72979ms step_avg:61.02ms +step:1197/2245 train_time:73042ms step_avg:61.02ms +step:1198/2245 train_time:73101ms step_avg:61.02ms +step:1199/2245 train_time:73163ms step_avg:61.02ms +step:1200/2245 train_time:73223ms step_avg:61.02ms +step:1201/2245 train_time:73285ms step_avg:61.02ms +step:1202/2245 train_time:73344ms step_avg:61.02ms +step:1203/2245 train_time:73406ms step_avg:61.02ms +step:1204/2245 train_time:73466ms step_avg:61.02ms +step:1205/2245 train_time:73529ms step_avg:61.02ms +step:1206/2245 train_time:73589ms step_avg:61.02ms +step:1207/2245 train_time:73652ms step_avg:61.02ms +step:1208/2245 train_time:73712ms step_avg:61.02ms +step:1209/2245 train_time:73776ms step_avg:61.02ms +step:1210/2245 train_time:73836ms step_avg:61.02ms +step:1211/2245 train_time:73898ms step_avg:61.02ms +step:1212/2245 train_time:73957ms step_avg:61.02ms +step:1213/2245 train_time:74020ms step_avg:61.02ms +step:1214/2245 train_time:74081ms step_avg:61.02ms +step:1215/2245 train_time:74143ms step_avg:61.02ms +step:1216/2245 train_time:74202ms step_avg:61.02ms +step:1217/2245 train_time:74265ms step_avg:61.02ms +step:1218/2245 train_time:74324ms step_avg:61.02ms +step:1219/2245 train_time:74386ms step_avg:61.02ms +step:1220/2245 train_time:74446ms step_avg:61.02ms +step:1221/2245 train_time:74508ms step_avg:61.02ms +step:1222/2245 train_time:74568ms step_avg:61.02ms +step:1223/2245 train_time:74631ms step_avg:61.02ms +step:1224/2245 train_time:74692ms step_avg:61.02ms +step:1225/2245 train_time:74755ms step_avg:61.02ms +step:1226/2245 train_time:74815ms step_avg:61.02ms +step:1227/2245 train_time:74877ms step_avg:61.02ms +step:1228/2245 train_time:74937ms step_avg:61.02ms +step:1229/2245 train_time:75000ms step_avg:61.02ms +step:1230/2245 train_time:75060ms step_avg:61.02ms +step:1231/2245 train_time:75123ms step_avg:61.03ms +step:1232/2245 train_time:75183ms step_avg:61.03ms +step:1233/2245 train_time:75245ms step_avg:61.03ms +step:1234/2245 train_time:75305ms step_avg:61.02ms +step:1235/2245 train_time:75366ms step_avg:61.03ms +step:1236/2245 train_time:75426ms step_avg:61.02ms +step:1237/2245 train_time:75488ms step_avg:61.03ms +step:1238/2245 train_time:75548ms step_avg:61.02ms +step:1239/2245 train_time:75610ms step_avg:61.03ms +step:1240/2245 train_time:75671ms step_avg:61.02ms +step:1241/2245 train_time:75733ms step_avg:61.03ms +step:1242/2245 train_time:75794ms step_avg:61.03ms +step:1243/2245 train_time:75857ms step_avg:61.03ms +step:1244/2245 train_time:75917ms step_avg:61.03ms +step:1245/2245 train_time:75980ms step_avg:61.03ms +step:1246/2245 train_time:76040ms step_avg:61.03ms +step:1247/2245 train_time:76103ms step_avg:61.03ms +step:1248/2245 train_time:76163ms step_avg:61.03ms +step:1249/2245 train_time:76225ms step_avg:61.03ms +step:1250/2245 train_time:76285ms step_avg:61.03ms +step:1250/2245 val_loss:3.5205 train_time:76348ms step_avg:61.08ms +step:1251/2245 train_time:76366ms step_avg:61.04ms +step:1252/2245 train_time:76412ms step_avg:61.03ms +step:1253/2245 train_time:76478ms step_avg:61.04ms +step:1254/2245 train_time:76538ms step_avg:61.04ms +step:1255/2245 train_time:76601ms step_avg:61.04ms +step:1256/2245 train_time:76661ms step_avg:61.04ms +step:1257/2245 train_time:76722ms step_avg:61.04ms +step:1258/2245 train_time:76782ms step_avg:61.03ms +step:1259/2245 train_time:76843ms step_avg:61.04ms +step:1260/2245 train_time:76903ms step_avg:61.03ms +step:1261/2245 train_time:76965ms step_avg:61.04ms +step:1262/2245 train_time:77024ms step_avg:61.03ms +step:1263/2245 train_time:77087ms step_avg:61.03ms +step:1264/2245 train_time:77147ms step_avg:61.03ms +step:1265/2245 train_time:77209ms step_avg:61.04ms +step:1266/2245 train_time:77269ms step_avg:61.03ms +step:1267/2245 train_time:77333ms step_avg:61.04ms +step:1268/2245 train_time:77395ms step_avg:61.04ms +step:1269/2245 train_time:77458ms step_avg:61.04ms +step:1270/2245 train_time:77519ms step_avg:61.04ms +step:1271/2245 train_time:77581ms step_avg:61.04ms +step:1272/2245 train_time:77641ms step_avg:61.04ms +step:1273/2245 train_time:77703ms step_avg:61.04ms +step:1274/2245 train_time:77762ms step_avg:61.04ms +step:1275/2245 train_time:77824ms step_avg:61.04ms +step:1276/2245 train_time:77884ms step_avg:61.04ms +step:1277/2245 train_time:77946ms step_avg:61.04ms +step:1278/2245 train_time:78006ms step_avg:61.04ms +step:1279/2245 train_time:78067ms step_avg:61.04ms +step:1280/2245 train_time:78127ms step_avg:61.04ms +step:1281/2245 train_time:78189ms step_avg:61.04ms +step:1282/2245 train_time:78251ms step_avg:61.04ms +step:1283/2245 train_time:78315ms step_avg:61.04ms +step:1284/2245 train_time:78375ms step_avg:61.04ms +step:1285/2245 train_time:78438ms step_avg:61.04ms +step:1286/2245 train_time:78497ms step_avg:61.04ms +step:1287/2245 train_time:78559ms step_avg:61.04ms +step:1288/2245 train_time:78619ms step_avg:61.04ms +step:1289/2245 train_time:78680ms step_avg:61.04ms +step:1290/2245 train_time:78740ms step_avg:61.04ms +step:1291/2245 train_time:78802ms step_avg:61.04ms +step:1292/2245 train_time:78861ms step_avg:61.04ms +step:1293/2245 train_time:78924ms step_avg:61.04ms +step:1294/2245 train_time:78983ms step_avg:61.04ms +step:1295/2245 train_time:79045ms step_avg:61.04ms +step:1296/2245 train_time:79105ms step_avg:61.04ms +step:1297/2245 train_time:79167ms step_avg:61.04ms +step:1298/2245 train_time:79228ms step_avg:61.04ms +step:1299/2245 train_time:79291ms step_avg:61.04ms +step:1300/2245 train_time:79353ms step_avg:61.04ms +step:1301/2245 train_time:79416ms step_avg:61.04ms +step:1302/2245 train_time:79476ms step_avg:61.04ms +step:1303/2245 train_time:79538ms step_avg:61.04ms +step:1304/2245 train_time:79597ms step_avg:61.04ms +step:1305/2245 train_time:79660ms step_avg:61.04ms +step:1306/2245 train_time:79720ms step_avg:61.04ms +step:1307/2245 train_time:79782ms step_avg:61.04ms +step:1308/2245 train_time:79841ms step_avg:61.04ms +step:1309/2245 train_time:79904ms step_avg:61.04ms +step:1310/2245 train_time:79963ms step_avg:61.04ms +step:1311/2245 train_time:80025ms step_avg:61.04ms +step:1312/2245 train_time:80084ms step_avg:61.04ms +step:1313/2245 train_time:80147ms step_avg:61.04ms +step:1314/2245 train_time:80207ms step_avg:61.04ms +step:1315/2245 train_time:80271ms step_avg:61.04ms +step:1316/2245 train_time:80331ms step_avg:61.04ms +step:1317/2245 train_time:80395ms step_avg:61.04ms +step:1318/2245 train_time:80455ms step_avg:61.04ms +step:1319/2245 train_time:80517ms step_avg:61.04ms +step:1320/2245 train_time:80577ms step_avg:61.04ms +step:1321/2245 train_time:80639ms step_avg:61.04ms +step:1322/2245 train_time:80699ms step_avg:61.04ms +step:1323/2245 train_time:80761ms step_avg:61.04ms +step:1324/2245 train_time:80820ms step_avg:61.04ms +step:1325/2245 train_time:80882ms step_avg:61.04ms +step:1326/2245 train_time:80942ms step_avg:61.04ms +step:1327/2245 train_time:81004ms step_avg:61.04ms +step:1328/2245 train_time:81064ms step_avg:61.04ms +step:1329/2245 train_time:81128ms step_avg:61.04ms +step:1330/2245 train_time:81188ms step_avg:61.04ms +step:1331/2245 train_time:81251ms step_avg:61.05ms +step:1332/2245 train_time:81312ms step_avg:61.04ms +step:1333/2245 train_time:81374ms step_avg:61.05ms +step:1334/2245 train_time:81434ms step_avg:61.05ms +step:1335/2245 train_time:81497ms step_avg:61.05ms +step:1336/2245 train_time:81557ms step_avg:61.05ms +step:1337/2245 train_time:81619ms step_avg:61.05ms +step:1338/2245 train_time:81678ms step_avg:61.05ms +step:1339/2245 train_time:81741ms step_avg:61.05ms +step:1340/2245 train_time:81801ms step_avg:61.05ms +step:1341/2245 train_time:81863ms step_avg:61.05ms +step:1342/2245 train_time:81922ms step_avg:61.05ms +step:1343/2245 train_time:81984ms step_avg:61.05ms +step:1344/2245 train_time:82044ms step_avg:61.04ms +step:1345/2245 train_time:82107ms step_avg:61.05ms +step:1346/2245 train_time:82167ms step_avg:61.05ms +step:1347/2245 train_time:82230ms step_avg:61.05ms +step:1348/2245 train_time:82290ms step_avg:61.05ms +step:1349/2245 train_time:82353ms step_avg:61.05ms +step:1350/2245 train_time:82414ms step_avg:61.05ms +step:1351/2245 train_time:82476ms step_avg:61.05ms +step:1352/2245 train_time:82536ms step_avg:61.05ms +step:1353/2245 train_time:82598ms step_avg:61.05ms +step:1354/2245 train_time:82658ms step_avg:61.05ms +step:1355/2245 train_time:82721ms step_avg:61.05ms +step:1356/2245 train_time:82781ms step_avg:61.05ms +step:1357/2245 train_time:82842ms step_avg:61.05ms +step:1358/2245 train_time:82902ms step_avg:61.05ms +step:1359/2245 train_time:82964ms step_avg:61.05ms +step:1360/2245 train_time:83024ms step_avg:61.05ms +step:1361/2245 train_time:83087ms step_avg:61.05ms +step:1362/2245 train_time:83146ms step_avg:61.05ms +step:1363/2245 train_time:83209ms step_avg:61.05ms +step:1364/2245 train_time:83269ms step_avg:61.05ms +step:1365/2245 train_time:83331ms step_avg:61.05ms +step:1366/2245 train_time:83391ms step_avg:61.05ms +step:1367/2245 train_time:83455ms step_avg:61.05ms +step:1368/2245 train_time:83515ms step_avg:61.05ms +step:1369/2245 train_time:83577ms step_avg:61.05ms +step:1370/2245 train_time:83638ms step_avg:61.05ms +step:1371/2245 train_time:83700ms step_avg:61.05ms +step:1372/2245 train_time:83760ms step_avg:61.05ms +step:1373/2245 train_time:83822ms step_avg:61.05ms +step:1374/2245 train_time:83882ms step_avg:61.05ms +step:1375/2245 train_time:83943ms step_avg:61.05ms +step:1376/2245 train_time:84003ms step_avg:61.05ms +step:1377/2245 train_time:84065ms step_avg:61.05ms +step:1378/2245 train_time:84125ms step_avg:61.05ms +step:1379/2245 train_time:84188ms step_avg:61.05ms +step:1380/2245 train_time:84248ms step_avg:61.05ms +step:1381/2245 train_time:84311ms step_avg:61.05ms +step:1382/2245 train_time:84372ms step_avg:61.05ms +step:1383/2245 train_time:84435ms step_avg:61.05ms +step:1384/2245 train_time:84495ms step_avg:61.05ms +step:1385/2245 train_time:84557ms step_avg:61.05ms +step:1386/2245 train_time:84618ms step_avg:61.05ms +step:1387/2245 train_time:84679ms step_avg:61.05ms +step:1388/2245 train_time:84740ms step_avg:61.05ms +step:1389/2245 train_time:84801ms step_avg:61.05ms +step:1390/2245 train_time:84861ms step_avg:61.05ms +step:1391/2245 train_time:84924ms step_avg:61.05ms +step:1392/2245 train_time:84983ms step_avg:61.05ms +step:1393/2245 train_time:85045ms step_avg:61.05ms +step:1394/2245 train_time:85105ms step_avg:61.05ms +step:1395/2245 train_time:85168ms step_avg:61.05ms +step:1396/2245 train_time:85228ms step_avg:61.05ms +step:1397/2245 train_time:85291ms step_avg:61.05ms +step:1398/2245 train_time:85351ms step_avg:61.05ms +step:1399/2245 train_time:85415ms step_avg:61.05ms +step:1400/2245 train_time:85475ms step_avg:61.05ms +step:1401/2245 train_time:85537ms step_avg:61.05ms +step:1402/2245 train_time:85597ms step_avg:61.05ms +step:1403/2245 train_time:85660ms step_avg:61.05ms +step:1404/2245 train_time:85720ms step_avg:61.05ms +step:1405/2245 train_time:85782ms step_avg:61.05ms +step:1406/2245 train_time:85842ms step_avg:61.05ms +step:1407/2245 train_time:85904ms step_avg:61.05ms +step:1408/2245 train_time:85963ms step_avg:61.05ms +step:1409/2245 train_time:86025ms step_avg:61.05ms +step:1410/2245 train_time:86085ms step_avg:61.05ms +step:1411/2245 train_time:86147ms step_avg:61.05ms +step:1412/2245 train_time:86208ms step_avg:61.05ms +step:1413/2245 train_time:86271ms step_avg:61.06ms +step:1414/2245 train_time:86331ms step_avg:61.05ms +step:1415/2245 train_time:86393ms step_avg:61.06ms +step:1416/2245 train_time:86454ms step_avg:61.05ms +step:1417/2245 train_time:86516ms step_avg:61.06ms +step:1418/2245 train_time:86576ms step_avg:61.06ms +step:1419/2245 train_time:86639ms step_avg:61.06ms +step:1420/2245 train_time:86699ms step_avg:61.06ms +step:1421/2245 train_time:86761ms step_avg:61.06ms +step:1422/2245 train_time:86821ms step_avg:61.06ms +step:1423/2245 train_time:86883ms step_avg:61.06ms +step:1424/2245 train_time:86943ms step_avg:61.06ms +step:1425/2245 train_time:87005ms step_avg:61.06ms +step:1426/2245 train_time:87064ms step_avg:61.05ms +step:1427/2245 train_time:87127ms step_avg:61.06ms +step:1428/2245 train_time:87187ms step_avg:61.06ms +step:1429/2245 train_time:87250ms step_avg:61.06ms +step:1430/2245 train_time:87311ms step_avg:61.06ms +step:1431/2245 train_time:87373ms step_avg:61.06ms +step:1432/2245 train_time:87433ms step_avg:61.06ms +step:1433/2245 train_time:87496ms step_avg:61.06ms +step:1434/2245 train_time:87556ms step_avg:61.06ms +step:1435/2245 train_time:87619ms step_avg:61.06ms +step:1436/2245 train_time:87679ms step_avg:61.06ms +step:1437/2245 train_time:87741ms step_avg:61.06ms +step:1438/2245 train_time:87801ms step_avg:61.06ms +step:1439/2245 train_time:87863ms step_avg:61.06ms +step:1440/2245 train_time:87923ms step_avg:61.06ms +step:1441/2245 train_time:87985ms step_avg:61.06ms +step:1442/2245 train_time:88045ms step_avg:61.06ms +step:1443/2245 train_time:88107ms step_avg:61.06ms +step:1444/2245 train_time:88167ms step_avg:61.06ms +step:1445/2245 train_time:88229ms step_avg:61.06ms +step:1446/2245 train_time:88289ms step_avg:61.06ms +step:1447/2245 train_time:88353ms step_avg:61.06ms +step:1448/2245 train_time:88413ms step_avg:61.06ms +step:1449/2245 train_time:88476ms step_avg:61.06ms +step:1450/2245 train_time:88536ms step_avg:61.06ms +step:1451/2245 train_time:88598ms step_avg:61.06ms +step:1452/2245 train_time:88658ms step_avg:61.06ms +step:1453/2245 train_time:88720ms step_avg:61.06ms +step:1454/2245 train_time:88780ms step_avg:61.06ms +step:1455/2245 train_time:88842ms step_avg:61.06ms +step:1456/2245 train_time:88901ms step_avg:61.06ms +step:1457/2245 train_time:88963ms step_avg:61.06ms +step:1458/2245 train_time:89023ms step_avg:61.06ms +step:1459/2245 train_time:89085ms step_avg:61.06ms +step:1460/2245 train_time:89145ms step_avg:61.06ms +step:1461/2245 train_time:89208ms step_avg:61.06ms +step:1462/2245 train_time:89268ms step_avg:61.06ms +step:1463/2245 train_time:89331ms step_avg:61.06ms +step:1464/2245 train_time:89391ms step_avg:61.06ms +step:1465/2245 train_time:89454ms step_avg:61.06ms +step:1466/2245 train_time:89515ms step_avg:61.06ms +step:1467/2245 train_time:89577ms step_avg:61.06ms +step:1468/2245 train_time:89637ms step_avg:61.06ms +step:1469/2245 train_time:89700ms step_avg:61.06ms +step:1470/2245 train_time:89760ms step_avg:61.06ms +step:1471/2245 train_time:89822ms step_avg:61.06ms +step:1472/2245 train_time:89882ms step_avg:61.06ms +step:1473/2245 train_time:89945ms step_avg:61.06ms +step:1474/2245 train_time:90005ms step_avg:61.06ms +step:1475/2245 train_time:90068ms step_avg:61.06ms +step:1476/2245 train_time:90128ms step_avg:61.06ms +step:1477/2245 train_time:90191ms step_avg:61.06ms +step:1478/2245 train_time:90251ms step_avg:61.06ms +step:1479/2245 train_time:90314ms step_avg:61.06ms +step:1480/2245 train_time:90375ms step_avg:61.06ms +step:1481/2245 train_time:90437ms step_avg:61.06ms +step:1482/2245 train_time:90498ms step_avg:61.06ms +step:1483/2245 train_time:90561ms step_avg:61.07ms +step:1484/2245 train_time:90622ms step_avg:61.07ms +step:1485/2245 train_time:90685ms step_avg:61.07ms +step:1486/2245 train_time:90745ms step_avg:61.07ms +step:1487/2245 train_time:90807ms step_avg:61.07ms +step:1488/2245 train_time:90868ms step_avg:61.07ms +step:1489/2245 train_time:90932ms step_avg:61.07ms +step:1490/2245 train_time:90992ms step_avg:61.07ms +step:1491/2245 train_time:91055ms step_avg:61.07ms +step:1492/2245 train_time:91115ms step_avg:61.07ms +step:1493/2245 train_time:91177ms step_avg:61.07ms +step:1494/2245 train_time:91237ms step_avg:61.07ms +step:1495/2245 train_time:91300ms step_avg:61.07ms +step:1496/2245 train_time:91360ms step_avg:61.07ms +step:1497/2245 train_time:91424ms step_avg:61.07ms +step:1498/2245 train_time:91484ms step_avg:61.07ms +step:1499/2245 train_time:91547ms step_avg:61.07ms +step:1500/2245 train_time:91608ms step_avg:61.07ms +step:1500/2245 val_loss:3.4411 train_time:91673ms step_avg:61.12ms +step:1501/2245 train_time:91692ms step_avg:61.09ms +step:1502/2245 train_time:91736ms step_avg:61.08ms +step:1503/2245 train_time:91799ms step_avg:61.08ms +step:1504/2245 train_time:91860ms step_avg:61.08ms +step:1505/2245 train_time:91924ms step_avg:61.08ms +step:1506/2245 train_time:91984ms step_avg:61.08ms +step:1507/2245 train_time:92047ms step_avg:61.08ms +step:1508/2245 train_time:92106ms step_avg:61.08ms +step:1509/2245 train_time:92167ms step_avg:61.08ms +step:1510/2245 train_time:92227ms step_avg:61.08ms +step:1511/2245 train_time:92288ms step_avg:61.08ms +step:1512/2245 train_time:92348ms step_avg:61.08ms +step:1513/2245 train_time:92410ms step_avg:61.08ms +step:1514/2245 train_time:92471ms step_avg:61.08ms +step:1515/2245 train_time:92533ms step_avg:61.08ms +step:1516/2245 train_time:92596ms step_avg:61.08ms +step:1517/2245 train_time:92660ms step_avg:61.08ms +step:1518/2245 train_time:92721ms step_avg:61.08ms +step:1519/2245 train_time:92786ms step_avg:61.08ms +step:1520/2245 train_time:92847ms step_avg:61.08ms +step:1521/2245 train_time:92910ms step_avg:61.08ms +step:1522/2245 train_time:92969ms step_avg:61.08ms +step:1523/2245 train_time:93032ms step_avg:61.08ms +step:1524/2245 train_time:93092ms step_avg:61.08ms +step:1525/2245 train_time:93155ms step_avg:61.09ms +step:1526/2245 train_time:93214ms step_avg:61.08ms +step:1527/2245 train_time:93277ms step_avg:61.09ms +step:1528/2245 train_time:93338ms step_avg:61.08ms +step:1529/2245 train_time:93401ms step_avg:61.09ms +step:1530/2245 train_time:93462ms step_avg:61.09ms +step:1531/2245 train_time:93525ms step_avg:61.09ms +step:1532/2245 train_time:93586ms step_avg:61.09ms +step:1533/2245 train_time:93649ms step_avg:61.09ms +step:1534/2245 train_time:93710ms step_avg:61.09ms +step:1535/2245 train_time:93773ms step_avg:61.09ms +step:1536/2245 train_time:93833ms step_avg:61.09ms +step:1537/2245 train_time:93897ms step_avg:61.09ms +step:1538/2245 train_time:93957ms step_avg:61.09ms +step:1539/2245 train_time:94020ms step_avg:61.09ms +step:1540/2245 train_time:94081ms step_avg:61.09ms +step:1541/2245 train_time:94144ms step_avg:61.09ms +step:1542/2245 train_time:94204ms step_avg:61.09ms +step:1543/2245 train_time:94266ms step_avg:61.09ms +step:1544/2245 train_time:94326ms step_avg:61.09ms +step:1545/2245 train_time:94388ms step_avg:61.09ms +step:1546/2245 train_time:94448ms step_avg:61.09ms +step:1547/2245 train_time:94510ms step_avg:61.09ms +step:1548/2245 train_time:94571ms step_avg:61.09ms +step:1549/2245 train_time:94634ms step_avg:61.09ms +step:1550/2245 train_time:94694ms step_avg:61.09ms +step:1551/2245 train_time:94758ms step_avg:61.09ms +step:1552/2245 train_time:94819ms step_avg:61.09ms +step:1553/2245 train_time:94883ms step_avg:61.10ms +step:1554/2245 train_time:94944ms step_avg:61.10ms +step:1555/2245 train_time:95007ms step_avg:61.10ms +step:1556/2245 train_time:95067ms step_avg:61.10ms +step:1557/2245 train_time:95129ms step_avg:61.10ms +step:1558/2245 train_time:95189ms step_avg:61.10ms +step:1559/2245 train_time:95252ms step_avg:61.10ms +step:1560/2245 train_time:95312ms step_avg:61.10ms +step:1561/2245 train_time:95375ms step_avg:61.10ms +step:1562/2245 train_time:95435ms step_avg:61.10ms +step:1563/2245 train_time:95498ms step_avg:61.10ms +step:1564/2245 train_time:95558ms step_avg:61.10ms +step:1565/2245 train_time:95621ms step_avg:61.10ms +step:1566/2245 train_time:95683ms step_avg:61.10ms +step:1567/2245 train_time:95747ms step_avg:61.10ms +step:1568/2245 train_time:95807ms step_avg:61.10ms +step:1569/2245 train_time:95870ms step_avg:61.10ms +step:1570/2245 train_time:95930ms step_avg:61.10ms +step:1571/2245 train_time:95992ms step_avg:61.10ms +step:1572/2245 train_time:96053ms step_avg:61.10ms +step:1573/2245 train_time:96115ms step_avg:61.10ms +step:1574/2245 train_time:96175ms step_avg:61.10ms +step:1575/2245 train_time:96238ms step_avg:61.10ms +step:1576/2245 train_time:96299ms step_avg:61.10ms +step:1577/2245 train_time:96362ms step_avg:61.10ms +step:1578/2245 train_time:96422ms step_avg:61.10ms +step:1579/2245 train_time:96485ms step_avg:61.11ms +step:1580/2245 train_time:96546ms step_avg:61.10ms +step:1581/2245 train_time:96608ms step_avg:61.11ms +step:1582/2245 train_time:96668ms step_avg:61.10ms +step:1583/2245 train_time:96730ms step_avg:61.11ms +step:1584/2245 train_time:96792ms step_avg:61.11ms +step:1585/2245 train_time:96855ms step_avg:61.11ms +step:1586/2245 train_time:96915ms step_avg:61.11ms +step:1587/2245 train_time:96978ms step_avg:61.11ms +step:1588/2245 train_time:97038ms step_avg:61.11ms +step:1589/2245 train_time:97101ms step_avg:61.11ms +step:1590/2245 train_time:97162ms step_avg:61.11ms +step:1591/2245 train_time:97224ms step_avg:61.11ms +step:1592/2245 train_time:97285ms step_avg:61.11ms +step:1593/2245 train_time:97348ms step_avg:61.11ms +step:1594/2245 train_time:97408ms step_avg:61.11ms +step:1595/2245 train_time:97470ms step_avg:61.11ms +step:1596/2245 train_time:97530ms step_avg:61.11ms +step:1597/2245 train_time:97592ms step_avg:61.11ms +step:1598/2245 train_time:97652ms step_avg:61.11ms +step:1599/2245 train_time:97716ms step_avg:61.11ms +step:1600/2245 train_time:97776ms step_avg:61.11ms +step:1601/2245 train_time:97840ms step_avg:61.11ms +step:1602/2245 train_time:97900ms step_avg:61.11ms +step:1603/2245 train_time:97964ms step_avg:61.11ms +step:1604/2245 train_time:98024ms step_avg:61.11ms +step:1605/2245 train_time:98087ms step_avg:61.11ms +step:1606/2245 train_time:98148ms step_avg:61.11ms +step:1607/2245 train_time:98210ms step_avg:61.11ms +step:1608/2245 train_time:98270ms step_avg:61.11ms +step:1609/2245 train_time:98332ms step_avg:61.11ms +step:1610/2245 train_time:98392ms step_avg:61.11ms +step:1611/2245 train_time:98455ms step_avg:61.11ms +step:1612/2245 train_time:98515ms step_avg:61.11ms +step:1613/2245 train_time:98578ms step_avg:61.11ms +step:1614/2245 train_time:98639ms step_avg:61.11ms +step:1615/2245 train_time:98703ms step_avg:61.12ms +step:1616/2245 train_time:98763ms step_avg:61.12ms +step:1617/2245 train_time:98827ms step_avg:61.12ms +step:1618/2245 train_time:98886ms step_avg:61.12ms +step:1619/2245 train_time:98949ms step_avg:61.12ms +step:1620/2245 train_time:99009ms step_avg:61.12ms +step:1621/2245 train_time:99072ms step_avg:61.12ms +step:1622/2245 train_time:99132ms step_avg:61.12ms +step:1623/2245 train_time:99195ms step_avg:61.12ms +step:1624/2245 train_time:99256ms step_avg:61.12ms +step:1625/2245 train_time:99319ms step_avg:61.12ms +step:1626/2245 train_time:99380ms step_avg:61.12ms +step:1627/2245 train_time:99443ms step_avg:61.12ms +step:1628/2245 train_time:99503ms step_avg:61.12ms +step:1629/2245 train_time:99567ms step_avg:61.12ms +step:1630/2245 train_time:99627ms step_avg:61.12ms +step:1631/2245 train_time:99689ms step_avg:61.12ms +step:1632/2245 train_time:99750ms step_avg:61.12ms +step:1633/2245 train_time:99812ms step_avg:61.12ms +step:1634/2245 train_time:99872ms step_avg:61.12ms +step:1635/2245 train_time:99935ms step_avg:61.12ms +step:1636/2245 train_time:99995ms step_avg:61.12ms +step:1637/2245 train_time:100058ms step_avg:61.12ms +step:1638/2245 train_time:100118ms step_avg:61.12ms +step:1639/2245 train_time:100181ms step_avg:61.12ms +step:1640/2245 train_time:100242ms step_avg:61.12ms +step:1641/2245 train_time:100305ms step_avg:61.12ms +step:1642/2245 train_time:100365ms step_avg:61.12ms +step:1643/2245 train_time:100427ms step_avg:61.12ms +step:1644/2245 train_time:100487ms step_avg:61.12ms +step:1645/2245 train_time:100550ms step_avg:61.12ms +step:1646/2245 train_time:100610ms step_avg:61.12ms +step:1647/2245 train_time:100674ms step_avg:61.13ms +step:1648/2245 train_time:100734ms step_avg:61.12ms +step:1649/2245 train_time:100797ms step_avg:61.13ms +step:1650/2245 train_time:100858ms step_avg:61.13ms +step:1651/2245 train_time:100921ms step_avg:61.13ms +step:1652/2245 train_time:100981ms step_avg:61.13ms +step:1653/2245 train_time:101044ms step_avg:61.13ms +step:1654/2245 train_time:101104ms step_avg:61.13ms +step:1655/2245 train_time:101167ms step_avg:61.13ms +step:1656/2245 train_time:101227ms step_avg:61.13ms +step:1657/2245 train_time:101289ms step_avg:61.13ms +step:1658/2245 train_time:101350ms step_avg:61.13ms +step:1659/2245 train_time:101412ms step_avg:61.13ms +step:1660/2245 train_time:101472ms step_avg:61.13ms +step:1661/2245 train_time:101535ms step_avg:61.13ms +step:1662/2245 train_time:101595ms step_avg:61.13ms +step:1663/2245 train_time:101658ms step_avg:61.13ms +step:1664/2245 train_time:101718ms step_avg:61.13ms +step:1665/2245 train_time:101782ms step_avg:61.13ms +step:1666/2245 train_time:101843ms step_avg:61.13ms +step:1667/2245 train_time:101907ms step_avg:61.13ms +step:1668/2245 train_time:101967ms step_avg:61.13ms +step:1669/2245 train_time:102029ms step_avg:61.13ms +step:1670/2245 train_time:102089ms step_avg:61.13ms +step:1671/2245 train_time:102152ms step_avg:61.13ms +step:1672/2245 train_time:102212ms step_avg:61.13ms +step:1673/2245 train_time:102274ms step_avg:61.13ms +step:1674/2245 train_time:102334ms step_avg:61.13ms +step:1675/2245 train_time:102398ms step_avg:61.13ms +step:1676/2245 train_time:102458ms step_avg:61.13ms +step:1677/2245 train_time:102520ms step_avg:61.13ms +step:1678/2245 train_time:102581ms step_avg:61.13ms +step:1679/2245 train_time:102644ms step_avg:61.13ms +step:1680/2245 train_time:102705ms step_avg:61.13ms +step:1681/2245 train_time:102768ms step_avg:61.14ms +step:1682/2245 train_time:102828ms step_avg:61.13ms +step:1683/2245 train_time:102891ms step_avg:61.14ms +step:1684/2245 train_time:102952ms step_avg:61.14ms +step:1685/2245 train_time:103014ms step_avg:61.14ms +step:1686/2245 train_time:103074ms step_avg:61.14ms +step:1687/2245 train_time:103137ms step_avg:61.14ms +step:1688/2245 train_time:103198ms step_avg:61.14ms +step:1689/2245 train_time:103261ms step_avg:61.14ms +step:1690/2245 train_time:103322ms step_avg:61.14ms +step:1691/2245 train_time:103385ms step_avg:61.14ms +step:1692/2245 train_time:103445ms step_avg:61.14ms +step:1693/2245 train_time:103508ms step_avg:61.14ms +step:1694/2245 train_time:103568ms step_avg:61.14ms +step:1695/2245 train_time:103631ms step_avg:61.14ms +step:1696/2245 train_time:103691ms step_avg:61.14ms +step:1697/2245 train_time:103754ms step_avg:61.14ms +step:1698/2245 train_time:103814ms step_avg:61.14ms +step:1699/2245 train_time:103878ms step_avg:61.14ms +step:1700/2245 train_time:103938ms step_avg:61.14ms +step:1701/2245 train_time:104002ms step_avg:61.14ms +step:1702/2245 train_time:104062ms step_avg:61.14ms +step:1703/2245 train_time:104125ms step_avg:61.14ms +step:1704/2245 train_time:104185ms step_avg:61.14ms +step:1705/2245 train_time:104248ms step_avg:61.14ms +step:1706/2245 train_time:104307ms step_avg:61.14ms +step:1707/2245 train_time:104370ms step_avg:61.14ms +step:1708/2245 train_time:104431ms step_avg:61.14ms +step:1709/2245 train_time:104493ms step_avg:61.14ms +step:1710/2245 train_time:104555ms step_avg:61.14ms +step:1711/2245 train_time:104617ms step_avg:61.14ms +step:1712/2245 train_time:104677ms step_avg:61.14ms +step:1713/2245 train_time:104740ms step_avg:61.14ms +step:1714/2245 train_time:104801ms step_avg:61.14ms +step:1715/2245 train_time:104864ms step_avg:61.15ms +step:1716/2245 train_time:104925ms step_avg:61.15ms +step:1717/2245 train_time:104988ms step_avg:61.15ms +step:1718/2245 train_time:105048ms step_avg:61.15ms +step:1719/2245 train_time:105110ms step_avg:61.15ms +step:1720/2245 train_time:105170ms step_avg:61.15ms +step:1721/2245 train_time:105233ms step_avg:61.15ms +step:1722/2245 train_time:105293ms step_avg:61.15ms +step:1723/2245 train_time:105356ms step_avg:61.15ms +step:1724/2245 train_time:105416ms step_avg:61.15ms +step:1725/2245 train_time:105479ms step_avg:61.15ms +step:1726/2245 train_time:105541ms step_avg:61.15ms +step:1727/2245 train_time:105604ms step_avg:61.15ms +step:1728/2245 train_time:105664ms step_avg:61.15ms +step:1729/2245 train_time:105727ms step_avg:61.15ms +step:1730/2245 train_time:105786ms step_avg:61.15ms +step:1731/2245 train_time:105849ms step_avg:61.15ms +step:1732/2245 train_time:105909ms step_avg:61.15ms +step:1733/2245 train_time:105972ms step_avg:61.15ms +step:1734/2245 train_time:106032ms step_avg:61.15ms +step:1735/2245 train_time:106095ms step_avg:61.15ms +step:1736/2245 train_time:106155ms step_avg:61.15ms +step:1737/2245 train_time:106218ms step_avg:61.15ms +step:1738/2245 train_time:106279ms step_avg:61.15ms +step:1739/2245 train_time:106341ms step_avg:61.15ms +step:1740/2245 train_time:106402ms step_avg:61.15ms +step:1741/2245 train_time:106464ms step_avg:61.15ms +step:1742/2245 train_time:106525ms step_avg:61.15ms +step:1743/2245 train_time:106587ms step_avg:61.15ms +step:1744/2245 train_time:106647ms step_avg:61.15ms +step:1745/2245 train_time:106709ms step_avg:61.15ms +step:1746/2245 train_time:106769ms step_avg:61.15ms +step:1747/2245 train_time:106832ms step_avg:61.15ms +step:1748/2245 train_time:106893ms step_avg:61.15ms +step:1749/2245 train_time:106955ms step_avg:61.15ms +step:1750/2245 train_time:107015ms step_avg:61.15ms +step:1750/2245 val_loss:3.3772 train_time:107080ms step_avg:61.19ms +step:1751/2245 train_time:107098ms step_avg:61.16ms +step:1752/2245 train_time:107144ms step_avg:61.16ms +step:1753/2245 train_time:107209ms step_avg:61.16ms +step:1754/2245 train_time:107270ms step_avg:61.16ms +step:1755/2245 train_time:107333ms step_avg:61.16ms +step:1756/2245 train_time:107395ms step_avg:61.16ms +step:1757/2245 train_time:107457ms step_avg:61.16ms +step:1758/2245 train_time:107517ms step_avg:61.16ms +step:1759/2245 train_time:107580ms step_avg:61.16ms +step:1760/2245 train_time:107639ms step_avg:61.16ms +step:1761/2245 train_time:107702ms step_avg:61.16ms +step:1762/2245 train_time:107762ms step_avg:61.16ms +step:1763/2245 train_time:107824ms step_avg:61.16ms +step:1764/2245 train_time:107884ms step_avg:61.16ms +step:1765/2245 train_time:107946ms step_avg:61.16ms +step:1766/2245 train_time:108007ms step_avg:61.16ms +step:1767/2245 train_time:108072ms step_avg:61.16ms +step:1768/2245 train_time:108133ms step_avg:61.16ms +step:1769/2245 train_time:108198ms step_avg:61.16ms +step:1770/2245 train_time:108259ms step_avg:61.16ms +step:1771/2245 train_time:108322ms step_avg:61.16ms +step:1772/2245 train_time:108382ms step_avg:61.16ms +step:1773/2245 train_time:108445ms step_avg:61.16ms +step:1774/2245 train_time:108505ms step_avg:61.16ms +step:1775/2245 train_time:108567ms step_avg:61.16ms +step:1776/2245 train_time:108627ms step_avg:61.16ms +step:1777/2245 train_time:108690ms step_avg:61.16ms +step:1778/2245 train_time:108750ms step_avg:61.16ms +step:1779/2245 train_time:108813ms step_avg:61.17ms +step:1780/2245 train_time:108874ms step_avg:61.17ms +step:1781/2245 train_time:108937ms step_avg:61.17ms +step:1782/2245 train_time:108997ms step_avg:61.17ms +step:1783/2245 train_time:109060ms step_avg:61.17ms +step:1784/2245 train_time:109120ms step_avg:61.17ms +step:1785/2245 train_time:109183ms step_avg:61.17ms +step:1786/2245 train_time:109243ms step_avg:61.17ms +step:1787/2245 train_time:109306ms step_avg:61.17ms +step:1788/2245 train_time:109367ms step_avg:61.17ms +step:1789/2245 train_time:109429ms step_avg:61.17ms +step:1790/2245 train_time:109489ms step_avg:61.17ms +step:1791/2245 train_time:109552ms step_avg:61.17ms +step:1792/2245 train_time:109613ms step_avg:61.17ms +step:1793/2245 train_time:109677ms step_avg:61.17ms +step:1794/2245 train_time:109737ms step_avg:61.17ms +step:1795/2245 train_time:109799ms step_avg:61.17ms +step:1796/2245 train_time:109859ms step_avg:61.17ms +step:1797/2245 train_time:109922ms step_avg:61.17ms +step:1798/2245 train_time:109982ms step_avg:61.17ms +step:1799/2245 train_time:110045ms step_avg:61.17ms +step:1800/2245 train_time:110106ms step_avg:61.17ms +step:1801/2245 train_time:110169ms step_avg:61.17ms +step:1802/2245 train_time:110230ms step_avg:61.17ms +step:1803/2245 train_time:110293ms step_avg:61.17ms +step:1804/2245 train_time:110353ms step_avg:61.17ms +step:1805/2245 train_time:110416ms step_avg:61.17ms +step:1806/2245 train_time:110477ms step_avg:61.17ms +step:1807/2245 train_time:110540ms step_avg:61.17ms +step:1808/2245 train_time:110600ms step_avg:61.17ms +step:1809/2245 train_time:110662ms step_avg:61.17ms +step:1810/2245 train_time:110722ms step_avg:61.17ms +step:1811/2245 train_time:110784ms step_avg:61.17ms +step:1812/2245 train_time:110844ms step_avg:61.17ms +step:1813/2245 train_time:110907ms step_avg:61.17ms +step:1814/2245 train_time:110968ms step_avg:61.17ms +step:1815/2245 train_time:111031ms step_avg:61.17ms +step:1816/2245 train_time:111092ms step_avg:61.17ms +step:1817/2245 train_time:111155ms step_avg:61.18ms +step:1818/2245 train_time:111217ms step_avg:61.18ms +step:1819/2245 train_time:111280ms step_avg:61.18ms +step:1820/2245 train_time:111340ms step_avg:61.18ms +step:1821/2245 train_time:111403ms step_avg:61.18ms +step:1822/2245 train_time:111464ms step_avg:61.18ms +step:1823/2245 train_time:111526ms step_avg:61.18ms +step:1824/2245 train_time:111587ms step_avg:61.18ms +step:1825/2245 train_time:111650ms step_avg:61.18ms +step:1826/2245 train_time:111711ms step_avg:61.18ms +step:1827/2245 train_time:111774ms step_avg:61.18ms +step:1828/2245 train_time:111835ms step_avg:61.18ms +step:1829/2245 train_time:111898ms step_avg:61.18ms +step:1830/2245 train_time:111958ms step_avg:61.18ms +step:1831/2245 train_time:112020ms step_avg:61.18ms +step:1832/2245 train_time:112081ms step_avg:61.18ms +step:1833/2245 train_time:112144ms step_avg:61.18ms +step:1834/2245 train_time:112204ms step_avg:61.18ms +step:1835/2245 train_time:112267ms step_avg:61.18ms +step:1836/2245 train_time:112328ms step_avg:61.18ms +step:1837/2245 train_time:112391ms step_avg:61.18ms +step:1838/2245 train_time:112451ms step_avg:61.18ms +step:1839/2245 train_time:112514ms step_avg:61.18ms +step:1840/2245 train_time:112576ms step_avg:61.18ms +step:1841/2245 train_time:112639ms step_avg:61.18ms +step:1842/2245 train_time:112699ms step_avg:61.18ms +step:1843/2245 train_time:112762ms step_avg:61.18ms +step:1844/2245 train_time:112822ms step_avg:61.18ms +step:1845/2245 train_time:112884ms step_avg:61.18ms +step:1846/2245 train_time:112944ms step_avg:61.18ms +step:1847/2245 train_time:113007ms step_avg:61.18ms +step:1848/2245 train_time:113067ms step_avg:61.18ms +step:1849/2245 train_time:113130ms step_avg:61.18ms +step:1850/2245 train_time:113191ms step_avg:61.18ms +step:1851/2245 train_time:113254ms step_avg:61.19ms +step:1852/2245 train_time:113315ms step_avg:61.19ms +step:1853/2245 train_time:113378ms step_avg:61.19ms +step:1854/2245 train_time:113438ms step_avg:61.19ms +step:1855/2245 train_time:113501ms step_avg:61.19ms +step:1856/2245 train_time:113561ms step_avg:61.19ms +step:1857/2245 train_time:113624ms step_avg:61.19ms +step:1858/2245 train_time:113685ms step_avg:61.19ms +step:1859/2245 train_time:113747ms step_avg:61.19ms +step:1860/2245 train_time:113808ms step_avg:61.19ms +step:1861/2245 train_time:113872ms step_avg:61.19ms +step:1862/2245 train_time:113933ms step_avg:61.19ms +step:1863/2245 train_time:113996ms step_avg:61.19ms +step:1864/2245 train_time:114057ms step_avg:61.19ms +step:1865/2245 train_time:114119ms step_avg:61.19ms +step:1866/2245 train_time:114180ms step_avg:61.19ms +step:1867/2245 train_time:114242ms step_avg:61.19ms +step:1868/2245 train_time:114302ms step_avg:61.19ms +step:1869/2245 train_time:114365ms step_avg:61.19ms +step:1870/2245 train_time:114425ms step_avg:61.19ms +step:1871/2245 train_time:114489ms step_avg:61.19ms +step:1872/2245 train_time:114549ms step_avg:61.19ms +step:1873/2245 train_time:114612ms step_avg:61.19ms +step:1874/2245 train_time:114673ms step_avg:61.19ms +step:1875/2245 train_time:114737ms step_avg:61.19ms +step:1876/2245 train_time:114797ms step_avg:61.19ms +step:1877/2245 train_time:114860ms step_avg:61.19ms +step:1878/2245 train_time:114920ms step_avg:61.19ms +step:1879/2245 train_time:114983ms step_avg:61.19ms +step:1880/2245 train_time:115043ms step_avg:61.19ms +step:1881/2245 train_time:115106ms step_avg:61.19ms +step:1882/2245 train_time:115166ms step_avg:61.19ms +step:1883/2245 train_time:115228ms step_avg:61.19ms +step:1884/2245 train_time:115289ms step_avg:61.19ms +step:1885/2245 train_time:115352ms step_avg:61.19ms +step:1886/2245 train_time:115413ms step_avg:61.19ms +step:1887/2245 train_time:115476ms step_avg:61.20ms +step:1888/2245 train_time:115536ms step_avg:61.20ms +step:1889/2245 train_time:115599ms step_avg:61.20ms +step:1890/2245 train_time:115659ms step_avg:61.20ms +step:1891/2245 train_time:115721ms step_avg:61.20ms +step:1892/2245 train_time:115783ms step_avg:61.20ms +step:1893/2245 train_time:115845ms step_avg:61.20ms +step:1894/2245 train_time:115905ms step_avg:61.20ms +step:1895/2245 train_time:115967ms step_avg:61.20ms +step:1896/2245 train_time:116027ms step_avg:61.20ms +step:1897/2245 train_time:116090ms step_avg:61.20ms +step:1898/2245 train_time:116150ms step_avg:61.20ms +step:1899/2245 train_time:116213ms step_avg:61.20ms +step:1900/2245 train_time:116273ms step_avg:61.20ms +step:1901/2245 train_time:116338ms step_avg:61.20ms +step:1902/2245 train_time:116399ms step_avg:61.20ms +step:1903/2245 train_time:116461ms step_avg:61.20ms +step:1904/2245 train_time:116521ms step_avg:61.20ms +step:1905/2245 train_time:116583ms step_avg:61.20ms +step:1906/2245 train_time:116643ms step_avg:61.20ms +step:1907/2245 train_time:116706ms step_avg:61.20ms +step:1908/2245 train_time:116766ms step_avg:61.20ms +step:1909/2245 train_time:116829ms step_avg:61.20ms +step:1910/2245 train_time:116890ms step_avg:61.20ms +step:1911/2245 train_time:116953ms step_avg:61.20ms +step:1912/2245 train_time:117013ms step_avg:61.20ms +step:1913/2245 train_time:117077ms step_avg:61.20ms +step:1914/2245 train_time:117137ms step_avg:61.20ms +step:1915/2245 train_time:117200ms step_avg:61.20ms +step:1916/2245 train_time:117261ms step_avg:61.20ms +step:1917/2245 train_time:117323ms step_avg:61.20ms +step:1918/2245 train_time:117384ms step_avg:61.20ms +step:1919/2245 train_time:117446ms step_avg:61.20ms +step:1920/2245 train_time:117507ms step_avg:61.20ms +step:1921/2245 train_time:117570ms step_avg:61.20ms +step:1922/2245 train_time:117630ms step_avg:61.20ms +step:1923/2245 train_time:117693ms step_avg:61.20ms +step:1924/2245 train_time:117754ms step_avg:61.20ms +step:1925/2245 train_time:117817ms step_avg:61.20ms +step:1926/2245 train_time:117878ms step_avg:61.20ms +step:1927/2245 train_time:117941ms step_avg:61.20ms +step:1928/2245 train_time:118001ms step_avg:61.20ms +step:1929/2245 train_time:118063ms step_avg:61.20ms +step:1930/2245 train_time:118123ms step_avg:61.20ms +step:1931/2245 train_time:118186ms step_avg:61.20ms +step:1932/2245 train_time:118245ms step_avg:61.20ms +step:1933/2245 train_time:118309ms step_avg:61.20ms +step:1934/2245 train_time:118369ms step_avg:61.20ms +step:1935/2245 train_time:118432ms step_avg:61.20ms +step:1936/2245 train_time:118493ms step_avg:61.21ms +step:1937/2245 train_time:118556ms step_avg:61.21ms +step:1938/2245 train_time:118616ms step_avg:61.21ms +step:1939/2245 train_time:118679ms step_avg:61.21ms +step:1940/2245 train_time:118739ms step_avg:61.21ms +step:1941/2245 train_time:118803ms step_avg:61.21ms +step:1942/2245 train_time:118863ms step_avg:61.21ms +step:1943/2245 train_time:118926ms step_avg:61.21ms +step:1944/2245 train_time:118986ms step_avg:61.21ms +step:1945/2245 train_time:119048ms step_avg:61.21ms +step:1946/2245 train_time:119109ms step_avg:61.21ms +step:1947/2245 train_time:119172ms step_avg:61.21ms +step:1948/2245 train_time:119233ms step_avg:61.21ms +step:1949/2245 train_time:119296ms step_avg:61.21ms +step:1950/2245 train_time:119356ms step_avg:61.21ms +step:1951/2245 train_time:119419ms step_avg:61.21ms +step:1952/2245 train_time:119480ms step_avg:61.21ms +step:1953/2245 train_time:119541ms step_avg:61.21ms +step:1954/2245 train_time:119602ms step_avg:61.21ms +step:1955/2245 train_time:119664ms step_avg:61.21ms +step:1956/2245 train_time:119725ms step_avg:61.21ms +step:1957/2245 train_time:119789ms step_avg:61.21ms +step:1958/2245 train_time:119849ms step_avg:61.21ms +step:1959/2245 train_time:119912ms step_avg:61.21ms +step:1960/2245 train_time:119974ms step_avg:61.21ms +step:1961/2245 train_time:120037ms step_avg:61.21ms +step:1962/2245 train_time:120097ms step_avg:61.21ms +step:1963/2245 train_time:120160ms step_avg:61.21ms +step:1964/2245 train_time:120220ms step_avg:61.21ms +step:1965/2245 train_time:120282ms step_avg:61.21ms +step:1966/2245 train_time:120342ms step_avg:61.21ms +step:1967/2245 train_time:120405ms step_avg:61.21ms +step:1968/2245 train_time:120465ms step_avg:61.21ms +step:1969/2245 train_time:120527ms step_avg:61.21ms +step:1970/2245 train_time:120588ms step_avg:61.21ms +step:1971/2245 train_time:120651ms step_avg:61.21ms +step:1972/2245 train_time:120712ms step_avg:61.21ms +step:1973/2245 train_time:120775ms step_avg:61.21ms +step:1974/2245 train_time:120836ms step_avg:61.21ms +step:1975/2245 train_time:120898ms step_avg:61.21ms +step:1976/2245 train_time:120959ms step_avg:61.21ms +step:1977/2245 train_time:121022ms step_avg:61.21ms +step:1978/2245 train_time:121082ms step_avg:61.21ms +step:1979/2245 train_time:121145ms step_avg:61.22ms +step:1980/2245 train_time:121205ms step_avg:61.21ms +step:1981/2245 train_time:121268ms step_avg:61.22ms +step:1982/2245 train_time:121328ms step_avg:61.22ms +step:1983/2245 train_time:121391ms step_avg:61.22ms +step:1984/2245 train_time:121452ms step_avg:61.22ms +step:1985/2245 train_time:121514ms step_avg:61.22ms +step:1986/2245 train_time:121575ms step_avg:61.22ms +step:1987/2245 train_time:121638ms step_avg:61.22ms +step:1988/2245 train_time:121698ms step_avg:61.22ms +step:1989/2245 train_time:121761ms step_avg:61.22ms +step:1990/2245 train_time:121821ms step_avg:61.22ms +step:1991/2245 train_time:121884ms step_avg:61.22ms +step:1992/2245 train_time:121944ms step_avg:61.22ms +step:1993/2245 train_time:122007ms step_avg:61.22ms +step:1994/2245 train_time:122067ms step_avg:61.22ms +step:1995/2245 train_time:122130ms step_avg:61.22ms +step:1996/2245 train_time:122191ms step_avg:61.22ms +step:1997/2245 train_time:122254ms step_avg:61.22ms +step:1998/2245 train_time:122315ms step_avg:61.22ms +step:1999/2245 train_time:122378ms step_avg:61.22ms +step:2000/2245 train_time:122438ms step_avg:61.22ms +step:2000/2245 val_loss:3.3226 train_time:122501ms step_avg:61.25ms +step:2001/2245 train_time:122519ms step_avg:61.23ms +step:2002/2245 train_time:122564ms step_avg:61.22ms +step:2003/2245 train_time:122630ms step_avg:61.22ms +step:2004/2245 train_time:122690ms step_avg:61.22ms +step:2005/2245 train_time:122753ms step_avg:61.22ms +step:2006/2245 train_time:122814ms step_avg:61.22ms +step:2007/2245 train_time:122876ms step_avg:61.22ms +step:2008/2245 train_time:122935ms step_avg:61.22ms +step:2009/2245 train_time:122997ms step_avg:61.22ms +step:2010/2245 train_time:123057ms step_avg:61.22ms +step:2011/2245 train_time:123119ms step_avg:61.22ms +step:2012/2245 train_time:123179ms step_avg:61.22ms +step:2013/2245 train_time:123242ms step_avg:61.22ms +step:2014/2245 train_time:123304ms step_avg:61.22ms +step:2015/2245 train_time:123367ms step_avg:61.22ms +step:2016/2245 train_time:123426ms step_avg:61.22ms +step:2017/2245 train_time:123491ms step_avg:61.22ms +step:2018/2245 train_time:123552ms step_avg:61.23ms +step:2019/2245 train_time:123616ms step_avg:61.23ms +step:2020/2245 train_time:123677ms step_avg:61.23ms +step:2021/2245 train_time:123742ms step_avg:61.23ms +step:2022/2245 train_time:123803ms step_avg:61.23ms +step:2023/2245 train_time:123865ms step_avg:61.23ms +step:2024/2245 train_time:123925ms step_avg:61.23ms +step:2025/2245 train_time:123987ms step_avg:61.23ms +step:2026/2245 train_time:124047ms step_avg:61.23ms +step:2027/2245 train_time:124109ms step_avg:61.23ms +step:2028/2245 train_time:124168ms step_avg:61.23ms +step:2029/2245 train_time:124231ms step_avg:61.23ms +step:2030/2245 train_time:124291ms step_avg:61.23ms +step:2031/2245 train_time:124354ms step_avg:61.23ms +step:2032/2245 train_time:124415ms step_avg:61.23ms +step:2033/2245 train_time:124479ms step_avg:61.23ms +step:2034/2245 train_time:124541ms step_avg:61.23ms +step:2035/2245 train_time:124605ms step_avg:61.23ms +step:2036/2245 train_time:124665ms step_avg:61.23ms +step:2037/2245 train_time:124729ms step_avg:61.23ms +step:2038/2245 train_time:124789ms step_avg:61.23ms +step:2039/2245 train_time:124852ms step_avg:61.23ms +step:2040/2245 train_time:124913ms step_avg:61.23ms +step:2041/2245 train_time:124975ms step_avg:61.23ms +step:2042/2245 train_time:125035ms step_avg:61.23ms +step:2043/2245 train_time:125097ms step_avg:61.23ms +step:2044/2245 train_time:125158ms step_avg:61.23ms +step:2045/2245 train_time:125221ms step_avg:61.23ms +step:2046/2245 train_time:125282ms step_avg:61.23ms +step:2047/2245 train_time:125345ms step_avg:61.23ms +step:2048/2245 train_time:125406ms step_avg:61.23ms +step:2049/2245 train_time:125469ms step_avg:61.23ms +step:2050/2245 train_time:125530ms step_avg:61.23ms +step:2051/2245 train_time:125593ms step_avg:61.23ms +step:2052/2245 train_time:125653ms step_avg:61.23ms +step:2053/2245 train_time:125716ms step_avg:61.24ms +step:2054/2245 train_time:125776ms step_avg:61.23ms +step:2055/2245 train_time:125839ms step_avg:61.24ms +step:2056/2245 train_time:125900ms step_avg:61.24ms +step:2057/2245 train_time:125963ms step_avg:61.24ms +step:2058/2245 train_time:126024ms step_avg:61.24ms +step:2059/2245 train_time:126087ms step_avg:61.24ms +step:2060/2245 train_time:126146ms step_avg:61.24ms +step:2061/2245 train_time:126209ms step_avg:61.24ms +step:2062/2245 train_time:126269ms step_avg:61.24ms +step:2063/2245 train_time:126332ms step_avg:61.24ms +step:2064/2245 train_time:126392ms step_avg:61.24ms +step:2065/2245 train_time:126455ms step_avg:61.24ms +step:2066/2245 train_time:126516ms step_avg:61.24ms +step:2067/2245 train_time:126579ms step_avg:61.24ms +step:2068/2245 train_time:126641ms step_avg:61.24ms +step:2069/2245 train_time:126705ms step_avg:61.24ms +step:2070/2245 train_time:126765ms step_avg:61.24ms +step:2071/2245 train_time:126828ms step_avg:61.24ms +step:2072/2245 train_time:126888ms step_avg:61.24ms +step:2073/2245 train_time:126951ms step_avg:61.24ms +step:2074/2245 train_time:127012ms step_avg:61.24ms +step:2075/2245 train_time:127074ms step_avg:61.24ms +step:2076/2245 train_time:127134ms step_avg:61.24ms +step:2077/2245 train_time:127197ms step_avg:61.24ms +step:2078/2245 train_time:127257ms step_avg:61.24ms +step:2079/2245 train_time:127320ms step_avg:61.24ms +step:2080/2245 train_time:127381ms step_avg:61.24ms +step:2081/2245 train_time:127444ms step_avg:61.24ms +step:2082/2245 train_time:127505ms step_avg:61.24ms +step:2083/2245 train_time:127568ms step_avg:61.24ms +step:2084/2245 train_time:127629ms step_avg:61.24ms +step:2085/2245 train_time:127693ms step_avg:61.24ms +step:2086/2245 train_time:127753ms step_avg:61.24ms +step:2087/2245 train_time:127816ms step_avg:61.24ms +step:2088/2245 train_time:127876ms step_avg:61.24ms +step:2089/2245 train_time:127940ms step_avg:61.24ms +step:2090/2245 train_time:128001ms step_avg:61.24ms +step:2091/2245 train_time:128064ms step_avg:61.25ms +step:2092/2245 train_time:128124ms step_avg:61.24ms +step:2093/2245 train_time:128187ms step_avg:61.25ms +step:2094/2245 train_time:128247ms step_avg:61.25ms +step:2095/2245 train_time:128310ms step_avg:61.25ms +step:2096/2245 train_time:128370ms step_avg:61.25ms +step:2097/2245 train_time:128433ms step_avg:61.25ms +step:2098/2245 train_time:128493ms step_avg:61.25ms +step:2099/2245 train_time:128556ms step_avg:61.25ms +step:2100/2245 train_time:128617ms step_avg:61.25ms +step:2101/2245 train_time:128680ms step_avg:61.25ms +step:2102/2245 train_time:128742ms step_avg:61.25ms +step:2103/2245 train_time:128805ms step_avg:61.25ms +step:2104/2245 train_time:128866ms step_avg:61.25ms +step:2105/2245 train_time:128929ms step_avg:61.25ms +step:2106/2245 train_time:128989ms step_avg:61.25ms +step:2107/2245 train_time:129052ms step_avg:61.25ms +step:2108/2245 train_time:129112ms step_avg:61.25ms +step:2109/2245 train_time:129175ms step_avg:61.25ms +step:2110/2245 train_time:129235ms step_avg:61.25ms +step:2111/2245 train_time:129298ms step_avg:61.25ms +step:2112/2245 train_time:129358ms step_avg:61.25ms +step:2113/2245 train_time:129421ms step_avg:61.25ms +step:2114/2245 train_time:129482ms step_avg:61.25ms +step:2115/2245 train_time:129545ms step_avg:61.25ms +step:2116/2245 train_time:129606ms step_avg:61.25ms +step:2117/2245 train_time:129668ms step_avg:61.25ms +step:2118/2245 train_time:129729ms step_avg:61.25ms +step:2119/2245 train_time:129791ms step_avg:61.25ms +step:2120/2245 train_time:129852ms step_avg:61.25ms +step:2121/2245 train_time:129915ms step_avg:61.25ms +step:2122/2245 train_time:129975ms step_avg:61.25ms +step:2123/2245 train_time:130039ms step_avg:61.25ms +step:2124/2245 train_time:130101ms step_avg:61.25ms +step:2125/2245 train_time:130164ms step_avg:61.25ms +step:2126/2245 train_time:130224ms step_avg:61.25ms +step:2127/2245 train_time:130287ms step_avg:61.25ms +step:2128/2245 train_time:130347ms step_avg:61.25ms +step:2129/2245 train_time:130409ms step_avg:61.25ms +step:2130/2245 train_time:130470ms step_avg:61.25ms +step:2131/2245 train_time:130532ms step_avg:61.25ms +step:2132/2245 train_time:130593ms step_avg:61.25ms +step:2133/2245 train_time:130656ms step_avg:61.25ms +step:2134/2245 train_time:130716ms step_avg:61.25ms +step:2135/2245 train_time:130779ms step_avg:61.25ms +step:2136/2245 train_time:130840ms step_avg:61.25ms +step:2137/2245 train_time:130903ms step_avg:61.26ms +step:2138/2245 train_time:130964ms step_avg:61.26ms +step:2139/2245 train_time:131027ms step_avg:61.26ms +step:2140/2245 train_time:131087ms step_avg:61.26ms +step:2141/2245 train_time:131150ms step_avg:61.26ms +step:2142/2245 train_time:131211ms step_avg:61.26ms +step:2143/2245 train_time:131273ms step_avg:61.26ms +step:2144/2245 train_time:131333ms step_avg:61.26ms +step:2145/2245 train_time:131396ms step_avg:61.26ms +step:2146/2245 train_time:131456ms step_avg:61.26ms +step:2147/2245 train_time:131519ms step_avg:61.26ms +step:2148/2245 train_time:131580ms step_avg:61.26ms +step:2149/2245 train_time:131644ms step_avg:61.26ms +step:2150/2245 train_time:131705ms step_avg:61.26ms +step:2151/2245 train_time:131767ms step_avg:61.26ms +step:2152/2245 train_time:131828ms step_avg:61.26ms +step:2153/2245 train_time:131890ms step_avg:61.26ms +step:2154/2245 train_time:131951ms step_avg:61.26ms +step:2155/2245 train_time:132014ms step_avg:61.26ms +step:2156/2245 train_time:132074ms step_avg:61.26ms +step:2157/2245 train_time:132137ms step_avg:61.26ms +step:2158/2245 train_time:132198ms step_avg:61.26ms +step:2159/2245 train_time:132262ms step_avg:61.26ms +step:2160/2245 train_time:132322ms step_avg:61.26ms +step:2161/2245 train_time:132385ms step_avg:61.26ms +step:2162/2245 train_time:132445ms step_avg:61.26ms +step:2163/2245 train_time:132508ms step_avg:61.26ms +step:2164/2245 train_time:132568ms step_avg:61.26ms +step:2165/2245 train_time:132631ms step_avg:61.26ms +step:2166/2245 train_time:132691ms step_avg:61.26ms +step:2167/2245 train_time:132754ms step_avg:61.26ms +step:2168/2245 train_time:132814ms step_avg:61.26ms +step:2169/2245 train_time:132877ms step_avg:61.26ms +step:2170/2245 train_time:132938ms step_avg:61.26ms +step:2171/2245 train_time:133002ms step_avg:61.26ms +step:2172/2245 train_time:133063ms step_avg:61.26ms +step:2173/2245 train_time:133126ms step_avg:61.26ms +step:2174/2245 train_time:133186ms step_avg:61.26ms +step:2175/2245 train_time:133248ms step_avg:61.26ms +step:2176/2245 train_time:133309ms step_avg:61.26ms +step:2177/2245 train_time:133371ms step_avg:61.26ms +step:2178/2245 train_time:133431ms step_avg:61.26ms +step:2179/2245 train_time:133493ms step_avg:61.26ms +step:2180/2245 train_time:133554ms step_avg:61.26ms +step:2181/2245 train_time:133616ms step_avg:61.26ms +step:2182/2245 train_time:133677ms step_avg:61.26ms +step:2183/2245 train_time:133740ms step_avg:61.26ms +step:2184/2245 train_time:133802ms step_avg:61.26ms +step:2185/2245 train_time:133865ms step_avg:61.27ms +step:2186/2245 train_time:133925ms step_avg:61.27ms +step:2187/2245 train_time:133988ms step_avg:61.27ms +step:2188/2245 train_time:134048ms step_avg:61.27ms +step:2189/2245 train_time:134111ms step_avg:61.27ms +step:2190/2245 train_time:134172ms step_avg:61.27ms +step:2191/2245 train_time:134235ms step_avg:61.27ms +step:2192/2245 train_time:134295ms step_avg:61.27ms +step:2193/2245 train_time:134357ms step_avg:61.27ms +step:2194/2245 train_time:134418ms step_avg:61.27ms +step:2195/2245 train_time:134482ms step_avg:61.27ms +step:2196/2245 train_time:134542ms step_avg:61.27ms +step:2197/2245 train_time:134606ms step_avg:61.27ms +step:2198/2245 train_time:134666ms step_avg:61.27ms +step:2199/2245 train_time:134728ms step_avg:61.27ms +step:2200/2245 train_time:134789ms step_avg:61.27ms +step:2201/2245 train_time:134851ms step_avg:61.27ms +step:2202/2245 train_time:134912ms step_avg:61.27ms +step:2203/2245 train_time:134975ms step_avg:61.27ms +step:2204/2245 train_time:135035ms step_avg:61.27ms +step:2205/2245 train_time:135099ms step_avg:61.27ms +step:2206/2245 train_time:135160ms step_avg:61.27ms +step:2207/2245 train_time:135222ms step_avg:61.27ms +step:2208/2245 train_time:135283ms step_avg:61.27ms +step:2209/2245 train_time:135347ms step_avg:61.27ms +step:2210/2245 train_time:135408ms step_avg:61.27ms +step:2211/2245 train_time:135470ms step_avg:61.27ms +step:2212/2245 train_time:135531ms step_avg:61.27ms +step:2213/2245 train_time:135594ms step_avg:61.27ms +step:2214/2245 train_time:135655ms step_avg:61.27ms +step:2215/2245 train_time:135718ms step_avg:61.27ms +step:2216/2245 train_time:135779ms step_avg:61.27ms +step:2217/2245 train_time:135842ms step_avg:61.27ms +step:2218/2245 train_time:135904ms step_avg:61.27ms +step:2219/2245 train_time:135967ms step_avg:61.27ms +step:2220/2245 train_time:136027ms step_avg:61.27ms +step:2221/2245 train_time:136091ms step_avg:61.27ms +step:2222/2245 train_time:136151ms step_avg:61.27ms +step:2223/2245 train_time:136214ms step_avg:61.27ms +step:2224/2245 train_time:136274ms step_avg:61.27ms +step:2225/2245 train_time:136337ms step_avg:61.28ms +step:2226/2245 train_time:136398ms step_avg:61.27ms +step:2227/2245 train_time:136461ms step_avg:61.28ms +step:2228/2245 train_time:136523ms step_avg:61.28ms +step:2229/2245 train_time:136586ms step_avg:61.28ms +step:2230/2245 train_time:136647ms step_avg:61.28ms +step:2231/2245 train_time:136710ms step_avg:61.28ms +step:2232/2245 train_time:136770ms step_avg:61.28ms +step:2233/2245 train_time:136833ms step_avg:61.28ms +step:2234/2245 train_time:136894ms step_avg:61.28ms +step:2235/2245 train_time:136957ms step_avg:61.28ms +step:2236/2245 train_time:137017ms step_avg:61.28ms +step:2237/2245 train_time:137080ms step_avg:61.28ms +step:2238/2245 train_time:137142ms step_avg:61.28ms +step:2239/2245 train_time:137205ms step_avg:61.28ms +step:2240/2245 train_time:137265ms step_avg:61.28ms +step:2241/2245 train_time:137328ms step_avg:61.28ms +step:2242/2245 train_time:137388ms step_avg:61.28ms +step:2243/2245 train_time:137451ms step_avg:61.28ms +step:2244/2245 train_time:137512ms step_avg:61.28ms +step:2245/2245 train_time:137575ms step_avg:61.28ms +step:2245/2245 val_loss:3.2777 train_time:137636ms step_avg:61.31ms +peak memory allocated: 29249 MiB reserved: 50528 MiB diff --git a/records/track_1_short/2025-11-10_CautiousWD/541679f5-b0e3-4a19-a5ae-34b6a4f2d896.txt b/records/track_1_short/2025-11-10_CautiousWD/541679f5-b0e3-4a19-a5ae-34b6a4f2d896.txt new file mode 100644 index 000000000..6fa786023 --- /dev/null +++ b/records/track_1_short/2025-11-10_CautiousWD/541679f5-b0e3-4a19-a5ae-34b6a4f2d896.txt @@ -0,0 +1,3772 @@ +import os +import sys + +with open(sys.argv[0]) as f: + code = f.read() # read the code of this file ASAP, for logging +import copy +import glob +import math +import threading +import time +import uuid +from dataclasses import dataclass +from collections import defaultdict +from itertools import accumulate +from pathlib import Path + +os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" +import torch + +torch.empty( + 1, device="cuda", requires_grad=True +).backward() # prevents a bug on some systems +import torch._dynamo as dynamo +import torch.distributed as dist +import torch.nn.functional as F + +# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min +import triton +import triton.language as tl +from kernels import get_kernel +from torch import Tensor, nn + +dynamo.config.recompile_limit = 64 + +# ----------------------------------------------------------------------------- +# Custom operators: FP8 matmul by @YouJiacheng + + +@torch.library.custom_op("nanogpt::mm", mutates_args=()) +def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: + @torch.compile + def impl(x: Tensor, w: Tensor): + assert x.is_contiguous() and w.is_contiguous() + x_f8 = x.div(x_s).to(torch.float8_e4m3fn) + w_f8 = w.div(w_s).to(torch.float8_e4m3fn) + out = torch._scaled_mm( + x_f8, + w_f8.T, + out_dtype=torch.bfloat16, + scale_a=x.new_tensor(x_s, dtype=torch.float32), + scale_b=x.new_tensor(w_s, dtype=torch.float32), + use_fast_accum=True, + ) + return out, x_f8, w_f8 + + return impl(x, w) + +@mm_op.register_fake +def _(x: Tensor, w: Tensor, *_): + assert x.ndim == w.ndim == 2 + assert x.shape[1] == w.shape[1] + assert x.device == w.device + assert x.is_contiguous() and w.is_contiguous() + return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) + +@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) +def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: + @torch.compile + def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): + assert grad.is_contiguous() + x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) + w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) + grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) + grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) + grad_x = torch._scaled_mm( + grad_f8, + w_f8.T.contiguous().T, + out_dtype=torch.bfloat16, + scale_a=grad_inv_s, + scale_b=w_inv_s, + use_fast_accum=False, + ) + # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) + grad_w = torch._scaled_mm( + x_f8.T.contiguous(), + grad_f8.T.contiguous().T, + out_dtype=torch.float32, + scale_a=x_inv_s, + scale_b=grad_inv_s, + use_fast_accum=False, + ).T + return grad_x, grad_w + + return impl(g, x_f8, w_f8) + +@mm_backward_op.register_fake +def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): + return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) + +def backward(ctx, grad_out: Tensor, *_): + x_f8, w_f8 = ctx.saved_tensors + x_s, w_s, grad_s = ctx.scales + grad_x, grad_w = torch.ops.nanogpt.mm_backward( + grad_out, x_f8, w_f8, x_s, w_s, grad_s + ) + return grad_x, grad_w, None, None, None + +def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): + *_, x_s, w_s, grad_s = inputs + _, x_f8, w_f8 = output + ctx.save_for_backward(x_f8, w_f8) + ctx.scales = x_s, w_s, grad_s + ctx.set_materialize_grads(False) + +mm_op.register_autograd(backward, setup_context=setup_context) + +# ----------------------------------------------------------------------------- +# Triton kernel for symmetric matrix multiplication by @byronxu99 + +def _get_autotune_configs(): + return [ + triton.Config( + { + "BLOCK_SIZE_M": bm, + "BLOCK_SIZE_N": bn, + "BLOCK_SIZE_K": bk, + "GROUP_SIZE_M": 8, + "LOWER_UPPER": 1, + }, + num_stages=stages, + num_warps=warps, + ) + for bm in [64, 128] + for bn in [64, 128, 256] + for bk in [64, 128] + for stages, warps in [(3, 4), (3, 8), (4, 4)] + if bm // bn <= 2 and bn // bm <= 2 + ] + +@triton.jit +def _pid_to_block( + pid, + M, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) + + # Map PID to a single matrix in batch + batch_idx = pid // (num_pid_m * num_pid_n) + pid = pid % (num_pid_m * num_pid_n) + + # Map PID to 2D grid of blocks + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) + + m_idx = pid_m * BLOCK_SIZE_M + n_idx = pid_n * BLOCK_SIZE_N + return batch_idx, m_idx, n_idx + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def XXT_kernel( + A_ptr, C_ptr, + M, K, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def XXT(A: torch.Tensor, out: torch.Tensor): + """ + Launch Triton kernel to compute C = A @ A.T + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert out.size(-2) == M, "Output matrix has incorrect shape" + assert out.size(-1) == M, "Output matrix has incorrect shape" + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + XXT_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + K=K, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + ) + return out + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def ba_plus_cAA_kernel( + A_ptr, C_ptr, + M, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + alpha, beta, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A + # Performance is slightly slower than XXT_kernel, so we use two separate kernels + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + # Load block of A to add (corresponds to the current block of C) + offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) + a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) + a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) + a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) + + # Apply alpha and beta + accumulator *= alpha + accumulator += a_add * beta + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): + """ + Launch Triton kernel to compute C = alpha * A @ A.T + beta * A + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert M == K, "Input matrix must be square" + assert out.size(-2) == M + assert out.size(-1) == M + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + ba_plus_cAA_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + alpha=alpha, + beta=beta, + ) + return out + +# Computed for num_iters=5, safety_factor=2e-2, cushion=2 +polar_express_coeffs = [ + (8.156554524902461, -22.48329292557795, 15.878769915207462), + (4.042929935166739, -2.808917465908714, 0.5000178451051316), + (3.8916678022926607, -2.772484153217685, 0.5060648178503393), + (3.285753657755655, -2.3681294933425376, 0.46449024233003106), + (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) +] + +@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower +def polar_express(G: torch.Tensor): + """ + Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 + by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. + """ + X = G.bfloat16() + if G.size(-2) > G.size(-1): + X = X.mT + + # Ensure spectral norm is at most 1 + X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) + + # Allocate buffers + X = X.contiguous() + A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) + B = torch.empty_like(A) + C = torch.empty_like(X) + + aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm + + # Perform the iterations + for a, b, c in polar_express_coeffs: + XXT(X, out=A) # A = X @ X.mT + ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A + aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X + X, C = C, X # Swap references to avoid unnecessary copies + + if G.size(-2) > G.size(-1): + X = X.mT + return X + +# ----------------------------------------------------------------------------- +# Muon optimizer + +class NorMuon(torch.optim.Optimizer): + """ + Muon - MomentUm Orthogonalized by Newton-schulz + + https://kellerjordan.github.io/posts/muon/ + + Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- + processing step, in which each 2D parameter's update is replaced with the nearest orthogonal + matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has + the advantage that it can be stably run in bfloat16 on the GPU. + + Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, + or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). + + Differences from standard Muon: + - Newton-Shulz is replaced with Polar Express for the orthogonalization step + - NorMuon adds a low-rank variance estimator similar to Adafactor. + - small 1D parameters handled here instead of in Adam + - Cautious weight decay, a gated version of decoupled weight decay + - Custom distributed sizing: + The model stores all attn and mlp weights in the same shape, and then updates the view as + needed on the forward pass. This enables attn and mlp weights to be contained within the same + dist.reduce_scatter_tensor() call. The model architecture has been customized to enable + (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. + The scheduling is: + 1. reduce scatter smear_gate (1 param 7 padding params) + 2. reduce scatter attn_gate (10 params 6 padding params) + 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) + 4. reduce scatter attn/mlp round 2 (16 mlp params) + 5. wait on step 1, then compute update of 1 and schedule all gather + 6. wait on step 2, then compute update of 2 and schedule all gather + 7. wait on step 3, then compute update of 3 and schedule all gather + GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] + GPUs that receive params of type attn reshape before computing update + 8. wait on 4, then compute update of 4 and schedule all gather + 9. wait for each all gather to complete and update params + Empirically, leading with small params provides an additional 0.2s improvement. + """ + def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, beta2=0.95, custom_sizing=True): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + # custom sizing requires 8 GPUs + if custom_sizing and dist.get_world_size()==8: + param_groups = self.generate_custom_param_groups(params) + else: + param_groups = self.generate_standard_param_groups(params) + super().__init__(param_groups, defaults) + + def reset(self): + # expose a reset for clearing buffers + for group in self.param_groups: + group["momentum_buffer"].zero_() + group["second_momentum_buffer"].zero_() + + def generate_standard_param_groups(self, params): + """ + Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. + Creates one param group per module. + """ + groups = defaultdict(list) + for param in params: + groups[param.label].append(param) + + param_groups = [] + for module_name, group_params in groups.items(): + chunk_size = (len(group_params) + self.world_size - 1) // self.world_size + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + + return param_groups + + def generate_custom_param_groups(self, params): + """ + Implementation requires that a single GPU does not receive both attn + and mlp params when a param group is split across GPUs. + """ + module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp'] + params_list = list(params) + params_list.sort(key=lambda x: module_group_order.index(x.label)) + + idx = 0 + group_sizes = [1, 10, 16, 16] + assert len(params_list) == sum(group_sizes) + param_groups = [] + for size in group_sizes: + chunk_size = (size + self.world_size - 1) // self.world_size + group_params = params_list[idx: idx + size] + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + idx += size + + return param_groups + + @torch.no_grad() + def step(self): + # Efficient systems-wise implementation of step developed by @YouJiacheng, + # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, + # @ryanyang0, @vagrawal, and @varunneal. + rank = dist.get_rank() + group_infos = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + if not params: + continue + + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + stacked_grads = torch.empty( + (padded_num_params, *params[0].shape), + dtype=params[0].dtype, + device=params[0].device + ) + for i, p in enumerate(params): + stacked_grads[i].copy_(p.grad, non_blocking=True) + if len(params) < padded_num_params: + stacked_grads[len(params):].zero_() + + grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) + + reduce_future = dist.reduce_scatter_tensor( + grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True + ).get_future() + + group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) + + all_gather_infos = [] + # Second pass: wait for gradients, compute updates for the local shard of parameters, + # and launch all async all_gather operations. + for group, info in zip(self.param_groups, group_infos): + info["reduce_future"].wait() + + params = group["params"] + grad_chunk = info["grad_chunk"] + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + start_idx = rank * chunk_size + module_idx = start_idx if start_idx < len(params) else 0 + + num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank + + if "momentum_buffer" not in group: + group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) + momentum_buffer = group["momentum_buffer"] + # Apply momentum update to the persistent momentum buffer in-place + momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) + updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) + + grad_shape = updated_grads.shape + if params[module_idx].label == 'attn': + # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] + for p in params[module_idx:module_idx + num_params]: + assert p.label == 'attn' + updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) + ref_param = params[module_idx] + param_shape = ref_param.shape + + if "second_momentum_buffer" not in group: + group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) + if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) + ) + second_momentum_buffer = group["second_momentum_buffer"] + + if "param_lr" not in group: + group["param_lr"] = ( + max(1., param_shape[-2] / param_shape[-1]) ** 0.5 + * ref_param.new_tensor( + [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + ) + + group["param_wd"] = ref_param.new_tensor( + [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + + # Determine LR and WR + eff_lr = group["lr"] * group["param_lr"] + eff_wd = group["lr"] * group["weight_decay"] * group["param_wd"] + + # Compute zeropower for the entire chunk in a single, batched call. + if num_params == 0: + v_chunk = updated_grads + else: + v_chunk = polar_express(updated_grads) + + # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) + v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) + second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) + step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() + v_chunk.mul_(step_size) + v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) + + v_chunk = v_chunk.view(grad_shape) + + updated_params = torch.empty_like(grad_chunk) + param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) + + # "Cautious" weight decay (https://arxiv.org/abs/2510.12402) + mask = (v_chunk * param_chunk) >= 0 + v_chunk.addcmul_(param_chunk, (eff_wd * mask).to(ref_param.dtype)) + + param_chunk.addcmul_(v_chunk, -eff_lr) + + updated_params[:num_params].copy_(param_chunk) + if num_params < chunk_size: + updated_params[num_params:].zero_() + + stacked_params = torch.empty( + (padded_num_params, *param_shape), + dtype=updated_params.dtype, + device=updated_params.device, + ) + + gather_future = dist.all_gather_into_tensor( + stacked_params, updated_params, async_op=True + ).get_future() + + all_gather_infos.append( + { + "gather_future": gather_future, + "stacked_params": stacked_params, + "orig_params": params, + } + ) + + # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. + for info in all_gather_infos: + info["gather_future"].wait() + stacked_params = info["stacked_params"] + orig_params = info["orig_params"] + + unstacked_params = torch.unbind(stacked_params) + for i, p in enumerate(orig_params): + p.copy_(unstacked_params[i], non_blocking=True) + + +class DistAdam(torch.optim.Optimizer): + def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + params = list(params) + sizes = {p.shape for p in params} + # create one buffer per unique parameter-size + param_groups = [] + for size in sizes: + group_params = [p for p in params if p.shape == size] + param_groups.append(dict(params=group_params)) + super().__init__(param_groups, defaults) + # init state + for p in params: + chunk_size = p.size(0) // self.world_size + exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) + exp_avg_sq = torch.zeros_like(exp_avg) + self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) + # DistributedAdam implementation by @vagrawal + + @torch.compile + @torch.no_grad() + def step(self): + rank = dist.get_rank() + reduce_scatter_futures: list[torch.Future] = [] + all_gather_futures: list[torch.Future] = [] + grad_slices = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + for param in params: + grad = param.grad + rank_size = grad.shape[0] // self.world_size + grad_slice = torch.empty_like(grad[:rank_size]) + reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) + grad_slices.append(grad_slice) + + idx = 0 + for group in self.param_groups: + beta1, beta2 = group['betas'] + eps = group['eps'] + wd = group['weight_decay'] + params = group['params'] + for param in params: + reduce_scatter_futures[idx].wait() + rank_size = param.shape[0] // self.world_size + p_slice = param[rank * rank_size:(rank + 1) * rank_size] + lr = group['lr'] * getattr(param, "lr_mul", 1.0) + state = self.state[param] + g_slice = grad_slices[idx] + + exp_avg = state["exp_avg"] + exp_avg_sq = state["exp_avg_sq"] + state["step"] += 1 + t = state["step"] + # weight decay + if wd != 0: + eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) + p_slice.mul_(1 - eff_weight_decay) + # update running averages + exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) + # bias corrections + bias1 = 1 - beta1 ** t + bias2 = 1 - beta2 ** t + # compute step + denom = exp_avg_sq.sqrt().add_(eps) + step_size = lr * (bias2 ** 0.5 / bias1) + update = exp_avg.div(denom).mul_(step_size) + p_slice.add_(other=update, alpha=-1.0) + idx += 1 + all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) + torch.futures.collect_all(all_gather_futures).wait() + +# ----------------------------------------------------------------------------- +# PyTorch nn.Module definitions for the model + +def norm(x: Tensor): + return F.rms_norm(x, (x.size(-1),)) + +class CastedLinear(nn.Linear): + def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): + super().__init__(in_features, out_features, bias=False) + self.use_fp8 = use_fp8 + self.x_s = x_s + self.w_s = w_s + self.grad_s = grad_s + + def reset_parameters(self) -> None: + with torch.no_grad(): + self.weight.zero_() # @Grad62304977 and others + + def forward(self, x: Tensor): + if self.use_fp8 and self.training: + _x = x.flatten(0, -2) + out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] + return out.reshape(*x.shape[:-1], -1) + else: + return F.linear(x, self.weight.type_as(x)) + +# yarn implementation @classiclarryd +class Yarn(nn.Module): + def __init__(self, head_dim, max_seq_len): + super().__init__() + self.head_dim = head_dim + self.max_seq_len = max_seq_len + self.reset() + + def reset(self): + angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) + # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) + angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) + theta = torch.outer(t, angular_freq) + self.cos = nn.Buffer( + theta.cos().to(torch.bfloat16), persistent=False + ) + self.sin = nn.Buffer( + theta.sin().to(torch.bfloat16), persistent=False + ) + self.angular_freq = angular_freq + # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 + self.attn_scale = 0.1 + + def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): + rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) + scaling_factor = old_window / new_window + interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) + self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) + theta = torch.outer(t, self.angular_freq) + self.cos.copy_(theta.cos()) + self.sin.copy_(theta.sin()) + self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 + +def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): + assert cos.size(0) >= x_BTHD.size(-3) + cos, sin = ( + cos[None, : x_BTHD.size(-3), None, :], + sin[None, : x_BTHD.size(-3), None, :], + ) + x1, x2 = x_BTHD.chunk(2, dim=-1) + y1 = x1 * cos + x2 * sin + y2 = x1 * (-sin) + x2 * cos + return torch.cat((y1, y2), 3) + +@dataclass +class AttnArgs: + ve: torch.Tensor + sa_lambdas: torch.Tensor + seqlens: torch.Tensor + bm_size: int + cos: torch.Tensor + sin: torch.Tensor + attn_scale: float + +flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface + +class CausalSelfAttention(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int): + super().__init__() + self.num_heads = num_heads + self.head_dim = head_dim + self.dim = dim + self.hdim = num_heads * head_dim + + assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" + std = 0.5 * (self.dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng + # https://x.com/hi_tysam/status/1879699187107033311 + # make matrices the same shape as MLP to enable batched call in optimizer + self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) + # label module to enable custom optimizer sizing + self.qkvo_w.label='attn' + + with torch.no_grad(): + self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights + self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero + + # sparse gated attention to enable context based no-op by @classiclarryd + self.attn_gate = CastedLinear(12, num_heads) + # label module to enable custom optimizer sizing + self.attn_gate.weight.label = 'attn_gate' + + def forward(self, x: Tensor, attn_args: AttnArgs): + B, T = x.size(0), x.size(1) # batch size, sequence length + assert B == 1, "varlen sequences requires B == 1" + assert T % 16 == 0 + # unpack attention args + cos, sin = attn_args.cos, attn_args.sin + ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas + seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size + + q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) + q, k = norm(q), norm(k) # QK norm @Grad62304977 + q, k = rotary(q, cos, sin), rotary(k, cos, sin) + if ve is not None: + v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 + else: # skip mid-layers token value embeddings by @YouJiacheng + v = sa_lambdas[0] * v + + max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) + + # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng + y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, + max_seqlen_q=max_len, max_seqlen_k=max_len, + causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) + y = y.view(B, T, self.num_heads, self.head_dim) + y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) + y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side + y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) + return y + + +class MLP(nn.Module): + def __init__(self, dim: int): + super().__init__() + hdim = 4 * dim + # make matrices the same shape to enable batched call in optimizer + self.c_fc = nn.Parameter(torch.empty(dim, hdim)) + self.c_proj = nn.Parameter(torch.empty(dim, hdim)) + # label modules to enable custom optimizer sizing + self.c_fc.label = 'mlp' + self.c_proj.label = 'mlp' + # corrective factor to account for transpose + self.c_fc.lr_mul = 2. + + std = 0.5 * (dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + with torch.no_grad(): + self.c_fc.uniform_(-bound, bound) + self.c_proj.zero_() # zero init suggested by @Grad62304977 + + def forward(self, x: Tensor): + x = F.linear(x, self.c_fc.T.type_as(x)) + x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 + x = F.linear(x, self.c_proj.type_as(x)) + return x + +class Block(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): + super().__init__() + # skip attention of blocks.7 (the 8th layer) by @YouJiacheng + self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None + # skip MLP blocks for first MLP layer by @EmelyanenkoK + self.mlp = MLP(dim) if layer_idx != 0 else None + + def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): + x = lambdas[0] * x + lambdas[1] * x0 + if self.attn is not None: + x = x + self.attn(norm(x), attn_args) + if self.mlp is not None: + x = x + self.mlp(norm(x)) + return x + +# ----------------------------------------------------------------------------- +# The main model + +def next_multiple_of_n(v: float | int, *, n: int): + return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) + +class GPT(nn.Module): + def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): + super().__init__() + vocab_size = next_multiple_of_n(vocab_size, n=128) + self.embed = nn.Embedding(vocab_size, model_dim) + self.smear_gate = CastedLinear(12, 1) + # label modules to enable custom optimizer sizing + self.smear_gate.weight.label = 'smear_gate' + # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 + # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 + self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) + self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) + self.yarn = Yarn(head_dim, max_seq_len) + # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. + # suggested to me by @Grad62304977. this originates from Karpathy's experiments. + use_fp8 = not os.environ.get("DISABLE_FP8", False) + self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) + # Add learnable skip connection weights for decoder layers + assert num_layers % 2 == 0 + pad = (-num_layers * 5 - 2) % dist.get_world_size() + self.scalars = nn.Parameter( + torch.cat( + [ + -1.5 + * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 + *[ + torch.tensor([1.0, 0.0]) for _ in range(num_layers) + ], # block lambdas + *[ + torch.tensor([0.5, 0.5]) for _ in range(num_layers) + ], # SA lambdas + torch.zeros(1), # smear_lambda + 0.5*torch.ones(1), # backout_lambda + torch.ones(pad), + ] + ) + ) + # set learning rates + for param in self.embed.parameters(): + param.lr_mul = 75. + for param in self.value_embeds.parameters(): + param.lr_mul = 75. + self.lm_head.weight.lr_mul = 1.0 + self.scalars.lr_mul = 5.0 + + def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): + assert input_seq.ndim == 1 + + ve = [value_embed(input_seq) for value_embed in self.value_embeds] + # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure + # dropping first layer updates this to .12 ... 012 + ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] + assert len(ve) == len(self.blocks) + + short_bm = ws_short * args.block_size + long_bm = ws_long * args.block_size + bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] + assert len(bm_sizes) == len(self.blocks) + + x = self.embed(input_seq) + + skip_weights = self.scalars[:(len(self.blocks) // 2)] + lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) + sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) + smear_lambda = self.scalars[5 * len(self.blocks)] + backout_lambda = self.scalars[5 * len(self.blocks)+1] + + # smear token embed forward 1 position @classiclarryd + smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) + x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) + x = x0 = norm(x[None]) + + # U-net design by @brendanh0gan + skip_connections = [] + n = len(self.blocks) // 2 + + x_backout = None + backout_layer = 8 + # skip layer zero + for i in range(1,len(self.blocks)): + attn_args = AttnArgs( + ve=ve[i], + sa_lambdas=sa_lambdas[i], + seqlens=seqlens, + bm_size=bm_sizes[i], + cos=self.yarn.cos, + sin=self.yarn.sin, + attn_scale=self.yarn.attn_scale + ) + # since layer 0 is skipped, layer 11 does not have skip_connection + if i >= n and i<11: + gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) + x = x + gate * skip_connections.pop() + x = self.blocks[i](x, x0, lambdas[i], attn_args) + if i < n: + skip_connections.append(x) + if i == backout_layer: + x_backout = x + + # back out contributions from first 8 layers that are only required for downstream context and not direct prediction + x -= backout_lambda * x_backout + x = norm(x) + logits = self.lm_head(x) + # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) + logits = 30 * torch.sigmoid(logits / 7.5) + logits_for_loss = logits.float() if not self.training else logits + loss = F.cross_entropy( + logits_for_loss.view(-1, logits_for_loss.size(-1)), + target_seq, + reduction="sum" if self.training else "mean", + ) + return loss + +# ----------------------------------------------------------------------------- +# Distributed data loader + +def _load_data_shard(file: Path): + header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 + assert header[0] == 20240520, "magic number mismatch in the data .bin file" + assert header[1] == 1, "unsupported version" + num_tokens = int(header[2]) # number of tokens (claimed) + with file.open("rb", buffering=0) as f: + tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng + f.seek(256 * 4) + nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng + assert nbytes == 2 * num_tokens, "number of tokens read does not match header" + return tokens + +BOS_ID = 50256 + +class BOSFinder: + # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd + def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): + # Precompute BOS positions once per shard + self.tokens=tokens + self.size = tokens.numel() + self.quickload = quickload + if quickload: + # only scan first 4 million tokens, then kickoff async thread to scan rest + self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.thread = None + self.ready = threading.Event() + self.start() + else: + self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.i = 0 + self.world_size = world_size + self.batch_iter = 0 + + def _load(self): + self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + self.bos_idx = self.bos_idx_async + + def next_batch(self, num_tokens_local: int, max_seq_len: int): + # if quickload was used, repoint to the full dataset after 5 batches + if self.quickload and self.batch_iter==5: + self.get() + n = len(self.bos_idx) + starts = [[] for _ in range(self.world_size)] + ends = [[] for _ in range(self.world_size)] + + idx = self.i + for r in range(self.world_size): + cur_len = 0 + while cur_len <= num_tokens_local: + if idx >= n: + raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") + cur = self.bos_idx[idx] + starts[r].append(cur) + end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, + cur + max_seq_len, + cur + num_tokens_local - cur_len + 1) + ends[r].append(end) + cur_len += end - cur + idx += 1 + + assert cur_len == num_tokens_local + 1 + self.i = idx + self.batch_iter+=1 + return starts, ends + +class DataPreloader: + # Helper for asynchronously loading next shard and indexing bos tokens + def __init__(self, file_iter, world_size: int = 1): + self.file_iter = file_iter + self.world_size = world_size + self.thread = None + self.data = None + self.ready = threading.Event() + + def _load(self): + tokens = _load_data_shard(next(self.file_iter)) + self.data = (tokens, BOSFinder(tokens, self.world_size)) + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + return self.data + +def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): + # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len + rank = dist.get_rank() if dist.is_initialized() else 0 + world_size = dist.get_world_size() if dist.is_initialized() else 1 + assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" + num_tokens = num_tokens // grad_accum_steps + + files = [Path(file) for file in sorted(glob.glob(filename_pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") + + file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training + tokens = _load_data_shard(next(file_iter)) + if align_to_bos: + finder = BOSFinder(tokens, world_size=world_size, quickload=True) + preloader = DataPreloader(file_iter, world_size) + preloader.start() + else: + pos = 0 # for unaligned case + + while True: + num_tokens_local = num_tokens // world_size + max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 + + if align_to_bos: + try: + seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) + start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) + except StopIteration: + # This shard is exhausted, load the next one in the next loop iteration. + tokens, finder = preloader.get() + preloader.start() + continue + + buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) + _inputs = buf[:-1] + _targets = buf[1:] + end_idxs[-1] -= 1 # last document was too long to account for _targets offset + cum_lengths = (end_idxs - start_idxs).cumsum(0) + + else: + if pos + num_tokens + 1 >= len(tokens): # should not occur for val data + tokens, pos = _load_data_shard(next(file_iter)), 0 + + pos_local = pos + rank * num_tokens_local + buf = tokens[pos_local: pos_local + num_tokens_local + 1] + _inputs = buf[:-1].view(num_tokens_local, ) + _targets = buf[1:].view(num_tokens_local, ) + + cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] + pos += num_tokens + + + _cum_lengths = torch.full((max_num_docs,), num_tokens_local) + _cum_lengths[0] = 0 + _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths + + new_params = yield ( + _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), + _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), + _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) + ) + + if new_params is not None: + # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() + new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params + assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" + num_tokens = new_num_tokens + max_seq_len = new_max_seq_len + grad_accum_steps = new_grad_accum_steps + + +# ----------------------------------------------------------------------------- +# int main + +@dataclass +class Hyperparameters: + # data + train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on + val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on + val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons + train_batch_size: int = 2048 * 16 * 8 + train_max_seq_len: int = 128 * 16 + val_batch_size: int = 4 * 64 * 1024 * 8 + # optimization + num_scheduled_iterations: int = 2205 # number of steps to complete lr and ws schedule + num_extension_iterations: int = 40 # number of steps to continue training at final lr and ws + num_iterations: int = num_scheduled_iterations + num_extension_iterations + cooldown_frac: float = 0.50 # fraction of num_scheduled_iterations spent cooling down the learning rate + # evaluation and logging + run_id: str = f"{uuid.uuid4()}" + val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end + save_checkpoint: bool = False + # attention masking + block_size: int = 128 + ws_schedule: tuple = (3, 7, 11) + ws_final: int = 13 # increase final validation ws, used for YaRN extension and short window size @classiclarryd + ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN + +args = Hyperparameters() + +data_path = os.environ.get("DATA_PATH", ".") +args.train_files = os.path.join(data_path, args.train_files) +args.val_files = os.path.join(data_path, args.val_files) + +# torchrun sets these env variables +rank = int(os.environ["RANK"]) +world_size = int(os.environ["WORLD_SIZE"]) +assert 8 % world_size == 0, "world_size must be a divisor of 8" +grad_accum_steps = 8 // world_size +assert torch.cuda.is_available() +device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) +torch.cuda.set_device(device) +dist.init_process_group(backend="nccl", device_id=device) +dist.barrier() +master_process = (rank == 0) # this process will do logging, checkpointing etc. + +# begin logging +logfile = None +if master_process: + run_id = args.run_id + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{run_id}.txt" + print(logfile) +def print0(s, console=False): + if master_process: + with open(logfile, "a") as f: + if console: + print(s) + print(s, file=f) + +# begin by printing this file (the Python code) +print0(code) +print0("="*100) +# log information about the hardware/software environment this is running on +print0(f"Running Python {sys.version}") +print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") +print0(f"Running Triton version {triton.__version__}") + +def nvidia_smi(): + import subprocess # avoid top level import + return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout +print0(nvidia_smi()) +print0("="*100) + +model: nn.Module = GPT( + vocab_size=50257, + num_layers=12, + num_heads=6, + head_dim=128, + model_dim=768, + max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) +).cuda() +for m in model.modules(): + if isinstance(m, (nn.Embedding, nn.Linear)): + m.bfloat16() +for param in model.parameters(): + dist.broadcast(param.detach(), 0) + +# collect the parameters to optimize +hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] +embed_params = [p for n, p in model.named_parameters() if "embed" in n] +scalar_params = [p for p in model.parameters() if p.ndim < 2] +head_params = [model.lm_head.weight] +gate_params = [p for n, p in model.named_parameters() if "gate" in n] + +# init the optimizer(s) +# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence +# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 +optimizer1 = DistAdam( + scalar_params + head_params + embed_params, + lr=0.008, + betas=(0.65, 0.95), + eps=1e-8, + weight_decay=0.0, +) +optimizer2 = NorMuon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=1.2) +optimizers = [optimizer1, optimizer2] +for opt in optimizers: + for group in opt.param_groups: + group["initial_lr"] = group["lr"] + +# learning rate schedule: flat, then linear decay, then flat +def get_lr(step: int): + x = min(0.9999, step / args.num_scheduled_iterations) + assert 0 <= x < 1 + lr = 1.0 + if x >= 1 - args.cooldown_frac: + w = (1 - x) / args.cooldown_frac + lr = w * 1.0 + (1 - w) * 0.1 + return lr + +def get_ws(step: int): + # set short window size to half of long window size + # Higher ws on "extension" steps + if step >= args.num_scheduled_iterations: + return args.ws_final // 2, args.ws_final + x = step / args.num_scheduled_iterations + assert 0 <= x < 1 + ws_idx = int(len(args.ws_schedule) * x) + return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] + +def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): + # warmup phase: linearly increase momentum from min to max + # cooldown phase: linearly decrease momentum from max to min + momentum_cd_start = args.num_iterations - muon_cooldown_steps + if step < muon_warmup_steps: + frac = step / muon_warmup_steps + momentum = momentum_min + frac * (momentum_max - momentum_min) + elif step > momentum_cd_start: + frac = (step - momentum_cd_start) / muon_cooldown_steps + momentum = momentum_max - frac * (momentum_max - momentum_min) + else: + momentum = momentum_max + return momentum + +def step_optimizers(step: int, optimizers, model): + # update lr + for optimizer in optimizers: + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * get_lr(step) + + # set muon momentum based on step + momentum = get_muon_momentum(step) + for group in optimizers[1].param_groups: + group["momentum"] = momentum + + # on even steps, only step Muon params + # on odd steps, step all params + if step%2==0: + optimizers[1].step() + optimizers[1].zero_grad(set_to_none=True) + else: + for optimizer in optimizers: + optimizer.step() + model.zero_grad(set_to_none=True) + +model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) + +######################################## +# Warmup kernels # +######################################## + +# Warmup the training kernels, then re-initialize the state so we aren't cheating +warmup_steps = 30 +initial_state = dict(model=copy.deepcopy(model.state_dict()), + optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +ws_schedule = list(args.ws_schedule) + [args.ws_final] +ws_long = ws_schedule[0] +for step in range(warmup_steps): + inputs, targets, cum_seqlens = next(train_loader) + # each window size is a new graph, need to warm up each with Yarn.attn_scale + ws_idx = step % len(ws_schedule) + if ws_idx==0: + model.yarn.reset() + ws_long = ws_schedule[0] + else: + new_ws_long = ws_schedule[ws_idx] + model.yarn.apply(ws_long, new_ws_long) + ws_long = new_ws_long + model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() + for opt in optimizers: + opt.step() + model.zero_grad(set_to_none=True) +model.yarn.reset() # rotary buffer is not stored in state_dict +model.load_state_dict(initial_state["model"]) +optimizer2.reset() # muon momentum buffers not in state dict +for opt, opt_state in zip(optimizers, initial_state["optimizers"]): + opt.load_state_dict(opt_state) +del train_loader, initial_state + +######################################## +# Training and validation # +######################################## + +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +training_time_ms = 0 +# start the clock +torch.cuda.synchronize() +t0 = time.perf_counter() +# begin training +train_steps = args.num_iterations +ws_short, ws_long = get_ws(0) +for step in range(train_steps + 1): + last_step = (step == train_steps) + ws_short, new_ws_long = get_ws(step) + if new_ws_long != ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long=new_ws_long + + # --------------- VALIDATION SECTION ----------------- + if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): + if last_step: + ws_long = args.ws_validate_post_yarn_ext + # stop the clock + torch.cuda.synchronize() + training_time_ms += 1000 * (time.perf_counter() - t0) + model.eval() + assert args.val_tokens % args.val_batch_size == 0 + val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size + val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) + val_loss = 0 + with torch.no_grad(): + for _ in range(val_steps): + inputs, targets, cum_seqlens = next(val_loader) + val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) + val_loss /= val_steps + del val_loader + dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) + print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) + model.train() + # start the clock again + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if master_process and args.save_checkpoint: + log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) + os.makedirs(f"logs/{run_id}", exist_ok=True) + torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") + # the last step only has the validation loop, so break to avoid training + break + + # --------------- TRAINING SECTION ----------------- + for _ in range(grad_accum_steps): + inputs, targets, cum_seqlens = next(train_loader) + (model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps).backward() + step_optimizers(step, optimizers, model) + + # logging + approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) + print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) + +print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) +dist.destroy_process_group() + +==================================================================================================== +Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] +Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 +Running Triton version 3.5.0 +Mon Nov 10 22:09:48 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | +| N/A 41C P0 131W / 700W | 5858MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | +| N/A 35C P0 123W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | +| N/A 33C P0 119W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | +| N/A 39C P0 123W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | +| N/A 40C P0 131W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | +| N/A 34C P0 123W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | +| N/A 40C P0 124W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | +| N/A 33C P0 118W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| ++-----------------------------------------------------------------------------------------+ + +==================================================================================================== +step:0/2245 val_loss:10.8258 train_time:0ms step_avg:0.02ms +step:1/2245 train_time:126ms step_avg:125.63ms +step:2/2245 train_time:147ms step_avg:73.70ms +step:3/2245 train_time:185ms step_avg:61.74ms +step:4/2245 train_time:241ms step_avg:60.36ms +step:5/2245 train_time:301ms step_avg:60.18ms +step:6/2245 train_time:359ms step_avg:59.90ms +step:7/2245 train_time:420ms step_avg:60.00ms +step:8/2245 train_time:478ms step_avg:59.80ms +step:9/2245 train_time:540ms step_avg:59.97ms +step:10/2245 train_time:598ms step_avg:59.84ms +step:11/2245 train_time:660ms step_avg:59.97ms +step:12/2245 train_time:718ms step_avg:59.84ms +step:13/2245 train_time:779ms step_avg:59.95ms +step:14/2245 train_time:838ms step_avg:59.85ms +step:15/2245 train_time:899ms step_avg:59.94ms +step:16/2245 train_time:958ms step_avg:59.89ms +step:17/2245 train_time:1024ms step_avg:60.22ms +step:18/2245 train_time:1087ms step_avg:60.37ms +step:19/2245 train_time:1152ms step_avg:60.61ms +step:20/2245 train_time:1211ms step_avg:60.57ms +step:21/2245 train_time:1274ms step_avg:60.66ms +step:22/2245 train_time:1334ms step_avg:60.63ms +step:23/2245 train_time:1396ms step_avg:60.69ms +step:24/2245 train_time:1456ms step_avg:60.65ms +step:25/2245 train_time:1518ms step_avg:60.71ms +step:26/2245 train_time:1577ms step_avg:60.63ms +step:27/2245 train_time:1638ms step_avg:60.67ms +step:28/2245 train_time:1697ms step_avg:60.60ms +step:29/2245 train_time:1758ms step_avg:60.62ms +step:30/2245 train_time:1816ms step_avg:60.55ms +step:31/2245 train_time:1877ms step_avg:60.56ms +step:32/2245 train_time:1936ms step_avg:60.51ms +step:33/2245 train_time:1999ms step_avg:60.58ms +step:34/2245 train_time:2058ms step_avg:60.54ms +step:35/2245 train_time:2121ms step_avg:60.60ms +step:36/2245 train_time:2180ms step_avg:60.56ms +step:37/2245 train_time:2244ms step_avg:60.64ms +step:38/2245 train_time:2303ms step_avg:60.62ms +step:39/2245 train_time:2366ms step_avg:60.66ms +step:40/2245 train_time:2425ms step_avg:60.63ms +step:41/2245 train_time:2487ms step_avg:60.66ms +step:42/2245 train_time:2546ms step_avg:60.63ms +step:43/2245 train_time:2608ms step_avg:60.66ms +step:44/2245 train_time:2668ms step_avg:60.63ms +step:45/2245 train_time:2730ms step_avg:60.66ms +step:46/2245 train_time:2789ms step_avg:60.63ms +step:47/2245 train_time:2850ms step_avg:60.65ms +step:48/2245 train_time:2910ms step_avg:60.63ms +step:49/2245 train_time:2972ms step_avg:60.66ms +step:50/2245 train_time:3032ms step_avg:60.64ms +step:51/2245 train_time:3095ms step_avg:60.68ms +step:52/2245 train_time:3155ms step_avg:60.67ms +step:53/2245 train_time:3217ms step_avg:60.71ms +step:54/2245 train_time:3277ms step_avg:60.68ms +step:55/2245 train_time:3339ms step_avg:60.70ms +step:56/2245 train_time:3397ms step_avg:60.67ms +step:57/2245 train_time:3459ms step_avg:60.69ms +step:58/2245 train_time:3518ms step_avg:60.66ms +step:59/2245 train_time:3579ms step_avg:60.66ms +step:60/2245 train_time:3638ms step_avg:60.64ms +step:61/2245 train_time:3700ms step_avg:60.66ms +step:62/2245 train_time:3760ms step_avg:60.64ms +step:63/2245 train_time:3821ms step_avg:60.65ms +step:64/2245 train_time:3880ms step_avg:60.63ms +step:65/2245 train_time:3942ms step_avg:60.65ms +step:66/2245 train_time:4002ms step_avg:60.63ms +step:67/2245 train_time:4064ms step_avg:60.66ms +step:68/2245 train_time:4123ms step_avg:60.64ms +step:69/2245 train_time:4186ms step_avg:60.66ms +step:70/2245 train_time:4245ms step_avg:60.65ms +step:71/2245 train_time:4307ms step_avg:60.66ms +step:72/2245 train_time:4367ms step_avg:60.65ms +step:73/2245 train_time:4428ms step_avg:60.66ms +step:74/2245 train_time:4488ms step_avg:60.64ms +step:75/2245 train_time:4550ms step_avg:60.66ms +step:76/2245 train_time:4609ms step_avg:60.65ms +step:77/2245 train_time:4670ms step_avg:60.66ms +step:78/2245 train_time:4729ms step_avg:60.63ms +step:79/2245 train_time:4791ms step_avg:60.65ms +step:80/2245 train_time:4852ms step_avg:60.65ms +step:81/2245 train_time:4914ms step_avg:60.67ms +step:82/2245 train_time:4974ms step_avg:60.65ms +step:83/2245 train_time:5036ms step_avg:60.67ms +step:84/2245 train_time:5095ms step_avg:60.66ms +step:85/2245 train_time:5157ms step_avg:60.67ms +step:86/2245 train_time:5216ms step_avg:60.65ms +step:87/2245 train_time:5278ms step_avg:60.66ms +step:88/2245 train_time:5337ms step_avg:60.64ms +step:89/2245 train_time:5398ms step_avg:60.65ms +step:90/2245 train_time:5457ms step_avg:60.63ms +step:91/2245 train_time:5518ms step_avg:60.63ms +step:92/2245 train_time:5576ms step_avg:60.61ms +step:93/2245 train_time:5638ms step_avg:60.62ms +step:94/2245 train_time:5697ms step_avg:60.60ms +step:95/2245 train_time:5758ms step_avg:60.61ms +step:96/2245 train_time:5816ms step_avg:60.59ms +step:97/2245 train_time:5878ms step_avg:60.60ms +step:98/2245 train_time:5938ms step_avg:60.59ms +step:99/2245 train_time:6000ms step_avg:60.61ms +step:100/2245 train_time:6059ms step_avg:60.59ms +step:101/2245 train_time:6121ms step_avg:60.60ms +step:102/2245 train_time:6180ms step_avg:60.58ms +step:103/2245 train_time:6241ms step_avg:60.59ms +step:104/2245 train_time:6300ms step_avg:60.58ms +step:105/2245 train_time:6361ms step_avg:60.59ms +step:106/2245 train_time:6420ms step_avg:60.57ms +step:107/2245 train_time:6482ms step_avg:60.58ms +step:108/2245 train_time:6541ms step_avg:60.56ms +step:109/2245 train_time:6602ms step_avg:60.57ms +step:110/2245 train_time:6661ms step_avg:60.55ms +step:111/2245 train_time:6722ms step_avg:60.56ms +step:112/2245 train_time:6781ms step_avg:60.55ms +step:113/2245 train_time:6843ms step_avg:60.56ms +step:114/2245 train_time:6902ms step_avg:60.55ms +step:115/2245 train_time:6965ms step_avg:60.57ms +step:116/2245 train_time:7024ms step_avg:60.55ms +step:117/2245 train_time:7085ms step_avg:60.56ms +step:118/2245 train_time:7145ms step_avg:60.55ms +step:119/2245 train_time:7206ms step_avg:60.56ms +step:120/2245 train_time:7265ms step_avg:60.55ms +step:121/2245 train_time:7326ms step_avg:60.55ms +step:122/2245 train_time:7385ms step_avg:60.53ms +step:123/2245 train_time:7447ms step_avg:60.54ms +step:124/2245 train_time:7505ms step_avg:60.53ms +step:125/2245 train_time:7568ms step_avg:60.54ms +step:126/2245 train_time:7627ms step_avg:60.53ms +step:127/2245 train_time:7688ms step_avg:60.54ms +step:128/2245 train_time:7748ms step_avg:60.53ms +step:129/2245 train_time:7811ms step_avg:60.55ms +step:130/2245 train_time:7870ms step_avg:60.54ms +step:131/2245 train_time:7932ms step_avg:60.55ms +step:132/2245 train_time:7991ms step_avg:60.54ms +step:133/2245 train_time:8053ms step_avg:60.55ms +step:134/2245 train_time:8113ms step_avg:60.55ms +step:135/2245 train_time:8175ms step_avg:60.56ms +step:136/2245 train_time:8234ms step_avg:60.54ms +step:137/2245 train_time:8296ms step_avg:60.55ms +step:138/2245 train_time:8355ms step_avg:60.54ms +step:139/2245 train_time:8416ms step_avg:60.55ms +step:140/2245 train_time:8475ms step_avg:60.54ms +step:141/2245 train_time:8537ms step_avg:60.55ms +step:142/2245 train_time:8596ms step_avg:60.53ms +step:143/2245 train_time:8657ms step_avg:60.54ms +step:144/2245 train_time:8716ms step_avg:60.53ms +step:145/2245 train_time:8777ms step_avg:60.53ms +step:146/2245 train_time:8836ms step_avg:60.52ms +step:147/2245 train_time:8898ms step_avg:60.53ms +step:148/2245 train_time:8957ms step_avg:60.52ms +step:149/2245 train_time:9019ms step_avg:60.53ms +step:150/2245 train_time:9077ms step_avg:60.51ms +step:151/2245 train_time:9139ms step_avg:60.52ms +step:152/2245 train_time:9198ms step_avg:60.51ms +step:153/2245 train_time:9260ms step_avg:60.52ms +step:154/2245 train_time:9318ms step_avg:60.51ms +step:155/2245 train_time:9379ms step_avg:60.51ms +step:156/2245 train_time:9438ms step_avg:60.50ms +step:157/2245 train_time:9499ms step_avg:60.50ms +step:158/2245 train_time:9558ms step_avg:60.49ms +step:159/2245 train_time:9618ms step_avg:60.49ms +step:160/2245 train_time:9677ms step_avg:60.48ms +step:161/2245 train_time:9739ms step_avg:60.49ms +step:162/2245 train_time:9798ms step_avg:60.48ms +step:163/2245 train_time:9858ms step_avg:60.48ms +step:164/2245 train_time:9917ms step_avg:60.47ms +step:165/2245 train_time:9978ms step_avg:60.47ms +step:166/2245 train_time:10037ms step_avg:60.46ms +step:167/2245 train_time:10098ms step_avg:60.47ms +step:168/2245 train_time:10157ms step_avg:60.46ms +step:169/2245 train_time:10218ms step_avg:60.46ms +step:170/2245 train_time:10277ms step_avg:60.45ms +step:171/2245 train_time:10339ms step_avg:60.46ms +step:172/2245 train_time:10398ms step_avg:60.45ms +step:173/2245 train_time:10459ms step_avg:60.46ms +step:174/2245 train_time:10517ms step_avg:60.44ms +step:175/2245 train_time:10579ms step_avg:60.45ms +step:176/2245 train_time:10637ms step_avg:60.44ms +step:177/2245 train_time:10699ms step_avg:60.44ms +step:178/2245 train_time:10757ms step_avg:60.43ms +step:179/2245 train_time:10818ms step_avg:60.44ms +step:180/2245 train_time:10877ms step_avg:60.43ms +step:181/2245 train_time:10938ms step_avg:60.43ms +step:182/2245 train_time:10997ms step_avg:60.42ms +step:183/2245 train_time:11058ms step_avg:60.43ms +step:184/2245 train_time:11117ms step_avg:60.42ms +step:185/2245 train_time:11178ms step_avg:60.42ms +step:186/2245 train_time:11237ms step_avg:60.42ms +step:187/2245 train_time:11299ms step_avg:60.42ms +step:188/2245 train_time:11358ms step_avg:60.41ms +step:189/2245 train_time:11419ms step_avg:60.42ms +step:190/2245 train_time:11478ms step_avg:60.41ms +step:191/2245 train_time:11539ms step_avg:60.41ms +step:192/2245 train_time:11598ms step_avg:60.40ms +step:193/2245 train_time:11659ms step_avg:60.41ms +step:194/2245 train_time:11717ms step_avg:60.40ms +step:195/2245 train_time:11778ms step_avg:60.40ms +step:196/2245 train_time:11837ms step_avg:60.39ms +step:197/2245 train_time:11898ms step_avg:60.40ms +step:198/2245 train_time:11957ms step_avg:60.39ms +step:199/2245 train_time:12018ms step_avg:60.39ms +step:200/2245 train_time:12076ms step_avg:60.38ms +step:201/2245 train_time:12138ms step_avg:60.39ms +step:202/2245 train_time:12197ms step_avg:60.38ms +step:203/2245 train_time:12258ms step_avg:60.39ms +step:204/2245 train_time:12317ms step_avg:60.38ms +step:205/2245 train_time:12377ms step_avg:60.38ms +step:206/2245 train_time:12436ms step_avg:60.37ms +step:207/2245 train_time:12497ms step_avg:60.37ms +step:208/2245 train_time:12556ms step_avg:60.37ms +step:209/2245 train_time:12617ms step_avg:60.37ms +step:210/2245 train_time:12676ms step_avg:60.36ms +step:211/2245 train_time:12737ms step_avg:60.37ms +step:212/2245 train_time:12796ms step_avg:60.36ms +step:213/2245 train_time:12857ms step_avg:60.36ms +step:214/2245 train_time:12916ms step_avg:60.36ms +step:215/2245 train_time:12977ms step_avg:60.36ms +step:216/2245 train_time:13036ms step_avg:60.35ms +step:217/2245 train_time:13097ms step_avg:60.36ms +step:218/2245 train_time:13156ms step_avg:60.35ms +step:219/2245 train_time:13217ms step_avg:60.35ms +step:220/2245 train_time:13276ms step_avg:60.35ms +step:221/2245 train_time:13337ms step_avg:60.35ms +step:222/2245 train_time:13396ms step_avg:60.34ms +step:223/2245 train_time:13458ms step_avg:60.35ms +step:224/2245 train_time:13517ms step_avg:60.34ms +step:225/2245 train_time:13577ms step_avg:60.34ms +step:226/2245 train_time:13636ms step_avg:60.34ms +step:227/2245 train_time:13697ms step_avg:60.34ms +step:228/2245 train_time:13756ms step_avg:60.33ms +step:229/2245 train_time:13818ms step_avg:60.34ms +step:230/2245 train_time:13876ms step_avg:60.33ms +step:231/2245 train_time:13938ms step_avg:60.34ms +step:232/2245 train_time:13996ms step_avg:60.33ms +step:233/2245 train_time:14058ms step_avg:60.33ms +step:234/2245 train_time:14116ms step_avg:60.33ms +step:235/2245 train_time:14177ms step_avg:60.33ms +step:236/2245 train_time:14236ms step_avg:60.32ms +step:237/2245 train_time:14297ms step_avg:60.33ms +step:238/2245 train_time:14356ms step_avg:60.32ms +step:239/2245 train_time:14418ms step_avg:60.33ms +step:240/2245 train_time:14476ms step_avg:60.32ms +step:241/2245 train_time:14537ms step_avg:60.32ms +step:242/2245 train_time:14595ms step_avg:60.31ms +step:243/2245 train_time:14657ms step_avg:60.32ms +step:244/2245 train_time:14716ms step_avg:60.31ms +step:245/2245 train_time:14777ms step_avg:60.32ms +step:246/2245 train_time:14836ms step_avg:60.31ms +step:247/2245 train_time:14897ms step_avg:60.31ms +step:248/2245 train_time:14957ms step_avg:60.31ms +step:249/2245 train_time:15017ms step_avg:60.31ms +step:250/2245 train_time:15076ms step_avg:60.30ms +step:250/2245 val_loss:4.0860 train_time:15138ms step_avg:60.55ms +step:251/2245 train_time:15158ms step_avg:60.39ms +step:252/2245 train_time:15200ms step_avg:60.32ms +step:253/2245 train_time:15266ms step_avg:60.34ms +step:254/2245 train_time:15330ms step_avg:60.36ms +step:255/2245 train_time:15395ms step_avg:60.37ms +step:256/2245 train_time:15453ms step_avg:60.36ms +step:257/2245 train_time:15515ms step_avg:60.37ms +step:258/2245 train_time:15573ms step_avg:60.36ms +step:259/2245 train_time:15634ms step_avg:60.36ms +step:260/2245 train_time:15692ms step_avg:60.36ms +step:261/2245 train_time:15753ms step_avg:60.36ms +step:262/2245 train_time:15811ms step_avg:60.35ms +step:263/2245 train_time:15872ms step_avg:60.35ms +step:264/2245 train_time:15930ms step_avg:60.34ms +step:265/2245 train_time:15990ms step_avg:60.34ms +step:266/2245 train_time:16048ms step_avg:60.33ms +step:267/2245 train_time:16110ms step_avg:60.34ms +step:268/2245 train_time:16169ms step_avg:60.33ms +step:269/2245 train_time:16232ms step_avg:60.34ms +step:270/2245 train_time:16293ms step_avg:60.34ms +step:271/2245 train_time:16356ms step_avg:60.35ms +step:272/2245 train_time:16416ms step_avg:60.35ms +step:273/2245 train_time:16477ms step_avg:60.36ms +step:274/2245 train_time:16536ms step_avg:60.35ms +step:275/2245 train_time:16597ms step_avg:60.35ms +step:276/2245 train_time:16656ms step_avg:60.35ms +step:277/2245 train_time:16717ms step_avg:60.35ms +step:278/2245 train_time:16775ms step_avg:60.34ms +step:279/2245 train_time:16837ms step_avg:60.35ms +step:280/2245 train_time:16895ms step_avg:60.34ms +step:281/2245 train_time:16957ms step_avg:60.34ms +step:282/2245 train_time:17015ms step_avg:60.34ms +step:283/2245 train_time:17077ms step_avg:60.34ms +step:284/2245 train_time:17137ms step_avg:60.34ms +step:285/2245 train_time:17200ms step_avg:60.35ms +step:286/2245 train_time:17260ms step_avg:60.35ms +step:287/2245 train_time:17323ms step_avg:60.36ms +step:288/2245 train_time:17382ms step_avg:60.35ms +step:289/2245 train_time:17443ms step_avg:60.36ms +step:290/2245 train_time:17502ms step_avg:60.35ms +step:291/2245 train_time:17564ms step_avg:60.36ms +step:292/2245 train_time:17622ms step_avg:60.35ms +step:293/2245 train_time:17683ms step_avg:60.35ms +step:294/2245 train_time:17742ms step_avg:60.35ms +step:295/2245 train_time:17803ms step_avg:60.35ms +step:296/2245 train_time:17861ms step_avg:60.34ms +step:297/2245 train_time:17922ms step_avg:60.34ms +step:298/2245 train_time:17981ms step_avg:60.34ms +step:299/2245 train_time:18042ms step_avg:60.34ms +step:300/2245 train_time:18101ms step_avg:60.34ms +step:301/2245 train_time:18163ms step_avg:60.34ms +step:302/2245 train_time:18223ms step_avg:60.34ms +step:303/2245 train_time:18284ms step_avg:60.34ms +step:304/2245 train_time:18343ms step_avg:60.34ms +step:305/2245 train_time:18405ms step_avg:60.34ms +step:306/2245 train_time:18463ms step_avg:60.34ms +step:307/2245 train_time:18525ms step_avg:60.34ms +step:308/2245 train_time:18584ms step_avg:60.34ms +step:309/2245 train_time:18646ms step_avg:60.34ms +step:310/2245 train_time:18704ms step_avg:60.34ms +step:311/2245 train_time:18764ms step_avg:60.33ms +step:312/2245 train_time:18823ms step_avg:60.33ms +step:313/2245 train_time:18884ms step_avg:60.33ms +step:314/2245 train_time:18942ms step_avg:60.33ms +step:315/2245 train_time:19004ms step_avg:60.33ms +step:316/2245 train_time:19062ms step_avg:60.32ms +step:317/2245 train_time:19124ms step_avg:60.33ms +step:318/2245 train_time:19183ms step_avg:60.32ms +step:319/2245 train_time:19246ms step_avg:60.33ms +step:320/2245 train_time:19305ms step_avg:60.33ms +step:321/2245 train_time:19366ms step_avg:60.33ms +step:322/2245 train_time:19424ms step_avg:60.32ms +step:323/2245 train_time:19485ms step_avg:60.33ms +step:324/2245 train_time:19544ms step_avg:60.32ms +step:325/2245 train_time:19605ms step_avg:60.32ms +step:326/2245 train_time:19664ms step_avg:60.32ms +step:327/2245 train_time:19725ms step_avg:60.32ms +step:328/2245 train_time:19784ms step_avg:60.32ms +step:329/2245 train_time:19845ms step_avg:60.32ms +step:330/2245 train_time:19903ms step_avg:60.31ms +step:331/2245 train_time:19964ms step_avg:60.32ms +step:332/2245 train_time:20023ms step_avg:60.31ms +step:333/2245 train_time:20084ms step_avg:60.31ms +step:334/2245 train_time:20143ms step_avg:60.31ms +step:335/2245 train_time:20205ms step_avg:60.31ms +step:336/2245 train_time:20263ms step_avg:60.31ms +step:337/2245 train_time:20325ms step_avg:60.31ms +step:338/2245 train_time:20384ms step_avg:60.31ms +step:339/2245 train_time:20445ms step_avg:60.31ms +step:340/2245 train_time:20504ms step_avg:60.31ms +step:341/2245 train_time:20565ms step_avg:60.31ms +step:342/2245 train_time:20624ms step_avg:60.30ms +step:343/2245 train_time:20685ms step_avg:60.31ms +step:344/2245 train_time:20743ms step_avg:60.30ms +step:345/2245 train_time:20804ms step_avg:60.30ms +step:346/2245 train_time:20864ms step_avg:60.30ms +step:347/2245 train_time:20926ms step_avg:60.30ms +step:348/2245 train_time:20984ms step_avg:60.30ms +step:349/2245 train_time:21045ms step_avg:60.30ms +step:350/2245 train_time:21104ms step_avg:60.30ms +step:351/2245 train_time:21165ms step_avg:60.30ms +step:352/2245 train_time:21224ms step_avg:60.29ms +step:353/2245 train_time:21285ms step_avg:60.30ms +step:354/2245 train_time:21344ms step_avg:60.30ms +step:355/2245 train_time:21406ms step_avg:60.30ms +step:356/2245 train_time:21465ms step_avg:60.29ms +step:357/2245 train_time:21526ms step_avg:60.30ms +step:358/2245 train_time:21584ms step_avg:60.29ms +step:359/2245 train_time:21645ms step_avg:60.29ms +step:360/2245 train_time:21704ms step_avg:60.29ms +step:361/2245 train_time:21765ms step_avg:60.29ms +step:362/2245 train_time:21823ms step_avg:60.29ms +step:363/2245 train_time:21885ms step_avg:60.29ms +step:364/2245 train_time:21943ms step_avg:60.28ms +step:365/2245 train_time:22004ms step_avg:60.29ms +step:366/2245 train_time:22062ms step_avg:60.28ms +step:367/2245 train_time:22124ms step_avg:60.28ms +step:368/2245 train_time:22183ms step_avg:60.28ms +step:369/2245 train_time:22245ms step_avg:60.28ms +step:370/2245 train_time:22304ms step_avg:60.28ms +step:371/2245 train_time:22365ms step_avg:60.28ms +step:372/2245 train_time:22424ms step_avg:60.28ms +step:373/2245 train_time:22486ms step_avg:60.28ms +step:374/2245 train_time:22544ms step_avg:60.28ms +step:375/2245 train_time:22606ms step_avg:60.28ms +step:376/2245 train_time:22664ms step_avg:60.28ms +step:377/2245 train_time:22725ms step_avg:60.28ms +step:378/2245 train_time:22784ms step_avg:60.28ms +step:379/2245 train_time:22845ms step_avg:60.28ms +step:380/2245 train_time:22904ms step_avg:60.27ms +step:381/2245 train_time:22965ms step_avg:60.28ms +step:382/2245 train_time:23023ms step_avg:60.27ms +step:383/2245 train_time:23084ms step_avg:60.27ms +step:384/2245 train_time:23143ms step_avg:60.27ms +step:385/2245 train_time:23205ms step_avg:60.27ms +step:386/2245 train_time:23263ms step_avg:60.27ms +step:387/2245 train_time:23325ms step_avg:60.27ms +step:388/2245 train_time:23384ms step_avg:60.27ms +step:389/2245 train_time:23445ms step_avg:60.27ms +step:390/2245 train_time:23504ms step_avg:60.27ms +step:391/2245 train_time:23565ms step_avg:60.27ms +step:392/2245 train_time:23624ms step_avg:60.26ms +step:393/2245 train_time:23685ms step_avg:60.27ms +step:394/2245 train_time:23743ms step_avg:60.26ms +step:395/2245 train_time:23806ms step_avg:60.27ms +step:396/2245 train_time:23864ms step_avg:60.26ms +step:397/2245 train_time:23925ms step_avg:60.26ms +step:398/2245 train_time:23984ms step_avg:60.26ms +step:399/2245 train_time:24046ms step_avg:60.27ms +step:400/2245 train_time:24105ms step_avg:60.26ms +step:401/2245 train_time:24165ms step_avg:60.26ms +step:402/2245 train_time:24224ms step_avg:60.26ms +step:403/2245 train_time:24285ms step_avg:60.26ms +step:404/2245 train_time:24344ms step_avg:60.26ms +step:405/2245 train_time:24406ms step_avg:60.26ms +step:406/2245 train_time:24464ms step_avg:60.26ms +step:407/2245 train_time:24526ms step_avg:60.26ms +step:408/2245 train_time:24585ms step_avg:60.26ms +step:409/2245 train_time:24646ms step_avg:60.26ms +step:410/2245 train_time:24705ms step_avg:60.26ms +step:411/2245 train_time:24766ms step_avg:60.26ms +step:412/2245 train_time:24825ms step_avg:60.25ms +step:413/2245 train_time:24886ms step_avg:60.26ms +step:414/2245 train_time:24945ms step_avg:60.25ms +step:415/2245 train_time:25006ms step_avg:60.26ms +step:416/2245 train_time:25064ms step_avg:60.25ms +step:417/2245 train_time:25125ms step_avg:60.25ms +step:418/2245 train_time:25184ms step_avg:60.25ms +step:419/2245 train_time:25245ms step_avg:60.25ms +step:420/2245 train_time:25304ms step_avg:60.25ms +step:421/2245 train_time:25366ms step_avg:60.25ms +step:422/2245 train_time:25424ms step_avg:60.25ms +step:423/2245 train_time:25485ms step_avg:60.25ms +step:424/2245 train_time:25544ms step_avg:60.24ms +step:425/2245 train_time:25605ms step_avg:60.25ms +step:426/2245 train_time:25664ms step_avg:60.24ms +step:427/2245 train_time:25725ms step_avg:60.25ms +step:428/2245 train_time:25784ms step_avg:60.24ms +step:429/2245 train_time:25845ms step_avg:60.25ms +step:430/2245 train_time:25904ms step_avg:60.24ms +step:431/2245 train_time:25965ms step_avg:60.24ms +step:432/2245 train_time:26023ms step_avg:60.24ms +step:433/2245 train_time:26084ms step_avg:60.24ms +step:434/2245 train_time:26143ms step_avg:60.24ms +step:435/2245 train_time:26205ms step_avg:60.24ms +step:436/2245 train_time:26264ms step_avg:60.24ms +step:437/2245 train_time:26325ms step_avg:60.24ms +step:438/2245 train_time:26384ms step_avg:60.24ms +step:439/2245 train_time:26446ms step_avg:60.24ms +step:440/2245 train_time:26504ms step_avg:60.24ms +step:441/2245 train_time:26565ms step_avg:60.24ms +step:442/2245 train_time:26623ms step_avg:60.23ms +step:443/2245 train_time:26685ms step_avg:60.24ms +step:444/2245 train_time:26743ms step_avg:60.23ms +step:445/2245 train_time:26805ms step_avg:60.23ms +step:446/2245 train_time:26863ms step_avg:60.23ms +step:447/2245 train_time:26925ms step_avg:60.23ms +step:448/2245 train_time:26983ms step_avg:60.23ms +step:449/2245 train_time:27045ms step_avg:60.23ms +step:450/2245 train_time:27103ms step_avg:60.23ms +step:451/2245 train_time:27165ms step_avg:60.23ms +step:452/2245 train_time:27223ms step_avg:60.23ms +step:453/2245 train_time:27284ms step_avg:60.23ms +step:454/2245 train_time:27343ms step_avg:60.23ms +step:455/2245 train_time:27404ms step_avg:60.23ms +step:456/2245 train_time:27463ms step_avg:60.23ms +step:457/2245 train_time:27524ms step_avg:60.23ms +step:458/2245 train_time:27583ms step_avg:60.23ms +step:459/2245 train_time:27645ms step_avg:60.23ms +step:460/2245 train_time:27704ms step_avg:60.23ms +step:461/2245 train_time:27765ms step_avg:60.23ms +step:462/2245 train_time:27823ms step_avg:60.22ms +step:463/2245 train_time:27885ms step_avg:60.23ms +step:464/2245 train_time:27944ms step_avg:60.22ms +step:465/2245 train_time:28006ms step_avg:60.23ms +step:466/2245 train_time:28065ms step_avg:60.22ms +step:467/2245 train_time:28126ms step_avg:60.23ms +step:468/2245 train_time:28184ms step_avg:60.22ms +step:469/2245 train_time:28246ms step_avg:60.23ms +step:470/2245 train_time:28305ms step_avg:60.22ms +step:471/2245 train_time:28366ms step_avg:60.22ms +step:472/2245 train_time:28424ms step_avg:60.22ms +step:473/2245 train_time:28486ms step_avg:60.22ms +step:474/2245 train_time:28544ms step_avg:60.22ms +step:475/2245 train_time:28606ms step_avg:60.22ms +step:476/2245 train_time:28664ms step_avg:60.22ms +step:477/2245 train_time:28725ms step_avg:60.22ms +step:478/2245 train_time:28784ms step_avg:60.22ms +step:479/2245 train_time:28845ms step_avg:60.22ms +step:480/2245 train_time:28904ms step_avg:60.22ms +step:481/2245 train_time:28966ms step_avg:60.22ms +step:482/2245 train_time:29024ms step_avg:60.22ms +step:483/2245 train_time:29085ms step_avg:60.22ms +step:484/2245 train_time:29144ms step_avg:60.21ms +step:485/2245 train_time:29205ms step_avg:60.22ms +step:486/2245 train_time:29263ms step_avg:60.21ms +step:487/2245 train_time:29324ms step_avg:60.21ms +step:488/2245 train_time:29383ms step_avg:60.21ms +step:489/2245 train_time:29444ms step_avg:60.21ms +step:490/2245 train_time:29503ms step_avg:60.21ms +step:491/2245 train_time:29564ms step_avg:60.21ms +step:492/2245 train_time:29623ms step_avg:60.21ms +step:493/2245 train_time:29684ms step_avg:60.21ms +step:494/2245 train_time:29743ms step_avg:60.21ms +step:495/2245 train_time:29804ms step_avg:60.21ms +step:496/2245 train_time:29863ms step_avg:60.21ms +step:497/2245 train_time:29924ms step_avg:60.21ms +step:498/2245 train_time:29983ms step_avg:60.21ms +step:499/2245 train_time:30045ms step_avg:60.21ms +step:500/2245 train_time:30104ms step_avg:60.21ms +step:500/2245 val_loss:3.8269 train_time:30166ms step_avg:60.33ms +step:501/2245 train_time:30184ms step_avg:60.25ms +step:502/2245 train_time:30227ms step_avg:60.21ms +step:503/2245 train_time:30294ms step_avg:60.23ms +step:504/2245 train_time:30355ms step_avg:60.23ms +step:505/2245 train_time:30416ms step_avg:60.23ms +step:506/2245 train_time:30476ms step_avg:60.23ms +step:507/2245 train_time:30536ms step_avg:60.23ms +step:508/2245 train_time:30594ms step_avg:60.22ms +step:509/2245 train_time:30656ms step_avg:60.23ms +step:510/2245 train_time:30714ms step_avg:60.22ms +step:511/2245 train_time:30775ms step_avg:60.22ms +step:512/2245 train_time:30833ms step_avg:60.22ms +step:513/2245 train_time:30894ms step_avg:60.22ms +step:514/2245 train_time:30952ms step_avg:60.22ms +step:515/2245 train_time:31012ms step_avg:60.22ms +step:516/2245 train_time:31071ms step_avg:60.22ms +step:517/2245 train_time:31134ms step_avg:60.22ms +step:518/2245 train_time:31194ms step_avg:60.22ms +step:519/2245 train_time:31258ms step_avg:60.23ms +step:520/2245 train_time:31318ms step_avg:60.23ms +step:521/2245 train_time:31380ms step_avg:60.23ms +step:522/2245 train_time:31440ms step_avg:60.23ms +step:523/2245 train_time:31501ms step_avg:60.23ms +step:524/2245 train_time:31560ms step_avg:60.23ms +step:525/2245 train_time:31621ms step_avg:60.23ms +step:526/2245 train_time:31680ms step_avg:60.23ms +step:527/2245 train_time:31741ms step_avg:60.23ms +step:528/2245 train_time:31800ms step_avg:60.23ms +step:529/2245 train_time:31862ms step_avg:60.23ms +step:530/2245 train_time:31921ms step_avg:60.23ms +step:531/2245 train_time:31982ms step_avg:60.23ms +step:532/2245 train_time:32041ms step_avg:60.23ms +step:533/2245 train_time:32103ms step_avg:60.23ms +step:534/2245 train_time:32163ms step_avg:60.23ms +step:535/2245 train_time:32225ms step_avg:60.23ms +step:536/2245 train_time:32285ms step_avg:60.23ms +step:537/2245 train_time:32348ms step_avg:60.24ms +step:538/2245 train_time:32408ms step_avg:60.24ms +step:539/2245 train_time:32470ms step_avg:60.24ms +step:540/2245 train_time:32529ms step_avg:60.24ms +step:541/2245 train_time:32591ms step_avg:60.24ms +step:542/2245 train_time:32650ms step_avg:60.24ms +step:543/2245 train_time:32710ms step_avg:60.24ms +step:544/2245 train_time:32769ms step_avg:60.24ms +step:545/2245 train_time:32830ms step_avg:60.24ms +step:546/2245 train_time:32889ms step_avg:60.24ms +step:547/2245 train_time:32950ms step_avg:60.24ms +step:548/2245 train_time:33009ms step_avg:60.23ms +step:549/2245 train_time:33070ms step_avg:60.24ms +step:550/2245 train_time:33129ms step_avg:60.23ms +step:551/2245 train_time:33191ms step_avg:60.24ms +step:552/2245 train_time:33250ms step_avg:60.24ms +step:553/2245 train_time:33313ms step_avg:60.24ms +step:554/2245 train_time:33371ms step_avg:60.24ms +step:555/2245 train_time:33432ms step_avg:60.24ms +step:556/2245 train_time:33491ms step_avg:60.24ms +step:557/2245 train_time:33553ms step_avg:60.24ms +step:558/2245 train_time:33611ms step_avg:60.23ms +step:559/2245 train_time:33672ms step_avg:60.24ms +step:560/2245 train_time:33731ms step_avg:60.23ms +step:561/2245 train_time:33792ms step_avg:60.24ms +step:562/2245 train_time:33850ms step_avg:60.23ms +step:563/2245 train_time:33911ms step_avg:60.23ms +step:564/2245 train_time:33970ms step_avg:60.23ms +step:565/2245 train_time:34031ms step_avg:60.23ms +step:566/2245 train_time:34089ms step_avg:60.23ms +step:567/2245 train_time:34151ms step_avg:60.23ms +step:568/2245 train_time:34210ms step_avg:60.23ms +step:569/2245 train_time:34271ms step_avg:60.23ms +step:570/2245 train_time:34330ms step_avg:60.23ms +step:571/2245 train_time:34392ms step_avg:60.23ms +step:572/2245 train_time:34451ms step_avg:60.23ms +step:573/2245 train_time:34512ms step_avg:60.23ms +step:574/2245 train_time:34571ms step_avg:60.23ms +step:575/2245 train_time:34632ms step_avg:60.23ms +step:576/2245 train_time:34691ms step_avg:60.23ms +step:577/2245 train_time:34752ms step_avg:60.23ms +step:578/2245 train_time:34810ms step_avg:60.23ms +step:579/2245 train_time:34872ms step_avg:60.23ms +step:580/2245 train_time:34930ms step_avg:60.22ms +step:581/2245 train_time:34991ms step_avg:60.23ms +step:582/2245 train_time:35050ms step_avg:60.22ms +step:583/2245 train_time:35112ms step_avg:60.23ms +step:584/2245 train_time:35170ms step_avg:60.22ms +step:585/2245 train_time:35231ms step_avg:60.22ms +step:586/2245 train_time:35290ms step_avg:60.22ms +step:587/2245 train_time:35351ms step_avg:60.22ms +step:588/2245 train_time:35410ms step_avg:60.22ms +step:589/2245 train_time:35472ms step_avg:60.22ms +step:590/2245 train_time:35530ms step_avg:60.22ms +step:591/2245 train_time:35591ms step_avg:60.22ms +step:592/2245 train_time:35650ms step_avg:60.22ms +step:593/2245 train_time:35712ms step_avg:60.22ms +step:594/2245 train_time:35771ms step_avg:60.22ms +step:595/2245 train_time:35832ms step_avg:60.22ms +step:596/2245 train_time:35890ms step_avg:60.22ms +step:597/2245 train_time:35951ms step_avg:60.22ms +step:598/2245 train_time:36010ms step_avg:60.22ms +step:599/2245 train_time:36071ms step_avg:60.22ms +step:600/2245 train_time:36130ms step_avg:60.22ms +step:601/2245 train_time:36191ms step_avg:60.22ms +step:602/2245 train_time:36250ms step_avg:60.22ms +step:603/2245 train_time:36312ms step_avg:60.22ms +step:604/2245 train_time:36371ms step_avg:60.22ms +step:605/2245 train_time:36432ms step_avg:60.22ms +step:606/2245 train_time:36491ms step_avg:60.22ms +step:607/2245 train_time:36553ms step_avg:60.22ms +step:608/2245 train_time:36612ms step_avg:60.22ms +step:609/2245 train_time:36673ms step_avg:60.22ms +step:610/2245 train_time:36732ms step_avg:60.22ms +step:611/2245 train_time:36793ms step_avg:60.22ms +step:612/2245 train_time:36852ms step_avg:60.22ms +step:613/2245 train_time:36913ms step_avg:60.22ms +step:614/2245 train_time:36972ms step_avg:60.21ms +step:615/2245 train_time:37033ms step_avg:60.22ms +step:616/2245 train_time:37092ms step_avg:60.21ms +step:617/2245 train_time:37153ms step_avg:60.22ms +step:618/2245 train_time:37212ms step_avg:60.21ms +step:619/2245 train_time:37274ms step_avg:60.22ms +step:620/2245 train_time:37333ms step_avg:60.21ms +step:621/2245 train_time:37394ms step_avg:60.22ms +step:622/2245 train_time:37453ms step_avg:60.21ms +step:623/2245 train_time:37514ms step_avg:60.22ms +step:624/2245 train_time:37573ms step_avg:60.21ms +step:625/2245 train_time:37634ms step_avg:60.22ms +step:626/2245 train_time:37693ms step_avg:60.21ms +step:627/2245 train_time:37755ms step_avg:60.22ms +step:628/2245 train_time:37814ms step_avg:60.21ms +step:629/2245 train_time:37875ms step_avg:60.22ms +step:630/2245 train_time:37934ms step_avg:60.21ms +step:631/2245 train_time:37996ms step_avg:60.21ms +step:632/2245 train_time:38054ms step_avg:60.21ms +step:633/2245 train_time:38116ms step_avg:60.21ms +step:634/2245 train_time:38175ms step_avg:60.21ms +step:635/2245 train_time:38237ms step_avg:60.22ms +step:636/2245 train_time:38296ms step_avg:60.21ms +step:637/2245 train_time:38358ms step_avg:60.22ms +step:638/2245 train_time:38416ms step_avg:60.21ms +step:639/2245 train_time:38478ms step_avg:60.22ms +step:640/2245 train_time:38537ms step_avg:60.21ms +step:641/2245 train_time:38598ms step_avg:60.22ms +step:642/2245 train_time:38657ms step_avg:60.21ms +step:643/2245 train_time:38719ms step_avg:60.22ms +step:644/2245 train_time:38778ms step_avg:60.21ms +step:645/2245 train_time:38839ms step_avg:60.22ms +step:646/2245 train_time:38898ms step_avg:60.21ms +step:647/2245 train_time:38960ms step_avg:60.22ms +step:648/2245 train_time:39018ms step_avg:60.21ms +step:649/2245 train_time:39080ms step_avg:60.22ms +step:650/2245 train_time:39139ms step_avg:60.21ms +step:651/2245 train_time:39201ms step_avg:60.22ms +step:652/2245 train_time:39260ms step_avg:60.21ms +step:653/2245 train_time:39322ms step_avg:60.22ms +step:654/2245 train_time:39381ms step_avg:60.22ms +step:655/2245 train_time:39443ms step_avg:60.22ms +step:656/2245 train_time:39503ms step_avg:60.22ms +step:657/2245 train_time:39565ms step_avg:60.22ms +step:658/2245 train_time:39624ms step_avg:60.22ms +step:659/2245 train_time:39686ms step_avg:60.22ms +step:660/2245 train_time:39746ms step_avg:60.22ms +step:661/2245 train_time:39808ms step_avg:60.22ms +step:662/2245 train_time:39867ms step_avg:60.22ms +step:663/2245 train_time:39928ms step_avg:60.22ms +step:664/2245 train_time:39988ms step_avg:60.22ms +step:665/2245 train_time:40049ms step_avg:60.22ms +step:666/2245 train_time:40108ms step_avg:60.22ms +step:667/2245 train_time:40170ms step_avg:60.22ms +step:668/2245 train_time:40229ms step_avg:60.22ms +step:669/2245 train_time:40290ms step_avg:60.22ms +step:670/2245 train_time:40349ms step_avg:60.22ms +step:671/2245 train_time:40411ms step_avg:60.23ms +step:672/2245 train_time:40470ms step_avg:60.22ms +step:673/2245 train_time:40531ms step_avg:60.22ms +step:674/2245 train_time:40590ms step_avg:60.22ms +step:675/2245 train_time:40651ms step_avg:60.22ms +step:676/2245 train_time:40710ms step_avg:60.22ms +step:677/2245 train_time:40771ms step_avg:60.22ms +step:678/2245 train_time:40830ms step_avg:60.22ms +step:679/2245 train_time:40891ms step_avg:60.22ms +step:680/2245 train_time:40950ms step_avg:60.22ms +step:681/2245 train_time:41011ms step_avg:60.22ms +step:682/2245 train_time:41070ms step_avg:60.22ms +step:683/2245 train_time:41131ms step_avg:60.22ms +step:684/2245 train_time:41190ms step_avg:60.22ms +step:685/2245 train_time:41252ms step_avg:60.22ms +step:686/2245 train_time:41311ms step_avg:60.22ms +step:687/2245 train_time:41372ms step_avg:60.22ms +step:688/2245 train_time:41431ms step_avg:60.22ms +step:689/2245 train_time:41492ms step_avg:60.22ms +step:690/2245 train_time:41550ms step_avg:60.22ms +step:691/2245 train_time:41612ms step_avg:60.22ms +step:692/2245 train_time:41670ms step_avg:60.22ms +step:693/2245 train_time:41731ms step_avg:60.22ms +step:694/2245 train_time:41790ms step_avg:60.22ms +step:695/2245 train_time:41852ms step_avg:60.22ms +step:696/2245 train_time:41911ms step_avg:60.22ms +step:697/2245 train_time:41972ms step_avg:60.22ms +step:698/2245 train_time:42031ms step_avg:60.22ms +step:699/2245 train_time:42092ms step_avg:60.22ms +step:700/2245 train_time:42151ms step_avg:60.22ms +step:701/2245 train_time:42213ms step_avg:60.22ms +step:702/2245 train_time:42271ms step_avg:60.22ms +step:703/2245 train_time:42333ms step_avg:60.22ms +step:704/2245 train_time:42392ms step_avg:60.22ms +step:705/2245 train_time:42453ms step_avg:60.22ms +step:706/2245 train_time:42512ms step_avg:60.21ms +step:707/2245 train_time:42573ms step_avg:60.22ms +step:708/2245 train_time:42631ms step_avg:60.21ms +step:709/2245 train_time:42693ms step_avg:60.22ms +step:710/2245 train_time:42752ms step_avg:60.21ms +step:711/2245 train_time:42812ms step_avg:60.21ms +step:712/2245 train_time:42871ms step_avg:60.21ms +step:713/2245 train_time:42933ms step_avg:60.21ms +step:714/2245 train_time:42991ms step_avg:60.21ms +step:715/2245 train_time:43053ms step_avg:60.21ms +step:716/2245 train_time:43112ms step_avg:60.21ms +step:717/2245 train_time:43173ms step_avg:60.21ms +step:718/2245 train_time:43655ms step_avg:60.80ms +step:719/2245 train_time:43715ms step_avg:60.80ms +step:720/2245 train_time:43773ms step_avg:60.80ms +step:721/2245 train_time:43833ms step_avg:60.79ms +step:722/2245 train_time:43891ms step_avg:60.79ms +step:723/2245 train_time:43952ms step_avg:60.79ms +step:724/2245 train_time:44010ms step_avg:60.79ms +step:725/2245 train_time:44070ms step_avg:60.79ms +step:726/2245 train_time:44128ms step_avg:60.78ms +step:727/2245 train_time:44189ms step_avg:60.78ms +step:728/2245 train_time:44247ms step_avg:60.78ms +step:729/2245 train_time:44308ms step_avg:60.78ms +step:730/2245 train_time:44366ms step_avg:60.78ms +step:731/2245 train_time:44427ms step_avg:60.78ms +step:732/2245 train_time:44485ms step_avg:60.77ms +step:733/2245 train_time:44551ms step_avg:60.78ms +step:734/2245 train_time:44613ms step_avg:60.78ms +step:735/2245 train_time:44676ms step_avg:60.78ms +step:736/2245 train_time:44735ms step_avg:60.78ms +step:737/2245 train_time:44797ms step_avg:60.78ms +step:738/2245 train_time:44858ms step_avg:60.78ms +step:739/2245 train_time:44921ms step_avg:60.79ms +step:740/2245 train_time:44980ms step_avg:60.78ms +step:741/2245 train_time:45042ms step_avg:60.79ms +step:742/2245 train_time:45102ms step_avg:60.78ms +step:743/2245 train_time:45163ms step_avg:60.79ms +step:744/2245 train_time:45222ms step_avg:60.78ms +step:745/2245 train_time:45284ms step_avg:60.78ms +step:746/2245 train_time:45343ms step_avg:60.78ms +step:747/2245 train_time:45405ms step_avg:60.78ms +step:748/2245 train_time:45465ms step_avg:60.78ms +step:749/2245 train_time:45529ms step_avg:60.79ms +step:750/2245 train_time:45591ms step_avg:60.79ms +step:750/2245 val_loss:3.6718 train_time:45656ms step_avg:60.87ms +step:751/2245 train_time:45676ms step_avg:60.82ms +step:752/2245 train_time:45716ms step_avg:60.79ms +step:753/2245 train_time:45777ms step_avg:60.79ms +step:754/2245 train_time:45836ms step_avg:60.79ms +step:755/2245 train_time:45898ms step_avg:60.79ms +step:756/2245 train_time:45958ms step_avg:60.79ms +step:757/2245 train_time:46019ms step_avg:60.79ms +step:758/2245 train_time:46078ms step_avg:60.79ms +step:759/2245 train_time:46140ms step_avg:60.79ms +step:760/2245 train_time:46199ms step_avg:60.79ms +step:761/2245 train_time:46260ms step_avg:60.79ms +step:762/2245 train_time:46319ms step_avg:60.79ms +step:763/2245 train_time:46381ms step_avg:60.79ms +step:764/2245 train_time:46440ms step_avg:60.79ms +step:765/2245 train_time:46502ms step_avg:60.79ms +step:766/2245 train_time:46566ms step_avg:60.79ms +step:767/2245 train_time:46633ms step_avg:60.80ms +step:768/2245 train_time:46695ms step_avg:60.80ms +step:769/2245 train_time:46758ms step_avg:60.80ms +step:770/2245 train_time:46817ms step_avg:60.80ms +step:771/2245 train_time:46878ms step_avg:60.80ms +step:772/2245 train_time:46938ms step_avg:60.80ms +step:773/2245 train_time:47000ms step_avg:60.80ms +step:774/2245 train_time:47059ms step_avg:60.80ms +step:775/2245 train_time:47121ms step_avg:60.80ms +step:776/2245 train_time:47180ms step_avg:60.80ms +step:777/2245 train_time:47241ms step_avg:60.80ms +step:778/2245 train_time:47301ms step_avg:60.80ms +step:779/2245 train_time:47362ms step_avg:60.80ms +step:780/2245 train_time:47421ms step_avg:60.80ms +step:781/2245 train_time:47484ms step_avg:60.80ms +step:782/2245 train_time:47546ms step_avg:60.80ms +step:783/2245 train_time:47610ms step_avg:60.80ms +step:784/2245 train_time:47671ms step_avg:60.81ms +step:785/2245 train_time:47735ms step_avg:60.81ms +step:786/2245 train_time:47795ms step_avg:60.81ms +step:787/2245 train_time:47857ms step_avg:60.81ms +step:788/2245 train_time:47916ms step_avg:60.81ms +step:789/2245 train_time:47977ms step_avg:60.81ms +step:790/2245 train_time:48037ms step_avg:60.81ms +step:791/2245 train_time:48098ms step_avg:60.81ms +step:792/2245 train_time:48157ms step_avg:60.80ms +step:793/2245 train_time:48219ms step_avg:60.81ms +step:794/2245 train_time:48278ms step_avg:60.80ms +step:795/2245 train_time:48340ms step_avg:60.80ms +step:796/2245 train_time:48399ms step_avg:60.80ms +step:797/2245 train_time:48462ms step_avg:60.81ms +step:798/2245 train_time:48523ms step_avg:60.81ms +step:799/2245 train_time:48587ms step_avg:60.81ms +step:800/2245 train_time:48648ms step_avg:60.81ms +step:801/2245 train_time:48712ms step_avg:60.81ms +step:802/2245 train_time:48772ms step_avg:60.81ms +step:803/2245 train_time:48833ms step_avg:60.81ms +step:804/2245 train_time:48893ms step_avg:60.81ms +step:805/2245 train_time:48955ms step_avg:60.81ms +step:806/2245 train_time:49014ms step_avg:60.81ms +step:807/2245 train_time:49075ms step_avg:60.81ms +step:808/2245 train_time:49134ms step_avg:60.81ms +step:809/2245 train_time:49196ms step_avg:60.81ms +step:810/2245 train_time:49255ms step_avg:60.81ms +step:811/2245 train_time:49317ms step_avg:60.81ms +step:812/2245 train_time:49376ms step_avg:60.81ms +step:813/2245 train_time:49438ms step_avg:60.81ms +step:814/2245 train_time:49499ms step_avg:60.81ms +step:815/2245 train_time:49562ms step_avg:60.81ms +step:816/2245 train_time:49623ms step_avg:60.81ms +step:817/2245 train_time:49686ms step_avg:60.82ms +step:818/2245 train_time:49746ms step_avg:60.81ms +step:819/2245 train_time:49808ms step_avg:60.82ms +step:820/2245 train_time:49869ms step_avg:60.82ms +step:821/2245 train_time:49932ms step_avg:60.82ms +step:822/2245 train_time:49992ms step_avg:60.82ms +step:823/2245 train_time:50054ms step_avg:60.82ms +step:824/2245 train_time:50114ms step_avg:60.82ms +step:825/2245 train_time:50176ms step_avg:60.82ms +step:826/2245 train_time:50235ms step_avg:60.82ms +step:827/2245 train_time:50297ms step_avg:60.82ms +step:828/2245 train_time:50357ms step_avg:60.82ms +step:829/2245 train_time:50419ms step_avg:60.82ms +step:830/2245 train_time:50478ms step_avg:60.82ms +step:831/2245 train_time:50541ms step_avg:60.82ms +step:832/2245 train_time:50602ms step_avg:60.82ms +step:833/2245 train_time:50666ms step_avg:60.82ms +step:834/2245 train_time:50725ms step_avg:60.82ms +step:835/2245 train_time:50788ms step_avg:60.82ms +step:836/2245 train_time:50849ms step_avg:60.82ms +step:837/2245 train_time:50911ms step_avg:60.83ms +step:838/2245 train_time:50971ms step_avg:60.82ms +step:839/2245 train_time:51034ms step_avg:60.83ms +step:840/2245 train_time:51093ms step_avg:60.83ms +step:841/2245 train_time:51155ms step_avg:60.83ms +step:842/2245 train_time:51215ms step_avg:60.83ms +step:843/2245 train_time:51276ms step_avg:60.83ms +step:844/2245 train_time:51337ms step_avg:60.83ms +step:845/2245 train_time:51399ms step_avg:60.83ms +step:846/2245 train_time:51458ms step_avg:60.83ms +step:847/2245 train_time:51520ms step_avg:60.83ms +step:848/2245 train_time:51580ms step_avg:60.83ms +step:849/2245 train_time:51643ms step_avg:60.83ms +step:850/2245 train_time:51703ms step_avg:60.83ms +step:851/2245 train_time:51766ms step_avg:60.83ms +step:852/2245 train_time:51826ms step_avg:60.83ms +step:853/2245 train_time:51889ms step_avg:60.83ms +step:854/2245 train_time:51950ms step_avg:60.83ms +step:855/2245 train_time:52013ms step_avg:60.83ms +step:856/2245 train_time:52073ms step_avg:60.83ms +step:857/2245 train_time:52135ms step_avg:60.83ms +step:858/2245 train_time:52194ms step_avg:60.83ms +step:859/2245 train_time:52256ms step_avg:60.83ms +step:860/2245 train_time:52316ms step_avg:60.83ms +step:861/2245 train_time:52378ms step_avg:60.83ms +step:862/2245 train_time:52438ms step_avg:60.83ms +step:863/2245 train_time:52500ms step_avg:60.83ms +step:864/2245 train_time:52559ms step_avg:60.83ms +step:865/2245 train_time:52622ms step_avg:60.83ms +step:866/2245 train_time:52682ms step_avg:60.83ms +step:867/2245 train_time:52745ms step_avg:60.84ms +step:868/2245 train_time:52805ms step_avg:60.84ms +step:869/2245 train_time:52868ms step_avg:60.84ms +step:870/2245 train_time:52928ms step_avg:60.84ms +step:871/2245 train_time:52991ms step_avg:60.84ms +step:872/2245 train_time:53051ms step_avg:60.84ms +step:873/2245 train_time:53113ms step_avg:60.84ms +step:874/2245 train_time:53173ms step_avg:60.84ms +step:875/2245 train_time:53235ms step_avg:60.84ms +step:876/2245 train_time:53294ms step_avg:60.84ms +step:877/2245 train_time:53356ms step_avg:60.84ms +step:878/2245 train_time:53417ms step_avg:60.84ms +step:879/2245 train_time:53478ms step_avg:60.84ms +step:880/2245 train_time:53538ms step_avg:60.84ms +step:881/2245 train_time:53599ms step_avg:60.84ms +step:882/2245 train_time:53659ms step_avg:60.84ms +step:883/2245 train_time:53721ms step_avg:60.84ms +step:884/2245 train_time:53781ms step_avg:60.84ms +step:885/2245 train_time:53845ms step_avg:60.84ms +step:886/2245 train_time:53905ms step_avg:60.84ms +step:887/2245 train_time:53968ms step_avg:60.84ms +step:888/2245 train_time:54028ms step_avg:60.84ms +step:889/2245 train_time:54090ms step_avg:60.84ms +step:890/2245 train_time:54151ms step_avg:60.84ms +step:891/2245 train_time:54213ms step_avg:60.85ms +step:892/2245 train_time:54273ms step_avg:60.84ms +step:893/2245 train_time:54334ms step_avg:60.84ms +step:894/2245 train_time:54394ms step_avg:60.84ms +step:895/2245 train_time:54456ms step_avg:60.85ms +step:896/2245 train_time:54516ms step_avg:60.84ms +step:897/2245 train_time:54577ms step_avg:60.84ms +step:898/2245 train_time:54637ms step_avg:60.84ms +step:899/2245 train_time:54699ms step_avg:60.84ms +step:900/2245 train_time:54758ms step_avg:60.84ms +step:901/2245 train_time:54822ms step_avg:60.85ms +step:902/2245 train_time:54882ms step_avg:60.84ms +step:903/2245 train_time:54945ms step_avg:60.85ms +step:904/2245 train_time:55005ms step_avg:60.85ms +step:905/2245 train_time:55068ms step_avg:60.85ms +step:906/2245 train_time:55128ms step_avg:60.85ms +step:907/2245 train_time:55191ms step_avg:60.85ms +step:908/2245 train_time:55252ms step_avg:60.85ms +step:909/2245 train_time:55314ms step_avg:60.85ms +step:910/2245 train_time:55374ms step_avg:60.85ms +step:911/2245 train_time:55436ms step_avg:60.85ms +step:912/2245 train_time:55496ms step_avg:60.85ms +step:913/2245 train_time:55559ms step_avg:60.85ms +step:914/2245 train_time:55618ms step_avg:60.85ms +step:915/2245 train_time:55681ms step_avg:60.85ms +step:916/2245 train_time:55740ms step_avg:60.85ms +step:917/2245 train_time:55802ms step_avg:60.85ms +step:918/2245 train_time:55863ms step_avg:60.85ms +step:919/2245 train_time:55925ms step_avg:60.85ms +step:920/2245 train_time:55985ms step_avg:60.85ms +step:921/2245 train_time:56048ms step_avg:60.86ms +step:922/2245 train_time:56107ms step_avg:60.85ms +step:923/2245 train_time:56170ms step_avg:60.86ms +step:924/2245 train_time:56230ms step_avg:60.86ms +step:925/2245 train_time:56293ms step_avg:60.86ms +step:926/2245 train_time:56353ms step_avg:60.86ms +step:927/2245 train_time:56415ms step_avg:60.86ms +step:928/2245 train_time:56475ms step_avg:60.86ms +step:929/2245 train_time:56537ms step_avg:60.86ms +step:930/2245 train_time:56596ms step_avg:60.86ms +step:931/2245 train_time:56658ms step_avg:60.86ms +step:932/2245 train_time:56717ms step_avg:60.86ms +step:933/2245 train_time:56779ms step_avg:60.86ms +step:934/2245 train_time:56839ms step_avg:60.86ms +step:935/2245 train_time:56901ms step_avg:60.86ms +step:936/2245 train_time:56961ms step_avg:60.86ms +step:937/2245 train_time:57024ms step_avg:60.86ms +step:938/2245 train_time:57085ms step_avg:60.86ms +step:939/2245 train_time:57147ms step_avg:60.86ms +step:940/2245 train_time:57208ms step_avg:60.86ms +step:941/2245 train_time:57270ms step_avg:60.86ms +step:942/2245 train_time:57329ms step_avg:60.86ms +step:943/2245 train_time:57392ms step_avg:60.86ms +step:944/2245 train_time:57453ms step_avg:60.86ms +step:945/2245 train_time:57515ms step_avg:60.86ms +step:946/2245 train_time:57575ms step_avg:60.86ms +step:947/2245 train_time:57637ms step_avg:60.86ms +step:948/2245 train_time:57697ms step_avg:60.86ms +step:949/2245 train_time:57759ms step_avg:60.86ms +step:950/2245 train_time:57818ms step_avg:60.86ms +step:951/2245 train_time:57880ms step_avg:60.86ms +step:952/2245 train_time:57941ms step_avg:60.86ms +step:953/2245 train_time:58004ms step_avg:60.86ms +step:954/2245 train_time:58065ms step_avg:60.86ms +step:955/2245 train_time:58128ms step_avg:60.87ms +step:956/2245 train_time:58188ms step_avg:60.87ms +step:957/2245 train_time:58251ms step_avg:60.87ms +step:958/2245 train_time:58311ms step_avg:60.87ms +step:959/2245 train_time:58374ms step_avg:60.87ms +step:960/2245 train_time:58433ms step_avg:60.87ms +step:961/2245 train_time:58496ms step_avg:60.87ms +step:962/2245 train_time:58556ms step_avg:60.87ms +step:963/2245 train_time:58619ms step_avg:60.87ms +step:964/2245 train_time:58678ms step_avg:60.87ms +step:965/2245 train_time:58740ms step_avg:60.87ms +step:966/2245 train_time:58800ms step_avg:60.87ms +step:967/2245 train_time:58862ms step_avg:60.87ms +step:968/2245 train_time:58922ms step_avg:60.87ms +step:969/2245 train_time:58984ms step_avg:60.87ms +step:970/2245 train_time:59044ms step_avg:60.87ms +step:971/2245 train_time:59106ms step_avg:60.87ms +step:972/2245 train_time:59167ms step_avg:60.87ms +step:973/2245 train_time:59229ms step_avg:60.87ms +step:974/2245 train_time:59290ms step_avg:60.87ms +step:975/2245 train_time:59353ms step_avg:60.87ms +step:976/2245 train_time:59413ms step_avg:60.87ms +step:977/2245 train_time:59475ms step_avg:60.88ms +step:978/2245 train_time:59536ms step_avg:60.87ms +step:979/2245 train_time:59598ms step_avg:60.88ms +step:980/2245 train_time:59659ms step_avg:60.88ms +step:981/2245 train_time:59720ms step_avg:60.88ms +step:982/2245 train_time:59781ms step_avg:60.88ms +step:983/2245 train_time:59842ms step_avg:60.88ms +step:984/2245 train_time:59902ms step_avg:60.88ms +step:985/2245 train_time:59964ms step_avg:60.88ms +step:986/2245 train_time:60024ms step_avg:60.88ms +step:987/2245 train_time:60087ms step_avg:60.88ms +step:988/2245 train_time:60147ms step_avg:60.88ms +step:989/2245 train_time:60210ms step_avg:60.88ms +step:990/2245 train_time:60271ms step_avg:60.88ms +step:991/2245 train_time:60333ms step_avg:60.88ms +step:992/2245 train_time:60392ms step_avg:60.88ms +step:993/2245 train_time:60455ms step_avg:60.88ms +step:994/2245 train_time:60515ms step_avg:60.88ms +step:995/2245 train_time:60577ms step_avg:60.88ms +step:996/2245 train_time:60637ms step_avg:60.88ms +step:997/2245 train_time:60699ms step_avg:60.88ms +step:998/2245 train_time:60760ms step_avg:60.88ms +step:999/2245 train_time:60821ms step_avg:60.88ms +step:1000/2245 train_time:60881ms step_avg:60.88ms +step:1000/2245 val_loss:3.5943 train_time:60944ms step_avg:60.94ms +step:1001/2245 train_time:60962ms step_avg:60.90ms +step:1002/2245 train_time:61008ms step_avg:60.89ms +step:1003/2245 train_time:61072ms step_avg:60.89ms +step:1004/2245 train_time:61132ms step_avg:60.89ms +step:1005/2245 train_time:61195ms step_avg:60.89ms +step:1006/2245 train_time:61255ms step_avg:60.89ms +step:1007/2245 train_time:61317ms step_avg:60.89ms +step:1008/2245 train_time:61377ms step_avg:60.89ms +step:1009/2245 train_time:61439ms step_avg:60.89ms +step:1010/2245 train_time:61498ms step_avg:60.89ms +step:1011/2245 train_time:61560ms step_avg:60.89ms +step:1012/2245 train_time:61620ms step_avg:60.89ms +step:1013/2245 train_time:61681ms step_avg:60.89ms +step:1014/2245 train_time:61740ms step_avg:60.89ms +step:1015/2245 train_time:61801ms step_avg:60.89ms +step:1016/2245 train_time:61862ms step_avg:60.89ms +step:1017/2245 train_time:61925ms step_avg:60.89ms +step:1018/2245 train_time:61987ms step_avg:60.89ms +step:1019/2245 train_time:62049ms step_avg:60.89ms +step:1020/2245 train_time:62109ms step_avg:60.89ms +step:1021/2245 train_time:62172ms step_avg:60.89ms +step:1022/2245 train_time:62231ms step_avg:60.89ms +step:1023/2245 train_time:62294ms step_avg:60.89ms +step:1024/2245 train_time:62354ms step_avg:60.89ms +step:1025/2245 train_time:62417ms step_avg:60.89ms +step:1026/2245 train_time:62477ms step_avg:60.89ms +step:1027/2245 train_time:62539ms step_avg:60.90ms +step:1028/2245 train_time:62599ms step_avg:60.89ms +step:1029/2245 train_time:62661ms step_avg:60.89ms +step:1030/2245 train_time:62720ms step_avg:60.89ms +step:1031/2245 train_time:62782ms step_avg:60.89ms +step:1032/2245 train_time:62842ms step_avg:60.89ms +step:1033/2245 train_time:62905ms step_avg:60.90ms +step:1034/2245 train_time:62965ms step_avg:60.89ms +step:1035/2245 train_time:63027ms step_avg:60.90ms +step:1036/2245 train_time:63087ms step_avg:60.89ms +step:1037/2245 train_time:63149ms step_avg:60.90ms +step:1038/2245 train_time:63209ms step_avg:60.90ms +step:1039/2245 train_time:63272ms step_avg:60.90ms +step:1040/2245 train_time:63332ms step_avg:60.90ms +step:1041/2245 train_time:63395ms step_avg:60.90ms +step:1042/2245 train_time:63455ms step_avg:60.90ms +step:1043/2245 train_time:63518ms step_avg:60.90ms +step:1044/2245 train_time:63579ms step_avg:60.90ms +step:1045/2245 train_time:63641ms step_avg:60.90ms +step:1046/2245 train_time:63700ms step_avg:60.90ms +step:1047/2245 train_time:63762ms step_avg:60.90ms +step:1048/2245 train_time:63822ms step_avg:60.90ms +step:1049/2245 train_time:63884ms step_avg:60.90ms +step:1050/2245 train_time:63944ms step_avg:60.90ms +step:1051/2245 train_time:64007ms step_avg:60.90ms +step:1052/2245 train_time:64067ms step_avg:60.90ms +step:1053/2245 train_time:64130ms step_avg:60.90ms +step:1054/2245 train_time:64190ms step_avg:60.90ms +step:1055/2245 train_time:64252ms step_avg:60.90ms +step:1056/2245 train_time:64312ms step_avg:60.90ms +step:1057/2245 train_time:64375ms step_avg:60.90ms +step:1058/2245 train_time:64434ms step_avg:60.90ms +step:1059/2245 train_time:64497ms step_avg:60.90ms +step:1060/2245 train_time:64556ms step_avg:60.90ms +step:1061/2245 train_time:64618ms step_avg:60.90ms +step:1062/2245 train_time:64678ms step_avg:60.90ms +step:1063/2245 train_time:64741ms step_avg:60.90ms +step:1064/2245 train_time:64801ms step_avg:60.90ms +step:1065/2245 train_time:64863ms step_avg:60.90ms +step:1066/2245 train_time:64924ms step_avg:60.90ms +step:1067/2245 train_time:64987ms step_avg:60.91ms +step:1068/2245 train_time:65046ms step_avg:60.90ms +step:1069/2245 train_time:65108ms step_avg:60.91ms +step:1070/2245 train_time:65168ms step_avg:60.90ms +step:1071/2245 train_time:65230ms step_avg:60.91ms +step:1072/2245 train_time:65290ms step_avg:60.91ms +step:1073/2245 train_time:65353ms step_avg:60.91ms +step:1074/2245 train_time:65414ms step_avg:60.91ms +step:1075/2245 train_time:65476ms step_avg:60.91ms +step:1076/2245 train_time:65536ms step_avg:60.91ms +step:1077/2245 train_time:65598ms step_avg:60.91ms +step:1078/2245 train_time:65658ms step_avg:60.91ms +step:1079/2245 train_time:65721ms step_avg:60.91ms +step:1080/2245 train_time:65781ms step_avg:60.91ms +step:1081/2245 train_time:65843ms step_avg:60.91ms +step:1082/2245 train_time:65903ms step_avg:60.91ms +step:1083/2245 train_time:65966ms step_avg:60.91ms +step:1084/2245 train_time:66026ms step_avg:60.91ms +step:1085/2245 train_time:66088ms step_avg:60.91ms +step:1086/2245 train_time:66148ms step_avg:60.91ms +step:1087/2245 train_time:66210ms step_avg:60.91ms +step:1088/2245 train_time:66270ms step_avg:60.91ms +step:1089/2245 train_time:66333ms step_avg:60.91ms +step:1090/2245 train_time:66393ms step_avg:60.91ms +step:1091/2245 train_time:66455ms step_avg:60.91ms +step:1092/2245 train_time:66515ms step_avg:60.91ms +step:1093/2245 train_time:66578ms step_avg:60.91ms +step:1094/2245 train_time:66638ms step_avg:60.91ms +step:1095/2245 train_time:66701ms step_avg:60.91ms +step:1096/2245 train_time:66761ms step_avg:60.91ms +step:1097/2245 train_time:66824ms step_avg:60.92ms +step:1098/2245 train_time:66885ms step_avg:60.91ms +step:1099/2245 train_time:66947ms step_avg:60.92ms +step:1100/2245 train_time:67007ms step_avg:60.92ms +step:1101/2245 train_time:67069ms step_avg:60.92ms +step:1102/2245 train_time:67128ms step_avg:60.91ms +step:1103/2245 train_time:67190ms step_avg:60.92ms +step:1104/2245 train_time:67250ms step_avg:60.91ms +step:1105/2245 train_time:67312ms step_avg:60.92ms +step:1106/2245 train_time:67372ms step_avg:60.92ms +step:1107/2245 train_time:67435ms step_avg:60.92ms +step:1108/2245 train_time:67495ms step_avg:60.92ms +step:1109/2245 train_time:67557ms step_avg:60.92ms +step:1110/2245 train_time:67618ms step_avg:60.92ms +step:1111/2245 train_time:67681ms step_avg:60.92ms +step:1112/2245 train_time:67741ms step_avg:60.92ms +step:1113/2245 train_time:67803ms step_avg:60.92ms +step:1114/2245 train_time:67863ms step_avg:60.92ms +step:1115/2245 train_time:67926ms step_avg:60.92ms +step:1116/2245 train_time:67987ms step_avg:60.92ms +step:1117/2245 train_time:68048ms step_avg:60.92ms +step:1118/2245 train_time:68108ms step_avg:60.92ms +step:1119/2245 train_time:68170ms step_avg:60.92ms +step:1120/2245 train_time:68230ms step_avg:60.92ms +step:1121/2245 train_time:68292ms step_avg:60.92ms +step:1122/2245 train_time:68352ms step_avg:60.92ms +step:1123/2245 train_time:68415ms step_avg:60.92ms +step:1124/2245 train_time:68475ms step_avg:60.92ms +step:1125/2245 train_time:68537ms step_avg:60.92ms +step:1126/2245 train_time:68598ms step_avg:60.92ms +step:1127/2245 train_time:68662ms step_avg:60.92ms +step:1128/2245 train_time:68721ms step_avg:60.92ms +step:1129/2245 train_time:68784ms step_avg:60.92ms +step:1130/2245 train_time:68844ms step_avg:60.92ms +step:1131/2245 train_time:68906ms step_avg:60.92ms +step:1132/2245 train_time:68965ms step_avg:60.92ms +step:1133/2245 train_time:69028ms step_avg:60.93ms +step:1134/2245 train_time:69088ms step_avg:60.92ms +step:1135/2245 train_time:69151ms step_avg:60.93ms +step:1136/2245 train_time:69210ms step_avg:60.92ms +step:1137/2245 train_time:69272ms step_avg:60.93ms +step:1138/2245 train_time:69332ms step_avg:60.92ms +step:1139/2245 train_time:69395ms step_avg:60.93ms +step:1140/2245 train_time:69455ms step_avg:60.93ms +step:1141/2245 train_time:69518ms step_avg:60.93ms +step:1142/2245 train_time:69579ms step_avg:60.93ms +step:1143/2245 train_time:69641ms step_avg:60.93ms +step:1144/2245 train_time:69701ms step_avg:60.93ms +step:1145/2245 train_time:69763ms step_avg:60.93ms +step:1146/2245 train_time:69822ms step_avg:60.93ms +step:1147/2245 train_time:69884ms step_avg:60.93ms +step:1148/2245 train_time:69944ms step_avg:60.93ms +step:1149/2245 train_time:70007ms step_avg:60.93ms +step:1150/2245 train_time:70066ms step_avg:60.93ms +step:1151/2245 train_time:70128ms step_avg:60.93ms +step:1152/2245 train_time:70188ms step_avg:60.93ms +step:1153/2245 train_time:70250ms step_avg:60.93ms +step:1154/2245 train_time:70310ms step_avg:60.93ms +step:1155/2245 train_time:70373ms step_avg:60.93ms +step:1156/2245 train_time:70433ms step_avg:60.93ms +step:1157/2245 train_time:70496ms step_avg:60.93ms +step:1158/2245 train_time:70556ms step_avg:60.93ms +step:1159/2245 train_time:70618ms step_avg:60.93ms +step:1160/2245 train_time:70679ms step_avg:60.93ms +step:1161/2245 train_time:70742ms step_avg:60.93ms +step:1162/2245 train_time:70802ms step_avg:60.93ms +step:1163/2245 train_time:70864ms step_avg:60.93ms +step:1164/2245 train_time:70924ms step_avg:60.93ms +step:1165/2245 train_time:70986ms step_avg:60.93ms +step:1166/2245 train_time:71045ms step_avg:60.93ms +step:1167/2245 train_time:71107ms step_avg:60.93ms +step:1168/2245 train_time:71167ms step_avg:60.93ms +step:1169/2245 train_time:71229ms step_avg:60.93ms +step:1170/2245 train_time:71289ms step_avg:60.93ms +step:1171/2245 train_time:71351ms step_avg:60.93ms +step:1172/2245 train_time:71411ms step_avg:60.93ms +step:1173/2245 train_time:71474ms step_avg:60.93ms +step:1174/2245 train_time:71534ms step_avg:60.93ms +step:1175/2245 train_time:71597ms step_avg:60.93ms +step:1176/2245 train_time:71658ms step_avg:60.93ms +step:1177/2245 train_time:71721ms step_avg:60.94ms +step:1178/2245 train_time:71781ms step_avg:60.93ms +step:1179/2245 train_time:71842ms step_avg:60.94ms +step:1180/2245 train_time:71902ms step_avg:60.93ms +step:1181/2245 train_time:71964ms step_avg:60.93ms +step:1182/2245 train_time:72024ms step_avg:60.93ms +step:1183/2245 train_time:72086ms step_avg:60.94ms +step:1184/2245 train_time:72146ms step_avg:60.93ms +step:1185/2245 train_time:72208ms step_avg:60.94ms +step:1186/2245 train_time:72268ms step_avg:60.93ms +step:1187/2245 train_time:72331ms step_avg:60.94ms +step:1188/2245 train_time:72391ms step_avg:60.94ms +step:1189/2245 train_time:72454ms step_avg:60.94ms +step:1190/2245 train_time:72514ms step_avg:60.94ms +step:1191/2245 train_time:72576ms step_avg:60.94ms +step:1192/2245 train_time:72636ms step_avg:60.94ms +step:1193/2245 train_time:72700ms step_avg:60.94ms +step:1194/2245 train_time:72760ms step_avg:60.94ms +step:1195/2245 train_time:72823ms step_avg:60.94ms +step:1196/2245 train_time:72883ms step_avg:60.94ms +step:1197/2245 train_time:72945ms step_avg:60.94ms +step:1198/2245 train_time:73004ms step_avg:60.94ms +step:1199/2245 train_time:73066ms step_avg:60.94ms +step:1200/2245 train_time:73126ms step_avg:60.94ms +step:1201/2245 train_time:73188ms step_avg:60.94ms +step:1202/2245 train_time:73247ms step_avg:60.94ms +step:1203/2245 train_time:73310ms step_avg:60.94ms +step:1204/2245 train_time:73370ms step_avg:60.94ms +step:1205/2245 train_time:73433ms step_avg:60.94ms +step:1206/2245 train_time:73493ms step_avg:60.94ms +step:1207/2245 train_time:73556ms step_avg:60.94ms +step:1208/2245 train_time:73616ms step_avg:60.94ms +step:1209/2245 train_time:73679ms step_avg:60.94ms +step:1210/2245 train_time:73740ms step_avg:60.94ms +step:1211/2245 train_time:73803ms step_avg:60.94ms +step:1212/2245 train_time:73862ms step_avg:60.94ms +step:1213/2245 train_time:73925ms step_avg:60.94ms +step:1214/2245 train_time:73985ms step_avg:60.94ms +step:1215/2245 train_time:74047ms step_avg:60.94ms +step:1216/2245 train_time:74107ms step_avg:60.94ms +step:1217/2245 train_time:74169ms step_avg:60.94ms +step:1218/2245 train_time:74229ms step_avg:60.94ms +step:1219/2245 train_time:74290ms step_avg:60.94ms +step:1220/2245 train_time:74350ms step_avg:60.94ms +step:1221/2245 train_time:74413ms step_avg:60.94ms +step:1222/2245 train_time:74473ms step_avg:60.94ms +step:1223/2245 train_time:74535ms step_avg:60.94ms +step:1224/2245 train_time:74595ms step_avg:60.94ms +step:1225/2245 train_time:74657ms step_avg:60.94ms +step:1226/2245 train_time:74717ms step_avg:60.94ms +step:1227/2245 train_time:74781ms step_avg:60.95ms +step:1228/2245 train_time:74841ms step_avg:60.95ms +step:1229/2245 train_time:74903ms step_avg:60.95ms +step:1230/2245 train_time:74963ms step_avg:60.95ms +step:1231/2245 train_time:75026ms step_avg:60.95ms +step:1232/2245 train_time:75085ms step_avg:60.95ms +step:1233/2245 train_time:75147ms step_avg:60.95ms +step:1234/2245 train_time:75206ms step_avg:60.95ms +step:1235/2245 train_time:75268ms step_avg:60.95ms +step:1236/2245 train_time:75328ms step_avg:60.94ms +step:1237/2245 train_time:75390ms step_avg:60.95ms +step:1238/2245 train_time:75450ms step_avg:60.94ms +step:1239/2245 train_time:75513ms step_avg:60.95ms +step:1240/2245 train_time:75573ms step_avg:60.95ms +step:1241/2245 train_time:75635ms step_avg:60.95ms +step:1242/2245 train_time:75695ms step_avg:60.95ms +step:1243/2245 train_time:75758ms step_avg:60.95ms +step:1244/2245 train_time:75818ms step_avg:60.95ms +step:1245/2245 train_time:75881ms step_avg:60.95ms +step:1246/2245 train_time:75941ms step_avg:60.95ms +step:1247/2245 train_time:76004ms step_avg:60.95ms +step:1248/2245 train_time:76064ms step_avg:60.95ms +step:1249/2245 train_time:76126ms step_avg:60.95ms +step:1250/2245 train_time:76186ms step_avg:60.95ms +step:1250/2245 val_loss:3.5237 train_time:76250ms step_avg:61.00ms +step:1251/2245 train_time:76268ms step_avg:60.97ms +step:1252/2245 train_time:76312ms step_avg:60.95ms +step:1253/2245 train_time:76381ms step_avg:60.96ms +step:1254/2245 train_time:76441ms step_avg:60.96ms +step:1255/2245 train_time:76503ms step_avg:60.96ms +step:1256/2245 train_time:76563ms step_avg:60.96ms +step:1257/2245 train_time:76625ms step_avg:60.96ms +step:1258/2245 train_time:76684ms step_avg:60.96ms +step:1259/2245 train_time:76745ms step_avg:60.96ms +step:1260/2245 train_time:76804ms step_avg:60.96ms +step:1261/2245 train_time:76865ms step_avg:60.96ms +step:1262/2245 train_time:76925ms step_avg:60.95ms +step:1263/2245 train_time:76987ms step_avg:60.96ms +step:1264/2245 train_time:77046ms step_avg:60.95ms +step:1265/2245 train_time:77108ms step_avg:60.95ms +step:1266/2245 train_time:77167ms step_avg:60.95ms +step:1267/2245 train_time:77231ms step_avg:60.96ms +step:1268/2245 train_time:77295ms step_avg:60.96ms +step:1269/2245 train_time:77359ms step_avg:60.96ms +step:1270/2245 train_time:77420ms step_avg:60.96ms +step:1271/2245 train_time:77483ms step_avg:60.96ms +step:1272/2245 train_time:77543ms step_avg:60.96ms +step:1273/2245 train_time:77604ms step_avg:60.96ms +step:1274/2245 train_time:77664ms step_avg:60.96ms +step:1275/2245 train_time:77726ms step_avg:60.96ms +step:1276/2245 train_time:77785ms step_avg:60.96ms +step:1277/2245 train_time:77846ms step_avg:60.96ms +step:1278/2245 train_time:77906ms step_avg:60.96ms +step:1279/2245 train_time:77968ms step_avg:60.96ms +step:1280/2245 train_time:78027ms step_avg:60.96ms +step:1281/2245 train_time:78089ms step_avg:60.96ms +step:1282/2245 train_time:78150ms step_avg:60.96ms +step:1283/2245 train_time:78213ms step_avg:60.96ms +step:1284/2245 train_time:78275ms step_avg:60.96ms +step:1285/2245 train_time:78338ms step_avg:60.96ms +step:1286/2245 train_time:78399ms step_avg:60.96ms +step:1287/2245 train_time:78462ms step_avg:60.97ms +step:1288/2245 train_time:78522ms step_avg:60.96ms +step:1289/2245 train_time:78584ms step_avg:60.96ms +step:1290/2245 train_time:78643ms step_avg:60.96ms +step:1291/2245 train_time:78705ms step_avg:60.96ms +step:1292/2245 train_time:78764ms step_avg:60.96ms +step:1293/2245 train_time:78826ms step_avg:60.96ms +step:1294/2245 train_time:78885ms step_avg:60.96ms +step:1295/2245 train_time:78947ms step_avg:60.96ms +step:1296/2245 train_time:79007ms step_avg:60.96ms +step:1297/2245 train_time:79070ms step_avg:60.96ms +step:1298/2245 train_time:79130ms step_avg:60.96ms +step:1299/2245 train_time:79193ms step_avg:60.96ms +step:1300/2245 train_time:79254ms step_avg:60.96ms +step:1301/2245 train_time:79318ms step_avg:60.97ms +step:1302/2245 train_time:79379ms step_avg:60.97ms +step:1303/2245 train_time:79442ms step_avg:60.97ms +step:1304/2245 train_time:79502ms step_avg:60.97ms +step:1305/2245 train_time:79564ms step_avg:60.97ms +step:1306/2245 train_time:79625ms step_avg:60.97ms +step:1307/2245 train_time:79688ms step_avg:60.97ms +step:1308/2245 train_time:79747ms step_avg:60.97ms +step:1309/2245 train_time:79809ms step_avg:60.97ms +step:1310/2245 train_time:79869ms step_avg:60.97ms +step:1311/2245 train_time:79931ms step_avg:60.97ms +step:1312/2245 train_time:79991ms step_avg:60.97ms +step:1313/2245 train_time:80052ms step_avg:60.97ms +step:1314/2245 train_time:80112ms step_avg:60.97ms +step:1315/2245 train_time:80175ms step_avg:60.97ms +step:1316/2245 train_time:80236ms step_avg:60.97ms +step:1317/2245 train_time:80299ms step_avg:60.97ms +step:1318/2245 train_time:80359ms step_avg:60.97ms +step:1319/2245 train_time:80422ms step_avg:60.97ms +step:1320/2245 train_time:80482ms step_avg:60.97ms +step:1321/2245 train_time:80545ms step_avg:60.97ms +step:1322/2245 train_time:80604ms step_avg:60.97ms +step:1323/2245 train_time:80667ms step_avg:60.97ms +step:1324/2245 train_time:80726ms step_avg:60.97ms +step:1325/2245 train_time:80788ms step_avg:60.97ms +step:1326/2245 train_time:80848ms step_avg:60.97ms +step:1327/2245 train_time:80910ms step_avg:60.97ms +step:1328/2245 train_time:80970ms step_avg:60.97ms +step:1329/2245 train_time:81032ms step_avg:60.97ms +step:1330/2245 train_time:81092ms step_avg:60.97ms +step:1331/2245 train_time:81155ms step_avg:60.97ms +step:1332/2245 train_time:81215ms step_avg:60.97ms +step:1333/2245 train_time:81278ms step_avg:60.97ms +step:1334/2245 train_time:81339ms step_avg:60.97ms +step:1335/2245 train_time:81402ms step_avg:60.97ms +step:1336/2245 train_time:81462ms step_avg:60.97ms +step:1337/2245 train_time:81524ms step_avg:60.98ms +step:1338/2245 train_time:81584ms step_avg:60.97ms +step:1339/2245 train_time:81645ms step_avg:60.97ms +step:1340/2245 train_time:81705ms step_avg:60.97ms +step:1341/2245 train_time:81767ms step_avg:60.97ms +step:1342/2245 train_time:81827ms step_avg:60.97ms +step:1343/2245 train_time:81889ms step_avg:60.97ms +step:1344/2245 train_time:81948ms step_avg:60.97ms +step:1345/2245 train_time:82011ms step_avg:60.97ms +step:1346/2245 train_time:82070ms step_avg:60.97ms +step:1347/2245 train_time:82133ms step_avg:60.97ms +step:1348/2245 train_time:82193ms step_avg:60.97ms +step:1349/2245 train_time:82256ms step_avg:60.98ms +step:1350/2245 train_time:82317ms step_avg:60.98ms +step:1351/2245 train_time:82379ms step_avg:60.98ms +step:1352/2245 train_time:82439ms step_avg:60.98ms +step:1353/2245 train_time:82502ms step_avg:60.98ms +step:1354/2245 train_time:82561ms step_avg:60.98ms +step:1355/2245 train_time:82624ms step_avg:60.98ms +step:1356/2245 train_time:82684ms step_avg:60.98ms +step:1357/2245 train_time:82746ms step_avg:60.98ms +step:1358/2245 train_time:82805ms step_avg:60.98ms +step:1359/2245 train_time:82868ms step_avg:60.98ms +step:1360/2245 train_time:82927ms step_avg:60.98ms +step:1361/2245 train_time:82990ms step_avg:60.98ms +step:1362/2245 train_time:83050ms step_avg:60.98ms +step:1363/2245 train_time:83112ms step_avg:60.98ms +step:1364/2245 train_time:83172ms step_avg:60.98ms +step:1365/2245 train_time:83235ms step_avg:60.98ms +step:1366/2245 train_time:83295ms step_avg:60.98ms +step:1367/2245 train_time:83358ms step_avg:60.98ms +step:1368/2245 train_time:83419ms step_avg:60.98ms +step:1369/2245 train_time:83481ms step_avg:60.98ms +step:1370/2245 train_time:83541ms step_avg:60.98ms +step:1371/2245 train_time:83603ms step_avg:60.98ms +step:1372/2245 train_time:83662ms step_avg:60.98ms +step:1373/2245 train_time:83724ms step_avg:60.98ms +step:1374/2245 train_time:83783ms step_avg:60.98ms +step:1375/2245 train_time:83846ms step_avg:60.98ms +step:1376/2245 train_time:83905ms step_avg:60.98ms +step:1377/2245 train_time:83968ms step_avg:60.98ms +step:1378/2245 train_time:84028ms step_avg:60.98ms +step:1379/2245 train_time:84091ms step_avg:60.98ms +step:1380/2245 train_time:84151ms step_avg:60.98ms +step:1381/2245 train_time:84214ms step_avg:60.98ms +step:1382/2245 train_time:84275ms step_avg:60.98ms +step:1383/2245 train_time:84339ms step_avg:60.98ms +step:1384/2245 train_time:84399ms step_avg:60.98ms +step:1385/2245 train_time:84462ms step_avg:60.98ms +step:1386/2245 train_time:84522ms step_avg:60.98ms +step:1387/2245 train_time:84584ms step_avg:60.98ms +step:1388/2245 train_time:84643ms step_avg:60.98ms +step:1389/2245 train_time:84706ms step_avg:60.98ms +step:1390/2245 train_time:84765ms step_avg:60.98ms +step:1391/2245 train_time:84827ms step_avg:60.98ms +step:1392/2245 train_time:84887ms step_avg:60.98ms +step:1393/2245 train_time:84950ms step_avg:60.98ms +step:1394/2245 train_time:85009ms step_avg:60.98ms +step:1395/2245 train_time:85072ms step_avg:60.98ms +step:1396/2245 train_time:85132ms step_avg:60.98ms +step:1397/2245 train_time:85195ms step_avg:60.98ms +step:1398/2245 train_time:85255ms step_avg:60.98ms +step:1399/2245 train_time:85318ms step_avg:60.99ms +step:1400/2245 train_time:85378ms step_avg:60.98ms +step:1401/2245 train_time:85441ms step_avg:60.99ms +step:1402/2245 train_time:85501ms step_avg:60.99ms +step:1403/2245 train_time:85563ms step_avg:60.99ms +step:1404/2245 train_time:85623ms step_avg:60.99ms +step:1405/2245 train_time:85685ms step_avg:60.99ms +step:1406/2245 train_time:85744ms step_avg:60.98ms +step:1407/2245 train_time:85807ms step_avg:60.99ms +step:1408/2245 train_time:85866ms step_avg:60.98ms +step:1409/2245 train_time:85929ms step_avg:60.99ms +step:1410/2245 train_time:85988ms step_avg:60.98ms +step:1411/2245 train_time:86050ms step_avg:60.99ms +step:1412/2245 train_time:86111ms step_avg:60.98ms +step:1413/2245 train_time:86173ms step_avg:60.99ms +step:1414/2245 train_time:86235ms step_avg:60.99ms +step:1415/2245 train_time:86298ms step_avg:60.99ms +step:1416/2245 train_time:86358ms step_avg:60.99ms +step:1417/2245 train_time:86421ms step_avg:60.99ms +step:1418/2245 train_time:86481ms step_avg:60.99ms +step:1419/2245 train_time:86543ms step_avg:60.99ms +step:1420/2245 train_time:86603ms step_avg:60.99ms +step:1421/2245 train_time:86665ms step_avg:60.99ms +step:1422/2245 train_time:86724ms step_avg:60.99ms +step:1423/2245 train_time:86786ms step_avg:60.99ms +step:1424/2245 train_time:86846ms step_avg:60.99ms +step:1425/2245 train_time:86908ms step_avg:60.99ms +step:1426/2245 train_time:86969ms step_avg:60.99ms +step:1427/2245 train_time:87031ms step_avg:60.99ms +step:1428/2245 train_time:87091ms step_avg:60.99ms +step:1429/2245 train_time:87154ms step_avg:60.99ms +step:1430/2245 train_time:87215ms step_avg:60.99ms +step:1431/2245 train_time:87278ms step_avg:60.99ms +step:1432/2245 train_time:87338ms step_avg:60.99ms +step:1433/2245 train_time:87400ms step_avg:60.99ms +step:1434/2245 train_time:87460ms step_avg:60.99ms +step:1435/2245 train_time:87523ms step_avg:60.99ms +step:1436/2245 train_time:87583ms step_avg:60.99ms +step:1437/2245 train_time:87645ms step_avg:60.99ms +step:1438/2245 train_time:87704ms step_avg:60.99ms +step:1439/2245 train_time:87766ms step_avg:60.99ms +step:1440/2245 train_time:87826ms step_avg:60.99ms +step:1441/2245 train_time:87888ms step_avg:60.99ms +step:1442/2245 train_time:87948ms step_avg:60.99ms +step:1443/2245 train_time:88010ms step_avg:60.99ms +step:1444/2245 train_time:88070ms step_avg:60.99ms +step:1445/2245 train_time:88133ms step_avg:60.99ms +step:1446/2245 train_time:88193ms step_avg:60.99ms +step:1447/2245 train_time:88257ms step_avg:60.99ms +step:1448/2245 train_time:88317ms step_avg:60.99ms +step:1449/2245 train_time:88379ms step_avg:60.99ms +step:1450/2245 train_time:88440ms step_avg:60.99ms +step:1451/2245 train_time:88501ms step_avg:60.99ms +step:1452/2245 train_time:88562ms step_avg:60.99ms +step:1453/2245 train_time:88624ms step_avg:60.99ms +step:1454/2245 train_time:88684ms step_avg:60.99ms +step:1455/2245 train_time:88746ms step_avg:60.99ms +step:1456/2245 train_time:88806ms step_avg:60.99ms +step:1457/2245 train_time:88867ms step_avg:60.99ms +step:1458/2245 train_time:88928ms step_avg:60.99ms +step:1459/2245 train_time:88990ms step_avg:60.99ms +step:1460/2245 train_time:89050ms step_avg:60.99ms +step:1461/2245 train_time:89112ms step_avg:60.99ms +step:1462/2245 train_time:89173ms step_avg:60.99ms +step:1463/2245 train_time:89237ms step_avg:61.00ms +step:1464/2245 train_time:89297ms step_avg:60.99ms +step:1465/2245 train_time:89360ms step_avg:61.00ms +step:1466/2245 train_time:89421ms step_avg:61.00ms +step:1467/2245 train_time:89483ms step_avg:61.00ms +step:1468/2245 train_time:89542ms step_avg:61.00ms +step:1469/2245 train_time:89604ms step_avg:61.00ms +step:1470/2245 train_time:89664ms step_avg:61.00ms +step:1471/2245 train_time:89726ms step_avg:61.00ms +step:1472/2245 train_time:89786ms step_avg:61.00ms +step:1473/2245 train_time:89849ms step_avg:61.00ms +step:1474/2245 train_time:89909ms step_avg:61.00ms +step:1475/2245 train_time:89972ms step_avg:61.00ms +step:1476/2245 train_time:90033ms step_avg:61.00ms +step:1477/2245 train_time:90095ms step_avg:61.00ms +step:1478/2245 train_time:90155ms step_avg:61.00ms +step:1479/2245 train_time:90218ms step_avg:61.00ms +step:1480/2245 train_time:90278ms step_avg:61.00ms +step:1481/2245 train_time:90341ms step_avg:61.00ms +step:1482/2245 train_time:90402ms step_avg:61.00ms +step:1483/2245 train_time:90465ms step_avg:61.00ms +step:1484/2245 train_time:90526ms step_avg:61.00ms +step:1485/2245 train_time:90589ms step_avg:61.00ms +step:1486/2245 train_time:90650ms step_avg:61.00ms +step:1487/2245 train_time:90712ms step_avg:61.00ms +step:1488/2245 train_time:90772ms step_avg:61.00ms +step:1489/2245 train_time:90835ms step_avg:61.00ms +step:1490/2245 train_time:90896ms step_avg:61.00ms +step:1491/2245 train_time:90958ms step_avg:61.00ms +step:1492/2245 train_time:91018ms step_avg:61.00ms +step:1493/2245 train_time:91080ms step_avg:61.00ms +step:1494/2245 train_time:91140ms step_avg:61.00ms +step:1495/2245 train_time:91203ms step_avg:61.01ms +step:1496/2245 train_time:91264ms step_avg:61.01ms +step:1497/2245 train_time:91327ms step_avg:61.01ms +step:1498/2245 train_time:91387ms step_avg:61.01ms +step:1499/2245 train_time:91451ms step_avg:61.01ms +step:1500/2245 train_time:91512ms step_avg:61.01ms +step:1500/2245 val_loss:3.4445 train_time:91577ms step_avg:61.05ms +step:1501/2245 train_time:91595ms step_avg:61.02ms +step:1502/2245 train_time:91638ms step_avg:61.01ms +step:1503/2245 train_time:91701ms step_avg:61.01ms +step:1504/2245 train_time:91762ms step_avg:61.01ms +step:1505/2245 train_time:91827ms step_avg:61.01ms +step:1506/2245 train_time:91886ms step_avg:61.01ms +step:1507/2245 train_time:91948ms step_avg:61.01ms +step:1508/2245 train_time:92007ms step_avg:61.01ms +step:1509/2245 train_time:92069ms step_avg:61.01ms +step:1510/2245 train_time:92128ms step_avg:61.01ms +step:1511/2245 train_time:92190ms step_avg:61.01ms +step:1512/2245 train_time:92250ms step_avg:61.01ms +step:1513/2245 train_time:92313ms step_avg:61.01ms +step:1514/2245 train_time:92374ms step_avg:61.01ms +step:1515/2245 train_time:92437ms step_avg:61.01ms +step:1516/2245 train_time:92499ms step_avg:61.02ms +step:1517/2245 train_time:92565ms step_avg:61.02ms +step:1518/2245 train_time:92627ms step_avg:61.02ms +step:1519/2245 train_time:92690ms step_avg:61.02ms +step:1520/2245 train_time:92751ms step_avg:61.02ms +step:1521/2245 train_time:92815ms step_avg:61.02ms +step:1522/2245 train_time:92875ms step_avg:61.02ms +step:1523/2245 train_time:92938ms step_avg:61.02ms +step:1524/2245 train_time:92999ms step_avg:61.02ms +step:1525/2245 train_time:93062ms step_avg:61.02ms +step:1526/2245 train_time:93122ms step_avg:61.02ms +step:1527/2245 train_time:93184ms step_avg:61.02ms +step:1528/2245 train_time:93244ms step_avg:61.02ms +step:1529/2245 train_time:93306ms step_avg:61.02ms +step:1530/2245 train_time:93366ms step_avg:61.02ms +step:1531/2245 train_time:93429ms step_avg:61.02ms +step:1532/2245 train_time:93490ms step_avg:61.02ms +step:1533/2245 train_time:93554ms step_avg:61.03ms +step:1534/2245 train_time:93615ms step_avg:61.03ms +step:1535/2245 train_time:93679ms step_avg:61.03ms +step:1536/2245 train_time:93740ms step_avg:61.03ms +step:1537/2245 train_time:93803ms step_avg:61.03ms +step:1538/2245 train_time:93863ms step_avg:61.03ms +step:1539/2245 train_time:93925ms step_avg:61.03ms +step:1540/2245 train_time:93986ms step_avg:61.03ms +step:1541/2245 train_time:94048ms step_avg:61.03ms +step:1542/2245 train_time:94108ms step_avg:61.03ms +step:1543/2245 train_time:94170ms step_avg:61.03ms +step:1544/2245 train_time:94230ms step_avg:61.03ms +step:1545/2245 train_time:94293ms step_avg:61.03ms +step:1546/2245 train_time:94353ms step_avg:61.03ms +step:1547/2245 train_time:94415ms step_avg:61.03ms +step:1548/2245 train_time:94476ms step_avg:61.03ms +step:1549/2245 train_time:94540ms step_avg:61.03ms +step:1550/2245 train_time:94601ms step_avg:61.03ms +step:1551/2245 train_time:94664ms step_avg:61.03ms +step:1552/2245 train_time:94724ms step_avg:61.03ms +step:1553/2245 train_time:94788ms step_avg:61.04ms +step:1554/2245 train_time:94848ms step_avg:61.03ms +step:1555/2245 train_time:94911ms step_avg:61.04ms +step:1556/2245 train_time:94971ms step_avg:61.04ms +step:1557/2245 train_time:95034ms step_avg:61.04ms +step:1558/2245 train_time:95094ms step_avg:61.04ms +step:1559/2245 train_time:95158ms step_avg:61.04ms +step:1560/2245 train_time:95217ms step_avg:61.04ms +step:1561/2245 train_time:95280ms step_avg:61.04ms +step:1562/2245 train_time:95340ms step_avg:61.04ms +step:1563/2245 train_time:95403ms step_avg:61.04ms +step:1564/2245 train_time:95463ms step_avg:61.04ms +step:1565/2245 train_time:95526ms step_avg:61.04ms +step:1566/2245 train_time:95586ms step_avg:61.04ms +step:1567/2245 train_time:95649ms step_avg:61.04ms +step:1568/2245 train_time:95709ms step_avg:61.04ms +step:1569/2245 train_time:95772ms step_avg:61.04ms +step:1570/2245 train_time:95833ms step_avg:61.04ms +step:1571/2245 train_time:95896ms step_avg:61.04ms +step:1572/2245 train_time:95956ms step_avg:61.04ms +step:1573/2245 train_time:96019ms step_avg:61.04ms +step:1574/2245 train_time:96080ms step_avg:61.04ms +step:1575/2245 train_time:96143ms step_avg:61.04ms +step:1576/2245 train_time:96203ms step_avg:61.04ms +step:1577/2245 train_time:96265ms step_avg:61.04ms +step:1578/2245 train_time:96325ms step_avg:61.04ms +step:1579/2245 train_time:96388ms step_avg:61.04ms +step:1580/2245 train_time:96449ms step_avg:61.04ms +step:1581/2245 train_time:96511ms step_avg:61.04ms +step:1582/2245 train_time:96572ms step_avg:61.04ms +step:1583/2245 train_time:96636ms step_avg:61.05ms +step:1584/2245 train_time:96698ms step_avg:61.05ms +step:1585/2245 train_time:96761ms step_avg:61.05ms +step:1586/2245 train_time:96821ms step_avg:61.05ms +step:1587/2245 train_time:96884ms step_avg:61.05ms +step:1588/2245 train_time:96943ms step_avg:61.05ms +step:1589/2245 train_time:97006ms step_avg:61.05ms +step:1590/2245 train_time:97066ms step_avg:61.05ms +step:1591/2245 train_time:97128ms step_avg:61.05ms +step:1592/2245 train_time:97188ms step_avg:61.05ms +step:1593/2245 train_time:97251ms step_avg:61.05ms +step:1594/2245 train_time:97311ms step_avg:61.05ms +step:1595/2245 train_time:97374ms step_avg:61.05ms +step:1596/2245 train_time:97434ms step_avg:61.05ms +step:1597/2245 train_time:97497ms step_avg:61.05ms +step:1598/2245 train_time:97557ms step_avg:61.05ms +step:1599/2245 train_time:97620ms step_avg:61.05ms +step:1600/2245 train_time:97681ms step_avg:61.05ms +step:1601/2245 train_time:97744ms step_avg:61.05ms +step:1602/2245 train_time:97804ms step_avg:61.05ms +step:1603/2245 train_time:97867ms step_avg:61.05ms +step:1604/2245 train_time:97928ms step_avg:61.05ms +step:1605/2245 train_time:97990ms step_avg:61.05ms +step:1606/2245 train_time:98050ms step_avg:61.05ms +step:1607/2245 train_time:98113ms step_avg:61.05ms +step:1608/2245 train_time:98173ms step_avg:61.05ms +step:1609/2245 train_time:98235ms step_avg:61.05ms +step:1610/2245 train_time:98296ms step_avg:61.05ms +step:1611/2245 train_time:98358ms step_avg:61.05ms +step:1612/2245 train_time:98419ms step_avg:61.05ms +step:1613/2245 train_time:98481ms step_avg:61.05ms +step:1614/2245 train_time:98542ms step_avg:61.05ms +step:1615/2245 train_time:98605ms step_avg:61.06ms +step:1616/2245 train_time:98665ms step_avg:61.06ms +step:1617/2245 train_time:98728ms step_avg:61.06ms +step:1618/2245 train_time:98788ms step_avg:61.06ms +step:1619/2245 train_time:98851ms step_avg:61.06ms +step:1620/2245 train_time:98911ms step_avg:61.06ms +step:1621/2245 train_time:98974ms step_avg:61.06ms +step:1622/2245 train_time:99034ms step_avg:61.06ms +step:1623/2245 train_time:99097ms step_avg:61.06ms +step:1624/2245 train_time:99158ms step_avg:61.06ms +step:1625/2245 train_time:99221ms step_avg:61.06ms +step:1626/2245 train_time:99281ms step_avg:61.06ms +step:1627/2245 train_time:99344ms step_avg:61.06ms +step:1628/2245 train_time:99404ms step_avg:61.06ms +step:1629/2245 train_time:99467ms step_avg:61.06ms +step:1630/2245 train_time:99526ms step_avg:61.06ms +step:1631/2245 train_time:99589ms step_avg:61.06ms +step:1632/2245 train_time:99650ms step_avg:61.06ms +step:1633/2245 train_time:99713ms step_avg:61.06ms +step:1634/2245 train_time:99773ms step_avg:61.06ms +step:1635/2245 train_time:99836ms step_avg:61.06ms +step:1636/2245 train_time:99897ms step_avg:61.06ms +step:1637/2245 train_time:99960ms step_avg:61.06ms +step:1638/2245 train_time:100020ms step_avg:61.06ms +step:1639/2245 train_time:100082ms step_avg:61.06ms +step:1640/2245 train_time:100143ms step_avg:61.06ms +step:1641/2245 train_time:100205ms step_avg:61.06ms +step:1642/2245 train_time:100265ms step_avg:61.06ms +step:1643/2245 train_time:100328ms step_avg:61.06ms +step:1644/2245 train_time:100388ms step_avg:61.06ms +step:1645/2245 train_time:100450ms step_avg:61.06ms +step:1646/2245 train_time:100510ms step_avg:61.06ms +step:1647/2245 train_time:100574ms step_avg:61.06ms +step:1648/2245 train_time:100634ms step_avg:61.06ms +step:1649/2245 train_time:100697ms step_avg:61.07ms +step:1650/2245 train_time:100758ms step_avg:61.07ms +step:1651/2245 train_time:100820ms step_avg:61.07ms +step:1652/2245 train_time:100881ms step_avg:61.07ms +step:1653/2245 train_time:100944ms step_avg:61.07ms +step:1654/2245 train_time:101003ms step_avg:61.07ms +step:1655/2245 train_time:101067ms step_avg:61.07ms +step:1656/2245 train_time:101127ms step_avg:61.07ms +step:1657/2245 train_time:101190ms step_avg:61.07ms +step:1658/2245 train_time:101250ms step_avg:61.07ms +step:1659/2245 train_time:101313ms step_avg:61.07ms +step:1660/2245 train_time:101373ms step_avg:61.07ms +step:1661/2245 train_time:101436ms step_avg:61.07ms +step:1662/2245 train_time:101496ms step_avg:61.07ms +step:1663/2245 train_time:101559ms step_avg:61.07ms +step:1664/2245 train_time:101620ms step_avg:61.07ms +step:1665/2245 train_time:101683ms step_avg:61.07ms +step:1666/2245 train_time:101743ms step_avg:61.07ms +step:1667/2245 train_time:101806ms step_avg:61.07ms +step:1668/2245 train_time:101866ms step_avg:61.07ms +step:1669/2245 train_time:101928ms step_avg:61.07ms +step:1670/2245 train_time:101988ms step_avg:61.07ms +step:1671/2245 train_time:102052ms step_avg:61.07ms +step:1672/2245 train_time:102113ms step_avg:61.07ms +step:1673/2245 train_time:102175ms step_avg:61.07ms +step:1674/2245 train_time:102236ms step_avg:61.07ms +step:1675/2245 train_time:102300ms step_avg:61.07ms +step:1676/2245 train_time:102360ms step_avg:61.07ms +step:1677/2245 train_time:102422ms step_avg:61.07ms +step:1678/2245 train_time:102482ms step_avg:61.07ms +step:1679/2245 train_time:102545ms step_avg:61.07ms +step:1680/2245 train_time:102604ms step_avg:61.07ms +step:1681/2245 train_time:102667ms step_avg:61.07ms +step:1682/2245 train_time:102727ms step_avg:61.07ms +step:1683/2245 train_time:102790ms step_avg:61.08ms +step:1684/2245 train_time:102850ms step_avg:61.07ms +step:1685/2245 train_time:102913ms step_avg:61.08ms +step:1686/2245 train_time:102973ms step_avg:61.08ms +step:1687/2245 train_time:103037ms step_avg:61.08ms +step:1688/2245 train_time:103098ms step_avg:61.08ms +step:1689/2245 train_time:103161ms step_avg:61.08ms +step:1690/2245 train_time:103222ms step_avg:61.08ms +step:1691/2245 train_time:103284ms step_avg:61.08ms +step:1692/2245 train_time:103345ms step_avg:61.08ms +step:1693/2245 train_time:103407ms step_avg:61.08ms +step:1694/2245 train_time:103468ms step_avg:61.08ms +step:1695/2245 train_time:103531ms step_avg:61.08ms +step:1696/2245 train_time:103591ms step_avg:61.08ms +step:1697/2245 train_time:103654ms step_avg:61.08ms +step:1698/2245 train_time:103714ms step_avg:61.08ms +step:1699/2245 train_time:103777ms step_avg:61.08ms +step:1700/2245 train_time:103838ms step_avg:61.08ms +step:1701/2245 train_time:103901ms step_avg:61.08ms +step:1702/2245 train_time:103961ms step_avg:61.08ms +step:1703/2245 train_time:104024ms step_avg:61.08ms +step:1704/2245 train_time:104083ms step_avg:61.08ms +step:1705/2245 train_time:104146ms step_avg:61.08ms +step:1706/2245 train_time:104206ms step_avg:61.08ms +step:1707/2245 train_time:104269ms step_avg:61.08ms +step:1708/2245 train_time:104329ms step_avg:61.08ms +step:1709/2245 train_time:104391ms step_avg:61.08ms +step:1710/2245 train_time:104452ms step_avg:61.08ms +step:1711/2245 train_time:104515ms step_avg:61.08ms +step:1712/2245 train_time:104575ms step_avg:61.08ms +step:1713/2245 train_time:104638ms step_avg:61.08ms +step:1714/2245 train_time:104698ms step_avg:61.08ms +step:1715/2245 train_time:104761ms step_avg:61.09ms +step:1716/2245 train_time:104822ms step_avg:61.09ms +step:1717/2245 train_time:104885ms step_avg:61.09ms +step:1718/2245 train_time:104945ms step_avg:61.09ms +step:1719/2245 train_time:105007ms step_avg:61.09ms +step:1720/2245 train_time:105067ms step_avg:61.09ms +step:1721/2245 train_time:105129ms step_avg:61.09ms +step:1722/2245 train_time:105190ms step_avg:61.09ms +step:1723/2245 train_time:105253ms step_avg:61.09ms +step:1724/2245 train_time:105313ms step_avg:61.09ms +step:1725/2245 train_time:105375ms step_avg:61.09ms +step:1726/2245 train_time:105436ms step_avg:61.09ms +step:1727/2245 train_time:105500ms step_avg:61.09ms +step:1728/2245 train_time:105560ms step_avg:61.09ms +step:1729/2245 train_time:105623ms step_avg:61.09ms +step:1730/2245 train_time:105682ms step_avg:61.09ms +step:1731/2245 train_time:105745ms step_avg:61.09ms +step:1732/2245 train_time:105805ms step_avg:61.09ms +step:1733/2245 train_time:105867ms step_avg:61.09ms +step:1734/2245 train_time:105928ms step_avg:61.09ms +step:1735/2245 train_time:105990ms step_avg:61.09ms +step:1736/2245 train_time:106050ms step_avg:61.09ms +step:1737/2245 train_time:106113ms step_avg:61.09ms +step:1738/2245 train_time:106173ms step_avg:61.09ms +step:1739/2245 train_time:106237ms step_avg:61.09ms +step:1740/2245 train_time:106298ms step_avg:61.09ms +step:1741/2245 train_time:106361ms step_avg:61.09ms +step:1742/2245 train_time:106421ms step_avg:61.09ms +step:1743/2245 train_time:106483ms step_avg:61.09ms +step:1744/2245 train_time:106543ms step_avg:61.09ms +step:1745/2245 train_time:106606ms step_avg:61.09ms +step:1746/2245 train_time:106666ms step_avg:61.09ms +step:1747/2245 train_time:106729ms step_avg:61.09ms +step:1748/2245 train_time:106789ms step_avg:61.09ms +step:1749/2245 train_time:106852ms step_avg:61.09ms +step:1750/2245 train_time:106912ms step_avg:61.09ms +step:1750/2245 val_loss:3.3798 train_time:106976ms step_avg:61.13ms +step:1751/2245 train_time:106995ms step_avg:61.11ms +step:1752/2245 train_time:107040ms step_avg:61.10ms +step:1753/2245 train_time:107107ms step_avg:61.10ms +step:1754/2245 train_time:107169ms step_avg:61.10ms +step:1755/2245 train_time:107231ms step_avg:61.10ms +step:1756/2245 train_time:107291ms step_avg:61.10ms +step:1757/2245 train_time:107353ms step_avg:61.10ms +step:1758/2245 train_time:107412ms step_avg:61.10ms +step:1759/2245 train_time:107474ms step_avg:61.10ms +step:1760/2245 train_time:107534ms step_avg:61.10ms +step:1761/2245 train_time:107596ms step_avg:61.10ms +step:1762/2245 train_time:107656ms step_avg:61.10ms +step:1763/2245 train_time:107718ms step_avg:61.10ms +step:1764/2245 train_time:107778ms step_avg:61.10ms +step:1765/2245 train_time:107840ms step_avg:61.10ms +step:1766/2245 train_time:107901ms step_avg:61.10ms +step:1767/2245 train_time:107966ms step_avg:61.10ms +step:1768/2245 train_time:108028ms step_avg:61.10ms +step:1769/2245 train_time:108091ms step_avg:61.10ms +step:1770/2245 train_time:108153ms step_avg:61.10ms +step:1771/2245 train_time:108216ms step_avg:61.10ms +step:1772/2245 train_time:108276ms step_avg:61.10ms +step:1773/2245 train_time:108339ms step_avg:61.10ms +step:1774/2245 train_time:108399ms step_avg:61.10ms +step:1775/2245 train_time:108462ms step_avg:61.11ms +step:1776/2245 train_time:108522ms step_avg:61.10ms +step:1777/2245 train_time:108584ms step_avg:61.11ms +step:1778/2245 train_time:108644ms step_avg:61.10ms +step:1779/2245 train_time:108707ms step_avg:61.11ms +step:1780/2245 train_time:108766ms step_avg:61.10ms +step:1781/2245 train_time:108829ms step_avg:61.11ms +step:1782/2245 train_time:108888ms step_avg:61.10ms +step:1783/2245 train_time:108951ms step_avg:61.11ms +step:1784/2245 train_time:109012ms step_avg:61.11ms +step:1785/2245 train_time:109076ms step_avg:61.11ms +step:1786/2245 train_time:109136ms step_avg:61.11ms +step:1787/2245 train_time:109200ms step_avg:61.11ms +step:1788/2245 train_time:109263ms step_avg:61.11ms +step:1789/2245 train_time:109326ms step_avg:61.11ms +step:1790/2245 train_time:109385ms step_avg:61.11ms +step:1791/2245 train_time:109448ms step_avg:61.11ms +step:1792/2245 train_time:109507ms step_avg:61.11ms +step:1793/2245 train_time:109570ms step_avg:61.11ms +step:1794/2245 train_time:109630ms step_avg:61.11ms +step:1795/2245 train_time:109692ms step_avg:61.11ms +step:1796/2245 train_time:109751ms step_avg:61.11ms +step:1797/2245 train_time:109814ms step_avg:61.11ms +step:1798/2245 train_time:109874ms step_avg:61.11ms +step:1799/2245 train_time:109938ms step_avg:61.11ms +step:1800/2245 train_time:109999ms step_avg:61.11ms +step:1801/2245 train_time:110063ms step_avg:61.11ms +step:1802/2245 train_time:110125ms step_avg:61.11ms +step:1803/2245 train_time:110188ms step_avg:61.11ms +step:1804/2245 train_time:110248ms step_avg:61.11ms +step:1805/2245 train_time:110310ms step_avg:61.11ms +step:1806/2245 train_time:110371ms step_avg:61.11ms +step:1807/2245 train_time:110433ms step_avg:61.11ms +step:1808/2245 train_time:110493ms step_avg:61.11ms +step:1809/2245 train_time:110555ms step_avg:61.11ms +step:1810/2245 train_time:110615ms step_avg:61.11ms +step:1811/2245 train_time:110678ms step_avg:61.11ms +step:1812/2245 train_time:110738ms step_avg:61.11ms +step:1813/2245 train_time:110801ms step_avg:61.11ms +step:1814/2245 train_time:110862ms step_avg:61.11ms +step:1815/2245 train_time:110926ms step_avg:61.12ms +step:1816/2245 train_time:110986ms step_avg:61.12ms +step:1817/2245 train_time:111049ms step_avg:61.12ms +step:1818/2245 train_time:111109ms step_avg:61.12ms +step:1819/2245 train_time:111173ms step_avg:61.12ms +step:1820/2245 train_time:111233ms step_avg:61.12ms +step:1821/2245 train_time:111296ms step_avg:61.12ms +step:1822/2245 train_time:111357ms step_avg:61.12ms +step:1823/2245 train_time:111420ms step_avg:61.12ms +step:1824/2245 train_time:111480ms step_avg:61.12ms +step:1825/2245 train_time:111543ms step_avg:61.12ms +step:1826/2245 train_time:111604ms step_avg:61.12ms +step:1827/2245 train_time:111666ms step_avg:61.12ms +step:1828/2245 train_time:111726ms step_avg:61.12ms +step:1829/2245 train_time:111788ms step_avg:61.12ms +step:1830/2245 train_time:111848ms step_avg:61.12ms +step:1831/2245 train_time:111911ms step_avg:61.12ms +step:1832/2245 train_time:111972ms step_avg:61.12ms +step:1833/2245 train_time:112034ms step_avg:61.12ms +step:1834/2245 train_time:112094ms step_avg:61.12ms +step:1835/2245 train_time:112158ms step_avg:61.12ms +step:1836/2245 train_time:112218ms step_avg:61.12ms +step:1837/2245 train_time:112281ms step_avg:61.12ms +step:1838/2245 train_time:112342ms step_avg:61.12ms +step:1839/2245 train_time:112405ms step_avg:61.12ms +step:1840/2245 train_time:112466ms step_avg:61.12ms +step:1841/2245 train_time:112529ms step_avg:61.12ms +step:1842/2245 train_time:112589ms step_avg:61.12ms +step:1843/2245 train_time:112651ms step_avg:61.12ms +step:1844/2245 train_time:112712ms step_avg:61.12ms +step:1845/2245 train_time:112774ms step_avg:61.12ms +step:1846/2245 train_time:112834ms step_avg:61.12ms +step:1847/2245 train_time:112897ms step_avg:61.12ms +step:1848/2245 train_time:112958ms step_avg:61.12ms +step:1849/2245 train_time:113021ms step_avg:61.13ms +step:1850/2245 train_time:113082ms step_avg:61.13ms +step:1851/2245 train_time:113145ms step_avg:61.13ms +step:1852/2245 train_time:113204ms step_avg:61.13ms +step:1853/2245 train_time:113267ms step_avg:61.13ms +step:1854/2245 train_time:113327ms step_avg:61.13ms +step:1855/2245 train_time:113389ms step_avg:61.13ms +step:1856/2245 train_time:113450ms step_avg:61.13ms +step:1857/2245 train_time:113512ms step_avg:61.13ms +step:1858/2245 train_time:113572ms step_avg:61.13ms +step:1859/2245 train_time:113635ms step_avg:61.13ms +step:1860/2245 train_time:113695ms step_avg:61.13ms +step:1861/2245 train_time:113758ms step_avg:61.13ms +step:1862/2245 train_time:113819ms step_avg:61.13ms +step:1863/2245 train_time:113882ms step_avg:61.13ms +step:1864/2245 train_time:113942ms step_avg:61.13ms +step:1865/2245 train_time:114005ms step_avg:61.13ms +step:1866/2245 train_time:114066ms step_avg:61.13ms +step:1867/2245 train_time:114128ms step_avg:61.13ms +step:1868/2245 train_time:114188ms step_avg:61.13ms +step:1869/2245 train_time:114251ms step_avg:61.13ms +step:1870/2245 train_time:114311ms step_avg:61.13ms +step:1871/2245 train_time:114374ms step_avg:61.13ms +step:1872/2245 train_time:114434ms step_avg:61.13ms +step:1873/2245 train_time:114497ms step_avg:61.13ms +step:1874/2245 train_time:114558ms step_avg:61.13ms +step:1875/2245 train_time:114621ms step_avg:61.13ms +step:1876/2245 train_time:114682ms step_avg:61.13ms +step:1877/2245 train_time:114745ms step_avg:61.13ms +step:1878/2245 train_time:114805ms step_avg:61.13ms +step:1879/2245 train_time:114868ms step_avg:61.13ms +step:1880/2245 train_time:114928ms step_avg:61.13ms +step:1881/2245 train_time:114990ms step_avg:61.13ms +step:1882/2245 train_time:115050ms step_avg:61.13ms +step:1883/2245 train_time:115113ms step_avg:61.13ms +step:1884/2245 train_time:115174ms step_avg:61.13ms +step:1885/2245 train_time:115237ms step_avg:61.13ms +step:1886/2245 train_time:115297ms step_avg:61.13ms +step:1887/2245 train_time:115361ms step_avg:61.13ms +step:1888/2245 train_time:115422ms step_avg:61.13ms +step:1889/2245 train_time:115485ms step_avg:61.14ms +step:1890/2245 train_time:115545ms step_avg:61.14ms +step:1891/2245 train_time:115608ms step_avg:61.14ms +step:1892/2245 train_time:115669ms step_avg:61.14ms +step:1893/2245 train_time:115731ms step_avg:61.14ms +step:1894/2245 train_time:115791ms step_avg:61.14ms +step:1895/2245 train_time:115853ms step_avg:61.14ms +step:1896/2245 train_time:115913ms step_avg:61.14ms +step:1897/2245 train_time:115976ms step_avg:61.14ms +step:1898/2245 train_time:116037ms step_avg:61.14ms +step:1899/2245 train_time:116100ms step_avg:61.14ms +step:1900/2245 train_time:116161ms step_avg:61.14ms +step:1901/2245 train_time:116224ms step_avg:61.14ms +step:1902/2245 train_time:116285ms step_avg:61.14ms +step:1903/2245 train_time:116348ms step_avg:61.14ms +step:1904/2245 train_time:116408ms step_avg:61.14ms +step:1905/2245 train_time:116470ms step_avg:61.14ms +step:1906/2245 train_time:116530ms step_avg:61.14ms +step:1907/2245 train_time:116593ms step_avg:61.14ms +step:1908/2245 train_time:116653ms step_avg:61.14ms +step:1909/2245 train_time:116715ms step_avg:61.14ms +step:1910/2245 train_time:116776ms step_avg:61.14ms +step:1911/2245 train_time:116839ms step_avg:61.14ms +step:1912/2245 train_time:116900ms step_avg:61.14ms +step:1913/2245 train_time:116964ms step_avg:61.14ms +step:1914/2245 train_time:117025ms step_avg:61.14ms +step:1915/2245 train_time:117087ms step_avg:61.14ms +step:1916/2245 train_time:117147ms step_avg:61.14ms +step:1917/2245 train_time:117209ms step_avg:61.14ms +step:1918/2245 train_time:117270ms step_avg:61.14ms +step:1919/2245 train_time:117333ms step_avg:61.14ms +step:1920/2245 train_time:117393ms step_avg:61.14ms +step:1921/2245 train_time:117456ms step_avg:61.14ms +step:1922/2245 train_time:117516ms step_avg:61.14ms +step:1923/2245 train_time:117579ms step_avg:61.14ms +step:1924/2245 train_time:117639ms step_avg:61.14ms +step:1925/2245 train_time:117702ms step_avg:61.14ms +step:1926/2245 train_time:117762ms step_avg:61.14ms +step:1927/2245 train_time:117825ms step_avg:61.14ms +step:1928/2245 train_time:117886ms step_avg:61.14ms +step:1929/2245 train_time:117949ms step_avg:61.15ms +step:1930/2245 train_time:118010ms step_avg:61.14ms +step:1931/2245 train_time:118072ms step_avg:61.15ms +step:1932/2245 train_time:118133ms step_avg:61.15ms +step:1933/2245 train_time:118196ms step_avg:61.15ms +step:1934/2245 train_time:118256ms step_avg:61.15ms +step:1935/2245 train_time:118319ms step_avg:61.15ms +step:1936/2245 train_time:118380ms step_avg:61.15ms +step:1937/2245 train_time:118442ms step_avg:61.15ms +step:1938/2245 train_time:118503ms step_avg:61.15ms +step:1939/2245 train_time:118566ms step_avg:61.15ms +step:1940/2245 train_time:118626ms step_avg:61.15ms +step:1941/2245 train_time:118688ms step_avg:61.15ms +step:1942/2245 train_time:118748ms step_avg:61.15ms +step:1943/2245 train_time:118810ms step_avg:61.15ms +step:1944/2245 train_time:118871ms step_avg:61.15ms +step:1945/2245 train_time:118933ms step_avg:61.15ms +step:1946/2245 train_time:118994ms step_avg:61.15ms +step:1947/2245 train_time:119056ms step_avg:61.15ms +step:1948/2245 train_time:119117ms step_avg:61.15ms +step:1949/2245 train_time:119180ms step_avg:61.15ms +step:1950/2245 train_time:119241ms step_avg:61.15ms +step:1951/2245 train_time:119304ms step_avg:61.15ms +step:1952/2245 train_time:119364ms step_avg:61.15ms +step:1953/2245 train_time:119427ms step_avg:61.15ms +step:1954/2245 train_time:119486ms step_avg:61.15ms +step:1955/2245 train_time:119549ms step_avg:61.15ms +step:1956/2245 train_time:119609ms step_avg:61.15ms +step:1957/2245 train_time:119672ms step_avg:61.15ms +step:1958/2245 train_time:119732ms step_avg:61.15ms +step:1959/2245 train_time:119795ms step_avg:61.15ms +step:1960/2245 train_time:119855ms step_avg:61.15ms +step:1961/2245 train_time:119918ms step_avg:61.15ms +step:1962/2245 train_time:119978ms step_avg:61.15ms +step:1963/2245 train_time:120041ms step_avg:61.15ms +step:1964/2245 train_time:120101ms step_avg:61.15ms +step:1965/2245 train_time:120164ms step_avg:61.15ms +step:1966/2245 train_time:120225ms step_avg:61.15ms +step:1967/2245 train_time:120288ms step_avg:61.15ms +step:1968/2245 train_time:120347ms step_avg:61.15ms +step:1969/2245 train_time:120411ms step_avg:61.15ms +step:1970/2245 train_time:120471ms step_avg:61.15ms +step:1971/2245 train_time:120533ms step_avg:61.15ms +step:1972/2245 train_time:120593ms step_avg:61.15ms +step:1973/2245 train_time:120655ms step_avg:61.15ms +step:1974/2245 train_time:120716ms step_avg:61.15ms +step:1975/2245 train_time:120779ms step_avg:61.15ms +step:1976/2245 train_time:120840ms step_avg:61.15ms +step:1977/2245 train_time:120903ms step_avg:61.15ms +step:1978/2245 train_time:120964ms step_avg:61.15ms +step:1979/2245 train_time:121028ms step_avg:61.16ms +step:1980/2245 train_time:121088ms step_avg:61.16ms +step:1981/2245 train_time:121150ms step_avg:61.16ms +step:1982/2245 train_time:121211ms step_avg:61.16ms +step:1983/2245 train_time:121274ms step_avg:61.16ms +step:1984/2245 train_time:121334ms step_avg:61.16ms +step:1985/2245 train_time:121397ms step_avg:61.16ms +step:1986/2245 train_time:121458ms step_avg:61.16ms +step:1987/2245 train_time:121521ms step_avg:61.16ms +step:1988/2245 train_time:121582ms step_avg:61.16ms +step:1989/2245 train_time:121645ms step_avg:61.16ms +step:1990/2245 train_time:121704ms step_avg:61.16ms +step:1991/2245 train_time:121767ms step_avg:61.16ms +step:1992/2245 train_time:121827ms step_avg:61.16ms +step:1993/2245 train_time:121890ms step_avg:61.16ms +step:1994/2245 train_time:121951ms step_avg:61.16ms +step:1995/2245 train_time:122013ms step_avg:61.16ms +step:1996/2245 train_time:122074ms step_avg:61.16ms +step:1997/2245 train_time:122137ms step_avg:61.16ms +step:1998/2245 train_time:122198ms step_avg:61.16ms +step:1999/2245 train_time:122260ms step_avg:61.16ms +step:2000/2245 train_time:122321ms step_avg:61.16ms +step:2000/2245 val_loss:3.3249 train_time:122385ms step_avg:61.19ms +step:2001/2245 train_time:122403ms step_avg:61.17ms +step:2002/2245 train_time:122447ms step_avg:61.16ms +step:2003/2245 train_time:122514ms step_avg:61.17ms +step:2004/2245 train_time:122576ms step_avg:61.17ms +step:2005/2245 train_time:122639ms step_avg:61.17ms +step:2006/2245 train_time:122699ms step_avg:61.17ms +step:2007/2245 train_time:122762ms step_avg:61.17ms +step:2008/2245 train_time:122821ms step_avg:61.17ms +step:2009/2245 train_time:122884ms step_avg:61.17ms +step:2010/2245 train_time:122943ms step_avg:61.17ms +step:2011/2245 train_time:123006ms step_avg:61.17ms +step:2012/2245 train_time:123066ms step_avg:61.17ms +step:2013/2245 train_time:123128ms step_avg:61.17ms +step:2014/2245 train_time:123189ms step_avg:61.17ms +step:2015/2245 train_time:123251ms step_avg:61.17ms +step:2016/2245 train_time:123311ms step_avg:61.17ms +step:2017/2245 train_time:123375ms step_avg:61.17ms +step:2018/2245 train_time:123437ms step_avg:61.17ms +step:2019/2245 train_time:123501ms step_avg:61.17ms +step:2020/2245 train_time:123562ms step_avg:61.17ms +step:2021/2245 train_time:123626ms step_avg:61.17ms +step:2022/2245 train_time:123687ms step_avg:61.17ms +step:2023/2245 train_time:123750ms step_avg:61.17ms +step:2024/2245 train_time:123811ms step_avg:61.17ms +step:2025/2245 train_time:123873ms step_avg:61.17ms +step:2026/2245 train_time:123933ms step_avg:61.17ms +step:2027/2245 train_time:123996ms step_avg:61.17ms +step:2028/2245 train_time:124055ms step_avg:61.17ms +step:2029/2245 train_time:124117ms step_avg:61.17ms +step:2030/2245 train_time:124176ms step_avg:61.17ms +step:2031/2245 train_time:124239ms step_avg:61.17ms +step:2032/2245 train_time:124299ms step_avg:61.17ms +step:2033/2245 train_time:124362ms step_avg:61.17ms +step:2034/2245 train_time:124424ms step_avg:61.17ms +step:2035/2245 train_time:124487ms step_avg:61.17ms +step:2036/2245 train_time:124549ms step_avg:61.17ms +step:2037/2245 train_time:124612ms step_avg:61.17ms +step:2038/2245 train_time:124673ms step_avg:61.17ms +step:2039/2245 train_time:124735ms step_avg:61.17ms +step:2040/2245 train_time:124796ms step_avg:61.17ms +step:2041/2245 train_time:124858ms step_avg:61.17ms +step:2042/2245 train_time:124918ms step_avg:61.17ms +step:2043/2245 train_time:124981ms step_avg:61.18ms +step:2044/2245 train_time:125041ms step_avg:61.17ms +step:2045/2245 train_time:125103ms step_avg:61.18ms +step:2046/2245 train_time:125163ms step_avg:61.17ms +step:2047/2245 train_time:125227ms step_avg:61.18ms +step:2048/2245 train_time:125287ms step_avg:61.18ms +step:2049/2245 train_time:125351ms step_avg:61.18ms +step:2050/2245 train_time:125412ms step_avg:61.18ms +step:2051/2245 train_time:125476ms step_avg:61.18ms +step:2052/2245 train_time:125536ms step_avg:61.18ms +step:2053/2245 train_time:125599ms step_avg:61.18ms +step:2054/2245 train_time:125659ms step_avg:61.18ms +step:2055/2245 train_time:125723ms step_avg:61.18ms +step:2056/2245 train_time:125783ms step_avg:61.18ms +step:2057/2245 train_time:125846ms step_avg:61.18ms +step:2058/2245 train_time:125906ms step_avg:61.18ms +step:2059/2245 train_time:125970ms step_avg:61.18ms +step:2060/2245 train_time:126030ms step_avg:61.18ms +step:2061/2245 train_time:126093ms step_avg:61.18ms +step:2062/2245 train_time:126153ms step_avg:61.18ms +step:2063/2245 train_time:126216ms step_avg:61.18ms +step:2064/2245 train_time:126276ms step_avg:61.18ms +step:2065/2245 train_time:126338ms step_avg:61.18ms +step:2066/2245 train_time:126399ms step_avg:61.18ms +step:2067/2245 train_time:126463ms step_avg:61.18ms +step:2068/2245 train_time:126523ms step_avg:61.18ms +step:2069/2245 train_time:126586ms step_avg:61.18ms +step:2070/2245 train_time:126646ms step_avg:61.18ms +step:2071/2245 train_time:126710ms step_avg:61.18ms +step:2072/2245 train_time:126770ms step_avg:61.18ms +step:2073/2245 train_time:126833ms step_avg:61.18ms +step:2074/2245 train_time:126894ms step_avg:61.18ms +step:2075/2245 train_time:126957ms step_avg:61.18ms +step:2076/2245 train_time:127018ms step_avg:61.18ms +step:2077/2245 train_time:127080ms step_avg:61.18ms +step:2078/2245 train_time:127140ms step_avg:61.18ms +step:2079/2245 train_time:127204ms step_avg:61.19ms +step:2080/2245 train_time:127264ms step_avg:61.18ms +step:2081/2245 train_time:127327ms step_avg:61.19ms +step:2082/2245 train_time:127388ms step_avg:61.19ms +step:2083/2245 train_time:127451ms step_avg:61.19ms +step:2084/2245 train_time:127512ms step_avg:61.19ms +step:2085/2245 train_time:127574ms step_avg:61.19ms +step:2086/2245 train_time:127634ms step_avg:61.19ms +step:2087/2245 train_time:127697ms step_avg:61.19ms +step:2088/2245 train_time:127757ms step_avg:61.19ms +step:2089/2245 train_time:127820ms step_avg:61.19ms +step:2090/2245 train_time:127880ms step_avg:61.19ms +step:2091/2245 train_time:127943ms step_avg:61.19ms +step:2092/2245 train_time:128004ms step_avg:61.19ms +step:2093/2245 train_time:128068ms step_avg:61.19ms +step:2094/2245 train_time:128129ms step_avg:61.19ms +step:2095/2245 train_time:128192ms step_avg:61.19ms +step:2096/2245 train_time:128253ms step_avg:61.19ms +step:2097/2245 train_time:128315ms step_avg:61.19ms +step:2098/2245 train_time:128376ms step_avg:61.19ms +step:2099/2245 train_time:128438ms step_avg:61.19ms +step:2100/2245 train_time:128499ms step_avg:61.19ms +step:2101/2245 train_time:128561ms step_avg:61.19ms +step:2102/2245 train_time:128621ms step_avg:61.19ms +step:2103/2245 train_time:128684ms step_avg:61.19ms +step:2104/2245 train_time:128745ms step_avg:61.19ms +step:2105/2245 train_time:128809ms step_avg:61.19ms +step:2106/2245 train_time:128869ms step_avg:61.19ms +step:2107/2245 train_time:128933ms step_avg:61.19ms +step:2108/2245 train_time:128994ms step_avg:61.19ms +step:2109/2245 train_time:129057ms step_avg:61.19ms +step:2110/2245 train_time:129117ms step_avg:61.19ms +step:2111/2245 train_time:129179ms step_avg:61.19ms +step:2112/2245 train_time:129240ms step_avg:61.19ms +step:2113/2245 train_time:129302ms step_avg:61.19ms +step:2114/2245 train_time:129363ms step_avg:61.19ms +step:2115/2245 train_time:129426ms step_avg:61.19ms +step:2116/2245 train_time:129486ms step_avg:61.19ms +step:2117/2245 train_time:129548ms step_avg:61.19ms +step:2118/2245 train_time:129609ms step_avg:61.19ms +step:2119/2245 train_time:129672ms step_avg:61.19ms +step:2120/2245 train_time:129732ms step_avg:61.19ms +step:2121/2245 train_time:129795ms step_avg:61.20ms +step:2122/2245 train_time:129855ms step_avg:61.19ms +step:2123/2245 train_time:129918ms step_avg:61.20ms +step:2124/2245 train_time:129979ms step_avg:61.20ms +step:2125/2245 train_time:130041ms step_avg:61.20ms +step:2126/2245 train_time:130102ms step_avg:61.20ms +step:2127/2245 train_time:130164ms step_avg:61.20ms +step:2128/2245 train_time:130224ms step_avg:61.20ms +step:2129/2245 train_time:130287ms step_avg:61.20ms +step:2130/2245 train_time:130347ms step_avg:61.20ms +step:2131/2245 train_time:130411ms step_avg:61.20ms +step:2132/2245 train_time:130472ms step_avg:61.20ms +step:2133/2245 train_time:130536ms step_avg:61.20ms +step:2134/2245 train_time:130596ms step_avg:61.20ms +step:2135/2245 train_time:130658ms step_avg:61.20ms +step:2136/2245 train_time:130718ms step_avg:61.20ms +step:2137/2245 train_time:130781ms step_avg:61.20ms +step:2138/2245 train_time:130841ms step_avg:61.20ms +step:2139/2245 train_time:130904ms step_avg:61.20ms +step:2140/2245 train_time:130964ms step_avg:61.20ms +step:2141/2245 train_time:131027ms step_avg:61.20ms +step:2142/2245 train_time:131088ms step_avg:61.20ms +step:2143/2245 train_time:131151ms step_avg:61.20ms +step:2144/2245 train_time:131212ms step_avg:61.20ms +step:2145/2245 train_time:131276ms step_avg:61.20ms +step:2146/2245 train_time:131335ms step_avg:61.20ms +step:2147/2245 train_time:131398ms step_avg:61.20ms +step:2148/2245 train_time:131458ms step_avg:61.20ms +step:2149/2245 train_time:131521ms step_avg:61.20ms +step:2150/2245 train_time:131581ms step_avg:61.20ms +step:2151/2245 train_time:131644ms step_avg:61.20ms +step:2152/2245 train_time:131704ms step_avg:61.20ms +step:2153/2245 train_time:131768ms step_avg:61.20ms +step:2154/2245 train_time:131829ms step_avg:61.20ms +step:2155/2245 train_time:131891ms step_avg:61.20ms +step:2156/2245 train_time:131951ms step_avg:61.20ms +step:2157/2245 train_time:132014ms step_avg:61.20ms +step:2158/2245 train_time:132075ms step_avg:61.20ms +step:2159/2245 train_time:132138ms step_avg:61.20ms +step:2160/2245 train_time:132198ms step_avg:61.20ms +step:2161/2245 train_time:132260ms step_avg:61.20ms +step:2162/2245 train_time:132321ms step_avg:61.20ms +step:2163/2245 train_time:132383ms step_avg:61.20ms +step:2164/2245 train_time:132444ms step_avg:61.20ms +step:2165/2245 train_time:132507ms step_avg:61.20ms +step:2166/2245 train_time:132568ms step_avg:61.20ms +step:2167/2245 train_time:132630ms step_avg:61.20ms +step:2168/2245 train_time:132691ms step_avg:61.20ms +step:2169/2245 train_time:132754ms step_avg:61.21ms +step:2170/2245 train_time:132814ms step_avg:61.20ms +step:2171/2245 train_time:132878ms step_avg:61.21ms +step:2172/2245 train_time:132938ms step_avg:61.21ms +step:2173/2245 train_time:133000ms step_avg:61.21ms +step:2174/2245 train_time:133060ms step_avg:61.21ms +step:2175/2245 train_time:133123ms step_avg:61.21ms +step:2176/2245 train_time:133183ms step_avg:61.21ms +step:2177/2245 train_time:133246ms step_avg:61.21ms +step:2178/2245 train_time:133307ms step_avg:61.21ms +step:2179/2245 train_time:133370ms step_avg:61.21ms +step:2180/2245 train_time:133430ms step_avg:61.21ms +step:2181/2245 train_time:133494ms step_avg:61.21ms +step:2182/2245 train_time:133554ms step_avg:61.21ms +step:2183/2245 train_time:133616ms step_avg:61.21ms +step:2184/2245 train_time:133676ms step_avg:61.21ms +step:2185/2245 train_time:133740ms step_avg:61.21ms +step:2186/2245 train_time:133801ms step_avg:61.21ms +step:2187/2245 train_time:133863ms step_avg:61.21ms +step:2188/2245 train_time:133923ms step_avg:61.21ms +step:2189/2245 train_time:133986ms step_avg:61.21ms +step:2190/2245 train_time:134047ms step_avg:61.21ms +step:2191/2245 train_time:134110ms step_avg:61.21ms +step:2192/2245 train_time:134170ms step_avg:61.21ms +step:2193/2245 train_time:134234ms step_avg:61.21ms +step:2194/2245 train_time:134295ms step_avg:61.21ms +step:2195/2245 train_time:134357ms step_avg:61.21ms +step:2196/2245 train_time:134417ms step_avg:61.21ms +step:2197/2245 train_time:134480ms step_avg:61.21ms +step:2198/2245 train_time:134541ms step_avg:61.21ms +step:2199/2245 train_time:134603ms step_avg:61.21ms +step:2200/2245 train_time:134664ms step_avg:61.21ms +step:2201/2245 train_time:134727ms step_avg:61.21ms +step:2202/2245 train_time:134787ms step_avg:61.21ms +step:2203/2245 train_time:134850ms step_avg:61.21ms +step:2204/2245 train_time:134911ms step_avg:61.21ms +step:2205/2245 train_time:134974ms step_avg:61.21ms +step:2206/2245 train_time:135035ms step_avg:61.21ms +step:2207/2245 train_time:135097ms step_avg:61.21ms +step:2208/2245 train_time:135158ms step_avg:61.21ms +step:2209/2245 train_time:135221ms step_avg:61.21ms +step:2210/2245 train_time:135281ms step_avg:61.21ms +step:2211/2245 train_time:135344ms step_avg:61.21ms +step:2212/2245 train_time:135405ms step_avg:61.21ms +step:2213/2245 train_time:135468ms step_avg:61.21ms +step:2214/2245 train_time:135529ms step_avg:61.21ms +step:2215/2245 train_time:135592ms step_avg:61.22ms +step:2216/2245 train_time:135652ms step_avg:61.21ms +step:2217/2245 train_time:135716ms step_avg:61.22ms +step:2218/2245 train_time:135776ms step_avg:61.22ms +step:2219/2245 train_time:135838ms step_avg:61.22ms +step:2220/2245 train_time:135899ms step_avg:61.22ms +step:2221/2245 train_time:135961ms step_avg:61.22ms +step:2222/2245 train_time:136022ms step_avg:61.22ms +step:2223/2245 train_time:136085ms step_avg:61.22ms +step:2224/2245 train_time:136146ms step_avg:61.22ms +step:2225/2245 train_time:136209ms step_avg:61.22ms +step:2226/2245 train_time:136269ms step_avg:61.22ms +step:2227/2245 train_time:136333ms step_avg:61.22ms +step:2228/2245 train_time:136394ms step_avg:61.22ms +step:2229/2245 train_time:136456ms step_avg:61.22ms +step:2230/2245 train_time:136516ms step_avg:61.22ms +step:2231/2245 train_time:136580ms step_avg:61.22ms +step:2232/2245 train_time:136640ms step_avg:61.22ms +step:2233/2245 train_time:136704ms step_avg:61.22ms +step:2234/2245 train_time:136764ms step_avg:61.22ms +step:2235/2245 train_time:136827ms step_avg:61.22ms +step:2236/2245 train_time:136888ms step_avg:61.22ms +step:2237/2245 train_time:136951ms step_avg:61.22ms +step:2238/2245 train_time:137013ms step_avg:61.22ms +step:2239/2245 train_time:137075ms step_avg:61.22ms +step:2240/2245 train_time:137136ms step_avg:61.22ms +step:2241/2245 train_time:137199ms step_avg:61.22ms +step:2242/2245 train_time:137259ms step_avg:61.22ms +step:2243/2245 train_time:137322ms step_avg:61.22ms +step:2244/2245 train_time:137383ms step_avg:61.22ms +step:2245/2245 train_time:137446ms step_avg:61.22ms +step:2245/2245 val_loss:3.2790 train_time:137507ms step_avg:61.25ms +peak memory allocated: 29249 MiB reserved: 50528 MiB diff --git a/records/track_1_short/2025-11-10_CautiousWD/7f53cbe9-4553-44fd-97e6-7e479337fdab.txt b/records/track_1_short/2025-11-10_CautiousWD/7f53cbe9-4553-44fd-97e6-7e479337fdab.txt new file mode 100644 index 000000000..602a6adf6 --- /dev/null +++ b/records/track_1_short/2025-11-10_CautiousWD/7f53cbe9-4553-44fd-97e6-7e479337fdab.txt @@ -0,0 +1,3772 @@ +import os +import sys + +with open(sys.argv[0]) as f: + code = f.read() # read the code of this file ASAP, for logging +import copy +import glob +import math +import threading +import time +import uuid +from dataclasses import dataclass +from collections import defaultdict +from itertools import accumulate +from pathlib import Path + +os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" +import torch + +torch.empty( + 1, device="cuda", requires_grad=True +).backward() # prevents a bug on some systems +import torch._dynamo as dynamo +import torch.distributed as dist +import torch.nn.functional as F + +# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min +import triton +import triton.language as tl +from kernels import get_kernel +from torch import Tensor, nn + +dynamo.config.recompile_limit = 64 + +# ----------------------------------------------------------------------------- +# Custom operators: FP8 matmul by @YouJiacheng + + +@torch.library.custom_op("nanogpt::mm", mutates_args=()) +def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: + @torch.compile + def impl(x: Tensor, w: Tensor): + assert x.is_contiguous() and w.is_contiguous() + x_f8 = x.div(x_s).to(torch.float8_e4m3fn) + w_f8 = w.div(w_s).to(torch.float8_e4m3fn) + out = torch._scaled_mm( + x_f8, + w_f8.T, + out_dtype=torch.bfloat16, + scale_a=x.new_tensor(x_s, dtype=torch.float32), + scale_b=x.new_tensor(w_s, dtype=torch.float32), + use_fast_accum=True, + ) + return out, x_f8, w_f8 + + return impl(x, w) + +@mm_op.register_fake +def _(x: Tensor, w: Tensor, *_): + assert x.ndim == w.ndim == 2 + assert x.shape[1] == w.shape[1] + assert x.device == w.device + assert x.is_contiguous() and w.is_contiguous() + return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) + +@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) +def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: + @torch.compile + def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): + assert grad.is_contiguous() + x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) + w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) + grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) + grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) + grad_x = torch._scaled_mm( + grad_f8, + w_f8.T.contiguous().T, + out_dtype=torch.bfloat16, + scale_a=grad_inv_s, + scale_b=w_inv_s, + use_fast_accum=False, + ) + # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) + grad_w = torch._scaled_mm( + x_f8.T.contiguous(), + grad_f8.T.contiguous().T, + out_dtype=torch.float32, + scale_a=x_inv_s, + scale_b=grad_inv_s, + use_fast_accum=False, + ).T + return grad_x, grad_w + + return impl(g, x_f8, w_f8) + +@mm_backward_op.register_fake +def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): + return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) + +def backward(ctx, grad_out: Tensor, *_): + x_f8, w_f8 = ctx.saved_tensors + x_s, w_s, grad_s = ctx.scales + grad_x, grad_w = torch.ops.nanogpt.mm_backward( + grad_out, x_f8, w_f8, x_s, w_s, grad_s + ) + return grad_x, grad_w, None, None, None + +def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): + *_, x_s, w_s, grad_s = inputs + _, x_f8, w_f8 = output + ctx.save_for_backward(x_f8, w_f8) + ctx.scales = x_s, w_s, grad_s + ctx.set_materialize_grads(False) + +mm_op.register_autograd(backward, setup_context=setup_context) + +# ----------------------------------------------------------------------------- +# Triton kernel for symmetric matrix multiplication by @byronxu99 + +def _get_autotune_configs(): + return [ + triton.Config( + { + "BLOCK_SIZE_M": bm, + "BLOCK_SIZE_N": bn, + "BLOCK_SIZE_K": bk, + "GROUP_SIZE_M": 8, + "LOWER_UPPER": 1, + }, + num_stages=stages, + num_warps=warps, + ) + for bm in [64, 128] + for bn in [64, 128, 256] + for bk in [64, 128] + for stages, warps in [(3, 4), (3, 8), (4, 4)] + if bm // bn <= 2 and bn // bm <= 2 + ] + +@triton.jit +def _pid_to_block( + pid, + M, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) + + # Map PID to a single matrix in batch + batch_idx = pid // (num_pid_m * num_pid_n) + pid = pid % (num_pid_m * num_pid_n) + + # Map PID to 2D grid of blocks + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) + + m_idx = pid_m * BLOCK_SIZE_M + n_idx = pid_n * BLOCK_SIZE_N + return batch_idx, m_idx, n_idx + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def XXT_kernel( + A_ptr, C_ptr, + M, K, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def XXT(A: torch.Tensor, out: torch.Tensor): + """ + Launch Triton kernel to compute C = A @ A.T + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert out.size(-2) == M, "Output matrix has incorrect shape" + assert out.size(-1) == M, "Output matrix has incorrect shape" + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + XXT_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + K=K, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + ) + return out + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def ba_plus_cAA_kernel( + A_ptr, C_ptr, + M, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + alpha, beta, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A + # Performance is slightly slower than XXT_kernel, so we use two separate kernels + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + # Load block of A to add (corresponds to the current block of C) + offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) + a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) + a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) + a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) + + # Apply alpha and beta + accumulator *= alpha + accumulator += a_add * beta + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): + """ + Launch Triton kernel to compute C = alpha * A @ A.T + beta * A + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert M == K, "Input matrix must be square" + assert out.size(-2) == M + assert out.size(-1) == M + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + ba_plus_cAA_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + alpha=alpha, + beta=beta, + ) + return out + +# Computed for num_iters=5, safety_factor=2e-2, cushion=2 +polar_express_coeffs = [ + (8.156554524902461, -22.48329292557795, 15.878769915207462), + (4.042929935166739, -2.808917465908714, 0.5000178451051316), + (3.8916678022926607, -2.772484153217685, 0.5060648178503393), + (3.285753657755655, -2.3681294933425376, 0.46449024233003106), + (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) +] + +@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower +def polar_express(G: torch.Tensor): + """ + Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 + by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. + """ + X = G.bfloat16() + if G.size(-2) > G.size(-1): + X = X.mT + + # Ensure spectral norm is at most 1 + X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) + + # Allocate buffers + X = X.contiguous() + A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) + B = torch.empty_like(A) + C = torch.empty_like(X) + + aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm + + # Perform the iterations + for a, b, c in polar_express_coeffs: + XXT(X, out=A) # A = X @ X.mT + ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A + aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X + X, C = C, X # Swap references to avoid unnecessary copies + + if G.size(-2) > G.size(-1): + X = X.mT + return X + +# ----------------------------------------------------------------------------- +# Muon optimizer + +class NorMuon(torch.optim.Optimizer): + """ + Muon - MomentUm Orthogonalized by Newton-schulz + + https://kellerjordan.github.io/posts/muon/ + + Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- + processing step, in which each 2D parameter's update is replaced with the nearest orthogonal + matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has + the advantage that it can be stably run in bfloat16 on the GPU. + + Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, + or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). + + Differences from standard Muon: + - Newton-Shulz is replaced with Polar Express for the orthogonalization step + - NorMuon adds a low-rank variance estimator similar to Adafactor. + - small 1D parameters handled here instead of in Adam + - Cautious weight decay, a gated version of decoupled weight decay + - Custom distributed sizing: + The model stores all attn and mlp weights in the same shape, and then updates the view as + needed on the forward pass. This enables attn and mlp weights to be contained within the same + dist.reduce_scatter_tensor() call. The model architecture has been customized to enable + (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. + The scheduling is: + 1. reduce scatter smear_gate (1 param 7 padding params) + 2. reduce scatter attn_gate (10 params 6 padding params) + 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) + 4. reduce scatter attn/mlp round 2 (16 mlp params) + 5. wait on step 1, then compute update of 1 and schedule all gather + 6. wait on step 2, then compute update of 2 and schedule all gather + 7. wait on step 3, then compute update of 3 and schedule all gather + GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] + GPUs that receive params of type attn reshape before computing update + 8. wait on 4, then compute update of 4 and schedule all gather + 9. wait for each all gather to complete and update params + Empirically, leading with small params provides an additional 0.2s improvement. + """ + def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, beta2=0.95, custom_sizing=True): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + # custom sizing requires 8 GPUs + if custom_sizing and dist.get_world_size()==8: + param_groups = self.generate_custom_param_groups(params) + else: + param_groups = self.generate_standard_param_groups(params) + super().__init__(param_groups, defaults) + + def reset(self): + # expose a reset for clearing buffers + for group in self.param_groups: + group["momentum_buffer"].zero_() + group["second_momentum_buffer"].zero_() + + def generate_standard_param_groups(self, params): + """ + Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. + Creates one param group per module. + """ + groups = defaultdict(list) + for param in params: + groups[param.label].append(param) + + param_groups = [] + for module_name, group_params in groups.items(): + chunk_size = (len(group_params) + self.world_size - 1) // self.world_size + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + + return param_groups + + def generate_custom_param_groups(self, params): + """ + Implementation requires that a single GPU does not receive both attn + and mlp params when a param group is split across GPUs. + """ + module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp'] + params_list = list(params) + params_list.sort(key=lambda x: module_group_order.index(x.label)) + + idx = 0 + group_sizes = [1, 10, 16, 16] + assert len(params_list) == sum(group_sizes) + param_groups = [] + for size in group_sizes: + chunk_size = (size + self.world_size - 1) // self.world_size + group_params = params_list[idx: idx + size] + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + idx += size + + return param_groups + + @torch.no_grad() + def step(self): + # Efficient systems-wise implementation of step developed by @YouJiacheng, + # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, + # @ryanyang0, @vagrawal, and @varunneal. + rank = dist.get_rank() + group_infos = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + if not params: + continue + + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + stacked_grads = torch.empty( + (padded_num_params, *params[0].shape), + dtype=params[0].dtype, + device=params[0].device + ) + for i, p in enumerate(params): + stacked_grads[i].copy_(p.grad, non_blocking=True) + if len(params) < padded_num_params: + stacked_grads[len(params):].zero_() + + grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) + + reduce_future = dist.reduce_scatter_tensor( + grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True + ).get_future() + + group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) + + all_gather_infos = [] + # Second pass: wait for gradients, compute updates for the local shard of parameters, + # and launch all async all_gather operations. + for group, info in zip(self.param_groups, group_infos): + info["reduce_future"].wait() + + params = group["params"] + grad_chunk = info["grad_chunk"] + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + start_idx = rank * chunk_size + module_idx = start_idx if start_idx < len(params) else 0 + + num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank + + if "momentum_buffer" not in group: + group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) + momentum_buffer = group["momentum_buffer"] + # Apply momentum update to the persistent momentum buffer in-place + momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) + updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) + + grad_shape = updated_grads.shape + if params[module_idx].label == 'attn': + # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] + for p in params[module_idx:module_idx + num_params]: + assert p.label == 'attn' + updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) + ref_param = params[module_idx] + param_shape = ref_param.shape + + if "second_momentum_buffer" not in group: + group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) + if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) + ) + second_momentum_buffer = group["second_momentum_buffer"] + + if "param_lr" not in group: + group["param_lr"] = ( + max(1., param_shape[-2] / param_shape[-1]) ** 0.5 + * ref_param.new_tensor( + [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + ) + + group["param_wd"] = ref_param.new_tensor( + [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + + # Determine LR and WR + eff_lr = group["lr"] * group["param_lr"] + eff_wd = group["lr"] * group["weight_decay"] * group["param_wd"] + + # Compute zeropower for the entire chunk in a single, batched call. + if num_params == 0: + v_chunk = updated_grads + else: + v_chunk = polar_express(updated_grads) + + # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) + v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) + second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) + step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() + v_chunk.mul_(step_size) + v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) + + v_chunk = v_chunk.view(grad_shape) + + updated_params = torch.empty_like(grad_chunk) + param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) + + # "Cautious" weight decay (https://arxiv.org/abs/2510.12402) + mask = (v_chunk * param_chunk) >= 0 + v_chunk.addcmul_(param_chunk, (eff_wd * mask).to(ref_param.dtype)) + + param_chunk.addcmul_(v_chunk, -eff_lr) + + updated_params[:num_params].copy_(param_chunk) + if num_params < chunk_size: + updated_params[num_params:].zero_() + + stacked_params = torch.empty( + (padded_num_params, *param_shape), + dtype=updated_params.dtype, + device=updated_params.device, + ) + + gather_future = dist.all_gather_into_tensor( + stacked_params, updated_params, async_op=True + ).get_future() + + all_gather_infos.append( + { + "gather_future": gather_future, + "stacked_params": stacked_params, + "orig_params": params, + } + ) + + # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. + for info in all_gather_infos: + info["gather_future"].wait() + stacked_params = info["stacked_params"] + orig_params = info["orig_params"] + + unstacked_params = torch.unbind(stacked_params) + for i, p in enumerate(orig_params): + p.copy_(unstacked_params[i], non_blocking=True) + + +class DistAdam(torch.optim.Optimizer): + def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + params = list(params) + sizes = {p.shape for p in params} + # create one buffer per unique parameter-size + param_groups = [] + for size in sizes: + group_params = [p for p in params if p.shape == size] + param_groups.append(dict(params=group_params)) + super().__init__(param_groups, defaults) + # init state + for p in params: + chunk_size = p.size(0) // self.world_size + exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) + exp_avg_sq = torch.zeros_like(exp_avg) + self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) + # DistributedAdam implementation by @vagrawal + + @torch.compile + @torch.no_grad() + def step(self): + rank = dist.get_rank() + reduce_scatter_futures: list[torch.Future] = [] + all_gather_futures: list[torch.Future] = [] + grad_slices = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + for param in params: + grad = param.grad + rank_size = grad.shape[0] // self.world_size + grad_slice = torch.empty_like(grad[:rank_size]) + reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) + grad_slices.append(grad_slice) + + idx = 0 + for group in self.param_groups: + beta1, beta2 = group['betas'] + eps = group['eps'] + wd = group['weight_decay'] + params = group['params'] + for param in params: + reduce_scatter_futures[idx].wait() + rank_size = param.shape[0] // self.world_size + p_slice = param[rank * rank_size:(rank + 1) * rank_size] + lr = group['lr'] * getattr(param, "lr_mul", 1.0) + state = self.state[param] + g_slice = grad_slices[idx] + + exp_avg = state["exp_avg"] + exp_avg_sq = state["exp_avg_sq"] + state["step"] += 1 + t = state["step"] + # weight decay + if wd != 0: + eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) + p_slice.mul_(1 - eff_weight_decay) + # update running averages + exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) + # bias corrections + bias1 = 1 - beta1 ** t + bias2 = 1 - beta2 ** t + # compute step + denom = exp_avg_sq.sqrt().add_(eps) + step_size = lr * (bias2 ** 0.5 / bias1) + update = exp_avg.div(denom).mul_(step_size) + p_slice.add_(other=update, alpha=-1.0) + idx += 1 + all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) + torch.futures.collect_all(all_gather_futures).wait() + +# ----------------------------------------------------------------------------- +# PyTorch nn.Module definitions for the model + +def norm(x: Tensor): + return F.rms_norm(x, (x.size(-1),)) + +class CastedLinear(nn.Linear): + def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): + super().__init__(in_features, out_features, bias=False) + self.use_fp8 = use_fp8 + self.x_s = x_s + self.w_s = w_s + self.grad_s = grad_s + + def reset_parameters(self) -> None: + with torch.no_grad(): + self.weight.zero_() # @Grad62304977 and others + + def forward(self, x: Tensor): + if self.use_fp8 and self.training: + _x = x.flatten(0, -2) + out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] + return out.reshape(*x.shape[:-1], -1) + else: + return F.linear(x, self.weight.type_as(x)) + +# yarn implementation @classiclarryd +class Yarn(nn.Module): + def __init__(self, head_dim, max_seq_len): + super().__init__() + self.head_dim = head_dim + self.max_seq_len = max_seq_len + self.reset() + + def reset(self): + angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) + # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) + angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) + theta = torch.outer(t, angular_freq) + self.cos = nn.Buffer( + theta.cos().to(torch.bfloat16), persistent=False + ) + self.sin = nn.Buffer( + theta.sin().to(torch.bfloat16), persistent=False + ) + self.angular_freq = angular_freq + # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 + self.attn_scale = 0.1 + + def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): + rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) + scaling_factor = old_window / new_window + interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) + self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) + theta = torch.outer(t, self.angular_freq) + self.cos.copy_(theta.cos()) + self.sin.copy_(theta.sin()) + self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 + +def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): + assert cos.size(0) >= x_BTHD.size(-3) + cos, sin = ( + cos[None, : x_BTHD.size(-3), None, :], + sin[None, : x_BTHD.size(-3), None, :], + ) + x1, x2 = x_BTHD.chunk(2, dim=-1) + y1 = x1 * cos + x2 * sin + y2 = x1 * (-sin) + x2 * cos + return torch.cat((y1, y2), 3) + +@dataclass +class AttnArgs: + ve: torch.Tensor + sa_lambdas: torch.Tensor + seqlens: torch.Tensor + bm_size: int + cos: torch.Tensor + sin: torch.Tensor + attn_scale: float + +flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface + +class CausalSelfAttention(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int): + super().__init__() + self.num_heads = num_heads + self.head_dim = head_dim + self.dim = dim + self.hdim = num_heads * head_dim + + assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" + std = 0.5 * (self.dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng + # https://x.com/hi_tysam/status/1879699187107033311 + # make matrices the same shape as MLP to enable batched call in optimizer + self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) + # label module to enable custom optimizer sizing + self.qkvo_w.label='attn' + + with torch.no_grad(): + self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights + self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero + + # sparse gated attention to enable context based no-op by @classiclarryd + self.attn_gate = CastedLinear(12, num_heads) + # label module to enable custom optimizer sizing + self.attn_gate.weight.label = 'attn_gate' + + def forward(self, x: Tensor, attn_args: AttnArgs): + B, T = x.size(0), x.size(1) # batch size, sequence length + assert B == 1, "varlen sequences requires B == 1" + assert T % 16 == 0 + # unpack attention args + cos, sin = attn_args.cos, attn_args.sin + ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas + seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size + + q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) + q, k = norm(q), norm(k) # QK norm @Grad62304977 + q, k = rotary(q, cos, sin), rotary(k, cos, sin) + if ve is not None: + v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 + else: # skip mid-layers token value embeddings by @YouJiacheng + v = sa_lambdas[0] * v + + max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) + + # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng + y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, + max_seqlen_q=max_len, max_seqlen_k=max_len, + causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) + y = y.view(B, T, self.num_heads, self.head_dim) + y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) + y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side + y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) + return y + + +class MLP(nn.Module): + def __init__(self, dim: int): + super().__init__() + hdim = 4 * dim + # make matrices the same shape to enable batched call in optimizer + self.c_fc = nn.Parameter(torch.empty(dim, hdim)) + self.c_proj = nn.Parameter(torch.empty(dim, hdim)) + # label modules to enable custom optimizer sizing + self.c_fc.label = 'mlp' + self.c_proj.label = 'mlp' + # corrective factor to account for transpose + self.c_fc.lr_mul = 2. + + std = 0.5 * (dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + with torch.no_grad(): + self.c_fc.uniform_(-bound, bound) + self.c_proj.zero_() # zero init suggested by @Grad62304977 + + def forward(self, x: Tensor): + x = F.linear(x, self.c_fc.T.type_as(x)) + x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 + x = F.linear(x, self.c_proj.type_as(x)) + return x + +class Block(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): + super().__init__() + # skip attention of blocks.7 (the 8th layer) by @YouJiacheng + self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None + # skip MLP blocks for first MLP layer by @EmelyanenkoK + self.mlp = MLP(dim) if layer_idx != 0 else None + + def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): + x = lambdas[0] * x + lambdas[1] * x0 + if self.attn is not None: + x = x + self.attn(norm(x), attn_args) + if self.mlp is not None: + x = x + self.mlp(norm(x)) + return x + +# ----------------------------------------------------------------------------- +# The main model + +def next_multiple_of_n(v: float | int, *, n: int): + return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) + +class GPT(nn.Module): + def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): + super().__init__() + vocab_size = next_multiple_of_n(vocab_size, n=128) + self.embed = nn.Embedding(vocab_size, model_dim) + self.smear_gate = CastedLinear(12, 1) + # label modules to enable custom optimizer sizing + self.smear_gate.weight.label = 'smear_gate' + # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 + # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 + self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) + self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) + self.yarn = Yarn(head_dim, max_seq_len) + # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. + # suggested to me by @Grad62304977. this originates from Karpathy's experiments. + use_fp8 = not os.environ.get("DISABLE_FP8", False) + self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) + # Add learnable skip connection weights for decoder layers + assert num_layers % 2 == 0 + pad = (-num_layers * 5 - 2) % dist.get_world_size() + self.scalars = nn.Parameter( + torch.cat( + [ + -1.5 + * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 + *[ + torch.tensor([1.0, 0.0]) for _ in range(num_layers) + ], # block lambdas + *[ + torch.tensor([0.5, 0.5]) for _ in range(num_layers) + ], # SA lambdas + torch.zeros(1), # smear_lambda + 0.5*torch.ones(1), # backout_lambda + torch.ones(pad), + ] + ) + ) + # set learning rates + for param in self.embed.parameters(): + param.lr_mul = 75. + for param in self.value_embeds.parameters(): + param.lr_mul = 75. + self.lm_head.weight.lr_mul = 1.0 + self.scalars.lr_mul = 5.0 + + def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): + assert input_seq.ndim == 1 + + ve = [value_embed(input_seq) for value_embed in self.value_embeds] + # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure + # dropping first layer updates this to .12 ... 012 + ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] + assert len(ve) == len(self.blocks) + + short_bm = ws_short * args.block_size + long_bm = ws_long * args.block_size + bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] + assert len(bm_sizes) == len(self.blocks) + + x = self.embed(input_seq) + + skip_weights = self.scalars[:(len(self.blocks) // 2)] + lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) + sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) + smear_lambda = self.scalars[5 * len(self.blocks)] + backout_lambda = self.scalars[5 * len(self.blocks)+1] + + # smear token embed forward 1 position @classiclarryd + smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) + x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) + x = x0 = norm(x[None]) + + # U-net design by @brendanh0gan + skip_connections = [] + n = len(self.blocks) // 2 + + x_backout = None + backout_layer = 8 + # skip layer zero + for i in range(1,len(self.blocks)): + attn_args = AttnArgs( + ve=ve[i], + sa_lambdas=sa_lambdas[i], + seqlens=seqlens, + bm_size=bm_sizes[i], + cos=self.yarn.cos, + sin=self.yarn.sin, + attn_scale=self.yarn.attn_scale + ) + # since layer 0 is skipped, layer 11 does not have skip_connection + if i >= n and i<11: + gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) + x = x + gate * skip_connections.pop() + x = self.blocks[i](x, x0, lambdas[i], attn_args) + if i < n: + skip_connections.append(x) + if i == backout_layer: + x_backout = x + + # back out contributions from first 8 layers that are only required for downstream context and not direct prediction + x -= backout_lambda * x_backout + x = norm(x) + logits = self.lm_head(x) + # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) + logits = 30 * torch.sigmoid(logits / 7.5) + logits_for_loss = logits.float() if not self.training else logits + loss = F.cross_entropy( + logits_for_loss.view(-1, logits_for_loss.size(-1)), + target_seq, + reduction="sum" if self.training else "mean", + ) + return loss + +# ----------------------------------------------------------------------------- +# Distributed data loader + +def _load_data_shard(file: Path): + header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 + assert header[0] == 20240520, "magic number mismatch in the data .bin file" + assert header[1] == 1, "unsupported version" + num_tokens = int(header[2]) # number of tokens (claimed) + with file.open("rb", buffering=0) as f: + tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng + f.seek(256 * 4) + nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng + assert nbytes == 2 * num_tokens, "number of tokens read does not match header" + return tokens + +BOS_ID = 50256 + +class BOSFinder: + # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd + def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): + # Precompute BOS positions once per shard + self.tokens=tokens + self.size = tokens.numel() + self.quickload = quickload + if quickload: + # only scan first 4 million tokens, then kickoff async thread to scan rest + self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.thread = None + self.ready = threading.Event() + self.start() + else: + self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.i = 0 + self.world_size = world_size + self.batch_iter = 0 + + def _load(self): + self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + self.bos_idx = self.bos_idx_async + + def next_batch(self, num_tokens_local: int, max_seq_len: int): + # if quickload was used, repoint to the full dataset after 5 batches + if self.quickload and self.batch_iter==5: + self.get() + n = len(self.bos_idx) + starts = [[] for _ in range(self.world_size)] + ends = [[] for _ in range(self.world_size)] + + idx = self.i + for r in range(self.world_size): + cur_len = 0 + while cur_len <= num_tokens_local: + if idx >= n: + raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") + cur = self.bos_idx[idx] + starts[r].append(cur) + end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, + cur + max_seq_len, + cur + num_tokens_local - cur_len + 1) + ends[r].append(end) + cur_len += end - cur + idx += 1 + + assert cur_len == num_tokens_local + 1 + self.i = idx + self.batch_iter+=1 + return starts, ends + +class DataPreloader: + # Helper for asynchronously loading next shard and indexing bos tokens + def __init__(self, file_iter, world_size: int = 1): + self.file_iter = file_iter + self.world_size = world_size + self.thread = None + self.data = None + self.ready = threading.Event() + + def _load(self): + tokens = _load_data_shard(next(self.file_iter)) + self.data = (tokens, BOSFinder(tokens, self.world_size)) + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + return self.data + +def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): + # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len + rank = dist.get_rank() if dist.is_initialized() else 0 + world_size = dist.get_world_size() if dist.is_initialized() else 1 + assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" + num_tokens = num_tokens // grad_accum_steps + + files = [Path(file) for file in sorted(glob.glob(filename_pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") + + file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training + tokens = _load_data_shard(next(file_iter)) + if align_to_bos: + finder = BOSFinder(tokens, world_size=world_size, quickload=True) + preloader = DataPreloader(file_iter, world_size) + preloader.start() + else: + pos = 0 # for unaligned case + + while True: + num_tokens_local = num_tokens // world_size + max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 + + if align_to_bos: + try: + seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) + start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) + except StopIteration: + # This shard is exhausted, load the next one in the next loop iteration. + tokens, finder = preloader.get() + preloader.start() + continue + + buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) + _inputs = buf[:-1] + _targets = buf[1:] + end_idxs[-1] -= 1 # last document was too long to account for _targets offset + cum_lengths = (end_idxs - start_idxs).cumsum(0) + + else: + if pos + num_tokens + 1 >= len(tokens): # should not occur for val data + tokens, pos = _load_data_shard(next(file_iter)), 0 + + pos_local = pos + rank * num_tokens_local + buf = tokens[pos_local: pos_local + num_tokens_local + 1] + _inputs = buf[:-1].view(num_tokens_local, ) + _targets = buf[1:].view(num_tokens_local, ) + + cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] + pos += num_tokens + + + _cum_lengths = torch.full((max_num_docs,), num_tokens_local) + _cum_lengths[0] = 0 + _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths + + new_params = yield ( + _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), + _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), + _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) + ) + + if new_params is not None: + # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() + new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params + assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" + num_tokens = new_num_tokens + max_seq_len = new_max_seq_len + grad_accum_steps = new_grad_accum_steps + + +# ----------------------------------------------------------------------------- +# int main + +@dataclass +class Hyperparameters: + # data + train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on + val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on + val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons + train_batch_size: int = 2048 * 16 * 8 + train_max_seq_len: int = 128 * 16 + val_batch_size: int = 4 * 64 * 1024 * 8 + # optimization + num_scheduled_iterations: int = 2205 # number of steps to complete lr and ws schedule + num_extension_iterations: int = 40 # number of steps to continue training at final lr and ws + num_iterations: int = num_scheduled_iterations + num_extension_iterations + cooldown_frac: float = 0.50 # fraction of num_scheduled_iterations spent cooling down the learning rate + # evaluation and logging + run_id: str = f"{uuid.uuid4()}" + val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end + save_checkpoint: bool = False + # attention masking + block_size: int = 128 + ws_schedule: tuple = (3, 7, 11) + ws_final: int = 13 # increase final validation ws, used for YaRN extension and short window size @classiclarryd + ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN + +args = Hyperparameters() + +data_path = os.environ.get("DATA_PATH", ".") +args.train_files = os.path.join(data_path, args.train_files) +args.val_files = os.path.join(data_path, args.val_files) + +# torchrun sets these env variables +rank = int(os.environ["RANK"]) +world_size = int(os.environ["WORLD_SIZE"]) +assert 8 % world_size == 0, "world_size must be a divisor of 8" +grad_accum_steps = 8 // world_size +assert torch.cuda.is_available() +device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) +torch.cuda.set_device(device) +dist.init_process_group(backend="nccl", device_id=device) +dist.barrier() +master_process = (rank == 0) # this process will do logging, checkpointing etc. + +# begin logging +logfile = None +if master_process: + run_id = args.run_id + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{run_id}.txt" + print(logfile) +def print0(s, console=False): + if master_process: + with open(logfile, "a") as f: + if console: + print(s) + print(s, file=f) + +# begin by printing this file (the Python code) +print0(code) +print0("="*100) +# log information about the hardware/software environment this is running on +print0(f"Running Python {sys.version}") +print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") +print0(f"Running Triton version {triton.__version__}") + +def nvidia_smi(): + import subprocess # avoid top level import + return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout +print0(nvidia_smi()) +print0("="*100) + +model: nn.Module = GPT( + vocab_size=50257, + num_layers=12, + num_heads=6, + head_dim=128, + model_dim=768, + max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) +).cuda() +for m in model.modules(): + if isinstance(m, (nn.Embedding, nn.Linear)): + m.bfloat16() +for param in model.parameters(): + dist.broadcast(param.detach(), 0) + +# collect the parameters to optimize +hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] +embed_params = [p for n, p in model.named_parameters() if "embed" in n] +scalar_params = [p for p in model.parameters() if p.ndim < 2] +head_params = [model.lm_head.weight] +gate_params = [p for n, p in model.named_parameters() if "gate" in n] + +# init the optimizer(s) +# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence +# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 +optimizer1 = DistAdam( + scalar_params + head_params + embed_params, + lr=0.008, + betas=(0.65, 0.95), + eps=1e-8, + weight_decay=0.0, +) +optimizer2 = NorMuon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=1.2) +optimizers = [optimizer1, optimizer2] +for opt in optimizers: + for group in opt.param_groups: + group["initial_lr"] = group["lr"] + +# learning rate schedule: flat, then linear decay, then flat +def get_lr(step: int): + x = min(0.9999, step / args.num_scheduled_iterations) + assert 0 <= x < 1 + lr = 1.0 + if x >= 1 - args.cooldown_frac: + w = (1 - x) / args.cooldown_frac + lr = w * 1.0 + (1 - w) * 0.1 + return lr + +def get_ws(step: int): + # set short window size to half of long window size + # Higher ws on "extension" steps + if step >= args.num_scheduled_iterations: + return args.ws_final // 2, args.ws_final + x = step / args.num_scheduled_iterations + assert 0 <= x < 1 + ws_idx = int(len(args.ws_schedule) * x) + return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] + +def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): + # warmup phase: linearly increase momentum from min to max + # cooldown phase: linearly decrease momentum from max to min + momentum_cd_start = args.num_iterations - muon_cooldown_steps + if step < muon_warmup_steps: + frac = step / muon_warmup_steps + momentum = momentum_min + frac * (momentum_max - momentum_min) + elif step > momentum_cd_start: + frac = (step - momentum_cd_start) / muon_cooldown_steps + momentum = momentum_max - frac * (momentum_max - momentum_min) + else: + momentum = momentum_max + return momentum + +def step_optimizers(step: int, optimizers, model): + # update lr + for optimizer in optimizers: + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * get_lr(step) + + # set muon momentum based on step + momentum = get_muon_momentum(step) + for group in optimizers[1].param_groups: + group["momentum"] = momentum + + # on even steps, only step Muon params + # on odd steps, step all params + if step%2==0: + optimizers[1].step() + optimizers[1].zero_grad(set_to_none=True) + else: + for optimizer in optimizers: + optimizer.step() + model.zero_grad(set_to_none=True) + +model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) + +######################################## +# Warmup kernels # +######################################## + +# Warmup the training kernels, then re-initialize the state so we aren't cheating +warmup_steps = 30 +initial_state = dict(model=copy.deepcopy(model.state_dict()), + optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +ws_schedule = list(args.ws_schedule) + [args.ws_final] +ws_long = ws_schedule[0] +for step in range(warmup_steps): + inputs, targets, cum_seqlens = next(train_loader) + # each window size is a new graph, need to warm up each with Yarn.attn_scale + ws_idx = step % len(ws_schedule) + if ws_idx==0: + model.yarn.reset() + ws_long = ws_schedule[0] + else: + new_ws_long = ws_schedule[ws_idx] + model.yarn.apply(ws_long, new_ws_long) + ws_long = new_ws_long + model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() + for opt in optimizers: + opt.step() + model.zero_grad(set_to_none=True) +model.yarn.reset() # rotary buffer is not stored in state_dict +model.load_state_dict(initial_state["model"]) +optimizer2.reset() # muon momentum buffers not in state dict +for opt, opt_state in zip(optimizers, initial_state["optimizers"]): + opt.load_state_dict(opt_state) +del train_loader, initial_state + +######################################## +# Training and validation # +######################################## + +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +training_time_ms = 0 +# start the clock +torch.cuda.synchronize() +t0 = time.perf_counter() +# begin training +train_steps = args.num_iterations +ws_short, ws_long = get_ws(0) +for step in range(train_steps + 1): + last_step = (step == train_steps) + ws_short, new_ws_long = get_ws(step) + if new_ws_long != ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long=new_ws_long + + # --------------- VALIDATION SECTION ----------------- + if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): + if last_step: + ws_long = args.ws_validate_post_yarn_ext + # stop the clock + torch.cuda.synchronize() + training_time_ms += 1000 * (time.perf_counter() - t0) + model.eval() + assert args.val_tokens % args.val_batch_size == 0 + val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size + val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) + val_loss = 0 + with torch.no_grad(): + for _ in range(val_steps): + inputs, targets, cum_seqlens = next(val_loader) + val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) + val_loss /= val_steps + del val_loader + dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) + print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) + model.train() + # start the clock again + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if master_process and args.save_checkpoint: + log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) + os.makedirs(f"logs/{run_id}", exist_ok=True) + torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") + # the last step only has the validation loop, so break to avoid training + break + + # --------------- TRAINING SECTION ----------------- + for _ in range(grad_accum_steps): + inputs, targets, cum_seqlens = next(train_loader) + (model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps).backward() + step_optimizers(step, optimizers, model) + + # logging + approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) + print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) + +print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) +dist.destroy_process_group() + +==================================================================================================== +Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] +Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 +Running Triton version 3.5.0 +Mon Nov 10 21:53:41 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | +| N/A 35C P0 126W / 700W | 5858MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | +| N/A 32C P0 121W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | +| N/A 31C P0 118W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | +| N/A 34C P0 119W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | +| N/A 34C P0 125W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | +| N/A 32C P0 121W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | +| N/A 34C P0 118W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | +| N/A 31C P0 117W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| ++-----------------------------------------------------------------------------------------+ + +==================================================================================================== +step:0/2245 val_loss:10.8258 train_time:0ms step_avg:0.02ms +step:1/2245 train_time:118ms step_avg:117.70ms +step:2/2245 train_time:139ms step_avg:69.31ms +step:3/2245 train_time:178ms step_avg:59.30ms +step:4/2245 train_time:234ms step_avg:58.45ms +step:5/2245 train_time:294ms step_avg:58.71ms +step:6/2245 train_time:352ms step_avg:58.66ms +step:7/2245 train_time:413ms step_avg:58.98ms +step:8/2245 train_time:471ms step_avg:58.92ms +step:9/2245 train_time:532ms step_avg:59.17ms +step:10/2245 train_time:591ms step_avg:59.10ms +step:11/2245 train_time:652ms step_avg:59.27ms +step:12/2245 train_time:710ms step_avg:59.20ms +step:13/2245 train_time:772ms step_avg:59.38ms +step:14/2245 train_time:831ms step_avg:59.35ms +step:15/2245 train_time:894ms step_avg:59.58ms +step:16/2245 train_time:951ms step_avg:59.43ms +step:17/2245 train_time:1014ms step_avg:59.65ms +step:18/2245 train_time:1077ms step_avg:59.84ms +step:19/2245 train_time:1141ms step_avg:60.07ms +step:20/2245 train_time:1201ms step_avg:60.07ms +step:21/2245 train_time:1264ms step_avg:60.19ms +step:22/2245 train_time:1324ms step_avg:60.17ms +step:23/2245 train_time:1385ms step_avg:60.23ms +step:24/2245 train_time:1445ms step_avg:60.19ms +step:25/2245 train_time:1507ms step_avg:60.29ms +step:26/2245 train_time:1567ms step_avg:60.27ms +step:27/2245 train_time:1630ms step_avg:60.36ms +step:28/2245 train_time:1689ms step_avg:60.33ms +step:29/2245 train_time:1750ms step_avg:60.36ms +step:30/2245 train_time:1809ms step_avg:60.31ms +step:31/2245 train_time:1870ms step_avg:60.34ms +step:32/2245 train_time:1929ms step_avg:60.29ms +step:33/2245 train_time:1993ms step_avg:60.40ms +step:34/2245 train_time:2053ms step_avg:60.39ms +step:35/2245 train_time:2116ms step_avg:60.44ms +step:36/2245 train_time:2175ms step_avg:60.43ms +step:37/2245 train_time:2237ms step_avg:60.46ms +step:38/2245 train_time:2296ms step_avg:60.43ms +step:39/2245 train_time:2358ms step_avg:60.47ms +step:40/2245 train_time:2418ms step_avg:60.45ms +step:41/2245 train_time:2480ms step_avg:60.49ms +step:42/2245 train_time:2540ms step_avg:60.47ms +step:43/2245 train_time:2602ms step_avg:60.51ms +step:44/2245 train_time:2661ms step_avg:60.47ms +step:45/2245 train_time:2722ms step_avg:60.50ms +step:46/2245 train_time:2781ms step_avg:60.47ms +step:47/2245 train_time:2843ms step_avg:60.49ms +step:48/2245 train_time:2902ms step_avg:60.46ms +step:49/2245 train_time:2964ms step_avg:60.49ms +step:50/2245 train_time:3024ms step_avg:60.48ms +step:51/2245 train_time:3087ms step_avg:60.53ms +step:52/2245 train_time:3147ms step_avg:60.52ms +step:53/2245 train_time:3210ms step_avg:60.56ms +step:54/2245 train_time:3270ms step_avg:60.56ms +step:55/2245 train_time:3332ms step_avg:60.58ms +step:56/2245 train_time:3391ms step_avg:60.56ms +step:57/2245 train_time:3453ms step_avg:60.58ms +step:58/2245 train_time:3513ms step_avg:60.57ms +step:59/2245 train_time:3574ms step_avg:60.58ms +step:60/2245 train_time:3633ms step_avg:60.55ms +step:61/2245 train_time:3695ms step_avg:60.57ms +step:62/2245 train_time:3754ms step_avg:60.55ms +step:63/2245 train_time:3816ms step_avg:60.56ms +step:64/2245 train_time:3875ms step_avg:60.55ms +step:65/2245 train_time:3937ms step_avg:60.57ms +step:66/2245 train_time:3997ms step_avg:60.56ms +step:67/2245 train_time:4060ms step_avg:60.60ms +step:68/2245 train_time:4119ms step_avg:60.58ms +step:69/2245 train_time:4181ms step_avg:60.60ms +step:70/2245 train_time:4240ms step_avg:60.58ms +step:71/2245 train_time:4302ms step_avg:60.60ms +step:72/2245 train_time:4362ms step_avg:60.58ms +step:73/2245 train_time:4424ms step_avg:60.61ms +step:74/2245 train_time:4484ms step_avg:60.59ms +step:75/2245 train_time:4547ms step_avg:60.63ms +step:76/2245 train_time:4607ms step_avg:60.61ms +step:77/2245 train_time:4668ms step_avg:60.63ms +step:78/2245 train_time:4728ms step_avg:60.62ms +step:79/2245 train_time:4790ms step_avg:60.63ms +step:80/2245 train_time:4850ms step_avg:60.62ms +step:81/2245 train_time:4911ms step_avg:60.63ms +step:82/2245 train_time:4970ms step_avg:60.61ms +step:83/2245 train_time:5032ms step_avg:60.63ms +step:84/2245 train_time:5091ms step_avg:60.60ms +step:85/2245 train_time:5153ms step_avg:60.62ms +step:86/2245 train_time:5211ms step_avg:60.60ms +step:87/2245 train_time:5273ms step_avg:60.61ms +step:88/2245 train_time:5332ms step_avg:60.59ms +step:89/2245 train_time:5394ms step_avg:60.60ms +step:90/2245 train_time:5453ms step_avg:60.59ms +step:91/2245 train_time:5515ms step_avg:60.61ms +step:92/2245 train_time:5575ms step_avg:60.59ms +step:93/2245 train_time:5637ms step_avg:60.61ms +step:94/2245 train_time:5696ms step_avg:60.59ms +step:95/2245 train_time:5757ms step_avg:60.60ms +step:96/2245 train_time:5816ms step_avg:60.59ms +step:97/2245 train_time:5878ms step_avg:60.60ms +step:98/2245 train_time:5937ms step_avg:60.58ms +step:99/2245 train_time:5998ms step_avg:60.59ms +step:100/2245 train_time:6057ms step_avg:60.57ms +step:101/2245 train_time:6118ms step_avg:60.58ms +step:102/2245 train_time:6178ms step_avg:60.57ms +step:103/2245 train_time:6240ms step_avg:60.58ms +step:104/2245 train_time:6299ms step_avg:60.57ms +step:105/2245 train_time:6360ms step_avg:60.58ms +step:106/2245 train_time:6419ms step_avg:60.56ms +step:107/2245 train_time:6481ms step_avg:60.57ms +step:108/2245 train_time:6540ms step_avg:60.56ms +step:109/2245 train_time:6602ms step_avg:60.57ms +step:110/2245 train_time:6662ms step_avg:60.57ms +step:111/2245 train_time:6724ms step_avg:60.58ms +step:112/2245 train_time:6783ms step_avg:60.56ms +step:113/2245 train_time:6845ms step_avg:60.58ms +step:114/2245 train_time:6905ms step_avg:60.57ms +step:115/2245 train_time:6967ms step_avg:60.58ms +step:116/2245 train_time:7027ms step_avg:60.58ms +step:117/2245 train_time:7089ms step_avg:60.59ms +step:118/2245 train_time:7149ms step_avg:60.58ms +step:119/2245 train_time:7211ms step_avg:60.60ms +step:120/2245 train_time:7270ms step_avg:60.58ms +step:121/2245 train_time:7332ms step_avg:60.59ms +step:122/2245 train_time:7390ms step_avg:60.58ms +step:123/2245 train_time:7452ms step_avg:60.59ms +step:124/2245 train_time:7511ms step_avg:60.57ms +step:125/2245 train_time:7572ms step_avg:60.58ms +step:126/2245 train_time:7632ms step_avg:60.57ms +step:127/2245 train_time:7693ms step_avg:60.57ms +step:128/2245 train_time:7752ms step_avg:60.56ms +step:129/2245 train_time:7813ms step_avg:60.56ms +step:130/2245 train_time:7872ms step_avg:60.55ms +step:131/2245 train_time:7933ms step_avg:60.56ms +step:132/2245 train_time:7992ms step_avg:60.55ms +step:133/2245 train_time:8055ms step_avg:60.56ms +step:134/2245 train_time:8114ms step_avg:60.55ms +step:135/2245 train_time:8176ms step_avg:60.56ms +step:136/2245 train_time:8235ms step_avg:60.55ms +step:137/2245 train_time:8298ms step_avg:60.57ms +step:138/2245 train_time:8357ms step_avg:60.56ms +step:139/2245 train_time:8418ms step_avg:60.56ms +step:140/2245 train_time:8477ms step_avg:60.55ms +step:141/2245 train_time:8539ms step_avg:60.56ms +step:142/2245 train_time:8598ms step_avg:60.55ms +step:143/2245 train_time:8659ms step_avg:60.55ms +step:144/2245 train_time:8718ms step_avg:60.54ms +step:145/2245 train_time:8779ms step_avg:60.55ms +step:146/2245 train_time:8838ms step_avg:60.53ms +step:147/2245 train_time:8899ms step_avg:60.54ms +step:148/2245 train_time:8958ms step_avg:60.53ms +step:149/2245 train_time:9020ms step_avg:60.53ms +step:150/2245 train_time:9079ms step_avg:60.53ms +step:151/2245 train_time:9141ms step_avg:60.54ms +step:152/2245 train_time:9200ms step_avg:60.52ms +step:153/2245 train_time:9261ms step_avg:60.53ms +step:154/2245 train_time:9320ms step_avg:60.52ms +step:155/2245 train_time:9383ms step_avg:60.53ms +step:156/2245 train_time:9442ms step_avg:60.53ms +step:157/2245 train_time:9504ms step_avg:60.53ms +step:158/2245 train_time:9563ms step_avg:60.52ms +step:159/2245 train_time:9624ms step_avg:60.53ms +step:160/2245 train_time:9684ms step_avg:60.53ms +step:161/2245 train_time:9746ms step_avg:60.54ms +step:162/2245 train_time:9806ms step_avg:60.53ms +step:163/2245 train_time:9869ms step_avg:60.54ms +step:164/2245 train_time:9927ms step_avg:60.53ms +step:165/2245 train_time:9989ms step_avg:60.54ms +step:166/2245 train_time:10048ms step_avg:60.53ms +step:167/2245 train_time:10110ms step_avg:60.54ms +step:168/2245 train_time:10169ms step_avg:60.53ms +step:169/2245 train_time:10230ms step_avg:60.53ms +step:170/2245 train_time:10289ms step_avg:60.53ms +step:171/2245 train_time:10351ms step_avg:60.53ms +step:172/2245 train_time:10410ms step_avg:60.52ms +step:173/2245 train_time:10472ms step_avg:60.53ms +step:174/2245 train_time:10531ms step_avg:60.52ms +step:175/2245 train_time:10592ms step_avg:60.53ms +step:176/2245 train_time:10651ms step_avg:60.51ms +step:177/2245 train_time:10711ms step_avg:60.52ms +step:178/2245 train_time:10770ms step_avg:60.51ms +step:179/2245 train_time:10832ms step_avg:60.52ms +step:180/2245 train_time:10891ms step_avg:60.51ms +step:181/2245 train_time:10952ms step_avg:60.51ms +step:182/2245 train_time:11011ms step_avg:60.50ms +step:183/2245 train_time:11072ms step_avg:60.51ms +step:184/2245 train_time:11131ms step_avg:60.49ms +step:185/2245 train_time:11192ms step_avg:60.50ms +step:186/2245 train_time:11251ms step_avg:60.49ms +step:187/2245 train_time:11312ms step_avg:60.49ms +step:188/2245 train_time:11371ms step_avg:60.48ms +step:189/2245 train_time:11432ms step_avg:60.49ms +step:190/2245 train_time:11491ms step_avg:60.48ms +step:191/2245 train_time:11553ms step_avg:60.49ms +step:192/2245 train_time:11611ms step_avg:60.48ms +step:193/2245 train_time:11673ms step_avg:60.48ms +step:194/2245 train_time:11731ms step_avg:60.47ms +step:195/2245 train_time:11793ms step_avg:60.48ms +step:196/2245 train_time:11852ms step_avg:60.47ms +step:197/2245 train_time:11913ms step_avg:60.47ms +step:198/2245 train_time:11972ms step_avg:60.47ms +step:199/2245 train_time:12034ms step_avg:60.47ms +step:200/2245 train_time:12092ms step_avg:60.46ms +step:201/2245 train_time:12153ms step_avg:60.46ms +step:202/2245 train_time:12212ms step_avg:60.46ms +step:203/2245 train_time:12273ms step_avg:60.46ms +step:204/2245 train_time:12332ms step_avg:60.45ms +step:205/2245 train_time:12393ms step_avg:60.45ms +step:206/2245 train_time:12452ms step_avg:60.45ms +step:207/2245 train_time:12513ms step_avg:60.45ms +step:208/2245 train_time:12573ms step_avg:60.45ms +step:209/2245 train_time:12634ms step_avg:60.45ms +step:210/2245 train_time:12693ms step_avg:60.44ms +step:211/2245 train_time:12754ms step_avg:60.45ms +step:212/2245 train_time:12814ms step_avg:60.44ms +step:213/2245 train_time:12875ms step_avg:60.45ms +step:214/2245 train_time:12934ms step_avg:60.44ms +step:215/2245 train_time:12995ms step_avg:60.44ms +step:216/2245 train_time:13054ms step_avg:60.44ms +step:217/2245 train_time:13115ms step_avg:60.44ms +step:218/2245 train_time:13175ms step_avg:60.44ms +step:219/2245 train_time:13236ms step_avg:60.44ms +step:220/2245 train_time:13294ms step_avg:60.43ms +step:221/2245 train_time:13356ms step_avg:60.43ms +step:222/2245 train_time:13415ms step_avg:60.43ms +step:223/2245 train_time:13477ms step_avg:60.43ms +step:224/2245 train_time:13536ms step_avg:60.43ms +step:225/2245 train_time:13597ms step_avg:60.43ms +step:226/2245 train_time:13656ms step_avg:60.43ms +step:227/2245 train_time:13717ms step_avg:60.43ms +step:228/2245 train_time:13776ms step_avg:60.42ms +step:229/2245 train_time:13839ms step_avg:60.43ms +step:230/2245 train_time:13897ms step_avg:60.42ms +step:231/2245 train_time:13958ms step_avg:60.43ms +step:232/2245 train_time:14017ms step_avg:60.42ms +step:233/2245 train_time:14079ms step_avg:60.42ms +step:234/2245 train_time:14138ms step_avg:60.42ms +step:235/2245 train_time:14200ms step_avg:60.42ms +step:236/2245 train_time:14259ms step_avg:60.42ms +step:237/2245 train_time:14320ms step_avg:60.42ms +step:238/2245 train_time:14379ms step_avg:60.42ms +step:239/2245 train_time:14440ms step_avg:60.42ms +step:240/2245 train_time:14500ms step_avg:60.42ms +step:241/2245 train_time:14562ms step_avg:60.42ms +step:242/2245 train_time:14621ms step_avg:60.42ms +step:243/2245 train_time:14683ms step_avg:60.42ms +step:244/2245 train_time:14742ms step_avg:60.42ms +step:245/2245 train_time:14804ms step_avg:60.42ms +step:246/2245 train_time:14864ms step_avg:60.42ms +step:247/2245 train_time:14926ms step_avg:60.43ms +step:248/2245 train_time:14985ms step_avg:60.42ms +step:249/2245 train_time:15048ms step_avg:60.43ms +step:250/2245 train_time:15107ms step_avg:60.43ms +step:250/2245 val_loss:4.0823 train_time:15170ms step_avg:60.68ms +step:251/2245 train_time:15188ms step_avg:60.51ms +step:252/2245 train_time:15232ms step_avg:60.44ms +step:253/2245 train_time:15298ms step_avg:60.47ms +step:254/2245 train_time:15361ms step_avg:60.48ms +step:255/2245 train_time:15424ms step_avg:60.49ms +step:256/2245 train_time:15484ms step_avg:60.48ms +step:257/2245 train_time:15546ms step_avg:60.49ms +step:258/2245 train_time:15604ms step_avg:60.48ms +step:259/2245 train_time:15664ms step_avg:60.48ms +step:260/2245 train_time:15722ms step_avg:60.47ms +step:261/2245 train_time:15783ms step_avg:60.47ms +step:262/2245 train_time:15841ms step_avg:60.46ms +step:263/2245 train_time:15901ms step_avg:60.46ms +step:264/2245 train_time:15959ms step_avg:60.45ms +step:265/2245 train_time:16020ms step_avg:60.45ms +step:266/2245 train_time:16078ms step_avg:60.44ms +step:267/2245 train_time:16140ms step_avg:60.45ms +step:268/2245 train_time:16199ms step_avg:60.44ms +step:269/2245 train_time:16263ms step_avg:60.46ms +step:270/2245 train_time:16324ms step_avg:60.46ms +step:271/2245 train_time:16387ms step_avg:60.47ms +step:272/2245 train_time:16446ms step_avg:60.46ms +step:273/2245 train_time:16508ms step_avg:60.47ms +step:274/2245 train_time:16566ms step_avg:60.46ms +step:275/2245 train_time:16628ms step_avg:60.46ms +step:276/2245 train_time:16686ms step_avg:60.46ms +step:277/2245 train_time:16747ms step_avg:60.46ms +step:278/2245 train_time:16806ms step_avg:60.45ms +step:279/2245 train_time:16867ms step_avg:60.45ms +step:280/2245 train_time:16925ms step_avg:60.45ms +step:281/2245 train_time:16987ms step_avg:60.45ms +step:282/2245 train_time:17046ms step_avg:60.45ms +step:283/2245 train_time:17107ms step_avg:60.45ms +step:284/2245 train_time:17167ms step_avg:60.45ms +step:285/2245 train_time:17228ms step_avg:60.45ms +step:286/2245 train_time:17288ms step_avg:60.45ms +step:287/2245 train_time:17351ms step_avg:60.46ms +step:288/2245 train_time:17411ms step_avg:60.45ms +step:289/2245 train_time:17473ms step_avg:60.46ms +step:290/2245 train_time:17532ms step_avg:60.46ms +step:291/2245 train_time:17593ms step_avg:60.46ms +step:292/2245 train_time:17652ms step_avg:60.45ms +step:293/2245 train_time:17713ms step_avg:60.45ms +step:294/2245 train_time:17772ms step_avg:60.45ms +step:295/2245 train_time:17833ms step_avg:60.45ms +step:296/2245 train_time:17892ms step_avg:60.45ms +step:297/2245 train_time:17954ms step_avg:60.45ms +step:298/2245 train_time:18013ms step_avg:60.45ms +step:299/2245 train_time:18074ms step_avg:60.45ms +step:300/2245 train_time:18133ms step_avg:60.44ms +step:301/2245 train_time:18194ms step_avg:60.45ms +step:302/2245 train_time:18253ms step_avg:60.44ms +step:303/2245 train_time:18315ms step_avg:60.45ms +step:304/2245 train_time:18374ms step_avg:60.44ms +step:305/2245 train_time:18436ms step_avg:60.45ms +step:306/2245 train_time:18494ms step_avg:60.44ms +step:307/2245 train_time:18555ms step_avg:60.44ms +step:308/2245 train_time:18614ms step_avg:60.43ms +step:309/2245 train_time:18675ms step_avg:60.44ms +step:310/2245 train_time:18734ms step_avg:60.43ms +step:311/2245 train_time:18795ms step_avg:60.43ms +step:312/2245 train_time:18853ms step_avg:60.43ms +step:313/2245 train_time:18915ms step_avg:60.43ms +step:314/2245 train_time:18973ms step_avg:60.42ms +step:315/2245 train_time:19034ms step_avg:60.43ms +step:316/2245 train_time:19093ms step_avg:60.42ms +step:317/2245 train_time:19155ms step_avg:60.43ms +step:318/2245 train_time:19214ms step_avg:60.42ms +step:319/2245 train_time:19275ms step_avg:60.42ms +step:320/2245 train_time:19334ms step_avg:60.42ms +step:321/2245 train_time:19396ms step_avg:60.42ms +step:322/2245 train_time:19455ms step_avg:60.42ms +step:323/2245 train_time:19516ms step_avg:60.42ms +step:324/2245 train_time:19575ms step_avg:60.42ms +step:325/2245 train_time:19636ms step_avg:60.42ms +step:326/2245 train_time:19694ms step_avg:60.41ms +step:327/2245 train_time:19756ms step_avg:60.41ms +step:328/2245 train_time:19814ms step_avg:60.41ms +step:329/2245 train_time:19876ms step_avg:60.41ms +step:330/2245 train_time:19934ms step_avg:60.41ms +step:331/2245 train_time:19996ms step_avg:60.41ms +step:332/2245 train_time:20054ms step_avg:60.40ms +step:333/2245 train_time:20115ms step_avg:60.41ms +step:334/2245 train_time:20174ms step_avg:60.40ms +step:335/2245 train_time:20236ms step_avg:60.41ms +step:336/2245 train_time:20294ms step_avg:60.40ms +step:337/2245 train_time:20356ms step_avg:60.40ms +step:338/2245 train_time:20415ms step_avg:60.40ms +step:339/2245 train_time:20476ms step_avg:60.40ms +step:340/2245 train_time:20535ms step_avg:60.40ms +step:341/2245 train_time:20596ms step_avg:60.40ms +step:342/2245 train_time:20655ms step_avg:60.39ms +step:343/2245 train_time:20716ms step_avg:60.40ms +step:344/2245 train_time:20774ms step_avg:60.39ms +step:345/2245 train_time:20835ms step_avg:60.39ms +step:346/2245 train_time:20894ms step_avg:60.39ms +step:347/2245 train_time:20955ms step_avg:60.39ms +step:348/2245 train_time:21013ms step_avg:60.38ms +step:349/2245 train_time:21075ms step_avg:60.39ms +step:350/2245 train_time:21134ms step_avg:60.38ms +step:351/2245 train_time:21195ms step_avg:60.39ms +step:352/2245 train_time:21254ms step_avg:60.38ms +step:353/2245 train_time:21315ms step_avg:60.38ms +step:354/2245 train_time:21374ms step_avg:60.38ms +step:355/2245 train_time:21435ms step_avg:60.38ms +step:356/2245 train_time:21494ms step_avg:60.38ms +step:357/2245 train_time:21556ms step_avg:60.38ms +step:358/2245 train_time:21615ms step_avg:60.38ms +step:359/2245 train_time:21676ms step_avg:60.38ms +step:360/2245 train_time:21735ms step_avg:60.37ms +step:361/2245 train_time:21796ms step_avg:60.38ms +step:362/2245 train_time:21854ms step_avg:60.37ms +step:363/2245 train_time:21916ms step_avg:60.37ms +step:364/2245 train_time:21974ms step_avg:60.37ms +step:365/2245 train_time:22036ms step_avg:60.37ms +step:366/2245 train_time:22094ms step_avg:60.37ms +step:367/2245 train_time:22156ms step_avg:60.37ms +step:368/2245 train_time:22214ms step_avg:60.36ms +step:369/2245 train_time:22275ms step_avg:60.37ms +step:370/2245 train_time:22334ms step_avg:60.36ms +step:371/2245 train_time:22395ms step_avg:60.36ms +step:372/2245 train_time:22454ms step_avg:60.36ms +step:373/2245 train_time:22515ms step_avg:60.36ms +step:374/2245 train_time:22574ms step_avg:60.36ms +step:375/2245 train_time:22636ms step_avg:60.36ms +step:376/2245 train_time:22695ms step_avg:60.36ms +step:377/2245 train_time:22756ms step_avg:60.36ms +step:378/2245 train_time:22815ms step_avg:60.36ms +step:379/2245 train_time:22876ms step_avg:60.36ms +step:380/2245 train_time:22934ms step_avg:60.35ms +step:381/2245 train_time:22995ms step_avg:60.36ms +step:382/2245 train_time:23054ms step_avg:60.35ms +step:383/2245 train_time:23115ms step_avg:60.35ms +step:384/2245 train_time:23174ms step_avg:60.35ms +step:385/2245 train_time:23236ms step_avg:60.35ms +step:386/2245 train_time:23295ms step_avg:60.35ms +step:387/2245 train_time:23356ms step_avg:60.35ms +step:388/2245 train_time:23415ms step_avg:60.35ms +step:389/2245 train_time:23476ms step_avg:60.35ms +step:390/2245 train_time:23534ms step_avg:60.34ms +step:391/2245 train_time:23596ms step_avg:60.35ms +step:392/2245 train_time:23654ms step_avg:60.34ms +step:393/2245 train_time:23715ms step_avg:60.34ms +step:394/2245 train_time:23774ms step_avg:60.34ms +step:395/2245 train_time:23835ms step_avg:60.34ms +step:396/2245 train_time:23895ms step_avg:60.34ms +step:397/2245 train_time:23955ms step_avg:60.34ms +step:398/2245 train_time:24014ms step_avg:60.34ms +step:399/2245 train_time:24075ms step_avg:60.34ms +step:400/2245 train_time:24134ms step_avg:60.33ms +step:401/2245 train_time:24195ms step_avg:60.34ms +step:402/2245 train_time:24253ms step_avg:60.33ms +step:403/2245 train_time:24315ms step_avg:60.33ms +step:404/2245 train_time:24373ms step_avg:60.33ms +step:405/2245 train_time:24435ms step_avg:60.33ms +step:406/2245 train_time:24494ms step_avg:60.33ms +step:407/2245 train_time:24555ms step_avg:60.33ms +step:408/2245 train_time:24614ms step_avg:60.33ms +step:409/2245 train_time:24676ms step_avg:60.33ms +step:410/2245 train_time:24734ms step_avg:60.33ms +step:411/2245 train_time:24796ms step_avg:60.33ms +step:412/2245 train_time:24855ms step_avg:60.33ms +step:413/2245 train_time:24916ms step_avg:60.33ms +step:414/2245 train_time:24976ms step_avg:60.33ms +step:415/2245 train_time:25036ms step_avg:60.33ms +step:416/2245 train_time:25095ms step_avg:60.33ms +step:417/2245 train_time:25157ms step_avg:60.33ms +step:418/2245 train_time:25215ms step_avg:60.32ms +step:419/2245 train_time:25276ms step_avg:60.33ms +step:420/2245 train_time:25335ms step_avg:60.32ms +step:421/2245 train_time:25396ms step_avg:60.32ms +step:422/2245 train_time:25455ms step_avg:60.32ms +step:423/2245 train_time:25516ms step_avg:60.32ms +step:424/2245 train_time:25575ms step_avg:60.32ms +step:425/2245 train_time:25637ms step_avg:60.32ms +step:426/2245 train_time:25695ms step_avg:60.32ms +step:427/2245 train_time:25756ms step_avg:60.32ms +step:428/2245 train_time:25815ms step_avg:60.31ms +step:429/2245 train_time:25876ms step_avg:60.32ms +step:430/2245 train_time:25935ms step_avg:60.31ms +step:431/2245 train_time:25997ms step_avg:60.32ms +step:432/2245 train_time:26055ms step_avg:60.31ms +step:433/2245 train_time:26116ms step_avg:60.31ms +step:434/2245 train_time:26175ms step_avg:60.31ms +step:435/2245 train_time:26237ms step_avg:60.31ms +step:436/2245 train_time:26295ms step_avg:60.31ms +step:437/2245 train_time:26356ms step_avg:60.31ms +step:438/2245 train_time:26415ms step_avg:60.31ms +step:439/2245 train_time:26477ms step_avg:60.31ms +step:440/2245 train_time:26535ms step_avg:60.31ms +step:441/2245 train_time:26596ms step_avg:60.31ms +step:442/2245 train_time:26655ms step_avg:60.31ms +step:443/2245 train_time:26716ms step_avg:60.31ms +step:444/2245 train_time:26775ms step_avg:60.30ms +step:445/2245 train_time:26836ms step_avg:60.31ms +step:446/2245 train_time:26895ms step_avg:60.30ms +step:447/2245 train_time:26956ms step_avg:60.30ms +step:448/2245 train_time:27015ms step_avg:60.30ms +step:449/2245 train_time:27076ms step_avg:60.30ms +step:450/2245 train_time:27134ms step_avg:60.30ms +step:451/2245 train_time:27196ms step_avg:60.30ms +step:452/2245 train_time:27254ms step_avg:60.30ms +step:453/2245 train_time:27316ms step_avg:60.30ms +step:454/2245 train_time:27375ms step_avg:60.30ms +step:455/2245 train_time:27436ms step_avg:60.30ms +step:456/2245 train_time:27495ms step_avg:60.30ms +step:457/2245 train_time:27556ms step_avg:60.30ms +step:458/2245 train_time:27615ms step_avg:60.29ms +step:459/2245 train_time:27676ms step_avg:60.30ms +step:460/2245 train_time:27735ms step_avg:60.29ms +step:461/2245 train_time:27796ms step_avg:60.30ms +step:462/2245 train_time:27855ms step_avg:60.29ms +step:463/2245 train_time:27916ms step_avg:60.29ms +step:464/2245 train_time:27974ms step_avg:60.29ms +step:465/2245 train_time:28036ms step_avg:60.29ms +step:466/2245 train_time:28094ms step_avg:60.29ms +step:467/2245 train_time:28155ms step_avg:60.29ms +step:468/2245 train_time:28214ms step_avg:60.29ms +step:469/2245 train_time:28275ms step_avg:60.29ms +step:470/2245 train_time:28334ms step_avg:60.29ms +step:471/2245 train_time:28396ms step_avg:60.29ms +step:472/2245 train_time:28454ms step_avg:60.28ms +step:473/2245 train_time:28516ms step_avg:60.29ms +step:474/2245 train_time:28575ms step_avg:60.28ms +step:475/2245 train_time:28636ms step_avg:60.29ms +step:476/2245 train_time:28694ms step_avg:60.28ms +step:477/2245 train_time:28756ms step_avg:60.28ms +step:478/2245 train_time:28815ms step_avg:60.28ms +step:479/2245 train_time:28876ms step_avg:60.28ms +step:480/2245 train_time:28934ms step_avg:60.28ms +step:481/2245 train_time:28996ms step_avg:60.28ms +step:482/2245 train_time:29055ms step_avg:60.28ms +step:483/2245 train_time:29116ms step_avg:60.28ms +step:484/2245 train_time:29174ms step_avg:60.28ms +step:485/2245 train_time:29236ms step_avg:60.28ms +step:486/2245 train_time:29295ms step_avg:60.28ms +step:487/2245 train_time:29356ms step_avg:60.28ms +step:488/2245 train_time:29414ms step_avg:60.28ms +step:489/2245 train_time:29476ms step_avg:60.28ms +step:490/2245 train_time:29535ms step_avg:60.27ms +step:491/2245 train_time:29596ms step_avg:60.28ms +step:492/2245 train_time:29654ms step_avg:60.27ms +step:493/2245 train_time:29715ms step_avg:60.27ms +step:494/2245 train_time:29774ms step_avg:60.27ms +step:495/2245 train_time:29836ms step_avg:60.27ms +step:496/2245 train_time:29895ms step_avg:60.27ms +step:497/2245 train_time:29956ms step_avg:60.27ms +step:498/2245 train_time:30014ms step_avg:60.27ms +step:499/2245 train_time:30076ms step_avg:60.27ms +step:500/2245 train_time:30134ms step_avg:60.27ms +step:500/2245 val_loss:3.8249 train_time:30196ms step_avg:60.39ms +step:501/2245 train_time:30214ms step_avg:60.31ms +step:502/2245 train_time:30258ms step_avg:60.27ms +step:503/2245 train_time:30323ms step_avg:60.28ms +step:504/2245 train_time:30384ms step_avg:60.29ms +step:505/2245 train_time:30446ms step_avg:60.29ms +step:506/2245 train_time:30505ms step_avg:60.29ms +step:507/2245 train_time:30567ms step_avg:60.29ms +step:508/2245 train_time:30625ms step_avg:60.29ms +step:509/2245 train_time:30686ms step_avg:60.29ms +step:510/2245 train_time:30745ms step_avg:60.28ms +step:511/2245 train_time:30807ms step_avg:60.29ms +step:512/2245 train_time:30866ms step_avg:60.29ms +step:513/2245 train_time:30927ms step_avg:60.29ms +step:514/2245 train_time:30986ms step_avg:60.28ms +step:515/2245 train_time:31047ms step_avg:60.29ms +step:516/2245 train_time:31105ms step_avg:60.28ms +step:517/2245 train_time:31167ms step_avg:60.29ms +step:518/2245 train_time:31228ms step_avg:60.28ms +step:519/2245 train_time:31290ms step_avg:60.29ms +step:520/2245 train_time:31350ms step_avg:60.29ms +step:521/2245 train_time:31412ms step_avg:60.29ms +step:522/2245 train_time:31471ms step_avg:60.29ms +step:523/2245 train_time:31532ms step_avg:60.29ms +step:524/2245 train_time:31590ms step_avg:60.29ms +step:525/2245 train_time:31651ms step_avg:60.29ms +step:526/2245 train_time:31710ms step_avg:60.28ms +step:527/2245 train_time:31771ms step_avg:60.29ms +step:528/2245 train_time:31830ms step_avg:60.28ms +step:529/2245 train_time:31891ms step_avg:60.29ms +step:530/2245 train_time:31949ms step_avg:60.28ms +step:531/2245 train_time:32011ms step_avg:60.28ms +step:532/2245 train_time:32070ms step_avg:60.28ms +step:533/2245 train_time:32131ms step_avg:60.28ms +step:534/2245 train_time:32190ms step_avg:60.28ms +step:535/2245 train_time:32252ms step_avg:60.28ms +step:536/2245 train_time:32311ms step_avg:60.28ms +step:537/2245 train_time:32373ms step_avg:60.29ms +step:538/2245 train_time:32432ms step_avg:60.28ms +step:539/2245 train_time:32494ms step_avg:60.28ms +step:540/2245 train_time:32553ms step_avg:60.28ms +step:541/2245 train_time:32614ms step_avg:60.28ms +step:542/2245 train_time:32672ms step_avg:60.28ms +step:543/2245 train_time:32734ms step_avg:60.28ms +step:544/2245 train_time:32792ms step_avg:60.28ms +step:545/2245 train_time:32854ms step_avg:60.28ms +step:546/2245 train_time:32913ms step_avg:60.28ms +step:547/2245 train_time:32974ms step_avg:60.28ms +step:548/2245 train_time:33033ms step_avg:60.28ms +step:549/2245 train_time:33095ms step_avg:60.28ms +step:550/2245 train_time:33154ms step_avg:60.28ms +step:551/2245 train_time:33216ms step_avg:60.28ms +step:552/2245 train_time:33275ms step_avg:60.28ms +step:553/2245 train_time:33337ms step_avg:60.28ms +step:554/2245 train_time:33396ms step_avg:60.28ms +step:555/2245 train_time:33457ms step_avg:60.28ms +step:556/2245 train_time:33516ms step_avg:60.28ms +step:557/2245 train_time:33578ms step_avg:60.28ms +step:558/2245 train_time:33636ms step_avg:60.28ms +step:559/2245 train_time:33697ms step_avg:60.28ms +step:560/2245 train_time:33757ms step_avg:60.28ms +step:561/2245 train_time:33818ms step_avg:60.28ms +step:562/2245 train_time:33877ms step_avg:60.28ms +step:563/2245 train_time:33939ms step_avg:60.28ms +step:564/2245 train_time:33998ms step_avg:60.28ms +step:565/2245 train_time:34060ms step_avg:60.28ms +step:566/2245 train_time:34119ms step_avg:60.28ms +step:567/2245 train_time:34181ms step_avg:60.28ms +step:568/2245 train_time:34240ms step_avg:60.28ms +step:569/2245 train_time:34302ms step_avg:60.28ms +step:570/2245 train_time:34362ms step_avg:60.28ms +step:571/2245 train_time:34423ms step_avg:60.29ms +step:572/2245 train_time:34483ms step_avg:60.28ms +step:573/2245 train_time:34545ms step_avg:60.29ms +step:574/2245 train_time:34605ms step_avg:60.29ms +step:575/2245 train_time:34667ms step_avg:60.29ms +step:576/2245 train_time:34726ms step_avg:60.29ms +step:577/2245 train_time:34788ms step_avg:60.29ms +step:578/2245 train_time:34847ms step_avg:60.29ms +step:579/2245 train_time:34908ms step_avg:60.29ms +step:580/2245 train_time:34967ms step_avg:60.29ms +step:581/2245 train_time:35028ms step_avg:60.29ms +step:582/2245 train_time:35087ms step_avg:60.29ms +step:583/2245 train_time:35149ms step_avg:60.29ms +step:584/2245 train_time:35208ms step_avg:60.29ms +step:585/2245 train_time:35269ms step_avg:60.29ms +step:586/2245 train_time:35328ms step_avg:60.29ms +step:587/2245 train_time:35389ms step_avg:60.29ms +step:588/2245 train_time:35448ms step_avg:60.29ms +step:589/2245 train_time:35510ms step_avg:60.29ms +step:590/2245 train_time:35568ms step_avg:60.29ms +step:591/2245 train_time:35630ms step_avg:60.29ms +step:592/2245 train_time:35688ms step_avg:60.28ms +step:593/2245 train_time:35750ms step_avg:60.29ms +step:594/2245 train_time:35809ms step_avg:60.28ms +step:595/2245 train_time:35870ms step_avg:60.29ms +step:596/2245 train_time:35929ms step_avg:60.28ms +step:597/2245 train_time:35991ms step_avg:60.29ms +step:598/2245 train_time:36049ms step_avg:60.28ms +step:599/2245 train_time:36110ms step_avg:60.28ms +step:600/2245 train_time:36169ms step_avg:60.28ms +step:601/2245 train_time:36230ms step_avg:60.28ms +step:602/2245 train_time:36289ms step_avg:60.28ms +step:603/2245 train_time:36351ms step_avg:60.28ms +step:604/2245 train_time:36410ms step_avg:60.28ms +step:605/2245 train_time:36471ms step_avg:60.28ms +step:606/2245 train_time:36530ms step_avg:60.28ms +step:607/2245 train_time:36591ms step_avg:60.28ms +step:608/2245 train_time:36650ms step_avg:60.28ms +step:609/2245 train_time:36711ms step_avg:60.28ms +step:610/2245 train_time:36770ms step_avg:60.28ms +step:611/2245 train_time:36831ms step_avg:60.28ms +step:612/2245 train_time:36890ms step_avg:60.28ms +step:613/2245 train_time:36952ms step_avg:60.28ms +step:614/2245 train_time:37011ms step_avg:60.28ms +step:615/2245 train_time:37072ms step_avg:60.28ms +step:616/2245 train_time:37131ms step_avg:60.28ms +step:617/2245 train_time:37192ms step_avg:60.28ms +step:618/2245 train_time:37251ms step_avg:60.28ms +step:619/2245 train_time:37313ms step_avg:60.28ms +step:620/2245 train_time:37372ms step_avg:60.28ms +step:621/2245 train_time:37434ms step_avg:60.28ms +step:622/2245 train_time:37492ms step_avg:60.28ms +step:623/2245 train_time:37554ms step_avg:60.28ms +step:624/2245 train_time:37612ms step_avg:60.28ms +step:625/2245 train_time:37674ms step_avg:60.28ms +step:626/2245 train_time:37733ms step_avg:60.28ms +step:627/2245 train_time:37795ms step_avg:60.28ms +step:628/2245 train_time:37854ms step_avg:60.28ms +step:629/2245 train_time:37915ms step_avg:60.28ms +step:630/2245 train_time:37974ms step_avg:60.28ms +step:631/2245 train_time:38036ms step_avg:60.28ms +step:632/2245 train_time:38095ms step_avg:60.28ms +step:633/2245 train_time:38156ms step_avg:60.28ms +step:634/2245 train_time:38215ms step_avg:60.28ms +step:635/2245 train_time:38277ms step_avg:60.28ms +step:636/2245 train_time:38336ms step_avg:60.28ms +step:637/2245 train_time:38398ms step_avg:60.28ms +step:638/2245 train_time:38457ms step_avg:60.28ms +step:639/2245 train_time:38518ms step_avg:60.28ms +step:640/2245 train_time:38578ms step_avg:60.28ms +step:641/2245 train_time:38639ms step_avg:60.28ms +step:642/2245 train_time:38699ms step_avg:60.28ms +step:643/2245 train_time:38761ms step_avg:60.28ms +step:644/2245 train_time:38821ms step_avg:60.28ms +step:645/2245 train_time:38882ms step_avg:60.28ms +step:646/2245 train_time:38942ms step_avg:60.28ms +step:647/2245 train_time:39003ms step_avg:60.28ms +step:648/2245 train_time:39063ms step_avg:60.28ms +step:649/2245 train_time:39125ms step_avg:60.29ms +step:650/2245 train_time:39185ms step_avg:60.28ms +step:651/2245 train_time:39246ms step_avg:60.29ms +step:652/2245 train_time:39306ms step_avg:60.29ms +step:653/2245 train_time:39368ms step_avg:60.29ms +step:654/2245 train_time:39427ms step_avg:60.29ms +step:655/2245 train_time:39489ms step_avg:60.29ms +step:656/2245 train_time:39547ms step_avg:60.29ms +step:657/2245 train_time:39609ms step_avg:60.29ms +step:658/2245 train_time:39668ms step_avg:60.29ms +step:659/2245 train_time:39729ms step_avg:60.29ms +step:660/2245 train_time:39788ms step_avg:60.28ms +step:661/2245 train_time:39850ms step_avg:60.29ms +step:662/2245 train_time:39909ms step_avg:60.29ms +step:663/2245 train_time:39972ms step_avg:60.29ms +step:664/2245 train_time:40030ms step_avg:60.29ms +step:665/2245 train_time:40091ms step_avg:60.29ms +step:666/2245 train_time:40150ms step_avg:60.29ms +step:667/2245 train_time:40212ms step_avg:60.29ms +step:668/2245 train_time:40271ms step_avg:60.29ms +step:669/2245 train_time:40332ms step_avg:60.29ms +step:670/2245 train_time:40391ms step_avg:60.29ms +step:671/2245 train_time:40452ms step_avg:60.29ms +step:672/2245 train_time:40511ms step_avg:60.28ms +step:673/2245 train_time:40572ms step_avg:60.29ms +step:674/2245 train_time:40631ms step_avg:60.28ms +step:675/2245 train_time:40692ms step_avg:60.28ms +step:676/2245 train_time:40751ms step_avg:60.28ms +step:677/2245 train_time:40813ms step_avg:60.28ms +step:678/2245 train_time:40871ms step_avg:60.28ms +step:679/2245 train_time:40933ms step_avg:60.28ms +step:680/2245 train_time:40992ms step_avg:60.28ms +step:681/2245 train_time:41054ms step_avg:60.28ms +step:682/2245 train_time:41113ms step_avg:60.28ms +step:683/2245 train_time:41175ms step_avg:60.28ms +step:684/2245 train_time:41234ms step_avg:60.28ms +step:685/2245 train_time:41295ms step_avg:60.29ms +step:686/2245 train_time:41355ms step_avg:60.28ms +step:687/2245 train_time:41416ms step_avg:60.29ms +step:688/2245 train_time:41475ms step_avg:60.28ms +step:689/2245 train_time:41536ms step_avg:60.28ms +step:690/2245 train_time:41595ms step_avg:60.28ms +step:691/2245 train_time:41656ms step_avg:60.28ms +step:692/2245 train_time:41716ms step_avg:60.28ms +step:693/2245 train_time:41777ms step_avg:60.28ms +step:694/2245 train_time:41836ms step_avg:60.28ms +step:695/2245 train_time:41899ms step_avg:60.29ms +step:696/2245 train_time:41957ms step_avg:60.28ms +step:697/2245 train_time:42019ms step_avg:60.29ms +step:698/2245 train_time:42078ms step_avg:60.28ms +step:699/2245 train_time:42140ms step_avg:60.29ms +step:700/2245 train_time:42199ms step_avg:60.28ms +step:701/2245 train_time:42261ms step_avg:60.29ms +step:702/2245 train_time:42320ms step_avg:60.29ms +step:703/2245 train_time:42381ms step_avg:60.29ms +step:704/2245 train_time:42441ms step_avg:60.29ms +step:705/2245 train_time:42503ms step_avg:60.29ms +step:706/2245 train_time:42562ms step_avg:60.29ms +step:707/2245 train_time:42624ms step_avg:60.29ms +step:708/2245 train_time:42684ms step_avg:60.29ms +step:709/2245 train_time:42746ms step_avg:60.29ms +step:710/2245 train_time:42806ms step_avg:60.29ms +step:711/2245 train_time:42869ms step_avg:60.29ms +step:712/2245 train_time:42927ms step_avg:60.29ms +step:713/2245 train_time:42988ms step_avg:60.29ms +step:714/2245 train_time:43047ms step_avg:60.29ms +step:715/2245 train_time:43108ms step_avg:60.29ms +step:716/2245 train_time:43167ms step_avg:60.29ms +step:717/2245 train_time:43229ms step_avg:60.29ms +step:718/2245 train_time:43678ms step_avg:60.83ms +step:719/2245 train_time:43738ms step_avg:60.83ms +step:720/2245 train_time:43796ms step_avg:60.83ms +step:721/2245 train_time:43857ms step_avg:60.83ms +step:722/2245 train_time:43915ms step_avg:60.82ms +step:723/2245 train_time:43975ms step_avg:60.82ms +step:724/2245 train_time:44034ms step_avg:60.82ms +step:725/2245 train_time:44094ms step_avg:60.82ms +step:726/2245 train_time:44153ms step_avg:60.82ms +step:727/2245 train_time:44213ms step_avg:60.82ms +step:728/2245 train_time:44272ms step_avg:60.81ms +step:729/2245 train_time:44333ms step_avg:60.81ms +step:730/2245 train_time:44391ms step_avg:60.81ms +step:731/2245 train_time:44451ms step_avg:60.81ms +step:732/2245 train_time:44511ms step_avg:60.81ms +step:733/2245 train_time:44580ms step_avg:60.82ms +step:734/2245 train_time:44642ms step_avg:60.82ms +step:735/2245 train_time:44706ms step_avg:60.82ms +step:736/2245 train_time:44766ms step_avg:60.82ms +step:737/2245 train_time:44829ms step_avg:60.83ms +step:738/2245 train_time:44888ms step_avg:60.82ms +step:739/2245 train_time:44951ms step_avg:60.83ms +step:740/2245 train_time:45010ms step_avg:60.82ms +step:741/2245 train_time:45071ms step_avg:60.83ms +step:742/2245 train_time:45131ms step_avg:60.82ms +step:743/2245 train_time:45193ms step_avg:60.82ms +step:744/2245 train_time:45252ms step_avg:60.82ms +step:745/2245 train_time:45314ms step_avg:60.82ms +step:746/2245 train_time:45374ms step_avg:60.82ms +step:747/2245 train_time:45435ms step_avg:60.82ms +step:748/2245 train_time:45495ms step_avg:60.82ms +step:749/2245 train_time:45559ms step_avg:60.83ms +step:750/2245 train_time:45620ms step_avg:60.83ms +step:750/2245 val_loss:3.6714 train_time:45685ms step_avg:60.91ms +step:751/2245 train_time:45704ms step_avg:60.86ms +step:752/2245 train_time:45745ms step_avg:60.83ms +step:753/2245 train_time:45807ms step_avg:60.83ms +step:754/2245 train_time:45867ms step_avg:60.83ms +step:755/2245 train_time:45931ms step_avg:60.84ms +step:756/2245 train_time:45992ms step_avg:60.84ms +step:757/2245 train_time:46053ms step_avg:60.84ms +step:758/2245 train_time:46113ms step_avg:60.84ms +step:759/2245 train_time:46174ms step_avg:60.84ms +step:760/2245 train_time:46233ms step_avg:60.83ms +step:761/2245 train_time:46295ms step_avg:60.83ms +step:762/2245 train_time:46354ms step_avg:60.83ms +step:763/2245 train_time:46416ms step_avg:60.83ms +step:764/2245 train_time:46474ms step_avg:60.83ms +step:765/2245 train_time:46536ms step_avg:60.83ms +step:766/2245 train_time:46599ms step_avg:60.83ms +step:767/2245 train_time:46665ms step_avg:60.84ms +step:768/2245 train_time:46727ms step_avg:60.84ms +step:769/2245 train_time:46788ms step_avg:60.84ms +step:770/2245 train_time:46848ms step_avg:60.84ms +step:771/2245 train_time:46910ms step_avg:60.84ms +step:772/2245 train_time:46970ms step_avg:60.84ms +step:773/2245 train_time:47033ms step_avg:60.84ms +step:774/2245 train_time:47092ms step_avg:60.84ms +step:775/2245 train_time:47154ms step_avg:60.84ms +step:776/2245 train_time:47213ms step_avg:60.84ms +step:777/2245 train_time:47274ms step_avg:60.84ms +step:778/2245 train_time:47333ms step_avg:60.84ms +step:779/2245 train_time:47395ms step_avg:60.84ms +step:780/2245 train_time:47455ms step_avg:60.84ms +step:781/2245 train_time:47518ms step_avg:60.84ms +step:782/2245 train_time:47579ms step_avg:60.84ms +step:783/2245 train_time:47643ms step_avg:60.85ms +step:784/2245 train_time:47704ms step_avg:60.85ms +step:785/2245 train_time:47767ms step_avg:60.85ms +step:786/2245 train_time:47828ms step_avg:60.85ms +step:787/2245 train_time:47890ms step_avg:60.85ms +step:788/2245 train_time:47951ms step_avg:60.85ms +step:789/2245 train_time:48013ms step_avg:60.85ms +step:790/2245 train_time:48072ms step_avg:60.85ms +step:791/2245 train_time:48134ms step_avg:60.85ms +step:792/2245 train_time:48194ms step_avg:60.85ms +step:793/2245 train_time:48255ms step_avg:60.85ms +step:794/2245 train_time:48314ms step_avg:60.85ms +step:795/2245 train_time:48376ms step_avg:60.85ms +step:796/2245 train_time:48436ms step_avg:60.85ms +step:797/2245 train_time:48498ms step_avg:60.85ms +step:798/2245 train_time:48558ms step_avg:60.85ms +step:799/2245 train_time:48621ms step_avg:60.85ms +step:800/2245 train_time:48683ms step_avg:60.85ms +step:801/2245 train_time:48746ms step_avg:60.86ms +step:802/2245 train_time:48807ms step_avg:60.86ms +step:803/2245 train_time:48869ms step_avg:60.86ms +step:804/2245 train_time:48929ms step_avg:60.86ms +step:805/2245 train_time:48991ms step_avg:60.86ms +step:806/2245 train_time:49050ms step_avg:60.86ms +step:807/2245 train_time:49112ms step_avg:60.86ms +step:808/2245 train_time:49172ms step_avg:60.86ms +step:809/2245 train_time:49233ms step_avg:60.86ms +step:810/2245 train_time:49292ms step_avg:60.85ms +step:811/2245 train_time:49355ms step_avg:60.86ms +step:812/2245 train_time:49414ms step_avg:60.85ms +step:813/2245 train_time:49477ms step_avg:60.86ms +step:814/2245 train_time:49537ms step_avg:60.86ms +step:815/2245 train_time:49600ms step_avg:60.86ms +step:816/2245 train_time:49660ms step_avg:60.86ms +step:817/2245 train_time:49724ms step_avg:60.86ms +step:818/2245 train_time:49784ms step_avg:60.86ms +step:819/2245 train_time:49847ms step_avg:60.86ms +step:820/2245 train_time:49906ms step_avg:60.86ms +step:821/2245 train_time:49968ms step_avg:60.86ms +step:822/2245 train_time:50029ms step_avg:60.86ms +step:823/2245 train_time:50090ms step_avg:60.86ms +step:824/2245 train_time:50149ms step_avg:60.86ms +step:825/2245 train_time:50212ms step_avg:60.86ms +step:826/2245 train_time:50271ms step_avg:60.86ms +step:827/2245 train_time:50333ms step_avg:60.86ms +step:828/2245 train_time:50392ms step_avg:60.86ms +step:829/2245 train_time:50454ms step_avg:60.86ms +step:830/2245 train_time:50515ms step_avg:60.86ms +step:831/2245 train_time:50577ms step_avg:60.86ms +step:832/2245 train_time:50638ms step_avg:60.86ms +step:833/2245 train_time:50700ms step_avg:60.86ms +step:834/2245 train_time:50760ms step_avg:60.86ms +step:835/2245 train_time:50823ms step_avg:60.87ms +step:836/2245 train_time:50884ms step_avg:60.87ms +step:837/2245 train_time:50947ms step_avg:60.87ms +step:838/2245 train_time:51008ms step_avg:60.87ms +step:839/2245 train_time:51070ms step_avg:60.87ms +step:840/2245 train_time:51130ms step_avg:60.87ms +step:841/2245 train_time:51192ms step_avg:60.87ms +step:842/2245 train_time:51251ms step_avg:60.87ms +step:843/2245 train_time:51313ms step_avg:60.87ms +step:844/2245 train_time:51372ms step_avg:60.87ms +step:845/2245 train_time:51434ms step_avg:60.87ms +step:846/2245 train_time:51494ms step_avg:60.87ms +step:847/2245 train_time:51557ms step_avg:60.87ms +step:848/2245 train_time:51617ms step_avg:60.87ms +step:849/2245 train_time:51680ms step_avg:60.87ms +step:850/2245 train_time:51739ms step_avg:60.87ms +step:851/2245 train_time:51803ms step_avg:60.87ms +step:852/2245 train_time:51863ms step_avg:60.87ms +step:853/2245 train_time:51926ms step_avg:60.87ms +step:854/2245 train_time:51986ms step_avg:60.87ms +step:855/2245 train_time:52048ms step_avg:60.87ms +step:856/2245 train_time:52108ms step_avg:60.87ms +step:857/2245 train_time:52169ms step_avg:60.87ms +step:858/2245 train_time:52229ms step_avg:60.87ms +step:859/2245 train_time:52291ms step_avg:60.87ms +step:860/2245 train_time:52351ms step_avg:60.87ms +step:861/2245 train_time:52413ms step_avg:60.87ms +step:862/2245 train_time:52472ms step_avg:60.87ms +step:863/2245 train_time:52535ms step_avg:60.87ms +step:864/2245 train_time:52595ms step_avg:60.87ms +step:865/2245 train_time:52658ms step_avg:60.88ms +step:866/2245 train_time:52718ms step_avg:60.88ms +step:867/2245 train_time:52781ms step_avg:60.88ms +step:868/2245 train_time:52842ms step_avg:60.88ms +step:869/2245 train_time:52904ms step_avg:60.88ms +step:870/2245 train_time:52964ms step_avg:60.88ms +step:871/2245 train_time:53027ms step_avg:60.88ms +step:872/2245 train_time:53087ms step_avg:60.88ms +step:873/2245 train_time:53149ms step_avg:60.88ms +step:874/2245 train_time:53208ms step_avg:60.88ms +step:875/2245 train_time:53270ms step_avg:60.88ms +step:876/2245 train_time:53330ms step_avg:60.88ms +step:877/2245 train_time:53391ms step_avg:60.88ms +step:878/2245 train_time:53451ms step_avg:60.88ms +step:879/2245 train_time:53513ms step_avg:60.88ms +step:880/2245 train_time:53572ms step_avg:60.88ms +step:881/2245 train_time:53635ms step_avg:60.88ms +step:882/2245 train_time:53695ms step_avg:60.88ms +step:883/2245 train_time:53758ms step_avg:60.88ms +step:884/2245 train_time:53818ms step_avg:60.88ms +step:885/2245 train_time:53881ms step_avg:60.88ms +step:886/2245 train_time:53942ms step_avg:60.88ms +step:887/2245 train_time:54006ms step_avg:60.89ms +step:888/2245 train_time:54066ms step_avg:60.89ms +step:889/2245 train_time:54129ms step_avg:60.89ms +step:890/2245 train_time:54188ms step_avg:60.89ms +step:891/2245 train_time:54250ms step_avg:60.89ms +step:892/2245 train_time:54310ms step_avg:60.89ms +step:893/2245 train_time:54372ms step_avg:60.89ms +step:894/2245 train_time:54431ms step_avg:60.89ms +step:895/2245 train_time:54493ms step_avg:60.89ms +step:896/2245 train_time:54553ms step_avg:60.88ms +step:897/2245 train_time:54615ms step_avg:60.89ms +step:898/2245 train_time:54675ms step_avg:60.88ms +step:899/2245 train_time:54737ms step_avg:60.89ms +step:900/2245 train_time:54798ms step_avg:60.89ms +step:901/2245 train_time:54860ms step_avg:60.89ms +step:902/2245 train_time:54921ms step_avg:60.89ms +step:903/2245 train_time:54984ms step_avg:60.89ms +step:904/2245 train_time:55044ms step_avg:60.89ms +step:905/2245 train_time:55107ms step_avg:60.89ms +step:906/2245 train_time:55167ms step_avg:60.89ms +step:907/2245 train_time:55230ms step_avg:60.89ms +step:908/2245 train_time:55289ms step_avg:60.89ms +step:909/2245 train_time:55351ms step_avg:60.89ms +step:910/2245 train_time:55410ms step_avg:60.89ms +step:911/2245 train_time:55472ms step_avg:60.89ms +step:912/2245 train_time:55532ms step_avg:60.89ms +step:913/2245 train_time:55594ms step_avg:60.89ms +step:914/2245 train_time:55653ms step_avg:60.89ms +step:915/2245 train_time:55716ms step_avg:60.89ms +step:916/2245 train_time:55776ms step_avg:60.89ms +step:917/2245 train_time:55840ms step_avg:60.89ms +step:918/2245 train_time:55899ms step_avg:60.89ms +step:919/2245 train_time:55962ms step_avg:60.89ms +step:920/2245 train_time:56024ms step_avg:60.90ms +step:921/2245 train_time:56087ms step_avg:60.90ms +step:922/2245 train_time:56147ms step_avg:60.90ms +step:923/2245 train_time:56208ms step_avg:60.90ms +step:924/2245 train_time:56269ms step_avg:60.90ms +step:925/2245 train_time:56332ms step_avg:60.90ms +step:926/2245 train_time:56392ms step_avg:60.90ms +step:927/2245 train_time:56453ms step_avg:60.90ms +step:928/2245 train_time:56513ms step_avg:60.90ms +step:929/2245 train_time:56575ms step_avg:60.90ms +step:930/2245 train_time:56634ms step_avg:60.90ms +step:931/2245 train_time:56697ms step_avg:60.90ms +step:932/2245 train_time:56757ms step_avg:60.90ms +step:933/2245 train_time:56820ms step_avg:60.90ms +step:934/2245 train_time:56880ms step_avg:60.90ms +step:935/2245 train_time:56942ms step_avg:60.90ms +step:936/2245 train_time:57002ms step_avg:60.90ms +step:937/2245 train_time:57066ms step_avg:60.90ms +step:938/2245 train_time:57126ms step_avg:60.90ms +step:939/2245 train_time:57188ms step_avg:60.90ms +step:940/2245 train_time:57248ms step_avg:60.90ms +step:941/2245 train_time:57311ms step_avg:60.90ms +step:942/2245 train_time:57371ms step_avg:60.90ms +step:943/2245 train_time:57433ms step_avg:60.90ms +step:944/2245 train_time:57492ms step_avg:60.90ms +step:945/2245 train_time:57554ms step_avg:60.90ms +step:946/2245 train_time:57614ms step_avg:60.90ms +step:947/2245 train_time:57676ms step_avg:60.90ms +step:948/2245 train_time:57736ms step_avg:60.90ms +step:949/2245 train_time:57799ms step_avg:60.90ms +step:950/2245 train_time:57858ms step_avg:60.90ms +step:951/2245 train_time:57920ms step_avg:60.90ms +step:952/2245 train_time:57980ms step_avg:60.90ms +step:953/2245 train_time:58045ms step_avg:60.91ms +step:954/2245 train_time:58105ms step_avg:60.91ms +step:955/2245 train_time:58167ms step_avg:60.91ms +step:956/2245 train_time:58228ms step_avg:60.91ms +step:957/2245 train_time:58289ms step_avg:60.91ms +step:958/2245 train_time:58349ms step_avg:60.91ms +step:959/2245 train_time:58411ms step_avg:60.91ms +step:960/2245 train_time:58471ms step_avg:60.91ms +step:961/2245 train_time:58533ms step_avg:60.91ms +step:962/2245 train_time:58593ms step_avg:60.91ms +step:963/2245 train_time:58655ms step_avg:60.91ms +step:964/2245 train_time:58714ms step_avg:60.91ms +step:965/2245 train_time:58777ms step_avg:60.91ms +step:966/2245 train_time:58837ms step_avg:60.91ms +step:967/2245 train_time:58900ms step_avg:60.91ms +step:968/2245 train_time:58960ms step_avg:60.91ms +step:969/2245 train_time:59023ms step_avg:60.91ms +step:970/2245 train_time:59084ms step_avg:60.91ms +step:971/2245 train_time:59147ms step_avg:60.91ms +step:972/2245 train_time:59207ms step_avg:60.91ms +step:973/2245 train_time:59269ms step_avg:60.91ms +step:974/2245 train_time:59329ms step_avg:60.91ms +step:975/2245 train_time:59392ms step_avg:60.92ms +step:976/2245 train_time:59451ms step_avg:60.91ms +step:977/2245 train_time:59513ms step_avg:60.91ms +step:978/2245 train_time:59572ms step_avg:60.91ms +step:979/2245 train_time:59634ms step_avg:60.91ms +step:980/2245 train_time:59693ms step_avg:60.91ms +step:981/2245 train_time:59757ms step_avg:60.91ms +step:982/2245 train_time:59817ms step_avg:60.91ms +step:983/2245 train_time:59879ms step_avg:60.91ms +step:984/2245 train_time:59939ms step_avg:60.91ms +step:985/2245 train_time:60002ms step_avg:60.92ms +step:986/2245 train_time:60063ms step_avg:60.92ms +step:987/2245 train_time:60126ms step_avg:60.92ms +step:988/2245 train_time:60186ms step_avg:60.92ms +step:989/2245 train_time:60248ms step_avg:60.92ms +step:990/2245 train_time:60309ms step_avg:60.92ms +step:991/2245 train_time:60371ms step_avg:60.92ms +step:992/2245 train_time:60432ms step_avg:60.92ms +step:993/2245 train_time:60493ms step_avg:60.92ms +step:994/2245 train_time:60553ms step_avg:60.92ms +step:995/2245 train_time:60615ms step_avg:60.92ms +step:996/2245 train_time:60675ms step_avg:60.92ms +step:997/2245 train_time:60738ms step_avg:60.92ms +step:998/2245 train_time:60797ms step_avg:60.92ms +step:999/2245 train_time:60859ms step_avg:60.92ms +step:1000/2245 train_time:60920ms step_avg:60.92ms +step:1000/2245 val_loss:3.5950 train_time:60983ms step_avg:60.98ms +step:1001/2245 train_time:61002ms step_avg:60.94ms +step:1002/2245 train_time:61046ms step_avg:60.92ms +step:1003/2245 train_time:61110ms step_avg:60.93ms +step:1004/2245 train_time:61172ms step_avg:60.93ms +step:1005/2245 train_time:61234ms step_avg:60.93ms +step:1006/2245 train_time:61295ms step_avg:60.93ms +step:1007/2245 train_time:61357ms step_avg:60.93ms +step:1008/2245 train_time:61417ms step_avg:60.93ms +step:1009/2245 train_time:61479ms step_avg:60.93ms +step:1010/2245 train_time:61538ms step_avg:60.93ms +step:1011/2245 train_time:61600ms step_avg:60.93ms +step:1012/2245 train_time:61660ms step_avg:60.93ms +step:1013/2245 train_time:61721ms step_avg:60.93ms +step:1014/2245 train_time:61780ms step_avg:60.93ms +step:1015/2245 train_time:61841ms step_avg:60.93ms +step:1016/2245 train_time:61901ms step_avg:60.93ms +step:1017/2245 train_time:61965ms step_avg:60.93ms +step:1018/2245 train_time:62025ms step_avg:60.93ms +step:1019/2245 train_time:62089ms step_avg:60.93ms +step:1020/2245 train_time:62149ms step_avg:60.93ms +step:1021/2245 train_time:62212ms step_avg:60.93ms +step:1022/2245 train_time:62273ms step_avg:60.93ms +step:1023/2245 train_time:62335ms step_avg:60.93ms +step:1024/2245 train_time:62395ms step_avg:60.93ms +step:1025/2245 train_time:62458ms step_avg:60.93ms +step:1026/2245 train_time:62517ms step_avg:60.93ms +step:1027/2245 train_time:62579ms step_avg:60.93ms +step:1028/2245 train_time:62638ms step_avg:60.93ms +step:1029/2245 train_time:62700ms step_avg:60.93ms +step:1030/2245 train_time:62760ms step_avg:60.93ms +step:1031/2245 train_time:62821ms step_avg:60.93ms +step:1032/2245 train_time:62881ms step_avg:60.93ms +step:1033/2245 train_time:62944ms step_avg:60.93ms +step:1034/2245 train_time:63004ms step_avg:60.93ms +step:1035/2245 train_time:63067ms step_avg:60.93ms +step:1036/2245 train_time:63127ms step_avg:60.93ms +step:1037/2245 train_time:63190ms step_avg:60.93ms +step:1038/2245 train_time:63250ms step_avg:60.93ms +step:1039/2245 train_time:63312ms step_avg:60.94ms +step:1040/2245 train_time:63373ms step_avg:60.94ms +step:1041/2245 train_time:63436ms step_avg:60.94ms +step:1042/2245 train_time:63496ms step_avg:60.94ms +step:1043/2245 train_time:63558ms step_avg:60.94ms +step:1044/2245 train_time:63618ms step_avg:60.94ms +step:1045/2245 train_time:63680ms step_avg:60.94ms +step:1046/2245 train_time:63740ms step_avg:60.94ms +step:1047/2245 train_time:63802ms step_avg:60.94ms +step:1048/2245 train_time:63862ms step_avg:60.94ms +step:1049/2245 train_time:63924ms step_avg:60.94ms +step:1050/2245 train_time:63984ms step_avg:60.94ms +step:1051/2245 train_time:64046ms step_avg:60.94ms +step:1052/2245 train_time:64106ms step_avg:60.94ms +step:1053/2245 train_time:64169ms step_avg:60.94ms +step:1054/2245 train_time:64229ms step_avg:60.94ms +step:1055/2245 train_time:64292ms step_avg:60.94ms +step:1056/2245 train_time:64352ms step_avg:60.94ms +step:1057/2245 train_time:64415ms step_avg:60.94ms +step:1058/2245 train_time:64474ms step_avg:60.94ms +step:1059/2245 train_time:64536ms step_avg:60.94ms +step:1060/2245 train_time:64596ms step_avg:60.94ms +step:1061/2245 train_time:64659ms step_avg:60.94ms +step:1062/2245 train_time:64718ms step_avg:60.94ms +step:1063/2245 train_time:64781ms step_avg:60.94ms +step:1064/2245 train_time:64840ms step_avg:60.94ms +step:1065/2245 train_time:64902ms step_avg:60.94ms +step:1066/2245 train_time:64963ms step_avg:60.94ms +step:1067/2245 train_time:65025ms step_avg:60.94ms +step:1068/2245 train_time:65085ms step_avg:60.94ms +step:1069/2245 train_time:65147ms step_avg:60.94ms +step:1070/2245 train_time:65207ms step_avg:60.94ms +step:1071/2245 train_time:65271ms step_avg:60.94ms +step:1072/2245 train_time:65331ms step_avg:60.94ms +step:1073/2245 train_time:65394ms step_avg:60.94ms +step:1074/2245 train_time:65454ms step_avg:60.94ms +step:1075/2245 train_time:65517ms step_avg:60.95ms +step:1076/2245 train_time:65577ms step_avg:60.95ms +step:1077/2245 train_time:65640ms step_avg:60.95ms +step:1078/2245 train_time:65699ms step_avg:60.95ms +step:1079/2245 train_time:65762ms step_avg:60.95ms +step:1080/2245 train_time:65821ms step_avg:60.95ms +step:1081/2245 train_time:65883ms step_avg:60.95ms +step:1082/2245 train_time:65943ms step_avg:60.95ms +step:1083/2245 train_time:66005ms step_avg:60.95ms +step:1084/2245 train_time:66065ms step_avg:60.95ms +step:1085/2245 train_time:66127ms step_avg:60.95ms +step:1086/2245 train_time:66188ms step_avg:60.95ms +step:1087/2245 train_time:66250ms step_avg:60.95ms +step:1088/2245 train_time:66310ms step_avg:60.95ms +step:1089/2245 train_time:66373ms step_avg:60.95ms +step:1090/2245 train_time:66433ms step_avg:60.95ms +step:1091/2245 train_time:66495ms step_avg:60.95ms +step:1092/2245 train_time:66556ms step_avg:60.95ms +step:1093/2245 train_time:66618ms step_avg:60.95ms +step:1094/2245 train_time:66679ms step_avg:60.95ms +step:1095/2245 train_time:66741ms step_avg:60.95ms +step:1096/2245 train_time:66801ms step_avg:60.95ms +step:1097/2245 train_time:66864ms step_avg:60.95ms +step:1098/2245 train_time:66924ms step_avg:60.95ms +step:1099/2245 train_time:66986ms step_avg:60.95ms +step:1100/2245 train_time:67045ms step_avg:60.95ms +step:1101/2245 train_time:67108ms step_avg:60.95ms +step:1102/2245 train_time:67168ms step_avg:60.95ms +step:1103/2245 train_time:67230ms step_avg:60.95ms +step:1104/2245 train_time:67290ms step_avg:60.95ms +step:1105/2245 train_time:67353ms step_avg:60.95ms +step:1106/2245 train_time:67413ms step_avg:60.95ms +step:1107/2245 train_time:67476ms step_avg:60.95ms +step:1108/2245 train_time:67536ms step_avg:60.95ms +step:1109/2245 train_time:67599ms step_avg:60.95ms +step:1110/2245 train_time:67660ms step_avg:60.95ms +step:1111/2245 train_time:67722ms step_avg:60.96ms +step:1112/2245 train_time:67782ms step_avg:60.96ms +step:1113/2245 train_time:67845ms step_avg:60.96ms +step:1114/2245 train_time:67905ms step_avg:60.96ms +step:1115/2245 train_time:67967ms step_avg:60.96ms +step:1116/2245 train_time:68027ms step_avg:60.96ms +step:1117/2245 train_time:68089ms step_avg:60.96ms +step:1118/2245 train_time:68149ms step_avg:60.96ms +step:1119/2245 train_time:68211ms step_avg:60.96ms +step:1120/2245 train_time:68271ms step_avg:60.96ms +step:1121/2245 train_time:68333ms step_avg:60.96ms +step:1122/2245 train_time:68394ms step_avg:60.96ms +step:1123/2245 train_time:68456ms step_avg:60.96ms +step:1124/2245 train_time:68517ms step_avg:60.96ms +step:1125/2245 train_time:68579ms step_avg:60.96ms +step:1126/2245 train_time:68639ms step_avg:60.96ms +step:1127/2245 train_time:68701ms step_avg:60.96ms +step:1128/2245 train_time:68761ms step_avg:60.96ms +step:1129/2245 train_time:68824ms step_avg:60.96ms +step:1130/2245 train_time:68884ms step_avg:60.96ms +step:1131/2245 train_time:68946ms step_avg:60.96ms +step:1132/2245 train_time:69005ms step_avg:60.96ms +step:1133/2245 train_time:69068ms step_avg:60.96ms +step:1134/2245 train_time:69128ms step_avg:60.96ms +step:1135/2245 train_time:69190ms step_avg:60.96ms +step:1136/2245 train_time:69250ms step_avg:60.96ms +step:1137/2245 train_time:69312ms step_avg:60.96ms +step:1138/2245 train_time:69372ms step_avg:60.96ms +step:1139/2245 train_time:69434ms step_avg:60.96ms +step:1140/2245 train_time:69495ms step_avg:60.96ms +step:1141/2245 train_time:69559ms step_avg:60.96ms +step:1142/2245 train_time:69619ms step_avg:60.96ms +step:1143/2245 train_time:69681ms step_avg:60.96ms +step:1144/2245 train_time:69741ms step_avg:60.96ms +step:1145/2245 train_time:69804ms step_avg:60.96ms +step:1146/2245 train_time:69864ms step_avg:60.96ms +step:1147/2245 train_time:69926ms step_avg:60.96ms +step:1148/2245 train_time:69986ms step_avg:60.96ms +step:1149/2245 train_time:70049ms step_avg:60.97ms +step:1150/2245 train_time:70109ms step_avg:60.96ms +step:1151/2245 train_time:70172ms step_avg:60.97ms +step:1152/2245 train_time:70231ms step_avg:60.96ms +step:1153/2245 train_time:70294ms step_avg:60.97ms +step:1154/2245 train_time:70353ms step_avg:60.96ms +step:1155/2245 train_time:70416ms step_avg:60.97ms +step:1156/2245 train_time:70476ms step_avg:60.97ms +step:1157/2245 train_time:70539ms step_avg:60.97ms +step:1158/2245 train_time:70598ms step_avg:60.97ms +step:1159/2245 train_time:70661ms step_avg:60.97ms +step:1160/2245 train_time:70720ms step_avg:60.97ms +step:1161/2245 train_time:70783ms step_avg:60.97ms +step:1162/2245 train_time:70843ms step_avg:60.97ms +step:1163/2245 train_time:70905ms step_avg:60.97ms +step:1164/2245 train_time:70966ms step_avg:60.97ms +step:1165/2245 train_time:71028ms step_avg:60.97ms +step:1166/2245 train_time:71088ms step_avg:60.97ms +step:1167/2245 train_time:71150ms step_avg:60.97ms +step:1168/2245 train_time:71210ms step_avg:60.97ms +step:1169/2245 train_time:71273ms step_avg:60.97ms +step:1170/2245 train_time:71333ms step_avg:60.97ms +step:1171/2245 train_time:71395ms step_avg:60.97ms +step:1172/2245 train_time:71456ms step_avg:60.97ms +step:1173/2245 train_time:71519ms step_avg:60.97ms +step:1174/2245 train_time:71579ms step_avg:60.97ms +step:1175/2245 train_time:71641ms step_avg:60.97ms +step:1176/2245 train_time:71701ms step_avg:60.97ms +step:1177/2245 train_time:71764ms step_avg:60.97ms +step:1178/2245 train_time:71824ms step_avg:60.97ms +step:1179/2245 train_time:71886ms step_avg:60.97ms +step:1180/2245 train_time:71947ms step_avg:60.97ms +step:1181/2245 train_time:72009ms step_avg:60.97ms +step:1182/2245 train_time:72069ms step_avg:60.97ms +step:1183/2245 train_time:72131ms step_avg:60.97ms +step:1184/2245 train_time:72191ms step_avg:60.97ms +step:1185/2245 train_time:72254ms step_avg:60.97ms +step:1186/2245 train_time:72314ms step_avg:60.97ms +step:1187/2245 train_time:72376ms step_avg:60.97ms +step:1188/2245 train_time:72437ms step_avg:60.97ms +step:1189/2245 train_time:72499ms step_avg:60.98ms +step:1190/2245 train_time:72559ms step_avg:60.97ms +step:1191/2245 train_time:72622ms step_avg:60.98ms +step:1192/2245 train_time:72681ms step_avg:60.97ms +step:1193/2245 train_time:72744ms step_avg:60.98ms +step:1194/2245 train_time:72804ms step_avg:60.97ms +step:1195/2245 train_time:72866ms step_avg:60.98ms +step:1196/2245 train_time:72927ms step_avg:60.98ms +step:1197/2245 train_time:72988ms step_avg:60.98ms +step:1198/2245 train_time:73048ms step_avg:60.97ms +step:1199/2245 train_time:73110ms step_avg:60.98ms +step:1200/2245 train_time:73170ms step_avg:60.97ms +step:1201/2245 train_time:73232ms step_avg:60.98ms +step:1202/2245 train_time:73292ms step_avg:60.98ms +step:1203/2245 train_time:73355ms step_avg:60.98ms +step:1204/2245 train_time:73414ms step_avg:60.98ms +step:1205/2245 train_time:73478ms step_avg:60.98ms +step:1206/2245 train_time:73538ms step_avg:60.98ms +step:1207/2245 train_time:73601ms step_avg:60.98ms +step:1208/2245 train_time:73661ms step_avg:60.98ms +step:1209/2245 train_time:73724ms step_avg:60.98ms +step:1210/2245 train_time:73784ms step_avg:60.98ms +step:1211/2245 train_time:73847ms step_avg:60.98ms +step:1212/2245 train_time:73906ms step_avg:60.98ms +step:1213/2245 train_time:73968ms step_avg:60.98ms +step:1214/2245 train_time:74028ms step_avg:60.98ms +step:1215/2245 train_time:74091ms step_avg:60.98ms +step:1216/2245 train_time:74151ms step_avg:60.98ms +step:1217/2245 train_time:74214ms step_avg:60.98ms +step:1218/2245 train_time:74274ms step_avg:60.98ms +step:1219/2245 train_time:74337ms step_avg:60.98ms +step:1220/2245 train_time:74397ms step_avg:60.98ms +step:1221/2245 train_time:74460ms step_avg:60.98ms +step:1222/2245 train_time:74519ms step_avg:60.98ms +step:1223/2245 train_time:74581ms step_avg:60.98ms +step:1224/2245 train_time:74642ms step_avg:60.98ms +step:1225/2245 train_time:74704ms step_avg:60.98ms +step:1226/2245 train_time:74765ms step_avg:60.98ms +step:1227/2245 train_time:74827ms step_avg:60.98ms +step:1228/2245 train_time:74886ms step_avg:60.98ms +step:1229/2245 train_time:74949ms step_avg:60.98ms +step:1230/2245 train_time:75008ms step_avg:60.98ms +step:1231/2245 train_time:75071ms step_avg:60.98ms +step:1232/2245 train_time:75131ms step_avg:60.98ms +step:1233/2245 train_time:75194ms step_avg:60.98ms +step:1234/2245 train_time:75255ms step_avg:60.98ms +step:1235/2245 train_time:75318ms step_avg:60.99ms +step:1236/2245 train_time:75378ms step_avg:60.99ms +step:1237/2245 train_time:75440ms step_avg:60.99ms +step:1238/2245 train_time:75499ms step_avg:60.99ms +step:1239/2245 train_time:75561ms step_avg:60.99ms +step:1240/2245 train_time:75621ms step_avg:60.98ms +step:1241/2245 train_time:75684ms step_avg:60.99ms +step:1242/2245 train_time:75744ms step_avg:60.99ms +step:1243/2245 train_time:75806ms step_avg:60.99ms +step:1244/2245 train_time:75866ms step_avg:60.99ms +step:1245/2245 train_time:75928ms step_avg:60.99ms +step:1246/2245 train_time:75988ms step_avg:60.99ms +step:1247/2245 train_time:76050ms step_avg:60.99ms +step:1248/2245 train_time:76110ms step_avg:60.99ms +step:1249/2245 train_time:76173ms step_avg:60.99ms +step:1250/2245 train_time:76233ms step_avg:60.99ms +step:1250/2245 val_loss:3.5231 train_time:76297ms step_avg:61.04ms +step:1251/2245 train_time:76315ms step_avg:61.00ms +step:1252/2245 train_time:76358ms step_avg:60.99ms +step:1253/2245 train_time:76424ms step_avg:60.99ms +step:1254/2245 train_time:76486ms step_avg:60.99ms +step:1255/2245 train_time:76547ms step_avg:60.99ms +step:1256/2245 train_time:76606ms step_avg:60.99ms +step:1257/2245 train_time:76668ms step_avg:60.99ms +step:1258/2245 train_time:76727ms step_avg:60.99ms +step:1259/2245 train_time:76789ms step_avg:60.99ms +step:1260/2245 train_time:76848ms step_avg:60.99ms +step:1261/2245 train_time:76909ms step_avg:60.99ms +step:1262/2245 train_time:76968ms step_avg:60.99ms +step:1263/2245 train_time:77030ms step_avg:60.99ms +step:1264/2245 train_time:77090ms step_avg:60.99ms +step:1265/2245 train_time:77152ms step_avg:60.99ms +step:1266/2245 train_time:77214ms step_avg:60.99ms +step:1267/2245 train_time:77278ms step_avg:60.99ms +step:1268/2245 train_time:77339ms step_avg:60.99ms +step:1269/2245 train_time:77402ms step_avg:60.99ms +step:1270/2245 train_time:77464ms step_avg:61.00ms +step:1271/2245 train_time:77526ms step_avg:61.00ms +step:1272/2245 train_time:77587ms step_avg:61.00ms +step:1273/2245 train_time:77648ms step_avg:61.00ms +step:1274/2245 train_time:77707ms step_avg:60.99ms +step:1275/2245 train_time:77769ms step_avg:61.00ms +step:1276/2245 train_time:77828ms step_avg:60.99ms +step:1277/2245 train_time:77890ms step_avg:60.99ms +step:1278/2245 train_time:77949ms step_avg:60.99ms +step:1279/2245 train_time:78010ms step_avg:60.99ms +step:1280/2245 train_time:78070ms step_avg:60.99ms +step:1281/2245 train_time:78132ms step_avg:60.99ms +step:1282/2245 train_time:78192ms step_avg:60.99ms +step:1283/2245 train_time:78255ms step_avg:60.99ms +step:1284/2245 train_time:78316ms step_avg:60.99ms +step:1285/2245 train_time:78379ms step_avg:61.00ms +step:1286/2245 train_time:78439ms step_avg:60.99ms +step:1287/2245 train_time:78502ms step_avg:61.00ms +step:1288/2245 train_time:78562ms step_avg:61.00ms +step:1289/2245 train_time:78625ms step_avg:61.00ms +step:1290/2245 train_time:78686ms step_avg:61.00ms +step:1291/2245 train_time:78748ms step_avg:61.00ms +step:1292/2245 train_time:78807ms step_avg:61.00ms +step:1293/2245 train_time:78869ms step_avg:61.00ms +step:1294/2245 train_time:78929ms step_avg:61.00ms +step:1295/2245 train_time:78990ms step_avg:61.00ms +step:1296/2245 train_time:79049ms step_avg:60.99ms +step:1297/2245 train_time:79111ms step_avg:61.00ms +step:1298/2245 train_time:79171ms step_avg:60.99ms +step:1299/2245 train_time:79233ms step_avg:61.00ms +step:1300/2245 train_time:79293ms step_avg:60.99ms +step:1301/2245 train_time:79356ms step_avg:61.00ms +step:1302/2245 train_time:79416ms step_avg:61.00ms +step:1303/2245 train_time:79479ms step_avg:61.00ms +step:1304/2245 train_time:79539ms step_avg:61.00ms +step:1305/2245 train_time:79602ms step_avg:61.00ms +step:1306/2245 train_time:79662ms step_avg:61.00ms +step:1307/2245 train_time:79725ms step_avg:61.00ms +step:1308/2245 train_time:79785ms step_avg:61.00ms +step:1309/2245 train_time:79847ms step_avg:61.00ms +step:1310/2245 train_time:79907ms step_avg:61.00ms +step:1311/2245 train_time:79969ms step_avg:61.00ms +step:1312/2245 train_time:80028ms step_avg:61.00ms +step:1313/2245 train_time:80091ms step_avg:61.00ms +step:1314/2245 train_time:80151ms step_avg:61.00ms +step:1315/2245 train_time:80212ms step_avg:61.00ms +step:1316/2245 train_time:80272ms step_avg:61.00ms +step:1317/2245 train_time:80335ms step_avg:61.00ms +step:1318/2245 train_time:80395ms step_avg:61.00ms +step:1319/2245 train_time:80458ms step_avg:61.00ms +step:1320/2245 train_time:80518ms step_avg:61.00ms +step:1321/2245 train_time:80581ms step_avg:61.00ms +step:1322/2245 train_time:80642ms step_avg:61.00ms +step:1323/2245 train_time:80704ms step_avg:61.00ms +step:1324/2245 train_time:80764ms step_avg:61.00ms +step:1325/2245 train_time:80827ms step_avg:61.00ms +step:1326/2245 train_time:80887ms step_avg:61.00ms +step:1327/2245 train_time:80949ms step_avg:61.00ms +step:1328/2245 train_time:81009ms step_avg:61.00ms +step:1329/2245 train_time:81072ms step_avg:61.00ms +step:1330/2245 train_time:81131ms step_avg:61.00ms +step:1331/2245 train_time:81194ms step_avg:61.00ms +step:1332/2245 train_time:81253ms step_avg:61.00ms +step:1333/2245 train_time:81315ms step_avg:61.00ms +step:1334/2245 train_time:81376ms step_avg:61.00ms +step:1335/2245 train_time:81438ms step_avg:61.00ms +step:1336/2245 train_time:81498ms step_avg:61.00ms +step:1337/2245 train_time:81562ms step_avg:61.00ms +step:1338/2245 train_time:81622ms step_avg:61.00ms +step:1339/2245 train_time:81684ms step_avg:61.00ms +step:1340/2245 train_time:81745ms step_avg:61.00ms +step:1341/2245 train_time:81807ms step_avg:61.00ms +step:1342/2245 train_time:81867ms step_avg:61.00ms +step:1343/2245 train_time:81930ms step_avg:61.01ms +step:1344/2245 train_time:81990ms step_avg:61.00ms +step:1345/2245 train_time:82051ms step_avg:61.00ms +step:1346/2245 train_time:82111ms step_avg:61.00ms +step:1347/2245 train_time:82172ms step_avg:61.00ms +step:1348/2245 train_time:82232ms step_avg:61.00ms +step:1349/2245 train_time:82295ms step_avg:61.00ms +step:1350/2245 train_time:82354ms step_avg:61.00ms +step:1351/2245 train_time:82417ms step_avg:61.00ms +step:1352/2245 train_time:82477ms step_avg:61.00ms +step:1353/2245 train_time:82540ms step_avg:61.01ms +step:1354/2245 train_time:82600ms step_avg:61.00ms +step:1355/2245 train_time:82663ms step_avg:61.01ms +step:1356/2245 train_time:82722ms step_avg:61.00ms +step:1357/2245 train_time:82785ms step_avg:61.01ms +step:1358/2245 train_time:82845ms step_avg:61.00ms +step:1359/2245 train_time:82908ms step_avg:61.01ms +step:1360/2245 train_time:82968ms step_avg:61.01ms +step:1361/2245 train_time:83030ms step_avg:61.01ms +step:1362/2245 train_time:83092ms step_avg:61.01ms +step:1363/2245 train_time:83153ms step_avg:61.01ms +step:1364/2245 train_time:83213ms step_avg:61.01ms +step:1365/2245 train_time:83276ms step_avg:61.01ms +step:1366/2245 train_time:83335ms step_avg:61.01ms +step:1367/2245 train_time:83398ms step_avg:61.01ms +step:1368/2245 train_time:83457ms step_avg:61.01ms +step:1369/2245 train_time:83519ms step_avg:61.01ms +step:1370/2245 train_time:83579ms step_avg:61.01ms +step:1371/2245 train_time:83641ms step_avg:61.01ms +step:1372/2245 train_time:83702ms step_avg:61.01ms +step:1373/2245 train_time:83765ms step_avg:61.01ms +step:1374/2245 train_time:83826ms step_avg:61.01ms +step:1375/2245 train_time:83888ms step_avg:61.01ms +step:1376/2245 train_time:83948ms step_avg:61.01ms +step:1377/2245 train_time:84010ms step_avg:61.01ms +step:1378/2245 train_time:84070ms step_avg:61.01ms +step:1379/2245 train_time:84133ms step_avg:61.01ms +step:1380/2245 train_time:84193ms step_avg:61.01ms +step:1381/2245 train_time:84254ms step_avg:61.01ms +step:1382/2245 train_time:84314ms step_avg:61.01ms +step:1383/2245 train_time:84376ms step_avg:61.01ms +step:1384/2245 train_time:84437ms step_avg:61.01ms +step:1385/2245 train_time:84500ms step_avg:61.01ms +step:1386/2245 train_time:84560ms step_avg:61.01ms +step:1387/2245 train_time:84623ms step_avg:61.01ms +step:1388/2245 train_time:84683ms step_avg:61.01ms +step:1389/2245 train_time:84745ms step_avg:61.01ms +step:1390/2245 train_time:84805ms step_avg:61.01ms +step:1391/2245 train_time:84868ms step_avg:61.01ms +step:1392/2245 train_time:84927ms step_avg:61.01ms +step:1393/2245 train_time:84990ms step_avg:61.01ms +step:1394/2245 train_time:85050ms step_avg:61.01ms +step:1395/2245 train_time:85113ms step_avg:61.01ms +step:1396/2245 train_time:85172ms step_avg:61.01ms +step:1397/2245 train_time:85234ms step_avg:61.01ms +step:1398/2245 train_time:85294ms step_avg:61.01ms +step:1399/2245 train_time:85356ms step_avg:61.01ms +step:1400/2245 train_time:85416ms step_avg:61.01ms +step:1401/2245 train_time:85478ms step_avg:61.01ms +step:1402/2245 train_time:85538ms step_avg:61.01ms +step:1403/2245 train_time:85600ms step_avg:61.01ms +step:1404/2245 train_time:85660ms step_avg:61.01ms +step:1405/2245 train_time:85723ms step_avg:61.01ms +step:1406/2245 train_time:85784ms step_avg:61.01ms +step:1407/2245 train_time:85847ms step_avg:61.01ms +step:1408/2245 train_time:85907ms step_avg:61.01ms +step:1409/2245 train_time:85969ms step_avg:61.01ms +step:1410/2245 train_time:86029ms step_avg:61.01ms +step:1411/2245 train_time:86092ms step_avg:61.02ms +step:1412/2245 train_time:86152ms step_avg:61.01ms +step:1413/2245 train_time:86214ms step_avg:61.01ms +step:1414/2245 train_time:86273ms step_avg:61.01ms +step:1415/2245 train_time:86336ms step_avg:61.01ms +step:1416/2245 train_time:86395ms step_avg:61.01ms +step:1417/2245 train_time:86457ms step_avg:61.01ms +step:1418/2245 train_time:86517ms step_avg:61.01ms +step:1419/2245 train_time:86580ms step_avg:61.01ms +step:1420/2245 train_time:86640ms step_avg:61.01ms +step:1421/2245 train_time:86703ms step_avg:61.02ms +step:1422/2245 train_time:86763ms step_avg:61.01ms +step:1423/2245 train_time:86826ms step_avg:61.02ms +step:1424/2245 train_time:86886ms step_avg:61.02ms +step:1425/2245 train_time:86949ms step_avg:61.02ms +step:1426/2245 train_time:87009ms step_avg:61.02ms +step:1427/2245 train_time:87072ms step_avg:61.02ms +step:1428/2245 train_time:87133ms step_avg:61.02ms +step:1429/2245 train_time:87194ms step_avg:61.02ms +step:1430/2245 train_time:87254ms step_avg:61.02ms +step:1431/2245 train_time:87316ms step_avg:61.02ms +step:1432/2245 train_time:87376ms step_avg:61.02ms +step:1433/2245 train_time:87438ms step_avg:61.02ms +step:1434/2245 train_time:87498ms step_avg:61.02ms +step:1435/2245 train_time:87561ms step_avg:61.02ms +step:1436/2245 train_time:87621ms step_avg:61.02ms +step:1437/2245 train_time:87683ms step_avg:61.02ms +step:1438/2245 train_time:87743ms step_avg:61.02ms +step:1439/2245 train_time:87807ms step_avg:61.02ms +step:1440/2245 train_time:87867ms step_avg:61.02ms +step:1441/2245 train_time:87930ms step_avg:61.02ms +step:1442/2245 train_time:87991ms step_avg:61.02ms +step:1443/2245 train_time:88053ms step_avg:61.02ms +step:1444/2245 train_time:88113ms step_avg:61.02ms +step:1445/2245 train_time:88176ms step_avg:61.02ms +step:1446/2245 train_time:88236ms step_avg:61.02ms +step:1447/2245 train_time:88299ms step_avg:61.02ms +step:1448/2245 train_time:88358ms step_avg:61.02ms +step:1449/2245 train_time:88420ms step_avg:61.02ms +step:1450/2245 train_time:88480ms step_avg:61.02ms +step:1451/2245 train_time:88542ms step_avg:61.02ms +step:1452/2245 train_time:88602ms step_avg:61.02ms +step:1453/2245 train_time:88664ms step_avg:61.02ms +step:1454/2245 train_time:88724ms step_avg:61.02ms +step:1455/2245 train_time:88786ms step_avg:61.02ms +step:1456/2245 train_time:88847ms step_avg:61.02ms +step:1457/2245 train_time:88909ms step_avg:61.02ms +step:1458/2245 train_time:88969ms step_avg:61.02ms +step:1459/2245 train_time:89031ms step_avg:61.02ms +step:1460/2245 train_time:89092ms step_avg:61.02ms +step:1461/2245 train_time:89154ms step_avg:61.02ms +step:1462/2245 train_time:89213ms step_avg:61.02ms +step:1463/2245 train_time:89276ms step_avg:61.02ms +step:1464/2245 train_time:89335ms step_avg:61.02ms +step:1465/2245 train_time:89397ms step_avg:61.02ms +step:1466/2245 train_time:89457ms step_avg:61.02ms +step:1467/2245 train_time:89519ms step_avg:61.02ms +step:1468/2245 train_time:89579ms step_avg:61.02ms +step:1469/2245 train_time:89642ms step_avg:61.02ms +step:1470/2245 train_time:89702ms step_avg:61.02ms +step:1471/2245 train_time:89765ms step_avg:61.02ms +step:1472/2245 train_time:89826ms step_avg:61.02ms +step:1473/2245 train_time:89889ms step_avg:61.02ms +step:1474/2245 train_time:89949ms step_avg:61.02ms +step:1475/2245 train_time:90013ms step_avg:61.03ms +step:1476/2245 train_time:90074ms step_avg:61.03ms +step:1477/2245 train_time:90135ms step_avg:61.03ms +step:1478/2245 train_time:90196ms step_avg:61.03ms +step:1479/2245 train_time:90259ms step_avg:61.03ms +step:1480/2245 train_time:90319ms step_avg:61.03ms +step:1481/2245 train_time:90382ms step_avg:61.03ms +step:1482/2245 train_time:90442ms step_avg:61.03ms +step:1483/2245 train_time:90505ms step_avg:61.03ms +step:1484/2245 train_time:90565ms step_avg:61.03ms +step:1485/2245 train_time:90628ms step_avg:61.03ms +step:1486/2245 train_time:90688ms step_avg:61.03ms +step:1487/2245 train_time:90751ms step_avg:61.03ms +step:1488/2245 train_time:90813ms step_avg:61.03ms +step:1489/2245 train_time:90876ms step_avg:61.03ms +step:1490/2245 train_time:90936ms step_avg:61.03ms +step:1491/2245 train_time:91000ms step_avg:61.03ms +step:1492/2245 train_time:91060ms step_avg:61.03ms +step:1493/2245 train_time:91123ms step_avg:61.03ms +step:1494/2245 train_time:91185ms step_avg:61.03ms +step:1495/2245 train_time:91248ms step_avg:61.04ms +step:1496/2245 train_time:91308ms step_avg:61.03ms +step:1497/2245 train_time:91370ms step_avg:61.04ms +step:1498/2245 train_time:91431ms step_avg:61.04ms +step:1499/2245 train_time:91493ms step_avg:61.04ms +step:1500/2245 train_time:91553ms step_avg:61.04ms +step:1500/2245 val_loss:3.4424 train_time:91617ms step_avg:61.08ms +step:1501/2245 train_time:91635ms step_avg:61.05ms +step:1502/2245 train_time:91683ms step_avg:61.04ms +step:1503/2245 train_time:91744ms step_avg:61.04ms +step:1504/2245 train_time:91804ms step_avg:61.04ms +step:1505/2245 train_time:91867ms step_avg:61.04ms +step:1506/2245 train_time:91927ms step_avg:61.04ms +step:1507/2245 train_time:91989ms step_avg:61.04ms +step:1508/2245 train_time:92048ms step_avg:61.04ms +step:1509/2245 train_time:92110ms step_avg:61.04ms +step:1510/2245 train_time:92169ms step_avg:61.04ms +step:1511/2245 train_time:92231ms step_avg:61.04ms +step:1512/2245 train_time:92291ms step_avg:61.04ms +step:1513/2245 train_time:92353ms step_avg:61.04ms +step:1514/2245 train_time:92413ms step_avg:61.04ms +step:1515/2245 train_time:92476ms step_avg:61.04ms +step:1516/2245 train_time:92541ms step_avg:61.04ms +step:1517/2245 train_time:92608ms step_avg:61.05ms +step:1518/2245 train_time:92670ms step_avg:61.05ms +step:1519/2245 train_time:92733ms step_avg:61.05ms +step:1520/2245 train_time:92794ms step_avg:61.05ms +step:1521/2245 train_time:92857ms step_avg:61.05ms +step:1522/2245 train_time:92917ms step_avg:61.05ms +step:1523/2245 train_time:92979ms step_avg:61.05ms +step:1524/2245 train_time:93039ms step_avg:61.05ms +step:1525/2245 train_time:93102ms step_avg:61.05ms +step:1526/2245 train_time:93161ms step_avg:61.05ms +step:1527/2245 train_time:93224ms step_avg:61.05ms +step:1528/2245 train_time:93284ms step_avg:61.05ms +step:1529/2245 train_time:93346ms step_avg:61.05ms +step:1530/2245 train_time:93408ms step_avg:61.05ms +step:1531/2245 train_time:93472ms step_avg:61.05ms +step:1532/2245 train_time:93534ms step_avg:61.05ms +step:1533/2245 train_time:93598ms step_avg:61.06ms +step:1534/2245 train_time:93660ms step_avg:61.06ms +step:1535/2245 train_time:93723ms step_avg:61.06ms +step:1536/2245 train_time:93784ms step_avg:61.06ms +step:1537/2245 train_time:93848ms step_avg:61.06ms +step:1538/2245 train_time:93908ms step_avg:61.06ms +step:1539/2245 train_time:93970ms step_avg:61.06ms +step:1540/2245 train_time:94030ms step_avg:61.06ms +step:1541/2245 train_time:94092ms step_avg:61.06ms +step:1542/2245 train_time:94152ms step_avg:61.06ms +step:1543/2245 train_time:94214ms step_avg:61.06ms +step:1544/2245 train_time:94274ms step_avg:61.06ms +step:1545/2245 train_time:94336ms step_avg:61.06ms +step:1546/2245 train_time:94396ms step_avg:61.06ms +step:1547/2245 train_time:94459ms step_avg:61.06ms +step:1548/2245 train_time:94520ms step_avg:61.06ms +step:1549/2245 train_time:94584ms step_avg:61.06ms +step:1550/2245 train_time:94645ms step_avg:61.06ms +step:1551/2245 train_time:94709ms step_avg:61.06ms +step:1552/2245 train_time:94769ms step_avg:61.06ms +step:1553/2245 train_time:94832ms step_avg:61.06ms +step:1554/2245 train_time:94893ms step_avg:61.06ms +step:1555/2245 train_time:94956ms step_avg:61.06ms +step:1556/2245 train_time:95015ms step_avg:61.06ms +step:1557/2245 train_time:95078ms step_avg:61.06ms +step:1558/2245 train_time:95137ms step_avg:61.06ms +step:1559/2245 train_time:95200ms step_avg:61.06ms +step:1560/2245 train_time:95259ms step_avg:61.06ms +step:1561/2245 train_time:95323ms step_avg:61.07ms +step:1562/2245 train_time:95384ms step_avg:61.07ms +step:1563/2245 train_time:95446ms step_avg:61.07ms +step:1564/2245 train_time:95507ms step_avg:61.07ms +step:1565/2245 train_time:95571ms step_avg:61.07ms +step:1566/2245 train_time:95631ms step_avg:61.07ms +step:1567/2245 train_time:95695ms step_avg:61.07ms +step:1568/2245 train_time:95756ms step_avg:61.07ms +step:1569/2245 train_time:95819ms step_avg:61.07ms +step:1570/2245 train_time:95879ms step_avg:61.07ms +step:1571/2245 train_time:95943ms step_avg:61.07ms +step:1572/2245 train_time:96003ms step_avg:61.07ms +step:1573/2245 train_time:96066ms step_avg:61.07ms +step:1574/2245 train_time:96126ms step_avg:61.07ms +step:1575/2245 train_time:96190ms step_avg:61.07ms +step:1576/2245 train_time:96250ms step_avg:61.07ms +step:1577/2245 train_time:96312ms step_avg:61.07ms +step:1578/2245 train_time:96373ms step_avg:61.07ms +step:1579/2245 train_time:96435ms step_avg:61.07ms +step:1580/2245 train_time:96496ms step_avg:61.07ms +step:1581/2245 train_time:96559ms step_avg:61.07ms +step:1582/2245 train_time:96619ms step_avg:61.07ms +step:1583/2245 train_time:96683ms step_avg:61.08ms +step:1584/2245 train_time:96744ms step_avg:61.08ms +step:1585/2245 train_time:96808ms step_avg:61.08ms +step:1586/2245 train_time:96868ms step_avg:61.08ms +step:1587/2245 train_time:96931ms step_avg:61.08ms +step:1588/2245 train_time:96991ms step_avg:61.08ms +step:1589/2245 train_time:97053ms step_avg:61.08ms +step:1590/2245 train_time:97113ms step_avg:61.08ms +step:1591/2245 train_time:97176ms step_avg:61.08ms +step:1592/2245 train_time:97236ms step_avg:61.08ms +step:1593/2245 train_time:97299ms step_avg:61.08ms +step:1594/2245 train_time:97360ms step_avg:61.08ms +step:1595/2245 train_time:97423ms step_avg:61.08ms +step:1596/2245 train_time:97484ms step_avg:61.08ms +step:1597/2245 train_time:97548ms step_avg:61.08ms +step:1598/2245 train_time:97608ms step_avg:61.08ms +step:1599/2245 train_time:97671ms step_avg:61.08ms +step:1600/2245 train_time:97731ms step_avg:61.08ms +step:1601/2245 train_time:97794ms step_avg:61.08ms +step:1602/2245 train_time:97854ms step_avg:61.08ms +step:1603/2245 train_time:97916ms step_avg:61.08ms +step:1604/2245 train_time:97976ms step_avg:61.08ms +step:1605/2245 train_time:98039ms step_avg:61.08ms +step:1606/2245 train_time:98099ms step_avg:61.08ms +step:1607/2245 train_time:98161ms step_avg:61.08ms +step:1608/2245 train_time:98221ms step_avg:61.08ms +step:1609/2245 train_time:98284ms step_avg:61.08ms +step:1610/2245 train_time:98344ms step_avg:61.08ms +step:1611/2245 train_time:98407ms step_avg:61.08ms +step:1612/2245 train_time:98467ms step_avg:61.08ms +step:1613/2245 train_time:98531ms step_avg:61.09ms +step:1614/2245 train_time:98592ms step_avg:61.09ms +step:1615/2245 train_time:98654ms step_avg:61.09ms +step:1616/2245 train_time:98715ms step_avg:61.09ms +step:1617/2245 train_time:98779ms step_avg:61.09ms +step:1618/2245 train_time:98840ms step_avg:61.09ms +step:1619/2245 train_time:98903ms step_avg:61.09ms +step:1620/2245 train_time:98964ms step_avg:61.09ms +step:1621/2245 train_time:99026ms step_avg:61.09ms +step:1622/2245 train_time:99087ms step_avg:61.09ms +step:1623/2245 train_time:99150ms step_avg:61.09ms +step:1624/2245 train_time:99210ms step_avg:61.09ms +step:1625/2245 train_time:99272ms step_avg:61.09ms +step:1626/2245 train_time:99333ms step_avg:61.09ms +step:1627/2245 train_time:99395ms step_avg:61.09ms +step:1628/2245 train_time:99456ms step_avg:61.09ms +step:1629/2245 train_time:99519ms step_avg:61.09ms +step:1630/2245 train_time:99579ms step_avg:61.09ms +step:1631/2245 train_time:99641ms step_avg:61.09ms +step:1632/2245 train_time:99702ms step_avg:61.09ms +step:1633/2245 train_time:99766ms step_avg:61.09ms +step:1634/2245 train_time:99827ms step_avg:61.09ms +step:1635/2245 train_time:99890ms step_avg:61.09ms +step:1636/2245 train_time:99950ms step_avg:61.09ms +step:1637/2245 train_time:100013ms step_avg:61.10ms +step:1638/2245 train_time:100073ms step_avg:61.09ms +step:1639/2245 train_time:100135ms step_avg:61.10ms +step:1640/2245 train_time:100196ms step_avg:61.10ms +step:1641/2245 train_time:100259ms step_avg:61.10ms +step:1642/2245 train_time:100319ms step_avg:61.10ms +step:1643/2245 train_time:100382ms step_avg:61.10ms +step:1644/2245 train_time:100443ms step_avg:61.10ms +step:1645/2245 train_time:100505ms step_avg:61.10ms +step:1646/2245 train_time:100565ms step_avg:61.10ms +step:1647/2245 train_time:100629ms step_avg:61.10ms +step:1648/2245 train_time:100690ms step_avg:61.10ms +step:1649/2245 train_time:100753ms step_avg:61.10ms +step:1650/2245 train_time:100813ms step_avg:61.10ms +step:1651/2245 train_time:100875ms step_avg:61.10ms +step:1652/2245 train_time:100935ms step_avg:61.10ms +step:1653/2245 train_time:100998ms step_avg:61.10ms +step:1654/2245 train_time:101058ms step_avg:61.10ms +step:1655/2245 train_time:101121ms step_avg:61.10ms +step:1656/2245 train_time:101181ms step_avg:61.10ms +step:1657/2245 train_time:101244ms step_avg:61.10ms +step:1658/2245 train_time:101306ms step_avg:61.10ms +step:1659/2245 train_time:101369ms step_avg:61.10ms +step:1660/2245 train_time:101429ms step_avg:61.10ms +step:1661/2245 train_time:101492ms step_avg:61.10ms +step:1662/2245 train_time:101551ms step_avg:61.10ms +step:1663/2245 train_time:101614ms step_avg:61.10ms +step:1664/2245 train_time:101674ms step_avg:61.10ms +step:1665/2245 train_time:101738ms step_avg:61.10ms +step:1666/2245 train_time:101798ms step_avg:61.10ms +step:1667/2245 train_time:101861ms step_avg:61.10ms +step:1668/2245 train_time:101922ms step_avg:61.10ms +step:1669/2245 train_time:101986ms step_avg:61.11ms +step:1670/2245 train_time:102046ms step_avg:61.11ms +step:1671/2245 train_time:102109ms step_avg:61.11ms +step:1672/2245 train_time:102170ms step_avg:61.11ms +step:1673/2245 train_time:102233ms step_avg:61.11ms +step:1674/2245 train_time:102293ms step_avg:61.11ms +step:1675/2245 train_time:102356ms step_avg:61.11ms +step:1676/2245 train_time:102416ms step_avg:61.11ms +step:1677/2245 train_time:102478ms step_avg:61.11ms +step:1678/2245 train_time:102538ms step_avg:61.11ms +step:1679/2245 train_time:102601ms step_avg:61.11ms +step:1680/2245 train_time:102662ms step_avg:61.11ms +step:1681/2245 train_time:102726ms step_avg:61.11ms +step:1682/2245 train_time:102787ms step_avg:61.11ms +step:1683/2245 train_time:102850ms step_avg:61.11ms +step:1684/2245 train_time:102910ms step_avg:61.11ms +step:1685/2245 train_time:102973ms step_avg:61.11ms +step:1686/2245 train_time:103033ms step_avg:61.11ms +step:1687/2245 train_time:103096ms step_avg:61.11ms +step:1688/2245 train_time:103156ms step_avg:61.11ms +step:1689/2245 train_time:103219ms step_avg:61.11ms +step:1690/2245 train_time:103279ms step_avg:61.11ms +step:1691/2245 train_time:103342ms step_avg:61.11ms +step:1692/2245 train_time:103403ms step_avg:61.11ms +step:1693/2245 train_time:103466ms step_avg:61.11ms +step:1694/2245 train_time:103527ms step_avg:61.11ms +step:1695/2245 train_time:103590ms step_avg:61.12ms +step:1696/2245 train_time:103650ms step_avg:61.11ms +step:1697/2245 train_time:103713ms step_avg:61.12ms +step:1698/2245 train_time:103774ms step_avg:61.12ms +step:1699/2245 train_time:103837ms step_avg:61.12ms +step:1700/2245 train_time:103898ms step_avg:61.12ms +step:1701/2245 train_time:103961ms step_avg:61.12ms +step:1702/2245 train_time:104021ms step_avg:61.12ms +step:1703/2245 train_time:104085ms step_avg:61.12ms +step:1704/2245 train_time:104146ms step_avg:61.12ms +step:1705/2245 train_time:104209ms step_avg:61.12ms +step:1706/2245 train_time:104269ms step_avg:61.12ms +step:1707/2245 train_time:104332ms step_avg:61.12ms +step:1708/2245 train_time:104393ms step_avg:61.12ms +step:1709/2245 train_time:104456ms step_avg:61.12ms +step:1710/2245 train_time:104516ms step_avg:61.12ms +step:1711/2245 train_time:104579ms step_avg:61.12ms +step:1712/2245 train_time:104639ms step_avg:61.12ms +step:1713/2245 train_time:104702ms step_avg:61.12ms +step:1714/2245 train_time:104763ms step_avg:61.12ms +step:1715/2245 train_time:104827ms step_avg:61.12ms +step:1716/2245 train_time:104888ms step_avg:61.12ms +step:1717/2245 train_time:104950ms step_avg:61.12ms +step:1718/2245 train_time:105010ms step_avg:61.12ms +step:1719/2245 train_time:105073ms step_avg:61.12ms +step:1720/2245 train_time:105133ms step_avg:61.12ms +step:1721/2245 train_time:105195ms step_avg:61.12ms +step:1722/2245 train_time:105256ms step_avg:61.12ms +step:1723/2245 train_time:105318ms step_avg:61.12ms +step:1724/2245 train_time:105379ms step_avg:61.12ms +step:1725/2245 train_time:105442ms step_avg:61.13ms +step:1726/2245 train_time:105503ms step_avg:61.13ms +step:1727/2245 train_time:105567ms step_avg:61.13ms +step:1728/2245 train_time:105627ms step_avg:61.13ms +step:1729/2245 train_time:105691ms step_avg:61.13ms +step:1730/2245 train_time:105750ms step_avg:61.13ms +step:1731/2245 train_time:105813ms step_avg:61.13ms +step:1732/2245 train_time:105873ms step_avg:61.13ms +step:1733/2245 train_time:105936ms step_avg:61.13ms +step:1734/2245 train_time:105997ms step_avg:61.13ms +step:1735/2245 train_time:106059ms step_avg:61.13ms +step:1736/2245 train_time:106120ms step_avg:61.13ms +step:1737/2245 train_time:106183ms step_avg:61.13ms +step:1738/2245 train_time:106243ms step_avg:61.13ms +step:1739/2245 train_time:106306ms step_avg:61.13ms +step:1740/2245 train_time:106367ms step_avg:61.13ms +step:1741/2245 train_time:106430ms step_avg:61.13ms +step:1742/2245 train_time:106490ms step_avg:61.13ms +step:1743/2245 train_time:106553ms step_avg:61.13ms +step:1744/2245 train_time:106613ms step_avg:61.13ms +step:1745/2245 train_time:106676ms step_avg:61.13ms +step:1746/2245 train_time:106737ms step_avg:61.13ms +step:1747/2245 train_time:106800ms step_avg:61.13ms +step:1748/2245 train_time:106860ms step_avg:61.13ms +step:1749/2245 train_time:106924ms step_avg:61.13ms +step:1750/2245 train_time:106986ms step_avg:61.13ms +step:1750/2245 val_loss:3.3796 train_time:107050ms step_avg:61.17ms +step:1751/2245 train_time:107069ms step_avg:61.15ms +step:1752/2245 train_time:107112ms step_avg:61.14ms +step:1753/2245 train_time:107179ms step_avg:61.14ms +step:1754/2245 train_time:107244ms step_avg:61.14ms +step:1755/2245 train_time:107305ms step_avg:61.14ms +step:1756/2245 train_time:107366ms step_avg:61.14ms +step:1757/2245 train_time:107428ms step_avg:61.14ms +step:1758/2245 train_time:107487ms step_avg:61.14ms +step:1759/2245 train_time:107549ms step_avg:61.14ms +step:1760/2245 train_time:107610ms step_avg:61.14ms +step:1761/2245 train_time:107673ms step_avg:61.14ms +step:1762/2245 train_time:107733ms step_avg:61.14ms +step:1763/2245 train_time:107795ms step_avg:61.14ms +step:1764/2245 train_time:107856ms step_avg:61.14ms +step:1765/2245 train_time:107918ms step_avg:61.14ms +step:1766/2245 train_time:107978ms step_avg:61.14ms +step:1767/2245 train_time:108042ms step_avg:61.14ms +step:1768/2245 train_time:108104ms step_avg:61.14ms +step:1769/2245 train_time:108170ms step_avg:61.15ms +step:1770/2245 train_time:108232ms step_avg:61.15ms +step:1771/2245 train_time:108296ms step_avg:61.15ms +step:1772/2245 train_time:108356ms step_avg:61.15ms +step:1773/2245 train_time:108418ms step_avg:61.15ms +step:1774/2245 train_time:108478ms step_avg:61.15ms +step:1775/2245 train_time:108541ms step_avg:61.15ms +step:1776/2245 train_time:108601ms step_avg:61.15ms +step:1777/2245 train_time:108664ms step_avg:61.15ms +step:1778/2245 train_time:108725ms step_avg:61.15ms +step:1779/2245 train_time:108787ms step_avg:61.15ms +step:1780/2245 train_time:108848ms step_avg:61.15ms +step:1781/2245 train_time:108911ms step_avg:61.15ms +step:1782/2245 train_time:108972ms step_avg:61.15ms +step:1783/2245 train_time:109036ms step_avg:61.15ms +step:1784/2245 train_time:109097ms step_avg:61.15ms +step:1785/2245 train_time:109160ms step_avg:61.15ms +step:1786/2245 train_time:109222ms step_avg:61.15ms +step:1787/2245 train_time:109285ms step_avg:61.16ms +step:1788/2245 train_time:109346ms step_avg:61.16ms +step:1789/2245 train_time:109409ms step_avg:61.16ms +step:1790/2245 train_time:109469ms step_avg:61.16ms +step:1791/2245 train_time:109532ms step_avg:61.16ms +step:1792/2245 train_time:109592ms step_avg:61.16ms +step:1793/2245 train_time:109655ms step_avg:61.16ms +step:1794/2245 train_time:109715ms step_avg:61.16ms +step:1795/2245 train_time:109776ms step_avg:61.16ms +step:1796/2245 train_time:109836ms step_avg:61.16ms +step:1797/2245 train_time:109898ms step_avg:61.16ms +step:1798/2245 train_time:109959ms step_avg:61.16ms +step:1799/2245 train_time:110023ms step_avg:61.16ms +step:1800/2245 train_time:110083ms step_avg:61.16ms +step:1801/2245 train_time:110146ms step_avg:61.16ms +step:1802/2245 train_time:110207ms step_avg:61.16ms +step:1803/2245 train_time:110271ms step_avg:61.16ms +step:1804/2245 train_time:110332ms step_avg:61.16ms +step:1805/2245 train_time:110395ms step_avg:61.16ms +step:1806/2245 train_time:110456ms step_avg:61.16ms +step:1807/2245 train_time:110518ms step_avg:61.16ms +step:1808/2245 train_time:110578ms step_avg:61.16ms +step:1809/2245 train_time:110641ms step_avg:61.16ms +step:1810/2245 train_time:110701ms step_avg:61.16ms +step:1811/2245 train_time:110763ms step_avg:61.16ms +step:1812/2245 train_time:110823ms step_avg:61.16ms +step:1813/2245 train_time:110885ms step_avg:61.16ms +step:1814/2245 train_time:110946ms step_avg:61.16ms +step:1815/2245 train_time:111009ms step_avg:61.16ms +step:1816/2245 train_time:111070ms step_avg:61.16ms +step:1817/2245 train_time:111133ms step_avg:61.16ms +step:1818/2245 train_time:111194ms step_avg:61.16ms +step:1819/2245 train_time:111257ms step_avg:61.16ms +step:1820/2245 train_time:111318ms step_avg:61.16ms +step:1821/2245 train_time:111382ms step_avg:61.17ms +step:1822/2245 train_time:111442ms step_avg:61.16ms +step:1823/2245 train_time:111505ms step_avg:61.17ms +step:1824/2245 train_time:111565ms step_avg:61.17ms +step:1825/2245 train_time:111628ms step_avg:61.17ms +step:1826/2245 train_time:111689ms step_avg:61.17ms +step:1827/2245 train_time:111752ms step_avg:61.17ms +step:1828/2245 train_time:111813ms step_avg:61.17ms +step:1829/2245 train_time:111876ms step_avg:61.17ms +step:1830/2245 train_time:111936ms step_avg:61.17ms +step:1831/2245 train_time:111998ms step_avg:61.17ms +step:1832/2245 train_time:112058ms step_avg:61.17ms +step:1833/2245 train_time:112122ms step_avg:61.17ms +step:1834/2245 train_time:112182ms step_avg:61.17ms +step:1835/2245 train_time:112245ms step_avg:61.17ms +step:1836/2245 train_time:112306ms step_avg:61.17ms +step:1837/2245 train_time:112369ms step_avg:61.17ms +step:1838/2245 train_time:112430ms step_avg:61.17ms +step:1839/2245 train_time:112492ms step_avg:61.17ms +step:1840/2245 train_time:112552ms step_avg:61.17ms +step:1841/2245 train_time:112615ms step_avg:61.17ms +step:1842/2245 train_time:112676ms step_avg:61.17ms +step:1843/2245 train_time:112739ms step_avg:61.17ms +step:1844/2245 train_time:112799ms step_avg:61.17ms +step:1845/2245 train_time:112862ms step_avg:61.17ms +step:1846/2245 train_time:112923ms step_avg:61.17ms +step:1847/2245 train_time:112985ms step_avg:61.17ms +step:1848/2245 train_time:113045ms step_avg:61.17ms +step:1849/2245 train_time:113107ms step_avg:61.17ms +step:1850/2245 train_time:113168ms step_avg:61.17ms +step:1851/2245 train_time:113232ms step_avg:61.17ms +step:1852/2245 train_time:113292ms step_avg:61.17ms +step:1853/2245 train_time:113356ms step_avg:61.17ms +step:1854/2245 train_time:113417ms step_avg:61.17ms +step:1855/2245 train_time:113479ms step_avg:61.17ms +step:1856/2245 train_time:113540ms step_avg:61.17ms +step:1857/2245 train_time:113602ms step_avg:61.18ms +step:1858/2245 train_time:113663ms step_avg:61.17ms +step:1859/2245 train_time:113726ms step_avg:61.18ms +step:1860/2245 train_time:113786ms step_avg:61.18ms +step:1861/2245 train_time:113850ms step_avg:61.18ms +step:1862/2245 train_time:113911ms step_avg:61.18ms +step:1863/2245 train_time:113974ms step_avg:61.18ms +step:1864/2245 train_time:114035ms step_avg:61.18ms +step:1865/2245 train_time:114097ms step_avg:61.18ms +step:1866/2245 train_time:114157ms step_avg:61.18ms +step:1867/2245 train_time:114220ms step_avg:61.18ms +step:1868/2245 train_time:114281ms step_avg:61.18ms +step:1869/2245 train_time:114344ms step_avg:61.18ms +step:1870/2245 train_time:114404ms step_avg:61.18ms +step:1871/2245 train_time:114467ms step_avg:61.18ms +step:1872/2245 train_time:114528ms step_avg:61.18ms +step:1873/2245 train_time:114591ms step_avg:61.18ms +step:1874/2245 train_time:114651ms step_avg:61.18ms +step:1875/2245 train_time:114714ms step_avg:61.18ms +step:1876/2245 train_time:114775ms step_avg:61.18ms +step:1877/2245 train_time:114837ms step_avg:61.18ms +step:1878/2245 train_time:114897ms step_avg:61.18ms +step:1879/2245 train_time:114960ms step_avg:61.18ms +step:1880/2245 train_time:115020ms step_avg:61.18ms +step:1881/2245 train_time:115083ms step_avg:61.18ms +step:1882/2245 train_time:115143ms step_avg:61.18ms +step:1883/2245 train_time:115206ms step_avg:61.18ms +step:1884/2245 train_time:115266ms step_avg:61.18ms +step:1885/2245 train_time:115329ms step_avg:61.18ms +step:1886/2245 train_time:115390ms step_avg:61.18ms +step:1887/2245 train_time:115453ms step_avg:61.18ms +step:1888/2245 train_time:115514ms step_avg:61.18ms +step:1889/2245 train_time:115577ms step_avg:61.18ms +step:1890/2245 train_time:115637ms step_avg:61.18ms +step:1891/2245 train_time:115700ms step_avg:61.18ms +step:1892/2245 train_time:115760ms step_avg:61.18ms +step:1893/2245 train_time:115824ms step_avg:61.19ms +step:1894/2245 train_time:115884ms step_avg:61.19ms +step:1895/2245 train_time:115948ms step_avg:61.19ms +step:1896/2245 train_time:116008ms step_avg:61.19ms +step:1897/2245 train_time:116071ms step_avg:61.19ms +step:1898/2245 train_time:116132ms step_avg:61.19ms +step:1899/2245 train_time:116195ms step_avg:61.19ms +step:1900/2245 train_time:116255ms step_avg:61.19ms +step:1901/2245 train_time:116318ms step_avg:61.19ms +step:1902/2245 train_time:116378ms step_avg:61.19ms +step:1903/2245 train_time:116440ms step_avg:61.19ms +step:1904/2245 train_time:116501ms step_avg:61.19ms +step:1905/2245 train_time:116564ms step_avg:61.19ms +step:1906/2245 train_time:116625ms step_avg:61.19ms +step:1907/2245 train_time:116688ms step_avg:61.19ms +step:1908/2245 train_time:116749ms step_avg:61.19ms +step:1909/2245 train_time:116812ms step_avg:61.19ms +step:1910/2245 train_time:116873ms step_avg:61.19ms +step:1911/2245 train_time:116936ms step_avg:61.19ms +step:1912/2245 train_time:116996ms step_avg:61.19ms +step:1913/2245 train_time:117059ms step_avg:61.19ms +step:1914/2245 train_time:117119ms step_avg:61.19ms +step:1915/2245 train_time:117182ms step_avg:61.19ms +step:1916/2245 train_time:117242ms step_avg:61.19ms +step:1917/2245 train_time:117305ms step_avg:61.19ms +step:1918/2245 train_time:117366ms step_avg:61.19ms +step:1919/2245 train_time:117428ms step_avg:61.19ms +step:1920/2245 train_time:117489ms step_avg:61.19ms +step:1921/2245 train_time:117553ms step_avg:61.19ms +step:1922/2245 train_time:117614ms step_avg:61.19ms +step:1923/2245 train_time:117677ms step_avg:61.19ms +step:1924/2245 train_time:117737ms step_avg:61.19ms +step:1925/2245 train_time:117799ms step_avg:61.19ms +step:1926/2245 train_time:117859ms step_avg:61.19ms +step:1927/2245 train_time:117922ms step_avg:61.19ms +step:1928/2245 train_time:117983ms step_avg:61.19ms +step:1929/2245 train_time:118046ms step_avg:61.20ms +step:1930/2245 train_time:118106ms step_avg:61.19ms +step:1931/2245 train_time:118169ms step_avg:61.20ms +step:1932/2245 train_time:118230ms step_avg:61.20ms +step:1933/2245 train_time:118292ms step_avg:61.20ms +step:1934/2245 train_time:118353ms step_avg:61.20ms +step:1935/2245 train_time:118416ms step_avg:61.20ms +step:1936/2245 train_time:118476ms step_avg:61.20ms +step:1937/2245 train_time:118539ms step_avg:61.20ms +step:1938/2245 train_time:118599ms step_avg:61.20ms +step:1939/2245 train_time:118662ms step_avg:61.20ms +step:1940/2245 train_time:118723ms step_avg:61.20ms +step:1941/2245 train_time:118786ms step_avg:61.20ms +step:1942/2245 train_time:118847ms step_avg:61.20ms +step:1943/2245 train_time:118910ms step_avg:61.20ms +step:1944/2245 train_time:118970ms step_avg:61.20ms +step:1945/2245 train_time:119033ms step_avg:61.20ms +step:1946/2245 train_time:119094ms step_avg:61.20ms +step:1947/2245 train_time:119157ms step_avg:61.20ms +step:1948/2245 train_time:119218ms step_avg:61.20ms +step:1949/2245 train_time:119280ms step_avg:61.20ms +step:1950/2245 train_time:119340ms step_avg:61.20ms +step:1951/2245 train_time:119403ms step_avg:61.20ms +step:1952/2245 train_time:119463ms step_avg:61.20ms +step:1953/2245 train_time:119526ms step_avg:61.20ms +step:1954/2245 train_time:119586ms step_avg:61.20ms +step:1955/2245 train_time:119649ms step_avg:61.20ms +step:1956/2245 train_time:119710ms step_avg:61.20ms +step:1957/2245 train_time:119774ms step_avg:61.20ms +step:1958/2245 train_time:119834ms step_avg:61.20ms +step:1959/2245 train_time:119897ms step_avg:61.20ms +step:1960/2245 train_time:119957ms step_avg:61.20ms +step:1961/2245 train_time:120019ms step_avg:61.20ms +step:1962/2245 train_time:120079ms step_avg:61.20ms +step:1963/2245 train_time:120142ms step_avg:61.20ms +step:1964/2245 train_time:120202ms step_avg:61.20ms +step:1965/2245 train_time:120265ms step_avg:61.20ms +step:1966/2245 train_time:120326ms step_avg:61.20ms +step:1967/2245 train_time:120390ms step_avg:61.20ms +step:1968/2245 train_time:120451ms step_avg:61.20ms +step:1969/2245 train_time:120514ms step_avg:61.21ms +step:1970/2245 train_time:120575ms step_avg:61.21ms +step:1971/2245 train_time:120637ms step_avg:61.21ms +step:1972/2245 train_time:120697ms step_avg:61.21ms +step:1973/2245 train_time:120761ms step_avg:61.21ms +step:1974/2245 train_time:120822ms step_avg:61.21ms +step:1975/2245 train_time:120884ms step_avg:61.21ms +step:1976/2245 train_time:120945ms step_avg:61.21ms +step:1977/2245 train_time:121008ms step_avg:61.21ms +step:1978/2245 train_time:121069ms step_avg:61.21ms +step:1979/2245 train_time:121132ms step_avg:61.21ms +step:1980/2245 train_time:121193ms step_avg:61.21ms +step:1981/2245 train_time:121256ms step_avg:61.21ms +step:1982/2245 train_time:121316ms step_avg:61.21ms +step:1983/2245 train_time:121378ms step_avg:61.21ms +step:1984/2245 train_time:121438ms step_avg:61.21ms +step:1985/2245 train_time:121501ms step_avg:61.21ms +step:1986/2245 train_time:121561ms step_avg:61.21ms +step:1987/2245 train_time:121624ms step_avg:61.21ms +step:1988/2245 train_time:121684ms step_avg:61.21ms +step:1989/2245 train_time:121747ms step_avg:61.21ms +step:1990/2245 train_time:121807ms step_avg:61.21ms +step:1991/2245 train_time:121871ms step_avg:61.21ms +step:1992/2245 train_time:121932ms step_avg:61.21ms +step:1993/2245 train_time:121994ms step_avg:61.21ms +step:1994/2245 train_time:122055ms step_avg:61.21ms +step:1995/2245 train_time:122117ms step_avg:61.21ms +step:1996/2245 train_time:122177ms step_avg:61.21ms +step:1997/2245 train_time:122240ms step_avg:61.21ms +step:1998/2245 train_time:122300ms step_avg:61.21ms +step:1999/2245 train_time:122362ms step_avg:61.21ms +step:2000/2245 train_time:122422ms step_avg:61.21ms +step:2000/2245 val_loss:3.3248 train_time:122486ms step_avg:61.24ms +step:2001/2245 train_time:122507ms step_avg:61.22ms +step:2002/2245 train_time:122550ms step_avg:61.21ms +step:2003/2245 train_time:122615ms step_avg:61.22ms +step:2004/2245 train_time:122678ms step_avg:61.22ms +step:2005/2245 train_time:122742ms step_avg:61.22ms +step:2006/2245 train_time:122803ms step_avg:61.22ms +step:2007/2245 train_time:122865ms step_avg:61.22ms +step:2008/2245 train_time:122924ms step_avg:61.22ms +step:2009/2245 train_time:122986ms step_avg:61.22ms +step:2010/2245 train_time:123046ms step_avg:61.22ms +step:2011/2245 train_time:123108ms step_avg:61.22ms +step:2012/2245 train_time:123168ms step_avg:61.22ms +step:2013/2245 train_time:123230ms step_avg:61.22ms +step:2014/2245 train_time:123290ms step_avg:61.22ms +step:2015/2245 train_time:123352ms step_avg:61.22ms +step:2016/2245 train_time:123412ms step_avg:61.22ms +step:2017/2245 train_time:123475ms step_avg:61.22ms +step:2018/2245 train_time:123537ms step_avg:61.22ms +step:2019/2245 train_time:123602ms step_avg:61.22ms +step:2020/2245 train_time:123664ms step_avg:61.22ms +step:2021/2245 train_time:123728ms step_avg:61.22ms +step:2022/2245 train_time:123788ms step_avg:61.22ms +step:2023/2245 train_time:123851ms step_avg:61.22ms +step:2024/2245 train_time:123911ms step_avg:61.22ms +step:2025/2245 train_time:123974ms step_avg:61.22ms +step:2026/2245 train_time:124034ms step_avg:61.22ms +step:2027/2245 train_time:124097ms step_avg:61.22ms +step:2028/2245 train_time:124157ms step_avg:61.22ms +step:2029/2245 train_time:124219ms step_avg:61.22ms +step:2030/2245 train_time:124279ms step_avg:61.22ms +step:2031/2245 train_time:124342ms step_avg:61.22ms +step:2032/2245 train_time:124403ms step_avg:61.22ms +step:2033/2245 train_time:124466ms step_avg:61.22ms +step:2034/2245 train_time:124527ms step_avg:61.22ms +step:2035/2245 train_time:124590ms step_avg:61.22ms +step:2036/2245 train_time:124651ms step_avg:61.22ms +step:2037/2245 train_time:124715ms step_avg:61.22ms +step:2038/2245 train_time:124776ms step_avg:61.22ms +step:2039/2245 train_time:124840ms step_avg:61.23ms +step:2040/2245 train_time:124901ms step_avg:61.23ms +step:2041/2245 train_time:124964ms step_avg:61.23ms +step:2042/2245 train_time:125025ms step_avg:61.23ms +step:2043/2245 train_time:125087ms step_avg:61.23ms +step:2044/2245 train_time:125148ms step_avg:61.23ms +step:2045/2245 train_time:125209ms step_avg:61.23ms +step:2046/2245 train_time:125269ms step_avg:61.23ms +step:2047/2245 train_time:125332ms step_avg:61.23ms +step:2048/2245 train_time:125392ms step_avg:61.23ms +step:2049/2245 train_time:125455ms step_avg:61.23ms +step:2050/2245 train_time:125516ms step_avg:61.23ms +step:2051/2245 train_time:125579ms step_avg:61.23ms +step:2052/2245 train_time:125641ms step_avg:61.23ms +step:2053/2245 train_time:125705ms step_avg:61.23ms +step:2054/2245 train_time:125766ms step_avg:61.23ms +step:2055/2245 train_time:125829ms step_avg:61.23ms +step:2056/2245 train_time:125889ms step_avg:61.23ms +step:2057/2245 train_time:125952ms step_avg:61.23ms +step:2058/2245 train_time:126013ms step_avg:61.23ms +step:2059/2245 train_time:126075ms step_avg:61.23ms +step:2060/2245 train_time:126135ms step_avg:61.23ms +step:2061/2245 train_time:126198ms step_avg:61.23ms +step:2062/2245 train_time:126259ms step_avg:61.23ms +step:2063/2245 train_time:126323ms step_avg:61.23ms +step:2064/2245 train_time:126383ms step_avg:61.23ms +step:2065/2245 train_time:126447ms step_avg:61.23ms +step:2066/2245 train_time:126507ms step_avg:61.23ms +step:2067/2245 train_time:126570ms step_avg:61.23ms +step:2068/2245 train_time:126630ms step_avg:61.23ms +step:2069/2245 train_time:126693ms step_avg:61.23ms +step:2070/2245 train_time:126754ms step_avg:61.23ms +step:2071/2245 train_time:126817ms step_avg:61.23ms +step:2072/2245 train_time:126879ms step_avg:61.23ms +step:2073/2245 train_time:126943ms step_avg:61.24ms +step:2074/2245 train_time:127004ms step_avg:61.24ms +step:2075/2245 train_time:127067ms step_avg:61.24ms +step:2076/2245 train_time:127126ms step_avg:61.24ms +step:2077/2245 train_time:127188ms step_avg:61.24ms +step:2078/2245 train_time:127249ms step_avg:61.24ms +step:2079/2245 train_time:127311ms step_avg:61.24ms +step:2080/2245 train_time:127371ms step_avg:61.24ms +step:2081/2245 train_time:127434ms step_avg:61.24ms +step:2082/2245 train_time:127494ms step_avg:61.24ms +step:2083/2245 train_time:127558ms step_avg:61.24ms +step:2084/2245 train_time:127618ms step_avg:61.24ms +step:2085/2245 train_time:127683ms step_avg:61.24ms +step:2086/2245 train_time:127745ms step_avg:61.24ms +step:2087/2245 train_time:127807ms step_avg:61.24ms +step:2088/2245 train_time:127867ms step_avg:61.24ms +step:2089/2245 train_time:127930ms step_avg:61.24ms +step:2090/2245 train_time:127991ms step_avg:61.24ms +step:2091/2245 train_time:128053ms step_avg:61.24ms +step:2092/2245 train_time:128113ms step_avg:61.24ms +step:2093/2245 train_time:128177ms step_avg:61.24ms +step:2094/2245 train_time:128238ms step_avg:61.24ms +step:2095/2245 train_time:128300ms step_avg:61.24ms +step:2096/2245 train_time:128360ms step_avg:61.24ms +step:2097/2245 train_time:128423ms step_avg:61.24ms +step:2098/2245 train_time:128483ms step_avg:61.24ms +step:2099/2245 train_time:128547ms step_avg:61.24ms +step:2100/2245 train_time:128607ms step_avg:61.24ms +step:2101/2245 train_time:128670ms step_avg:61.24ms +step:2102/2245 train_time:128731ms step_avg:61.24ms +step:2103/2245 train_time:128794ms step_avg:61.24ms +step:2104/2245 train_time:128855ms step_avg:61.24ms +step:2105/2245 train_time:128919ms step_avg:61.24ms +step:2106/2245 train_time:128980ms step_avg:61.24ms +step:2107/2245 train_time:129043ms step_avg:61.25ms +step:2108/2245 train_time:129104ms step_avg:61.24ms +step:2109/2245 train_time:129167ms step_avg:61.25ms +step:2110/2245 train_time:129227ms step_avg:61.25ms +step:2111/2245 train_time:129289ms step_avg:61.25ms +step:2112/2245 train_time:129349ms step_avg:61.24ms +step:2113/2245 train_time:129412ms step_avg:61.25ms +step:2114/2245 train_time:129472ms step_avg:61.25ms +step:2115/2245 train_time:129534ms step_avg:61.25ms +step:2116/2245 train_time:129595ms step_avg:61.25ms +step:2117/2245 train_time:129659ms step_avg:61.25ms +step:2118/2245 train_time:129719ms step_avg:61.25ms +step:2119/2245 train_time:129783ms step_avg:61.25ms +step:2120/2245 train_time:129845ms step_avg:61.25ms +step:2121/2245 train_time:129907ms step_avg:61.25ms +step:2122/2245 train_time:129968ms step_avg:61.25ms +step:2123/2245 train_time:130031ms step_avg:61.25ms +step:2124/2245 train_time:130091ms step_avg:61.25ms +step:2125/2245 train_time:130155ms step_avg:61.25ms +step:2126/2245 train_time:130214ms step_avg:61.25ms +step:2127/2245 train_time:130277ms step_avg:61.25ms +step:2128/2245 train_time:130338ms step_avg:61.25ms +step:2129/2245 train_time:130401ms step_avg:61.25ms +step:2130/2245 train_time:130462ms step_avg:61.25ms +step:2131/2245 train_time:130525ms step_avg:61.25ms +step:2132/2245 train_time:130586ms step_avg:61.25ms +step:2133/2245 train_time:130649ms step_avg:61.25ms +step:2134/2245 train_time:130709ms step_avg:61.25ms +step:2135/2245 train_time:130772ms step_avg:61.25ms +step:2136/2245 train_time:130832ms step_avg:61.25ms +step:2137/2245 train_time:130895ms step_avg:61.25ms +step:2138/2245 train_time:130956ms step_avg:61.25ms +step:2139/2245 train_time:131019ms step_avg:61.25ms +step:2140/2245 train_time:131080ms step_avg:61.25ms +step:2141/2245 train_time:131143ms step_avg:61.25ms +step:2142/2245 train_time:131203ms step_avg:61.25ms +step:2143/2245 train_time:131266ms step_avg:61.25ms +step:2144/2245 train_time:131326ms step_avg:61.25ms +step:2145/2245 train_time:131388ms step_avg:61.25ms +step:2146/2245 train_time:131449ms step_avg:61.25ms +step:2147/2245 train_time:131512ms step_avg:61.25ms +step:2148/2245 train_time:131572ms step_avg:61.25ms +step:2149/2245 train_time:131635ms step_avg:61.25ms +step:2150/2245 train_time:131695ms step_avg:61.25ms +step:2151/2245 train_time:131760ms step_avg:61.26ms +step:2152/2245 train_time:131821ms step_avg:61.26ms +step:2153/2245 train_time:131885ms step_avg:61.26ms +step:2154/2245 train_time:131945ms step_avg:61.26ms +step:2155/2245 train_time:132008ms step_avg:61.26ms +step:2156/2245 train_time:132068ms step_avg:61.26ms +step:2157/2245 train_time:132132ms step_avg:61.26ms +step:2158/2245 train_time:132192ms step_avg:61.26ms +step:2159/2245 train_time:132256ms step_avg:61.26ms +step:2160/2245 train_time:132317ms step_avg:61.26ms +step:2161/2245 train_time:132380ms step_avg:61.26ms +step:2162/2245 train_time:132441ms step_avg:61.26ms +step:2163/2245 train_time:132504ms step_avg:61.26ms +step:2164/2245 train_time:132564ms step_avg:61.26ms +step:2165/2245 train_time:132627ms step_avg:61.26ms +step:2166/2245 train_time:132687ms step_avg:61.26ms +step:2167/2245 train_time:132750ms step_avg:61.26ms +step:2168/2245 train_time:132811ms step_avg:61.26ms +step:2169/2245 train_time:132873ms step_avg:61.26ms +step:2170/2245 train_time:132934ms step_avg:61.26ms +step:2171/2245 train_time:132997ms step_avg:61.26ms +step:2172/2245 train_time:133058ms step_avg:61.26ms +step:2173/2245 train_time:133123ms step_avg:61.26ms +step:2174/2245 train_time:133184ms step_avg:61.26ms +step:2175/2245 train_time:133247ms step_avg:61.26ms +step:2176/2245 train_time:133307ms step_avg:61.26ms +step:2177/2245 train_time:133369ms step_avg:61.26ms +step:2178/2245 train_time:133429ms step_avg:61.26ms +step:2179/2245 train_time:133492ms step_avg:61.26ms +step:2180/2245 train_time:133553ms step_avg:61.26ms +step:2181/2245 train_time:133615ms step_avg:61.26ms +step:2182/2245 train_time:133676ms step_avg:61.26ms +step:2183/2245 train_time:133739ms step_avg:61.26ms +step:2184/2245 train_time:133800ms step_avg:61.26ms +step:2185/2245 train_time:133864ms step_avg:61.26ms +step:2186/2245 train_time:133924ms step_avg:61.26ms +step:2187/2245 train_time:133988ms step_avg:61.27ms +step:2188/2245 train_time:134048ms step_avg:61.27ms +step:2189/2245 train_time:134111ms step_avg:61.27ms +step:2190/2245 train_time:134172ms step_avg:61.27ms +step:2191/2245 train_time:134234ms step_avg:61.27ms +step:2192/2245 train_time:134295ms step_avg:61.27ms +step:2193/2245 train_time:134358ms step_avg:61.27ms +step:2194/2245 train_time:134418ms step_avg:61.27ms +step:2195/2245 train_time:134481ms step_avg:61.27ms +step:2196/2245 train_time:134542ms step_avg:61.27ms +step:2197/2245 train_time:134605ms step_avg:61.27ms +step:2198/2245 train_time:134665ms step_avg:61.27ms +step:2199/2245 train_time:134728ms step_avg:61.27ms +step:2200/2245 train_time:134789ms step_avg:61.27ms +step:2201/2245 train_time:134852ms step_avg:61.27ms +step:2202/2245 train_time:134912ms step_avg:61.27ms +step:2203/2245 train_time:134975ms step_avg:61.27ms +step:2204/2245 train_time:135036ms step_avg:61.27ms +step:2205/2245 train_time:135100ms step_avg:61.27ms +step:2206/2245 train_time:135161ms step_avg:61.27ms +step:2207/2245 train_time:135224ms step_avg:61.27ms +step:2208/2245 train_time:135285ms step_avg:61.27ms +step:2209/2245 train_time:135348ms step_avg:61.27ms +step:2210/2245 train_time:135408ms step_avg:61.27ms +step:2211/2245 train_time:135471ms step_avg:61.27ms +step:2212/2245 train_time:135531ms step_avg:61.27ms +step:2213/2245 train_time:135594ms step_avg:61.27ms +step:2214/2245 train_time:135655ms step_avg:61.27ms +step:2215/2245 train_time:135718ms step_avg:61.27ms +step:2216/2245 train_time:135779ms step_avg:61.27ms +step:2217/2245 train_time:135843ms step_avg:61.27ms +step:2218/2245 train_time:135904ms step_avg:61.27ms +step:2219/2245 train_time:135967ms step_avg:61.27ms +step:2220/2245 train_time:136027ms step_avg:61.27ms +step:2221/2245 train_time:136090ms step_avg:61.27ms +step:2222/2245 train_time:136151ms step_avg:61.27ms +step:2223/2245 train_time:136213ms step_avg:61.27ms +step:2224/2245 train_time:136273ms step_avg:61.27ms +step:2225/2245 train_time:136337ms step_avg:61.27ms +step:2226/2245 train_time:136398ms step_avg:61.27ms +step:2227/2245 train_time:136461ms step_avg:61.28ms +step:2228/2245 train_time:136523ms step_avg:61.28ms +step:2229/2245 train_time:136586ms step_avg:61.28ms +step:2230/2245 train_time:136647ms step_avg:61.28ms +step:2231/2245 train_time:136709ms step_avg:61.28ms +step:2232/2245 train_time:136770ms step_avg:61.28ms +step:2233/2245 train_time:136833ms step_avg:61.28ms +step:2234/2245 train_time:136893ms step_avg:61.28ms +step:2235/2245 train_time:136957ms step_avg:61.28ms +step:2236/2245 train_time:137017ms step_avg:61.28ms +step:2237/2245 train_time:137081ms step_avg:61.28ms +step:2238/2245 train_time:137144ms step_avg:61.28ms +step:2239/2245 train_time:137207ms step_avg:61.28ms +step:2240/2245 train_time:137268ms step_avg:61.28ms +step:2241/2245 train_time:137331ms step_avg:61.28ms +step:2242/2245 train_time:137391ms step_avg:61.28ms +step:2243/2245 train_time:137453ms step_avg:61.28ms +step:2244/2245 train_time:137514ms step_avg:61.28ms +step:2245/2245 train_time:137578ms step_avg:61.28ms +step:2245/2245 val_loss:3.2794 train_time:137639ms step_avg:61.31ms +peak memory allocated: 29249 MiB reserved: 50528 MiB diff --git a/records/track_1_short/2025-11-10_CautiousWD/README.md b/records/track_1_short/2025-11-10_CautiousWD/README.md new file mode 100644 index 000000000..23eacbca0 --- /dev/null +++ b/records/track_1_short/2025-11-10_CautiousWD/README.md @@ -0,0 +1,65 @@ +# Cautious Weight Decay + +This record implements [Cautious Weight Decay](https://arxiv.org/abs/2510.12402). + +## Timing and Validation + +This record improves the final training 40 steps, with a slight increase in step time. + +This PR: + +``` +import scipy.stats +import torch + +losses = [3.2784, 3.2771, 3.2777, 3.2790, 3.2794, 3.2813, 3.2772, 3.2772, 3.2785, 3.2783] +times = [137.582, 137.753, 137.636, 137.507, 137.639, 137.708, 137.722, 137.677, 137.456, 137.705] + +print("p=%.4f" % scipy.stats.ttest_1samp(losses, 3.28, alternative="less").pvalue) +# p=0.0018 + +print("losses:", torch.std_mean(torch.tensor(losses))) +# losses: (std=0.0013, mean=3.2784) + +print("time:", torch.std_mean(torch.tensor(times))) +# time: (std=0.0970, mean=137.6385) +``` + +Previous PR (timed on same machine): + +``` +import scipy.stats +import torch + +times = [139.813, 139.832, 139.877, 139.839, 139.939] + +print("time:", torch.std_mean(torch.tensor(times))) +# time: (std=0.0499, mean=139.8600) +``` + +These timings show an improvement of ~2.22 seconds. + +Thank you to Prime Intellect for sponsoring my research. + +## "Cautious" weight decay + +I found that weight decay leads to stable training dynamics, but performance seems to suffer. I stumbled upon the paper [Cautious Weight Decay](https://arxiv.org/pdf/2510.12402) which proposes only applying weight decay on the parameters that are growing in magnitude, and this proved to be effective. + +Based on suggestion from @classiclaryd, I kept weight decay on a schedule. After trying various combinations, I found that the same schedule as learning rate is quite good, so I kept the previous calculation of `effective_weight_decay = learning_rate * weight_decay`. Scheduled weight decay improves performance on CWD by 10-15 steps. + +The choice of `wd=1.2` is well tuned. In practice, it actually corresponds to starting `effective_weight_decay = 1.2 x 0.03 = 0.036`. + +Cautious weight decay might be better called "masked decoupled weight decay". While it should be an unbiased estimator, I noticed that this weight decay has a very different training dynamic than the baseline: + +val-loss + + +In particular, we find that CWD has higher validation loss for the majority of the training run. There is an inflection point when the learning rate decreases, and CWD only "catches up" to the baseline in the final steps of training. I noticed this dynamic irrespective of whether WD is placed on a schedule. + +Parameters under CWD have mean square magnitude `<20%` of the magnitude under the baseline. I found this pattern consistently for both MLP and ATTN parameters. + +I found that the condition number after CWD is virtually identical the the condition number after NorMuon: + +cond-numbers + +I believe this PR opens the door for rich future work, including tuning the WD schedule and CWD for Adam. diff --git a/records/track_1_short/2025-11-10_CautiousWD/a33fa276-234b-4c9c-9b78-43d85c411e8d.txt b/records/track_1_short/2025-11-10_CautiousWD/a33fa276-234b-4c9c-9b78-43d85c411e8d.txt new file mode 100644 index 000000000..da56e6d57 --- /dev/null +++ b/records/track_1_short/2025-11-10_CautiousWD/a33fa276-234b-4c9c-9b78-43d85c411e8d.txt @@ -0,0 +1,3772 @@ +import os +import sys + +with open(sys.argv[0]) as f: + code = f.read() # read the code of this file ASAP, for logging +import copy +import glob +import math +import threading +import time +import uuid +from dataclasses import dataclass +from collections import defaultdict +from itertools import accumulate +from pathlib import Path + +os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" +import torch + +torch.empty( + 1, device="cuda", requires_grad=True +).backward() # prevents a bug on some systems +import torch._dynamo as dynamo +import torch.distributed as dist +import torch.nn.functional as F + +# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min +import triton +import triton.language as tl +from kernels import get_kernel +from torch import Tensor, nn + +dynamo.config.recompile_limit = 64 + +# ----------------------------------------------------------------------------- +# Custom operators: FP8 matmul by @YouJiacheng + + +@torch.library.custom_op("nanogpt::mm", mutates_args=()) +def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: + @torch.compile + def impl(x: Tensor, w: Tensor): + assert x.is_contiguous() and w.is_contiguous() + x_f8 = x.div(x_s).to(torch.float8_e4m3fn) + w_f8 = w.div(w_s).to(torch.float8_e4m3fn) + out = torch._scaled_mm( + x_f8, + w_f8.T, + out_dtype=torch.bfloat16, + scale_a=x.new_tensor(x_s, dtype=torch.float32), + scale_b=x.new_tensor(w_s, dtype=torch.float32), + use_fast_accum=True, + ) + return out, x_f8, w_f8 + + return impl(x, w) + +@mm_op.register_fake +def _(x: Tensor, w: Tensor, *_): + assert x.ndim == w.ndim == 2 + assert x.shape[1] == w.shape[1] + assert x.device == w.device + assert x.is_contiguous() and w.is_contiguous() + return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) + +@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) +def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: + @torch.compile + def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): + assert grad.is_contiguous() + x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) + w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) + grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) + grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) + grad_x = torch._scaled_mm( + grad_f8, + w_f8.T.contiguous().T, + out_dtype=torch.bfloat16, + scale_a=grad_inv_s, + scale_b=w_inv_s, + use_fast_accum=False, + ) + # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) + grad_w = torch._scaled_mm( + x_f8.T.contiguous(), + grad_f8.T.contiguous().T, + out_dtype=torch.float32, + scale_a=x_inv_s, + scale_b=grad_inv_s, + use_fast_accum=False, + ).T + return grad_x, grad_w + + return impl(g, x_f8, w_f8) + +@mm_backward_op.register_fake +def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): + return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) + +def backward(ctx, grad_out: Tensor, *_): + x_f8, w_f8 = ctx.saved_tensors + x_s, w_s, grad_s = ctx.scales + grad_x, grad_w = torch.ops.nanogpt.mm_backward( + grad_out, x_f8, w_f8, x_s, w_s, grad_s + ) + return grad_x, grad_w, None, None, None + +def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): + *_, x_s, w_s, grad_s = inputs + _, x_f8, w_f8 = output + ctx.save_for_backward(x_f8, w_f8) + ctx.scales = x_s, w_s, grad_s + ctx.set_materialize_grads(False) + +mm_op.register_autograd(backward, setup_context=setup_context) + +# ----------------------------------------------------------------------------- +# Triton kernel for symmetric matrix multiplication by @byronxu99 + +def _get_autotune_configs(): + return [ + triton.Config( + { + "BLOCK_SIZE_M": bm, + "BLOCK_SIZE_N": bn, + "BLOCK_SIZE_K": bk, + "GROUP_SIZE_M": 8, + "LOWER_UPPER": 1, + }, + num_stages=stages, + num_warps=warps, + ) + for bm in [64, 128] + for bn in [64, 128, 256] + for bk in [64, 128] + for stages, warps in [(3, 4), (3, 8), (4, 4)] + if bm // bn <= 2 and bn // bm <= 2 + ] + +@triton.jit +def _pid_to_block( + pid, + M, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) + + # Map PID to a single matrix in batch + batch_idx = pid // (num_pid_m * num_pid_n) + pid = pid % (num_pid_m * num_pid_n) + + # Map PID to 2D grid of blocks + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) + + m_idx = pid_m * BLOCK_SIZE_M + n_idx = pid_n * BLOCK_SIZE_N + return batch_idx, m_idx, n_idx + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def XXT_kernel( + A_ptr, C_ptr, + M, K, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def XXT(A: torch.Tensor, out: torch.Tensor): + """ + Launch Triton kernel to compute C = A @ A.T + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert out.size(-2) == M, "Output matrix has incorrect shape" + assert out.size(-1) == M, "Output matrix has incorrect shape" + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + XXT_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + K=K, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + ) + return out + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def ba_plus_cAA_kernel( + A_ptr, C_ptr, + M, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + alpha, beta, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A + # Performance is slightly slower than XXT_kernel, so we use two separate kernels + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + # Load block of A to add (corresponds to the current block of C) + offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) + a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) + a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) + a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) + + # Apply alpha and beta + accumulator *= alpha + accumulator += a_add * beta + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): + """ + Launch Triton kernel to compute C = alpha * A @ A.T + beta * A + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert M == K, "Input matrix must be square" + assert out.size(-2) == M + assert out.size(-1) == M + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + ba_plus_cAA_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + alpha=alpha, + beta=beta, + ) + return out + +# Computed for num_iters=5, safety_factor=2e-2, cushion=2 +polar_express_coeffs = [ + (8.156554524902461, -22.48329292557795, 15.878769915207462), + (4.042929935166739, -2.808917465908714, 0.5000178451051316), + (3.8916678022926607, -2.772484153217685, 0.5060648178503393), + (3.285753657755655, -2.3681294933425376, 0.46449024233003106), + (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) +] + +@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower +def polar_express(G: torch.Tensor): + """ + Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 + by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. + """ + X = G.bfloat16() + if G.size(-2) > G.size(-1): + X = X.mT + + # Ensure spectral norm is at most 1 + X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) + + # Allocate buffers + X = X.contiguous() + A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) + B = torch.empty_like(A) + C = torch.empty_like(X) + + aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm + + # Perform the iterations + for a, b, c in polar_express_coeffs: + XXT(X, out=A) # A = X @ X.mT + ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A + aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X + X, C = C, X # Swap references to avoid unnecessary copies + + if G.size(-2) > G.size(-1): + X = X.mT + return X + +# ----------------------------------------------------------------------------- +# Muon optimizer + +class NorMuon(torch.optim.Optimizer): + """ + Muon - MomentUm Orthogonalized by Newton-schulz + + https://kellerjordan.github.io/posts/muon/ + + Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- + processing step, in which each 2D parameter's update is replaced with the nearest orthogonal + matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has + the advantage that it can be stably run in bfloat16 on the GPU. + + Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, + or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). + + Differences from standard Muon: + - Newton-Shulz is replaced with Polar Express for the orthogonalization step + - NorMuon adds a low-rank variance estimator similar to Adafactor. + - small 1D parameters handled here instead of in Adam + - Cautious weight decay, a gated version of decoupled weight decay + - Custom distributed sizing: + The model stores all attn and mlp weights in the same shape, and then updates the view as + needed on the forward pass. This enables attn and mlp weights to be contained within the same + dist.reduce_scatter_tensor() call. The model architecture has been customized to enable + (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. + The scheduling is: + 1. reduce scatter smear_gate (1 param 7 padding params) + 2. reduce scatter attn_gate (10 params 6 padding params) + 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) + 4. reduce scatter attn/mlp round 2 (16 mlp params) + 5. wait on step 1, then compute update of 1 and schedule all gather + 6. wait on step 2, then compute update of 2 and schedule all gather + 7. wait on step 3, then compute update of 3 and schedule all gather + GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] + GPUs that receive params of type attn reshape before computing update + 8. wait on 4, then compute update of 4 and schedule all gather + 9. wait for each all gather to complete and update params + Empirically, leading with small params provides an additional 0.2s improvement. + """ + def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, beta2=0.95, custom_sizing=True): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + # custom sizing requires 8 GPUs + if custom_sizing and dist.get_world_size()==8: + param_groups = self.generate_custom_param_groups(params) + else: + param_groups = self.generate_standard_param_groups(params) + super().__init__(param_groups, defaults) + + def reset(self): + # expose a reset for clearing buffers + for group in self.param_groups: + group["momentum_buffer"].zero_() + group["second_momentum_buffer"].zero_() + + def generate_standard_param_groups(self, params): + """ + Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. + Creates one param group per module. + """ + groups = defaultdict(list) + for param in params: + groups[param.label].append(param) + + param_groups = [] + for module_name, group_params in groups.items(): + chunk_size = (len(group_params) + self.world_size - 1) // self.world_size + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + + return param_groups + + def generate_custom_param_groups(self, params): + """ + Implementation requires that a single GPU does not receive both attn + and mlp params when a param group is split across GPUs. + """ + module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp'] + params_list = list(params) + params_list.sort(key=lambda x: module_group_order.index(x.label)) + + idx = 0 + group_sizes = [1, 10, 16, 16] + assert len(params_list) == sum(group_sizes) + param_groups = [] + for size in group_sizes: + chunk_size = (size + self.world_size - 1) // self.world_size + group_params = params_list[idx: idx + size] + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + idx += size + + return param_groups + + @torch.no_grad() + def step(self): + # Efficient systems-wise implementation of step developed by @YouJiacheng, + # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, + # @ryanyang0, @vagrawal, and @varunneal. + rank = dist.get_rank() + group_infos = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + if not params: + continue + + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + stacked_grads = torch.empty( + (padded_num_params, *params[0].shape), + dtype=params[0].dtype, + device=params[0].device + ) + for i, p in enumerate(params): + stacked_grads[i].copy_(p.grad, non_blocking=True) + if len(params) < padded_num_params: + stacked_grads[len(params):].zero_() + + grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) + + reduce_future = dist.reduce_scatter_tensor( + grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True + ).get_future() + + group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) + + all_gather_infos = [] + # Second pass: wait for gradients, compute updates for the local shard of parameters, + # and launch all async all_gather operations. + for group, info in zip(self.param_groups, group_infos): + info["reduce_future"].wait() + + params = group["params"] + grad_chunk = info["grad_chunk"] + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + start_idx = rank * chunk_size + module_idx = start_idx if start_idx < len(params) else 0 + + num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank + + if "momentum_buffer" not in group: + group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) + momentum_buffer = group["momentum_buffer"] + # Apply momentum update to the persistent momentum buffer in-place + momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) + updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) + + grad_shape = updated_grads.shape + if params[module_idx].label == 'attn': + # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] + for p in params[module_idx:module_idx + num_params]: + assert p.label == 'attn' + updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) + ref_param = params[module_idx] + param_shape = ref_param.shape + + if "second_momentum_buffer" not in group: + group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) + if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) + ) + second_momentum_buffer = group["second_momentum_buffer"] + + if "param_lr" not in group: + group["param_lr"] = ( + max(1., param_shape[-2] / param_shape[-1]) ** 0.5 + * ref_param.new_tensor( + [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + ) + + group["param_wd"] = ref_param.new_tensor( + [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + + # Determine LR and WR + eff_lr = group["lr"] * group["param_lr"] + eff_wd = group["lr"] * group["weight_decay"] * group["param_wd"] + + # Compute zeropower for the entire chunk in a single, batched call. + if num_params == 0: + v_chunk = updated_grads + else: + v_chunk = polar_express(updated_grads) + + # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) + v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) + second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) + step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() + v_chunk.mul_(step_size) + v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) + + v_chunk = v_chunk.view(grad_shape) + + updated_params = torch.empty_like(grad_chunk) + param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) + + # "Cautious" weight decay (https://arxiv.org/abs/2510.12402) + mask = (v_chunk * param_chunk) >= 0 + v_chunk.addcmul_(param_chunk, (eff_wd * mask).to(ref_param.dtype)) + + param_chunk.addcmul_(v_chunk, -eff_lr) + + updated_params[:num_params].copy_(param_chunk) + if num_params < chunk_size: + updated_params[num_params:].zero_() + + stacked_params = torch.empty( + (padded_num_params, *param_shape), + dtype=updated_params.dtype, + device=updated_params.device, + ) + + gather_future = dist.all_gather_into_tensor( + stacked_params, updated_params, async_op=True + ).get_future() + + all_gather_infos.append( + { + "gather_future": gather_future, + "stacked_params": stacked_params, + "orig_params": params, + } + ) + + # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. + for info in all_gather_infos: + info["gather_future"].wait() + stacked_params = info["stacked_params"] + orig_params = info["orig_params"] + + unstacked_params = torch.unbind(stacked_params) + for i, p in enumerate(orig_params): + p.copy_(unstacked_params[i], non_blocking=True) + + +class DistAdam(torch.optim.Optimizer): + def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + params = list(params) + sizes = {p.shape for p in params} + # create one buffer per unique parameter-size + param_groups = [] + for size in sizes: + group_params = [p for p in params if p.shape == size] + param_groups.append(dict(params=group_params)) + super().__init__(param_groups, defaults) + # init state + for p in params: + chunk_size = p.size(0) // self.world_size + exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) + exp_avg_sq = torch.zeros_like(exp_avg) + self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) + # DistributedAdam implementation by @vagrawal + + @torch.compile + @torch.no_grad() + def step(self): + rank = dist.get_rank() + reduce_scatter_futures: list[torch.Future] = [] + all_gather_futures: list[torch.Future] = [] + grad_slices = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + for param in params: + grad = param.grad + rank_size = grad.shape[0] // self.world_size + grad_slice = torch.empty_like(grad[:rank_size]) + reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) + grad_slices.append(grad_slice) + + idx = 0 + for group in self.param_groups: + beta1, beta2 = group['betas'] + eps = group['eps'] + wd = group['weight_decay'] + params = group['params'] + for param in params: + reduce_scatter_futures[idx].wait() + rank_size = param.shape[0] // self.world_size + p_slice = param[rank * rank_size:(rank + 1) * rank_size] + lr = group['lr'] * getattr(param, "lr_mul", 1.0) + state = self.state[param] + g_slice = grad_slices[idx] + + exp_avg = state["exp_avg"] + exp_avg_sq = state["exp_avg_sq"] + state["step"] += 1 + t = state["step"] + # weight decay + if wd != 0: + eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) + p_slice.mul_(1 - eff_weight_decay) + # update running averages + exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) + # bias corrections + bias1 = 1 - beta1 ** t + bias2 = 1 - beta2 ** t + # compute step + denom = exp_avg_sq.sqrt().add_(eps) + step_size = lr * (bias2 ** 0.5 / bias1) + update = exp_avg.div(denom).mul_(step_size) + p_slice.add_(other=update, alpha=-1.0) + idx += 1 + all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) + torch.futures.collect_all(all_gather_futures).wait() + +# ----------------------------------------------------------------------------- +# PyTorch nn.Module definitions for the model + +def norm(x: Tensor): + return F.rms_norm(x, (x.size(-1),)) + +class CastedLinear(nn.Linear): + def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): + super().__init__(in_features, out_features, bias=False) + self.use_fp8 = use_fp8 + self.x_s = x_s + self.w_s = w_s + self.grad_s = grad_s + + def reset_parameters(self) -> None: + with torch.no_grad(): + self.weight.zero_() # @Grad62304977 and others + + def forward(self, x: Tensor): + if self.use_fp8 and self.training: + _x = x.flatten(0, -2) + out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] + return out.reshape(*x.shape[:-1], -1) + else: + return F.linear(x, self.weight.type_as(x)) + +# yarn implementation @classiclarryd +class Yarn(nn.Module): + def __init__(self, head_dim, max_seq_len): + super().__init__() + self.head_dim = head_dim + self.max_seq_len = max_seq_len + self.reset() + + def reset(self): + angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) + # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) + angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) + theta = torch.outer(t, angular_freq) + self.cos = nn.Buffer( + theta.cos().to(torch.bfloat16), persistent=False + ) + self.sin = nn.Buffer( + theta.sin().to(torch.bfloat16), persistent=False + ) + self.angular_freq = angular_freq + # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 + self.attn_scale = 0.1 + + def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): + rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) + scaling_factor = old_window / new_window + interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) + self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) + theta = torch.outer(t, self.angular_freq) + self.cos.copy_(theta.cos()) + self.sin.copy_(theta.sin()) + self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 + +def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): + assert cos.size(0) >= x_BTHD.size(-3) + cos, sin = ( + cos[None, : x_BTHD.size(-3), None, :], + sin[None, : x_BTHD.size(-3), None, :], + ) + x1, x2 = x_BTHD.chunk(2, dim=-1) + y1 = x1 * cos + x2 * sin + y2 = x1 * (-sin) + x2 * cos + return torch.cat((y1, y2), 3) + +@dataclass +class AttnArgs: + ve: torch.Tensor + sa_lambdas: torch.Tensor + seqlens: torch.Tensor + bm_size: int + cos: torch.Tensor + sin: torch.Tensor + attn_scale: float + +flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface + +class CausalSelfAttention(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int): + super().__init__() + self.num_heads = num_heads + self.head_dim = head_dim + self.dim = dim + self.hdim = num_heads * head_dim + + assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" + std = 0.5 * (self.dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng + # https://x.com/hi_tysam/status/1879699187107033311 + # make matrices the same shape as MLP to enable batched call in optimizer + self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) + # label module to enable custom optimizer sizing + self.qkvo_w.label='attn' + + with torch.no_grad(): + self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights + self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero + + # sparse gated attention to enable context based no-op by @classiclarryd + self.attn_gate = CastedLinear(12, num_heads) + # label module to enable custom optimizer sizing + self.attn_gate.weight.label = 'attn_gate' + + def forward(self, x: Tensor, attn_args: AttnArgs): + B, T = x.size(0), x.size(1) # batch size, sequence length + assert B == 1, "varlen sequences requires B == 1" + assert T % 16 == 0 + # unpack attention args + cos, sin = attn_args.cos, attn_args.sin + ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas + seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size + + q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) + q, k = norm(q), norm(k) # QK norm @Grad62304977 + q, k = rotary(q, cos, sin), rotary(k, cos, sin) + if ve is not None: + v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 + else: # skip mid-layers token value embeddings by @YouJiacheng + v = sa_lambdas[0] * v + + max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) + + # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng + y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, + max_seqlen_q=max_len, max_seqlen_k=max_len, + causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) + y = y.view(B, T, self.num_heads, self.head_dim) + y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) + y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side + y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) + return y + + +class MLP(nn.Module): + def __init__(self, dim: int): + super().__init__() + hdim = 4 * dim + # make matrices the same shape to enable batched call in optimizer + self.c_fc = nn.Parameter(torch.empty(dim, hdim)) + self.c_proj = nn.Parameter(torch.empty(dim, hdim)) + # label modules to enable custom optimizer sizing + self.c_fc.label = 'mlp' + self.c_proj.label = 'mlp' + # corrective factor to account for transpose + self.c_fc.lr_mul = 2. + + std = 0.5 * (dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + with torch.no_grad(): + self.c_fc.uniform_(-bound, bound) + self.c_proj.zero_() # zero init suggested by @Grad62304977 + + def forward(self, x: Tensor): + x = F.linear(x, self.c_fc.T.type_as(x)) + x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 + x = F.linear(x, self.c_proj.type_as(x)) + return x + +class Block(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): + super().__init__() + # skip attention of blocks.7 (the 8th layer) by @YouJiacheng + self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None + # skip MLP blocks for first MLP layer by @EmelyanenkoK + self.mlp = MLP(dim) if layer_idx != 0 else None + + def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): + x = lambdas[0] * x + lambdas[1] * x0 + if self.attn is not None: + x = x + self.attn(norm(x), attn_args) + if self.mlp is not None: + x = x + self.mlp(norm(x)) + return x + +# ----------------------------------------------------------------------------- +# The main model + +def next_multiple_of_n(v: float | int, *, n: int): + return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) + +class GPT(nn.Module): + def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): + super().__init__() + vocab_size = next_multiple_of_n(vocab_size, n=128) + self.embed = nn.Embedding(vocab_size, model_dim) + self.smear_gate = CastedLinear(12, 1) + # label modules to enable custom optimizer sizing + self.smear_gate.weight.label = 'smear_gate' + # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 + # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 + self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) + self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) + self.yarn = Yarn(head_dim, max_seq_len) + # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. + # suggested to me by @Grad62304977. this originates from Karpathy's experiments. + use_fp8 = not os.environ.get("DISABLE_FP8", False) + self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) + # Add learnable skip connection weights for decoder layers + assert num_layers % 2 == 0 + pad = (-num_layers * 5 - 2) % dist.get_world_size() + self.scalars = nn.Parameter( + torch.cat( + [ + -1.5 + * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 + *[ + torch.tensor([1.0, 0.0]) for _ in range(num_layers) + ], # block lambdas + *[ + torch.tensor([0.5, 0.5]) for _ in range(num_layers) + ], # SA lambdas + torch.zeros(1), # smear_lambda + 0.5*torch.ones(1), # backout_lambda + torch.ones(pad), + ] + ) + ) + # set learning rates + for param in self.embed.parameters(): + param.lr_mul = 75. + for param in self.value_embeds.parameters(): + param.lr_mul = 75. + self.lm_head.weight.lr_mul = 1.0 + self.scalars.lr_mul = 5.0 + + def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): + assert input_seq.ndim == 1 + + ve = [value_embed(input_seq) for value_embed in self.value_embeds] + # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure + # dropping first layer updates this to .12 ... 012 + ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] + assert len(ve) == len(self.blocks) + + short_bm = ws_short * args.block_size + long_bm = ws_long * args.block_size + bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] + assert len(bm_sizes) == len(self.blocks) + + x = self.embed(input_seq) + + skip_weights = self.scalars[:(len(self.blocks) // 2)] + lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) + sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) + smear_lambda = self.scalars[5 * len(self.blocks)] + backout_lambda = self.scalars[5 * len(self.blocks)+1] + + # smear token embed forward 1 position @classiclarryd + smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) + x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) + x = x0 = norm(x[None]) + + # U-net design by @brendanh0gan + skip_connections = [] + n = len(self.blocks) // 2 + + x_backout = None + backout_layer = 8 + # skip layer zero + for i in range(1,len(self.blocks)): + attn_args = AttnArgs( + ve=ve[i], + sa_lambdas=sa_lambdas[i], + seqlens=seqlens, + bm_size=bm_sizes[i], + cos=self.yarn.cos, + sin=self.yarn.sin, + attn_scale=self.yarn.attn_scale + ) + # since layer 0 is skipped, layer 11 does not have skip_connection + if i >= n and i<11: + gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) + x = x + gate * skip_connections.pop() + x = self.blocks[i](x, x0, lambdas[i], attn_args) + if i < n: + skip_connections.append(x) + if i == backout_layer: + x_backout = x + + # back out contributions from first 8 layers that are only required for downstream context and not direct prediction + x -= backout_lambda * x_backout + x = norm(x) + logits = self.lm_head(x) + # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) + logits = 30 * torch.sigmoid(logits / 7.5) + logits_for_loss = logits.float() if not self.training else logits + loss = F.cross_entropy( + logits_for_loss.view(-1, logits_for_loss.size(-1)), + target_seq, + reduction="sum" if self.training else "mean", + ) + return loss + +# ----------------------------------------------------------------------------- +# Distributed data loader + +def _load_data_shard(file: Path): + header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 + assert header[0] == 20240520, "magic number mismatch in the data .bin file" + assert header[1] == 1, "unsupported version" + num_tokens = int(header[2]) # number of tokens (claimed) + with file.open("rb", buffering=0) as f: + tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng + f.seek(256 * 4) + nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng + assert nbytes == 2 * num_tokens, "number of tokens read does not match header" + return tokens + +BOS_ID = 50256 + +class BOSFinder: + # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd + def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): + # Precompute BOS positions once per shard + self.tokens=tokens + self.size = tokens.numel() + self.quickload = quickload + if quickload: + # only scan first 4 million tokens, then kickoff async thread to scan rest + self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.thread = None + self.ready = threading.Event() + self.start() + else: + self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.i = 0 + self.world_size = world_size + self.batch_iter = 0 + + def _load(self): + self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + self.bos_idx = self.bos_idx_async + + def next_batch(self, num_tokens_local: int, max_seq_len: int): + # if quickload was used, repoint to the full dataset after 5 batches + if self.quickload and self.batch_iter==5: + self.get() + n = len(self.bos_idx) + starts = [[] for _ in range(self.world_size)] + ends = [[] for _ in range(self.world_size)] + + idx = self.i + for r in range(self.world_size): + cur_len = 0 + while cur_len <= num_tokens_local: + if idx >= n: + raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") + cur = self.bos_idx[idx] + starts[r].append(cur) + end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, + cur + max_seq_len, + cur + num_tokens_local - cur_len + 1) + ends[r].append(end) + cur_len += end - cur + idx += 1 + + assert cur_len == num_tokens_local + 1 + self.i = idx + self.batch_iter+=1 + return starts, ends + +class DataPreloader: + # Helper for asynchronously loading next shard and indexing bos tokens + def __init__(self, file_iter, world_size: int = 1): + self.file_iter = file_iter + self.world_size = world_size + self.thread = None + self.data = None + self.ready = threading.Event() + + def _load(self): + tokens = _load_data_shard(next(self.file_iter)) + self.data = (tokens, BOSFinder(tokens, self.world_size)) + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + return self.data + +def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): + # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len + rank = dist.get_rank() if dist.is_initialized() else 0 + world_size = dist.get_world_size() if dist.is_initialized() else 1 + assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" + num_tokens = num_tokens // grad_accum_steps + + files = [Path(file) for file in sorted(glob.glob(filename_pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") + + file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training + tokens = _load_data_shard(next(file_iter)) + if align_to_bos: + finder = BOSFinder(tokens, world_size=world_size, quickload=True) + preloader = DataPreloader(file_iter, world_size) + preloader.start() + else: + pos = 0 # for unaligned case + + while True: + num_tokens_local = num_tokens // world_size + max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 + + if align_to_bos: + try: + seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) + start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) + except StopIteration: + # This shard is exhausted, load the next one in the next loop iteration. + tokens, finder = preloader.get() + preloader.start() + continue + + buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) + _inputs = buf[:-1] + _targets = buf[1:] + end_idxs[-1] -= 1 # last document was too long to account for _targets offset + cum_lengths = (end_idxs - start_idxs).cumsum(0) + + else: + if pos + num_tokens + 1 >= len(tokens): # should not occur for val data + tokens, pos = _load_data_shard(next(file_iter)), 0 + + pos_local = pos + rank * num_tokens_local + buf = tokens[pos_local: pos_local + num_tokens_local + 1] + _inputs = buf[:-1].view(num_tokens_local, ) + _targets = buf[1:].view(num_tokens_local, ) + + cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] + pos += num_tokens + + + _cum_lengths = torch.full((max_num_docs,), num_tokens_local) + _cum_lengths[0] = 0 + _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths + + new_params = yield ( + _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), + _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), + _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) + ) + + if new_params is not None: + # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() + new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params + assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" + num_tokens = new_num_tokens + max_seq_len = new_max_seq_len + grad_accum_steps = new_grad_accum_steps + + +# ----------------------------------------------------------------------------- +# int main + +@dataclass +class Hyperparameters: + # data + train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on + val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on + val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons + train_batch_size: int = 2048 * 16 * 8 + train_max_seq_len: int = 128 * 16 + val_batch_size: int = 4 * 64 * 1024 * 8 + # optimization + num_scheduled_iterations: int = 2205 # number of steps to complete lr and ws schedule + num_extension_iterations: int = 40 # number of steps to continue training at final lr and ws + num_iterations: int = num_scheduled_iterations + num_extension_iterations + cooldown_frac: float = 0.50 # fraction of num_scheduled_iterations spent cooling down the learning rate + # evaluation and logging + run_id: str = f"{uuid.uuid4()}" + val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end + save_checkpoint: bool = False + # attention masking + block_size: int = 128 + ws_schedule: tuple = (3, 7, 11) + ws_final: int = 13 # increase final validation ws, used for YaRN extension and short window size @classiclarryd + ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN + +args = Hyperparameters() + +data_path = os.environ.get("DATA_PATH", ".") +args.train_files = os.path.join(data_path, args.train_files) +args.val_files = os.path.join(data_path, args.val_files) + +# torchrun sets these env variables +rank = int(os.environ["RANK"]) +world_size = int(os.environ["WORLD_SIZE"]) +assert 8 % world_size == 0, "world_size must be a divisor of 8" +grad_accum_steps = 8 // world_size +assert torch.cuda.is_available() +device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) +torch.cuda.set_device(device) +dist.init_process_group(backend="nccl", device_id=device) +dist.barrier() +master_process = (rank == 0) # this process will do logging, checkpointing etc. + +# begin logging +logfile = None +if master_process: + run_id = args.run_id + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{run_id}.txt" + print(logfile) +def print0(s, console=False): + if master_process: + with open(logfile, "a") as f: + if console: + print(s) + print(s, file=f) + +# begin by printing this file (the Python code) +print0(code) +print0("="*100) +# log information about the hardware/software environment this is running on +print0(f"Running Python {sys.version}") +print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") +print0(f"Running Triton version {triton.__version__}") + +def nvidia_smi(): + import subprocess # avoid top level import + return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout +print0(nvidia_smi()) +print0("="*100) + +model: nn.Module = GPT( + vocab_size=50257, + num_layers=12, + num_heads=6, + head_dim=128, + model_dim=768, + max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) +).cuda() +for m in model.modules(): + if isinstance(m, (nn.Embedding, nn.Linear)): + m.bfloat16() +for param in model.parameters(): + dist.broadcast(param.detach(), 0) + +# collect the parameters to optimize +hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] +embed_params = [p for n, p in model.named_parameters() if "embed" in n] +scalar_params = [p for p in model.parameters() if p.ndim < 2] +head_params = [model.lm_head.weight] +gate_params = [p for n, p in model.named_parameters() if "gate" in n] + +# init the optimizer(s) +# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence +# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 +optimizer1 = DistAdam( + scalar_params + head_params + embed_params, + lr=0.008, + betas=(0.65, 0.95), + eps=1e-8, + weight_decay=0.0, +) +optimizer2 = NorMuon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=1.2) +optimizers = [optimizer1, optimizer2] +for opt in optimizers: + for group in opt.param_groups: + group["initial_lr"] = group["lr"] + +# learning rate schedule: flat, then linear decay, then flat +def get_lr(step: int): + x = min(0.9999, step / args.num_scheduled_iterations) + assert 0 <= x < 1 + lr = 1.0 + if x >= 1 - args.cooldown_frac: + w = (1 - x) / args.cooldown_frac + lr = w * 1.0 + (1 - w) * 0.1 + return lr + +def get_ws(step: int): + # set short window size to half of long window size + # Higher ws on "extension" steps + if step >= args.num_scheduled_iterations: + return args.ws_final // 2, args.ws_final + x = step / args.num_scheduled_iterations + assert 0 <= x < 1 + ws_idx = int(len(args.ws_schedule) * x) + return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] + +def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): + # warmup phase: linearly increase momentum from min to max + # cooldown phase: linearly decrease momentum from max to min + momentum_cd_start = args.num_iterations - muon_cooldown_steps + if step < muon_warmup_steps: + frac = step / muon_warmup_steps + momentum = momentum_min + frac * (momentum_max - momentum_min) + elif step > momentum_cd_start: + frac = (step - momentum_cd_start) / muon_cooldown_steps + momentum = momentum_max - frac * (momentum_max - momentum_min) + else: + momentum = momentum_max + return momentum + +def step_optimizers(step: int, optimizers, model): + # update lr + for optimizer in optimizers: + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * get_lr(step) + + # set muon momentum based on step + momentum = get_muon_momentum(step) + for group in optimizers[1].param_groups: + group["momentum"] = momentum + + # on even steps, only step Muon params + # on odd steps, step all params + if step%2==0: + optimizers[1].step() + optimizers[1].zero_grad(set_to_none=True) + else: + for optimizer in optimizers: + optimizer.step() + model.zero_grad(set_to_none=True) + +model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) + +######################################## +# Warmup kernels # +######################################## + +# Warmup the training kernels, then re-initialize the state so we aren't cheating +warmup_steps = 30 +initial_state = dict(model=copy.deepcopy(model.state_dict()), + optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +ws_schedule = list(args.ws_schedule) + [args.ws_final] +ws_long = ws_schedule[0] +for step in range(warmup_steps): + inputs, targets, cum_seqlens = next(train_loader) + # each window size is a new graph, need to warm up each with Yarn.attn_scale + ws_idx = step % len(ws_schedule) + if ws_idx==0: + model.yarn.reset() + ws_long = ws_schedule[0] + else: + new_ws_long = ws_schedule[ws_idx] + model.yarn.apply(ws_long, new_ws_long) + ws_long = new_ws_long + model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() + for opt in optimizers: + opt.step() + model.zero_grad(set_to_none=True) +model.yarn.reset() # rotary buffer is not stored in state_dict +model.load_state_dict(initial_state["model"]) +optimizer2.reset() # muon momentum buffers not in state dict +for opt, opt_state in zip(optimizers, initial_state["optimizers"]): + opt.load_state_dict(opt_state) +del train_loader, initial_state + +######################################## +# Training and validation # +######################################## + +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +training_time_ms = 0 +# start the clock +torch.cuda.synchronize() +t0 = time.perf_counter() +# begin training +train_steps = args.num_iterations +ws_short, ws_long = get_ws(0) +for step in range(train_steps + 1): + last_step = (step == train_steps) + ws_short, new_ws_long = get_ws(step) + if new_ws_long != ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long=new_ws_long + + # --------------- VALIDATION SECTION ----------------- + if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): + if last_step: + ws_long = args.ws_validate_post_yarn_ext + # stop the clock + torch.cuda.synchronize() + training_time_ms += 1000 * (time.perf_counter() - t0) + model.eval() + assert args.val_tokens % args.val_batch_size == 0 + val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size + val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) + val_loss = 0 + with torch.no_grad(): + for _ in range(val_steps): + inputs, targets, cum_seqlens = next(val_loader) + val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) + val_loss /= val_steps + del val_loader + dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) + print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) + model.train() + # start the clock again + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if master_process and args.save_checkpoint: + log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) + os.makedirs(f"logs/{run_id}", exist_ok=True) + torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") + # the last step only has the validation loop, so break to avoid training + break + + # --------------- TRAINING SECTION ----------------- + for _ in range(grad_accum_steps): + inputs, targets, cum_seqlens = next(train_loader) + (model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps).backward() + step_optimizers(step, optimizers, model) + + # logging + approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) + print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) + +print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) +dist.destroy_process_group() + +==================================================================================================== +Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] +Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 +Running Triton version 3.5.0 +Mon Nov 10 21:40:03 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | +| N/A 42C P0 133W / 700W | 5858MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | +| N/A 35C P0 122W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | +| N/A 34C P0 118W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | +| N/A 39C P0 123W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | +| N/A 41C P0 131W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | +| N/A 34C P0 123W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | +| N/A 40C P0 123W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | +| N/A 34C P0 118W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| ++-----------------------------------------------------------------------------------------+ + +==================================================================================================== +step:0/2245 val_loss:10.8258 train_time:0ms step_avg:0.02ms +step:1/2245 train_time:119ms step_avg:118.84ms +step:2/2245 train_time:141ms step_avg:70.36ms +step:3/2245 train_time:179ms step_avg:59.52ms +step:4/2245 train_time:235ms step_avg:58.75ms +step:5/2245 train_time:295ms step_avg:58.95ms +step:6/2245 train_time:353ms step_avg:58.87ms +step:7/2245 train_time:414ms step_avg:59.12ms +step:8/2245 train_time:472ms step_avg:59.05ms +step:9/2245 train_time:534ms step_avg:59.28ms +step:10/2245 train_time:593ms step_avg:59.27ms +step:11/2245 train_time:654ms step_avg:59.43ms +step:12/2245 train_time:712ms step_avg:59.36ms +step:13/2245 train_time:774ms step_avg:59.50ms +step:14/2245 train_time:832ms step_avg:59.45ms +step:15/2245 train_time:894ms step_avg:59.58ms +step:16/2245 train_time:953ms step_avg:59.54ms +step:17/2245 train_time:1017ms step_avg:59.83ms +step:18/2245 train_time:1080ms step_avg:59.99ms +step:19/2245 train_time:1144ms step_avg:60.21ms +step:20/2245 train_time:1204ms step_avg:60.20ms +step:21/2245 train_time:1266ms step_avg:60.29ms +step:22/2245 train_time:1326ms step_avg:60.28ms +step:23/2245 train_time:1388ms step_avg:60.35ms +step:24/2245 train_time:1447ms step_avg:60.29ms +step:25/2245 train_time:1509ms step_avg:60.35ms +step:26/2245 train_time:1568ms step_avg:60.29ms +step:27/2245 train_time:1629ms step_avg:60.33ms +step:28/2245 train_time:1689ms step_avg:60.31ms +step:29/2245 train_time:1751ms step_avg:60.37ms +step:30/2245 train_time:1810ms step_avg:60.32ms +step:31/2245 train_time:1871ms step_avg:60.35ms +step:32/2245 train_time:1930ms step_avg:60.30ms +step:33/2245 train_time:1993ms step_avg:60.40ms +step:34/2245 train_time:2053ms step_avg:60.37ms +step:35/2245 train_time:2115ms step_avg:60.42ms +step:36/2245 train_time:2174ms step_avg:60.40ms +step:37/2245 train_time:2237ms step_avg:60.47ms +step:38/2245 train_time:2298ms step_avg:60.47ms +step:39/2245 train_time:2360ms step_avg:60.51ms +step:40/2245 train_time:2419ms step_avg:60.48ms +step:41/2245 train_time:2481ms step_avg:60.52ms +step:42/2245 train_time:2541ms step_avg:60.50ms +step:43/2245 train_time:2603ms step_avg:60.53ms +step:44/2245 train_time:2662ms step_avg:60.50ms +step:45/2245 train_time:2724ms step_avg:60.54ms +step:46/2245 train_time:2785ms step_avg:60.54ms +step:47/2245 train_time:2847ms step_avg:60.58ms +step:48/2245 train_time:2907ms step_avg:60.56ms +step:49/2245 train_time:2969ms step_avg:60.59ms +step:50/2245 train_time:3028ms step_avg:60.57ms +step:51/2245 train_time:3090ms step_avg:60.59ms +step:52/2245 train_time:3149ms step_avg:60.56ms +step:53/2245 train_time:3211ms step_avg:60.59ms +step:54/2245 train_time:3271ms step_avg:60.57ms +step:55/2245 train_time:3332ms step_avg:60.59ms +step:56/2245 train_time:3392ms step_avg:60.56ms +step:57/2245 train_time:3454ms step_avg:60.59ms +step:58/2245 train_time:3513ms step_avg:60.57ms +step:59/2245 train_time:3575ms step_avg:60.60ms +step:60/2245 train_time:3635ms step_avg:60.58ms +step:61/2245 train_time:3697ms step_avg:60.60ms +step:62/2245 train_time:3756ms step_avg:60.59ms +step:63/2245 train_time:3820ms step_avg:60.63ms +step:64/2245 train_time:3879ms step_avg:60.61ms +step:65/2245 train_time:3941ms step_avg:60.63ms +step:66/2245 train_time:4001ms step_avg:60.62ms +step:67/2245 train_time:4063ms step_avg:60.64ms +step:68/2245 train_time:4123ms step_avg:60.63ms +step:69/2245 train_time:4185ms step_avg:60.65ms +step:70/2245 train_time:4244ms step_avg:60.63ms +step:71/2245 train_time:4306ms step_avg:60.65ms +step:72/2245 train_time:4366ms step_avg:60.64ms +step:73/2245 train_time:4428ms step_avg:60.65ms +step:74/2245 train_time:4488ms step_avg:60.64ms +step:75/2245 train_time:4550ms step_avg:60.67ms +step:76/2245 train_time:4609ms step_avg:60.64ms +step:77/2245 train_time:4671ms step_avg:60.66ms +step:78/2245 train_time:4730ms step_avg:60.64ms +step:79/2245 train_time:4793ms step_avg:60.67ms +step:80/2245 train_time:4852ms step_avg:60.66ms +step:81/2245 train_time:4914ms step_avg:60.66ms +step:82/2245 train_time:4973ms step_avg:60.65ms +step:83/2245 train_time:5035ms step_avg:60.66ms +step:84/2245 train_time:5094ms step_avg:60.64ms +step:85/2245 train_time:5156ms step_avg:60.65ms +step:86/2245 train_time:5215ms step_avg:60.64ms +step:87/2245 train_time:5277ms step_avg:60.66ms +step:88/2245 train_time:5337ms step_avg:60.64ms +step:89/2245 train_time:5398ms step_avg:60.66ms +step:90/2245 train_time:5457ms step_avg:60.64ms +step:91/2245 train_time:5519ms step_avg:60.65ms +step:92/2245 train_time:5579ms step_avg:60.64ms +step:93/2245 train_time:5641ms step_avg:60.65ms +step:94/2245 train_time:5700ms step_avg:60.64ms +step:95/2245 train_time:5762ms step_avg:60.65ms +step:96/2245 train_time:5822ms step_avg:60.64ms +step:97/2245 train_time:5884ms step_avg:60.66ms +step:98/2245 train_time:5944ms step_avg:60.65ms +step:99/2245 train_time:6006ms step_avg:60.67ms +step:100/2245 train_time:6065ms step_avg:60.65ms +step:101/2245 train_time:6127ms step_avg:60.66ms +step:102/2245 train_time:6188ms step_avg:60.67ms +step:103/2245 train_time:6248ms step_avg:60.66ms +step:104/2245 train_time:6307ms step_avg:60.65ms +step:105/2245 train_time:6370ms step_avg:60.67ms +step:106/2245 train_time:6429ms step_avg:60.65ms +step:107/2245 train_time:6491ms step_avg:60.66ms +step:108/2245 train_time:6550ms step_avg:60.65ms +step:109/2245 train_time:6612ms step_avg:60.66ms +step:110/2245 train_time:6671ms step_avg:60.65ms +step:111/2245 train_time:6732ms step_avg:60.65ms +step:112/2245 train_time:6792ms step_avg:60.64ms +step:113/2245 train_time:6853ms step_avg:60.65ms +step:114/2245 train_time:6912ms step_avg:60.63ms +step:115/2245 train_time:6973ms step_avg:60.64ms +step:116/2245 train_time:7032ms step_avg:60.62ms +step:117/2245 train_time:7094ms step_avg:60.63ms +step:118/2245 train_time:7153ms step_avg:60.62ms +step:119/2245 train_time:7214ms step_avg:60.62ms +step:120/2245 train_time:7274ms step_avg:60.61ms +step:121/2245 train_time:7335ms step_avg:60.62ms +step:122/2245 train_time:7395ms step_avg:60.62ms +step:123/2245 train_time:7457ms step_avg:60.63ms +step:124/2245 train_time:7516ms step_avg:60.61ms +step:125/2245 train_time:7578ms step_avg:60.62ms +step:126/2245 train_time:7637ms step_avg:60.61ms +step:127/2245 train_time:7698ms step_avg:60.61ms +step:128/2245 train_time:7757ms step_avg:60.60ms +step:129/2245 train_time:7819ms step_avg:60.61ms +step:130/2245 train_time:7877ms step_avg:60.60ms +step:131/2245 train_time:7940ms step_avg:60.61ms +step:132/2245 train_time:7999ms step_avg:60.60ms +step:133/2245 train_time:8061ms step_avg:60.61ms +step:134/2245 train_time:8120ms step_avg:60.60ms +step:135/2245 train_time:8181ms step_avg:60.60ms +step:136/2245 train_time:8241ms step_avg:60.59ms +step:137/2245 train_time:8303ms step_avg:60.60ms +step:138/2245 train_time:8363ms step_avg:60.60ms +step:139/2245 train_time:8426ms step_avg:60.62ms +step:140/2245 train_time:8485ms step_avg:60.61ms +step:141/2245 train_time:8547ms step_avg:60.62ms +step:142/2245 train_time:8606ms step_avg:60.61ms +step:143/2245 train_time:8669ms step_avg:60.62ms +step:144/2245 train_time:8728ms step_avg:60.61ms +step:145/2245 train_time:8789ms step_avg:60.61ms +step:146/2245 train_time:8848ms step_avg:60.60ms +step:147/2245 train_time:8909ms step_avg:60.61ms +step:148/2245 train_time:8968ms step_avg:60.60ms +step:149/2245 train_time:9030ms step_avg:60.60ms +step:150/2245 train_time:9089ms step_avg:60.59ms +step:151/2245 train_time:9151ms step_avg:60.60ms +step:152/2245 train_time:9210ms step_avg:60.59ms +step:153/2245 train_time:9272ms step_avg:60.60ms +step:154/2245 train_time:9330ms step_avg:60.59ms +step:155/2245 train_time:9392ms step_avg:60.59ms +step:156/2245 train_time:9451ms step_avg:60.58ms +step:157/2245 train_time:9512ms step_avg:60.59ms +step:158/2245 train_time:9571ms step_avg:60.58ms +step:159/2245 train_time:9633ms step_avg:60.58ms +step:160/2245 train_time:9692ms step_avg:60.58ms +step:161/2245 train_time:9754ms step_avg:60.58ms +step:162/2245 train_time:9813ms step_avg:60.57ms +step:163/2245 train_time:9875ms step_avg:60.58ms +step:164/2245 train_time:9934ms step_avg:60.57ms +step:165/2245 train_time:9995ms step_avg:60.58ms +step:166/2245 train_time:10054ms step_avg:60.57ms +step:167/2245 train_time:10115ms step_avg:60.57ms +step:168/2245 train_time:10174ms step_avg:60.56ms +step:169/2245 train_time:10235ms step_avg:60.56ms +step:170/2245 train_time:10294ms step_avg:60.55ms +step:171/2245 train_time:10355ms step_avg:60.56ms +step:172/2245 train_time:10414ms step_avg:60.54ms +step:173/2245 train_time:10475ms step_avg:60.55ms +step:174/2245 train_time:10534ms step_avg:60.54ms +step:175/2245 train_time:10596ms step_avg:60.55ms +step:176/2245 train_time:10654ms step_avg:60.54ms +step:177/2245 train_time:10716ms step_avg:60.54ms +step:178/2245 train_time:10776ms step_avg:60.54ms +step:179/2245 train_time:10838ms step_avg:60.55ms +step:180/2245 train_time:10897ms step_avg:60.54ms +step:181/2245 train_time:10959ms step_avg:60.54ms +step:182/2245 train_time:11018ms step_avg:60.54ms +step:183/2245 train_time:11079ms step_avg:60.54ms +step:184/2245 train_time:11138ms step_avg:60.53ms +step:185/2245 train_time:11199ms step_avg:60.54ms +step:186/2245 train_time:11258ms step_avg:60.53ms +step:187/2245 train_time:11319ms step_avg:60.53ms +step:188/2245 train_time:11378ms step_avg:60.52ms +step:189/2245 train_time:11440ms step_avg:60.53ms +step:190/2245 train_time:11499ms step_avg:60.52ms +step:191/2245 train_time:11562ms step_avg:60.53ms +step:192/2245 train_time:11621ms step_avg:60.53ms +step:193/2245 train_time:11683ms step_avg:60.53ms +step:194/2245 train_time:11742ms step_avg:60.53ms +step:195/2245 train_time:11804ms step_avg:60.53ms +step:196/2245 train_time:11863ms step_avg:60.53ms +step:197/2245 train_time:11925ms step_avg:60.53ms +step:198/2245 train_time:11984ms step_avg:60.52ms +step:199/2245 train_time:12045ms step_avg:60.53ms +step:200/2245 train_time:12104ms step_avg:60.52ms +step:201/2245 train_time:12166ms step_avg:60.52ms +step:202/2245 train_time:12224ms step_avg:60.52ms +step:203/2245 train_time:12286ms step_avg:60.52ms +step:204/2245 train_time:12345ms step_avg:60.52ms +step:205/2245 train_time:12407ms step_avg:60.52ms +step:206/2245 train_time:12466ms step_avg:60.52ms +step:207/2245 train_time:12528ms step_avg:60.52ms +step:208/2245 train_time:12587ms step_avg:60.52ms +step:209/2245 train_time:12649ms step_avg:60.52ms +step:210/2245 train_time:12708ms step_avg:60.51ms +step:211/2245 train_time:12769ms step_avg:60.52ms +step:212/2245 train_time:12827ms step_avg:60.51ms +step:213/2245 train_time:12889ms step_avg:60.51ms +step:214/2245 train_time:12948ms step_avg:60.50ms +step:215/2245 train_time:13009ms step_avg:60.51ms +step:216/2245 train_time:13068ms step_avg:60.50ms +step:217/2245 train_time:13129ms step_avg:60.50ms +step:218/2245 train_time:13188ms step_avg:60.49ms +step:219/2245 train_time:13249ms step_avg:60.50ms +step:220/2245 train_time:13308ms step_avg:60.49ms +step:221/2245 train_time:13370ms step_avg:60.50ms +step:222/2245 train_time:13429ms step_avg:60.49ms +step:223/2245 train_time:13491ms step_avg:60.50ms +step:224/2245 train_time:13550ms step_avg:60.49ms +step:225/2245 train_time:13612ms step_avg:60.50ms +step:226/2245 train_time:13671ms step_avg:60.49ms +step:227/2245 train_time:13732ms step_avg:60.49ms +step:228/2245 train_time:13791ms step_avg:60.49ms +step:229/2245 train_time:13852ms step_avg:60.49ms +step:230/2245 train_time:13911ms step_avg:60.48ms +step:231/2245 train_time:13972ms step_avg:60.49ms +step:232/2245 train_time:14031ms step_avg:60.48ms +step:233/2245 train_time:14093ms step_avg:60.48ms +step:234/2245 train_time:14152ms step_avg:60.48ms +step:235/2245 train_time:14214ms step_avg:60.48ms +step:236/2245 train_time:14273ms step_avg:60.48ms +step:237/2245 train_time:14335ms step_avg:60.49ms +step:238/2245 train_time:14394ms step_avg:60.48ms +step:239/2245 train_time:14456ms step_avg:60.49ms +step:240/2245 train_time:14515ms step_avg:60.48ms +step:241/2245 train_time:14576ms step_avg:60.48ms +step:242/2245 train_time:14635ms step_avg:60.48ms +step:243/2245 train_time:14698ms step_avg:60.48ms +step:244/2245 train_time:14756ms step_avg:60.48ms +step:245/2245 train_time:14818ms step_avg:60.48ms +step:246/2245 train_time:14876ms step_avg:60.47ms +step:247/2245 train_time:14937ms step_avg:60.47ms +step:248/2245 train_time:14996ms step_avg:60.47ms +step:249/2245 train_time:15058ms step_avg:60.47ms +step:250/2245 train_time:15117ms step_avg:60.47ms +step:250/2245 val_loss:4.0728 train_time:15180ms step_avg:60.72ms +step:251/2245 train_time:15199ms step_avg:60.55ms +step:252/2245 train_time:15241ms step_avg:60.48ms +step:253/2245 train_time:15309ms step_avg:60.51ms +step:254/2245 train_time:15373ms step_avg:60.52ms +step:255/2245 train_time:15436ms step_avg:60.53ms +step:256/2245 train_time:15495ms step_avg:60.53ms +step:257/2245 train_time:15555ms step_avg:60.53ms +step:258/2245 train_time:15614ms step_avg:60.52ms +step:259/2245 train_time:15675ms step_avg:60.52ms +step:260/2245 train_time:15734ms step_avg:60.52ms +step:261/2245 train_time:15795ms step_avg:60.52ms +step:262/2245 train_time:15853ms step_avg:60.51ms +step:263/2245 train_time:15913ms step_avg:60.51ms +step:264/2245 train_time:15972ms step_avg:60.50ms +step:265/2245 train_time:16032ms step_avg:60.50ms +step:266/2245 train_time:16091ms step_avg:60.49ms +step:267/2245 train_time:16153ms step_avg:60.50ms +step:268/2245 train_time:16213ms step_avg:60.50ms +step:269/2245 train_time:16277ms step_avg:60.51ms +step:270/2245 train_time:16338ms step_avg:60.51ms +step:271/2245 train_time:16401ms step_avg:60.52ms +step:272/2245 train_time:16460ms step_avg:60.52ms +step:273/2245 train_time:16522ms step_avg:60.52ms +step:274/2245 train_time:16581ms step_avg:60.51ms +step:275/2245 train_time:16642ms step_avg:60.52ms +step:276/2245 train_time:16701ms step_avg:60.51ms +step:277/2245 train_time:16762ms step_avg:60.51ms +step:278/2245 train_time:16821ms step_avg:60.51ms +step:279/2245 train_time:16882ms step_avg:60.51ms +step:280/2245 train_time:16941ms step_avg:60.50ms +step:281/2245 train_time:17002ms step_avg:60.51ms +step:282/2245 train_time:17061ms step_avg:60.50ms +step:283/2245 train_time:17122ms step_avg:60.50ms +step:284/2245 train_time:17182ms step_avg:60.50ms +step:285/2245 train_time:17243ms step_avg:60.50ms +step:286/2245 train_time:17303ms step_avg:60.50ms +step:287/2245 train_time:17365ms step_avg:60.51ms +step:288/2245 train_time:17425ms step_avg:60.50ms +step:289/2245 train_time:17487ms step_avg:60.51ms +step:290/2245 train_time:17547ms step_avg:60.51ms +step:291/2245 train_time:17608ms step_avg:60.51ms +step:292/2245 train_time:17667ms step_avg:60.50ms +step:293/2245 train_time:17728ms step_avg:60.51ms +step:294/2245 train_time:17787ms step_avg:60.50ms +step:295/2245 train_time:17848ms step_avg:60.50ms +step:296/2245 train_time:17906ms step_avg:60.49ms +step:297/2245 train_time:17968ms step_avg:60.50ms +step:298/2245 train_time:18028ms step_avg:60.50ms +step:299/2245 train_time:18089ms step_avg:60.50ms +step:300/2245 train_time:18147ms step_avg:60.49ms +step:301/2245 train_time:18209ms step_avg:60.50ms +step:302/2245 train_time:18270ms step_avg:60.50ms +step:303/2245 train_time:18332ms step_avg:60.50ms +step:304/2245 train_time:18392ms step_avg:60.50ms +step:305/2245 train_time:18454ms step_avg:60.50ms +step:306/2245 train_time:18513ms step_avg:60.50ms +step:307/2245 train_time:18575ms step_avg:60.50ms +step:308/2245 train_time:18634ms step_avg:60.50ms +step:309/2245 train_time:18695ms step_avg:60.50ms +step:310/2245 train_time:18754ms step_avg:60.50ms +step:311/2245 train_time:18815ms step_avg:60.50ms +step:312/2245 train_time:18873ms step_avg:60.49ms +step:313/2245 train_time:18934ms step_avg:60.49ms +step:314/2245 train_time:18993ms step_avg:60.49ms +step:315/2245 train_time:19054ms step_avg:60.49ms +step:316/2245 train_time:19112ms step_avg:60.48ms +step:317/2245 train_time:19174ms step_avg:60.48ms +step:318/2245 train_time:19233ms step_avg:60.48ms +step:319/2245 train_time:19295ms step_avg:60.49ms +step:320/2245 train_time:19354ms step_avg:60.48ms +step:321/2245 train_time:19415ms step_avg:60.48ms +step:322/2245 train_time:19474ms step_avg:60.48ms +step:323/2245 train_time:19536ms step_avg:60.48ms +step:324/2245 train_time:19596ms step_avg:60.48ms +step:325/2245 train_time:19657ms step_avg:60.48ms +step:326/2245 train_time:19715ms step_avg:60.48ms +step:327/2245 train_time:19777ms step_avg:60.48ms +step:328/2245 train_time:19835ms step_avg:60.47ms +step:329/2245 train_time:19897ms step_avg:60.48ms +step:330/2245 train_time:19956ms step_avg:60.47ms +step:331/2245 train_time:20017ms step_avg:60.48ms +step:332/2245 train_time:20076ms step_avg:60.47ms +step:333/2245 train_time:20137ms step_avg:60.47ms +step:334/2245 train_time:20196ms step_avg:60.47ms +step:335/2245 train_time:20258ms step_avg:60.47ms +step:336/2245 train_time:20317ms step_avg:60.47ms +step:337/2245 train_time:20378ms step_avg:60.47ms +step:338/2245 train_time:20437ms step_avg:60.47ms +step:339/2245 train_time:20499ms step_avg:60.47ms +step:340/2245 train_time:20558ms step_avg:60.46ms +step:341/2245 train_time:20619ms step_avg:60.47ms +step:342/2245 train_time:20678ms step_avg:60.46ms +step:343/2245 train_time:20739ms step_avg:60.46ms +step:344/2245 train_time:20798ms step_avg:60.46ms +step:345/2245 train_time:20860ms step_avg:60.46ms +step:346/2245 train_time:20918ms step_avg:60.46ms +step:347/2245 train_time:20980ms step_avg:60.46ms +step:348/2245 train_time:21039ms step_avg:60.46ms +step:349/2245 train_time:21101ms step_avg:60.46ms +step:350/2245 train_time:21159ms step_avg:60.45ms +step:351/2245 train_time:21220ms step_avg:60.46ms +step:352/2245 train_time:21279ms step_avg:60.45ms +step:353/2245 train_time:21341ms step_avg:60.46ms +step:354/2245 train_time:21400ms step_avg:60.45ms +step:355/2245 train_time:21462ms step_avg:60.45ms +step:356/2245 train_time:21521ms step_avg:60.45ms +step:357/2245 train_time:21582ms step_avg:60.45ms +step:358/2245 train_time:21641ms step_avg:60.45ms +step:359/2245 train_time:21703ms step_avg:60.45ms +step:360/2245 train_time:21761ms step_avg:60.45ms +step:361/2245 train_time:21823ms step_avg:60.45ms +step:362/2245 train_time:21881ms step_avg:60.45ms +step:363/2245 train_time:21942ms step_avg:60.45ms +step:364/2245 train_time:22001ms step_avg:60.44ms +step:365/2245 train_time:22062ms step_avg:60.44ms +step:366/2245 train_time:22121ms step_avg:60.44ms +step:367/2245 train_time:22183ms step_avg:60.44ms +step:368/2245 train_time:22241ms step_avg:60.44ms +step:369/2245 train_time:22303ms step_avg:60.44ms +step:370/2245 train_time:22362ms step_avg:60.44ms +step:371/2245 train_time:22424ms step_avg:60.44ms +step:372/2245 train_time:22482ms step_avg:60.44ms +step:373/2245 train_time:22544ms step_avg:60.44ms +step:374/2245 train_time:22603ms step_avg:60.44ms +step:375/2245 train_time:22665ms step_avg:60.44ms +step:376/2245 train_time:22723ms step_avg:60.43ms +step:377/2245 train_time:22785ms step_avg:60.44ms +step:378/2245 train_time:22844ms step_avg:60.43ms +step:379/2245 train_time:22905ms step_avg:60.44ms +step:380/2245 train_time:22964ms step_avg:60.43ms +step:381/2245 train_time:23026ms step_avg:60.43ms +step:382/2245 train_time:23085ms step_avg:60.43ms +step:383/2245 train_time:23147ms step_avg:60.44ms +step:384/2245 train_time:23206ms step_avg:60.43ms +step:385/2245 train_time:23268ms step_avg:60.44ms +step:386/2245 train_time:23329ms step_avg:60.44ms +step:387/2245 train_time:23391ms step_avg:60.44ms +step:388/2245 train_time:23450ms step_avg:60.44ms +step:389/2245 train_time:23512ms step_avg:60.44ms +step:390/2245 train_time:23571ms step_avg:60.44ms +step:391/2245 train_time:23633ms step_avg:60.44ms +step:392/2245 train_time:23692ms step_avg:60.44ms +step:393/2245 train_time:23754ms step_avg:60.44ms +step:394/2245 train_time:23813ms step_avg:60.44ms +step:395/2245 train_time:23875ms step_avg:60.44ms +step:396/2245 train_time:23933ms step_avg:60.44ms +step:397/2245 train_time:23996ms step_avg:60.44ms +step:398/2245 train_time:24055ms step_avg:60.44ms +step:399/2245 train_time:24117ms step_avg:60.44ms +step:400/2245 train_time:24175ms step_avg:60.44ms +step:401/2245 train_time:24236ms step_avg:60.44ms +step:402/2245 train_time:24296ms step_avg:60.44ms +step:403/2245 train_time:24357ms step_avg:60.44ms +step:404/2245 train_time:24416ms step_avg:60.44ms +step:405/2245 train_time:24477ms step_avg:60.44ms +step:406/2245 train_time:24536ms step_avg:60.43ms +step:407/2245 train_time:24598ms step_avg:60.44ms +step:408/2245 train_time:24656ms step_avg:60.43ms +step:409/2245 train_time:24718ms step_avg:60.44ms +step:410/2245 train_time:24777ms step_avg:60.43ms +step:411/2245 train_time:24838ms step_avg:60.43ms +step:412/2245 train_time:24897ms step_avg:60.43ms +step:413/2245 train_time:24959ms step_avg:60.43ms +step:414/2245 train_time:25018ms step_avg:60.43ms +step:415/2245 train_time:25079ms step_avg:60.43ms +step:416/2245 train_time:25137ms step_avg:60.43ms +step:417/2245 train_time:25199ms step_avg:60.43ms +step:418/2245 train_time:25258ms step_avg:60.43ms +step:419/2245 train_time:25320ms step_avg:60.43ms +step:420/2245 train_time:25379ms step_avg:60.43ms +step:421/2245 train_time:25440ms step_avg:60.43ms +step:422/2245 train_time:25499ms step_avg:60.42ms +step:423/2245 train_time:25561ms step_avg:60.43ms +step:424/2245 train_time:25619ms step_avg:60.42ms +step:425/2245 train_time:25681ms step_avg:60.43ms +step:426/2245 train_time:25740ms step_avg:60.42ms +step:427/2245 train_time:25802ms step_avg:60.43ms +step:428/2245 train_time:25860ms step_avg:60.42ms +step:429/2245 train_time:25922ms step_avg:60.42ms +step:430/2245 train_time:25981ms step_avg:60.42ms +step:431/2245 train_time:26042ms step_avg:60.42ms +step:432/2245 train_time:26101ms step_avg:60.42ms +step:433/2245 train_time:26163ms step_avg:60.42ms +step:434/2245 train_time:26221ms step_avg:60.42ms +step:435/2245 train_time:26283ms step_avg:60.42ms +step:436/2245 train_time:26342ms step_avg:60.42ms +step:437/2245 train_time:26404ms step_avg:60.42ms +step:438/2245 train_time:26463ms step_avg:60.42ms +step:439/2245 train_time:26524ms step_avg:60.42ms +step:440/2245 train_time:26583ms step_avg:60.42ms +step:441/2245 train_time:26646ms step_avg:60.42ms +step:442/2245 train_time:26705ms step_avg:60.42ms +step:443/2245 train_time:26766ms step_avg:60.42ms +step:444/2245 train_time:26826ms step_avg:60.42ms +step:445/2245 train_time:26887ms step_avg:60.42ms +step:446/2245 train_time:26946ms step_avg:60.42ms +step:447/2245 train_time:27008ms step_avg:60.42ms +step:448/2245 train_time:27067ms step_avg:60.42ms +step:449/2245 train_time:27128ms step_avg:60.42ms +step:450/2245 train_time:27188ms step_avg:60.42ms +step:451/2245 train_time:27250ms step_avg:60.42ms +step:452/2245 train_time:27309ms step_avg:60.42ms +step:453/2245 train_time:27371ms step_avg:60.42ms +step:454/2245 train_time:27431ms step_avg:60.42ms +step:455/2245 train_time:27492ms step_avg:60.42ms +step:456/2245 train_time:27552ms step_avg:60.42ms +step:457/2245 train_time:27613ms step_avg:60.42ms +step:458/2245 train_time:27673ms step_avg:60.42ms +step:459/2245 train_time:27735ms step_avg:60.42ms +step:460/2245 train_time:27794ms step_avg:60.42ms +step:461/2245 train_time:27856ms step_avg:60.42ms +step:462/2245 train_time:27915ms step_avg:60.42ms +step:463/2245 train_time:27977ms step_avg:60.43ms +step:464/2245 train_time:28036ms step_avg:60.42ms +step:465/2245 train_time:28097ms step_avg:60.42ms +step:466/2245 train_time:28155ms step_avg:60.42ms +step:467/2245 train_time:28216ms step_avg:60.42ms +step:468/2245 train_time:28275ms step_avg:60.42ms +step:469/2245 train_time:28336ms step_avg:60.42ms +step:470/2245 train_time:28395ms step_avg:60.42ms +step:471/2245 train_time:28457ms step_avg:60.42ms +step:472/2245 train_time:28515ms step_avg:60.41ms +step:473/2245 train_time:28577ms step_avg:60.42ms +step:474/2245 train_time:28636ms step_avg:60.41ms +step:475/2245 train_time:28697ms step_avg:60.41ms +step:476/2245 train_time:28756ms step_avg:60.41ms +step:477/2245 train_time:28817ms step_avg:60.41ms +step:478/2245 train_time:28876ms step_avg:60.41ms +step:479/2245 train_time:28937ms step_avg:60.41ms +step:480/2245 train_time:28996ms step_avg:60.41ms +step:481/2245 train_time:29058ms step_avg:60.41ms +step:482/2245 train_time:29116ms step_avg:60.41ms +step:483/2245 train_time:29178ms step_avg:60.41ms +step:484/2245 train_time:29237ms step_avg:60.41ms +step:485/2245 train_time:29298ms step_avg:60.41ms +step:486/2245 train_time:29357ms step_avg:60.40ms +step:487/2245 train_time:29418ms step_avg:60.41ms +step:488/2245 train_time:29477ms step_avg:60.40ms +step:489/2245 train_time:29539ms step_avg:60.41ms +step:490/2245 train_time:29598ms step_avg:60.40ms +step:491/2245 train_time:29660ms step_avg:60.41ms +step:492/2245 train_time:29719ms step_avg:60.40ms +step:493/2245 train_time:29781ms step_avg:60.41ms +step:494/2245 train_time:29841ms step_avg:60.41ms +step:495/2245 train_time:29902ms step_avg:60.41ms +step:496/2245 train_time:29961ms step_avg:60.40ms +step:497/2245 train_time:30022ms step_avg:60.41ms +step:498/2245 train_time:30081ms step_avg:60.40ms +step:499/2245 train_time:30143ms step_avg:60.41ms +step:500/2245 train_time:30202ms step_avg:60.40ms +step:500/2245 val_loss:3.8244 train_time:30264ms step_avg:60.53ms +step:501/2245 train_time:30283ms step_avg:60.44ms +step:502/2245 train_time:30325ms step_avg:60.41ms +step:503/2245 train_time:30389ms step_avg:60.42ms +step:504/2245 train_time:30450ms step_avg:60.42ms +step:505/2245 train_time:30511ms step_avg:60.42ms +step:506/2245 train_time:30571ms step_avg:60.42ms +step:507/2245 train_time:30633ms step_avg:60.42ms +step:508/2245 train_time:30691ms step_avg:60.42ms +step:509/2245 train_time:30753ms step_avg:60.42ms +step:510/2245 train_time:30812ms step_avg:60.42ms +step:511/2245 train_time:30873ms step_avg:60.42ms +step:512/2245 train_time:30932ms step_avg:60.41ms +step:513/2245 train_time:30993ms step_avg:60.41ms +step:514/2245 train_time:31052ms step_avg:60.41ms +step:515/2245 train_time:31113ms step_avg:60.41ms +step:516/2245 train_time:31172ms step_avg:60.41ms +step:517/2245 train_time:31235ms step_avg:60.42ms +step:518/2245 train_time:31296ms step_avg:60.42ms +step:519/2245 train_time:31360ms step_avg:60.42ms +step:520/2245 train_time:31420ms step_avg:60.42ms +step:521/2245 train_time:31482ms step_avg:60.43ms +step:522/2245 train_time:31540ms step_avg:60.42ms +step:523/2245 train_time:31602ms step_avg:60.42ms +step:524/2245 train_time:31661ms step_avg:60.42ms +step:525/2245 train_time:31722ms step_avg:60.42ms +step:526/2245 train_time:31781ms step_avg:60.42ms +step:527/2245 train_time:31842ms step_avg:60.42ms +step:528/2245 train_time:31900ms step_avg:60.42ms +step:529/2245 train_time:31962ms step_avg:60.42ms +step:530/2245 train_time:32021ms step_avg:60.42ms +step:531/2245 train_time:32082ms step_avg:60.42ms +step:532/2245 train_time:32142ms step_avg:60.42ms +step:533/2245 train_time:32203ms step_avg:60.42ms +step:534/2245 train_time:32262ms step_avg:60.42ms +step:535/2245 train_time:32324ms step_avg:60.42ms +step:536/2245 train_time:32384ms step_avg:60.42ms +step:537/2245 train_time:32446ms step_avg:60.42ms +step:538/2245 train_time:32506ms step_avg:60.42ms +step:539/2245 train_time:32567ms step_avg:60.42ms +step:540/2245 train_time:32626ms step_avg:60.42ms +step:541/2245 train_time:32688ms step_avg:60.42ms +step:542/2245 train_time:32746ms step_avg:60.42ms +step:543/2245 train_time:32807ms step_avg:60.42ms +step:544/2245 train_time:32866ms step_avg:60.42ms +step:545/2245 train_time:32928ms step_avg:60.42ms +step:546/2245 train_time:32986ms step_avg:60.41ms +step:547/2245 train_time:33050ms step_avg:60.42ms +step:548/2245 train_time:33107ms step_avg:60.41ms +step:549/2245 train_time:33168ms step_avg:60.42ms +step:550/2245 train_time:33228ms step_avg:60.41ms +step:551/2245 train_time:33290ms step_avg:60.42ms +step:552/2245 train_time:33349ms step_avg:60.41ms +step:553/2245 train_time:33411ms step_avg:60.42ms +step:554/2245 train_time:33470ms step_avg:60.41ms +step:555/2245 train_time:33531ms step_avg:60.42ms +step:556/2245 train_time:33591ms step_avg:60.42ms +step:557/2245 train_time:33653ms step_avg:60.42ms +step:558/2245 train_time:33712ms step_avg:60.42ms +step:559/2245 train_time:33774ms step_avg:60.42ms +step:560/2245 train_time:33833ms step_avg:60.42ms +step:561/2245 train_time:33895ms step_avg:60.42ms +step:562/2245 train_time:33955ms step_avg:60.42ms +step:563/2245 train_time:34017ms step_avg:60.42ms +step:564/2245 train_time:34077ms step_avg:60.42ms +step:565/2245 train_time:34139ms step_avg:60.42ms +step:566/2245 train_time:34199ms step_avg:60.42ms +step:567/2245 train_time:34261ms step_avg:60.42ms +step:568/2245 train_time:34320ms step_avg:60.42ms +step:569/2245 train_time:34382ms step_avg:60.43ms +step:570/2245 train_time:34441ms step_avg:60.42ms +step:571/2245 train_time:34502ms step_avg:60.42ms +step:572/2245 train_time:34562ms step_avg:60.42ms +step:573/2245 train_time:34624ms step_avg:60.43ms +step:574/2245 train_time:34683ms step_avg:60.42ms +step:575/2245 train_time:34744ms step_avg:60.42ms +step:576/2245 train_time:34803ms step_avg:60.42ms +step:577/2245 train_time:34864ms step_avg:60.42ms +step:578/2245 train_time:34923ms step_avg:60.42ms +step:579/2245 train_time:34984ms step_avg:60.42ms +step:580/2245 train_time:35043ms step_avg:60.42ms +step:581/2245 train_time:35104ms step_avg:60.42ms +step:582/2245 train_time:35162ms step_avg:60.42ms +step:583/2245 train_time:35224ms step_avg:60.42ms +step:584/2245 train_time:35283ms step_avg:60.42ms +step:585/2245 train_time:35344ms step_avg:60.42ms +step:586/2245 train_time:35403ms step_avg:60.41ms +step:587/2245 train_time:35464ms step_avg:60.42ms +step:588/2245 train_time:35523ms step_avg:60.41ms +step:589/2245 train_time:35584ms step_avg:60.42ms +step:590/2245 train_time:35644ms step_avg:60.41ms +step:591/2245 train_time:35705ms step_avg:60.41ms +step:592/2245 train_time:35764ms step_avg:60.41ms +step:593/2245 train_time:35825ms step_avg:60.41ms +step:594/2245 train_time:35884ms step_avg:60.41ms +step:595/2245 train_time:35945ms step_avg:60.41ms +step:596/2245 train_time:36004ms step_avg:60.41ms +step:597/2245 train_time:36065ms step_avg:60.41ms +step:598/2245 train_time:36124ms step_avg:60.41ms +step:599/2245 train_time:36185ms step_avg:60.41ms +step:600/2245 train_time:36244ms step_avg:60.41ms +step:601/2245 train_time:36306ms step_avg:60.41ms +step:602/2245 train_time:36365ms step_avg:60.41ms +step:603/2245 train_time:36427ms step_avg:60.41ms +step:604/2245 train_time:36486ms step_avg:60.41ms +step:605/2245 train_time:36547ms step_avg:60.41ms +step:606/2245 train_time:36606ms step_avg:60.41ms +step:607/2245 train_time:36668ms step_avg:60.41ms +step:608/2245 train_time:36727ms step_avg:60.41ms +step:609/2245 train_time:36788ms step_avg:60.41ms +step:610/2245 train_time:36847ms step_avg:60.40ms +step:611/2245 train_time:36908ms step_avg:60.41ms +step:612/2245 train_time:36967ms step_avg:60.40ms +step:613/2245 train_time:37028ms step_avg:60.40ms +step:614/2245 train_time:37087ms step_avg:60.40ms +step:615/2245 train_time:37149ms step_avg:60.40ms +step:616/2245 train_time:37208ms step_avg:60.40ms +step:617/2245 train_time:37269ms step_avg:60.40ms +step:618/2245 train_time:37328ms step_avg:60.40ms +step:619/2245 train_time:37391ms step_avg:60.40ms +step:620/2245 train_time:37449ms step_avg:60.40ms +step:621/2245 train_time:37511ms step_avg:60.40ms +step:622/2245 train_time:37570ms step_avg:60.40ms +step:623/2245 train_time:37631ms step_avg:60.40ms +step:624/2245 train_time:37690ms step_avg:60.40ms +step:625/2245 train_time:37753ms step_avg:60.40ms +step:626/2245 train_time:37813ms step_avg:60.40ms +step:627/2245 train_time:37874ms step_avg:60.40ms +step:628/2245 train_time:37933ms step_avg:60.40ms +step:629/2245 train_time:37995ms step_avg:60.41ms +step:630/2245 train_time:38055ms step_avg:60.40ms +step:631/2245 train_time:38117ms step_avg:60.41ms +step:632/2245 train_time:38176ms step_avg:60.41ms +step:633/2245 train_time:38238ms step_avg:60.41ms +step:634/2245 train_time:38298ms step_avg:60.41ms +step:635/2245 train_time:38360ms step_avg:60.41ms +step:636/2245 train_time:38419ms step_avg:60.41ms +step:637/2245 train_time:38480ms step_avg:60.41ms +step:638/2245 train_time:38540ms step_avg:60.41ms +step:639/2245 train_time:38601ms step_avg:60.41ms +step:640/2245 train_time:38659ms step_avg:60.41ms +step:641/2245 train_time:38721ms step_avg:60.41ms +step:642/2245 train_time:38780ms step_avg:60.40ms +step:643/2245 train_time:38841ms step_avg:60.41ms +step:644/2245 train_time:38900ms step_avg:60.40ms +step:645/2245 train_time:38962ms step_avg:60.41ms +step:646/2245 train_time:39021ms step_avg:60.40ms +step:647/2245 train_time:39083ms step_avg:60.41ms +step:648/2245 train_time:39142ms step_avg:60.40ms +step:649/2245 train_time:39203ms step_avg:60.41ms +step:650/2245 train_time:39262ms step_avg:60.40ms +step:651/2245 train_time:39324ms step_avg:60.41ms +step:652/2245 train_time:39383ms step_avg:60.40ms +step:653/2245 train_time:39444ms step_avg:60.40ms +step:654/2245 train_time:39503ms step_avg:60.40ms +step:655/2245 train_time:39564ms step_avg:60.40ms +step:656/2245 train_time:39623ms step_avg:60.40ms +step:657/2245 train_time:39684ms step_avg:60.40ms +step:658/2245 train_time:39744ms step_avg:60.40ms +step:659/2245 train_time:39805ms step_avg:60.40ms +step:660/2245 train_time:39864ms step_avg:60.40ms +step:661/2245 train_time:39925ms step_avg:60.40ms +step:662/2245 train_time:39984ms step_avg:60.40ms +step:663/2245 train_time:40046ms step_avg:60.40ms +step:664/2245 train_time:40105ms step_avg:60.40ms +step:665/2245 train_time:40166ms step_avg:60.40ms +step:666/2245 train_time:40225ms step_avg:60.40ms +step:667/2245 train_time:40286ms step_avg:60.40ms +step:668/2245 train_time:40345ms step_avg:60.40ms +step:669/2245 train_time:40407ms step_avg:60.40ms +step:670/2245 train_time:40466ms step_avg:60.40ms +step:671/2245 train_time:40527ms step_avg:60.40ms +step:672/2245 train_time:40585ms step_avg:60.40ms +step:673/2245 train_time:40647ms step_avg:60.40ms +step:674/2245 train_time:40707ms step_avg:60.40ms +step:675/2245 train_time:40768ms step_avg:60.40ms +step:676/2245 train_time:40827ms step_avg:60.40ms +step:677/2245 train_time:40889ms step_avg:60.40ms +step:678/2245 train_time:40948ms step_avg:60.40ms +step:679/2245 train_time:41009ms step_avg:60.40ms +step:680/2245 train_time:41069ms step_avg:60.40ms +step:681/2245 train_time:41130ms step_avg:60.40ms +step:682/2245 train_time:41190ms step_avg:60.40ms +step:683/2245 train_time:41251ms step_avg:60.40ms +step:684/2245 train_time:41310ms step_avg:60.40ms +step:685/2245 train_time:41372ms step_avg:60.40ms +step:686/2245 train_time:41431ms step_avg:60.39ms +step:687/2245 train_time:41492ms step_avg:60.40ms +step:688/2245 train_time:41551ms step_avg:60.39ms +step:689/2245 train_time:41613ms step_avg:60.40ms +step:690/2245 train_time:41673ms step_avg:60.40ms +step:691/2245 train_time:41734ms step_avg:60.40ms +step:692/2245 train_time:41794ms step_avg:60.40ms +step:693/2245 train_time:41855ms step_avg:60.40ms +step:694/2245 train_time:41915ms step_avg:60.40ms +step:695/2245 train_time:41977ms step_avg:60.40ms +step:696/2245 train_time:42036ms step_avg:60.40ms +step:697/2245 train_time:42098ms step_avg:60.40ms +step:698/2245 train_time:42158ms step_avg:60.40ms +step:699/2245 train_time:42220ms step_avg:60.40ms +step:700/2245 train_time:42279ms step_avg:60.40ms +step:701/2245 train_time:42341ms step_avg:60.40ms +step:702/2245 train_time:42400ms step_avg:60.40ms +step:703/2245 train_time:42461ms step_avg:60.40ms +step:704/2245 train_time:42520ms step_avg:60.40ms +step:705/2245 train_time:42582ms step_avg:60.40ms +step:706/2245 train_time:42641ms step_avg:60.40ms +step:707/2245 train_time:42703ms step_avg:60.40ms +step:708/2245 train_time:42762ms step_avg:60.40ms +step:709/2245 train_time:42823ms step_avg:60.40ms +step:710/2245 train_time:42882ms step_avg:60.40ms +step:711/2245 train_time:42944ms step_avg:60.40ms +step:712/2245 train_time:43003ms step_avg:60.40ms +step:713/2245 train_time:43063ms step_avg:60.40ms +step:714/2245 train_time:43122ms step_avg:60.40ms +step:715/2245 train_time:43183ms step_avg:60.40ms +step:716/2245 train_time:43242ms step_avg:60.39ms +step:717/2245 train_time:43303ms step_avg:60.39ms +step:718/2245 train_time:43362ms step_avg:60.39ms +step:719/2245 train_time:43423ms step_avg:60.39ms +step:720/2245 train_time:43867ms step_avg:60.93ms +step:721/2245 train_time:43926ms step_avg:60.92ms +step:722/2245 train_time:43985ms step_avg:60.92ms +step:723/2245 train_time:44046ms step_avg:60.92ms +step:724/2245 train_time:44104ms step_avg:60.92ms +step:725/2245 train_time:44164ms step_avg:60.92ms +step:726/2245 train_time:44222ms step_avg:60.91ms +step:727/2245 train_time:44283ms step_avg:60.91ms +step:728/2245 train_time:44341ms step_avg:60.91ms +step:729/2245 train_time:44402ms step_avg:60.91ms +step:730/2245 train_time:44460ms step_avg:60.90ms +step:731/2245 train_time:44520ms step_avg:60.90ms +step:732/2245 train_time:44579ms step_avg:60.90ms +step:733/2245 train_time:44639ms step_avg:60.90ms +step:734/2245 train_time:44698ms step_avg:60.90ms +step:735/2245 train_time:44762ms step_avg:60.90ms +step:736/2245 train_time:44826ms step_avg:60.90ms +step:737/2245 train_time:44890ms step_avg:60.91ms +step:738/2245 train_time:44950ms step_avg:60.91ms +step:739/2245 train_time:45014ms step_avg:60.91ms +step:740/2245 train_time:45073ms step_avg:60.91ms +step:741/2245 train_time:45135ms step_avg:60.91ms +step:742/2245 train_time:45195ms step_avg:60.91ms +step:743/2245 train_time:45257ms step_avg:60.91ms +step:744/2245 train_time:45317ms step_avg:60.91ms +step:745/2245 train_time:45379ms step_avg:60.91ms +step:746/2245 train_time:45438ms step_avg:60.91ms +step:747/2245 train_time:45499ms step_avg:60.91ms +step:748/2245 train_time:45558ms step_avg:60.91ms +step:749/2245 train_time:45620ms step_avg:60.91ms +step:750/2245 train_time:45680ms step_avg:60.91ms +step:750/2245 val_loss:3.6693 train_time:45744ms step_avg:60.99ms +step:751/2245 train_time:45764ms step_avg:60.94ms +step:752/2245 train_time:45806ms step_avg:60.91ms +step:753/2245 train_time:45868ms step_avg:60.91ms +step:754/2245 train_time:45929ms step_avg:60.91ms +step:755/2245 train_time:45993ms step_avg:60.92ms +step:756/2245 train_time:46052ms step_avg:60.91ms +step:757/2245 train_time:46113ms step_avg:60.92ms +step:758/2245 train_time:46173ms step_avg:60.91ms +step:759/2245 train_time:46234ms step_avg:60.91ms +step:760/2245 train_time:46293ms step_avg:60.91ms +step:761/2245 train_time:46354ms step_avg:60.91ms +step:762/2245 train_time:46413ms step_avg:60.91ms +step:763/2245 train_time:46474ms step_avg:60.91ms +step:764/2245 train_time:46533ms step_avg:60.91ms +step:765/2245 train_time:46595ms step_avg:60.91ms +step:766/2245 train_time:46661ms step_avg:60.92ms +step:767/2245 train_time:46728ms step_avg:60.92ms +step:768/2245 train_time:46790ms step_avg:60.93ms +step:769/2245 train_time:46853ms step_avg:60.93ms +step:770/2245 train_time:46914ms step_avg:60.93ms +step:771/2245 train_time:46976ms step_avg:60.93ms +step:772/2245 train_time:47035ms step_avg:60.93ms +step:773/2245 train_time:47097ms step_avg:60.93ms +step:774/2245 train_time:47157ms step_avg:60.93ms +step:775/2245 train_time:47218ms step_avg:60.93ms +step:776/2245 train_time:47278ms step_avg:60.92ms +step:777/2245 train_time:47339ms step_avg:60.92ms +step:778/2245 train_time:47398ms step_avg:60.92ms +step:779/2245 train_time:47459ms step_avg:60.92ms +step:780/2245 train_time:47518ms step_avg:60.92ms +step:781/2245 train_time:47581ms step_avg:60.92ms +step:782/2245 train_time:47642ms step_avg:60.92ms +step:783/2245 train_time:47707ms step_avg:60.93ms +step:784/2245 train_time:47768ms step_avg:60.93ms +step:785/2245 train_time:47831ms step_avg:60.93ms +step:786/2245 train_time:47892ms step_avg:60.93ms +step:787/2245 train_time:47955ms step_avg:60.93ms +step:788/2245 train_time:48015ms step_avg:60.93ms +step:789/2245 train_time:48077ms step_avg:60.93ms +step:790/2245 train_time:48137ms step_avg:60.93ms +step:791/2245 train_time:48199ms step_avg:60.93ms +step:792/2245 train_time:48258ms step_avg:60.93ms +step:793/2245 train_time:48320ms step_avg:60.93ms +step:794/2245 train_time:48379ms step_avg:60.93ms +step:795/2245 train_time:48440ms step_avg:60.93ms +step:796/2245 train_time:48500ms step_avg:60.93ms +step:797/2245 train_time:48562ms step_avg:60.93ms +step:798/2245 train_time:48622ms step_avg:60.93ms +step:799/2245 train_time:48684ms step_avg:60.93ms +step:800/2245 train_time:48744ms step_avg:60.93ms +step:801/2245 train_time:48806ms step_avg:60.93ms +step:802/2245 train_time:48867ms step_avg:60.93ms +step:803/2245 train_time:48929ms step_avg:60.93ms +step:804/2245 train_time:48990ms step_avg:60.93ms +step:805/2245 train_time:49052ms step_avg:60.93ms +step:806/2245 train_time:49112ms step_avg:60.93ms +step:807/2245 train_time:49175ms step_avg:60.94ms +step:808/2245 train_time:49235ms step_avg:60.93ms +step:809/2245 train_time:49297ms step_avg:60.94ms +step:810/2245 train_time:49357ms step_avg:60.93ms +step:811/2245 train_time:49419ms step_avg:60.94ms +step:812/2245 train_time:49478ms step_avg:60.93ms +step:813/2245 train_time:49541ms step_avg:60.94ms +step:814/2245 train_time:49600ms step_avg:60.93ms +step:815/2245 train_time:49663ms step_avg:60.94ms +step:816/2245 train_time:49722ms step_avg:60.93ms +step:817/2245 train_time:49785ms step_avg:60.94ms +step:818/2245 train_time:49845ms step_avg:60.93ms +step:819/2245 train_time:49907ms step_avg:60.94ms +step:820/2245 train_time:49967ms step_avg:60.94ms +step:821/2245 train_time:50029ms step_avg:60.94ms +step:822/2245 train_time:50089ms step_avg:60.94ms +step:823/2245 train_time:50152ms step_avg:60.94ms +step:824/2245 train_time:50212ms step_avg:60.94ms +step:825/2245 train_time:50274ms step_avg:60.94ms +step:826/2245 train_time:50334ms step_avg:60.94ms +step:827/2245 train_time:50396ms step_avg:60.94ms +step:828/2245 train_time:50456ms step_avg:60.94ms +step:829/2245 train_time:50518ms step_avg:60.94ms +step:830/2245 train_time:50579ms step_avg:60.94ms +step:831/2245 train_time:50641ms step_avg:60.94ms +step:832/2245 train_time:50701ms step_avg:60.94ms +step:833/2245 train_time:50763ms step_avg:60.94ms +step:834/2245 train_time:50822ms step_avg:60.94ms +step:835/2245 train_time:50885ms step_avg:60.94ms +step:836/2245 train_time:50945ms step_avg:60.94ms +step:837/2245 train_time:51007ms step_avg:60.94ms +step:838/2245 train_time:51067ms step_avg:60.94ms +step:839/2245 train_time:51129ms step_avg:60.94ms +step:840/2245 train_time:51189ms step_avg:60.94ms +step:841/2245 train_time:51252ms step_avg:60.94ms +step:842/2245 train_time:51312ms step_avg:60.94ms +step:843/2245 train_time:51374ms step_avg:60.94ms +step:844/2245 train_time:51434ms step_avg:60.94ms +step:845/2245 train_time:51496ms step_avg:60.94ms +step:846/2245 train_time:51557ms step_avg:60.94ms +step:847/2245 train_time:51620ms step_avg:60.94ms +step:848/2245 train_time:51681ms step_avg:60.94ms +step:849/2245 train_time:51743ms step_avg:60.95ms +step:850/2245 train_time:51803ms step_avg:60.94ms +step:851/2245 train_time:51865ms step_avg:60.95ms +step:852/2245 train_time:51924ms step_avg:60.94ms +step:853/2245 train_time:51987ms step_avg:60.95ms +step:854/2245 train_time:52047ms step_avg:60.94ms +step:855/2245 train_time:52108ms step_avg:60.95ms +step:856/2245 train_time:52168ms step_avg:60.94ms +step:857/2245 train_time:52231ms step_avg:60.95ms +step:858/2245 train_time:52291ms step_avg:60.95ms +step:859/2245 train_time:52354ms step_avg:60.95ms +step:860/2245 train_time:52414ms step_avg:60.95ms +step:861/2245 train_time:52476ms step_avg:60.95ms +step:862/2245 train_time:52537ms step_avg:60.95ms +step:863/2245 train_time:52600ms step_avg:60.95ms +step:864/2245 train_time:52659ms step_avg:60.95ms +step:865/2245 train_time:52722ms step_avg:60.95ms +step:866/2245 train_time:52782ms step_avg:60.95ms +step:867/2245 train_time:52844ms step_avg:60.95ms +step:868/2245 train_time:52904ms step_avg:60.95ms +step:869/2245 train_time:52966ms step_avg:60.95ms +step:870/2245 train_time:53026ms step_avg:60.95ms +step:871/2245 train_time:53088ms step_avg:60.95ms +step:872/2245 train_time:53148ms step_avg:60.95ms +step:873/2245 train_time:53211ms step_avg:60.95ms +step:874/2245 train_time:53271ms step_avg:60.95ms +step:875/2245 train_time:53333ms step_avg:60.95ms +step:876/2245 train_time:53392ms step_avg:60.95ms +step:877/2245 train_time:53455ms step_avg:60.95ms +step:878/2245 train_time:53516ms step_avg:60.95ms +step:879/2245 train_time:53578ms step_avg:60.95ms +step:880/2245 train_time:53639ms step_avg:60.95ms +step:881/2245 train_time:53702ms step_avg:60.96ms +step:882/2245 train_time:53762ms step_avg:60.95ms +step:883/2245 train_time:53825ms step_avg:60.96ms +step:884/2245 train_time:53885ms step_avg:60.96ms +step:885/2245 train_time:53947ms step_avg:60.96ms +step:886/2245 train_time:54007ms step_avg:60.96ms +step:887/2245 train_time:54069ms step_avg:60.96ms +step:888/2245 train_time:54129ms step_avg:60.96ms +step:889/2245 train_time:54191ms step_avg:60.96ms +step:890/2245 train_time:54251ms step_avg:60.96ms +step:891/2245 train_time:54314ms step_avg:60.96ms +step:892/2245 train_time:54374ms step_avg:60.96ms +step:893/2245 train_time:54436ms step_avg:60.96ms +step:894/2245 train_time:54496ms step_avg:60.96ms +step:895/2245 train_time:54559ms step_avg:60.96ms +step:896/2245 train_time:54620ms step_avg:60.96ms +step:897/2245 train_time:54682ms step_avg:60.96ms +step:898/2245 train_time:54742ms step_avg:60.96ms +step:899/2245 train_time:54804ms step_avg:60.96ms +step:900/2245 train_time:54864ms step_avg:60.96ms +step:901/2245 train_time:54926ms step_avg:60.96ms +step:902/2245 train_time:54986ms step_avg:60.96ms +step:903/2245 train_time:55047ms step_avg:60.96ms +step:904/2245 train_time:55107ms step_avg:60.96ms +step:905/2245 train_time:55169ms step_avg:60.96ms +step:906/2245 train_time:55229ms step_avg:60.96ms +step:907/2245 train_time:55291ms step_avg:60.96ms +step:908/2245 train_time:55350ms step_avg:60.96ms +step:909/2245 train_time:55413ms step_avg:60.96ms +step:910/2245 train_time:55474ms step_avg:60.96ms +step:911/2245 train_time:55537ms step_avg:60.96ms +step:912/2245 train_time:55598ms step_avg:60.96ms +step:913/2245 train_time:55660ms step_avg:60.96ms +step:914/2245 train_time:55720ms step_avg:60.96ms +step:915/2245 train_time:55783ms step_avg:60.96ms +step:916/2245 train_time:55842ms step_avg:60.96ms +step:917/2245 train_time:55904ms step_avg:60.96ms +step:918/2245 train_time:55964ms step_avg:60.96ms +step:919/2245 train_time:56026ms step_avg:60.96ms +step:920/2245 train_time:56086ms step_avg:60.96ms +step:921/2245 train_time:56148ms step_avg:60.96ms +step:922/2245 train_time:56207ms step_avg:60.96ms +step:923/2245 train_time:56270ms step_avg:60.96ms +step:924/2245 train_time:56330ms step_avg:60.96ms +step:925/2245 train_time:56393ms step_avg:60.97ms +step:926/2245 train_time:56453ms step_avg:60.96ms +step:927/2245 train_time:56515ms step_avg:60.97ms +step:928/2245 train_time:56576ms step_avg:60.97ms +step:929/2245 train_time:56638ms step_avg:60.97ms +step:930/2245 train_time:56699ms step_avg:60.97ms +step:931/2245 train_time:56762ms step_avg:60.97ms +step:932/2245 train_time:56821ms step_avg:60.97ms +step:933/2245 train_time:56883ms step_avg:60.97ms +step:934/2245 train_time:56943ms step_avg:60.97ms +step:935/2245 train_time:57005ms step_avg:60.97ms +step:936/2245 train_time:57065ms step_avg:60.97ms +step:937/2245 train_time:57126ms step_avg:60.97ms +step:938/2245 train_time:57186ms step_avg:60.97ms +step:939/2245 train_time:57248ms step_avg:60.97ms +step:940/2245 train_time:57307ms step_avg:60.97ms +step:941/2245 train_time:57370ms step_avg:60.97ms +step:942/2245 train_time:57430ms step_avg:60.97ms +step:943/2245 train_time:57493ms step_avg:60.97ms +step:944/2245 train_time:57553ms step_avg:60.97ms +step:945/2245 train_time:57616ms step_avg:60.97ms +step:946/2245 train_time:57678ms step_avg:60.97ms +step:947/2245 train_time:57741ms step_avg:60.97ms +step:948/2245 train_time:57801ms step_avg:60.97ms +step:949/2245 train_time:57863ms step_avg:60.97ms +step:950/2245 train_time:57923ms step_avg:60.97ms +step:951/2245 train_time:57985ms step_avg:60.97ms +step:952/2245 train_time:58045ms step_avg:60.97ms +step:953/2245 train_time:58107ms step_avg:60.97ms +step:954/2245 train_time:58167ms step_avg:60.97ms +step:955/2245 train_time:58228ms step_avg:60.97ms +step:956/2245 train_time:58288ms step_avg:60.97ms +step:957/2245 train_time:58351ms step_avg:60.97ms +step:958/2245 train_time:58411ms step_avg:60.97ms +step:959/2245 train_time:58473ms step_avg:60.97ms +step:960/2245 train_time:58534ms step_avg:60.97ms +step:961/2245 train_time:58596ms step_avg:60.97ms +step:962/2245 train_time:58657ms step_avg:60.97ms +step:963/2245 train_time:58720ms step_avg:60.98ms +step:964/2245 train_time:58780ms step_avg:60.98ms +step:965/2245 train_time:58843ms step_avg:60.98ms +step:966/2245 train_time:58903ms step_avg:60.98ms +step:967/2245 train_time:58965ms step_avg:60.98ms +step:968/2245 train_time:59024ms step_avg:60.97ms +step:969/2245 train_time:59086ms step_avg:60.98ms +step:970/2245 train_time:59145ms step_avg:60.97ms +step:971/2245 train_time:59207ms step_avg:60.98ms +step:972/2245 train_time:59266ms step_avg:60.97ms +step:973/2245 train_time:59329ms step_avg:60.98ms +step:974/2245 train_time:59389ms step_avg:60.97ms +step:975/2245 train_time:59452ms step_avg:60.98ms +step:976/2245 train_time:59512ms step_avg:60.98ms +step:977/2245 train_time:59575ms step_avg:60.98ms +step:978/2245 train_time:59636ms step_avg:60.98ms +step:979/2245 train_time:59698ms step_avg:60.98ms +step:980/2245 train_time:59759ms step_avg:60.98ms +step:981/2245 train_time:59822ms step_avg:60.98ms +step:982/2245 train_time:59882ms step_avg:60.98ms +step:983/2245 train_time:59944ms step_avg:60.98ms +step:984/2245 train_time:60003ms step_avg:60.98ms +step:985/2245 train_time:60065ms step_avg:60.98ms +step:986/2245 train_time:60124ms step_avg:60.98ms +step:987/2245 train_time:60186ms step_avg:60.98ms +step:988/2245 train_time:60246ms step_avg:60.98ms +step:989/2245 train_time:60309ms step_avg:60.98ms +step:990/2245 train_time:60368ms step_avg:60.98ms +step:991/2245 train_time:60431ms step_avg:60.98ms +step:992/2245 train_time:60491ms step_avg:60.98ms +step:993/2245 train_time:60554ms step_avg:60.98ms +step:994/2245 train_time:60614ms step_avg:60.98ms +step:995/2245 train_time:60677ms step_avg:60.98ms +step:996/2245 train_time:60737ms step_avg:60.98ms +step:997/2245 train_time:60800ms step_avg:60.98ms +step:998/2245 train_time:60860ms step_avg:60.98ms +step:999/2245 train_time:60922ms step_avg:60.98ms +step:1000/2245 train_time:60981ms step_avg:60.98ms +step:1000/2245 val_loss:3.5947 train_time:61044ms step_avg:61.04ms +step:1001/2245 train_time:61064ms step_avg:61.00ms +step:1002/2245 train_time:61107ms step_avg:60.98ms +step:1003/2245 train_time:61174ms step_avg:60.99ms +step:1004/2245 train_time:61237ms step_avg:60.99ms +step:1005/2245 train_time:61299ms step_avg:60.99ms +step:1006/2245 train_time:61358ms step_avg:60.99ms +step:1007/2245 train_time:61420ms step_avg:60.99ms +step:1008/2245 train_time:61479ms step_avg:60.99ms +step:1009/2245 train_time:61541ms step_avg:60.99ms +step:1010/2245 train_time:61600ms step_avg:60.99ms +step:1011/2245 train_time:61662ms step_avg:60.99ms +step:1012/2245 train_time:61721ms step_avg:60.99ms +step:1013/2245 train_time:61782ms step_avg:60.99ms +step:1014/2245 train_time:61842ms step_avg:60.99ms +step:1015/2245 train_time:61904ms step_avg:60.99ms +step:1016/2245 train_time:61965ms step_avg:60.99ms +step:1017/2245 train_time:62030ms step_avg:60.99ms +step:1018/2245 train_time:62092ms step_avg:60.99ms +step:1019/2245 train_time:62156ms step_avg:61.00ms +step:1020/2245 train_time:62217ms step_avg:61.00ms +step:1021/2245 train_time:62279ms step_avg:61.00ms +step:1022/2245 train_time:62338ms step_avg:61.00ms +step:1023/2245 train_time:62400ms step_avg:61.00ms +step:1024/2245 train_time:62459ms step_avg:61.00ms +step:1025/2245 train_time:62521ms step_avg:61.00ms +step:1026/2245 train_time:62580ms step_avg:60.99ms +step:1027/2245 train_time:62642ms step_avg:60.99ms +step:1028/2245 train_time:62701ms step_avg:60.99ms +step:1029/2245 train_time:62763ms step_avg:60.99ms +step:1030/2245 train_time:62823ms step_avg:60.99ms +step:1031/2245 train_time:62885ms step_avg:60.99ms +step:1032/2245 train_time:62945ms step_avg:60.99ms +step:1033/2245 train_time:63009ms step_avg:61.00ms +step:1034/2245 train_time:63070ms step_avg:61.00ms +step:1035/2245 train_time:63134ms step_avg:61.00ms +step:1036/2245 train_time:63195ms step_avg:61.00ms +step:1037/2245 train_time:63258ms step_avg:61.00ms +step:1038/2245 train_time:63318ms step_avg:61.00ms +step:1039/2245 train_time:63380ms step_avg:61.00ms +step:1040/2245 train_time:63440ms step_avg:61.00ms +step:1041/2245 train_time:63502ms step_avg:61.00ms +step:1042/2245 train_time:63562ms step_avg:61.00ms +step:1043/2245 train_time:63623ms step_avg:61.00ms +step:1044/2245 train_time:63683ms step_avg:61.00ms +step:1045/2245 train_time:63745ms step_avg:61.00ms +step:1046/2245 train_time:63805ms step_avg:61.00ms +step:1047/2245 train_time:63867ms step_avg:61.00ms +step:1048/2245 train_time:63928ms step_avg:61.00ms +step:1049/2245 train_time:63991ms step_avg:61.00ms +step:1050/2245 train_time:64052ms step_avg:61.00ms +step:1051/2245 train_time:64116ms step_avg:61.00ms +step:1052/2245 train_time:64176ms step_avg:61.00ms +step:1053/2245 train_time:64238ms step_avg:61.00ms +step:1054/2245 train_time:64298ms step_avg:61.00ms +step:1055/2245 train_time:64361ms step_avg:61.01ms +step:1056/2245 train_time:64420ms step_avg:61.00ms +step:1057/2245 train_time:64482ms step_avg:61.01ms +step:1058/2245 train_time:64542ms step_avg:61.00ms +step:1059/2245 train_time:64604ms step_avg:61.00ms +step:1060/2245 train_time:64663ms step_avg:61.00ms +step:1061/2245 train_time:64725ms step_avg:61.00ms +step:1062/2245 train_time:64785ms step_avg:61.00ms +step:1063/2245 train_time:64848ms step_avg:61.00ms +step:1064/2245 train_time:64909ms step_avg:61.00ms +step:1065/2245 train_time:64972ms step_avg:61.01ms +step:1066/2245 train_time:65032ms step_avg:61.01ms +step:1067/2245 train_time:65095ms step_avg:61.01ms +step:1068/2245 train_time:65156ms step_avg:61.01ms +step:1069/2245 train_time:65219ms step_avg:61.01ms +step:1070/2245 train_time:65278ms step_avg:61.01ms +step:1071/2245 train_time:65341ms step_avg:61.01ms +step:1072/2245 train_time:65400ms step_avg:61.01ms +step:1073/2245 train_time:65463ms step_avg:61.01ms +step:1074/2245 train_time:65523ms step_avg:61.01ms +step:1075/2245 train_time:65585ms step_avg:61.01ms +step:1076/2245 train_time:65645ms step_avg:61.01ms +step:1077/2245 train_time:65707ms step_avg:61.01ms +step:1078/2245 train_time:65767ms step_avg:61.01ms +step:1079/2245 train_time:65829ms step_avg:61.01ms +step:1080/2245 train_time:65889ms step_avg:61.01ms +step:1081/2245 train_time:65951ms step_avg:61.01ms +step:1082/2245 train_time:66012ms step_avg:61.01ms +step:1083/2245 train_time:66074ms step_avg:61.01ms +step:1084/2245 train_time:66134ms step_avg:61.01ms +step:1085/2245 train_time:66196ms step_avg:61.01ms +step:1086/2245 train_time:66256ms step_avg:61.01ms +step:1087/2245 train_time:66318ms step_avg:61.01ms +step:1088/2245 train_time:66378ms step_avg:61.01ms +step:1089/2245 train_time:66440ms step_avg:61.01ms +step:1090/2245 train_time:66500ms step_avg:61.01ms +step:1091/2245 train_time:66562ms step_avg:61.01ms +step:1092/2245 train_time:66622ms step_avg:61.01ms +step:1093/2245 train_time:66684ms step_avg:61.01ms +step:1094/2245 train_time:66744ms step_avg:61.01ms +step:1095/2245 train_time:66807ms step_avg:61.01ms +step:1096/2245 train_time:66868ms step_avg:61.01ms +step:1097/2245 train_time:66931ms step_avg:61.01ms +step:1098/2245 train_time:66991ms step_avg:61.01ms +step:1099/2245 train_time:67053ms step_avg:61.01ms +step:1100/2245 train_time:67113ms step_avg:61.01ms +step:1101/2245 train_time:67176ms step_avg:61.01ms +step:1102/2245 train_time:67236ms step_avg:61.01ms +step:1103/2245 train_time:67298ms step_avg:61.01ms +step:1104/2245 train_time:67357ms step_avg:61.01ms +step:1105/2245 train_time:67419ms step_avg:61.01ms +step:1106/2245 train_time:67479ms step_avg:61.01ms +step:1107/2245 train_time:67541ms step_avg:61.01ms +step:1108/2245 train_time:67600ms step_avg:61.01ms +step:1109/2245 train_time:67663ms step_avg:61.01ms +step:1110/2245 train_time:67723ms step_avg:61.01ms +step:1111/2245 train_time:67785ms step_avg:61.01ms +step:1112/2245 train_time:67846ms step_avg:61.01ms +step:1113/2245 train_time:67910ms step_avg:61.01ms +step:1114/2245 train_time:67969ms step_avg:61.01ms +step:1115/2245 train_time:68032ms step_avg:61.02ms +step:1116/2245 train_time:68092ms step_avg:61.01ms +step:1117/2245 train_time:68154ms step_avg:61.01ms +step:1118/2245 train_time:68214ms step_avg:61.01ms +step:1119/2245 train_time:68276ms step_avg:61.02ms +step:1120/2245 train_time:68336ms step_avg:61.01ms +step:1121/2245 train_time:68398ms step_avg:61.01ms +step:1122/2245 train_time:68458ms step_avg:61.01ms +step:1123/2245 train_time:68520ms step_avg:61.02ms +step:1124/2245 train_time:68581ms step_avg:61.01ms +step:1125/2245 train_time:68643ms step_avg:61.02ms +step:1126/2245 train_time:68703ms step_avg:61.02ms +step:1127/2245 train_time:68766ms step_avg:61.02ms +step:1128/2245 train_time:68826ms step_avg:61.02ms +step:1129/2245 train_time:68889ms step_avg:61.02ms +step:1130/2245 train_time:68949ms step_avg:61.02ms +step:1131/2245 train_time:69013ms step_avg:61.02ms +step:1132/2245 train_time:69072ms step_avg:61.02ms +step:1133/2245 train_time:69134ms step_avg:61.02ms +step:1134/2245 train_time:69194ms step_avg:61.02ms +step:1135/2245 train_time:69256ms step_avg:61.02ms +step:1136/2245 train_time:69315ms step_avg:61.02ms +step:1137/2245 train_time:69378ms step_avg:61.02ms +step:1138/2245 train_time:69437ms step_avg:61.02ms +step:1139/2245 train_time:69499ms step_avg:61.02ms +step:1140/2245 train_time:69559ms step_avg:61.02ms +step:1141/2245 train_time:69622ms step_avg:61.02ms +step:1142/2245 train_time:69682ms step_avg:61.02ms +step:1143/2245 train_time:69744ms step_avg:61.02ms +step:1144/2245 train_time:69806ms step_avg:61.02ms +step:1145/2245 train_time:69869ms step_avg:61.02ms +step:1146/2245 train_time:69929ms step_avg:61.02ms +step:1147/2245 train_time:69991ms step_avg:61.02ms +step:1148/2245 train_time:70051ms step_avg:61.02ms +step:1149/2245 train_time:70114ms step_avg:61.02ms +step:1150/2245 train_time:70174ms step_avg:61.02ms +step:1151/2245 train_time:70236ms step_avg:61.02ms +step:1152/2245 train_time:70296ms step_avg:61.02ms +step:1153/2245 train_time:70357ms step_avg:61.02ms +step:1154/2245 train_time:70417ms step_avg:61.02ms +step:1155/2245 train_time:70479ms step_avg:61.02ms +step:1156/2245 train_time:70539ms step_avg:61.02ms +step:1157/2245 train_time:70600ms step_avg:61.02ms +step:1158/2245 train_time:70660ms step_avg:61.02ms +step:1159/2245 train_time:70723ms step_avg:61.02ms +step:1160/2245 train_time:70783ms step_avg:61.02ms +step:1161/2245 train_time:70845ms step_avg:61.02ms +step:1162/2245 train_time:70906ms step_avg:61.02ms +step:1163/2245 train_time:70970ms step_avg:61.02ms +step:1164/2245 train_time:71030ms step_avg:61.02ms +step:1165/2245 train_time:71092ms step_avg:61.02ms +step:1166/2245 train_time:71152ms step_avg:61.02ms +step:1167/2245 train_time:71215ms step_avg:61.02ms +step:1168/2245 train_time:71275ms step_avg:61.02ms +step:1169/2245 train_time:71337ms step_avg:61.02ms +step:1170/2245 train_time:71397ms step_avg:61.02ms +step:1171/2245 train_time:71459ms step_avg:61.02ms +step:1172/2245 train_time:71519ms step_avg:61.02ms +step:1173/2245 train_time:71581ms step_avg:61.02ms +step:1174/2245 train_time:71641ms step_avg:61.02ms +step:1175/2245 train_time:71703ms step_avg:61.02ms +step:1176/2245 train_time:71764ms step_avg:61.02ms +step:1177/2245 train_time:71826ms step_avg:61.02ms +step:1178/2245 train_time:71886ms step_avg:61.02ms +step:1179/2245 train_time:71949ms step_avg:61.03ms +step:1180/2245 train_time:72009ms step_avg:61.02ms +step:1181/2245 train_time:72072ms step_avg:61.03ms +step:1182/2245 train_time:72131ms step_avg:61.02ms +step:1183/2245 train_time:72194ms step_avg:61.03ms +step:1184/2245 train_time:72253ms step_avg:61.02ms +step:1185/2245 train_time:72315ms step_avg:61.03ms +step:1186/2245 train_time:72375ms step_avg:61.02ms +step:1187/2245 train_time:72437ms step_avg:61.03ms +step:1188/2245 train_time:72497ms step_avg:61.02ms +step:1189/2245 train_time:72559ms step_avg:61.03ms +step:1190/2245 train_time:72619ms step_avg:61.02ms +step:1191/2245 train_time:72682ms step_avg:61.03ms +step:1192/2245 train_time:72742ms step_avg:61.02ms +step:1193/2245 train_time:72804ms step_avg:61.03ms +step:1194/2245 train_time:72864ms step_avg:61.03ms +step:1195/2245 train_time:72927ms step_avg:61.03ms +step:1196/2245 train_time:72987ms step_avg:61.03ms +step:1197/2245 train_time:73050ms step_avg:61.03ms +step:1198/2245 train_time:73110ms step_avg:61.03ms +step:1199/2245 train_time:73173ms step_avg:61.03ms +step:1200/2245 train_time:73233ms step_avg:61.03ms +step:1201/2245 train_time:73295ms step_avg:61.03ms +step:1202/2245 train_time:73355ms step_avg:61.03ms +step:1203/2245 train_time:73417ms step_avg:61.03ms +step:1204/2245 train_time:73477ms step_avg:61.03ms +step:1205/2245 train_time:73538ms step_avg:61.03ms +step:1206/2245 train_time:73598ms step_avg:61.03ms +step:1207/2245 train_time:73662ms step_avg:61.03ms +step:1208/2245 train_time:73722ms step_avg:61.03ms +step:1209/2245 train_time:73784ms step_avg:61.03ms +step:1210/2245 train_time:73844ms step_avg:61.03ms +step:1211/2245 train_time:73907ms step_avg:61.03ms +step:1212/2245 train_time:73968ms step_avg:61.03ms +step:1213/2245 train_time:74031ms step_avg:61.03ms +step:1214/2245 train_time:74091ms step_avg:61.03ms +step:1215/2245 train_time:74153ms step_avg:61.03ms +step:1216/2245 train_time:74213ms step_avg:61.03ms +step:1217/2245 train_time:74275ms step_avg:61.03ms +step:1218/2245 train_time:74335ms step_avg:61.03ms +step:1219/2245 train_time:74397ms step_avg:61.03ms +step:1220/2245 train_time:74457ms step_avg:61.03ms +step:1221/2245 train_time:74519ms step_avg:61.03ms +step:1222/2245 train_time:74579ms step_avg:61.03ms +step:1223/2245 train_time:74642ms step_avg:61.03ms +step:1224/2245 train_time:74702ms step_avg:61.03ms +step:1225/2245 train_time:74764ms step_avg:61.03ms +step:1226/2245 train_time:74824ms step_avg:61.03ms +step:1227/2245 train_time:74887ms step_avg:61.03ms +step:1228/2245 train_time:74947ms step_avg:61.03ms +step:1229/2245 train_time:75011ms step_avg:61.03ms +step:1230/2245 train_time:75071ms step_avg:61.03ms +step:1231/2245 train_time:75134ms step_avg:61.03ms +step:1232/2245 train_time:75194ms step_avg:61.03ms +step:1233/2245 train_time:75255ms step_avg:61.03ms +step:1234/2245 train_time:75316ms step_avg:61.03ms +step:1235/2245 train_time:75378ms step_avg:61.03ms +step:1236/2245 train_time:75438ms step_avg:61.03ms +step:1237/2245 train_time:75500ms step_avg:61.03ms +step:1238/2245 train_time:75559ms step_avg:61.03ms +step:1239/2245 train_time:75622ms step_avg:61.04ms +step:1240/2245 train_time:75682ms step_avg:61.03ms +step:1241/2245 train_time:75744ms step_avg:61.03ms +step:1242/2245 train_time:75804ms step_avg:61.03ms +step:1243/2245 train_time:75866ms step_avg:61.03ms +step:1244/2245 train_time:75927ms step_avg:61.03ms +step:1245/2245 train_time:75990ms step_avg:61.04ms +step:1246/2245 train_time:76050ms step_avg:61.04ms +step:1247/2245 train_time:76113ms step_avg:61.04ms +step:1248/2245 train_time:76173ms step_avg:61.04ms +step:1249/2245 train_time:76235ms step_avg:61.04ms +step:1250/2245 train_time:76295ms step_avg:61.04ms +step:1250/2245 val_loss:3.5259 train_time:76358ms step_avg:61.09ms +step:1251/2245 train_time:76376ms step_avg:61.05ms +step:1252/2245 train_time:76419ms step_avg:61.04ms +step:1253/2245 train_time:76487ms step_avg:61.04ms +step:1254/2245 train_time:76552ms step_avg:61.05ms +step:1255/2245 train_time:76614ms step_avg:61.05ms +step:1256/2245 train_time:76674ms step_avg:61.05ms +step:1257/2245 train_time:76736ms step_avg:61.05ms +step:1258/2245 train_time:76795ms step_avg:61.05ms +step:1259/2245 train_time:76856ms step_avg:61.05ms +step:1260/2245 train_time:76916ms step_avg:61.04ms +step:1261/2245 train_time:76977ms step_avg:61.04ms +step:1262/2245 train_time:77036ms step_avg:61.04ms +step:1263/2245 train_time:77098ms step_avg:61.04ms +step:1264/2245 train_time:77158ms step_avg:61.04ms +step:1265/2245 train_time:77220ms step_avg:61.04ms +step:1266/2245 train_time:77280ms step_avg:61.04ms +step:1267/2245 train_time:77344ms step_avg:61.04ms +step:1268/2245 train_time:77404ms step_avg:61.04ms +step:1269/2245 train_time:77468ms step_avg:61.05ms +step:1270/2245 train_time:77528ms step_avg:61.05ms +step:1271/2245 train_time:77591ms step_avg:61.05ms +step:1272/2245 train_time:77652ms step_avg:61.05ms +step:1273/2245 train_time:77714ms step_avg:61.05ms +step:1274/2245 train_time:77774ms step_avg:61.05ms +step:1275/2245 train_time:77836ms step_avg:61.05ms +step:1276/2245 train_time:77895ms step_avg:61.05ms +step:1277/2245 train_time:77957ms step_avg:61.05ms +step:1278/2245 train_time:78016ms step_avg:61.05ms +step:1279/2245 train_time:78078ms step_avg:61.05ms +step:1280/2245 train_time:78138ms step_avg:61.05ms +step:1281/2245 train_time:78200ms step_avg:61.05ms +step:1282/2245 train_time:78261ms step_avg:61.05ms +step:1283/2245 train_time:78325ms step_avg:61.05ms +step:1284/2245 train_time:78385ms step_avg:61.05ms +step:1285/2245 train_time:78448ms step_avg:61.05ms +step:1286/2245 train_time:78509ms step_avg:61.05ms +step:1287/2245 train_time:78572ms step_avg:61.05ms +step:1288/2245 train_time:78632ms step_avg:61.05ms +step:1289/2245 train_time:78695ms step_avg:61.05ms +step:1290/2245 train_time:78755ms step_avg:61.05ms +step:1291/2245 train_time:78817ms step_avg:61.05ms +step:1292/2245 train_time:78876ms step_avg:61.05ms +step:1293/2245 train_time:78938ms step_avg:61.05ms +step:1294/2245 train_time:78998ms step_avg:61.05ms +step:1295/2245 train_time:79060ms step_avg:61.05ms +step:1296/2245 train_time:79120ms step_avg:61.05ms +step:1297/2245 train_time:79182ms step_avg:61.05ms +step:1298/2245 train_time:79243ms step_avg:61.05ms +step:1299/2245 train_time:79305ms step_avg:61.05ms +step:1300/2245 train_time:79366ms step_avg:61.05ms +step:1301/2245 train_time:79429ms step_avg:61.05ms +step:1302/2245 train_time:79489ms step_avg:61.05ms +step:1303/2245 train_time:79552ms step_avg:61.05ms +step:1304/2245 train_time:79611ms step_avg:61.05ms +step:1305/2245 train_time:79674ms step_avg:61.05ms +step:1306/2245 train_time:79735ms step_avg:61.05ms +step:1307/2245 train_time:79797ms step_avg:61.05ms +step:1308/2245 train_time:79857ms step_avg:61.05ms +step:1309/2245 train_time:79919ms step_avg:61.05ms +step:1310/2245 train_time:79979ms step_avg:61.05ms +step:1311/2245 train_time:80041ms step_avg:61.05ms +step:1312/2245 train_time:80101ms step_avg:61.05ms +step:1313/2245 train_time:80163ms step_avg:61.05ms +step:1314/2245 train_time:80223ms step_avg:61.05ms +step:1315/2245 train_time:80286ms step_avg:61.05ms +step:1316/2245 train_time:80346ms step_avg:61.05ms +step:1317/2245 train_time:80408ms step_avg:61.05ms +step:1318/2245 train_time:80468ms step_avg:61.05ms +step:1319/2245 train_time:80530ms step_avg:61.05ms +step:1320/2245 train_time:80590ms step_avg:61.05ms +step:1321/2245 train_time:80653ms step_avg:61.05ms +step:1322/2245 train_time:80713ms step_avg:61.05ms +step:1323/2245 train_time:80776ms step_avg:61.05ms +step:1324/2245 train_time:80836ms step_avg:61.05ms +step:1325/2245 train_time:80898ms step_avg:61.06ms +step:1326/2245 train_time:80958ms step_avg:61.05ms +step:1327/2245 train_time:81020ms step_avg:61.06ms +step:1328/2245 train_time:81081ms step_avg:61.06ms +step:1329/2245 train_time:81144ms step_avg:61.06ms +step:1330/2245 train_time:81203ms step_avg:61.06ms +step:1331/2245 train_time:81266ms step_avg:61.06ms +step:1332/2245 train_time:81326ms step_avg:61.06ms +step:1333/2245 train_time:81389ms step_avg:61.06ms +step:1334/2245 train_time:81449ms step_avg:61.06ms +step:1335/2245 train_time:81510ms step_avg:61.06ms +step:1336/2245 train_time:81570ms step_avg:61.06ms +step:1337/2245 train_time:81632ms step_avg:61.06ms +step:1338/2245 train_time:81692ms step_avg:61.06ms +step:1339/2245 train_time:81754ms step_avg:61.06ms +step:1340/2245 train_time:81813ms step_avg:61.05ms +step:1341/2245 train_time:81876ms step_avg:61.06ms +step:1342/2245 train_time:81935ms step_avg:61.05ms +step:1343/2245 train_time:81998ms step_avg:61.06ms +step:1344/2245 train_time:82058ms step_avg:61.05ms +step:1345/2245 train_time:82120ms step_avg:61.06ms +step:1346/2245 train_time:82181ms step_avg:61.06ms +step:1347/2245 train_time:82244ms step_avg:61.06ms +step:1348/2245 train_time:82304ms step_avg:61.06ms +step:1349/2245 train_time:82366ms step_avg:61.06ms +step:1350/2245 train_time:82426ms step_avg:61.06ms +step:1351/2245 train_time:82488ms step_avg:61.06ms +step:1352/2245 train_time:82548ms step_avg:61.06ms +step:1353/2245 train_time:82611ms step_avg:61.06ms +step:1354/2245 train_time:82671ms step_avg:61.06ms +step:1355/2245 train_time:82733ms step_avg:61.06ms +step:1356/2245 train_time:82793ms step_avg:61.06ms +step:1357/2245 train_time:82855ms step_avg:61.06ms +step:1358/2245 train_time:82914ms step_avg:61.06ms +step:1359/2245 train_time:82977ms step_avg:61.06ms +step:1360/2245 train_time:83037ms step_avg:61.06ms +step:1361/2245 train_time:83099ms step_avg:61.06ms +step:1362/2245 train_time:83159ms step_avg:61.06ms +step:1363/2245 train_time:83222ms step_avg:61.06ms +step:1364/2245 train_time:83282ms step_avg:61.06ms +step:1365/2245 train_time:83345ms step_avg:61.06ms +step:1366/2245 train_time:83405ms step_avg:61.06ms +step:1367/2245 train_time:83467ms step_avg:61.06ms +step:1368/2245 train_time:83527ms step_avg:61.06ms +step:1369/2245 train_time:83590ms step_avg:61.06ms +step:1370/2245 train_time:83650ms step_avg:61.06ms +step:1371/2245 train_time:83712ms step_avg:61.06ms +step:1372/2245 train_time:83771ms step_avg:61.06ms +step:1373/2245 train_time:83834ms step_avg:61.06ms +step:1374/2245 train_time:83894ms step_avg:61.06ms +step:1375/2245 train_time:83957ms step_avg:61.06ms +step:1376/2245 train_time:84017ms step_avg:61.06ms +step:1377/2245 train_time:84079ms step_avg:61.06ms +step:1378/2245 train_time:84139ms step_avg:61.06ms +step:1379/2245 train_time:84202ms step_avg:61.06ms +step:1380/2245 train_time:84262ms step_avg:61.06ms +step:1381/2245 train_time:84325ms step_avg:61.06ms +step:1382/2245 train_time:84385ms step_avg:61.06ms +step:1383/2245 train_time:84448ms step_avg:61.06ms +step:1384/2245 train_time:84508ms step_avg:61.06ms +step:1385/2245 train_time:84570ms step_avg:61.06ms +step:1386/2245 train_time:84630ms step_avg:61.06ms +step:1387/2245 train_time:84693ms step_avg:61.06ms +step:1388/2245 train_time:84752ms step_avg:61.06ms +step:1389/2245 train_time:84814ms step_avg:61.06ms +step:1390/2245 train_time:84874ms step_avg:61.06ms +step:1391/2245 train_time:84937ms step_avg:61.06ms +step:1392/2245 train_time:84996ms step_avg:61.06ms +step:1393/2245 train_time:85059ms step_avg:61.06ms +step:1394/2245 train_time:85119ms step_avg:61.06ms +step:1395/2245 train_time:85182ms step_avg:61.06ms +step:1396/2245 train_time:85243ms step_avg:61.06ms +step:1397/2245 train_time:85305ms step_avg:61.06ms +step:1398/2245 train_time:85365ms step_avg:61.06ms +step:1399/2245 train_time:85429ms step_avg:61.06ms +step:1400/2245 train_time:85490ms step_avg:61.06ms +step:1401/2245 train_time:85552ms step_avg:61.06ms +step:1402/2245 train_time:85611ms step_avg:61.06ms +step:1403/2245 train_time:85673ms step_avg:61.06ms +step:1404/2245 train_time:85733ms step_avg:61.06ms +step:1405/2245 train_time:85796ms step_avg:61.06ms +step:1406/2245 train_time:85855ms step_avg:61.06ms +step:1407/2245 train_time:85917ms step_avg:61.06ms +step:1408/2245 train_time:85977ms step_avg:61.06ms +step:1409/2245 train_time:86039ms step_avg:61.06ms +step:1410/2245 train_time:86099ms step_avg:61.06ms +step:1411/2245 train_time:86161ms step_avg:61.06ms +step:1412/2245 train_time:86222ms step_avg:61.06ms +step:1413/2245 train_time:86285ms step_avg:61.07ms +step:1414/2245 train_time:86346ms step_avg:61.06ms +step:1415/2245 train_time:86408ms step_avg:61.07ms +step:1416/2245 train_time:86468ms step_avg:61.06ms +step:1417/2245 train_time:86529ms step_avg:61.07ms +step:1418/2245 train_time:86589ms step_avg:61.06ms +step:1419/2245 train_time:86652ms step_avg:61.07ms +step:1420/2245 train_time:86712ms step_avg:61.06ms +step:1421/2245 train_time:86774ms step_avg:61.07ms +step:1422/2245 train_time:86833ms step_avg:61.06ms +step:1423/2245 train_time:86896ms step_avg:61.07ms +step:1424/2245 train_time:86957ms step_avg:61.06ms +step:1425/2245 train_time:87019ms step_avg:61.07ms +step:1426/2245 train_time:87078ms step_avg:61.06ms +step:1427/2245 train_time:87141ms step_avg:61.07ms +step:1428/2245 train_time:87202ms step_avg:61.07ms +step:1429/2245 train_time:87265ms step_avg:61.07ms +step:1430/2245 train_time:87324ms step_avg:61.07ms +step:1431/2245 train_time:87387ms step_avg:61.07ms +step:1432/2245 train_time:87448ms step_avg:61.07ms +step:1433/2245 train_time:87510ms step_avg:61.07ms +step:1434/2245 train_time:87570ms step_avg:61.07ms +step:1435/2245 train_time:87632ms step_avg:61.07ms +step:1436/2245 train_time:87692ms step_avg:61.07ms +step:1437/2245 train_time:87754ms step_avg:61.07ms +step:1438/2245 train_time:87814ms step_avg:61.07ms +step:1439/2245 train_time:87876ms step_avg:61.07ms +step:1440/2245 train_time:87936ms step_avg:61.07ms +step:1441/2245 train_time:87998ms step_avg:61.07ms +step:1442/2245 train_time:88059ms step_avg:61.07ms +step:1443/2245 train_time:88121ms step_avg:61.07ms +step:1444/2245 train_time:88181ms step_avg:61.07ms +step:1445/2245 train_time:88245ms step_avg:61.07ms +step:1446/2245 train_time:88305ms step_avg:61.07ms +step:1447/2245 train_time:88367ms step_avg:61.07ms +step:1448/2245 train_time:88427ms step_avg:61.07ms +step:1449/2245 train_time:88489ms step_avg:61.07ms +step:1450/2245 train_time:88548ms step_avg:61.07ms +step:1451/2245 train_time:88610ms step_avg:61.07ms +step:1452/2245 train_time:88670ms step_avg:61.07ms +step:1453/2245 train_time:88732ms step_avg:61.07ms +step:1454/2245 train_time:88792ms step_avg:61.07ms +step:1455/2245 train_time:88855ms step_avg:61.07ms +step:1456/2245 train_time:88915ms step_avg:61.07ms +step:1457/2245 train_time:88977ms step_avg:61.07ms +step:1458/2245 train_time:89037ms step_avg:61.07ms +step:1459/2245 train_time:89100ms step_avg:61.07ms +step:1460/2245 train_time:89160ms step_avg:61.07ms +step:1461/2245 train_time:89222ms step_avg:61.07ms +step:1462/2245 train_time:89282ms step_avg:61.07ms +step:1463/2245 train_time:89345ms step_avg:61.07ms +step:1464/2245 train_time:89405ms step_avg:61.07ms +step:1465/2245 train_time:89468ms step_avg:61.07ms +step:1466/2245 train_time:89527ms step_avg:61.07ms +step:1467/2245 train_time:89590ms step_avg:61.07ms +step:1468/2245 train_time:89650ms step_avg:61.07ms +step:1469/2245 train_time:89712ms step_avg:61.07ms +step:1470/2245 train_time:89772ms step_avg:61.07ms +step:1471/2245 train_time:89834ms step_avg:61.07ms +step:1472/2245 train_time:89895ms step_avg:61.07ms +step:1473/2245 train_time:89958ms step_avg:61.07ms +step:1474/2245 train_time:90018ms step_avg:61.07ms +step:1475/2245 train_time:90080ms step_avg:61.07ms +step:1476/2245 train_time:90142ms step_avg:61.07ms +step:1477/2245 train_time:90205ms step_avg:61.07ms +step:1478/2245 train_time:90266ms step_avg:61.07ms +step:1479/2245 train_time:90329ms step_avg:61.07ms +step:1480/2245 train_time:90389ms step_avg:61.07ms +step:1481/2245 train_time:90452ms step_avg:61.07ms +step:1482/2245 train_time:90512ms step_avg:61.07ms +step:1483/2245 train_time:90576ms step_avg:61.08ms +step:1484/2245 train_time:90636ms step_avg:61.08ms +step:1485/2245 train_time:90699ms step_avg:61.08ms +step:1486/2245 train_time:90759ms step_avg:61.08ms +step:1487/2245 train_time:90823ms step_avg:61.08ms +step:1488/2245 train_time:90883ms step_avg:61.08ms +step:1489/2245 train_time:90946ms step_avg:61.08ms +step:1490/2245 train_time:91006ms step_avg:61.08ms +step:1491/2245 train_time:91069ms step_avg:61.08ms +step:1492/2245 train_time:91130ms step_avg:61.08ms +step:1493/2245 train_time:91193ms step_avg:61.08ms +step:1494/2245 train_time:91254ms step_avg:61.08ms +step:1495/2245 train_time:91317ms step_avg:61.08ms +step:1496/2245 train_time:91378ms step_avg:61.08ms +step:1497/2245 train_time:91442ms step_avg:61.08ms +step:1498/2245 train_time:91502ms step_avg:61.08ms +step:1499/2245 train_time:91565ms step_avg:61.08ms +step:1500/2245 train_time:91626ms step_avg:61.08ms +step:1500/2245 val_loss:3.4449 train_time:91689ms step_avg:61.13ms +step:1501/2245 train_time:91709ms step_avg:61.10ms +step:1502/2245 train_time:91749ms step_avg:61.08ms +step:1503/2245 train_time:91812ms step_avg:61.09ms +step:1504/2245 train_time:91873ms step_avg:61.09ms +step:1505/2245 train_time:91937ms step_avg:61.09ms +step:1506/2245 train_time:91997ms step_avg:61.09ms +step:1507/2245 train_time:92059ms step_avg:61.09ms +step:1508/2245 train_time:92118ms step_avg:61.09ms +step:1509/2245 train_time:92180ms step_avg:61.09ms +step:1510/2245 train_time:92240ms step_avg:61.09ms +step:1511/2245 train_time:92302ms step_avg:61.09ms +step:1512/2245 train_time:92361ms step_avg:61.09ms +step:1513/2245 train_time:92424ms step_avg:61.09ms +step:1514/2245 train_time:92484ms step_avg:61.09ms +step:1515/2245 train_time:92547ms step_avg:61.09ms +step:1516/2245 train_time:92609ms step_avg:61.09ms +step:1517/2245 train_time:92674ms step_avg:61.09ms +step:1518/2245 train_time:92735ms step_avg:61.09ms +step:1519/2245 train_time:92799ms step_avg:61.09ms +step:1520/2245 train_time:92860ms step_avg:61.09ms +step:1521/2245 train_time:92923ms step_avg:61.09ms +step:1522/2245 train_time:92984ms step_avg:61.09ms +step:1523/2245 train_time:93047ms step_avg:61.09ms +step:1524/2245 train_time:93108ms step_avg:61.09ms +step:1525/2245 train_time:93171ms step_avg:61.10ms +step:1526/2245 train_time:93230ms step_avg:61.09ms +step:1527/2245 train_time:93293ms step_avg:61.10ms +step:1528/2245 train_time:93353ms step_avg:61.09ms +step:1529/2245 train_time:93416ms step_avg:61.10ms +step:1530/2245 train_time:93476ms step_avg:61.10ms +step:1531/2245 train_time:93538ms step_avg:61.10ms +step:1532/2245 train_time:93599ms step_avg:61.10ms +step:1533/2245 train_time:93663ms step_avg:61.10ms +step:1534/2245 train_time:93724ms step_avg:61.10ms +step:1535/2245 train_time:93787ms step_avg:61.10ms +step:1536/2245 train_time:93849ms step_avg:61.10ms +step:1537/2245 train_time:93912ms step_avg:61.10ms +step:1538/2245 train_time:93972ms step_avg:61.10ms +step:1539/2245 train_time:94035ms step_avg:61.10ms +step:1540/2245 train_time:94096ms step_avg:61.10ms +step:1541/2245 train_time:94158ms step_avg:61.10ms +step:1542/2245 train_time:94218ms step_avg:61.10ms +step:1543/2245 train_time:94280ms step_avg:61.10ms +step:1544/2245 train_time:94340ms step_avg:61.10ms +step:1545/2245 train_time:94403ms step_avg:61.10ms +step:1546/2245 train_time:94463ms step_avg:61.10ms +step:1547/2245 train_time:94526ms step_avg:61.10ms +step:1548/2245 train_time:94587ms step_avg:61.10ms +step:1549/2245 train_time:94650ms step_avg:61.10ms +step:1550/2245 train_time:94710ms step_avg:61.10ms +step:1551/2245 train_time:94774ms step_avg:61.11ms +step:1552/2245 train_time:94834ms step_avg:61.10ms +step:1553/2245 train_time:94898ms step_avg:61.11ms +step:1554/2245 train_time:94958ms step_avg:61.11ms +step:1555/2245 train_time:95022ms step_avg:61.11ms +step:1556/2245 train_time:95083ms step_avg:61.11ms +step:1557/2245 train_time:95145ms step_avg:61.11ms +step:1558/2245 train_time:95206ms step_avg:61.11ms +step:1559/2245 train_time:95269ms step_avg:61.11ms +step:1560/2245 train_time:95329ms step_avg:61.11ms +step:1561/2245 train_time:95392ms step_avg:61.11ms +step:1562/2245 train_time:95452ms step_avg:61.11ms +step:1563/2245 train_time:95515ms step_avg:61.11ms +step:1564/2245 train_time:95575ms step_avg:61.11ms +step:1565/2245 train_time:95638ms step_avg:61.11ms +step:1566/2245 train_time:95698ms step_avg:61.11ms +step:1567/2245 train_time:95761ms step_avg:61.11ms +step:1568/2245 train_time:95821ms step_avg:61.11ms +step:1569/2245 train_time:95885ms step_avg:61.11ms +step:1570/2245 train_time:95946ms step_avg:61.11ms +step:1571/2245 train_time:96010ms step_avg:61.11ms +step:1572/2245 train_time:96071ms step_avg:61.11ms +step:1573/2245 train_time:96134ms step_avg:61.11ms +step:1574/2245 train_time:96194ms step_avg:61.11ms +step:1575/2245 train_time:96257ms step_avg:61.12ms +step:1576/2245 train_time:96317ms step_avg:61.12ms +step:1577/2245 train_time:96380ms step_avg:61.12ms +step:1578/2245 train_time:96440ms step_avg:61.12ms +step:1579/2245 train_time:96503ms step_avg:61.12ms +step:1580/2245 train_time:96564ms step_avg:61.12ms +step:1581/2245 train_time:96627ms step_avg:61.12ms +step:1582/2245 train_time:96688ms step_avg:61.12ms +step:1583/2245 train_time:96751ms step_avg:61.12ms +step:1584/2245 train_time:96812ms step_avg:61.12ms +step:1585/2245 train_time:96874ms step_avg:61.12ms +step:1586/2245 train_time:96934ms step_avg:61.12ms +step:1587/2245 train_time:96998ms step_avg:61.12ms +step:1588/2245 train_time:97058ms step_avg:61.12ms +step:1589/2245 train_time:97120ms step_avg:61.12ms +step:1590/2245 train_time:97180ms step_avg:61.12ms +step:1591/2245 train_time:97243ms step_avg:61.12ms +step:1592/2245 train_time:97304ms step_avg:61.12ms +step:1593/2245 train_time:97368ms step_avg:61.12ms +step:1594/2245 train_time:97429ms step_avg:61.12ms +step:1595/2245 train_time:97492ms step_avg:61.12ms +step:1596/2245 train_time:97552ms step_avg:61.12ms +step:1597/2245 train_time:97615ms step_avg:61.12ms +step:1598/2245 train_time:97675ms step_avg:61.12ms +step:1599/2245 train_time:97738ms step_avg:61.12ms +step:1600/2245 train_time:97799ms step_avg:61.12ms +step:1601/2245 train_time:97861ms step_avg:61.13ms +step:1602/2245 train_time:97922ms step_avg:61.12ms +step:1603/2245 train_time:97985ms step_avg:61.13ms +step:1604/2245 train_time:98047ms step_avg:61.13ms +step:1605/2245 train_time:98109ms step_avg:61.13ms +step:1606/2245 train_time:98170ms step_avg:61.13ms +step:1607/2245 train_time:98233ms step_avg:61.13ms +step:1608/2245 train_time:98293ms step_avg:61.13ms +step:1609/2245 train_time:98356ms step_avg:61.13ms +step:1610/2245 train_time:98416ms step_avg:61.13ms +step:1611/2245 train_time:98479ms step_avg:61.13ms +step:1612/2245 train_time:98539ms step_avg:61.13ms +step:1613/2245 train_time:98602ms step_avg:61.13ms +step:1614/2245 train_time:98662ms step_avg:61.13ms +step:1615/2245 train_time:98725ms step_avg:61.13ms +step:1616/2245 train_time:98785ms step_avg:61.13ms +step:1617/2245 train_time:98848ms step_avg:61.13ms +step:1618/2245 train_time:98909ms step_avg:61.13ms +step:1619/2245 train_time:98972ms step_avg:61.13ms +step:1620/2245 train_time:99033ms step_avg:61.13ms +step:1621/2245 train_time:99095ms step_avg:61.13ms +step:1622/2245 train_time:99155ms step_avg:61.13ms +step:1623/2245 train_time:99218ms step_avg:61.13ms +step:1624/2245 train_time:99279ms step_avg:61.13ms +step:1625/2245 train_time:99341ms step_avg:61.13ms +step:1626/2245 train_time:99402ms step_avg:61.13ms +step:1627/2245 train_time:99465ms step_avg:61.13ms +step:1628/2245 train_time:99526ms step_avg:61.13ms +step:1629/2245 train_time:99590ms step_avg:61.14ms +step:1630/2245 train_time:99650ms step_avg:61.14ms +step:1631/2245 train_time:99713ms step_avg:61.14ms +step:1632/2245 train_time:99773ms step_avg:61.14ms +step:1633/2245 train_time:99835ms step_avg:61.14ms +step:1634/2245 train_time:99895ms step_avg:61.14ms +step:1635/2245 train_time:99958ms step_avg:61.14ms +step:1636/2245 train_time:100018ms step_avg:61.14ms +step:1637/2245 train_time:100081ms step_avg:61.14ms +step:1638/2245 train_time:100141ms step_avg:61.14ms +step:1639/2245 train_time:100204ms step_avg:61.14ms +step:1640/2245 train_time:100265ms step_avg:61.14ms +step:1641/2245 train_time:100328ms step_avg:61.14ms +step:1642/2245 train_time:100389ms step_avg:61.14ms +step:1643/2245 train_time:100452ms step_avg:61.14ms +step:1644/2245 train_time:100513ms step_avg:61.14ms +step:1645/2245 train_time:100575ms step_avg:61.14ms +step:1646/2245 train_time:100635ms step_avg:61.14ms +step:1647/2245 train_time:100698ms step_avg:61.14ms +step:1648/2245 train_time:100758ms step_avg:61.14ms +step:1649/2245 train_time:100821ms step_avg:61.14ms +step:1650/2245 train_time:100881ms step_avg:61.14ms +step:1651/2245 train_time:100945ms step_avg:61.14ms +step:1652/2245 train_time:101006ms step_avg:61.14ms +step:1653/2245 train_time:101069ms step_avg:61.14ms +step:1654/2245 train_time:101130ms step_avg:61.14ms +step:1655/2245 train_time:101193ms step_avg:61.14ms +step:1656/2245 train_time:101253ms step_avg:61.14ms +step:1657/2245 train_time:101315ms step_avg:61.14ms +step:1658/2245 train_time:101376ms step_avg:61.14ms +step:1659/2245 train_time:101438ms step_avg:61.14ms +step:1660/2245 train_time:101499ms step_avg:61.14ms +step:1661/2245 train_time:101561ms step_avg:61.14ms +step:1662/2245 train_time:101622ms step_avg:61.14ms +step:1663/2245 train_time:101684ms step_avg:61.15ms +step:1664/2245 train_time:101745ms step_avg:61.14ms +step:1665/2245 train_time:101808ms step_avg:61.15ms +step:1666/2245 train_time:101868ms step_avg:61.15ms +step:1667/2245 train_time:101932ms step_avg:61.15ms +step:1668/2245 train_time:101993ms step_avg:61.15ms +step:1669/2245 train_time:102055ms step_avg:61.15ms +step:1670/2245 train_time:102115ms step_avg:61.15ms +step:1671/2245 train_time:102178ms step_avg:61.15ms +step:1672/2245 train_time:102238ms step_avg:61.15ms +step:1673/2245 train_time:102301ms step_avg:61.15ms +step:1674/2245 train_time:102361ms step_avg:61.15ms +step:1675/2245 train_time:102425ms step_avg:61.15ms +step:1676/2245 train_time:102485ms step_avg:61.15ms +step:1677/2245 train_time:102548ms step_avg:61.15ms +step:1678/2245 train_time:102609ms step_avg:61.15ms +step:1679/2245 train_time:102672ms step_avg:61.15ms +step:1680/2245 train_time:102732ms step_avg:61.15ms +step:1681/2245 train_time:102795ms step_avg:61.15ms +step:1682/2245 train_time:102856ms step_avg:61.15ms +step:1683/2245 train_time:102919ms step_avg:61.15ms +step:1684/2245 train_time:102980ms step_avg:61.15ms +step:1685/2245 train_time:103044ms step_avg:61.15ms +step:1686/2245 train_time:103104ms step_avg:61.15ms +step:1687/2245 train_time:103168ms step_avg:61.15ms +step:1688/2245 train_time:103230ms step_avg:61.15ms +step:1689/2245 train_time:103293ms step_avg:61.16ms +step:1690/2245 train_time:103353ms step_avg:61.16ms +step:1691/2245 train_time:103416ms step_avg:61.16ms +step:1692/2245 train_time:103476ms step_avg:61.16ms +step:1693/2245 train_time:103539ms step_avg:61.16ms +step:1694/2245 train_time:103599ms step_avg:61.16ms +step:1695/2245 train_time:103663ms step_avg:61.16ms +step:1696/2245 train_time:103723ms step_avg:61.16ms +step:1697/2245 train_time:103786ms step_avg:61.16ms +step:1698/2245 train_time:103846ms step_avg:61.16ms +step:1699/2245 train_time:103911ms step_avg:61.16ms +step:1700/2245 train_time:103972ms step_avg:61.16ms +step:1701/2245 train_time:104035ms step_avg:61.16ms +step:1702/2245 train_time:104096ms step_avg:61.16ms +step:1703/2245 train_time:104159ms step_avg:61.16ms +step:1704/2245 train_time:104219ms step_avg:61.16ms +step:1705/2245 train_time:104282ms step_avg:61.16ms +step:1706/2245 train_time:104344ms step_avg:61.16ms +step:1707/2245 train_time:104407ms step_avg:61.16ms +step:1708/2245 train_time:104469ms step_avg:61.16ms +step:1709/2245 train_time:104533ms step_avg:61.17ms +step:1710/2245 train_time:104593ms step_avg:61.17ms +step:1711/2245 train_time:104656ms step_avg:61.17ms +step:1712/2245 train_time:104716ms step_avg:61.17ms +step:1713/2245 train_time:104779ms step_avg:61.17ms +step:1714/2245 train_time:104839ms step_avg:61.17ms +step:1715/2245 train_time:104903ms step_avg:61.17ms +step:1716/2245 train_time:104963ms step_avg:61.17ms +step:1717/2245 train_time:105026ms step_avg:61.17ms +step:1718/2245 train_time:105087ms step_avg:61.17ms +step:1719/2245 train_time:105150ms step_avg:61.17ms +step:1720/2245 train_time:105211ms step_avg:61.17ms +step:1721/2245 train_time:105274ms step_avg:61.17ms +step:1722/2245 train_time:105334ms step_avg:61.17ms +step:1723/2245 train_time:105397ms step_avg:61.17ms +step:1724/2245 train_time:105457ms step_avg:61.17ms +step:1725/2245 train_time:105520ms step_avg:61.17ms +step:1726/2245 train_time:105580ms step_avg:61.17ms +step:1727/2245 train_time:105643ms step_avg:61.17ms +step:1728/2245 train_time:105703ms step_avg:61.17ms +step:1729/2245 train_time:105766ms step_avg:61.17ms +step:1730/2245 train_time:105827ms step_avg:61.17ms +step:1731/2245 train_time:105890ms step_avg:61.17ms +step:1732/2245 train_time:105950ms step_avg:61.17ms +step:1733/2245 train_time:106012ms step_avg:61.17ms +step:1734/2245 train_time:106073ms step_avg:61.17ms +step:1735/2245 train_time:106135ms step_avg:61.17ms +step:1736/2245 train_time:106195ms step_avg:61.17ms +step:1737/2245 train_time:106258ms step_avg:61.17ms +step:1738/2245 train_time:106318ms step_avg:61.17ms +step:1739/2245 train_time:106381ms step_avg:61.17ms +step:1740/2245 train_time:106442ms step_avg:61.17ms +step:1741/2245 train_time:106506ms step_avg:61.17ms +step:1742/2245 train_time:106567ms step_avg:61.18ms +step:1743/2245 train_time:106629ms step_avg:61.18ms +step:1744/2245 train_time:106690ms step_avg:61.18ms +step:1745/2245 train_time:106753ms step_avg:61.18ms +step:1746/2245 train_time:106813ms step_avg:61.18ms +step:1747/2245 train_time:106876ms step_avg:61.18ms +step:1748/2245 train_time:106937ms step_avg:61.18ms +step:1749/2245 train_time:107000ms step_avg:61.18ms +step:1750/2245 train_time:107060ms step_avg:61.18ms +step:1750/2245 val_loss:3.3807 train_time:107124ms step_avg:61.21ms +step:1751/2245 train_time:107143ms step_avg:61.19ms +step:1752/2245 train_time:107187ms step_avg:61.18ms +step:1753/2245 train_time:107253ms step_avg:61.18ms +step:1754/2245 train_time:107315ms step_avg:61.18ms +step:1755/2245 train_time:107378ms step_avg:61.18ms +step:1756/2245 train_time:107438ms step_avg:61.18ms +step:1757/2245 train_time:107500ms step_avg:61.18ms +step:1758/2245 train_time:107560ms step_avg:61.18ms +step:1759/2245 train_time:107622ms step_avg:61.18ms +step:1760/2245 train_time:107682ms step_avg:61.18ms +step:1761/2245 train_time:107744ms step_avg:61.18ms +step:1762/2245 train_time:107805ms step_avg:61.18ms +step:1763/2245 train_time:107867ms step_avg:61.18ms +step:1764/2245 train_time:107927ms step_avg:61.18ms +step:1765/2245 train_time:107990ms step_avg:61.18ms +step:1766/2245 train_time:108051ms step_avg:61.18ms +step:1767/2245 train_time:108114ms step_avg:61.19ms +step:1768/2245 train_time:108176ms step_avg:61.19ms +step:1769/2245 train_time:108240ms step_avg:61.19ms +step:1770/2245 train_time:108302ms step_avg:61.19ms +step:1771/2245 train_time:108365ms step_avg:61.19ms +step:1772/2245 train_time:108425ms step_avg:61.19ms +step:1773/2245 train_time:108489ms step_avg:61.19ms +step:1774/2245 train_time:108549ms step_avg:61.19ms +step:1775/2245 train_time:108612ms step_avg:61.19ms +step:1776/2245 train_time:108672ms step_avg:61.19ms +step:1777/2245 train_time:108735ms step_avg:61.19ms +step:1778/2245 train_time:108795ms step_avg:61.19ms +step:1779/2245 train_time:108858ms step_avg:61.19ms +step:1780/2245 train_time:108917ms step_avg:61.19ms +step:1781/2245 train_time:108979ms step_avg:61.19ms +step:1782/2245 train_time:109040ms step_avg:61.19ms +step:1783/2245 train_time:109103ms step_avg:61.19ms +step:1784/2245 train_time:109164ms step_avg:61.19ms +step:1785/2245 train_time:109228ms step_avg:61.19ms +step:1786/2245 train_time:109290ms step_avg:61.19ms +step:1787/2245 train_time:109353ms step_avg:61.19ms +step:1788/2245 train_time:109414ms step_avg:61.19ms +step:1789/2245 train_time:109476ms step_avg:61.19ms +step:1790/2245 train_time:109536ms step_avg:61.19ms +step:1791/2245 train_time:109598ms step_avg:61.19ms +step:1792/2245 train_time:109658ms step_avg:61.19ms +step:1793/2245 train_time:109721ms step_avg:61.19ms +step:1794/2245 train_time:109781ms step_avg:61.19ms +step:1795/2245 train_time:109843ms step_avg:61.19ms +step:1796/2245 train_time:109904ms step_avg:61.19ms +step:1797/2245 train_time:109966ms step_avg:61.19ms +step:1798/2245 train_time:110027ms step_avg:61.19ms +step:1799/2245 train_time:110090ms step_avg:61.20ms +step:1800/2245 train_time:110152ms step_avg:61.20ms +step:1801/2245 train_time:110215ms step_avg:61.20ms +step:1802/2245 train_time:110276ms step_avg:61.20ms +step:1803/2245 train_time:110339ms step_avg:61.20ms +step:1804/2245 train_time:110400ms step_avg:61.20ms +step:1805/2245 train_time:110463ms step_avg:61.20ms +step:1806/2245 train_time:110523ms step_avg:61.20ms +step:1807/2245 train_time:110586ms step_avg:61.20ms +step:1808/2245 train_time:110646ms step_avg:61.20ms +step:1809/2245 train_time:110709ms step_avg:61.20ms +step:1810/2245 train_time:110769ms step_avg:61.20ms +step:1811/2245 train_time:110832ms step_avg:61.20ms +step:1812/2245 train_time:110893ms step_avg:61.20ms +step:1813/2245 train_time:110955ms step_avg:61.20ms +step:1814/2245 train_time:111015ms step_avg:61.20ms +step:1815/2245 train_time:111078ms step_avg:61.20ms +step:1816/2245 train_time:111139ms step_avg:61.20ms +step:1817/2245 train_time:111202ms step_avg:61.20ms +step:1818/2245 train_time:111262ms step_avg:61.20ms +step:1819/2245 train_time:111326ms step_avg:61.20ms +step:1820/2245 train_time:111387ms step_avg:61.20ms +step:1821/2245 train_time:111451ms step_avg:61.20ms +step:1822/2245 train_time:111513ms step_avg:61.20ms +step:1823/2245 train_time:111576ms step_avg:61.20ms +step:1824/2245 train_time:111636ms step_avg:61.20ms +step:1825/2245 train_time:111699ms step_avg:61.20ms +step:1826/2245 train_time:111758ms step_avg:61.20ms +step:1827/2245 train_time:111821ms step_avg:61.20ms +step:1828/2245 train_time:111882ms step_avg:61.20ms +step:1829/2245 train_time:111944ms step_avg:61.21ms +step:1830/2245 train_time:112004ms step_avg:61.20ms +step:1831/2245 train_time:112067ms step_avg:61.21ms +step:1832/2245 train_time:112128ms step_avg:61.21ms +step:1833/2245 train_time:112191ms step_avg:61.21ms +step:1834/2245 train_time:112252ms step_avg:61.21ms +step:1835/2245 train_time:112316ms step_avg:61.21ms +step:1836/2245 train_time:112377ms step_avg:61.21ms +step:1837/2245 train_time:112439ms step_avg:61.21ms +step:1838/2245 train_time:112500ms step_avg:61.21ms +step:1839/2245 train_time:112562ms step_avg:61.21ms +step:1840/2245 train_time:112623ms step_avg:61.21ms +step:1841/2245 train_time:112686ms step_avg:61.21ms +step:1842/2245 train_time:112746ms step_avg:61.21ms +step:1843/2245 train_time:112810ms step_avg:61.21ms +step:1844/2245 train_time:112870ms step_avg:61.21ms +step:1845/2245 train_time:112933ms step_avg:61.21ms +step:1846/2245 train_time:112994ms step_avg:61.21ms +step:1847/2245 train_time:113056ms step_avg:61.21ms +step:1848/2245 train_time:113116ms step_avg:61.21ms +step:1849/2245 train_time:113179ms step_avg:61.21ms +step:1850/2245 train_time:113240ms step_avg:61.21ms +step:1851/2245 train_time:113304ms step_avg:61.21ms +step:1852/2245 train_time:113364ms step_avg:61.21ms +step:1853/2245 train_time:113427ms step_avg:61.21ms +step:1854/2245 train_time:113488ms step_avg:61.21ms +step:1855/2245 train_time:113551ms step_avg:61.21ms +step:1856/2245 train_time:113611ms step_avg:61.21ms +step:1857/2245 train_time:113674ms step_avg:61.21ms +step:1858/2245 train_time:113734ms step_avg:61.21ms +step:1859/2245 train_time:113797ms step_avg:61.21ms +step:1860/2245 train_time:113856ms step_avg:61.21ms +step:1861/2245 train_time:113919ms step_avg:61.21ms +step:1862/2245 train_time:113979ms step_avg:61.21ms +step:1863/2245 train_time:114041ms step_avg:61.21ms +step:1864/2245 train_time:114102ms step_avg:61.21ms +step:1865/2245 train_time:114165ms step_avg:61.21ms +step:1866/2245 train_time:114225ms step_avg:61.21ms +step:1867/2245 train_time:114288ms step_avg:61.21ms +step:1868/2245 train_time:114349ms step_avg:61.21ms +step:1869/2245 train_time:114412ms step_avg:61.22ms +step:1870/2245 train_time:114474ms step_avg:61.22ms +step:1871/2245 train_time:114537ms step_avg:61.22ms +step:1872/2245 train_time:114598ms step_avg:61.22ms +step:1873/2245 train_time:114661ms step_avg:61.22ms +step:1874/2245 train_time:114721ms step_avg:61.22ms +step:1875/2245 train_time:114783ms step_avg:61.22ms +step:1876/2245 train_time:114844ms step_avg:61.22ms +step:1877/2245 train_time:114908ms step_avg:61.22ms +step:1878/2245 train_time:114968ms step_avg:61.22ms +step:1879/2245 train_time:115031ms step_avg:61.22ms +step:1880/2245 train_time:115093ms step_avg:61.22ms +step:1881/2245 train_time:115156ms step_avg:61.22ms +step:1882/2245 train_time:115216ms step_avg:61.22ms +step:1883/2245 train_time:115279ms step_avg:61.22ms +step:1884/2245 train_time:115339ms step_avg:61.22ms +step:1885/2245 train_time:115401ms step_avg:61.22ms +step:1886/2245 train_time:115461ms step_avg:61.22ms +step:1887/2245 train_time:115523ms step_avg:61.22ms +step:1888/2245 train_time:115584ms step_avg:61.22ms +step:1889/2245 train_time:115647ms step_avg:61.22ms +step:1890/2245 train_time:115708ms step_avg:61.22ms +step:1891/2245 train_time:115771ms step_avg:61.22ms +step:1892/2245 train_time:115833ms step_avg:61.22ms +step:1893/2245 train_time:115897ms step_avg:61.22ms +step:1894/2245 train_time:115956ms step_avg:61.22ms +step:1895/2245 train_time:116019ms step_avg:61.22ms +step:1896/2245 train_time:116080ms step_avg:61.22ms +step:1897/2245 train_time:116142ms step_avg:61.22ms +step:1898/2245 train_time:116203ms step_avg:61.22ms +step:1899/2245 train_time:116265ms step_avg:61.22ms +step:1900/2245 train_time:116326ms step_avg:61.22ms +step:1901/2245 train_time:116389ms step_avg:61.23ms +step:1902/2245 train_time:116450ms step_avg:61.22ms +step:1903/2245 train_time:116513ms step_avg:61.23ms +step:1904/2245 train_time:116573ms step_avg:61.23ms +step:1905/2245 train_time:116636ms step_avg:61.23ms +step:1906/2245 train_time:116696ms step_avg:61.23ms +step:1907/2245 train_time:116759ms step_avg:61.23ms +step:1908/2245 train_time:116819ms step_avg:61.23ms +step:1909/2245 train_time:116881ms step_avg:61.23ms +step:1910/2245 train_time:116942ms step_avg:61.23ms +step:1911/2245 train_time:117005ms step_avg:61.23ms +step:1912/2245 train_time:117066ms step_avg:61.23ms +step:1913/2245 train_time:117129ms step_avg:61.23ms +step:1914/2245 train_time:117190ms step_avg:61.23ms +step:1915/2245 train_time:117253ms step_avg:61.23ms +step:1916/2245 train_time:117314ms step_avg:61.23ms +step:1917/2245 train_time:117377ms step_avg:61.23ms +step:1918/2245 train_time:117437ms step_avg:61.23ms +step:1919/2245 train_time:117499ms step_avg:61.23ms +step:1920/2245 train_time:117560ms step_avg:61.23ms +step:1921/2245 train_time:117622ms step_avg:61.23ms +step:1922/2245 train_time:117682ms step_avg:61.23ms +step:1923/2245 train_time:117746ms step_avg:61.23ms +step:1924/2245 train_time:117806ms step_avg:61.23ms +step:1925/2245 train_time:117869ms step_avg:61.23ms +step:1926/2245 train_time:117929ms step_avg:61.23ms +step:1927/2245 train_time:117993ms step_avg:61.23ms +step:1928/2245 train_time:118054ms step_avg:61.23ms +step:1929/2245 train_time:118117ms step_avg:61.23ms +step:1930/2245 train_time:118178ms step_avg:61.23ms +step:1931/2245 train_time:118240ms step_avg:61.23ms +step:1932/2245 train_time:118301ms step_avg:61.23ms +step:1933/2245 train_time:118363ms step_avg:61.23ms +step:1934/2245 train_time:118423ms step_avg:61.23ms +step:1935/2245 train_time:118486ms step_avg:61.23ms +step:1936/2245 train_time:118546ms step_avg:61.23ms +step:1937/2245 train_time:118609ms step_avg:61.23ms +step:1938/2245 train_time:118670ms step_avg:61.23ms +step:1939/2245 train_time:118732ms step_avg:61.23ms +step:1940/2245 train_time:118793ms step_avg:61.23ms +step:1941/2245 train_time:118857ms step_avg:61.23ms +step:1942/2245 train_time:118916ms step_avg:61.23ms +step:1943/2245 train_time:118979ms step_avg:61.23ms +step:1944/2245 train_time:119039ms step_avg:61.23ms +step:1945/2245 train_time:119102ms step_avg:61.24ms +step:1946/2245 train_time:119163ms step_avg:61.23ms +step:1947/2245 train_time:119225ms step_avg:61.24ms +step:1948/2245 train_time:119287ms step_avg:61.24ms +step:1949/2245 train_time:119350ms step_avg:61.24ms +step:1950/2245 train_time:119410ms step_avg:61.24ms +step:1951/2245 train_time:119472ms step_avg:61.24ms +step:1952/2245 train_time:119533ms step_avg:61.24ms +step:1953/2245 train_time:119596ms step_avg:61.24ms +step:1954/2245 train_time:119657ms step_avg:61.24ms +step:1955/2245 train_time:119719ms step_avg:61.24ms +step:1956/2245 train_time:119779ms step_avg:61.24ms +step:1957/2245 train_time:119842ms step_avg:61.24ms +step:1958/2245 train_time:119903ms step_avg:61.24ms +step:1959/2245 train_time:119965ms step_avg:61.24ms +step:1960/2245 train_time:120026ms step_avg:61.24ms +step:1961/2245 train_time:120088ms step_avg:61.24ms +step:1962/2245 train_time:120149ms step_avg:61.24ms +step:1963/2245 train_time:120212ms step_avg:61.24ms +step:1964/2245 train_time:120273ms step_avg:61.24ms +step:1965/2245 train_time:120336ms step_avg:61.24ms +step:1966/2245 train_time:120397ms step_avg:61.24ms +step:1967/2245 train_time:120460ms step_avg:61.24ms +step:1968/2245 train_time:120520ms step_avg:61.24ms +step:1969/2245 train_time:120582ms step_avg:61.24ms +step:1970/2245 train_time:120643ms step_avg:61.24ms +step:1971/2245 train_time:120705ms step_avg:61.24ms +step:1972/2245 train_time:120766ms step_avg:61.24ms +step:1973/2245 train_time:120829ms step_avg:61.24ms +step:1974/2245 train_time:120890ms step_avg:61.24ms +step:1975/2245 train_time:120953ms step_avg:61.24ms +step:1976/2245 train_time:121014ms step_avg:61.24ms +step:1977/2245 train_time:121077ms step_avg:61.24ms +step:1978/2245 train_time:121137ms step_avg:61.24ms +step:1979/2245 train_time:121200ms step_avg:61.24ms +step:1980/2245 train_time:121261ms step_avg:61.24ms +step:1981/2245 train_time:121323ms step_avg:61.24ms +step:1982/2245 train_time:121384ms step_avg:61.24ms +step:1983/2245 train_time:121447ms step_avg:61.24ms +step:1984/2245 train_time:121508ms step_avg:61.24ms +step:1985/2245 train_time:121571ms step_avg:61.24ms +step:1986/2245 train_time:121632ms step_avg:61.24ms +step:1987/2245 train_time:121694ms step_avg:61.25ms +step:1988/2245 train_time:121755ms step_avg:61.24ms +step:1989/2245 train_time:121818ms step_avg:61.25ms +step:1990/2245 train_time:121878ms step_avg:61.25ms +step:1991/2245 train_time:121941ms step_avg:61.25ms +step:1992/2245 train_time:122001ms step_avg:61.25ms +step:1993/2245 train_time:122064ms step_avg:61.25ms +step:1994/2245 train_time:122124ms step_avg:61.25ms +step:1995/2245 train_time:122187ms step_avg:61.25ms +step:1996/2245 train_time:122248ms step_avg:61.25ms +step:1997/2245 train_time:122311ms step_avg:61.25ms +step:1998/2245 train_time:122371ms step_avg:61.25ms +step:1999/2245 train_time:122435ms step_avg:61.25ms +step:2000/2245 train_time:122496ms step_avg:61.25ms +step:2000/2245 val_loss:3.3265 train_time:122559ms step_avg:61.28ms +step:2001/2245 train_time:122577ms step_avg:61.26ms +step:2002/2245 train_time:122624ms step_avg:61.25ms +step:2003/2245 train_time:122690ms step_avg:61.25ms +step:2004/2245 train_time:122751ms step_avg:61.25ms +step:2005/2245 train_time:122814ms step_avg:61.25ms +step:2006/2245 train_time:122874ms step_avg:61.25ms +step:2007/2245 train_time:122936ms step_avg:61.25ms +step:2008/2245 train_time:122996ms step_avg:61.25ms +step:2009/2245 train_time:123058ms step_avg:61.25ms +step:2010/2245 train_time:123117ms step_avg:61.25ms +step:2011/2245 train_time:123180ms step_avg:61.25ms +step:2012/2245 train_time:123240ms step_avg:61.25ms +step:2013/2245 train_time:123302ms step_avg:61.25ms +step:2014/2245 train_time:123362ms step_avg:61.25ms +step:2015/2245 train_time:123424ms step_avg:61.25ms +step:2016/2245 train_time:123485ms step_avg:61.25ms +step:2017/2245 train_time:123549ms step_avg:61.25ms +step:2018/2245 train_time:123611ms step_avg:61.25ms +step:2019/2245 train_time:123675ms step_avg:61.26ms +step:2020/2245 train_time:123737ms step_avg:61.26ms +step:2021/2245 train_time:123800ms step_avg:61.26ms +step:2022/2245 train_time:123861ms step_avg:61.26ms +step:2023/2245 train_time:123924ms step_avg:61.26ms +step:2024/2245 train_time:123983ms step_avg:61.26ms +step:2025/2245 train_time:124046ms step_avg:61.26ms +step:2026/2245 train_time:124106ms step_avg:61.26ms +step:2027/2245 train_time:124168ms step_avg:61.26ms +step:2028/2245 train_time:124228ms step_avg:61.26ms +step:2029/2245 train_time:124290ms step_avg:61.26ms +step:2030/2245 train_time:124350ms step_avg:61.26ms +step:2031/2245 train_time:124414ms step_avg:61.26ms +step:2032/2245 train_time:124474ms step_avg:61.26ms +step:2033/2245 train_time:124538ms step_avg:61.26ms +step:2034/2245 train_time:124600ms step_avg:61.26ms +step:2035/2245 train_time:124665ms step_avg:61.26ms +step:2036/2245 train_time:124726ms step_avg:61.26ms +step:2037/2245 train_time:124789ms step_avg:61.26ms +step:2038/2245 train_time:124849ms step_avg:61.26ms +step:2039/2245 train_time:124912ms step_avg:61.26ms +step:2040/2245 train_time:124972ms step_avg:61.26ms +step:2041/2245 train_time:125035ms step_avg:61.26ms +step:2042/2245 train_time:125095ms step_avg:61.26ms +step:2043/2245 train_time:125159ms step_avg:61.26ms +step:2044/2245 train_time:125220ms step_avg:61.26ms +step:2045/2245 train_time:125282ms step_avg:61.26ms +step:2046/2245 train_time:125343ms step_avg:61.26ms +step:2047/2245 train_time:125406ms step_avg:61.26ms +step:2048/2245 train_time:125466ms step_avg:61.26ms +step:2049/2245 train_time:125530ms step_avg:61.26ms +step:2050/2245 train_time:125590ms step_avg:61.26ms +step:2051/2245 train_time:125654ms step_avg:61.26ms +step:2052/2245 train_time:125715ms step_avg:61.26ms +step:2053/2245 train_time:125779ms step_avg:61.27ms +step:2054/2245 train_time:125840ms step_avg:61.27ms +step:2055/2245 train_time:125902ms step_avg:61.27ms +step:2056/2245 train_time:125963ms step_avg:61.27ms +step:2057/2245 train_time:126026ms step_avg:61.27ms +step:2058/2245 train_time:126085ms step_avg:61.27ms +step:2059/2245 train_time:126148ms step_avg:61.27ms +step:2060/2245 train_time:126208ms step_avg:61.27ms +step:2061/2245 train_time:126271ms step_avg:61.27ms +step:2062/2245 train_time:126331ms step_avg:61.27ms +step:2063/2245 train_time:126394ms step_avg:61.27ms +step:2064/2245 train_time:126456ms step_avg:61.27ms +step:2065/2245 train_time:126519ms step_avg:61.27ms +step:2066/2245 train_time:126581ms step_avg:61.27ms +step:2067/2245 train_time:126644ms step_avg:61.27ms +step:2068/2245 train_time:126704ms step_avg:61.27ms +step:2069/2245 train_time:126767ms step_avg:61.27ms +step:2070/2245 train_time:126828ms step_avg:61.27ms +step:2071/2245 train_time:126890ms step_avg:61.27ms +step:2072/2245 train_time:126951ms step_avg:61.27ms +step:2073/2245 train_time:127013ms step_avg:61.27ms +step:2074/2245 train_time:127074ms step_avg:61.27ms +step:2075/2245 train_time:127137ms step_avg:61.27ms +step:2076/2245 train_time:127198ms step_avg:61.27ms +step:2077/2245 train_time:127261ms step_avg:61.27ms +step:2078/2245 train_time:127321ms step_avg:61.27ms +step:2079/2245 train_time:127383ms step_avg:61.27ms +step:2080/2245 train_time:127444ms step_avg:61.27ms +step:2081/2245 train_time:127506ms step_avg:61.27ms +step:2082/2245 train_time:127567ms step_avg:61.27ms +step:2083/2245 train_time:127629ms step_avg:61.27ms +step:2084/2245 train_time:127690ms step_avg:61.27ms +step:2085/2245 train_time:127753ms step_avg:61.27ms +step:2086/2245 train_time:127814ms step_avg:61.27ms +step:2087/2245 train_time:127876ms step_avg:61.27ms +step:2088/2245 train_time:127937ms step_avg:61.27ms +step:2089/2245 train_time:128000ms step_avg:61.27ms +step:2090/2245 train_time:128061ms step_avg:61.27ms +step:2091/2245 train_time:128124ms step_avg:61.27ms +step:2092/2245 train_time:128184ms step_avg:61.27ms +step:2093/2245 train_time:128247ms step_avg:61.27ms +step:2094/2245 train_time:128307ms step_avg:61.27ms +step:2095/2245 train_time:128370ms step_avg:61.27ms +step:2096/2245 train_time:128430ms step_avg:61.27ms +step:2097/2245 train_time:128493ms step_avg:61.27ms +step:2098/2245 train_time:128554ms step_avg:61.27ms +step:2099/2245 train_time:128618ms step_avg:61.28ms +step:2100/2245 train_time:128679ms step_avg:61.28ms +step:2101/2245 train_time:128743ms step_avg:61.28ms +step:2102/2245 train_time:128803ms step_avg:61.28ms +step:2103/2245 train_time:128866ms step_avg:61.28ms +step:2104/2245 train_time:128927ms step_avg:61.28ms +step:2105/2245 train_time:128989ms step_avg:61.28ms +step:2106/2245 train_time:129050ms step_avg:61.28ms +step:2107/2245 train_time:129113ms step_avg:61.28ms +step:2108/2245 train_time:129174ms step_avg:61.28ms +step:2109/2245 train_time:129236ms step_avg:61.28ms +step:2110/2245 train_time:129297ms step_avg:61.28ms +step:2111/2245 train_time:129361ms step_avg:61.28ms +step:2112/2245 train_time:129422ms step_avg:61.28ms +step:2113/2245 train_time:129484ms step_avg:61.28ms +step:2114/2245 train_time:129545ms step_avg:61.28ms +step:2115/2245 train_time:129608ms step_avg:61.28ms +step:2116/2245 train_time:129669ms step_avg:61.28ms +step:2117/2245 train_time:129732ms step_avg:61.28ms +step:2118/2245 train_time:129793ms step_avg:61.28ms +step:2119/2245 train_time:129856ms step_avg:61.28ms +step:2120/2245 train_time:129917ms step_avg:61.28ms +step:2121/2245 train_time:129979ms step_avg:61.28ms +step:2122/2245 train_time:130040ms step_avg:61.28ms +step:2123/2245 train_time:130103ms step_avg:61.28ms +step:2124/2245 train_time:130163ms step_avg:61.28ms +step:2125/2245 train_time:130226ms step_avg:61.28ms +step:2126/2245 train_time:130286ms step_avg:61.28ms +step:2127/2245 train_time:130349ms step_avg:61.28ms +step:2128/2245 train_time:130409ms step_avg:61.28ms +step:2129/2245 train_time:130472ms step_avg:61.28ms +step:2130/2245 train_time:130533ms step_avg:61.28ms +step:2131/2245 train_time:130596ms step_avg:61.28ms +step:2132/2245 train_time:130658ms step_avg:61.28ms +step:2133/2245 train_time:130722ms step_avg:61.29ms +step:2134/2245 train_time:130782ms step_avg:61.28ms +step:2135/2245 train_time:130845ms step_avg:61.29ms +step:2136/2245 train_time:130905ms step_avg:61.29ms +step:2137/2245 train_time:130968ms step_avg:61.29ms +step:2138/2245 train_time:131028ms step_avg:61.29ms +step:2139/2245 train_time:131091ms step_avg:61.29ms +step:2140/2245 train_time:131152ms step_avg:61.29ms +step:2141/2245 train_time:131215ms step_avg:61.29ms +step:2142/2245 train_time:131275ms step_avg:61.29ms +step:2143/2245 train_time:131339ms step_avg:61.29ms +step:2144/2245 train_time:131400ms step_avg:61.29ms +step:2145/2245 train_time:131463ms step_avg:61.29ms +step:2146/2245 train_time:131524ms step_avg:61.29ms +step:2147/2245 train_time:131587ms step_avg:61.29ms +step:2148/2245 train_time:131648ms step_avg:61.29ms +step:2149/2245 train_time:131711ms step_avg:61.29ms +step:2150/2245 train_time:131771ms step_avg:61.29ms +step:2151/2245 train_time:131834ms step_avg:61.29ms +step:2152/2245 train_time:131895ms step_avg:61.29ms +step:2153/2245 train_time:131959ms step_avg:61.29ms +step:2154/2245 train_time:132020ms step_avg:61.29ms +step:2155/2245 train_time:132083ms step_avg:61.29ms +step:2156/2245 train_time:132144ms step_avg:61.29ms +step:2157/2245 train_time:132207ms step_avg:61.29ms +step:2158/2245 train_time:132267ms step_avg:61.29ms +step:2159/2245 train_time:132330ms step_avg:61.29ms +step:2160/2245 train_time:132390ms step_avg:61.29ms +step:2161/2245 train_time:132454ms step_avg:61.29ms +step:2162/2245 train_time:132514ms step_avg:61.29ms +step:2163/2245 train_time:132577ms step_avg:61.29ms +step:2164/2245 train_time:132638ms step_avg:61.29ms +step:2165/2245 train_time:132701ms step_avg:61.29ms +step:2166/2245 train_time:132762ms step_avg:61.29ms +step:2167/2245 train_time:132825ms step_avg:61.29ms +step:2168/2245 train_time:132885ms step_avg:61.29ms +step:2169/2245 train_time:132948ms step_avg:61.29ms +step:2170/2245 train_time:133008ms step_avg:61.29ms +step:2171/2245 train_time:133071ms step_avg:61.29ms +step:2172/2245 train_time:133131ms step_avg:61.29ms +step:2173/2245 train_time:133194ms step_avg:61.29ms +step:2174/2245 train_time:133255ms step_avg:61.29ms +step:2175/2245 train_time:133319ms step_avg:61.30ms +step:2176/2245 train_time:133380ms step_avg:61.30ms +step:2177/2245 train_time:133443ms step_avg:61.30ms +step:2178/2245 train_time:133503ms step_avg:61.30ms +step:2179/2245 train_time:133566ms step_avg:61.30ms +step:2180/2245 train_time:133627ms step_avg:61.30ms +step:2181/2245 train_time:133689ms step_avg:61.30ms +step:2182/2245 train_time:133750ms step_avg:61.30ms +step:2183/2245 train_time:133813ms step_avg:61.30ms +step:2184/2245 train_time:133873ms step_avg:61.30ms +step:2185/2245 train_time:133937ms step_avg:61.30ms +step:2186/2245 train_time:133997ms step_avg:61.30ms +step:2187/2245 train_time:134061ms step_avg:61.30ms +step:2188/2245 train_time:134122ms step_avg:61.30ms +step:2189/2245 train_time:134185ms step_avg:61.30ms +step:2190/2245 train_time:134245ms step_avg:61.30ms +step:2191/2245 train_time:134308ms step_avg:61.30ms +step:2192/2245 train_time:134368ms step_avg:61.30ms +step:2193/2245 train_time:134431ms step_avg:61.30ms +step:2194/2245 train_time:134491ms step_avg:61.30ms +step:2195/2245 train_time:134553ms step_avg:61.30ms +step:2196/2245 train_time:134614ms step_avg:61.30ms +step:2197/2245 train_time:134676ms step_avg:61.30ms +step:2198/2245 train_time:134737ms step_avg:61.30ms +step:2199/2245 train_time:134801ms step_avg:61.30ms +step:2200/2245 train_time:134862ms step_avg:61.30ms +step:2201/2245 train_time:134925ms step_avg:61.30ms +step:2202/2245 train_time:134985ms step_avg:61.30ms +step:2203/2245 train_time:135048ms step_avg:61.30ms +step:2204/2245 train_time:135108ms step_avg:61.30ms +step:2205/2245 train_time:135171ms step_avg:61.30ms +step:2206/2245 train_time:135232ms step_avg:61.30ms +step:2207/2245 train_time:135295ms step_avg:61.30ms +step:2208/2245 train_time:135356ms step_avg:61.30ms +step:2209/2245 train_time:135419ms step_avg:61.30ms +step:2210/2245 train_time:135480ms step_avg:61.30ms +step:2211/2245 train_time:135543ms step_avg:61.30ms +step:2212/2245 train_time:135604ms step_avg:61.30ms +step:2213/2245 train_time:135666ms step_avg:61.30ms +step:2214/2245 train_time:135727ms step_avg:61.30ms +step:2215/2245 train_time:135790ms step_avg:61.30ms +step:2216/2245 train_time:135851ms step_avg:61.30ms +step:2217/2245 train_time:135915ms step_avg:61.31ms +step:2218/2245 train_time:135977ms step_avg:61.31ms +step:2219/2245 train_time:136039ms step_avg:61.31ms +step:2220/2245 train_time:136101ms step_avg:61.31ms +step:2221/2245 train_time:136164ms step_avg:61.31ms +step:2222/2245 train_time:136224ms step_avg:61.31ms +step:2223/2245 train_time:136287ms step_avg:61.31ms +step:2224/2245 train_time:136347ms step_avg:61.31ms +step:2225/2245 train_time:136410ms step_avg:61.31ms +step:2226/2245 train_time:136470ms step_avg:61.31ms +step:2227/2245 train_time:136535ms step_avg:61.31ms +step:2228/2245 train_time:136595ms step_avg:61.31ms +step:2229/2245 train_time:136658ms step_avg:61.31ms +step:2230/2245 train_time:136720ms step_avg:61.31ms +step:2231/2245 train_time:136783ms step_avg:61.31ms +step:2232/2245 train_time:136844ms step_avg:61.31ms +step:2233/2245 train_time:136906ms step_avg:61.31ms +step:2234/2245 train_time:136967ms step_avg:61.31ms +step:2235/2245 train_time:137029ms step_avg:61.31ms +step:2236/2245 train_time:137090ms step_avg:61.31ms +step:2237/2245 train_time:137153ms step_avg:61.31ms +step:2238/2245 train_time:137213ms step_avg:61.31ms +step:2239/2245 train_time:137276ms step_avg:61.31ms +step:2240/2245 train_time:137336ms step_avg:61.31ms +step:2241/2245 train_time:137399ms step_avg:61.31ms +step:2242/2245 train_time:137461ms step_avg:61.31ms +step:2243/2245 train_time:137524ms step_avg:61.31ms +step:2244/2245 train_time:137585ms step_avg:61.31ms +step:2245/2245 train_time:137647ms step_avg:61.31ms +step:2245/2245 val_loss:3.2813 train_time:137708ms step_avg:61.34ms +peak memory allocated: 29249 MiB reserved: 50528 MiB diff --git a/records/track_1_short/2025-11-10_CautiousWD/aeaf2a6d-2a2e-4414-bc48-293946e087fc.txt b/records/track_1_short/2025-11-10_CautiousWD/aeaf2a6d-2a2e-4414-bc48-293946e087fc.txt new file mode 100644 index 000000000..133685283 --- /dev/null +++ b/records/track_1_short/2025-11-10_CautiousWD/aeaf2a6d-2a2e-4414-bc48-293946e087fc.txt @@ -0,0 +1,3772 @@ +import os +import sys + +with open(sys.argv[0]) as f: + code = f.read() # read the code of this file ASAP, for logging +import copy +import glob +import math +import threading +import time +import uuid +from dataclasses import dataclass +from collections import defaultdict +from itertools import accumulate +from pathlib import Path + +os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" +import torch + +torch.empty( + 1, device="cuda", requires_grad=True +).backward() # prevents a bug on some systems +import torch._dynamo as dynamo +import torch.distributed as dist +import torch.nn.functional as F + +# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min +import triton +import triton.language as tl +from kernels import get_kernel +from torch import Tensor, nn + +dynamo.config.recompile_limit = 64 + +# ----------------------------------------------------------------------------- +# Custom operators: FP8 matmul by @YouJiacheng + + +@torch.library.custom_op("nanogpt::mm", mutates_args=()) +def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: + @torch.compile + def impl(x: Tensor, w: Tensor): + assert x.is_contiguous() and w.is_contiguous() + x_f8 = x.div(x_s).to(torch.float8_e4m3fn) + w_f8 = w.div(w_s).to(torch.float8_e4m3fn) + out = torch._scaled_mm( + x_f8, + w_f8.T, + out_dtype=torch.bfloat16, + scale_a=x.new_tensor(x_s, dtype=torch.float32), + scale_b=x.new_tensor(w_s, dtype=torch.float32), + use_fast_accum=True, + ) + return out, x_f8, w_f8 + + return impl(x, w) + +@mm_op.register_fake +def _(x: Tensor, w: Tensor, *_): + assert x.ndim == w.ndim == 2 + assert x.shape[1] == w.shape[1] + assert x.device == w.device + assert x.is_contiguous() and w.is_contiguous() + return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) + +@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) +def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: + @torch.compile + def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): + assert grad.is_contiguous() + x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) + w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) + grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) + grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) + grad_x = torch._scaled_mm( + grad_f8, + w_f8.T.contiguous().T, + out_dtype=torch.bfloat16, + scale_a=grad_inv_s, + scale_b=w_inv_s, + use_fast_accum=False, + ) + # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) + grad_w = torch._scaled_mm( + x_f8.T.contiguous(), + grad_f8.T.contiguous().T, + out_dtype=torch.float32, + scale_a=x_inv_s, + scale_b=grad_inv_s, + use_fast_accum=False, + ).T + return grad_x, grad_w + + return impl(g, x_f8, w_f8) + +@mm_backward_op.register_fake +def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): + return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) + +def backward(ctx, grad_out: Tensor, *_): + x_f8, w_f8 = ctx.saved_tensors + x_s, w_s, grad_s = ctx.scales + grad_x, grad_w = torch.ops.nanogpt.mm_backward( + grad_out, x_f8, w_f8, x_s, w_s, grad_s + ) + return grad_x, grad_w, None, None, None + +def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): + *_, x_s, w_s, grad_s = inputs + _, x_f8, w_f8 = output + ctx.save_for_backward(x_f8, w_f8) + ctx.scales = x_s, w_s, grad_s + ctx.set_materialize_grads(False) + +mm_op.register_autograd(backward, setup_context=setup_context) + +# ----------------------------------------------------------------------------- +# Triton kernel for symmetric matrix multiplication by @byronxu99 + +def _get_autotune_configs(): + return [ + triton.Config( + { + "BLOCK_SIZE_M": bm, + "BLOCK_SIZE_N": bn, + "BLOCK_SIZE_K": bk, + "GROUP_SIZE_M": 8, + "LOWER_UPPER": 1, + }, + num_stages=stages, + num_warps=warps, + ) + for bm in [64, 128] + for bn in [64, 128, 256] + for bk in [64, 128] + for stages, warps in [(3, 4), (3, 8), (4, 4)] + if bm // bn <= 2 and bn // bm <= 2 + ] + +@triton.jit +def _pid_to_block( + pid, + M, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) + + # Map PID to a single matrix in batch + batch_idx = pid // (num_pid_m * num_pid_n) + pid = pid % (num_pid_m * num_pid_n) + + # Map PID to 2D grid of blocks + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) + + m_idx = pid_m * BLOCK_SIZE_M + n_idx = pid_n * BLOCK_SIZE_N + return batch_idx, m_idx, n_idx + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def XXT_kernel( + A_ptr, C_ptr, + M, K, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def XXT(A: torch.Tensor, out: torch.Tensor): + """ + Launch Triton kernel to compute C = A @ A.T + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert out.size(-2) == M, "Output matrix has incorrect shape" + assert out.size(-1) == M, "Output matrix has incorrect shape" + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + XXT_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + K=K, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + ) + return out + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def ba_plus_cAA_kernel( + A_ptr, C_ptr, + M, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + alpha, beta, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A + # Performance is slightly slower than XXT_kernel, so we use two separate kernels + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + # Load block of A to add (corresponds to the current block of C) + offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) + a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) + a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) + a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) + + # Apply alpha and beta + accumulator *= alpha + accumulator += a_add * beta + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): + """ + Launch Triton kernel to compute C = alpha * A @ A.T + beta * A + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert M == K, "Input matrix must be square" + assert out.size(-2) == M + assert out.size(-1) == M + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + ba_plus_cAA_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + alpha=alpha, + beta=beta, + ) + return out + +# Computed for num_iters=5, safety_factor=2e-2, cushion=2 +polar_express_coeffs = [ + (8.156554524902461, -22.48329292557795, 15.878769915207462), + (4.042929935166739, -2.808917465908714, 0.5000178451051316), + (3.8916678022926607, -2.772484153217685, 0.5060648178503393), + (3.285753657755655, -2.3681294933425376, 0.46449024233003106), + (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) +] + +@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower +def polar_express(G: torch.Tensor): + """ + Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 + by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. + """ + X = G.bfloat16() + if G.size(-2) > G.size(-1): + X = X.mT + + # Ensure spectral norm is at most 1 + X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) + + # Allocate buffers + X = X.contiguous() + A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) + B = torch.empty_like(A) + C = torch.empty_like(X) + + aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm + + # Perform the iterations + for a, b, c in polar_express_coeffs: + XXT(X, out=A) # A = X @ X.mT + ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A + aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X + X, C = C, X # Swap references to avoid unnecessary copies + + if G.size(-2) > G.size(-1): + X = X.mT + return X + +# ----------------------------------------------------------------------------- +# Muon optimizer + +class NorMuon(torch.optim.Optimizer): + """ + Muon - MomentUm Orthogonalized by Newton-schulz + + https://kellerjordan.github.io/posts/muon/ + + Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- + processing step, in which each 2D parameter's update is replaced with the nearest orthogonal + matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has + the advantage that it can be stably run in bfloat16 on the GPU. + + Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, + or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). + + Differences from standard Muon: + - Newton-Shulz is replaced with Polar Express for the orthogonalization step + - NorMuon adds a low-rank variance estimator similar to Adafactor. + - small 1D parameters handled here instead of in Adam + - Cautious weight decay, a gated version of decoupled weight decay + - Custom distributed sizing: + The model stores all attn and mlp weights in the same shape, and then updates the view as + needed on the forward pass. This enables attn and mlp weights to be contained within the same + dist.reduce_scatter_tensor() call. The model architecture has been customized to enable + (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. + The scheduling is: + 1. reduce scatter smear_gate (1 param 7 padding params) + 2. reduce scatter attn_gate (10 params 6 padding params) + 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) + 4. reduce scatter attn/mlp round 2 (16 mlp params) + 5. wait on step 1, then compute update of 1 and schedule all gather + 6. wait on step 2, then compute update of 2 and schedule all gather + 7. wait on step 3, then compute update of 3 and schedule all gather + GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] + GPUs that receive params of type attn reshape before computing update + 8. wait on 4, then compute update of 4 and schedule all gather + 9. wait for each all gather to complete and update params + Empirically, leading with small params provides an additional 0.2s improvement. + """ + def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, beta2=0.95, custom_sizing=True): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + # custom sizing requires 8 GPUs + if custom_sizing and dist.get_world_size()==8: + param_groups = self.generate_custom_param_groups(params) + else: + param_groups = self.generate_standard_param_groups(params) + super().__init__(param_groups, defaults) + + def reset(self): + # expose a reset for clearing buffers + for group in self.param_groups: + group["momentum_buffer"].zero_() + group["second_momentum_buffer"].zero_() + + def generate_standard_param_groups(self, params): + """ + Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. + Creates one param group per module. + """ + groups = defaultdict(list) + for param in params: + groups[param.label].append(param) + + param_groups = [] + for module_name, group_params in groups.items(): + chunk_size = (len(group_params) + self.world_size - 1) // self.world_size + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + + return param_groups + + def generate_custom_param_groups(self, params): + """ + Implementation requires that a single GPU does not receive both attn + and mlp params when a param group is split across GPUs. + """ + module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp'] + params_list = list(params) + params_list.sort(key=lambda x: module_group_order.index(x.label)) + + idx = 0 + group_sizes = [1, 10, 16, 16] + assert len(params_list) == sum(group_sizes) + param_groups = [] + for size in group_sizes: + chunk_size = (size + self.world_size - 1) // self.world_size + group_params = params_list[idx: idx + size] + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + idx += size + + return param_groups + + @torch.no_grad() + def step(self): + # Efficient systems-wise implementation of step developed by @YouJiacheng, + # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, + # @ryanyang0, @vagrawal, and @varunneal. + rank = dist.get_rank() + group_infos = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + if not params: + continue + + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + stacked_grads = torch.empty( + (padded_num_params, *params[0].shape), + dtype=params[0].dtype, + device=params[0].device + ) + for i, p in enumerate(params): + stacked_grads[i].copy_(p.grad, non_blocking=True) + if len(params) < padded_num_params: + stacked_grads[len(params):].zero_() + + grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) + + reduce_future = dist.reduce_scatter_tensor( + grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True + ).get_future() + + group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) + + all_gather_infos = [] + # Second pass: wait for gradients, compute updates for the local shard of parameters, + # and launch all async all_gather operations. + for group, info in zip(self.param_groups, group_infos): + info["reduce_future"].wait() + + params = group["params"] + grad_chunk = info["grad_chunk"] + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + start_idx = rank * chunk_size + module_idx = start_idx if start_idx < len(params) else 0 + + num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank + + if "momentum_buffer" not in group: + group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) + momentum_buffer = group["momentum_buffer"] + # Apply momentum update to the persistent momentum buffer in-place + momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) + updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) + + grad_shape = updated_grads.shape + if params[module_idx].label == 'attn': + # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] + for p in params[module_idx:module_idx + num_params]: + assert p.label == 'attn' + updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) + ref_param = params[module_idx] + param_shape = ref_param.shape + + if "second_momentum_buffer" not in group: + group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) + if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) + ) + second_momentum_buffer = group["second_momentum_buffer"] + + if "param_lr" not in group: + group["param_lr"] = ( + max(1., param_shape[-2] / param_shape[-1]) ** 0.5 + * ref_param.new_tensor( + [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + ) + + group["param_wd"] = ref_param.new_tensor( + [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + + # Determine LR and WR + eff_lr = group["lr"] * group["param_lr"] + eff_wd = group["lr"] * group["weight_decay"] * group["param_wd"] + + # Compute zeropower for the entire chunk in a single, batched call. + if num_params == 0: + v_chunk = updated_grads + else: + v_chunk = polar_express(updated_grads) + + # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) + v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) + second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) + step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() + v_chunk.mul_(step_size) + v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) + + v_chunk = v_chunk.view(grad_shape) + + updated_params = torch.empty_like(grad_chunk) + param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) + + # "Cautious" weight decay (https://arxiv.org/abs/2510.12402) + mask = (v_chunk * param_chunk) >= 0 + v_chunk.addcmul_(param_chunk, (eff_wd * mask).to(ref_param.dtype)) + + param_chunk.addcmul_(v_chunk, -eff_lr) + + updated_params[:num_params].copy_(param_chunk) + if num_params < chunk_size: + updated_params[num_params:].zero_() + + stacked_params = torch.empty( + (padded_num_params, *param_shape), + dtype=updated_params.dtype, + device=updated_params.device, + ) + + gather_future = dist.all_gather_into_tensor( + stacked_params, updated_params, async_op=True + ).get_future() + + all_gather_infos.append( + { + "gather_future": gather_future, + "stacked_params": stacked_params, + "orig_params": params, + } + ) + + # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. + for info in all_gather_infos: + info["gather_future"].wait() + stacked_params = info["stacked_params"] + orig_params = info["orig_params"] + + unstacked_params = torch.unbind(stacked_params) + for i, p in enumerate(orig_params): + p.copy_(unstacked_params[i], non_blocking=True) + + +class DistAdam(torch.optim.Optimizer): + def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + params = list(params) + sizes = {p.shape for p in params} + # create one buffer per unique parameter-size + param_groups = [] + for size in sizes: + group_params = [p for p in params if p.shape == size] + param_groups.append(dict(params=group_params)) + super().__init__(param_groups, defaults) + # init state + for p in params: + chunk_size = p.size(0) // self.world_size + exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) + exp_avg_sq = torch.zeros_like(exp_avg) + self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) + # DistributedAdam implementation by @vagrawal + + @torch.compile + @torch.no_grad() + def step(self): + rank = dist.get_rank() + reduce_scatter_futures: list[torch.Future] = [] + all_gather_futures: list[torch.Future] = [] + grad_slices = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + for param in params: + grad = param.grad + rank_size = grad.shape[0] // self.world_size + grad_slice = torch.empty_like(grad[:rank_size]) + reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) + grad_slices.append(grad_slice) + + idx = 0 + for group in self.param_groups: + beta1, beta2 = group['betas'] + eps = group['eps'] + wd = group['weight_decay'] + params = group['params'] + for param in params: + reduce_scatter_futures[idx].wait() + rank_size = param.shape[0] // self.world_size + p_slice = param[rank * rank_size:(rank + 1) * rank_size] + lr = group['lr'] * getattr(param, "lr_mul", 1.0) + state = self.state[param] + g_slice = grad_slices[idx] + + exp_avg = state["exp_avg"] + exp_avg_sq = state["exp_avg_sq"] + state["step"] += 1 + t = state["step"] + # weight decay + if wd != 0: + eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) + p_slice.mul_(1 - eff_weight_decay) + # update running averages + exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) + # bias corrections + bias1 = 1 - beta1 ** t + bias2 = 1 - beta2 ** t + # compute step + denom = exp_avg_sq.sqrt().add_(eps) + step_size = lr * (bias2 ** 0.5 / bias1) + update = exp_avg.div(denom).mul_(step_size) + p_slice.add_(other=update, alpha=-1.0) + idx += 1 + all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) + torch.futures.collect_all(all_gather_futures).wait() + +# ----------------------------------------------------------------------------- +# PyTorch nn.Module definitions for the model + +def norm(x: Tensor): + return F.rms_norm(x, (x.size(-1),)) + +class CastedLinear(nn.Linear): + def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): + super().__init__(in_features, out_features, bias=False) + self.use_fp8 = use_fp8 + self.x_s = x_s + self.w_s = w_s + self.grad_s = grad_s + + def reset_parameters(self) -> None: + with torch.no_grad(): + self.weight.zero_() # @Grad62304977 and others + + def forward(self, x: Tensor): + if self.use_fp8 and self.training: + _x = x.flatten(0, -2) + out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] + return out.reshape(*x.shape[:-1], -1) + else: + return F.linear(x, self.weight.type_as(x)) + +# yarn implementation @classiclarryd +class Yarn(nn.Module): + def __init__(self, head_dim, max_seq_len): + super().__init__() + self.head_dim = head_dim + self.max_seq_len = max_seq_len + self.reset() + + def reset(self): + angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) + # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) + angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) + theta = torch.outer(t, angular_freq) + self.cos = nn.Buffer( + theta.cos().to(torch.bfloat16), persistent=False + ) + self.sin = nn.Buffer( + theta.sin().to(torch.bfloat16), persistent=False + ) + self.angular_freq = angular_freq + # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 + self.attn_scale = 0.1 + + def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): + rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) + scaling_factor = old_window / new_window + interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) + self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) + theta = torch.outer(t, self.angular_freq) + self.cos.copy_(theta.cos()) + self.sin.copy_(theta.sin()) + self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 + +def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): + assert cos.size(0) >= x_BTHD.size(-3) + cos, sin = ( + cos[None, : x_BTHD.size(-3), None, :], + sin[None, : x_BTHD.size(-3), None, :], + ) + x1, x2 = x_BTHD.chunk(2, dim=-1) + y1 = x1 * cos + x2 * sin + y2 = x1 * (-sin) + x2 * cos + return torch.cat((y1, y2), 3) + +@dataclass +class AttnArgs: + ve: torch.Tensor + sa_lambdas: torch.Tensor + seqlens: torch.Tensor + bm_size: int + cos: torch.Tensor + sin: torch.Tensor + attn_scale: float + +flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface + +class CausalSelfAttention(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int): + super().__init__() + self.num_heads = num_heads + self.head_dim = head_dim + self.dim = dim + self.hdim = num_heads * head_dim + + assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" + std = 0.5 * (self.dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng + # https://x.com/hi_tysam/status/1879699187107033311 + # make matrices the same shape as MLP to enable batched call in optimizer + self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) + # label module to enable custom optimizer sizing + self.qkvo_w.label='attn' + + with torch.no_grad(): + self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights + self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero + + # sparse gated attention to enable context based no-op by @classiclarryd + self.attn_gate = CastedLinear(12, num_heads) + # label module to enable custom optimizer sizing + self.attn_gate.weight.label = 'attn_gate' + + def forward(self, x: Tensor, attn_args: AttnArgs): + B, T = x.size(0), x.size(1) # batch size, sequence length + assert B == 1, "varlen sequences requires B == 1" + assert T % 16 == 0 + # unpack attention args + cos, sin = attn_args.cos, attn_args.sin + ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas + seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size + + q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) + q, k = norm(q), norm(k) # QK norm @Grad62304977 + q, k = rotary(q, cos, sin), rotary(k, cos, sin) + if ve is not None: + v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 + else: # skip mid-layers token value embeddings by @YouJiacheng + v = sa_lambdas[0] * v + + max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) + + # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng + y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, + max_seqlen_q=max_len, max_seqlen_k=max_len, + causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) + y = y.view(B, T, self.num_heads, self.head_dim) + y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) + y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side + y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) + return y + + +class MLP(nn.Module): + def __init__(self, dim: int): + super().__init__() + hdim = 4 * dim + # make matrices the same shape to enable batched call in optimizer + self.c_fc = nn.Parameter(torch.empty(dim, hdim)) + self.c_proj = nn.Parameter(torch.empty(dim, hdim)) + # label modules to enable custom optimizer sizing + self.c_fc.label = 'mlp' + self.c_proj.label = 'mlp' + # corrective factor to account for transpose + self.c_fc.lr_mul = 2. + + std = 0.5 * (dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + with torch.no_grad(): + self.c_fc.uniform_(-bound, bound) + self.c_proj.zero_() # zero init suggested by @Grad62304977 + + def forward(self, x: Tensor): + x = F.linear(x, self.c_fc.T.type_as(x)) + x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 + x = F.linear(x, self.c_proj.type_as(x)) + return x + +class Block(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): + super().__init__() + # skip attention of blocks.7 (the 8th layer) by @YouJiacheng + self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None + # skip MLP blocks for first MLP layer by @EmelyanenkoK + self.mlp = MLP(dim) if layer_idx != 0 else None + + def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): + x = lambdas[0] * x + lambdas[1] * x0 + if self.attn is not None: + x = x + self.attn(norm(x), attn_args) + if self.mlp is not None: + x = x + self.mlp(norm(x)) + return x + +# ----------------------------------------------------------------------------- +# The main model + +def next_multiple_of_n(v: float | int, *, n: int): + return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) + +class GPT(nn.Module): + def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): + super().__init__() + vocab_size = next_multiple_of_n(vocab_size, n=128) + self.embed = nn.Embedding(vocab_size, model_dim) + self.smear_gate = CastedLinear(12, 1) + # label modules to enable custom optimizer sizing + self.smear_gate.weight.label = 'smear_gate' + # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 + # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 + self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) + self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) + self.yarn = Yarn(head_dim, max_seq_len) + # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. + # suggested to me by @Grad62304977. this originates from Karpathy's experiments. + use_fp8 = not os.environ.get("DISABLE_FP8", False) + self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) + # Add learnable skip connection weights for decoder layers + assert num_layers % 2 == 0 + pad = (-num_layers * 5 - 2) % dist.get_world_size() + self.scalars = nn.Parameter( + torch.cat( + [ + -1.5 + * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 + *[ + torch.tensor([1.0, 0.0]) for _ in range(num_layers) + ], # block lambdas + *[ + torch.tensor([0.5, 0.5]) for _ in range(num_layers) + ], # SA lambdas + torch.zeros(1), # smear_lambda + 0.5*torch.ones(1), # backout_lambda + torch.ones(pad), + ] + ) + ) + # set learning rates + for param in self.embed.parameters(): + param.lr_mul = 75. + for param in self.value_embeds.parameters(): + param.lr_mul = 75. + self.lm_head.weight.lr_mul = 1.0 + self.scalars.lr_mul = 5.0 + + def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): + assert input_seq.ndim == 1 + + ve = [value_embed(input_seq) for value_embed in self.value_embeds] + # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure + # dropping first layer updates this to .12 ... 012 + ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] + assert len(ve) == len(self.blocks) + + short_bm = ws_short * args.block_size + long_bm = ws_long * args.block_size + bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] + assert len(bm_sizes) == len(self.blocks) + + x = self.embed(input_seq) + + skip_weights = self.scalars[:(len(self.blocks) // 2)] + lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) + sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) + smear_lambda = self.scalars[5 * len(self.blocks)] + backout_lambda = self.scalars[5 * len(self.blocks)+1] + + # smear token embed forward 1 position @classiclarryd + smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) + x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) + x = x0 = norm(x[None]) + + # U-net design by @brendanh0gan + skip_connections = [] + n = len(self.blocks) // 2 + + x_backout = None + backout_layer = 8 + # skip layer zero + for i in range(1,len(self.blocks)): + attn_args = AttnArgs( + ve=ve[i], + sa_lambdas=sa_lambdas[i], + seqlens=seqlens, + bm_size=bm_sizes[i], + cos=self.yarn.cos, + sin=self.yarn.sin, + attn_scale=self.yarn.attn_scale + ) + # since layer 0 is skipped, layer 11 does not have skip_connection + if i >= n and i<11: + gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) + x = x + gate * skip_connections.pop() + x = self.blocks[i](x, x0, lambdas[i], attn_args) + if i < n: + skip_connections.append(x) + if i == backout_layer: + x_backout = x + + # back out contributions from first 8 layers that are only required for downstream context and not direct prediction + x -= backout_lambda * x_backout + x = norm(x) + logits = self.lm_head(x) + # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) + logits = 30 * torch.sigmoid(logits / 7.5) + logits_for_loss = logits.float() if not self.training else logits + loss = F.cross_entropy( + logits_for_loss.view(-1, logits_for_loss.size(-1)), + target_seq, + reduction="sum" if self.training else "mean", + ) + return loss + +# ----------------------------------------------------------------------------- +# Distributed data loader + +def _load_data_shard(file: Path): + header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 + assert header[0] == 20240520, "magic number mismatch in the data .bin file" + assert header[1] == 1, "unsupported version" + num_tokens = int(header[2]) # number of tokens (claimed) + with file.open("rb", buffering=0) as f: + tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng + f.seek(256 * 4) + nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng + assert nbytes == 2 * num_tokens, "number of tokens read does not match header" + return tokens + +BOS_ID = 50256 + +class BOSFinder: + # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd + def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): + # Precompute BOS positions once per shard + self.tokens=tokens + self.size = tokens.numel() + self.quickload = quickload + if quickload: + # only scan first 4 million tokens, then kickoff async thread to scan rest + self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.thread = None + self.ready = threading.Event() + self.start() + else: + self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.i = 0 + self.world_size = world_size + self.batch_iter = 0 + + def _load(self): + self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + self.bos_idx = self.bos_idx_async + + def next_batch(self, num_tokens_local: int, max_seq_len: int): + # if quickload was used, repoint to the full dataset after 5 batches + if self.quickload and self.batch_iter==5: + self.get() + n = len(self.bos_idx) + starts = [[] for _ in range(self.world_size)] + ends = [[] for _ in range(self.world_size)] + + idx = self.i + for r in range(self.world_size): + cur_len = 0 + while cur_len <= num_tokens_local: + if idx >= n: + raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") + cur = self.bos_idx[idx] + starts[r].append(cur) + end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, + cur + max_seq_len, + cur + num_tokens_local - cur_len + 1) + ends[r].append(end) + cur_len += end - cur + idx += 1 + + assert cur_len == num_tokens_local + 1 + self.i = idx + self.batch_iter+=1 + return starts, ends + +class DataPreloader: + # Helper for asynchronously loading next shard and indexing bos tokens + def __init__(self, file_iter, world_size: int = 1): + self.file_iter = file_iter + self.world_size = world_size + self.thread = None + self.data = None + self.ready = threading.Event() + + def _load(self): + tokens = _load_data_shard(next(self.file_iter)) + self.data = (tokens, BOSFinder(tokens, self.world_size)) + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + return self.data + +def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): + # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len + rank = dist.get_rank() if dist.is_initialized() else 0 + world_size = dist.get_world_size() if dist.is_initialized() else 1 + assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" + num_tokens = num_tokens // grad_accum_steps + + files = [Path(file) for file in sorted(glob.glob(filename_pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") + + file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training + tokens = _load_data_shard(next(file_iter)) + if align_to_bos: + finder = BOSFinder(tokens, world_size=world_size, quickload=True) + preloader = DataPreloader(file_iter, world_size) + preloader.start() + else: + pos = 0 # for unaligned case + + while True: + num_tokens_local = num_tokens // world_size + max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 + + if align_to_bos: + try: + seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) + start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) + except StopIteration: + # This shard is exhausted, load the next one in the next loop iteration. + tokens, finder = preloader.get() + preloader.start() + continue + + buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) + _inputs = buf[:-1] + _targets = buf[1:] + end_idxs[-1] -= 1 # last document was too long to account for _targets offset + cum_lengths = (end_idxs - start_idxs).cumsum(0) + + else: + if pos + num_tokens + 1 >= len(tokens): # should not occur for val data + tokens, pos = _load_data_shard(next(file_iter)), 0 + + pos_local = pos + rank * num_tokens_local + buf = tokens[pos_local: pos_local + num_tokens_local + 1] + _inputs = buf[:-1].view(num_tokens_local, ) + _targets = buf[1:].view(num_tokens_local, ) + + cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] + pos += num_tokens + + + _cum_lengths = torch.full((max_num_docs,), num_tokens_local) + _cum_lengths[0] = 0 + _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths + + new_params = yield ( + _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), + _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), + _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) + ) + + if new_params is not None: + # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() + new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params + assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" + num_tokens = new_num_tokens + max_seq_len = new_max_seq_len + grad_accum_steps = new_grad_accum_steps + + +# ----------------------------------------------------------------------------- +# int main + +@dataclass +class Hyperparameters: + # data + train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on + val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on + val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons + train_batch_size: int = 2048 * 16 * 8 + train_max_seq_len: int = 128 * 16 + val_batch_size: int = 4 * 64 * 1024 * 8 + # optimization + num_scheduled_iterations: int = 2205 # number of steps to complete lr and ws schedule + num_extension_iterations: int = 40 # number of steps to continue training at final lr and ws + num_iterations: int = num_scheduled_iterations + num_extension_iterations + cooldown_frac: float = 0.50 # fraction of num_scheduled_iterations spent cooling down the learning rate + # evaluation and logging + run_id: str = f"{uuid.uuid4()}" + val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end + save_checkpoint: bool = False + # attention masking + block_size: int = 128 + ws_schedule: tuple = (3, 7, 11) + ws_final: int = 13 # increase final validation ws, used for YaRN extension and short window size @classiclarryd + ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN + +args = Hyperparameters() + +data_path = os.environ.get("DATA_PATH", ".") +args.train_files = os.path.join(data_path, args.train_files) +args.val_files = os.path.join(data_path, args.val_files) + +# torchrun sets these env variables +rank = int(os.environ["RANK"]) +world_size = int(os.environ["WORLD_SIZE"]) +assert 8 % world_size == 0, "world_size must be a divisor of 8" +grad_accum_steps = 8 // world_size +assert torch.cuda.is_available() +device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) +torch.cuda.set_device(device) +dist.init_process_group(backend="nccl", device_id=device) +dist.barrier() +master_process = (rank == 0) # this process will do logging, checkpointing etc. + +# begin logging +logfile = None +if master_process: + run_id = args.run_id + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{run_id}.txt" + print(logfile) +def print0(s, console=False): + if master_process: + with open(logfile, "a") as f: + if console: + print(s) + print(s, file=f) + +# begin by printing this file (the Python code) +print0(code) +print0("="*100) +# log information about the hardware/software environment this is running on +print0(f"Running Python {sys.version}") +print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") +print0(f"Running Triton version {triton.__version__}") + +def nvidia_smi(): + import subprocess # avoid top level import + return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout +print0(nvidia_smi()) +print0("="*100) + +model: nn.Module = GPT( + vocab_size=50257, + num_layers=12, + num_heads=6, + head_dim=128, + model_dim=768, + max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) +).cuda() +for m in model.modules(): + if isinstance(m, (nn.Embedding, nn.Linear)): + m.bfloat16() +for param in model.parameters(): + dist.broadcast(param.detach(), 0) + +# collect the parameters to optimize +hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] +embed_params = [p for n, p in model.named_parameters() if "embed" in n] +scalar_params = [p for p in model.parameters() if p.ndim < 2] +head_params = [model.lm_head.weight] +gate_params = [p for n, p in model.named_parameters() if "gate" in n] + +# init the optimizer(s) +# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence +# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 +optimizer1 = DistAdam( + scalar_params + head_params + embed_params, + lr=0.008, + betas=(0.65, 0.95), + eps=1e-8, + weight_decay=0.0, +) +optimizer2 = NorMuon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=1.2) +optimizers = [optimizer1, optimizer2] +for opt in optimizers: + for group in opt.param_groups: + group["initial_lr"] = group["lr"] + +# learning rate schedule: flat, then linear decay, then flat +def get_lr(step: int): + x = min(0.9999, step / args.num_scheduled_iterations) + assert 0 <= x < 1 + lr = 1.0 + if x >= 1 - args.cooldown_frac: + w = (1 - x) / args.cooldown_frac + lr = w * 1.0 + (1 - w) * 0.1 + return lr + +def get_ws(step: int): + # set short window size to half of long window size + # Higher ws on "extension" steps + if step >= args.num_scheduled_iterations: + return args.ws_final // 2, args.ws_final + x = step / args.num_scheduled_iterations + assert 0 <= x < 1 + ws_idx = int(len(args.ws_schedule) * x) + return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] + +def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): + # warmup phase: linearly increase momentum from min to max + # cooldown phase: linearly decrease momentum from max to min + momentum_cd_start = args.num_iterations - muon_cooldown_steps + if step < muon_warmup_steps: + frac = step / muon_warmup_steps + momentum = momentum_min + frac * (momentum_max - momentum_min) + elif step > momentum_cd_start: + frac = (step - momentum_cd_start) / muon_cooldown_steps + momentum = momentum_max - frac * (momentum_max - momentum_min) + else: + momentum = momentum_max + return momentum + +def step_optimizers(step: int, optimizers, model): + # update lr + for optimizer in optimizers: + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * get_lr(step) + + # set muon momentum based on step + momentum = get_muon_momentum(step) + for group in optimizers[1].param_groups: + group["momentum"] = momentum + + # on even steps, only step Muon params + # on odd steps, step all params + if step%2==0: + optimizers[1].step() + optimizers[1].zero_grad(set_to_none=True) + else: + for optimizer in optimizers: + optimizer.step() + model.zero_grad(set_to_none=True) + +model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) + +######################################## +# Warmup kernels # +######################################## + +# Warmup the training kernels, then re-initialize the state so we aren't cheating +warmup_steps = 30 +initial_state = dict(model=copy.deepcopy(model.state_dict()), + optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +ws_schedule = list(args.ws_schedule) + [args.ws_final] +ws_long = ws_schedule[0] +for step in range(warmup_steps): + inputs, targets, cum_seqlens = next(train_loader) + # each window size is a new graph, need to warm up each with Yarn.attn_scale + ws_idx = step % len(ws_schedule) + if ws_idx==0: + model.yarn.reset() + ws_long = ws_schedule[0] + else: + new_ws_long = ws_schedule[ws_idx] + model.yarn.apply(ws_long, new_ws_long) + ws_long = new_ws_long + model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() + for opt in optimizers: + opt.step() + model.zero_grad(set_to_none=True) +model.yarn.reset() # rotary buffer is not stored in state_dict +model.load_state_dict(initial_state["model"]) +optimizer2.reset() # muon momentum buffers not in state dict +for opt, opt_state in zip(optimizers, initial_state["optimizers"]): + opt.load_state_dict(opt_state) +del train_loader, initial_state + +######################################## +# Training and validation # +######################################## + +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +training_time_ms = 0 +# start the clock +torch.cuda.synchronize() +t0 = time.perf_counter() +# begin training +train_steps = args.num_iterations +ws_short, ws_long = get_ws(0) +for step in range(train_steps + 1): + last_step = (step == train_steps) + ws_short, new_ws_long = get_ws(step) + if new_ws_long != ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long=new_ws_long + + # --------------- VALIDATION SECTION ----------------- + if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): + if last_step: + ws_long = args.ws_validate_post_yarn_ext + # stop the clock + torch.cuda.synchronize() + training_time_ms += 1000 * (time.perf_counter() - t0) + model.eval() + assert args.val_tokens % args.val_batch_size == 0 + val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size + val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) + val_loss = 0 + with torch.no_grad(): + for _ in range(val_steps): + inputs, targets, cum_seqlens = next(val_loader) + val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) + val_loss /= val_steps + del val_loader + dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) + print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) + model.train() + # start the clock again + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if master_process and args.save_checkpoint: + log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) + os.makedirs(f"logs/{run_id}", exist_ok=True) + torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") + # the last step only has the validation loop, so break to avoid training + break + + # --------------- TRAINING SECTION ----------------- + for _ in range(grad_accum_steps): + inputs, targets, cum_seqlens = next(train_loader) + (model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps).backward() + step_optimizers(step, optimizers, model) + + # logging + approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) + print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) + +print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) +dist.destroy_process_group() + +==================================================================================================== +Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] +Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 +Running Triton version 3.5.0 +Mon Nov 10 21:36:01 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | +| N/A 41C P0 130W / 700W | 5858MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | +| N/A 35C P0 122W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | +| N/A 33C P0 118W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | +| N/A 39C P0 122W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | +| N/A 41C P0 131W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | +| N/A 34C P0 122W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | +| N/A 40C P0 123W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | +| N/A 34C P0 118W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| ++-----------------------------------------------------------------------------------------+ + +==================================================================================================== +step:0/2245 val_loss:10.8258 train_time:0ms step_avg:0.02ms +step:1/2245 train_time:119ms step_avg:119.44ms +step:2/2245 train_time:141ms step_avg:70.39ms +step:3/2245 train_time:179ms step_avg:59.71ms +step:4/2245 train_time:236ms step_avg:58.91ms +step:5/2245 train_time:295ms step_avg:58.99ms +step:6/2245 train_time:353ms step_avg:58.89ms +step:7/2245 train_time:415ms step_avg:59.23ms +step:8/2245 train_time:473ms step_avg:59.16ms +step:9/2245 train_time:534ms step_avg:59.36ms +step:10/2245 train_time:593ms step_avg:59.32ms +step:11/2245 train_time:654ms step_avg:59.49ms +step:12/2245 train_time:713ms step_avg:59.43ms +step:13/2245 train_time:775ms step_avg:59.58ms +step:14/2245 train_time:834ms step_avg:59.55ms +step:15/2245 train_time:895ms step_avg:59.66ms +step:16/2245 train_time:954ms step_avg:59.63ms +step:17/2245 train_time:1019ms step_avg:59.93ms +step:18/2245 train_time:1082ms step_avg:60.13ms +step:19/2245 train_time:1148ms step_avg:60.42ms +step:20/2245 train_time:1209ms step_avg:60.46ms +step:21/2245 train_time:1271ms step_avg:60.54ms +step:22/2245 train_time:1331ms step_avg:60.49ms +step:23/2245 train_time:1392ms step_avg:60.52ms +step:24/2245 train_time:1451ms step_avg:60.46ms +step:25/2245 train_time:1512ms step_avg:60.49ms +step:26/2245 train_time:1572ms step_avg:60.45ms +step:27/2245 train_time:1633ms step_avg:60.48ms +step:28/2245 train_time:1692ms step_avg:60.44ms +step:29/2245 train_time:1753ms step_avg:60.44ms +step:30/2245 train_time:1812ms step_avg:60.41ms +step:31/2245 train_time:1874ms step_avg:60.44ms +step:32/2245 train_time:1934ms step_avg:60.44ms +step:33/2245 train_time:1998ms step_avg:60.53ms +step:34/2245 train_time:2059ms step_avg:60.55ms +step:35/2245 train_time:2123ms step_avg:60.65ms +step:36/2245 train_time:2183ms step_avg:60.63ms +step:37/2245 train_time:2245ms step_avg:60.69ms +step:38/2245 train_time:2305ms step_avg:60.65ms +step:39/2245 train_time:2367ms step_avg:60.68ms +step:40/2245 train_time:2426ms step_avg:60.64ms +step:41/2245 train_time:2487ms step_avg:60.67ms +step:42/2245 train_time:2547ms step_avg:60.64ms +step:43/2245 train_time:2609ms step_avg:60.67ms +step:44/2245 train_time:2668ms step_avg:60.64ms +step:45/2245 train_time:2729ms step_avg:60.65ms +step:46/2245 train_time:2789ms step_avg:60.62ms +step:47/2245 train_time:2850ms step_avg:60.64ms +step:48/2245 train_time:2910ms step_avg:60.63ms +step:49/2245 train_time:2974ms step_avg:60.69ms +step:50/2245 train_time:3035ms step_avg:60.70ms +step:51/2245 train_time:3097ms step_avg:60.73ms +step:52/2245 train_time:3158ms step_avg:60.73ms +step:53/2245 train_time:3220ms step_avg:60.75ms +step:54/2245 train_time:3279ms step_avg:60.73ms +step:55/2245 train_time:3341ms step_avg:60.74ms +step:56/2245 train_time:3400ms step_avg:60.71ms +step:57/2245 train_time:3461ms step_avg:60.73ms +step:58/2245 train_time:3521ms step_avg:60.70ms +step:59/2245 train_time:3582ms step_avg:60.72ms +step:60/2245 train_time:3642ms step_avg:60.69ms +step:61/2245 train_time:3704ms step_avg:60.71ms +step:62/2245 train_time:3763ms step_avg:60.70ms +step:63/2245 train_time:3826ms step_avg:60.73ms +step:64/2245 train_time:3886ms step_avg:60.71ms +step:65/2245 train_time:3948ms step_avg:60.74ms +step:66/2245 train_time:4008ms step_avg:60.72ms +step:67/2245 train_time:4070ms step_avg:60.75ms +step:68/2245 train_time:4131ms step_avg:60.75ms +step:69/2245 train_time:4194ms step_avg:60.78ms +step:70/2245 train_time:4257ms step_avg:60.81ms +step:71/2245 train_time:4316ms step_avg:60.79ms +step:72/2245 train_time:4376ms step_avg:60.77ms +step:73/2245 train_time:4438ms step_avg:60.80ms +step:74/2245 train_time:4498ms step_avg:60.78ms +step:75/2245 train_time:4560ms step_avg:60.80ms +step:76/2245 train_time:4620ms step_avg:60.78ms +step:77/2245 train_time:4681ms step_avg:60.79ms +step:78/2245 train_time:4739ms step_avg:60.76ms +step:79/2245 train_time:4802ms step_avg:60.78ms +step:80/2245 train_time:4861ms step_avg:60.76ms +step:81/2245 train_time:4922ms step_avg:60.77ms +step:82/2245 train_time:4982ms step_avg:60.75ms +step:83/2245 train_time:5045ms step_avg:60.78ms +step:84/2245 train_time:5104ms step_avg:60.76ms +step:85/2245 train_time:5166ms step_avg:60.77ms +step:86/2245 train_time:5226ms step_avg:60.76ms +step:87/2245 train_time:5288ms step_avg:60.78ms +step:88/2245 train_time:5347ms step_avg:60.77ms +step:89/2245 train_time:5409ms step_avg:60.78ms +step:90/2245 train_time:5470ms step_avg:60.77ms +step:91/2245 train_time:5532ms step_avg:60.79ms +step:92/2245 train_time:5592ms step_avg:60.78ms +step:93/2245 train_time:5654ms step_avg:60.80ms +step:94/2245 train_time:5713ms step_avg:60.78ms +step:95/2245 train_time:5776ms step_avg:60.80ms +step:96/2245 train_time:5835ms step_avg:60.79ms +step:97/2245 train_time:5897ms step_avg:60.80ms +step:98/2245 train_time:5956ms step_avg:60.78ms +step:99/2245 train_time:6018ms step_avg:60.79ms +step:100/2245 train_time:6077ms step_avg:60.77ms +step:101/2245 train_time:6139ms step_avg:60.78ms +step:102/2245 train_time:6198ms step_avg:60.77ms +step:103/2245 train_time:6261ms step_avg:60.78ms +step:104/2245 train_time:6320ms step_avg:60.77ms +step:105/2245 train_time:6381ms step_avg:60.77ms +step:106/2245 train_time:6441ms step_avg:60.76ms +step:107/2245 train_time:6502ms step_avg:60.77ms +step:108/2245 train_time:6562ms step_avg:60.76ms +step:109/2245 train_time:6624ms step_avg:60.77ms +step:110/2245 train_time:6683ms step_avg:60.76ms +step:111/2245 train_time:6745ms step_avg:60.77ms +step:112/2245 train_time:6805ms step_avg:60.76ms +step:113/2245 train_time:6867ms step_avg:60.77ms +step:114/2245 train_time:6926ms step_avg:60.76ms +step:115/2245 train_time:6988ms step_avg:60.77ms +step:116/2245 train_time:7048ms step_avg:60.76ms +step:117/2245 train_time:7109ms step_avg:60.76ms +step:118/2245 train_time:7169ms step_avg:60.76ms +step:119/2245 train_time:7232ms step_avg:60.77ms +step:120/2245 train_time:7292ms step_avg:60.76ms +step:121/2245 train_time:7354ms step_avg:60.77ms +step:122/2245 train_time:7413ms step_avg:60.76ms +step:123/2245 train_time:7475ms step_avg:60.77ms +step:124/2245 train_time:7535ms step_avg:60.76ms +step:125/2245 train_time:7596ms step_avg:60.77ms +step:126/2245 train_time:7655ms step_avg:60.76ms +step:127/2245 train_time:7717ms step_avg:60.76ms +step:128/2245 train_time:7776ms step_avg:60.75ms +step:129/2245 train_time:7838ms step_avg:60.76ms +step:130/2245 train_time:7897ms step_avg:60.75ms +step:131/2245 train_time:7959ms step_avg:60.76ms +step:132/2245 train_time:8017ms step_avg:60.74ms +step:133/2245 train_time:8079ms step_avg:60.74ms +step:134/2245 train_time:8139ms step_avg:60.74ms +step:135/2245 train_time:8201ms step_avg:60.75ms +step:136/2245 train_time:8260ms step_avg:60.73ms +step:137/2245 train_time:8321ms step_avg:60.74ms +step:138/2245 train_time:8381ms step_avg:60.73ms +step:139/2245 train_time:8442ms step_avg:60.74ms +step:140/2245 train_time:8501ms step_avg:60.72ms +step:141/2245 train_time:8563ms step_avg:60.73ms +step:142/2245 train_time:8622ms step_avg:60.72ms +step:143/2245 train_time:8684ms step_avg:60.72ms +step:144/2245 train_time:8743ms step_avg:60.72ms +step:145/2245 train_time:8805ms step_avg:60.72ms +step:146/2245 train_time:8864ms step_avg:60.71ms +step:147/2245 train_time:8925ms step_avg:60.71ms +step:148/2245 train_time:8984ms step_avg:60.70ms +step:149/2245 train_time:9045ms step_avg:60.71ms +step:150/2245 train_time:9104ms step_avg:60.69ms +step:151/2245 train_time:9166ms step_avg:60.70ms +step:152/2245 train_time:9225ms step_avg:60.69ms +step:153/2245 train_time:9286ms step_avg:60.69ms +step:154/2245 train_time:9346ms step_avg:60.69ms +step:155/2245 train_time:9408ms step_avg:60.69ms +step:156/2245 train_time:9467ms step_avg:60.69ms +step:157/2245 train_time:9529ms step_avg:60.69ms +step:158/2245 train_time:9589ms step_avg:60.69ms +step:159/2245 train_time:9651ms step_avg:60.70ms +step:160/2245 train_time:9710ms step_avg:60.69ms +step:161/2245 train_time:9772ms step_avg:60.70ms +step:162/2245 train_time:9832ms step_avg:60.69ms +step:163/2245 train_time:9894ms step_avg:60.70ms +step:164/2245 train_time:9954ms step_avg:60.69ms +step:165/2245 train_time:10015ms step_avg:60.70ms +step:166/2245 train_time:10074ms step_avg:60.69ms +step:167/2245 train_time:10136ms step_avg:60.69ms +step:168/2245 train_time:10195ms step_avg:60.68ms +step:169/2245 train_time:10256ms step_avg:60.69ms +step:170/2245 train_time:10315ms step_avg:60.67ms +step:171/2245 train_time:10376ms step_avg:60.68ms +step:172/2245 train_time:10436ms step_avg:60.67ms +step:173/2245 train_time:10497ms step_avg:60.68ms +step:174/2245 train_time:10558ms step_avg:60.68ms +step:175/2245 train_time:10619ms step_avg:60.68ms +step:176/2245 train_time:10678ms step_avg:60.67ms +step:177/2245 train_time:10739ms step_avg:60.67ms +step:178/2245 train_time:10798ms step_avg:60.66ms +step:179/2245 train_time:10859ms step_avg:60.66ms +step:180/2245 train_time:10918ms step_avg:60.66ms +step:181/2245 train_time:10980ms step_avg:60.66ms +step:182/2245 train_time:11039ms step_avg:60.65ms +step:183/2245 train_time:11100ms step_avg:60.66ms +step:184/2245 train_time:11159ms step_avg:60.65ms +step:185/2245 train_time:11221ms step_avg:60.65ms +step:186/2245 train_time:11280ms step_avg:60.64ms +step:187/2245 train_time:11341ms step_avg:60.65ms +step:188/2245 train_time:11400ms step_avg:60.64ms +step:189/2245 train_time:11461ms step_avg:60.64ms +step:190/2245 train_time:11520ms step_avg:60.63ms +step:191/2245 train_time:11582ms step_avg:60.64ms +step:192/2245 train_time:11641ms step_avg:60.63ms +step:193/2245 train_time:11703ms step_avg:60.64ms +step:194/2245 train_time:11761ms step_avg:60.62ms +step:195/2245 train_time:11822ms step_avg:60.63ms +step:196/2245 train_time:11881ms step_avg:60.62ms +step:197/2245 train_time:11943ms step_avg:60.62ms +step:198/2245 train_time:12001ms step_avg:60.61ms +step:199/2245 train_time:12063ms step_avg:60.62ms +step:200/2245 train_time:12122ms step_avg:60.61ms +step:201/2245 train_time:12184ms step_avg:60.61ms +step:202/2245 train_time:12243ms step_avg:60.61ms +step:203/2245 train_time:12304ms step_avg:60.61ms +step:204/2245 train_time:12363ms step_avg:60.60ms +step:205/2245 train_time:12424ms step_avg:60.61ms +step:206/2245 train_time:12483ms step_avg:60.60ms +step:207/2245 train_time:12545ms step_avg:60.60ms +step:208/2245 train_time:12604ms step_avg:60.59ms +step:209/2245 train_time:12665ms step_avg:60.60ms +step:210/2245 train_time:12724ms step_avg:60.59ms +step:211/2245 train_time:12785ms step_avg:60.59ms +step:212/2245 train_time:12844ms step_avg:60.58ms +step:213/2245 train_time:12905ms step_avg:60.59ms +step:214/2245 train_time:12964ms step_avg:60.58ms +step:215/2245 train_time:13025ms step_avg:60.58ms +step:216/2245 train_time:13085ms step_avg:60.58ms +step:217/2245 train_time:13146ms step_avg:60.58ms +step:218/2245 train_time:13205ms step_avg:60.57ms +step:219/2245 train_time:13267ms step_avg:60.58ms +step:220/2245 train_time:13326ms step_avg:60.57ms +step:221/2245 train_time:13388ms step_avg:60.58ms +step:222/2245 train_time:13448ms step_avg:60.57ms +step:223/2245 train_time:13509ms step_avg:60.58ms +step:224/2245 train_time:13568ms step_avg:60.57ms +step:225/2245 train_time:13630ms step_avg:60.58ms +step:226/2245 train_time:13690ms step_avg:60.57ms +step:227/2245 train_time:13751ms step_avg:60.58ms +step:228/2245 train_time:13811ms step_avg:60.58ms +step:229/2245 train_time:13873ms step_avg:60.58ms +step:230/2245 train_time:13933ms step_avg:60.58ms +step:231/2245 train_time:13995ms step_avg:60.58ms +step:232/2245 train_time:14054ms step_avg:60.58ms +step:233/2245 train_time:14116ms step_avg:60.58ms +step:234/2245 train_time:14175ms step_avg:60.58ms +step:235/2245 train_time:14237ms step_avg:60.58ms +step:236/2245 train_time:14296ms step_avg:60.58ms +step:237/2245 train_time:14357ms step_avg:60.58ms +step:238/2245 train_time:14416ms step_avg:60.57ms +step:239/2245 train_time:14477ms step_avg:60.57ms +step:240/2245 train_time:14536ms step_avg:60.57ms +step:241/2245 train_time:14599ms step_avg:60.58ms +step:242/2245 train_time:14657ms step_avg:60.57ms +step:243/2245 train_time:14719ms step_avg:60.57ms +step:244/2245 train_time:14778ms step_avg:60.56ms +step:245/2245 train_time:14839ms step_avg:60.57ms +step:246/2245 train_time:14898ms step_avg:60.56ms +step:247/2245 train_time:14959ms step_avg:60.56ms +step:248/2245 train_time:15018ms step_avg:60.56ms +step:249/2245 train_time:15080ms step_avg:60.56ms +step:250/2245 train_time:15139ms step_avg:60.56ms +step:250/2245 val_loss:4.0758 train_time:15201ms step_avg:60.80ms +step:251/2245 train_time:15220ms step_avg:60.64ms +step:252/2245 train_time:15262ms step_avg:60.56ms +step:253/2245 train_time:15329ms step_avg:60.59ms +step:254/2245 train_time:15393ms step_avg:60.60ms +step:255/2245 train_time:15454ms step_avg:60.61ms +step:256/2245 train_time:15514ms step_avg:60.60ms +step:257/2245 train_time:15575ms step_avg:60.60ms +step:258/2245 train_time:15633ms step_avg:60.59ms +step:259/2245 train_time:15695ms step_avg:60.60ms +step:260/2245 train_time:15753ms step_avg:60.59ms +step:261/2245 train_time:15813ms step_avg:60.59ms +step:262/2245 train_time:15871ms step_avg:60.58ms +step:263/2245 train_time:15932ms step_avg:60.58ms +step:264/2245 train_time:15990ms step_avg:60.57ms +step:265/2245 train_time:16050ms step_avg:60.57ms +step:266/2245 train_time:16108ms step_avg:60.56ms +step:267/2245 train_time:16169ms step_avg:60.56ms +step:268/2245 train_time:16228ms step_avg:60.55ms +step:269/2245 train_time:16291ms step_avg:60.56ms +step:270/2245 train_time:16351ms step_avg:60.56ms +step:271/2245 train_time:16414ms step_avg:60.57ms +step:272/2245 train_time:16473ms step_avg:60.56ms +step:273/2245 train_time:16535ms step_avg:60.57ms +step:274/2245 train_time:16593ms step_avg:60.56ms +step:275/2245 train_time:16655ms step_avg:60.56ms +step:276/2245 train_time:16713ms step_avg:60.56ms +step:277/2245 train_time:16774ms step_avg:60.56ms +step:278/2245 train_time:16833ms step_avg:60.55ms +step:279/2245 train_time:16893ms step_avg:60.55ms +step:280/2245 train_time:16952ms step_avg:60.54ms +step:281/2245 train_time:17012ms step_avg:60.54ms +step:282/2245 train_time:17071ms step_avg:60.54ms +step:283/2245 train_time:17133ms step_avg:60.54ms +step:284/2245 train_time:17192ms step_avg:60.54ms +step:285/2245 train_time:17255ms step_avg:60.54ms +step:286/2245 train_time:17314ms step_avg:60.54ms +step:287/2245 train_time:17377ms step_avg:60.55ms +step:288/2245 train_time:17436ms step_avg:60.54ms +step:289/2245 train_time:17498ms step_avg:60.55ms +step:290/2245 train_time:17556ms step_avg:60.54ms +step:291/2245 train_time:17617ms step_avg:60.54ms +step:292/2245 train_time:17676ms step_avg:60.53ms +step:293/2245 train_time:17737ms step_avg:60.54ms +step:294/2245 train_time:17796ms step_avg:60.53ms +step:295/2245 train_time:17856ms step_avg:60.53ms +step:296/2245 train_time:17915ms step_avg:60.52ms +step:297/2245 train_time:17976ms step_avg:60.53ms +step:298/2245 train_time:18035ms step_avg:60.52ms +step:299/2245 train_time:18096ms step_avg:60.52ms +step:300/2245 train_time:18155ms step_avg:60.52ms +step:301/2245 train_time:18216ms step_avg:60.52ms +step:302/2245 train_time:18276ms step_avg:60.52ms +step:303/2245 train_time:18338ms step_avg:60.52ms +step:304/2245 train_time:18397ms step_avg:60.52ms +step:305/2245 train_time:18458ms step_avg:60.52ms +step:306/2245 train_time:18516ms step_avg:60.51ms +step:307/2245 train_time:18578ms step_avg:60.51ms +step:308/2245 train_time:18637ms step_avg:60.51ms +step:309/2245 train_time:18698ms step_avg:60.51ms +step:310/2245 train_time:18757ms step_avg:60.51ms +step:311/2245 train_time:18818ms step_avg:60.51ms +step:312/2245 train_time:18877ms step_avg:60.50ms +step:313/2245 train_time:18938ms step_avg:60.51ms +step:314/2245 train_time:18998ms step_avg:60.50ms +step:315/2245 train_time:19059ms step_avg:60.50ms +step:316/2245 train_time:19118ms step_avg:60.50ms +step:317/2245 train_time:19179ms step_avg:60.50ms +step:318/2245 train_time:19239ms step_avg:60.50ms +step:319/2245 train_time:19300ms step_avg:60.50ms +step:320/2245 train_time:19360ms step_avg:60.50ms +step:321/2245 train_time:19422ms step_avg:60.50ms +step:322/2245 train_time:19481ms step_avg:60.50ms +step:323/2245 train_time:19542ms step_avg:60.50ms +step:324/2245 train_time:19601ms step_avg:60.50ms +step:325/2245 train_time:19664ms step_avg:60.50ms +step:326/2245 train_time:19723ms step_avg:60.50ms +step:327/2245 train_time:19784ms step_avg:60.50ms +step:328/2245 train_time:19844ms step_avg:60.50ms +step:329/2245 train_time:19905ms step_avg:60.50ms +step:330/2245 train_time:19964ms step_avg:60.50ms +step:331/2245 train_time:20026ms step_avg:60.50ms +step:332/2245 train_time:20086ms step_avg:60.50ms +step:333/2245 train_time:20147ms step_avg:60.50ms +step:334/2245 train_time:20207ms step_avg:60.50ms +step:335/2245 train_time:20267ms step_avg:60.50ms +step:336/2245 train_time:20327ms step_avg:60.50ms +step:337/2245 train_time:20388ms step_avg:60.50ms +step:338/2245 train_time:20447ms step_avg:60.49ms +step:339/2245 train_time:20508ms step_avg:60.50ms +step:340/2245 train_time:20568ms step_avg:60.49ms +step:341/2245 train_time:20629ms step_avg:60.50ms +step:342/2245 train_time:20688ms step_avg:60.49ms +step:343/2245 train_time:20750ms step_avg:60.49ms +step:344/2245 train_time:20808ms step_avg:60.49ms +step:345/2245 train_time:20869ms step_avg:60.49ms +step:346/2245 train_time:20928ms step_avg:60.49ms +step:347/2245 train_time:20990ms step_avg:60.49ms +step:348/2245 train_time:21049ms step_avg:60.49ms +step:349/2245 train_time:21110ms step_avg:60.49ms +step:350/2245 train_time:21169ms step_avg:60.48ms +step:351/2245 train_time:21231ms step_avg:60.49ms +step:352/2245 train_time:21290ms step_avg:60.48ms +step:353/2245 train_time:21351ms step_avg:60.49ms +step:354/2245 train_time:21410ms step_avg:60.48ms +step:355/2245 train_time:21471ms step_avg:60.48ms +step:356/2245 train_time:21529ms step_avg:60.48ms +step:357/2245 train_time:21591ms step_avg:60.48ms +step:358/2245 train_time:21650ms step_avg:60.47ms +step:359/2245 train_time:21711ms step_avg:60.48ms +step:360/2245 train_time:21770ms step_avg:60.47ms +step:361/2245 train_time:21831ms step_avg:60.47ms +step:362/2245 train_time:21890ms step_avg:60.47ms +step:363/2245 train_time:21952ms step_avg:60.47ms +step:364/2245 train_time:22011ms step_avg:60.47ms +step:365/2245 train_time:22073ms step_avg:60.47ms +step:366/2245 train_time:22132ms step_avg:60.47ms +step:367/2245 train_time:22193ms step_avg:60.47ms +step:368/2245 train_time:22252ms step_avg:60.47ms +step:369/2245 train_time:22313ms step_avg:60.47ms +step:370/2245 train_time:22372ms step_avg:60.46ms +step:371/2245 train_time:22433ms step_avg:60.47ms +step:372/2245 train_time:22492ms step_avg:60.46ms +step:373/2245 train_time:22554ms step_avg:60.47ms +step:374/2245 train_time:22613ms step_avg:60.46ms +step:375/2245 train_time:22674ms step_avg:60.46ms +step:376/2245 train_time:22733ms step_avg:60.46ms +step:377/2245 train_time:22794ms step_avg:60.46ms +step:378/2245 train_time:22853ms step_avg:60.46ms +step:379/2245 train_time:22915ms step_avg:60.46ms +step:380/2245 train_time:22974ms step_avg:60.46ms +step:381/2245 train_time:23035ms step_avg:60.46ms +step:382/2245 train_time:23094ms step_avg:60.46ms +step:383/2245 train_time:23155ms step_avg:60.46ms +step:384/2245 train_time:23214ms step_avg:60.45ms +step:385/2245 train_time:23275ms step_avg:60.46ms +step:386/2245 train_time:23334ms step_avg:60.45ms +step:387/2245 train_time:23396ms step_avg:60.45ms +step:388/2245 train_time:23454ms step_avg:60.45ms +step:389/2245 train_time:23516ms step_avg:60.45ms +step:390/2245 train_time:23574ms step_avg:60.45ms +step:391/2245 train_time:23636ms step_avg:60.45ms +step:392/2245 train_time:23695ms step_avg:60.45ms +step:393/2245 train_time:23756ms step_avg:60.45ms +step:394/2245 train_time:23816ms step_avg:60.45ms +step:395/2245 train_time:23877ms step_avg:60.45ms +step:396/2245 train_time:23936ms step_avg:60.44ms +step:397/2245 train_time:23998ms step_avg:60.45ms +step:398/2245 train_time:24057ms step_avg:60.44ms +step:399/2245 train_time:24118ms step_avg:60.45ms +step:400/2245 train_time:24177ms step_avg:60.44ms +step:401/2245 train_time:24238ms step_avg:60.44ms +step:402/2245 train_time:24297ms step_avg:60.44ms +step:403/2245 train_time:24359ms step_avg:60.44ms +step:404/2245 train_time:24418ms step_avg:60.44ms +step:405/2245 train_time:24480ms step_avg:60.44ms +step:406/2245 train_time:24539ms step_avg:60.44ms +step:407/2245 train_time:24600ms step_avg:60.44ms +step:408/2245 train_time:24659ms step_avg:60.44ms +step:409/2245 train_time:24721ms step_avg:60.44ms +step:410/2245 train_time:24780ms step_avg:60.44ms +step:411/2245 train_time:24841ms step_avg:60.44ms +step:412/2245 train_time:24900ms step_avg:60.44ms +step:413/2245 train_time:24961ms step_avg:60.44ms +step:414/2245 train_time:25021ms step_avg:60.44ms +step:415/2245 train_time:25082ms step_avg:60.44ms +step:416/2245 train_time:25141ms step_avg:60.44ms +step:417/2245 train_time:25202ms step_avg:60.44ms +step:418/2245 train_time:25261ms step_avg:60.43ms +step:419/2245 train_time:25323ms step_avg:60.44ms +step:420/2245 train_time:25383ms step_avg:60.44ms +step:421/2245 train_time:25445ms step_avg:60.44ms +step:422/2245 train_time:25504ms step_avg:60.44ms +step:423/2245 train_time:25565ms step_avg:60.44ms +step:424/2245 train_time:25625ms step_avg:60.44ms +step:425/2245 train_time:25686ms step_avg:60.44ms +step:426/2245 train_time:25745ms step_avg:60.44ms +step:427/2245 train_time:25807ms step_avg:60.44ms +step:428/2245 train_time:25865ms step_avg:60.43ms +step:429/2245 train_time:25927ms step_avg:60.44ms +step:430/2245 train_time:25987ms step_avg:60.43ms +step:431/2245 train_time:26048ms step_avg:60.44ms +step:432/2245 train_time:26107ms step_avg:60.43ms +step:433/2245 train_time:26168ms step_avg:60.43ms +step:434/2245 train_time:26227ms step_avg:60.43ms +step:435/2245 train_time:26288ms step_avg:60.43ms +step:436/2245 train_time:26347ms step_avg:60.43ms +step:437/2245 train_time:26409ms step_avg:60.43ms +step:438/2245 train_time:26468ms step_avg:60.43ms +step:439/2245 train_time:26531ms step_avg:60.44ms +step:440/2245 train_time:26590ms step_avg:60.43ms +step:441/2245 train_time:26651ms step_avg:60.43ms +step:442/2245 train_time:26710ms step_avg:60.43ms +step:443/2245 train_time:26771ms step_avg:60.43ms +step:444/2245 train_time:26829ms step_avg:60.43ms +step:445/2245 train_time:26891ms step_avg:60.43ms +step:446/2245 train_time:26950ms step_avg:60.43ms +step:447/2245 train_time:27012ms step_avg:60.43ms +step:448/2245 train_time:27070ms step_avg:60.43ms +step:449/2245 train_time:27133ms step_avg:60.43ms +step:450/2245 train_time:27191ms step_avg:60.42ms +step:451/2245 train_time:27253ms step_avg:60.43ms +step:452/2245 train_time:27311ms step_avg:60.42ms +step:453/2245 train_time:27373ms step_avg:60.43ms +step:454/2245 train_time:27432ms step_avg:60.42ms +step:455/2245 train_time:27494ms step_avg:60.43ms +step:456/2245 train_time:27553ms step_avg:60.42ms +step:457/2245 train_time:27614ms step_avg:60.42ms +step:458/2245 train_time:27673ms step_avg:60.42ms +step:459/2245 train_time:27734ms step_avg:60.42ms +step:460/2245 train_time:27793ms step_avg:60.42ms +step:461/2245 train_time:27854ms step_avg:60.42ms +step:462/2245 train_time:27913ms step_avg:60.42ms +step:463/2245 train_time:27974ms step_avg:60.42ms +step:464/2245 train_time:28033ms step_avg:60.42ms +step:465/2245 train_time:28094ms step_avg:60.42ms +step:466/2245 train_time:28153ms step_avg:60.41ms +step:467/2245 train_time:28215ms step_avg:60.42ms +step:468/2245 train_time:28274ms step_avg:60.41ms +step:469/2245 train_time:28336ms step_avg:60.42ms +step:470/2245 train_time:28395ms step_avg:60.41ms +step:471/2245 train_time:28456ms step_avg:60.42ms +step:472/2245 train_time:28516ms step_avg:60.41ms +step:473/2245 train_time:28577ms step_avg:60.42ms +step:474/2245 train_time:28636ms step_avg:60.41ms +step:475/2245 train_time:28699ms step_avg:60.42ms +step:476/2245 train_time:28758ms step_avg:60.42ms +step:477/2245 train_time:28819ms step_avg:60.42ms +step:478/2245 train_time:28878ms step_avg:60.41ms +step:479/2245 train_time:28940ms step_avg:60.42ms +step:480/2245 train_time:28999ms step_avg:60.41ms +step:481/2245 train_time:29060ms step_avg:60.42ms +step:482/2245 train_time:29119ms step_avg:60.41ms +step:483/2245 train_time:29181ms step_avg:60.42ms +step:484/2245 train_time:29241ms step_avg:60.41ms +step:485/2245 train_time:29302ms step_avg:60.42ms +step:486/2245 train_time:29361ms step_avg:60.41ms +step:487/2245 train_time:29423ms step_avg:60.42ms +step:488/2245 train_time:29483ms step_avg:60.42ms +step:489/2245 train_time:29545ms step_avg:60.42ms +step:490/2245 train_time:29605ms step_avg:60.42ms +step:491/2245 train_time:29667ms step_avg:60.42ms +step:492/2245 train_time:29727ms step_avg:60.42ms +step:493/2245 train_time:29788ms step_avg:60.42ms +step:494/2245 train_time:29847ms step_avg:60.42ms +step:495/2245 train_time:29908ms step_avg:60.42ms +step:496/2245 train_time:29968ms step_avg:60.42ms +step:497/2245 train_time:30029ms step_avg:60.42ms +step:498/2245 train_time:30088ms step_avg:60.42ms +step:499/2245 train_time:30150ms step_avg:60.42ms +step:500/2245 train_time:30209ms step_avg:60.42ms +step:500/2245 val_loss:3.8189 train_time:30272ms step_avg:60.54ms +step:501/2245 train_time:30292ms step_avg:60.46ms +step:502/2245 train_time:30333ms step_avg:60.42ms +step:503/2245 train_time:30398ms step_avg:60.43ms +step:504/2245 train_time:30458ms step_avg:60.43ms +step:505/2245 train_time:30520ms step_avg:60.44ms +step:506/2245 train_time:30580ms step_avg:60.43ms +step:507/2245 train_time:30640ms step_avg:60.43ms +step:508/2245 train_time:30699ms step_avg:60.43ms +step:509/2245 train_time:30761ms step_avg:60.43ms +step:510/2245 train_time:30819ms step_avg:60.43ms +step:511/2245 train_time:30880ms step_avg:60.43ms +step:512/2245 train_time:30939ms step_avg:60.43ms +step:513/2245 train_time:30999ms step_avg:60.43ms +step:514/2245 train_time:31058ms step_avg:60.42ms +step:515/2245 train_time:31118ms step_avg:60.42ms +step:516/2245 train_time:31177ms step_avg:60.42ms +step:517/2245 train_time:31241ms step_avg:60.43ms +step:518/2245 train_time:31302ms step_avg:60.43ms +step:519/2245 train_time:31365ms step_avg:60.43ms +step:520/2245 train_time:31425ms step_avg:60.43ms +step:521/2245 train_time:31487ms step_avg:60.44ms +step:522/2245 train_time:31548ms step_avg:60.44ms +step:523/2245 train_time:31609ms step_avg:60.44ms +step:524/2245 train_time:31668ms step_avg:60.44ms +step:525/2245 train_time:31729ms step_avg:60.44ms +step:526/2245 train_time:31788ms step_avg:60.43ms +step:527/2245 train_time:31850ms step_avg:60.44ms +step:528/2245 train_time:31908ms step_avg:60.43ms +step:529/2245 train_time:31969ms step_avg:60.43ms +step:530/2245 train_time:32028ms step_avg:60.43ms +step:531/2245 train_time:32089ms step_avg:60.43ms +step:532/2245 train_time:32148ms step_avg:60.43ms +step:533/2245 train_time:32211ms step_avg:60.43ms +step:534/2245 train_time:32270ms step_avg:60.43ms +step:535/2245 train_time:32333ms step_avg:60.44ms +step:536/2245 train_time:32392ms step_avg:60.43ms +step:537/2245 train_time:32453ms step_avg:60.43ms +step:538/2245 train_time:32512ms step_avg:60.43ms +step:539/2245 train_time:32573ms step_avg:60.43ms +step:540/2245 train_time:32633ms step_avg:60.43ms +step:541/2245 train_time:32693ms step_avg:60.43ms +step:542/2245 train_time:32752ms step_avg:60.43ms +step:543/2245 train_time:32813ms step_avg:60.43ms +step:544/2245 train_time:32871ms step_avg:60.43ms +step:545/2245 train_time:32933ms step_avg:60.43ms +step:546/2245 train_time:32992ms step_avg:60.42ms +step:547/2245 train_time:33053ms step_avg:60.43ms +step:548/2245 train_time:33112ms step_avg:60.42ms +step:549/2245 train_time:33174ms step_avg:60.43ms +step:550/2245 train_time:33233ms step_avg:60.42ms +step:551/2245 train_time:33295ms step_avg:60.43ms +step:552/2245 train_time:33354ms step_avg:60.42ms +step:553/2245 train_time:33415ms step_avg:60.42ms +step:554/2245 train_time:33474ms step_avg:60.42ms +step:555/2245 train_time:33535ms step_avg:60.42ms +step:556/2245 train_time:33594ms step_avg:60.42ms +step:557/2245 train_time:33656ms step_avg:60.42ms +step:558/2245 train_time:33715ms step_avg:60.42ms +step:559/2245 train_time:33776ms step_avg:60.42ms +step:560/2245 train_time:33835ms step_avg:60.42ms +step:561/2245 train_time:33896ms step_avg:60.42ms +step:562/2245 train_time:33957ms step_avg:60.42ms +step:563/2245 train_time:34017ms step_avg:60.42ms +step:564/2245 train_time:34076ms step_avg:60.42ms +step:565/2245 train_time:34138ms step_avg:60.42ms +step:566/2245 train_time:34197ms step_avg:60.42ms +step:567/2245 train_time:34258ms step_avg:60.42ms +step:568/2245 train_time:34317ms step_avg:60.42ms +step:569/2245 train_time:34379ms step_avg:60.42ms +step:570/2245 train_time:34438ms step_avg:60.42ms +step:571/2245 train_time:34499ms step_avg:60.42ms +step:572/2245 train_time:34558ms step_avg:60.42ms +step:573/2245 train_time:34619ms step_avg:60.42ms +step:574/2245 train_time:34679ms step_avg:60.42ms +step:575/2245 train_time:34740ms step_avg:60.42ms +step:576/2245 train_time:34799ms step_avg:60.42ms +step:577/2245 train_time:34860ms step_avg:60.42ms +step:578/2245 train_time:34919ms step_avg:60.41ms +step:579/2245 train_time:34981ms step_avg:60.42ms +step:580/2245 train_time:35040ms step_avg:60.41ms +step:581/2245 train_time:35102ms step_avg:60.42ms +step:582/2245 train_time:35162ms step_avg:60.42ms +step:583/2245 train_time:35224ms step_avg:60.42ms +step:584/2245 train_time:35283ms step_avg:60.42ms +step:585/2245 train_time:35345ms step_avg:60.42ms +step:586/2245 train_time:35404ms step_avg:60.42ms +step:587/2245 train_time:35467ms step_avg:60.42ms +step:588/2245 train_time:35527ms step_avg:60.42ms +step:589/2245 train_time:35589ms step_avg:60.42ms +step:590/2245 train_time:35648ms step_avg:60.42ms +step:591/2245 train_time:35709ms step_avg:60.42ms +step:592/2245 train_time:35769ms step_avg:60.42ms +step:593/2245 train_time:35830ms step_avg:60.42ms +step:594/2245 train_time:35889ms step_avg:60.42ms +step:595/2245 train_time:35950ms step_avg:60.42ms +step:596/2245 train_time:36010ms step_avg:60.42ms +step:597/2245 train_time:36071ms step_avg:60.42ms +step:598/2245 train_time:36130ms step_avg:60.42ms +step:599/2245 train_time:36192ms step_avg:60.42ms +step:600/2245 train_time:36250ms step_avg:60.42ms +step:601/2245 train_time:36312ms step_avg:60.42ms +step:602/2245 train_time:36371ms step_avg:60.42ms +step:603/2245 train_time:36432ms step_avg:60.42ms +step:604/2245 train_time:36491ms step_avg:60.42ms +step:605/2245 train_time:36552ms step_avg:60.42ms +step:606/2245 train_time:36611ms step_avg:60.41ms +step:607/2245 train_time:36673ms step_avg:60.42ms +step:608/2245 train_time:36731ms step_avg:60.41ms +step:609/2245 train_time:36793ms step_avg:60.42ms +step:610/2245 train_time:36852ms step_avg:60.41ms +step:611/2245 train_time:36914ms step_avg:60.42ms +step:612/2245 train_time:36973ms step_avg:60.41ms +step:613/2245 train_time:37034ms step_avg:60.42ms +step:614/2245 train_time:37094ms step_avg:60.41ms +step:615/2245 train_time:37156ms step_avg:60.42ms +step:616/2245 train_time:37215ms step_avg:60.41ms +step:617/2245 train_time:37277ms step_avg:60.42ms +step:618/2245 train_time:37336ms step_avg:60.41ms +step:619/2245 train_time:37398ms step_avg:60.42ms +step:620/2245 train_time:37457ms step_avg:60.41ms +step:621/2245 train_time:37518ms step_avg:60.42ms +step:622/2245 train_time:37578ms step_avg:60.41ms +step:623/2245 train_time:37639ms step_avg:60.42ms +step:624/2245 train_time:37698ms step_avg:60.41ms +step:625/2245 train_time:37760ms step_avg:60.42ms +step:626/2245 train_time:37820ms step_avg:60.41ms +step:627/2245 train_time:37881ms step_avg:60.42ms +step:628/2245 train_time:37941ms step_avg:60.42ms +step:629/2245 train_time:38002ms step_avg:60.42ms +step:630/2245 train_time:38061ms step_avg:60.41ms +step:631/2245 train_time:38123ms step_avg:60.42ms +step:632/2245 train_time:38183ms step_avg:60.42ms +step:633/2245 train_time:38244ms step_avg:60.42ms +step:634/2245 train_time:38304ms step_avg:60.42ms +step:635/2245 train_time:38366ms step_avg:60.42ms +step:636/2245 train_time:38426ms step_avg:60.42ms +step:637/2245 train_time:38487ms step_avg:60.42ms +step:638/2245 train_time:38546ms step_avg:60.42ms +step:639/2245 train_time:38607ms step_avg:60.42ms +step:640/2245 train_time:38668ms step_avg:60.42ms +step:641/2245 train_time:38729ms step_avg:60.42ms +step:642/2245 train_time:38788ms step_avg:60.42ms +step:643/2245 train_time:38850ms step_avg:60.42ms +step:644/2245 train_time:38909ms step_avg:60.42ms +step:645/2245 train_time:38970ms step_avg:60.42ms +step:646/2245 train_time:39030ms step_avg:60.42ms +step:647/2245 train_time:39093ms step_avg:60.42ms +step:648/2245 train_time:39152ms step_avg:60.42ms +step:649/2245 train_time:39213ms step_avg:60.42ms +step:650/2245 train_time:39271ms step_avg:60.42ms +step:651/2245 train_time:39333ms step_avg:60.42ms +step:652/2245 train_time:39392ms step_avg:60.42ms +step:653/2245 train_time:39453ms step_avg:60.42ms +step:654/2245 train_time:39511ms step_avg:60.41ms +step:655/2245 train_time:39573ms step_avg:60.42ms +step:656/2245 train_time:39631ms step_avg:60.41ms +step:657/2245 train_time:39693ms step_avg:60.41ms +step:658/2245 train_time:39751ms step_avg:60.41ms +step:659/2245 train_time:39813ms step_avg:60.41ms +step:660/2245 train_time:39871ms step_avg:60.41ms +step:661/2245 train_time:39933ms step_avg:60.41ms +step:662/2245 train_time:39992ms step_avg:60.41ms +step:663/2245 train_time:40054ms step_avg:60.41ms +step:664/2245 train_time:40112ms step_avg:60.41ms +step:665/2245 train_time:40175ms step_avg:60.41ms +step:666/2245 train_time:40233ms step_avg:60.41ms +step:667/2245 train_time:40295ms step_avg:60.41ms +step:668/2245 train_time:40354ms step_avg:60.41ms +step:669/2245 train_time:40415ms step_avg:60.41ms +step:670/2245 train_time:40474ms step_avg:60.41ms +step:671/2245 train_time:40535ms step_avg:60.41ms +step:672/2245 train_time:40594ms step_avg:60.41ms +step:673/2245 train_time:40656ms step_avg:60.41ms +step:674/2245 train_time:40715ms step_avg:60.41ms +step:675/2245 train_time:40776ms step_avg:60.41ms +step:676/2245 train_time:40836ms step_avg:60.41ms +step:677/2245 train_time:40898ms step_avg:60.41ms +step:678/2245 train_time:40957ms step_avg:60.41ms +step:679/2245 train_time:41019ms step_avg:60.41ms +step:680/2245 train_time:41078ms step_avg:60.41ms +step:681/2245 train_time:41140ms step_avg:60.41ms +step:682/2245 train_time:41199ms step_avg:60.41ms +step:683/2245 train_time:41260ms step_avg:60.41ms +step:684/2245 train_time:41320ms step_avg:60.41ms +step:685/2245 train_time:41381ms step_avg:60.41ms +step:686/2245 train_time:41441ms step_avg:60.41ms +step:687/2245 train_time:41502ms step_avg:60.41ms +step:688/2245 train_time:41561ms step_avg:60.41ms +step:689/2245 train_time:41623ms step_avg:60.41ms +step:690/2245 train_time:41683ms step_avg:60.41ms +step:691/2245 train_time:41744ms step_avg:60.41ms +step:692/2245 train_time:41804ms step_avg:60.41ms +step:693/2245 train_time:41866ms step_avg:60.41ms +step:694/2245 train_time:41926ms step_avg:60.41ms +step:695/2245 train_time:41987ms step_avg:60.41ms +step:696/2245 train_time:42047ms step_avg:60.41ms +step:697/2245 train_time:42108ms step_avg:60.41ms +step:698/2245 train_time:42168ms step_avg:60.41ms +step:699/2245 train_time:42230ms step_avg:60.42ms +step:700/2245 train_time:42289ms step_avg:60.41ms +step:701/2245 train_time:42350ms step_avg:60.41ms +step:702/2245 train_time:42410ms step_avg:60.41ms +step:703/2245 train_time:42471ms step_avg:60.41ms +step:704/2245 train_time:42531ms step_avg:60.41ms +step:705/2245 train_time:42593ms step_avg:60.42ms +step:706/2245 train_time:42652ms step_avg:60.41ms +step:707/2245 train_time:42713ms step_avg:60.42ms +step:708/2245 train_time:42772ms step_avg:60.41ms +step:709/2245 train_time:42834ms step_avg:60.41ms +step:710/2245 train_time:42892ms step_avg:60.41ms +step:711/2245 train_time:42954ms step_avg:60.41ms +step:712/2245 train_time:43012ms step_avg:60.41ms +step:713/2245 train_time:43073ms step_avg:60.41ms +step:714/2245 train_time:43133ms step_avg:60.41ms +step:715/2245 train_time:43194ms step_avg:60.41ms +step:716/2245 train_time:43253ms step_avg:60.41ms +step:717/2245 train_time:43315ms step_avg:60.41ms +step:718/2245 train_time:43373ms step_avg:60.41ms +step:719/2245 train_time:43435ms step_avg:60.41ms +step:720/2245 train_time:43494ms step_avg:60.41ms +step:721/2245 train_time:43558ms step_avg:60.41ms +step:722/2245 train_time:44001ms step_avg:60.94ms +step:723/2245 train_time:44060ms step_avg:60.94ms +step:724/2245 train_time:44118ms step_avg:60.94ms +step:725/2245 train_time:44179ms step_avg:60.94ms +step:726/2245 train_time:44237ms step_avg:60.93ms +step:727/2245 train_time:44298ms step_avg:60.93ms +step:728/2245 train_time:44356ms step_avg:60.93ms +step:729/2245 train_time:44417ms step_avg:60.93ms +step:730/2245 train_time:44475ms step_avg:60.92ms +step:731/2245 train_time:44535ms step_avg:60.92ms +step:732/2245 train_time:44594ms step_avg:60.92ms +step:733/2245 train_time:44654ms step_avg:60.92ms +step:734/2245 train_time:44713ms step_avg:60.92ms +step:735/2245 train_time:44773ms step_avg:60.92ms +step:736/2245 train_time:44834ms step_avg:60.92ms +step:737/2245 train_time:44904ms step_avg:60.93ms +step:738/2245 train_time:44966ms step_avg:60.93ms +step:739/2245 train_time:45029ms step_avg:60.93ms +step:740/2245 train_time:45089ms step_avg:60.93ms +step:741/2245 train_time:45152ms step_avg:60.93ms +step:742/2245 train_time:45211ms step_avg:60.93ms +step:743/2245 train_time:45274ms step_avg:60.93ms +step:744/2245 train_time:45333ms step_avg:60.93ms +step:745/2245 train_time:45394ms step_avg:60.93ms +step:746/2245 train_time:45453ms step_avg:60.93ms +step:747/2245 train_time:45515ms step_avg:60.93ms +step:748/2245 train_time:45574ms step_avg:60.93ms +step:749/2245 train_time:45635ms step_avg:60.93ms +step:750/2245 train_time:45694ms step_avg:60.93ms +step:750/2245 val_loss:3.6675 train_time:45756ms step_avg:61.01ms +step:751/2245 train_time:45778ms step_avg:60.96ms +step:752/2245 train_time:45819ms step_avg:60.93ms +step:753/2245 train_time:45880ms step_avg:60.93ms +step:754/2245 train_time:45940ms step_avg:60.93ms +step:755/2245 train_time:46004ms step_avg:60.93ms +step:756/2245 train_time:46064ms step_avg:60.93ms +step:757/2245 train_time:46126ms step_avg:60.93ms +step:758/2245 train_time:46185ms step_avg:60.93ms +step:759/2245 train_time:46246ms step_avg:60.93ms +step:760/2245 train_time:46305ms step_avg:60.93ms +step:761/2245 train_time:46366ms step_avg:60.93ms +step:762/2245 train_time:46425ms step_avg:60.93ms +step:763/2245 train_time:46487ms step_avg:60.93ms +step:764/2245 train_time:46546ms step_avg:60.92ms +step:765/2245 train_time:46608ms step_avg:60.93ms +step:766/2245 train_time:46672ms step_avg:60.93ms +step:767/2245 train_time:46739ms step_avg:60.94ms +step:768/2245 train_time:46801ms step_avg:60.94ms +step:769/2245 train_time:46863ms step_avg:60.94ms +step:770/2245 train_time:46923ms step_avg:60.94ms +step:771/2245 train_time:46985ms step_avg:60.94ms +step:772/2245 train_time:47045ms step_avg:60.94ms +step:773/2245 train_time:47107ms step_avg:60.94ms +step:774/2245 train_time:47166ms step_avg:60.94ms +step:775/2245 train_time:47228ms step_avg:60.94ms +step:776/2245 train_time:47288ms step_avg:60.94ms +step:777/2245 train_time:47349ms step_avg:60.94ms +step:778/2245 train_time:47409ms step_avg:60.94ms +step:779/2245 train_time:47471ms step_avg:60.94ms +step:780/2245 train_time:47533ms step_avg:60.94ms +step:781/2245 train_time:47594ms step_avg:60.94ms +step:782/2245 train_time:47656ms step_avg:60.94ms +step:783/2245 train_time:47720ms step_avg:60.94ms +step:784/2245 train_time:47781ms step_avg:60.94ms +step:785/2245 train_time:47843ms step_avg:60.95ms +step:786/2245 train_time:47904ms step_avg:60.95ms +step:787/2245 train_time:47966ms step_avg:60.95ms +step:788/2245 train_time:48025ms step_avg:60.95ms +step:789/2245 train_time:48087ms step_avg:60.95ms +step:790/2245 train_time:48146ms step_avg:60.94ms +step:791/2245 train_time:48208ms step_avg:60.95ms +step:792/2245 train_time:48268ms step_avg:60.94ms +step:793/2245 train_time:48330ms step_avg:60.95ms +step:794/2245 train_time:48389ms step_avg:60.94ms +step:795/2245 train_time:48451ms step_avg:60.94ms +step:796/2245 train_time:48511ms step_avg:60.94ms +step:797/2245 train_time:48574ms step_avg:60.95ms +step:798/2245 train_time:48634ms step_avg:60.94ms +step:799/2245 train_time:48698ms step_avg:60.95ms +step:800/2245 train_time:48759ms step_avg:60.95ms +step:801/2245 train_time:48822ms step_avg:60.95ms +step:802/2245 train_time:48882ms step_avg:60.95ms +step:803/2245 train_time:48944ms step_avg:60.95ms +step:804/2245 train_time:49003ms step_avg:60.95ms +step:805/2245 train_time:49065ms step_avg:60.95ms +step:806/2245 train_time:49125ms step_avg:60.95ms +step:807/2245 train_time:49187ms step_avg:60.95ms +step:808/2245 train_time:49246ms step_avg:60.95ms +step:809/2245 train_time:49309ms step_avg:60.95ms +step:810/2245 train_time:49368ms step_avg:60.95ms +step:811/2245 train_time:49430ms step_avg:60.95ms +step:812/2245 train_time:49491ms step_avg:60.95ms +step:813/2245 train_time:49553ms step_avg:60.95ms +step:814/2245 train_time:49613ms step_avg:60.95ms +step:815/2245 train_time:49676ms step_avg:60.95ms +step:816/2245 train_time:49737ms step_avg:60.95ms +step:817/2245 train_time:49800ms step_avg:60.96ms +step:818/2245 train_time:49860ms step_avg:60.95ms +step:819/2245 train_time:49922ms step_avg:60.96ms +step:820/2245 train_time:49982ms step_avg:60.95ms +step:821/2245 train_time:50044ms step_avg:60.96ms +step:822/2245 train_time:50104ms step_avg:60.95ms +step:823/2245 train_time:50166ms step_avg:60.96ms +step:824/2245 train_time:50225ms step_avg:60.95ms +step:825/2245 train_time:50287ms step_avg:60.95ms +step:826/2245 train_time:50347ms step_avg:60.95ms +step:827/2245 train_time:50410ms step_avg:60.96ms +step:828/2245 train_time:50470ms step_avg:60.95ms +step:829/2245 train_time:50532ms step_avg:60.96ms +step:830/2245 train_time:50593ms step_avg:60.96ms +step:831/2245 train_time:50657ms step_avg:60.96ms +step:832/2245 train_time:50717ms step_avg:60.96ms +step:833/2245 train_time:50779ms step_avg:60.96ms +step:834/2245 train_time:50839ms step_avg:60.96ms +step:835/2245 train_time:50902ms step_avg:60.96ms +step:836/2245 train_time:50961ms step_avg:60.96ms +step:837/2245 train_time:51023ms step_avg:60.96ms +step:838/2245 train_time:51083ms step_avg:60.96ms +step:839/2245 train_time:51146ms step_avg:60.96ms +step:840/2245 train_time:51205ms step_avg:60.96ms +step:841/2245 train_time:51267ms step_avg:60.96ms +step:842/2245 train_time:51326ms step_avg:60.96ms +step:843/2245 train_time:51388ms step_avg:60.96ms +step:844/2245 train_time:51448ms step_avg:60.96ms +step:845/2245 train_time:51510ms step_avg:60.96ms +step:846/2245 train_time:51570ms step_avg:60.96ms +step:847/2245 train_time:51634ms step_avg:60.96ms +step:848/2245 train_time:51695ms step_avg:60.96ms +step:849/2245 train_time:51758ms step_avg:60.96ms +step:850/2245 train_time:51818ms step_avg:60.96ms +step:851/2245 train_time:51880ms step_avg:60.96ms +step:852/2245 train_time:51940ms step_avg:60.96ms +step:853/2245 train_time:52002ms step_avg:60.96ms +step:854/2245 train_time:52062ms step_avg:60.96ms +step:855/2245 train_time:52124ms step_avg:60.96ms +step:856/2245 train_time:52183ms step_avg:60.96ms +step:857/2245 train_time:52245ms step_avg:60.96ms +step:858/2245 train_time:52304ms step_avg:60.96ms +step:859/2245 train_time:52367ms step_avg:60.96ms +step:860/2245 train_time:52426ms step_avg:60.96ms +step:861/2245 train_time:52489ms step_avg:60.96ms +step:862/2245 train_time:52549ms step_avg:60.96ms +step:863/2245 train_time:52612ms step_avg:60.96ms +step:864/2245 train_time:52672ms step_avg:60.96ms +step:865/2245 train_time:52735ms step_avg:60.97ms +step:866/2245 train_time:52796ms step_avg:60.96ms +step:867/2245 train_time:52857ms step_avg:60.97ms +step:868/2245 train_time:52917ms step_avg:60.96ms +step:869/2245 train_time:52979ms step_avg:60.97ms +step:870/2245 train_time:53040ms step_avg:60.97ms +step:871/2245 train_time:53102ms step_avg:60.97ms +step:872/2245 train_time:53162ms step_avg:60.97ms +step:873/2245 train_time:53224ms step_avg:60.97ms +step:874/2245 train_time:53283ms step_avg:60.96ms +step:875/2245 train_time:53345ms step_avg:60.97ms +step:876/2245 train_time:53405ms step_avg:60.96ms +step:877/2245 train_time:53468ms step_avg:60.97ms +step:878/2245 train_time:53530ms step_avg:60.97ms +step:879/2245 train_time:53591ms step_avg:60.97ms +step:880/2245 train_time:53652ms step_avg:60.97ms +step:881/2245 train_time:53714ms step_avg:60.97ms +step:882/2245 train_time:53775ms step_avg:60.97ms +step:883/2245 train_time:53838ms step_avg:60.97ms +step:884/2245 train_time:53898ms step_avg:60.97ms +step:885/2245 train_time:53960ms step_avg:60.97ms +step:886/2245 train_time:54020ms step_avg:60.97ms +step:887/2245 train_time:54082ms step_avg:60.97ms +step:888/2245 train_time:54142ms step_avg:60.97ms +step:889/2245 train_time:54204ms step_avg:60.97ms +step:890/2245 train_time:54263ms step_avg:60.97ms +step:891/2245 train_time:54328ms step_avg:60.97ms +step:892/2245 train_time:54385ms step_avg:60.97ms +step:893/2245 train_time:54448ms step_avg:60.97ms +step:894/2245 train_time:54508ms step_avg:60.97ms +step:895/2245 train_time:54571ms step_avg:60.97ms +step:896/2245 train_time:54632ms step_avg:60.97ms +step:897/2245 train_time:54694ms step_avg:60.97ms +step:898/2245 train_time:54755ms step_avg:60.97ms +step:899/2245 train_time:54818ms step_avg:60.98ms +step:900/2245 train_time:54878ms step_avg:60.98ms +step:901/2245 train_time:54940ms step_avg:60.98ms +step:902/2245 train_time:55000ms step_avg:60.98ms +step:903/2245 train_time:55062ms step_avg:60.98ms +step:904/2245 train_time:55122ms step_avg:60.98ms +step:905/2245 train_time:55184ms step_avg:60.98ms +step:906/2245 train_time:55243ms step_avg:60.98ms +step:907/2245 train_time:55305ms step_avg:60.98ms +step:908/2245 train_time:55365ms step_avg:60.97ms +step:909/2245 train_time:55428ms step_avg:60.98ms +step:910/2245 train_time:55488ms step_avg:60.98ms +step:911/2245 train_time:55550ms step_avg:60.98ms +step:912/2245 train_time:55611ms step_avg:60.98ms +step:913/2245 train_time:55673ms step_avg:60.98ms +step:914/2245 train_time:55734ms step_avg:60.98ms +step:915/2245 train_time:55797ms step_avg:60.98ms +step:916/2245 train_time:55857ms step_avg:60.98ms +step:917/2245 train_time:55919ms step_avg:60.98ms +step:918/2245 train_time:55979ms step_avg:60.98ms +step:919/2245 train_time:56041ms step_avg:60.98ms +step:920/2245 train_time:56101ms step_avg:60.98ms +step:921/2245 train_time:56162ms step_avg:60.98ms +step:922/2245 train_time:56222ms step_avg:60.98ms +step:923/2245 train_time:56284ms step_avg:60.98ms +step:924/2245 train_time:56344ms step_avg:60.98ms +step:925/2245 train_time:56406ms step_avg:60.98ms +step:926/2245 train_time:56467ms step_avg:60.98ms +step:927/2245 train_time:56530ms step_avg:60.98ms +step:928/2245 train_time:56589ms step_avg:60.98ms +step:929/2245 train_time:56652ms step_avg:60.98ms +step:930/2245 train_time:56712ms step_avg:60.98ms +step:931/2245 train_time:56775ms step_avg:60.98ms +step:932/2245 train_time:56837ms step_avg:60.98ms +step:933/2245 train_time:56898ms step_avg:60.98ms +step:934/2245 train_time:56958ms step_avg:60.98ms +step:935/2245 train_time:57021ms step_avg:60.99ms +step:936/2245 train_time:57081ms step_avg:60.98ms +step:937/2245 train_time:57144ms step_avg:60.99ms +step:938/2245 train_time:57204ms step_avg:60.98ms +step:939/2245 train_time:57265ms step_avg:60.99ms +step:940/2245 train_time:57325ms step_avg:60.98ms +step:941/2245 train_time:57386ms step_avg:60.98ms +step:942/2245 train_time:57446ms step_avg:60.98ms +step:943/2245 train_time:57508ms step_avg:60.98ms +step:944/2245 train_time:57568ms step_avg:60.98ms +step:945/2245 train_time:57631ms step_avg:60.99ms +step:946/2245 train_time:57692ms step_avg:60.98ms +step:947/2245 train_time:57755ms step_avg:60.99ms +step:948/2245 train_time:57816ms step_avg:60.99ms +step:949/2245 train_time:57878ms step_avg:60.99ms +step:950/2245 train_time:57939ms step_avg:60.99ms +step:951/2245 train_time:58002ms step_avg:60.99ms +step:952/2245 train_time:58062ms step_avg:60.99ms +step:953/2245 train_time:58124ms step_avg:60.99ms +step:954/2245 train_time:58184ms step_avg:60.99ms +step:955/2245 train_time:58246ms step_avg:60.99ms +step:956/2245 train_time:58306ms step_avg:60.99ms +step:957/2245 train_time:58368ms step_avg:60.99ms +step:958/2245 train_time:58428ms step_avg:60.99ms +step:959/2245 train_time:58490ms step_avg:60.99ms +step:960/2245 train_time:58550ms step_avg:60.99ms +step:961/2245 train_time:58612ms step_avg:60.99ms +step:962/2245 train_time:58672ms step_avg:60.99ms +step:963/2245 train_time:58735ms step_avg:60.99ms +step:964/2245 train_time:58796ms step_avg:60.99ms +step:965/2245 train_time:58859ms step_avg:60.99ms +step:966/2245 train_time:58919ms step_avg:60.99ms +step:967/2245 train_time:58981ms step_avg:60.99ms +step:968/2245 train_time:59041ms step_avg:60.99ms +step:969/2245 train_time:59102ms step_avg:60.99ms +step:970/2245 train_time:59162ms step_avg:60.99ms +step:971/2245 train_time:59225ms step_avg:60.99ms +step:972/2245 train_time:59285ms step_avg:60.99ms +step:973/2245 train_time:59346ms step_avg:60.99ms +step:974/2245 train_time:59406ms step_avg:60.99ms +step:975/2245 train_time:59468ms step_avg:60.99ms +step:976/2245 train_time:59528ms step_avg:60.99ms +step:977/2245 train_time:59590ms step_avg:60.99ms +step:978/2245 train_time:59650ms step_avg:60.99ms +step:979/2245 train_time:59713ms step_avg:60.99ms +step:980/2245 train_time:59774ms step_avg:60.99ms +step:981/2245 train_time:59837ms step_avg:61.00ms +step:982/2245 train_time:59897ms step_avg:61.00ms +step:983/2245 train_time:59960ms step_avg:61.00ms +step:984/2245 train_time:60021ms step_avg:61.00ms +step:985/2245 train_time:60082ms step_avg:61.00ms +step:986/2245 train_time:60142ms step_avg:61.00ms +step:987/2245 train_time:60204ms step_avg:61.00ms +step:988/2245 train_time:60264ms step_avg:61.00ms +step:989/2245 train_time:60326ms step_avg:61.00ms +step:990/2245 train_time:60385ms step_avg:61.00ms +step:991/2245 train_time:60447ms step_avg:61.00ms +step:992/2245 train_time:60507ms step_avg:61.00ms +step:993/2245 train_time:60570ms step_avg:61.00ms +step:994/2245 train_time:60630ms step_avg:61.00ms +step:995/2245 train_time:60693ms step_avg:61.00ms +step:996/2245 train_time:60753ms step_avg:61.00ms +step:997/2245 train_time:60816ms step_avg:61.00ms +step:998/2245 train_time:60877ms step_avg:61.00ms +step:999/2245 train_time:60939ms step_avg:61.00ms +step:1000/2245 train_time:61000ms step_avg:61.00ms +step:1000/2245 val_loss:3.5894 train_time:61064ms step_avg:61.06ms +step:1001/2245 train_time:61085ms step_avg:61.02ms +step:1002/2245 train_time:61128ms step_avg:61.01ms +step:1003/2245 train_time:61194ms step_avg:61.01ms +step:1004/2245 train_time:61254ms step_avg:61.01ms +step:1005/2245 train_time:61315ms step_avg:61.01ms +step:1006/2245 train_time:61375ms step_avg:61.01ms +step:1007/2245 train_time:61436ms step_avg:61.01ms +step:1008/2245 train_time:61495ms step_avg:61.01ms +step:1009/2245 train_time:61557ms step_avg:61.01ms +step:1010/2245 train_time:61616ms step_avg:61.01ms +step:1011/2245 train_time:61678ms step_avg:61.01ms +step:1012/2245 train_time:61737ms step_avg:61.00ms +step:1013/2245 train_time:61799ms step_avg:61.01ms +step:1014/2245 train_time:61859ms step_avg:61.01ms +step:1015/2245 train_time:61922ms step_avg:61.01ms +step:1016/2245 train_time:61982ms step_avg:61.01ms +step:1017/2245 train_time:62047ms step_avg:61.01ms +step:1018/2245 train_time:62109ms step_avg:61.01ms +step:1019/2245 train_time:62173ms step_avg:61.01ms +step:1020/2245 train_time:62234ms step_avg:61.01ms +step:1021/2245 train_time:62296ms step_avg:61.02ms +step:1022/2245 train_time:62356ms step_avg:61.01ms +step:1023/2245 train_time:62418ms step_avg:61.01ms +step:1024/2245 train_time:62478ms step_avg:61.01ms +step:1025/2245 train_time:62540ms step_avg:61.01ms +step:1026/2245 train_time:62600ms step_avg:61.01ms +step:1027/2245 train_time:62662ms step_avg:61.01ms +step:1028/2245 train_time:62722ms step_avg:61.01ms +step:1029/2245 train_time:62784ms step_avg:61.01ms +step:1030/2245 train_time:62843ms step_avg:61.01ms +step:1031/2245 train_time:62906ms step_avg:61.01ms +step:1032/2245 train_time:62968ms step_avg:61.02ms +step:1033/2245 train_time:63030ms step_avg:61.02ms +step:1034/2245 train_time:63090ms step_avg:61.02ms +step:1035/2245 train_time:63154ms step_avg:61.02ms +step:1036/2245 train_time:63214ms step_avg:61.02ms +step:1037/2245 train_time:63277ms step_avg:61.02ms +step:1038/2245 train_time:63338ms step_avg:61.02ms +step:1039/2245 train_time:63400ms step_avg:61.02ms +step:1040/2245 train_time:63460ms step_avg:61.02ms +step:1041/2245 train_time:63522ms step_avg:61.02ms +step:1042/2245 train_time:63582ms step_avg:61.02ms +step:1043/2245 train_time:63644ms step_avg:61.02ms +step:1044/2245 train_time:63703ms step_avg:61.02ms +step:1045/2245 train_time:63765ms step_avg:61.02ms +step:1046/2245 train_time:63825ms step_avg:61.02ms +step:1047/2245 train_time:63887ms step_avg:61.02ms +step:1048/2245 train_time:63948ms step_avg:61.02ms +step:1049/2245 train_time:64011ms step_avg:61.02ms +step:1050/2245 train_time:64071ms step_avg:61.02ms +step:1051/2245 train_time:64135ms step_avg:61.02ms +step:1052/2245 train_time:64195ms step_avg:61.02ms +step:1053/2245 train_time:64258ms step_avg:61.02ms +step:1054/2245 train_time:64318ms step_avg:61.02ms +step:1055/2245 train_time:64380ms step_avg:61.02ms +step:1056/2245 train_time:64439ms step_avg:61.02ms +step:1057/2245 train_time:64501ms step_avg:61.02ms +step:1058/2245 train_time:64561ms step_avg:61.02ms +step:1059/2245 train_time:64623ms step_avg:61.02ms +step:1060/2245 train_time:64684ms step_avg:61.02ms +step:1061/2245 train_time:64745ms step_avg:61.02ms +step:1062/2245 train_time:64805ms step_avg:61.02ms +step:1063/2245 train_time:64868ms step_avg:61.02ms +step:1064/2245 train_time:64928ms step_avg:61.02ms +step:1065/2245 train_time:64990ms step_avg:61.02ms +step:1066/2245 train_time:65050ms step_avg:61.02ms +step:1067/2245 train_time:65113ms step_avg:61.02ms +step:1068/2245 train_time:65173ms step_avg:61.02ms +step:1069/2245 train_time:65235ms step_avg:61.02ms +step:1070/2245 train_time:65294ms step_avg:61.02ms +step:1071/2245 train_time:65360ms step_avg:61.03ms +step:1072/2245 train_time:65416ms step_avg:61.02ms +step:1073/2245 train_time:65479ms step_avg:61.02ms +step:1074/2245 train_time:65538ms step_avg:61.02ms +step:1075/2245 train_time:65601ms step_avg:61.02ms +step:1076/2245 train_time:65661ms step_avg:61.02ms +step:1077/2245 train_time:65723ms step_avg:61.02ms +step:1078/2245 train_time:65784ms step_avg:61.02ms +step:1079/2245 train_time:65846ms step_avg:61.03ms +step:1080/2245 train_time:65907ms step_avg:61.02ms +step:1081/2245 train_time:65970ms step_avg:61.03ms +step:1082/2245 train_time:66031ms step_avg:61.03ms +step:1083/2245 train_time:66093ms step_avg:61.03ms +step:1084/2245 train_time:66153ms step_avg:61.03ms +step:1085/2245 train_time:66215ms step_avg:61.03ms +step:1086/2245 train_time:66275ms step_avg:61.03ms +step:1087/2245 train_time:66337ms step_avg:61.03ms +step:1088/2245 train_time:66397ms step_avg:61.03ms +step:1089/2245 train_time:66460ms step_avg:61.03ms +step:1090/2245 train_time:66519ms step_avg:61.03ms +step:1091/2245 train_time:66582ms step_avg:61.03ms +step:1092/2245 train_time:66642ms step_avg:61.03ms +step:1093/2245 train_time:66704ms step_avg:61.03ms +step:1094/2245 train_time:66764ms step_avg:61.03ms +step:1095/2245 train_time:66826ms step_avg:61.03ms +step:1096/2245 train_time:66887ms step_avg:61.03ms +step:1097/2245 train_time:66950ms step_avg:61.03ms +step:1098/2245 train_time:67010ms step_avg:61.03ms +step:1099/2245 train_time:67072ms step_avg:61.03ms +step:1100/2245 train_time:67133ms step_avg:61.03ms +step:1101/2245 train_time:67195ms step_avg:61.03ms +step:1102/2245 train_time:67255ms step_avg:61.03ms +step:1103/2245 train_time:67317ms step_avg:61.03ms +step:1104/2245 train_time:67377ms step_avg:61.03ms +step:1105/2245 train_time:67440ms step_avg:61.03ms +step:1106/2245 train_time:67499ms step_avg:61.03ms +step:1107/2245 train_time:67562ms step_avg:61.03ms +step:1108/2245 train_time:67622ms step_avg:61.03ms +step:1109/2245 train_time:67685ms step_avg:61.03ms +step:1110/2245 train_time:67745ms step_avg:61.03ms +step:1111/2245 train_time:67807ms step_avg:61.03ms +step:1112/2245 train_time:67867ms step_avg:61.03ms +step:1113/2245 train_time:67930ms step_avg:61.03ms +step:1114/2245 train_time:67990ms step_avg:61.03ms +step:1115/2245 train_time:68052ms step_avg:61.03ms +step:1116/2245 train_time:68112ms step_avg:61.03ms +step:1117/2245 train_time:68174ms step_avg:61.03ms +step:1118/2245 train_time:68234ms step_avg:61.03ms +step:1119/2245 train_time:68296ms step_avg:61.03ms +step:1120/2245 train_time:68356ms step_avg:61.03ms +step:1121/2245 train_time:68418ms step_avg:61.03ms +step:1122/2245 train_time:68478ms step_avg:61.03ms +step:1123/2245 train_time:68540ms step_avg:61.03ms +step:1124/2245 train_time:68600ms step_avg:61.03ms +step:1125/2245 train_time:68662ms step_avg:61.03ms +step:1126/2245 train_time:68723ms step_avg:61.03ms +step:1127/2245 train_time:68786ms step_avg:61.03ms +step:1128/2245 train_time:68846ms step_avg:61.03ms +step:1129/2245 train_time:68910ms step_avg:61.04ms +step:1130/2245 train_time:68969ms step_avg:61.03ms +step:1131/2245 train_time:69032ms step_avg:61.04ms +step:1132/2245 train_time:69091ms step_avg:61.03ms +step:1133/2245 train_time:69154ms step_avg:61.04ms +step:1134/2245 train_time:69214ms step_avg:61.04ms +step:1135/2245 train_time:69276ms step_avg:61.04ms +step:1136/2245 train_time:69336ms step_avg:61.04ms +step:1137/2245 train_time:69398ms step_avg:61.04ms +step:1138/2245 train_time:69457ms step_avg:61.03ms +step:1139/2245 train_time:69519ms step_avg:61.04ms +step:1140/2245 train_time:69579ms step_avg:61.03ms +step:1141/2245 train_time:69642ms step_avg:61.04ms +step:1142/2245 train_time:69702ms step_avg:61.04ms +step:1143/2245 train_time:69765ms step_avg:61.04ms +step:1144/2245 train_time:69826ms step_avg:61.04ms +step:1145/2245 train_time:69890ms step_avg:61.04ms +step:1146/2245 train_time:69949ms step_avg:61.04ms +step:1147/2245 train_time:70012ms step_avg:61.04ms +step:1148/2245 train_time:70071ms step_avg:61.04ms +step:1149/2245 train_time:70134ms step_avg:61.04ms +step:1150/2245 train_time:70194ms step_avg:61.04ms +step:1151/2245 train_time:70256ms step_avg:61.04ms +step:1152/2245 train_time:70316ms step_avg:61.04ms +step:1153/2245 train_time:70378ms step_avg:61.04ms +step:1154/2245 train_time:70437ms step_avg:61.04ms +step:1155/2245 train_time:70500ms step_avg:61.04ms +step:1156/2245 train_time:70560ms step_avg:61.04ms +step:1157/2245 train_time:70622ms step_avg:61.04ms +step:1158/2245 train_time:70682ms step_avg:61.04ms +step:1159/2245 train_time:70744ms step_avg:61.04ms +step:1160/2245 train_time:70804ms step_avg:61.04ms +step:1161/2245 train_time:70868ms step_avg:61.04ms +step:1162/2245 train_time:70928ms step_avg:61.04ms +step:1163/2245 train_time:70990ms step_avg:61.04ms +step:1164/2245 train_time:71050ms step_avg:61.04ms +step:1165/2245 train_time:71112ms step_avg:61.04ms +step:1166/2245 train_time:71172ms step_avg:61.04ms +step:1167/2245 train_time:71235ms step_avg:61.04ms +step:1168/2245 train_time:71295ms step_avg:61.04ms +step:1169/2245 train_time:71357ms step_avg:61.04ms +step:1170/2245 train_time:71417ms step_avg:61.04ms +step:1171/2245 train_time:71480ms step_avg:61.04ms +step:1172/2245 train_time:71540ms step_avg:61.04ms +step:1173/2245 train_time:71602ms step_avg:61.04ms +step:1174/2245 train_time:71662ms step_avg:61.04ms +step:1175/2245 train_time:71725ms step_avg:61.04ms +step:1176/2245 train_time:71786ms step_avg:61.04ms +step:1177/2245 train_time:71850ms step_avg:61.04ms +step:1178/2245 train_time:71910ms step_avg:61.04ms +step:1179/2245 train_time:71972ms step_avg:61.04ms +step:1180/2245 train_time:72032ms step_avg:61.04ms +step:1181/2245 train_time:72095ms step_avg:61.05ms +step:1182/2245 train_time:72155ms step_avg:61.04ms +step:1183/2245 train_time:72216ms step_avg:61.05ms +step:1184/2245 train_time:72276ms step_avg:61.04ms +step:1185/2245 train_time:72339ms step_avg:61.05ms +step:1186/2245 train_time:72399ms step_avg:61.04ms +step:1187/2245 train_time:72461ms step_avg:61.05ms +step:1188/2245 train_time:72521ms step_avg:61.04ms +step:1189/2245 train_time:72583ms step_avg:61.05ms +step:1190/2245 train_time:72642ms step_avg:61.04ms +step:1191/2245 train_time:72705ms step_avg:61.05ms +step:1192/2245 train_time:72764ms step_avg:61.04ms +step:1193/2245 train_time:72828ms step_avg:61.05ms +step:1194/2245 train_time:72887ms step_avg:61.04ms +step:1195/2245 train_time:72950ms step_avg:61.05ms +step:1196/2245 train_time:73009ms step_avg:61.04ms +step:1197/2245 train_time:73073ms step_avg:61.05ms +step:1198/2245 train_time:73133ms step_avg:61.05ms +step:1199/2245 train_time:73195ms step_avg:61.05ms +step:1200/2245 train_time:73254ms step_avg:61.05ms +step:1201/2245 train_time:73316ms step_avg:61.05ms +step:1202/2245 train_time:73377ms step_avg:61.05ms +step:1203/2245 train_time:73438ms step_avg:61.05ms +step:1204/2245 train_time:73499ms step_avg:61.05ms +step:1205/2245 train_time:73561ms step_avg:61.05ms +step:1206/2245 train_time:73620ms step_avg:61.05ms +step:1207/2245 train_time:73683ms step_avg:61.05ms +step:1208/2245 train_time:73743ms step_avg:61.05ms +step:1209/2245 train_time:73806ms step_avg:61.05ms +step:1210/2245 train_time:73867ms step_avg:61.05ms +step:1211/2245 train_time:73930ms step_avg:61.05ms +step:1212/2245 train_time:73989ms step_avg:61.05ms +step:1213/2245 train_time:74052ms step_avg:61.05ms +step:1214/2245 train_time:74112ms step_avg:61.05ms +step:1215/2245 train_time:74175ms step_avg:61.05ms +step:1216/2245 train_time:74235ms step_avg:61.05ms +step:1217/2245 train_time:74297ms step_avg:61.05ms +step:1218/2245 train_time:74357ms step_avg:61.05ms +step:1219/2245 train_time:74419ms step_avg:61.05ms +step:1220/2245 train_time:74479ms step_avg:61.05ms +step:1221/2245 train_time:74542ms step_avg:61.05ms +step:1222/2245 train_time:74601ms step_avg:61.05ms +step:1223/2245 train_time:74664ms step_avg:61.05ms +step:1224/2245 train_time:74725ms step_avg:61.05ms +step:1225/2245 train_time:74788ms step_avg:61.05ms +step:1226/2245 train_time:74849ms step_avg:61.05ms +step:1227/2245 train_time:74911ms step_avg:61.05ms +step:1228/2245 train_time:74971ms step_avg:61.05ms +step:1229/2245 train_time:75035ms step_avg:61.05ms +step:1230/2245 train_time:75094ms step_avg:61.05ms +step:1231/2245 train_time:75156ms step_avg:61.05ms +step:1232/2245 train_time:75216ms step_avg:61.05ms +step:1233/2245 train_time:75278ms step_avg:61.05ms +step:1234/2245 train_time:75338ms step_avg:61.05ms +step:1235/2245 train_time:75400ms step_avg:61.05ms +step:1236/2245 train_time:75460ms step_avg:61.05ms +step:1237/2245 train_time:75522ms step_avg:61.05ms +step:1238/2245 train_time:75582ms step_avg:61.05ms +step:1239/2245 train_time:75645ms step_avg:61.05ms +step:1240/2245 train_time:75704ms step_avg:61.05ms +step:1241/2245 train_time:75767ms step_avg:61.05ms +step:1242/2245 train_time:75828ms step_avg:61.05ms +step:1243/2245 train_time:75891ms step_avg:61.05ms +step:1244/2245 train_time:75950ms step_avg:61.05ms +step:1245/2245 train_time:76012ms step_avg:61.05ms +step:1246/2245 train_time:76072ms step_avg:61.05ms +step:1247/2245 train_time:76135ms step_avg:61.05ms +step:1248/2245 train_time:76195ms step_avg:61.05ms +step:1249/2245 train_time:76259ms step_avg:61.06ms +step:1250/2245 train_time:76317ms step_avg:61.05ms +step:1250/2245 val_loss:3.5199 train_time:76380ms step_avg:61.10ms +step:1251/2245 train_time:76399ms step_avg:61.07ms +step:1252/2245 train_time:76442ms step_avg:61.06ms +step:1253/2245 train_time:76506ms step_avg:61.06ms +step:1254/2245 train_time:76567ms step_avg:61.06ms +step:1255/2245 train_time:76630ms step_avg:61.06ms +step:1256/2245 train_time:76689ms step_avg:61.06ms +step:1257/2245 train_time:76751ms step_avg:61.06ms +step:1258/2245 train_time:76810ms step_avg:61.06ms +step:1259/2245 train_time:76873ms step_avg:61.06ms +step:1260/2245 train_time:76932ms step_avg:61.06ms +step:1261/2245 train_time:76994ms step_avg:61.06ms +step:1262/2245 train_time:77054ms step_avg:61.06ms +step:1263/2245 train_time:77117ms step_avg:61.06ms +step:1264/2245 train_time:77178ms step_avg:61.06ms +step:1265/2245 train_time:77239ms step_avg:61.06ms +step:1266/2245 train_time:77299ms step_avg:61.06ms +step:1267/2245 train_time:77363ms step_avg:61.06ms +step:1268/2245 train_time:77424ms step_avg:61.06ms +step:1269/2245 train_time:77487ms step_avg:61.06ms +step:1270/2245 train_time:77547ms step_avg:61.06ms +step:1271/2245 train_time:77610ms step_avg:61.06ms +step:1272/2245 train_time:77670ms step_avg:61.06ms +step:1273/2245 train_time:77732ms step_avg:61.06ms +step:1274/2245 train_time:77791ms step_avg:61.06ms +step:1275/2245 train_time:77853ms step_avg:61.06ms +step:1276/2245 train_time:77913ms step_avg:61.06ms +step:1277/2245 train_time:77975ms step_avg:61.06ms +step:1278/2245 train_time:78034ms step_avg:61.06ms +step:1279/2245 train_time:78097ms step_avg:61.06ms +step:1280/2245 train_time:78157ms step_avg:61.06ms +step:1281/2245 train_time:78220ms step_avg:61.06ms +step:1282/2245 train_time:78280ms step_avg:61.06ms +step:1283/2245 train_time:78343ms step_avg:61.06ms +step:1284/2245 train_time:78404ms step_avg:61.06ms +step:1285/2245 train_time:78466ms step_avg:61.06ms +step:1286/2245 train_time:78526ms step_avg:61.06ms +step:1287/2245 train_time:78589ms step_avg:61.06ms +step:1288/2245 train_time:78649ms step_avg:61.06ms +step:1289/2245 train_time:78711ms step_avg:61.06ms +step:1290/2245 train_time:78771ms step_avg:61.06ms +step:1291/2245 train_time:78833ms step_avg:61.06ms +step:1292/2245 train_time:78893ms step_avg:61.06ms +step:1293/2245 train_time:78955ms step_avg:61.06ms +step:1294/2245 train_time:79015ms step_avg:61.06ms +step:1295/2245 train_time:79078ms step_avg:61.06ms +step:1296/2245 train_time:79138ms step_avg:61.06ms +step:1297/2245 train_time:79200ms step_avg:61.06ms +step:1298/2245 train_time:79260ms step_avg:61.06ms +step:1299/2245 train_time:79323ms step_avg:61.06ms +step:1300/2245 train_time:79383ms step_avg:61.06ms +step:1301/2245 train_time:79445ms step_avg:61.06ms +step:1302/2245 train_time:79505ms step_avg:61.06ms +step:1303/2245 train_time:79568ms step_avg:61.07ms +step:1304/2245 train_time:79627ms step_avg:61.06ms +step:1305/2245 train_time:79690ms step_avg:61.06ms +step:1306/2245 train_time:79749ms step_avg:61.06ms +step:1307/2245 train_time:79812ms step_avg:61.06ms +step:1308/2245 train_time:79872ms step_avg:61.06ms +step:1309/2245 train_time:79934ms step_avg:61.06ms +step:1310/2245 train_time:79994ms step_avg:61.06ms +step:1311/2245 train_time:80056ms step_avg:61.06ms +step:1312/2245 train_time:80117ms step_avg:61.06ms +step:1313/2245 train_time:80180ms step_avg:61.07ms +step:1314/2245 train_time:80240ms step_avg:61.07ms +step:1315/2245 train_time:80302ms step_avg:61.07ms +step:1316/2245 train_time:80363ms step_avg:61.07ms +step:1317/2245 train_time:80425ms step_avg:61.07ms +step:1318/2245 train_time:80485ms step_avg:61.07ms +step:1319/2245 train_time:80547ms step_avg:61.07ms +step:1320/2245 train_time:80607ms step_avg:61.07ms +step:1321/2245 train_time:80670ms step_avg:61.07ms +step:1322/2245 train_time:80729ms step_avg:61.07ms +step:1323/2245 train_time:80792ms step_avg:61.07ms +step:1324/2245 train_time:80852ms step_avg:61.07ms +step:1325/2245 train_time:80914ms step_avg:61.07ms +step:1326/2245 train_time:80974ms step_avg:61.07ms +step:1327/2245 train_time:81036ms step_avg:61.07ms +step:1328/2245 train_time:81096ms step_avg:61.07ms +step:1329/2245 train_time:81158ms step_avg:61.07ms +step:1330/2245 train_time:81218ms step_avg:61.07ms +step:1331/2245 train_time:81281ms step_avg:61.07ms +step:1332/2245 train_time:81342ms step_avg:61.07ms +step:1333/2245 train_time:81405ms step_avg:61.07ms +step:1334/2245 train_time:81465ms step_avg:61.07ms +step:1335/2245 train_time:81527ms step_avg:61.07ms +step:1336/2245 train_time:81586ms step_avg:61.07ms +step:1337/2245 train_time:81648ms step_avg:61.07ms +step:1338/2245 train_time:81708ms step_avg:61.07ms +step:1339/2245 train_time:81770ms step_avg:61.07ms +step:1340/2245 train_time:81831ms step_avg:61.07ms +step:1341/2245 train_time:81893ms step_avg:61.07ms +step:1342/2245 train_time:81953ms step_avg:61.07ms +step:1343/2245 train_time:82015ms step_avg:61.07ms +step:1344/2245 train_time:82076ms step_avg:61.07ms +step:1345/2245 train_time:82138ms step_avg:61.07ms +step:1346/2245 train_time:82199ms step_avg:61.07ms +step:1347/2245 train_time:82262ms step_avg:61.07ms +step:1348/2245 train_time:82321ms step_avg:61.07ms +step:1349/2245 train_time:82384ms step_avg:61.07ms +step:1350/2245 train_time:82444ms step_avg:61.07ms +step:1351/2245 train_time:82506ms step_avg:61.07ms +step:1352/2245 train_time:82566ms step_avg:61.07ms +step:1353/2245 train_time:82628ms step_avg:61.07ms +step:1354/2245 train_time:82688ms step_avg:61.07ms +step:1355/2245 train_time:82750ms step_avg:61.07ms +step:1356/2245 train_time:82810ms step_avg:61.07ms +step:1357/2245 train_time:82874ms step_avg:61.07ms +step:1358/2245 train_time:82933ms step_avg:61.07ms +step:1359/2245 train_time:82995ms step_avg:61.07ms +step:1360/2245 train_time:83056ms step_avg:61.07ms +step:1361/2245 train_time:83119ms step_avg:61.07ms +step:1362/2245 train_time:83179ms step_avg:61.07ms +step:1363/2245 train_time:83241ms step_avg:61.07ms +step:1364/2245 train_time:83301ms step_avg:61.07ms +step:1365/2245 train_time:83364ms step_avg:61.07ms +step:1366/2245 train_time:83424ms step_avg:61.07ms +step:1367/2245 train_time:83486ms step_avg:61.07ms +step:1368/2245 train_time:83546ms step_avg:61.07ms +step:1369/2245 train_time:83608ms step_avg:61.07ms +step:1370/2245 train_time:83668ms step_avg:61.07ms +step:1371/2245 train_time:83730ms step_avg:61.07ms +step:1372/2245 train_time:83790ms step_avg:61.07ms +step:1373/2245 train_time:83852ms step_avg:61.07ms +step:1374/2245 train_time:83912ms step_avg:61.07ms +step:1375/2245 train_time:83974ms step_avg:61.07ms +step:1376/2245 train_time:84035ms step_avg:61.07ms +step:1377/2245 train_time:84099ms step_avg:61.07ms +step:1378/2245 train_time:84161ms step_avg:61.07ms +step:1379/2245 train_time:84223ms step_avg:61.08ms +step:1380/2245 train_time:84284ms step_avg:61.08ms +step:1381/2245 train_time:84346ms step_avg:61.08ms +step:1382/2245 train_time:84407ms step_avg:61.08ms +step:1383/2245 train_time:84469ms step_avg:61.08ms +step:1384/2245 train_time:84529ms step_avg:61.08ms +step:1385/2245 train_time:84591ms step_avg:61.08ms +step:1386/2245 train_time:84651ms step_avg:61.08ms +step:1387/2245 train_time:84714ms step_avg:61.08ms +step:1388/2245 train_time:84774ms step_avg:61.08ms +step:1389/2245 train_time:84836ms step_avg:61.08ms +step:1390/2245 train_time:84896ms step_avg:61.08ms +step:1391/2245 train_time:84959ms step_avg:61.08ms +step:1392/2245 train_time:85019ms step_avg:61.08ms +step:1393/2245 train_time:85082ms step_avg:61.08ms +step:1394/2245 train_time:85141ms step_avg:61.08ms +step:1395/2245 train_time:85204ms step_avg:61.08ms +step:1396/2245 train_time:85263ms step_avg:61.08ms +step:1397/2245 train_time:85325ms step_avg:61.08ms +step:1398/2245 train_time:85385ms step_avg:61.08ms +step:1399/2245 train_time:85447ms step_avg:61.08ms +step:1400/2245 train_time:85507ms step_avg:61.08ms +step:1401/2245 train_time:85569ms step_avg:61.08ms +step:1402/2245 train_time:85629ms step_avg:61.08ms +step:1403/2245 train_time:85692ms step_avg:61.08ms +step:1404/2245 train_time:85752ms step_avg:61.08ms +step:1405/2245 train_time:85814ms step_avg:61.08ms +step:1406/2245 train_time:85875ms step_avg:61.08ms +step:1407/2245 train_time:85937ms step_avg:61.08ms +step:1408/2245 train_time:85998ms step_avg:61.08ms +step:1409/2245 train_time:86060ms step_avg:61.08ms +step:1410/2245 train_time:86120ms step_avg:61.08ms +step:1411/2245 train_time:86183ms step_avg:61.08ms +step:1412/2245 train_time:86243ms step_avg:61.08ms +step:1413/2245 train_time:86305ms step_avg:61.08ms +step:1414/2245 train_time:86365ms step_avg:61.08ms +step:1415/2245 train_time:86427ms step_avg:61.08ms +step:1416/2245 train_time:86487ms step_avg:61.08ms +step:1417/2245 train_time:86549ms step_avg:61.08ms +step:1418/2245 train_time:86608ms step_avg:61.08ms +step:1419/2245 train_time:86671ms step_avg:61.08ms +step:1420/2245 train_time:86731ms step_avg:61.08ms +step:1421/2245 train_time:86794ms step_avg:61.08ms +step:1422/2245 train_time:86854ms step_avg:61.08ms +step:1423/2245 train_time:86916ms step_avg:61.08ms +step:1424/2245 train_time:86976ms step_avg:61.08ms +step:1425/2245 train_time:87039ms step_avg:61.08ms +step:1426/2245 train_time:87099ms step_avg:61.08ms +step:1427/2245 train_time:87161ms step_avg:61.08ms +step:1428/2245 train_time:87221ms step_avg:61.08ms +step:1429/2245 train_time:87284ms step_avg:61.08ms +step:1430/2245 train_time:87344ms step_avg:61.08ms +step:1431/2245 train_time:87406ms step_avg:61.08ms +step:1432/2245 train_time:87465ms step_avg:61.08ms +step:1433/2245 train_time:87527ms step_avg:61.08ms +step:1434/2245 train_time:87587ms step_avg:61.08ms +step:1435/2245 train_time:87650ms step_avg:61.08ms +step:1436/2245 train_time:87709ms step_avg:61.08ms +step:1437/2245 train_time:87772ms step_avg:61.08ms +step:1438/2245 train_time:87832ms step_avg:61.08ms +step:1439/2245 train_time:87895ms step_avg:61.08ms +step:1440/2245 train_time:87955ms step_avg:61.08ms +step:1441/2245 train_time:88018ms step_avg:61.08ms +step:1442/2245 train_time:88079ms step_avg:61.08ms +step:1443/2245 train_time:88141ms step_avg:61.08ms +step:1444/2245 train_time:88201ms step_avg:61.08ms +step:1445/2245 train_time:88264ms step_avg:61.08ms +step:1446/2245 train_time:88324ms step_avg:61.08ms +step:1447/2245 train_time:88387ms step_avg:61.08ms +step:1448/2245 train_time:88446ms step_avg:61.08ms +step:1449/2245 train_time:88507ms step_avg:61.08ms +step:1450/2245 train_time:88568ms step_avg:61.08ms +step:1451/2245 train_time:88630ms step_avg:61.08ms +step:1452/2245 train_time:88690ms step_avg:61.08ms +step:1453/2245 train_time:88753ms step_avg:61.08ms +step:1454/2245 train_time:88813ms step_avg:61.08ms +step:1455/2245 train_time:88875ms step_avg:61.08ms +step:1456/2245 train_time:88935ms step_avg:61.08ms +step:1457/2245 train_time:88998ms step_avg:61.08ms +step:1458/2245 train_time:89059ms step_avg:61.08ms +step:1459/2245 train_time:89122ms step_avg:61.08ms +step:1460/2245 train_time:89182ms step_avg:61.08ms +step:1461/2245 train_time:89244ms step_avg:61.08ms +step:1462/2245 train_time:89304ms step_avg:61.08ms +step:1463/2245 train_time:89366ms step_avg:61.08ms +step:1464/2245 train_time:89426ms step_avg:61.08ms +step:1465/2245 train_time:89488ms step_avg:61.08ms +step:1466/2245 train_time:89549ms step_avg:61.08ms +step:1467/2245 train_time:89611ms step_avg:61.08ms +step:1468/2245 train_time:89671ms step_avg:61.08ms +step:1469/2245 train_time:89733ms step_avg:61.08ms +step:1470/2245 train_time:89793ms step_avg:61.08ms +step:1471/2245 train_time:89855ms step_avg:61.08ms +step:1472/2245 train_time:89916ms step_avg:61.08ms +step:1473/2245 train_time:89980ms step_avg:61.09ms +step:1474/2245 train_time:90041ms step_avg:61.09ms +step:1475/2245 train_time:90103ms step_avg:61.09ms +step:1476/2245 train_time:90165ms step_avg:61.09ms +step:1477/2245 train_time:90227ms step_avg:61.09ms +step:1478/2245 train_time:90291ms step_avg:61.09ms +step:1479/2245 train_time:90349ms step_avg:61.09ms +step:1480/2245 train_time:90410ms step_avg:61.09ms +step:1481/2245 train_time:90473ms step_avg:61.09ms +step:1482/2245 train_time:90533ms step_avg:61.09ms +step:1483/2245 train_time:90596ms step_avg:61.09ms +step:1484/2245 train_time:90657ms step_avg:61.09ms +step:1485/2245 train_time:90720ms step_avg:61.09ms +step:1486/2245 train_time:90779ms step_avg:61.09ms +step:1487/2245 train_time:90842ms step_avg:61.09ms +step:1488/2245 train_time:90902ms step_avg:61.09ms +step:1489/2245 train_time:90966ms step_avg:61.09ms +step:1490/2245 train_time:91026ms step_avg:61.09ms +step:1491/2245 train_time:91089ms step_avg:61.09ms +step:1492/2245 train_time:91150ms step_avg:61.09ms +step:1493/2245 train_time:91214ms step_avg:61.09ms +step:1494/2245 train_time:91274ms step_avg:61.09ms +step:1495/2245 train_time:91338ms step_avg:61.10ms +step:1496/2245 train_time:91400ms step_avg:61.10ms +step:1497/2245 train_time:91460ms step_avg:61.10ms +step:1498/2245 train_time:91521ms step_avg:61.10ms +step:1499/2245 train_time:91583ms step_avg:61.10ms +step:1500/2245 train_time:91643ms step_avg:61.10ms +step:1500/2245 val_loss:3.4412 train_time:91707ms step_avg:61.14ms +step:1501/2245 train_time:91726ms step_avg:61.11ms +step:1502/2245 train_time:91768ms step_avg:61.10ms +step:1503/2245 train_time:91830ms step_avg:61.10ms +step:1504/2245 train_time:91892ms step_avg:61.10ms +step:1505/2245 train_time:91955ms step_avg:61.10ms +step:1506/2245 train_time:92015ms step_avg:61.10ms +step:1507/2245 train_time:92077ms step_avg:61.10ms +step:1508/2245 train_time:92138ms step_avg:61.10ms +step:1509/2245 train_time:92200ms step_avg:61.10ms +step:1510/2245 train_time:92259ms step_avg:61.10ms +step:1511/2245 train_time:92322ms step_avg:61.10ms +step:1512/2245 train_time:92382ms step_avg:61.10ms +step:1513/2245 train_time:92447ms step_avg:61.10ms +step:1514/2245 train_time:92508ms step_avg:61.10ms +step:1515/2245 train_time:92570ms step_avg:61.10ms +step:1516/2245 train_time:92631ms step_avg:61.10ms +step:1517/2245 train_time:92695ms step_avg:61.10ms +step:1518/2245 train_time:92756ms step_avg:61.10ms +step:1519/2245 train_time:92821ms step_avg:61.11ms +step:1520/2245 train_time:92883ms step_avg:61.11ms +step:1521/2245 train_time:92946ms step_avg:61.11ms +step:1522/2245 train_time:93006ms step_avg:61.11ms +step:1523/2245 train_time:93069ms step_avg:61.11ms +step:1524/2245 train_time:93129ms step_avg:61.11ms +step:1525/2245 train_time:93191ms step_avg:61.11ms +step:1526/2245 train_time:93250ms step_avg:61.11ms +step:1527/2245 train_time:93312ms step_avg:61.11ms +step:1528/2245 train_time:93373ms step_avg:61.11ms +step:1529/2245 train_time:93436ms step_avg:61.11ms +step:1530/2245 train_time:93496ms step_avg:61.11ms +step:1531/2245 train_time:93559ms step_avg:61.11ms +step:1532/2245 train_time:93621ms step_avg:61.11ms +step:1533/2245 train_time:93685ms step_avg:61.11ms +step:1534/2245 train_time:93747ms step_avg:61.11ms +step:1535/2245 train_time:93810ms step_avg:61.11ms +step:1536/2245 train_time:93870ms step_avg:61.11ms +step:1537/2245 train_time:93933ms step_avg:61.11ms +step:1538/2245 train_time:93993ms step_avg:61.11ms +step:1539/2245 train_time:94056ms step_avg:61.12ms +step:1540/2245 train_time:94117ms step_avg:61.11ms +step:1541/2245 train_time:94179ms step_avg:61.12ms +step:1542/2245 train_time:94239ms step_avg:61.11ms +step:1543/2245 train_time:94301ms step_avg:61.12ms +step:1544/2245 train_time:94361ms step_avg:61.11ms +step:1545/2245 train_time:94424ms step_avg:61.12ms +step:1546/2245 train_time:94486ms step_avg:61.12ms +step:1547/2245 train_time:94548ms step_avg:61.12ms +step:1548/2245 train_time:94610ms step_avg:61.12ms +step:1549/2245 train_time:94673ms step_avg:61.12ms +step:1550/2245 train_time:94733ms step_avg:61.12ms +step:1551/2245 train_time:94797ms step_avg:61.12ms +step:1552/2245 train_time:94858ms step_avg:61.12ms +step:1553/2245 train_time:94921ms step_avg:61.12ms +step:1554/2245 train_time:94983ms step_avg:61.12ms +step:1555/2245 train_time:95046ms step_avg:61.12ms +step:1556/2245 train_time:95106ms step_avg:61.12ms +step:1557/2245 train_time:95169ms step_avg:61.12ms +step:1558/2245 train_time:95229ms step_avg:61.12ms +step:1559/2245 train_time:95291ms step_avg:61.12ms +step:1560/2245 train_time:95350ms step_avg:61.12ms +step:1561/2245 train_time:95413ms step_avg:61.12ms +step:1562/2245 train_time:95474ms step_avg:61.12ms +step:1563/2245 train_time:95537ms step_avg:61.12ms +step:1564/2245 train_time:95598ms step_avg:61.12ms +step:1565/2245 train_time:95661ms step_avg:61.13ms +step:1566/2245 train_time:95722ms step_avg:61.13ms +step:1567/2245 train_time:95786ms step_avg:61.13ms +step:1568/2245 train_time:95846ms step_avg:61.13ms +step:1569/2245 train_time:95909ms step_avg:61.13ms +step:1570/2245 train_time:95969ms step_avg:61.13ms +step:1571/2245 train_time:96032ms step_avg:61.13ms +step:1572/2245 train_time:96093ms step_avg:61.13ms +step:1573/2245 train_time:96155ms step_avg:61.13ms +step:1574/2245 train_time:96215ms step_avg:61.13ms +step:1575/2245 train_time:96278ms step_avg:61.13ms +step:1576/2245 train_time:96338ms step_avg:61.13ms +step:1577/2245 train_time:96400ms step_avg:61.13ms +step:1578/2245 train_time:96461ms step_avg:61.13ms +step:1579/2245 train_time:96525ms step_avg:61.13ms +step:1580/2245 train_time:96586ms step_avg:61.13ms +step:1581/2245 train_time:96649ms step_avg:61.13ms +step:1582/2245 train_time:96709ms step_avg:61.13ms +step:1583/2245 train_time:96772ms step_avg:61.13ms +step:1584/2245 train_time:96833ms step_avg:61.13ms +step:1585/2245 train_time:96895ms step_avg:61.13ms +step:1586/2245 train_time:96955ms step_avg:61.13ms +step:1587/2245 train_time:97018ms step_avg:61.13ms +step:1588/2245 train_time:97078ms step_avg:61.13ms +step:1589/2245 train_time:97140ms step_avg:61.13ms +step:1590/2245 train_time:97201ms step_avg:61.13ms +step:1591/2245 train_time:97265ms step_avg:61.13ms +step:1592/2245 train_time:97325ms step_avg:61.13ms +step:1593/2245 train_time:97388ms step_avg:61.14ms +step:1594/2245 train_time:97449ms step_avg:61.13ms +step:1595/2245 train_time:97511ms step_avg:61.14ms +step:1596/2245 train_time:97572ms step_avg:61.14ms +step:1597/2245 train_time:97634ms step_avg:61.14ms +step:1598/2245 train_time:97695ms step_avg:61.14ms +step:1599/2245 train_time:97758ms step_avg:61.14ms +step:1600/2245 train_time:97818ms step_avg:61.14ms +step:1601/2245 train_time:97882ms step_avg:61.14ms +step:1602/2245 train_time:97943ms step_avg:61.14ms +step:1603/2245 train_time:98006ms step_avg:61.14ms +step:1604/2245 train_time:98067ms step_avg:61.14ms +step:1605/2245 train_time:98129ms step_avg:61.14ms +step:1606/2245 train_time:98189ms step_avg:61.14ms +step:1607/2245 train_time:98251ms step_avg:61.14ms +step:1608/2245 train_time:98311ms step_avg:61.14ms +step:1609/2245 train_time:98373ms step_avg:61.14ms +step:1610/2245 train_time:98434ms step_avg:61.14ms +step:1611/2245 train_time:98496ms step_avg:61.14ms +step:1612/2245 train_time:98556ms step_avg:61.14ms +step:1613/2245 train_time:98620ms step_avg:61.14ms +step:1614/2245 train_time:98681ms step_avg:61.14ms +step:1615/2245 train_time:98744ms step_avg:61.14ms +step:1616/2245 train_time:98805ms step_avg:61.14ms +step:1617/2245 train_time:98867ms step_avg:61.14ms +step:1618/2245 train_time:98929ms step_avg:61.14ms +step:1619/2245 train_time:98990ms step_avg:61.14ms +step:1620/2245 train_time:99051ms step_avg:61.14ms +step:1621/2245 train_time:99114ms step_avg:61.14ms +step:1622/2245 train_time:99175ms step_avg:61.14ms +step:1623/2245 train_time:99238ms step_avg:61.14ms +step:1624/2245 train_time:99298ms step_avg:61.14ms +step:1625/2245 train_time:99361ms step_avg:61.15ms +step:1626/2245 train_time:99421ms step_avg:61.14ms +step:1627/2245 train_time:99484ms step_avg:61.15ms +step:1628/2245 train_time:99545ms step_avg:61.15ms +step:1629/2245 train_time:99608ms step_avg:61.15ms +step:1630/2245 train_time:99668ms step_avg:61.15ms +step:1631/2245 train_time:99731ms step_avg:61.15ms +step:1632/2245 train_time:99791ms step_avg:61.15ms +step:1633/2245 train_time:99854ms step_avg:61.15ms +step:1634/2245 train_time:99914ms step_avg:61.15ms +step:1635/2245 train_time:99976ms step_avg:61.15ms +step:1636/2245 train_time:100037ms step_avg:61.15ms +step:1637/2245 train_time:100100ms step_avg:61.15ms +step:1638/2245 train_time:100162ms step_avg:61.15ms +step:1639/2245 train_time:100225ms step_avg:61.15ms +step:1640/2245 train_time:100286ms step_avg:61.15ms +step:1641/2245 train_time:100349ms step_avg:61.15ms +step:1642/2245 train_time:100409ms step_avg:61.15ms +step:1643/2245 train_time:100472ms step_avg:61.15ms +step:1644/2245 train_time:100532ms step_avg:61.15ms +step:1645/2245 train_time:100595ms step_avg:61.15ms +step:1646/2245 train_time:100655ms step_avg:61.15ms +step:1647/2245 train_time:100718ms step_avg:61.15ms +step:1648/2245 train_time:100778ms step_avg:61.15ms +step:1649/2245 train_time:100842ms step_avg:61.15ms +step:1650/2245 train_time:100903ms step_avg:61.15ms +step:1651/2245 train_time:100966ms step_avg:61.15ms +step:1652/2245 train_time:101026ms step_avg:61.15ms +step:1653/2245 train_time:101089ms step_avg:61.16ms +step:1654/2245 train_time:101150ms step_avg:61.15ms +step:1655/2245 train_time:101212ms step_avg:61.16ms +step:1656/2245 train_time:101272ms step_avg:61.15ms +step:1657/2245 train_time:101335ms step_avg:61.16ms +step:1658/2245 train_time:101396ms step_avg:61.16ms +step:1659/2245 train_time:101459ms step_avg:61.16ms +step:1660/2245 train_time:101520ms step_avg:61.16ms +step:1661/2245 train_time:101583ms step_avg:61.16ms +step:1662/2245 train_time:101644ms step_avg:61.16ms +step:1663/2245 train_time:101706ms step_avg:61.16ms +step:1664/2245 train_time:101766ms step_avg:61.16ms +step:1665/2245 train_time:101829ms step_avg:61.16ms +step:1666/2245 train_time:101889ms step_avg:61.16ms +step:1667/2245 train_time:101952ms step_avg:61.16ms +step:1668/2245 train_time:102012ms step_avg:61.16ms +step:1669/2245 train_time:102075ms step_avg:61.16ms +step:1670/2245 train_time:102135ms step_avg:61.16ms +step:1671/2245 train_time:102198ms step_avg:61.16ms +step:1672/2245 train_time:102259ms step_avg:61.16ms +step:1673/2245 train_time:102322ms step_avg:61.16ms +step:1674/2245 train_time:102383ms step_avg:61.16ms +step:1675/2245 train_time:102445ms step_avg:61.16ms +step:1676/2245 train_time:102506ms step_avg:61.16ms +step:1677/2245 train_time:102569ms step_avg:61.16ms +step:1678/2245 train_time:102628ms step_avg:61.16ms +step:1679/2245 train_time:102691ms step_avg:61.16ms +step:1680/2245 train_time:102751ms step_avg:61.16ms +step:1681/2245 train_time:102814ms step_avg:61.16ms +step:1682/2245 train_time:102874ms step_avg:61.16ms +step:1683/2245 train_time:102937ms step_avg:61.16ms +step:1684/2245 train_time:102997ms step_avg:61.16ms +step:1685/2245 train_time:103060ms step_avg:61.16ms +step:1686/2245 train_time:103121ms step_avg:61.16ms +step:1687/2245 train_time:103185ms step_avg:61.16ms +step:1688/2245 train_time:103245ms step_avg:61.16ms +step:1689/2245 train_time:103308ms step_avg:61.17ms +step:1690/2245 train_time:103369ms step_avg:61.17ms +step:1691/2245 train_time:103432ms step_avg:61.17ms +step:1692/2245 train_time:103493ms step_avg:61.17ms +step:1693/2245 train_time:103556ms step_avg:61.17ms +step:1694/2245 train_time:103616ms step_avg:61.17ms +step:1695/2245 train_time:103679ms step_avg:61.17ms +step:1696/2245 train_time:103740ms step_avg:61.17ms +step:1697/2245 train_time:103803ms step_avg:61.17ms +step:1698/2245 train_time:103864ms step_avg:61.17ms +step:1699/2245 train_time:103928ms step_avg:61.17ms +step:1700/2245 train_time:103988ms step_avg:61.17ms +step:1701/2245 train_time:104050ms step_avg:61.17ms +step:1702/2245 train_time:104111ms step_avg:61.17ms +step:1703/2245 train_time:104174ms step_avg:61.17ms +step:1704/2245 train_time:104235ms step_avg:61.17ms +step:1705/2245 train_time:104298ms step_avg:61.17ms +step:1706/2245 train_time:104358ms step_avg:61.17ms +step:1707/2245 train_time:104422ms step_avg:61.17ms +step:1708/2245 train_time:104483ms step_avg:61.17ms +step:1709/2245 train_time:104547ms step_avg:61.17ms +step:1710/2245 train_time:104607ms step_avg:61.17ms +step:1711/2245 train_time:104670ms step_avg:61.17ms +step:1712/2245 train_time:104730ms step_avg:61.17ms +step:1713/2245 train_time:104792ms step_avg:61.17ms +step:1714/2245 train_time:104853ms step_avg:61.17ms +step:1715/2245 train_time:104916ms step_avg:61.18ms +step:1716/2245 train_time:104977ms step_avg:61.18ms +step:1717/2245 train_time:105040ms step_avg:61.18ms +step:1718/2245 train_time:105100ms step_avg:61.18ms +step:1719/2245 train_time:105163ms step_avg:61.18ms +step:1720/2245 train_time:105224ms step_avg:61.18ms +step:1721/2245 train_time:105287ms step_avg:61.18ms +step:1722/2245 train_time:105348ms step_avg:61.18ms +step:1723/2245 train_time:105411ms step_avg:61.18ms +step:1724/2245 train_time:105471ms step_avg:61.18ms +step:1725/2245 train_time:105534ms step_avg:61.18ms +step:1726/2245 train_time:105595ms step_avg:61.18ms +step:1727/2245 train_time:105658ms step_avg:61.18ms +step:1728/2245 train_time:105720ms step_avg:61.18ms +step:1729/2245 train_time:105783ms step_avg:61.18ms +step:1730/2245 train_time:105843ms step_avg:61.18ms +step:1731/2245 train_time:105906ms step_avg:61.18ms +step:1732/2245 train_time:105967ms step_avg:61.18ms +step:1733/2245 train_time:106029ms step_avg:61.18ms +step:1734/2245 train_time:106090ms step_avg:61.18ms +step:1735/2245 train_time:106152ms step_avg:61.18ms +step:1736/2245 train_time:106213ms step_avg:61.18ms +step:1737/2245 train_time:106275ms step_avg:61.18ms +step:1738/2245 train_time:106336ms step_avg:61.18ms +step:1739/2245 train_time:106399ms step_avg:61.18ms +step:1740/2245 train_time:106460ms step_avg:61.18ms +step:1741/2245 train_time:106523ms step_avg:61.18ms +step:1742/2245 train_time:106584ms step_avg:61.19ms +step:1743/2245 train_time:106648ms step_avg:61.19ms +step:1744/2245 train_time:106708ms step_avg:61.19ms +step:1745/2245 train_time:106771ms step_avg:61.19ms +step:1746/2245 train_time:106831ms step_avg:61.19ms +step:1747/2245 train_time:106894ms step_avg:61.19ms +step:1748/2245 train_time:106955ms step_avg:61.19ms +step:1749/2245 train_time:107018ms step_avg:61.19ms +step:1750/2245 train_time:107080ms step_avg:61.19ms +step:1750/2245 val_loss:3.3771 train_time:107143ms step_avg:61.22ms +step:1751/2245 train_time:107167ms step_avg:61.20ms +step:1752/2245 train_time:107207ms step_avg:61.19ms +step:1753/2245 train_time:107272ms step_avg:61.19ms +step:1754/2245 train_time:107333ms step_avg:61.19ms +step:1755/2245 train_time:107396ms step_avg:61.19ms +step:1756/2245 train_time:107456ms step_avg:61.19ms +step:1757/2245 train_time:107518ms step_avg:61.19ms +step:1758/2245 train_time:107577ms step_avg:61.19ms +step:1759/2245 train_time:107639ms step_avg:61.19ms +step:1760/2245 train_time:107699ms step_avg:61.19ms +step:1761/2245 train_time:107761ms step_avg:61.19ms +step:1762/2245 train_time:107821ms step_avg:61.19ms +step:1763/2245 train_time:107883ms step_avg:61.19ms +step:1764/2245 train_time:107944ms step_avg:61.19ms +step:1765/2245 train_time:108007ms step_avg:61.19ms +step:1766/2245 train_time:108068ms step_avg:61.19ms +step:1767/2245 train_time:108133ms step_avg:61.20ms +step:1768/2245 train_time:108195ms step_avg:61.20ms +step:1769/2245 train_time:108260ms step_avg:61.20ms +step:1770/2245 train_time:108322ms step_avg:61.20ms +step:1771/2245 train_time:108386ms step_avg:61.20ms +step:1772/2245 train_time:108445ms step_avg:61.20ms +step:1773/2245 train_time:108508ms step_avg:61.20ms +step:1774/2245 train_time:108569ms step_avg:61.20ms +step:1775/2245 train_time:108631ms step_avg:61.20ms +step:1776/2245 train_time:108691ms step_avg:61.20ms +step:1777/2245 train_time:108754ms step_avg:61.20ms +step:1778/2245 train_time:108814ms step_avg:61.20ms +step:1779/2245 train_time:108876ms step_avg:61.20ms +step:1780/2245 train_time:108937ms step_avg:61.20ms +step:1781/2245 train_time:109000ms step_avg:61.20ms +step:1782/2245 train_time:109061ms step_avg:61.20ms +step:1783/2245 train_time:109125ms step_avg:61.20ms +step:1784/2245 train_time:109187ms step_avg:61.20ms +step:1785/2245 train_time:109250ms step_avg:61.20ms +step:1786/2245 train_time:109311ms step_avg:61.20ms +step:1787/2245 train_time:109374ms step_avg:61.21ms +step:1788/2245 train_time:109435ms step_avg:61.21ms +step:1789/2245 train_time:109498ms step_avg:61.21ms +step:1790/2245 train_time:109558ms step_avg:61.21ms +step:1791/2245 train_time:109621ms step_avg:61.21ms +step:1792/2245 train_time:109681ms step_avg:61.21ms +step:1793/2245 train_time:109744ms step_avg:61.21ms +step:1794/2245 train_time:109803ms step_avg:61.21ms +step:1795/2245 train_time:109866ms step_avg:61.21ms +step:1796/2245 train_time:109926ms step_avg:61.21ms +step:1797/2245 train_time:109989ms step_avg:61.21ms +step:1798/2245 train_time:110050ms step_avg:61.21ms +step:1799/2245 train_time:110113ms step_avg:61.21ms +step:1800/2245 train_time:110174ms step_avg:61.21ms +step:1801/2245 train_time:110241ms step_avg:61.21ms +step:1802/2245 train_time:110300ms step_avg:61.21ms +step:1803/2245 train_time:110363ms step_avg:61.21ms +step:1804/2245 train_time:110424ms step_avg:61.21ms +step:1805/2245 train_time:110486ms step_avg:61.21ms +step:1806/2245 train_time:110547ms step_avg:61.21ms +step:1807/2245 train_time:110609ms step_avg:61.21ms +step:1808/2245 train_time:110669ms step_avg:61.21ms +step:1809/2245 train_time:110732ms step_avg:61.21ms +step:1810/2245 train_time:110792ms step_avg:61.21ms +step:1811/2245 train_time:110855ms step_avg:61.21ms +step:1812/2245 train_time:110915ms step_avg:61.21ms +step:1813/2245 train_time:110978ms step_avg:61.21ms +step:1814/2245 train_time:111039ms step_avg:61.21ms +step:1815/2245 train_time:111102ms step_avg:61.21ms +step:1816/2245 train_time:111162ms step_avg:61.21ms +step:1817/2245 train_time:111225ms step_avg:61.21ms +step:1818/2245 train_time:111286ms step_avg:61.21ms +step:1819/2245 train_time:111349ms step_avg:61.21ms +step:1820/2245 train_time:111409ms step_avg:61.21ms +step:1821/2245 train_time:111472ms step_avg:61.21ms +step:1822/2245 train_time:111533ms step_avg:61.21ms +step:1823/2245 train_time:111595ms step_avg:61.22ms +step:1824/2245 train_time:111655ms step_avg:61.21ms +step:1825/2245 train_time:111718ms step_avg:61.22ms +step:1826/2245 train_time:111778ms step_avg:61.21ms +step:1827/2245 train_time:111840ms step_avg:61.22ms +step:1828/2245 train_time:111901ms step_avg:61.21ms +step:1829/2245 train_time:111963ms step_avg:61.22ms +step:1830/2245 train_time:112023ms step_avg:61.22ms +step:1831/2245 train_time:112086ms step_avg:61.22ms +step:1832/2245 train_time:112146ms step_avg:61.22ms +step:1833/2245 train_time:112210ms step_avg:61.22ms +step:1834/2245 train_time:112270ms step_avg:61.22ms +step:1835/2245 train_time:112333ms step_avg:61.22ms +step:1836/2245 train_time:112393ms step_avg:61.22ms +step:1837/2245 train_time:112457ms step_avg:61.22ms +step:1838/2245 train_time:112518ms step_avg:61.22ms +step:1839/2245 train_time:112580ms step_avg:61.22ms +step:1840/2245 train_time:112641ms step_avg:61.22ms +step:1841/2245 train_time:112705ms step_avg:61.22ms +step:1842/2245 train_time:112766ms step_avg:61.22ms +step:1843/2245 train_time:112828ms step_avg:61.22ms +step:1844/2245 train_time:112888ms step_avg:61.22ms +step:1845/2245 train_time:112951ms step_avg:61.22ms +step:1846/2245 train_time:113012ms step_avg:61.22ms +step:1847/2245 train_time:113075ms step_avg:61.22ms +step:1848/2245 train_time:113135ms step_avg:61.22ms +step:1849/2245 train_time:113199ms step_avg:61.22ms +step:1850/2245 train_time:113261ms step_avg:61.22ms +step:1851/2245 train_time:113324ms step_avg:61.22ms +step:1852/2245 train_time:113384ms step_avg:61.22ms +step:1853/2245 train_time:113447ms step_avg:61.22ms +step:1854/2245 train_time:113507ms step_avg:61.22ms +step:1855/2245 train_time:113570ms step_avg:61.22ms +step:1856/2245 train_time:113630ms step_avg:61.22ms +step:1857/2245 train_time:113693ms step_avg:61.22ms +step:1858/2245 train_time:113754ms step_avg:61.22ms +step:1859/2245 train_time:113817ms step_avg:61.22ms +step:1860/2245 train_time:113877ms step_avg:61.22ms +step:1861/2245 train_time:113942ms step_avg:61.23ms +step:1862/2245 train_time:114001ms step_avg:61.23ms +step:1863/2245 train_time:114064ms step_avg:61.23ms +step:1864/2245 train_time:114125ms step_avg:61.23ms +step:1865/2245 train_time:114187ms step_avg:61.23ms +step:1866/2245 train_time:114248ms step_avg:61.23ms +step:1867/2245 train_time:114310ms step_avg:61.23ms +step:1868/2245 train_time:114370ms step_avg:61.23ms +step:1869/2245 train_time:114433ms step_avg:61.23ms +step:1870/2245 train_time:114493ms step_avg:61.23ms +step:1871/2245 train_time:114556ms step_avg:61.23ms +step:1872/2245 train_time:114617ms step_avg:61.23ms +step:1873/2245 train_time:114681ms step_avg:61.23ms +step:1874/2245 train_time:114742ms step_avg:61.23ms +step:1875/2245 train_time:114805ms step_avg:61.23ms +step:1876/2245 train_time:114865ms step_avg:61.23ms +step:1877/2245 train_time:114928ms step_avg:61.23ms +step:1878/2245 train_time:114988ms step_avg:61.23ms +step:1879/2245 train_time:115050ms step_avg:61.23ms +step:1880/2245 train_time:115111ms step_avg:61.23ms +step:1881/2245 train_time:115174ms step_avg:61.23ms +step:1882/2245 train_time:115234ms step_avg:61.23ms +step:1883/2245 train_time:115297ms step_avg:61.23ms +step:1884/2245 train_time:115358ms step_avg:61.23ms +step:1885/2245 train_time:115422ms step_avg:61.23ms +step:1886/2245 train_time:115482ms step_avg:61.23ms +step:1887/2245 train_time:115545ms step_avg:61.23ms +step:1888/2245 train_time:115605ms step_avg:61.23ms +step:1889/2245 train_time:115668ms step_avg:61.23ms +step:1890/2245 train_time:115728ms step_avg:61.23ms +step:1891/2245 train_time:115791ms step_avg:61.23ms +step:1892/2245 train_time:115851ms step_avg:61.23ms +step:1893/2245 train_time:115914ms step_avg:61.23ms +step:1894/2245 train_time:115974ms step_avg:61.23ms +step:1895/2245 train_time:116037ms step_avg:61.23ms +step:1896/2245 train_time:116099ms step_avg:61.23ms +step:1897/2245 train_time:116161ms step_avg:61.23ms +step:1898/2245 train_time:116222ms step_avg:61.23ms +step:1899/2245 train_time:116285ms step_avg:61.23ms +step:1900/2245 train_time:116344ms step_avg:61.23ms +step:1901/2245 train_time:116407ms step_avg:61.23ms +step:1902/2245 train_time:116468ms step_avg:61.23ms +step:1903/2245 train_time:116530ms step_avg:61.23ms +step:1904/2245 train_time:116591ms step_avg:61.23ms +step:1905/2245 train_time:116653ms step_avg:61.24ms +step:1906/2245 train_time:116713ms step_avg:61.23ms +step:1907/2245 train_time:116777ms step_avg:61.24ms +step:1908/2245 train_time:116839ms step_avg:61.24ms +step:1909/2245 train_time:116901ms step_avg:61.24ms +step:1910/2245 train_time:116962ms step_avg:61.24ms +step:1911/2245 train_time:117025ms step_avg:61.24ms +step:1912/2245 train_time:117085ms step_avg:61.24ms +step:1913/2245 train_time:117147ms step_avg:61.24ms +step:1914/2245 train_time:117208ms step_avg:61.24ms +step:1915/2245 train_time:117271ms step_avg:61.24ms +step:1916/2245 train_time:117331ms step_avg:61.24ms +step:1917/2245 train_time:117394ms step_avg:61.24ms +step:1918/2245 train_time:117455ms step_avg:61.24ms +step:1919/2245 train_time:117518ms step_avg:61.24ms +step:1920/2245 train_time:117580ms step_avg:61.24ms +step:1921/2245 train_time:117643ms step_avg:61.24ms +step:1922/2245 train_time:117704ms step_avg:61.24ms +step:1923/2245 train_time:117767ms step_avg:61.24ms +step:1924/2245 train_time:117827ms step_avg:61.24ms +step:1925/2245 train_time:117890ms step_avg:61.24ms +step:1926/2245 train_time:117949ms step_avg:61.24ms +step:1927/2245 train_time:118013ms step_avg:61.24ms +step:1928/2245 train_time:118073ms step_avg:61.24ms +step:1929/2245 train_time:118136ms step_avg:61.24ms +step:1930/2245 train_time:118197ms step_avg:61.24ms +step:1931/2245 train_time:118260ms step_avg:61.24ms +step:1932/2245 train_time:118321ms step_avg:61.24ms +step:1933/2245 train_time:118383ms step_avg:61.24ms +step:1934/2245 train_time:118444ms step_avg:61.24ms +step:1935/2245 train_time:118506ms step_avg:61.24ms +step:1936/2245 train_time:118567ms step_avg:61.24ms +step:1937/2245 train_time:118630ms step_avg:61.24ms +step:1938/2245 train_time:118690ms step_avg:61.24ms +step:1939/2245 train_time:118753ms step_avg:61.24ms +step:1940/2245 train_time:118815ms step_avg:61.24ms +step:1941/2245 train_time:118878ms step_avg:61.25ms +step:1942/2245 train_time:118939ms step_avg:61.25ms +step:1943/2245 train_time:119002ms step_avg:61.25ms +step:1944/2245 train_time:119062ms step_avg:61.25ms +step:1945/2245 train_time:119125ms step_avg:61.25ms +step:1946/2245 train_time:119185ms step_avg:61.25ms +step:1947/2245 train_time:119248ms step_avg:61.25ms +step:1948/2245 train_time:119308ms step_avg:61.25ms +step:1949/2245 train_time:119371ms step_avg:61.25ms +step:1950/2245 train_time:119431ms step_avg:61.25ms +step:1951/2245 train_time:119494ms step_avg:61.25ms +step:1952/2245 train_time:119554ms step_avg:61.25ms +step:1953/2245 train_time:119617ms step_avg:61.25ms +step:1954/2245 train_time:119677ms step_avg:61.25ms +step:1955/2245 train_time:119740ms step_avg:61.25ms +step:1956/2245 train_time:119801ms step_avg:61.25ms +step:1957/2245 train_time:119864ms step_avg:61.25ms +step:1958/2245 train_time:119924ms step_avg:61.25ms +step:1959/2245 train_time:119987ms step_avg:61.25ms +step:1960/2245 train_time:120047ms step_avg:61.25ms +step:1961/2245 train_time:120110ms step_avg:61.25ms +step:1962/2245 train_time:120170ms step_avg:61.25ms +step:1963/2245 train_time:120233ms step_avg:61.25ms +step:1964/2245 train_time:120293ms step_avg:61.25ms +step:1965/2245 train_time:120356ms step_avg:61.25ms +step:1966/2245 train_time:120416ms step_avg:61.25ms +step:1967/2245 train_time:120479ms step_avg:61.25ms +step:1968/2245 train_time:120540ms step_avg:61.25ms +step:1969/2245 train_time:120604ms step_avg:61.25ms +step:1970/2245 train_time:120665ms step_avg:61.25ms +step:1971/2245 train_time:120727ms step_avg:61.25ms +step:1972/2245 train_time:120786ms step_avg:61.25ms +step:1973/2245 train_time:120849ms step_avg:61.25ms +step:1974/2245 train_time:120910ms step_avg:61.25ms +step:1975/2245 train_time:120972ms step_avg:61.25ms +step:1976/2245 train_time:121033ms step_avg:61.25ms +step:1977/2245 train_time:121095ms step_avg:61.25ms +step:1978/2245 train_time:121156ms step_avg:61.25ms +step:1979/2245 train_time:121219ms step_avg:61.25ms +step:1980/2245 train_time:121280ms step_avg:61.25ms +step:1981/2245 train_time:121343ms step_avg:61.25ms +step:1982/2245 train_time:121404ms step_avg:61.25ms +step:1983/2245 train_time:121468ms step_avg:61.25ms +step:1984/2245 train_time:121527ms step_avg:61.25ms +step:1985/2245 train_time:121590ms step_avg:61.25ms +step:1986/2245 train_time:121651ms step_avg:61.25ms +step:1987/2245 train_time:121713ms step_avg:61.25ms +step:1988/2245 train_time:121774ms step_avg:61.25ms +step:1989/2245 train_time:121837ms step_avg:61.26ms +step:1990/2245 train_time:121897ms step_avg:61.26ms +step:1991/2245 train_time:121960ms step_avg:61.26ms +step:1992/2245 train_time:122021ms step_avg:61.26ms +step:1993/2245 train_time:122084ms step_avg:61.26ms +step:1994/2245 train_time:122144ms step_avg:61.26ms +step:1995/2245 train_time:122207ms step_avg:61.26ms +step:1996/2245 train_time:122268ms step_avg:61.26ms +step:1997/2245 train_time:122331ms step_avg:61.26ms +step:1998/2245 train_time:122392ms step_avg:61.26ms +step:1999/2245 train_time:122454ms step_avg:61.26ms +step:2000/2245 train_time:122515ms step_avg:61.26ms +step:2000/2245 val_loss:3.3226 train_time:122579ms step_avg:61.29ms +step:2001/2245 train_time:122598ms step_avg:61.27ms +step:2002/2245 train_time:122643ms step_avg:61.26ms +step:2003/2245 train_time:122708ms step_avg:61.26ms +step:2004/2245 train_time:122769ms step_avg:61.26ms +step:2005/2245 train_time:122831ms step_avg:61.26ms +step:2006/2245 train_time:122891ms step_avg:61.26ms +step:2007/2245 train_time:122953ms step_avg:61.26ms +step:2008/2245 train_time:123013ms step_avg:61.26ms +step:2009/2245 train_time:123075ms step_avg:61.26ms +step:2010/2245 train_time:123135ms step_avg:61.26ms +step:2011/2245 train_time:123197ms step_avg:61.26ms +step:2012/2245 train_time:123257ms step_avg:61.26ms +step:2013/2245 train_time:123319ms step_avg:61.26ms +step:2014/2245 train_time:123379ms step_avg:61.26ms +step:2015/2245 train_time:123441ms step_avg:61.26ms +step:2016/2245 train_time:123502ms step_avg:61.26ms +step:2017/2245 train_time:123567ms step_avg:61.26ms +step:2018/2245 train_time:123630ms step_avg:61.26ms +step:2019/2245 train_time:123693ms step_avg:61.26ms +step:2020/2245 train_time:123753ms step_avg:61.26ms +step:2021/2245 train_time:123816ms step_avg:61.26ms +step:2022/2245 train_time:123877ms step_avg:61.26ms +step:2023/2245 train_time:123939ms step_avg:61.27ms +step:2024/2245 train_time:123999ms step_avg:61.26ms +step:2025/2245 train_time:124062ms step_avg:61.27ms +step:2026/2245 train_time:124122ms step_avg:61.26ms +step:2027/2245 train_time:124185ms step_avg:61.27ms +step:2028/2245 train_time:124246ms step_avg:61.27ms +step:2029/2245 train_time:124309ms step_avg:61.27ms +step:2030/2245 train_time:124369ms step_avg:61.27ms +step:2031/2245 train_time:124431ms step_avg:61.27ms +step:2032/2245 train_time:124492ms step_avg:61.27ms +step:2033/2245 train_time:124556ms step_avg:61.27ms +step:2034/2245 train_time:124617ms step_avg:61.27ms +step:2035/2245 train_time:124680ms step_avg:61.27ms +step:2036/2245 train_time:124741ms step_avg:61.27ms +step:2037/2245 train_time:124805ms step_avg:61.27ms +step:2038/2245 train_time:124865ms step_avg:61.27ms +step:2039/2245 train_time:124929ms step_avg:61.27ms +step:2040/2245 train_time:124989ms step_avg:61.27ms +step:2041/2245 train_time:125052ms step_avg:61.27ms +step:2042/2245 train_time:125112ms step_avg:61.27ms +step:2043/2245 train_time:125174ms step_avg:61.27ms +step:2044/2245 train_time:125235ms step_avg:61.27ms +step:2045/2245 train_time:125297ms step_avg:61.27ms +step:2046/2245 train_time:125358ms step_avg:61.27ms +step:2047/2245 train_time:125422ms step_avg:61.27ms +step:2048/2245 train_time:125483ms step_avg:61.27ms +step:2049/2245 train_time:125546ms step_avg:61.27ms +step:2050/2245 train_time:125608ms step_avg:61.27ms +step:2051/2245 train_time:125672ms step_avg:61.27ms +step:2052/2245 train_time:125733ms step_avg:61.27ms +step:2053/2245 train_time:125796ms step_avg:61.27ms +step:2054/2245 train_time:125856ms step_avg:61.27ms +step:2055/2245 train_time:125918ms step_avg:61.27ms +step:2056/2245 train_time:125979ms step_avg:61.27ms +step:2057/2245 train_time:126042ms step_avg:61.27ms +step:2058/2245 train_time:126102ms step_avg:61.27ms +step:2059/2245 train_time:126166ms step_avg:61.28ms +step:2060/2245 train_time:126226ms step_avg:61.27ms +step:2061/2245 train_time:126290ms step_avg:61.28ms +step:2062/2245 train_time:126350ms step_avg:61.28ms +step:2063/2245 train_time:126413ms step_avg:61.28ms +step:2064/2245 train_time:126477ms step_avg:61.28ms +step:2065/2245 train_time:126537ms step_avg:61.28ms +step:2066/2245 train_time:126597ms step_avg:61.28ms +step:2067/2245 train_time:126660ms step_avg:61.28ms +step:2068/2245 train_time:126721ms step_avg:61.28ms +step:2069/2245 train_time:126785ms step_avg:61.28ms +step:2070/2245 train_time:126846ms step_avg:61.28ms +step:2071/2245 train_time:126909ms step_avg:61.28ms +step:2072/2245 train_time:126969ms step_avg:61.28ms +step:2073/2245 train_time:127032ms step_avg:61.28ms +step:2074/2245 train_time:127093ms step_avg:61.28ms +step:2075/2245 train_time:127156ms step_avg:61.28ms +step:2076/2245 train_time:127216ms step_avg:61.28ms +step:2077/2245 train_time:127278ms step_avg:61.28ms +step:2078/2245 train_time:127340ms step_avg:61.28ms +step:2079/2245 train_time:127403ms step_avg:61.28ms +step:2080/2245 train_time:127464ms step_avg:61.28ms +step:2081/2245 train_time:127527ms step_avg:61.28ms +step:2082/2245 train_time:127588ms step_avg:61.28ms +step:2083/2245 train_time:127651ms step_avg:61.28ms +step:2084/2245 train_time:127711ms step_avg:61.28ms +step:2085/2245 train_time:127774ms step_avg:61.28ms +step:2086/2245 train_time:127835ms step_avg:61.28ms +step:2087/2245 train_time:127899ms step_avg:61.28ms +step:2088/2245 train_time:127960ms step_avg:61.28ms +step:2089/2245 train_time:128023ms step_avg:61.28ms +step:2090/2245 train_time:128083ms step_avg:61.28ms +step:2091/2245 train_time:128146ms step_avg:61.28ms +step:2092/2245 train_time:128206ms step_avg:61.28ms +step:2093/2245 train_time:128269ms step_avg:61.28ms +step:2094/2245 train_time:128330ms step_avg:61.28ms +step:2095/2245 train_time:128392ms step_avg:61.28ms +step:2096/2245 train_time:128452ms step_avg:61.28ms +step:2097/2245 train_time:128514ms step_avg:61.28ms +step:2098/2245 train_time:128575ms step_avg:61.28ms +step:2099/2245 train_time:128637ms step_avg:61.29ms +step:2100/2245 train_time:128698ms step_avg:61.28ms +step:2101/2245 train_time:128763ms step_avg:61.29ms +step:2102/2245 train_time:128824ms step_avg:61.29ms +step:2103/2245 train_time:128887ms step_avg:61.29ms +step:2104/2245 train_time:128948ms step_avg:61.29ms +step:2105/2245 train_time:129010ms step_avg:61.29ms +step:2106/2245 train_time:129071ms step_avg:61.29ms +step:2107/2245 train_time:129133ms step_avg:61.29ms +step:2108/2245 train_time:129193ms step_avg:61.29ms +step:2109/2245 train_time:129256ms step_avg:61.29ms +step:2110/2245 train_time:129316ms step_avg:61.29ms +step:2111/2245 train_time:129379ms step_avg:61.29ms +step:2112/2245 train_time:129440ms step_avg:61.29ms +step:2113/2245 train_time:129503ms step_avg:61.29ms +step:2114/2245 train_time:129565ms step_avg:61.29ms +step:2115/2245 train_time:129629ms step_avg:61.29ms +step:2116/2245 train_time:129689ms step_avg:61.29ms +step:2117/2245 train_time:129752ms step_avg:61.29ms +step:2118/2245 train_time:129812ms step_avg:61.29ms +step:2119/2245 train_time:129875ms step_avg:61.29ms +step:2120/2245 train_time:129936ms step_avg:61.29ms +step:2121/2245 train_time:129998ms step_avg:61.29ms +step:2122/2245 train_time:130059ms step_avg:61.29ms +step:2123/2245 train_time:130122ms step_avg:61.29ms +step:2124/2245 train_time:130182ms step_avg:61.29ms +step:2125/2245 train_time:130245ms step_avg:61.29ms +step:2126/2245 train_time:130305ms step_avg:61.29ms +step:2127/2245 train_time:130368ms step_avg:61.29ms +step:2128/2245 train_time:130429ms step_avg:61.29ms +step:2129/2245 train_time:130491ms step_avg:61.29ms +step:2130/2245 train_time:130552ms step_avg:61.29ms +step:2131/2245 train_time:130615ms step_avg:61.29ms +step:2132/2245 train_time:130676ms step_avg:61.29ms +step:2133/2245 train_time:130738ms step_avg:61.29ms +step:2134/2245 train_time:130798ms step_avg:61.29ms +step:2135/2245 train_time:130861ms step_avg:61.29ms +step:2136/2245 train_time:130923ms step_avg:61.29ms +step:2137/2245 train_time:130986ms step_avg:61.29ms +step:2138/2245 train_time:131047ms step_avg:61.29ms +step:2139/2245 train_time:131109ms step_avg:61.29ms +step:2140/2245 train_time:131170ms step_avg:61.29ms +step:2141/2245 train_time:131232ms step_avg:61.29ms +step:2142/2245 train_time:131292ms step_avg:61.29ms +step:2143/2245 train_time:131355ms step_avg:61.29ms +step:2144/2245 train_time:131415ms step_avg:61.29ms +step:2145/2245 train_time:131478ms step_avg:61.30ms +step:2146/2245 train_time:131538ms step_avg:61.29ms +step:2147/2245 train_time:131602ms step_avg:61.30ms +step:2148/2245 train_time:131663ms step_avg:61.30ms +step:2149/2245 train_time:131726ms step_avg:61.30ms +step:2150/2245 train_time:131786ms step_avg:61.30ms +step:2151/2245 train_time:131850ms step_avg:61.30ms +step:2152/2245 train_time:131910ms step_avg:61.30ms +step:2153/2245 train_time:131973ms step_avg:61.30ms +step:2154/2245 train_time:132033ms step_avg:61.30ms +step:2155/2245 train_time:132095ms step_avg:61.30ms +step:2156/2245 train_time:132156ms step_avg:61.30ms +step:2157/2245 train_time:132218ms step_avg:61.30ms +step:2158/2245 train_time:132279ms step_avg:61.30ms +step:2159/2245 train_time:132342ms step_avg:61.30ms +step:2160/2245 train_time:132403ms step_avg:61.30ms +step:2161/2245 train_time:132466ms step_avg:61.30ms +step:2162/2245 train_time:132527ms step_avg:61.30ms +step:2163/2245 train_time:132590ms step_avg:61.30ms +step:2164/2245 train_time:132651ms step_avg:61.30ms +step:2165/2245 train_time:132714ms step_avg:61.30ms +step:2166/2245 train_time:132775ms step_avg:61.30ms +step:2167/2245 train_time:132837ms step_avg:61.30ms +step:2168/2245 train_time:132897ms step_avg:61.30ms +step:2169/2245 train_time:132960ms step_avg:61.30ms +step:2170/2245 train_time:133021ms step_avg:61.30ms +step:2171/2245 train_time:133084ms step_avg:61.30ms +step:2172/2245 train_time:133145ms step_avg:61.30ms +step:2173/2245 train_time:133208ms step_avg:61.30ms +step:2174/2245 train_time:133268ms step_avg:61.30ms +step:2175/2245 train_time:133331ms step_avg:61.30ms +step:2176/2245 train_time:133392ms step_avg:61.30ms +step:2177/2245 train_time:133454ms step_avg:61.30ms +step:2178/2245 train_time:133514ms step_avg:61.30ms +step:2179/2245 train_time:133577ms step_avg:61.30ms +step:2180/2245 train_time:133638ms step_avg:61.30ms +step:2181/2245 train_time:133702ms step_avg:61.30ms +step:2182/2245 train_time:133763ms step_avg:61.30ms +step:2183/2245 train_time:133826ms step_avg:61.30ms +step:2184/2245 train_time:133887ms step_avg:61.30ms +step:2185/2245 train_time:133951ms step_avg:61.30ms +step:2186/2245 train_time:134011ms step_avg:61.30ms +step:2187/2245 train_time:134074ms step_avg:61.30ms +step:2188/2245 train_time:134134ms step_avg:61.30ms +step:2189/2245 train_time:134196ms step_avg:61.30ms +step:2190/2245 train_time:134257ms step_avg:61.30ms +step:2191/2245 train_time:134320ms step_avg:61.31ms +step:2192/2245 train_time:134381ms step_avg:61.31ms +step:2193/2245 train_time:134443ms step_avg:61.31ms +step:2194/2245 train_time:134504ms step_avg:61.31ms +step:2195/2245 train_time:134567ms step_avg:61.31ms +step:2196/2245 train_time:134628ms step_avg:61.31ms +step:2197/2245 train_time:134690ms step_avg:61.31ms +step:2198/2245 train_time:134751ms step_avg:61.31ms +step:2199/2245 train_time:134813ms step_avg:61.31ms +step:2200/2245 train_time:134877ms step_avg:61.31ms +step:2201/2245 train_time:134937ms step_avg:61.31ms +step:2202/2245 train_time:134997ms step_avg:61.31ms +step:2203/2245 train_time:135060ms step_avg:61.31ms +step:2204/2245 train_time:135122ms step_avg:61.31ms +step:2205/2245 train_time:135186ms step_avg:61.31ms +step:2206/2245 train_time:135246ms step_avg:61.31ms +step:2207/2245 train_time:135309ms step_avg:61.31ms +step:2208/2245 train_time:135369ms step_avg:61.31ms +step:2209/2245 train_time:135431ms step_avg:61.31ms +step:2210/2245 train_time:135491ms step_avg:61.31ms +step:2211/2245 train_time:135555ms step_avg:61.31ms +step:2212/2245 train_time:135615ms step_avg:61.31ms +step:2213/2245 train_time:135678ms step_avg:61.31ms +step:2214/2245 train_time:135740ms step_avg:61.31ms +step:2215/2245 train_time:135804ms step_avg:61.31ms +step:2216/2245 train_time:135865ms step_avg:61.31ms +step:2217/2245 train_time:135928ms step_avg:61.31ms +step:2218/2245 train_time:135989ms step_avg:61.31ms +step:2219/2245 train_time:136051ms step_avg:61.31ms +step:2220/2245 train_time:136111ms step_avg:61.31ms +step:2221/2245 train_time:136175ms step_avg:61.31ms +step:2222/2245 train_time:136236ms step_avg:61.31ms +step:2223/2245 train_time:136298ms step_avg:61.31ms +step:2224/2245 train_time:136359ms step_avg:61.31ms +step:2225/2245 train_time:136422ms step_avg:61.31ms +step:2226/2245 train_time:136482ms step_avg:61.31ms +step:2227/2245 train_time:136545ms step_avg:61.31ms +step:2228/2245 train_time:136607ms step_avg:61.31ms +step:2229/2245 train_time:136670ms step_avg:61.31ms +step:2230/2245 train_time:136731ms step_avg:61.31ms +step:2231/2245 train_time:136795ms step_avg:61.32ms +step:2232/2245 train_time:136855ms step_avg:61.31ms +step:2233/2245 train_time:136918ms step_avg:61.32ms +step:2234/2245 train_time:136978ms step_avg:61.32ms +step:2235/2245 train_time:137041ms step_avg:61.32ms +step:2236/2245 train_time:137102ms step_avg:61.32ms +step:2237/2245 train_time:137166ms step_avg:61.32ms +step:2238/2245 train_time:137227ms step_avg:61.32ms +step:2239/2245 train_time:137290ms step_avg:61.32ms +step:2240/2245 train_time:137350ms step_avg:61.32ms +step:2241/2245 train_time:137413ms step_avg:61.32ms +step:2242/2245 train_time:137474ms step_avg:61.32ms +step:2243/2245 train_time:137537ms step_avg:61.32ms +step:2244/2245 train_time:137598ms step_avg:61.32ms +step:2245/2245 train_time:137661ms step_avg:61.32ms +step:2245/2245 val_loss:3.2772 train_time:137722ms step_avg:61.35ms +peak memory allocated: 29249 MiB reserved: 50528 MiB diff --git a/records/track_1_short/2025-11-10_CautiousWD/assets/cwd_condition_numbers.jpg b/records/track_1_short/2025-11-10_CautiousWD/assets/cwd_condition_numbers.jpg new file mode 100644 index 0000000000000000000000000000000000000000..11e0a17df98d88c166acd4fe438816f5287705ba GIT binary patch literal 604484 zcmeFZ2|QHq-#32jTOw;HQfdoK_5BxxU%8<}sWe7UXr}D35H$LruxrZNu_+|fb zf2=no_%HW^I{!MX|K_fK6*A5E^F zTc2g%^yv!y#8Q*`KkoYvOJW}9g51G3OmJ$rU%V6y`XmR+hr@y|{waHavfL%mDNqjo zQ}+B@`PQHE*}s*u{<`O=<34atF(_-F{WoRLe^d7N3;EO5pW{!DSDd?W>@z6aOEQ82b}|D8|VV$4yd*nlntOQ z;57O}Pw*@_{z0dITnl&#K|2;9NZ|HAuARLPL1i%zw3_yhYZ_S)By}BvN?R_Txpd~Q z>i+N%|9QYgbGZ;Cc@lzTzeA9?%U|DtKfGUjq&<%INB{cYc^BZ)jS>*_d;9`ZvQ3;CffwfxY)I6oBdn;(i47l3j$ z2tZ@|1fbYc0-#+12#FPdcH9?$OkWE?vMmBo?Ti5Q88B~vj39JfTM+6!DG2Gu2ts~E zf>3LdAhcsp5W2e{2t}(2K?^&CpkNyzDBDE{vWpOcq>_c8lP`pzbu~iJ(vT3e3lV~P zWrU%tn}nevb783Lv@k@tDhxH;5r+6)3q!phgrR|cVdxP{7>X4Uf%26_AiSOkWVufS z!cK@lsTV|`%}FBAgI6LDbwmU@yGj)5R1$@B)I_0M;iAy1YogGZ0#T^FQxqDT7lkx_ zi$bu37!2WX!lM6r{eQc_8-N7)4)H!ht3-wJ<@yEq)FA#m#*_x>d~R%NYwzgn>h9^K4GoWsj{O+_Iq{1%H@~pRUP68^1FiVC;efw?8_~ZR z&nhq;enCM2L6JY>;o}bjr@$&fq4isZW%f9ToC%awGq@@$xA)GI!fG+~ZH`R&vzJ=M z6*LToG+BR!^w)^~_XZUGe>9?h9nimyhev}}2=IXk6Icb|Aq>-g8V~*R{3i|m$pio7 zfq(MA|2_}!zW;a9ovG?O7P5q%#%n`Q;}6~%j#7x%&f@=1&kL5*#lIl!?9bn5&X6{x zsijW~lW^rhg5P;iG>p0=Fr22nS?bs-d%5P^+E=<6Y_q&b?B>O9cc}M0JxxyC>+z?* zbHBYKU%w|8lh6kb(!qnVsC|c-B&mT1t%8SiSsT)F8uMLG@*oeNP?gPR(^o6oEv4VK z4Afxhb)KbfH~**sk+NDX=V0;g&Vykmx`Tp0 z+yAd8SA^I4-$&i=dl*O<-^qhSa!9?+BT`AYmH0CRNjLauxLG4=mtU_!%st#8lHrx~ zAyXyfPB-^KCJ)-LuMnPR&NRLeBBc~w@N4&4{R5@xb2Gz7uHP6G(pN8g`jo$+#|epm ztM|Y@Ez+cZTxmufnO0HfXzjszTpk$aE0?PuaQa^SH-2~`@vVg-Dja3DL#LG53S%qE zxMdmj1R;_Jsk&-&#X!iAN*(LaRh24_?aDW$2lwZFl2J)ZI>>`Ak1o{1>!`9+#(mWt zOe<7o_YfVG=V!LC?N!42j(_)vRk;(qka)Z)5H&C9%t!qiu5q3;MDif!UV7hNxCf7k z$1tfQs?Q?X_?14p;*o&5gNM|xrY_O4JEIMn{{0O>SHGQEoh8MFE&F$!tx^FN=^@dA zm0Qh&5G|_)LIOcDL@YRYRk-g*v-KUtUnYY}U3~L=m9pdao5t^0VOoFX>rQ#>5}tXn z0aYww&mc!Qu0$BravnJx9zs&?pSXjBpaVM}^Ps8AU8xtAa8>i>bnMQ$eMRD18|3&< zga_54Jcv=qgBC76*5N@^77t>*=Rqf^h~`oqc@5DG)E0!UKwh$p9Uq)vDc&>@$@0m3 zw160#avkVx(0IEY8tYH-Me%$2FP3oCiTph1S}vaPf>4zFkuHq2z>?iHVp-uT7H_`? z&lY>@Yy9Y{(Y&F*75i?yy+seFzR{={tgnuNuL} zP)qddO?^MT7mBT|;5yP89D>82RVEy1Uw`tbAkLt{Fy1ti`h6F*P8Jhu25o9`pHUeZ zc;!VxGPOvAeWH8u(aHd+Zy~-%!bFQ64-by9v%;C2>1VZ`SMMbGE`jNBLf2M+2MI_5 ztG0SjpnHVr#3mKxPp}pFO_ePIHvI@rEF~8+Lkp|3q!!Spg(g=UP>MUNIWsuzmK#fzQSfP`RSz^sk|h{+RFXT;TNC>E7Qftfk}31bs^}b!JAqzs!dnNFWA4x02(-R|Lx#=A zOzxe#b8NJqR(`Cfe5Z-jL8mSp6O0`(k!YZK*15Vc=%}o$T|4#7LV0uLW5cd2&cTe7 zx{J=AzNhVvRUbD;4=;d%B%PMyC?!m4j z2Vb8_H?Em2dv_Gx82_?q*PS_!DAIGH6}7Sz_#{ywDRj`5PK+Sy#@Iwzzb z2t5~O6xR9UixuO6=(|4SK_emDOK|lkGSe-&kAjJfaD0uY$V@Sl9mI`!h~Bqlc-qvo z;DQNfLhtGAyqkylN{wo!*PW$)yBvk@C18@%$UG~V{Rl|YUQ`nq;FyO;Pu9EXb{(oJ zqvcEpN7NUTTnSK^&XP*eZWR_8a3B;LVS5OL#^@UGjF191iU*z7Ux78k{IWIUHG^^- z@3*^>?sY5TC%TWg*?g$`EH!(cNUU73vXdLZgQ`pE%u`Z*wwU-!7IoOZwuN@pP^geu z5aa!#=aE`{KJHuQNR$6)6Qf}88h@0T9H4g^o{@lK9tW zracwWP9uLz?!^r|p=u=gjGA2+UUXSmwo&Fo0*#uiv2$Y&=jPNM%1EtAP(dvwc@5CB z8LqabGM*C3$`UYfQV$&jD*DW>`wWR`pRVuT%5LN_`o0$vr4w5B-Ms_W*M-UGwQ+WS zGmwjtnqkN5Ab+OhC7>9cu#yLP!g0>I9|?$7*^PzSxXh~C)Bcge&5ohFWKONGweprM zx6zkdlZ^j%!-Vo#bFXp~zI2lgnpEio1_93M3zb%{N}uEq54SI9yX532#~NnDkZ%0w ziohEiEmEdH6eV&K+;)H@?+YfVby@#L)h4fX6q@|BWx@;TZP=?wqdDovoP3R+TazyF zVNd0p#(}~iVB$d>eYpCFnSg$SRBxFBY|z-&=kk3v^m$IU>l+P=K;?9mQ~YPs#FETc zwwJp)b-@9Iq{o}M_wZ+v6L7_`SN-$#LnwTR)rsyY-Ptr+mrJ5qo6U$#x^W`S;K&O3 zj*%xKg3uLz3;5n@zzOXz<7L+AaM_{rA^}tRE<7lGarp}84CpvxiM*0}em0h*%)L)E zeS)NpD8>f2Q2rM4>YX% zF7o>a)`2g!Ponw;&{RH=i<9fdrSikOPrUD(kw0`CSG@Dw4YPYma(!#u1;n$Q_?UN- z5oeO}$4^GK`rI{X6K-PMhiJfve(z)hT-=-);ikwmnn}&+KRCiYH}iHrVimhzc+lBW z;7dMjqjvE}iON+$*5Av3cwN7T1F|?RIqhEbsX~B6H=qv0w}docVt#MT*@)|1XGW4<=Ls4 zFbku^+P(vczf=u!<1x@W;~vZmqE2%-a>%4Db7`7y7hLn|nIf?Q?n zr?yjq`#j&>OdpjTmT8U2Gzc|}5DZpFy}SgtKd}|4D{=++Y4))sBXnbjdMD9%INywZ zwEe=!glyb2iUT0>KuTgKOfnnA6ug@nxy z(@G6A2UlLvC?~Fra+l|t@?DLGoY8U6Me1%V;MwPJv*Y1b1C}+q^1%*DYdkszzWT_6 znzVV4w?1bB34FE(FYsX+Rj%+GGOA)gW<8@^RmUEMP`M^_Cj@cgh=F#C@N?H%uHqLb z^Lfz3kL4(Qm@m2+#0-?*rZ_-L_U>?CbZwtgmJU*wMch*A=mH6Vg+0^5Qe3^vD0~GT zF%ZD4aSe3N>fiwGhR4+Mq+L$T79cg|aH9ai0b;!}kq2eBcP}IZ&nBDu&TJ!Gy%T0U za@)nFU=kDWkLM5DudvCn^7hZK-`u6kp59CAMC+nYXzX@8u2X99JL;R2pupmRRP_@0 zo<3@s2WkP#^h9LH2@@}Yu~SaHbmnn7vYRp-th(zx0*EPC$C122&Q1-0MSj9AnJygOq_1ehN^w1Fc-!zj zDJRFf{p7c2h>}9DIIvs?GYxjHBG9c+X(}es$Q;MS#~IW6s?7}B^NH(OaIFH>r!&~a z)7>+|W3^V_=E@qv7pHS4)z6=gdy9TutqftYUfFAR=KL>k-TxKt`_J$Hb6osCp-&Zo zTSguIQl$^~q)`C3ujpJ9B2qa4aAU@wu)a`-z=PV9fIiZlT-b(*IdJ@as7!nPzR97M z=_CFoH*l z`IgGsR{N~ZdH*kid~w%PCl6dtGS2R5PWvT)`?i6$0JpQ)G{}afcgL6VQ#NNHacXNz zfQi*c4aXNA=S-^|MJYf(Fka$H9|HjtT;g8C)JdJ0c@1nHB;{?dni!y^6O`AF{rIH& zq2t?UkB`G!n!hWf*+8cprv`{{DFCF&B-8{bxJi%|i1PPMbR){kOft{D^fx#aYQNs@ z?ec^3CQgh7RP_pnSU*gNa|1|4bO>TFa)o*$0|WxHey0!9Pd&3ZBcOlG>$JhakIdu4 zX7@$~5Pz0pg2}$&=F6OVRrQLLd54Y}jf(g0?#`rmkL`Zcv$Xk;{Rh%rxLSTg1fKDl zTGVhaC>iI4Up}^$c;yAD+0C?-dhWw+c43#|6sFo$X%#q>Z*&0dmhE-12)s0M=lU&s z@oUja2`i$nJ9UC++@j42l}}}dHgKhu6|pocbKErC;KPGf+avn9=yPLsG?=J6}0IfKE2YO1=snUOo?up3mn$$0@x;v^5!zQcWnpCSX>cK~Qu?T+8- z;Sc<-ft4nL9;CZXK;;pT?LQ0?Fwr!oy!&cg@8z55p`1`V2)<3ZYPJP+MvlM=6lQJR za#^hY`X4(NDtcd!C3CxDzQho| zPo7CPP*OEkI_|h*UDBSluf|BgF{l>AG{RGdbJ10^43T(~lUk1=q+(9Dhoz>RZP!To zwq~4~CVrmkrVUZX%dnLX?xbR(L97_3XLdv1N#wrmTFXugnuHdB-~gkny&Iu|a2o~1 z+y|Q=9<;ux{Vla);pY8?-;ZhMb_}SuWO;2r}T~*L|cU$V7XdB5k z;C56RHx#b6f*Ds~rac|ewy*WKwd+8Yo`l$>vQfbIRQ4IiIg_Zh z8-ACLs()&@$b&TSv?J_r z&c>lOS>e0Q{{<2={0Agf7A?Pd&U$L^Q?=-mHz7WAm0&04PEh>@ATUk1nku6X(rUPR zyUF_8NrliAT|h+dhu`rC*Ry7PNQ*brQySN&dfq;iDOuW zoC}VMNbh#=xc59+UFUnWl7zaBv{b$}W$YTZQWt2tW>PBiuZmHrvBJN~Nht*V> zmUg5>m{_?-2AL?*g7zNBqtR(Etdc@Dy}mxCBAKMfpdxDa=$a*tGhDqF$Z3|UD)$a> zw`Ny_y9y{|QL4qRHOXxert!fi4PgI%-k;tB-?Z{TyAE8^eVrw)Xx6#9~yM0$R1erf_C?m)(TZnwj zKdz565EwmRJj^!km?jutpK!&R$NgZ%$MX6#Wpm2ExPY20YVwmi7P><9kV5*G@g3 zC{XjB7QU<-YPS!QadG)V*5PdF6|GEu>*nm=Lc@`!f7gs$SZR)bxygLb1YsNiID!D8 zzgCU+rYDd~VsBK+asnoBD^VP?)H=jAKkRs0G1oyqfB0?t%muYMlL5;yiAF<*x|gFB zJFmnUOIv-RmU*J8MjT(%j@69a_O`Lj zj(&}*?txKK3ms3XO%%2}X!!%4kw*|S@%Maz!qirhj+e@9w-fY<^!KifaUpYhhDg?` z@+pt_V+>*mmP&s!JDtM>uBrfME8@w1g@}Y`U^teB#&2UtxB1amE03=BERD@?-fim9 zw65K_>F8~#{1^Yl)`aoe{`(}~umC(`g4$;Xd(aV0LR_Wn&}3&ROTWG%n;8=)8LJQm zRXs)4OmWmqRzxagOc2KSm^|p?V@)KNC9)6kWaJ&M>>^sdaGnTNnD}BF5~fx!BNZA7 zIW7J{Hp5A@1~!PAqLWTy&4X*+&70tvgadXv(^P^3Ui+NU3(z)9Q!bxZJ^OsFShfYEhKwS!O z2ypclldU86h}!lNAPa3wp}2Ps_A*Trlx4MXm0gC`Z9ls9+WfYw?@<4`jt70QTr&y) zl{IFS-Aj5@W@JNyCb(J?^?iqF$;$?;&xw`F*@qr(tujV~Sa3b?E`LggAOBLX+|15j zne6l9gx_)Y`3Ij9;|ttxRdqB)%kj~o@ICIB=rN9G+4KPD0TnsJ%5$zXuF@!MZ-}kj zJBtdE~i=?(*;J<15nx2VUei`&OVG@wGD}2&n7s8>mTn@ z?>?0OvZ0h?+kU|Lv#xuEi~YdsRgyO(6U%;xXyIF0P$hqWz{ilbp*GxWs8eKVdwr?x zFf3I!e)Yw~NK-}9)`1_T8`lGTv8S7T1?b7kjbIv%W39H`m~O*Oc!6#T!8b!An&Fp< zsH`U@*$%$mj(x{7?J3dZwPw1+HCPABJ^__|@@u(d7S-4_iu3wVS)62(JvY=LW7t1+ zsKdhHJG6r`H%eiH@HTok+@uYk=Ruqe7(j*EM7;#QnBV|4H2<_4F2N22rvAkDsmgn#r+;{(v@wf+2W37EsFif;S*hM@7zoNi_vPLq zvK$S>6Rh8nuw`(!9gWm4btRmii6}_1n%CcuxhE}A_4|v+cGFUi*~9r5|iNSkVZGX>0RkX zR#jI={G-==)qL6GM#6i%$69@EkGlU9G=d%BIl^n-tY|;yMFR)$tskn;d(hWVgnx z0&?Kcl>>t!pX3^a+6<$vC|L8LUDzHTbdKH|HbS0$#HzL6KE$7IRx;T)f|vfygL-K} zgU&q>;EtH_+hPHl(bD@Jil9@nw^lq92NUD0e*@3-??zYE4&~OoBPrN1!|Yko(h*L| zrA3BI$~j+_(h82XJ`0)Pl?RQb>7#oAA}Q4N z2|r{dg>FPmfq-vMzg{+lAHHXODNyv{_Hjs^3zL z4$j;tJ`UAcE1jqG{z=)%_*0qI>4$eAUjV4o8YYhnfk~VKR$7|Mc!htHfy%X*gQPge z9tBCd>qn})mbX*fLtaP*SXb5d_odx5$=ltVvpZYj^IqYdAAeBYVFuF!$NZUM&V%m% ztx7pQ?3ni2dul6EKk`#MpJ8IaV-l_-eZ=um&*N9xaNNShzYH7KHM<{*pLBcIbugvc zGw|XQT3eUou%Ydx0}&61jjI(l^BuLHn?WSpJzt`M%=7nOtV@wd3_BEWVJdF;QxwhP zj{g)uqw$Q<0O0*r(~%9pt5?I_AQaV?)W=2>U538c%XpW{9+=O+ni-kk?f)#rvCYtj z&FPNZCUl$s&^E#aFyu^FzNAGIu0D+JV}8=fb=CQ#DoS-p@UOF6QczTAuU_v^?Lja) z`Zz)TayJF8o+>0G0n``+kVlgG4v$&RRe{O4W&u2e#LqtoY#m?vvZj{+2T(5>MDW#0 zU@9Nr%7j1UVQu&l{aP3~Ks=Z0Wp^&%1?=aO?xH)baB}?~`JDbOXwuXrca3oo6DQZbqk_V+68zyjsUE!b4DNK8i(~bm{u7InBsEoUWQZg!&Fyu77m1B!;0+W>b>9Ho( zkA3eDE`NRUH`TpMKmQE3Ie+1BLC={)Gwbf%moC+%a`i}cc&0TS(aJ?-lk1)7iBxxd zES(wa^H{w&slnj^$H}pSI2lgh!Me`pYquxzF{qiX=sOP}SMgIm*y(Osu59%S-_fI*usJvWoAq%ATOE zGSLsY++33o9-VLwguUa$HW3SyNJSVCX%ewp}|M2ZJ%_#giD4cy}syx z##O$q3Ru-wmm3$YeKG68dA~x;2+#O2i>NEs;6bEbFC{`e!%aU;K7<6aq^6GpfVIt5 z^K+Y#j(JjRc3YA4nM`9z<%(^>Haq)@r4~<6hB=tjnJ%KK)RYl3wf4w6th<8Me#-fi z*}O}2ZhwQB2+{EScab>@AZnvnpV?;aEtv5NxAIw0RXT13YD2wN{hphp)tD)He2;u( z(k_93OJ{GJ8N68IA#8+Y;h9bh7fj5KW9^CEq=3kBVZanl3Y*e@?z`MFoyUZ&m~RGN zxFYo0@6J~TBLbeBX!sFGm;!7nbe-Fc$~<7-R;Shm0EcLz^fJ(yva9sM>)|K*sLB=4 zpLwhiuYmV(Rb-EH%TR;I#+~SedHXD{vYFwUhJ!aO<99xmJ16tyqQk?82Y1eHjFC(w zj28>h}5(LuO9XO5u0dv)Tzr% z3UHen?AapwbAELWmG$`k6TF7-HRs#ja~m68omvx0nYO~D&bdTqs0Gt{$Hg5zPff~7R?~9fqSa|XrX=GY+zVH z{Cgcg#K1FsTIyzR4C3B!!V*y3k$lg3H>G2zDt;-u<~%4C!#bPGGlih!`_(P-s2>q< zZ_4~gGH10Bo~wPC2if6}Z5O4G;G`VjEeHViZ}SBxzYq-8sFWGCpt4jOdEe}k+x#5CuFU%Oy@7ZLJXw9N>;|;2OOfI?CWbN>8sd&^t z?yON^$G~Ij!=Wy|cML@h?9=1zoq_WaEs9l#;h2ZPLb8mgDuqdfz4JA0Di1K3WZ79) zux9++#I5|%&pVp>zn&>oZ*z=1EHYa*-OLfxpOOSZn-i{Iy*w7%OC*fItG!5ito5nl z!}CXOgvRsZulG5}ojKRCTz_d-H;cf`OO7W2hxkoS29|Vy2RY~?CWi2*tYmJ|dwuL` zAYrVRnTH)9-=I!yJja8cr9|Ne8^uh0(WfwDR$2sAFh0@16@QPz5&(srPE4Xtf!&Eh#DcuyV3ZCJ)2HOQd(($iYwBAQYKeuQn|&(xQCLO5oeKEgwh9%?e)3of^<~SS z*hU3lEcw*8{`6^c(l)rJlA}rud-5kpsM*cU|8XT82wWdJ?VjKN{5h@aNszmijEmLAvXUh2F+%#m z{okB6DeW;rUxWwLzF$fR`GQ}8I5%CfI^1zV4Q#M0z4<)T`^Rqg#8rIfLrpfl(0B|{ z7DRzlznad-vpczgMW5D;e9LwLlPj){Q~NwBSZ z8bHsRnB125%D@1{4BSlBfmF?dugJ$59VyDX>B`S_P|ptPVaS zzzi8;s@OLSe{(&Hkh;Bu(y+QA?S)U0rwnmALlO@ zmyHM@J9iW{dWn3&#L_r>kt3GL&F?$RRKlOGR(w9;a_FHNwjl~%r$+6u#U$^67$kZv zD%OB5Pr}O)R0?ZKnA)9yN&2vEBn*=gHN~2>yAPxiX>v4-9QtA80mpgfG=6nXVr-!t zA6KLsIZ5i9;f#2pQ8=9)Lhc7o_oA; z)mGJmF2@cH)4xzrnI(2Oj;X_&MV&9%uel4?xdFj%G{pP<^dq)rN z60ZIw1V6J!RryBN3e=LB)+&^hK5wsAfF#p`tqxe96mRp?;M5xdn)a#g->1R@%sWjS z3VOqX_TTyv0h@$=x1>hoq(x^5^xrYZ&6bL;Th*5S^y-Sk*`CNzsiuD>TmO%frT^*X zqyHqy`uFGlUnE;OG29&L=@9O7n9;6(6<18S=5N+MOpY#npI%fhd@glgMaPUg^_F{- z8ox@qR(9z0B%!twrgWn!BMct265R)6UYe-H#NG0DC|Zo56PM*26|ppy!G}_^r{xI z-HQyTv9M_i*2pjgmD6E2u?P)z=eulMw*Ky-gVhD2tDI>KlViCV;!+m!ZE&?E z>`s@(s_~`WbpgFrb=CdhR!-@IHDQ8Rp4r_Mo61>k^w&6K{;e(Wu@3tSCTYZtp$@&I zGO}63D=do)8c~;N)*ABrrqaH={@H|shk7Nde87eo#~oJee?9XHVs_s8nXB(Tl6 zsS6snnuj7vf;+Vi-U)hjYEOXa=tmwDGNX~|g%8>Of+3CtQW**Kxpe#9KCFQ&0phY} zl&M%uD)IrIkHoJ%Ti^>XtI?>5bVTiUA237iAe%A;f2S_I+69!0@6So}EX=6F&)wre zG!g96OX|X@5bRrPH&O?*I}v1Ac~G&`zouFFfe};th%3?m)DM5p%LTp`NVJCk4ck)3 zgY@fm5Z3(tdKgHx>cb2GV9wptr|E#O5twlsH|pWMxg|`30AkD(?o;ZNIk4hsAhdV% z}BXo-!}tuvu&X93NYv|gyL>ge&5hm?n6J207H$CppZbx!6(mY1C1{RePmp=ggZt) zy}j{<@v%GZ)`|w&=e5K|7)~F_SIKfz?|=kM6_H}vTR(7PImHK@ebfHhzgu79t!nPu zrz?ueYo78q#|S3RmJQ9KtCyG=2Rfc+48zk>uYK6@aD-~=(9ht;6L676Sw=i{shTq3 zj)?)ZM$m_Qjmo%}8TgL-z+`h<_~T)HwfVZ`I;ww{p6qI!AGh;X_@fSe*$#zJn#TDk zvm9;=eBR%ft;bC^Qz12F5+{LIB&r`fRa`#v%=KnqkeqwVJ%R5w8^ap>y1VYy?L;>& zKu+_mAZ!MXbJI#pAMS35#V^dCBYgh=o8p+k!0pPK9bV!R!@(TdZ$HY1?d!yb$;^ZH zzbzZ47QybpmKwscZz+X2Nm-Hylsp?vW!jN@r|ko7kYvCzCD#(@N&{gOu%L)P;%UWP zLKOn)cRjzCWuK1(?fFn)R;2l+jDW_haA#5$eOcsN=tkyb&E+{XGy2FQAjGJ<0tK;ZB$S>o9aJ^Bfc-5&nxmeRY;tvPvy^EEDa%H6du z<;0LA;OYnDqC*rG*a%}aEJfRX0zYieEUMj>YGyX`EI-vZ0Oyd%o(v7bKJ8{LQczXa z(_hDNs%Csh90j$V36}I}>rZOyJLPg;gz>iF_jeIiA4dyoe!PuK!FyBqNnhwvkJfdf zA^FZ;O#5{M^6%bVr0l-hXniC&No4oYkxbxF0~sYS{lichks53I;-;rpWGB}(UaD%0=dKypDqLT!skK8|F+x2PIgCGG4hw zz!9&XJR-F{3=Nlfvihelm<}KPtC>>Mx||*1^Xw3oy69A8&=A*ZPx@X1b=SQE40_T| zS5E8`fi+hNLq~>N#C@woS=*m5W@Y+5JkQBF)$AE2cuh)wpHrYBEtt2yr1G~hQE zQd=w|3@p5;8&qlAu+s+9$=(CPpsTLtQ-GB;Zm3Sp{7O~@p(T%KruwUhZnXyH# z9T4x%()^Wi0j1c_!R8gIaYfvrH6Fv%r&L6bjLLaa{qQ}pYz?OPFUg1ycjoU!inMg^ zwqFiS!DpueYhTm+`}lfn2_Xnr6V=ECF|;tzhc27hNG&TM0;EcQ{c({?`@An1UK(sN zTueVVUGhR;8N`g&0Qlr>h;##fNPmNzxf1x}7fyn>=f%&>uYHd!?8}L&9VV3{efdujiXLto=FeThlvWQhVMyWU*Nt)b*>Og4Sw;U7334&2bT59{I%HrF9=~7M{cFfs87dZ{)Qin9azG- zE7P9uzo0iWX8rW$&IpY=(XU?2*qHo*^^de5hALpfui=;nuCgMXu5Wn4I#L%#YbsnF z@TO6xk@`;D62#!r?WcegzZOs5RXVp4bf!jsuvz_$-jgnwQ>2ElTKw_#Ib zBLvz!JA-*p9=hXACbrDcc+&7BKc|fuGvqgh@VoTl@hi zwgFVkwxnNS=|l)_dP6d?3Fd!F?eU4i*A`*ouAI#iX|g2Ng)3>fS?T>`(roMd%JQ};qG_1BH^2Ac#!>S*-R8tEN}67S=1oHf z{qC)`;o9Sv=WVzHEXVEF&4;2NEI(HK>UT>UrjPn~%HO!#M#ii`6hwTgJoibj}V{(#CbXn&lu9dKyEVn$G1lG2n!ZRM@ zOWevi(xg7Z3LfP1L2MMqyfInY9TvFv;{3!6G4Hc5e!QgUi{c^UUgq5@F-cq9_g)Bt66eR*+`*v_4vn}3s;L) zAG1_2>XSiVZA)#2;Fy4$=x$2*wmRcMS(fWvTIkE?r)<3=yf5C^nLmCt-bOC&;^&`D z$L>b$k>{QwRTG$@OVhhK3ZyS2Ic`QsOU$xJq8}>nu8{HEM>oQZe9~Ft*LH3H z^hNje+l$-O%{_KNrNTP%W(35b0TVm7u?5v4%QbN8gHzFkYv#BDJaa#cYz(_eW#rKd zJIAMOhpA#T#9#)uita@|T~zK{ywO`uMr5#2sd2vWRw!kp0h4m%J_LEh**>YEjOi1) zU|rVrfI3VLe1yz67_0g0vTLs6tSyQ$*cXCJ&lS3U!7}tA9EfMyf&5NOH6VKO)FOVW zor9lLo+-+*&+DIb3y3>k`gp?gy>)A+GmX}{olk}#r?qqZ&GI&OIyVF3$9nK$vm{|s z8)@}hR#=RYT7F~V&>0`ZcX;>|x zH^>>$dFr*jYG-%8VMK_;mE+#oWiF{@m!FwGiCg$gDIo%2PuLekLRuy+6#LgdFVpMr zl6_ytV$vVw^){}En97eh6SQ4~8=>YeHw%1dIibq}ne9JB{ z@b$#@tvRTW*Ay4N(h=2G3W-GkCm=WgJe<{7!40^;#+yFbmYBwj#VBJ^Sb zjwj?3=%e76Mq|1xK*9>LgY1cO9A;wQzCTV$Mr^MbdapVZU|W&u^J`*I}a61IsuD z*c@ZQQ9vB$B&f-F1rwJMjkvM=;?KIpCp6f835{E^srV~D1q#8uo}z+%=cS&g+?NrN zX~_{Bt-G%FWU@^e^K)D97nks9L+(~RoWs1#;OdSmj5wui&pXk2KX%B^Qkmguo|u?) z&-92cAdp4B(=>I;3OI6o+;}SEk(ojjlRR~W;bJdlmG@e}vD`+pys~P0A6jHuH>;8S z=JVJd|Cb+1qgLwZW8%OI6r4c*q&D_*9}w8C2>0dWWy`4LM+JH8WT|GM4~)!~(2vT) zgFj4TBRY-eWP=@%zJEod$yJ zXn9)!qw?mtj^b-dkzq30r{C{-RPJzFbeNj|Ck1ENf|$l^<6K6y!6I=!cwr=rpbA$% z1;Om6QA0dOz69(@D&cm()eclf6uEE~m47fi9!uJWwW?~el44&W0hY>|m2N7PZ%eOE zcX?|Ktb1Sh{t_qW%x#AI8qdsmGNOe;Rs1>9aJ2#K?Mt9NW7sE{gybPNTI?&@pOtk} zf`3n;dG+F>`Bmf#oG6_BEVLGEN~ce|9Ex>pzYzQ-O~xunc7M}l=~_W2#EJx5NXR6N zqeJ>`X25+&+R*o*s)CX4HfvS6;(@@2*D-=|yBpMEQVhga+!EDTMVMV-Zt6P+c8mgW zN6;GZGI#-mtx*UocX};@om^j7qnldiY*g#LoRN|xWTT89_$Ix=YP?}XbqyoX95-GI zo_Y(YO2&H=KpJ|g_BqxV3QHC1uR+}RysvoI=JEbrX46csevbE#xc6f6-@nePp;6IO z2A1Ec{V*zh4U^Cs!6TcsKt*wGJ6XwBUstZuiVjiPEfoKaZCn42bMyiy2^r}vJHKUH zCi=H!DpW@Lr32?mSuOReuLWCl8a{nC@n{{niwd$qj5IRSirx$MTPqGpDIognz-|-U z{66H{OxN5ZY7^vp@h3a(XSDuXnsbzB(4*=Kcks>zVEwB{*Ciu5q(|gsX~O`n_J;!S z_JkGrN`M|Y$P#Yhz#^V&;?>FRCiPI67W&8r!Ze5FgHIxtG}IChTjDYK>$}bZq}}gV z>FVM#&+w7C*OYY$W8SOB&yPH|Kk%KKKxN#b1IPl=%BWwvZ^V8of;oZ#c9iwWJox!DWjx6a#!Ke+VbOc@kqy>1N-Rn)@7#$g}5bK$|@VLBu;`RW=dNeS;~| zk-Yl%ns-C3+*Nf8Q!YCnT-4S7Oa|w=@(WHr3~-j5n|%oKkGq)YFqnNo_M87w>NLj)=r6t&k7U0=HZn^EXQ|>07cReAaLr}TRR5W z%fPXFyE)VX9`c$!cWeY|RDI@`opgy<_;|-^NLc=wf4rZ ztJY2s#7HZDZdX-eev5@l-?SU{D9~tw>wDGBp~kV~;~J@r1@`Q8M1F`sgV`h&Wol?7 z??HvBY(u8%lPp5Z>CYJsF2A;ztYjA7RNEX+xx4um->+;SgA5)-n7$9>5Wr}69ZBB^ zY`aQ*9`yXD4)Z4bm$z0RJiC_Yh#I;6%~!Lp5G9{7k~sI74ivpxB=OXS%X5 z*=SyUz_j3zc%U04+ zzUa&X@{-ZQ-A(IW?gyF>h~Zsw5r8?1hH$Y}5@3OPcU-ZZPObC1uKXP4Kz&I`?YH$Q zk_8w13vNGeREdfjUo)}}I|N>B!t`oDWh2=>$RZ<|rvA~*7)cAt50@acn^>I@V&^q|NQ@des$OXmo%|HBdjd@GmD> z^m?K4&VnD6cg}V!WW8!H3>DaAvqSKqM~bizX#(g7LzwXZFNkNP%?i#Ai3 z)jz4YtIvK|X?jMlzP`;uMf-oS_a<;L?fv8UB#DwivPEOBBw1QWb;2cdleG}drA?(0 z(xN&kL}3thtrL}ynoAo=(>_Sr8$;qC=JPq9^ZvZ|?c?ftY}?!6TE|;jm$ycf)NFvCEd?}PN)*!hRx-TxU=dADJ*{J; zDvYc6)UzTb`Fc@icOPq}Pu!TG!iO=k&TkAa?u6(Lu!~tG9dm=u4HoW zQ+sCk!>@C!7?83G9;_cIrzqolOU^NWK|={$pU0}l0=5>a$);9!%}lX6U$0)N+xz6= zJ}al6j^8*b_jA*I^=pL3AgNJ-j?Ik{lQ}9V-!&HPzQw#x3*`)lfB>7Te^O-lVQIa^ z9Sk*1WaB73ObW#+H6rU_Y9p%LELT1=WQj!Qiql7`)qcl405P{H1m8_Kf-z6Q;kZ zx?1@1myy=8j)!mgkz@wOeeI>Ub0FAWaX419&Acjd-odZL{^hD8M#8mRu;*Q33y4%?}@rRF<092ev){^gyC z)vIRf?oa`Z7=2kB<-an~(w9$evOZ&^Znw0{W{=a4Y1KGndn&wCeXCzq=4~(8DiVH5 z{2X$Db!v}6$;+rjTpk<6Z!F@i^11V}r{2=qFu8qdaP6D8c{XMft6gPZ#3pq=UZS}| zKZ{Lmw##Wy5}4)dD1U5_=dU?2kd&NIzuIoEoy#ek@bEKl=XkD}+H$l0^!Dw;FCEkp z%iz-i*_LA?SUYzTGXl%lwQ)i1a-k=i9z_v^Nz9=bLd@;~d>q~a$Vo z#{ArR#KAW+Joeh;muegBUd7A2Ixq?kvlcBT8W8^K5=mHif@ozqzFxEjJJiT3D1E!q zZBp2gQC4|7#y-FwJ-d8qN|l*{`YDT>%KPVxC4V-6l`vtQV9f7lU|Ox`du%voraI71 z_)q6;))5+n>ekp_Th>BXv+q+jsW8eX9Zy?(FJxqRly~gKesXFSOq#(um;l3J>ew`go&>*9T-8#BUY~Q3a(j43GbNB zNX*h){>3kkw+46i#a?`yT%^_1dTFhbdvmEg^LsDS|GQqS)Sa(PL@_@`TzR*owm zp}_^{BQmb)^AP2p;m81Xm_CR6j*SM}`_DnP_D(csp|FWihO{7G$T7Ti5hBulL?pTo zt$`j?9n(b%eawXGu{Fo;yO>PlJ7lI+TTjirXztgpTfVWuHE+V=h>TSuAOnk49fTMo z+!!j~6n#4h+7b#PEU;dYSplO4i`ZD+RBj|z7+9fqt5#EO9bU>ee|q6XjtwnJHhMBJ z-BBz+ex^7I1$}0e7aBr;T%Q%KD?YZ(Z5PpQb#P&wGvgfB^*p&zxUN6=Mdy7Ew7`rNIT7Wyi^TV`G_aCJ?NiU2_N?sHs&muYT(R$Q znDzE8vv#Juj8*KWJ)ceS1OJN=6xo&FDr_MQzGJOVR{<5QW=FANo?(4oc*l$i#-#i- zwl%a~9Pj&J-^BEzCl(ufPn_JLB@88IZ-F84po5*^gT^4YD}9ip-98|T@Ettx@nz@9 zw5<7n9h)s?gWc3a7sy%S11>9;hcKnMW9T?{_3z&QfU4FUQEUlA>ru|Q3J5vfzmf8W5U_?FoN;yy#eXFwxvLMb?1@4n*R#2ag zNHiDosKj7Iw6E^@Y$j{_{`^(sgyB3OB!rkwQkz3yZij2FkbOf|`X}vvFZ5q6;_zJqJSHS4(7m;x-0ChO= zikKz7XrkLTH7Sb6RDK>_zx%`E0;RTq;5OBiaVY}J$wj#j>h5@4zVQ<`h|Dizw0UzO zfX)Wk&BK)VTDb_xNS@0Hl{e=*hQ{yLx3=Ep9;aPV_wmBqOv{;5ujSiog!t~LR4$bX zLgGM04^ALPISrCl_Z3j_sUInLd_p9Z?~smme6d)?j?W=f~IT;Dq;Hds4S^Q3jWBb(*OWZ=~Gtu6Y$Nau}DH z`#5i{h1cQxj{7@3f|@#QpMCB5hQ&oJ;nj-@HO|70gYG)&g)e4Wb{Q-P+%~#d6POMC zB+7T@;NxO=NpWKm8!7Uxfx3~DjEwiG2WP!ARGv<+*_?G&0fbWvrYKs0j}rf!DKDQ# zscFv{K!Q^yZ{#`LsafNoh;62!X@+MN7zj>;{h zi{k{!JcoC;W|Vc#-cv!{wd6on@@a41+1n!ZWOe3fgs3JA=qbp$$Ae}$$7gAw$udl* zsZ*;_{y4+^2r?dX-5fmdstu{%y|dx%i#g|l5U-|-+fVtaFuFXrG+d3t73rMN|0S># zmGkJz^!6B2bmoZhT8$$`PdAa~C@x7!LmF zCVKArrT2OkX2kw9NaDzX*MbGf2^>OUg&>AdpozA*P*XUCJASHA%u%};>pbzSE$7FI z=mlljXWuULUa@{Q&-+D3&bF~J1F+d4MhG|RmVR}XwhO}F$_yK@PcxZTw&2qd+K4Gl z^L4u1t2(-}~qGgOWMsA=ozx8;@Ddr*>9RBShESVsg0pRMVDXl~*U&~wC!Cm;ewr2U z!`MQd_;%V)x_-5aS3Nt@wRP>puOP>F1Fj3(55Bn^Ch|xJR|#zz5h7p-PgnczyyIFs z;zGupLq6`G-c+fluD8Ex#5?S8Waq_}fVd6(gDatzEUPWgQb0M|Yiy90vCk6qTFSC> z9@+mkvwZr+!WHh0ynyu#VJqLD!;TvO{Jjb~xgx2n5^sYi@Hs7;O{UcQs9K6fCoONw zj#M{_pq{1-XL3s!;6lx|JZDSBcsz!R|LQ88Ox464?UB{H?%uv@?&PUYpIc z4z=JikU2s*tujCdK|4u&EwpndFqs3GCR9$!=wFX-4{y{qUG|#lRCV7y7lTkT$sYT12j05HK zC+#zJR-M8~fMz)qzK1~v;PUWTtm{5BtxIl5ZtV2VH@p?De*)#r&3v2@tmt~?_SIUI zw8yDhGZ)SJv>c`X5?Q-D!Ce1kX& z9HbevZHwQMN58AJ@r~KCXo^u+>aIOb9yJw?^*c;A>51=rP1g+Ej?#AhllnU0@!-}-Pl#jZX%yGndwZ-7TkPoZICu#a{F$c zgX-$xN84ttPkGraYj1wDJbihgylf`}n~UJ%s6uPNTLjGoO>J~Z5*9CUcZZ6j#K8PV zX;;OSToGRsGSpv)v~S+6=eS1v+USA$AD01A`447*#C|l*`Dm*6)eG<+^0(|!ZA0lv z6B;rjhU9t02j5*d;ls;9$DA8{b`3d z9Lwgm>O)A>aD{G@$}PpI8>@!jk_+spbbkglFsH3?LvKO>#PHe*i6?MmJ#~v8F#Q4zfVq z_3-V`zN!03Yv){W^n(RhZb>d3j`HiMd|!F7s?%|Fg(X4V3>kOHpe|lO%hG&QKf^y; zdQn$!84E1`#uEy^4*4>gxdI`m*1|`SPcZ?i(*Ah=`>fv~@b89z-eD2MxN{oQ&W>*q zy7QMrR1n6N-E5cIz&bkL=kq-Ws9#YE>2ikfP#aMgI1^=^ix+$E>fAd>+QwXIN-3b@ z`X#v9N{}lxT#q*;5R0lr^A5n2&6FOuTO;s}K~jGJwe}?Bgn-!j=84%i!>h#iAo-yH z{pP{mbVwl_)x?dRZR_ zKs+o~?`#1J;nd1|3nP#Y4)@B_y;m&K;*oya0v9w7n7vd%`fVWlO~N2)?l2bdRNJqG zghIfnjv_T0EQ6U{*QrA`$D<*eBL&kU`XQJLMu$rZybA=5(l?L5>iFXX*1`lZJHAg0 zu>1}R)vsvvZwJM8ReD_vL7tp)@DfoRr~vNrM+eZzXVkjszJ2*Rm>^}4l++G!CL?MZ zoOotDU*57NA!o-R=_n9eRFyie58d%4YG4JIA$l*pC%#*UKA=V?fQhjkY!B#q2d2lb zlm!;Xw<8{ufr(-L?GuTO0pn%qnByNcu-S+M1zW(tCqXMh8$ub)F+sVv)rRw`vs%(4Kp06vnH%3i?5oA*MoqJz1E33 zU;dYPB(3>txLufbIzDZyXgvYZ0@bV$1f(*IHk#B_L_KLT?|FIBooi)3MZT%`aChrm zZIp2({^uR0*%!!~Hx{~z>KTnOgc5iOFDIfotI-Z`RBYs`E8|3{S`uXFH6G){*J+g) zg98W;Hj3Mg4(Ox9h@*6z$>=l$O?3hK=lh_k!dM6g|2CGWxCe?V>)Qw4C;bl7e*;Qg z=mGQt4~7RE3Z-xSH&X&z`qS`UalFLwLHp*x z_j$j=_uqkMy>xHU5j3Y!IC^}$(3L+wqE0*+^1HzA(e{~k#>iPn@8v_rkIzaGB*5h~ zPtwYu)Wl@xqjmQNN$))yJ_+x;OKG9-V#j3p{1GUWTDzslwBF#YGC)TPftE}(H? zkH|6ex%e8cG;MuVvMYMF&*DHhT`XSv;6+5olR5rutc8wTUVGT9of=AXY_=Ggp8)!WPC!EHmbx zZkPRyp{UC?nAGNPJo!H9cbNW@QOZocQ{q;*xaxYd_?ppZ612vVVr$|$IAdI}(p4%g zyxsSO3f=hnEuGw7Q|G7pr%CoX@9)}Q7fi#ALA z;~&0wQ(mGUYBH6er>|5Rtdd9rm=LIj+f5zN|Nd)6=XaQZ(I4_3jVG*5DSAU#=#(Sh zfY<;*AtKp9oQZG+BlT3u=Riz#wnl0qIF9y2fSL2>2 zPZw=w!jUqx5^RnEt^zSDg=Zo&G{?$+Gh{gSLsmryW|?OTFK3=wnCfP^q1^39?}-sC zp;<^@ZufhA^7L1iWEbU)SvlNVIGj?;4C{J(h1v9&QoGxZu)l{$uWX`zPaY^2p^~5<(pLbcq|W{?Obv24Kh*jwLq`A`v|GqbeFaq~ zF@N(J?VCAVMZ2K}HVpaZ!S~6(gXJ5f_|LM5uQ*P8Yc}|KwO>g(*Nye$@pRjmh20Bhwy!x!?TbYdX7E% zR1$oz^%19|T)`j-+$(ZWy)%)6KWB6vJTd`%I9q5CkRYqE@axt&0C6TpxJKB{$teI0&sP1h1e#{ z!jc3~hSM;muVS!TI)n5sEJZQ}qtg+zdrem7GoWLd4u$`kkVa8qDo}X3a5W=D4j)mQxc1nE~r?tAX=SdrM3v~szi_R zmrMmx7J5qf0BA;+aBo9h%MkaFCU64d8JbHC_RNiE0gSS6wGmCG!{otaY(DLvVwSO_$6d zz4t38bJYb-wh@u6({~6CkRl|V3E@e0wTT7RRd6UiVu)!zlz5H}hGv4wj#JM3bzi^+ z-wI6Dcj%_&mpO=PNv6<^K*>70hrj^AG|5ycXr9Em*Tj-;HhE|RT}3-2I*5mKDxc3% zetq#P*nqKeV3wh~zjE=1k^tK>DFU`g0ReeXUvWW-O*H9-5A7d&C?S?3oe7$;c%v>^acz6_=>B&jAaz6pIeQ zXEjDfmU#P$^nB(oYnpa5jYSC+c3M9YKLdbjZ+vIF0}jGgxHRuWo!cq2`x&(sIgK_e zdJmFz+w@zR1u>g^T60j=L{3i6fIhA4#5c?i&31B}=7KRp53`DQ%9 zmY@giZ#;qR{?qvHGY_$}?{NOt;f5XYjso(;9K4sn^GQg}7Y^A2TuH}G1jlYSTA#H` z6<^Do(Bj9_egYN$ozP$7)H?~Amj(d7@>=F2%Mor8;9rp8z-09TLTB4S=;j{5pVuKuA41x+yEHgrRKy%VuV5!o@lqy{w zLkYu9UlWEwulN9hBZ2@mNOIaN7Y1aOjl=Y3fn5;$Lk8r&5AlKR2d}^0^=(Itk4OkKr)Ti<7iX zrv#D^fq9*n{5h%o36q4rV0iTC)ed!#_$y^xwtFNlc^8TmH1jC-JLl_toY|~*FTe(> zq;+N5bpMXde<~Y=M2Tg(fkCpF0=e;GFyp<8Z_D|SyPSQtPtg-Jn=}0F1*w$=NoCH$ zy;sI9lUbRKTaW@wvlj7lrVy2np*{FZLN~rpZIUcf8%gaGBZZlJ5GTt~kAwTHO$W$2 zlmVr6h?CAp*+CK|3hgIDPoik@9usXo+ieRA(&*!{QX?wwa&uc6(kGVhWy}Kk0aq+>Dy?^9<=gmE-6oTeg(Rny_WI^<-nrEP|%#aK`$J)k&;Y3A@kz5(PVs{$z(X!?q-y9cr;d&%e8MiieO{uO}4tI zBWL6olR8i|VJ0kWyZCzM%v(BNCo@&c_TR(1b69-eZjgjl;Lh^l_>vnptt23bCQ_Th zQ0P>$HME1xBOUtP@I0vOrkz4Qs5?lAY@FGQNZT)7s~I_h{bth%qS zvkVt{@Way9bPfSRL)OkpKs|-AsZSl0nj$L>_&TnEv=E;^)1<>;;V07bC_Q6=qcQIjx}gOPlcswqdWZ&KF)P3vY2i`( ziDyGE3w&d%Lg^O!_Q%6cn5uyWGb-xjAgOhLLlE&;IhyOl+gnXYFW4}Bo&ogoK)aA- z63}lJ8DN{xB5`Z0WE)M@ieRt@fNmyOUXo>#!!s8ewoT;p^#Zbb6H|(EnV-epLvIq_ z$efR~41E4O}8GmCa@n3E2eR3*I}$*W|Ahxqqkz3nB%2o^@_?ZSSP^hZUi z2T2mpL=eh|;+S;d&{ZZ%Okw8I!9euwhn>#fLaSlsh7i;*FeeEm-F_Djl2#6F=sG|y zPxhvvt=4N|@?k$E-^7%>O1`6ow;9&Vre8yWqT4zAt3Ff$O&laifTNDj3!FIJqjTuW z2U@!B<3A|EWjJH2u(D%?ftt@D_Mt$sztuGTPQyEcXlRMD}GM!=s}x{~oxmI8)Gh|rsVCw>w; z^+iOmc21keZ)$K1Kg&Dz4xairk~E#bAD)I z@MFkPL-%P0&1;TS%HURkwMh39b_XzleOu9w^S~i+6IG$7(eNZa(GoaZ<}znseG*$3 zZz}h%sFX)lDEtE)JkFI+P=$sw&(M!eI~kp}%rF|J??G%y1&vAxG@W{8kKkdcypZu% zlmW@ub$(mQ)h^4tPv>-Re#mUU2zr6q^M#q^B4UGHl=u)bAj^&`{kCHJf6>Z7@)lcH z!J;b_ZVG}k&_~URv4NqB-2~;T$a0_yj)&7D`RV>7b9_ssXXRf`kCq8(wCtDNgRDUW6}NbniSIQoc;i$ z#Dz2vOr7GhYmt82QS|IfLXRX!KPhS((n>PVp};CeHc{N3H4$L>N@Rm4|NOz<$@3RR zdwE-sW=$uMNfG$|A!C(Q7y`XQw#$k+$tr{u@HVy+s_+N2#Qs0<{A=uXLs0BQ2qa)E}mEeZ}P6 zl=MAA`j~y?z<1HS6&P^BF0>0IC@|hCl(&rUA(nfH@@FE2v_H(nm=99bo6nF)Rp7%Q z{qi>h^XLNRF-ZE2du#`I{b9+gn^5_2#}k07Nc2Eo>L0#1f|$a|jYe5tK8NH2$Esve z(yLP^K4GSvo!E5b#nPVl&&OgFxvPKOfr3udi86A{QgT}TCEr0(Bq%hNQXPd0%roa3wnd^+MHqOm$+&3&kK4cKt~?O*KcGb|UzIX8 zXiPf%bg|q5kagYYxtW5{lcq1LBv%8VB!CZ}b^>Yb2AYzpz6mhgbecG=iw~keBCz5A z=Kc$i_fjOPknZEN+?ZcK;iXwe6AxIZ&KD$#mk7Yw1a;vT1=m!K3Bmw9$^7sI3!TYl zjpmQo_!S8YL45kdn~^VidmI(UlUh5|(a@+NAtn_=dk0C2%@GknZ!*K@HmFSZ8oIFE zfB*n#{r(w4+V?ULMB^yk{oY>`pu0DXd(|=U=yKJB*$&Ca}&M^93>>`GHC{H)|s0^Mqv3 zS3VNDXzm`~X7;wBE4^k3uam-hNOFTM(=vpHU(G+s(wVuO9_e^6A43b(RAkk^j&DNv zZ%ib%(3id1!hrFl!@@dd9td%X;s#`+&HrVP^m80tEHUI_i1A$t=QF-sG3HW|X;N6D z5(Dyv#QFp@{DMbAt0yLciVVLehL|TPBEM42H&_pv+X?yK?c8OInfnCbIy9W2R|TrB zgP`;J%~ds_SE?L;Bv-@I{zy3wmH;PE+8Gk?xWnL{gm|x!`gE`#{k7`0egPpUfIi#2 zekJ`%YO==(eNW%7`}Z?wfIvK=Ux`OJu8E@PM5#S0 zrRG_#spqk;!y8Qs67lJR)L5&&5$|)GTz)~`sb4&Q6)GIx>TdHipH_Be*76atlF*iL z4xC!`0(JHfjlT}3#vYkURJ&q##b=O~LzG$@BL6OUYW9p^re6D(oF&(Udkm6hIWh)G zw$_vZ?S%f};CWJKwy05*p+)}fVmQ2^ZYq)W_xy@Py!!A@IPI>xSi`PyI!k<*GBAvN zpB?!T?cVeWZT}gh=fvk5{(A1yXB{=q2hKY_wf5|9uW)^hA2o2fX2szhhA#OmQ?-lJ z*KcNR`O9vi5$eASfmsHz@GW4}@e%Rp>h#||u$HJ%)?l&yFVv_}tPL--*eMo^Ix9W2 zJ72WFe{mmwk%Tlxg_(umn)>%u-LCz(dQw#eJ!d6y(uaP67+2g>uj$2^^zJ2b@$V1mB~w_urJs+214(x>}qcvU@QiPONGn1&=%gUNViJz3Nz;oi(- zr{HbPd-f1&z)HZvF9GJ89)f<*8q%hm$>;FNG3r*yvJ%sRO(R3?;cCLx{R=Cm)}-gS z99ozZS~0{=s%f-Lg=L1tz_Zd%NQJm1J_^LrbsGYg3Cv*?JV(QNeu#R*v--Q`rN$`0Nc4Kj<=v8RGB2I(WK#Z?W8gA;W5> zfDvN0XV;M4&JL&5BugsOX$fD{)hGz+ZTUS%0-~5R-AJkL5E~>_8 z4yU}z@`gx3a7MrbP^u+{n#xPe{qqiNk6@mT{JV|iJNB#_xB3Q68?%yqRDZmH6yOCe z6luGM5pRM}I7X1ai7rwDnPF*Ig}S8T+abjzncjU_2h*?=(?Af#-*jQ7ZXgY^|ItWW zg7_@b;>oNvC-QGY-TEyGqjL*0jQKW?uvQhvGCMX<2G;WiNJP~?qdV+pk2Cac1yCcB z+HJ$9`?B*(`sHj;wijc~`Jg#}si7k@MNK+ZKFFISfLKrl808E^O>f{w%B`RH@U2vauY zlXVb2Wv-r0cBcE$H!B*Si)_J!&|`_9&J0w8W-V_*v`M~vLqgh(31cp+*T?^tN}t?; z(IFqLT%PZc_KcOhLHZ|Rh9{Ly;csW2r%pf|X_&e}p$-j|OI;k}laPCBa%Sa`BF;Q5 z+N?&ESLs&_*DKqVmC{@=C<##FiIdR%mXM%0ipFEqY6))(d zX-s{(zfo=bjYX^Uqh)6q4O}CMU;Mh25w&N}hoq9U|3|Oe7SGzCiOG6n)Moc48osVH z=|Rtt_Jn?_tQw6Ry&F9&Gp1=jg`cQL@z8+^f&%R zmbeB^J~m?`V=-ntNSY(eBWj0kfc26r1S|YXA=8mMWWDIs5|!=PLpc1U#gkDJHDtY5 z9A%~&{!6SEEey~i#1V3swScBMqUm)&(^HF4mtK4unv;xnUtwSx9^&(zK+@+NgOF6e zzuKL2b$O(u3SPSsX^sId#|CXP+St$E>;2SF{PHo%Kr`AS&#?R_(A}RLyncIpyucuaYi@G~akDwg)0EHWRRj^>KqF zQ7NEN!F>}2^yTtL124h;x<=}C`L8zUJoV|8ZHEGCNyWhEXfZvBE-%0S;}1a4Tp|}Z z9r$@3GflpeEE)}GnZhYTD+7xHK~S8a7EUxwig5F|7Z6$7XKy`8;}YYF_k_}$FR#2@ zWwtqlYJ=uzni%+R70036n-qByBiywT4uZ+t-pEkg(YEVmUxW&i?rC_bxW)8h{;iyG zW~rpW&|lB#S>I{?_r)|E%^^0g7rSn{t3DO$k5-x>N@;TmsP!VLAFwW>s`WX}lb>9O zY#6wfWsmVcJX_i+){y;pW&7rnlhrEQBpHwY$6kY$Oi*5C*)oQ_1`F8TGH`6dCiK;c zta;GVZ#9@17HN8-H6r-ToP%IST%pVO+ca{5sFO?qbwsf6amh*`yv~VXjV1YH1zB1O z?FH>4c&ioVA(-5a%qAwt)Rygb!qugYb9yN-CXnRdgm3N9m8tpu8I5uvT%RtcKI^G z184Y9MH`F6GXi=$&@AJ7vYZb|dj7|FM?ZQFQ26l>zVPPJ`aug~Sz?>yscSq@{x3%0 zAwXoi5uO>ofUJm?WaBh==Dfy}&Zk2dD-d06_A6DF$~%3(Pp6mu)8H_I^0M9C&M@@RELaOaVOi;k^rsQT0}JR?m_sX{*uExR)vc`Fe| z+br5bvWwO0%;HGYJNG4Lg*+W76GjoO?WU779-z+Xr@)i|>D^MfSM%w$R1rOqBE2hf zC@fz3Lo?}3((XSG*G%klVMw3pCE3^YA-hPI#!OALGJak$v9R`;@D$NWviUluQj%e3 z*Bh;*poMgDI^dnp#I))lU5$r=fLp`166MWkf_VS1P8IwLn&Z#p-leB8J1j;1#A`$j zab!+GbC>hb(}rgcV!=hC6_%NzefqbZM;H}7bB+z~J7h?Fj7ai3PHR_NQT%rJz95~= z+aZ?hA_>xx9<8zdEWQW-4J%OD@lI~atf&$pBz~OV_0`UTfa?X6dpw;%a+Ea6PK+kUx)?eSsSoNexOZG?NK~q z_UgYlZjn}1TBv-;8sSZS0q9Ybeo!z{tc7x3F-t3 zJlPd*QmM6Z(iD_uz8pBM3DYos6~GeYx`8H?^%zY=U3z2&OaD=~!atJ0t93)*c(>Ld z)Wd7rOpCb8h}P1zPl?Od%@6j-Lj3idkIxPleb(0bJ9drQR(zZ&JcP{u(2k)~`_5}; z26B|$yxJogLyoe`-bIKCuAFZ?Os&&DnYgn(!y+xw-h1`o(TCRtY*{mW>YH13SN1W) zSI=zsJrb2sMK3urp!e2X@n_J&&^(b==L2t1nyYz?OPEID+W&3NuW$PAJQ|tpjZS!Q zBC|b%eTQW_e)C^Ep~c#`k=8#9vRx(-S4!ExU0jyA{;Nx){VZP|532^>_t7JNx%r@V zBt|Ay`^8tHA3SX1Y;7RJMyZXy8LXe@B)=>FqeZ{jbh2R=9=s5F#~33q$7gCW`)Z$; z>{>Oj?Zfj8>=XDDPyO3IFUxilPov4VJ#>Ehh4v=n;%r*gd9CF}>+d{n-dzAS5angg@q=kl10Qm?0-PIk?Q8|D%v`D7+`8P+2voOC!z&;BF>us1douguDIDce8W7?8P*4gkpwc%W z>AFZi9zqa07@O}t`IhG4Fp7LB^)syf642Gurm0;AbO?(<5{Uqf^0yzCiJyY^t)!XK zCqN6CA^z8PH-_f)Ou~m=_rsG{&*~HhNfBr^l8OFqZMvxYe)My{ycnkn7~FS@jkZKs ziZ|Xq765L^xzNIa(thRGp1o+d2PWV5)WcKdmB;B|;|G3t7d+cNO1@(&Z%_T`j8mNW z+V_ky1?|SFMvoKj40U^nULVn6arrlHJg$oVFWoX|qiT5t*uM!+1#@LAOnrLI6 z_dxtRcpLF5P!nt6dMMQ!$2KfEUp-*1`VyeMLz|bH>ObbxlJ5SqvC89n7$s2HD9SaU zk45%()f~U~3K8*@Fr}Pg^4D>&W!Je3d_qhk@Vbs1!HOIANMSS|2l2U>IFZyIRtZa)G4}~AKVCo+Et@xjm)z~)JYw6q8UE6yvAg~Ee4vf zJNF#%`2A8hXZv*^BRjX+-HL@KJpryzShXICgHwd54}8LjQ6-|O7`<^)aLSoo4HjD+ z>N`r(FFcMr`LN99#kP>i&I1q#O-L}Ezln0TBJc%A)qm1G|0x)2ctnBP z@dq26(w|+bGqGRvTCj2!NOY(4Ym?u|zbFZo|3vFxiB|Y85=WTG_-qw=V$1L5#rYi0 zoqEJDAa-n0ssE$T>%_I|sY}+iMjc(?tA_(6=5UeWy3Cq{CXZ}OOjAAkUPNA&AyFCm zW{|WgWu>ZxcdYLuzXP3T9oMC;yPolQzUHzu6J*3cpgCSh%UDXvLiB@qCEBBaQ3xei z_GN7!10s(Ad?{x!9;4hx4NH=->2T5@>)9DC4A(T)oYC>yf3u7{&+QflYQv^I>Ix)l9FCZ!A=rvPM=i%P2 zSxfFzHweAw$GlgXV5+v1dOZ-{8*&rQutk$~t%&8~40P`xNf|?O##>HX+7{DUJ?q`# zoS8`vGeh02U34G0e^9OHT|p=SkB}!5d2oC?{fsg`rm{zD)<`bUyrh|AghbDA7OvLa z|C7k8^27qS1r8hLxauB18o6dmEMe|bM2zti-w)LBnZ2EV3#^ji0zuRnBdw&J#SY#R z()%K8>TBQWM9n$Cd^>*jY|RkpQGUpXJz9U??sYqFht*d9 zs6J)W=TV-0F{=yEZPa8o&EH(Gm1`YhqTy4qHX<*Bb>@h+fmK#{oR#&Xr_a;QJG;GI zX04?BXw~Eqs=mY0iAlINhVm_W4Wax9p6wNn+t}f}qu~5;JNaOESVMKqu(e^vLAYNp z!*s}5D$9=`6hK>$6Ns`#o>q#;Dt4q`I6zA3o z{KAvjs?{_6+^4(pWr@3gSxu931j^4w0HmHPF(3?a(i^JxhKU5JL~ZkSloi8vJ)=8jQ!A& zbav4;TIboZW|QIc&oX&jSwtvI5iJ9{g*3f7%UX4zTh_IJ<#%1RSLF(Z8`L`Vm`>47 zvXcwW2#!^myK|kxoT+%KpcS9=fhY!9-R^VRK>{V;Fho* zZ%Ut^{Ns%Hd=|5h;BmoyP<^6P$=7%`U`f2 zxBZ$Zr#7}IbxTV`Uu49x7Pq9}2aj^by_g;0T$2(MnC$O}vG`5G0L+Hr?}9NOY_PnC^wlo4&^2_%n~rp`0Majp~K++ zsd+U!B~1%{8Pn9kCoNr%=>QQM zRq3fzbmGec7MN(T8|-xZs;4ej+uc{NagJ#C{2Y}X;aAg^rs0dgJ4voF7w*KCizWw_ z+=BMfS;IbZY62(mR`*Ya{EQyl&Te)o02<@)Kk*iI`RcDU18SskeD> zF&Xm^jmFpV=S&Ku2(oDLu70$$NnXrf3Sw#-Y#VlMD0UWVW|j@xdG&BB>)FwF3k@i5 zEMbD2*UU6_!Fn&9({#S2$6HPTb6lXx$BHMR1qMgiZ=%X_4|Yx6Sz9#Cr=4cFc8%y& ze3;}YppZxtd1nd~4B*q&h;)fJw`MMxcfAQ!^0EBkxsT69Z5^XE4^}VLY#P2c|NI&> zIzHHF9FKg87+2vh$DO{UZf%*a>y86QM5@d3t%TKSZR3~4tjT?FCx|3V+9cl?ijS)! z6sttWXwI%XRVI|0{niNCMBT@4w&y(`>ZT_)H-&9EQ(eu2E>5{PYV-V*&6)Pyv3qGi zto$7arU^nK*phf*Cwu@7c6CCzOq}izgE|b7oKXjq)%e1{WJL#?>RnXG(Yet(nb8cV z>~eSk6k-%)70T-eme5nl*>9s(gS#*0*d1FeSmV@N`)dQeJ&8XCEh{dLNmyR^naKG#uBhbp^*SMk+aAO_~x{Mp4P99_Z4`C`y8^^P_HhU6E(5gy>ChO4wjbhrnO1>pBko2>itZN0s~J@35yp*2r<@l4+MCl;`_A>4qZ3^x`KWodyiW=~*sVtFwwSj0{ELt**KP_yUMId|w3Xi_ zoQJuJ41JDr>+1Z?1Pj=Pmu|D@uD1)%|MJU+O(mYAL&f*1gD8y*%+OVMTzto5rmNZ{ z_Z zfe07Ez%%~S!HRK0PVY)C@Q6=KDGwhNf1~rvJezykt*Oe$OGTAS^Nj6nwYX5E!6e4q zf+4z+Cg{oXA*dt$1gaP~*Vzv_b>yw~&o<6eSA}0sT`9aCh17C0BZ{t0e^Im;S=kSA zTb=}Hegx%0+>md@sfo(VQo^*k9#i*o#%In`Lt2aQQ8M;D~r637SC+)Z~i6ylIl^1 z{xrRcOD0za_KJS+H^O+Z)s`^Y{Q$1bn;5kT3l+%o=%>7b6K5rG+N`vkP@@@_CV9QS zplj#3z35!N0uFc=3@jw4fce-PS56o1=Cg6-1YtLydd5W6CyLh`6}i=A+()-LO09d$ z4_eeSU;2*jk8&P1O4jY&hSvQgNf0df7@8BnbRHzt$_fkl9v#a?&O}k*ENpQjGF55i z5%HFudYjs%AFG#gmMyZasY|O4s}KJ9eZ&pV^RB&Q=_$N&mg1WrCNHBh+F(l^Hh;dc zZ~IeMZSv}h;Oo!Vwv2dBda5{W^Jv}2=_41Bob1;MX*Je_f{aL+XsA{72^DU@Xm@?Y(h?@g60{G>Ji)?x#|)W8Bcr7`{_(1=ebY zt+?kxZd97LY?okm`qNA0KJk(DGLC8Us$!o0WU}ei#z}KXht+`5g`7jVk15JMFHAK3 zDdHg9{DF_5aIxMELC%AEBYt|lPiwg44;MEY#{O`O{mB|1C+U^>;bY#aCgL z3H~z#-g(EKDOym?s+^YUcdnf4^Xkluw5y9|&Wv1Voi?hKA{$sl;U8_q$J3hb(jA-y z$2M07D!Xo#TOZ9{V4v|=S?!Kif(m_c>+v~dITcrS5$CeT;6Guj1?@RU=@YX?V&Oju z+(RA|Dy7|zeP`z7F@H?>`yX~V99p?GI6hb=n!J;tiPbBpO-Hvc)!1fc2Iw#?1@_7)EP5K(mEITnkk%XKwNT7 z72p5s6+(Zjj)_`ehN~J3k~DV#&Yc3P{=^yZsdg1v|M|EA59x6Q;)JiE%$u5u8n?4L z{d+~5e6qT-sWSeX8{5|tuY^|P({C_+C!BjqUA1I!*Y3Bk&8qmlTDAE7XMVE}7*kxW zbU$6`ND@-{WY<)|n2Q143+WD?6C&!GiqB2o8FAF}iEN=Kt32)UrR6GkM&L_01^v_~ z=6>iu;B2BQT9d-|779IIYHYfcn0Laye@D{{cJIW@djr%ue)6 zJb#aGTpxbfRc9!iZ~n);kN+m@{p+lM{!L~@qc_dqq* z+Gw)b`Ph3+_kTNq_|ET6BmQ6fTUP$RirOdYj7MKgZ6}@}Jr6lDP~VXN2Cghf(AuCC zf6s}nMo~>PJi;6bQ$enjg>UhP-MHjw*Bb$OBYG4TrGH)&WO{)NkC}LrPbJnrr&U{u zN0-XS?cvnckld$lHD2PmEs9!xov?*+^Q@DB>w(Z6eK2`RRqEF9=*PJQsO}y^>#CM% zOM7b-(v6v1it0_z)!B{4w7EhglZvhDVf1%V#69!ZuzNr+j5^Kix=qbtYz{MBT%BR9 zP|?1NqXQ{8164M0OGRZ2G|QHaZ?w?fRiV2r3HX@d zR|-`I_A)8p$GI(jbQdkjNBZ8zi0poZQi#)N*CT4qQHzD1tGNS;#6Gl}ip|>Xu&49n zAn7Hnt|3%d2!;wW7QA5lXr1Sl5`i<}%L-7roHLD4%Wm@nDR%>B`r=8LcM1* zhfnTW{oCvtGSY6aTuR0bEMsuf=S9TXo#t)Oz5V(T1;!{JFql}KU_wo>X_a`j`s{An zt43Vx+_Jt;OL=99&>;zw(> zt+1x@(NED%Q}7A_1D{qU^7h}Bl8L2@#t{W3Is!)SGNY={e0Ey;>nOhIhRl$T$U}MK z&#&C9Tj%*wen#Li*jj(46Jrcf1q#4nr|goNlEV32cd+yy`Eil))0|w_|KyQW+alBC zynnTZ2k*^6&#*z#D&PS=HY@qop${`>@>G}U7OPtXYuRXN?YN|R%++Vp9M3gV<+hzu zS@qzajA?db5Hbbj(qR5QmNoH&0x!C)L5+Vm>M)mjnwT22yRBhDrIStQ#JJ#qJLNnp zBqjUd_UUIgr?{KiE4(P~lgB537$u8gq zQdV1=|N40BPY?5x=g+$D=&)%Wu?Xcxt0|N+xi^@6FBnV-qYmFoTlWYnGYLJMxa)`D z6^;@4Q4t^hKla`{9LoRSA0MfNG|9eBS&EPfDP%}Ol2$vTlqDhAx4DaubxK7krb5Uv z*~>O|MG~^d3{zwoW>n*5X72az)%$a{>s;qL*E#3=yRPqb{eI^kg>m1n`*puw&*kxa zJRh&x`1gkmUC#zaa9j5F-loBG z>p5aBsQQYmkhI*&Sc($B32VR$3oh*B2ou}i0^-yX7;T;f?m3QOvXhUqLy;BM$=$M! zLvDHh{E2J>q3S~|xCUvh#UPKA@zGH8RzUO;##D52?}88AHo^d#Q;p%sUStk5VJgX~ z*ekswIMzibp~E?%$>CwzXU;ZjO>m|cl$=g_d;dDdI*;~=ga zM14grn@z+^!8n^NjgBhcj9|q6TTO=0@8Ywkn_SntZ{9(7HFkqpy#-!(FSZQ3sLQOo zzMso5vJcx?8gB8NRvn zA}|}uU^a~n4@@@GC?nl0MJDMkUc&lDg5R{y$ssN>c-?OEW93J=_d3K*pB_)RZ^Rzt z5>#!@blpUs?Cq~C5|Xg4>Hjt2R#Q`zuq#pJf|>M*7=hP0u@_}gdIEhY6@eVonGO9B z=Y~OYNo;2mMrTE}5Iv_C(fx8sGtlW)OND=}}d z91-tx@Dt`RtCK-YHt=FT=p3t0lZ!BWQ`Ru*I+BT-oS-;#m49{Cs&Cr0Rs7&-TTkQ` zq7MF^LS~rZz)}#ZBHf{g;l(3Q4IAiA>r4HbFte`h*N#h!he|SqgPgav zV<~G=V2z&Rcn$t0q40ee9{Rqe()uG)6K*Gl1Mm7WlyUScb#d;lxdk4MD!Q9!!B+Sr z@JL-4h6SMfcvD6{BIDQ%Ggg8$-DXR(A&sCd-yCfk`U6*db8q#mJWLxFI-27vgS+WJ z<9h}@0gXefPp}r72l<$-3J_hriKHlzR+|z9#~hWnjkUfzD{IcD44i_G%kfQKE+}rL zS;LAo9ML)&jw7Z|J^g}h5lW%lY3_l z`>(7)x=a%WX1azB0w6v+{1-@o>`oaIm^sQfmC;P!Eoa?vb1vn3>5GGJQoVcCH=PX& zKd{mKd0y=kCF7KN%G`Oz?9_0yp)ia-0bgQ_pNcub_zM+Z{CpsAT;_sstB?1!=)*5f z<-{(afPPdzHy2C&j){TF%{Zd$PUwwh=Q3|;!W0(e<#w)tS;OjlqTXR8FLzpHS3lAr zXsP}{qu85OY8OY-H*3P{gWT&W$PHhv;$*Wh=hEk zxQ5>b(X|@1o({4`>cs*F=)&Nm1CPr0^cr?! z?xKPAx$cer?;HB=l72ZJn$kPN_uYfD186P$W`l}Avm;*61*C5KAN;^8Gk&)7-L2Qc z&##7Gukc+N(YsY)ysvX5PvG5{iNgN92G)om@Nkw2Zh#>a07bZc z`wMj)a^q$!`ggEvqEeSXPRzJTo^mTB&HQ9Rj2hrLs81=3kngiiq}%(mM4E^qQ*SM_ z-$EhR%eB(-B@ zIVQt~(qQl^kUK>L`OM0eNV7cZ*v8sFdFX-Ix6ldM4WEVr4hHYgzPH?Q)?T2D8v|bj z16ThX30C^X{yq>BKsBH4f~m8#>o%--F{xA(aP{Vc4+4(rT1w}&`$c6GM`Sd9@MFyJ z&E)dcR?^5g%LInldXbuC(Y;oTnFJ5t8Bu;#5=qrzdfIx~!l36H46c~wyRDw*nPK}^2 z>T-l>Q<&R<1P}&nH58Au_-t_GxkI~KzSof6A%ajIj*7jGg>u;pM(_>!Gaogxv!j z^M4L;ig}E^LQa7gXRKjmv3g8J@S~8D1-#5~;Id@t{mWZ3LRdG(oSN0obZov7B!Tx# z7V7!Vd0s7wK;TB2Sn3=Z-jmU4J=)zwF0Y|`zg)m=;aGx=f1->r_@(I8(V?`?Q?Wj) z1=X8p#4pEY@2TE!(EnZSXTG_3fM_6sAxuBkDmPDO8I;q~VIC4^D>FNGs<7Uw#y;(m zEQ_^zXsERD;$YshzUqa7_fx}C-~3(3U}Z*uaAgYV#WBpeBFFn8onPWYX`P6wg>oN9 zE8o0&k#XYK+OSCN+c<^J%k)&UkoR`)ZPWlDrVAW-1%{qMwTA7<<;y9*2r@tB?oKXP zytT+4r6Be_nDzHcO`krTm60og$H45QB2tkde?X!?T6Y`3KX1mYqe~+p2BoQ5@48<7 zfx`ROJ^D%eb}O8?=zqODucfN;Q=`MDd;{usv}+Z(z#Xs(eva-*0gH5bx{z~3fqz50 zUorm^^og(PX!-aL?a*-|<0JuAp=WSJ7g(NkFsqS|42KkWNz*CuRT}8C&zvm%LayA` zpEj0NN01I1Bizm_m>vkSq5*$cLYHgvYlYJ?H$~A+*9eZNgab+1K%? zmxD$}Coj@4#$4V|iFTK*%$j(OdM^h1Y}t@@ijB&W#`3Sxa}kb@uJT+-wEJMOz2J1DUoBKH+h@K=z3C6TzBNC+JRVsSg<3SGde>HxkTo%~ zDasYtbj_*dwO@plT<|Hk)>gz>kyGSrcEKrIjd>msFa~Q@wI9FZ$2y@Iinc8bnD!F+ zP*1)am)aS!HE|TM%s1TLNuh<*X>iRU}BI=EcPyHCw^j`BeGE>Siku zn^_sPPPHYqaS=*yQ%cW3RonF+`p-FsGCG07nC-`Qk#FNRa7>w`TLvMQn0s0?mc>c0 zCd@*Fn)cwoAc5#w>_1Pvt%)frfb!3;vfy85-dRZWe{b^7OYgrsU8I9d z!kv13ZoV6Gml&lFRjZ8`edJv2$ag*J44FBg7WiW84H`5QIfWEP5)A#li^BhRNBoDj z<^Qq)p>j;7tkFW^%iJB<-x_1(9NsA$S~Hf;^ujP%0G}_aVxl>>eg8r&Btd(-{^ zD1X|?3V1CMVF>-tw({S+<>tTL22i;ChkZjZ|771LU*6geO-X=Q>7Q*P`JYU9O~Lc6 z7}%5p;<H91IX%AQ8)=|%e9J{+UbOmqB6PNd6J{9h3 z$i=MrVg9BfHWT4}XimJLEPCDTHKc!yBTFjlf-g=pKDPycXzT@3iY&&p7E{M6)7{uz z2nFCxv-Qv__*IH|?My0VWm1z(|3ifQlp}h}t;F-K4bh=`4QkeMy6BjPU0A}_-!A$w z`!_-l7@B~t>7$6?6#%F`%<+Z~$rHP>p4m|5qqe<$3V{Okj zbIgqMhn5D&rG4CzTmYuy{=PM|c}@FWJ38iv=2`-P&LL_vZEx>C=$d0hK=iBz;UiWU zqId)032!gVPo~7b;a!gX8@}d=w;lh&*Q{Hn(QceT#LhIqr;c}2l}_isE`KHr_(&Q^6S*d1aTWw#irizK1` zCD(-g^WpQS9{GGVvbdN%KlP}+{&Z7rgj>4VrIik&%j8eP92wrAQse`XVb?oWf$)>3 z*1kPx$viXo>h^9WqHe47>k-Gd-{@XD>$tOf^e@O#-N3qt2K=P~EEPZmi~v76ANv$1 zJWj(G8>d^_zLX@?A?KFevaX?dYL~TSRdq_HR&w!cLRPCb&+pVSFf=VN$Ww@b=Npb6 z7sQ>H2*RNuQ+4V-hJ{#G3gN!&Gd^8iY5ycXn0F(kV+);o41697PN#<|tU02sR3Q>w zkgYZE7nCoR*^ji)kK3;MjG-I^AK%{Pv|-C$dnxn1ZMGwb5+gsp-eal`+%UAUL&*B% zpwG9;hRSy-SL3E9uc|8fCJU77r4{G8#v~$q5S21OkHlf1JICA;|K%J})=xNRz{p|_;qPyY7hug?9 za~J_bS#Lm;)PY}Rwzg*aOcwgfl^304+PC$WcSt>y*kyjkT#BYZj$elL2p_SG3xtsx z)NO8w1d>(k&A!k43zcZFi)y`be)2FJ6XEyjhxE~BtF>8!3$GoTJ^ZA?avu=|%5tyS zC%FPx3I@b&Fz`oY>T`@d9h%zBy#w%N8KBK|F2K7K*BPO1dw^Iw>74hVsR1n*q&YM+^)`yRphSc6tD@qurpfUd;EbgCa`;} zRp(bp((D2v2$F?Ou5dT_is3$sJn%g7GMB(HVc4Ew4Bu6JH^9LaIXPRdbZ`8i9KAMX znjw;v=4!S2{Cp+t7W(%N(l?VP3L|)Vk$ZHuLk>g3wy-C7&%6#zCa7k*H}xx$S#6pVCa8kG1K!bMFk(nF=+ z1JZs{<``CrvOl{I7z{Id0YX}bfKhJ4Ye3B)Kk_7x33eXZKMw$kt-j0TvU5ROHAq1| z#{zX#^etrfTS#ZOoTo^lxC`89K+VN4&f&($tR6;9Gen@2^WAMkN9r*RBzh(826YSTWVz0OFiu#d)u zmE(QS8#j~^hla+&GAHh~bZZ%Bcc*&cH$csO4zsOA9eCo6DjW+7dR5lj&H?tCY>i(qKJa`!&#m2L3US_xP! z*TB#RB9*|`dX528Y%*hN28L+8Tb$Oy$+7$VVt%Uz8Rn#N&>i%kl$;){O8*yrOZc?bi2OZ8T;Cugt zx=I2-{t3c3(31wl&XEh_b7R(bg9v4zpObA z?*1P6ygTqz{*288_xr&O0Qv0jXj+C1w5T3!%eN=hnUv&4^t`l3%HnT*ImMQ2!TunK zp7a-L`&SqB;-0!i09bn@%H1tJcB<|Diwo5}R0Atu#VwFYY=zm~U*sLgSEx)k!B9TZ zaB%64SJ?iBinpfz_X>uqZ;baG<~2O)2}lg?jXc06s(~5C#Y4h)J&p^vC^8`5II1Xg zL+7b}?LFDRSGp7JbcOs4?Ad9>Hrv1Vt*;igZbh&9ywm@EF=f`IW37vKO-OXNZ-PTbAg|Y2F|=0N4{zeD^Xe&Fp`{zblQ2Qc->5C>{N-S z8wFR%I~!RhxK%icup3pDEMS!!((T7ei4zO(_3%=rNIQ z4Ys}#IFm;PjQ#+;%p!6dE1g3+vISsl!Gx!wR9~QXL{Eu?#Ehcq=Om1|EgabPfV#j7wR-M&fSa4>T9d zt(T^}9yj(@>)D;5^b=Vcinu$d{6)j9Rxxa?{KmT`6w*qjmJ%5bnxg{IyCH(nfEcE4|+xcu#DHmG(zT;|=K-C%)HJ$SUl&RfQ8k1%yu!JomNVMcT~t z3}DXIm`8i#?8PsnsQ-?C)K`1ZT>Vfv`>wx}1V^@y;Y}kqV_+5RA{Wec3?n4^6&yoF z$M$&z*G@bX=r{A)M<}$$m-v?{t+-|R9#UtYRJt$p^&#JH8I}iF!zEbiJ#wXh-CUz4 z(>;gLX;-&5#bdgNUm-#EH^#eKLcHtwUntS?;|D%;0&((XgkKZfMj3Pz#IlbpK+?Z8 z<1;f_xVTnB34NZ=HdlVcu(e<#_I6m&s*8J}s{G4}Sf&KkHCndPx zsji*i#*^JJch*Urq{Q05nIWY2bl8ruA;wNfK|5M7VaX%KdikudxYnaddwbh;2iTuR zh$o90C|+gCOu3ZhhsGNXy|jF4WqmSRzsNFI(KGGHOAVPq0o2~k+ql^$RPvZ^JJ_4P z`5f_OT5mgH?B`TR6)BXLwlDAXgqMrTfSZLfdvgwccuu*lyIN}GidhI>mQ1`+xXMwF7|t|P6LZiITZ^#xdh zsc!+(8ICPjnLzy`#n%lky_{R0sA_1&9a14tEe0`OiZ|n%J7xWGk1^%e9Ek;lUzdH& zJJ^K}_F$gwwq$P36$8fKa$}}WTPkPE{nCrBPkN2+b$w%zM&G|kl}r~5rs9?%pD6`LepBJXc zXyRHx(n>x+jCGzvy}97@yX$7>fOZ$K%$vLSQDD_sj;Nd&FCr4kIEtP2FG0`Y#(E$z zEaQ4FCwCQadO+o>7Mh!N_;XG&_OHJGX{YUfTfj@Llk90p%jNJfOG_}5)t_}#`8XRU zdVi5FkpP72txu@HcSAdr=W1G|Yml@LPH@dNt*8f_ga?(5*gHu+mNvRQg(a=(pKe9)@E#0P%^ZLT zq+m}n>l9vW%`dLvWc6-h8Whx|EuZ-OHuX}l1NR$jg}gwWfBPNE z9p*&Hd*)^vo14S_-~tXjpXzGBP#8` z``IQl)B6ItCO9qRQ>urvgN0sUJSKI0-M8^F_bGYM8)JuM<)NZ<{jYk55GtQXHE4}h zkf&rAu%tMMI#%bPmg*m`a);X9?x{Uu4V!BFt^JBexFq;|d~BdAme!AEv&d7rPwmfB2}_4d=#aT}6@6Fz9( z(FU>XKF~W0Dr^GxLTyj*B1d$Q7;ey`x?>XU?z|X(nND2VGg-d?pNmM|2z_0=V@JGim1TRFhPYXHptn~8O>Cj^M#QJon!x!xuk#Q6MB1;~!46Untjuo7Jr8sx(O0CxTL4NSh|pa{Qm7x-x#TCU^8H<7oks1${`$sPXh+>*7XLh9&F z_@YRV=1?`0vr&VM;*sb2=sF;uTo~%TS}-@`%XeC+OH>G*=EkoczqahI10weJU zOR2!c1GXpTR&7<)SLwW5jY6-YgCD|jaR+A{^($pN)vX?V+jQS3uRk@euKU6 zZY*beJc}QU*lx0$=68GjBCCP+^s&nrP85#5`tFN3|AAUbaqYw{ihD?pcGPW?d(2}b z140<|RxdnwZ_#@r0~qf2$UnlZTiHc2H^T4Ir?wF9Ilx3jNCgEoi*uxn7D#Z+Unom? zau9TFd6d+QtHvx&(KJV)Y3J|UxZAXpgU)-`X_leW^@&nt>9OBZ?FKszYsjFl0SfRQ z!lED}2I0?RAM_%Z6PkLg?FQ@h^Cn$f+WkpZTUTFMj-D!&qDD_1eg15pBA?u!qy)_o zu5gC2QzZCE4C_TT_s#dpnCVS3@22fIm_rqns(`Iku`6O@-}?uWG(g61c~|(5t&#P$U;LnJ5p>fJ?=o?lyEWzPKUG?A7OKG&Gozu_AJC<*&!9HS zNg$E&y#zW5F;0aAT)s48q!?alu>Uot0CvflXS zgR#?#b)Qv*KVUvIbvmH9J}>W``_pFejOT$7W)^Igjs8qx3mR7Fb{ezzUv9tGnUEO&T?S=z;xf#SYR1ip(ftFlKi_Ij4TUf6>9G?EHxHh=%RX5AZ zp-uN<;tfgd<7tsk((TFvDh}&#;2MsUAA6TJ&CD+)+t5(fMmZ2@?rAiM5Xs-x!d6%O z=|-3VQWJ>)l|gO2iHH*H#WsZFTgE6!275MI6&p{@M2D0o(w}x)64Ka&7fJ zN!v^&YQjg%5P>Qh@eW=LK&lx~na7yU9}b~JzntM*U{2$L<-l*nZ=C{S?2L@Ihe_BVb38%rPG4yYK;A`a!;bakEcg>brlXjPP>U88qhU0db8OWR(TjlXJD zq;& zt0191=i8pMve-JKqIMLt2dY}-$k5rxpk@W=ighfOajtiS(k2mtl|s-o2lzJY#Lt)V z-o9QXZa;=t*E+`L_S-~U?tgP}TRt1qe521n%@%ku@Dk2ruV{#{9>C!(y%$e;O)B;! zZI05ii8W5wnor%5CV{9t2^D&0Hm8haT`ID-%9}rv*Iw14eXd<0&g_Kb-mcJ{sar^$ z*by0IOTgctw+`gFEQ$Y&Ehi-zxaHcx`GHk#grx8W}W+3{mJ*# z-Q@T7j~;JFf2%>ZCBd{1_5m=S2r{t6WBRA7(^(Kts9CH0PVq8VnL7iK#zjsBF1Pev zoFB*JuhJP3tTcEJ`!q+UAM1&|-KbDL%)yj6Bozb+hHd^L*{zgQe9mn3YD!D}b%8e< zFOw4~um+Nn13y5v5xMsaj+8pKhg^kThMu0BnG78=znW+jVPj)v(yF#?i^1haCd>vt zE(+g-6{!z+9ZI5t0#PK6F>E#T%F8!U?vUrc%C-s4c6H{HvwBl5_8WdVsR*F3yUEuv ziTE90b^#FG{0XxOkwrdg5yV3GP`}jrv8qelj}M8MyqG+cS70q#xbj%AGT6U6i?FXh z_Woz$C^(N>bqn}gBi)*mN^9T@`syRfIQfE>rYm{ns)3kdvw`=|gGq`1#I*ZCPTd>9^L=AV2j0Eh!J0Csk^gxK6A@Xz%ouGN`zsu|Bos z6Ah`2O4q5SRd+w^Buk=@hmA@|Ke3`_ae;AqfM(W?6}AhtY`Wi>Zzf+9itlmTa1AYc z=ytF9+05f-K8cA2?c8<*2t-6~<|h7y^7Z4jhvTPlck1PsxMTJ4;u#s~!Qt(7X3u7% z?mpZqA1jmC%Pj(N;^+XYhx=FuJ@OZd7M$VlPmMYzs1{sRS(`p~!CBAzdyU%O>cx(} zeY!i)CpOC3{!}G_HJgDgyCdB;j1z1;7z*ksEG-7BzX}|xg)IGwMh{w{@F$RAh?ndE z>M|i5*%!dF${_s4H?`|ZtZhHLGO_W$ifZa>-lQSi=Z#hFA@@2(hs+}1=;NpG{$;_j z2!dC~4?`8Y&7?%>J=?_u(lAHlTy zf8)TP!P|?3TrsdRC2@GVv&B))q}|2P?pqvoM`)V)lCO{3lasY$X0rY!esAUS2Dj$% zo9sH_8fAhA+y=JUnF|^%i-mtk5Gykmf~AM!cV-V;4+g4D`52t}D&}tW!0c$9(0s1Q zwx!YL=V)6l$i%6c6vip+SWe4Wd?Iv>9E)uK;`2S})4SoB>Uit=)LqH%zC2&?6%N0$ z)?~W^{MhCf*LHC9!0vsH2gjt;EDo1Wzz?X0p7<3w&Dr{CKHa+P@w%k?f#Fn@L3fhK zq1i2su0}UD6p$*eld0XL(H)j=)q_YWc*A#}U*0=XhAUHJa)&+TMAtEDgpemrbj`MAo3D=Ci7B458a=HyVZLvsg0f-y!o~`>4Cnv< z+M7AVt^%UHqAt`?JXj{Rr?0}&p+>J}SAk<(MVsyeX6sMP&*ke_{l}(ZPlg@~ z02Cg&-;;=J!BIi#i6;uG5q2RR~<0X|Er+< zKT%K~Km?LGhs)$wpdao}{u2S^UlDaO`X%IR3CiOG;j?0=e$?vU7`g{~6gT`6;9$Cx zY5-fC8*tM#*;7zU9-$?3Z>rC(f4~ESeGc5q`2p#N1=L*5Kg0=o?ZwiD{{CDpPH0YG zzt3gB29e;Xy=wZEBROX|-uqTvNUZX-X{2TzdM!8!fiY7yZj70GhuP}5)t0|8g7ZPM zU(Wz2;r(~M*kAVnE>Q+icsYAb8I$*NHvi+=wc)A}VXLaHo+U4D3Qk3-K;4zN)h2*l zH26ICI+OLmXS-q_9}LKO+v+K2&o}kQy)O|J6c^i!`c-&^{Ycqt&qVQ)VaeR$~VG2P7RhK{n`p!u+1ZdVJ72dD=Qi{nNHDiX1L_|YVI7QFNuz=Np?%%@7D&V>9(FY zF)U!r3xgOXI{$~XL^762Cyp*OH>y^|oa?W0Mw?dFSJWn3Ze*MaQVRWcV$EKT-{#zY z#!iZuIT;LOPB0J3ILFR%U79@g8G{CQ%uvkul6n?7`Zd+?6Rw!n=% zi6~ALOkzs30zd44f%S+}=@o44uQhiKHZiol&VBW+4vWyOW;A@sOs)~=+JE`MnI}GO z{wv)N_Xgkd9)zefgsq|7ECn#oYNUtExX6~1_AzS-kIcN9{o-4Pz12```0zoa<6nRY%abV4C37uS*HH_ z{>zuXKKI*Y{i7|$-=^Wq0O{Es4L)6sZsyujHR*~|Mb=`lwS!THjf5+`>)u6LpqkRT z7Z|0o@>=-|iKq;!Eqxb53lQ8*^Esg~W;#W`4Zq_HJ@$)UO7480dPIouz8jYVWi4c4 zJx9b`x9D)>@RN{Oy(&QX3xMZ*h!=OJh1 zYVfnh;4|M0#E~lz22Cwma3xE6DHcGQUbrAIoni_@*r^^e&*oS{6%l0@CPfX)#&xd` z_d|@c)*~`g7>;NK%N3+rSC&(yv&%qr-nO=)K9hDVeT$#T=*fzmHgBV+_Y0~BvFmBK zOd})~bHM3@iY#O9GmaE{f28)ihI)XS-L5RqPU5fHW3lgB+}BGEHi?Jty<4ukkegRm8_a%z z$|j!x)O@0GE54n=u*TfQ36ZLvS`zL8pej(NS12^3^|N_N!I8ns-uAxrkGgXY_0_$* z|4{Wt*=zuh3yADC0Z=B!334Tl7twrk*yf&@jSUQPhu$}e*I*uFq2;(-oY(=A1I8g$z2dNz455jR&brl-^VDa4{~iga8-b%(diCY z!M)sL8rWY0QMf@rj)(-ppTzhR>oG)ZxY<~b6dZ1-(#OMv*mk~mT0bcw=|#}ds;a`{ z-rnAJZO8QOSfq@50iX8srNcjwEi|?k2NZ&%pM`=hsH*9sA#L4Ek(MFQJnZ$^X{#hV-qdje^U5?LR{NgbXTCKUYJP*M%x}hyR7{(-lK>b;U-Nhss?}Kr=Thc0 zr+3L(1VnsS&KrE$yPyB<+V&4VKQ9Lbs}djTt~!@;j>0l8ew_DSA0oUM9dYb%Pn(&R zynI^(6ZY)Z>qf5HPY_U^2Qio?_coSVu<8%!HlLCrUc#ToFcl`#1hkW&)v2Z8!b_@RQoR(eS}#mTo3XUsEj76A+1Bw6+DGsY>3LFv8~@1}?el4Bu}J zwl?Pm_W^Rqcv_eHD2U73b|y4@O}oK`*4it|>YFIGRJq%;%`MpO8{c>L@j5;}1#%jY z6a-`9WyZ6p+!vKNcqf4)QurxyE+XZ_W8P0o&eL%|U+@#^YN$G!T6% z*ykE5+sQ)fV8#2ccfO!=PlF4fXX;~$1o%3x2{1f<@Sl9*8^e5P+FCF)SWph#S-ooC zw(_w^maZ2$u3jZBbJiSt<*o3&_T&ht4%EhJtXlX0I{;`rNpkb?JoCI~7_$kB{nP4H zE1&&c-9CDRciKC6Ws?SOtXVjTEa1fOVlXewqtz#by4rqpF9qn)BrJLi1nCP_O$Y%r zLp0D;AlrC;i(!G^Z6?K}xOnDGi0%AD9OD=Y(dibS-sF zE(upUIhHh}6m=@TQoOgktM9y`N-heLzyd&z9y4V$0o2ocK=`d>!9FQnFoFALRtXngtvs93c}*(hjTNI)ocjY~zpM zA29royhia7`W_aFXpE4iY}){2*(l8k16HHQa2H1oe)nR`83kODSLZ4xPP@z~ zxyp2%zbEi43ZO}RjnD@Hm=~1T>3-m(M*6~f1@{(c=Es$jIU{Y$+=ZpF;8e66@qx!yZ&ja`whAK)3IFzdFZws^{heT}SQ^BKo$fw&vcYH1n@w zij3)7hKIPxfXbyg62n@7F}a)w?h}w)$PHJPoSfL5Ii~OJS3P3hI#W}3YXhT4YR%rx zm8Jq(CKvDrIIi4S$bG`Zsv2G>hP7I1G8;aMXGUfHs7hrmO@B{6oF;gqF);SYEt&0X zs_oBVCK1^>1MF&KXhp|BXeX5~h2z2KQ0?W!Twlr|SH_|l$AEAyKEA6KieEyoaGg2BFl$=FSleeG%_i%5i9DlYMcpu9TIGZJxbMnyn&<3> zdKxVRG!XJ5qhqJShdVKG*}_ati2r4clRvF%YM*s>?w@$QR`l7mb7zMdn67BxEt_>9 zcPzYjPqU@wZCsT`bj_=g=VZU<@?v^1P)r!Vu2@>Ih zG?s34U3xc&rKJsb+IdzOy7Xjz7zF`6e^9hG7Pt2nc0ZX)blD8svoTQfkQV+E(9xZ^ z3BtEKs|?D>g3TdYzo<)M<(~AQfa|NK-@JAVoVlmgMQ-_Ikj&Xcy|p)IOgwqm5rAK zovrm_I7D7)XqWgHAwj3;cF=1l<9m~5u9{EB9qh`@6+R*RI%A^B|0ig%??!}Xz#^c6 zuubGCjo9JXyCXEB<5ia+nr^l2x2VCP(UnK^rOVl}`%VY%6CIyK4=f?G==u_Cma^^ z`)Xu9`@~Q3PIjFobz~~?(+^!*8n`= z^U&PB7{Gk^KQ4s-N5%4+cn~SxSF{I|0zo2I+( z3fUNX;>O-qKnB8bL??S&(pM>)u^tPI)z(AnuSh&y&0N@L90?CTh^|F<|GbHP>SMYb z6m`tm(ktM=*YQ+~J@59EL}bv96mSj>HtN7})-i|NJ6npv6y})vzL%43^&|@w*vTag z+Qz&m4U%A`;l(4~G@oGXn8&owJ0D%M3_Y>p!rMptuSDgO*5BF*V=&q=M`ZRxYSYQz znc{}o){kBloS&e7T<>ymRv>ie|1KWb|47xsYy3(*p@pE5-$l`^f+Day1}4}fJU?vG zZYx`z_tIsr_gbV)WCUEYU5DTpW~=X zaKg}gaUg6(UMVK=fKD2c&woqU$}o!i5=b+hs{o6g#C`R9zLD=RBN$AP%V%&p)rR^a z79fn?KiP{kl37oOU&A{+nrq#$vtt-8P|h?C$+yG#u&)dE)V%sFM0j;ADHJ+}S@{m4 zgFie9-^9G#)mPsBjkw?}dO6_$N{tlA#BTL$jq%{)GSptbtYrOMx@pE^>g@(h`knLU zf4ZNb<=f{pV}Aun9s)n3Y^beOVg1%maaAdK>0dPEtm47lfo4-L-sK#~+4hJ2(Ny|6 zQC=aceQP5if4m2skf8MjusWIwC0f??l4br}&e>TsB0Q_(ji^oE;Xy{}LoFl$+^MApna;c686`N&%CF8Y>w{ehs(HF(&<_9G z;W_#z$O)r$kx4R;mjMDmCbYqtPUn+H(Jjyp1Jh|gp~k|S(`x#1rd96QpDg1Il&-s_ z$#n{psJQPS^x0;bW4{m9f3x3O#`77RkpxYYDQ>3O&rqoN*$L1XZ zSGp)Fcg)l5KkN=R+c({QR+6x+#q#AoK{g@piJL2}M?oPzfH6Ue1sdo-BA==Wo-uby zv!D1{nf89s(jZMszf3t{cjHNLHv-hW8@jv<@AbH~$l1j{=egrU!JG`Bs|0cP8G9Fb zO}gvey}I|u`|%S$zNeah;u*iU1;v)2S)~pcNq%blJ9+-~OZ*3IrFR}W%p7m}1 zo@C3?St&pwPNiNxOxwC5yLW@R*dOS>xh7mU8S+EJ>iKjqPn(>4_Rh36r^%fjw@9NK zUJ5FiG*hnRi4)GA?AmM}P0aXsin_md3=b|9qe;HxjTLsVruii{8V>zZQ`I$ll~AIw zLi~F0u8e?xOxbqY3%-reIdX->Vi5z)(uQ?%kzPuLbzDiL+LVTr_2^BvQjUD#B6&w%~E4+sa_E_o^1MEyxf!u=w2^z{#G3h~K?1 zKH?OJZE!7^=r#ALSCi|G{hqzZdG|*n)sF%z`YrZ>Cz64GGwOiK(kLnPspTLda(bWk zV{OYMbD#W60b-x_ZDc*D6Jiv!=v@j(mN%Q%O)-#~!rjf@@rARAmSsq>=?f~dDArso zy=SorBwdo<5{!NHKVBfE^hNyQ6Nj*UJ2gt0XpOc7N1)WFS5gD@9?=H^{R&RZLHzol4-eP zQHDoyqK|lF_S>DCFql!0bWI6Cg@3z;9w5R>{m;Qjosgb93oP?my>xwYV82h%+A;9d zCqdvF2hISj1OUnNM>WW^AZcU-fF02;65LEf6s!nVx=#DVWCB%=7wDj59cAZcoOAFB zW{w~KM+NnSpxAMzUuO=JdV!J##1XDz0RmdIXQT^EWad-d8>Um-F2ZGcPS=J?#7$e?QQVs`34Ioo;0;}$=oJR0q+`s@nUXbeX%1HRx9@wBF_4+h3^Z<+z=Lh*&7${8PHF z&y+<@V$1Z+oBcif(k`fsGo*v zDs7@(8Uaf_Vn+YN6lsUWn5`@k;7+3h;Rq)trj;C59W0)CIH_gzzVgqx;A^7x4L!dk zk}uucyVjs|PpFt}KUhbE_xwNXeR({TZQu7uQltr4BBrucwn8anw2>s05MnCXW0JDP zoG65tC=_KBS!0qdStfh-tYgg@*=HU76| z-R0_bqf4jGBetxRd{(*TJIhb=4p@>Aikm>HKbT>vAeC} zt1x9L$xY4Xxs`ATx1&d(0I$xZ*ON$HY0@EYkj7q*Wj)ux00Xbvf*#C_4bL0CT)O-F zdHG_so9>55U0E4c@`09udG-t6&X|MrY%%*IfCSAlf<4eeSsK6wuXhI5@cM3Z_&}cc zb74A~TC0fVzg$=V*lG_<`R|o^n$?N={xOmx3OBD+MqSuPSUk$PyHtaf4|3k!fZRls zf-~?(&poEipsbFc72i!CwSPIW_*?Cx*-k4~Kh( zm+m+zPq#X3J?2!<7lG>rz?%q4mcUx6>p;AYkCd^-5#yI zK33;`16?sD$Gf-YdP{aS2d)q*JBS_xi2N>)7RQdPXdYm03cAu+>?rjp)}8jX#9KOA z-LvjF-(jbbf&w-gQCj-e2KFtt*;Y;7!emm8YGCPMd^e#={BX-1BC6hMOA0pOYyX?P zZ;daezej$GoL)SlCyL2O~6!=(CXwbU{ZfLXji4lCy+|a-S?}Ls!Nd-cs zvibgETUJ{xK;)x~;^rv1LBG8>nmJBOiu1!Qg(c8t( zxgJXv>y~ykCh_S?UMLpgy>k3`F0t-)cC(Bhz7`MS`xsL8vyPoX!!p$dsHs?iq>wKY zEqO2V+ffxI3D31Xwx4bvksu+&*}$ANrZ*3tssX{Rok74C2NN?wg%hVyOP0!^m!=Zo z^9wVfM;?j`B!|e_L_t9gqpq=VVd)OI1(RJcllgi|d?Oti#&a`ILw!>CrZlS5-Cs zP(j7?HRDvbybA5mvTKEvgtB*oW#caO;QG6Bm>2?R9&K)3qrz%_C`jUk(H_Jql3m6~y@sjoyh=Kcw$DzH^9p}Z>FMXT>$us7TK2W z0$nXsl>t*!8E|0(TjUR(XxA)Z)w45$VN4uOfq9+-vQf~dv4+~5b~j*YOc}8<&U5}~ z{dfCE{&l6c4F&zgD6~`msodsn(P7ES4VYuazHVc74OUCnFh=0R0Mg(a6X+T%2)DC& z%6c7=E>@|>atu<6QQ*sgr$u>P2e%dlB-J?d>#&6x?^HW zj=kD7*nwhS+`40!nvXleH^{E0fx(cPymbXvXO{dpXgS()=ZDRuaf7};5E|rBOEL1d z23yovFvVL6P5T%zkG{@7+m`y8Sq}giU*1hx|sv z`Col48&B0?`YDj6K&*b|yW_84AsYWnyX|hZ$Nz05`H#-Fc6GA#-khMWRbn`aUR3?J zMC!Sg7KEnz;f%|xWj_#kvJ*Wkp)|157`^qm3J{`C*_Qs1y1T$+l}QE=B~b_48nA=@ z(g4oZrg_v`h)xFO6B&4`0o`!f(84E%fnpdk^qLCu2{xc2Z?+Mod?f=cw7hbD&4U}}*MVuA>HCVp~v z_AYU!sq}h&d(s%#I4ZvxWIeVm-oL#I!JJt-$O>oyD{h&oo(OMQ1+?I~bbv*NZu~%u zIhU5P{6GqmL_dNWGMTX-QWS&xQMo4fnNF1CvaF_(*AtOP3O|O;yg7Z+iXr&L+SaZQ zUO5c)(_*lX`j6b$}^Uhn$6PvafMASjH*A-5j4`n7w0*rU-9{t zz`Ym1p4veY4k25II|S9LwBsT`e#p@>)6$Td1M4OyF5&?dx)~yyau3#>31Z~|!$Y)u zX=(M_*I-Qkfc&3KHD%cSK-7(3auk2b_{Ppa4*$8<;2)**OsE&YzSaoNW=tAG0?fMDg{y1RBA-cAJU1IhEAsCRSWB_r7w z#^gL3#tWFHW{`xC%ix^eI}(h)mVztj94;=NW0jNWm{C_Nv3ECvm8^gw^c{NaH`tL#K}Z0?{CjNH+M109YgPu; zBMWo0Q^+t^zNQp(9W=A$x6P1-gy0&6L2qd}nWt!mbQ51YKnwf8P#>}JY~q{~09Jr4 zoI@T!C_uSn-BzR-qGnz6fA1kfn$q_0fesC}%!wZeWY$PF*|sg~b(^ykswiY>M3%jv z4lwAIZdx%9uO{JvORye%?SAQ%S&*#Ar64uKLDX@#4sQ9LfE*rx#hO8>)f;N*@!*6R zPktbt*RkZlbsOUV%ES`m2j$wlS@Wy*7StGQ_C7<04&8TgbWHkQ)pMsq#7<;y)1A1j z0=rG0!W0&DK0a}E$7|oMJA6<5?b_F_pL1vf*tgc}!=J%n2PX*>Q;2SQ4|ryo97N_W z;HY$Uhl$8bpnFg^$5|#X@$bWn)SZd_B!#5ce?Mc_lqG?JF3^u4J_TOuz{-87w+3z_ zX2^a+_o%{G2WV7qQKL+PzttR}51VkPV7G z0NH?{$kP1iLDp21LA(LC<-t1Lx2LZ0JYu8==ehlOz~S1a1G+hM zM^@e)^AdjA2YQ69*N1^&(TO%zZIY5AtX57+M$;v6CAM#I74bXTp_pKspRJ? z!RcQ01rF=FU12!;$GTsJZ=O`sdoD#M2t?>~kL3+bt)g=7zmeZ&GR4{dueSBBeP7%p z3Gi~ADExtNT{THJiTV|@;(xAo3yMSVaWz{;4X{ArB}&E3)dTaqZRoG))g5=(CV1PU zcHEWvkBgYsL4T1}!O!;LC~FTSeCi&Sj)DU_GKLjb*7dJx`x-?7on3($(jc@ddYxy3|Qc{FM1>Bh9OZ-q3G`JGl?LB z2AskGuq|z}Wh&eY%^*x;r=fRi2+U-fUL(PdaRbKn!*v2r9D@3I^y@)u`E4kb*A`bQgyn5iLP3MY$$K_jCOQ@t6ua~e>EZdiXj2w_BlU56OZ*puUi7{ zCy1g9N+rGz@OwLX67}4ZC)H$gDz-#B4PFWe?Y{i7+&U`zTfRV0BU5(u0YB$K&8DXa z%I&>&z?f+vfMfS_HsJW6G9)wz_$|huO@P0N#FQh^O;MEietWW}Elrh%?ndWy-t8@z ziNOJ%MLo;V`RmHaGNwKaNczNX+wv4Rw!hw!(wDAheBlUL+p|}6fZ$zpm=LSifF~$RBP0Zm9ULQw=PPz+7>z zG;EhnO;D#g+sfD85Eciv2m@x2iM&2=le?@cJIC?)7WVIu z;S({1D@S!${3XPA#H1=p76JUUt zV*NgHRZ2M1b!cZ7!zduL7J~tZH{{S5dAN_qqOW7)og z(Y<=>k37(S@E+UP06%(E0E9Faoh5>53&dc;JQwiZv!*OV#SprGwyOT>`&X7j(4(6` zBJ2ug)#SvlU!C6oX78wR0n;%jOZgLOUe+8w*uB;)`VVcyKmGhS^4?xio>&l9kE*8o z=)i%CY?GJ%0&Ri`@fpcxZpo~s6Uj=$oFg}2a@v9`4 zfm`=~!=hc=g^5j|>pEweb-&p8W#5D^H0VxeT_fK`X9YZD#l&KP2SbseL-p2arxs4>YEx?eNxH0u{MO&$Fl*gpK3UWT-ifIF98qrJJOQyKd=-n z?Lmq2STtb?B${tHEPniLm-eUssRP1xf7oD!-2x(W!clQ+E`SrX<^mYU z1fxfJ)(in^)%fS4G|B_ed2n}5T4DjfC;Gc-|GW3Z_5QpE2mc3BX|BIWrKzw@w*Or8 z7v^YxpZ3~!MDv^Oh1z91cBOWG-N|o2XFjHDpfu9Lh98ImJS#;DWekqM_J?pYXxl)x3^VHlLl5eDI!~B2b*Sq+C+%-rMv$Kt>T?iNkn?zcD|9nss`qX$jipq5 zk(0r@astfpnjs8`B+51(%*P_VwJD!v1as`&H$ETq%uqwI1ZBsObotU)6WSxn*FsU} zkdca(lO$TxFgS0H3HbSX<<7DDBZNoZhl55xerX*gW|%jHpcPff-;Br|^rGi?3r4ni zZ^0zrMVqWBuX>3u=K7A>M3am(>3=04|NVCs#FXJPJk1Qk7n3(cLreQ)sP@=T-<@kQ zZ_RMwDYU_wO`7 z&AI4R1aaT*v0fXuQ=FgUS7|^`-`=CCY`^Tw|DBNd{WJXi>wP7c(-+yUul>IC z0{s;L=p}+c*sTrFV#Th0{JXahs~^Evk$+aVFrgZP6kh{K@fiSV8UQC!Pl4m_ro$bn z>_6K7T`%?DD`?bW>2fTF8Lrx<8~Qe9(KgJI#ImAEW)DAC;Z&hAXXi(#_kp3p0B{t) z=RInfQPyO+4fvnM^#MKZi;Z${q)A%8hY-l<8D)^gVh*q06fU9`XS~bSRPIwdHQzzgi)@Si zOH26#XJ_XZyA?K=`(3_3uc^p#p-o{KblGuI@A2V#nC_9w(cK}c&+U{BODJp0Qt~ZM zT#P!mGRnM69(!@E=5}p%c6s_0a@Y^Vxti}+q&`g@DR1VTDLi74Kfi)%b@dpp*hR}^ z6|k7wzEKH5Qip|T0}uR3aYEvf{mWSpb@Cd%^5&uJ4-#!-L@|0;sO5ZxHxa06E)jw} ztW!E64%l`Pl>au70#ATgIUGHakTyb136dPbW|HOxiDNnNKJD?lwV?AgjmslAP`D4o z^d$>4uZz*{WRPS~oLJ9PtEj$0 zth+36Sw;IYMai|B{*64$Q?_o@kWn4ip6&5;T1Iu z3mh}?jmjD#<<}>NE)i>GM3-~2E#F>}Uru*e+`>;Wfw`z4rY5xk4os9N5M$ocsfm&a zorYx%Gz=$|rqJC7CU|?Vm}|AZXGM;l{qUhr`+NS1M{{EolR<9;iC7u91+c>c8s~1X zUxN~lHC}*uhXMsTkLD4ok2ft&6u6#Pj_g!-G~KlRcEG6a#_>9KffZ28;EeXr@IJ{( zLtCSR@UmI}+C!^;(JJ%xf~#jrO+@dbQMX}I5z>Ev6DI~Ylejd{6ulvZm@T10MDDJ# z6#IPRNr7Dje(OMy#6Dc9!P7=E9uP)s1Q)RjJvcP?lc6TGhcul=H7^r_HkK+(9O%;* z4}jDS#3oH&796Oa+sr4`AS^ZW^=KK})pO)c3?Ie?TpMWv+6wZs+{H>l+a0}rl)Kzg z7ywIV+QJ$awno&5Y`GDwV|}{C(`F|ZybqvP;>BA_*`cy7!ux_sY#>$PWtzvdhv|~% zPI1$Cz05OnXQG~kznSjXeRGN10?!opqpH`4OQCaurX7bub2x$?<@xAcEPinNeP5;1 z+v(>)J9KEz=}QvHYzbR>$ncg()(6Ob#_x;Sx*j?6o z+@`M!9;b2E@;A7AiDm5pxgxOBlvQKa1EMNuyA3O%RF$rHo_>HDzDZ#>UaLb|xwpU8 ziJ&B$?2!DBjCB%-qp-tre&Q@~DYvwr(`B z)4aTrgqx8hK9_W@FA~&qTvET^RAAeS-606L4vih?DTXW1*)^5Oo;C&mH{$8X?xeJX^l@niO)>AL zwM9G>r5ZyfJ{>p%-%>%*)nzJB0-EdSY6~U7ip+UAh}8o1L)?BAGl zVMT=50ZI)=A?tx4)D~zkb{U=f^34O=ssY}+jJdo?4xLyzl^TY8$@_QXbtbyjWs~ze z7KIV3HMG-UtVy_D@%^kPU#3x6n>ulO z-gOa({3!SKvgC!{(*qLx0cFH~uz~Ns2~K_<*NUO(d+URQ`%1>j3b+epG|0RK#&2gm zD7Ix>IFWJ^-cN->W*46myAiSSiU$7dV!1buTvD!;3oBp?@54vvBdopE!4J~_q=<+0 zS_OL_q8OpO3ry(}NgmN!PI;aj?N}%Ft)=&REzD0k?+nAw-3Tby$;q3;leY}|4#6J> z1G)u5JT%PX7Hrr~+;H>(d&#GJ8-s{z|T8(tKU%+Rh2Yb=o z<3j_V0Js%hUSd_&=~9#7G5d)gbEUMGF;(9?ZezG>T=SCV6(ETouB4Cf-pc^oat&T{ zRfOJk6*N$8^AGmkPNMUVuy7vJi@^$yCN{?upeE&Kow8m9+G#92@HbLQ%KCKe*ap&@ zfgaZpRi<*xbT^fo763QGz_dL;R;U#`B;_#Ya#$G*g-$(!Ok=l9%$Vmr_i3lXrnW~v zeLT)I4?S(lUe*heJ(8<3T3ji+eaTU!{nhy#;mp!{SA=}H_j7j@&+ir`j;p|z^nYwA z3MQyDs-~z9%A#Y2Q<#exAqa{h9V{*&hY5=>F!oH6a+jGq2^1ofqoMCUG%!6dgPsA~ zmvuoNp=qJC(LQXzzBi}-aPP~$K_4rI58N?8set~rxHx|)jED)ml}JDB zoF@uW;UFxwV-co>l+2-;ng}Nn6hmTNoarxkaf1o!lN_#KlKG)+0L<6mVeY_vB|8%m zA8w!HR;fl(KyF&}z~?~2eCeVO-HMSmW%G%r-$Z;+Ul(OHhM_pHH8XjYhnn*Ktl%G#0Fs{KBur!5OQC|?SCAk`|m>#xdQ?lrilWVO0+*eV= z$uq&afU7_ixKGc4JK$rzTqFWf4!aJ~^Iz@?I(iO%I3_SlxVP0%>P@@WyKs++y*rg< zWn~jpZob8Z(C<@5%Fu$vyy;A3dQ@`RTRLQs8(VSy-A64-pZU1}$*`cry)~_*2hH?@ zv~g2bGd7l#X!pK$Hw&^Ydu=v=X||PnZ|)O*ZJ&A4 z1{+OpPn2SPSjWmZRT18Do%Oy{)*%Kb502xAtHJH36DTzX@kaP6vT+P-oU&QQN@J^8 z&*9+PTU7NV2E$saKV{nQ z6j;ZQCA&CLI)gI?&+0k`-v4nG^aoD5Vdzt~t0uvf&fS)~K*9dS5j-OkvlcC$mAJ}g zh3P|18@gek8v{X+@osj4Kw&7=2u(ACMb;@Fx01=1W!Qqp@8Lk`MPD*D`6K(SUM zfR(f+HWaJGFsv_wdgzkQ=UaDH-bi($>>e#|%e{cfMS1GmP>r|s6X*3b&KS_YDh@)N>e>X4w?lAu(rz0>1Ix|PN+k>(RaBH=RdPJ4A68g@1 zgH>Qm-XBDGY5WgmZ8Rl8*5)9Ybqks>U1Ly=Fed+!wXyej0&9^mOJ#uW#D)$^u;!Sx z66{~h+KiI^1X_{`gHW-{1p4W&tE?pa ziI3@o;KS*UX%Q9Pq$EDd>;vo1RJ8vOg!1Z1u$9(V9O?NOkdK~9NVB41gQUtq^gOha zI5w&pjBnui8R=$eoax(N2tsC+6P4bVUa!T47J9 z_&@cisgU7jy#k&B2y|8BiqC4hy3(DKnuG6&5}97PiNn(SGGi-xR~My;LAiVTHO_t2 z13_-5wC$A$J(<3@{UDgAOdnvim|gVgVd1t=b-rCRc2kzu5;}=ILw*~FrLUsTbf!zf z9*{Gj1rZt|Pj3R2LWzQ(Mp9kN+$bKlmaX!4U+UGUnhkVcZ=FeDoSW}FUT3HnEl=lG zi;>&5c?2Clu$46U4bw{+zT*yUgRYXgNY$W)*yyH3X0bu)<$>ceTPZ#e9 zh`*vJgWXL!N*eqyuufwW!vL6{FCg^Cv3ctU&1i~K#h(vdzAJmv+F4$rq&V8{+Qmt7*jL}9TTHvUD<7Npe)2#3D&#;OFE7r>=n97s0-BSuGmX1`rS}}4vdS(>q>ZHAvsCK} z3DtfAN8WdOD7d{7M@l{B!sk zQbXQ6OZV5IE>n=@UV!o#xW#WgDu#l}d+ioyM5_f&$U>b|g-X{n53i$^gH9*~@g6NiBmtXF%bslnYyf*6pLA^tDdLmwy z7792p!A~fx7(i$^m&OFe{}}f{RIu4Lv|R}$KTYY}ym^|DXQqr1zB&BR)%P}s`;nJ= zj-m53g@GctH*zQwU1quYT|ujglXQ=)x~S;kPVRg6JT6JP)QBKbA13Te_XYbt%nl9a z#;Tf^LT9~LuW?6Aaj5-5RCxhGBExaKSjFcNTmKdoAp)XX^AbjWIU#d|e`d5H|7 z)(L%+;ei0q_hmqrnVX&geOhS#gBaxHsd&kjMN^xiHs%qj#p}l$r8-IyODZGAx0LU> zU{GJwusL)FqPV+4)p1My9?cmnY#(W& z=#f0UOr(#CYhtZC_U^$)7y3L4j3af0LGnrt`FQfHG28Y!W*(?embIz{RVfCyoM>>LTRfIB(ZC9FtH&V27LDL&UTf6l4I`; zom`xAX8QKuiqzW9c}lLU`W%#jrAxa5URW|spLHJ=U~Z+GUlR)pe?F9{X4<_~*240N z*89G``tbM7vdV?ap9T>K^iI+mQ8dK`gd12?59&NC7;TRU5+ZEC?Z2EfVBMOP5_uUL z-=f-#&>ix>ckkHBQxUqH8D9&18`bS0e1UmSR2gDVsY_S&8^@F^{8xeHIl7bWyZ0})A|fufQY{i ze;(KZIY84Gx(<2>{Y5UXd76~#L!2hue1wh~_q??d=Nk6a%$D(*1h7V_5al+u%%JPu zNGn`}&VhWF6vGKZ%&MG^Eh@ohjXT{hjyUD$Zd2uJbbZ8+-7Js)*c8r%CxT&&Me&W$ z4*)YPK)*os8PWq%!Zz?9XIHlv*g-A}49?}11Q zLl}DpBV_-+`-u|gp|*%;&JytSyVHWFyL>t?ED}c@;mw)!@ezHiK~fF6I}}rRFHE^i z?Q2B(M0Vk@Q0Mi+kfE(sTybxW-W~31aESyBVit_EK&TD90aW|}6OMZ@dN7`VDi#&= z8$xqU=v*qL_I+%+h0d-{jgXF&o1d@y=5+nk$ND0L4X4l0Q8fb;S3sq%ZrYkcAEB_* zc2L8>RFid~Jv+!UlU-7E+gtmgA8RdD%v}&!x4hBUX92^8YXj_775JVU7^zi2KT5rS zPT#O1Qz0h(N_U;=sUs~+MX|*LpHl+ElKuBjdRTwnc^@PuPSE+UuC@HmtHK)@U^@z; zgZakcP2ZdDtwpy_?)NaMgoR^8v0FWEuU?8b*Zy#^MQ_{ZZTeUR@}4cvsG+zFb|r67mA4@{J9i#to@&U(3S<jzH)0dPLnz)7t1Uo&5VE)Ah{!Io-hOQIs35;5704khi%Ck z{1jECIt}K&>|?lffF`91M}9xZdQ1KzLK&6+6kG8}zh|ojlz9IHtqYfGc!CnGcFqrk zr4V{mjyw$?%3N*W; zn>}!m3zl7k;PE&S-Wq7hM|NHUC^=*i10g3AyJfN8dUB?{zvgmJLbMnuoM|$)<8nzf zEEn-QzDxU6>IpVhRY=FMYG`SoK1m_=)^qLl*uEe;P#akRpRz;qgV{ zC8BIAH2SLV2cnQA+woi6;&02)f9xC=&QN|LnN>wa{~h$)E7FB>Xkf+jx@Q6T&9baTvEox02~03ZI-&hD)x4 z1`Q5R#TmW>L$Vq`Mf#_3GJ)M?aP7luYab#U9n3nPC(XcL;t#vuSrC@Dm3gG0!cv?e z%?*>sfY+uyem=MKi}+rPt#6-oxiB*pj^rKH4j@}>)8-wK9f zB36U8(C@NRd{e?`%wlqDePV&@>N)HV`VGwvQoaQIk?S_Sxgzof_b>5R{v5v-czPr4 z<)uEswr;yStb-GD5>T&kh|yUvLJ2X)210sTTuX#1FKZ3G}@wWugbDgc0<#{4j>gTYiHQxhG@t z(9P_|Wp0c5l#ScoF%qiTG;B)uf*l=1M9qN&Vf5f2p$9u6R4K11e`b6rK4;i*X1mP! zT5PK26yK);gt%&rP;45jl@ZjCuCvO{G^Se*;=|CJi;q$64;wx30vf^wDOV=U!jck{ z^c~%|ho9WJnub1uSV^HFSS6aE=64!xBafsdvErSRo}BJ*FLSlMy{>*CWQvAYdTZpu8}}UZo#t@ntTveUaa?%6)TYcvW<( z+`<>+nU!96)2=}$8JOz`h#^&RUog3IHJ5jjuhTI;Z@+$TxpJ*_@3i9e*2VH|)KGJ~ z?vcZFb*U?T%33JXDFVe6zb3Ez74JkP(IQxZ8mESu0~;ii9iF^A{^`Vy2OO@ht|=>E zm9NWhWIZHKqTx-T9suZgVnP2aDBH`E?6BQ&hW9cLH#jLyE3}`2lE3cc$eg{tKH%2I zooP3$cV_TnuF{{Buc96mAA|-$SEAXmRC}|`;(Yh~mVHQ{dgYofP@s~A_?oMyH^;E5*;D(|=rz63VvmVpqu}Axy2RHAWcLONO?ny|oiqB%*x_Z6V86NAps6DV=bTl*YC-<(J^FBx?sDpz(sc^k}E z7!@47E1+7(j7g;fOTa0Q>lhHZ+j#K@0^a1x*yT|_g7RIg9nTA`nl~Nr30c*1>#7!Z z%9JdpM)B;AL3ycYn2ckXVzrdXwh=6I8-Zp*KPcC{*;1|muN`?hyF~B@;!@y_ERWB~ zh*?=iD;+~A06~5kI>-xEZV+H*R2nqY%+pntJdrP$$|rlz{Kh7cJv4Ueq-O$hycy=& z1csT}Ng8|sy!kcJ(ybnhAufj=d`-XD-J@q(otzf&bRFC7s|ylA?>^WcKSX|=%dv9J{`M?dBNo8~c}ah;*OxV*-xFry z2Afe;jMEcAgE5aZg(k=%)kVpZG19|*6EyrceM!ER3#V@c*Xc+Vos|4K$=@)JtwiyX z0B3eDp-SY`T#Z4uEKQaQ1>?4K*%(o^)3EpW`n!+oI{G?gN(F8#pTDf9R?R6*0wj#7 znCuuJ{Xn3&Df;2Kr)YaLj#AY|Dq_hG-&*Ovv-CWWW5A!V)2Yy0YQezG23zck`CgJ@ z-zjqUg_+MI-U^P7axT>&0=(Oq|JXp!2+f=BgasKc zQ#o&46)$h@_ZxA|8Qr?;UT;}fUcjgAYsKZL10(IXgA%X_JM14X^8?Xmgn-Ybl}oW)y^?+`ZI3t8@@a#Xr``8?pe(rjFlX_;f!Zb_|p7P#BtW7kXH_mwB@jw;s%fJ!spP(8xqff!xP-jQ!;s zT~%jWOnZ9u_vVcR+eEH|K5!mfHeiYwQpuITaDq_?k3ZPvEsO4!qISMX_PB2U`0)}1=pE}RI8!Q`V~N<^TPYG(!;}9iwK%y+_1W;D zBLA_kJ@+mPQ)I;NSY=FWHIc}--+v4|@z{$OQ;!EE;|v*Z6*@?buTLcd^oV5{dPtgX zkZMm}wH|(SMm|sUgPe2bGSqxzecG-oyrtU?50|ch9m@xIgq}eI!IS?{<`(*^k`C-AxNbji*6);7XfkGMOJgK|j zd>bqYcR|_GC4*3qS7xgY$G5(iPD7n-@i9Wid)GhV_;y`(leg9A6h;Ipi^b&Vls7*_ z<#E>Fd$5(Ochmw42RGg+Q9KrV14+Zoq$!FIUoIQ6?skX>LhaYSy90VMbS1c>~y~b??FO7@)YDFRLREfuX*z;JCkfY*#O#Z<1 z(h#BN9t|R?#RO;=-L%c~+$PzganmY#(_=oq%QuX}Pp$W@yLGyZ_{Jy)Q-fh{cN`IO zy#mJ3i4NQ>ucz_WSz**I(>=ia$hD3eWV;=Mn<(a>(vZF9`?@o~4-S6asEOMH)!MZ{ z{kfoNe5^E!`1Ch#_-bZ>Fo2z&2A+Zq^i76DIS(_YiTWmt0$0X7`ehjESYfBr zUG+H2SnKqD^vB%n3g)Q5QV^4Wq_U=bARnrz?|4$Ls`lCx8oc-nSQq^D59scNiW8QU z=BjO*NYgmvz2=1c_M4J2@QUG;x#^s@y&mqgXLKLpS3*={DI?fPzEx5vX z;x1iz;cVe;<3z*erEJ!y;u*6Wwg>!=?pL;cm)spTsow|q|LSYkub@`)SZWPAJz>G~>sMOaOA(+ZXEs31 zYWmQOfIeqp>15mArW5|vv#ovq*L_P%lSaSRPW{4}_AB`HAEgA=-ea@;wJxfUI_3bb zabPkJ{}#o#=&Sj9PiDpDkVZA}5N*vGP8I`0$u!$1bxx^YV0)dqa zLrEK^=mZUoUhc&+ZUBMQP7Yy#F&(M$s{WG`9+RB~${IsVwxKcwtG1c))r~RCebbto zzF3W3HvfV6DTX@xZqKJo@e2WCW9LtctL3ykkH7U5LHQmt+BWlPkynv!^u3rr;naPa zs0-MmVwkcYlRhQAf269aMk z_3W)ZpHktf@~@K!YDX-%oNwTAS`=$jdQ*QMo&j;#?cZ8&wT^H>1V3R>CC#z5_Ext0 z1#t9^!sj}GJ3yR$p z!>8sdQ2wL>or-#$4?#oWneHhLco8AN`qaGQ&L_3_B9X?MR(-cr>+Yusun@QoG)0@# z{e)J=DlQd*%1+>y(wSKIEh+Zw4rUH@M9(D4_hWi`sny#h!`>%Eaqsq>cxo*}2g7zh z9&JbBfyxZfPI%D1dZxK+#PEKV8H|5>F1E6hPrmVmS#omA@y9vU=QJzU8|vSwkQFHv zwb$WZ)up<_oBLS4-WNvVNZY)T)a5Xon0>+Q+1R6RDuwd1cL6y2tzs%G@#+qwnrU2n zEXxc$k{iHGZzNp>q76~3ny^x?36s*@)T%y%_3?YC+p_xxlK8Cd?3cSxv7Yw{x&ER0 z7O^fit*ua*i-uw{T?XD5MlCAqISGCDLi-0qm3o0aHnk1yE3g1DMM92&!|(A{-|6%N z5e!gci{mbsJapdpG-(jst4Fh^;sF`n@rCnk#pn^Iqq)6PVbZ6Eio<1gKH=BfksY`K zg&Y!|v9tn?#4_uFCO=em2(oSzrJbhlN^@8x2C2x#^~T)`bG~d-BJZQ#ci6^XSYoI0 z;}4Hq9*t)rJja=WvUs{Az~@PT#BOip!0NGsxD()J2?-)F?WLWMRB~Xn{QIqM_bVEt6+rWsR*O@0TEBPDkZsKK!CZX zL-P{7=8?q^avX2~L59dX>S*%C(C}jiu7#+#emT&w#IJhl-n|=LhjQh*jgLWk6n)j;4btJq=dObsGYJUzmJBQ7)jnkV8lil+v!ZuTj*5 zn#U=B&v(xDJ(L;)k|s?rtsvZ9u3B*$l=E zdzsn|HKRwXNx|}ck3a2C2I4D;@{`6t#aDnuDWjm~Pv8(x*BS)8^_~MDEp{LUsr^7y z13iUd)p8V9%JX!OoKI%?c$)-URiRbycRDqw%HdKFSo$Y}-|vKM2I#Ue4~iuQn|stP zsE2|a2kVP6iliwWjc^s>rWW5BVQG zzR#W@{*;XgBF?fHLhg9^cgPwQjLHtrPCa#Pq+hgZS$wztzM)EYAwi*NxBTfA?ORRP zqFaycIq&AI-_<8OhoP%kf#oJo;RGyVBk1l}eao+N!!lJHueRnk@_jvZFE?l5qIpK) z>nrcI9&1l-_vP7jUX4$PMgi*$#35Jly|N>?42HIcgc%v$=8<9fQMKn8P=V?3X&i}c zy%J~t_}SRw$jcRycIPf94>x!LNhO%)7Px&ECKrA0$UvZpDo&EB_^@>M#O?2^_|PTM zP}`2_8JU`A`H0@Ul0v5hVhnT{A$oWAD@o3mi>{ zX$)G*c>(lrggQb~W~JMNmgBb5!(8P{@ficCeeF-&$X+M$vfddyRmF3Hv;NmpyUJ>4 z0K2=M4weWok7b7dQm9{)1}YP2Qz9g=NJ8u5?!EM~vn{nLeK~$>4$Dz|W5aDB-bb77 z64!xKMnh#gpzEOS0T$GhL67^%!UF1sP|*{~9#2k2$t^jaeqJE_AZ$l=z2NPWt~I;& z+UyZ{7K>n-3IIChYH4L85YiVuPOEl|+y3nHx5!oD=g%Jehsh(MSGGZoWn0TkP&njoBaizisRV zny8b6L80j-iX+X8uAk`5#foRD+YezjdEk&aT@#&GAC7F7YtZ9snyt)@JfbjtyD}C5 z%Yg-4qzJ@N-B}?UgoyqhN8muc9(bnZ;iT-m9t}8g{wi<-ly5LY1_GU2o2Y0@sy_Zv zDSy_pkNVNi?>Y)TKio>ZQ5rhKTW>Gd_|bGDq+mDi%XYsJSLn-m1_AS;9ik;*X)yP_CzL&Ulr#)k17oXiDZ~$b&LYvz$95#?pC-9Vo~+yqO0U@uxNA4 z{xsAqE|nh6MVL!L&JvO)_a8X-I)s(W+WPwryMOe0FY=7piB?~Yg7cszg9n?{m7nxb zRPr+i)cl2M=x_W`nrev1|LXGlzukWS;{W2dVRzrL9Ix~{6Nf|0EBOX zNp#?hN{-b`$-sej>}_ND1s{g*An_NBwnQXFfJ)p?9YEPLA;E^B{o9f|8y0FPn!cE@ zL%oDmn|4<`$!laB|b!YE$)_6SHPvfv;gO}4g)KY`d zXK$e)`l@{=s9szz^5)tvtMhcIu<$A03VU^@mn*t>p@4!4dO(tnyUEFN6>wABV8;ub zWLtAQ!$$VQf4TS!zY)iS&bG_(ZZJED2@Xu;e}c#dOkPop{kGOk{JfGG5I_9l-){nk zcIa!K)1WdrhguX2_4C=`!Vr?v;LL_~+i%LZuRajyX}xL~s2>QeSz^eNjG7oZ})%t#wU` zLN*3?0OKJ5pQ!8{(~9U34>+FI1-A02)isT^sKn?=KW}Ra(bJW_es=C9?!vAFZ78+FP?YO`_-^yMZ&c4`LoAGoZ@xVXb_|;WquoS1ky* zLCs{9_}(Xxbqt&jg`PUXAW8YmG0p(FMt6SBvk!Yy8w5}hAXU@WBN_wzD@DhEW?GI$zK8oawpnP0pi!6I)qu zTVj%*>Be2YyGelOiabnu05+LC6y-vNJ-AfFy+;ZpLk>OQs>f# z@JlOa)s59vo~VmoE!uuR89ME42udGU;YuOguA-PBfPEmMzKifIM8};~GQGk~p$i)1 z)XWo4eGS)|^&-zH0xe{ClMq7y48MHwcK0Eu+wpk((Gy19tTduSjt|kKl-H*!@@i_Z z$S6E1Jc5@|cLkF3Q6LBA9()b>5Uw4*AWgH$I1oKAL7o7Ph<+DTCPlKnF|EP&>IY&% z$>bf7+=Bo-Vgodog3h@KK)iD`K8EHki=tu2+b*hVqGGx4mC45mYKaVUKcu}_U=*+N zT7F~f2dZJ5tS68yzz#KHc*cv1z7+HPANJlmuBmM68xJByQKD1@AqprU0!k4DBsRK; zh=9@}D$NMU2-0&@6r@H$L_rCPfQS(3QX^eJq)Au0QUeJk1d<%zjWctvGuOE@^W5is zf1mgFK7TkvLe9=%pS{;!>$}$aexF&6wYq=#j622|*AKm#sc53h6aZ`=-OWF& zlk~#bR+OsIRoebOOzK2mYL~*3Vq^K#Q12RBLGw?d{?D!l-q1jHHPKAyJ`7&mJ1r!T ze-p5sP{#-R3p#X|TZrV|HT08mk*q(KvF=38@lnao5juOS5s(au;up+(=vCgt5Cb4~ z>t^io66^8%nQP*l=mo`b(z3Cb|H_%84~}yq-1~ltIWeeg6WxZPJ!vb^0-)!$6K*eh z>IUn`E?;~u+^#S`;)(T0-ur0K?6}`>C`*)Ld^2~3jjm=u(TSYZ$0l``+w;6Tbf?z! zhU%_UGVcW(5lEn3Q)$fHVh^5K;5Cw6ENWhWQVT!BVp<%TTlgvI)u>IRfd~xBUvQvH zp;R<_OGgyM@8 zjS%Y9E6fY6@p@GvoJM;V*Xk^0sl9e+zhwPoZq3fgb@{J89i+v&zs%B07TaQfMz4Ie z_-@^c2-qGy&_?R<8&s?YL03Uw8Jjou)HQm+;H|!l)Jb)_J3RNS8kS}fAL(4~J9gLl zxkXLu+X>!N*C`fE@$C#8lRuwz&x6<{TE1LbmNWd=(I&rrAzn__+`M(3tf^J$S)g?) z`gqHdd;X>uSs=qV4miQ#^=yFgekaqejGjl)t96b9sB%+C=;^7s zv*92ukKS|Fck3xO-Ig+t7681)0}o+g7JdQWEq&aL?=ZZhEUT=;-$t;#fGH*HQ2y?2 z<(CGm)JCfrsRu*Xuarw=2PGn30@X%{X9}kX8tCBl{*vv@>`y)I>I zHMbgst-jv{O5{cI`v=#v%3%;l=JbP_=nGBVo#lXevob8cw~^~jk#=U7oz~Ow?H})0 zZawO3xw2jM%ZcC*%FX@4I?cfXG?-~xTS0Cmo_eDORGMjN=|ycEa^rn_ztgmqn}^Rv zS3+N~B9ZNgQpjOux(G=`~bN zGcU?*=BUrBkBk@bD}1^d5|Hl|oWO9B2Hq>urE3ETW3)nL(%GOcPH_Jr5jgM2LE6GZ2M_W|S$}KP>A1Z(`?Zf1DyDBGP`T>vhjxL^1}1Ma6;uokm-j4&O#6EcGyEECju@8~=0@vTzbT6FN8b+Ja#gqZqTnIl2s8xMiF$iupif3K9;9bWYi4=t<4OwLXq&pPKsaXh(K6 zJq)K2M|h4D6LbJ(AAcmo<&pZ`TFI}7cFZI9X>ocurzB7zme;^Zk1oygy&G24Wd+nO z^Y!QvH%PW-bgCxWRQlO*kkeg5FIXJbNJnUU2_6AK@FN^1gA1fd2{z{{1!I*Hlfm$-QF&921 z1Aub^O07JUEG=z-j_wE99ewflL~f2P9|tnwD8>ME%N7JM9*K*3$@}H`_X$xHgBu(f zdwXG4XqIrG{{f`r@q85R9dzsJYk{*C@mu0~ggyD649h{vj3zY1?tRJd!FIQ)b($@U;d1bm$>x)+hmey=33^C6$J==*b5wJc`1BC1~MB*5gZP^2%#+ z+HTohN$VTmC6*0%_cW0~2rx=f3tnA_Kepnh<0ql( z>$%1E<}i$BG?z-&54P+u{r2I|Lg#@}&OZCcySD1s9y+zE*m|we(^#5<(4q-f1c2=c zeQS#?Rs{iWXly{w!pZEc`ri~K-FpLr;^OW^XmB3NpTBU~)5b~?9*$LI4TLWWE~zp= zF**Q-93Z-MKUtM(OnXV+PxMYQr6Q|)L#92$TpT7kureY;<1?GAr1sv?OVr=S{_ehU zEPHe#E(h4oXyCXS0k#d+bJEoIK|HIz^F1bOv1$gS+sXw=1Ea`r3=-|!9bGBTL4VtG zw{5|X^y!RMd8Um55HQi-wsdso`1VIp7bNswXPN2LE~EidTnyYQHQYOhqQ7l;g!M?Z zi64umWV!BgRaKo*aGY(- z?Tk}S{D#k+ir+^QolOJq=55T6ZW|i%TJFA5-jY=(eo*5lHU#QC=nYAP*TV0?aiOXh z*)QU6luGxiP&dmeCY&mHd=jbXY(Lv{M(j!c=*PSki%33b7hCC=)g(j z5NZg5Wm~bnf?+{;qq&zE@F6CrS^g#VmVAjv9$N?S6X$6*toyiCxEdhIMe1hIoG9(7 zw=}mW$qIFtCq`Yk5_|v57I5&4iaxvhu<0l7H(XTkB)F6b(l8I2fqa{+%QTuNwpry9 zr$2ZKF3B=H!1Dlc8-^|;O;)57(9-F(ne zmA0ZUcA$M1yUqny3x_o88>p z98)bnS!RYw^y;b@Tv*Y#)_XAOQW43uiex#)P6sCDhCw(~4#Lg6+hXW!e`AoGmRBgA zp&y7IB!v(`Clx1sG`TmoLtKz5l%D!gJ$&h>i4#_h8?{2U!viNn;Xn||&?RH1+8F?E zEkCltt6;#~Or$5w)uQ;wM6#zGPj9+ItDVtdZw1s8RU5md-Ylb8VX6Pdm4rQufB}l3 z;xEK{RmKH;ubfme&y&g_#WmjdkrK4`E)9>LsklbovfVFvIWzL6(yP@Xh*X9${Gp!{ z!Xmq84ziMAB-T7GHb}LXd#0>K;K}{2d+T>8i|^HEtFI3iyn8Qb6{#r}=5b?qFj1@= z5P6nia_~X$s)@_=;7Y)&pgs7Q&*)rBRlb+yy`mD+k1zR%4HAbl1njR%`sVK|Cw8If z!2P5N2i9D%c7_Dut(M*=2OA8NYE@(7-a>(7ltb_JX8tw%YCA{Y=Ioo#uj!>4_=YZ%@7e4R$; z92*0GedVeBl;dtaVM7s%HEuyg_}f~bZ9b;F=ye|D-7Da9KkWMa%uqkcZ99JPY}CG3 z`gs915OY$zK-<6QAXjBmm-!5B6@`}J^1}Zx{fS>{K&oM$XOy3qqH%HPRO4>0MV&K= zkuou_TJ7t@CC}3cFn2sd!JA0i1J<0CzbGrg-^aDXmlpd}bvhX<&1dy#qJXczktec= z=i{aa9vr*(5d%R!LXob@fY)>~c>Pai)2ZoNIn*OHan_Tbc0NNs+oD?Ef95rJ$~YGy z@HREZInL%KyFv$|b3X9e{zT*;KfEpm{hF1go02|AVAWgDuna2#y_yh$og6#>mYcp5 z5%egDV0evG!U5xMk`;9?SSUPvemoI|FrP-eAzn5Y@NxT`LuU6_lDWu=ulK@X z;a3Q4sM~#9t+*Zj=niuD_e`fsN?{<~kZ2 zdO@q{86YRYMwtHHCcYwiEs#Lm z9vd=DsEb^mm6wsM?P z*MhW;ap;cV0^m0yyAmYw=uODjVy{YTe#T@GD7t9QR3P3%6{T1lY&#}~1%^Y_JCKX@(oJryr875hc8yO?F3wE(Qj#S>f~ za{o=s{z?@3@*mLw`_(`LgqW<;AB#f7XE}fPUhv>wN|8=Y$KUg zbAbd5s9UKTA#37j|Lpin+aqAtqv0k@-KxwU7+8sc&;nbpsf?H4ea#Ymk3Hk@ehzTn zl5Q{&78ItRp*-5Mwf`vh{^L4?5SVoCjeXZ^d^zh}dA7m43<}vP76Lfd!9phTwVh z@}7&vHwd%GFX6JGF^6jhEdMB%mlQpHDrO7e>b2jq5B%{C{iC6N|9*OVDDj}s$`k*M z9LMBD{gUxYN$vwRRtZE_M=WQ9E0II-G^wwMn75F#8-p(%coZHYa0iy20{VkL|AgwX zf~!DLNUIGeDKG9nUIIT_g(EAz#h>))K^SpOP;&Th#p17_vkg`(=<$2G!6`EAgCL3U z{(&ddm0G*EzIWZ#j=A}!4c6M*P*I)K^X$20zl)M0+cX zq7xVPym^N{AxC#FyLGIomB_${YH#$45=qHi5^c~bXnXB zc|tilM6d<9rWgY@5HJ+^Vi{7(m&y^MIdhx5!s=)k5iz|ml#0TyPfq^M{~O)VQCrkx4PNp z4Y)1w=Qb@bgneV{ZU40d;rI2YrdHE!X&H-7t1`Y3^qOh7l>wyQ`iHxBbu|#;S#|C{ z(98Lqp<&yzqA~XSvA^fgPNE(GnuB35ITgmME_ebB!hgP2`42P)JO8WR@82)mUWx|s z!Iz8LWlD5qJxP{%=qxb6Hqroc?B;-sbgyF?Fic_I$#<-j5>*h>X);xLpnfkk)F1{` zYx=qVq5=&QGC);$=xH4L1#bq%5_TR6&>ixmObVzUJr8rb;^78N5a>VgKj1!S9BR9R zQj+A`Msv2Ek=XOx9o z9?qG=@w11t1uPOhZhepU{>jh(!wLMSoWRu#HR#0(B{10lv3x&NcAo$u*HkUkrr{v( z(}yJ{B_?NIEz8DNi&hKqApiA^xW@d)2Y(hl{Fg0hq74b7bG*p9jAN4^kh@x2jp`ksRG2dw04hlsBJCg=D!4bdntV=CRQimOx(MG!CfC4L*?fk$ZGrAYa0H8pUZK--m z?QdUSLx(o?>cSQxtcTE)fg4Mz6U#6q4&~5Vz~Ns*a=`q2_^bB7AO7>e=XX}_o7E!@ zCBf@~e{nAZRBA(0UT!Q+tST$L^M^`-2)KRho9aj?0^r?D5Ix)&cxT6oQ8~75v?{-U z*~k-<1Nw8R7?=l-j$!QfcHOO9T-A)bU!Xeo=rs9W=RC{g%a{J?m;mJGa6P|JIGHqX z+c&X@&OTI%PK_l#1bL3k`^|V(6zP6-tsSNnw2p147)+-74X5+J34&VzBIP0>2Pz^g zK5yaDvhE?oVJVIkmw_e27Tjf25w1Sgprv#ueE9ep?2)#*5 z(mNh-uTe5n(`u8PeLIMr0rsiS)ElexHCe02FCn1bRErH?f1Z^e zaUk7+=t^O*NjH#UsaLIq<1fiA)BjwOPk@aTwc;0P^S5pTj&+$Zpo5z6 zNwz&wAm&xou8979aD0MAu_(~<1(uEfk;*!FEU1> zmPAM=<*1-(^*S-6Y!yAVB02lgLgi0MEz&mxQH1-@TVn6p@j>5&Hn}H?)4kKp%gh)4 zT?WQ=fLjG~(ChD|?X6T--|OOej>r5~ zgCh~C_uo_(%Kr!K92uC6&|9HN(2y|ulE?-U!$cRLyb4gFp1)QWsul1r${PFrsjMOR z8Iy})hMGeQ3jpixD>W@=*IM5Of;FvXpt!T@n)-w+o4|z=tH7R@>63YNIzXUt@{i1e zk_Hg5`Xd#+f1KB{N{FV5O3I-K%azkzE|8w&?ru-%tP<{sD&Z?n`J}()?jv=jqg8?j zZr2JjA3)^E6VuzJvN4%HuM-(13GF<;kAKAm73x)z^0`A3b6^edKM@uPc6KTZaQ$lamcI6 zMMn%(35#XD{*b4^xZk}#Bl)F+zWtSM6M^U{Ir^h4u zd310wD0P-#LtR_ty8SJy*HE>+z4V@<1hb-DbKFc%<2Gm!H|)7jih#O$VP0WYRv8zr zSHzgf-w*S6Qd%nqiMJQV<;g0GtVwbb-_Vxfk-n)U;MvmEj?2$B5SQf_j~*1j zts=dc;mnGzi1Tq_yr=QGK^LdF#hPjTH!7jLfu?UiF47QquKhR|njVpO9OXo8Xp4j?Qgh}-LQ z%S(5$&V!+LPAsaYP6fDgls9#i912*4JSr>jfH|VN%;e;hqol*!c0<>_C|)%rsp#OJ zP>X6hpUrbR&iMh)gPgn#e3zAK#Sp|VQviq;JJ`hB2C6~ewbKy&lYgK4qoLg~g6b+h zxmrhV=P+$KU&;9zM?i_UsV>=zp#Z{$?d%6Zcf=S( ze*s$nMZMhAmo=vZR+1+cdcHlrz8D0n9+hCm1PR*{5xsYyU^zYF+a z!yZt@b^4gP_qW0AR|bFb^Vr~Y%v_3Pt%4mFjJ_h)dogxFEx!(g|BpuA4G^9=#bpj= z(?mQhGDABJ)YGmfLIeJ2rWuOTiOH_}iirF54M+}60e2;a9@Fmxw&M(#h?+5grm<6- zJt)UPg5*X60aiOa^b!KMY6lm8S(y$(yBbrqe+pP}?Qc`NG8@J#_v}_swH|mC?1Nts zL+MMFAZHEH$G;*D|6`zo^xqDAT+(hNkQgh7(tBDICPKIlng^;;5!Z@!al{ zKGCVPakeOJZwk}fUOi5l*iRZfUjs7p4MIK7qU$D6* zjCt$AMFSa?z-q0XV|)Ba)=kKHpYwtY`Qo_{Pu z-rG%&1quQN0BMu@OzkF%lVO7w;$r$D9A%Jh0!8bt(u6K-8}v_!CL3O_nh?#zvN)cv z2&n^JYSKU|MvP>K3Wq?EjuR=+oW!f8&Qvox-mly0CZE}9(4pWmPxW-(`DRWo-e-r~ z_Bhd0E-vKVd;&-+GokX+I7z77igW?m;E$nqrZ^QThLC6bxW+PNgM))nkS0utZL^Fy!c0DzvsaQa-4EDU`lVc5{7J z5H~U8^fW}P!O!W%YWMU{EFhw;4Qaq}00NaJ<@}mw z!Jdr^b-^N8y*tT;_Lg ztJR~v??EsC57tIdeih70K@X%rloP}rp22lR!a**ebbUv{P?K)E$D~zqd!{Y6GpIZ@ z&aiM>uWD&VRd|{!Y4-{4+Qw$6+zlEiRo7+mOP`{QP7O2A-scoU(4r+yS#_yS5AI|A z4E=)k9Fn5;4xLF(IB`<+^h29BAI}^z>Q@FU{4qM`9%dah@F0#@n?9xq)>^2oY-w?% z_4%uWWa!D^GKQmJrZ=OjR&Yh|X*IJMRH5}SVBR_U+;D&R+oFitiV^Di_jrLf`g~e= z)r4D@u0CSR?sM=Q{(OgXf-Mo5>H$babeUp~3@vXDxDB04nwlJ(;b*EB+_b$|v^Rd_ z(91&?NIp-4dSJf?9~GrN_ytU24+Y{ueS8!lcd=5j_Xy1UoN7TfqIiEq2bZ`EHk2N2 zcQvg~;PZ%AiINqTatk%{?sc}`{?oH?i{%|0Tu+E|UEo33`wLKXr$D&C<>h~xZcO>o zeJNLC)!|GfAsplt&6jNNg7nN z0@J8d>g$Ps1PS?gW~M3w01qqrnEIKWv{cZNOHKleoDsNpq;HIz*gRI+|CaEc4O6Yl z6jl3*kOijo27ukWru&z@JFpCkA0d3d>*Az?av_eluXzA{mqAOB{*WL#rlbMRGwtXtg3#3p)t{o6pk4lPs_QTZEs=t8| ztV8`#+h%UbALi5%Q$I@T{{FpLOeA{NNbxJ84>({t@)`rLnkxM^8~~0>bEDcHH+!nE zWepQsT%ki4+z!sVdDCXujTd`}Lx^p1CE|ylziU~r{0|#fAo2|Yg zsGhSHOhLbLoC1~f=F{5B@@1R8G!^v`r8=>gl33o5kz?_pH7~DDpIuWcNZ$tmZz*EH z5#G=Qj0}sOqza(QT4oqVN z@J)olvj{3;-G{~!>3-mpGY3$z4YYG~K)QzF4mheUZlft?zj`!0)Yl#q3&4)P@=le- zo!99okY*%39D*_nfA+(Sfgg5-ArFwV!<0pM9Y9FvMcrH2OrqGk0k5)iHUre3 z3lBrAHMZNpeV8=?pDr1AOurrf#4p1?`2X$bmsy#RJz-hKRt(HS%*xZi)DxL9ma1lCn^q;4-)cmpC@;kU^H&YNwlYo)s6$+>r0rteU8&;Z!49Wn zj7ZZiE=6^ZXWL}D4BeNz^wHY7eyvik_Gf~8pzs!1x_T{*0GzhB<`t-r2#R4j{}~lS zndXtbZ1wxzeiEiteB!JcW6q52@yUhlp2*=XEb^=4s~;sjD5G9Tt3t8$=AHvUiqXx4d{szUAH+&`%`}S_I?v z`){Qhmm6YZ zTXm&Ywl*p&WLMr^#P$4rBK2D$tDBBBa}t<>!Z2?z>nUVMq|5ddGh~>%DXBN}2*Fz1 zq!zOOq35<|`i!5GdY%ONwFf!5N!${bx*D+(9F&BQUfx8UV$`5EmdFkugVl-EZhSTp zPL>MsS{a@m!9VdBI)%TVoY2gZI^RlaRHOy4qVkOCG6W_chQmO?!aq(`syb2y+Tht@=An_Q=$vrQi@+5V&>SDvjGHI%A(5|vg zO3`x-URyGf`(}m%<-_@#!hhQ2`{f$?7&Opl47!`iz))~F(2b~y!?9%HLDD+9-=XHb z1{YC~hPvt7Hz6El$7AeKpy=*-S!z$Ud`d-#u=(5lqBwtWYXHG6FGm5}<%sNYGXie_ z%Tm5q6DeG+7-<>#Hj_E_xiF{g%R|lTe7&tv(s3i&jBfp?eHm>t_chwJ>cJw!uAGc? z2RvOewZcXVMoODelb@CCE<2#vM2=PJ0SYb+Qw=lW)K*dgp02_;n9E8<4W~R_RAsfu^}7`62A&WPB3JDAgsd>Y2x3-NV|509{~SIRJ(f7=b<;rb3?sulBPD z^N;-ormpDpEX)L1w&xuX>Cjo*b}6T{=wMdhBjXJx4{~034{9Chs#j|Igy}?Z4ThD4 z+esI<^Lgrh-->K%OlY{kXW4gg&X6Bbe%fB_cxxiw0G=&5GpM+pK3SQI$>BP=j^=m5f_ov#AUVksLY|2W4zO4U>a6>VUm%Sw310hrY5@;c> zh1(L5=_^;mvZQmyOV+`(`+&YumNXxGh)8xIg}QY`!xr0E$)uG`?e=CYYpTYWY`(M%2=~SLFs4%n2&Vvu_6OS3!>f++5AZ`r8rH(9bbGIpb^=f zi>|&FRN%;QaJL>0VPL#zRgXq3;&2b%6tbq#)TzMY&Mx5sb$O))EBYx%F8(z)Z)uVivY!%@0F8pD4O;qY zuhT(-1~BkvL1U{*2(*w*-~F=lI%(v18+x)skp*~Y4-C(Qqu1etL`|r|wr%rA{76$E z1^+s-zj2mY>@ch5^0Bb?Sm*#6$nQ8Hmr=P$=BDx$Pv{GYDVQ^uif4LedP_`}TW*Iz zUcdht$u2)?xIyZT&de^cLUO`u0K(u2lqR$oY0R*51!Dphx5}d4!KY7dy;)nT;X@Si zvHK1UuF#mBDn1}oK)^4+S3%&bWW<4s_OqvF2z)aah9cDb6#>AdUv6Nw^ebZB9JLd^ znE~8Cj#v~^Tal^URC}ncd<8V*Q9#o}XFCOk!7~+|(}Wg!LD-gvc?YM0K{7}nZ1_9h z0R7Q72>>nZZk?}_LdY$d0cYHYPR1|sfv}z_^bw*80_=|+UgeK%!|kPOoGGIO6|NIf z?|ZC?-|<-e=?iNe{avPO5fgr9gY+aZ^%s1VXZ#x}kIv&C>s5H$h!&VGcSmZw$Lvo- z8U%e>d-TPpgK2mJ#%prg^5L#D>>^>(ypBENR34*gfw+m4svh*OyZ@ebxDTju~Kij=eKk0AvB_x0z{ZsogPLRrM&`DS9m60`d}5b8K+?RNsEwX ztYqYw(%q*E=dZreopj%^ppqwKU5mD>S)glAFAf(=1Jf2*iQ6Pde+a0&VP5Lqha$2B z-GaTsd^d4*V)xT%H<%tev0MC8N`XiP?mVzsswP^MxR(Iqi1 z64w=_S;O`@S5ziimmfs8l3q_B;L8AK!cfeOdxL2wfJ1SFxexp|g1L>t8JtYdF^ks) zUP(b^f?QtdV~^W;`)q7h)gHK+zRLo4N=mPE6NQ`D19jnPx3gK9=nINLP;Pr@IXAU7 zDEHI3)AHrBiLVQ9SvpxrJwj}D-4w)Gv;HPjheGxB7PPD4+ZbDv!WHrY|5H&&tA83zvyde3?SU^a$zeE}m7nr8Xvm*g$*1+5|VFEY7Qu zI%vnW+YWH8L#NO5PST#`u)@&ih#{b<@DiomUG2L6exI~6y&tD%T@_>2_8i*LDte)S zb6+plR<$jiOQZ=5#rUyTC7N<66Sx!{?5&k}k#edL^Ma~{^(WnDh7w~Fg_`*Jx947) z?;i{q1BF8{?^!UbfzE(A9z76@=9=-Bn23T~h)*U8gr2*H@nG#gNFO^7pRpA+lkC(2 z2~uE{R!gy7lbnfknc68~M}`%yElw?XUW2q@()XUaiujiN!u->32SXDR<2|OH zViot-rQbp+9FnGv(r94S1f#K-+faV#B9lKJ=FXzLZyIt8SK0g}uRV`Z#K*g(}Q!iDc{8 zmD^eT8XBr$koFW0AE06Y*QLE;`3$%8)Lp6{EjD2XAUa(M-W6m`9(%VZ{Tu1MuYsb~FuQ!j6oM*5EcdrMO5;)ApW@qh4}# z9o{ot5hJ(yrhSq#H@9~`U1@ta$(|T780kLrSOb)=uZK=!3R25a9JrSLtkkNR8<-8` zj!|8zaFC~l{)fz=;Hl}B`b3Jjte{jl2-coU-o@07GB+dfbbCVNCXj+j2LXNSk%Oq- zUlX1yr5S(v?|ae*v;cY}*NHUU1C7mLEG+83e|->T0Wz;_f5fK@?*wh!geCnMrT_}A z1xRY!`#elf9s>H>^6%5`J8#nVp6t^OP?4Ff;MU#ABeU;4^3o8oKWfrL+6L_YLaEXh(2tI)JYdE(odHW(*1$DJ z$|ocN9-_(u;Y_*A0*|~gyvC2Q-)5X#gr=L@@*6+F8y1{Iui|FPurGZC>-jy&QIVHk zN#msemUqVs1j=1+o@$fRVQ&!jZJtgS&YVaXZVLJo+r}k^T?P=rjWbN|c9JuRF0EFM z!HY{2s}-f**4&n2`279xW3erVB?>Pk_&nS)i=CB&d8a;*cR>RYJ)gtLA&N}X#B%Nh z&zNLMm>z*2#9+Y*4w<*F&TH5-UhD+9T_)dm`AAzSyZkX49BfLVrE~{bKCct z+C=RyAFT_cbi2i$K3CJ=zh!39jF20j=}LmRN5QR*fD66+^*-Pf;94{lZ-V6X^S~LyJ?RRJWnM zNmR+I1Tl(yt1Ufc@g2iF(NW~+^9yq+ucH^8C2G_RK5nhyLa?*e$q;u!!OI;uBU{sD zQ)m#;3604sjvf>zYWrZATYVQ<92c04D9ZK8{&}J3ITf}E1*JTuD|nV?rq}nAyNBAF zHXKR?$6M50HPDUT;X_d8XL+8wFW0o*nRFr~-bFhP9-t!90G6HoZVMNSLZgn8?iCo`+dX zOfUxrD=Bas;|-OIGiCi0r~DPjt_dGLPO3%um%e{{4;oXQonZ`?lin@Cm) zHO%WA@_lqb>kY=rnqc!t=Mt+$gAW?EjT5AEQO* zc~iDkT)2SC?2hL&HtW0V5|RJ&82>L>&%bH#j{V1WEvbByz(M8i@YI~h_7rPc?7OCw z`2W(*Bzvap880d64H0#>+ z7p^>U4!@}fy)%`DqLB==zKy+eO6Vw!vUGyGbmvt@p6<7M>Bw5bEsh`7m)mMk|2C}g z-Rma7=Qmw4uS_+5IhQw_zYUS1%w&=Vl1#ljVQz3QTUAE$Z+c?$MkNx67` z{E92nKO--=e#}SraAt7K*3N-1phS!_5x6%%BrG8Bf&a#rr!7k&q-g150 z!scr49bnr>9Fm54<*1I4BtWcBu}0n`P)rqr%yr4O(v;GQ{SyVk9lMQFtYynbg)H_y zI)8&}kd+VDp@d5_7|32s@l#RJD<-a+x5#5uyHCIW0dt*(t@m}W?pY1WU7vaTG zh-#lvGQa{?H?x4dIq*!xI9L!8Ll=hE?_fP1r7MH06SL^anF=i9A>wUS)W+Oa_R)YiiCuO&^6$|t31(AA3&pT> zFiSTG-nBz$CO?A~#Ck;9kqq$Mg7v!Dp@qu=MAw zmnHTP`1@r*I2D@-lzBt1dPxo})@oqW6jganKfHN)LQc3hmU~Od{Q^qu6)7jRl3m3* zr&l6HJv;9YG$Qb5E~2|;Xl|^0oPdo#KP#ueCuAD{kTVSjxeJQJe3)Y}R9w ze0Qsb7FAd-aZT$uJsQ9y`CvXzF?Z9qp=(QxhlzoCHWG4IRLtXaGF+gv_g(dsV)v9p z1NXe;Sa;r$N=5*MS#YDG^x%OYJpj#B=WTaNh*iM-6Bw3&#KMhR}Tj%zAq>tDGAt2=L@D#GEu z3pa;^h%Yuv;;9G0RiwqRBbK!88=vPHIRcTekm#* z;5I9rhw1cKR^~266%aOSqG=BKMR(rn-Z&jm3&NzMo#_lmF&Q?eU)lnHlzaQHz9&b2 z4(`R_E8?>phH|2xA=V3IuS6Q5iwn#jFoRq}=lT9xs>7533uUYfCrDQSaD;g39@I_w zuZRxfr=D+8R8#sZ(umh2o2(`)m!zO!jA9LDRyJdn+49-Z_w2=GRO1DiLz_1d!L{^l zrLfk1@8aX^Au6iTFzY>l5gmw?^{mI>8@AcphAyHvoQZ~uE0yn{KNaQ^YrW5zH&Wo# z(T@s%WGT)6D?)NPPK-l->$oANMR%h72mAnk*vO^ld$JV>c6=@hX5RBdEssF+Ttwr@ zlFAbx5XPVXwXliWy!!7{zx}8xPVl!>^h86V3LX4?TN{j*XFz&T72&Jtxm{OG|IgvJ zKY`>nX#pVNj%p|CGBkdsj2&=Cn0LZl0Nl%S_Kt=Rc48L5fsyP$N5a}x7&_>m{lM0> z(h%_{|JX)ITcD9R5O0+t;c(07fmCL6JnAK3dGm7^tkEBmgB%o(_Hr3bH0?>T^1jCI zls1-TVtCR=a9Vq$3VjoIYDNJ)jjtzYYi#S@xU>iIz?~|yhJZ(-6}sQzY_{a=Ie3hj z=KuODq6ob+K;Wm$@(a4(do`z)zj-?Nx-dr4Z1U{z%BSZ|>WewuqLV<>?r)8)Igz|_EO!uqbq(_A7tX(>smxmOd;8-BiP@9CP!UcB)&pOO>%C`8}3IOY75DGaVt zU@EW(ntrLWS_+h^o`eE7rMYd?-F^=x@3GhEIi5B?wz?L8OsZrTk`uP&DOAqx%B+bt zppQS__Souk?1@4rm3E2v;93aAbta$8Ii$_C%s6oityxfp&L8^?j zEoXN=ry4ofdWs5^8L9Ci*B{+P0Qh~ zOVr{!Y|GmRP^-PY28lOR+=9zC<{Qe(9eS1Db}3_cP$EA!T$@iZ`^7amCa6xbBSiWl znWA8~adgw`82fP@ScU79f}yFhtFalkAM{ka$X6AJn05DT$(6}&blzjV^i$O7TM-e^ z({_K|m0ot>$(#rEOZaPB>7ER0Z;7UAq5B+hHIs@i&;`t9#(@v(qIK4o?Yf@7-|S;X z?pgL>sMi?gDF%Tx&*#1P}38M!r=NbxtUL>d%MY+#_rEv7F8*yMoW2-pKBbC*j#=xTWGcT zgKEeK%5Gwcf}?BoLlz$1Y011ucczR?BuKS6FmxVYA@((;)t*;$4C=FBXw47Qkona& ztbHX>_KEcXc&Wf?%2J#1p~lyua@D5D15XBLHv8+xIWu*iLr(plSo!*+*jO#I_IEAX z53Tbhj&2!n2R&Hce`240W52#<5lLm}d_}wyh5OL7#{p+nT8JSu?Rfvd+#G$$gKfbc zm{Zh`8iawz#o&YLN6@(itPtRQJu6xdVO4(fHGk)p{>^9E>gaRu(FjMlL;;#VLmWZD z>z)F#fU7H|uGtYFYD(!{3zup7{3%xu0=n?)F|~&-$izYA=DE80tZe@?zOQ){yD--$ zow?C;%0;-`*Z?*Gk0PrF-3?)7cbH!HdwRxt<4oCU0P4RzixV_+Vq7Xl8N!bmC^;-7 zepnT*U84|%x%&l>)`~df9Kc+ zpXGWm{ypdPoRKnYupZQ-G+QJSO%Eb}#KZ7+-Xkhg3z#=OnilauU(MJ4dqV0l_EyQX zo%<#X{$o<8DylH^x+`fi>?>kt{R%4=t$&4;E8XG`+K3Z*EB-=JGm-%Po?e<)^Plw6 zJ{DwPY9(R_tN_5@N<2j|W`-0wviyz!D0MC3V@bAQ(7KB~y;WSF1P=)Al)ZR3RkW|l z_Sg}+?&ssboU*3bJKz56xqBBCEexOt!_TzGq`BqAh3Dx3wnqdjLoCO6La;G5`d)0J&QIA*c&`&}8 zx$;llwc#N4K^fNI3}0~%<=`$63#KTUab-ncm90Ox z?vBmgcZ6(=)dPH-l7v%5UBRdEg}v$~jN z$Mq%IM%(6351%L7^pjW2-6z&`9q3r22M+pRqL{x;?jK|8U61wZ;1A zvw>`>>lmkxyqsv65*`3BBtzxGOtB&E1@e(MA_-;NKFfQ`yDnrb`o@HwepnV>9q^oe z(Q!bxh7{bN3zhe;_xGdgdi3CLF>O*0y>Pa0;eX?hTPT|3reLPeFBh11ZzEY$YyjX zhSx(e471K(o03#DNiw8rJ@k3jr?{9gW9M`Mwb&p6v#(UB_6Ao#6dhELyqJ7)@EU+% zaY#hg&(U5{IP&veI^SI~_icQZJ-+44H1VKC+1Ra?Ui&`9vX~Y>wa@5()}RH?cW@A|8jhOe^P(X@Acow_tpCE%LnAj zAt$$XK- zCo$H-4Iq6Bxb_LFA?yqfRAKYK*n1avsP?vRe5BHOI*<@ksZ>ZODW_2?MVd-XjDv6vX#5kW1GY(^zndN_JxA)WDPkY~c-+Mp*_j%vD z&!!^`)*RoztqH%C(UV>e zFrjE{@@?>#sz@)3yb1FKQVsZ7ovBmpqT4IfAFcUh@qQF{YQqO}rToX^vZj-Gs-@VZ zz0N=jrZ$RO={OVVdzkJN`G7tz?Yy_P}NJQaGOn#O! zeVu-h*frJcK;ru8wYoZ_8{-e0U2q3E7{a^D2Q{9_VjJMAqZ?F*@<>3jAK-IYb)#ly z4P{ftZNuw5n@?D|-g8`}XESvFa7JDaur$M0%YIEsWqE_tJR(SI?_$SY99CEl#d(OCm9H8?S;{|h=*yMbd(+SvH1n+Rt!Fl)P zZ0C3@gOztQWr8t zG&v54Cb1Pn8~m_^KNld=iA#`yP@M>Tx78P`KQZ+Pwg7eHKg|>-VT(S6k0b}t`PSog z2R2cvJ3T**klCsHqgJ;WRs9WH!h9|bijU&56kM>7VHTLJo^_4F-r`|x6{sc97;VTd zl)WQNtqXa~@;;VS_^Il$53R?tU>+bic9RPs7P6q=t3F{x)Y+W*^iGEiJ=rItxaUse z-?2uN$Q5+9!G>|vY-~c6dO`%!=iZP($EFhM(Qd}a_nzhj{lr8U|EWa}TI?Y^({yCO z=?bNziZakm-4RwA3FUGD`iT-!ln1#Vd4ND3J4vgi>?n=@sO0|OHsY{gaO%7pNZUg! zgBgX)A+QaLB(v*S)sFR8l&e(a^Q^0{n!A|AZ;tu2NpIqjxi9I;&aQ4ox0URZ19#1i zD?FdnRsb0o&-%fIf5D~vK2rQQYFi{E9%6x|$dLUj>KypZtW(%%H5gN!QhwoSdwJ-% zEL$F_L2C_uR)?8O|FE<9^Sx;I$y}T#usZ%iz5J`R+b^UtXuip{o^pqHG|H5ct}{hP z%Wl20x*zUXZlrZ-OvJ=)+2v(>bS8<<2);cxY%6X7?qsBR^0zPfdmr_Mulb>cqNHKu zgTv$A{vKnqkKi~nxlJJHa<6xyTkSegj@ZcViM`n$i#>>mTu>o*Q``e{8LN3(vSn7d z$CAsFPDoOQEdBAteuGnB;ws`m8MdHtZ9!CJ0J7~ef|crRqU45l$3Z`85cUp+56C0k z2`HuNaNGa7{H+Km>7Fds6oI;rrc9Ut?Orwoz5k;%8V*vavX;YwgNC61+X<36WyW3e^+Ejx%aQYI-~ zb6#=(0+J*%k$6SIo2_E@IE}3wfBBf^>l&sbQ;DdL<5vA#3Ha^LK+^NQ`%X+_h>P4j zf=gTAZjyP(Jg@V_{#a4Dkm$di-<|+)(~0;f9BEAWDXRqTKFGCeL+aEGjOe&#Lg1~d zHJ#k>`K7Cs-+i>qObz$y6aSeiR;_CXh$R?Y*5@|yHdW!H|GSess)>R2!%-l{Wo?m| zBPC=tW2`d%ZlGs#-dj(Jwm@%lhpZz32fASp{5$b!Rz5*K# zLs#iR+%8w7^#T*qFS^n2(fMGBXZ&y^+rLiI*Bnxp^D@*LJ~i?c$)&xLq8XA zl~SOzQ!=#U#PNWW?PXDIF2P3PKLn8&r9s?HTg8pOZZrBf@@zn@TY!u;I}(TX_pHYQOk$M!Cl%itZB zO{}|R(vr`W<#`_6)m6E3I8R%c6Hj{;!Taz z2ZXuB%!@m}6iFi5ur8ju`3CTJj=QdKYPTB~b=bGxZmT%V-B3g+kX7tDe#6o3dg(px zaRVT6(!O|S<70i#B|&ld)TOp*m0254EsM;B8Rq%IX+I`OuuvF!WrwVsjs^95*e3Xr1Ywk zlFEh?BNSpdR?OIDXBRhQS3Z)+z4Uv}?OVsc4`2C>4WH9)CDG99b-cqp&Xx9R zw!g-kjUG)C9W4SwddhZ-6&SK<4Y$Mcb7VX(lv+bxbc#>IjrInJ5|CCxNmlf|v)W`} zWZ1;nj=Xz`^bVAv>=^3u!0?5{AD;;L&SXu)%l~iao`R%*dg_M$7Z2L zK&{{fh%oQgNVp$v*anjzG*5j;UK8B!^}jM%ma{B>ge9sxAg zMYNB-6Xo3^{$p3N_mJm3kPL7`*-H_at{Zz_Ynf<6dan;E{%G(Mm!8I3aUOB-&T9>fU)~8Bgzs1S zk6R~5uYts&!f9d{428Dny_6*@FeuHJAT4kj);)RgwGKm1MZ+l~g*aV%hv^|t%*OcgYfxo$7XAgF3#=R==>;%H+7THeq=7A z)fSujzA}wx7VaNU5XwVDPWdGwcg@gUX^|CO(686uZa?GX^H-;;DfcLB{WJ6@*eAIe zdq4JX9YDhFK>C$JzmE1z1Ko6%SxICY7V7fXax2&pAL=|)8XK`&FG2Tql@D)~rsh30 zC!_6qBKt2tK)Uy3?h(fMnK{nem_W%1U&m%#P&Vn!l@53sHYY4*XpPbl^HK!~{y@j( zSuMAB5E8eoxz_BLuAZi-xT{<_#b6(r=yI!eQ*h9v9}jd)>NZPpvWts9_aONIxcxJK zlXLoS<$dYW8lLsk;})P6NDn2rHzbLWyGe)kE1vyq!1&t+&xZ;EcrcW z_0Jx6gQm&hiDn2B5eCJTbpC2;la3tiADTLh_a!z>2N%|YFk<3(7V@LUuqDnxQ;UO{ z)$d`q{@m2zt(vPXUUK_mehkuo5`IB$HD!Z6VZ5rM;np63M2x@cwIG?<>1SYLw(Hw0 zSWM8z4Lf5JIij3!$96rzilWOlK@0{|tirP%fE5Sx;=PA+L6aPVekE0!{*fXH1at z5zPUg5NVBu&yvb65-f)2Jx)OakQ*X>FW{DF=q1T9J-D4JF0hs-u&;;=gu9PGjKmAw z%%4E9*fd#3Ih zfPkOCW;{&g??J21YVRXV(^j4lE>8kyMX+4&6~W&t`Vtdv9`c1#M&ghZ8DWfQ^CtGI zhT`_`m->Hvt&BmG$Q;)hn)-B*kJcWK>6~MwWEvRX3tk>iB3!C92f6#V$WCyBd=6nw z7Y;-aj6_5V&b2|Xf>#*mvBkf;wP+V%3bs~7KF1Ce#vH$q_(+on!;0Ww(!7aM>d4D2 zW7qEXn8QBWE*(>mK~k$$*hRj&Z4}q*T$FO=NzOy(y~X2ZY={Ek_qNzk{WFpr8ZoPG1F@U!gq=oE5SR(cu>dfdtkAA5O z-JTj|CfaAbLLQdUu4)U7f(!urbaN>C_X-)mbNoB@Y25GNpr#d444+o48$OxdJbgTH z4SUqMun9SHl}cO)xhNi1#W0!#QSci1o&TA2kt2@G=}eTOYI_vLFKQn@kZASe5mz z?9NMjEDYlwj1IlyIo)M7BxqiW$u0H=vr4i`uiJjwUOI+~c-WU+7vFUDC%cSFqHQ)~ zCwyKImSjF@R_>UU9y4w4!T7JmBph>lN9I8Yf%bp?wx@GBr8!OrYj|wclEAyz35U%& zXRwPK&|bmkuVTsZILVn$8Ny&i9zke-jaY9Yu!1sNpbCE!N966+mh_;=YU)XXhXP-> zQ+=6eQxVVj`@Q@xUzc8*3)&P5 zQYLTb#U7YR=*vb2=0@Pj=iLOba*i`>7JA1?a`^TVEy+kr3! zSRbm+a948}7X`{}74AXK>3n;`BY6gKkl&g1davSS8_)AnxTo~%gjG#WRp@mU-eOWd z127!PVglJY97ZD}hvsuD`jz1s3VU6zOTELsn<~kG>C}>5Je$69EkDQiASH-Oo>ycB zeuR#?Vj;0!vm9-k4g~K!zpsDC4W#|iAE)M3PX>fuk1K8O8FZFa&AGn9bypGAQA5Pt zjCOf&S=WPG<{NbarXq1#=oDR84**WvyXTZUII^NXFNn?y`kshmwzfH9-a zl0zPA?_Wl$AGr+7kKDkx4#bUy$lec_2sA~|3~%O0&$tstOm~!Z|G=rezl-=pfl++i zP;dIKnnmbhPdSPH-2rKfkrgmrh3*p%fDm~j<_m%?M>!z;-i@xktB<_u_c~q1Y!?&E zOm7hFKqhJ0u-|VF*+z@akBJLRcPvgQt@b)*{|2dh8a}torAFoj@DEMA1WFnoAUb*<2~X@M3XwRbrXv@&)r}@lP^SsUa-^T% zN?_mVAP$TrLkVN!4Me_Q=I^b|Yyp^|E5@n9(`EzvJ6no;!#9+nPvX%ABeju1Y-ns6 z#a3GW*G~ZF9wN~~{Pkl%e)SxS#iI9E+5F~V>izmT;@n-nUuA5Q|0k@Ab1kTVD@&L{ zC#2mX^w%4~8B^foMG1XQkK$(EM9YXnnP__$?lcuNcaRy2lkh$ZVdDc;7uKI~5?)h7 zdJ#@q7h1ou3~8IAh7rfcg@~5MFjhkx*$U~Jh@C9(7h%^%VspauuybpXMu40iJFBJc z*H7T>av_Y2FC4|ufGIqU`28aPX=@T!;2}1OTbOACCPXdvR~w)c{?zEX-QMgj_U1N& zkFbytTOGpg?|SUBnf9yYu(&ImYEs{@hUNFbPrx-Vc5)z%WK+=a|vG zeVF(J9BV5SymauT2SONvGkRDXW~}9AnHuOqc(QfOC}*`{L(xL#*#8Nu3lb<$mo z1`0R5i~va-uRP+R+rA$n73;#DtLEn3*feg`m{y zbr$}x8X_q$WgBIu)!vTIJNWEFS7c(@(vRq;)Wq`XpQ1l3w6$rcR1JRepH%yCA1*2V z_`}HOjPbl6{(kB^!o-Y)ryGm>2r~s6@7m3sm7HWz+i|Km!niY_WLo&yWleW*YoO8L zHNsn@AdsZgw?fCutE$$ss=d?k;{?WRkIkLr;w`l&*H*1NzdkMHkg;d1{Gvjc1Z&$p z&!0?3bW|}ah3!c8b|F@!VpBixH$xe9Gin>ZzHt{%tCe@fy!J*wO3J5Obp?-fwb#u z5-r?jA6U|7;nVl=EU#+2gun$(E3cgHn(qwK$(Nz z8`Me@uGwt8#!`D{l}FF6H|DL$*H#y#h|I(xC&bA)Xe!d(&)0%WsiTK#$S`^9?%>vx zQYE9eiBB$jxjm$;(l(BIh|GUtBP`o(`SxBCzLf-n896X3G!rVxON37}$?$U@kCX6; zCt0^E)^@#j1aCo{R(Rx;M5uZ(QXMNTdO~+kSscFGdbj+i#Iwi`WqWo8jlX1b?1Bx@ zv1)rjkA_zqdKAK8k9{0EgvB2fxu$8HZP9hzCVYcXqv!VB4pNT}9==(dPfGQ^Cw9n* zdktEb+Q74vxdxvdER06$7}B=6HAdWP#U6_HcGPWLH?K?VqQkN73uu+S`@#OKa<|jz zj+Mnr3vUc3-3dU8Q*%0HH1HR+7bQLRRc@f^XHE%$^H#BPu)}@No!YAfx zyWg6;85A#Lou3jjZkyP`)qX3LWJUo0_57oruL@2J^ZWx91;ZZ7jt}77ny`^Ll>!-d&yo@kMb42! zfh|=_D(Cw*cy2sqxtSKM!hNWC?y^UO^;x1JQWwQt4_i=AFh8EZ0Lw#XIb7#wYtLbi zIr>qhQ8IV0?IAb@P4lCY#~H5tc;MVTrPck?LZ~iz1!Ln~)fz(uvx*w3d6A^Rmi}z+ zDQrneg1*<1unM~7!TXoF$|Bd7=Cd`X2>RfgrFBRnT^jJnD}~pPx)@Y^sLEsAv(jFw zoc%~%;nS*|UGt;J%WgX<`PVCxMMgrQvJ%E)HV~`UVw2<8 z#70uUv0Ho#rl!pOtjH6s&V_FCr_S$Nt$E#TR#;{s)tmeZ@@s~a%2rdj#UBV>tfWvs z`RysR$*v@W#Ig@Fj2$Z`s4XCDTJ1spG&Tknf~JYEQg2Pry1OV>BNYo*y@i!`ew!Bu zha4IBG6rp7JLNrDAbGy=6v}?jF)i-K^DwjxQ=*D{wfvQl@Hms1n0IUN*jo z`U%NisEIUC2-v$+pPVGArVoR$U5yGEhvpp=tSSN5*~)$}QAC300}qk-+*EtEYsKqj za#o4rbgvhY(SnKSam9~yRS9m|vRQsC=ft*yyjv;ku!~M!yxfpF7W@qZW&kz zW6a!li``-&Z?-{~T7n+r(A)>o`puh(?8KwbMse4FI`Z%@na*z^MHPRA39$;bvgfGo zC{;dAY(}xyWg=}K%0lM5Ecw%O^1j6sGKo^a-2k|_u35D3lxn+Jyj%Wx4po5>mh(_g zAg9YWm<>q}aU-OCr5B|t_ir!xw;sg_Dk$P0c~c^;fL25BGD zbFbaV39vqzAnxw)+;=}V$qe1x*~F^8y~PsY>hZj+JD=%AT386R-)5a|f0F(rw$J8p zQF7#PWa0be{GHPepY|8f8lX0YX&WN@K;><-Jkh&Pc=g!Pgg|OG!!n!jB>8xO;1jbC zjaj;QB6<(<&I@}lH9upM0tIsB%MW_mx@xUGb#*z6TT>l2Mz~fRb4xKh15Ua&-s{MT ziu0+JsWmo3m*@ML9T8T!Py?xVzO_Psbj-~PoP#AZT7^aJRIdP z-l`&2wvCR%liKOM1mO(zfw%p{uLsB!i{01TyU(dC_?v*TKLa}Za*l{^gUax@8iXl{ z8Hdynhb6kvIuJJ##wE0A>KKHJGHgOV3lJ_cv?qXAwmIm?{wa<1FSMGRP2DB{3I~{D zX>{S_xYD?H8IvJ#^X<-{9;o|M#k9v;sy$DtFCZlthLJvPwN$zhz)f=T^8rpQ9_ubi~Z5I3iI-16p<^|Np zYiy&5C@UX0r&CAm-puLFJvF_k)zp3GnVGm%xEJwjtFl<*Z%16yzIEBbkZ^~>Hl|-J z;D~aSIZJX=Qma%niA$a`9N(|^40{m*uF$G=A%ZOGYs&1RmyT8bFTyp{2tQtAlAi{; zt)>sy`^?%&>@7_}j$l7wIzEFoCXj|@s)M!h)ey2N2-W^)wl&_N?sK}d9_Q)|g zy)`k7vb3XQZ$QV@JU4ThHw=QA=V=eRRjF%Pjt zb09oz@oP;)-#YZi}imbuq780oU95dF~xDx!I69M+k&`>r*}l$^HbdO zXZN(617|I^$%HjnWB4Z`S0ca3)QuXUsxI;WqY^yzzm#*HZ z3A<)d3fgEH_P1?oh*z3Xe@)q?_L9jVxq!qA3S}^bSkF@Kb`P&%jAkrjb2eXALGFN5*g|5I$8R<+b z7Q1H#^~7YPht_OVHW%e|6ZnzC`v!himm&gVtg;C@6MjUK_7AX0Se1Rb@FAIF%(n$k zxrvozEA%Du?HFWk+`o3tzEXAb)pCk(?L-M}*-qB9l;P;}uX4;BYC}(Ob4~X7^)2$f#+B|h}UxTfU<5j(}XpVy26#eovE)@*`pe}PPnUh{RYi7X<3BPR={bdKUr^QAZOLwTIa6iT-yUW-G(PvO&j_As z)Aw3DZhOK!Lg+jNn}%qC1RtC1#CFUMrK)qH^G9(aG8)?pZah-0Hr&>+^ORpz$&Tn? zzMc%>2oAF+)`5*@@1|5aa}9)PE_B`kpYHtV!$BQ#a}BB~lRDg$FU*}(Qk4ym`{mwg zo5ZgV&2@Wzaa+KlqJpIjjRn{zdqEFU7h>iUfYv>xuuq`=Xk83z#!1CY#ro*;QQ@uK zr)SnV7%K23cb$8`-aq!r)ac8zXYI+`K9xSOHE)@SJ)3~dQ03wIIKh4hGhAGKa*=Z=HslDQAwr#Zh^TJ3n`*u0pFZ%wTa%Qc@4+3%gz)~Hhq z9+!`IhGKn@DDPB$Hil31+F7zL-8Ek6QM}Fm`0n^RouFLtBjfHLekftAGi4%f9J&Y= zk;vZ4s635L(d93OJ)aa(5q7Yy+bH|G0sg#dn0oo{wnOvE<*Virg3|CQ~BFE2|((|TjuDk>S>TuJO1&l&R5hWTh83LwhchqCP_u9t%(O$ zJSFekHZML2_hLK_dlhr<3VY2&U`r{&l)%MN;;0DEGl&BQWHu8*OLgRR5?>z><&%)M z%)MxgN|huwr3S*G0`}a{M#S4{1Fx~JTQTR>I_ZlyJs(&cxe`^A*`&|WRGHiTatjii zur^$gR|jGG^ns*I*wRxz6<1BfOJg z9>pttb;wPbn0(UzLRxBneI9S0g}=*^klM&;W;u&5534-r zQ44o%ax<8-I=lLrGvzHRm^J4WWhPn|kFu=@zAW~Bp^g5t$80e#+ccW4THHDA&2A*= zB!9Qq>mR*H6HU3R`|htE{4^bhxKO0CCU`6q-q)7tAc|#ay4o9Z9KBsGOm=&4Hoox*MVn3_+P z9Z#y>2>D2A(Q)NZrw(skHDAoawC-nZ0!cjrfk?h2Id$Wx`&)BQXo_@}UH7GAKm!5p1+ zB%HwPKG|7#q*Kv3mgErA5tLcR(jcrkWF&v^_5|8eG`)qNg1Lf%((A2a<`AEx3dD=J zt?9+<;yibDQZ>@ak`qF0D^m=2Uv?O*bsmtos4#1ac$nF3`B16LQ5WuYR-A%cyGYC$ z&(4c6TG~V5^N_qyY*xUIXXbD81k(VfLS4)d%$01oRL3Lj%;%RN9Ah=2McLtEBb~tx-Gom?@XdL;gV%>c_g=vd-JtJQ#@<`z2l zhtIlUaIxI!3rA%&?!9fNAMK|l%+ONgigJ(gIQ3{1v0jyyi=V!Yy|{~AC`!Av{GNvC zi4F2;SE%i>>N7HCoc78sEWN9dOw(Q2Fl~nVv2f|r_HV0XH}h1?6$n+NQJkkLqRwKQ zb+72(&bwL1H!ryUT){9X)Jfskj|&>L)AgDurFB+n2^MC0L`9D=~V!vkGMRrr{jRi)dO(ncBYsGZAkemS)L)Q-7{)*sLz(}Na-m(L7U zFs&s($`tRqrkAggCv6Ex%B0$Cg;M(|ykiS!OG#G?_(>F(ZB&Yv#j}tObTQ&;U+1|$ zj{w|IH`^hYWHgF$lgjcE`6R)nux7~#FyeR1A<<*KD-T)Gy^-U%)F3cYj5lG;v+%&^61|gLZeGW7e6p*$PKC9CpX%5;r&GV+E5|G6smJiFN*ITU%1=DYE9( zRy*vt2DcZz)K(kzUm!Jdw6T-&EVh0+`53F)ls5z!B>{j7&P(k&=;&Kf#|e;vDCfmv zBnr`aIO)Kx5h|2bf>iNzEVzgaJu4@onjX2evv)mm!z$?gj+l}Wv3^Q9u}y{80)k`d z?uzpxt;~8nNQ0350v^KEh#c4tV?YH-n3&@pEu5(E6TT^Jpc^Z5|ms-d&=r#_6a zv*L}g^Zl43OwIfN>_aB80K;{I`O$#Q>MYF|L*4>7LT8XPG7WYL?Wa!@gHN+Y#&B8w zj9skmZ9w%tcGl(;(#X`JVf6EzFN9plgjNCT^Esb!!0(WHQTX1giDS@=&mSf?_F=wo zO^B{mW@`M@^#P}SwLAp2_YwMlB)uOdJ!WcrM&|ze>r}zEiV+9bt@fvt8pN->lAnw7 zD2R0>?Ip}QdN%!bix1|G9#r7hmbZ7-%Wi$5U8xT6XA@YAm@@gIoHH}rx3ES;$yF(+=o$`43y;?|bpzmsu7zOIZ)2f?hxwAR^>j zf?e$EQNqx}+Dx^P{!yH=Ip0V4aKtQw*MQ(pm65vmbi6r5R>iVLQzPhD>j%DBP;D5y$L$Pu94%EYQoc9Ytu0^)LL&SGp;fN3j7P~Vd`^vNF8BVLKzv$?(Lp9iM=gGPv9L* z86z;2H2A(7K3lTy67dbKaKmGIouvWa*_cq(k0cOk!E-v6DTB>4Gysp|Gx*YSKbt7< zQFR>YwhGdbbB4{d{wvi2Mf6{36_<7(*th`|S9Zq$1X zO=vVswLhI&X7<*I=tHMmX3b1ybcJo}+!1cr zZ@%-xD$#i#o#^=zBk&N%+i{kaAoh75njI25MkJaeOr9>i9%d%^AUrrDI60@C#CT4m{3TLEVD-_=-w8EWFK4WEK%(8gjCsrMYu5a~ z&(-xcF@9&M;yZx@)Tw0G3Lzl7o3A4m^bc_+DEw~0%`9sXr0A8$u=UNRR-zD=T2=ov z@8-{OCj6^Bx?AWuVv|Z7!Zi9!2kP+K*MC1r-+X!$*Jpeb>HMh>xA&hd#8v!zg*ZiC zoG^gce1cR31$S>?Cnn*U+*nF3rOOUT6t7ZC=-kCB*i~05%!JY(V~?^2j#Y7ye!XsQ zC$37RLBrOHwMnm}6Z3ane?n#t*A);`dFeZnYLeb%rCNS`zor3AF5)fIs!J65V+)_R zjciFoSS5+0xTTE7Qtqh_fQmaz_Zbm=`kJAs_h8E>+g^)Bv;x&(MB4bNt+&6WGLJ5q zW4zHRSl*$i7f+h8@YTs&3$y*9N9Egf7$wI!%~3X8CCNv%_sz&)awhXV1R8{C=ury6 z_1NkZg)-KjfPDwXQ67H6Wt{3?Pd9wV=oZ|Wf|(QRQZbyTSYxk`w!HZkUpt1~(oc^& zhSu+We1(2*$4r4%T;m4*1nFU$uoN#+(||U`ug&>Dq_O99>RqV%%VCRk8y!Ek?qats zep7$DGgI?gjVV_Aj62RDvT`dmmfRl%!mq%|4}HYztWSFWqpPy&yg5&f9tzwn#y1Cn z$)bh`WkiFlq9<7&G*`Ue{K35E163>~gvtG=Z>)(b15J)%dm#Is-?D#M-i32S3$vCY zuUAxK>GAGLF0GU~#3l(4PN)RURmWZZyI79rz2eC0*}zYV4kTRs>P5sf*3Xk(Eo{#c zn@my#_DP?cFaH}HQkT5af#rkMR{fYb8$Av^FinxQqw`4~tCPzGM$Y29 z-^82NSlTh)KfaewxVhxfhv6)pV4 z&GPMzMP>C%T%P_u|GIPk>Q1Ztt>Pl$v6<^>QM*f$3uKp;A3fP-b6(<2mj$m2l%}1ok!-4VBQ;ksVr}nm3BW9idOv zRyk}Ux0KzIdd$Hwxp3F`=PBWfbPqH-d7Xdw);$ilZIVR$$qS+tXNh+Je#ui+!zMj{ z$ouH}LZDSv|Fm|o`abub2qoVEDSDwXOvBiYYg@H}0cEE11QG4&SRpcPpx28iqNy6q z(|ehw-j!6k?)8?L+wip-&@A=Pce_}silSgNas1rg^LM-r4TlF!!_b4yTx$B zNuHMqHgy!ozlS;9Y+^@l#bnd}4*x#Gt7(z)2>LGac`wmRyn|{h#MKDN` zXHO$WQE5oFyn-@c?Z|4VT4$97cQd~uYWb!1t>S`1)8feHko*<9!{`# zKTM5d_6ea<@YX}%$-YdzHKX@^!zQ8P$T>!H5jK^!%CSo2Wih^v-A8}l;>b?Gv_n@K zsqFJCn)NgAvoYbyT2b1OsKa@R_d2tUf_TQ=@VsO<5gDzIDcBlWts(sdwW!D`- zu}PGec^S4Gyg~k)O1nM)v_=ObU4udyaj__5EuFnZu~)H?>E)6?7 z*v?_zI6~;#(}i>f0YXVGQ~Mt(gz}DGU>^T= zoJJ1mApB{M)fj_XPv5iN?&YI6Yk+I62e@VwB7r0l1qm8Z8vhu~8ZnFj=khN>NhW*+ zEBT|}=hJD#rtK9-=gU?Ya(D~d1l91jD#LoVIlcU4Ou~r6V@79VKD}?TJ#nxeZ2!GN zWC0XU4@wFmlNwG7gY9X;^&fYOad-TT@G(T5QG+{lhYrvAh)E$wx;G{}(!#Yqd`Q)` zWpvDnp4Wc7lP)-wc@3LPOZdn;!>wm8PW2hrVjkx*igUNO+k==$kg+L*2yHLEDGz%M zxIwkOe4WO5rj0v%Fduqe#%ZoDQ(2(F-zCs4XKe5z%(A;p8xnu-M{087G4ps&?yT)f zpCc%KfVE;KMA-tZcgV_5NvVy_{0QL9zbM9A5j>D{%1vZp-9b`||gWiG{Qi z`*!yrpRkaQkY;z1AXtMwM2vEe`ukB%3w1rjX@NH}4hVB2W9P31tl`|Z{6o`E!+$Rt zJi_zGbW%Tq$3hOe!t~QKj(>jo>A#W`kru1*@a?kVL6eZMi{}aqz`Y=;_#>_bB`n`#r|%I9@E^Fb*Nj3Mfo3;>(so zWm>%)IZ`Gv%KZN+RD^I*djijVxypHj z*4s>#dxNy=z2eM$eo;zA>R_i1MFQoxiJAzos*=oc{GI9jPqb&%6cC zihd3R|Gr{(`^=N6_Rv!gyExck46w{b^D+#5Q+rlR|4**UABCUz)n#bYJ%Fe%iNc^x zKwGP-3@>{ci9u-8MYFoIv0Z2CSEBWM%JUD8TiJ2e2COfp4fD*PF@@v{JggAuC3Nb@ z`9g#s_hD?xaR_O**)OonUwjX-Z^14pxj^v_iI{8DLP*4n;<_JXQmW$rwwC^uQz0P$ zXh3!n*zjOFZ<(GjuzM62ZE20BtiTkJx=K(%`Xqt4JeGfI6jz#%s6(gVT|;@GW-4`wTFkIV|3(ksFq{ zz@^v$9q`QcL_vzqJnM%ZHQ&_%w>mk4sa^ube!=rz)Wj7u z56h2a3ga~mr$-BBQ?)kRTrAThMr@robTb~7D6j<$QIY34Q;Lha84YJHfRfwou_Caz z>Bye8xmP{qpPVA{2CPwfkemiH*D1i!&MzX}h}PukIEQpAeu=P*`Vy9Uc1ty1pLdl> z?9|){K2z2)2VnU?WA=@AVKh5qlZsArNmV5yfW*qr_eqG%7pA5<3a_INcb`~Gx$lZ- zA5EyatsUpH#6#R80Ysg0{7m+r*}PvSp|_;6`u03Rzqs5hGfMcFzG^7I>PJMGQNQfT zKiukn^l2al8rY`m^g*y?KOXrk39P{cye;bI1 zl_YUQiP;CKfr)#Nw;I|CNaZ$2ER;HUBNRCR8vSv*)mh?`&)Md&jI(aZb>K1Gfz-vR z<4TdEo^ChMMsCT=#(DV$C;d-1HrvQ1Qf^V_mPZw9>N-3!OAItOxD*}WGk%`Ms{Ce3 zBAsm-uX;~-|D@KTwKsY)G^PY<6rZK`mM8b3wMayY*P|dsy&zbj`Hk6dekr>-iW@=Xs+L~ge0B6um?onoh%w&rbg^RHd$TIG<16arfJDU zXsv2fEMdk*hqSdN{O|^k@-L}HtU~h zYqWO#rVm~o_suS=f2gVPrt2JR^GK@Dg*YNZe!%?o%YQ(lkr8+lCi&Pif^b4-x#HVi z2)iGC1YZI|?ubmBd(>C%79p1;kXZ|T0pNC?_kv7G4}HX~B93pu_`S(ByTwL}|2?-{ z09)j|57-l3qFwLD{Jbm~cIpeXGBlSZ8?PECwK9PrSSzn&GCPkyevOC&QAzv2>&6sB ziOAkgtHL)ByCDH5wpDPH*UgcyuZUxpS}`;58kRhB$4`b?@#Ttuvkx6-UY>jPC~Re;f(^0dS3A zqvO!J@KdLgu*xja^|OYPmVA1j*ea8e?zWi7>yQ@0!#U5iyiKYcSF-B)TGxwjv~L7L zN)V>eL3cS>>X|T8bJE)lG`=C)x48~66I}CKQj{}qKSWIZ+H*ECH{2zV-eGAz0ji7g zu_Z?5s)0>$ZP`9s{tD+2M1NrOi508v+uRbKFf_O6T|A)K#^W1_5h~|k#=~xqEj)SK zh(qf_+)h-?HrI>nSbsXOs9L4zGQ#>q?(&tI$0+LMt(Xzw^oaV(N;4&O&yBjI8+;|n z>wRjN`h$!|we;i{WZDp=>G+|2!|W8vYXVw}Gd8)P;dzD>-;Rgh?|a%OJ-m!DlvVDa zw1U_hEg=jnY&6_k{*F@VtUY6mx)UXb)pag*(|u|=I-PMW5cX*OlG2%U!~O_HPn6Cw z?GR45%)YFN_caU#G__}k@|7YhF`wIb*^aX|{H2G6Ug3pTAmDoG_HZm6=O{P>Sa4?4 z4||}4Jd(c8jUiCD#Sn_LfM0_0Y6$G`Bj|t_*c!PpTA%j)a=|~0wP^q5{owabZi$0- zZ8wzDi>XVyi?pKbu#UKV?v{LPguvfL+hyt>? zfk#tT!PuKZ1Smw2HoO`Jsw=Mzq_ZCi4H25GF|4=@L;uA|NTn;dF%8X8kt>tjn$Vh1v@}F>>b6`az}9wT)Owmg%r8o5xHmS1XVQ3$ZoN# zJh8l6se(mpBx+;4pR-~?2`}pfzKeXh15cZNrT8)0Q zLGPz_OntemWECJA0W}7t{@%8O&u_!(wT=txih}6**sAMzdC!91DHk1lu$?vJ!9%Q5=(QrAvaSrRs-b|;SM;jF zf4E>HmcqD1!e)2ZZXd}~9K{`{#t1Ts>kd5F;YJJIKLuu&*px0fKip3%A6J0ip7NWtZMkuNIurK^q5ukXQ>vh=!W-98QNu({5k65!Y$ zhD~~wN98*X%*%)(4$npxTfOI;yj~vUJbzbAYjKL_EB?23&m}r%NgjheZhiRLULH(bwN{c|QBRcHBB@ zZJI$>HqySF{sXM?f|ObK?ht#tK+qn#8lPz;;cX`XWy*(gML4~S^%Q7z8rxgV zSqa3Nr$J~G^1~h&yGLR-cvSv~=8njBJTvk52yi+`%|{Tnf2-p1-;Vi2DK7+lYaWBV z(HpD(>V4)n<8{7M#Sbsz5(i(8rB*4&@z7-MER{u_t>%DCIQ`}+yG!C2r(y&)FA_s5%_&yrG7=forcGNMba7>wm8iV zhGCJ*(Sh5W7~Ol%eyh^(=Z__v&cRl)8RwHtGj=pi-&MM!)ZyZxw9K2zb(fLkp3Srz zKU`Z3<7$=+l|_75B^%TNx=Q<4l_V z=ZEY2^#=|tw9Mq_5bp0IJ_)3qmrC0ItjjP!S$ENRrNHcjk}WUzHo>8fd;H%V6T1ypq(Q1qV+Q`vjcW zpoE|I8aHyx50ep0tt0T3sdPcBkJuEfkeH;9PpY~N9F!;Nm&~-25<0G@lGV&k1s2MN zKZup&9!?Lh36UP}vSi!5akvH2Skq_7wdgfYtH4*fNa$(-5|>1IO%zCN9mOr{;)RXk z4oWwU;tZck2t8c;u`L0%SjI|Zz?9KMz^2`Uk>sa5c0uto0Ak$-)sPW=^Yh*r1HUa3 z9qtCLR@HXJyfaILxqC%sVM3(ii`JRRN@Ar$J}?%vNG4w?$m{$W=0E`?5}%e38!CA! zIn@I6$q}RKp6W34@a6fK97P{Hth%g1*|3{Gg)K0-tA0o2EJNbH!jhpH%%|yjDc3aK z-i^8)D?{t(wTtAI3IjbvsWQiX4Ci`%G}G8bdsUF@d4qlmT}7-5l3OabKt@3orOBHd=fO$m3dGqlHDK+_4Yg=#9H>*elbgf@Z3a<@JF7YYy}UxSOjCW?D7 zsrw%&wts^h&cKihF7NlAls|!ynHfF032_pMwk>LK& zPl(?%q`lSbGh-W_x84@p3qJ4!v9W=-#DV^JGB;j_F#6%&D`_b5qQ5kykgyymu9p~& zLsR@PD3aTd{y+BKJszrk{~sTz6iM!dOi@un7b0XSNk}SXM9eO^hg5QjS*2Zym`bH6 zBN8Phxs}_vEBDBq#{Cj9BgPuDX7zikz4xhe_TFdjbI$Mk{eC`Y|KZ`XmYFs0%j^BR zJYUZz`$#w|9K6d)2=K+l2#p>4QvU=ncPCzE8YbYoF@$Aa*N_-q!U{nGB0tbp08G(M zRu8c~qNrx;MDmp63><3=@JIV;U9TcTTM&HL7=P<|Nv-bd3Dk(W!I1J4dk~!P4h^Jc z&e5o}`i7l3QNy1MCGo_~;%pUPs#tgLplElRWc7o$6RBKRo(sVaH5>Qp-`GoSk`3y( zE*c)m4q-LbD^Jl{rF%>tynZb!{VpCquFt!h_B43rRBmst*Rs7iz6^-u254VS?yFs#!N3?<6aisZBI}H>PqctW0u|&@gdG zVL^%Uwo7m{e^gc%`3-!_v;_;IpUv~|(;#J25T_^s?Udc~9KZ0hB<^?S~# z&rp<$HcziBYFFoy_zVri2p<$$0C4aRp>|}wn0RbEi!#Ny{k|Q3N{mSn=z-9SW6(&q zQXcSx4$`*kN&IZ;{?+eF(M&`1f(mUU z1ij@F%^1j!&Vr2QYl;t8{pac5erCX@no!k{DA4WxRS$gV&d;Q8Q}d`llivSg8UTbp zvldM;;%ovX%Fv&5y46u@Qad#9N?{ZnLB}+(wvj3Bqq=#)v!(d|^w+NKvVR0Jc z1n{Kd6)=9`pPLK7W;edeT|3#tN#{Wsz)80~3J6)~-R#s1NNo23>qfvk4J_mZDZz!mQ?T``Y2W}MR>y&9Xbsa=fSeo<&;%gX{Ft|p^v~Qxb z?c>{os!t#4kx7myMolIh8$F3=2>q$9KH?>yF_^lB$&v7=Yl zyA_{lXYH@;AM9mPYbA3cKWN5A+?aEM_;|o$g!qE8_>44%h{#QcWh^YCXZnO`r{hNj zk~HDSNHl6q{Fuu@%7l4Dw=~}9Ja@_}Enp>E2@mp|g{rG1C|PC~?%(Hg+W~PtRQSW| zu+rGF8lbBTyc|E<2imMgB;YJGINK#kyPgeOVQ=la?CP3_NXu&20S4Uhw|LVc=;Hz6 zd#>3Ag!WmH+Z@zxGnU z^S4=6#c#R>zU(aE|E;Jrn*rtJ7wR}a@r0R6pq=YbWU@2%5bhLr__ue+27jZkOch<= z)JvOH@sUfWq5wpr?^4fHKrf`wMlJ&5Sk1tr!I?A$#1%~Beg02O&>t~fyj40Nz5NSr zDVP2YrpalJ-@m)>zeh07vIKtvrWoRF{Px87^?+!W7M6k$me10{8hdcp)VXpF{(IJ+ z7C}YUsiM*_$X*p5{ER!n;y%h(^grIHv#~qzOp@s)RUyEfb*hpT(Ei9Mo>Cd&>weHW ztFz{u&fE^>s<59;d0_2li>qa&&77skis5I{EYjs#+5r~i7@E>x&d0h zv>}XtGt->q%#GF&=SY%jvYNNZsv-@iJtZD`K7M80GU;k3D>{Z%l+{X0jF9A=pDD?aKn1| z!6zIZ7rGd&>pYQRn+D8HX`}!4fFyr0qy#XN+@X7_vE>5A8ER#)vRI;$6= zI4=`TA%;%^9q!*EO&}cG~vU6$U*!r;w4Kw!q56o(zzE6+~HUK-(MNxYO zv4A4#6T$Ljq}ysR^THmiYTI&g+dGSmA0&(&ts@47o>5w)dm_f3Fwb>y$Af7eLYFKke7VOXA2qRNmWL&~Ju6)u zST4G{mf|w8OW<|=+8y44_l68`0i_1w=m)qE?B=e12p}-9g%>${k6;y_aZ)o!Cxye#>i~?OoGuttTMhgyh)(byL3GCf#7YPN zz5~yU*y8iw#(?;Hne4~+mip3en{nAov|Pj^z(3jgJYFt}Eu3b8c|3j{D%twFMGG#; zE&+`$6a(K%f1?KBVDHnBp-uUG$b%l@<;X~FgB~SK z_Z1W8I|o}+rS*m@jH?2|<6#3jBV7$3o+p5dkZ*`ISDifMbFgBqA~&iOOiIxEzVLy< z;y0F)ACxNLZs+08xKF*O*+R@XOd^^Ba^e~g@DikE(wYEjeS(fy`N1gCf$5`QTU<&= z03#e)+Xt1+V%q<@`dg#nOz622U80!!YOn0jTA>V;jz3-B)fJGa>0qV zBn5h_aH5Pf#3w_D7rJ8cgTxjMc?P~$Pd389_XhGH;YnfY*Ey_*i|QIc&9`C@W6xfH zFe|yLEC^x#De(9WOzp4B|I>%X*1i{FFSMKuW`W&!y0bUv-6fgfosYD1=Kr1WX1@Y#O5|&5)QN5oq)dyAaZy{+82}Y5{*Eec(y9_62#g8S!!^3s3j(!%bJr>TCZspZ|>;sGfW^ z(pQtSe0G=X2?ObX9c|RFDD(tE6*d=%NC>q)1R}VXAaig|bvo*Op7yNaxW*0pYDtX$ zvLydK_LI8)9o+X6*!`(e+N4C1AamW@gQ?G7%^p9#(NCCCWzdy78f6uOa~9vlmn^{2 zc_#Bf9^%`h<-1emi=UJ5?a}hB6{_Qle`^cg>zh*zIOx9|Xgsq$s%g+fI2}=hxM9Pf z-|vA7EyHCt(caa13UvE%K>BU0_X@}?F+oNj^!$CU|GSL-qt`WUyR>%&Y*XZUN9b1_ z1KGNH#lQ(S!z!ijw+WTuL$T!{2QoX-EQdGgW}FsCICw{StBUQK&o~FY3^rR8U7ieL zN1~}YXA2}bGLGMVQcZxS1cFRD2qE_IP39Poc3`y-!Z`VnT~L$Tc6DQKcaYqN?4lb(05D7FC%qa?Q$gy9=N&CC&@(ZPn)e+J^Wx3A|5&5u3XjbP6S@SXVbTU>#5%N%}U7L_AFnZmX zsfuaE($;jDbvjr+gQ#7j{)YS}5yH0QsfW``2IH00eU>H=MhrXV{n2{D|4sp!$J>Ok zny-z^pcnhAo=hFzlRr@~{jH8Jp1;$K@eKnJ(du8y3J?BJ73X(n$2Xy*My-7ZG}!G5 za;dsthrZs_3`oUreye|}9Bg;}TMzRu?}#t;9!vW|7FR`M{e+z|g(ZHaw4law?04%l zHH}_28u|^78>jB|rl)7>uWG0&(u^}$w!Z457|*-=JVY{07VheT5qbLP@RK7>q^(Hr zLl;R_@n9BXrZ(leA!EFuCwfEi+XtR>F&>VB0jw@TVVotS!e#Sv}s-2xtu={0AP*p`!)V^oWU_nB|^ zQR>e_W{Qu*{v5>vfoTmAF(408aRBamh@VxSeBc#~y-PxX;k)Pv`ZCu-K(3Uty!|t7 zZ+0mVyE^zF)x@YBD7Mb><`)u0-yk8~TRkb03k~~MrhLK{G_JjY$t8B{)$fHEN~qFp zU`mSHPcVsrAIWHDW)8qKzX919kWxW~pYF?|2(Q2<{q_U%L~6j)*<6Ud7C*%@t8y|@ zQOufR;9pfwf6==O%zH^;wKN@tCd!HmVa@GW2KSuk#A#xA7Al?r-oqUdxyEfb)r9-f z9cSM5XrLiaKI5*E_RORJYklL~l;M3;B+Y^wN>Tu!mrlm`Vp?;5E^&mo@`H*M`z1AB z7iTrjJ)ghv4Q0wNB8dEsT~D2&UZ7Tl+|kV#@m#-+8suY$rUhX)>^`|f_gm?n0hA9t z_kPGdE6xR|!Ba}Ah@5y!COPVg)FkI}V-|Sxd>nPqGyg=~89CZhI^M^jc{Uve8N?T! z#~027G|1Qu-ZIz>_m(sd=*Sy(mc5ybmml^aMQkcD;atOD_-Lp8C?#XC_#Oet7|{e& zN15%!_P|MMH(JZT0iDpL(zRvxCrE=kPOq@FXAAEG)NpA87F<+x^0a^dFGW`fn;- zROkqS(nTj|xh5nv-}1p@`q@C*l1ljZ$^)~8>N|$c=BIzp%z~RT{-!Vj{Yr&``l}7u ze~8xco2tz~u1&A%0)L1xRU6{({nW}S+cbEYimJzB6IY*oUXC|+BtG3{H0QjQl@ep5 z7e-|O5M!kkT2cAvWKG3}Us*#s;!*ua1KAzau(V;Ce`jBbDG zGiY5ex6L7k!DVho^u?%bWgKjzMks(^0EKQ1Y$2WFY^(o>R7H_QHF_TG?G`AHFZW`( z?MeJVDm^9lssQBW{0Vg9))>AoN1ai-_*dga$Oq`QLO7R?Sy?;-p(1d1#AVc@%Sdfi z#G$+<_w_5Iw^?@Vm0B%n^&n()$pTBaYblm!2?Ga3n_WuqKxI37@H69yWHfz*JMYIB z)-Z9PJO(b@2HevjGeC?`xW^spO3L;qu$haR<=k|RmTXGj96vw0n4_MGYtru>pz^#4Cy#;xgEm<2>e3nn&cc6T}8UM%{)=90tD=M+Qc#6m5 zoYcIGcgpo2YYs-P6}OSyvgp|}%@tuWEb1FNvYw8L)D{vM7Q|Nm-Y(lXzevg8Nt295 zbmOxZlGY~3yN5R+qxMpq7uyVTnTK1xR{28@R3>QKlw=}aPd*AB_B*V>@%CQg^Ob_mQ z1CsQH38ktWMc{!9XO;opfGc$dQe}B!IEX*sZ&JB@;Kr=QLC#dt8`=_O_wq{FtKPbB z4$G%1ht{RNne2jXP~W!6_2rKeYX|n2Rpcpg?jgOgZ+f5~brl&CTXgAh^!9mWr|OF; zpmo@)bnX7PQ)Ake*CkA}uQ>dm`Fht}a(#Yp5F@p$e`!_z;|4RP1*(O=%gko^WUEKG zO2Ce0A>J@={#GCBp6blALlI1qlSz%ql)ih0_)=*w{vGe{t(QunNF+E7F^;TzBbqpO zljgkLs1gEm6LptR3m|g-TI4{g@4jL0iGbUF@Wbk>%AnMhvL}OU2u5iWX(O#nwH03+ z(OmCAt6MVU?F-cZgp{H;V_|R4%7hcEQQm8J@RG#!NpypObok*%MoHdn;}(rRAVpF^MI>_(LpYSGeAzGu;B5?3uUA|e#d!WNzYrhUSt3z)Wydi!d=54=NjwjXq4KgRDpWCOCPM)m!p93S%9HlBN_egwLr|1avS5o zL;FGIVb%6sTP{qBaW6i{-!?M9Sxgw(6D!CH+}r4-!~X>X^{+3g0aTwyN}1mIrH8io zuqHGkT4 zweR5f)bRgPZQ=h>>wJ)-1$)9PZQT-!qL1ZXV{iT7nZ1*rH=ZX{=>BUnZ)Q z#<*hmL#ngHrw~GR)c(Az)T=u6H%Dddi|)v2Xm2kHzd$VgjJpEc6OpZWOrTc*U4>7# zcuoUHLg6gUvRCYTBb9Jh^{i<)O}_J-Pvpm z_v3}@`1Y?_{9;#utcLv47tvKbGQp{`*fS?yq#aof5&>3NAxSXHghHkZqepv_7#oVp zN86Oc;dbQ+b8=B1p$#DZ3IW=JvCHI~YAR*E>orfZ1|}lFR2Wt?Lr>-BbI*hJEPNV} zdvs;pN3L>I(WMM>aC#09bKKvt<;hbQMr(OOPJ3rS_;FLgcQ3-_=le|WiaWODY^E4i z3Fso5nGM(?KtC3E&Zl?JU6h|mzosLQ_?+9^7Ts?7;u@3OskUz~iQ+=7QI^CO4gyDV zC0xkw5DrEt6$%b28^0{EcvVF4oj{WztJON=ZxcMU)+jt_uy z1kGyFEsHgjM=j2#8yk{)&L4U05K(DhIw2!ioxbQkZj*19Mdxn<`@~Umtc(JXt78ga z_IJR;Yc3GmN?|eZiJ4m`_BN{pdxr@ z`T>3`?s3lf$61|n4~#&(Q38!kcUMvuV=eS0C#Zb%#(|f6>8ooL@H3lek7|)swO5kW zYg;^uCGHSE3A}u_p3Y@@DPee}WD*W8Jp6+zu|27588l+o@Ca{~sz<9f1uj~l@D%Sk z&N^3%_$?ts(f7NyeeyWbhn;Ypu*Gw0g`3GFd!z$75j5?a>_?*ANaxz3wWj`B`*rl`QgD9)nD@JH%#@}=Y5|!cJHfY*# z;>pm@*#=%Dg_{WjIgr)}ti*$x(LBO2qV}}2fS(U4Bo0K5Fpbqrq>imQzy2G`{awM)a%tJK3<4i#;<07{l-LQ`SI6EC0uJV|aoS=yVpCt{n1 z#0W7SnhSMRQ4s~9K(!H9awtieNe<0W$LCvqbepV{1Y)j=F}?siH%Ab*($!@?j$n)w zTG7HX;=R+~o>1{xSYo&SScblQmtU7B@h;3Trhq260y?5Zx#UmmC7-tyWoiVeQRjDB zS$3HAXu2Kc7?jqlMg1U=rLk<{?ZNypT7YNtNghTno#7mT3cI^Pa`<_!&yCtFQT>2C zQq&`mP7P}=gwuu4WkilNlG1jL)|%5p3C@-FE4X{KNiMVL**n(0)a3gyaC&{JELVxM z%c5HAJXA~E4SFPG05V}dn&v7c6Uy1{FI-QUW^JkOaJ9J86o>g1aJ&SykM)NXCA<81 zlfxvomyd4c92_QQ){frj!nft@d^^D4Qp3{kW@t_h_aBaKCvDf4yCjqz$m@3Gn#|(c zQ>PW}75#*RE1uV%I1^%@-q5rA*pAWz>I>pL-xMYC-aO?qq9-+Z0QAKCae)e@U(bWX zE{A@dj}(;Q)+x@G;7DR{phl0~OP|K{}=VJK{fVFQcxPXnA zV(MVc&%1r#IO3)7gE}xxk1{~4TZn(G)Jqd@XSpg~MP-CWh?F;{m{M5&8}`K`_su;< zx*&i2l~Q;)#IW!cLNydZgdSP>01(=4SI$z;a${@@SZ>cYIkE#|U%ut?)F8D(jww2)FHGS$-jNH${tTL^;nrYFF~3H94Pq9^+H-zAp(ldR>xCM zt+9i&5H36gcReMh8=|7chKL6voTZ1@K_#`pkYrzDSKY&~ttmHaIV;`vypT3II(Q^i zQr~6mPHB^^1^o0H_j|y4&cKCkU~?zS%3bUgLdDPZ;?WgtN2shQH<4~yhP$%yJLn>LZ2vCv>c<7^G8g8DO_57ggJv%b*@g2K%b{OH@?TH5fpuGe! z01xB|td(-E;NxP+5;u>se6<|S&C;R9($Z&-a$k?F2z?uH?ODJ`hZhY|OT!jaCpiyP zloyd!)c48HIcG{@yhPXdf=udiY9kR@1EXTr-QK9UC<}bV%Q1*XclEhqE8Y(gS_g%F z0+oDH6kH9+Rd+VJZ7cG%?kL+-m?iHY^8=;M4~NOpoIH?CWR_pPh%5dDREA%IILxd! znMb{<+|{^pldAU4rXEPTHUE99W8A{t2cH5BGrp3_V=FTe-XVv{^8uS zF^jdkgF;71tI=(Bn4I&zH}f9nC0My^FjS>mevCbQ$a!sCL!Ld~8?x%u$_lOLb2mkI4F)$-8&|5YieFBy3m*kL zd*AOFvv04-+C7HfY=89`cY20@GGWZt^}Ap7M1IE>{~=pWi0~P~c4QMcW1y|v&^FH2 zrkBd*A101xVV9QU{7(EOXz@>u4#FKG@SDtEfN}-j7eNcHiON|)i?hU^1uYVOE|7!o z|B9f+yYJILv)yEvAnXw8SBxzX1is%SlGP15l9Hv1>GNQxR(B5_^`2HyhX{{trOoX} z!<@q`HoDhWx@xXEQFSV_Ww#3k>ij)qL7#^08BDW_83Jwull*HYZr+aT%skW_;|%3Q zo+YI6iKF}&NYXM~=%`p0D}(z4TjOt8Pg?JzhFF)$ZfMFijCanCj(0v;%!fjq-!abs7Wc@f-M(9!yVCm( zl#eS&8qJ|Om`VEkq&JNof0aXHRK?3ZzoXHz^16UC`{*ZCrE(VxgqdB}X?R_EQXLqY zGiGJfu+KOVlB^45(Nrf~khZP)QpSPf`3>lmNvG?^{x8YnBCBtc4A@8PHO6Ts z5W9)H;ld@VAJ1$P>QSD`Na~ah3DuGlMReltU*mc+$!78XAiXv(rm>nn=I!PZ7iBuX z36F-TQ8@bubeORtd96^P$I=n5|CqB}=g5oqn?W^+(B~swTWX$Vf*BV75EbohUWo#} zUy6*&Me?X`Z5JIiP(T&50FQ108a4Zjra0mI2fF+SBS9U}pg_N{CJAu=0P_uI_6tYy z;GtQ{WRORKnc~AOvbTO*bS6fq&khs7+ewp}GG}UAW$R51x2NB+=?bm7Su44W7uiSX zfEYWec$XPMFHXsFTPW;4rDDK>Af;S_m(r1>8sE2Zs<*zp zyzTwjV~~&<0RRN}Gh7cgVXKBtXN|AAX#8?NR;7IM87w$LywMGxF1;Ca7SL$<>(1lo zeDEBHD+b32r~<$~^MFw=2W4bH4om^$!1E{*FrfUsldx@V<4pH7-#xe};3qel11^nw-lm`S6gj;Y$8;r1;s3 z@eII!-7qRgYU@?bA%)tS2(pyyV2C&A`oCQQsY~lh#Cp zMdrRmpZa}`h5k)H1p`r(tJ|&sM@&eZ8|m&Y@5GNN#t;rX>S?eY)uWEZ(n6^1))+yK zdoOR6Gb=Nj3p?d$#t8gcPWI=&9rFn3E*q7jH+XzjMy;)?r)W$rRpg4!NVM5`QJ zD6-q~s-7`6o`g2$f*5H&fRGj**70J&$X?P$gwIkUCzq1>A+qGXE$ZG*+pAgg_P*CF zQTybB^q0XOS7)c&-&`oRN$dV)zK62!Z=a%7N9lp0A_R4%ycbEwsu>YwwR49p;vQK< z)?~~f86_lgmAtW)s-VsUME+h1Ed<&G+kvRr#5^1v*DsEMm}Tw?aM2rUpfgNWTjfhA zys)d#(qU{1Q-a#V$@6{&|IZb}0Jmc7K^875z?Enx45{LnDosKz35%24RZY9~~ zf#{bE>OMt+-OKn~TXC!kcq*w7Fc07C?KQj_gR#Pd`Zvv%(3(IaL$K-$Vpi!1O&l`p z2emfJTj4S#1SA6jpFN;w1%>fV@48RBeNCKu=S&za1O$NVR#BX|IsU#Iw{rV$!gPs~ z!M5@cV)mGpPg@gUCP8Ics*aPc%`~%=vqZt%s_Salk)t=V6`>~DO0UT`M?MkL>Ce@# zdmwAd%i15bRC%ulWx8mKhcw&U2sLG8#i!DM=q3LX9(+}^ot!n`2n!ZX316dsFGdK=E;L1MtO2ReB zt4qX{gD|EF^6FRQ*@o{*frKMzLhmy~B>A2Bld3v^^!60*$c}#op_sjXeKP(Lkawx{ z`kqPH_wqxKV395V%AwWJ8?*3RDyDzsXQgUg1^D%*1`MQ3Mni2u!R}inK-{F#*E*WI zRRb49tl8_T|5SPBXH)X+_aBgOOs|48qid)bI5F{(+G5AJw&qrNmJXuPcRN^%T}>4J_PA zo%?j(pAvE;uERR;X`xwTz8pD!Bko9_;Os3g2oz2`tFTn#fJ?HK! zaNOg(yd~bK9N4<|wZl&B>p!H-*PDBubn|x#5Ip7n`k;GG2xLvEUkgtc;KEy*c2O8+ z9>MEMK*S>;;sJ!on}_(ug1}()VkL0;U@1|CRR0;b1UurIpK&ff-$t9g3k=A=c$YWD1C#)w18HGG?SlsC z;6!i*RY@yr{8U6wABC7+7VgNT6Pr!TFUV<@PJ)AB5C{qpD?oA`SbgPg)i(OU8^^t+ zZij}JSChLg9jnE8ZKPD>H(yXX)tj)G$=%P0USY_+d_Tg76f~}iiV_GJ&9q~b1p@8{ zFY~e>R?WaZ8Zyv^9fn*$>GKXar$6JI&6fFZNHx9Fwl{u7mihFLn@KY=(SsFd251je zkyVdxJVLTqmiP8t$}ddrEO^=LvdKUZ&lZJ3NrtmtM(#X-eK#Dtns}r%TsGserEL;? z?l|mFi*2Wt=^QO6)ViJhG*s0v3B6jTaw0-#zlXS3{s_L#0~PfskI09(*U)7FPb&xB zJ2y=Xhx?IJjhVZk@%s3HPT~~`V^@wcD7mHGNqg9_QPkP0uYjj*S?tyDDLOHwygTLE zE{&Hngrs6GZi>wgxYOjLg>JEO(%6Rd{yy64NyR>+eL9XS8b)f4Bd+nr*zpLBWx6Au zaU~5wn+u=4pQ}F>h;Re20sFENa zBe_HBU@6YBU>%JEyg1e61GL#i_CN6vE-=b>@nO0@eWE$an=xZS&`wfEh=*L*S;mNc zoX{15!k8rfbYlPzDBd03eVJUQ^3(0S4T;n(-;VAR&s zRc6)}3*tHEz%?~`$rnxDX)0F3|{&JelQC%hLjYFnsZHY0fMWmJwKIR_CbG?cB)8al}Gv z%SP5u|Mb_P1>C2nnXFzJD*T ziIU+G9rv1fur>_`bur1jAk@#OEvB@R3n1Kh@-xy3B+HOd-K zs6AmL`%*4#+vzOnvQv;Q&5qn!UW8}ZdLWwmy_7Q5wpMxxaj~v&M@M_B6Im%am(?Wc zVfXm(_4wUcN-`1V>$;3quDl?!B|sDtFk*yvtAefv%@CFh!>rJ3!LY1w{oc4P-=@l4 zsael6g+LKxeaH>51y>e}Ejq8cPkSHVv?|BM->V*Wz@N8O;XXtSV_ouB)T5H|9j=m1 zTB|%wNvzC}e7i-bP3CM2+fXn5iD%s-MdeG-Mldd$XRR$6boy7WtQl1d79L0HWTYVrkfrUf0>qdX>9N+ z+c0>0#;vfsD|&~CMG>f!u=-$;YFifT757$5q}ga5KlWyHh$!~}gLj=aV-iDPWe`hMM{PhZ=EiI;W|b;- zb4ZMbn>?EjFvovv7=E^CqnyByBtaXPJVg@#RQ_dIVRNex*|-|f`M`y zj&6WR&-)$l|((hcP}TecLO%#M!mbtrD!;Y;W2B=q?vv2+N=CREjayB=54 zh)&43M@9J*(?FdBp6G~f%OA}*e)70VncxRjAZ5fhWVvs1NiBPxHQek)H3&Pu>US`0 zf2cViaESox`f!`$$-jr^gqyopbY5zFHe~z zf?jo6Po7H@>6w#kPT- zFo=6&q#^MSjm+pw_6LJI?k~dvaKfpyLeeCkKz8Th4ap@*of_@f`Hcof6LdZ*&b zbye$8o~}G%xz>A9d2CAuu{Mi(`^06#@-8Km1$A}MT7WmhuvBep?j@k&0vJD!J)Zoi zAp>bDHhfp`vLWFeRljOXN1^Ygv^Km(QMp4++j^d-R4|U~37!cIdSRm!9u@bBN4hQt?#X{(KdUPKk)X%k}aV8!1j^kbsy{Kb;bhWt{6&|(0RD) z0Wob2l|%49#2BFUmsIF`W}9_sPNDarcDbwjB^AOi$uzT0bIzd(^}5=m z5}E`iIFjm{`F<6HlyS^wZc?T5ie1EAhQ$v@D+F;0qZ%A%e?0h5fut(grQB!O-1t%3 zWz2)8)pNCO8jAOcSevfSO*NG{VElA=m1ErXT?r66}{?Z3NO2V{xnBFI%n z`=J1ak9XO`5i2r#P~xGZd#Pb2axedxD`)v{+u3hz?$1Bdg6YT>7&{OSN>mGG#iwYK zBhLt|cw(ssDh++ctpzooz#rk3kMtR@CPiM(0nlrkU~c`fj;|L&B{o7m?LNk4rpDVr zp4Te>7`S)437I^D;^}pFG4o;~0WLfSPVbQI-M-))+IWnzr)Uaj!-*(gBc_KXr`|Reu3PF_<&?A5JGA+0kS?fR0%X*KT4^ZhQat0PO&k$IN zTdlCwP*>~=P((H95pNmt5(GZ+S#@#BmCv}3X9&|Y{STUH+%^hA+eNw$){oXRZ@6#^ zNuxODs&d99^U3GbqGbCGiAEol!0CZ4^mUqs8D>zhMMB?Zqo;<^FFSt-nrZHC3XN?Y zA+5-)Q!N{E|7e#5jk2#vo}50yBtH4!g-dJ1yki@dy15uXP&xGeZAjLg@)CywWEH+P zZ6$S5i1it02hWc zHVU-;SlhA2cat_dY&;G%*6FYAV?NvSd~JvPp#wg(w!Q_!MDN|_i|G19!9upd37QJa z@{)6AyOndS%{HNiQyp0!Ea9Uy@xxiYb|^<|^C@k@bSfiyaE!9ZU;FHQy2hULfc&Uf z^I%NWUq1mD1|;fW!;z00Z1=tzzQ2+B5zjjLl0zsEnrqYR4(J=4gJdZ~LvXTxP>>4p z%y@!$uZxkCo}`~aGjC4>O=u@MOMcIs{e11OCbYkr(6(G8w#EO+DkV`8E;OJyC7ZE~ zxtDa6NpEL(*tq7a|6e^tzq!*i~?mcU(rcHXlI9DDQpxJenG zajg_nL=WQ$+R4ahzG`kBZYvdDp`0>0wsTE~W&4kvtmO8Rj)bwQDq z$=U5M!sMH6VyLK7)?EYov~^eIU))VxQkH!w-RMMdZLUxMyw%x}{gh3+pY7Yl-<7If z3m$f1F7yzw#uk&gRCHT+rRMte>(j@j+S{$IcReqjyU{0P&%vELc@F7y;WqFYkafLTMQoDYj^efb6oJA}xRH8Ti=jbldLaCGSN~ z_~xdN>H!g2S^=EySV-YVeGxnDBM4lkW%@WRA@}@xibL+T-51`8h!<$voI3klj;2KG z>hZYbr%*~pdZ2#(!1a1@RRfD!uQbE{XSaY8YW>f)2Yly=*FWjX4h5^@jZ^($ ztX^tOTK`zsNbz&krrik?=%E#Mzs9qaBm@__!adBU z*Fd+=)uNJ;#d8L`p}<_1B^w*2j7AHG$r;5oJo!d2TXpG%v+-An-SCY;dusd1u^b}9 z8zqB-HETI5?Z=3J6 z6PvtYB1%TjsKW;ndi2 z!&vnHq;Od7LQuR2u*R|}T*0IU*Q)P-|IO|4$H#9y4a`k~J&0X|u)wJgUi}dOZgp^! zHUiR(b++fX?fuyo1mA^v5->pkAio8+ULkPmM!n6rIwOa;NdzXy$;__;gJTBVYQVHH z%P!;v*o6{#KR5kjM&>han+KvmD68d%3#Fm1o^digjXGv>Hh(E2sC0xgP@iF)F>#{X zWF#(e*!;~#Z418O!^)kXGFEdqJ6;h9U`0>n-H8YO_X?~itL@y?o;#lk=gF>F^o+u| z`TEeUum#)58Srj^NWddW z=@Zri6ZdY#9WNVxH4RG=MQL5Y1|#btenj={pdwW~0+sl=hXeuk$tS)ENA#Yyzk6in zAy?3T^@QPO#jPto>ESyQP_ZbC-;J{cF5C>b{5tJ=`O5PA-l%)p%MZ!oMO9g0sXp`N zFE7}2OXO2zUIZ z(bn}%o?(h}!u{QbajPs`fo3ZjyG87q!T>yx;HqQpYEyUI8 z+G6lAAwY9!S%>B|GG|y2-bT;L4DnJJ3uP`ov`p~X9>WF!V!byiLfac|%KZ@)<&8Fa zcbD1V#Vsr)?JwSVD-AWUcB^JT@?T@9S3j?=r=IzO4_tA9Xzrc8bb?C0G-f@YHa-%0QOWC}kc-TZ`b z2cXRK{Sw~6Lmqmv@!vGB{wbK%H~+_X_XBZazZ*P4!Cp@dH~t;h`=9hZ4gTSNpdcao zFAKy{(G)|@X831)SQmAaKx(5+yuy!G|Dq3@SoeFMoKL>DmfsQ?DQrs!oZ6sUnbg{0 zSzB(IBERuD8Vd7L7_Q_o{ca&dL3PZLNy8QzaBPU}h+&C$+o)-rDJ)m$>iJ~2du%XE zBwsS1%63m?ZjZqMu{*~Fu|Rcn8nD2CUHg8#m5TAF2H?x2-40=R)I2rI#H%?u8EbHZ z7j2!0%g+?VM?Lp{vPtXplS`{_9K!MSm|ck%h6|Zma=q9QkUFSbVtvgM@;xx3 zM#Kb!yD2gW!Ttih;|pAx&kpyuxmxC~6u!`WNZptCNHOn8~v8=&4^&Qx>Kw;)y3cRE4 zjK?BpiRk5w&PP_H2S?|gZOXhe1dRRBeOG0iULM@JPyCP`Zt?B9KM585wKR(o=K!@Q z3ljuk$*o7=Id?##-X2`75s>_T0#YvxPZsow@vud~q4y&=qWLw757=oo?p}ti+0q7GjUYc2KdM zKLT1B^fPV)StY=9I+cAE_zSDm@ezHniA^ZHgvi)8-vc>8xx&Ad_s}hv4yjI3$C#vU zYWE0>@xXzbsw?K;%q~n+wDsr~Oh4S36WNH3j zTgLLesz-S3+?5#e1~~#xt#U(NdtLujUQsYhjdKN@5?W5LxWPJy5(Yim)T1VAHitHy zEslq^>K-?WcyHTwi8R&y;JuB12Y%LO8y|{aEyqBz>{&P=lX6A6roCl4$kNg7??sWF~1RXSODf2w7;DY5tM@^`iSUFvA zpgjv)W$}bOE$J~yeVwa&$1M&kf80BNxaai=`#2R7`~XPZ6vh)7 zhsOnwk$sK+O1)IRyr!oPmj@4JFvTYYbxv&=JZwE`q%?npNRF@Ec_@a0Xr06OOBs?~ z)X~E|?F@hi=*`;@p=bP-3=jMX+M~V($oEm(w#L-~kb= z0#o3U!GU*^xR4~_!+O_(1Yi`NgZu zGihnrxZf+v_3C(mK+_=!h@yJqXVxaM=YuM!0FN=I*JPwe`8QUe!R+9>XH) z5@dNQz_SKy8K`oMy%(n_{b6&dSASz(*t*-cSL14{pJQ}B3$hRGHMSSLEakn9>^$ys zR0viTCfaVfJE#r!vKI*e@X7|wsX4S$P}g&kIB19uVr_WmE4ME4G(EZJx^GI2s&f5?PW=hopERt@k6_rP5`G z=Oout-po7q|FQSxaWU@w|M+x55<(KC21yZxB-*Y>g^;yX4U$wcLR!>Z6sNthC2NBy zvuSC5o71ZNM5@4Y z+@eK6&ir$3b$}BkpyE{Mz@-~HYlin2QLgw}{!S(OmsB`l){rB1Lyk5IJ)m!=RDaI+ z;#rl5(ii+je} z!^1N4ee|#35zruV_KT7Bu=LLbDj3!BnFb!Sri0?fQ{3?JHy=;VrhroQd z1Tf!?`85`@eEIPQl7)cXQW=vW@`Smx9q2Kp8G1`icOO^OGDXYU;!-kmYyZ(p@{e3@ zla{-P98T7la&F^bt8YRYn24;7Dd>P7(LygL9%SLA$Rup>%#hjB7N;jyYu;|Luj@$7 zNH**XGzV^?p9|$}4mfsx)j*?qP7sch!bz##P@t{`XM$i=rA{<_ef_M*GKtTxDtcr1 z2|O_T#A{Rjtpf+8kL|-l?8G@{{*;B_;&MrnXV1%v8Nb*hpYrWpviy_y70|aE9v#?1 zI;0|sDhmK0z#bk5ys5C>KaZGJn%f|Ee%|dx8CBTgr8P-)3!6hMN!!+3nYlq`%@vWN zet&+H;68ku>Q77|=VnW^j`j)?8_?@{c1zP_*30nfrgdF=Vr4nS#CWou*0jzyaRg)4 z&oHVYhDoXNK*_G(RcuLBjtr%|N#<2fX4E%#PG((M(w%Xzc5G?tYr@uV?ny0h4WF`b zHtD??wxG1tESNPNSHyv~e!&PO>sLNTmG`D*ZN2hY&ewRs$(1MX=-1vel!1odUw|^- z{~jv%=irZ+G-vm5x9)i2wpB561Gst_qCp+pM?~uU8D+&UO9q4Z#(CxWzj0)NJI(ceE$B5k~R9k zVr_@kG;a_0s!aGRXHidkAG{BUP{X0vVMsTI|y zak;8hDYi=*_9g2)N)J;q>`oMsuCnM#mhUPYQ|xszyQ2l5kCb)4g&%+U`qzBO?pzIg zlR0ao(WTt}FKDid0@2Y$wePUp?VT=B+zp^wS00ElV?HLSYea(3dKYoSHt#NhO>`&%6@D|)TJV!7niOF<7B z-Ex;Iifv$GF2|$JW~{&MLAeV1G;BGjrpFBiIJG3I$;8E*IRO4J<037k*xWO`_T+$; znGd5~x>xpUuHl8~Bpk!^N=n+R)UT@FjQ5*+Wu4d6TcM8*iTb?;{u($R?D(A3+1K}= zMjRN#uyu+evy<=J?{yz9nfXwbm^&|LOaQz`_7GhHr59#+TtwIph%;I_Sz4!;N}<%e z3RAr&aU0)w#UEp>4s=Ur=q65ald~Ard3?OyEMPeOtnHKzU{If|@H!WxBi^XJ6Z}EaKel zgeNMB$J;;6UEh}f{0bQF$2LUYtbsXSIXi%;^|+8P^3X0&TqjIPtS zEP*A6sV`)RDWwHLVuI=3fmp2D!5l?XK~$&@NFa5guNx$Yq;gt0gw6J=Qtpj>`cUW6 zo%WB$daL?CIm^(&+|$PFHKd9TYJ@3YpN3y64Q+9TdpE4OPah|KBIox!^kBQr>5%iNZAjJAGLA_^ zBMa^1IX9y+sKJ^NJJ*)qGd`Ai`_Rsk(fu-S-u13k4M*kA@aX;_ai;)B11UJz7XfM_C zyWaOT9A08Cc=wppMJhU%%-3{U)2Ye_(3YQVV?65x*ftp=n30{aF+@|>) zX4>0b@h+PLLZ^Sg)4%LK|7VY-+dV+K^1uoB`*bB3fRYGGwzelwf0%&s{EMJWo=KvH=(G-q5y+@MG(e}~0(cpqFDK+m$DfD$b>n2)axblYwXr?k0r!6T6+3wZmk)t&#PgGFKyYl_S?^Z0A}UP|*I zOirHLQN$u5>~P|AcT%8K@%g8o4s=XQzUqVZwiCm}0pV41&Kpg6YtXCsK$7))PW(nl z9)5fGIjGUS|NXc-i2Z)bKCoaT=wmT*;B~m`RUTsST^ltRh}##(z21U^=d42|d39(I z+g9x0wey*fFz$u88uGtcT+ zxKb(+#1<*us>M*VST)kA3U!!Zi^dVbWJj!1w5pT765t{-Aiw>fV1f_{U>_>F32@>~ zHyeXT{=oZzZwobVoKWqvBZ1CoH9BB}1=d&{&n9yGx&fWneA?gyp{Goh$cXK`%;up^ z{JowXdGqUEI4A9tZS5~S?Slpg_*Y%VcL}KOa-?t&Cd~_ZWOwMD`@@5sbB69ooZ|I# z+lSqz}e&z3;Z! za4~AfG^ZxH^!`;gN3X7SKU_7-lRi+KOHoV#_~qGBY0Iz|&gNHajWuXDk6I{1rPG`J zTSCNP4f_dkP`rlTTbb71d^^Ik4-`k%AievfJHj9bNs*q6bM#Y7w2iWJG-!8f zy;gnlFeH6Ykgwuxh&P@Gg7rvp)oA(=pNP2Ked9!rayvVp*>rc?BOnj%h*bf^~n8#S)gHaQDM`(qIRE`ZL0ChH44aJdFu}3w14F zFDiC`$hizWv|*XMV5l3u;)0%9@Q@k0Ic0({E&-B%zQ_bY+5}r{JwXsm{tS#9K<0uU zNSZ#%s>Ej{z_{NK%mSyga1YpQbMUvDypVx3&;b5aw4(F>!sV!N>~bWr%Rk-u)Zx-G z6)mVA-eh+}5cj!JJ=boLS)j$F?jhgO0VI}R-8F+vo278)#|;qas>Umf?afYU10 zS3;7-VCai+C3!Is02cSk;lpwh1Pa7^D%xe)y?tmd$1G6H_DB5y|3Y^Di%#(0`t3&{ z_^Y)4%=`k<;N%Qp_Y@YV3L2SI8-$R;@cKmc0tKOMd3h380J7G2B(4P>#3{dt66X>) zpjL6`XB=vU2h#ZaMF#f&t%&|}6w$xE)!4(%_92e_A=*olVmP{d8pXGLL>n}ZoPkx zFx>jVt%s=a&*?yhU}o0@A@;sAlH!do2Z3;%8}+-1tcongY`~w`YDKI-Qi2H+WnX6s zhu{#>e0^zbG6$fLh2*fDC6zu(&AzJd`6mz?ve$@BM^#FDeOp+y9WBd}d>LiS zg#9XNtso6LHbEewq*g63Q@!$_R$ww~n#(z@j2No>`Hin{1WqcrItf2|MPbSDqpNS4 z)K?2m`78rr;xL@MNM9W_SD%M^v<_Te*XbZ}OEdFj^ucI_=ZBD4+ah!qF1Ne2VF@y= z0iS+H5FY#h&^wj1LRUjzMhX??JAxRfU=HhL>9xh*y|GIi;`Z5p5&^}XS zlLv22>J!}5pWmIzN-_&xrLXR@`HuMFiWBR04>`HnN((-vc|;%8->hJ@*dyX)+BAK) z7MSC#+*`x81$kc%k8R0Rm+Y>|h(4ZLn0V)-YLd&+z%;4% zqkA{bH@pyXUCQDTge`#h^15!Zm>Nk|Qk^(BVBdh(SqQ7$#PF%n91nad$nZ%>;L=*5 zDfD@O0JK2XAOmNHPX%Mrukx~^NY~;<@tTGtK{;r4jTKf>|JFZxOC(S2`<7lOGyWVI zKm&4i8Xk$bAm|G9UTOXmPk$p$S|6D^rxKK3E=f6GqOBTd^V$bhJQVsScT(``t&`?R zroc)t$8@0Ho=xsk0>$;VK3;-HIbuCrPybxI#T?zR*zPNZE8RY(aynnU7GI)x;nkL& z_u=>4z&My;l|%@aqR*JAUk>Ig0@p*z1XH$?1%z}UL~exv(OK>O(-VX^XnPyrz0u|{ zu~9I@0(i|qF;KYA`nUTJ!mVLK!ZYQUSO{LtT-M49LoX)7AI|IsU7d~q{;jm=H6vc< zuR>RTM)9||p2-aoL^Cp5@J`#tY zew^b_-j}8_`E^6@RQiSHSf8TI5v=qBx)IJxTtPv`_wd}M!8yq*hD@hW7 zK#g3Hg9+C4KTVXw=P6%CI*=~#j+q2p3RhgvwmmX8{iWwpLY&`Yg-Z(6xQ{xSr~1q_ zDdo{=Xaz3#%gLnQ!=F=Zo4N>r1nr;%#TrMFCRevuSj( zaV%|QjHp~ttrwZ}%$|Q%aBHBW263c)<6^N=qS5pGj)*{fsG(@*`4Jx?f11yJPJc9? zi0R!K>{uqRAa}48_l4Z3`3}C$$ZXw}QzpN`;`#^c@1f3n2c*Y{M*b0v+Efrs_5WzQQhKWz)=Sw#X; z0k;DmT@xkgt}N`itc-kgxth7=PXz=&NlZc*V$Y^8g`K7n#NE{5{~ zy@mnNaL?Zx{`V31vr54bXbTAuTJrD&;jmCi5DLE{(2461STJ&=%>w6L^Nqu;hHLRl zut!t27A`d)F;l;|j8Bm1TL3w%sQCx6SgZFUMgagI$Qy^?55m>-dknQ}zd#wp>{o2NX z&wgbiFX(8ihW>J^K)m#&l_cOBU{{i`!xjY8FF~N>>OPKFxu8*k7bC7379fjW1RXnH zvYw~66H_{47y(6$J{P79*-Meq+%b5ps@KB-ncsW+ekzTR?^Zz9Vu zwxRfJ=io|}p|x>a_i7#TzxH(3>KW&aD0Topu!7m3HfjdUPJokAL-^W0d${_~ns-7* z+!;t%FIacL-HgFq;l>wlx|zCi9Vct>(w6L(hUO zOM`}c*KVla1o+Q(Sa6*{P!0@--(|{25{I1D_l_&ZUZ}5Co})@Bf##mPzPnu4lr1o3 z+GLd5HQrX#GUrufT?lNmOx)v_?6>q9;nPFG{OkDiNI`f+s{{J3Rc$_8oDw-fkc+DX zGlr3u1#jO0%|b0QT$PFI#b+a3_X!jMs)$Q~aAeX|?r1%xkR@Ys3x zwOVH|udfhHn|^kYf+`n~@d5uyY%R{!zwh|+_`JN*E#>w( zhI5nNdAYY046ayzq_#+j9JOukvYMf@V4IX+`^&B@&TCzV3BrXM5_S@VJyT36j7^Wp z=)s(|V?|JFeK8daFd#lBaa>J$6F4?J5LEb)8&K6|kMJxVuGGP5&CD>DU}Io_h^w{edn=DOF(y zVtXix>D;A}*3XR=zSAsBc=9Z4%VLW|Hb#ewPBZF}-T!Nqcz`@$vW)@cQ0^8wX8j}@ z?lc@{wvA#^bIuV{Y64@U8=BPdO=kqEBboTLXCgG-?hNQt)ymS(Y}gv}2g~9jLgpXx zbbzxWqX$!NBKjYZHh04s<#ufIYtMk`6NLORS`~lc7PAz18kEvH4{&HHt?BgH_8jgy zn1^J%gQ~PDkjupS5|Frah5{3j;kDiQU<_z`B6;=`XG@q%2`F>B4%Y3GPiYr~PpuR2 z=V@J17*Amih9mdXgUyQ_xH<*q7dnco)u(u$y&X5c8(fq@emh>oyVdEgngozS=Fm&N zL2U|^!CGjwv9S&H-?yW6LBpvYf$F$ezA`^nvfpq}-!X`VEn|Z`4udDcSKFZmc_PHorjw(>?bL825<*o{lXZ>w&S>}GMxR2z{-&zwht%u(9)gDWPFegOd{ob{kM)Y4 zU!0w((VV@`eq&L?Spby0v*^=HgvodXMX&k}Npl%@=A-iUYxoa!o>R6HFx5E!XEtjK z^xx47;j4{B2lf`9DjRS_;avTl)%L)}Prj`mA@BqvZ^;xUe~zc~CpA{mLFd^WOu-6s zJe}TVVNT>~UFXodhw!OgXF70(dD-l=x7F$v8+)$NfdgpIp?9p5=}*s>mmJtn8e4tJ zzP&)RiTE5cH`!#mRePGXV6|Y;AE}M|u>>y-EbuidxU_>*{K7~VmwC&XIJl=5?zsxi zfI_yuHW-Nm2C)1q|FSAuJTU)NP!EE?t?qIM-P}mq0-%@xwSMFa~> zqCW5vd>y#rP2MBh2Q!6}s8gXICs9B9vMHZyA*|$l--;y13W9cmSw{6UVCRTmuhIKc zHG)2^Ts|*Xa07Nw??q52R*?Pz^t|A82MkPUp2>3?C)x@A`cj>@e8bq=?Os<0>h5PU zH@u}^f{p`ZoQBP=#sneRBj2%H*EQO%Gct!1G~CiNd3dM9S_%Nxch?!# zUC@gIV(6C(=gz?aB9nUrfSW*S@|w8cCN-O(7NmOCQT$!FxGor71ivB=>9(IZ!C%tX zK+QT+k4OV7`<5VtY8Ha8A@T-*h!;H-ZKk115Nw_t+YFmcF4A8CzWo@^#5Kz?66cf4 zKIa^1G=N>4K2ow)rJonKs^A=XOWkpK2~$+TefJ~{Dtw)eS`h`NTv5!oTH_^hjvd!k=Dhf7{3-wK3nk|6mWxA z9dlOeul6jS-yu){%Mey377pj00r?7`v+DABA5cRGlOo~bL-aZq8EE#&MU_CDWnH9Z zNEy6%VPd5eu5))lQ781g!;J@_MiHTz=?%hpM7gp!0TUAzN#2Dd@R|6>sMQttBG2=t zFK8dQ%R;md)^=N8id>sL{Gxb)!%>^0b=sbn%_dv!Rw;e0sml-HX`sc1FBpWHvqQ`Jcuz9kAevTUj^v*t|T*h&k%z)xU?H| zzO+O51(z{!jQo`+2<`GLAkn>#N?_9-|s0==zMOz zd!XyR;Rb28i_FR%s+YI92*2ot$|KOVgoN4b@vlax&pT=qga;lNWN@V*z69 zN+Vj;&s00f&i=TDmeNqT20%ggf^oo_c{NInG?!9ntrjKcRW?BH6$h1^F{jq_zPgB7 zw02;C3rJp_Q;}_)D!s6Ivc`SFc|@{|lB7 z_~?LTDnPIr8kb-`W8NeI)3HxjTrVax9fT3>FZ%DG-)N{L6`Emk8i0jV51bSK2|{f@ zZQQE{43v5xIZ^&~{&e|FgE5nJ$-Bi`BC)wcxI{G{P*16j%*Q5kgF#hnL6dG&<4JoC z67a}sU`epq$mxSwVGGxH3M|f#g(7Per{Ui2>ZrcVn?2ogjG`$YwFc|``LES{_RO7G z+#eGAR;+9dJsg|~;tSw{5>Zt%Y+fbrE}F#mszrkObR`qJEn+uktG)O{+WFL!?6RngJ2^vUzSId{)Ee!BA>B&&~ziwpSTp58^P)Hk8(Y!^Apxl2zqpx${Z9zRsu6~kcP$4n_zT_Iwm72)Ds_o-6_F8pumTegqv*uc}zbmE-{XW^ifmo z8}5cOWT?pe6Ioz3yhr3`FO|$xvg_-v^wtD|_x;~$Q`USX6YM@$q#2%r0CQW{)9TO{6U(K) zp~!Q6o1(&`lgDc!U7l{6e!@dKEH>=DnpYm%f?Lif_^d;{IDF7l2vUv_ei59L{9i|;!uZ-V92J)R>77440N8x5AT_%f15lv^hMI8L-4tb6@DE8 zL*Eq+Ny}`AMG#|uQe-PB^g`%mSC74iOGguz(!0;_c5!&Xd79qH{+7R#q<=X!tq~ud z>M~}IGMB?4Im=IJWLjjCKOUp^5P8vC?7?3@p-@iK)?=tvIAF;Zygm z(m3w;iYDRdLqmr`@Uh1L3$V!tbE7&W!F6_LI#rLp6>R7Ln=H*4#l{Ljo~#_y%g_Um zj*sg6&l%HHL?@y4`Dlyq84_sLU}Jne9|7$}zAIyDO9RytXTeEsi?2{ZRr!&68|bb6 z__WMsIZ&(JV7L7BdEYBN;!eII_@JC;wEYADo;^WWff8qNlAc8t#mnTXVyXJ3A=NL+ zB59oz9-O;!9Yg8K7L*)^!YqikL7;6q$;zw4RM~F)O@bA;4-{;Uop9b1O7%O1I$M{@sq)8HZ*{ zK7n;voP9)e<31jUD*R5Oydic(ZI!+v%HK&=15A9Jft!ab+vW=z8>=qchi@ul%#SEz zYsVN-I>4k|wM;haTZP43(zmXFG}y`eKwo>jj)&LvT#0o~;bV%iDKHK6vpxh#n+n0* z2?E)d)I#q=_>RXxVrou`ud-eE`}Yjn!uDr5uMOSZJ=`VQ)#N_}&)PoC@w}_D({9Na zO-WQ=Iq%#kc}8S!oyXEcSb}nhVp!m)=#E4Bw>8ahMJi`hAh!i1pDlq2e4l}T7-rRy zc*+&4%u9EmdhGMZ_a8Em42W0UH2jTh8Q}X`AqWFT_!!^t5bVH8 zgiR+UDF;GM@;Yw@*eYk$=QesCwaj^xLA%d$F*T*>e3)g^S6ZLiTkDpj6MhOhsOpFM za?+YvIdZ(e zm)fB{4YLBb=>GN{|A?oKASRnQrld#t1n2H(9)j(uQd*-*FBq-vv+xu&04&PLV(0{E zvPjXfOybz`)%4P&>JuH3fHV4rfk<<=(< zr@(vO`A*{oV)z-2B66-?qPaqnsMyGIzKrv3 z$=ZQ8X^~dS?Eihk^pI zjAy*TMVKAqNegztx$`)%!_J*zTe#Bo_--5KV9E2uhSbt0uo=SH0#a+OvROt)#Kw`? z?;8+S3eXeNaQg_zu?X#iDX}g69vpCE_0hQGAms4vn!;5k^4mz!w3tNz_niMlpZr&# z{*SNkRk)EH@Py!%FpAx9Q6F!EhsEfo^mRGFG=+hE`#w8!_E$gp<5&EV|1IwL0QmZf zL#zM@N1bf8+Ly_hDv?feTDOd#+}t@Ob{wFSXAj)@r(6#IEZLOHRmLtRB0c^<7)ylw z?Wu^6zdg>hfHpWq*l8sxP}KsRRx9d(EC^ez9?Z=9SF%k%d{%Nb#|-;6#caISr9#qr z2%ZE)(M6bvDau|n@<6=v53p4H-kqtw9{_ot=e3}{T@(P>+1^jQKw<)mupxd&Z1zBM z{}&**C>kdKSaiD^t9~C|5HmrjaGiprY{yps5D6YT=ogDK7p37bGO&juhDY+e?8_h3UzMb#e_`1)4 zcoj;E#pLe;`5zTr(O<;Ei0Hm2-H42j$&(#5%xlLh3|*u1*PYnY%0F=#E844_Km96? z6{R|)Mg~wzZpeGQgy|>`wZwtQCnBX5&V3Fv1i2PzA@?4gAf$=)$*@ZlTO>nL1jokn z&8W6%)ZH&DPn;cmDIy>L@ZnID!AFO)#|})DB9~;24?!;&y%uvs#PTF3DGQ2gh}e+} zP?#ASZkR_LcJ+W!bD_a_kEcWITX5Q3h_jbidnT8d>Z7ngUy`%{cCz29MXM(|;vbRM zFDD43wp;|R!%9a?)6O-yUGFAx@{yM`fW-j`Xx2LnvOq2@Yyq1(#b+a@9r33So#t83 zqdFqJJZ<-^bE)NCp|+_;vd`5%A7KE}_T!>Yzu3rM_6Pj6*^J;XAXy>yzLC7Z^K@ll zj#RbJDvC&2ReVSCmDO(>*3Q*+t7USs;ls~7O$(T^4j&Iyt&58Z0*B}3>CLC$Ws@k< zg0`p#%#qSWnXW$-P({K|8p2*4bT$^OVPNo*Pc*>-lJ6V$$h5dk2^H8nf!RmQAm1)b zwx>5uKoJ8n&9P`SW)$HI(2d;GPk2Xx_mW{Ra(L;94K(5DIJ2+PSw9+V%*4ygsHjnYVjS_n5E$gfp zi;f?@>0r3>)w&az{;$oCeS^e=sp2yhfX`_<@l45ZGfT>OZFrJd;)if}s`*l)Uoc?= zL4#p@>X``w@bN(nTml(bc9&sw9`FMFV)1;KvzOaKMeQZ$Z)+(k%^x}YU)I1LwX1a=}q5P}HH z)Ht``{Kdyd$IUQhs1CS0tdqzc{s;U6-{fjqiSjZ1%4Kd$#j@REU9P{C!Vta=&jn?7 zS0U~KsBby2f}5$>jWSifCB5Y?Z!It>?+{^|mBn{$Rs_7}IM_Q?js1(Px8Yq0+8rZ- zKApB2p?<2Eh?^)$Tok8cl#{)XB z1t!m=|M-W+`RH>P3_V#rrtFr|Cw8t2mNm7_3ryoL zU!D|tE@f&FD7x!B*Wr3>b$2MX7EXeHMJy&=ib?=8xn79dR|f5 z$L)PEUN2o044CxfO--w^^x_U$?Mj%$JCn7lHm>LtaF0RF$PsGTY>Xnft*gXUYH+XO zEhl@>vUwro#(sp%ub<#RLbrnN-H)1!}7qZ63aK{?O zs2j=EUAyR2X7t}n3;+1u|0;{*&zzEk5{?)aTm$lL=iG}tMR6k+xF}QmUv^eQlm&q-z7qg29{z63JnQ}ILU?Mn`LM~{^)Lv484>aiDA04#gziLriZnzdHm{eAb=#&>4CUr z3hKsQMP{VE$o`(1*Y!`fUL!eYKAps41OKDF*~U&`I@% ze1>+iwRGZ>g&y|Xm&>>JwKd;gb=d2gWnbT_!~1uJ$jwR&zI$pX0k)#ZfnyCOy#;0D zT#Fp17G#@lza3EGa;CX&PrKQ%)i$O?!C&$G7c!>uOb(v`Tz)^b@TeI~gad<57M6+i(t7py zFy%eM*+aYRO)a9;NfwY>;F(U=@fk|hm30Sedvm^3i9ODoaebPryqVsKN zAkNE(G6FWtco8T7)?dxDQ5>_)2lH!tx1HX{i{qHi)Ob1>)O}x#9A|c5rfLTprVF6q z=)opcDyG(~gcAiScv|AERgW+6rXUwf!;P-Dl0pI(u;N)-xUn-nO26(N7dv`U8WZ=i z$u?yL+bL!+#!Gf_k&Tv2TFcyl-bcQY^XZ=|Mxc0L?v$pl66-Z=-it7lxLBeCDD}KR zOY2L%iKJ>#XD#fn5*_zNtC1bdVMVaTFM<>W0lG`xm1P4uyoeIgMHXjIoLMhOCYUkI zONQXb)?0Pllpe1dKZZQi)tw-Cp29oY?ekjFPq*D7N8;BgiJpdq+a7^?mzCRdu1K(K zuz)|t9B`#FD5_(nv((>fL`CrRo}uv|wvt(Qf6e2;hTYY&Wfa-a$oW9l0Gs9-^|xo3 z6NChkOJ1^(M_Z_%6gkus# zda2Bzkmve!Ss&c#of`O5m-C;|^DUZJ4o>Y~$1F{X$Lt$iVU^Fz^QkW-vYw17bck7R zc+yEh_g1-8MT{>W4+dKP;^p77?!xyLcbRVW-Ys^s`|EPjp&5qwc0KC%oR6^#gE@Z$ zwfMci`R_qAR5%b8;BGeFXGgc&!-mIZcGE`cv+e#IVDO6=^E-q6bs4GMGdl|ygi7WN zJe??*+$mc9B@puKpal|))Y5PZdEscX?PS5SG8g{-Kc{HY;UL;fOp$xO`C~SR)-w5y zv$kZ=aL;|^y(!oGOPsvq4A(i)Yfdif!$4Vvz!|bRDLdvQEY$bFNUsZLtz)q(6 ztX#3p{qrVLj}*fUcdUKOBTXuV1R(U2{`kHRV!;glY&RjJqF{J zHUd|ROdKtYmaIV{5p*#Lnt{w=xOnM2uqWA6OnDDC584vMFVp4db`@m%MdLOBd+2^YH@Rrr5Z7%BliHgAo z8djl zmneqrx1FIcml=?8n^JdX)u+3U;oC_oKN@>apXql&|Mp)Fi4cSZp7+f#DMrgBUq-<; zYDbJEe3<|{oc#rVsqzVn!9q|zd5TlYQ|>#YxwW-HA?~sDr+$2B@o{L=EPA&$5{00v z$N>*tab+(0?6(NXbm2S6@#9X|yEB+2P=v3=lp3DbJ(;u2kx}B}qAIV!I?W1Jn-7b_ zhe5`HeGQpCtVLc6&+rWXI3hE@VLIJe16_Utj~LX#q>2lo^Pp7n1@${S{W|u}gGi~0 zo2VEp%~S4`ZbRdE6m*Q!a&*Y9C*R3yk`+7q)j_b63TyBKV~!kJ0i8_?0R$H(9G%KX zbWFM7uTvt9;)c>FGo)(q<(9yfB}jze!&s>N$>VI7bCvy$qeaNM`v+RDpFMm%V$q@M zVE)t0$EDQ({)qXphVl3Q7rWI(8{B;o{)GEwYiAYjFelDBxobn6K(+GEb))>+X8*%f z(0s-U#{P;IP#7|-(kWWe`fSC9?(ILhkzXp1Kl8ZY*Jppm*%U0gKd(~+Wba91<`x%4 z6l79M`CvZAZZXoY6ng&NBK_KXVz)h|@Xh0Cg6|lXVwks4X)AYM8%1J{kZ~#IUoJcq zv9l8D*ImWm>DQjBo?>z}((Gile#4+7?gwUI?EgaOe{%)~6XSt8nYtw)J{|r(Y`fqI$c-I0^_7JJ?UvW*ZeQ2he2msh zt7o2W^inupb^XI2R&tlOtk=PS>1YkeyPvA{3XEPaHTm&5WNk@SH9?b(?Ic`26Y`A<~5QNjWYFhx6_!4jEaX<6f z3jx<|fAZrPTXcNJubqP*U;8!O0I7S7wPg3GTBSRn3Tlc@;~6x4e7WnBBs(nia`D7UB&(2m0F19l3dj zcJ4mus4qw8NbTDmdgT5#kON?lF=@~#fGrlpyC>|eS#$>u_Ul>Cr}TFXbxenuhxz~^ zYaa;&Ik___nhCEwV|j4ZW95FAc`J6!Jsa#!L|c7DhUt$Gb}~Fku80L5hh&_9yC^fW z_TRqY5O%+7zoI;MbqJNd9S8DmkA$A_nJkFWUCtG^Oo-QtBcF?Yn3}F|&_BF+e&6s0 z`a6yHW~wdIJkz^m!?ZxVN-@X`K>P60k}CE7_dGi@jMfI9dFr9&G-54&&GP1!w$+pD z%c?J&)*!_fZF^k9=v3HTZX|8+YZc{RzUGxyP&zi*24=d`jl#2fS{yOI1f(Z}`RK3k zMfF;!W8c^c$M!^&F@Jt?L*y=7a!fh_v~cwHg#Lt4`~~T}FJ^h#J^6}GfZ!eR&s_9B z@VVP-@}tN0|I|Vyf?41IdmKz>^}Yk}mezN$#8Znu4NENW{umbH>AyXrTr=H4z9%?B zQ_it4TUeaP6-9!O&h|w>M*F83?l#4}Y{8XRQi$LVN1j6RT@NsbozN%*c&$&wXWR#w-OQDCI zl5DUOW5?`}yE-}~<@Xw&9LaSa`Tw7uF7fnHG^I*}OULHQ zO1beB&>G#-s~#Q(m}!)TKy|E05b}9BXuTnpb-PKHQhP`hk$uIkJX)Puky{j87JzGa zyNpE(sO|Gu$pIH0_q*Q(*q86tXaFR9+D^aSOuGiuF|`=YA#q+_CN&G6^iJstxU!W# z#N`%*sfVQmt+~&k;PD+h zZSD#hsx)v>S3UEjkIm^1Nblyf>F}^N_IZvm;d4$H7Z|i9S%Mq7GnzXWew$o9k`|$J z>@YcLaM}dnzUKkj<)+a$GL%(;j=k2GpG4ElQcN@+U_Resal<~Z6pEy-n}5EJm0l`h zx;sq9UYc??ta39bQXQq?Q}-xJaW|{IU$<{ekc6pnEUehm&s+4%V6#59sfXn(wr(A$ z?HZu%q^dA_#w`ZPbJZPMa#^aqb#;tc0by)IRDPg>+arY;bV{0+P=}(qDME*+$g$2 zwM%xam6whJymtn~R|Q_`?aF=|WjbO_(mu+l-2C+@)Cj(@Dbf6ZCjk?NzRJ7@aipy&0kX7?5^7UR&H9q~opTnoX% z`wzy^J;<1&31%?TLLc8~3Smk_kl#NmB4A3^E3=j1g`S{*XUa`r5CGtd89hMh+d}L( zg}t(2q5Q145T|*Tw8JaTy}CAY>Y~>4LYRPXMVY|{6NKwG09Kv12bc{iDR9i7ad_6a`jETi?wz~Sz(T(=~_;g6rV_Xv_@&c6mC z{qP!epV}aXi;MA3Vk9)7l^({Dgk9-@lyAyLf*f;_IXQQwQo=56UO9CMJX5R! z(S+DQv(2<~W^v3)gOVET^qvxDIm`#hHh1MqO~Eykk8g2x#IqbhNBRsHSUrw|G8SFj z0t;?zgol}j4Tv@6oL$doV-h>`Ua>Gs0MA@f2Bkh=Q~){wkOdjL+eQR08n#F~Rxkxb z0SsXB+KHSC-88@<3etd}t}CMJ#%{6OU}2=5!(gNU28?Y8Afdm%PdEjj6%_W^dOOS& z`3!&cp&M|XDERXu-~D;GMGUAj2GRc1lg~hwWgpA};A~6WpgJxwgVVE@%8m^1u~T;B zPf1Mew+(4ldw9j|n%hkyF+fLpYTFwyX%5caqpKFROb~``Dbz~6uNT(BKan10iBB!D z%P|b(Z5Y=WG94)6dInRIi{niA$W;`$lapfkN&|gLUVy^GOs&$ktiNyRK}U-0#`O~f zmP$2XAvwcefg-#eXxbUJ-dWQ#F(0Sbn?Eoq_Hd<<#<~;?uAEzHubM&yB9W_~MFJO( z^o6lqo)X(LGS4rQ*Wjhs&w>VGqy_6JHMAs|ORzKbk``;G&z5;joaNVd0vDVuNRDNK($ZtZ zpgUEm*QFbpizCS+7vc8uTfmE1m>>1#LFXgH6KCq?G_wANz7e2m9>wndE^l+7eAM$JfRhH~4bm8yzoQi4{MXsSPJicZ{ zBO@=RO?5Ej1}N{A!IDKUA6aw3fOv~To(2{x<&WwWgCp>OV1Kcx! zR6e&oC5Ryy0ZAH=N1D0|to9}#A>096Eqk@&!U|@Wa%B`agHz34f_LsxOAs^zj6bM> zIyKzJ(teh@#nueLl#>wHk(N#%n5vdQFiRMpg$MV0k-F?c9A1!)fe=+FatZdhzZUT> z%ys`!B&FlwvSY17WL!;1TTC6F{)yrWE{Z9XH*h5WoqqoS$Nc40K(J zels46t<<5S_smA2`eD!uldX==a@_}bfx*E8%b*5U^{2WV(GLNB?rv{tPC~+kj{FZ_ zkNl?S^fEa13gE=_6lb}!eRn(GG1m;j$MDfnpo>rDJop-Zyf77$iVMF} z!oUF*28hkTD2bm=6n$&>;|G!&UmrZ-+=QS1V#lKkx}{$T8tA|J`N(Cj4JHghU!FUm zGWg=i55HFt{QxQpzc~0pSM7b0RH1{pvBXgz>J_YyW(FRyS#hKEY7aP4YEH-SOxGC{+5s*jnaSP35jDT*el}7 zKiKKByTv|GCUNwi2@;+Is)H7IB$m%=BJ+AGk>)AYtDzP*=sV3+4#++woRJ@9Yu`6e zII0Wu)|4bS&~F5D&wj19{?`UuLygtz+x~ zG+3RfdY)N1(w-53MW}+t0@k?1?r(tdk%)uKoMG(z_a39I4ek#Vtk9M64x1~|&H~fu zSi=i`u%JSpQ^p+loWMw-qe8bd8U#Zg*rJRy!4%*=GQM+LCxc;m8sAGP^J%#Ou@FoG zR9o{ycIgsa%b|^X)&vaG`;Pq1@b#)+rky~+EeSrZj_QCBuL=D1KBu_UUO_8s24-;G znzKIp8G2gKn_MiDc0OTQd!NJc+aptpx0EhW+5SxNxqqSO0A>sdS|`N*Lg0OVA*k|Qm{Rt2im*Nr|`O866WNQ*0(94MN zjNAzlM81MZ0Wcrp(oeDz0pk=Yc}(A|u0PSkzV{lrF>S#?$9ohzoWoJOiW&YE?k;^tB6zV(RHJgpPgbmDiI)!TbU=LOBKJ#00b z0L9-~F1E|*xAVGxIzKf8YJp|eI^`@xs=w)<#~a)he!KQ@OXg>ON(VHLgG_ht5<5`Y zFWJ5zamNXpXw`+a>9NL6JSg;hI;F&Y6@m>NK_aCENuCr*MU%8ZkVq04n}#bpmVA6w zMjdlmEZ7OVdY@`I7L}J{|E-eO{;QmOvp#jVEW<*&z8fT)^N#rFyFNQmMFhkZDI^n) zPY~wf@;%VZkv5h?yj6*l`e=m1=FG_Q$c@D9xP-evb~K-&m|d2lG)Q09bKA9j52h}0 z1G>78)VbQA5S{~bb`OZs0-#hXHF*o^Q=>Dne%r-RLAFsYqwK9ol`Pg#a8Hl3KaSz~ zzPJO>kob_K=|`Tm>}A@IzY~OF6pAvXSbs7%4!JdIZS-pU1fl89? zN-#T=vQB@QfGU-LM!`of&dv(de7G-d*dTDw9A}aaUulUBEbcr&9egr++}CcJ59)oU zKe7#^ro0(jf3Ls^yeIORB?YxfgRsxGCVQUt5Iu-CI4u<IlaLu}ZnV(POt3;J|gx7Z*eOk|bP4I7j zk^cTNbN6Q+dxe98sx*&@zXpV1RJ;3jU+1y@f*igRS0ija5N6bRIAh2x2pOC%%#!sO zCO|ss8fs(m+XLSBR+YC@yYH z8(M*x0$sK1KB!C5#{VCC?-|gzaqqLwKKI;n&-dMPzx~HD z$z*2Mto5$9JkRrH402rI_BI@{dd4_1_X)@ZH8W5MWIS*jIWfUP3VIVC03&N_zo!@X zT>~aI2AJ4modi^T^R$WmvRJU^12_tDSKd05N;5vB%gWiVQ?*jre zHTh4&Ff0W z75Q9~f@KjKVDigP{zXarjDFT;i%}U7#l}9}W z!VdtEos)p>hys9tUHT|=Zp6PUWsr9oo$cPhcnLMjezqZ(lXY*nge)9pjNJNwMhJk} z4ELsTud`Rfb;Yspz;Rc$j=>9CKe(Zt(Ajp56;WIU?n}t`HA$6#T~TVci;KM=zyWS%giTSrUR8KMu{?LM z$bh)|eSEXpHYnB`d+O;D_*BlZVvSNtntNNpM+PnI)D@7Y^_gNMlIXIV3}VS)R_+JR zL+y7Pw#Nq%n@LxgRO|FgB2{o1cawbQzA4u*_-LC0?s5Cd;hY5ctR(8+Q<9tqKC1w4 zZ2s45T8P4?)aJ=?a~AaVU<@vitm))brAp2OpDwzPPOd>$B{!oPg?IGemUro06SN+M z?I4>xk444{SFRp#KWarkr`5-!)}8BL=ODJrh;Wgd``z>$J#%Dur^H{4?IUrF#kx(i zR58ki7e>nNMhw`fI`emvB(phPg3ak1n@HXeu}6NvW!wWrv{w`OUZ44G^^d+3$o4v1 z)qHHeyJc9dlrH6Yv_{>~qN-}TCiY9s>rYYEKSf!aV*k+m|8_H!>BiyE&F*Gg7eu}b zY7#5mGHE8Kd~LI&QIW-stNT-|w2~q&Om2E@_%X0%)4tv7>}F1^xNO6)(k^28warC7 zu)7Gz0ghU}D?!RV#yGP^%Sid`-ghZEAy1=~)|&XZe~!-=6AL~pHk$ekv)}=ozYecl z0Hh&+F^_{sKT(k2N^~x;d%GD$6oHHbOOQ&dn+;rAPpCaurJ`yF?8M8UH4U6l9u(jRCQy!v{4BUuxX5u zKf-UOyR;$h%puJVFRzfrdD?HRk+=?Sj$%{f9j_M$2NzK!vY!plke#ttdfDp5AT{wBy+w;1QG0IWV<*=vW<`jl*5o5MdA>Mm zDxg&AK-4UosI!b?W5-bORQ~#YSQeG249=L{($@7`$3xsQGj@0^ z>7(?Ga5p25UD!k}m2Dpqd2D&6qeip!wW?+-^vl7}c9kjq`dW-6^ z*CZ7wrPd8Ar9PIA(cix4)c6V}+pUX;9kdbQG$giHoFHXY$c{R#b7``4TU*Fp#kjY% zS?Ueaw3TOJNO1)JCy)ER)(@*!kT=k>BR4^3-^ zyWg&+*Qk9c?O6Xtpwi|vabC;4AxrQE6L!vJ?mgjS5|n^{ z;zZVEw?<4nqDU6!d`fVhmwaGpQ`!4Z;wiUx&73jkbr|0an7Z*J0$?<*C*aX-g!j~8 zYhVE{!iC0Y9ncGPz$51>hwzAb^!f%8UI?OvIsQ%LQemRo8fI7@>}1gEVRN-{one(VWxm8OEhl7+Oys|=uF@t)?xlVuPubIQ9uWpsM0Z4)hbh&gxrT+MqZbIpTZ z!CId3sEqIixeS+r&%=GDw1-moJIECZIR8TOqH2?1)9QtzMk=FYcon z8+8R<`yl6VaIjgZfPCAMw2?=h&du z6$PQiH{tV5WVsJpc;cvP?qXloLeh@8-Fk877?X8@D*Qdw*JKxnHtLfo;<{^|vaM!< zZ1Jd;^w-*#&vF}a&;G*0f>mL*B>|)7&)@Ct)hWIzF*he>O5Wczy370^Dv zGW@-W7}tR<>ON9>`LwdD@0`-M@O}My*8$IsIpZzdNaPkGfLe5p>1H$AIm1*;IC0pNv)AjeX!48-G(iwHB|LJPDUH-40)3e2z<0hr+^2HOOS zsA7i}^H-3|^tNO@>{R#9#&-($?8`~6*3o`WD12$t%Cai5y=fOZ8(uZv@=CbT%{fx7 z6?E(vm>cu8K*t2&`F+2H$v8C+hinSso?sIKowXQEDqT{Ht}J;6S%Wt<4 z)o>4H_^sxcIjuOHYuzYDR~?;~1)2uqK^7$+(pw=o%#losd|)4xeFm_H1HYJ@4J~L` z`~3b6nN6}m7tyu%U`hCNeJ?CV#*#D<5q9}rM2k(-Ph*ex>1=aY9{D)VykxcS9NT4> z?cs-vG}hb;KcINW=8q-p{-;#a^ezb#DFTpKDH0X3u#RAqNqKz$ZW_af+7=;W?EyRa znO`4~@4XG4PQWBG9PiCzrJ7ONQP5I?z7nWujSxOe=Bq|BmXef)o6VC;g(y&uR!%AD zXq=+Rso1_rK{KFa>iqqx&W>ougIk^7SrFPqdqV>%4%RJ05XM-0q0J^d9VJD(5R-_ICiey!G?{5HiD1K zUiGkIZntm&@~P~VL|;ZMxmIAo&VGOuEEvqMHdun)@^pCz5MPS#TY(-@;at)P_2Lk! zwTDeQqSzyXh4zJVXWpQbhiXYWH8gzN=@qO5Ies=8r+R)FG|;Js^N)|OF43!N6nti_ zv+{Vv{K60{@Ed$DOr+0Arb2~oVLV&u<}EO%#jNLQQb+yl?Pj2~Bv_~fujba&+Ln-p z*9-Ni>odlSVNE1BCrV*AT=4-luovH9_KY%k^2(yL>!ItMhVYHPw?o&RdxEWCHN&X5 zXh&tEI8gauC%(AtTH$22XM52Z4gGnT;kM7F8xG7|e@&nqW>d*uhSs+&5hkIsa6x|6 zhUPp&!!fj<- zDY}oA!f5yN%mxG|`uPtVH%C|JBS^r&}l)}ClH{b zcaPpIQMP%&+)G)kS8}`n=+_;p+)HKyNdUT=FXTuQtLTL0>8}at3EJS6u6c>AY40!( zGcWNv-DkO^_eqTC3`Q{R^I~pHmhdrI*W9V8*i>%H$%8|lrsF7)2ZG# z)~-hDl0P8hx~*%=1IOseqoQeD)fhGJyG+icEoYz5HLc;YqAKxX)cmrL$OE?g*HR)Q z4}`!3`R0+U8c>AuzXg$#C;7}{uacLJByWr^btP{$xVdeZ@Q~v+|B(C0 zDPw-x_!Jdeq}3HtLWeKOsWW+_5Q8P@bTxVfd&2MCsj)pC&Wi@np};wbyeXKg#o3J% za?RONZz;VOsyB3=op4{h077;A066ILMOPtxZJqT|*b_SZRL|4p!I#@d^kwQC7x~Ak zt{m8@;N%Tz19Ii)JDgBtux7AAozWe00%ooTRZ?9~>whZk@}uMM%fDrQ-Qj3j#t#9` z_KI>m50GwCv@o7Bzo_daI-pPCR++Q^GKYI3=>WKb6OGpns0^voawV%0Y16G~_c+%4 zz5w4#(Kr9q(B}`nC;CxG8pK_R$IvXf@#_uQtbAxxt{xgprtQ}~H^cJ}fWqII=l|&Q z{|(IaRIL!(gKYG^OX8s0f)k$`i^f{44h=3lqA&N={_1HNlbs)wF(MYO?GT-q#{%t; z8ZUR^hnT{6p+D%1CsB}D>IbKhC}~0p-=fBnu;T*@2f89XOeY$DF6sG#Sq0JkhYF*C zjH?+)e{G=(E>RFCsL?@eBzKMb5$LO?Jt#3cA%UPCyOi;zA07qU&qzI>0BH-YvFHhC zIyS3=k*zVP#4M25S4ZcLaIoA{$mOob(j~0uQ*4 z(UIAoY5dYlS;v!3kO39ux&^ufDeZs-3!*y8-D~D1#&VXoB_>(wZ!MJAkrg+83zW5f zMFgIs;x|H=N&bLipN$$iz_u^p5kr8cC`G{uk~K35J8QhsS$c0xpKIUZ1#k)!Y)Y@#00e#rnjN1(04xvZ-ID62tWelFv8`@xPDQk4)AmQic zLBQ#`zLwFW055`uo|c2wL0RG$wCuMOly!-d_?qvu$f8oPWnN`dhV{EzrR}4Zc}{fg zg$C}1rvalM`iAx;iaU!l1Bv45vSq#s(S4DT@K<3#NG^m`ml1HlW`8()tIjt5StJrj z%e*YMzxsA0bLMV%@jhCkI--aV41LE}ERBsCgOlk_gF3tmr}UsvRWhY0oL6GxpXmG! zUOn0-OD^43bOgi({QJDSvlq(s79(AX2FV^SEJI*85cIcY!lC!;;mZNYUe&lN*yiM!5 zgU%`CfXX`GkU-MTjyj{DT+_2ac{1ts`!Hzj4Td55Vvy#`DVL6@Wt#IDS6MehKR#SA zeDOBXHEv=QEcx8m6EptIf?Z$?2EpsgrqxintZ6mW|0!eqpPvM4_;nIYWeV)dw>~l2 z?^4It3k<>X`nk%TUO9syRwhFb;sCj%=dm}a$<40=zqtTdk{h3a_(K;!=XdiEOD=Gveb0hV zXgKyE)=ap-mTyXesyEM74tSni6J=4~SFg?nwbz)YHBJE^d#W?17yuzd_6c{o^i%QM{7=mxs`$U%R#w{qW!xA)NH(%Z{DPLi@jgz zRT-+n&jlvW89YufVfnP81xreA6(Dv7GSkXIa_NZmg6`_^CpE>fYPZ!d(ykc3u7R32 z=C7XfP|xhPcA1E9(;uKwM5eSr!5H`zzNP3#m~D)eLoVmR%dcP;JDHJ{lf%(^fo5#d z(}nudQ)diI&ZUQYO1aZ|DAR->Ytjzk8%BFS9Ezjc7g}qZys64Q*6z?&Q)_a~nPw{+ zA7jaQxWr8&>c;TYyQA3UDnbkfbMhjP3S=d>9BBbBqZ>+lu9kI;y_+2QKoSGnLhMJD zpi#zi1~P)w;zJYf+9leFqM`tz0^6N{FCGn1dw*ru&Sj?%|G4WFX)WF=#gGH`mMo54 zN#1sM*V=VYzDPFHv-r$9Tw{ZK>0RpNdm5GDn6<-1cwazHkPB&3sEfKOnwS+hP0kmG zJM5KonnANEHE1{mB)4oiqKDGy`zzA=FtC)$lq|B`L6TVl zi#44mv^iZBT{01^A#~y^r)OL{dH9H@RQ>2O`H_O>SvzoI9it>V>wXbjP+lwD!`}cB zR{dxiTu@O9^)(O$m#2kV_R6wFe1fKqPR)m1;Pw2=Xpi=2{2)o zO%s)>Nw=W4j0lCjxE4a0&sH2_9=RH|23d_#xFlBwyzjdf#TX?44v3!*BZu=r)6;p40 zV8XsD(G^+22HeT)v)dXs{|(>T%ryggLnl zS2Vo(0xPu=#QIFOx>uaFgIleG+hDrW35#`$`APoMg0~+?VhHWIdl2_1yRsz2(>ob1 z(4EyY)at@@zpMe!1-I*`{4t%Y{Ueob8oV+aH|+b!4)Lmq-`!MFItE5));(m5$XVT` zc848gc)~WBO^?@Hz3gmnw;!Z_Xu)C8JBS%DL^Hh*>Zx{YQ)HK6UF5zX>*^%u<0~ph zR-Nm`Az!q}S4hst^_PNE2)z><=$S+kUgcUolMjDdS;p9FwVsDj>4y92uOod1lV$g!3Y3ju-fd zh#fJ8KeIAWXItjN1nFt`x}Y*{(T>vzF)rx z+^TWq*$D&yxYn-LrpPm8RGL zveJO?l&W0Csdn!bgt(Sq{kV^zLTqJ$lowvckWC` z`OWmRh@No`>R9S?(rwr}>Oa#M{HGbOKkpZ8C;!IH;Wy(8nuZAu13Fmx1b*BoG=@)Q z!Q6y_pNu^CtT~R`@N(n!P}XqHbZ@CYZS$Wy-?X#!AHPxWC$Dwzu^$5N<#$IC`fl5| zozmZi0nXuANo*JY3UU*a=uW;ZU+ppqyW|X3ixnBUT0NBTf!c8b6FWx>>mbtpRAG~_ zuvzFwGol!!9NtnA!+s*QNN}ju%rEs3%mUw2{sO0iXQGDCvpiE?lGS65N%m-1wgy{(qDE4h4)fHJ=Ew0O_M5=&aYhj_`%i;_K3l zb7sBB@!jaeKS%4Na?5;yP`r*4_l)^%3`r7feh(&Kr$v8P3eJpzRzY$CRlc@Bl0zL9 z>3Zmu5F(wIIXkvfb@vX%6KZly!wj$crfbgXJBa@Un4E6|b1XZkzIs|D4Gu1!5s)tN z%AHu8Y26$2s!=oB$;B3y-ld}-bH)`bpVG6`+};-?gfagTF7wQj9?uVp@xs96G~xCatke;KjD7nd8h;PD zq5Ld|)@uyNX@1Ah0E8?`-NY)xgzW2`Iq z_J64N|38KH(9!0GQ7VO0lC#)kn(9utTa4Bn)!}jmN$q%d3_bCs!FcLWgz&1s)4vi~ z(>Zui52biA9&J#WnvGm`<-!3LlKJgwF^;AEDe-6o+sY#GR$=Ox-j*Qu7F9+fesHpr z&TJ$v9E<{Gp$%6O;-R(om~3rw$h-&cpadY0WYwF$Wo7tDVAl!v=nEnsB^-N&~QKm$DR8e`-X9zP@JE3BwiWlqX2 z7?{6r`>CT}+cbD)V=C-Dy9m-GXZF==JyVPe4Qc$-_e+ji-l|XF-3PNRtT{>;pyQG@ z+(-lz%gN$8RiVlk_ZKuZ+iSk4-Bi)kg@sQ^S@O`=X0|;?m>?{sWrOQQELJcWuIN}) zLi((=1f}+eXQ1VT8IGH#lI?49d%@3ITLC:GgA3$eC##Joiu_2QhidPMZ|mW!Ku z)UvULwog41O11zcA^K{r&$GZY)Gw7a4?knOL{V|o&J2$B@Y62-`{yky>P&0n@F{y1 zEfMHsn>A9|dxe*Jo4Xi6I@{(h$*LmPrq(AWtuhg0*Ak*wX=<6#+mNv?{F0L9sH05v zKK!J@Srq~RYOQv?KMprTMs%NHYrJtT9iB9WARtT|C=gXCwuC~Ah(}O0R{RusHe-wP z@b+y`5&9O8fA;6BbK+ryJEzNqDpMHnV9A3LG_=`f73E{wX1=pt6`Z2?DLicIy%SGJ z%p^w$49P9qyO0$KP-= zlTkHw1$4jQnr4R7sV~xYnR=6&)Y?DSit_YmT+3+&44oc9)`i|_XEiRisM1p@3wLUp z^G4!%!Wb7>b}R9YDDF$kD{cOguTl=*Fog=GoQLRKyADaOeb>?TG^_>7k-Mb3a^6_k z=q?JkYW2&nKwU|s3sPI++&tROEhQE(wgr9MgtND~lR@`&N>9kRLK1GPFLR2w_;N(R z?EjRxKiiUH-?`2^<)X?13Y0xS5^V2(Kbj@8eT(ID+=dC+# zFP28W#UUHi{jmNFr@~i??m;K2hm!Iq&P_H`-|wbBJ{WqqOu^Qy^X}^$)dT?~Qk!3r zbmExwnh^qQtTzmca?Q}WTAdLb?IG(Yjeadx9zHDJ1P`a~7i&)u7jB>x!N{t`L94TY zi5QuMW5f=^4hp_IUfN3Vit70n=IE+$zskLS)pJ*g5uu*7XE??45waa$K3WAPXHIRc z_#S}%8GnLRt%JH_K#}NYoCwLaSI%E>*9-SkuSR^RkK!NTB>oUu{>yp4!HIZUd&Cuu zOwG1{nXoYIrD#N=xu zN4~e9ItCAHxlU+`g=OGmNRYIHZwP^GS46+etss_;Bx*mj}qsP@24oDE!bmc>2*I&#aRil7SvsA)u+2qnn{jP$4i z2C^0!O6`D~M%o`o#+~Y*$O0hTdrdH>3;Mx2ks#eTJ1r6*_lh^;K`4=z5rgKOF9%ri=-xZdMw)^ai6M z+{b9Y11BqiwEj#0VDyCrI>OLwW7?ihYlDLM1n;=R85vIJ&OAKKc$e$C_w42L)h<1= z6^U~hj(z7a!X>{f)!+GBG>9`PxaG+ddVqq(6FGD@7`!s|DFbRTmcRDWH_Sj%0fUCe z!SDSOVdh;h4kEN}d30V(^crv*UZf-YPR>DYDiip#f9iaY_JaugFUT@ji@-inDItLm zCChQbl~}`)xS{7f`s<|9(J_6mNx=f;qC2g~p3+i34 z7c2`DL^(HK1vj;_PFxswdL9B?*1Zp=z!Cxd>`pmwXvJh{)-~^#HB|&t`s4Xz8&`68 z&#=3l2RR&4Ma85ovjo*bmFuN7xnKtQ-V|)Y2U1fEf;XGrmM@pb56d5GwRs<4NqiS_ zg6x-D6WF=B-oNj-R*>orliC-x#0Wb-_a^N<)e(MkcngZj$L^i#xK&X1R5ezXQme7( zczcKHYgiIWWfUFA#R|OHaFM%pSCCH~vz%chs~5e`RYl7c=5k_|wH$b%?|kuc zgVVvhz6{ggupxaiv11rcsHa8MMP|kJ4*|4XH)2XtiT59TH{sqFtEzEvmRe6S`IYzW zFY83!SE&A>ee$pY9^4u!&|q@F=Dy9Y=U=!4_M5MI#C9}dfZEhxlO~e!8_M24M85y_ z`!?2Hv?=ic!bk>sbGaL}c?+(Mrpoz5kNbvkbRcMRrVnGhmbtD0ornJ^oArB^dfCV= zxO*a47fQau&v$Gfe9oY13z8j6*|IcSKqjH5%V$%k`!Bqx1`6{( z7}@{MzeRd%KxA?|HxbBP;8DN&f!2i5?70aiV5ct(Z2^6{2}2z0biy_fU9szQW2;^SsA>4~^!6r?JsiR{|Hg=|<*;(%9ulCEr>=02t`4mV&dw zwVRXDuf?Gix~Eg!GTvsS9vqEd9v|XmuoKs4FkB0w;Zx*lLRw$QF)lFl zzFXWSj!}-($9nB>CVwq*lQiRs#1qn&FVai4=w$>Y;PH%{vE z8N-&qbcA!h#t~mSNo8nnm!wZ}{W7iw>kUuv(*iDNj!qq(s(90`qpwdg=Ffxk%b{d@ zfwX_Vb9-f@3U_@9n{V|j7EMqNMynGXw&kL3H7l51wV#;DCGLoTQZDBH( zHB3ndeA@_WM2|N`m;>T0f8q)p;}=W&dwnT)?Eg|w8OR2YEo0Xj3HW01 zn7m?D(8Y58FN)jWe?Xd_M8XS z#RRmnTPTMWZGLZNl3jm+v|*m|u3I`IcpApNT^$evm^%MKpb$Ze#9uCQ7Y-%aSvICTkn?gqy7<3SP`sy7|YM!?` z#idOe5#9?LmW&JuOxNQouH<;irs3ud>Z#;}m+Dh0BIUh;UfA68YgqhX572_&{~?ia zvki3nVJsBv=%Rnvt`_S+*2u5O4ivZ!>8ecFqjM(s+Zst$oW&uztDIhjJEVK+Wa>zf zM;^zQ?8(G&SI(L{9?4ktV%5^Jlgp$_tGalH*@UA7r!>MX>Uv?-4IbOb3$F+xl`Ibo zj6zk>pN#t%h721VUk511FBf3hZXwF>7HiABZ?eOLHs_8|1|M+i&IbxBMFn$=G6`Y4 zp7oD7y?gqf1rBchdw}PEYp+Zy2NL)IA9ygxY?l?FLIQZ(tnVh>M^i@x*mOFh%lffP zG;sF9ELfbRPyaFyT^!WCKstkXoJSE#4%Q|7*gEqG_UP~UP5-6P=!f$}e`=0N{iQkP z*m5J(P?aXU47f}RIotpBCje?bj~vo(z?B8eSSHBSg1yL&CUmpFj2r!>L#8hi)s6Xv zxdN|`{>ufSz{+MkY9=oP%XerWXa#8P$)ImXRUl_*%8JMZt0Y+-G?H-rg4Xr_-G_b= zj)^lXrLqrK21IKi%6w7MMWmkHFvj0e6pcmTPHQK_%+VUcs!4opT=d*6`(KBo?5`WP zTkqmKr_*O6klIGpRt|wTJaY`WNLEDaqqW|!TQpa0*)l7*DTGgHPfrWpTyu4n?~qkU zv4NIg54j#@Um^+?T}Q<|IC6trBd|~CB4Xz3`q+W+)o@&iTDGx^$mw}6_Ex0dpMT8% z&SlUNOKov1ypn@f9~M{&pZb@=fXcHF3Uqec`0&_XCY(AUQIY-1JJZW@Ozvty@$KxD zhvrNtsm=gfv|}~bQs{@u8L)%6M}&c!S+n{NqQM_A^ozYA&qEfoEG zt_cCV;&pp~Qe6kJ4}*OX48)<#V%X6{UfE;ofmX0% zfC=$b6OfP5nlgfrs^=k;JZ7Pb0l!!xW~@y;kk_Da6rzoUjJ+mc1Sj8beA93CnKuU~ zP(chrwD&(_WMV0`OCWpvvhsnB;r*y-jTyUQ1}el%(4xEEW&{=#bA`rZQ#Oy4;7hFJ zH{iM?ElDQ2AWQGdEQG^;qdvLj=H)<9@nFM=Inu)k>3 zg0HK5|N2KjrOftVV||cMuJX@$XSE={DgdAqOsGDpArnXK?dQ{woz%OwJX|>I4LJ}v zC8q^Y%EqZ@lKZV)OIRuK4Jo;W&!#pvf{oKFlf_1J(VHI$qOJ zp25w61-S+RQ3zxFE-c%0W8d1mVsG#wXO3s_al=p9sJ_+LI&s@LR!Hr5E43i^NVahN z)cyg&E^={{lO5lZv}4EREJDhz<`KgL-dTnWCu`Em%s0LXv;nMX9LwGT6ZgSwMqkgN zt!rPO-&6!^Ljm=@ztsd&Y+AVkRcu0Ctw`Dyk5y zr%WeA9-(%g$Ts8bC?cyx^=>bTx$JphB6=VEMTH-OBzIn6yA=}WyT?+?(2q4E>e{f| z`&;R-I4a>>O<%ryt95eUduhFg38R#SO1>ZopW`pm@xBoV$v1o3-IP$8_Gp3bo|xco zA2C>dX!D@Mg(e8Urw`lc)xIx`zg3`6n=9uKyXJx9nDxP+G$Wn34WGwF=fv?}setsY zL_o(JRULdgv@E_T>gwdxy|%CN-gk|T>8kLQ^Jv8^M3s5-P#j+e)Wc*4dhss?gXS#P zGrDJms|YhNZ0~GY1ShXQ68kLBbP5_(UVh`oMd?`M6Khd}k1Y0{5g8$+Q!G@l$g(H5 zc32(cD41!%_i>Z_DYJ2}IVWS>EKoy6`)xh2HgCY(($wk?O%HmbJkA&-}CR5gQ^9%PMsP%Ptu5W&St7>PF}T0z^ckGIbnb9S*84#4KNa7 zGs#t~r3^a)qPVz3J(QaIH3#z3v3#B%W7+6IQDpX((Ixx$$3ZjWDb{ZNgLdE}efd9%u1+|u6vS-$&x zRrfo3z>qu>m6fzCs&EE^h*1IZnRU?t23 zr9ua|=;1J<2|9~@s#7};cXhqbU@?YJdtPjF7IEtb44?N0%r0ynp8El~G?ziYHuWZ_ z&e9MNRjMl>shu*>jLxOX^+eV2)=N9Qa2xeQl8c-lp1!zMk^nBswwt_pl7kx(;piFN z@4ltanSFSsJ#I`Aq-?~|@@bE7l^&>0oa9r5vh~w_Tk`J!--tpBgnts$a*n{jR@Xa1 z@3(#Vj%*JAd5xR4;Ezf`{F#QUOWQ;Mg;|JOj}^8t8l;yXyah#3nS zZ8(9;m4c}^tYw`1SVo;gTsOe5H-%(P= zo+d!WmBD^}g)zkg`F)VR?mYKsXn#co_GjC3S23{X#RS_RW*JH=$th4)g$d$7lqwiRCZRk5q|n#L2be zdgm;@ZL2=ioLjc)!~&6ExJ-Te6F zbV`}o(RwZ72kO$0yt9im?i8Km@f;@a8K4p?UuDhIo1w6a|3%A6yPaj%HA-uS8=FfY zECgQFjZFJu${ekhEzFLH;hx(|Pey9eyGZlSY}{IEl0OoN9)+D4jYgKjFm&xMHp`-} zt3j{+l#IYkqu|u6Gxoc)jI>+NtfsZAOybcw4?8;&K(w%u9nviOu6m+Tv1H>+ZH;}0 zMbS}R&qtcnvnHBzPjYH=DMo9~=19Gc(k!VfDwhaxt1ZXB6`Fx*;}WAl;!`RhBoOMb z>}QR&b&cWCRXNH_VrjLM1RW+SLk8;$(#B2C@C62f^+I`+40@_mf-kKH9W4lm2~S%f zXqjdL>EqZ52I4+aQ%sShdB^pRUML(o*?fA}u2esVxDJ?e)m=y}u#BpvluT7`K) zE#r86tmhH-@>I!0H$A8w5*Pz|)L;4^`{;ZYXw=9@zWeDZAK-r+fS=dqkD~znH~=!g ztnVL(VePL+!0Z<*@#8Q68}Q?M_}S9@@G>ZW-p2oi(QqNKZ#01|0VH>S+7eE5F+`h} z;|-6wL29fPbQ~UHR?GCXZ3n@0QUvG}(to5%w6?=9^nlVYtHSZuGitNzn(0mDf5XH5 zc^kjj!T(L`??O(y(~%f!zibfp!fqfl`?dKoC~dOZ>z(@TIf z)=r!O%$?&GJEHP8jD_$HzqBbwHGA&)a(YrI5GiV}AI4t(9>NoZJE*j$IKLi%WUI@tV83TQ53gLA1PTX?1 zFbf2q5N{<4(!vE!hd_PJ6Y6OFhIxY%f{ZN&?$84I*t*_9VA+90g4Br62jJ-Z5n#sW zChh=Lx53Z3(&SIzo8%eZ40I9k+BkML>^MbmMhj|H(?-&!S29YKTD z+wC>z5M%Y!dtq4nLbN(}W=7Y#L~|;?BOi?r`s7`n4@CK-`EpHDEV}x&sFW{Z+QP9r zLKWVW)vDof+WmIMidB+Gp5l^VPxNT|di@4S}vqs#L9}$K)Un<)gKO^6bU71H+ zuseizHD2$2?c>ryhqB@qYL86<%C%iGztT!qt5`>5z*WuJPo$wG4lJAu1x222**Xj;W37taXjT&q=2OYA3LS)QEAxwp-jOXK`w) z(bpW}1{?X@s*&x7NVMOC~f+9$!+^_!Yqe&9^k1k|0{%;ntU*i|JdljN8RU;&g&V;pE zx|+u_!fU>M`l9uqt+UN;o#Lf^c`-5Z>&te{T&AFMArM4f=<*Rha6788Y%x@Nn!B?k zX8MpQ{KFAiGRQuJ!W{td`9}M%CYA#dA%Am-FBgaz1->!JM(HgeVXI!SGxCYz{pkFf zuWE8%Ep-=`$V<4rwL88k?c%C+mtF>%2Y2l3pXGV;*TVY$Fnv!bCJN-CZ1i>QTKm-txzuWX154g^_oQ`C24BpLyrmZIO`5xxsEMvYs%+;b6Na^Ch>E z)am@IiBB9QJh*l%4k+r2>s1uZFk6}4rzn~xTR=^P&lZm|>{&B+1)s0F(g8$Q2W;xP zw$eUt-1?zIL!)SqJ0omD=HkW@JCM1K4m`470P@hjhtLlMS@L4RUZ&nA(xK|4&HF76 zd2cFFDH@Ht{b9p};WwkNW*W{m*v6&RRW@;z9&!&O!I~ktOY7y)sr1`G?UiS;Z|;n2 zJooO6R&u4ywp5#W(F%2SozWuSyFKoBoXPg4q%|gKD5Hc3Xz^eExV#Bv&}k`$j1s(Cp#2sp{%|9>jQ^go;+C{?0LL?133`Qtqo7)HR3iM-+0sxkNdgXRDkzry}YD&zj>T$LE>FNAczAXn%2wXfB`o>_v5itYg}z9F3r z6%8v8qyz>`fj0C(NPTH-dAGFnT(=$c-mGJZ`5vr)Y+!oQQNo&X;zvR6VXpgnEjard z20sdALvseN<|?u$qHb-FI+^3QtxH$$z@m*pNms40jG;S;-?P!1+(oU`6rCv8fsaE_ z-;-;i=-$)00kr-NsaBiUv??RGmq(+tww8YYSXAp?{||76TCR>3e~mle#9@o*vzBI& z(8P(0|fdG zy;H{VR_0B#rJpXWz68naA$*z?NY@sM7BkX`g?uezbl;N;j8^^zJV=kLG1hj;rGL6O zo|tXcrGi`h*r@63@pZ8O7?k>Q-(nfh1WwayvoDL*@z*Xmxl0qq_vLyBUuT0{>0EfIqKk!HBB;Im#8E`L!Q|!x=z~wv zh9=*{Yl--5Lic4;iql?Y^G;0c+7DU|DclvAZY%i%eI>i_Fj%ORY2TcJ`b-RPs%GB(9SHjG0VDt3 zvHxEQp_b`@qRYxwAcSg`z`W~@L-%CH39r)3p3L|=KGT2scy`hc92Vv)vj`TzESrGP z=_kxGX5vTx3JPR=rlMxcg`k6hAyKGQ1?2K+oir{8^dm}P2Rwh_Lx7`6lSAwOIEr`{ z_^Z!FgE@d*M#Bh9*|E%PddmHg3ahI|vKeUuO|fZik#{@}r6?h1O+(W6zZ#6_xVxYx zmk1EG9^|~Y`N-;ab%F9`@C+J+Qy^vnrLV#)#>v&7E6|=i0&)l#Rl#Wr;^sCq3&dA{ zJ(OGW%tsG|NdE5RJa~NXGz9%dFDRXrs0`pmNV@FB1&p~(?h+k-0jD}D#0%lZv|+WT z9Y*^}rJMBLKQqy98Txo}w)vep<|+L@#O+^-)&X&tJlI5O@)V)06T$~&N-$hKIKmi~yjm0l_k9@WP2HUI%+5LbfOvP>@**qg>ZQQt7e=|BNK>bD5^qerVj z{g89Hz@e-1Cc1*H=woY9tM*#TQ^ZADd4dME$GtiA*(+(5Umoh`^hUTYYL*`{(8?0^ zTb@iA3q;wyJ3rHU_MqYvzFsM)3H!h0MElRKP%h>?-nl7weXA^V4xiMi2nbqThzCD( z4$Ua~g;ZRV1&Ps4Hpp|yT^OgTb7l*r<1_kzEQ5>`vZPT32i$zpdi3n$geHAg5#8PI zp9~ZC9UFZ!v`?ivbcb0-hG0P<*Oy|#E=k~TcVf<>Z7y(Ku4Oerzclk@?uV6<0K}*= zxhj{iC9iyL(Bq}fwN*S))JF4<%ho4y$uZ@gN2jhB7^^<=GK=>7fcu0OESAr&K7mx` zMeBw5I5n#z%y^5++neEd+DnQp!N$J~Ns=eXkan}rvpm0kDwx>TE^#&j7wk4>HcETS zXisljfYu@s9hAid54&sVjFnlZ)nNR0F7;dL#>Do~@^6@_^;ZcKdB90f->DGy!>j!R)%^lHDj=Xx6L=xE{Nrt)9}c5ElDy=4 zQLa`E(Jw51(D+PKV&Ru|0oTGV+&57*3lp9luXLs@Xvt_%$W+&D7cfCU6;u^|Eiiu4v0=>j5MLbhT- zYD5GCl_*6-Nl)Gf1Wh-%d23pc=r9z)GKinJA* z8Yu17k2}{6fvgMGe(_T6sw)IuP;k>{MfxBY!bWYdy!C{+=ik<*GZgvV$i;sr(>aTj zkwqfGA|Jk41CYo~4ziFX2`!+7;0Yv{|F(Zq0}V0co345mz}sE~qzy+4+Uzv(%;?%JU{nAMn~G*O6%*BATrY`?>ou3e=0 zB@iSMr~zOtojiE``#H2T#)JE#(qzfc)Z!Kj>119Gr7Fs=PV|_c0YdBE4=HLPqD4PYiBb-TL>SS-xxO*E|0;hWHl-^EWZeKjhf|@_YZ*Oz^)HgIT-L zR8b8Xu>E;YjV$6R2>J@}b}tF;=g6d`Alx`^y~ zncz8?-+F@`yaQ^k^7G0P!&>b#5-OZzk=xIYOsnV=PHDy zdyG)ECRoz|VGnoM?<-i zffk+dkF#J}C%%wGL~UkRYnl?zxxPw@^is=Uv7%SZT4@!KW2~{wG!}ux9ax{< zyr8AP*o%3*3>EN)sh`QaVKKgm3@z)|*B#~>8O6k|%rI{{IbL9AW2C;e2@k9;EfVGq z&x<`{qJbKOKk&R_jG(yYVeL?PDRBok`HmBdU!|S7VD-WkHaHV>zr1Njx?zPC9T9Hf zhNYsSTt2CX42LJXS8}ce7jM0mNuJTr@PM^H4NuH}FTmkeaP7)00Xr1PPVUoXs2Woz zMkFfgxcVDQ;*KCWk}D_iO=N}?6pFHBNAukano65ej%%LIl6nq}eof9&03!Q>dqoETLN1~)R1vXCRl2A{geE3gJ(C4-6!AK5FR)X z_NrSkF^pzdp#Pov(R9lb4|hfGBq}6~wQSyhmWy|TeO~lKm1X>^;jIU&ZnUQ#zM0)R z{nQM8s!<*j;@O*(zDw}RcQZdF4h_upz23f=X zCj?3^f_m8ln@FslV@P4|$$Lj~*MDcKm%2ww7G~?YmbrWzY*N*Cb$DtwYIDy{F2CjW zy7#EHKmEFkuL)F>=UIxlWEuc8CliG|^Ehxm_MeR(#S z=-a+f@Z^5w5w(?Q_e&K#Zj1QRiL9`x!iOFY>kJ;6_oSB_piOs*h8;^$N-;$~CiG?w z!`W1k5x)>vW55Px$e8Rjb>MMXkJpwv&OXFyA1(L>9^`M(Mgdh!wf^THsef#0Z!;^xOtnX}f)(kid(;4@u0eV=8nF3z-OA$yYDKAb3YyMg>hXvv4ht5Kxl8QGzABjnyGq9kvh zM4<*ZJivKNxtv#I{-zmqYDnLAqy{6`scv$k{y+!*YT0=MQPEv0#9`TaNlk2C%aM<1 z?Q$CX(2hIs_K)8@o>^7HkxPTB4c_nc8K{_v5Q)QA%GjBmm*CDF#*?O=IM*u%J-wzS zXu|t@&5~=MEl3=Pb@|$uxG!euX20S#zqRLb|G}>N!=KI1NkENeVD?B5Ve#Npmuqu5 zU~=rSk!90Ci6!MVzkXEeyOE4vee-|mHK#xwYJMZg;BypMnX*;Tylg)Nkc`C>*{JPM zqwo~`kuaZy2es}-Jg`j=Qvn{J+8=)L|Kts;H&@~2MY;)7015fhS8<~JZ{Gh8tl(cE zGq&lr77&n1F=&QABG?wK7{#+!HQ&D~eIZ;mqSEVL)Ifxs<84nwMq1AAi!^46Yw>v*AQve*nhp>ralp0YcH>DP-V3GK;m(Wf>Xl zaA@>4)Mo|PVc7+sBw+eGB%#;*GnN2U*;H&GyTF9Goo~y5b!J1zdatWU*~e{*_%C~J z9Y?)HQjMETpc~oeyMT05U_U5sRoY)+%eJ(PHV3~44V-NXs=ifbe-f{#pJPbhfN0N? z_&^d87mrJOxx}`S9>kCO!CP8f_Oo-<57fdI>_Wo4kv&|fw6e-lqus3B$cTLH-?wMMsqbOoI5E3CV>d7x zPt>>wVzqFUo8htbJv>|?--09<0$j)-jk*xxxT-LZ&=NG8;OhQ8p`EqQB;9Do;nSw% zr2*G=JG|-D^MFAhfOqo`-hMat_a76hK4^2Qea8|5P9}5; zG>U9-)ujODNdk4V zhc_Tr7LZZ6e7?^I1L%jBSf`eZ`FSie|#al2h36 zdc-~Z&f2uzI$A`D3G&K9lt#SthoD}zFw3jN0j#z2BuNUkWXK`s_kZA@LJ=Lp{3cBS z1m1iO_F)I1-Jc^E`zui4AjYs5sS_*JD=@+6tDG8CaP&v(B zi@cfzDdXWLZByfp`A<7vM{{F%kJ@Z>L}^zX$|%!0u5N9W+JOvq{VpSBeRU-5R%_dj zBX}L!!R@YorgyKNnPOL~aj|aiw3ekq_aV-6giXpe!J7>{Dz`B#SuYa`UP!m~2r5kp_1}XxAiS|($I+Wz zz3ZWU2&x%!J6|BZtt*Su9VX^}C|GsJwsBGyycndV$^ltM4!fMC1GcEy%tIR2mFU_g z;jEqr?bGxN9lRJxTiIbW&{e≧-NUPa1yz@}rjIG`j+<`~gNl!CY^M6-he08Povl z1ZI$fVWPXWSTB4q;JH^jurxfbF-;~)>z~)SXi5}1fcfwrv4ERwf(JX8vETinYbso4 zD^yY>uFr}mDZsSSSZ!D(6ZMizwVv6*gA`;b(*#}K4D>=E2#-cUsbyLJ_0uuU zmO1%~w$_R9kL9gSwtkNiHAb}E!V~B5WKZKsNH(65sHvu4)+(z>VsB^iZrZelyj<0+ zTSHzBa;^w~%jEl;lp~prQjt_`n1BoOt`&UdLfKp4#KuDNohAbx<$95%y;sE zSiE&I_Ci{i2&$_P4FaT%rPS&?M#~UF*sZEdF+fiFPGM>q_WJ>$37~C&6p+gzY%3Sp z^W#eT;JFeCKRJ^IMuwSv7jvBT&$%@_%E-h?#Mj2CW=c)~$3707S-f&v=Cdnh*m^@weY^UL66jg1L&olO!4-Z_?;)xgxo?+0b3_7LjosQ(+E(K^yqK|KIcskQ3Zoona8cSWtd|~K{;_Bd z!j9401VyN*D3P=u@|8D5Dqq!rjWkqABP?f|9|6xwOosF=?#PkJD^d&7%+RTtBUO|i z!^tbD!!v%J9;14OC)%cfwr?gaSs?PBoGCN#m2sO-rf|lcwqeF8 zKins5hwmfDLt|c7XSB}vk9ml#l7@+OxUH-`RN-_Qf8G4!tb(sQ&DzAnu(2ZSbQKec zCNQZR0i25j$R8t+LQ2r%_@@GzDQXvP-xgBctH3!xsI$ zl;QU3_ia8YY99j;hfReH@C^kJDRC`uEf4%NP2xz?&WNWQRkoei*5v@=$p4-g0?91$ zJZ}NWU4L>grOei3vkXA|XP?dnhkx(RyBge`>@(oZI@Dqce(4fnHZ1?YtxqYc2}mzb zlt}Db(1gWWxEXCq`ZKq|**^w8MM5oAg9?iSh&;8dQ2&G6hEYi_@PB7PvTT#tJ@qT? z>)%2;>Gcs&&o~N^#faS%~2PEf_qmsm?ed&88uU>p;ekzx6qJOGC>!-amPPNs=xyG z3!+>6d!cQkjNVU4JKx{7wDKq`RiXAq?JV<5*SI`qm-MWl>XhKDY@eD|a!gcL$5)kl z4$UwF#@o2+wHJq^XHM~dE0~t2%<9QnSs@FF~DZthT7e`|@M#XGTGp zHE4d|sn@ZiFuSdJBzTCW-dDDl*)Jzt184^RS@?X;p@c^}1?WbZLh_{Z@&y9<&IpIg z=Bj3o3xaO9YFs&27Chf!KOp$!!oKs8ukjm#WSwtZv2oFvZnws_#UUqzNUV@i|2}cbJnpZ#avAPIqa=_~ix@POZRUrA7uQXC|H5hfJ*j6@ zJ!;zUCx>hl+<~L-SB7Jpl-Tbt{ZnZQ>N$>Tslex72Dmj;U1T3m?cj#cAKm+@@*gI4 zojpvgUl-7|b@a{F*INy4$u7=TV%S+2B`dgv@PS;1@7ueqW8+J%vK8z>YadALcq~lw z&eV~=xJm!%%^i>F?(<}KuL5vRja`9xm#H2#ps|08vUE6Fd}AJ|5GUeNTPJe#uA^(X zhuqbu^HYU#5f?+3%|0JK>4oYx2Qm#SqL@zzDoP>{-a=-Y_}7Al86nE?A;N<}^^YQn z#kF#Nmv<(;5^B(K%qtD!aV+W+O|@&jCBP${`2HReodwX1NWMMWCZT*|+UE*m|3ZI| z{ujJCg&15?Wt*5Q|DAY(iQp0sfiDF1ZO&Qc=0f!ALg=WUa@L zDB6*oR;&~F=1vcA5v1i4>Yw*MI00zZbAS|H=I1shwi^-n2gIOtpZw?Gb3u-9xjS;v z0zWbb^8&p2SrGaP$N@tX;1@SDnHy+i*5lPSw*NP1JRe0Qv&4`;IS#He(t+;gRuub0 z?UXlsbvpfmKpJxt%MuEt*x>5$z3)eerqQ!q=-o9F7UwOpM!oeOvP5=3{i-Yg zfB^MoeFRX5Ch)o#UA%VR^p$t`GtNCj6?>0<^qkMCyfT(|8AVdAGBgR8~UX6TGiO8ZX^6p+TmGz}h#Cte>_*8-@6)t|baK7A+$r_SDrVixeNXI;8! zBCLCBQ^bFA{A-N(-zhP_9xz4+2MXGCJ;cR}!CU=5U;z9ez#hl{O&jP=2t#TDlA7={W zwXtM?2L-87FcHL!q%8DNh6vS=k_+ZyQ2R+VgvY-s= zv6F^0{g-c~aD{t4Z_eeKw)riLzWn!r-akh^>|qwJq=10JG`^V~mjDqm`tXxb4MDH5 zu^wXsQh19Sy-Y>>oBC0Y5&lgsuxP24|B|eD_z6>Hn&X)Vh?{247h~8~K{7fv<_YyG z1#JsWNADlb#Fj}Zu;)EdgIv?y!}%jn?{c-ItmA^go7WSaB1;bUl%bJc{4kdOlsZn; zp4mQv-%ONj^d9hq2c=AX1xL#HAd2w4QiesXAMJt9x~epuy5I{QO?MR|C_dMtCq z4P#h+oGw-*%>@FVr)3Df8WTC&3Z!A}*~NXkB0Ac&Fdu^c5yyAk53&IAyAUWZ+4NKh zt1jR}ar4|oAgZ6T4c)%shWG0&PQO!6{aalAkKW5B(X?6j4tt78|5sqn?CbEL`WoE+l0#ul#Y3N~I+-Ph{Gv8udVPixgbX zU*_(#9vS{8+_Bfk~~08e(Bf@4WfLyVT=uxLCT@S2j56zrcr zfpAeEz}VytE^knI0Dwd_t~9G*d~k9Ph!q15eH)y#j98IM%@@DKIUocWIxJGj2L^bDfeU;2$C> zFA>GqS_DHz1T=%A`r3xv$|ibN7Y&8z35SC%yl2h~6j@uT-F;JZI~Wcp>Jc=ZcY$p%71 zTf}BJjDmk5zncrg4(?Zi92S0ZyhG~3ijdPmEj$ss({k$u%h_Z?Y+Wq%n=7?_38_j0 zHG^b8W56twl1KD=ee2kfelw>PV$ zlr#^$g@|>~Wt3W-`%0^A`J0vKI)UNL*J9@)42*^UpJC7ZmU zne?Fm8w2Ze{Z@*Q7czf8xu4(&tq12w8;XAdztXR5JB1R5tH}arKAH&Kf@-a~wR?Bh z{`LYJqygc=i?QO;;^K4u1OA6jbL|2dmR8W?)Q#_(UC#Jh8ogZ{MMX0)VuR06PcC?h zt(sB&XfNvT6H#L7RwJ>@(uVvX-{Zv9-k0Fk&6Z_HK!v!Uakg5uI4mb*cClY(mR#Jl z!M<%CPr27G9fO-fB{E_(ov*!Hg+rQYHu?uUX2~mvevfIJp>g%2%tL5o`|F`c5A7B2 zi>{14)+I9*cCx@VgDL*EY|)}rzpxtn0=E4Ouus&HeBjdW1pxABuLmdN4!I_2YX|G7 z8=Pra5ZsF#c(|T_6qN{3%`3RajM!M|9=A@2egbdjubn5y1$Wm68EzZOe$t9{!q*T> zBGce%)u5CWL@Y!-Z@9fS9q#cfRjeKDDo6TkmMG53ZA8+#T;TU18nO%u_&V0EwY8uQ zHg^APWi{I4aQ~ON!so#GR@RW{s;^{2YD-`Cq7`A(|BQFWmR+ieUajcHO&Ygzv4 z%|5FYx1K2mdmg@6eo(sWmMk3sb5BrsaJw*#`2&`g9la4uGa*(wRC64k zBQ}$__=6xkJbpd5ce3F!_CjI5KJqLjf0xf;f^y`Yrb4&4S8^4&@6ff%-QfE?#ncb( zELu=96t`-mK<|UMjAAt6RF~#J`$(a7m0{odZ_U*7y12xcdogY?LXDEt3&N*VrivwF zSvMI-x<6oP@17~4DhK<#GHoXc^z~hs0Q8rQz5UGYu~Pc05yT^SU(6X&miwZOQB%YC ztklz<9|t5fO}VF5%-j)CUWYKS^wtt|3@Vcq73!&E>re8w1{DMwZx(j0MQn4`-dE8b zgh=TAS_u-1;8)K^UZDxU@p2Vjul#&ha?}wc{l?nP^m+Usw$}sZR-o37Y%HcLM<`q? z>^c)HjT)$xSv(jJTDCBs6d)nn{|SAr>U!p3&ApnRY{ZKd(pJ$On#^^&kC0n4N0*Gr z=58QIcVGI+QEZjB95jz*n8&Gu+M58F(xl-@E^wUTq7ZtEhtkICP@sQXSb^{J5YZ5<9?8mcLN zCV^?gAkj-0FTC(Jx;>Q=g$T#a!~7SLXNl}|sl%B2uQ~3N>(mP#n^qc87}SVfu%O?} zRdho{PTvWc(FV?aA3GW8q~?mwK;|IT#_f%3=*fvECGNze`@5p+F)?s83j9oue(z&A5hpmD)L--N)2BCK zN}m>CSdE$#tOqp95Ce;dKgQfbcu;?E*HLRqq+nllCKKL_T0Ph`zW4}EhF zvYKN2=+=y9i`X@W@Q>sPI(OCggV)c0OVQP(<5maA37DG4tI2`qNBQ6VYcE#($8Wuo53 zX3aNQQ=}X5?zUDh5{`Mn3zc4dg!fo6xHfSgB3BTXFZ?J}KGQB;{G+|?#l$ST&yHNh z&nXT3H*R``u|S9utcz*&WXz4+W*L*N3z#zk#F85u?A2xYuTIV5=&q|#<|d~u_f_O@ zG!2EHpQAES&DSHd)17fCic+C5XGOd~Axd}VQPKNzRi{Fr0otyZu_M?|+ zKSBb8zB#{?eg5)90|2S+_SG^|Nz2!$3&bQz#v6)0f;cD|-;&#N^L1sh1}pQy))I-R zt(!sP<_aKh1V^%Vyr9I-*bK|^_@A>hz$?%rAF0^HBh_Auj@Ug-NqRnkNYTqDTIeCi|pkgrX5jPebAY}#F#E3AD#qhU3O z7!%VCx7M|-vlUlFG0}qq&0HllRiEqi@&o(?bk~2}!sZIf^y!Z<#?!(Eg(T; z$;J~^&bouBFx887dtz|I+w>m_%U7KE*KtZvzp;3t(89?YMvlqGMcegosf z+GN3qTF`L}ME|gMm(HlavniU^Hq0N=q`AEkr=#xX8jz}O9hfN>6o;sLf2_mU4Ma_M zF}@#b%t*X5Aid&%XISI3m$Xabjht@ZWEpW-dOJH4Zs-Z8aao-j5fiy+Zr9!~k5ru} z3!~i`O~Z@x<=2KQtV~RI4^K{ve)pdAv}&tEAL@%?@sWkkrX7HMP5TC8(fnWKufK3V zp4Xk1^oq|ZWWx<#JmpF)VF%ia!nT$3t0_%u!1?hH#0`8lKvFK8M%=@H@pwp}n&SG% zRl;s>UE_-|`-`_fGo0BB`G@2!NFB1O^mA1Au6GWhiEor;TDogQjC)4PV=ihbBZ!OI z-DIj6)HkGlHs>@`#ec_$Pv!HAeVHlt`c0h85fM?0Xi>H?~GGQ+v z5BIf=Zr6ddpL-CO#Q!*}w6N}S^SibvOVsmeF6KM)nNQ0XkSO4wDd*nZZ}=8tuOfE-%DtCnLj;yEl70)dg{76p zSRQ%-$=BMR-t_vNibx)>2Yq21FEc|!2QLIH>nf(Jn9K>wd}AovzJO)ew}?M7)Xz1u zTvWdwY#NzeuR2_IEX!}TETMTz7*rY?%YU+^8&gQ%NskDW90cO9 z`fVTbcEhyEJL=eliAxQ6hu%y-y?bL|+&q_#?~wI0u+Xm$w3q$KA)iUruqLY3Qe1;j z+fzw;Zd0J|yO?)4`jv^ES^4mP+YS=L>{mMG?*);zyG9xqLYnSlV~rOzQfvh`53GC)v5=IEq5@6?;FDE$zf{B+)DDE{_P zdF<}j)qUTUFRjZV!&`u5ZIp=9z$gThC45eL$SrMI52OXU9%(-^o#4&tb{E+|i9XMHG?EsdoJ!d))u_59Dk>$gAdd9(U0yq7Yk% zwU1FUgpa`N#MVoSp%r-4c|8yB073pSI`R53eiyc~)W|NvvCJiNP0J>7)ztD&unC)Kq^vJLum zbHnHKeGb*(PFaa*-u0R=sYI}}84ykTwB&w!#6dl87`fM*8sqHaebJK3b?U^AKH^L zwOncYNT<}(tAM7x#Ghy62519F`uoZE5Xohc4`tb|C;?_8S(iO(?dax>F;oxV<(a(X zaQzg=ouq@e^=>I;w!ynuMhqWnz7G725uR-^-t$}~u*7tCf1v!ZvLx={*cY>r_!&K5 zeqmIz2ZJ7);1*w+|DzR*g_3iBbAHHMUgug}&zFeUb)4co1RNBG6N2}nZ_?D0a(=bt}Pqmt^>Z$J$62#K73fHvO_DL z&B+%W6a%le0U#*awG9KX>}rKIxu%GPw9(cl>auAnp9WklilZM-7g5CS1a%niftN6^ zLFTzZh2c`keH>ge0aqz#mbPi=^N@N#{tCM7+lbiNqi#W9Bo~pD>A#0%s0(YgH z^W`ZdU%6KLl<)jOwXE44);GNExo+c3u5Mn$k4leVSv$t)c!r#td}f+6&+v?@{L}H? z#8b$=`;zX^H#5nlm7&{xmGO7zW|EkAXkDL4t<;`HsZqso&nV}3y{{wVgM+3JhJ4R> z;hVdpyUHicS4awC{D2ro)!q~-E+N(te^+`6JB79BdU%cG$)4$%>&&=ey6I!6isvSf zb=L}Vq|7m7{bn+Td@n3XTyM4wJ+EHI6m5Vgro&upFWUxzIy9di)5Qq;mhj#8Y}ly7n+c!{pnI zTd$M2MnqUbYta`y4 z>l$>{m!KXY1>k)wLD~b`BMh(iHcpl;FQ$*3yLF4dWmk0W!K#ssHW%plnK25sny`Vj zf#LX;B~RU0+&ir}PAjQ&c6h>`u=g1DlR6BoyUPIvJ(CbrgSiJ#Azxub#a8lG{~Rq2 zm!jwh{o>aj!*^fc$}wSZ`flPip{VQc(oi=LPRrDCDO%AkBy7X+Fu4zz2iPWS3NgIE zUN88POqD1n2&?b+hW8m?-qVqY%)3~T$|oy>^;+_W2?=X zhtWYh`gB)RN$}===2e}Hmb8LWUeC;2tN3zH-Z13jjgOmF8f4qOv5dW3#K6pf(z$SN zHS@s2(l#}SEVt|IQdUaja7D@f1puw=HC`BZc4d<@=-3)CU`x3gDPkL`HcSjrVC}lm z*KQ~tdH&m*5wDvUMr{Z7A|J^v`3)7Ri^hZ~LeXq;7q-)3~L&)9tx9`ht%pS_jN1n{i>FO zcYKaxPBgM*pBCLW>Q*pq?U9jp--kGKURezz)UL2S9fVT3rJ&qibs|JSH=Q~@Gqx7) z7isPXkHLyyM_zArZ-}isS8paJ^{FGfb<mcSUjit)^55wU@6dFG>}@7okZgTQw< zyHZ;B_KM8K7eoB4?T9*&;>$K@G)<}C#Ea8$ofk}>+6)Fen;6@{CB;M50-sf46qlWe zPj^h0Kq47!yG%VJBHoDJR?rI~4$T%1BSg`AxP3a>L*VImvg7H*CFFWjQ;l?)Beq%f(OM2MQkPQ~|ngr&K2_ zh>muVL_V{K|{kM^Z|6x)I1| z79Yc_K~j;bF+rdvr3a(%i7mpj`5*bVp#%L=m)xOET^i9Xfv%F zM3j$21*ODlPgrB`d0Bd}W42{a?X!oouFC+zTSev43ZnH#IIuW1cCn(BF(__5XqlyZ zGxZ~1-env1*5a7sW@P{)=TTxBIZImlhwFS%Iz_-3=Id?9W~7#BvuUUd&Anh-t6}Q< z#$M6Y^sRIkKy6sExNc@OAX9oOk%|<6s<&c9zUdLFF(nmpl zTl3@D6~%)GM-%G#gR^1i$8E_i;CjOo{6RwaI>LHAR{{P+w__8IPjF7D7ee53_)g>l zTrqY!jTWUcL~V;WoQ>bVq7|1+w);qC*IS$jQJENxYja_#Gs?hmca{A{T~2_^I^MlF zC)99vc>4?SxW$@AONG|UsORJznp|in2GaLt<$i|Ur7!({@d5)i?Zo3-la(7toZUKt z6Ev>vSC4XwmRh$jn4%68>=lc1egMq~mL#(knb>I_`eXJ-$dS;SC|yKr%__EhjgXJO zRuovM_jRp5*RZ4Gi7hVZo=2`43ekz6vw9*L?2kU69OQHtmiUgMI!e?C45{Mjz@zNXss)@KY8}YL#13_tx+DDp z%erSRBs~x-p)89fuIZyRWCKAAvd40xFET=X6?`HM)Z#uqKOeoR*Xxi%<2~bRLfRbO zc&9Gp_cTkysy$l1Qga#8A0C zm7Rs6`wB~oK1SL$*5B+G z;3Lz&OPIRH&sQVX_{J(X$I3St1ZjD9MqoMqV2Fy#U87kXWj}_l&)>#uV@QL?!bE_A zKJO^&YQ7;}`!t$bJJG^%0B>1tPWjmMD47eO?b7_6!Z@mfGHA9l&4sozQich42}Pr71Ck>~u{==Wt)A0_gDs#RFPa?!x^q~b z)DS@TF#HNs%oCSEwM4uc;wwMyM!|=aYDC>n>2+snM7{EmNGq_l6LE_^_ddMjYhIPu zzC<7}W+O0yXNiQseCdpep(&3(Th}%*^qwyFb3BfkDFA9?G6V(o`6^%JEfU$osp!%y z;8~LV^oT2a4;U(j@8ciB&Dc=+Ymfzg)%}DbhHHbRePz0xl;_^m$t_V@V>k1PjI>}8 zBehFhAOCy2xJ3jVBox6_n0_1=e26he!PXMKVuuy51^QyMhuzc-->Qe(LHx1N>Q~J_ z^0euXrHkQP5RBuTHf8M)1&0#gD@+;Kg4tvE3RfGcN@RhdW2VXFBv_#03Ve5AO#^;y zAkdZ{89zBrIsfDU&v3+h-Gs3jSfiSN_t2M}P!s5_7(h^yTyTq zpg4`wWiqmu%y~_Ps9al;JiWs$XAP00 ztRXk2qT;QCc!4Ia@jUz(s?(2kAQ2yQ0Qy8&PoNa6`V8WOC`hpv!bN?>Jw{~_0%grl z_pItH;y(d@%Qtg7L5Xe&ks3*l-vwS2n9tX|5BAtT|0);c7IjC;qwf=mpKLQ`KFsgF zi`n#zG*bZ`B~!J?kIgiN87U(Qo2qXvOPO>~%J--RJvvh|F>%Xv6~tN(md!w!2Gj$W zNuV3m##+D(cQ9o%eHf77ZayLuR&~Y}b5ftk=3VQ6iHUgQ_9x0h`l zfdS5D0~59YO90e6m9<&YrN!)Z;B(@d`EuN`7y1EFlP1ZF8-c+h)~x(rSNUJME&S)8<2hUCwgH{_7I|+t*?%=^sd8liAy;F z5RPhvXGn9+3;@Uq9P7dN2B2z3pzYT_IN)sA^0^QHcyH$(rgfsjo7*P_gWk6L70^Xk zjWdN$Px5fh)3(b{<6(YZ+JX2XmBR!l{<}|H?o2xc>s4@Co$!rOKDl>rxW_NNSQ4E; zC+0a`-EyZ=4wvV%qxR&CvcQ?GR1!tuPZyph1i9*${+b1d|)`-!r&}NoQ z?0zcpg~Lr7@!JKyBvO0t5e-t4V>SK}(iLKy1ciIRC);?bb3Z!@x~lL{fFZ*|ZSS1D zayHS;uTErJ^F!}2@8AJByKB)>kBbj2Mspk>WZ@XfG4K}Y_fYjAP{Xxq<`2tU$9P5C z>aXOnL$1V3Pw*kP<;3v0{Wnewy-M|2;r;OOOyj#c&Sm=Fa)Ez+Px!l;UWCF+r^O^> z8ENc@w^95I^>})$^VW6L3y2k&*I&5k#aa%KtHqsmcDDW0sh#I24g?v9l0!P*PxopE zh(KH-yA=9f0S0)jKIuz~9~eM^7>sIG_SJ!%0b@gdG6Y_t`^n`kSPSK`mQBKK9+5YY9IQdf@R- zj%_1@U5`scbaAxq;j z&RnPUQ19OcF)nKzI5qs7N9#y#f3)G>@xJ+07*CJC?ZdA6Th85Vb^C&w#g5uuqe+>5 zfLjY!2%|%;Wgoh;DnA2z$cfS2b|SFmMo5rMy+L|_5kuT3>?#j%6XYKe=}~+k`*KG|ZTjKkilVK)sZSG{ zh=a1@9QfJv^@J8f`Y`(`;s3Dr=HXEOf8Q`k%9<^^`4&Z4DpH%_-+evz{oKcMKgV$&$8-JB zaWrG*nDhLc@6Ymnz2C3bYbpyt_tB!wQW0Z7uIo3~o36K5S;nPz&L$n^^}QNJK5}2Y z4_^t}N0D``-ZMFWcT0M}NBPg8Je>Wnlm(9<`8Uj&EA-rEEjhRqxeI%ck*q;!S~uA_ z2s$+TxgTlt4xaK5zAw$Fw!53XC3~kD^no78R@{)Re4tQ8AiJMGLK3@fuk%YhnwE{e zY^k$;LvbvVY((}#LK;N2q!}#CU4tx%TJoUx+6ZIZTMo4;Xy3u+B`AgMIi3=~_d1W5 z+8Y1=8rAbZl^;TlCgUZ^^xGiA%~1g?TYz@e;unVOfGXQ*AW4*AjjlZ=380sAxMcPn zT<>`r=QZ|z#u3cpi--;FHO><&ID3dO#JVu{3~0ZgqAidFm;Nz$Gvf#K{UD+`*W?Qu zU|FlLEdS`}Bi(R@PJc?X#_u*U!In0xISeme?f#`>6Q@E0NsDST*(|Vl#~%WXZ!fQ; zBVe0j73?|)x6d0lY7{cJrNPnIX=Kn5mc}ofl8FU<;Sb4dmieZj{hbGR{`Yc{XS;sJ zRG><_faZ`My+>bK`VDzpX_RPzqnW-NhW8*nj$h?uo9uSf?n19(=wgQNC?ed^QqIT=maWOUNfhRsLF zorixU^1wsp;N+D6I$ zT-zfW;;8Fil8#B;kbTVEcIS9@E&?lvptftEzI4T~^=8Q7IAr?CA&ugD)_C(Is4a1pcDbADeYsQmk~1NdTR*mGZsHY}MQFJ;AcOya ztZ?)&8wi(PW5zq7LA$o@$-}9`H23JlSLHzxx%M}nJ&><~jzgbNw+ZWfKHiHRd7DImd zGT3MB(W}rzDf`gr(gdhL7XD`YGY00WQz;(DShbD@MthS7JyM@I=d!x>uYrD1R@Af9 z?`qT7BguJJaz`mcQ&6#HT?XX$>F2m^JM~YI_A-yYSx40Bjq92 zC8{f`&E!ihY7@F+cbey5SyR78kW3%C&^j|U0;`ZS=H#PZd{yj_J1c$tHghc0SZ(3- z8IjXpFTGMbdeTtlY{dTZ`K`2XH^=N9#{P<*IIg-X2{3XQq&q5{sHk(Q2dDD=%ILEL zSB{Px%hXy|mUe3|-BRCvy|2LF{b^k(^{KQ~`R{!%j9$B&x9+gGSW0ri^s z4>IJ+LM|=ljYXzKydk*8Ay z!LsnW+6UYb_zPtl`Id$nON;9Bn0G8fP2qEBs^~ncOyrr!ZJKc_cAx(TeFys8rqM&R zKA3wM6EA)5Z7{Y0o9L)HNqEJ+fms1}(Mow_BG}u2z>GO&0vM@hnSSCMzVHLri>uds zmy5Nl^-1sXQyF-=O(%-^WM~FTX2|MQ(r;!Q+|ee)?wMLTJ88!%o)A7AZ`wY*j-;PL zZf|D19_=)84^)YCLm#E#zU1G#CSzik;dGz7_G)_$ZW(j2X_GIk&d_xl$vkfGw$!+>UO#(@Xv!fCUgO>RiuYDg!rFR6wvAZLZDTch z2lFi_+W#r=gTyz$Ok^7Nfvkgn<*ALaCBe=zo%e@}?iuI`lX zC1u5T`qYw61qba5G^)(b&TsQgd>5NcWqHhYZ-_9DKr%se+v$R4ssN^_!utZsfU3H& zGfz5Ro$(|q;X|7*%B9y;VZT@Z=d#Z-xLwj`49fQu5|D`4Of6BPlP8i=Pri1s`gEJn zf`rtoK9tTNBOT0Jg^nOsI~BeN9-n=_-9rK=jhnELWDlN12m1v|MiP0(tyL!C_l~zK zhLLN2#p6YGLk0A%3-t55m8&n`x^a!DY4`Dl{$^~5J|Hwkph}_$#SOd1MliL_m7BYg z1+tu6PUWR{Ya5DRQ%#uJSI-c z%5q;t+Jt^_kvH0GUTfF}-VqecFqzbfp(xI#G@A>Xe9Acy(Bb!>!(u{Gg8foe`Ns-qn!9X1%m~&a#q~mG2(? z6{914Ic}ST$jc&s=ZZ}q^Ja|Gwp}`xE~Z`?{I%*#5D-baS~7HLcW!1OdlDjnjdGJ- zSzq|6n5~+r!F#jA`sTTELvV(+4Kpeonr-7V4L#_j0XhyTAM|>HT(tXVULdB!zUsPd z9{$xn#fu}UFfRH!Nn&`E3LatJKJ)MVHj26%VA9U<{DLcxmrMKRJhKaC7US-DQ0P`# z!);u>p0IrCg=9ub+M*qUU@CBmoa?y+px3E%VySlwq70)rdgy)vfGXH`85E5fWkZA`ExvUuJgqIuAz8uRlgkr2}miN~||;&d=zQ!;vHKlx{(IWWP2jeQ&YsTTS@ImCXJOdGdJzEo>QY6_%Jh*&AY3tRM1n4@s zKIpt-5l)|dA8fOi?LZ?)b;wK?4<=q?Tc6#Nac4)}{AcaEXGe`J(vzR1jQWRa+w6dN zSO>9coOHG>=h2jagw#bo*pjy#DA9EyROyN${>6E#{=#RpM81O- zv4+0kXA`v~+2)K}q8ppVrEi^tiAH4|YP^W_ooXya&Kvx7bU31XFr|}fZ6c@Psn2v= zENL@ZYd2M~(bY$&DIy={&(-lULGBpliIHM@GqN+0{kpzX1qa>z0A;wzGH{HPzA4af zra}MW6Di}vv3EO~mu`C<@Ogdb7iikssB&t3UDc%HEBiWw{ecmgF2gmlOfriEi=UI7+H z)wJTXgK3)IZ#27(&FYhWq*GHHkr5P`d*~&RM-F?7N$9VQ23tO!J$h>TDgEdLku79F zvKIwXHm;0w4`L`W1M8oLo~e>G?pfhNZMv#Vet>`c)Q2?(Hm-y;R?rje*!#K%95I^+ zjR&9oVr#O$&-#;ORvX|8HWN7W1>^@CXg^}9i_v!$rFI1+;_1nk;rjMa(>bCgazLKB z@t{p#-gYv;mU%y-#@YUJV#~~dkBv_W*Y!zLQl1>2{EDyHzI;gXewph;Qo`1izOKEW-#`&6$D8AwShP|)8uc+Nox%ryVV<*V6pRxc)NIc;m*B^&Y!-%Ta8-{V9L9D1>Lv3_0+q@ZIZkn7^l zuID&4HWq4Rq&g}pK}jTM&vmnaF5?w>$I=9OR{?2#8Gb%f6zD!^)FXQZ;oT9jj9Zfj zsz;{LVz(>F7jSlNeCxJ6We<&xE|d^Ya|R(jur-!)o&#NgiYU6qh(*e}Lj8!tGyNOyub+=-C!M@393Z97qkKhpNYM{KUwX6>%lbhF6x zRdY|fCGwGv?pAVF0*8WO#Nk;+b%36L^qd&6A&s0$vO_ysQ3L znR4{VjG^=qdNWY?-Oz&?YVfwJswK`8hRCW5(@?nq2t^YCPn;k+Ism8ED`y(bt@(=iF{0d?GkCZ0a+LpV z(FGT#fKlW1S6!zHaV-1$3J}&Cpw3Nc0mk8f2@dU^!7E_Qr-vecVo@_i$0?S*g;`q0 z17E^)&fJq8DVyeorEsC1Swdk;3Q)D0Nz#PvxL$*S!TM>h1um&|>$!0|i*~M5q)|ekn{tW3)9;t- z!kGr(q=ClTa`HN>VvUaXV%!_g@NjA{QN%#Pbk(ExV}0U?RKz{K)6+=ru3>x|rOYd4cUO#+m3d12*Qbknq2N1mp`^2AQLZrS%HwWv`5RU&L%(GzkV(DnJW!8 zud;a$w_=f`1;xF+FNbuFW>uG0$GL^wdS-ODZRWuD=CJI0x&RI+C(u;r>&zPv9vNTf zuctra{OV)FFcVW5 zbZSQeT>*DlX7cE`+&8UJEhYgXu+Cx4tD#2>;hCeK?MK@1zV3XtW)KAjX9+4L2>V!| zD|8|5^!{MmzN)qEwSz|1C#TUT;gu@aaTBo3Hl9X$*~nne0o|TQ(_P{dP=IPSs>~w* z4M#{=uAhIcbfX^Uy644e8yC+yf(g2W!ZN_gEhbVViXxGyC5&l7XzqrKJI<{?NdH;g zhE23=pU?;=e8G=&!8;DmkYW)#G#mcH{JN6VlvbR@_x$CE<*qqZiWCjRlJp%Qni z1;O6hNr^@A!u#PaMML5Ty!h%Tow1L1x*Pl?zo43xHBD^I`jlR6l^2cKc(Kp#ghcpYQwRw;FR%i7;_z({J&bB|{ zlIchY)!u>n_KQktLxUO_0vqTz^DBC&*Ykez2tOaajod76VRFfIC*$g-TtIrVibjLCo)>lq$uXigOUx5E*?129D?T- z-`~c4P2d!X@KR-v{F)SxEOQ4YM@lw{ebn42?_aq$M2i-AgqE)Pl?z0F{#5S%^XJ$i zu=Hp{(9OvF8J9Yz&(~e=ra-TI<%pi+FyrqM}Hu#;T0m2 zOD@+_-m{-VWEL9kL(1Q{Rp&?jF`Zl8yudu8PIJ-Pm|NIx{R3v~YdG_4%PDYC=@67lPXLUz#Oszk~YMx7UH&p8b;Pz>ZswPlOiG6RR;K7V?bLqA)~=r`A0 zsj-k-#1r0$#5T{pP0yKP!gjY$vY!@hjELJPe*wMqjYat9pwuUfoT{1>udTI0#ANc< zmc&M&$~~%f_@^4--##&Yuo_@8Knujv9grhF{X#W*kKDS}H8i01!EeA^kqnGVw)e2k zVQdevk#)V;i#xvd$Gf^C5ldfy^C!$Cseqpevw+8>3>yYsj*QaUj5=6;#5e6enNZR0 zblxZ)Eed@?>yK_(mVedd7lay9-Tk&)!;zMnZQ|r^~VQV25ZpY;Zs~?FrXZ55f zwF3pfUkn?Ve}r{~ZDn~Iu+qJz1@xvbo{5$z{iQzk(S#BGBd>=%^_W;uPy75BR4;@_ zPR838&jlckrF_FxtfQNN(Wy;S$c~p}sF@lJD$v%7JE}^ze5P=6l%Fdfz_8+EW|qIQ zFFqr=sWk?c5jM8w&+875Iv_--9!efQdZZ3kH#t0ntjM0Hg~9$$5bMc}hzF9ZdJphZ zS@_DR&!-iXnsyHZKi^EtnWQBFL4jM z8m~7q(uxaX8*<_Ri6tUtA6@T(&f}mYxqX@A&S##jrg$%h;5_k*H~m0`<38#2&oRU% z`I!x8M$~5l{}Hyu-$V|p&2-U>gS`D-f!iHy)3pYpj49L*f%Qhvp}V&87=_dZ@v7`CBx z;xlrlxe(c+Y~ZhX(TTBk1HrWoC(*IqT8o8;-v)J6kH1N>J@z_tzI9R)L~kJ6rl>!$ zuYwK~;bKlnPu!4gU*6RzjR++r8M%Ta!;X`#8@hrqv>RMKDzaHF*6z#iK6+wBgu@{X z2b|k>($d8lr`S=Tg?orQO-bmy$P<;%=tKFk#~>2C-FxP7FnGe-354<=(MQ$M-pZB! zrtxEEjI`H z?z}0Z71HH(LmLN~{rj_NK6CoFb9Y3hmS+hZ{LCMFJ>i&6dfE$4!KCrl4{=$05=uMu zkDT5AXh?!bHt2_o7DJBd3&}8I&=u@+^y245XC+iKsS{^D2sfFoaMRM5ZAMe)UWln3 zDEH_m?6r7P&-=NCcw_-t3N%0}Uberpfxm>V0S@+{gj^;%~xb<*US?Y)qGa%TWo!VK20?5^}z`08zcYM28lK5uL~0j z&+ok09lyU&`_6$=%J=f#KGDuUF52B|4$4BF9$U=xL~U4El6%d`qYtvp+dZ=iZjJg> zx$b-Z)>NUj?(-FkoO9n7gC+k8GcI*_pHsMaAZCKVTl2> z)3RI^9#De{IAEI0`cg(O!YyxpbM<;}BUhaM5242YOt$i0-&0#2f(6ics*oPMV>ioe z0(0%T4|#g);JKx#4K1GzaG>w(OuGp+&?cu4)RZMqDHjjijPd8Rv0w>Pkv-wC#+ zmP{$gk7@%cq1zFV{eJFn>mFBfNy=@Xs0)$Q-M4z=-JIXth!u^E8tbMufLd($Ie)wu z1>d-o;IAS9*z%FwI64t!TBIn9i~@?5rEc4O2sd%=9J#5eYkC3MImQByik;AV1J?gR3TUy%(&KdK2fCJ40^WZnEPM8F^CN;oo$$y?XGG_lq~?Jx_<28Gv_eii!azZmb4Ff;hQ2>Rs2j(~mU~ z{zak2#Ovq#pHVt_Glm7oZs_cm*1I*(e&dXxTvRs`&Wd6CgU8wn`BsRG;LkAj?EAG8 zcYn}Qj#X+UUrOwJuErGgEul8Zf)6D?$ABiA2IFh8SY)DrPQ4Rz*M*#BM5B`K214}Q z<+4RO{DUo0{pD(I* zEO{t_y&s4cm<&DMIRB>FRidHC=0F?Am`4_#F+aV57CwM4>CYWSf%XHXPt!s8viAYk zdrnGj?p}THpNWZs_xO?*kjv+O;>Y3Sh2**WHteT{#C2rOeK#(<|F;caBwa=-I}(WR zBQ~W;r~m%yrHHx98{!%8@4uI5{#T*n|H*6VOhAc#qA|qr#n0Ni@Bobtm$85I;_!qX z!^f)rVLg}Zh5=q2!RY^uUL12STfy%(fG#{|8#r=cq2gcPDF0VH=zsaOZ7S+F*QfLE zK*t=Cr8bl|^iQw5`SSh-I&rfvfpzR1mk6|fo-KhutQx7nnYwS`fdc3)Sg&qIBBydG z^mG2kb$5uHL4yUX%P&QK5Bzgi?P&)#<mVZVpdAQOStpQhc8%a$j!cR(hN2@= zInN7RT%7Do(M;phgNxpMqvkrF8n)jstvvGSYA(Mu?k>KOOa!I7u`zhZ8}cp!jm!`~ z!=N*&m?BISrcY6xC-DZ{$qF3R<1~zpb*z~52vf8IXi)*$)m(6VD8F`=I_)0#uTsWA*70@ip340Ohp%Z3? zB3Eq$FwJeUKdZBbZKxpula7u;rN%-!eKbZ}Yj=}-BBrpO`FJ^52%{A_Z2Zskb8svIIOpp_VdE%Q!e{)67K!hCZ zG0=`b#NuO#vku$#`=iS6H#xO5Lzv=?p4}kcibAK>1~k{o+5Uy-q5ThHACX9=pBI6Y zh)b>DCbSu(D5F5em|M-LQBG!#Up^_Ea%DFHeT78_zd(uV$~Ds1<)@(IgLd_>@-b9_6oQZnEE z@d{^&Z*-~CS8w2l?yCOG-dM|g5fEnM$$5>gMa2{8c1g9>CQilV+(kmii6PA;QZ6Du+RR*h78NKlLsG(jb{_tp`g+ zrUH5Ei#SUFYYgQ8%TL=r#El>#TTMs{#=T)D0hUJAKY!>AFwF#q0RCpHKQ~TR1}xlL z0FJtr1EhTWDT#oTw(#^E-*W(8mVhI`{37}j5=ui(+j-zu*7Uv++1B^nB)9>8u~3J+ zv6;SX1Mee1S7cFaj`apO!HK&(Kgsn!`+YAm8UNt_5`n=vmlC`1FP=ym_3T|laOA6*4?DnnwbZ5+OOurQ$Y+^zWi6R(@ z7GjjHa2M&O%RQC}7mTag?rIbwgy)`Ar++P-B1w3fh!0&~E?)idyyy+;XPxHxsHsas zb2#N|G(nSt=F?cx3M!v2Z)-kGRi)05H}D^T)GFU8x_jgrXHTC~Z*gwo;Ij1mqxCgQ zi9_(iD+!QycKo|>JjH+6oh`0JjfV1Ba+p(#Kdskmlr^%QVqHDTHWh6n1qwE7aW!Toqa8?E>*4uKv?e(vlcODh*Y2EB zaC;c`JtT1VN28F*Oky39UU-kN0TJQN-&{Xz83n9um<^l)JT$q5q5d*j?vm9}xz`bO z+s40fl#!{P@Z{5PThuzIjGBtD65ofCXZdniy01E)%=wqT_qpG7iMF8#HUa*Q(|v%uu6Ph?i#oRBnLV=jAcUHxDcpZ^n6Ss{pfLXZBe~!sq70G7 zUGvrz+nBzrweR9!ZWL1~ADD9k5|cJ@S*%Rr&jI`+Bag~lICEERhZe-NS-0Id#|M@~ zp?mpqeoo`~86x{6m~<%#VAAO!{#=}>3vf~jB7=)lf^j}baEp7ud~h&lv;XbW_fLY^ z0#?()a>B+2fG@zs#~`QeJrFA^zW*2l$pvtX0U1wISub*a4yhZ~5G&h~%Y zplvpowCF!KXaR08oxt`3YZ;i=|8*Vz2aZRfh!|U^x6iC@%>NGZ?!*W9nC-hEBo)8G z4MKEi*GrLobg5?V{dHRCn#HNLpCT*UY7u{r7*7oFa0te~D(($6qWtqSHvfB=)$alB z?fAcc6Y)<`wi=_#(+McRe~UE<{69sRTCeI$5m5a7d#IiNmnb7yy#BAt_>VdFf6!>? zlc3|?`1zmsKa0U5k{Vs*0Zq<+Kw@ItRf$EU;F(_E)0QdR0AojA+9Ysh#)u z8~?X;N#(yn>2Hhpe@(2vZzUEu>4mMXw13~=l1_4AIr*3a6AioV&6%H;MQc^o;2VU1^>2Tfi3;-E6smx zYX3hU4?vUIZcSoyy8i7f2(06=;$EERUT`XMa}EA;@IxK5O~V3?$`dydC(wb*t*Y5r zdCVM6@?AGyH%5-Ija0-tjT(!UU3AOqC>}zY z*D%j5#bi$D5W|&q>o?cu8RRD2|P1TG~|ZNi9^#Msqe@|c24jF8D{H2#gJRNB+?!(DE!TaNd-G@|#E zBu-u#-uY5^`%z_)Zw&I(AQ94fa}XSEMW$`oYJW`gKG$%;-|IkPxOEYv{UOl2ZAfrC zVoQUfm)>$luM-91YW@CsdVgKY65e6cYI>+yB$B)XYmzAnU7ViKYL-2NTwE$o*DoyU zTaZ7HhvY`hKOxh-%-K?1$mKgQc-~F#T2SnQ92-bC3l>-yV@CafF8*L(v(8-jtP5@2 z>RCnhq*g6xx73ml%Lxp;0saPFj4jD|SLd1w&9oYjV$r%CUNpUP_5H8oo4nNI?nQTm z@YcP0n6g}Y`FN=IU*Db}y?~H&%>Q$WCy81G3xV9@DOjKyk0X}?(Hw&xlAL^7h9huK z0@3;T0)FZhf@xRx6l$7{Vdc5ur28(9~jDc8vUZ+O;s9*)-xW?)PJf$C<1)I~wNoU%?fTjQB=0!YhL zoXuYgOd%r?w#;$0jV!wQb-eUKLJ4bVyV~YKh?EafmaUf}x@@gL(|ChFcwqc6#zea9 zCIcpTBIAD^+<0O8ajtj1bNM4_Hg_W{RnG8o6LK?9oW*>{dIoqP`6EE(x@Ks}Z=53_ z{F^J_u|!6x^>G^qf75Qj60WR&7S1Fweo0db#z7*57yk7F1t0*nc&4vEYZoMTJjl_q z4~E_+<8$!(MEW^3oeEAV)_+T^cRhiy{wj`Kgf}D8&)7GNutYKYTo$Xo^5<+C;!i)=|AT&LE zn$YCvn2wGd<&BS0=-Wurc)CO6+4xlHIRRG{nT0gyHj=BSc_yD)vQob1m;3DSMjrj! z{_8U?sUKF>gJfc`rZxu;Xb4QkoUT1?!WRYWkQRkSM)lsyzarTUsrZDX9kW1lM~S;X$cq zZ$=eg-Z{jzPWsl&OrrNfhD-w3s3X{9ReMa-xp-YbOf~V6?xNNWrQI@-Vo1dl z1BrXK?ls>K)wV0!N6)yw2?&o*hJ??zNS4J6Ec6m|S6Ow0&SkcXl8{4=ys?SL?ed06 z3;2Z=a;gIPF1a$_I9qmCwQf-Z&)L!w2CEy`uY9?>D_t~zBmvFPOcP_=7kD3JJ9W!n zf6dw6J8|#(2cp{V+J3F5$LgOnR$C!9gDlxvWK5*h;mrNE^y#|^pL+Hs@A3W3#eLiM za3OehhWOjo%58FuffvzTn~8GL?Jm!lBwMN0=U?~5e*YX*yBGa{=5;Gr$h}zX^8SV^ zwQ8IWR061yv!)5qY4UAC)TZrI<>d;CU+*}SQXVAyiP%K0DD5SuQ=q-Smed$SjR-^* z)X8u?k={J$f{2{en67oQHd<-xjtfhpL#|tjf&UE z)l(GPrROIoP;2;{nrssK@gsifwAZQ>Pj9Si4rXO~yzGKsimX&f(D?=}NtC$FFm>Oo zaOVWDGTIL|2}P_*g9yiflwiKf#2XR@Zd1&0e1gD96_(?w5L7!q)zHH^*}8POnWfh!+IgLFhb# zGCkJFF!(7V0A-oR-K?*Au@SJ+vQ2RC+uC}p8@V*B;OP zAminA}mWm8vO5ltBdUxAS3Z?54~Dj&_2s-&ZMGimyVEnrWdxMt?J z-2QQV{2t=vT)h@k2#hQSQ_IXB@W%SjT+>XRlJkg4H<-QI&+i{EOjJ468=H6Z{KvoW zwhde8A`cs+F=snIH1PIK_33079F(jt-%%mTx1y4-k{Dz(boTNSSHGCwTrOYaE>S4o z>&V&nZfrP}UT&z3vvrm-|HB0Fo3dS!Vd-PJ%N}9(zibZ7BT@RbQfcDv@ED&BCFEMC zM6h^g$Qwy<@Q@5sfFMT)N`z^%#fW!h`c>jGhA>SA_*iXaK;g4}F zYphU+SJRQ#=v~mY8`XWG2pxvD;ys@62oVI|{4qWUj_%v#6V3qvd0$E{l|J*BE-;@+ ztl9`^p=#3$&R&`;_iU#J(^Y2lVu7q~%%*uOk+;;^%p%=4;%k4};pI;f&*r^~FU3#X zf8f&0I{k-vD~I>Y@F%eYao#!~KOL!YPFSX0Tc{`)jyQS}lk|C{+pBS(z@lY=;DsdBjr>?A z&*;)Tx%fTqRKr~lnSz)@$*UU|kACzmC$uflnL#^jG;{pVvX%&uS;;r0Fe*7}zq~bO zVRL{%Y7Gy~5NM)zqZ&sYo89+quYP)M8}9x2zG_L9J>3!B-bF_ri&Q|(M9?UYG;I;z zEmcNne5B3#GQ|lQ-*^t$T_+A;)|`h9S0cP5GYXYF&9-cxIlq*}btUR)h}TjXT7>@I z(Mk&Gp*dIv*V~jdx!LhcxSrAx4{;nIr*wR4aHn?Byus-r0rH>G;DU%uk1&LgSIYUB zCBa`zt5%oCDXz1~Ebd7CiFgNc20{#Tv5hEXWTW|r>Sb7uxYReR`f1OzY;zkk&PjZ* zBhQ`twyl&3!qPSR3^T3wOwB_}x5Bqn!&awEgKcdh-`8u)BH@b{*a@FJBYP~DN zWr(u{Rhl3pE70bW^d0W%)XaQ*wtXcW5byulsCqyIDMn^sPL%GIj zyXK7iuUoE>Oc)}x0hS}?Ee&0PQdkUfqS+~2J=yiTC9l3-jp@~hpr0f~Tx;l=pL-*r%g(-u+~mDaiNjAlL6%LvqEpV%*F)8RmZNum5@j7JT)*(O>6pnmMy>Q`1hpx$ z{kYm(ifPNU&qoTsC09Nr3wQ=eXDW1R7qJAdC!sp_!%`N@C}`ghTjCP?V&d48O?!Gj zN{peHj)P9t^JH6tcBpz0u>IB#G3oR=TRNVBoQ`TigP5f1z&jX4b!k^Bax7%*EoHda zi7jj&GeUAy_f!}kE%$>6q^_7S{VPmL7T@+GIL@kd^OBe3^9L4i57$*0IbK({wi|v! zd0W{PKkrmYhB%05eVKT%D-K3!Mm|T2sR*C(Dv zAtxsdL23xMG+WqgKOIv7$-DWaHLA92>kL|Ve`g%v}|C-Fc^ z#eI~l++c#e4~4T%vX9q54;b6%oyzUEcYSKDFZb%%b2i|?+WL>gkY&)4Jj6P;NZjGw zpm}RkraEliuP9o-epz2ll~7aimFwtCs8?6J?;ArY2h_-7cg*)<+WebnYKrS~*WCS6 z*ZcR>;871ac5Hr5HoOz=F;N5u#Hi_;q;0Qc-#)*_=iRcmX2%}nA(p&1vdMykB~3kU zoHT&K>A|fGOnRf#y1}lB+|n&}((QXh8?PwHgj{~W_ntJ48X>^~XGTl%spiT-mhajv{F^Yj)&JJR4VYXKC6MM@i(y2mD&&2Q<}&lcc*<+ z5Y@E|{;8I5D5YCbnz<4^La46Yox13!`B&o%~l2< zvyqe5AOU?vaI;MrIN}@6J{rP3p+hG)()3y0{_Uft+fF%!XC|-46%AC-YMe(tmbLDQ zJY|-A;J#@50E{0>O-)N)+H5E?oN$>iv@}}I=u}c%I%`L~V7ZTMPQH$Y_Va-_Or(xA zD(b9YPfm>}oG$_v^fz-0#>6b!{p1u1gLuzcLUp7Ti@}!n*YoolmxgRm3wCsEC+n_@ zDRoC($Kjf?!!ZtYq5#~3-)iJc|Af>2p1HRKE#f*~m*^IE-DhDR)u>8=zhvuu+c`XK z2Z*qE2sM?jX=F%0d>St~`Sz()FHyzDmWjkU-B-PAoYpWSMRbF1?H?#i9vDhnHm{BT zQqr%~dBN)=CnrlO2d|ZeX79>nrd8Gm6?|8ed;iei;$`(C@-{so!smLj85s6CD{oJ! zRG!oLZiNE$3{n!LCJlS9{imHDN+em;0w%1S}_@x5IAbI+H z4px>vj*}%n=SvR>Qms4aPD)a2aUx6y3X+#D_0JT(l^EZEui)uTzq#y6jN-AZd!?)n zM(WJ3Caq9KdBy!uMbVhYE<(c2gm*e$-fvugbaz?NoL|^*%64Ak7kgI$^K8Z4%1v~1 z@g@0fF@xhf+fuJ(I44wFqraf!FxqL;B z=S^{Ok;<@p!O~H}sW5WSG9#%(Uhm5yCdV@u_1yp?vFj`G)VYv?St2l+7^%Lh8vM9y z&WNxUE)zY(Yd(QOE|+fhfSjTcv|`+JOy5yhh&7~MNwgcW#U9D4BcRHOvv!#>;8FEq zxAfh!^S(*1BrnfrR5p|An#cygykVW$w9_&Wn|4ODCE}4+<};>D*PNQ!8So7g+@M2I zqKn7McW-fF0cQg^^L&0vPzyLag@g8BYK*>3U@N(Z3~K)D z8jlImm;wQ^#d0q)QI^euionyraq_T%a|^JbBVzlg<%^I=LdFU;D30c;65T9XCVI#1n|}K5ImiXJGC^v6fox9VYj)X?>qfe4nM#AyE2X^&(!Rir%M8%Z{D@o$H>9zBfX{;(>!@`vdFDGe<>pi=kJmt;wi$};md^&Z?`4IaNj zp1HG)YqcQ1abr?Zu{YVjlQW{D`(*CkT5dGkW}d=~;M9^m5Tkc_d)UDRIhpZqrC3^L zY~GEM>q6h@M2!r;&Ag=HM}C*P6v*gfruA<{3a!50#3Xg3UVJ9K^jazJ&=a0R>lW*J zFG&>qLj?U4euVO&VNZx3buWh!yC>e7$-MY_Y)mUREzj~P;cfo+&*qYw(=Wf@w~5ku zren=NDracuwmcoSeNc7AwCD0AF0Li6q7r%0xR$Q5x#R|C94FzPzE6ci!;r!S7+@7w zki-c&c)P3NsPgEq7NX|etox=p8E-is_I>kFjPm3h7+7cT{My0VH|0SQyWv$QqSDzH zcCxl7tSc$=AgcRD!Xqi~OWHT)b=P{P3Ud~g$WpL-g`(mIhsiDmehx2>2a!7UoKg|k zXljyk8L6lME?~ zG=dD@AX`xzYQ|vQ@>`^Uy!r0FMkg8tp`kfWSE$@1kSpX4;NVb+b3w<%oj?k%Z_HyCWiz@ErYTPaI;B@KcS#3 z3MRctZ(i1aBBOTPS{l&JE=KngwFB--I=RY6USbK)>nZx^J%V*Ox5(OUhnz}c$G zSe)~ASzUgqDo;9zfPa_U)HQmskuXJ!xqmybM6jY9YGIZoJW$p=dIgd2-BjL|d>(Zz zy7@wpb#H;h%r8=P__?5-Pugm* zv(k*nNG?+n5@D%XndH;q(O}y3`2(YwxqoGTJ2C#O`|ayJnXF6VZI&O}M}6nz`>)=3 zaKYT=r#P>ffQZ|1{)5ZL{5l>zVYz4bYwaF>`8FYJ|IVaAqO)qhPixlozPvZPQ4sc)3DZU&H3GP{V2ZC{Q5dUKe>~S_OFNgtk9gV1E znL*?~-a)&h5d%UMPD)%j{Q;O`X!D=BD+0du-?af=;0{l7Q000f@u}0~-++j}BN>1H zzuISAPe?RdxkR9BvqPJ7G8@tyb=KOow~(vW1_ok~P8S5ai0C=@Z-Qi;hrm1VMv zWZ$!sof*rxn3=2Z+qv&^e>>+s=X~z-yFZV|@B96H??1|1GuK>mE${bhdA?t-=aUMm zV;cA%X^3hEBJD5_rZFHr3A+Y>$~qkisUDQYRL7-*(Q>~Ex#PUN7Sx5xQbr^pfQe5f zTSMbH#?Mhx2|o;>U%_UvANh@v?qDL7U;tW`IqIO1_Lap6PAlA|65Q+ZkdB%c2Y?&7 zb-t%4eNh$=jxuz;fTQi-#uTM(%yZnBlZdt6k@2iK$%vjaXi^=>8FTx%j`h%%MIitx zbLz4Pco}>{*Ri0XzI)LPea_;Wjrl4fei9^aGR)1ax+tO?h1ptpWXqO2Y0Is&s!~)C zcB4|?DcYC2udX!N-D9|>Jj9*;YIgxyJ4iWQQpFuNa^r2D)5{w5WvgmtjL_oeG>wel zRO8rj`OQR3^fRi=EgWvx{Uk# z9*Oudu`d_z;4{=E&LaZg&xZG1po;ng3q?+@g32EbfV%54+cM>f2MPSP*(^1rR zM6iDP&=7`g&zkiYTI}BCA8K?}r1-8OEM4^A`pVx$c|n195GuU!a`>u*E##tywYLW{ zR(6!Qy>`8N5&Qh^V3iQ8!EUnR`Q8t@9v(Fn|O6L|2{< zn6%PXq?JK1ce4wDt=m>aW};T9x!IorZiZ z2p)!Cc7h3yW<^c5IR9#UjHBfW5GwuUi>u6#U7$DfB`}C46$`W`0`twJ=`SmI4N%P) zOn)MUeozTX@OX{1yyW=a_Uqztxg<(B8c>O#IAH~bD$<1A1Y#CRTz>-jRgVvbrk*kI zUP)omOGpuW;zML>D*qGi(Bl{Pygnf<&bGopxPyQZ2xp~1t`ICwnwDg6_?&4dR7!&} z{jByN=|+-jy4Ab91MT{4XN>}es{QpkdqMAhzKhZ z?qV*2ZXu1TSy;}9OkFaet&5=rPhu1Ljt9{9?|f_HY<5b|=hI2I%uDCyz6jbqqpp)* zL1C!3XFu{c_p5_U*8e4DS^N&>UA-bDb{5tEfQKaG6jWl)G^Gt4VI`?5Z5Ye)*;~*s z&_oDy&CKXJ*V293LZ`vaE%T<3@U^MV95I*CP`oblG&oc@Q|Yyrj7c+|*N3|xpSGiX zv;HOb*ONhXEAOYrPuOhB>ybY4rS4OK=vR3#wc&j@`f)gp>Oj|r9|45-WV2f?1-7y* zCbAxrgD*u-8Z~W;acwYTTfScFu{#+OJykZc`#2Zc5DS^3e=1qJH;@7Iv!#c z9vvc0U!(-%I>?CmPF5;r>NtAk#41g54miph{)k|v9q{gDwcpu5a%j z;_tbOkdFSeT*r?@vH1Oryhx*?2>1H;KzJ=Ad%yS@!WUE0*j9<1Sp#Y=AJCZ3BmHrE zUQ3bYw;LB2y+{M~*o{QXU&O_J{HqpVF2rt2Sc6-hAvY$!yaY~pgsB`odI$)C%FEpmzHXe_qjO-Jac}9q4#iK9CKZn1SS%#* zpHevIR&JTJxYAi?P{ z8SLWda~gM_lCN|RCmUCt*E+K?N)s@Hy0s*sTu!_N#ylxGP)ulmD!i|21gSATujP~Q zvM+#U+sDDDP@A@l7e-^NC_ulol5ddEjkRcwA|FZYIDh^a=EH`=1A+a`=)~%2Y=<&X zPLjd5+U_-$LYfjZY4!y%o>G7)l%ClqZ4#sDAB8_LS65TBfY))eNUd7 z%*J(IU-pg3!`$JNEHEs0q!GHSC|S6Bkz?N`k6Gk?djEy9fG7v`J(BbR-ItFtj-ai$ z4=PFenlld!Uv-6gO;g(S5dE>|2Ly9ZW3!1qKR!Bda(VqSrxn!hLDJ0y%(9f3V32DE z5$AaFfkm9Oz3Yf1LOp`)mw(4-|0*c?uRec2D+pC@UbXNGnE^j+Jm1jA232R^;C2UV zhdY>IGcwaT=il}xYYNF9p*e|Gon&ysZ!PZrRq-H4%gZ0^?Jsusul}8vG(-#((pc`_ z**c|)Gz+WFfReEgXHYWsGsU#gf5SC24?2juCZgx5dE*|Sq6ium1BbT(UUzdlau8hE z4QO}5^E{AVg0h_+bWPeneiAiUMF*jCR3&s7!{DdTo3SJiJ%pe^y^*;X>j{W5K~qfk zDLf^2;vatJ_paXnxkBmx3c-5_)BAv1@TQV0!rbW*4TbxX4Eqf;q$Jn}3Y^pJBh-XG zIBE~Ld$>myAD))adG*Ds>f2RdaC`9VP$Or1H>>V4aOU}glmrNtzR48P_z0vN0no!V zYAD3@^Bt+;s%jTM*}w{z)53tv_i{$HYp)%y0P0$&9O|l*%Z}tO&M!@0qM6&m{0s#j z19)WMio#4DW}TP$`F5B70yAfCMb*g@iZT{!RFn@l-YKy#(%j@t8X_Yyo-hcik8)uF zde;i#TbcGMG`Wf?#h-*UW3zDt%w=$z5a+Bm<|RCTn9{JbRqdd~PD3qM1L6FqO`^}w z%#IwddFsJauA$}m5H*29_-Mey#529Y|8Mb?cv)5wpviER(QWj|kfftg$mbu1y=#wm`FaBk3=i)O{4E13#g3&MG=JfcL`gwp8|wQuK=4!v7fjc)W{`*HUESFeSbAg(FN0m9*r ziLjhLD;)eln$1N@-tOP!pVmcpMX41}YXYw6JEBGD%Tlboer|>-+_Qef1I6M|RbyS< z7{z>o^kzMQoM$DT12U*Ph$oFWi501w>9E985|=?Oe+Vl8WR9pTojo>CWz?RJV-EKr z&X$hVmT_30daO$fui)h*w$AL7QI|^b=Rt}avd0+c6VE(FF&A1kK+7i2+6Og0-oQ`` z3b%3j+J44gmqYUA_P=u<^|sS~g-4)&cW?I3( z>R3WBrxT7&>U8q4L?;bIjVIO=IHFT^VtbTLZY0Ct1pOZA();p9!y}HdM}Gp^Rq6Z1 zDZp^{M=d2lQx*g7D5LkkJSQ&ilYuOm3Sjp{D^?u6opvt{YTw~cU~=4t=$5Qk;Dh5f ztQ@+|+mKxG`-9LBgOY)dD8EHrEe#6H+{Kv;UxzF90XG1GXhTS$U zqxcbd@S7a3jbk_F%AeY{zWu`7HujExCUc&d^)YSW_;75@#l|e>MoXTdy+y*0E}e@LjyEV*Rc4oTTsVDhZ_*DXW+?}E zIid{M=3Lzo14$z58p)Ehqa8DxxEj8SFTVa$gYkPt{TFjm4VBG4<_34DMZeL#)Y8KPI&G)0thFpTFEl zJSZ#J*aTyB8w(F4c8+UuzA-b4rf$Dl#&a)RmE*ptH5-?Ux&FBueal>jfI)6wHvPzV zPWt)#i@AC!k=d49rbEF2sS5tVmwYF(RGj3J60`5IZG0@WE;`teohQ@jBsukn(bv|a zFOF9U_7GF~n=Nv5mv*Kc+@E4rr+JsS9^Vb-uY*6v8MDUw%)M-yI#D8ip9$~0d_=kN zq`;PIC1=>zQ6jJsKKp2+WfUnJ@Ygd&TYTh^=t#a^?@iRqjjx(MW&T~1qRYBvlIfuZ z@a_J%9^1Zhky|!z+S@1-ncIUb&q)QB+u?<%Ukyv%etun zvuSr0493}77>vYAN0L$rm9t$NJ5G3&TyD;FG#?3UoME51by1t|-Rv_|Zv`PNEp>Ml z!6~|k8MCc)q~&0g=){sQcClkYuIawq?sHK|`PlZegOn~S`X&|;a(pQcl-M)@<*eV} z!HZS=HmFm>8;-Bj+`k{$BIQgqrU!su)k7L78%}oBmtkZpyR5x0(GLyV?p}F<-dY@? zm7p_gfV|zcSN@Uufb}}9-e(8wdmLexFB?2h2UK9Y9?( zZF9y#hDx315m#5ya8;>~_>?(g+1TX`vVq!BL^A1?b5b)RO1t`1fydn_)#wU6YmQuE z+<~FvVJ&L-?7;q}OKlh1WO&ni`cNu^%Yo848(et2^$2dw)ywZmrPyvzs1*Y8)y^41 z!UHsy4AGM6ye{^9+_(45f!3NYcMNfeJ=_3D`12q$iT#sDA7pRUH~C0cTl6MyUr;`8 z8hB68+cc@EyS8EJn%HHjJfUOP8eNAW;&SZOfSvV{dcH)4Hu4dhn>H;m?35th82%`! zww|8wQoMKb!HZ{7Q>Dh#8SRUlNk|znJ1vljRSLgRzjnO#<#9uw@!}l{?JlPtOHu+u zuVGEQtqwWzbX!(t9rNdT$!2o!&Nvnw4ZEA3D^__uVs-sHn?Y5?{ehEeyXL>R;mxvH z&=IOS1aD0UQIFriP1`tZjf%ajzPlkOA=Sj-{w>I>{V|+|tiMe&`l=m`U5Beiie?6o1r3=vhY*!DP(N) z5a+GlI54;ssSC%g&cI>rdjjD{r_QnX%FJiKu)mm+e5HRk+oMPc7}qD`!(MW(C(vKa zfO}u7Ny9|C`AE>#eVN7Fukal~A%v}@qyJqqeSKk@zcXEnaFIVAt&_MMk1MSCy zk#lb$UK)CfjujS^5a&exj;Z5plmDa= z{Pe%}nfI|Yz?!1@H`bKaiNz6dBJ=SiX60Mo3Wo_-`mcKEO%VWs&ApqQ-UXII&qCws z{@AY6-xIX{*M9n!_HCaW##N>Z&3LMnAqI>}k`FJ=*&_qyDzOBC%L1$v)!Sy6vAfdh z$r-z_APztvY`>9CdI024m}-1eU2it)yY9?z zP{^*S-pU!p8?bwQwdixyI00EN210F_)qYG)6V`(~RF^s3ziB}MZIeqoMGtrj-{PL0 z<|}$53>J6Zy?gIF{JB$$c4sda)ExjEoi~X0Hc|SAz~UK&YGB*}YTxPhw3spl<4O^8 z{Enh_GkU#AaHEjobMgcItU>pvy#mjcD)*B*t^vgvvdG&K>dnlra_Lo`qx4s!VZ z40Im3oY1ikEZ@=O>16oGr58@NGE~>p$8BaSg=cE*rmx+`xs;mvo!>6K{f(}L$Tto$ zS3H<)s`xIzItS3IKB|#YASIuxG6Q4K#55diYvb(64-$NPK8=olb<-Xl{isgcU)d_r zwTWE!?7{`q00ZFxK`R$#qK|r(&}nGS3MgLVv!w_?=}cLTSl^=a>hn8d<=VAA$?O*SR*m#S{h{dW(QwQq47?LnGTT`{XhZ7o(MuY8F?%*gCeEtp zj5qGZ-3Qf5f{$`>{$aIM;Lf*zIN%Sl(gRA8P&F|fYQGQqft;5)LDKLWz@9WYS%saG zo#13)KHYuh8jD+Wg|73T_&DLbV2-RW8q_O=Cf>d1c^52+&)eC*09?a3fC2P_mTb_Bj-hi$Vz$w1ja5IKEVp3X_Zoha~F7p(|_7=uVYpr zJ>al!O)Xg`SRRY4hiJC#Mj)LEE#By~;b?4Kp=HK%g}H#-od?cs@cm|CvN#qmzi<=b zAyJ?Cn6e0Jubu~fDWJAcC->eBy-%(&aPM|eoNj(8#QcgcqX-g8psuu%1eJHt&br05 zSp_~me!L$qxK`zJ4Bx&xj;$5T#pHPT2@TCup3<0KvRDxPcJf6dtaV)TCa`yY;$pI3c>^cJv_x#+pa z1XvEoA{c_SINs^+X>k{7*+8EusDV_aerId`k_S|o*r4z2zXsV2;BGLnsYa9zj>_U4 zEk2GzN_YEEi#AxZ_tEqf`h1zV%aVwOKvtxMeWanM|kpk-JC9zdOz6d?5A%NW?9zL`8Oi4uE{5>MznMlm}<+wE3acA z4SiG#RCw871DLdVAW&<-WJ7PE0YRj>A*oTj4>R58);Xewjw9RWyT?Bjt(;&3Vdy0+ ztX@XjPyCxrzm++asDa|z#;tz^~Tt8>m2`eOFb7W(1 z!sZW!@CK|$80tw1ymNUHFg5K57_=2ATN=KFXhSZT?sQ}$nHo6lPBSdU3sCs$BDr1edpYu;X%1cyvb!Y2lH`gJqOy^N`aPkkp{U5rs!AFY*A)ADW z%s^dqPd%Wju^3#cjL6dKlp@@}lp#c0$s zpzcD-ZqdS?Vw?P_(aXn_Kit72PyC7X=RtN+F7_>3$I%~y#7`O;-wC3TIv}c+g=1#^ zZzLbcbgjaq>&QK%G6KBkO*ndBSbD&UaV}zkp(wv3Sv-vw0hevRgAbz4p=MsZ?0X8p zJ02=bK=c;z*jOE0NPPBdic;N{DkeYDfeH7qz_9jLQeaq(G5E}<6+Qw2Tu=WxnnqYA zG}_zM2+dAn0BZxbpte#^pBJkB5WC3yyk8MbjWHfwegp_q&YqoVPVdus^V<}Bb^EZ2 zGL{#djvgXNs-SerR>W?O|S@O)#B?s6I;`k+kFsg_^ydGWO3nr zSW<*Uvdo4}z7NtQr|1W=9t&BjyjV$QuIy3c&EU1kZC7nqwmx5-T|U`rvtF9;LY~nD ztI>q321%tvl#kw98MnL_UA9r^Qirhfp`IoCnessKsGT~uG=Ql@ZT7)@AY2^#aF4DV zP8e2}2+Y~Acd|r?r3aNr>L{Sm$frgxojH>wTfw5}dfYXfjx1sE=IN;<%Ju<8pLHHV zwN6@YMV8$o&P1`&^C9QbkM&UaP)(%Z1O+S`NW!_oo`hamaqStOt|&Yui_~I2dQ_B? zHBjzaZr-9f0$*d12B<$3QHD5>^4t`zfx`kR+Xa-o3tO|ubQVcTpyA?a!FHw^B>7g8k^HV39(mEk*1hC+FsFJ%>r7BC@ zeIJtSw53#(Pi^y3PFH0!n!);pH)U9W2fK&HyqU0$Mk?FfefHFc{Fbc*q&O=1A#zP= z)D{H6oaOLsRvehk>`cAvdwXuSiD=q|Ye|@!hvLuVffXM9n5t6)P}f&4+d~Y}X2+ZA z@z(*)GqMN*|y^LFt#N<e((F<(j+R48B1Xb>LVyI^t`dp zS>R39qM1Ru=&vT=1TR7*>iWi=O9NzN^DT3*d|>Vkk`Ve9R!H$?I2Z`p?1A1{l1%8p7|L*sy+xs>YOn%E~GIG{TB(FPOfiYf#A1m&4$YpgXB#aNG}%_DTmm zWD1JwTw2WEPmufxM4XR31GYHN@+4>037L z%NG=t6gKn#%PG=@P1fNmS3}BBG<;y5k5yqklb@t#-s%SauL0xMqCkJRv|@pUGulw6eGG=Vnh$ua*f6taXnbzB&V zhIyEV!B=iBAU{dFf>5Wvv$YOpOgW|7FL7FOc;M>});EBsS9Y|eicx5CFClj&Fsp$u z43f?FY`Ls#H8UAR$HirsnPdvYkHJt{>+`OUAzQm1sJm$PG(p4rtybDcuuP$euW8es zO$+lkoYk4JdSG9BuSUx6$zrx=Mfei64LN z_g*%!)IlNNO@Z^NPzw6 zIn>`LPeQ~5U6GY~$bv|A4ATaaCoXcsziOw4FK45WFvScw2T%Eq!| z2F^fbDTP|Eoq2s-NR?EJ+?H9f_nZpsucMl$*kz7`{f=}#?| z1r|nn0=osje~e6ra$BS}(V|B3pQ?xEw^v!@M6+((K)#%B<<-8{?Lw$w^~;xhs^)qE zl_)^{ea4*{bOF{g5}p^|&?`YTui|qXvVPwxLt954(8AHkTRG4IdN&>%QK6(n270yi@>z4QmIPRhoe zYfjWq#EeexkATYHh|}p~i61NQvQUYT&Td6A6;}>%yV*P~Jm3>!Rzg_QwQO9o4s(C- zI@C+*nnqgb7V9)7Equkv1taE*mYC|54N_ap5_$6T8g~eR&O#^m8x^;?Ox_FHnS&rJ z4>Mf>YS>OU`4B~|!;X{-=ofb%%e~OPK}$70iGV3DOYltR4@U4z8{oRHvNF^#lKO)< zRIkOj<x?Z5J-znp*2pwjZ19;jVp zRa*)uPEAvOR-7U|pbYbZS!j)2HZ#8ifE058Qmi^VS6#{bKj3sf=3#&3wUm9o@iH#} zp+P^wn5%7^?QgdKZ2*ze8=O5eN&>S2mKd^&ZiW4No9$NE4_AS%z%eBbeP`Qs9L#sH zV%)Jn`I+(f%+Jy1^Hr7_sN@JVbqmYVUM+Do;rib2YnW~K!KmdrFfLtKK}-;Sa|(e2 z`7m`^;o7JSORt;fXm^?l!w#=#M6RdgEXrH8ea&K4UGIJ@I(B>-TcOW9w2N_e2r;~Q z4Ixpb7XXn6Yl}NwS-9ygpZ#UrY!`NF}#3S(@7r{I4aHOP=n6y;y{7t{>tGFtvW2&iy4oYwdI}Qxx2@ za_i+@IjSj%$>{3QV6p7b3?s@dc`6D4EgL%ctzKv9k!Ra+%m;d)nnu8&)7LoW8zE*0 zy)4wSjK4Ag^@o`Vw-j%+_B_6jSH-9tWy&ud5&I%f<3iTAyP2UL+{E^3UWEG4z!8eG zijD4=UabLscmj+0oVlraS2D1pO;26{vZ5A7?!*$|#Xnus>NOr|ieBX=83yWOvea@f z4SJG}Lf(TNl;Ax-6@1n7&O&W%7adxo=5i~~6Er*$f|Uk93IEDyso`>|B^+Zke<3tn ze5QOBkV@C6Acu=Vw&2&rlp|I^?5e0>&FdW327Hf}PlrFWy`u@T?el}8OQg{Px-}f- z4ovJlYG8S-gB?=a((mYb)v_nN^AW-&Jj`^Sn~}qIcFv%sa>ER@Dc_?bTkT+Y-*H z&_=HkbrW2UbdTzh2K>4&9`6p6x4}!F zHAxVWY}3H*AWc*ciZZs8MsM}ByoH$qrsK$y4i(h%wDw^9mtt+Ou$f4eFU9chxG*=d zqW5f$%u-V(?8T#E=^t1!C^I$3V=3y6YGaK{<72@3yZlnx#8Fqg6RO_bjTB_u;!In) z52`24iA;cHKQtu+KDGs3<3(O%>^ZOM=|5g0|2=Q=+xLX(y^;~#n4ih+Q*WsKxW#rE z25hL#P-TJUkTyFA#quaaHt~PBBLCYX|IxP4NHGVvjNVs^XpqFbGLME)$dOxVEzlTF zS1R?-iqZeY&$ymUVCPP|LX$+KdU3q zDjWy0zFO?cOfLIa^e-%Ui<~6pZGB`6F+KQ?8wLqu@(Ik$eGFybTEakmX)FtJY-OHD zx-fI^>Zhte<$wm#<-EvB`;)~vAvhKIl|C@it-R(Do&m--uiJDS>@k>uDHU0f4ax8^ zXutJX<&*x7u{NOnt85p(DP$!W52WQPbb1EN9gg={IuFqN_hNDL=8Sb z1IwFbG^alLR1Cu1%-HG`i3-Q^-|*jlYq`Nyy!m9~NZ^$y*@E{Tge9%~FJHG`cMWkj zFp%S7$a{N8R5elRv`Q#ckV8y2r+axy>Q_QSOcW51yPMF~gTrm!$7+_15I%oH>s;u< zcJpB}Q&0=xn`ISZ`k#LAAeR#f!qK;8%PzkQ;H3z`=ZhAEw_QGm&9(ZRKVN=bm-F$_ z`g@ya?4P6PVjm~XoR@(k_jkZtBBAhA80x8S&4nT~UIRG(xsNVQp)X{i$XOrU8l5_K zM+b-DLieAVuB7B#+qQ{Eqel0F9&}_iRFZW8vYXhNn@m9~ItXg~?;{FGPp>>H0?;x^ z2{bWxQ5j{|9FvBsZiR&!kQ7+Y_SL-j#;6Ox?n>#*Zt6O*$#ydy*siG=BI1yCww8bs@yTN?2DZICXV13GGt zN(Eq#8-6#s`7l$pc=ESvh`%e20zE?a^^kG3q0jC>)H<>!z???uvY92P%$fWK_L{rr{$&*U}itaYccWgDL|c0win z2}-aN#*q%sCOg`0k7a*-r2lDem&C`=#QX2_#oZl0Dd7}Gx-|Fse`JAl#cvIWfZVh} zmdq*B>|?{wz_;DnqazHDP}^nCrAEN?!k@GK_a%8oT!Pmqb6M4QCITsGDL z@a;(O?UM+we1eXejW1v16@59*iG)n`WsVyaO`%So_|LzNbq|1daZyRo1*mIZPOLSN#w&sJ7L_eRtsHIr2qRM{EE8yc+- z$86ere`M3{u#}u}>6$+&u(VkOZiptn(4iAJv4&kfbaryaCNY zd8croBZu7ce7!L*wx*9M7};FI%GNdoAl@5KrDe-coV!#k_jMZFUqeUk?`&tiv2{DumNOW;)4C{* z1h#HD1|r?eH9&tdxgaoD(TIb093wg%=o}&_&6A_j`$-)Wd?3w4-(i~sT|OJ5xsbJ0 zSWaW>CGd#d@PSXK1-y(`=!6-!=9!_cTyL&alN{sd3=dMb!~2z;=;PZeK*{;G&9|D! zC(j)We++@;%kz_HC3<>u+7@p_G7n-td5;~(;(^AKq6WU+c4y8i#<~?eC0{Em5DoaO z=B-ci0jti6(70&dK*w2R`Q1BATQS(rZ+XPZlWBoVlWS!pp!>0 z%}tSe2A$H|=P)i~Nx^>uWFNA=^02ure%W<%W7O($E2k}xPF4gczki~S3~c|#I7xTq z)`a=>PK_w-jP2RSUfOY7f61R97&@BJu|hz&aST15!|)RI41x(F>zlOXN(?#0xM95a za0BggbBb0PvTZ4palWj z5L*u#8aV_e;LmC0&O2JZU}11|vkr=~eGM{oe^;Qe0DR%@qJKNV{1qIGL4g$6BxePK z1UOrj6X)0qK~s_vGhG&h?aW|dNPo3_lFjqnzZtWwMppk^|41%*HnH*vbl*S=8l*G- z=&kuZ9Qj{*@3+wJ;;p$bW5mEUhUJc9vA^VK1*7mE#r4Q;5WPUl);E7HIoL&6hy#s+ zXDGje5C63jdnCjJ_pan)-)i;)1Yi1A83W!hT~ocZ5TSbdQWumvJwlvDcLB-D&YYS` zb>E1bgn|G~^LYZADYk=g_B&gnDOeY56 zW~FyBf6AfWexunQnvv|PQI?-AEN9%c^HTA_8Z(vjV%N-{!WwT9zxra34bNHctjhYF z+VF~A;(RAj@X}p1?hMeEF-``Lw_24Qd9+{-w;^5Nl!;y42sfE_q%0Oao&VPBHn@}r zv-j`WYByebWYIVnaTil~l;0@ywZ@ZPwTNW%D{ZIlJF^aBfF@{mC^1W)DWEjqFf{_K zlx_TwS8kl)B*XRQ@%j#2hDY3Fm;v|JPumlN$Jq{URM}s_hEM$ytcU%R*CcY3WrY+f zLJJ(&Y9*%F!S8HxlQ0p;T#AE;>Qt;RgtY}pVg$wvh@0{fS#0fdyZ}8ghH5ksS%5qp zdk3nEE)#Xpod5}q`{?u`Um#Z zo8<};6|`sS1jyCv-6A;x3(lhg@>trWe=xoKGl$HpGpNJs0oGYzxuf2XdG;l2uyEN` z<(spG#0y&yJ8e%!-ipSWbA`%+zL>M~IyxcusEg*Yj5m-M|FiOxTRcnveE2mp6^P~N zbxbvy^Qlk?BCnIe+OY7WBG19vYi8C_j3Yy@Wa?5U@D^L##OjCwyH#YD#<3t!Be#Cder6UKp1!CbSmxF7vQTg#wuQw# zU4fT?N{#{Vo+>_DN)fPTu7gq9rzhx*cV9b&1gdAUw%)tsMy2?MdkWmq*rDvwK4JCOhomm|%^cS89C|BxoHW#ayg}QQ_GoaAxYU!miP+f( zxU3HjJ=@75iW*w^T3LJQLgj}0d8UUHx5o2c*h|?$UYH*Cbg)Qkojy#Uq5F3}O=arC ztwV_~u28XFA=a&#hUC8Y@1lkopX>Zi^xH+Uu|?{xE1(dFIh1m2C#51kVLcqT_NtSH z*(U6C5IHHOQ{pD8>Gb1t*Xrv22tOzj+o?rA0bgS%z?+-P5YfB<-=C7P(~_r?r93{T z_>|o=J=D1AYI5IBiZawsCT_5c)2HGpwL}9~v~$MIameQop!2#W8|pKidrDYq%o}~K z(u}C4=PKMbuf4?qmFMpT3`m835wJ;O#qvZkq)vpmaW+oGmcPY5>?|XnoKlZQbnH1A z*yI=KlxWZ=26o>`O`Dc{%~UIlu_p&Fs8ol;OF22c(E6!TN zPg8i&=8=O3GdDgf%vf5bvt87G{YxV49sU&5qpurf)sM+Hr+>Zy+^Gg;20COL|8ILS zPVbm-%&_tc7DT^6Iz>{URWSop`$))_$L!feiAzbmTOn*O`U zvkBKv|6;cPXSJ$aPrU@d0x6m00oF^+SGgcoH%WirZ1}UU{!5?P+Cbom9IHowZJZ+6 zCfi#5y!{z!2D}M?C*kFf9&(i@;h#%2At4IYm5Z4wgYZazIJyZ)W{z5trVZ|3zBm3F zTV7onL%2bID?wlkEd-|W2Xr3j!ytxa@2D?->d@9_G~ef5<@SLR8GYU520?)N5E=r6qW;h!({=_g!Oq&pR zDRQRDIS=V{@G{hH;+-A(oBk#YANQaDLiI60wp8`?O z&%z2DL?~I{o-0%9m>Q{G5qb%!BKHPk8K<4K2oX)ACNXyE(X)-iom{n1j_Q%}D~rG# z0BQMc<_NbIV^7~0dJ|4z5f_9UNZ>nCA>x_lQ7Cm-h@5rzU60V)o3;IJ5q%tPU2Lp@ zq#ybWfBTv=7)MtCP0ODstGc;dNaPhY)|+>L8J8NK4`jesFF6Cz`>iSti$bF?v_TAXBrRhd$ z5wZyo4X9FI>ncneCGV(v9Zqz3U*33W3^=8;SpKRGS#yOjugaOnq;at;-`S+j4(zW* z;p`tQV;Y74?_FqDS`|vn@fMQF7-_z7O5>#V%<}YwqYoEEzYc_JiL*dT>?aIwJ19Ru z3@^OZIIzfP(ECv%FH*Q_V3Y@OYnk~*A(7zikbg2YZAyQjJJ%-g{JVl88T3L+DMsyq zc@2DIl6^@uLuIxzHOB zHLtXf&L5jR)(AyPFy-2hE4noYkfY$>rZajbk(WiB*>!VGRX7mrCKE4Cne?o(2 zcWiM5EnsJwW1XM6LG{4vV@)S;FT(qv{#(x+7#{@F7mWHKrt%7Sf@`B#uZks{h&lA9 zRE$;YHZn~r?Ao<0>e=xbs++6r4nM8DMsefX&mlI=(0zn6(SwK^<8FJT8a(PQ=&lv3 zIessFm*if31C|>r2I^1m6Dj65#0|`b@c39I`iayo zzqTng2aLS%MT;=t=Op?zjn}8!e(AL@vp6M~ec13>zWuYWIa`?J?%1yt&@=RO37ARs zz&gZ%;0&PiZhSa83^q-=`X>7}s9Ctl#?~9b~Sk|YnG~-2_`m#hSy3t_?$EhK(mX-MXcLTx=7S(pRi?6x4UP@mQVyb%? zA}Duw@&%0VEd%}k3LIzKPG7sGNX#()$;GO5Yh>^E!`ot3lYoo=GB}Bc;GA4A{X=Q! z?3k(9eoiW=)2TBWUzdssd>*Z^|BanyJl6cZ)`n|BU$+{|oybKg>YEC(GXR>l?!4D% zWi0S@95ne7%a-g@$z!(rFQs`0UJc50aqaSI&VIf-Q}EjRjwKQ(EC*3-1e$7uAO%t)%r_^5!@|m8k6YuxiXFe>P@OtAD-pLgvt2T09->ykgoGhBFc zh{yxLM7LKo-pZv0$5_Qc$Ad_s_toyL`FVLqwr9Veib#sS)WI{JnQC?WX6#`h6T8T( zY?4N|FPKw7n#gGgF4i9p`aW!lUC~Og44Jt(5uhM}`R!Irhr!m%!sc0B21zIEqV&aW z3Z{3L?VPjmlm4jxjm;fHbvBE8g*0HP7D`=#^jWIZ;}wT8mQlBLz*m*&Xv^9yira zha5wzMy(9}N9v?*9s^qF9Ki@ki7#Wi(&?)(=T8(DZ_H0h-5+lE2_toQwSwr`(%xO` z=Xdzk$tk(BX+2rD8!qOU35#Z}< zSSb|8Fb9O!m*MMGVAeRR?Qv;GT0r404z-MoHo|^Z^C_3ly{7jJ1G4WhXZo$Rq{Oj9 z6hJ3F0Ik8-5%ak|q5%$Lf#L9qg#5)9Bi>Q(eW;IPJ&6WuwU-0>OQbI;5{1gcwDW0G zU$BgAdjLznEolzK(NRB)pGIGPCOn#TN z-})gzb_xhexDtF;A;PnRI0mOJH1>itkc@2b1n|Tq`Xhi(5WxH%6HkS%gYXA4{1}cy zO3aBXyV7A0Q20wyhR?H70Kzu}1bEiH(na^SLPYav2Pce+F|3{Z5wp(ca-U114%&L0 z+tH;@)k)Q6EAxx|)4J%fjg3QHl=}9;O_(e)Q_3~TNr$^4I*PXYwsC&>{u7h|>8393xpg|MIq$yN z)uFoCKMx|}n1WjfK^Ur6T1h>x7j}s(g?>lPu_4T`!nr-TF>>d{*;t3c92$1)m1cov z^Ox8h6jS&GjF)Ph23k7`FOI^{H`N1=l-}VW%%}Ylf;1A%*j__MQvA)8~xD?YdOE+Q>7a0UW zA1<(VgW`13xZ5^3d8lD(VSEG?8W7pL{o=+0Tp4VaKtF#@De?)MRbPhcjHwI$hFt?+ zI@y?(uwnVEwdKbk)u05=!n;o#o{6ul#@3`k2;qx<7XbLzFigR%_7~pMvSg^Od9kci z)(5EH4-Eo-rl7>oFdzptuOte>^M-Aidotee72nNJ2#c(6+WRCZW4L7WQ9&$o9T)~? z2ruj~08KA4a7;;RMbAJJQ-aiWCHjo@@$9cBaQ0BTRr1M^W8*xPyqnl$wKqOyXLaD{ zDGCw6^Ay;01V}-^*>vLA?e?d)T0cRhuc6YM=2hWo?5glID|H5+ zY9Q=)a>3&R89RjiY|`#I?7PUUxJsdB_Jif`aUd()M@_+O%&4yFP6PV7P-#SV1LBM6 z&n?|YODm00-sDp!`U+6@e+I=Wy>JYf zK`uX1o5Pc@9oG5*Y_4-QzHF0#ev6E(&r2*Jk;+4EKTQex38}W{f3|_#Re~I*7?dl) zl>)ZdmD9`j(Kmzoh#>5Sj)eFn3r@^qfjp~)7~|aEg^YpBq$JSgUOi24ZOj&y_QGpt zR@bVEt0qv;2AMGsMOJO5)*GHC=KXyup-g`syyWz1byfP#<}4JrQ{o}?gw#RyC<@I2aQjrw}GQ423LmNvl3*`T&qiRnj!FWuT)^G2m; zS2rvDOMTg?y>snn|8_LlU)}2e0eA5Kg;DZ<=ry*EV)=nL3-TbN=T-|~cdn|Jn=@Bs znpOqc!KaAWxXdYX9hkn8#!HwFQ0H@$Pcs_Xkll1x4bT3}Vc$P@N!qFXVQYPON14eTv?D#^;a^i2o+8~Di zmG{z>3ZZm`Az<17v31h&GVr2XK#MPk`Tw)qe{RTXeDhaE$Q@SJ&w*5}0SIOP9y&zf z#m060=}!S}v^rV=*(;}E*je)-#EQt`o@^yE%s(UCFo>v}oWMd22rx*PugE;0Jh26l z;ID#^I|oIeJP8mztm-6V=jSm9faJ>?|FhSB?!n6kkNJXW zr2wucS8eunGfFd<)j+tg?=)5DJsjhJ5U&vxfO0%4kIh%${|6I_t_qwo7yzAvvt7nG zWB4#I+$VCykv4fao)zFCatdVY<&^)$-kS$P*|z=TBT1+xWJxg>=3+F$}U!W-McvnXB)qXX}36`@Wy&dGGi4d*AQ( zeg2THF*Db7o#%NS=W%?FgtRh_SMr(rA6AVUmXGyI_{iX7p{d7VV z04TlZ%L##&j;W(E+gh{SGLN*F7O7=^xwQTGcSLch9s*K@==RaX3IzbZMdgg+SD%6k zuKJuyR|!0Q&5nJ>aOJ)I6Stj)qWS%dFJwZNYVbu^oeR1Z``=*bS5UOgR9qk~`yp0l zA4};~eaY1Do2cW>Q<)L*wKkTLdu#381@oJCdft61>v6ETv&GwmBQwJ=S3%Y|zQQ+Z z9PWPEB=@d>v8t&j$2(B!(?R~DyxVIs5s(Fj?y5wn06{cM*0g3HyyMl1+C8V@>KkjM z9=!{g+^RXnHjP=KxzUYU40?0UFlvH}UBJkkW8LHr{(6fziRRiHy@|mpf34flWwo^y zmYjdo1PN}r{T=~rl~G}VAd0UP%uOQ4>$%UKf+o>2BEMS?D{EU!ONSG3RaZ7qu~ED7 zVh_dB#IPvq6@y;A1ci*I{hu}9K%49`)T*X1GuxD zTJkH%U_>~q1f|NB!*4hTAL||>EZqMLiUm^viNJE<=rLnk54@z~#i3qP;Fg03a}SZ8 zF-uxl{EPR&+wAZo&c=Z_dnq9QQ=Gkn3j%VmDJ+2vh*>K%*axsc8VM4zQi$R|)Vz z5}5p_etja`3?JX59aq9ltZt<;x~?*If&#}0_BtKDJ8JT@oIhPBgvQZNVi3p`_>TE( zuWSP~-(7X|g-4T5b!8n;GnIE|NqPNNU*3*cOrbuoRMsj`cm)%PUkMZKwcY!~(i|s)#vi%xG7wz(vbtis zlnF3e2e_%uw<;`Y%nwON>%Gpu-6uKTbiIB&IpB;SXDv4x!M7vh?4qPW%J>s$NGxoKZi;=l+?@X=*rx1??S zQykk3?!c1sQmTsxB4g8rsQa@IuH-u1><{GMb_qp^ey3gTzkVO79mkTTOhza_eo@ah z^PFxKy)67_58neGIggPb`x^RVOgrh&YxUz)@>YNkle?m@+%z4{&z_1$YM-B?&SMMB z2s^sB=;(6eET&bc-rGr&qWY`@0|{%&l~j&7o+~KOyDvGYX}Rygx#K*aEzh8$Pt;KD z5Mf^^8Q_MP`l z(zmRZuL6UsMZe9F24n|g%X>(7AxSMmdNyq({JzaTCx}TCIkW*!^P|1H_PO@GY#f{k zYDqKo^;r#UG)uo~`b&<@fs2LCow4s{UJ2ZLa-*Ja_0*GjQiGOe)h(&K=|#fHOR@cp z5d*jihtcSGXFAY)fl2HKLoWC82X&OILB0+e+SjKdi zl|nZzIf$5<8TBE6RiQv&LtI#hpEg~x2`e7Di;a+fdg-Ch?v?`pT@E7uFdEy(K%bK` z^aWE_va9rZOX>PCUaD9b`!MAOmtZ>Y)Qb7(n-Sit#YQMUeb*A!V9Mb%EXe=xoA2s* zE#U_8F#ZxJ3b?*WuviTEjoFPi1b%B1|FdRW-zDc291$Or<^B?O{q=VC@Qm)k5~1?! z7HH(F%MyJ2@3a4DPXF6S-it_phSXm|#6craZ1VFIIyGqBq_-M4+k>TP@aP6yGvCUM zLo9C5h%6q*-2dD1F#ma85&vM(2G30WytJ0E>A$Y3XUGH3i`lxblLJZT z7f;Hbf1mwNPtd5yRlG(3W#3fW}0jG*=G5 z&o6o+2bS!|%94V`qm-I1OetluiT~}tjmZB_1KJY)ymk9`Z_zPAB}!BGE!i!||} zctG=n^+`~+2%P3amuKPjnSk3m>&GzN-C4@p09_aKj{6l9UzC8pkEcE0UXsc?nvQPNuTYX(<@@qa>j>2I{N$VcA!i0vL*i`M2i&wf zNq{ZH6|Cl^qV7RZ)P0N`Ol0k~@c~@e5}b9aNSBSzIC;JfSZ1a!>U+z7toB9^5l}w5 z+cD6KMz`lD6y6x(mjlv*aH)N1GqA5(H zPWvX%;<~}Cv~_E)#4f9`Wt~P3t-g$VC;A(IdDSV&-g66e_M)KQLO4FM8Pi|mvd;Fi zo;oOMaJ=QDnN2PU?<_aDAyx3a=<@3rnPG`BT~yEgAhr=)4NJBr0Tq)$62Bw!0r!Ja zp7qM-xaV?-`op%F{VE|S!BT|{IrRFcy#nUvFDdY zvjC_PC>7w+9q=8YNQAlAsS(cF8hE7x1`wgsb0O*|R9*{aB!V0C9kJ7N5_$t&;1<9{ z7cGi9jW~Iwi*tHG7Xyv{PHiQc^^zL`4PSTZwty>de@DECTMap>aV+pvE(i4rp=U^VR*rAJknM}%H3gA~lkXI&#|yg3iS*Xs7% zV@Xr7yzqsO#QBC?%pfk)_%qhy;JfSNI>fY5daiTyaa&yK5a0??AladGf2+G<^BC^q z6h}gI;8ZJK&J#Ul@^j5AI+(>fK0E32Wh&i+Rd^nH@}t)+5d($!{-+-5y$icoez$4WpiKL8ANu^} z$HO~GJ9en`w|_0na9{Y$sZbwyWJfMd)+{Tse!~Ury=978woi9(gNqdc2R_AgdYFXw z$VWVYC@+n;;-rd3*5<}gC>OuNGvoco>)p1mmEUmNih@q+4-OIQ$kzh4E4V z%D@qZqS?$ta98gY?(Ssla~ZtDEo!9$+-zY^e1gwpvr!5u(W*u(7)oowmkKeL6{ zCfiyAuBz4JL^B{hQiAY~{s)bTzt*hy_pixZNP}wi@_-)#v1L1fH0{q^B5i;C`r~|# z4w~f3!g%o#<>FSV3+Jv9G*Q^Z|J({-g}Kl-!mve~_+MB=X8&?No?l}L!7?teLGH6| z5R-%A22$9Lq)ukS@3w*ooSyk|`g2eH&0D(;tlDw$<|V=Oq)5Ylg!W}6C#zN;%u5?5 zjp92()(UcO;||T6@4Nq1ev}W2{T2^=vNBV0{{*IczB*(85+Mlq$kIb6_K0I!i@jw6 z>|AAi!!bDri20Xyy|^;9d1LgIVh34WAbtH*fzde0N0oM-Dt3D135XcXQF!BtFbzil zI>?G=P1iG@a39WQD9mzIYT$JRcf@)&nqAI!2*CVpFs&F*;nKlgm$xhH`JH=7N&?sWPo0ssp#W7-eIF{Va_d;Eq z4fZZvq69(9Xt=FWJY~7MSy$DNTx4c?*)HwYO}k8@R@wwpU7UeA0f2HD7NiJz8(CYC zla0>w3j^wv($*&!E0Tx5u#zfO1~LuKT<#oQ*$Y1l>Q{hsXqJ~56JgG zP4QVMtWT57N@9@u34T5osLsa^s;D{NsIc-#^mQFN^19}0qZU$jGE7)P{@zKma=(4R z&0Zc4&V72BDQ7pfJPR5(X~8Ur`)NrqWx&frK>8EY{~Wb`8;hT5=cs-p`WKA_V82twB)(f{M-h$@dq5il7fN5fiMIsdDUH z;p^tXrrPF-yzVK+F@(xAq5c?D9)Tk+%B%vVad(zhrv0Ay z2K5S!lan!h;;fr#<;>dn4i{BU>uqv>t%xayI1z_)ehJHmy+ zk|#}4fXB3f8=r^jN*aX2))ih%BO8_!9FMiq8(*b&(HI#J<$hA^&CpR+ck z9HO6WWXWS`zEp@MN1~wzGDsVIHa1P#b!uS-Pn=hDtKToQ^00_u0^`hEg)6sboWHKA zTV)OM31_KV*A%Ax5e6$(DA1~;_a#3=j)X8%K@tMc&P1X3eGUQMDQ8yDV1iM7Rq1-7 z1^rv%C+kK{r0G&$$3r4RLmx zBZ?EE3rg-MP1x@6Co=sw0gNiW36;%Eaed{9qs@#x|4MDKmS^kbQ&fH4wWEf=?61~F zFYI1QC_xuV;i#Mfnl(KN7J}HS4%3$-eYAEa`9B!A^90BE-xS&52%h^!H#`w5_l@|h)nd*RaU>PV& zcQ8dZJRen$8W(06DzBwF6;-k~vC#=nuj~~UU>=FO+9ge2dC?u!wwDARKM>^JF}?hB zF9ay$ayoCzwIx6J&_CMb`#7O6>I%Z|$qXPuF)UUc=oKp00HygpyX{jtAn}ibmwz{P>7^Pdr%7K%<6>9`>x^mMtbx-_AkhXG+X~;jMfOGGJvIX9GH^ z@{b8GAKUV>m+O7ScfiXmhr%vb63XpJSQf#ffi2K=ssP$ptBj$&SD7QJffGa4vD|DO zRLxlW`w@8A`_7TBqa`nI-fMNj>wg%q8M!x0SU~ku+6%?qST##KW-Fd_vH{mY3k-MYsluBIYn;5$NTOZ$Zy_EI))vJEO*o28ZMLe)+K{MZj1 zL;2bF`=QD9W~j_XX?-TVp^s!hIcdtCnj~{fyg;S7@b!s0R@mT%%^WMAV^IGt1SSC= zkGkcrjak?|BT#xw9wtA8Ljn#M8)%q!ehnBKFf=|Ib#ITrC?rj!AE#Yl+U*12FDe%`8T|ZUz}#4*os=_<+gxd)h$eo)Ej0(?`O z{>BBN3C!FNJ{GfojrQc%ChbprEb(?=G4(7_asQc*#hv);d@Rs;Jm7%_1sB@I>%i@s zBm$TLS7ka0pnfy~-$G<=#lTSxC%GBFLpHbWSNa2AAKW$5Yw+eU z+42ex4s}@vEay8uHkWIu&R7eUt7{qw;}kh~bT=+@Drad}$owwO92}wn44Mee*TQ+)4TEiwk`9-EbH(Cb ze*iag1upO7=Z5G9V+O!pcTb&d49la=!=Kou%<#k;Ac-aeyH__59j3n{I`U~nZ$CBh z%yWMya@j~AkcW<)bh%lskt7NW-S zgUX`F(*%`Z4ATrir>%DW2*@t?ACL}h?#IsD1@2H5J-CZwM#qKHF|E0^lL0L)HXN=N zjFEx~6AXP=$a>)7TbAa4snq0`ZhTfp5xFo{csHuC4i)VyA!y0;N|5iHpzGL&jNUqv zo=&r$mvI4Go*_edwAgoG`-=go@^35IKIMlW64+l=X}D6!iD39F7`$*91!qlWOna_~ zM@{NzWIE~9n|e_atRX{AN#4JLDAB?^{jiv}AbHKk-9qsm6npSkknK5!{s4akG0avh zwu~&gC)M3X!}h58(AwVM><4xF=?|P*J0pTU5!m|nB*E9$($;m_znQbR6FD5yOAuhm zvo$y}+^2gCb5VB-S4r)WI3{yF{zZ(rhF7G)M_RaR%wXQF=i$4&Tl{N`+}Al+B=a46 zVs8B0@?eV5x=uy0*sGv--tuL@G;l)sLRsLcTp2)9a$-nz>{Kc~{jvsaq@mUjvpGUP zyLt=8+-ZD@wV7PC{;H4kpy{1X{~IY^sxqunB{i>N>mS-TNe;fJ6_x-JSQS83B+}Sb zZ^8<5Mm|>eSCfmx!nEYi(@*fbyfD#t;PIEK0NxRQNK&WMJgqVFaZWOK&`AeqXzi0ma zCyrIM^Z?Xh{Ua|`mtD-il$R>tAeYhz?F zLrnjd_j$?>R)>%lUuK;=)-?hc}REe7B`Dw&T3M3=i*w;*T7BrBWKW6P#3?1uVuHy2MAM+e=h z-CV!p-NJiWT)9R7N5+NTc&LD$63|*BVcEfW4)ke93!jWQR9iVUYFO$dKZy!P0I6TW zNPxTUYhR*%<4=42A&&ugyoyx;%@=x66L`Upk3P8Nbg%%F2rM5ts~0$X7`RCH^w~%1 zjdMPIM-=#y?U`2`$;nwhUuc7ucD3AU4KSWzHLkK@IRp2R#1Y%oZ^*X%aL>14FCR?qD76TMhHx|b)vgpy zFzqX$iOiCdz~nGX;nK~(lQ+zqnDf1b9maA*cA?Nrm$n}EBCU**=m!~bEe}+AzuO26 z=nxVo=zWjf`MAxqKg85JwBP^MD$jx{`%o<$v&3G{o7VnU6-mz>K#bHzF39=-kX{ePtn@LjwHA?wzn)T)AS!c8BNhKD{Rms{k6% z0p90ut2ZV%MoXFW{IE@lU9h37Ed8Vr{ECg8VT0l8p+}wZJoFx~eqS}jI;4L8hIhMP zIRntAMymLjToUbiv7Q>IfSt!hCQyU5SmJDJ4nYDeT}f8^q>eJH_5J>3+7<@)S$BJ1 zy%f&E2;IfC+CUpIbVEq2c7W95QW3L;Een!HS6<9>D?TJK+;G!qwp8fQ`BJE&`^$n& zkveQgZ+CIL9SqwD0Ee2LLs>GLT0MWSECynR%EkpgYpBzmq?`L4ah&`u7?JZkm5u)t z!_b-eV9bz!JMj5%i`YXyA01!e;)fZsEZ{ihWbOWo?%00;rP0rjs&}-PiysK~)YEMzBq={Uzf1D=z1+xuL)Fnyn*aci7%0!UD@` zws6KYh&;V24@4Iex-vmd!24m<1{J(~%;;IO=B;YiM!XF`~EJS-r z#&!R`s)H*|xLC}QA@Sb3^R`N|=jM)Fc7l$f>_CKal4=g}8d8j73zfYnEGd{I0&!-L zbh~5kG!EEKOTPUn*oC(aI=g_OY%5kxDoYx>#_BLQYvb4Zc~m0u$<1Ajq8}*qZSsA2 zwR!HWu6p< zJ_y#o8MHn}e0ZCj@q} zF7hnxRVT9qSB2zk097g5&A}l016u7#%A!Yy125b5FY;f!5_yn1g>LDOp*Jrc{>+9^ z1*qt%D#!rcO-?|u&Fr>;VCRcwv)azYE`GY1)+X1E}Y)1?YaC7*SzuY5$IhXXI2_+Ne7OQywx1 z6F4viWlplwc%c5&1-A&SIa${_B=xvdQqq5&?j z0jEOj8$(MXWe7T!Yo>?4ihm|%JKV8veWTv`46%@mYPsC#Q#3w<#uTyRSVzguSj7$c z3~5HkLLMm$-qX(H#A+~;cM#{l$?YRGE!CkXk3_I7#ytXV-x#RRL!EpIz?aRqM^BYH zL~#t6G}4hGiQ~m_W7sC6DnG8%tUN5B=EXGaSyR5#l^LeIsKRl>0Lp;8k#*z&DJM2D z5+hts%=t9t^#M*x%~*$}>VG?v2#o6SUklOAWgg}RBqO{$a)5ym0lMwjWNh37X4(v7 z+$N&EPa;nJ)?WC}zE09&1-f3i?7I~}HYVQEUY(U!_xzc5R#$pBMyEy`^Ojuqs@)pJ znhxMr0Tr@8?A`ACEmg6Of*{NgFW7dL#92%$z9SaaDZ;=RN{mBHrv6T59Bg9$6eH2} z)C%YW?lkCe0%2j|l61<=yO<%BS3pRj?1vKNkw1KIf&FmzBm1A`iPxEv55QK;qt6keqiJ-T|F=SvYfx39R9p?SNq={ve5F^{-tP z3kM6@u{D8V%Zr`lXWU}C(?cR&u`J=|9#HWLtTAK33FC-S~)*h=KZ8ua1^h0HeWf}v*y z6JX$X<2WbTM@`umL1tLofg(C`(wew-RMpXYYkNl@GJ*SqS*zTSxIS7{?pNuDo)SU= zRI6;<%K6?kYim|9xUX%|1GQ(kr>TmND4;Ew-pre9f*X2 zHmE;RBbJtv0lq=UhX(@YRxJD*Ce(!sUJ*c)26wM#wPpEd4dn=dZRF6}3uo6$%0Fr1 zAN?NyV2CX&?M7$lO(2(d<5l1CKDUhe4X<@_6y;!{&7>Big%IXPdM9TqlwCuZeP}Yh zTI8E&H72}o_YP?A1G+#dbMlQ_N>|dShO}Ep#jJd(Ks|HeF&LSvo(VqN*>;|LKCsOf zokEVx&p0QaM9p{M>d2%#$>m%$D}2md%@MG#~I}YV`E#JUv4b)1qUfz^gT*oAuUNEX9qy zl^6%&{cAoKSt?Y>zjcS^goe$>ov#v>_dssFrIYa{5K?C&V?|;=MYzi|<;V>&=HnY; z4Y|SB%uh+%m=5HV$r3j8xJnAGk)8|W0W-Ah2R3*X$DdZ|TWnJ(U#UXI2Ny#K-vlT? zY{4avh4i?;3+;>=LA5O?Z%_JW@v0E>5pMJmpk~U~*a|droS&7lmCo1oE%uh2e;a0l z0zpV`FjO5w&JuH_gJZ(Mx)h zKOZ;or~VW!|0g8s+kS~ZYzN#=xySj5oI|V-I(tC<9tpq6$Gp?+J7Tqil^foNY)1>FJUBzk>>>Lw z;Tl!wL~mI9vWKe`quE4y0K5v+4&K3fs+6yFM2Wg^aDyhNws4;>+8ACDeqyh4VsCR+ zrFL}RK}?A*Dg%`bR4$OPz)7~F=kwP0I7I1ANA;&$$U9=Xk0q~L)Ek;EFh~qvRM1*j zqcf@bX@xg-eAZ6RhW?<=7tOK+=Lv+$5^zFpF88~MRRq?y#^AgnjJU(=l3AS%9mB?{ zq;6DvZQeV*V1sREk+EOB0vq#;j|iMw+yFM4OsMP_emmSl%q|Ef+fQ|Cnh`A-l(LL? zWA!%DnfUmI9gK_O)#ocz_8?sp&G!%8UhnfpQ5Tbb`S+lfIl#+FH?F`urF`@Q{gr)3PV2)31EVt3%9~7dv`60XV`fW z_0FHgEbhThsQCo3Jv|TjII%bR9BdYJK_@kMWQ=!sK7mFHgCCweWfawINGdrsGh<}i zDqt>@g9)GN!c|T5BfE{?51NZ-uQI^t@R8FJ!g6^iaQ#SvDouqqk@Ror6A?xyHh9aiG0^zVrGr!%n_znT)? z8TMwV%!b_mgk0e9ERKJ&gn`xXoV!nGfImE`O}UIe_35;dipcr|ENb@_F|)SNQ23yy z1iYFDSWiPw(~1Xl@+{L- zxEoZin2^>_%*(&uoL9t^Fn@cvEmO&P-G_NAUt0EonI78A@}u$0J16TEjsU;+a3PvL z2(PArGZ8#fU_r1eE}v*F-QjaQa=*`w0pqda-PHXNC;8fY!ZLu%xdMdTbtlOEi4ap$9ZtxL zL%^MBnLu=~gPqGGxo`a}X(Rv^oRh)t@pNnyTr{?)1CO?voY>dS0oZC8&iOD|ObhJ_ zI-Wo5V@70rW#zznGjsa2t|U%~{5YWo3c}jM>niu2J*%8$Jrm&U4RJ*Gia=Kg%U!=C zR9qGhZHA+%knmMY6@>ozF8~28!_DZT0mtzpVF9u;#Ag*gF9EU+K2{%{X0EOW*bz-- zVv;Bnol(E01T&k&R_%7|;ij#XX(M%$oiUdVKQztCytzxQfP!x*S1i;HhQgb%(R)AAh=Xb>J=`VvoP)EjxJOrI|Ghg#CZ=rVa0akz^ z#TqRY=um6l5EdO>$-WV~;9xD&i!PLv6{BKuV^X*wbpGnX6b{=#DwI55xytE`26xi@ zTlg8fl6znB4d89x)^4qM{=|PDCu54t+6v&H&tND7t35?4Qjb7Fr@>{|ks2pswi48s z1su`H*;DxLr^_GAI)#8JmK@WP0yp5^eSJuZIZ2pD4dI1jPCT5kI*BqQeT3hSzL4JT zM8C1PeCcW4&L>G~!1S=0lO^0JrP)kmS+{#>a@OTwkXcTSnN}J&xDGrS_KIJ2q)D$a z7}VaUC4J`P$s@r<_S@7q7z-J8yrQkcN8jhaZG`Rz7YPbbaNUFN3u6*9PwT&&DBSoa zc~*GEsr;h?!Fp>>`Sn~?NUJ)Rer~)j?4;wC*N+NM#hH=x`c%%o`0!~*cyiil zNAca4twI8wX-SS3BTC1guZm&`avrr{`biMq94fS){&i5VsG!+)^7OXsmTK%rtra`N zBiI2-1^f;}LgoEV)cqx)Ds_2pVs@)m40xUNN~Ije7nzwIrXCjA-p@XtoMn->B#zV?feSp3_ zi$i>yga+$S^AZau(=&h#kvaYy0Z^aJKa5Ze?)4`DE__<(yELa2fuazvYo|YfW zZUwa%FCh9ca^PZbpMQT3`-1D%dO534_d7*aUU*c$Okb=$H+*7wdb2XGU;UQdV(>Wl zKs1yON^;l9GFJfQl~XIXU|+AyN){9diax8vn%GVuThOz)7;IxCtm87 z7VGxG>$hkca3qCU*m*H}W3?2v(lkIrqEcydLC=JhZ{Cpec+`=NkOpiDYN#PW(E%6Q z4Qjp9#V+T}Pild#P28N38*?-2glA&x05IAKtfze%+=rPQHE6TCj(BmrsX~R*O39$} z``~XI(}{XaQj2GKWTo9_A7>@i24>JJoK2tQ^#jO37$RFgdeU^%$wa zv~9`zUj}|w6!en57QsR{=pP+HCJM!{`lK3WTj#8C9W#=g6c9?ho+CVaU<}t1;@izB z0_*Xu&rT>1l`Wn`vQoaPrF3IWswB@_y^p&Fn^q1fLcL5<9Q6yTV^?81MX+!Idao5a z9*|dVqSQIM)C0U+mzBswO0il8RxMm=dqtZwwo|%?%O*#wU(Fmjf)B#ANJC;cx+yX! zhoi&=t#PEy=+!g|^UQa|Ok;hi%&{}flZq&vbTs<}|0XF0N4w_Mm3J*s9Z0%2sjVge zeWO2evRgb@fmEX11WZ5g&$>6Z7hXcO2f!Y1V^MH+{GUR#|K9EY`567N7`Qc?!_>i( zoRjSB{F5loGo)eoi`2oiKVCEq=&A$}t{E^>U_3w$(67A#(V2Yc%mt^FBdHk=T=D&s&OoL%t*s&Ly z1NOwudrsvbl4>FC~uV|qe?wju`8TE1UO2TvV)6=eg^^A@adYfb>V)!Qdy`GEW_RBEj zw%P?oDsZC|MF+$~P=ve<#$p2NvK5dD*IOYWF455+sH*Fh10H2E=e-K9rx_hcIv+kp2U~nGqFHnMh4yLM(nAv zYUil+SG~0kLRH1aFg1MkZQ6b4`DdEazM~6X1%V#BvYgn!=HGDvfq=FSXk}3a)0Th& z>fj=gCXSA>edFHI@0X37XiEX?`6l5!XM+Qi?u7~O?}_A@uX|9p-doB_cc-fHZ!{d4 zyRAdy{kh#z<)?N_vJa*|5hVaTLZXO|^*{!gzHL)g#FZSKX6Gv}2v!9Q&CRGO-oJN5iF7O51uv9mMc>phwdV8O_ z=S&*D0SoR2qrN_ZI!Ydi{S1;Ka{ZVNaxyeY4$_`F#{m{w6jzU<&B+v~(7Xn>=Xn)( zY4sfbz=I!th5n2*V!>(&ua6VF@S!ZzwV;rPk2}>&d`qDldwztn{X_j+x6TbhLzY0( z^GNGUrfNuclTeUE8uDGk^Wpu8p$ z25q8pxXBZka>9>Pfik#(JKlYV7C>Q+s6Nxa61l(*n8v}|iW-m4Nule>IXau5#xEGK zycm0+OFp`Qw&g7Z(w0;Q-Zlr4GW?MRRt)AEYz%_7X6|DKFN`1H#Qx~0H+x=O#NJKi z=Wr|j!`J(19|@kGnrsY2&CCPYdcauIMO=#+`G%^60MFI`;4|-|fej~Q586ePg)^rQ zQ!^@3ts1V3Q`}yqp14c9d*-?0cf{HNPH7d0R!Lw?%^`cvfvH>Km|ifbA)kO$`(SIzA>QlENmhT zk(haOcZwgz(Pe|@v9Wm}U#sCJ=bmZM9X9vvg=;q%BKL>)$uV$&7`q^j_Df1lf&})d zbhv8XR)b-Ld!u6e!m$Gi1D;W0_wTvQN6vmN)Zz6{zdk5$T8OzK`mHsi9xh|3=e40& zQ{NH7`k!^=0S(!o4saQ$XHOILyr7?7{KY$T#qn~XWBEz&Q}LpD6&BNU9=m{TLFmd)$aNhttcm^gur<=zDyjNAi%B1T*1dzX6i zZ$~`i>wdi9rZqxTW7N4GQ~PdqOsPohKt zof?t@Jg2~_CGpU=B``Mos9fDra+?fXNGdvu&0jgS zuz2@7LK++@@U|(CRDSu=FOUBHhW}*)g|06KIO68aL*tai^~vxjRy^3fUPBZ#*uH?Q zqx@0U1GNxlFuYhHvGBPLoizLwZlP>~$P4F=@gO;n7Y^rP>Ho82X%KU=bA$m74ab^QC!zq+IT zUp*|OJ*>yo@ecYNj*=cXfuRXQX8|jv1n`VW`&kh$Ir4M%zz446Mr-F zFMJJi!yesGB{{oVrxV#L9ox#Si)Uk~KwajCBQ2~elc48CV1jr2$ywh*0C05>^e0am zgdy%?UolCKejN|T5ALpnDlb4eQj4A&0jZF339kFp7_O2` zyCIWGqH4hP>s`EGDaIchV)T&;9-Pi~DY<~ILI1Fo8 zk{whib}KMWZ4-07RwsQJB$>_uQ8uKGObsKRxq;=csG(jd(SEjrrl{;HT{UF+CjM4x z6RMc`D*d#E|MiN^r%3ZjyH-f5am+!e<^c6~L7>8UJ?N3nAoq;G0=Hh34NS>Uz9d+j zY_c88?TEGbWHoPEyj&40QqUlGU9zckL}a_B0x$adr}DfX<}B1-j|_Oa1*9#3JfAF8CT)Fi zMWyXthsgu|2Kjrhp+DSZ7kzmN6^CkKnff>#xE_^*Do1|q=||qjTq5{&JG2yUA3&?+ zyb0L+;#}mkF&d#>uNc~5@OeW*(EBmmfqyNf@axskPHfv>%^~gql*ty>-&#Sw&|#T! zI}0pvK0kA<8v~~;N*;y)YM&@v>7zO-nV;PMI);i8W^KbQOgMXx=f|QzBn2?R#s!H6 zZ@k08Rn@TRIMlm~#kqmXK;}dBVcc=3Z2N;owku1?)V_qPn1!4rMd1~4xN*HV zroWBhfZJHMZ7rGBFt7?*Nd^U-8{#-4{^uHq77Jiuui{K?hDs|aWcUkoQS(hrvib#c z0{Q3!E{~Y6e!fcW?Ycn22_uS!-zXkLCuQdp8)_!wMN=E2GP^NfBS|2G7D1#N2}Rhm z?M9oQMt!EJaA$B5X`!cE|S9NSTJqb&0a-3>L}W9jJ|giRWC?aNCIIivkVhbZP}9 zFwZ;l@pbdYPV8eo8pUC6L0?mpF(;0y1|K2c{5S~LTE@=fh9Jza`#V>+amWjA&#cxE zw7O`ofXve&6HQ0kUxNbbRR>*h4JeMdZw1_hm&)h;=CGp%=5tkNVQ1euE!+VQMtxrp zblGIf=Nm(;>pgRC->J<#5d(FR2Huzt3o?6wLw~Wv>;stS((MHm0DB8Z34JM5EIxTb z=Sf}7&`gt86Id$vE*?O)A;z^hSQV=i%~9|at?CZA?cg=tx z=!F%xeG(h+pvJSz9##Yz&@xDt+SQTz)@Sk9;^X)ukuG1#by71_`l+C=bW9xmA%)l9 z{fh5uU-^}GUmjmb9kkaZ$9z~QRjy%QX|M=sNr-LqfIQ;(X1!y=oFIwbr*aQ`C3&sI zzM}ZAHguL#)_f`0X0of9JeB&eGo^E=J*nQ@Y!YAOAFC#b*#tR053eWghl^-G5c?B ziR*r6yZrfYV#88I&cg}J;#bNdFE*p|;bmyr0u;wj#6zR2b!A9D%?&@pf-MTw4gSys zHw9dD-D1$?cFqa7k}~~v{s%Z}0gT>We_VIozjR$<3t-atB3WKkR4|!tj3kuu(j(XB z88`AyAf1#d%9h8)@rubSkh>xg=?VS(Os{6{I@ zxm+1FxQMua?6+)$>pOMGww(iPW0?V_|1+wa(l1XF zNQFOB=z3gl6O~U-tw%Z9cIna^EL)P}V@-61Uj$3!S2KnU6|V#zK5LXRyugETP+@7Y z%ROlQD`zRvb+1*;uECPcS&uFN@8BqV<71hC2H6%r%qt9QKbA>k9b#)jWn1y`EGMc` z@7s2Gvt#A}V0!sjKhoQGrdIM&`0iS3&l#<-`4gK^UJqpC;nj95i0T5YO_?Nq;~CGm z-pp|Ftykk6&2KBXaZY3Adu8+I2p?qAj~xZEP4=&4Ig%Pf{T@*+0&#UUU*dks0i}%S#m&w)ZutV;4dkG z?AY&C7QLVu8+Q9)mV@#d%xXJh6@JdeH}`}$MmegRHqp+F99lSk!B!ol5nNg6as1U` zw+5nT1AXuMrE##fdxlWRde2WHo^>+XIW}`@_fG`Z<*AvA-0jK` zRY+6DzJVdpu|=Z4T2aEb$@p7bje-7XeY%#uBj?dlR@aVhLDi0}BKq6Pu%a#=Q7h5t zrazxO63t_`m{u3@IT$t|_a_52bkp*w9+j#N`k|rjiR=0qJ;EF(4|M|wK(KKbK`yAu zx`SDooO^>zEAa0Trt8*D|2%=rW%J@QFpO#M)z1#*B;DPrDDG&@Yt}u+9UEhvVU^0H zO{(U-CM>>Ltkv7JX*QYEiOm1eZTgQJ^w(Hd<9D=b2@lxlS4&ka&ugJ27d;$775NM0 z>jb9+!*)KqfvE`QK0D7MuPC58T!98Q6urZ#8;2_@T6Q654}88JAeJPg6n1{Y(EtWl z*5CMfgt{U%#i{Xy@p-a!ixQcc42jQd>)&66KV8$+A&U&GXs&3e?wiTUo6F0c)v?!Z zNFTU-a9!adYE(kF?~dv59aapLuAEB1*a0l@9nP5+uN3oUMP*|sPj034T*EDA{Nwu> zG>7sOH)6l9qD{*=z^E|zFXcFAR5n5%E|@HWJ#EZ3-qvoAu&~N;soYf>z`Rfz{8Q9;*<33xnv^1Cf!5`!A^r034}bl#V&DeAMqxy* zE3h)QB}aaFyp-kn+dUwu1@P);z}~z7e74Y}H;$_{agv*cWq?Yy6-`i`Imqadnc6Dw zQ}=YZEtU6s$1fAL=!M=+az9zaGzin59*dzHQMw2NdX!3=TSg@&n3J8YSmO;s+s_^E z{5bCXQpYBCyR3+i!OK17RqFV8jvm9GTtVS+vehg$p^xTEsJEyb&CE5~jAGFsB$0jz zs@ThlO$SA-1@hrDsh$~?QhB8z+~@k%M5{DiidFSGorsQAcpTv1Sgg&4iFI6gj^lbIY>w8zA zNx(E#f^{6|qX_;@#DD0Pr`gc6KACJmkEOhB%MBVnozm(qWLf#ZU|h_4NcBS~;^Wm6 z1YM_%70|CD$g!Y{7tkAPEQ3-Vo^E;lu4PWh@W_RyE#G#pd-M9+tI!i{DHsH-{f!ur z38H|G8onL##y0u-^k)|Cbar8W7=7;3;rJI|St3_B*bMs_ydJxGL-< z2#?TF_!bIN4x(k<#dW`CR|6K}&lxaw8#kFeY49DH=`?%aA(Ho5;EUjB~KAu96DRqLoV30{IxHFd2=MkZ#e(Q)9jL!Qc-DuMV_tb1?y zkH3CeAxXP5Py-W|iZ|}4c^1LKly0uucI3A?+Lmu1R1xp-=WbS8xatJgSGlDpe{0ZZ z7^yHYn&tQjt;iXulS-Jd9y2!BCz(uYBQ!4=5ad2qWV+kLQNn_!)s!MM*o|3A&+#+l z!}hiTTA%VxFFR0uSi4k$wF6P`H;41FSx5BbvKE<#uH7+@P-0No0Ugk!)=!y?rII$V zj!jK!#K!U^UBEH%;GOnv@N{sDyjqdKC>zC~Qp%#w9d9Xkf5{Py6q18*vzWzK{Y%a0jce`?ZcI zzla|=<5p!&>O4EOt}*Kv8}PH9SStDaUC(Ce@2^lIJluerHJmq>`(CL4s;@Ou@Y_7Z zMHb&d)+I!eG@waPe~+71I>7pl(3nj9hu`;)&Ph_F??~D0b5s_-h5>YxtL_k1@4FM?;={3?(RHPY-kU*%?5=sap zS$>m!?mc(!bNAV2pL6c_8{_+p`v+sVLpE!zx#oP!^FHtM=C56Q3sfXXrhqf4!41q8 z$$&?${nmUgy~%&&M`ZgqaWLP?k^!{?_eQYEwK2|wKNx^e@fT^@jQ;^zrae=W?{(Kz zxY2`j8sMHF*xcNRU7XBYHsblrFF_Z&5AP)MwLAxxB!F)69`wi5XDi;%zT|6(4Mm+6 zIl1T1j_c1v`cqk7QE(+4&XrpU^*>lLD89e+=8|2|3m08kS3qwrz?NJ*ejx4F|6tj< z^I%`S3}iYJCrt+}bsS+fnU7AV<)Iu8`Q8ddz#c7%gJ9j2x9lhHmftd#}ivO?auE$4*d3A0CjP;`C1$}m@_LmA2l!> zb8wG1ao&Q<;;#B?RU@~*O(ln@T;03*vD_2y*~ZsK4gMEkl&f$Yv#HYZQ8+K!y7i(e zr%;r+h#}*v@qnOwpV0rU*s*y)zweMcW5&fsm6s?!J$!dg^R@{3dIGOOS3fz$ReJ&% zH(3dq8(87j|z<0m8ZxzrfKh<49nW>@>n@g> z(viQpc)GUa$yb4(A8*3tO(TzDXxLCZD6Q0OX6w2Uw%Gm_Frs&*$L zP|29w_u3*X!m>-hp5Zt$-PFbD=MrTg2ebs6XXWu~p2Yv~P?ieWLxeXQEc4?GZeENg zJ6r23S!onFPnN9MN=MBik2qzhf?G@YP_TyP#%7{x9WMo_lj6^ zHX2zhJjr^0Yd=;SX`mdPVp*}hb^`uq>FZP;bGhm)>4g*Hv`M}LNgX~+BlnDwCrJ%} zSNZ;jpk7PtL!gwrodOL!?*tg6^xT_iYj@=2CXh0aEdM2k_8TWyHr1~x?FaD3Uz9Yq z{Z>h%hrspGL%`AAnQ!L&*ItW%JC&>cdtDTky$S#?6dLBa!Gji%2;f$q%Uy(!!0UYP zfPMG_pY^AH$JbZ}0In^5_DF>Y61b$}_TS*ifBVste=VIs!jP0efH;t90bU$H>AyvjMIjKy+5Ee`~)E24lI?RTUyFh(J|sx1{_IB7DY$)CG1V8TYR z&Esm6UFQ8* z%4$UU`K&84di8Ku)LUc>a{{ENT{skj`W|A-Q>tz^(?oyB?aL?DYGu}-W_lcN<-Szc zTyVKBs>)GP;+wy=6mh?IWh-O3l54>Exu=gi&cMep?P)#0$KZtMV2|iC4U#exoW$Nn zv7T%zbh0hD9(6ys;>LE<>8F8fI7u|Dj5oF)doPa=T(Po@VBD-Wrl}^RJGtS>q{@wR zC)TFj$#|)(nj_AR2AXv{fR42k?-`j!fW+f0?j8bHRC&(45v_zOle9Dq^u*3*C3q7! zyPZT3p$F{hEbe9SjomoK=$c3IQ@Fjn)V`rxCv?veeRf)@9;|D0U%#w%O`U+)G7jgz zGJ*eAKNOIsPc4Y%lNK4pKT*xs$PY!>`@9h2L7?pUQGvSX{FXEZ*mKlrc_-{#pZibL zi_h?8G~?lb|a?VB{tXijU&X!2w7V zr;#JYJ%`>1UxQb9cRA=Z#_uxe9T|p3B{~C*K0hR;_!(%*#Vk=^S6pE~^FS|S6Z^Kc z#rx@wswtnJjl6ncfbXloVSKg|YjXQFY(m-Esnv7N9+oF8uttV&bfbfqM9aoevxK`W}T zJ^PLihjPZl?wk3_bxV1`51jF?HL^}S#oJ|tk_!C}Nj~Qdj!GxR=Dt3X>F`M58|C`D z%B`{)b@ZLp9g+GDlqxdgtZJ|jeX3)slHzsDFXK#spSszmin<{speiYy;@6Jd5c+)F z=`3Nopn^Wd@Fut(70)U$VnE5N?A;nwpNwd(;aaz)REJ06_Q3;pfZ=YQ>w_2uq;Qt* zYW7?@ck8#|>E}!rLcFtxp2y`_v{8=^@u4`UlXC^P`YrDjWZ$%oyR*ocm65ihK!HA3 zdOs+PgxZhFLO!T*FR}O6R4B4d>7}Y-UCI`YUE)6)-gxg&D-8&!e?z16%f^3g2K_Q229=$lL6YQEI31y4zPTcLqx-+!WtdHlZrsq%IL zBpf?H6&%?CVk9rtvFOuY-4Z#O7M3R_Ln>&9%^7&d$M=-kO(1A#VqT+);2?&MNm86C zw)f1s5!|a#%@3gT8_o}FKP=QFy=VWxw;o7G51zma3~@0g9ks55hO8xQq=6~z9mY~* z*r44n=(}GvoYya}2@XKmYp`IL1dIdF2shxcENcfd|Z`KNv3d#TwiJouC|~c z_`#P?mvW;YG^L(w0mj7<-`%^5mrZ z(}Q_GUZSIN`(2JXnyYIF6`W8#V5^dxy6QX1LB4wfzks%U)Z66=2v5o#TSef?s)7Tc z!F#2%pQ8s?gMgD3$M`|$f08X}#?*~^zSsIJkRCd}JM~h?%%`h-yscv9(fF%oH>ck6 zao)b7r@)ExAYQfl7zhdl>fP6W;TKHv>5}VPePcY#8j=zXx=SyELUe6PX~gBG^2} zFz721?K`*Lcb7l2vwr?u>E46WTLl16{9>NFA(6$;)&~psjrWMF z#!IkDJHVdq?ALh~M%wNS8Z)_&VR@63+Yg4yUv(lLSIT zkRZf-AUn8n)f1Xg?IEq=v&HtD6jcBd z8+Hs@UK%mvL~$Q34Nr3f^DI1*`u-pS`fC&OaRY-T_^JkYH<1StGc?ISuHy!`77rN2 zdByIc3V7Xfq#D%@nK~114c2=$oH=~;meV?m<8WvBCp1SF2%;yFX=%wITe#oIQe~PL zqT_cbe~Zf@@!aFZ$x59W>>qDS%bPt`c_>TlHM%jT;9PF@aPT+~0eiv@IGQibK3#c~ zVKbR%tvxnROd8%7h}O72#DPkYjXzOGUd8DwYe^;D1&9KaUz<8%#JHlJpV_e&HA~iI>G!DdRHNo+osMRTpL8{R3)p{N(#bA= zJNgT({@)7NqAdO+*I6cV5-}swUP1Zwb=uS}0*0X)fYZD4>U#2qN6J>m$6sIfIvboN zU-A9MrGd0r6lU)1fHQOw0Sj@RB%>M$UkBZw@2JYc&$F->fduKR=!4i6vNlsh$|P1g zM)MF@Ue&J*&7e&fp`*L+7hCyhqrU;ns+t}bjE9iM62(XyRrQP!kAHSuCcxa1!E!C&_tOx<#5h+( zoG2~DTgt^+TbkX>`*G$+GaFbW{${4_D0wxgl8P$`8JZRs*E@Gr=HYET-~(Fv-%D-%txI_>DAI^CI7 zlO|WA1&>{-CzkQUS1$#$kiR$~YXD5S0>>+WX)#bfkIEI7M^+#9ka1n}`HowLb?vjG z_N9!0`CXPn2LY+G>%a$Hdms$Vtcj02FZxD%7Rn`Y<(ir9)ref!X#JA(a7b46mQtA}zuc|En&m);5+(B99 z)yUUJ8aNVUtPS8#mNf>@QBJY&wyd zH+Eg6ta5_of0KA#jpu_2#DB)-4p%jG>d_1z!1m~S1p;RnKk#@pxN()FZRXA%jX>lLZ#H@JyKB5LQefD` zJilOla@uJ|`uoNzv5_Q!a(n}LZt2_J zuDUzf^`?*Q8=G6uRVU7ada5j`VaF`1u38MZTZ))yN05!U~=I z423nlOZ;?LH$|OzjWj$g^Td-h=7g*cx`tt)88>`(b;TiZ7cn!JITvxV#q157!)!ud zTLJajz24Em9KOl?@JyTK70mfl#M*Mlvf((ijtAkOK()Tw>4bN?z^eUAT+;`F~4zyId*8FFx$1B*Hb zPJ^A=NX#%+w5T=fFX|6xP@s7KpCn(i98ld=g1=)2&HwG9IG;!jae5he;3aVj#Xg|6 z87cuA0s#86llR}5%Joe6xy@guaU}tt`Il*2s5#$XWmXWN*d!ttIbI4heiG(w|C+%uUrMe1JL>*VtvMfK z10w7W6oJtZScZXXXhE_>A;*jwxH73CwA=n&r00WkgLsML2^6%0Gy>HoOMJz-_4(;l zChy1#{E0f%o|TeYmS1#4VSo3R6oruHqN}&JUGQ2-{zPVd1pZ?ws5uGIx=CG47~Wf8 zIEeF$$>|LIG-l6_y+NF%9JnfTY!5i(ga3RKg(&i z5UjjI8jEmumGM+@a&_S&f4#U3g`a^do8f$N0Qnk;tw0S&&{%6Frbew-KvmbaCw4R2 zXY3+R)z=%#dZsPh+_Q;p8c|gWCJU(2l1mvz$f`!h8<7nLOdED8N3Z6=McaAlm^Om5 z-+cEfBElcp&J9NDfRYUuwhnP+qd4ZHZ~?>>TRvveiikK&KaWNF!Kg;JRD*8NeVV7a zoDh3d{ll$iZw5*aJi42O#Py0Yz68<8LZirfM*d^V8*wR?)pqs{H!3ckv$9G%{_>dY z&KIaR0*6u0gtPx(?+Y9L1AHC}j{H9u8Fm+X*Y&II`WwUX>(9B-;1!c`Dj>(LX5D(n z{G4f(!uax1>=Ak1Y3?-Kd00Nm+I#KAw?zpalujaK?s^w+`YbmsZll%e%8?s%_a8zWZ2*_JQH?Q{45#EbvP1Qx6BQ+}P2#?I^PYwka3Cba2ex z313k}e2?uf=HjPKHKODvh2$-HXOI$_eqFfT_C^s(o!D!ZGlxCrRHY z9kM50ZMOg{5uf|P$7O}@E~|wN(YdMPQ#7bEqc17;xzO`Qz!K_v*#T$UtN-*tg8#=#aw|F#PqH6G!tH{s;8{dHYtBJ_my6A zGhau}*!MA2*gAg! zz>fSXyyWNXDb8NUIEg(WYJ|05NK$={?qS{Ae!^b0Mf{sCsNgN?q%f3Q3rZxs7HRD3 ztowJ#Z#T17TW4V!*msasB<7+vZv&tp=a)X3m%uXBTFmvD9u6ol14bV0)7*%tt8qrL zRfXfDOrjYt8BzE4>d%g>*D~wZ8PS|kxiFES8#Gm8VfdUhDPNtuwWZAXMVEKb(ReVQ z=~?7Z?|GdQYa5GY@9d+6&J@Y0v)Jg2#{SgQ#^(fKI+7E}a^ZY!<&ycHgPx+vsHx=N z=wr=n+K-$RYT^*TVp4z6m`5!&9ci5tm~I6Rp2bvUktVi1_r7&dLrD2(kT2|MG4zD^ z*-Ux*7FN*F9KSw?d7q||#q0U}*d9sfGFMRe-=P+%?Ysk3Xm-C617jDZN^M9=ym zrPkMW{`ANAlUdmHNQc~R*I+0gsu7VCnAu9aSQ;t?k%e*>_CEEj+DfP@A$J(o@x3SK zQ}5~=U3I1_&s@KJJy!-~ATRmxwx^_UJi2Z@H1reIs5gMT0}~n+$mOnI5sKlFQ|Iay z_*!0*Mh-N!Yyf8YN!1^Cg1<&fWxq3I&kJ~g|Br|4!T&GM*aLwa%)*e2|8wTvU;g`< zi1B}*qDw3gK6Vy>Dy~C=JJ5`CRE}I6plNBf!;WTLb6}_P(9@3zY$aD70np7-lN`zU zJ5fKs{^_K>tPA1Wv2UpnhyXCmAVI&spv*B1 z9WQ@`H%FejG9$q!F(+^XIU*&_4e?e*gyq2&1>siZ4%}staX^OSf6U(N-hb+j`=BK| zxnn!g$4i=z(=fSCmzXAulEXZzW`CJP39uA7N8w6MFpYewtNJ{=4rvAi##=Mt<)uLF>_^veYKS|VZXA{p6nMKaBNd*%7OUw3gZ30@+!64aoIemTH#P2&%HF^lV3N3sF{OymNTdBGR}oJf{Nkdm~m@QVM&HZXtFP+1y5*De`4k>`O{-wt=57%m;5M zkPAiTM_>~u=aY@mcscDcpn_gi@LSb1M@ltK!HsgZ>F2WS7#lauYHwJm3akj*c#+b6 z$?R<#;(XUx$r=JQj*#QVEcVKrQi}jWXN$zWi8% zSMeE&>gS!^<}Ul`1H2@)e8qs=AvsSwEHoES8qj?)cZNgtoFKY z)U?6#gY5JAkME_CHbe%e$7rfG;&&jg(~-GZXlb5tBW~Tv)O#{%x(|+53)hDtf;T2U zX~?=I$8fwr7;~GKLMl^cf;jUg<<+$oox|^XIt1X@HASK+k5w;ii;zm>y=wC1VwDzq zk34N_V0I~Ydb*5LqQ4II$xg-ZI%J(3)_}xT3Z$rPMy(ZY%U_IITs=hH01b${{6txQ zD_`EqI|@1^l$$EH+j4exgC=z+a=zWVG=NB$lyvN}dXcbEj$iA^67xpZo=|qil}6=r zl2a}Q@~bJGMxH$t7uFRcW(aJ0LWTapJukOcwJ%MdI ztQX2TyDH;zjHz`t!F@APcO@~33uafi&Uw#?bP*0WKRalJ7J6^hIapK#3|7Ta3VqM% z<;!aEM7Yvw&S6fq+Xd(W>vo((okD?woY!2-L$@)6(74bH4*E^ILNzW z`{UGNGr9|OHw0M{FY>~4_>m%emKIyJkwF%sVeF&;+zQY%OU10co|2CHd z+Rg%iG;`6i&}j{TN%%?tOrmmF3eN|wa80DHRGR+F=-N-{2+x0xoBey2#KeNwT+cVP z4gO^o`LBmMqI~~HB%}X-Tc;7j3@CCX5?S`VQrN+@(^ijwtZt;cvX3B2-tsl9Ud5cm z2^U)yJ8NS1L~`RNR6U29m)@y4E&I%3PwHmAxk<7vUJI^N0(nOwF^?R)8;51K(untU zj%3=*xTT#)sTLpj)hhMV1)zJDeNGmB7+$o;ziZR)37f zG+#QSeJ(+XyrYVR|$ChDz#^4h$o zh$Lsm=Zq7O!8+YB6{*BGiVdPN^oVQ)*Qy@JLtfQT?(n_dA=M(+JZ9R?sIL!7_bbWg z)am12c*6!QIE@s&=OgA1FLZX1;`qz+fE%hKdzgvwqlEXYRG!&4kE~fTdExJJTnykY z^!o{EVO*B^4l9Ixx^NV`f{O>P7jS`Skr4lZ1;Tx<(;J+A+lvMX8|juCGAiBk^I=Y| z1dphq5ukQ}JfzoaPM9ngISJIfct*&Ya$x9R0?YErI~(>#Gfs#+Fk%c>&wO-I<(}Ch z8QoP!^+|1e9jUFY^V+I0RHGO@T^s)ZzD|CNsZa)lCbIs|@BHfnC|5YuDKqPlHKHr=Qfw19 zu2wXGdvA^-(?#A4C}m$=!7OzLKsd;x&jcDVNCF`k4!;qo(j@mkp{it!kPpE707V43 zPUnuzLddH38DJce?=(H8CAEGKn~WA66qFC2|B0HzjDUd}j_axUnMD&Xts&lY1H^LV zQtnw71)^!`KT)p-@NU&_=F%3y7=hf87oZWC_iV>6eQ-nQ?%hosJ=R%vh)b&+x(%Jm zoq3SBh?zF#?14H!QJ=i5cNtN+#?oRFd2mQ;Y1F-=l~Dom8M`j?@x%=GV0xYK{)lfa zf{0dy;HLsu5~p&L9z2(^9G`rPGV)zwfuZ@v5Bhw8+Y}zR8(L#pxIp}lGgR7nlH}a| z%zSQnCT93FZzo!C9T-+YLe{w9a=nRtlTWlNUD83J2A2K~kH_#1*jYpny^(c9>!{a3ppP>24qvRIyZI~9Sbp?%Lk6lFXDXGX0@*+jcpn2)o zP;}mzccUy|=X@S@BqaQQ#Q%zF#I@u}FSYs?j8XpY>v%0B2r{)E(#?Ukt$s(0~g z+MwZzpwthbU*vFL`RfV?9)Sesk?r6hFI#fZ?KZB-3hXHM6lQQ^zGJJ6GtWykhS5fy z#|(gw&pqkh2LA+PFWDu---lz#J!<>q^R?^syxe6_3Vf~Q=@wFd7^|pEgP4!5krL2k z3cn)GfBqB-9l?6MCr+8c)r&7dz_f0gvx?FJ5(IoTScyTxI5j88|D_H{K+i6W&QETG zQvBrmLBc^?QwFDXJK1P3hT=1=JK?khR?CiF)%i8)%4|$HQY{f!%`#y730xduEWM1C zG-xg5+locZ$7}l^?~O?jV5KY&EnNi?I^}-ZM#4MTtdcO77K)}F)9!2e)zmC}d#eeIR zRsvD+Z}1om{7qMzmo~p0B=K)Q*7CnCv&{MadSqxCUpp{me|;^#KMr)M-T7~8G5_3+ zWh_|B(_SCkNP6GI-W@bP2Aq@y+ux4S!~(h6y|e7vSyxHi9;6FtWlUe(h3)=ULhSgM z8l$Q1{sjNABa+THA8#Oxg2F%wJ*TgU)!vKtuA=Ga&$BN<%*lTEw5hro+HzsZuWnPOS zn0w`mO6ZQCcmo4nZBQ8k-qe4wMt2L_jj^H9vm~i-PtSJ;x8BALo>!fS2Qoy=DoIZ( z*29$_&lutjH#rEA@1ZN1`~I*n(1Tc%Du+K!Cg30}W}a6T%ox13}F9&!+_+&n7- z6;knEk=5Sy*jz`q?_?fRc%B=37c!erxsqFW>N@KDk-I2C)GX@KGIOT|mR<5xiIV)~dtU`u zwm`rmJo0hgKV&MI-V}2?`{J(dIbJsT8ngy!BMS~O&Sn*Ko8)izQ=b({?sdLI@cxd= zF0a$RYd9^_PZIW?qG4;|ZwsA2^@5#P?Uk`rT+@4d*yJ9at8#A)*A{J#3PpxP{XgTe z2$lNwT0F$DQ;oNW8$Him5F2>uBA=8h)v(wr-E=vS*zTq} zZ^DIWXvy06wajxDp6=w0V|w8O$r{Y28j%QHh**o?RTO7VQ45LDrBM|kJzprD)8up9 z%0dYuDG-fFekV>IFK2x1^?t2Mtdkx&F?Vo7%>7#?z597>O`qV9+~c>19PCq@oe1-FIS z$W>#L`iLQr`T^^cT@$REr`<|&m1uK(E#zX3GRJD{c5(hCW=WvIrZ%8=1L40`tU#~A zYqB%?-nj`!r+?QMq;CxJnj~u$pdFDs@)eN~3F-#2o$Ctr3&ozTqfwS0SAU>7JlRN0 z51+xpbqrSCb=$Zq$|xvpa;N;=1Y<1Ol1#;g>wccA4(;5(aj)2+x}Ex)Bb9*Q%|iEy zm$QxNT9F9~2l|PsK8SBFnGYC$KAo2_nrPa>;!)pO?_U>y6d#w{-^8GdQn(u+MDPeO z(5t+Sj`I@n8XtH>@kp21az_mBeK4`7652>`?K!c(@J9P)mk+!PyiC{*dmpTOYJg!7 z*BS{)x4hv3nT8xmXz!MKud`2m<$#Xt+Dl+B#&gagA0SEa^^NLLbSEQ?Q%{I*e-v~& zIXzgU54bfyoE9lq&Gk}i%<LwYCMH=hdg=|5<0h#b3Zr@7N$)ru<>oFMN+p@i0X$z7CA zI5ldnNVY0+x9lwT&A9ub5%;k+VIE5Hwev}iSt4Kz^QyTSt;(ZkTb2mU{CnAYvml?D za8k9LA%)=&g}wiYTBCDUAZue~$$naapXvppq^XTcB z4$Mq|oqd{9SB<%O^{1lp7~3O1CU5e?j&L67?(Ga>M|B+}+()>r4#u5^t{9DWSav;# zX6(ew7OVEVrnxZ93#W_q1>fgqOzgsuv9MM#fpIJ;us6JQ5K#DJCjeat5I@of(LHq2$ zLDS?8c?RhPY)j(*Kqqu1zSl*EyDC}~Bt@frm2HO`psC;wT;*`yORgMlHFKs8FZ|%N zXUUCxKcRQYUw35<0%M)%L{qA5O`WS{%vjCoo=oa98pHPH@G89bTNE&%paUSc8qKQs zf^;bdTYWotZXdBBbUcGD!d3*pv`FMnlr0Ph>~Ln-sZ-k*J0(CI*JZ{69CIYRQ94#4 z=@_Cwq~SAy5|WfYc<80Q2%S>@Gx(@7C4oH*&>6TLo>j}Hax5Fal~L-o^5a;GIv z4~z|eTW@mSt?ltY^d7&5{ru{)Wnb7q$l*A6t~Ns+h+imo^R6yUe|g1RnPtZd^O=Dg zB{Jbf&Cr<$~ao6pUr<$#=$(rnXpeVs_EE($qG5# zCB|fYKS_j|-JP@_lNZYRV%&|}uWZmU5moD}4}^Qa;G2mIhk5RL8Tw?Z2gw^-y*ONb zYAmy~@PWtHhdx11!|n`}g-8}O#H!V!g!36IsEn)i_dz2(sHNO9vLK=zZfs&H!KmfXtj zLV5hbn-M~LQ-G?WIQ(&dnpemRL35m*QCOM3*FXX~Xs!SlMUZk@LShlajAvF2-T?C` zD=xyHT+i|B-FZ$N#r8zImr=Y{Y@JB$yfV|a=F>syn_dmXz>yufpRKe=Ok(yAJ=oSs z{AM_wkV(vytY&c0Y{R@Jo?>K0K)nJb4F8!C=Gr+^De>O6@$2nDZdOZq`ax1o6}((7 znDbO{;Q3jo2m5H-e~be*um4k+?b1Oqhk_5NDA#mtq6VootRMQgC|Q`SD20hUlJCz| zz7k!qBh~))_Goi^W(3CtX)Gk?D%Lsyy79u=E``wDx2;;6k{qeipA(;Z6FyS?NAz)n z94)Rf4BV?+*(Rpg4da)NtkzSxf$E%`eUT!)r%5M~bchCWMSkGXLa6Uz3o}*9ZaK6NkGtH0Tu$;*;`J~72>ROy?*!o!IA5w*?WV9Puv$K}l z3hk(+!MSe8kNF#~$PJ(^;mHM;6^nw8qY4?M$H2CU1lRYwju+Ahl14KpyG~2YK=Xt7 za(YDPcN-$xsYo+%y$fE1yM~FWDZV=Ws^#{F0e|vn?(U8t`W9oXnbD@R7xcr(rHyb* zHa_L#nu2m>V};P z5hsiyjle7+HZu)r#^xFZRcvv}%zD#rrDAx!eo;U7u_ChKxfQeC5mDKl?{j1Oz7sQh z4=T1L=ku1sl|mIt#|rQV*xFuuzi$h&C?w>}2t@4{dEKFp*RSjx7<~G{ERN|4;wkp` zL~W9(jMDkIVwE`k5m;}tt^cXxS|*f+-Bc57>SMOo?f`W_X>s29<*GhcM`xm{hv zJcAbhsyR}PcU3sExMsw}vJSdO*v}RD>|Fv+gs2W0{y=lr^VO_*j_=ZPd8tmFIHbj* zAj@+W2H=6supm;-cl8=Mgv?en3YDw^MZnZ*&bvz7gyA?Dn_tHQ&bJ|kM>^QO27Pj| zNko~dl17fjKA*j~%`|Z(#eNZ86UP;^swMPeyD|OZqwxOm#Y9VeNJLe9TDFmO zDIQS}u}SBg9?M}zvqG6o!F;Rvk@ED5%EE~XMV{zbrwFD**qzti4jPm-obscT&p|z+ zKu`HKS$i?Db{Bs8spH7I@9`C5;H7MCKMPVlUa6n#(yNbyGj5F zQ{&iUzsz&jd0C30Lv*EL5L;fnyEKRIW)`=F{9;+tNnX(Xo$iANaM=|@O>O&5RNgfM3w-O5u$ACouGrO*6Y8hCt zTmiau#6SFF|L|gi6?`D-=}n(QFUnBTe_dNSESK_UdUXE5cLaXZ|eF(`AOV6hR6tFcIdAc1cUnu2;6K9&R)0! z9O;6RIoL6u`#Y^Lf7k*c5X{B#5_M5PXm3SgU)+Qgtp{CKvbGbl{kE&WG(tAqmGH@Q zun3Vny=UBMfx?~x_VXUd{}u4N9rrn*h5MM;>Jv|MSKSuEH03qIt%nSh>()>mkKeWD z<3qtFZRZ4-#;I`SBXZ7tX<||u&X$>3K%Dugw9EFPpKJ|?3S>#m`YVt%Fz9m#&a_K=H9Y*#yl7k}9e+dL??td1?xH|&p>U>nrGM+@D) zqr#`xDnFe$%hT%BwzBfBKNbmC31Yq|<+sLpmrT?CIY8;(s3urvOoKWJpD=!f_YFEH zVHS=m0B&25=RARp25bjRThsX5^Am*9{#63QF77taI=#eYW}b!tKx$ZpG>RBb4>M&F zgX>f9Pr(X{_4RNtHYa$WSAtfMfeQk2q)$!3)y{7TQ_p;P24J(#J9-0ih4djdK1*%kVR-A7Z`8{qTgU|rN_q2dP)y{-LTJYIxpNTdK}9#9Qy9^T2VxG%$-x0V;vjy{9u*8`HzUi z8xhAxL(8$tRGOO8aiX)*P79vk5vm-W{5WP^7Rmet0WOkl3%Lh*o17LyTuZSZ2wZU! zprLe%)kN9+MYq=8b{&fUySonz$bwPlgy*La5TLG z2i%67rqdR;m7(d_Ky-n45DPn@&s-%+v1wDN=d@~rhYf&e52a7!@}hOy8vXBNS~o7N zAS`tdR2g-DdEbdpDzw5j_N2w{1qFUHERcsFy=<`ioYYU5E78S5sUM>Hw(*={BkWth zu3n@edAY}^T%4>PdbR;L?oOw%1hyfvIx7t{cI;x9%AM5rLi0GK@o(YjX6H!jN-sKxHSNXA>1ga6~SUX{2v;B3(VQ zxC+k*7e&g@ZIFt2%;DAow{1Y}ffIBwge|yyI0S#1cLw||J?k(+=fMfF;qk88Ha+GA z>e4vzcFU#ecbO2$y9b%OB$AEG<(b`m%=3M)w;g
  • ihth2H)Dr@0ha-;w9Ll5^kiy)BU_gJn2wO{&yH*jY z2GF9W&p8WC_VBls|D*X!V5RSr>4G^&zGZZg-j)wwQ&x+N;WojgGNwR6AGbvM^Oet` z=k*KTnJ`e7bU)iv6cv0IFRI`T@;poYeIaSrvD@EJyJt?Ip8}!U>HCV$Rk+8Vlcw|4AC>Xm3#!!_guFlakFf?`fb18!!7^8YyU7d@B-XI0%HTm zz9(vV{SB3h19JnlzhlYyo<3;tDiNb=rFwCdP;F zt}`{JJa!W|q4+)NMc8>cRHe$|@C?^&7-yBXniz@efGZ^q0$J!3YF4zj0ZTx>qE4v1 z@Dkm^(K%l*t8L?kX1FlNVtzBc+%L~~F>$3IiT{Q>4I7FNd-SXVhzoE<7JC{T0 z$5Xs75SjL1yDxKP-WIVoywsEr^fpQt3)0^AHl|tVt9=$+yWIJK@5)cv=CTT0M!OAq zRf(eaXpeKu|Axb`{%*0eU;cLs-)zF zs}9H09|~m)*D56A*kjZE9Dpj>XM} z?zI@n=059w+~rr+t%Q>wND!k*w;CdkRGEy8M*{rTYa9(;S8K_;2}SSWH)cfxIW@C0 zx|(Mq%~1C})Rm&8E%>54Rq%%J-kKAwCd)TbT5vhlw8~ywM^mY^5PdQrn~`*BSYK!U zfgibHB7Ax-nftW?cFAxxFwL!f&{7b!h~>waLg zcsj@o9#vAgk;L|8%#{)XAYs+(-%vuT`Y%5TXUk4)&NGl_2J@O$obi6gVryAAwH9v_ zb_yR{sT9?aV}nnjI+aIRMuuPBX=)2EGvtc2*0E$nN+FuO7%QkRF&$ME9+1!$-oIi{ zG6|a8>b2JEr2V`gL)br&_4sfxeIj#J)MsSHo}u6w_m^kW$oG{<5o9yyP96>gjJ&*v zJKFHRX3D|ElV_?9Frx`ErD`=6CxrQzD~Zj%-vk2dtS^qOPO}(Umlzd04 z7-iW_4#F#`Yw;ULiNZy2x;I3D`o@lvpVu4Z+Kf831BB^@ zhh$p{JP~JaHM&VUL#B6~FMaYDdcHdip%K(t-QP>F`Y>=>XH7F6pLYjpnN^ zwjU8I5|uw5x^3kPAAYnSjE3pw>iF{RLH48#ni77-K~LpxBXb=uLfZP;YemY)lD)!L zYj=K%6$=m3aR$(zR*J%Nvg=Bd2OI>#V@Q4u%G6sK z+?9z)RbfgGUJ8Cs;_tiMg4(RK;iaJR`5Pw#HaNFo2Zn(H$VO^qdh58DQ>SC4NFC}Q zc-i$Ynrdgh2!BKE)#&s%tiq#4%#b>P_^^PJuFJ$3ppU-PSfQhoZpi07ZZ zHkEFTXl(=J#2^foE;cMm1+mYg09(+<0J_LDNe2G~U!bYiTEZa~Jx$LUaUCT7t+}-I zmfC6xq6VJK1>W`iMGlx)U;{>mplgzT`AU)AJSGT_!vOh?JVz8}Qg9@i7CDgG2{?}> zcz@$O#sg>J|CJyS!veO$*9IQg1!|~djuc4zobLDp+qe8KB@e()|3B=#c{tR4-#$)~ zN+lv>nMzU#36bs7N}?tSWt*Z1*-{Z>J|dKDicl(3Aw^8~tdo6DmdHA0gzPhxF=jq9 z{oY;Q=f0oox}WQQzSncz_jCV_?{VCJbeNGjX6CcJmh*L<=WF6;W%l-_z;h{;HyJnT zq#h7W!oweyT5U4=-a3S$4nFjGftYnKSknuv{g;O?b<7lD#xY-f5RnxFZt&$;_tNRE ztU9ia3{HXDuP+%!@tIboqE1oRx!)bbB%7?!A+JQd5`RD<~B_M=RQ}@O- zY_#3J8kx3Qrs-S>!4mjs$QO;8A3c>vl+5rQ?UTM-P)bn(!*@o}-`B37W8G&Yar>Y11H z5&sRtUF+t?Q0a53RdCm(q>;Er9|7R%0UF@g;(H~WsJzF9NC}x-hc{_%;keR?+_fb8 z0z!dBhF8b*t<7g)vg#fnrvhbV(80l{q#k|hrXM`YtdV}-1x`Qc*>f$&CFdE>pwNIK zcp-L2(EJCVTCS-uPhx7bTY}ED_wcpQr+kD)7wD_mD7Q`B{i#xRWXBhZtmK=Rr z&Lee&KdOd29riZsCEFZ(xTJRzSE2BY0X!BNx(yH&Ul7MzxoeiwB7}g*C42*RTC-Iq zd#t1hkKjnLm?&0;3rQR?OP>ZeZkf{=LUZfjyuz(osD}eI$r7BnyzX|`k-+1;zrapt z!}y(0)rJB*l;ygRq|E8E4tar)R)0n6g5iM2)Qn}QhP(jbNzuKA2yoMqE_J3{WY>fi zy*V1`Af<4#bvXKYHYOXOu~8lxD5g9ZPy03p_Pg9qN%=4n-*r}vdrE@XJlhw1NEXF; zufOd=?h1@Wj6F{sg1w^;qxP58F<#IUT}bZ%h_T)Z5;^dGO09OUsJ2udn%BUq1pR+6#LB(%N8zKy=L}1PHP}$$dU6Y2VHz_*M=SU}6 zzyQgTLh+&HR&W(=G+DWv?F}6-oE8?BZ9orv86Ju(ZjOiZ?8+!zSVJGm zvV0x;O#dN)@1vyGUwAesj(jL5C_9@j6BZu|sHpXk5&d)eXFzyG&| zkl-Y=NP~a6nf^6J{i&?}vzz{Z%0lQURS3CfMdk$0zzf!vb3(vF^9)ev>gp?is^K_5 zdQKH+aIo0Jpt=&Y{=xG;pp_fp!Ua!<8d1xJVbwo)fYlHf==@X}<@o&r_{EmG6(3rq zebuTk!iWMqM3A&H3`li3lPA9M#Qt~hRGGq^!u48mD=n`Y80;=N64?^xLJh>8 z+mi$pGT*Y%%#NC_YgU~TXjk6F=r)T1G*wPu;*?*hswR*_m<03c*ST{@8pCz zGrxwuZ@3P)Vrx0%J4Q{YB8nio^wCtWi~(PF>Y(w3nUwvFN;7eRktLTOWNs3UIWZ6& z$h#el#JiQjlc^vkOe0gn$=d+JV;Hgv|J z?;9A?I%uE4P&GKuyc+^I0q@t~APpfME@ChTPvvtWj2eY|Nih&}r*dL9rXcOE6SObi zco|J`-iP``%Dfx!-PKMnfsK{xg)&rWa|A@;T-SUJg@BvHQ>lcQ0C?I8?Bmh zYmN7Mc8g^>sfmag)+kaT0jzjz^T4Pt&bSp>W9$yg#4W$1cc0d_tP_%Sj$S^%8ht6L zReQ@udtti0Jd0C|J;|yIEf@Uy#79Dx_q9n2emHhl9qN3>w9C<6D$VTsBhmtaDNxy# z(GXf|Iv>t{!k9&bH+3jCv%RggBBvnUefb9r5!Flh7$(X8gLy1(eY> z>KL{@(1wdG`#v{<_hXj#81`TCY~omY&3Vv^md8NSywS;DIIa5byr$w!%6PZTLF))_ z*1GP6FS4pNFu0~^Mg!-+;=UGNc?r>f@LamhGu;l>(l)Rx7u{BG^|rC`*kJRCLnVx} zWW!c;jx{OuB)kC#@207sKLTC?gcoc9P<2Ml{@^huo*}BxNl2s>#Qwqa~#IjbI_t* z{Z)(lcMx4zj6`!2KvLznYo)*~*$(o=m=$RB0-+yW|5BtEz_&rQO@)=*H3i(ofsSEL zwJE5%yTjm$ZJAZrOLV1*>6Deo2%1Lx`IpVDIZYph;lj7j(5rsfYM#WDn_DIeBj!HQ zi8q{=>+_en3S7em7_(+M$`=#vFyg`4!3{J&q26fvxifio3x|iJ=4sw{`c?SawSOWb z{?>Q>VeKaIEJ|1WpIs_c)%mRaU5pDCx??9bKgVxAt$f zH7p_#1x^rY=t;REO&~gfN}3jTb;)%ml1_GK#6S15>2tLFcn%_7< z@&-z>`h_wvd3k;GYvT*nAS!;`aY#RoquNakLpCQ@P=N*&eOLGD8wt=3cPaF!Y#&5( z4}oqt>)Pu*HwR^=&+f;QHwTA1&tI93VIP3Qp=YhZI5jXc07TNNB;DHCDciH{2rs}Q z+6A~rNM+x@-gaFe&QrpIh6hO>@I9K;*Lb=&!-tj-nHP~);@CdDt@!Kpwt?d!@7g|a zH(P!>j#K?8aqn!6y{9*mxNI}81#L{-@~CW!e&Vad%ny#wG3h85u@GR;$vXgdZ_ezk z{7G7vAx%b{StVXYToxNbT#Kf|Ddm3!(0)>*`FWYOq?OQR;KRiXA3#bb*&b6Rd!;oA ze#b6gZWuTVu;UQNt+k%sm?D^FjgQ(iA-kJ~|CqV`x=5`;m&ya_m+==b1^SB~-px9p zMXcI(DM03xi%8mx{6ngdz^+zVO@2N42TO`>z+>&@u-vR zsM0retCmM@O~O037mcxB-1CHbiR8s`%sbC6J%{}h8h*X4BWd3QYA;y#QeJA7g_#}~ zo0Xv)7n8bvrC9k(kp1_an3P!w&(F&`R{b@X!8^mzvnG4Y#}B{@N_)kZkJjfHuD#5zs${+cy8TUVIYYkSnZqn*-TMcw%A^$`7ZBhXq$Eb>vkdyf zYheD!(fI4%U}}UADKKT;eHMiK5Wv0wt1E7Gxw7qu)G+S)mw*WJ7Opq~Cfc_=I6AKC zyd2_B66a5|c`FyU$k_7uIBi66q^C^{55s~r z61P>Q2|OA7lAf_Z}Eh?(!$0s_&K#~yf&H-BBcfSBtp;7LA^42+lmG*J+t&- zHy}?$Z*h^owaglQ(dXTvxdG0EF>sF9hipz9SH?OTratYE;<5@NyXh75o!YFkX9=$zT-Brt!uQv4w&a_KcSKP`$4hKfhLzg$7Tt?+B`~u}Y`$IIxGo4xSwY9EAzR|mS8W^UD zr3MCy4LF?%5)F??&>c%61@$&_-3C-gD(hJ0?RfR!O#t>u+kMNBMt?J4*D?`gAujbklpbZ3PbEQJlpy_d4EZpn3@=UrfCK(iN8Zj9__T z2F$L8U%Jlat=oFKai4x1LvoXO4x+<7N8)#&=0>zNdkN2w$8O*daIQ8j2)SMS*PEnwlYb?*~ zCn1-m2VN(9!?)2S{5^*;{T}nsYIrCvak&k+&p#Jwex7WeTb_%&a0Zq41H%za*WMiu zgL_F)_^&$EAgZA9mjY+y0+MdMPzsYdRjGa+@u8A(DmucuqrcA$4=d9-Cd?J7v`u&? z);3U1Gm~fjSV!}3eq-(r9*n?0tJC>6zwuA@q`W*Ua`L0ZO}g`%p37N=xGQD7x_Tw* zU6+k<*vdik1`%7g&u_?05=b+z-pk^a67zw zXWCX&QC(Lj9?|LV-BXLe-Ow%&kuiv{0~OnyBuw|H6ntx(A5!g?`AYVVyJb_HWt(}j z30#JG?bN7+#DpGDBnMe8^l%kMN)MzajG5EU-nyGaNJNCXG0_+hoG9j4S-MKKry3{X zwXN;;*uonRYfW6ANnd%d797r%8;Li%;w^K(J$tn; zoAGfP^H~8Ao2)5FQs6w-ml^!}`r?<_*fUW}H_*8v$!P~gaB<(Ad_A0>4rRca<8P~C zsF|_DyEJHh^o|96IdC3KK;G+x^1tMN*8MuJnajOo&vjR1#YfIWM#mw5BwQi2Fz>}S zbWLDVQ9>sPrL#8Viao^Z|7=$i_x#Tv#lxKx&;3A(@?!DiDfu#NEhjl zFsJjg@hP#OJ(~KG#y-is=lTHAiF)eOqbucbRj>4kQ$3=&c8cWTrh0?ayHr=~t-OYG z0iInr0T5{oY$nyseX{!KDK-YFn%1u;%!Dd&To?Ka67<&Too>UhjFv6-rzv4iL9z1ug>BUyzS^lAFtbclB9KtxviMM!*>cN!tWK z2lsi>GLxi^@1osEU1|&s;n2cxa(>i&2Gi| z^}o24{^_5Vzu`ar)A^9y{}PmJJp87@wKD<~46@npD1#7ohG^oF0xpJ3Z!*nLOP4Ih z>>B3!T3Hb}2)HAtduEZ%RNK}}U~Zp{QzttTH?yIuwTp|a<1pe`YezwCON z2H?7e-(F~vAOex|JO}EeBPB6@Xnf>BZ~p?JL68b zNK2*)xdmq!-Hd=j$_!6FPn(~Ly`H_}1)Cg5^x-@sIl~czT!BcKqRe!HNA?EF4@7+= zyWR+XRsTdrXMfzysY437d&* zK{YBp%@rxGIfjh;H$Ll@D7G8;7)Eb$}bt*ocE zbx1i7v!bdjs|`|3bn?dg=s$Qu_3Jr!5UX16V)F4}Em=|56EXLvHwNSyJgJB~A)U7a z&cbt2C8gc@wwI5~C^ALbu!o0*?#k*Hc9kCVR(?wSZn{jtZj=dnjldSd(* z=Aa|>xkmPsJvYtMK;|btoXr0Xd8|x(hpba34;FP67VS(+W@Q@29x0$a0Nf z`lWG&uL^XPgXOM4cM*G!AWV)=IAVN7A>j?CPe!>(K%k|J(>1abC3phr&(gi6tmU#P zQ}5MN_QU5^*=cJ|Tkn8(6Y|q)S22g#$9;*^BO+Z13C;1);`p_d`=oCviMrJFcOT8f zo@07}^|AMpu|gf{{*Sud@_o9k%?8)WrydoEe_J?n+gb9{?BYFNV$ATod?{)ye4IfQ z-a-e;aHeYoG`UKAf35vZcO^sQdKc)c-GpkrB%i&sIBVT7IsTOe^uW{fyI5IHmabf8 zf8)j6FUnAk@>8a6Nxwk}NBHvBZ-5AcPh;RBB@Z)*^9y+;iH+m#Zq^Rflt zZH_M*qwT_WYKlqgNuEnM25ZW0qi-Nm3Bq>FLYogUW2T)>{uxic9dwE}=&zfHLSSb? zc46(oa$Q_ASA-cCpz{2-zvK3jfD#{@KGjf*x{H^i?k$3gGRpYDSK{{s?&@Irm6=k0 zYHLtpYD4_6VX5uOHtCU;vc|g?li3gR9LCE@yN9qHUmax|Jx)f@t&*v}-kXQR86M7Z zOZf);N~&t_FWWzQphfIMe>OBVde|HP$F>;#n$b7enbCv={mQm7vk7QH^M%maKNFDt z<~4kcznBaY7F$86^INp~8@}Hk`;`CPd}1Db|Cw{aUAVQfJGDFUFE$xIRy+&F7$yVH zX+zTeg8q}}_t!UseviioMnrC?WgAz9J-A#!=k0cr#qE6ax09`Z%Jbg^@iaI@OFF31 zR(%HahI6E$moP&|llweBk%!y;<|lP3THnTOT<9eFB7_G!je^}dg~$!^LVGpN!qajV zSPj0`BV?mPFGF+fW2PQR;v)d=fnp*k8hXmU${gV+yCra!^Y4N^jsQO@uq9=y+{L2@ zCm_}Z4O~KX;$;bQE((BqRRC@L<`WKRNTh}21!cg|X+d{X{!MCI0iLfd+-Rtm5imdrK!J>&d8^@M#9y}rbYQ*4Glw&q~SkWF4^{pA=hz(gTk1V-0pI{ z_LFq)HvuJbq&^2DpzxZUVQwzVFt7T-vmXiOJC2jG;zUqJLmW?IGxdACJG7!)nMPk*cD7O#B$kM?n4^Kc&YKHPHH3;D zfd|egPBSNr>ehQepP6W(WM_Mi5msUz0M&ePk45V!*LBRsGncONEf;{((%Qob8-_wO zPQSL5IA+L>JpnUZmS&mTuPhhS(9!R-Z;La)Z3CNzaU1N+0S=U%(>F}!vwc`ypb6&x zxD*Umxfo(@Ur&y^P2W?UBzfZr&q7`A9c~;;Bf!^w{{ql3;+x(mpWgEbOsVD+W;vq4 zsY+md}Z`jqAFHv7U=y!dJ!Xpuw`*IR3zev-p zWkijJ2rHNoj#lI|MR^tr>|-up5$sD$>x;F%VvD76XP0duMXpnw*A-=(E@IQ1;o==| zgA{%a3j>OB(i*pjM%NCX8~ux9ukCLUQ;YID9O4aLDxt@lx+?r_(a2|&NC$4>j=c@b z`@8)`zp7{opC{l87VG$21m%qOgz~m9TmOCs_;z{cn0t0dB%f zV+eYl*g(%GK0f!3^mGxjtfY`AdnA02ctnV5IJ64XsjJ88P^*|BBXb6X%<+!gVV$!& z_j*Y2itKp1a{-D>G2{DkT z7u;!!I%zD_ZVJ*Hx;^%L_axUK6RgHeLM8ut$I{<@e4AWtr_T0ZfrOirxNvfwc(32N zi@XTh=M=tQ38KBBeK83%pwYbP+|A+Q0;oReWmH)C&t8;8kxX`K7$p9`SG zV3*E6-6;MyO*3!(AEDQgi1>TZ+%{L3Ayo8(r_Bvh4t_K5 z`X4<#M#iVyAMr9ls^GU`V!Vc1@A?rU)awgveC>QaB=DtO^TL8JSn&qBUb(#_u&5TW zT*P)T&A?7&_uJ1>lSrdi3$-K%7^SNd`j(m`5k=NDdSTEz-1b%~k!Qj!r}aX6TK3~R zF(Nm17x`T;swWQX8)G&yQ8nHd?D(#{9T$(<#D7rLrtieLA#fu!jbl%2)%6wg4ydIj%T7%G zEIyS^n!3W)KTcATd7b8}@Iu?cx(|=sxa_|LATdE7m||Ex_W8naQ!Tn|tdsAQ)-rXy z2cA7D58?B%GVgo`-8?9a;34Ow)eDLdJ==HCL}*9j_*HGU3W){KA zn@MV;-m7TmBi$u$cKqPkX~F?%E=bAQ6{?<AkGqy5&(*FOD><`XtDf>+LA{JBnZ^5BxudasR{ zE*jpcSo86H9841tMryFT8%(D$uOXTRXPX=Bx;ztDQQSVA3I(nKC%Ahwbl?NA>-xd@ zi@Ts1!#QZ{r?+#CV#7aI^K}sr5k|!n$sV;TYI%Oc3yx~!U@PVj?#nI1vz*7cZO))m zeV0(>_resksZ!d(- z{7pCTXDVkt6b4+ydSK0MmgD=Mbd>GYs0+}-ffpOv zf2U6US*!j}{s}45#1UXd#{wgFsh@z^ZxFhxf97la<~2lR5|BA1I{|nUW>^wUfBXD} zkm;|E8y_?Crwj-h*^Q!|A+u!@e(<2Rzx={P0FdnOD3^JDeEwG;a?i7k!ET*Hs6fDJ zI)7i2*;lzaE&W2HA;`v@OSVpkaMB7qU99eiI-z6Qd=WF9#@;wd&Q}S~!wZjVcW$}b z;TFli)4;7d`x^49Smmn+<1k%(Dox-XrTeC}+nGuHwR0aOvDpGo>bBmThfXqCw;i`R z!B(ySguUZ1bSaif-WcE$yJQQ<#(gL_eL4+D@BEy|pmh0DQn&wYppI9_#im0j^rC_Ykl0|u<1qO06Fcj<<$#axY6twWH-ajog+MZ`%5hCGu@QHg?ibPvKW z$u&3OROhbvEh#+QIl&M1Ybo7r_vFzA(!t});~=E12XWHdbx^x|maB&q#N?4}aBQ>ICc%`t+b(sSORVS1ZmMT~AsYpNE zETy&WWB=N~FU(-J4sx*@w*u==Bh|RAQzv{aw7)QPG_RG9Pda<7fZ3DKr-7s@H4`ME>3!>XFeRjE5ew}LbTgvWvJ?kD+NA!IvVZAT9 z;=P@3*qOWz>TGO8E-E5KZhfq>a?+3~MjP zOHwX{jhvk)eaFYtdTo#sGdSmb#+A+yt{!hoSJfIVe!P$|nkSxfbo;S6U1K?A;Ur!6 zGXa)zp3@H}&$;JWw``#2u_*;txfjIQS=w2Kjb2l(4A(k1=TR{V5gD#3*U2gDNv}#j zPBmgg7WQat---&0xwzM#UxQ;#Veos72@Somx18NmQXW6r%MEY(h85c@Z)59H-a4g- z;A;dU>~oX8aDXjOg(z;XauLnUhN*XZn=5|ZU}p2$y4126AAZ^PZOAk(+*=TWz=+7tkOvMid*VVL%Z~YJi-h0#IaARL6j^N zq6NA5`L}n)#?~s}ijW(iJiE4UG0uvx{vCYF`fU#o|GpXq>=CxNaVSRWk!&zJ8$cm` z!2S~uOVjPP)1AVHnP~Qzfg^Jh%A3$#)vha+(^4$g-f^()UwPA9a%fRv=A+k)?frLe zF4VF$vss%__uj`jzRlEgb6EQF!E(wL@lpDA|XvScMl zw%ohe>BC`6C%LXQyZ~OMzup&5ea@)G8+ME)se3)#!4zEQD}$}u#36x zsAq6gqop}hJtrw^=DHCpaSnT|?%H+_3&zP|mO|T%rFO z*x7a}r?6--E7#}4Ik6mxX#hTto2ZS~fYe>F55QoR>74h74kKZ1TK7cG)z}9*{gjue zNP4Ya2o@9YW^h8-kkN2Cp+3s~)b#T=WjtB_C%!DAgeJOJqtT;+eNy+blt#@Obq%-T z>riptUc}X16qg>{_WdN?O@3DG7R3lYy3+B6t=~;&gZ_BCg2@qF-4atmn`Ie_1c1w* zMr3%=)R05WY?vXVXL~7f`)s^3W~Oe-h70&_FGNnPD8}D~Mbte2L?x(>3g3cia>{b1 z*}Za-*X>;Yo_5e|E=gcJDGp}t#!Md-m=>mU0|ctB-^Z#@P->@h1>2pCWU)gmon7Wr zXEpRDC+BCjMGNw8S-3W$bDgmWV3R;x>@&ygkCXv(`VGT{WcC4EH?#_N2{^!7qL}x` z9Efi$Jg~H8q(uTdWq*+0+PPxoLPdoH@vSQp0r&1j>;-5o$(6HWaHP?hyVFSBD9mk1 z%4&n&Xq5O@RpQj@8)=n2ow?F#Z!P*K4dSaByyj{saTjLa1DGh_;N;)!zW>q<-u3`F z_nT&VfZCh^q0J2Gyow6s{S= z*bb6^774hr01p3uVQT)vz{Bqzt>1@dx!oDRE7k_vm^**6YJq3q&*3G#4y0;fPe3L0 zhz5ND^bq&J07`|AVrV9!yV=4_%>pnC1|?>zE*H7_Xv_|wkb z1?V=(MG7Ti@DwUgcfEaCKc{x?nBmZy$E@x}8O$AEkOT{ApZArj^W0jE_P?U&V?PmN z0IkO_cX!Z2Je;0X_?&_$9Au=LtShM>Q9bKEWN&wZXeNOxXn49g!k~*fv*imc~ zn+R0TSnZOv&uza|Da`Tb1+z8MhmXb{d}L`sk`mmWfqkgoc9pA$J(zYNlUgEkP#n5i za^m9;p4-`&fiGdKIF==Cm=+*1OzplqeWKy%g>1CD<6%sHRLr%4<9dN&lU~&a<&Vgs z7j&cfjB4LKbpNH}Irg*Tfp|$nh1py-B`>`!|E2=qOYDQ5U&qjnyOI@aak?FrnKdEr z2utJr@bt+=52jMwz1Rxws?J;U{L`HmKrd1cIy2|)Ah<$Yc^xcOYUaSHd~eWhB0o_4&?7Bzm~9N)w4Mi1G&2nRs}mbo!B0}0i|keYvfS& z#$`JkQIBj#a@Gg(xE|to6?YAVD#SE`9Z38GVdQVtr=0OKocP<@`Do^1EJzZ!q+DgW zfLJ1Y^U-SwJ0ant4crZNeQTDSJ54c(gtY%_68M=6{td_e)l>4{a4ZDh$4o`I?*y~%{Q zCuKpnv{9U^;|j{r;xQW$HQH=V$u47=wG+nHX@_Itmy92@;3<}oLiUyA2_b6yA$7Gg zjYh;ded~z#(D{%0=Lc%i<|bfWoG6X66pk3C`C1!~&YG13eG;p)cwV3~aR_l`KcErA z-j&Kz+%&7U*xxPexHgn~|J=pZLx;ju7sL<(TycOaIe1h`m3(Mv>~#!XNbv>w0Z8B@B#A##IW#^+R2kz6ln zbM?u^*GOw0?fpn)EouPs5RslJkUQBqb0c4e+oD)4CGn}!z>8_Q#81PY+tOM|T0a>M z4bL0rCP1^t(pcSeJp0giodI)f_|zKfeZ8)%L=#VHExSeAm2Dze?)W(d>K7{ z`ZD6VKX?|c7T5k38~tY)HNwEC5&9P!HA?;O8#Qvgz@XQz%-}D^jsNF)pq|06idbOB z#9jXa$evQ*ilgw9ABLklY5vQ;_Me0axbZAB^peETQ{g7vOb4tDz8`J>3K$fW_v*WV z0P5=h2w?tOF#3NRe*B3(uUKJJ zEvslvWF zE}P;h7J+xn7Mx%Z1jc%`1uj$K{dZ+Pa#du|z50+FVnKC0Z^i0otf zxH1ADbiU_>=yAtKcZ!Xq_`C}aLV+K7gvO#b*L&Sgy@HY4Io{ER$t*DJ8M$!91fz7# zvO+Xw-Wa)$TZt$zy=i?PH*Y}&sYkf!_e$J#9Wi)yw6H{r_c6g@^aIrAKrO}a{S+&R z`7%0$2W#xR;!k~!h{$UHD@C%!I3 zS^6nq?O@a8k8{|fHBa=kq#EDU@g7G@U-n%`R!)B0`6~5td~eUfGSochYahAW(R`o8 z#^@MU(?_R&`31k3!Am;>kCPkoMUyq=1?}J%i3a`Nm!SqFT?Co ziiU1q@t2SdNEx1KkmN|>CRJ9EHhj`jSz{HJV4MxX@w3_j<2&xT~1@r^Yyh=+&zn_P)h z+pKh?76>Gq4}4%;vJ(%xuv1f&dQC&xyf=CFBuDfBKKYj6=yK;|>NNwJ#j@yX`2!AO z!Ta5o1Bvh6mGd%#mKf(_8S%omQ=e#VR!n ziP;)=w)w*?-$b;HZY*aDfc@mEDl1cW*MdDs#S73`t~72?8()QDDm-tk2-q^E61jyN zu&33z>TsHk{jH~IXD<83qW06OX6VbyH1Q!~1nH_uxRp<3>ni2RfkPz?YU|#q9xE9% z9Iw2?e75UbIQ?Fg6Y;bnD+XNMCfr6(WE*ooZXYwJ$E;db)jVD|i>zQ$fA!IB_UmhF zFrC*{uEVNw`;xuEB(D5wCVzZQo)2K8T_5>ExOR2RdfbMs#nAAyIIJ z5mz*=66zhRW>~b{zLnTX>a&=xmBwIO1GV`dop|A=G6aF^Xeb&Z-in7EONcz=0(V3P zmQQ5&_HR-*Ok3(b$Jch;zhBJ4SGv7mRaOErTuAZmsYiXG$9V=o^&H<b zR~mVmmxe#a%%G&sz5J3{jbAlAy5?M54B&4Yo-GqN9aQ$tLE4gdByUeg)Jcy4m&wly z+dGUhLtcd|nuh!ZWBuEo|HfSZU&CBYP2dk#`It?dBB>V6yHek=^oeLfOeSoEp`H4G zkiHT=77ZU*QkXA#)ID#+c5A}-Hc_R5mu^N7kQ-Fe>sh8k9_lhDvUH6rzT}F+qW7;+TbkV3Jg|`Q^wv2-CF^GZh>?8 zC63&QdRZ{a{@#X3*O;@M{zBoW7f#)$^Bvt?`ud?pFEQf+UKf|${SuhqYc*=8i*DR*0!vF>m~8!MM?G!mb%h*e9f>ard@m*@973#QMs2JGVp^( zbTsAh_|o(MK$Hz2N0LM@q7B9?L*y-FG{fIk=X_rLNSRV~;7NF83w)kbc)YzASI^Z> zIyOk^$qEU09`u99iY>c4wIz`#2Bn-deW~7ufVr~j34kmMyyd`0>89FjeKR}`=b7Sv zvN^xg7IDq7`VTX5Uy}E>itqC9dY(OuTF%S6lOWq8R-bUY6 zB)`plJ2D7Fe!VwY2kgqy)_<`ja-BwsBEC`m&}Ae=ROd7xL_QyT^v?-(oX9QCAIoJg(1w`ex2ImQ_K@>^cmM&ex z9WC;W>3P}K)>EHa8qaN3UghvWI(}1@?8WLj zQ76VU2EBPf=L4oI1!&N}u@lr>)CehxJm3?@jbYw{)j*`0c)MM}6w7J(^aH7{Cq%C0 zbSba6uax3FmomfCZ202({xr#D+z7-}x|KcZOa7RsASk_AJXZPO@|A}I+y)Ff?(&!S zPinNqy8ZQ|tY6L*m3tpW#h7OvZJbQ+tOSQw@oHNWJ-CLu?eHDvl)6;`D?4_hSSnyM z-41`tA)s89GQebG4ZFb<>U{5&(qmv`uPjMgkVi6>ip91aQb_gkx-px0&h{N?rJuB< zaqOPgZ1TK*04JkdbO3STBq64>j*~yvJf6+DoU102&3U=n_l{o9M6zDS@+jv4bcxgp z_zYJ`8(hlOX!)a2S14sC`%H_AN+avUS#r0NY_56c#NRlxRz$!qj$?Z6%Au9rn=wiD z`yTDNG~O3)Xy;VLyyoWT8=iPlCbW9(R+}55snO2+J=gD+yk;5gpIuNlrw0!4F&eWC za)U+t_gb%d$sh8ujt(cMTp_8;(TF`{<_`2dC?CFjey{GV4qMHIDAJLAEW28zohX=y5C$|_CXiZ$neHh^%&~rv8)xQoAHm>C_RU0OR#T;DYLXfnBWmeu@Uv2dNPk-sZwXQ1oqSrn_Iv9?)2tY z&G0V`um;hj`fI5y zOxBkcP0wsJk`DLZ|9&gpUh!T>7-Hjg93Bs2TBcWV-{OKol8Z%Or4(6xo*p6#x!Na;%>mT|;L+6HfG$ z3JDx?)3OhBucZw^__G`*)5f|;5H$3-_fiLJPhOpdEQ;r_0@w|7U}a1#=MwL?jI=~$ zs?4g*Uf+W)3gG8_Sw`K#(5(IWBg~jL-)^7Z>z8tkJR@ZfphsB^rmcfx_tJZ3I{Yr_ z`0-(O^;&KaJXE0V*8p@3E-CQa@{%xUu}i~)lZFX}1l$bL=)skV`&OXyCpW8JtF!HR z!OdqStUCvUpO?wURaywKbPYwPm_kcb1FBysXRk%WI)}CCJbdrPF8$z%7&jk6E3Z6V za!#l*Q)}4^gV$^Al}l0j21qIe`DqNy^o3zIKNv3FALtxV#pektL-Z-G1_vb33LW>_ zRJMe#$oF-^&U=r+-D^?+0C2emCjwWPb2qsr@o?U%AZ}jnn@s4}&J0*JlWwL2?V>z^ z1})Bf^V9AoqiBYNUPZ15w0Z)QOS<>`c-Q@6wuzo(=FaPmXttB0yKK7y({&+hD@X|S z?-zHuHaT*XJSqqYo*Z;K@I>qCNF4VYoTNb>hJ2!V15M$kV8++XIcN3XX~zEYf1sw2 z`Zs;`f4rlb=eKm#e>#1&{a=LJivAtLvp@R#+r;0qJDexpC(W$`nw=~%Th8)T6*m(e z5am_++lKOwKG!HVd-Fo*lLvFR&&Qk0Hn!@7U&y2Fr;IDBCkmJ!9Yjwao7k`cx*0!o z{mfdf7j&5%WrpQpPE*MDx-ka`($tQ#c7Z+p+zGQgoKE}Uf#a~V<*oRof|wqg7rva5e?2Zmd3& znPA!5ZQMj0c)6S=Nx2e2G~D+pzxdR0CdrJg*P)ch_IJ>@>s1{StL#uKWf zH<%IIwe^4xSHF8ub5f#n=LtN*D`_}l=ly2Vk91Et5%fUCNplulo%`#68ZY3$DcB0L zl5`2*<52}|m{M;*BvV~-c3iqbIFlbeK6-PgWH~UBa8cle*g|(05FsnlT#o|*S6XOZ zrn=!0uHlNP6g==XS#+EwY*O_?>li*Y)!g zy~v!4<;La@F&dsC@*Zr*mt_$fZxh?X!}mqVe%>eG5>J9^?MK3i+Vg`Z94Y?0pyb@VS%c~n_5R$`b@n? zc1*#fdmo&9xG$Z%SfzJa5!XGp6$OaeHlQ_M;xQeLGa3x-4%P!$E`y_(Y*LsobJ5^x zkJo+G-#&WxN!Fd-IJ*h38#;pfzsG%lOEE3T0yOd0?C_o+$ac`ac5Hg5TAjK$`)h=9 z<@d%xUd+*_(8IB@wRq{j zhEztjCHE~2QD$sdS;7j~kD9s;kB>MJ)LwqsNcTH0tV)L;wnc_>@{7O~6gaqG-Qv<> zrb8TsdHUP>Jma%pQgRy1~;sg7J#d!Snq@*>|MQV zQfil)Z`tUdGFThuxF)db!jaFA5Os^+-4C2|a45-7x2vd#;x<-(;JjasoZV?`&xrsz zRt(*YBV=|G{>)xCX{<9cfj{_Nyif0Y=5I3_bN%KG(V1IoL$Ey#eKA$o1nNgpwGA_u z^??x|k{$E)Q(lpJ$fWJgsvDZs=-k7Z5M!6Y!#m_@j*6VvBZ53V4dGciTtpPxTvvhF z^srN3cB#)On#*H_Z;Sx8e%&ZG*V*EYb$0ORjMj$qv>CmL#OMHDd@Se1b;j?xg0h%>KF=)oDSg{UWB1HByChdJtHE@~+OMw$7!Hyewj40GxZ zs`xdRZ?q$&fjA%COVMGpaJJQ-hriJG{-jnH2)r{Cb0x`l1?ahLSDV_sY`e=_=H)eU z#IX!FOce4v!|WUj&yQ2>ttixtX=-Va`POy;@(>gf+;aTFp^h;Qmgc8u7M`V6f5srI zkR2ECE-570Hht)z5Cc}uGV$b)X)jgC{_h96`+5fZzuXlrIK{uK<=NvPW2mRuLh3DL z0t9|z!(Wrvants~?)Uc_V7EKXSAQ^SP|U1KP{A_R>?U{$FGt@GJ%9R8Z$3x`o#@1i za12|ucaM*h4e(zmteB7exMzNc3P^U@KRiMDs%ctjuy@$!eyY;YWqoe1@)F)L>8y!(yV5-N)f_AzI)er3#W7$8a+urPgd`0gWuJa~c|Oj#ut^NF&{>b2e!acM?5Ad6 zZee?Q8omBA*OyNH?CLm73Y%BVzyGy#Q~&Nrznr=>o{^}7=@m`M#Uv`k168S|5Rgq& ze+J#+QxBc=+fC-y7iQZ|KwIO3`mi+xyIZ3NiD)ZFdCjQO@bSPMnV4js zqii$vAeIWJjT896ZYm*dtxneayT}eRo<@q2msPS-AG*algqX-aKCOP#=89mS1cfj@ zA?O0{gu$(C^TWW2sor!QfemRxQwt?$x@Xo8$p}4x?maXK-LX}U{T$X|9;t!0xm#5mU62v$t;yZLe8p2*n5DTUQ9W-@Ur`tC8g^M=62ftl z0J-#PNB;k$GT^^^&wt`W`1kyy|5(!hxA{kQ#M>PcPyK67ZtF(&lU+>**q8nEAj3Nx z{CSs^SZ>gTv}uTTx$ej{i=WX`%hueSmPEE8=d|>D%HpA4D8H{$T)CB8Zb)_7bQo7hOm(It(9mnQn;O-Uhjpmr%sO()Q_Y&6_p4C3`#XWz#3P1bS zs^mr&@ve3NH%#7eY{reYQTT4INw^VvF%F64CVrS9(SIVFI&Ksa3!uPP?!<)%(6dJN z%vD_=O$&&wt?(k)B_B>$WdR%KXaN>5%o6#;g0ZgO{uOZalq+UX_{&h$wXAi(c4Vrm zwe&1zn1Lwm4r!6}g#gQRy>i9gELZF-67>{~rgIx!kW=vhw5+;1qX~RQ1Q}n*|DI`q znQM^702c?8W(7Gq$Mx#XT8O+>$_pL>TTY$eH;svZ#3@h4>@9JjvNOUy^O~% zs|EA#;ZmFH7yEQi?f4$>!vWB-8l5x$qdM0y#)UY4VHzO8k9|7tD;9%dH^CO9h>~?- zEokXNu3~8^p9Y4iNPeR&2P$4&S1ByR;~ zl_2K*P2c8zhUEiWN@G}AQRt9(I!A6s4TJvn_+D}=?ghioKWwDx?D^X=)uH$ zp7+dddASbc`Jm)htY8d#C(J)g=4_j*#%}bxzcn+y#AouOwC@FL+mUm>QZ3mHlP+Bz3E8ek4{@(qz zggjcWb)pr9|An3RS4An|*O_~SxSE7PG`xHCxiq|IILYbQ)+y`bYIUb>pMqGx)jZ1q zs~Du`ZE>j;Ymc4LV0#rB+L{!V1^diUH=cQIDplV&Og28z?T&lqxLBU8ofi)^=`1FT zznienyT;zyrYlTO9&atG&);0UnlgBdVO%9|1S{(&!l-{7?FHFRZPwW~e*l`+yU;edfBmGOf>h)#0EL7E!_(+wVb8Mt0odg9|B z)^!&^IInrTDL)0+(%$Lb+gNWHn6TJ_`{w0kg4BVBBV}P|8VU4(15&^9x;Qk*K*Y8* z6@_w%TCZ^(KXbTgQMw9dQRmW?uai~#bQU+}bp?V3_=BJ%=TGeHAO0k1z!7ysS1>LC zVqLNh1EgURkJbB-5+Zk9E3cO=fQbMgRYY#gGO_q8@xou}ZT?5?yMN*!fAjMo2V^t; z5vf8~VHoxY&wWs@iCZ=`ssVg>Lh=X!bmAtUrvFzA;oJtO0e2Pj0R)4Kmis)u?-h*w zp_B=V;epC+9V|cnFFzaK{9_~J>^&m)t{ZnvM(5PuQmVc}oBhpx@chG3@W=t^y4$j) zFY6iTpV$v(v?rb<&6xk|xAiy6|L3+HL;qiS6ex>KCNFYI!40We!mcP}Ec1dk;eF7E z6lNRf(s=_y)pvd3DCEod|Av>6q>7x4ooCHMtji>$Y3#9)ZodSgyH% zIg3hO%}=Zj3%mh_(CAgp^`eJg!&w>DI47_8#}dvLdY2vKa>XC?>%87KB5Zc&rTfy| zr;z@>>_|n$0&%~EJ&iN28atks32eSBBv+-cfTurV@CJZdqM9lnw*WtBbWh&5-Gdqy zH&&u|*z(AhO8uBO(sIe|xoCXlyz69hy6(vXhuWl>M8F4=_Ck9tY-WO6wk5sAolwYI zrS!z_z<-Iu^kVY$Y7(u<%nMF)9mxH!k$T*D0X!ERecMctIhI*ok=ydUu9>%($V;-w zH(V%@VKj3Pr1lDA18C6&%UsiS`R$yA?HC1yW~Oe?G28gj0nBIjwxqTZOF zd%WSSYGa0gGUP7cGTIJsFG=$O<*tk<{#KHVbdxPI~(7(*(`^QccUd z!O&j2<;X+bSFTG_puGS9vv!*JqrT#zdwYP$e@=O1~O|LAXWe>eN) ze|;8C$G<_@zcwMR)_H97F&lz7BC0f#*}@?zn&aKJThQaZU}Ac#Wnr|&tw6#j{bHYc zPdw_eCCW|S4DTCzy`O~9UVgA@K0ITrr9(h4P@gW6}1EI&P z3ik=BD)ti{YAsYx>ObvzkrSzZ_n4&78>L;MuwJpu62yzOBLzUYNSbdJXHTnihugh1 zVYkfnWfO1R*^*U@?f911${&;wFf>i56b1PiOLEli7!YdAX($?4r2a0?0HX?bj&YcgIw~;J7z~d zmIuDNaiy+tDnB+pXyZ&ccL{iVhd4XEXUZ-y3+9!joCtaap6d=z*LI|{WWS6_nh7tW zX8<${5LbJ)F5HRi8TWZxv8_`HO9$abL`%>}C?{`zslGsma}ND2>hvAZn7Ibw^EP8a zuvMy`Jj2~Hw_{7-(@XJj`rAuOedA=u5AveSddDoy47SAkimRVDEt!s|`HARM$9`N6 z&Cf1x;3$z@)uk%Q`@WFI&xo;sZH|P8;90)opP0J{d6W{;lu>4 zjYW+YUd{mcxdnVlsg}^Abg|Q;`&{AR_GE?gJ9)1J&cEa3MhiubajPb9sx?>UJ~#+X z;`CA$e4yw>KmCA}F5JbWd{dWI&;zj=Vp1aH%TQ!~g*hvaL->0~oY4)p&Yct(o7i4S z>cYS>#OZ4CZ+UISkbes7m&xwa>b0aD4NC)~XCmwdEN%{_lK3*V) z;)Nnx-cK$8BI@ls?4wjMy4IP#d`M@-20!Um{06@RZFmv*_8Ql>eVMue^)Ggf>Z94` zN1k(UtF5h=i}kBt@hzg-R56|tm8}))jRGg_kcV@hw@B?Df^oDRb5aUYAL)#i`-syOz;x}Q} zGPAVxo|5KMM%tH&&m6YEW>4^+sX4?NYTuEwU1bUxE@t9_`9nRl;)iEauD1K~Kdx$f zbG^7{ca)AhIn|Fh+)o39lcBeZ`VVjyHE?a?oI*~N^Y;DqN3-H*yk=LR8X;hS|E-Eu9?ldGck1)JK-d{m0mp}|FVD5JhR39_K#nvZBh~A;Rn;pW-g%UDDMgh zh76Lb?bWzadT+YysdnkM=k?OB4bbQJf1B)V0j@zhHMbU84>@U&gq*f)%)^O=es?uN z&C{Q6VLh+fJ|XQ^r)_dis%AzBmPwllp%K@3qpe^CW~g;RTV`IUtDZqJa_`~#!e@c< ze9b1ODK#I)oHzr!G2Z9y2G+iK%(shZrFx_u%$33Cs@NhfFBH?FNYSfkgU zUR;3?rB^cGLU1VX+?WbU8CHcL$kEiP=Dgll;005nv z7D;L*Fjdcg*Oa(zci`z+vP*$Q~OlW zcQdxwi;ultd+y8>W%Nv6Z;8=kP&-KI?Vy!sCgf8>>&R)kOWApjs%MY%*XP^wX%xl> z8gXw7ZvyV3ufz$D5l_(LUAwR+adgf^&q!ybpCS4g6AFs*JlsisT3wm<5|4mEoabZK z545Uu%oADXH)gvHr_HQq;Aly&O5v1Y0|AWR^)th97aXJ2?RQ(21|p3Kh!jU_n6Sf7 zjZ*h|Xo}6{y%;d_({-<}(|0ec#uKO|so3>k_8B!yO2$G`s&~%(dv=({b+h8)Co44j zgaW*D7t1dLJ-0HkWU+Yk7wT&2Efa&mMZkCf86+zLg-Joat%!zo=rRVwGN(MlcIsyj zof)q7soxPL*77DPcFCxc*oS4gao?cwVb@{b`}a0)o}fVE6x*n_3jP3%`eL8ExUYbm z0UV8j&@j$UKqbNdSnFe!K+?e13;k!)+Qc4KYh+ejome16aP9*4lgxLI!T@811@`_t zNq+Q0$CtnFOTLf#EJ|1_Jf=P8b=}o2aZC7v_84r5AGJ5o{OV-D{SaC_4*9$6o>_NC z>%CtU?lUm}Ou-`-nAOaPX&xJzlNiu?Yk8{BUB`R!l?T;*2O+R^GfhygEC%6=VBb;2 ziQ~=vLFg(?DL1P1;|Ur|lR+r+lFmYnxH@x2Dh9Z5rif?}9*N*0&mhY-7L#i|Md1DoehyZFP5 znFhL*&S70=1lw*-``yz3bqvX@g4{yxqKrIbzMzFg_~zJ76QTBX@81oM!D%^>TL?E; zd_~@bc&&*q{?t(?B>x&4TZxpYW=V9BW4s|+4zXZ+mF*jB*O$=wXC`rA)9R3J)v#P~ z8QD&GDoBew>jdkm+XTho?f@=TiV&>~p|4c-g9bH$y8Lv1keSm_=Hxm9vNLL*>5tHY zF8Q04lQ})7mN{02?kW)DGSqjci{+B;XC=NoCL^HT5$SJp6uae4C!i#PUMPisG$8jw zc49(yX3Qy&<-5cMS+{X_BaUd#FBCKUG`OJ*e7|!ss68o1Q;a}`7G0RXHnMOTeEK<* zbwvqVgj7eY(JwP&&Y?NNp{!wnkMNuRU#NcVzF(-vU3d~$c3p1KqJ+*SW=3CzdkMdU z3^tH`Oqy{Nsy)+`^>+0_1VB0M%LJ&O|(^oOn9`BV8Z6UDl;wh|a^3ZRK@lsDMMUnd%41|g) z*ts1nj|<@>kz;s~sRRH6?NnI6-9dC2M}uxAmZyJ%0CvC_Bp2+QpmU9W8X>y_*}h13 zM91)kbSyAkCj7Ogzk4uc%O6KTWO#vS*vWmx-%g;;0tR1nqETveFg>KS0b* zVepAUubFUIdRQ@!3~;5DPJ`N)5@3>*(cB3s#cC|j%sgy|!F_{ra5Sb+Z{`tpG-V2c zxBNnNr63Y%tk1ttZ=$ong6n}Cd@39ZkRhTG1GCci05Jnk>&ZL~zLvmI)th;!?x)uU zZe98pDoc-}YBUo8D&nMpq*oG=*w$^V2-Z9|$RqYW0q-E}RX%VbvedcCtYT?L3;Ff8P z5TY0h-Ng0gVV3tH;7+O{0q$S|zWuq;>MpGT_f~}a0d}}%R`PcRBjL)vg>)69+{FF1 z_(dzV)QK0-N*Gtb(q2&utP+;pz1bHk7q3_>*Ev&GpI}j`(k0?Hv2k_PjQh3r-^G2V zax~3Yt(c$M*1u5H3J!Yqz{3p*gBG@b#=>D&l$L)gRbCvP-py88 zRZf)`A8sIrWsv)tSfwv*5fQRC3A*~E%Fn3Vrjv&GpgtW|ZgGi0BH>X9`{sq$ zx<4X1xb1QxY*rCBt(mGZtQTgBVTlxD*cXB9d!K(s91P~Vyk#B zhMdSw)ZVKVj&Q{Ab!b`+9N+r=qO4?YYU_yhZumKd&2I@bRpSfq*b}b0FPb8pl$JeO zW?vadcpROnCO;g|U{!KI%xrMgmhq0+vG{%L7ST+(XW|bPqja{vfco1#6<(gD)IFL$ zVp|APdYn^V55VcPu>^GrLaMdS>5VV!UQxezn+uqB!)4Hl&`?v3@HYb6)sX=+RdVQxG) zWd+RDv$98;3p}oyk|~JdY4A&+0CJShzK!p;O{?{H0c3CHDadXIG~$n~O#OW2cOlPn zliaVvxB9_i^35H9*;cQMJcsBZngSjyz+?#a`ZVe)=j8YE*s%{Mc5)Jqn3fq--Ah%f z#7+I7kZu5lv<{FPHv{t$6oc=N-}2}!bDLv`3%00M+u{NS7e_W&I9rr(8pOA4!tn=Q)KtM$JRY=F@%DC;T^aypm9 zns$3sG6d9IfzV%NoSz^(gzE-UkT+|tng-;0>l~|zn?X{G8J2x?89>MB&BPV{@X!u0|JGS0P`+O>kpiO9Un+_w|=QA`OLOd;3_g9QiTe z?LynSp_`Tv^Iow%+YX_Kduo4UO4qLYh3YU^ehW;^^9~?SF(Ei%QSdA`Uw6GE^RNT^ zH7<^lBeEnAJ`DTjjl{=kwfv*~hr)&ni*Sus65f3WD(nrlpe|F$+IwXJxUMbR6=>Yu zVR{oLKtal_|J_*d3{M|e)gQUTmBz*XF7S&YZZX!S@8v@{BukcirwFXh_)b$Dd=HeL z>jB3MZ&@Heho%?NwPhI23KT#Uv3Uw(XpmcJ$9Lm$EnNtG7*F|y$_H-O6e0sZ|Ai|3 zV5v0kZX&}ak)0aO_f+hGW6`fWXZ4WpK)|ej7cktEz2Gw{4Z5QNE>i5T3kfh+FOoN< zI*Z=}M-yzgq&tw&;rw{kH*U5qN)N0I&y@du#IAkSc>STt(Tksg23J?br(f<{#a{P! zy+DIw`EidS#u2nZK9UQN`xaV(l|1`A351?G413Ga`!{597_-OmjdFj)xrUc z*@;)?BqALOrK~78$S01u7wEgC&L+xY7F%p@OOM{r_+o1&VN5)MiwUPHUJ4?^3?s5I_Se<|bTcpDvzc{E@Y)_QF zee#8EJ5PUI`#|B#RI=Ps=k};h()Q~&DE04K6P_9y7?!*&+ww3=N9i_6(DcUu(RCBZ z=`3HGKKW!J^-X~05wNJ2T$yRK&@gQ=%Me-9oTvLDHa~cAo2i2Bjy4W$Cr4rgy+bYG zdvsUT&%HNXT#a-}_316^WecDDhTUL{8?bvNTt)h!f$-@v2#7Fd#)QO;_3VKkJ$C3; z8sK+F3ezAhdk`ZIR{+h=e1A5Z&)2|Yg_z_%DA*Wxn185k#RHoG(2~JJ?f6}=s2+3l zf!Eb3l?&EYQzKsk04JJ9DSZ(*26L_oGFemxsddeOYE(#-;x0^a3e; zCbb^J$ifSF$Jd{6=SII9uo9#mH9i8qzoLM`)T0skoVAZJ@$s~QT?PYFq6MrAOyZzr zgY-pc+c(GO(jH*#{Ppz3e){QnxxRbFi}^fOIP_^NI_ER~H>gToQ&96R+t&MV zuW^aw;Zei$y7jo6x^}&V+ShUmw}gJ792=S2Gh6_t2swPx*RSiLz|AZ;cV)iAqsdD8 zOkVrVw(H^T8;4FASHD3Cp|7cB7So{;grQ@WCC(<@QF zrm4ynM<*pJGTRRs>+4+uB2=KB>QhQb}c;`-PIk?7%ljiLnhf6t|agwXb1A8)C~| z!zXP?6#?~YE0iQpwAFrpx1vJ$)R^(1TiwL$@@oI?^SM8@>p%VZll-X6iA7xm7Qki$K5^Z2jiq9fKTKB`$(UC)SPw3i+Z>wv2;7N$izi z>|c1S|8}Io{{oNoKHxR{NCVCgMFl*Lc)4|1$|d0UCpm&T0T4qUcSbGx{7rfr8&2#_ zM9iePso3eW5O;5D2HTW0oW|M<9>1RKWW?SID5UiB4j`TX@|ypnk_w(DFfjkSMSuf) zjZNf=baKGUs%O}F0y+%PsT+g4?r?n!;V{2JKTd*havB3dhc~_CClGu%6fquRVv$C9 zusg0fuh7%KXS@G}y;<$xL(6wSKhmfWMH#G~NVyi#guXPIX%71ozPW^4LlC&B{@VN2bZw|}Ufe9kVK$&-L{w&T^6ewIXGF&N zxo!h`Va;5xRNpY^bCy`10a_rd>apxoS>Ld?VXv)%RH0a%xPI>jiM!3%H#;s0H&hOn zzZCQhtENp}d*2s_^D)X9vQ>iw!s;&7eTP$A)tw6aRf9IDUqoFQ%Rdr%J^Aa2O42sl z{_Z=E^XElp$ZRchd^={%J?w_QI>9VQBr%mNX7T*l5I!n3g*6b}u+IO!q>70^pv6!| z^W!U~X(?eXVKvp|V#Wz=8JHTk8q6bWS%5C&3U-`qVj1D|@iiqt1&$)|t(NEnF`NYbjNudtv6Wq()r*ymRAw%lAfa;8hncr979}i$^5cqWy98;lCkUY zp`X8t9O2PBn%aXKLyUBOW(#~^s5KVbo7|!*!IhNdRL!ot+^4fEgMHuf3ZC6=oX#qn-kBkE8Wm|)h;(Dbcf&kv4e$72 z^WO>!3f@VearxiobpxqR8V$%~0-~_zSz}+~)ARb8PmS)lOcl9=+B`p*dX}PpBa7>U z5&f)bC|}_5zD>U{V__Wf%lRghlbLhS{@Oc!*)3)VmuO0brpw40Msp8)+Vy%Qf)`?8T0*WDbixX z4W1EyjOj@Jj>`K>r;P9jq6&0W1khd3N|sN^-_7GZF7^6fo$CQUzd@S}3Xpt>E2A#x zO3b|ciNzRzS336tefteWd*f8h1S|nfgENqxo;%CuK!bS{9hXEcTEW zR9Cy1{*L^=bX#g0hBge(G0Sp&?or>_T4w3M@SLe-( z=te*=26@=o^LJal)>dduxcNR*=78BbplR$n^0&wN-hWare-ZU%_rWUDOI7PUvkvSt zcYe_B{B=2I2$}{EwVyCkm-@Iu#M_*r*Xic9p*XVi(}qcD!LK=_6Zvbi)#}tp??TU- zpO3xg+?-iFd3Nq=^J8(3H?Pm{SNw8?v)cPm&XIQ$N51_A|1=6PxHn{YS}wRdtGs>| zH-A?GV^69lz> z>j!JhG%^;tO|s#?_=9t_le_4X84QtpN0^4>yI*lns-I9IZAOcJ-cDQZ{@X>l@G0 z7#Znt=8HV25bkipKFRS@X6Dz|2D5~N?ITyfEss<5iavAWu@3BwSz1TJwBXA@kU;^zsU7@|g$ z@bkE3U-^8@_!mlkq;5~-%Jv_dANXYXjY*7L?!0098gs|o56tW)OHaVkhsILmg>m8H zBqJE}e)_|~qN7EU0&5A*u-A?3%X!+%QtO#G)-`wyv*spIxNJ|F^Uqh+mS04gVtHO| zJ8gPArNdM1Bkaa-9CECh!LZRo4uQ6>%~b~4dX7iC^|cmX;Eztu)F+*PKlM~?vwFr1 z_vQW49?Mwa?sVJjJncm|DUKq1pQ(2|9$;Nt7JLm|j1n_Njpf794wjJ_f?tcI^L=*q zs$b;F`7FHeBq*F25csSmgEw)(GW6!UJIuP?bt+8XQ_xmDCc;_@ZMpcN8h zaid;elgaPg!xO1Ih34|Q;@>^%Ye-W;3)2lkKNV|mt+DRkRwR-GOo#?&G*JRRTiv252fM8+vXPm|{t zO~YX&xsr_DYzUJ}nVyT+i@&NU)ELjHNx@EM%5qlp=o}0~7uRFHQ73>!cIG64$lYFK*GYZj}1q1-Beq13+Nb8eceC}0;YRzK@a zU}FUUP)A>qo+!}g8)_zI*zaRL|06Yff(-3xO6PU6VyLysjIyC@bxrcXJ z&-0MmP+}r(Wh{LF!xc0owf#aVCT~y^&iiz(q-&)NVhAby}n#9OJQ$A4nxq`cl-~$V=q)CL$;YR z-C(r2|0~q@{T7*ZRis1?3>1!^KIYm{b#-M@mCA_@=L){bvb*ZV<*A=N@y8k(v!fY1 zu#9uK?mJS+8zPuxrC7FkmMUkP97{@$0H4?e5bshlqJ{HX_x?baUObQtO)xV9(XC$} zikqJW;)!kHYz3jgj%TrMJO>CYr8%?Z^9MktB*+A*DvJSRDDQEgAw(A+62N=b+(o8| z!l|hNsUF-9G*75+`|1jIfXz?m2Ia=kjQ;Xw08|HEMyj0eXo5A3l>;?XXGm?5uUbz= z3842@YH5?uob}uOe1Q063=zC-T`u>*rCC_W9ncQQ%^_#rlZPnekFDFKHgXnMUz1t| zg5s~YaUnF=nByi~W@0Dd^`GGX1EGf-vqcDtfm~Q(szfj>K+ko)K z6T9x3Zvm6bur>`o-h4edTy|m{a)&a&Hhs2vN5|E@*5pws49o}OwThI76F^mHi(?pu zc43EqR7R^sJ+$p$P%&R#$X-jZ9eeiGo* z#tj8R)$$13=RRCd5pH=AQ;bOln^>iYwR!a1wz82rhZ`(J`b}4lrY%zAy`3SX33Uum zKzTo@?CZ6Kv|%D^ujOCYA|;?k&)J;~q=FZC0$$VV*wNz!Sc^ok(fR3MlWyQ^fzpWP zLv#2_Kt`c+3IF-slUEn-D_Z_S{kfjVdmY3dJjtj(SPug0Xg;$uI~o&ojjh>WVN+5{ z=QH9gu}J0}SQ-(VQvsY3slhcJ4U%tv%7F9HPigHXa-I!nAjMcDpmXB>T5x|>k{=o1 zD48vaM-vz+WR`Ov?OefXQm#JCb1E&pEbV%Em(%j1p{vm9J^`?p}b6_ zjkw~NKA6raTj2`7dG^V2UEymCXT4a{nul=Ph1q#&Cd+5EjsVEOf-rA3KRpgtGh z0BlLA`)a%mLudzB2D4366zg~zz3>#m76HhH{2*-Y$pIhMthc`TmS3vST z*r7@tWK#%xb9yD?B#+pu>f^^5r{3dq)2FZDmMv*Pfik2;)-T#pjgqR+Puv zUA}GHuz2k63MA}4UhH=Ree@bw_OtlfkH_3#y}2EjA=k5Gn1lxOl$B&5XDgW{{NM*H zjmMJHsj5GVNUh}&`uSi*Hv=U=U)p)qQy6LjCK}!a(OzIaPD{HM1}aec2ds$PEQ4Pt z!EfBKmO+Mcjug=e>@E=3D%9e;8RN_I@_$t#`)S~4`~~& znx$Q2_!QU-#56Xom>6JT8S1p^jVfw&;Wk@#@8ebs z{38SRHu3n|>hKo!HYI3&7z^_qx$h~8_yZxWybT`zAW$R$7)EdkoxM~@;EXE2B{B?^ zy79XnFofT)^jj7pourZ=!G%&0B1`c&$?c=A_(bD-2+pJ?o z13T`E)TuNTU_w`JNIf`)$uR4c5Q%XPciGznfT_Re<*u!v95$h!+yHk$uBY_$QKo?zyRL-vPA&dUDb2nC+#NbDSCspR%yuDBSgHRd$XpFH z|K)#>5SHPJ=A;@Re4-%Q$%Q8Ru99Ya$l(}RB^0@s3Sur`LURBzF_Q@tT+tf|;Fl_@zhpu4wYSBivq{qz{kOp~ZATbT=S<1v9H z038lEBTVL7_!Q~ZdFWY&has+mM7w{#nSbhvdY)^Bxz7SNfe7zG*I{6wTJwbqgO%R) z+0*&GI>KW@w$*L^NX#}#ktE;8bruYTBU_GI40sJKMP-)hI+pZZY@M?$Cgem7cO7p6 zGc%qLQY@(C`6LWPK$0rh<8$ z%2#!Pq<4wH=HK({;v}2mJ|u8g6}C3K%TJkoWWYOU7(M@XYs zgkOSo^{7%WT9?^ia+z{8n-)?=`$w}F2w!kLu zxHMd*n)B6JaV4gzll$)Kd2CbKGJf4NV?&2NoQnUM$$jr~U0?D3e)`V|YIKbMktc}R z9T|=?-_NVH|38u<|F!>edth20M7ab4tpwEUG(>O2EQv0ge8>PA|Ic2`3x9Vm{@Ho? zZ~ndk^UwM$yO^?^fLowaA_6+_g@LMn;3~wT0yk9r|EmT}$KZ`fcbC>IdUed0$KTN+ z{J#ttGyC_v>;Goo|8H``8{Z9Ts0@LZlnun~xcNfE)YoayhPVoA9s&}n;A7{pX3u{z zNt>chfD(aKN`=+wQuRi#yipYyKC6wsK^6A^cKR$McDVj_rF}^1&}CUk)|ZdA-0T$L zWl$Tvt^%&W3nQJT>Ixk*9UK!Lrx?ZCZ!SG7bYsKobPDI>R@~rv2K@|N&d9ZZ-AMaX z5MFgaY`hKa4gV$SBGKTfHy?>oKm*unn30uT@TeL`YajQF6! zk=Q#cO^(&L=*3?iAKW5<0lbYR0);bKiVb3IH@Q`9KQ|`R6_>_pT zrRKG9w@hwhrMzi)`N=#FB$YT>a_H{ircYiOCP_n>hV<)|VXz3WmaeVEk z$#BS^leabfa9YuNTNsa?*dbR$r$(-sx){5ld;aW!F&dU5B1f;4+Rc0z&DI>hGuo&7 zh&s8(sF5PNzd%@fLTYrX{X_?O)T{>A6H|}bqP+%;5D08<9>7-3@fn~DTU2Hpsv7!! zyf)L^5!@#Xf(;>LmkUk=l%@f2P4lEtpK@aHt+!tW3crpA(EQT1O|}mm7v_2+jo93R zB*tBqWh)}!(wf=cbnSe4CbrASvp7kDT`963nGENe2}eBC3w(VNBe`hR1>An=d3kM| zCi`Hm$@{lkKh3wzlcG5-NHvrvP(RHC^G~ezR6NNJe&0~y@IgIr^VrfI+rrr08IJ5? z+^{e=^!SPOrDjYcwvpd&-z!jl9!zwNTv0=QZn47kF~r=wk*C)G@NWU`twC~&vei_^ zAZL3jRFt#k0Z+?FbCg6QH?|FB*lkV*-gh_t3d6DK5^vk!o(AWO2JyK;>!X z!KSxwx}PDa`mFZVTj zzV!Tpi5XnWFd~p|7=@_}Zea`;qz)_Gv5b4dlnH&YC%U*dT-!lSjVJXc2MBWDde&2He*^C)$m4n#iwg#6L$wiUOF2sPmjCnI$6eFOgw8HLt z^K<$>^)s03N2r&)H0+l^uD){frr#mkW8s6%^WXLNe0S&e?710gSUGa)rb=++(@j`1 zb*=W{ag3C<5b*Pa{f?^iP8Lk<{W^Q-jM~#9l_T4Qn-982*U7;d!8O)9wWT0Y{~-5N zJ^eT<|~n8&6Xi2wiuT+ajf^t439MWDr1ePr5+IgHgH|NW94#FKpdVKX0a^+|XjiKw)IZ~yJXn|NNnvt9=(irmp{uFAxu1Pp z^L!N+HApj8a;iU8tTo+fn-@YMF(|A}xc&GxOio%QvDas|<`SIsF0ZiHeA$y?`f0nv z6~bN>&5XVGQKY6mG38kX%biOMW5vMYxj8>y8bpH@Ih$CutLHT^kvN7vrl$z*7}gq_ z;TgWN<|8B5pJ-@i8Kz{9OE>|t?w11l?gn>)Oj&TSwHKph$SVjzdmpZ^#0cKJbiUo= z$NtS54c0up^7Kla%u{`t80$xy2EN%OIS>xkmgyPBKUHn`lpj!C7m`2KJ2t-rp3Dsf zpf=wUE20oidSh|3L(}Ef-q`OprU>=NGk5Nf?&Vqi1=4FQ?I|eioqMP$H$>RVk$}T8 z`QJF_E+x*k2RSzG#BJma~}in^P`g{4`n1jeLWeD9L8_LcLJCZSwoIx zwA2=g6ilAJN!;3NOP;kn{7^b$vs4mDoRl-C=;*p{7beLtATSl(seqewdRndaT&Jy_VBSh7i4&AX2q zf5AUvS(JgDHeg?ZDb-%qYOQ{v_}+VO@eX!I2d?`3LcI@ReNj5EvcJWd$>phqw&H*D zPIs;ls~|JF3hmrSPo6vcdSU0H)Q8F-_3&{&$$;7uHN20?b;l|FQTyEE>`yv-_ll&A zQwHv`6Z<8|XU`%|hy=KGn+h%1<)oE#eaZQxVgBa3n=BrGkD zD5A&l^}eK!nJXi=Wq`2tVy@lvn~DYh&EC}g&xBs~&gXDNK_n@T-24ILpp$cxcZ;`~ zF7EwvP$M#&KxUY?JSNUlsuSe=H`jn_(iHC5Z{%qIDNduM>$6`d4fYG}UiYc<4ag#- zk>JJ=$sOLjV-nf$v1s7C?mZc7hCH#s%=aC**<13P`aW>fpY8&EX!n$37E~s#ULM-q zg_)o519t9+xBJ6OO3U|Q>^%0q3On5MZk-GBa0|k3!DxNl8yb{v&k8?-!$b{nLOfQ1 zLTCKi1~N9p?|52|Le4K#%2X1g?gn|JWdw0x4oAKR^B6{!z@_X8#D5Yt{HAPWhLoC( z_)!_=q`*qdC-{=pbu5`>2cY6!*0c5y>I{ic&wt5?wb2ekTPUx-V zfQM=cg*UmYn1OEPzPuG z`J=>5E{8!Zbb;3#W;p)!}v0h-^$1b#QmjhW_-3l+sOEYRy~`&>1* zcpX?NZ$E{9zVR2TIYSTVwL(U)ibSPZ@Ps#!1r@Z)g;)T8gdsHE+MeU2XV-ulTg(yKlyLL zv||9z(SqP_s@2OR&N9brIfCl~lHTPU%rd3~v}ITKm2ugk&~K6-JUM3_7A( zQVB^VH9Ag5oolAUG|jA8-rqG-wtL_EexK)k?&taA{d}H3_6KXvTGw^`j^E$mx-QEX zbaHG|W!X+|&Kv93s1au@?DcB+u3*{8?ClhBIyI`3ITWq~OSmFgb-H;IC(96SRK{BP zHBy~)pQ#tcLm%f-Ts1d}ZYo)9&l>3gq#EEt^v;U)G7Qn_J&?VFaSfQ=ErJh-rJW8X zw>}eeys8U}-3NsgcR<9-Ft7J|czOf2w}P4^05T-nxKV3COs!&ddihk<6HP=>F^RvE zp6u=1^hrxlf{W%Gw3H4TaF{k~^oemU{#2ReQ6wT>1K!&6iE}mu3rlHI&Xy#1U8pFG zSbx5QkPE3j<@GsngH_EKnU)MYeTIKK!St04O&YTXtBj`)-d016g0q^$2~g&UXEuF; zIhxR`s9jWS9o{MR!D@Xzs_o*bN|Poz3pf;=!Jj|G)kx=YR9-1v{s|(s7hMFJf%Rl2 zf`Ja_SI9n5IWwb}2?bgvtC#+KhHnu|w}Jp8Rn^5-z#6Os>QVs^Aq&Xv|Je(~B%GPS zlz$+Sdi;&uSw$)C*mw<>Q&FP%v>@R2H)0ZGCnJPM&Bx*+J8MY({Pus~ed?0=I}q z2Cc&@V|lSYCQO>>GJW-_VqEwwSez1q=lb^ytlkg8z{w(6+#J71g&s~8(0A}%DLeKI zpL=c%h|f_T=@TAADO>OdP8$H6*o1#YwCrB**NOG=)BaTW7T5Gm`+cuE3vzI zJnZ|a7K!n#bxzK(r$({Bobg_{Q5ZvN^AUs~ogDDzCmJgrz&5%~FzQA~t>}Aht>a|O zUr8yCmte$&`SH1;)#R^6XS8Co6Y8g2A{=Wtk#6wA!H-1aFDLkn;X(wPSgX@TFy&0nZ_%r!xic@Erj$4G!=gniw;iOiSAJK>w7dag!c@FuQdnht1S^y1 z;;m+D`EoJ&na8m=OyZizLv+41tGIlP4NJx2tZw5Z;802RV>*)EPbbHFVbyzJ3eF(w z59GS?;b&cuJS}(!3^L-7P!e(ok}~|sO}zE!*HLhXUIVcgTAbC5VCVBmAuw&d)b1lu zpDsD`6OV<8yP__i2qnEhtnz(wA~^nDo0Oy{O7V`3Q$SfhQ#tb~PC7NqEm9^W`4nd_ zv$IP2`sYt5$v@!@E72)_6-7=@)uh{;K$D<}=Jm8dUTe7y;!zBGL_PWT*AE~$LvuI@ zbNEE%@VW*Hl%z<#YD2NAP9pZ0R3UlAcLkrvU@=(T$B??x+6MorESxZvB`gc(WsKj; zD}F_tw3|v!lfBIp!LnM)hA{jQls{SY^}E97P=WBx3g285?8AM4G!Mnf=G^%$XL2s`0No|d)NpKcO zDEdap0z6MSgAl2JuD;hJ_+q!>z$9Dcj3}xcy?pj^ANhi1S|;wLb4pc=PM2|OX{5;s zxY%gBmvQxO&~Ek#Cbce_&KGlv39xpcsLm9*H3nosA8%>{nJ}T3{bVnm*=*E-8GOal zobbI=#U#N5{Dn#k-kPIBnnVa8xDR*B;E8D9!`|yJs%pu>vwv39uP$*ljiICm(Ep3i zr%71V1n7@fEP-~IjgTTr+ehsEhUW0&6MAggH)m=GmIT_+XY$^-MA0{JYc)xe(?X}^ zBVH<7^_gN3dZ zzE#8+ywVtA=@|v!26(X!Qe`^;*{^{pYs2{zWT9aB^kx&QK4j1`q~tOXu_h6%)=iPl z4-~P1U9EWUEHr+k^d|ERzx`ZVm&apx#nmZz(3U~WzRZTce8#HK?^sTIYt`d%QDTQuFwtCG(8gOAyQTf@{#tqqyKa-=-ML;PsEL$8LVKXe0%Ibf9n2vf zN<71h*G*x^($h0`eGs{=XMQMIMj7}lIK@*>2Gd?q*p7F(TM2hbbgM82=D=x}l>Fmz zrJ9$B-)a~=8+73G>Ku=0SSuXl>3ZUpDoRWp_6hx0G5cSiPgUr?H&2*6kLi@xlKgc zW>3B~s68&%dDe`9Mkj}DpY&yRz2i6z;1^fk*J!LH>y77;T>tnPpS?`)#QA;L*C{+c z?GF`xcH-rG(N!mB9`JzM6`N}eUc*t1a0U+VBFo=Iv!O@BIM47-*rP?hq}e(`%l@l? zr{pX4u{F|mzn-pS9UZuh3bT39yLuO6%++s)-hPhs+$3>vD0oLz4_TP06p7Dc#xh7X zW{trcHHI>2xm9rKjGCqOzM;fPTH6NqZ?t`jyvL2ysvx(63OyC<`7q9CSf!24*6|s^ zJ)8rysJ5C*^X#QHBCC&FfiMs~R?%C*LT98-c^*mBv+u5Y3v@K8iv8aI3Rsm(X zkJZ@SZ5ul+>*Qd3j&&lbMhG9i5*V3GS#LXhp@vI`N%qtW9i7Jj2%rM9We+)aLe0^X z%5u|m>nwl0Kf*RYzg_GQtGlu;`Q7FvBiaLBeA3O`q_rw7;)JjK|>% z`}SBiJTIGaebWiDrI0*HPUWu?aXzwy*y%rs@j`Xq--3spRQTKfpdGGBU=gv5vs~}Pf`irl> zT)b%YhRHW)Dneb=T{6)l^{u|j{EG{jQBUKp&2R@dcbZ#%uX1k^ey&0-L6Cbgo-?F8 z`bNC7@`c4J1!La6&rQ@XIQ6K38)fNzon3Ztud3P({dJ6JN6G}-&CsCvv*Z0w#+SKt z>U-?Zt{=DlV`(B~sM*LYu4tKT$mKINg&cpQm1^mGD>^rYO`l}$SejTckhr@|7Jdvf z*Jp@U#@nA%5`V=+(Nfxlj*FYLBQ=sd+zq$4q|o4Q*=6ZF8@_~}57v(69U)c;8r06h+N6aVpehVE21d2Ec(_-_8RSqqtin-fX&fQR<2VpJb-HigGASuUL@P1+V1@@QXN>`E}3Wu(KF%?G|={89zAV9x8%^A06^LeKaha7>6ka?wzM}NT| zy?A+L*Ljmi8M8U8!*tPXI47oBX99g-BelR+Gb`I&&ph3cx3+w&iSfJ1XM~hx>U`1z zTH_P9Eh93qU&@<4n{3~8_16ReM5jyCM@@lJzml7izolrDAS%6L^o6X{8T{0UsD>c# z?Dwq|OA-Wrh9^s+L~CCgpU?C*8FOm(YI| zqP>?Ki?-cZwq)@3O)He{&)qs#DTn#CzC|K&^~qz$dgYgR7;cleUiH5P_(ivb-LrZH zBU%mfN0|tSno7a$$HPIc?c*55_v`^^VKYJ;qy@u4=q^w%Mf#2RO^TlrIX) zF5ldNg;4{VOj!#yu?#O*P)v~_Jpqe@!!7Vir535t%h8*tncOLcGoKc0XIxF6xOVz4 z(I*B^U>k24X*!=DObvf1(IPjD`YJlr;WnI?suc7vs*MyxB-|O!olqV zGfwCHgmi3JSJanC|6RB3J8Vj|>Q(go>sEB!JhsGeg^^W^myPrk?!-D3ID?wTk1UFa z%7n=L}6L$JH|;-%uF38 zunQiWShexjQj6yy?vpa_#prD?zBSB3q|spM%c-(YV~6Gz0&ejK}Nw>QFPH%KXv~85dXN#<9d7ccy>IyIm+(l zQhuQ-x9r3=SHpB>u$q!m54+H$E^=ZQCxba0PjgNkU%s!&XkTT@x=$in$h>a>0LB!j zo?+cDX@(HaztR*}ZYHjQVol~ukXp;b9GdGyv+G!|S86?F+TyJy0>eoQ7t!ES^VC

    1+29yjJjW#6%?52~`5hM;^HdX8;>e5MNI5F%{ZZFCDTGGipzv^t@eI+s;6A$A+ zW0AL2ih9ET~U*CS!gjG+I&I78|!UKf_4BnsE)Z(Zb~`&dmgm;{Tb-Re9s# zmZ2clJDLr+Ruw|5Y!qVke@Q!lbV))nh}Tg630^u+o?_iCm@s}Tyj$co93ZWgEx=hf z%Gi>6gy0-CfQsG2?(C(<9=59458)x}MUmzL?dBt z(0dKXAE1~McG&%(N3@opfTFLLEQ991_-ahjiIBW1p5BN%FvVk&O~FFD07+Qg?y@j| zPwNhbouuq{(hGNqNv7o3Afbnmj$g7VTMUyBR>$N56YhydjT|nru?3CiK7|0Ui>uN% z58lL(%p5Qx178 zd{XljiHsX*5+po6k*?yw;`^0CH~HMZvSn5P2?o(WpZdh>nkQmWB*coVfU+2(EW4kw z5z-wf)+80Bd%(_jV7ni1vX~-odP^&aC+ebZ&p`_U38h6yZxIdK1$1ezOhU23kN+SB zbN_BUMM~LJVu%BSrgr;%#alD!P2XfeQ6;@u8>9=nB2ioAV+08!+`$WfM{v9s+kopa zMd1Vy8x0j1712eSnMwlIbvLN$<9mpQL`vf9>v(RhPewlT5(I9%;wv6OD+J?A*bcH_ z5@77$$I=^6@flo=kxH^>XVZIGD3Srk;6Q>B+Ny~fP>v`T{CF1OpS&729K3<)<*Y)H zO{E5UdX`+yYoGZX_DQ&JO>$p^cU1ZJ4gAiG8K?!L#E1snQ)Z2De%LiO3$CdTNFw;w zjN;mh02p+uu{T;wYE2ZReAYt*_Cr~KN=Fe_d~)e>uAt}*v4vh-)6W)oFOqtK0#HyD zx!^t1vqC|WzYxq#xI+fs^NkBUhZyFczVui6ip#4Ry$TT9b9J}dDb}Pfpl#%RB4!?O z6V2nb2#!ZF9KH0b^~=ZCzmEsIG|P#77S3GFEGigb6~X z+E51MnOoXfy<^c~_&zQW5B8<0evFq6<4vcOS4xcRF@B&2$TOncMxY$6m`CrJFAC~^ z(j>G;ap#{BrUDYi63wTz2oP)Zf6NV~DUsAARx|o$i^mirIXT%cC;Dol4oJdbv1yG} za$|%7TLPu!KJ%9PleLzS6BVvrhg+v?5BM!K*WB`0_2 z1JZ}Ay<)YO%o|vTy>J#h#nT)pC3`wQz*#=EhoD|MC!fI!S$_I;Vr2_x^~}V&opr&E z(!iib$cGKSOH8n$l+7f@wBGARt9`CmBbm1Lr<56zC5hmm+%v8~J`=s3s893Poe623 z@y_R;%iLOkmH@WnK*Cw~w)zTzraYZtX$5D>n@Wt*15?KfrYk}r#l@5=`;z4!s|oLp z>u1j@IJp$O?ggP|Fvrhpbp-VSjl2Qg69msRZm z;0wLU-?gPl@nYMJZgVQI^nBMobnd!wBpd3MK`{6(?-6bYsRo(R1 zdg*X#I$X4}m4KujSjZ?T8)bm$Qn5^^IsfE(j>#`T}YSz>_5a>f^?L(G6r||7(`emc93arBpe8_n#|oj2DSNTcgKR+f;EM3^ylfC z>!4JmyK+dfLA4$nKDQW_U@CqEnPIo()MnEER8ZdDzL0((Sr*;GC`Y~U-2b_11oaD6>qTJ{xKqD7mfcuH$*b_p>6+z+1 z3?Y%Tq<Z(L zzYjyzqS#Ni6npuSi*g$$LD#O=1LdF(IrPVT zuB$_W5g#c!53uCg9WbiVSRwfj;xjFP1dQOHOCj;;r{YxPhd^~QAmcn5FVzAjh4Y2K zgQ(NG!{kQ%90zuR!ND@P>|7a@mHhS9LVp3b%7-PePyc13OO*3My{B0R6LT!B=5+4g z%dBd78*CXwAG05z|8=0?9gPl$O`C(xTno(;?%NW$X7vG2{UZ&Yb*g6Oj{T`RTn&3> zKt3^+ti!+l0VR)(;a0-O%uGhh#_tMRET7M8;Y^GQa|;oR{_9J=m(z=l|I+yTqciFM;rZn4&@b?Q6P)q&1Z)8`6rT`s7NGNKn17y6 zvl%++SN(0{(ySp1nICdVHH%7-$X&7_ySjAHuqaW&;N;~I!xI-)FXc+Be|?1$q140X zj!tU$3%-e@I_SFKC zm_T2cS$wRhqvXm(x7kgpHk6V-o;Hcn+xUpo<)$h-#rlV(cPeMku)k=t1C&Lvc+X&# z^zcNhX7)L1Ho1hqS17oia=&qaVTG-!66Lp*@L#WDvo(K1&b_j=Zo+dP&5yfUc}|W# zu-dBReu2?aPIFIoWJ7|1LGU1l_X%eseL^JB##+stcB8{>Mj2Tp)R`w-2PyVCYLR*4 z=Hkh>N11)A+nUmPw=`;{EtNHgVg~T&er$fE&%$RVMl&D5M^bUhfN`Oc zo~3*-Ygc8tXZ7BO^Y_be<7?aTVU>rSOFDYtHZ*6l`) zv8Y@}9+XBMj%i&tU@@zTDGDoZlRVmuC2hKSJ$PSf-rYns>7lHXsn>cr(Da=p4LO1p z*Hj)34ZhHlnpsU7y)M{lD6e^UV&KvA3H$}e9i}Hp;l{5>dRC?vTkRXZAQwh z)35y7mfaIm=dR~n=X5a-ictJ|LVDX?;Fcb!Tf_BtS=lss93IPbq3eGAKBh163sL3Z!C<_&Emq$*e{_f2GOew~!w25E zZg%-m-MQ{}gFKI`k2O_1EmXT(%S~_mDoDC;#^K`Iw<$yZU}o025(hK!G-G0S7_B(* zbwp;x;s8y-lf?aO=Mr7v`eP+;%IMXG{-0{ve;KlEwkDGrK1Tw;F@25KEREc=F6pAI z^t)^0s+9~DKFzouMZHuQPOmyL@xgme{z*75>9QgB<97uouDJRwkCy`c^rUZkZZtQ| zd-cM&`EDYUD1K}u=e)@>(W;u?1ygzDk$NX1bH-cgwmUeawEANmxOB8+5kJ6tRosE3{4sW+vW z6u2vVNE_IyHE>~*;=qpw6eo@h(CWJAFt*AhygQW9^LcBcWpgbnr-6Tau}8Px6V9*N zjnKbaR_>EQs8mU>&5{Pkoy*v&*++?>6mx7mIZfoh z_-2dP;hdL&r(!Fnr@r<1ka0%dMP{e8B=;*d)yOu$NIQwW;Sr*%>B0~`IbnYHU|Ve4 z>=y&8mjANGTk*kc#&XLO3C{INrc1tM7->(H#ABF9Gv)2}iDNa!J({s)&&I>HSS3A< z{=7bGqmJ;rXzE=rvx0<2+ym#@r^3xpN5&2OO*pg9rKR51+N#B&5f?*sira44(=U(f9%a?YH$Y!w%cGp=7j469UuiecJ)H(X#h5;HRHW1 zPvouMk`KU!E`xMX?&iP|8M2x?YlUF(Hlj|}pmMe>zAkm?uO-533mTpe5OcpBLm=#j zTSiSGGpy9B*`j%@mYQQncDXD0ldCJXL`;7to1QvNRzrT{#kdX^^P;vYm!A1Bp~-;8 zd9jxc1DzK)&*c#QMyoWX&l2d8gTGdx$QetbkEy#rF;Sn!*P8)LTA3G* zVSH=iq~NBFh{1I0xn7J3pFM!^%Z86(*qI}*vDTB+q2YxGKJ66(Kp22OUMg!=;l7hb zKqT-Of>&H&=J+5;x6#`oWAWC5TVXzU?B=Ru<7$9f`{zT_ES%r=d*U6a`fiu?P@$%- zma%ma5SM0=CTyV-045yJWHm}W&4D11SsX*1g>4rIAOkfY{%H3xK`||XI%S?2UJuM> zvLIf^|KZ1Lzy(6iZZAiyPMSjO55q$cb3&H0Yg5YTg+ghttjk5Hc_<%(R7iv|B>H$B z@iRAd5J3MGFfX7b8C!nzKZw}#m>sfapA0w~cs@!sHG&xZG}<&&H;$Bs5)I5Bl7$tv z{FU~%vRBukvw*y@zmB!#3JNaZqs1om{zjX4ZKXDJ>5GUyf%5H>C&pJ}^aXi10d)J7 z7l89eBLePY^LyFFS&;ov1%;T2r0O#YoMin{sWTA)O`F>g^=kpHy2;?G8yqk`49T)y zuE;C5r3NMI`o9MPDu21sYdw-Y7PtIgjZC1C(+NM)+$13ZI!{c+gseA6v3mbAJiE&$ z)fLff6`l?Ndv7xSPBDYch($doRsbsJSL#mmGUfM_4h_JF)bboU^3b*SQL zdx^ajY^e0#!%HLk7MET}?9F*Wy+-jq`W(H9+9T@~pq+o738jI2apw!ivUrgg^3u#$ z^PBii6lvK&0#lcwuR`oc!Hi7&z$nBN%i0v!W6;1dMX{AJD-EPSX`Zb@{ zf~e00YK=wIf6yODCN;v=!1X^Eyhg4{dF>mOvb`8)`A+`l!jrl08nNp z^&lN4qpZm+nHHZHe^)?k=~aZEvZbLff3RhvCTZd>+z%L5>2FjpOs0aG&6G@rX`~4* zbZV6P_A9U+$bW8eEOqk7RQf3@7sGB5n{*mBz@H{f(bVkxAQ1JpCIe*lqS#6#B~= zqL8Ee)+eiIKL5(mb+^kbr#eM9kmz4hPp{B)>v4I^qRI-O%MOcWNtI3Jpj4NeJk}m?dr4iV}bB_J~?Iwb^MJs6T)@}a1=-2f_Z2V+cJTunk>NRU9yLVArk}|9$aRS z@d7zo0k+X-(7{JeXL=wL)TL_|0S?#T4V%9WFOJSOO9`E?l+khiCNVZ4*>!2_Vdo^c z{nxEHn*jdG{YSsSIV-F1N$DILnci9;2Z`@}w}9qPuFaJiLO<_|Xn+kEX#LrC5C$1) z==YdsM12OM#Y6@nqF-{#K&NXLwIgM?wfVg1u${Ez~Pip+(8#h~4jhI!&p zli`qqWSKJu(&U1y+0G$zI#upMK$5J_2>01=Y*!*eYIM}qOOIu`(L64D?~JEQ+>w#I zB@II^r5}aj{RT(#XRx9q11l)Ay+X1W`kW9hu(4)KUmz){moZIsqKgfkJ2_OxIWFZ2YEOIciQz+FY zM>wPF0BB@L{bXheoIG@6ew>6om33Xi?1G>wW$jjcC5b`vKsma{KM(s_6b$Z=(rnTG z*+$;`-NQK(qG}GTcU?4UaCIxbc%XtV08foN=w+E#Sx>M`IpFohGTo?0jg&i)GwqvX zD9AP&br*)?{q6X`r9S!ikLGO#64wY;%jh7mY7BtUe<0W!nUMj4O-s%KjlO!`32$U~ zJtBPZ@ySaCJym&2!8LQX9 zZcF2b(Fd-^T2C&V`|RP+PZRW`!kZw;wf3hIV!&X;(L4TNiDtV=S?j+mMBQ>C;^#v5 zWMv7hcbzGzYCvU0!r9TE;+ZAa6!7mw|GP)>e|nFk>J8Pn%>#GZtA-uVSr#z*=`i1m zR{HxYbBBI)>X|&{+2__N#LK0j-EP zpFRW5b<>m^vL7@Fs{EoJ?yjy~*7bYkjNgwgj1HA7IO+M>Ws=s5Z`-06Uth91~Cqb-htF?3QbM1L2wHj;S*ouWUS?xc^w5Pjwr%@%jA;j#T3ZiB-YW z1Bd2x=dK)Oez9=fqB`eig-qpj$;P)Fqb6rnjg5sg-R6AP`0*2#i%!qK6^_ktZ{#^% zGfh5KJq2&~wQL1NIp;YIs-*Q9Ywy8POdBmJ_jn&ci}jHcTIZo{g{0~z(+k>TKI};u zEMlK!^K03PwC>x6DVinkmC!XS6R%V*Zmf9)O6R%{KBLuRx;y`wBOIlvi}gxHIvuob z26%k7a=SyNOFKDgWcly&?~$f^Hf3uE6ZCU0=pT~iucwA`1lqcA`v+tEzTi2F+>pGB=JoTNhy?UbypDBqDLeO2{EV%YM{`8M8RO*|NMZVqP| zf3uS17dTzzS&WxUC>EDmg+C}}?!}E1siB3EH4b$GC4b%0o*p_GX!aiFxL5i>btU z^qht#Y3@Rqd4?uA%B^M_vkM>FP~Q~f3(rESs|So{zwUYjtrZhm_|R0+4M*>`Q+(U=_$ zp*jhvwX{=BG-kj2M_Ti`?Y*ZsiEY8N8x1tK(eJ zq%3k&m@t$@8O8HPf~#^maSeQ%J+r`Xu)$6}{K-;wC+Ec|?V%rd#m?VwodP^L(R`kmAK)q?QA|ooh6kMhaJ{7)&X>Nk=s3cx>=sZGaJ*u&fgw;NZ;a3vk)6P z3PAX9=@Y#EzCUzJ0DgbF53kCF{sZm`?fxgu;_G)gQqLU2tTwv%!u6v#PAc~9qw`I% zl8UVpo_HCZuIOMkx>-YC!$jQatnK569*4bMqq3+Vt8RrMOH@Z)31D1uH!Z>r;16Cn zIP|cy!TG`IuZ7bo`^oBbn^$b=w>x;+1XxB8YzAa0vm?VAO2VGWFjh8vx22n z!4?ZbkFmUFk?fii0BmD!7G1fle=NUS*kY_0^a~F@oaNzbai?UBs8GD+;NaF(|6zng;~&|Yt_|V zE6IC$(%oECqwj6sLzDjYfrP<*;9z67l0PQm&K`gg2ex6o8o|^=*!@t0mE4}>)}!r! zf!K@B4p;YG!#f;+aW57m`_wXIygp7o0C9jrejlo5nC-(|@-UCEMDHhd3v>(K37>Re zqUmm}zix3YGFptaR)gwkSySiT;8=#7C7zHd6f0ZZTdPs4VP4~v-yP7aa$wfBSD^e0 z@XCus+!J2!NYSzBiML>3?+Ss#{M8_}S3EQbKghH&4W{>k-trd=nN!SM$81j6M`pKy zzM38Sl-rI4KO`zIftVh`9}xT&UR(Jm?AFt0e`;ut#73t)cm=lpJ@#lI@t|c2rE%1B zmnD__?N&|C80zkx+o1PJEYS?jR+qfRfLDDvNa$;*VTuwPd#Nj!9N(+ZYG90;`7lx@ zTcu##^?L2}B&0APKG_U%nWFH8Sx=YFn+`^^LL^f(gVuV;gBQ+{c;S>T^UJ%<%gn3U z;%+22^d@c4GFTX`?h&3g8EpH0oRg;{O$Dg+b`B^S9-l~(j&zZ_&Ub}zoF?q-5>?j- zxvHrH`HzE!Lg+E_uoSFceLaBG*Dr8UCY@i;D9NRK4%0~GV9;)>?&-CX{JInQRLWj< zxx~MJm3aDX!kO7ClFOzkVB|D9=6qmD*b2AET6WPW<`<7ChgB;N9bEs9Lw>?5FJtlV zmXwEs%LR!ML}OKX%y@2397d|RQTog;d1#TW?&dzqSd}o5O|S<6UE#yUczLj7eL_?? z8-x@dXaGoa-Af|-2kZ4CKEs)_Hsb0NPjtoZJOs@)T!NZGBnhV#pCtR>^=xBL)d-O#|bDyR*&pkI_ z=Qc~4Pe$i#ja?cq|J%XaC;vY_ywdDLSj}w1fU5z6imqc}EH#FtYc~#XN z(AMF6mZtxD=kwH-oi1j!)638v%BT&pk;dNY~^{gfLg_i zobig63LD~14)~n1BYRn^Qvsl5ud+b*&<2P~3tY_a%jwtTmP4l1mbDCqC zo%5&E8D4Wl-hvb9i$sRio36RVGzJ|xRlT5TLag@jt&>6ym^yXcbO>0m%;HSskjRb2 z5zF8R6QgTtHKpMs9CHSudP>(Rn3}mP`1O3xA}-_B+x^{pgBIvWb+76jN;4h)NY^)e zjAb%7)IpoLZ;MH{nxvgs+aAqJ&f%?{u}7=1ht|bu&<+rf6KvfxZuZ=;JOKaie}w+29eSYqOP9@6~vsmOb`%rjKS#drK6| z>nXGyWS(FIq%IN|d44I`YWJZ+*?su1knP9T2Jb(sG&f>mX9;0`@8~aC0GMcy4;)~hst~j$D4tM(dZ_uA=6VE5RkaO3~9%Ho? zW4D*Ar-rH5@FtSXLeiOk%sRjIaE@K|`dGIIde7D?ek=_caA~RHtm;gkwgGRo-mu6~ z{=yJ5wrGvlLhwfh7;S}Am2)y^ zFxkhcK-9%lp(3?A6)@Ekz*P4b$hoic@U}^}rz#N2BBGxB5zh#WWqe~E#G2$lejzxB zK|7`4kyDr+rpZ3rIFON;3fwwv}+oPiCpwa~+_dWYuNapXc$7|horBo}6rCQ?Y zZ|96(@hIx;_x_k?XVwR)?cFi$bG{?$x^M}{`6-uX!qwBVD*qjS7c3$1TmztcW z`G&8a{ORUsJO4Un>v)*y1JP8$u6j7WPak&hTI{$yHe+V!2f!UpI+eMM@>az>?kD;{ zh_M8Xx2M=T*R;JF67KBOdntWg=fL%sZvXyC?UeZL6TFmDu}tf8M8*>F79Wnr?e?0U z+k0HcXE#|{mK)==q^e|&#xoSB&$TFMK%^i>_1?0=eNmJ3x1w8@3;}yvGLWw z$h*nM*QVqi`o(OkX)d8k>}fxVjmu8Va2wWGaxvCh67TkW;oj@EXb3}lVcoNf<(eKs$x^E3^$~$zGVI)}N6Fp_W&yjOjB)Q)dsfqQHEjauJq6B5gV)st|1KO4_ zC-AvC6oeeFWzUEQ%kP<2ht=%QeJMFhMbxE$Gt&WQhL=1#`~`gburN5opjip_#;xI$ zu+}A9I?jRYXVOJl{Dx&9!zuWdaccF%s3Cf#``Nv&aoh=eZLKBecPL+f90Zeit|OWr z)wKZw?!kNAUC{B!meCk@fA&j>UlM(O zOKx2`@EI`V)`il@Du6m}$FR>uV0zF80(7NORUA_Jg-m7ctlwGfw`3fI2Da|lHzvX^ zUWu_-@M#dJW73!baMGE-KLh|c1SPAT*_X0@aYk>n8xOe>iHAqw1#%ec1NH76S z8sGzf6sJ#jCF$wX<`@1{DO?#nozrTgxp8IsN{XkAMLxzi-(E{~4-Y{Tj@2ymezRlP zb!`aQal~Gy?+PDRraLqqJ1BL59TD}cl81O*=W=ZQLmVOIb>_Z~ETA*DO#gMSb3<-j z12{4qMP@plLjh}q;P;>+C325y-Nri#M7}_E7rv&pz9k-v7=Gx!6T#g91oj1>pYVcA z0rL~pYE5+h!Y>xvmf<#L45ny0(Yghz>p@`lApxP*0}XRuCo9>A%Dx@kd^Q}Qy|h^f z%Vw7=_S29la!+jC>Iu09sWuqz+Jnq5Bj>J|tYkg&ZSmD&8BGhzC_A7jFP`x;O{;)K zN2SsJWNHIlqy_;x!0aJ@%j2{cf0S^NoaF=7{}@mS2HcC7W+8vCG_uFvRKj&<)wwah z(68btzupI+^Q1WVNYAhXv#ejS;a-fTTwzS3G`xx+mDwQ9(F`rimyl!316;*AP6QS_2dlhHk6*K2|I?X+aP9ITTmT8>r4r!-W6eCAKbL_5iKBBq zgv0;?JGER>tF#AYZ{=daGr*=A`xK5mD7-bmC(-=7!WtE8plERG7E=v>zbt8rTxnNW zkT2QYJY8?AGX1piZhLVZ9Eny8b;@J{ko^^lS{(&K6V+-?bU^K;hH8cU5P3;U>=QZr zms)d}Y7Y~Fbs78lNZxR#I|p>)`t-FCtNH|LV}ob+__IDYK?toTT~%Ni%a9(Gi}#^I zU}7SeXI5nOg0C6^mVe49=1!v9#W{ z@sMlWKs6fONoFQ6Q+WS3pD!i1LT!ufJiH^OKYV!;Z%qN@J5^CKvYC1j`i?)ew93$o zf`X9W7hrL6YPHqCJuxrHe2~=JVCBi0*4vOTdHo(Nmd0&@Z<x~eg55;!&_0ab)KI4eXG6&61fs!f-R;v_@9!_+2C?e)JB!a$IBggX zK2=))fg4!J5b_mGG@aYJ>$aQGGQ9lNP58#ssJ`igH0#_())nj$Ql0tV?q$-O6Mlcm zNy6l_#YOJztvi<4LQsjMefnp}Zqx`lLB1!d^1ELFx#IE7OZB?yPcx)PesBxI%uYHv zDpq)3sYe{;SAcbOAHanraadIjJ6=%%Pw;|I`KrrcNUV6vHLUrG14QFmQq$}cy1jT1 zHMCX2$b{UDn+jXv3HeJGOH*=Gv+#~97B71h2mulG=65Hy{K3`u2DU|~TV{d5woD@S z3CB}Kd>%EJfr#6VOv~~z$W^$3EYM2CNcq{mrupom%!f zj7_b2KvaDq;=vH#Nc0ndFNk3QGaLEqgD#o}g{}{D*8!`|R;-Hw-RsL7{5m2HuPVmN zWg=t=ks0J3KH}LfN-%0I?7G*Nnw}0Amb1kJ$wK80+V#|nu)(Y<4~u7$!aRbn&7c-; z0%u`D6~VmccMV|Kyuk0NNmti?_RAU)1Jos6^$e`s5PujY9KaCPxBi}@naB9t#)Py1 zjMrY0H*#j)GYFfI`a_5sx^L;dB;F$L)T$>++051Wd`K-yCEJ3iM98GPeu0THLHb4> zmp-1z>f(%i0MgunH@YytY~8q&VZTxE#KDDIHuH6eQv<}d9xwo2a$|%^R8<Fk=Wp^K;Bn#n#TQp@pE+q%oeO6_GLfvYW6a(7VXi|O`|R7MrXNo7%u@U z0ASu}CD(#qnPq+ zbv*ub#x(v5o7_u0oM>KX8W^h&E{!!1xCsI77+}IFQ>w5eurMc?c9&Dk+cp2#luyQ( zyNwns#6$YnoqE+rz#M#S#h?$)%MztW^qV~Hi<%d8$dr!rf{cqIM|BFH4_9JC+*`Q= z27`P(Vy=JW+ml!;ikz8WE)5>nYMxZ1+Xe=CaJITB6&)^S#t^Qqg)Abz+^}^9lggy<~|;d2Zk31gEe{!#r9lY{GGeSF)$(E5O<#&A4b#as^OH$nlLSm;2`Z=a@L*hrIC$JXC z>I9c7!)rjv8YoR4R%+d}L_HL2A;`yDXYrNS5d+U=jSminJlQOpZ}@z`KcIwR6t1U3 z4o7SIasfDqOQRKFH%1Qn5}-4rn3ZO}8wEvRYgPE#@S--L7&Gzw)XL+13&JJm$YprU!<&MCL~Gn_zrN5iTLcr(#DOymuEroW?+)#R^9W)1q7?&obgv% zA&I*h83S#g8Op-GuC7@jw+1=zWTc-odIBrXkttHsl=&gr!G}+0dmRip?)_RcW_~9Y zEb};QE>pex(-7Z0Xg($&#Hc}|;Rc6JoGv|f0>_1K*n zDi!@kFCT{Yls@$I<{(`P!9+RuW=Y^i-<35Nv$NZjUWWk6&;|ywOTywSPP4?lvavu8 znDGWHj)mLHKcg@G%~d`Wrjx|*T?A-Pk9v0zdQo0!D@HhZBP|(TPF-=6{OWe>rFW z$hV_dhlMND3U5G5%Gi`e%M`ButMid!Y>P{nCGYo~(8wMUZI=If?uW8pOgD^rJ2yvl z@JqT$Bkwt@r6AJjQ1jNwih0vtT}vFl`qmIqGU=v|T63tw*|_k_G`=5|Tf z6ucVP`9-L{ah-#L2qvlzfxSfj!C4(V?I}~ALXACzkCQta99n5l)n55v9lvmz2slyP zm?;WvoKA&VuqJ-H5Di-gf6F$+lBomW9(ck>W^uL+;KMn9B62VkqFP~eXTC^Wo+F*2 zFI~0!I}OSXw;;aP&H+<#NE)Lmz^mTUTgr2#E@v$;gm+#~^p03wL$Q%p z)R(SviOh0}f7{DG0zT?IlogMYact2HrPf^)HOs-qyoaf3qG}5VBEdiiB4plFU%I|V zH4bXQ7a^s9DjnV^KThX2&`Z*(6ANxw#xf20P#HEi; z49f3aj0AK5g)o%v%vMf6Ty|AKMz zE;L2JMc@-Th4v9%{}WLFIy^PE1n63I_0k{`A#_^?O8N8hjZ z$(*u3+4EyzDaggh$$w3V$d9N2MJl?iWP}!vT#S$O5muk-D@P@vRAd9tLXH;Yk`hHh zKqcAsqu>Hy7MxB`D-hqfizWvt+-{v!^yaq?>?~P{M+Rh|#TeiT4d98a zLJDBdb2dU1fGuf#_1ArfUQ`j*Uq$^}y62mU1@ibc1SK}1Vh70~;WMN0U35|zd8N-X zAmqgBZN3Xo&jQuAsC&ZCzOuq6%2UbZ7!_UGOH!T=5|0y?ufc@Zlu&hX+klLI#0Cha zblngrn~ab8Q-q(>QE72c!Kb{DSYro}5K#AJcKoG-uU3)TK-@s9ZUc}*7*)11btH%m z9f>DEpw3BSTIOkMA9Kt-7siSQZ7s13s)9--)@uKtaoFLjj*b@(MLwymho)X`w^p^9 zSwz`E$bIMR5e*$sbpn6H7EgfB+IDVghC2QEYeI)kz%wIHAol&h44Np@AyX(N@cy+j zMO=dvk=`;r$8iiQJpfVA(L<(%+gbeBz#35vVmFFBMD~*!YupLehuPl zD82|;HkmrvK;{C_1yZNJFdV596!hpI9(@>|c&H(hAxKz5K0_`KbVPz@eIJ*RK(?6AiT5% zYWl>3*0?*nf6~Mslx%O^N%o3|Li!9D;D5;kX7k&9auq>_bUq>`QSw?KL}ZZ9v{%aa*2+x{qG`YR1_Go_1y!DW+toKqYJwcd94nMn+1520jN}AOSt86Dim^y3 z|8zTPActPkN8?76^%E6^L6CBSn+kuTs!Ci^>w39KM1^tGNrJctvPPpj0;eH3);0Y* zCjRqG2~@6?S&x1}vj=cZI1e5#7d?Q;q0~KObdaE>|1fayYrw;gJdeLrqIXHqdQcG- zbR};+7?Ayl9S@!6fXhP`4i)}oBt4CSmu(BiC-qGdBo0-c6%Ly8k51h?3Z_8L3E5~e zCa#3h&PX4D@muzC+dCJjU0|s`WrC* z8b(PPWm~cU#32ea&{DejXI~+6+neLuL^Il3E$}1mgJ0ycirKrqE3EgyJJb&2{03IZ zd%wKHX5FvgbIi|-lgp{X!aoj5nz|JhZWR2<&yUnLhAcLWwPzcyn9SQRGkz$bbH;wM zvaNDn_L+2e!u+ zE2Cv?fl32>B@x((c1v{8b>P8V#V z%+$i0@C!r&dT-+o^ADyC&Aclg=YzzE3UP}YnQ-&8EYOEmgAiHMlKNji4?qrN3H;th zbx=>aZ8!YM%^`uC$K==Nf<5$fBt)u^)o@~=)!Kt=)MLuTf%&ENPcJM3LC<6yqJwv0 zNj!nW>##}|9v|9S-vHO=*2+TJ$6#iHQESql)(IM@kiJhD7!2oXmnY-YqB-f*uu+n2 z!es{3U_wR>3!nygh<=wUYl3hBab%TbaaV)sO+;LSTVPFFcQ2a+nYF$(XRoN2A_sGkIHM{$rlNl_mk`X|Lc`|VAWYA8HW+sImbFK`je&Rt7DSf$LaZJH5No{Jn?L;&TpRHs%G0K^q+D- z&vYXvfQ_271~kwQ&PwPa#K((bB<7szL`nsW{2%x5w3T>^CsS(FY$<)XS?N3Xe_OQw z$6K`j#YquA*Z*Zb0N*Mg33~Z9SVP-DPw0p~jn`^E1peDEpB(gW+?XdjbYqRD`RB=h z45&-wby&o;i>rBJMXQJ0>rDLprh?pifF16{d;aU2c>X%ofAw#*Z|pHk3S0oWu+MVV zbM!=EfIhRC;^Rl_lu9@#&Oq@Rr38-L%VY2LAHzUUNC%d^H9q+~IJ*-DHIs zqOGM^E53{?pu^Rj1l%Zw`$L^zZmAFU>7?PZ*>_L=Ut5uOZMa#`aA!z#Yt(nJ z{q%3yP%qh1m6!tOk0dD6)V_Z=7yRcadQVYFFK+XI4ReA*y;T(oZR48$=E(2l2=I(o zZ#lpTWu=Iy-JTtR3~JpGw$$BCI^cm~VZ%??vQcyK-EO@*P6+>iVgvm|E&B#?=|($04bRr`(2?i0ZJRsJ^v$S{U`pv zQS3%UJrMUDT%ewHS5K?mEEl)D%F2m%+G?DEA9X*t-C#S5KwS8uUV7wD~$Z<@!%hizYO61cZ^x* zqX3TL?IWxa+(TYTig_~UHN9heA<>NfU6Eq`e^;mf1c!!q&Nrpj&5x40W8f(6b3>f4 z4l@_5IRw71angA4Pifx&!`_?6L%II{<0DG8rtD=IMNvZbLZ%&+RFXobQrU`BGP2x7 z_AL~r#3V^2P4+^@PN|T6lo=z5$c!}Z%sqGCYt-r8Sw5fhexLLC{vMz2@8OTrF*kG1 zHP`)Gp0DR?xv+nZEo@mpl;$);&vxVK=@j-_Ei}#h9@=(?|7-LA>yU#qE?S720D6DK zZNhx6_Hx`CNk0AmV8GO`aP2q$|DQcN@4s@;zrQZ^&XI>X>UfyL^4tmEWPmxed6>g` zonp%W6y~6YA(#DdQ~jv^u+2aT?}gU%i#`8T`7pNE{I~a87AT{E(oZzYS{a^5um<#@ zq*?G!PrIM^_4EE;Jna4-J`BMtH0bFbV8tH@f^56n=dI(VV_#MzL7yw8!N=V{g5dvD zpnW@QNM8iNx!@2tKD)*JWuOY6$5(Vgt>EC%`+M<|X5)A7$$7$)(b?t{161W@4p0>< zaA@h!mG2j_!3ALJHlCY!_K)*`>d*A?UzT@3pQn?Twq!M45SX zm>>X~K^#K;8Ds1ey#14o0s{>g>HnzLyyfqq3UxSz8|NLMW%d;lTnD+j7Vb~zUB=v} zcqH~tR8i{&lf<~KXtz4j8>j{rw90foRB1(_gY2O-+YzQQ#5~Pfa`ZA;E8e^xoAu@k ztH*Uku1DI!#AhOBs^xUY$(b1FW<8?H^RYK-!%zWLxaBKGs52OPuYCyQNrX^+Bm@iV zB%(spIru(j|4m@(50wDg;)WoGuobN8$(5vqlO{RazG4PI6ObdI_^-pdXs|RMo=@sG zLsU%xpLwG;4izv1w+e!4B=O80V^hm%3-DuXfYll#|Kt)M*B=br_NITN7J?;ZeP0sK z4e-{q9NZ}QPu}$>mwvqwuW@f5MBfU<;FzgIJQ~!2gT+bxga$)oJ&y_h_!V7tA?(@3EALZ=DV0$C2ZF%Ce7l`52@l)3(=U@69!BphYK zl8NkVUKZw<@M5&TIN{|mJuk}7We2xeZK2A+G~MXHcd$DuS_Fu0;)g(mhwWpgOOcJ@ z2kdLr?^)fxC_0SgACccF$OcuYLbPCD+_XT5^W8+jz(i3YR3H;tOk&N(A`&TRBsdNf z5o(3L%!-49!cd_Xpx;o7p@j+zQIHG3IFS(5K%Q|b0&K~J<-qX%^or2n{LQ4j19mQ% z#mnxTp`q(CP^EORy3 z)N*J6#8kBeXi^x@5XQl(hFnQXI0cpfw9EKx;94MN>W_T^X8gy`GkY|%M>BHtKdWmb z0Fw*>io@|x+K;MHr!i? z6pO#!XR7#uiTb?{NoPCrhu-z>A34#xufsCQoFk(4TIzEAYhVQj>3DhAglN}e^pb=G zXi9JfdqLaC4qn6U`Y0?i*NR2{R0fQd9#ol2%cUWb8K}@CeT@2{9~X-_XSkOuk@gf{ zYc)%%)mU(b^C4L&H0BNx53T}WtGt~Da&iCNo&MbV;*s@h0m=m42B1tFu)cats4c+R z^R1vtBj|k=Z%$!FjP6&=l?i}Yf3u3;o6DIs&FoXdj6D5bga7}kcX0~;GQ*Cp^f5Wu z#NCL5>^KA1oh;PR3Dl~fdmx6;7_4f=l^hEP2y4M@XuO<)!P66P%ym$A94aivITF~0 z&rhxa)f+_8=9cm~aQOSFwRM--QK!A@d){6%WcHm~S|{M@zmv1JOh zeV}Bq;v)DA!f!2cKL=Gx1;*G<5$bbArYQj)bmUpI9lX)}@9%!&33wNq1=)PR0?0o_ zxWFE=1O^d_9yBt!#)`w`!96ixWPg7N10DPs_5J)aG;3-62jBq?N}ycA!a=7|p-~XcZeyr&Q4rwHxd76#pb4^aZ5Yz71<1uoUQoCQgWfO1 z%E88{v_?GI-3mq)fJ(*Zf#~O9!Fp3M1^r)9q6ilpv=@ZS$5!ECOhyWogRt73T5d^6Ydn;fZFO%O&ph}<-JBSFi?BpG<1Qj$*N7jG1 zb_4ri>^EqPO{QH87zPi2L(ERk**h-m0K39}sqA z!WUv3;{Ih9;Nw&&^fcVW9@8ZdZ4`u;m(-b2sK{4Pnlon#ZNxJzaNV9&IQjC~Fs0UX zNxQ!p{0MtTA+b?$dA^~YKa!CFA}XPrEDG&tM!S}~%i%F%HcN~{IPb_Zm} zw}%EkC3{tJ)j3Y&fhIPfID_|m#W;<1ZDc=YdXAoEmNK-Wy)Cy@+Uhz`yWB{JiA!DS z39lm-$+@KP9Z_+T2cz)*z52f2<&6KbE0)YKL&eCQdJ1x=Fc5&*f=2CcrPG{aQW`(q z+BbQkz3_U!%9#f8XbI)H4=U6Mn20?vKlnQkJp{!Bc9mn{F$PWi{Vso9)|0;cl#L19 zOCP(x*|ytES1x?n+kjMas-SgR7m%85Uj1Z&gHHQ%f#(CEMqM1d1DTJ1rweX$VF73b zgBIghvsDp^yN~e_<;p)eV8HM~%pGLjlcRiRcroY`e4i;(t)FwO9tbh`+eP?3JX4iB z=QzY%f@hK6Dggj+nZn$0!wbw~GGu^{UxAmT ziG}}Kn-wAi$fFb-GMA?gz-5Oor_fUg6V`#s(06tktc>=Dm6iWyCBRFA1wzd~*2yb7 zWC(l0s#9})8W7|SwOGP2zq0weX7=mHgPYl}X%(80 zukSYD-_*x-z|b!Twk&nsTYsQg^<0y08Mk)RbQXVwJ8Nr=!m)bOjZkA1#Iq?7jQmE5|HL=}Rv!37h${sS zik$=pV1J{md47+AMuyc8IA1Y5f(TeA(qCF9&uLFh1J-FHLyHUAegJI_#0MBafiYm7 z(pfT~Vq%Ddkb!v`{0s9Whbr9w=E>(P2E<7q+Xzj3=fHPDmBt|ZdB-1Q5-C&8ptR&Y z|J0X1*vjA9Qui+a0EMgJ>41jf;Tv!i3>N^LD-*z?1C#(V&MR;WpnvTe^s{rB*|nKn zn~}9YAg=#K8cJRZNA!bZD5B)nQeVU&{!mj5^v3iva_i638v*qQ!GJTO(>%M~QK7{g z4H(}~t{}_6Sv7W9;bk_`!Fr1(k~}>wtxbJVc|@B1u!!4yXW`*hRtl^5&}(ckOu zf$uRexd|p&*94O60EPt6@dEz+zmTmws>lm$z^9PTHHEFzk0@95-)NzEBhq$HiltC8 z2y6~93awF5fWL$MJV5VA#okk;Ywr8wn8LmXk;8<#evc0(c+*Ag2W`j5@_%@2h;M$g)WBv z90R_91rXRL-0VTz34d;A6GWV zWX;?I$Ljg<<1B!7Sz;U_h}tu84=E!V94UWWZYH9@5~;UK%B2&8-KYaso{Ug^i<10A zoUcmnOxijiv`$uidHuOBVBRJLa8ggvdOUMC304I?KJK*Wz}2jz7&06>2nstod4(Oj z&TzVOtPfs$WjnBB_o@1{Ut}@--1(IM9@=f-MeADMSt1%h#M{0jVwvwukS8#%U?#{k zB)0lXNKAo)qJAZ>3;`qtvv;O3F#xXu@4toYJWLEtW8&7oz{Fap5^y;|XTFE*&;;dM zS;#KHaNxf&9F!@-H$)7w4}ZaM0Aep(4KN(IX@-Nw)%uQy3Ex>W5JV(@3nBpb@)Otc zM>ssQXES^DUo&Yla`yk7Zr&hrDzYxz|A7<<$ofB8np=40QBWg%gY_#@3TSK)k`HHB z;^v#o=H}cz(QzE=pde+clTA;~5q^V3_0dBR(;SqIC^?VR0#0}qH<#k3)!oDhi!!mw zaG~*+Z8{UMsDG>9=8}>KPx7dbLqXB2daBX|H0;NN3VX7+4m&!!VOKPM`_k!v%e z@!yL^S^h47iWjqt8hHG#J)jP%pp%@=@rMxo86H->D~p%>aRYZImK_R^o*??10+u>pYnJv*U9W6jnA<;}%4P1BiBK=6Vt5zw^P_8O{s0Y~CT%fCZEP|Wh*LcjH( zm=JgaRcEdwu*E#QJk22UqoO3R{lerK(onl z@P3fD{f@WhTJ(;FF02Iax7Y8A!q1}U#k3m_hl{V3l8WVe{MH-$zii3 z?Pl8v!ar{s|C1(jd8kIwWOfdiUS&E6xbZ?r0XSfC#F@2nsL*GYQLD|9qJpzyY_yKIqUkt_dtsm#I(XBp}IBIKo+fGA%r2D+Vv zEGq|l{Zk7k`R5i+AbT~EKQJv*nHK|jgJADR41~<(1u_RfO?(`(tQQOJ^yEk9WpOaC zIq47}#CV;TVSs|0xzo%(%}BwFw)|aN(3)I_&J#p8u)lX0pa+PMUBGn! zB4iCPKHq9RVt9l`8y|OP7qPVxwBzJ^T67#Y1y+*hc>Ox>l4FoX_;=b)z)*psn|1XGR4Pom z;`@R0o*z}Z#dCP4LCw4&Llr+koA!f%fDfy8#0thZS%GpB9uH;(YG4+FI+GYYShqDO zG!X*M0AyQ>d96m@v#p>eW;$s)3J4<7uSrv0(O@ftR#)2vFo z4gSML(Ed3Focd4R17s`Qf2#1}z5zB z?+Wn5w^;yz^MPZj13+ThlU)3k%%M#!|BCsZtDT+(fMf3X?KM8GA8?!QblR_FW;o_G zPzU_IJB`rrJt!WTrr&t2>AaM5#y4&aAQsE%vRq!e86cJ+@Z~y#Iqtlj`8Ml-b`k$x zo40LXI>2-R)3=8Q{djo5PyYTA*l!?!*We#5a%SK`&imniJI7_d39m7aDcJNQ*Z7^F zBLRXAgv#F%)_mM^yb?N;I=wGK(@ki7*eOxou1$|9_?~e8t?c~i#TkK`PX7N^c>YYt z{vQ#je*wDxOrQdwDo>!XB0Saz*aBbASQBr0TYCWwzP>Mj1y0-&`tX@N0JxUv0=qnI zKpW{Yl%}k{H4ahe4`{rS6;A+WIfnxqN%_fDkQAJ?x21dyF^VI%|51BpzF+pfz&BYv z9z`xE4)5&oc#RDAuPAd`18M}Vma-_pMR9P-_fI!QB_Khl*|8p&M^LN!WiF^|0VQv} z!EVS-I98DB&m*g{pc*NV0L4wMzynkDz0nc_9r&Y?{k$h2@5>7=jHjzd%z!=q)*1+& zIO*3Cke_bmNMNQAL0|nhrvnuBCz#tgE;n;8UdhbNe*Fl2W@O@f8P$wT{B4=A2Q+w0 znf#fkIAD6QZ%AK=R;>HvS;U^o#y&q!a1Hl474*7~juC1MM`<7h8Xs6i$SWQpuQu1rybKAI|RWEMCORoyz%Kk=~^cxn_f{sMO{DbSd6BhY%{3$c+ZpJ`)d& z=p3|@Bnu9xUW+5J#Om&Pu6+=@?*g0CmArH>0c(b z1%RU_z#B{l`@GRl-ys@dYB`B#8dRq-@Hae^j7Bf#n$ZkrUII(Z?lWqa`5^J|rbkH)|BNU>ok}*hf3zD75Kjm>VYpmm-_{ z(KNjueH_#MqXZsI35$b@t&FwjfYK-%@Em+IfzUcgi)@E704GHpruK6H=5+yQ+Xy7aA?YPz<3 zy3w_Ow*g!6-@0DEz4+Gj0!VxOx1mhmnyr8*09gmpx7k$F{Z!LUFFOz%AVf~!0pbTo zdH?)sy5S`m6>0)h!SwJdUh@}tOn{FT7zB^$+ia@eZPd4|nc1kBjhd0DAAOGhcU6xD ztkPVld79+B3`ovTmK*CG0eKe^=L}{*#o#|0B4t^Va=nx*$^f+T5`5Yq!3Qlc zns6Qhr?bwtVjqFm1L~CqZv^^%pE(%iZ-oyETuk)g*zRoCRBRrOf2Q0`NFMZpJ6bE>Yqj?SOhOi7So1r+EpBAm*Sgn9yWzZbgUnL7Ye zO8cqr;|b5C|Ju6$JBzjfnx0<;h}5y)bjpBA8bEIT;BI-HGM>LNV!tISAvNH`?p?NPZ*7HkB@z$&aov| z1KLl8gv_s}16&Va5^BJbsU*n4!L|JL(nV*BwcSa_YDA8_(vd~NZ@&GKQ@ZB8b|jY~ zFA7e|dG&V(%w};TFW{NP`cYFr;9LdVK{va9JYE_#q|iU_9f#;?c##w`;^61W zHE@&So+Ao1!pi0ibt`k~(T#R@`kgwdvwS~`w4ka1Lu6SROJ!1ogeadj=oU4d1(;LzhpUa z+&7Jbygx19Na?Afpmp4#6O38VR28#JH{OfGZ0?vGuK4tl0_Sqo3DkIJtKv zlCG$#zgyAw%Rz%x=I`Che~cXeyThetQDP|YT2EzEsE_$5Zb}Vg>ZguV$Ri1lAO&PK z9Kh1u7PT3UN_VAlZ|V59Y`kk%>mzQ?cc60VUd*VYt&ME2XQ_}*zH~&=3c5Pm~ zQ1%9v@qyi8t%WGA;pndPkEb7Jjk+@=kZjHtvQZnl5HV^iJLJw1=d5eZWQkq-a&{`E zXG>?ilBmJ0TAvG{ht7KinVp-EojS<=A2?|R?gU>OjQn$_^m%>7+=~Oa`gY?K`b<5H zMXQbrqe#Nf>C6bd_|MJgBx4BfM(P+7(;>vH3%Q+Y4&M!flhIfRR4(p@uZIUf-V|Xl#}; zV7LX=!TVmWS_J&UkpO(*gZc^bdwaX)2}s;E@x9}cf>5+-*dWr+{i68h6{{+3x6Iye zP7l)FS1!RMTtxUY>ibCiLuaoUT;+3;%DlMzghxoM{c*4J*@+1!jjlHf9@?xVE1UU` z6$#5|LdDX+nJRZE%$;SiX{*6*UkNxyDadWM<;fr($Y73}sS?v8`1X+DwNl^YPU*ez z_AgF-(YWfUZ@u{S^HbexR69yb{_$q2!@1l7WnVmm_1N^2ww zem$P!|JMKc8epb#rBMU!6Hi8Oz5w}1=F!!FLpSV9iTW+EudS2(@ ziuLc;k2)A$D3?P_;iI=rqnLvmS_?!hZhg8HY*`TbO7+tD(#TEsgs3-;d;;DXGj`_| zKmR}f7JP^;T63N!gt&rUK`VY}wEO~r8gkMAFN!J-nHoNZ69HT}uBcQVUFmhV-c{W3 zsZ8UzUf;$%Bzrs&;DrYhC$-m!{FN>HtKSo9Nr?ROEL1T9D8{FX5B(4DOwXIP?5hr1 z5$HmHW7dPhEMi!l(Y%yw$M>V}_BHvsN{;uK+7xg2c&pB5#r50s_M8*Od^B6X&3w7b zUk=&$i=iC+pPWz;TsvU{#UV`AJk+{Ug5cBTsK_ z9h%QrNesy-;#dmTjY2}+o)Bessx1xO*L5&EW7hq-BU{vsd7yxFeqwc*Ha$kC)2#L2 z%EJYDAqOvrq`oN9y28V!bc8rujU!9ZM&&zXO$766=Z-0A=t=%?hb z;GqF0SR}xIT3^er3>MQw0_BI(w1XM??BLst47p9?^mdiP81hYrRn)}zX*3`|t*v33 zm4>$cruGl_we|0z9-LEV@9)jcz`GGH0vw3# zcHbMIImj?!>%wU+N7WUcwt1LD8Sj54*64|e`IzNsQyES_J~$CvMPs^7BI5j8U}2Jw z_n44dNmlr^_9ZE`gL_MZ_ZTi-r+dpntE=_;XxzxCtgL|c+w+k5lwkOTmehN8`4Z-n z*G93a0m2UU2kg!BD)n}YX4DOD&0AHu*KzKUy*F=Jm1Q_wIiQq4-w{0v5(i)EB2-{d zGF2XR)HGe|;=p5%C8dQ{MOxkQk7GqRvplG)w%&>wk9j!)mHquAa+d4N;mah4LiAg- z+=0%~&m2kOAbWEjYaufV?&$4kDLlmYn4`Drz($j>XIN?O(*>5LZSwT%$Ih>AiXt&> zJ>g}l6;BRZiTUq<$LJ|yyG0|8J9TJ$^f;`XUn^pq&OV4O48U?s0=o!#v{BcLCI+^f z#K4A?dAqvJf2|;~EA;c|qZwoJ2ua$~#kd=$GMm|B6XTWwPQ?bZ=cLuf^4==Yx#K-ox zAy`k-C)_xLq)97psnp3!liDj!X!|ZJd=%SzNdHN714UEErs#&?l!>h%OPvu{?HTMS z`UH+)c@kR7v~RfHeRsgkD5q`j*-Ljmjz4dRg5w>%IO_gq3%IxO$M9i<0-Kohhf(!u zvad`w6xDnj#}|w`JsSU1ch-|i93@o6jb4RV{OCOJY=dAxw+V0#fXdPxFqUaCMk!2h zJyPkFaV2bL>H2~U2TR{2-5W*}4#$aRB+cXFhWM+4H-SL3QnOs2h-!t10igQ0~J{>c3NIjBq)BhiYuNp zN{%iUJX@)6}`QPy>RPVO8(E*334~L4J)Qli9KQ{bv#$?_=zEFmi8=9$%CH zac(}C_5_Y2%(BmCc8)G&*;i7qpA|M>3+?QPi3jptIj3Q2O&ag-=Cq#C<-**w?S5`pXBC`;;DC@$hC93mA z6AYIunCf8=V7)OlJiR7w;Ypdiy}ycZsWM9lZKE3%3hbif(L(j{{BUyBF-~JaLA!^?#D{dqSwAJ><`ef&Iq_ph#?wqz zEo@*EQ|M3d%$=!zK>olD`dH5O*s$?iin<+d!>Sz@USV-kx9t++Fy|_zxQM&9ty*EH zFL1C~u$@w1hDZ*gLLAOAI2oN!p=yC@&Jh2lImhRnX&l|6?SJo_uKn$yT~^1AV%S5D zlc0;@8aEAc^ArSFK7g|t?%Bwyi@l|iY^N|`*iyR~6%t+9gNrNaGHpjd2Bi{IqAG{FA(=TIfTl?bKSIjMw1Y@=R z^Jj~S;6R{2zd_2c%LYcO3fyj6xeVGx%eCpadxCkI!T-YjKJnM#H(fr-sHO6z+;ajMYb}b7-0O+FS1WwMB7vs=ZDNY(J@JE>+gH zdkf#uS(G>$tb|461~@{2?a;%H3!IZ^kCA144Mh?$d(=iwFD`5INH5ASj{NAp^99@W zrtgDgE_VA>uKXfaxna)kdhqn&)!R@ZOQs(>m;3+$*{mS&5GwTwHLUt--2?#I0exdB6+&9-0`QeyC~kivV@ z*{zyywe4R#AXGaI=JlVd$Xn&Q|CG?aN~P0|rkuPsAKBd(LN_r0zQnMBo0QRScum$#PpmT`NY8nzE}?KAO-e@ zKplp}OstvL##^o1Ru*t~d_2i6cQa#`x@s;>a#1ZhYccQXEZJ(LG?z5giO#L$5GjZx z1S)muSgNOkHaX&>25l=#W!J$KZd0dybsui-lMcu}8E!hfHKPy{2#zwA{Erd+UXRS| zp>z7zsj)eyr3wp67A{S)-DBSDee75T=G?Vc%8RU*Af`4tH%34z1 z;7B#o*`d8`)_~>pS`GFYd?(PRv%cv9ERZK zgt)JmgJWH%x57?LL)!p}dV=(rre3Mv2cPkeIeOW8TgtD^k-s1IA{0}7MLcQu!p-3M zNO+=R;KPCf;M44l%#T?dFW7Xj9w%Y4>vlDC|Ap6G$NlwLR%e4Yo;aSAxIr+!)X8|v z*?Z;O-I5r{nj{pc@2?C`(xV6w5INu*R4cXLcLGY-`Hxh~Vbv^3M?*g29;nPZN|`EmD}lpvvAQ8KI;Nk5 zF@qGBub3Vm_Ev6OLDT_f_ANN7Z;Y@04pe{5U53?TdAfK^_NHg=YU|1sVG68$q#;;hmL#JWRNnJvXDzZQZ74p3e zRh~6D%>pwp?*jF(DJy3^iM{*4B6Dcbs*-CjdtGnXBsl!Ce`)NicE!~v(jzxJ${K8- zvq?-x0<7!<*bI7NpcIpYNZF!776zHjXOk7@MsJ;a5t2DEscoOry>pFkZ}$6e=}LyOKFb1&`r@~VS6 zH#xg3PioVy;TmCNoY{m(gBo9(e*7Xc=43SvSGM5vy%(#3R=*?*>}nXiS+5>1d9fd( zEPq`X1t$vTQ5KkL0b6_&q8F!00Et=dzmm1IBZAplC9$DbCE-QvQ>`V%GM$sVMMvUu zPUO0K%Aa51@Db$(Rv^a#3bMr06MOZ!X_~04zY-Hf!@-D?ScaHkt<#>e(h`~UKnu;o z*Y*}C7q~ zo!J2$6nYmHUS3~~r~0v%bb8AIT7HABrBC07lwkZ-SErTgcZTxvv9wKxADI>T>Z(|M zI+8a>2{+`}cnyHX7^pz7g4Uf~x!d22d&B=g2CTI%qUOXMciZ@VJ!2l>ipO@o^s3U_ z$Cerwvg{}{dv@pqbKpbvEbTVN<`KrK{soS?rG5$-f z=UvI@-rmtalJK16%D(5tNq#c_>1m%OSlzeU*=k)o`8nrc00d`m1AFNm*ygB12!k-F zF43x{WFl<-PzUw0x{m>V@Umv3O7I)WNX|P-6dqogg~;i0q+ouwF-w&r~7@n_WoJwesWHMekg1#r7$0RMFY+Y}zD~)0cy_QsqJ#0^pYKkvW(OA6sO6=-AD%Vgv-t)w1 zh9M%q06g+O0=#TT9K-mD7CpJLG=(yn`S6n1i(4_*u5@(l&NEG2Ws8=fxG zJ2Kb6aUd;{uyrBz5{2Wz$@dxywEu312M8HdP_*rnW?UojqJSj`iVD)jS&Tqqn@olCq_sT%j${*+HZ z8iSw6Zh9Cxt}!C{`i1NvcF>T_Gs!XTQdk364G`m0E<~l^IjvQ_y0EWL|BJn={j*b@ z_jhRZKYHXe8F$6m%TvIAV!U5qp3b>u!H;GYO;md}k(*^=fUe?RM^4{>KWq(iN&2w9 zO{q;ef@Ey-d|r?2C|>;_lgM~dLIG0C8`jCR@a&cV-kMD0?B>Sj=v%5=!6DpVJPR5> zwEB4-rxK|C`5v$CUiai!k9}I2`0>G;vcWG?K8IV68eGB7Y67K<2`U z&-)x|Row5}H_Ce$WxtTqQ#0GJunwz`Jz-dBx|$hQ6DQn7?60s(eUO_md@`ptb2OUu zx~EBEg7`M#s=|)dE6mqsf4K_-&V|QaEkh(YCSCq|@RL5{F&Sd*i1b>sTFY!l>KpyO zUFF+cmRvn?GlpR5a6$xmB)(96Tn?Sv!jkI;3XoV05$Kgm=Futbs$sgZnxojxoccC@xw8>+jer3=(gP2F~3s3=Kwi$~Q_CUz|v&KBO9A|I5>A29LV-Dnn3mYcLgB&)9 z-Gf27BR!rX8YnvCC)wD?N)WVi04SO6*%rse!!5 zbL@Q|KDcwkVB^y=elD*1i2S@O%)wCrZ)UIJ0!(-$HNbx{#~g8B)?d`mC@4w^xU;GF z4!MQ;T4F`|NbOlpXCn~l16Wv16M#+u(DD>?irCao3qBT$lNmN8gdwY5VS2nzY+SNt z!^0Ic=ZK?^gtv5P&o)_vg0v)H&phf!iaEOipBX{C^8eWyuL`jdin{Q_R`;_-C82kH zXRSYKT+|pr?RCjad-wj>sqwAPq?!fh-G9!U99!ZVB!FW;q-|uqjF(K? zQ*5+!!uGBR+rp%1nPJ7tdJM<*iR~9)wJI$r$22ygVliNuG5#yKk0^(-U+nJbQ;qp} zyPzG>*XVKBvgAU)VTqTokn7@WZ#uT^P4=tSJ6bG$$DJ!-BE#Sc)}!LDkkfs}m&;xi zrkHEp|0waXadwrEve!sMMI>hU@6>^)Hoxjp>4~!iOv3;b6{I^c6+k#DbxodTh+5(<%4WeR{r& zw6)3*zr*Xo!Cxed4d-U`%i7d|O4)btLe5_FIlf?$or1m~EkHC_!fRz3IDSOu{RQnI zWUAp8N1yJaiH)C3in~kYiVRGL>M5N}05|8XeoCo^3Le&Jd-Y+t{Cmh_AO%4V$BCON zBFmUDV>Q@FyN{&4CxVvIjXrnXlC`_LD}u`sx7y4PFee9@%0rbP1f?gFWc@v5sy#uu zjY8SNJ0E;Rl?#iEZtV=Xz2*I-f=ubXYm?S?j}jULM?c@=^^6Ab&S|J5g3u=JCY&&~ z9ad#}Fg|fykIVn$=_WN-9(4ZI)6M1_1fGt@+)h&rd_I*GOgV}ZvZCX|lg{cCg*~{P z9pF9acr#JnC;oNQN=wlf;|&erEq9+C6f5B4THjr|URmBc`-?PCy&VuegCq<6Ndg z(sBu-G6BHZwbE~~o4cE!Wx(0lC$LuPNPmgi_d;Tm&#BxpixxL5Y>Y>R^;x#;liV~! zh9il-phmMzSHZ0s&(B??`YkwDq9?ax)h6qCF@Ad4ind9yYa}vu3VyWWxj92be6kN8 zyA1Z~2Qx2vMIN?~I@EdzvhV9~Q#uSAY)Y*fWw5ISx?BVNk4m z73xPAIodW?Sf%~)gR5s{_{94(GdQ!W)iz(6#WCI2z7F4f@9)~@f8%{veK?<$*r!$D zDe%VM0`6g)(5f<3%&RBebvN|dM>R{kka;zpZ*Ax)KVi6uPD7;XQK1yhdLp%_k6cNM zRm66bt@p1owp^>f`f%F2jV+5UgbVKb&8@a&LH#-SIs)?)1(8Xv!R5zA6+SVu;*v}S z4Q`N*wWfunk>=hu|_WZtQG8mB)6Y~O4^(FKF(=b+a7z^CZq4(Jem z?rWk9!zc9qv9Lup$CC- zbbz#(Rvg}=smo^%Y|SdQPTt3&Rrd4*C=O3CtaSWdtaxYOE_Bo*M`^(}5$&g5^Up5* zrE}uTZ&JMfQ61lDM-5h3NZIT@4oRBWv#aoxUbf07O)afs+uVseA&IAp_v~D2A@!=0 z5rzP!4V~Lcr@`tUvzg>AaCU!ySykWy*w6cJFOBM|t$e^u!~fEf6&9NB4~-)kp9*xC zH5lD%7`DP`RKy#&g#_kUcvz^u7FDLA99rYz9jV>iWEx0^F4SAqRNW#CL;ygoveRf=NVSW|Ig@yTvTn{qw_dLF(opAD5=Sy6H=V%B=Bd|eHu88G4 zYQYj*$CSGALC3DB5R%Gp$UE`+;g%7_!=;xW#_YSZX0;U6-yd%3!*w^&ZdLuOH7Y7F;9)k+!hTMHcR*VZIj%E#|uMJUfZOiYd7KSbjm0gqrQ z4pE9{Ir#&ADTg-nY2@zCsRp)&q-8zD_z@vf|}5FoCf@Bm4j4 zXKRZJbVDj38qai~!7HQzd=@C@Mt?-y`<5h=Bu!kE-#gA}HSlfH%v^H5bI#(%%`W#9 zLg%`Q?vnRCe;!-v@fG8V?+UDDk%0&dov8NCU-q_#4g)L93P&Z4Ka6UQK32(q6dqxQmWEG6b{cHDw>Mk=CfvPdgKPOw(Q$Ya#DyT zvH<2?HDrcJOQ2$L+^{lbjyz0epXXjM(f4I?K)OEF6cV@3jLr1(mQJ4sg8t?!rD>;7JVUb)dR#T6bYiAD07r+Lv7| zwmkQ+F2U|igI~eJhx=u(m*pCJT~ePC`}-UeMX!7bRCxrVU!e$uEJAr_~CZI_cR z`>r_7Hgq7}t8%|dLZ7V9UZqI%3`8#j;z%2n;dn!p8~w3hk+Zl_$U0OZswPy(1iIIg zSUNH}U#`9=;G$u?=fT_WT&l4JGUnwwv!EfZN>5myfX;1Vk}C|-q*SH0^Od%@W2w#ByZKy7^~DzuyfLZ?bC{V~O1LZ1GEyP2+_z3D3^ z-;#Yb;GE})(Uff*tE3aXHo8ET+yc-ec>(P`#n-=LOesuYPDck=MZKIp5E@&t?Vq?4 zBoLQ`IUVSW#j$=-av$Py_jQFjf-o>n_vQQjhlRd}fjqMA26%eAiE?#VI!+vIHk}h} z6S+|`{z0xlZrz7=N0;vOEedm;WlheKo4;b_5(g_u^ZgBv_G@9>I?la|xx1lQI(f=} z=(M|uWlMjOPQ$QQ6gHH$2+{KE*ILwgL}m|*z|I-XtbF1v z_qH%-VG>oXJ%6CCnbLS+aRLR0r$gfuP{{ZRQw${u_U@T5~2E{I{b)mf&t*}p> zH>KVd!BI)GmXM=6JDm^6?sZ4rQs^19JnkSO$79nZV0+G)z^X~ROmdagqBoYV_ld{c ziylQCEmb}+R6DRVGt}Kj+)l5jclk}#KpQdWreKiCfpq_M%%tRgijYI*yZHOv)d5FO zUccWlIaej7vpfERBwxqhVLNTbKmZUy6GJ87Qv?}g7i&&R-;-t7&&eg3ulJrEirc^J z$oemxrwLE7qkaI|+$9RK+1y8nH|GEfly;H#qcdcwVZ2LmvtE>1O~=`xMRmD31JCpC z;55(Xt%GM!AEORn^!g!75vnSO( zL)6b#%C>B=*{r#HmA!DrT8yj|8dbm-|G@p4S?HORxcS7;VG%LYXR2xMhTm-tl|2-( zO}Dg2%4XeDmGhV;H>5ik+`9!Bh2`GZMdWG@;Qa&xHoqH}SXBF{*m-1jpfvL3L=H;OGf6$vHd%7+nO~s&u~amCMp*++8J`y?2rwjm%3c(4&f2irAJ~K zE(R8XIt#Hi;4|ngI^^i34YgW1C^K^0D9VRDd zrL7a3$jx%wyXZ-@T~e}+gZ=aLj0By93RW9#om@X+CUuq)W(DhW*kv$_V}iDm09>Yr zQwlZjQ=>`Pqisi+W%=66`G*WmHxCp(_6Tg!(K%p$8)GBBpxTSW2e>Cy@NLaafD7Qw zPY?2WGfWXAQ}X@&Skhd7?eyM^6G;`@(NpR-qI{Q(dBUwd3O8h=QKfEy_N86W1hHGD+*zwoXOS!$t*}#_S|evLudorpdUV zdQ2qi@bIFgBZeJ}^&L9Qd^PWnx89z!9p4WY0s&Zh6n_}ID6qCynhX2}{f6Vh68HiO z%kUrxhJRfAq|Nih;?{$0wl71Kidw6h*2J_A`bMp~e1&h_;3w67=ju4%&nh1f08-&2 z(&#%1??BZi$U&;U!isUa(bAxo4|^+>4aF_8t}ZDlN_I(f3g6$LqkA68I4arZ?L99{ zX<|w0bZmqB+V=O#R)M!98uE_F3+{Dl_1`hO#%OJ0PRYZ**vrC{;&~y@F0>AJWT7SOjyb=bceKDm-iXC8*NHc5bdpcXVA3WiArhSaUS24j0*F| z0p<#D`;{~aqJyuY>Eg%CVrmzJ>(L-tC4RcFp>s;i_REC7MUqyJ42m^sKGQUdR6K+i zeV{RW;Mr*~Ju=R93&TIY_2{Pn5?bQ*iqhLbnfJ4@llxtShvXKv6DHQZb{9kzma% zMB0ZjS?d`U1)e`B=nUExftayuYd$I9YR+{;_oP+eCH7Ibue!V9&a&|jC+*GKLpNP` zjt7X4xfus5<0{8sm8ObTSHbSjW0B=cLbc$nCKdzH+8D#G`{Vp(t`cW(>RbDtpETXq zkKrDJ7czaSN%@nNR)UBS^K`?MSZToyecXO+1hknGXj#sY?ulXKXynA)Vf#=Wb zF|#p=?`K1N6sDU;;+;TXz?gd>DuvB%?ZjK=PLV83S+-=2$I6}~ zNu}t}sw9rdRv##{VZ>URlReeUeYg~^#EoRI^a5Wpw!pyv@1^DW>pc7t=-Qt2$|~>p zC%jK-Q+F}cc-=V?odeN9I6V)d4>Z9`J*z3=L@e8vo9J(t<3KFlgSK_(lf_q;wrdV- z?w0plxo9Bo7e?b{MS_^}^@KLQDKVf~Q82cWCJf97xX*0UMX-ov@v8zH7Jk;bqjhCY z;t_+6vn5AscGd*v-Exx*P@jqg@^^F;$F!t%f5o^6m8t510|Bw`b||Ny;$mtJ>w{?lph1TF=+UHr6=A-udXaFtGk=CP&Vm06p7O9djbj`5ZSKn7N z(|SClr63$JzG6ypJyYL7X*M;&B>Wqk+MhKGVn2n0g%_5a8dhWBm7W!rX0-;+w#=~$ z{@OQ*S0-9ilQb!e(`P@`mx@Y0Rk*3Ndi4bPiRmKr|FHMw(NO+z-}p%OG9p`+DU!;T zvK5&mNs?3&(tOLlPC`b;T$ZG)MN!05NE^ypwy}heC1j16AtBog8W%Hj_51jq-}A?F z&i$PGe)^r~{LXpK{YU9&u4}IAv%Htr`}KZ(8Ziw2V2-3=T`IN={fRtu6&hUZZxm~O z0|(TR6w_IazRRoUbwrFVTsJLsI&o#P&0nwEs5WN~BLdY;WB63RVtqOzs3m?BAxLy) zR1OmDR1&o<37!0S|MX#JiX7=S<*R;!e!3MB;~7dHmz!QPl}}v|`r<41(q&ILe|wH^ zlfhFoy$2EEg0%t0GV%e~B8R}D%h3ZT{gPpI;}=<)MNHks(wLa4bLv?>U)xBdwhvlD z?+pFw_<7{2^R=45vkdziIze4f-GbttZ@!?~8DT71PPE6tY3HjRLeu%VZ#`v!p~NH* zJHcdy?Yv3lY&B=$7?g(0m}dUZtDYiI4xmta`R67n#5_syS!oZO1|Q5N%6{WqVe=F9 z;r171Kh0=VnXuF)UanJ3zPFN)(Cl-MtEV$B1%e|BqFBa^(KJw_)uQ8X=@@V-4{OyQ zQh6f89e7$4Ek+W4YPTc;Zuk<$TLR)^q#MVWdFt!WC2RQMm5tWl%>rI{9XI2g@?5oB zJ^<`^J^bw%(gnoeIwQ$4fojnot(@!94`#+NoEm)YUup^)&s0EH^e@SRJH8`&b_G%f z+4a};RpB4(9G_n9>& zJlNNEKjjZfL$We`t^4nMA3Tl-^RTsvVq6e5I#V&E4swZYw`a|k{^eJs;g?2lKx1#~ zG3F@#`W8~#PH)YNv#TV0nl1}a_JM^axHqA}QWwF}9t_=xpjp0Be_MqgcQQQEmnAi{ z2v(45Rn=kuw?Dm4Y4n+YSC_z28+hk_FzltscPhim7x#($4n1o_Z^I8;(mSKI=KV@C zI)ywdnls%roknesCl20;n4qd@JybUvOh5Sev>7%QS4xSa&^}Vifd(jWbr7u9ef zn#2bmT6H|laBYv~ralV|4tam-`uCAz?;CX9U>+2IFR|EGrW{`hdKo-%3=^$Lj37x6 z6kclt9XUITN}`P{=uY~d95TNM8x3dL*+yt;j=J%xPbOKqna_2~m_!@)V zDQSGj1G_nYtb3VzpmyQekmFHpdA$SbH!hz|Y<>69W>#&f4cYXY(ew}$;Omq!?B81z zdz19~H0KYG8sz(e0h5pRE`pt2aGbbhU_bU<=pkNo`$n`Y1fhKFFv;aZb%tfEZPCWx zp1T|g5zfh91`ifyfDeAb#jl}3JVJYd=))=stu^^H^0rR@ov0tVsrlZ#!<2WPJ3iPT z^rz@4b3{0So_U}N_>96NMB&>#BYjAeXoJDlQwMt_ugt&YMf>AAv^cVsV+nS^cSaJ4 zR*cNv=(#{D#?lD&WR`uJ>?Ujel-p(Sdlw}bqLop%>7c{O42#aB8pHmH#aqbc-evD| zPoNjxHVVcA5OwBOVKbm*9l+CSJ~9A@``jDi$^L&9)5Zi6N)KCFL#xxyuA8gFvC-|- zwH#1>YyVFg=>9`OTh8_^QI=*XD2vLG3Iv-)z!_=#I2S@ga}RgorYvxiU>)&gYy`s& zO0PvM0Zt8p7f4VB^skAZ5m;RUzaRWX+5vum1>lN_tl44gmK22d*k(*+9{AQbO2Q7E zI75;|BA@zP7NFy5lQ@c|bSsE%H>bkZMg|WBy|KXO>W(HgN#*#AppN{sy>`7C9A z8^R7x@H?;m3C-=`Fgjl#l*`z_16elHAW zk11xb6KotCwRcW8po=p#Pf!+as~>5;$pw|!sK+1$9faxxNGda_0xfBm;u@~CQ&;Q` z<)$2(OnkFQzUs|LLqr$9xo}^slRO5!Sz8(MWG|+|=Lf4ObgK2%QHQ~yNP{1x8S*!` z;JV#8@|Fxc8@Lo~SX+3QmLy{G)9U$DU;20dHODEt8!N>ac$3&cFfU)g^6JM5PhSQ` zFmgLb;X_ZF_^)p7m3~!rcUfVG3@2F!6CJw3k>{g3X@TW+92yci`K^a|hTuX4wVxMd z2O0xpIf^QUcd@i?YJ9rDC%~!Ai{|jEKz$r>>$U=d$_})LLN}u8r(4grN%s!~@Y1ZCv z>nFr+4DMNOH(N1#PM} zKFYVMqc0ma|G0FiWS=%>u3-akibZCS*@M4k=lmxaJ`WOxWgL0b{0Rpi;$@XGsdoi3 zgRdHe`1@IOrp}G2oj@tJNo-Y&q;xq@Izi7)sM`9$XTg}}g(zbq>^~V}Mu=L>>2vi( zGx}nL$S(J?8A8(`mwSd0xGDW3n8_aYMuUHYrDWn|Mq#x1ea|rwvPACCA5`d^UffH{ z1RjK;=u$y62qg`=KRQGcq z>LQW7e?1~+#nAkzkJ^h<>Jwjt@BXz`mPY69nsMsOvvgG3eL3Xkz33BT@qTVs2|Pzy zmL(B?GFX}MzdPZBfDZ0XTNqqm1t%;R^KrIoc!w5*AKUNwJSglq8#;qO{FnGL_vfOQSpqF?-u9;q7!}xn{QU^p@dryZ(H|V$FZUW zs?WW*{%SAku)ncjFzUe)cL%Hqh9!QmzK5n&3mf-`8n?wL)qWQ915i=>O6esBAuijTt`hax~B>O?hWBYGmu8OLyEAB+eSDC+R%SO5{i#=R& z_tuKmQaatXbNswH_Rd{k?S`7?qL^=vai36Jad#++9CPfT4@ZHXf9_&1Ex^-fajzEJ zr*^OCT=+pH$~_=wqeUwyr7r+-IC)MWgeSJS@g|b;Z0J4v?U1~L)ef9Zrpa-0S(#1& z6z2PhY0JN89a!I0b__bHA7WgGI8uk?WH@aLgQ_goD*k`0*HrJ625E2(EBNIk;pH_R zn9m=s?Ocul=KnRrdhQ%k9}&9Cbf)Lc+It3tF{M9k3o0Y$)j6wQ`(7n2oyr`2i&s`E z@QEM4K{ID8#UiJG*&_#2838dI33HY=*eY&XgLy0yhEKX~?zhI@jjSA__M>a_KO%cB zwMbFK4QGY6%_VGT-nLu*3?Cjoe3FX6KyF8+^UKN%Z+v)%0T{+@N{clj@|%4Ws&haL zIVWM$*TA}+t`VQYZqm)y(=a3Ii^e9#5w|(wjoK>DfbjJ~i`XR@*m;?*U$k( zM3qUs#C(fk*kYna8^|AF)CdT9~SPq|3X@Ax3A?Hy5v>0PY{7SUyJ#&z}aR~ zJci*&^0HpTlrik+uNA#=Ls0g8m-Tk3eYHer)oyLGb04KM3EAA`fpx*a2E43HZhY0X z8E4Jc>A%Y|_=7sq);1h=sM~S)mAK3^Q{~DhdzBU~ zmQlc|0^41;p#=u(Rp1cPkKIf>$gbXHZxrgcqf@x7MycH6#EQwoHEoay7A`G;MR=Kp zEm`15Ww46470ZI7P8%5ByY@Kw$1qQqvWAIEb^GjFny|pwA8Gw-y@x-j-74IVkS95U z%D4uB)=UAG8{?C)2K;dxBME2MW{H+(Ubr}if6?35H0mtpYpy+f2g?nXbF)vrXkQF?(pAd7hs@Jz8AQI}{upL1JM&-P@+f$7~ z&j>66yT?BdhS0ZWKk56S$$0D)@^Ha7^%?`>K?w_uu4rG(+49mS4kPNUDJg@3IX74P zV<%s>4hrVVVUw}t8JCFbkltq!8aDrw*IvT--Fe$A`41}4VaWIyX5xT^{U6l(TNf(I zIg0K~sXr+C#Y8Kmwlhq9*2lZ$d@emk8Ic4T>=57iq=g^0ov?-`)|mnyk_I%+HykG< zfll6{e_OL4NZN2m;_i8}Rb|_CY=Dr3TwZId0f5e@v?pQ#u2=|beaWWhapafSs)U6G z?+GWWIn6RI(qKtp-8P!AJRY853j@I(u>Os$;k{~Fa}U94@!@QMHE4Y7K&q$<n$|uHS-%t}lyGx%*DcjnGZY$2tGx2yBL=@bcn`1+g9%B2XzzPSooA*i zf@k}~n7=USVT-F-hF=5o|8cmS`)WYbkCCu^)O;2<_y|dmcYx2R?KD9kq^0w5&?o*aJ8Aa%o2D+92 z>|Hx5`;LK&daJRVDQ8h0+tF0tp&L0gG%6`1@81-@S82EK>!-VUWeS%7tR9tI;v~X4 z3jE(N36MG_XUFlr&p5}JsGcB=^B2dIg95x?iV$)i?Ur4lYxshIIrYAINxU$MZH=IP zn8tZm*t>{Z8Meh&d(JR}=4PXi^ua-yaZYqY|L+jC2<+EikA05?5SM3_qbEs_!F{4O zb?;(*+!_xKSSZCPhPxv(4aln1lBG*dY-*^xtQ&!sf!gG(P7ZlgHOx)s2>tfQb)h-p zN#m!Zm~2ESkD)eJ7J(h=13P7#jSeK zZ*qsAoN4Tqz`lr>oFUTlc#3J=+RMUW(AZ;r2fyAJyzx0Vvvkg;?K`%CimshK9UPY4 zVPLX$FG`p58+^qts_b%t080jTWGDs;y&}A6@_o}R%?wL3FG|-p_=74)yVt)w0d6x6 zAIwKOp?A`C_(#Ns}dKkV+>kr-h-yvj2CxIoCgbIe>}(0o#q? z*nF3P%x7BR@xp^}^&iyE!cSzDF!;w;^bt%0mLoI#2c7GPZ-mH_6 zqvW6;m$<<8&y3e&*7#rHM(2;;=n_~3m~vh2Gh=1&!v^cs8DFD}J78B;N+xCvy?DJM z5irllo4`$}$^4$fO#-};|Cf_Fdpr0rsDgM>2RfTSdG`EzRWLmL9;+BSou1r@!mZ5a zgrsrrNO~oe%Q@K<2Szn2m@Mxc3p+nNyAyWhu4$R-J-H$YS41^2*hpYSGU@ zb%kZtY#Z=6o)j<8Mb_Lf<;wW|%Ns5KMR1Q+tnRar4OvmUEN(+*xA`xcOO%t5*Cr zbjusZVTXq) zhGq~^UhX4`vlhrb^Ez`j^HXgSpoyqo4k?$cE0K?bJFl0s`NOeYet(Hb{zoX~|Ma_$ zRB5ksQ=?M7rE$35sb$5dvPX5)u@3`n&236=ueCbL_U=EGE2DTzQTqr6B)PIedAlmd zFhkB`{4X~SrC>yMJmJojZ+=}Bg=}oi;i|FWz!BUeSnvFiUjC#nlyboQVs9dRYcIUJ zBX)S7eh6_-Xt3*bfe+q|2Fsnf5HU-iSr}8JK+v~?BgWV-XLS;UTgPo`K0KiBC@VGH zSrGPcYs&IdG|e5^td)jH_H!gnSs+b(iYy$D9G&qA6X2XIy-^;OnGO1~~UDfT%I&&&*75D5(>Y%NK)P!@Z6CtJNad^fMaSZ5

    Rx?o=-XAguA0wYc}e>g$%T6GPu?(Yc%FEX#*?MS;ZUwR;_raC1kaII z0kUioW!-*()F=#})~sg9%-!*UF%5+wt6jb0NhQ`n87=qxE@o@yzMq)?1r7upa%KTE zH7c9pdl9;_Fp%oKunGB+wdtcJWeef9i^o`HqTXID&3nG8As3L2_W>Utg>>H<8Z}=L z0FZn$wBZlx3VLwCgqseSDPcPTlSHR%csf8PP7h!5D{w)DluPJ;nLjf7dIL~Luc}8Fs z^eb979KywcMnZB4>G7KvFdgeIr`Bv?mz9pwVqrikyjhL@HGX$Rb3&)i&CwdIaZS6D=-^!A$pT|nY7 z=jC6DB90sHfZ6pp;VO~0#FNbVXZT!?n|u$}qPY`2tJfTd9d{&KC{Utp&nNriny?%R z{dz?`@D8)!>CfqQ!fFdmbBBHo(_-pHil1}#eZ}|gJm)l&*>cTUUMd_@^10UkI&=Gt zi^56YK*o({g>lQVUJ$HmMXALNxSLfo`QMLWwhow%iHOKPkNDX&7Wva&a>y<|1m|ga zwN>;q=nRsCGuRqP7lskIFdM)&BA5bcS2SA=mbPM)N4-_t$hzpY`dr1 zy^^A`uNBALYCCrGwt(KzMQ22`iTh~ZapOS-CkV4Fn@agW9#(BUlovs9!N!twx`tin z@6$K?Esjts)upaiU}__96D{meQWuqRAxRFZb0djyytprG#HJKwxzBW7Rp}ryjMC>7 zp8f5f#Knt34@+e`7xKO*#J#Ms;p}vybg8lgnMs_j#jH+l9HK(7Mufu{)Z;L2T*0$+ zGupEF>$ppKhGj+3F$sgvnV*_2skVPm7o$X0%+K<`pmxk}3gfNHxsQ>9ggR;N{*_Nd zu1^3R?NolgQx_@VFyy<&iUag>(`p9m&9*jgXk%)R@}@WDdmsK2cE_NQu8(XggcFI9 z*f~k|4!9j#OtjAS+U8ftlI>98_LE(M7oC%KnMGWhgc~6*a!GSj5*q0s7dI!8u!I7C z{0oBCkzU<5xBsB7w(_U(exDl>xDH@YG>+i{)K%UFQHOB!89lZ^(rry!DB8*K+}(HB zAvtIv9YMU|3t%fNAzKJhr(r{|wOEn8V<4_(N|k1NTv&8YF@CtY@zXd%`m5c_VWGWG z6?=L9)-RzKBXPrNtjDkd2%H6wuVgXE4Zq2Bj2f4v&BVnT3DIUkzPTp$t@&E{`2N%` z>kUvS*%T}PdV}e~#fB*E2Dl9oq=QSB?j3e^t?lG~^5bVQ zZf){O6|p|!F-ER-KMOm6u0t@LKY@Wp(As@f3QRFU{#W4jBxfqF+`DC}dMEHr`|ih|UzfM3Ot8Xi}jYbYQzuw=Ovb7$MeV;b)3B##JQ`1)Hd z)V?vlyVEbiId;C!&Gx~u*YWxHVr`07)!1Pr6mbd-jAhiQ7}H?sEypl`PJX|8+@rK4 z>?pSNx|J)bk?aP7RoWu%ZZK82Pp@13H(DGhudMo3q7@%q`LL7r4?y4~&l+MS^8jX% ztBrgnGXk|{=h${|U3A)Q&dE{f4;qUy=omc#)RW=i*OBG&b5aP0tVo?r`-+9;(- zvvzqL2Y8ixTvLCavG|91WLt>w*B{xtBM+(^Pud%=zXy3R z!(mfsQ-}Q)#_=2}GV44y!B`tt0*T<$8hS=dt8y&4)dd!=&;jQUW#tLyL7`qi!3yO@ zYNjx*g9{Gf2Y)H(pXR17)EqXYvpv<%HQ&^H_*TTSx73iYy)(sQ>c{w`+7AiNiTrdB zHb=@jaFoE05~$ARw>7-Gd2WPhZApO_o*K4227`DH~R+*#@^L; zS6JM2OSxf!z(f_=AJk@oT!WsX2qP%ECMDZbk7++JVBhr4(b2K~nR22*7vC{69^3PL zDChMWIOF<-xM!r#IEEvBuz~f2`^0yy1bZiI`!>Ja*KzX4{0e-7c<@_``FM?i;!O^A zv3ob`J~$N68yNLD#a+F#kV|J|jc}QdgO{RKb@e@{$I))uvbcHdTFEPm+i9(Qc1sZ^)i=_9zNV)>XPsf0@jR-ws z92l#K@Mo2PqX8258nhUr1H&r$-=T~1JMN=DJVn<-u+dV3!{t}N{g*J(?)cep)hytI z7do5gbZ0wnn6#Ko!=EM&A@mqjy5Rw1r&Oc4yxsOVkGMUKH{T6g*G&AkZ7dWQ1Ja&xlk9T96o#*SKw;c{6P|wy##9D z*!hJ^+;n3_)fOM{;gr!60V|oqRtbb=ITXHNexSgcb*$QYSPen%W-c^A8_}gxWc02+ z#h{^c4G4c(A>@zR32T**FIGGWec6&CL`?yAs zK#{2|V(xn7vhFth^0R|6aVD4v+(6PSmHDQDT%r%67g|yxBL3)-XU+aDojI?;X3kPg zv@438FrfTik^^>o-7n0RjHVQ%11eIDjA6yZ>Smgr{T6*obw3&V`F$~ue`luhsh0s9 zkX;p95A-zNECkdU7hE(+zh5rwXEBW{x_0At>R79+JfiYu+F=P3Nk;e30a78v(YO_b zxxMVTdC!gI#s@-Z#etK{zy3ua|9ty9#vO8#LZeU^X6Jx?cZ3a}5gKUo*`9Q2995Jv zO~gWhr_Nk9(LSEs8nVT{%g<(~_&=*>H|Rwu??;S0sHFfV;iFM+`y*`@3@!6sj+L*l zCzCKr$j_}Dju%P&uq}z@lPt!w-^ef^^K<(XDB!^vfh7yDEqH)EQ-VQgO2LnH4h66Z zzU5chFq_8=l+RW~o(oJ5dRr9s{Nja)aL*T6VMR_BEB5z~{?J@OQm0>6dj5UE$8OS{ zBN=1HEg`Ny1RGAE7s9GJ*DaW#XG@Dqx~k$bybs!{R+T3*v$N(eUcKAblIIejRP;{H zIP5v)*mQ2d5zJ4piZ=q^4NeuWYXLq|t3FbCT^Vsty{sn}X3t#aIW94UM z8aBn{e^O5Pj=v_YSr?ot7y;Rt&+P%*;HkDpb1jbN#vfE#wOkL<@SB^%6A^m^WV%?S zwW6?@Dn{y16zZdR@K-GDrB*ce146|NTQk1_DxYA zu)kYCfF^*&7^^^a!0p;V(PZE;9A@agi7vY?(S_CaI&Zw0%0HIFv# zP2OdBG3)r*^TeEd$O;QoC*fV_24Lt^(6dk;(t&KNPI6*OJ-fC>xf|4&)W#-d?|+Ta zuuy+@^v?UEIoxmGkGOkronC&id=e3L1TEXFQ)2ViEpeDHg&YM5@cJ;f0O>H3 zVTFv91|fGsR6QcxrIT>F|4SlHX|btE#x03>idoitwDSJr)5d3ytNj|<&ASQ$qDTbn zb#bCQ*gDEZjKVoXwyGCvR0f`8Q2*)C+?6u>>Xm(+DtRkE-NeJa{7Fe_(Y1)oDCZqL zi);t(TMCVlB*oc>9}F|zI8|YNr{JGMmqzuYHLg?*mwSE$ujB#BJ2^R!{IziO+J}a# zl#gzgx{H2>W69R=b}VPhWvz8zRhJp?i7fS74!4DPnaOU}q(I(jksYI2>1t{r@&ean z<2rXqx2N!KJe~u>F|Q9fwj$Pl-PnU}(}LAN)+-?a;$KkIWMO+DcdW#y=G}t9@4Md6 ztMP41qU2Zgxm$}O^-a}%n5RCn6FzKi{eZY2>!LG=D8K^&`$@Zuo45=`)+oX);6}oA zEc9kLatt@8rP-veu5GR^%}UN7^ztQWK1;McBE3WMJQOUm6=?29fobjmci2uEg+TC* zhauWq3d7;>$*&xz7uE_$6n(dABCRgOJL$bqmg(UVsXpAzQNhgbt-PO2Hd=7ck_G@i zo>osD4rRH6#-C#5^N|+}P|*!sd7n8}n?ACrxMxf?LS6i|rJ{+`)PBLSnv5J}<*huN zO{5MSA7D~q;2jR$HOLlk%kz@@q9}6f=y|2s+!ZO?Fsbqhn3boa?P|4S=!`~go<~xv$H+<(dF}Lz%w4vs8 z;CII zJ%k41i^|t{zaCMvW0=zx!m;k1w-jYzX+}XC8{1$k)}v99xqsL&{zl*ZxBijMslKoE zwI`BP(-E~ABdBf%aM7rQ} zVI&Puy-I198J?7LtE{N}_;BmH26?e#yt%J`b#B-d(sn{Fj7cm)C&QsH;Pu&6s=-Vd z8-A<~J4VQ0U-vdJOK)aD?;q8CgRY_uD&r(N+M6v4OVIhFm`U|D7%HCL?&r2G{z6|=bfVDqnxZF1@_;;!zgp9W zGqxLEBFlF+{?p?W&>fCV!ofR%`q}|jC&+#M4g(U}92mcd-t8aYY@>vL=Y;DApI1oP zX<_aacXS;vktvT_x$*8;XYr0lw~wHI&lOA^QoXwzT8M@dyXDv-Q#;E(mVGeUdCqk2 z91z8AP~Ow9M@FeMym>=vI5yG)qCJ8b_J!~c%$(D>RxHFY=V~^9@S)yXx~L_9UTxBC zkw13aEmE!WM%GYpI`xZ7Z@y<2YLWZv5?S1a>w@+!-G6ThziW-HBOT62Z0c3dPT7ZpF*lyl?us36&jw)D# z@gR#BD@-${9Ry<3HI~~CwT|3q#Cv;gYuw~zxY6v4@x$R>&;LWmeOL^~IsmT;@Px?q z+Lahd8;G}vR{?ODj}QDV>=zRb9-j=a`SMEcRGIVn#Fj5*?ecTC66g6<&bQt95r50= zOsw1zWlVa#gd{*9byGGMKrRCa-_$5yZ_vflk6*z}pG{r;SvSt6oY36a>sk_Dky2sl z5e(A4Gg~6Z+4-*R!j&GpOf({B1N;byvBDA8VqLCcT&N85EF_vS6TaL8k-GOW*BGK& znVjWyx>NpK-PO0Pxg#TGFZaf=!~T*S@-IPz4sqnw7(mKZpM8{6YfcO1NSto+HT_hx zyZ@heOm(ULJ_nypdR}P-`=ro!cSGD6AVo2~TD@c?b?^Y`oY z`J3v#_u0k-UPZFc*BC(ZhCDztu1lxRwFEi}Rhfy!uI69W-_qLsCO{D% z2fJen7%t#2BJZxFKW9b&CfNXfbN*vV^*1B)o#nx z9Y2=3r^X{>a02oQQfviM1dwM^49NEa04FlL;RdG2yjkmZx>P(Ro`S`Gt$(^>D6gYH#B{1c{R(ny0NMl%3iHUFMB@wpth-t z@9c5m76zpWil;Je;5b|5=vE7}VXrvouOzAIwPnnp@ph(0&4`|bhKrm#m(#z(bM^85 zqPP?GpAC-kU!YXB9^DUeiThN>agYm;BLMpwI3tDfTHjq-QkvW^rQEuECIeEjH2=J{Pm7ocb0(h4l&1R4z{ z8kAsMhqd^p?E2EBKS3M6w?K5KtIUhFw_QaJ=hnp^$gi%w*tY-8RzkM#Tb|?G2HWA2 z$H7Dkff^ablL8xNuzcVvc=pb`+62|6q10$^Zdxu}&oB)QV7c-rJ)D2un_ZG>c0}*+ zClhvib)4N4g(K#U2*xohSw`^VG0=sJ;cP`;EN7}VLzs-)yNly~bdT5gYxEW#;v`?b z^noWHSDCouy=+t_r8nf%liv67{?kV!{u5^B|B@eRxAYhoQ(o}iXowEbs!be2CNJ&h zdx02ZHJC>w&LVa0vHEY3gO8o^cLnif|MP2`5NgN8qWibMQ@&>Ydr}g;ncg{<3vLpK zbSU!2(D3mf^huB#)6C9s{-| zzF`DJ^q=8#{z6N>4fERXcYcOyD4IE-Wo42DlI}<>RA&v=&kJy95N>__m|ZaUJ8lsM?aui}ox*T(c{_HU0T-^|*VNa0n$>s)!hwbwz3g6cZPfYu&2?f1; zbfRlcm3SiHoB2tk!fS`30qAJ+qcBeWK@C_UqG{lrtcT#%HkLgsLSI#4wt3FIOz^(g zRlo?WGl*#&eA`$a7c=r?l2>Md3*wTxeN~QN*0wUqvmF)9oUQEoA-h|oU6AL9>Of$~ z_EhSuu*7GdVJD{-W0t(^@60Z{sa%jyJ%sA~z_8r`D!h&4p$s6QWs8GV65rsT6MYy% zCfsMaEjl-1&WEaOsWTGqWG7)i|3R6j?`drS=f{|A+6fzuLyc6}Pzw>}W4xGQ8THi& zdQe1k7Q{@r4+{y&RJY8XsYRQMlS-^0|FS)4O1Pp=9VYenM28Tj;O9mi^TY={t008m z=n7uf4hCY5T5P3ZqChu(yq{?M_7dbx$hJ=~Dyi(1J<4|r^T@G2_WdV0*@lC{9p}x# z>t?NA25Qd2o$MW*j00yVNhZw8cLQ}H5pVHbQ*%LE&t~iriP2limiZyl$|*v%Z+peX z+YVcX0|`O~o?$uG&r!%^qljvy-1y~%$V2*#8fp#iAU6W^Da}z=S>XY%i`ryEn2+M_ zo((D$u?M%JFcD=ntKeJLGkyuk3mr`bTH5?QI#?$Vm(o4AeEEW1$mX{0D zbb2h&^F|A5dW;6Y_Dxcg9!9U5xgLLyviw)_OiVMLBRPWzX0yFLFy)$=V4T1lM;d#T z_2DGWntkji9@b$Dg%$nO5tF5PCzo|=d@@M*TbbJ(A@P|To!m#|dPIDzJ@ELChHJ1P3OT12wOWIZ+gq=^IyI1aZ8z^y zFe0=Lg-xy#JvV;fCK_Sc>IC5#w!^QHAHAjR|1?h+=e}|BUCJ9wDLwk!O#H{gSlx{& z$!u>W;GFLD{s_{f6?5aA5AFOYl*Wgmv>4woF{n9vU&2hM!DWX?wE% z*|%@^3ug%`cT;Xr?_d}f6j-g0vjxxUVOq~wfgwHL<72#osKYGDU8s`2F`YrMtcvqJ zT#_-*}JuJ*gI848Wwi=i()bUnCI7W`k7))1-azNYl}#XZ zQ2-^k78;JBOGiMZBx!j6c)iw}8SK!_Aj)9csosgK8SXakm-D#RyA7mysgbYOz1dsM zhW6;4f55}J449F?xFuj1XF3M~RZ<$ukt8F65B=S-cS`$GA#wPS=hLV!SwGOF4=mGS zMdG}RfPzoS-My!I8&U@+h%M}s1RNp?;Dg9cf*X1eGj86XAVI@5WDGuq#4dlbdSGWE zAt0iiy!dO(Kv2Jdffk%X$Q>49q)->{WrRG8>^#JioUR>bz--9{(PXGc5Yw8{?`RZ zSCd2gInvgM5D!2fVKr#B!I{>MIaAUF&Q01o2Hcs*uYQlF%{{yA=QL$`T+5&Am2$nX zo0&cL&)Wp{qc>$wla0YFYZ4EwcQw%#v<6e?35Bz%)D6qla=57*ae1CK8a0df0tdmS z&pF9IHBPG8JBe*5G&(sMWz6b;j$>(2WX3Uk0``6Rr%ovZKsbyu3Vvt_P7oaFC(@K6 z@~ua?+F#>K{xxIs$`99g9~H~{C*1iWGrku8JX}>k`LE$}=pdPK4a*T*K@=&B&IXDY zXgU%j1dLI_#V?jT_8Uy@JX8Gm;ZV2x83#SlXHV~Kkc`_SzH0a3_oKwMOaJAv4P0Gp52~b5PL!h+D~?KDq|J6ST0O$L=Qvr-n7m%nVO569#q4B@ zo!5wse1jS6Fqa0m+(96Q`I9=-$+`;Lvpq;1##-D*r2WIX%`sK3c@|f$IoGE08+hc! zDNm#d*{a+bk1=WXLiSMHI|l*JalLhLOG_cClU!sS8E8EuKv!wx^dCBvNQCoU$+Gt~ zx)-|_2fw4-@7_wSTeP{W(z)`jC;?3J5u^>nFsHzK`Z-%W5n&NJ2+#o?V@w1iJe9RM zc{B`NCzUT>gC8Kbmz9f|?VRR&^k_9@b+N_Y*r-fyJ;K8?pEpJ7$nfU=WrU};`Vc&P z3+_Nm9^22Y**QA4d{6xuV>78={!25^8#q`JAR5~n~?Ww5pF?{+uJi4_s zlCh!EsTW<;=si41$ z+#ro9!q#JN8!N&yjK!w05F$2AEv!b)(74&{(FA}6unByv57 zmOy2k!4BaNVT=GrvKSGu3Gl{`;m~dJh%_TGpz&a2s1wy?^qtNl#!4_s26Z)DUertj@LK@KMc_~0-&KVFMh*A>ziU#IbN@ei{r^g2``@4W zuSHP*Jx>2UPXD*80nDHN|GUlS(jQbTWos1=!<=68kqM0zyb`@jCvO_)eC3rMUH9oi zUs>yAbD!(ivY$s<9}%i8q^FIoa1@-`K1lB$lw1KK03v<`gcp1Fc~twwg5QR6`8b>O z5kXu@SZvF9Y&X_ay=eDFzu|6*JqVKLaT0Jbs2VWLH3F;{gKC~{VX|q_H^FI^?)?Ik zanqQLx!;Mxy&%RhTBb(4gX%Q3f|6p!zQC<3jrx;~iMQmay8G1o@MDXOWj$i_{oTHk zq$5~E;I1R@gXf@9M6wk%0X{$DU3jsvuT@J_ckmx*IyEz760u$D0U5#p@X24qVDR@7 zpw-?y+srm6Dwo0mV+$Lm8qQayj(XpzYF-)-ukX{hS$%i^CvR$IwKlFN;d*vmjZ9pl zd$ZJa0L%2al(zmUPAvM8+1vLvP30b&B&av~(OJkOENJe{ciAwU{D} znAyRQ@jFCmZ-2jv@0WKZ*ZD;^ekzw0+q`$&jjE4d>+AT=e1bwRmprHJfN0=C#`zID zMEEP)5KbCzde0cc-4>rxR7x}1c4_Et?~rZc&?wFNr<~>C$|`Q9p1e=T{51n00}MD1 z-Ygw3L{=9aVf=AtH(a$3W6v z5A_hYm-P4{3Y@d$W!~BJ%#$u3Ek8f!%k>t?IWMt#QvZ;S|4C_e<=t;pSARqL%SMOvir;T})cDNp#?^?^7Yq1l=owZlBFMq5 z2e!z7Hh>7WR%x(RzBe+y>(Bp|W4E;@$+F+#r5b(j-H%GhPdbPD3W9HaMC27)6{>Rt zA!IhuD0Z?QCsv~7AMl1?XA_w05_Qeg-+TWRKi`J^Zw$_RwD5d2-SH24c(2+mID@HQ zi{mF@ym6AG-M>=TRAY<`UG=hG;YRd6`{w(m5tFr-ZDnhJEb|)KF7cVI2nQoO9A z`pY`mmq;xRNioYGaV}|FMnY(x~2Hz zyKZ6_ovZ)p!iAMkBRK!JVc>^^oq}H~zg^-VI?-u*M3Mft1n>X8?%yO}JrVdn{YRjk zQd&Oe4umRqzwRe-)<+-@tcQN`^BTF>O}`O#tJ&qT67BPludfGuI)Ex?g#~& zzz`U22MBZ!Z!3=kVosRi-r$PE5$BiY)iq0J25golEP~X<8(h7A@~p@k$R}yP$DM-e z=8X3+6Vhh$7h_lo|J1AM-d7_rPS$pjoa3*o&<9EE2LsdkP#GnnO7 z)W1wP|M+=a!?}?Mw!f%M_3HiGdErNriVxXbaNBh*#wP;61O^%7&HZF?l0q|s{Rz8y zVYXx#zEYJuruJB~`$B7@y2VbpO=c$#wER|3Tgm#EuT@tg{#BQ2*?3bg&C~O>x3Tr` zJUv=Gw6HkxeMkYyeX;jm#@!(yfOatD5Ro|GSKfmoBlp9O;Jg}$4t!}nD23cwn>kyy z_y<)6y+>NOS@-k;Sca4XP-grUy}5-fERNz{2)q%a#+ZkI8BZF(7E$Q?8OY;;u(_-Te{SS`u(P50$v>$5TI=O*;b-}z5!^QO**b_Z1o1g?_?U&UB{HR+ zLHt!t*US$M-tAWQ#Q#7qTd_B$KyRBC(o~ntSpaO0s;%;89H0qAAOiMC>&nm?;XuP0 zI7%lO&a!l^+U8qw10lq*w^?g_SzPfMWd|fQ%RWUft3Rh;AzB)xTm>kyg|c=rlIEhq z=mLzLHG9HLMDMB$)_CLJUAFQ>s_J|xPzr^6c5bBni3{Qf)m;E{`~yD#gyT}irV)YZ zK0B-gX;*D_Cje5rPiu>3 z6s5mK=Krobc58{g`{X4@X~SbLP0mFq9k(|@aU-7BoaKJRdUnFj=s}S&)j2!v-9IQH zRnGQ93Vp@gG#2?iw{W(Ntktpjb8!U{)gJVBKt;~-Te1C&__Af)bGu;Kd&VJr0Xn%Q+UiriQ3WX){YHyy0lQO!))F!b* zg{-$QhAqX}2;`tsbI&oIVCizpKeA2jiBhX)s zX{5#}YS%FOXEoTGUNu_pinizUmj`?wDbl;flA3?InWrFs+%NiYB-zvvt!LMG@2I)8NxZ3Rk7u670%cEeFf)l1swkSy8rDl zHwKJ@e@`IxJ60nP@N z+q7)t3~B;2R;L7^W`>ms%hKk|;s`g}>W88y%J?M<<_jKbY6Ds6$WclK4px1Uz>qu6 zyWNiCKsHr{A0JRM{1EgxdVTbu9!c#TKJmX(uBB7UpIM=(v4Scq0cWd!!mCS6el zOVkrLSkdg%rNKW_8AWgVm>n%(I5cim=`n_xJCPgqW^EnI-4E4S>T1Dz`)9q#rHb$A zf>jbhilrgX+w&T3;S~-kd?DEqi!EwV!Wj3!5=ixThRsv2SB`0kr2HXQ)(Vg0b+&c$qX`7kIyxE z1s)K>L#|9Qa)h*_+6Sc{ zPs|Tf+N=gTzD;4X$nYL4N5PuCi_`>3EKuCZgToDW*lkAsd8hmIpV$9z@RTY)c|(TO ztDHHna=ey6zx5KU^+-kxg?+tO4c5jof}y!6My@aM9NUc4vhDU=`ko&bYkD=cN~n~c zw)9^u&jY5&yTZ(%i&%zLC%mU~wh#>72^a$`4VmTkO(o!J3;+A))Ypx#rtL(9F3u}m zxN=+g)OBSYKc1n%3&K2Wa%LnDYScMGwCCU)b5FL67qyhmfHn?;3!_wy;zZQ8> zT3$r?@Xn&35ELL%RlToYH7oTLG7u@jKSj?Z$0EYk97SoiHBpI~NDctv`>l4yvW(GJ zyZ4dHM(js30ys8umAbL?2LTZiw|_ez4U{X`|EIk#4~P2w`&G!AC6z3hLX;(KQjrN| zNm3LcO}3CUNg~U9EZM`CLXk%HvSlyZ*dps-Wb9*1vNL1rQ#14Fx%)o$!ga_`}70-Jg4Vzwg)me!X6!ZJQ6k^ghE8tD&X+%wux}VnF_?(omo5eHvc^3J0V03Ou6%tGIYY)x2w_@ zmu4Rg1Oz6klZ^dbhU2X|-q~9vit4B|@FN#MF7=y(Y0YvZFK4sk(T)ogky(o#^DYgI z>a@mnSMId*SXl#2PHl9p3n#NbgYXPLw1x-~$=U}I(gZg<;g53kFxi5>t%{cBXF^{p zoUT}_O7wbv^FX=Dfwb|$vVyy-haQ&JaQEzd&B;#r!-w zq>_ehQy{uamV^k*fLjHytbDTqFqhD;kki4#oj$7Ynj_wPdx^eRLw}aye zKz1|@9O3Qm8L6N!DN`%tW&QryuKKpcV{w--26(HCAFs zAw$8$@*GLsPq|RPC%|IWVq`&3OXQ?G8>-qTI&jLjZHdt&7#sM~u51mV`0eBdL%r4UPu0niAt1sOv`FuNB9 ztuKU?P(Pxs5lC8j?%4@aliJN5*ze-5OFjXkx@jLfL81uq3DLvgzeUiqHN(&(-tsrc zH%Vk4da@c*QNz~lhgxyjJm?r2W1csvjP z`Lz-U+Yx>@m8!NE>8O`u>)keGHJykrnBpfps6`>XNw(SAxhqG1hT$9dpzfQSTDCm7MS+o}L z8F-9D2i{qjuO{yMONUEzgwTaW_DQBj4L^?jn35w=_VIZmkxHShy*$5gmOlY***Jb?sdx!=j@ur-&OGL?jJnU*n;Ukb zaJOM2vpTja;JD6~J=EGy`=P9iKy;BBeA=NABUZm3{HHkXgbSMA?#vvi zapj-~2S7|R!7*EXF`*m6Vb&ArD<`UCwtq4{C|KAVeLC|^zxu|Fx6e=x-5j{}xnzPM z?xc_OnCqG=PL)1bD5}diVL7OA^bP;F`!VG8qVOi12%Qq7bv)NcvGMWAk;~DUBh3zW zcFz3Zd3V$%Jr+Ev9(gD267@bJYS+02AczLK5e#nijXNNK81F*|87%z-+T@;Jr|GTX z@2&OT@$Oo$)9+j|={`G(V0XE71 zFeh7n{s0?F9W0MW{(t}$oPq1GE?y)slB~)%l3X_SqAV(;icZvB9y|8gQ!-%}y@*jQ zBdOuUZ6t^eRCPdjT|wB~0W}_CdA#$9-V`CLn?M&5O!_fCVwS5Gv-{bUk@9!nVzM8c z9Ory`0s!|q{0HnaRevx&~*NWL( za7lk1_bS1zpHKnpM#6y-|KFKtwZF)L*avgaM#K$9bI1v&^+J6$I>s)eSc&TC71F`)3G`@85>DfizbbaI7D@9iHMCFHnp zZd4({VXLFmmBs0Y3u`%W7HSX zfhOS_?XWFVvjZJ&&e+GA`cS;Eg76K8{QI9@`vx`D(OZv#ca~9KjwUcGNw|SCaf?<= zE|w)bZ(TjI0?p&|aCPmnm|%<7Hq7DPQ_DHjOfGL}%bzP?lYb(VV^}*pCpF@<{FwJZ zkvYo`Oq0OWn0R-lFkARN5E=dC+C z*^|+EDQN-|KNTQl;ixMKmF6Pshm#4Kvr;0_aeXQI&cO2-~ zy1w(6E2G4Y+3f6b_M=d9($hE3JdxKSx$1~BRP{Jtde{*X@@!skIkF>1Z#=(Ka`(q|CPbo4iZgePa-<+iBv)ZHipDS&Q;r#vyqQx`D(M2Cj)DqTm4RvZnOEol zk+V%kpLCYDxo|F6{|h|)-v$r=af$vv9BZ^BG+72{yU&6`l6J@Bn3 zMYYF^vrsNJMv7Myf?cWFMrfRN_v*S<7-)TMVrq)9K6~#XE#+Czm1>Sfl@R>|RK%6PB{ziBdaKF0=wH!>d3NAkR)EOoGi(_PMGJ@SsLE z&&3<6st3r@x$66ib-y*9i4~U4ihkksO*(ih2r6hN)G&}*t7S*w_JPwmeLYokT>GrR z)mi{PU05qmt?~0t(CmC-*kz_`shPYb`QWg=6S~=(C80P)*{#F0_rC)}u5yP%^Wc-@ zt2O5yNG`W-$X21BdRub%-U$RXjr5l-yxfwfLU}Fj9?BN$xP=%n}tt_?S$z31HUcz(Lt_1ZkF0 zz~cjMeD5eIKHy1Y-i~7(I-XD2BA-Zw? zGNO$8k@*y;?A~`9X=KNE8+?bLOqd)uh2>^|8#KZXVm#MSpSB9?5$vdC$`IBW^s*Ii zNrpzN>oT=E52cI(ZSxOJfmYpHW8$AJ$xl~FaX)Ba?jSa?1(9pj<1Wi)aMb*61f1$g1e)ka4d!cea93&4XOvI3est`3uc+pc!Ux zNEN#!f3SWF)K~{p+|`Cdrob~Ja+W0o8?Ym%DE#B>Dq{jd87$H!p5oSfVoqrno0PWe z3$Nh#Q$@q$4x)0n8eE37stNz*m@^~Kfa2s)@Ii~PWfs!qTLmz~scp}<<eTVwc*heVI>S$$PGxb2k44ZdN?^0zq|&whECg_($5dKf3^e zPdH4qh5Gw}R=w!IIReYAbO!tdrc|-U0}e#x1Kph%{UKz*;9)xI3ShbaSldiuztZMM z$QZ5-%J2``L^!tC$D!pU5$lOPqr7wM$bpEz{6A$bNJp>9Jp@v7=*bb{Zk8@ce*FK0 z^Lo&U#^W1x7i|i~F!&aky$RwS7d-M{+vztW43PGYNIetMHab14%s#NdB0`f%DKS+` zHlX7VeDE|%lfHPpB#C80E5t6jbr|IaBrG=_dcj!jRBe&><=MIAM=ncdZPGMSqh1i^ zWXKE(M6%>^#uf&}6WTu4o(b(AIr}lWa!xJ`s}>tEMqcLu-&_HD_73z(9#O@0YxBSu zrW!oce2dps*{Hl67$|6nx{7a_Wf|A+=Wf<_R?-<)(Y|~7XVq)+0;Sb77(6)=T-3s? z)=zJlqR9|y4kn1DChlTw;j7Wwy^klQkyh z#4(dM#m?Lt_0*8*oBJNgKc4L60nQEo>4gseCkiQee)s*?#(r5|V2WuhQT9jV4AK&v z4i>Ox@$&jgNhuJzTUhwbMeN)WnK))?1Jm6okCWS|$w%Z`!vb8&JB21)L)z~=R@TB`Z;2klL)DY^q2`T182C^2^;RsRZMnP;XWO92KTy<>yZ zk5UdbKqD3qe8`lw^SvRtZF)uRgu9S&BB922_TuAiV8NC)_x?vUqW0$`plA2z*Yk-c+|@%dxr2GdHU* zzs*cdJF8mw%_c}FtNZx2BqK2&P244iedM)BFib$HJp=+HOc^@>wa+WH?kJzJzCt{_bkX}v2 zsJq#kU}-g#<7&%QrRU!7Fq4g!=!@te|LR40MMFV@574uifi}dr4&uYm9sHiWQDkR- zoO7e9u?4QB3+1+9=npMf;>!B;jkZR&mEo=K%2Q_N?cH8IGP!jlamb-Ns-cJ$dlAQ` zX-OZW`=No>3rmEu*#?I&U5VgZ?|@Eyuo)kQJ!@lRmwuZ=iFOLszL!ndtF^PSz~B7^oZuaznusGe3<>5wU@qihML^$R|BgRXMZZICKWil zT)tXVp?Xd~;R2tGOW4HO=|D}}V19&(o=if2ND2bGV3Zxk5@X8Ek_jW;+A_gb70UD) z?upDNa;6?i?-cK1AC?qd9{Q@&&V}zyICk{%Kh~D`?-bCNI`9Z^L`h~kl*>xkzhZOj{Ipd$+ zW$mF8S<>PlLlQzd2Z6am#mwSc@2MvRv=J{9Y*2Al$%^=+Yj8n56~HOPJ&AsnKUPL} znL7dBJbbI#-=RPa>>h4<;YpTj#DxM1TkWfDE0djN$J*ZBIIi+MvnT7b zdRI?`blXeXwIE(8^XiKuQ2Mu zgYMb)m5=MvntVUAn}&z;qV}ZFOkUe`l<2>Ypg!!_qS6Kg@AW$F7L9PQKsXC_WcjkQ z&~C(`kY>yt$Q=`&OC`r{3=X6ZY905?wRNR9$4Y%l5BI8IxHcq>J3#ob@%Ys#LQLF9 zCc+CcbZ%|Xh!#R76RHeYqE5@TCe%Z$IiXPdi~!x%{yW1$@ecU5b9(!!RZ2M|!?_YDoU!ox8@F2#6GbxPy z_A4}N@D;tz&(Z;XJw(E?#FD2l>=%B;?31qRfcwyjflm<;foz6(lP9=IsWT7Fr*H*zqJQ&vPh%5YuhlO%eX?$(5&& zF!JZ#p`fx-p$1d~Y?3gdH>?DVLqCyZ_P&+rUj(+a!bdo`E;?PlNXtW@6F><+4iRvm zCw=k3Y(}zUma_&kq`#z`tjY}MBqYjvOZ?_|azib>c;zoaK{Mc@l8^LigLoSZ7?B;& z4u82xd?dsV>|&eZgeRnioCnGm($p?hu30*a8~%7|ns+&m|NIAqzSnkJoQu~L*-4P| z3OCYDDZ>XtcjChD81!jv&lsMqEHyzYua!SD_+EdCvY(s6vAyd7M{)XH^x;7HjHrYE zQ%}r){r>-tamkwW$_BeUNf=fWZbHMVJ@HH(m}fH|tPumsT~BKtKiyPQa(Sn~sD-#$ z4)X%TZYwKGH_1Wj@l>P`+pNboS0I?Os0tu+mL0blcbP%}F&h8j0Ii&Wwl+E3T zH|8vpQ1>fd*cjkVFPTTyT|nzdNle`39D2Vb$2@s>k<^!G{59S@)ctU5tJcrOaH(2-KE_ zt*8r>(acRQ`f?)E5kCDWl@O&ZSP2Tao0}hwb~6SI4Ui8>x=5C7dR6yVsHoo9X7-mJ_dPr9rQ`?({ zc}t)h5~3mc1M-ps^R5l97el{N%x6R=Pqs*^{u=f1TkLb0C9iSaFdfQ{-w=1=cs;&1 znsrKnP;eKBFuVmvVMQ3gZEmTcHjd1qepyto0Bq5CupamiTcdAdhEWX^`ZWrC;CmA8 zkdNDxyksaomc{+8ZtU4F{cLYHuD+4;?OM$pR~-BTNT2sV=;%ATi){=6^`g}CT2q?a6me<$ zfm+HQ))Oq32HnQ6>~qFkeC2wn(Y1KFnz&=sftQ_1N{$9~z&!T)xm;PG0QKDs5lCYw zA_4()IU2f|7(<|6tlPHs`HdK9C?F+*(%oGmAR?WIlrHJM&q3e! z^S!?@?jLX&Iuy=fpS}09p0(zhYt9|1sv?htL5=}|K(G`QWS&4E$Yv18Ek`sI@RK}p zQ}{nZuCiLL>JApJ9>&h*kVnR@j!*|zsI>{LyScNAwSzqmCqE|-JFS(gtD}oB7nj|? zZ{T!rw&WT(Y&rx7L3dQpc7Z^s+u(l@?7zE-Ll7VkMHxvA&(zHsFHeo-Q-R%`w|@7v znSBCFLPMC)qZ#v{Po1O@oKT67Y;_AOqegWKN3$F@p5$wGHby+P%@2xN(2>MNz-wz_ zei6d>;K=-b;>1crz>+{BCH{T79w-ZJz|BjjcHphcHkq>oPwmRv6|^kYC|de|zOn+< zLxd6j`Hnv8DhNyaf8Ws>OTUTu=ZjTg`~}m$uVLWxn??TfQbbV5|G$3iX&ylYuW}M+ zCxt`9{JD^r7?p0d{nq}zASJf%*;=kvqc5) zv%F@v4fFh9`k!BxArypuaZemt>o8B!4+}=cQCjeuc_HSsfb_HPZSdExN{~QWG{NI( z7jD;8)!~s5>R%()6XP1#i&@Bi$Nh>pfo z5dZh=hCMvhitPoRm(UD6=aC^TBhL(D*M@RYmqcyHiVpj$Myz-}Hwvp6`?Vkn3JNVz zL~QrKc|snTnlc#ppDqV3N&kB&zi2FN2^qBxzvCIy`Wfu(!3c@#(*XpLFY9qLPMxV=&L5gl?m|`Rb4hI#q=Q+1+i&VYqYW z4(&>*7W;UGb>hy-Msagf?&5B*E%&0Wsp@w7>RVQ62qMO9A)Lo!Yp!2+`bZ(r=i2ZT&O z!{oAC)gOx?OX4EB7iDG9!?p&q<)w3e1vFNcns%Sr(+D{;(t7=~GI(k*W9cJa&0>RC zhvrLSh+{XE2Q3>LTgHlr_d(u{ySKM@^)`>=+yj{1U%Rd?{n|SDQq|8-?CKy_0s(UW z{(Z+WeGlY|R4gp4rm3Euo|(GGN=i{X?xv=TWa;}Ikpwf7#eR!P|K2k1IIIa3cKtd+ zH#av(AWOP0zD^l=59^y_`A z8%ZeG?%rMhjzoRm3=JP2UsYE(Iw}gIrz$y_G=`KL(Y~6(=XiU|F)lTgGH=9&kc6b& z?7m_Z7NsCASW+t}RL0u+lUa&Ead)0tHUv`du@lA`1%bqO->RR9o1WI|=;+8OD7bxi zc$jeh@xuqBxt8bmQu?Xh_^RfQeCg>yfNa8=X;gC6%fxti2nGiS1)UaT!dLFo(IH5O z-F`k|(>dQB&NPK2*>bYChyV%4p$QwYc@3rEhmyv~F@Wn3=`)Ipjg^&?LrGO(Iog52 z=7lIl+}U3#F~w1fMg9C~(NlGB;L6Cvl-bloZEkL!V22XiTFoeb9$;I{ZQAu9e`I02 z)HLDkG~~sbH(?bOyvYxl!9^-pSy$@Bknx~EN*rL~HF}&qM+F7!AXwos$;Zc+V5g|4 zsPOo)ys|PT1|}xc)GMVNhs!1DaBR1e85(h4!G(nd)u&HG6ImWBRguNY-KOBbjU9dq z6`OK(SF1ag0yR|XIfHy$3zT2E3X58lG%_+WFF#+}%d6H**VUExJ`d0Qo~Q6-27`zQ z1$it=PEHP$ML7=XJ;ZAllJL#%D3&WGIG^gYsfKAU0I)kE(%JW2*g$T8{Q@Nh^% z0)7vFno2m>QMVvR+cTKeQydfG&Pxh1va+&Eq-vU)Kl7bJrSJ}p@eC*#bev+WI=Qo+ z8phyRl#_v|IvlBa_Ti#%9{iA~J3iwDkmr}>&19UYzO$mja<*lDNn7}4sobF$9Ows@e?abBuG zw;CI=n*zH)3Ak9Kxtz6}sdLk+aij|zHC#|Pd{~!|loSpUI3o+oIr;Yo8<;QTvY)d? zHIbwv+(J32l9!c5u(E^rh*I}(^O0j|+$Bm?u?$C${-hz5KO#lgi7ks$fo&E<+hSks z;YkN#8vJpr)pHc2Ot8ZUZ4*!*x2Zfs0Z|qO?o|x67&!=W@FEq)cOSj#shRoM5433C zHx8-dHgZO6GBxgs_!ZoJjR97iIzG1@)AM{WY|&->$B!TLHNt7w7@=9ci$_0&HaB6+ z-v^8>yOAf~w-?1tjktK$1*B|?hO6WfeB`KZSG)M=>SH2~Z&B2ZS-&;@gvsxH8NsmU zDkXy`=ZbZDd(+kB(J>{8ZOrcI$?kkR9tDL@muX1SF)Yj!sT+;9p*sS+1N)e}&FucE zjb8B>>ZPBI$9w5Gyh1^_ih0t(&myKEv37xA@#Whlx)WgQ=@j!qToWD{fuk#`gtCOh z#3o-}&`~uZQ2uuT+rpqC@uI)b-f$UWLp--@R~$j5_NX%iWw;vVFCu=v$pdLAte%bP zruOOE@oPF+z`XdC)Qt4~Z0+l}G?98gaR+IB4h}42WMnaA;TSK8o?ZXX@`UIz@dSr3 zJw*9sZ$&);D0SAy#&G!nXvZM)T6AZQ^x=t@{X^XH+c*VI^j@a11U(eCsS2}Ni!xVt zE_QxMY|hk^uxS=RAP?W(U#{pMe-IfTe>@}ZR(G&gk^at}m;79qi5}y=WTxE3DO+X? z8C~R*5-i*lX6R_>Pq|R}!}xr|RMS^?bdNyk4=BTUtS5bRO)2QuBA|Z`l6^8r+1AH) z%}4TXZk1**CgnGCS<#%ri~I4UgQI~$ZjFgU*3BQ~Ei8VLrK^AACC5U+p=Q^%Ok%}n zb&ZT@-sKtg^XHVYoH~>JsUHwFD2Q^rY#5oNOFjoZ-bxTS<}=NZjNI(F+CTqB&XOQ7 zD)_RYv@3EcmQQ7ghN3Kum4joeL~DNyK&GB22w-oC3Fms?nP#}{IVSZYzy7=t@ct-T zt(=Z6=gFrRAnmX7MtIBEQQlhg+Q~NFVt)YLXyCWk3&~8f>##F*DS>+EmXC5E4s2Vs zEvgf|z|XlY`&F|0|9o%~I~u|1>FIZyN{G&w?nI-HN7DJ!miy*-Y5Va$C{4HC8Gdl8 z)dN{?qePHL?Zfw+0U78O&yS3$s53VgZl+9xXJ!gxCN}O7p58tLP|mfsP3O-4^C=W% z*Os#lUV{D?p4C@AM-!HQ$I6c%CqYvL91P;WIou;+_waZU5?&-BXoh8TZ`P8Lr~7&P#8Qj+QS z49TYJ*?{ZIp>3Z*S(14Gfe^meF}8w_bDV2I=n)4c_2hIQU|V-_)QWc@PGlQR>q>@D zC+JD=38tnLSt2}4uP_H%SsVlWDR0|Imf4xBVOFJ9AoD!>6Q=9TD*3m0df#xgZ%&k} zfok>Zm$RO857P~hg^u6*2gvFaLUc&J`X7!w!y_RfSty8PeL+~q*fDTI=w|pZY!!Oc z*)MX9WI9dC{#Fq6@x?IO%xVD&ah4?R_oJZ|$|B($KQ7 zph9S)=&PU;LDv^65+^@X8~_~d=#XJIY@!^>eT-vXXRMs5N-Hc(j)5G|sYpG%Cv@-q zvEhRkS1JORjyzpx=T7^{R*KW}2=8ZSL*(j3xqkiM2+3!-ZEj&<(*KU{q+8)snf0V)E)HVC0n*5bgesz(v{CbSr>F+3HzfQ}=h4cFL zYXxOxX-P@RyrDIhELJH(S+pRCB<=<5IjdAeR>a5t=D^C;M~ZrBuGl7Ar`Q1#_U-d1 zxn@(y!EQBn(3k?7=YNIWAy#-hLixn)=uSlNpLqHAusDWwtehw4pbC$VYmwd#EAg6{ zCof}E*32gg4Gkp)=r$`WODYXpEgcUD!D*!lvHBBQ+~?S)nsRqGc8M#*D5z+c+DpqyvFiAC0b2zCf_V&QF81=*t zNPYd5__eT6SB4c)7n5yJ`n?+je)tvurmRn&6!8g=5R`K6%aAEXW}$)010aIG9PiV$ zbN$RNC$>?g4!{;+pa6wkcqD>`Hw*2t9rO@kVnPO>`TLry@lXsCh5$w=Vrgk9h>rcvcPS~- z$!cuyo`OEveiz!Ybo~gQK%XPe&;z+hsd{dcx6o|Lshomqlee{fLkJ&!0kWxY-Vnp= zvIj})WFqJvy1G{E1Y_opkS{<1J@EDiXpFvn3scF(bh|5qVEN^g>fmiO6L)+L}w)Nw2TxnPW%q96t<_b+Dgf|Eb`=BlM=*z>W_eE20&q;x|)#Gi@(GPC?ObLN2UU z!g_cIlu&iFyTOdJdb_mu*$wAMr%4s3*AT9n8qp7{zT^}9RWTVIvRK=;+@+V?*dkOw zFHBEQrp6Kzyl1(CU?oANDYJMiuwXeTTjQE%P!z!Zw``nh_QdM)M{2mgmBSj!e7HXQd z1R?JKnJRgz$stNLs`dB=w4^qNJ_@!mF;xWF$&mrX!-eenwtUG+V=!?LL)y- zs($d5`)i-SY(NhJlm!;ZPCF0rjE}cxv}&E1+_$GA&(6F-CkHxV%yP?tuD-WC=bNIO zNto%rztEe`*4g08XH!Xk=MK7Mx*z|~f%JGh2DyWYTB%qp?F1%3f(#E2tL91)J_%9^ zKOv*Zw(p;C)cxw~v@sY!M<vL@1Q8z;fleJ1KP?)l4KEq@pMN13V* zGUrIjmKQ3=4BCG!_ktG5x3sQK7}TlxZkm9OXNG>Ex$nI(bnJGRk9hg{7mDh)+n(P- zW!wYJ^5SB=MAu4sK-~FGGtDytFZ1GW1e{uQjLP6L9zA*lKzpW2DC>t;XBr)bCwGuu zOX54sUZE}tHnm=;{iqpXT5y<3F>?3B{q~>x1+r68Z>O!smzK{@^LM)_99kClXaPwT za5j{_TO4qS8E}0Ldzqe&1A#L*yW^(uhwGvsnH}+ z26P9WfD9?M9{Ok?QRE2XsVAX*{(+(*IwiI|C`lMXZf%~Vxy?RG8ZwJv6i!}J+#j)D z`6p^d_-sP7ZrFvZ!|4$;*4`NkVyeE|o@pD5b7W zVBota3wfZULq6-h!O@o{LLww2BzS$XzxP9#%lPbI4H8(~d__K0?Xa^~J6fO}tY7br z00H+_SoHE>DEYoQhO>(ccfge|yl$feBT&bq!mrwV?>RNR5L^b@ufC3Wdr|{xXrGmy z2eQA=kGL4YHGJgr^;FN5458#ZH|b0b$!XYLdM0Tl`zVpJg3=f7+xjC%zRl<57Ce;l z29lSSi>k-Vr;fw+m~NkkZ^0w4@8@VUdcgSX z-p4be=rT@DPBq}(ia=>ZJ@47qE)zo@czFqp78}AAT|zP88TSmmeH=AoK#MlAwm+u? zT7z<&mJ_bV=BId4HXK!HKKjr8!)wvpV_I((riuHErN6zBHyLcdA>nqWo46}W|7{6u@6|vH=O!W+T|9;Utirx z=CdWH_Fkm}JV1u5 z`DKzRzWmhqw8?_<^U4_dO$y3;o$XFR-c*m0m7k+~9$iGtThz?IVDVKYRS*y2xH~6V zQQ_)aAmYBcJH!9>AuC{toN0F6=XWBw!41Ro>^%K_Eh(J|$|pIX#R1DQUr)!)*a>|_^1xMIm;^itKJYQgnhqc|)!mp;*e*V1Mu!F0+OJTtt&+}sjIv>Zj(Z@BC zziL4gw13JUr)FR?#WgD*xqoU0M^*0m%cLaKjKd)Ht&}hW&^hW~O^d%tPL42c4?A5r zKi=tX$C0=V&{V8kaA!2hVywXIu71Z?fT?>}(q)W|jbUOLJdT^=+%+?W0k^6K>Up-R zeJtzVM;;xoZP$MO{Mo2d?x6Hgvlql%Zv*qKl-NUx)s@Q%mdnGR8zP;7U61JN1!Y{PASn54 zM-SGW;HwX|d>x~vzW!EyT0+8)l^oKa_W;wO{>Um5MzY@S{D|%OzgAw;u&(o_KU75K{iY`Q6{r5OS z_}BP`>Lsiw#VYg3^>RfZrGU9V->bSMaj}n2K6L=juheV*Cw95v@zxZeqvKXrt)^Xv z9*x&HF@UOs9^bu~a%pI2xRY`@(KKHdaP1FzZV1E?bSfTae3!3Y5wdHSA%XtSC|&&H zwoS84utD*TGwf(eM0b)-b!FkUcGf+!N*$-AFL-7a7H1q*SyxID2R1d|&M^B#XpTB% zEP39I_P07%+IOZ`u{pK3m)Z=nWi$t1xL9qUzoXJUkC`%2V-gO(!zW@-{Fc?S{cd+0V}4})@-Yq;z=&REJ~gtU?`*&6&>HlwFz zS^#d}JsC#ous*G&zG)K#)vO3!-fy0~juZQ?w=1s|eKoOozESQE0ABa`y=+YkJCk%8 z0;{wfVL(DaC(1guXsa0aNjbxPg4kvbR! zE1_d@pw2GfdGWW@9@$!9mv!0J7`bei3wqG^Y?n|1__jj(nunlJv#UPr&yD3+?YC|< zc_jiSFc`~g>U;X3{UpVe-n*M|%ol5v3>p{)OVRLFn^g}jC~JVraoPM!ZSFHHZRu}~ z2)!Jt@NSnwx9Vi{MA3rkJ%xrU7 z_Kc2=dAyzK%LrRo0SxrVqA zzMkgkV(6kU8t^ws_fpxeFO2!HfqLdyqPzHT7q?%k+ME}~=UF$iMv@I1m=&0Aed+H< z{$8i?+=B)TtN`>>h)a6=7CpmlEf_SvL+{6yoij^HN_MEfkby$L;^=kzKIE!hkDTJ_bIdfw zXGBRoyn-PnKb{D%e=q`m03e9(nbO?8CyuXPy_%bHXzmm{o*r65b>Ff?gr_1)-}jE~WFWPG zaRmsA1h!o~wPTt1YXaBypNb~iKeKVG^}^4Yn&-hu9UL-)7ZsWONHLd}1bGECNTFX}4fDig_NFXR)nWd#_Wiwmui=aq{Ol_e+ z*!5~kh87!+CT+hA48Xw`Ad|~gjiH)u8ymPHo8ira2U`o2I*)ml6k~*kfZ@l!}rJm*%*Tg@LXaNXY)f(NP z@ejWW&z@Oda+RBJqw|bOC%XG5om>C-Cg0X~_&vvvIJE<@V+zWbqRZtz?SH9N3^J-z z_8?fDRCn+lDzzs>DF+|4AI+!z4!dxyJyxVY-W(tSSU*2@tr7^pjDWu}gU~!31LLIK zDcBVlNd8(<1Q?SPEa@F=O2I%Z3jR5P=2I+w`)M=?e8oYt0UMdGj8F~p{)(2{hb=ZG;@pVbY5_21ij;r#F8vUmAMtw9z82At$F?GHmQJ%sUxj)Fse_@~($Ak~qmr@F(ffrYKA*vaQ1omy z+w)*h-ne?I6s!&GwkiZ*c+a`#s3l-V^rDaf6Eddff(4dmZy?}`0K}x{W@)$ZCXd^? zhUl*^42{C^04nedhz8eh*I|MQTT7kWMv485a1WSxq8!5lBtEL7P3Hf>`g!VK^PWef zmy){~Ufp)o^>Z4k^0~LtrpBr)hC20mvdKov*nP26pZP*u0y|25RPi@e>h*-89M9t$ zW>fl%y}+ish=z_GTKT#z=#v1M10VFONbK`*aSe>U*n)yYZ}BZP@+E`(0+?j9RoP8_ zA0Ll6Iq{64k)RfsaU}rVQYpByrht2vt5s9>?X@fKqj4lIa78?}Wxxp8#Kh#q>(`+| zE=n_lQ^fmI2I`sgm@But?_%4J`KCsA< zV&w?dH6qODyxSPQW7VtRy9m%c_ue?Hu?#}^sy=JmSl1oG5c@4~x&1HnwFGAevIiez z8udo5x@u7|DuXT?@OAdpQ`g^QmXc^$jZ%+@ha-PfCQ!1B(OrJoaa4|TjYP~vJvl#O z9-cx}5B`hUSD2MigJ+dzUl@hO;AaNbJSr**5S`L#7%!n{pIJKww6H?`0~u$XPo6%s z)_$%Ww$8MNQja=7$lE%SB*w}yJn);hGU%iHRbTXDQiM;4tpV&S~4#xh(8e6P>Td6Rp zY^cLeU2XiRlrMtpriaGE3*~}?hSiugo}Ucf7wLe38bn(;E_Z@?BD?15EpNcW`^L5V zi<$nsy_8P5j1j&HL`agm%{+LA3I`!M-?19y@Lf_c@EZN7w<&2tk49J!J^gdhiTaDg zg#`ZUwh{y+aec4x)K&oDsQqK(5G3zVgW_ZOaUy70aRs&vRhd$mB1KgEqhEGk1$t%Q zPJlBhYg>=*iTE_vNc`tep;8Eq1~{QX94uKw#%>@R#orl5VbnXhLwLrk*s%Ovxn}&7 z|KsgprG?iV_M*+%==L~Vzy2sjE(zdZ7%;S`IMg8{Uop|~kl+1y5qCnC4#BYAuRP6# zExcJmH0$`pxF=Sr(%|!DI^U)2F97Vd?_3ml#(>J@eWp$eT8oT$D5eX@~N6<&ik^FM6lP#7xDN0=34-Pcy`5*^$pxw85uvR zbSH+}i&w0Xk#Q(!6qbv#r5JfCUY%a14m{W+B0EJlpgUanw zb71rJ5jNgqKuz{{7W4f1gzFnBNi!d=m8j4uyiSYEdwgej5RNTbT|Xsw_O_BZ%DSpzR=jSH|ME&q5+(YHBt8@Td zb7#EZH|*El>9D)yZW<`5lOplHVlkV6V@5(8toP`@2b9Fk(6~#7CiQ1aB(MXr0Hm(A zHPg#q?XM5TFpxlJ5?qjZFFj6njmNhNd;2coGroPf(teAZ;$3Mw z1|TgW0B1l6lFV;M4kufIRN-LlR4NiqCSwSO3X|`V_(#?8S+<+zA9WUnzwdsu64$A? z_>>h2)%`qY)JVW~B(O+Q%iTZIB%y?1F-a_!91$q;XUt zVUNX6kpfP84e$A#9b_yMD(IC3-mJa2`1@z}Oc)HGS2J%G{=32ghpt0+nlv`88jGLe zSyf+{L!7pV81a8L`pv=%mD0CW8s5?Z^F0QJTcD642qkrzyyP`4u3Nv`@aroZTmt6= zS6|(|ivjXDXyR#s(2w#3Wl$^+NXQjB9H0du2eax`!gsIkM5O#edFQV+OiZYc8&HFP zoLWs4zZDmhJ@c#7f$E;AY6V8E;tyudzt1Bn%7WzTnT4!kBBo{%TBB0St{@R7b(W0x z?G*&M&wx9a28zSQ2`*5(4TzuL21~QE@6x#cO{B?UQ{Q7IG{vE5qS_7+TtI}@ zUhIF!HV)+%-S6Y^1iFTE(QqSE^B3%3Tm~|Ur)a}HT(z3WuMK2j)8|awg9fOJkINl> z;&lGOl_Lvpx205@!W{wX}J`YE=W2wC2t9PtvjVx9wg#XafRryvN zudsC5-NTj_bkZe2?XtAIOn3M0*J~>8FU!vJ&DWP)5F?=R&?+&K4EO7br-`q1(d8m+ zH0_QHT#k|7pz}EGwzvFUJFK9qJ3{;tY%IjmKk+Yun*GvGB9(Nnip>4>!h887c#B@?VgODy#9uL~~{o4O`?CTT{!!p(jhZN=dAF_v%Zb zN$BPuN2C43nzE65^Aq+?;)n~zVKHFYZ#En2F1UAlP)UMK`|ESWz?qq|!fJ(-J0d`U z25D&xMjZr=t?@Ysf8Xjy%mkqN*G;oXNEnuF0dkwhzHW^foD|B_r%wT44;aw4F~fjZ zII=!jN_>2_`ZA;%^e?COE6dAi?#*zaS?&-5oaCfhtw4;s*{A5C!YF$@;-OMW?2)=S zqS5K{0YRJ|`&+EjE6`uIBXwQx%%a`hKW{f?mXSotQ zQ-bZQb${?4++}55{k3vb3Z~~kYSu>r18oGg|A{fsd|2_4!y5*NhCLKGzdsH1sQ1WY zJ%9DEsA*}9d*HA>BqAX>x33O}Qvm{yd!D;SkV2kl1}d&~P+aMH)GTGPM*-%-E~Lo;H|K+`>FoN zIalmq5vQ8j=Llg<4zjYgNUTVikK={B53GQoPK%wGkdV*_FyT#hAgh5UEuivSeu?Lf9cHOFnWnuyXBp{qxII5=?s_E)M8U{=9 z@majfLPsQBqk^WJ8z1CB@fZU$^T%F4z+b9qXoQ4?p&0UrJh`d1bUBJ?VKqI)+15iG zc9QV~>;2E>CL@uxoMgtfOud@155Lszec}Cl340C?7zK+yY`o@+D*j<>*Y%eiu?Rxj z6eLV^np4$S+Br`m0bO7Ul(|$E&D&GeIPUdSpi`|``n^cnTApnhXux9>YA%h4|_VtJdy4nqcuCmADJc_MIaI2Q9W)2aq z-mI-XtvH_ssTiI_2B@d6tzk>Ik?R`|gB3d%t9#fbtU(5w%*BhYAzXuQN+#hJZ<~qtb zVq-&}+-ZEV_X#Lwz0Yym-XEVHa_WC4%CFpVXxL_V!u=-bwn_*TI^$J#6!7^e5QfeJ zohf`&(rNNsDJN*qssmidJ+EJ8dOll~r1y*-Joq(UW{xR-w#opVsUsR08F3U&Q&3cF z1?9)FljxZdIG34s-)0O!IRSgdQ`FLPO`aa(PAD@myxLFlS!PR#RXyvzYxnCKoeeLP zr77-4=*C)-#fLas!+M9d`)a_LXB6Po_E%&;TVkx?N@Fp6S7)etquU!Uyaes3ZWVNo zPwRLGwxcUCDncctRa&;e5&#RxDdL%z=vtV8=Vnl87&-Wms!%f;932X4nh_AYnzSwM1k>~|P>PPvfD>U}Cgq5S zGz$lz@q{_sfGk-c^F|bK`P4fGV*=fVGSA}-c1k#bz8=xg65ivCg^LtlWiK%{}uUK`-qMZ}7Gm!-hai@h0m>eK ztRuh+O~kVfR@Io?cQXp_++&J^=ia>H8D3^`fHZ(^==^x_uI>&=WLbH6=GcaDrA{e` z8*C86%gf7dR$tz`jcQY!@xgTnz}sLBX?e$It1|UMzuCVLwu}ZJyyhq;9W5D$#saED zMqa)h%D=cjAkhtUb;4Uten3LlGzDY={f=H=HbSkEGm{?&VGt`{4hrPqZ-wC2ck^Q z6Jr1`A%P?L&j3xV1SsNPMMU)g7*UFO6F%tv$N_;fYVS$_mg)eZCcLQ*6#Q_hv)>7; zl;zrwoIiAEl>=cveb=x-o?1pEe<=4;NyTfW~1TT)Nn>%meh(aJ*O};`!Km}ru zW@r!SoaP#Gx|gXrb(q|3PNedD>X$pdQ4hPBKh(+JA{{!8#8Ye8bNem7eYsOW$U!Y6 zKz%K4we5Tww28!LPfkEp+OJg{Nx%YU#DP@+@&PPuER_f`AoGZ$J4rSQ=rAETqPr0L z#zRdYL%aAX(A)}Uwh^DybESY12G-=KRce9^h5k;!_MBILujq3(F z9zN`Nbw}OX^?fKN$pV1ylG@s2aMuh_f-V9MHdHge6HK$T zeA+wzjC-sMuXmWHoleH>+#h2wl8rq+b9vXbBW+&_!o!&L;Ga6ZL^qVzmCn+goi!19 z{aRaXaw%wpLn;?3tc%sDM^y}^`>QFluVpVms#5+4y@0^j7%Vb6T4XD9+{MLZZ#6r9 zVNjkXbjH3L7<}L~H_RR25;)BaTp8Hb(nqH&>D@hvEFv>i-iJ1@f~Ogk)-yw41+7Lc zU<1Sh9}fi>e4Lyt#*6q;OdH*s;|e%c4u9M8G74+Lz{-#i_>vlhHSO#QcicV5^KYTt z4t(+{WNN0)su!@AfnZL1co$X3NjNPpFYked1|e(-PnOkk2K%=s zO<_>WjqWM|m!A(~D24DH8V`v9zZwQoDHuY%gf0$q>; zKvk$(_3j*?yPbCxehE|+AY2y0DC~i7>XsCsr;Vkn!hsWF4`kg_|Mz^hn#FM}Q43^S zTxdZbO3vF~A4lW`;HC9L0-@c)(-WRV;(Ve(3^UqI4^-!7l1cXDSM(9Sy@02xh_hQj z?*w@F)*b)q8Fvc%`mHK^8UfJc#)}<|TUUplW$ABmeHc-<{6%`;Ek?|t6CxW;e0bI{ z4=g}DaAl)`K^(|v^RJ&7?DgC9fBo8)<}o{wZ_2{Vya0znbM@qg&s5%vq9^q(pVxw$ z{cMGW9z!6gZcF`$u1LS8e9cDVuDnU2JziYCi!nh~CP6>x(lqL8t9FmDDyJGrA*6gq zl5KR^g+FF1eIw~AOfZlpD5_ik%z6A4I4j<8=!UJXt{SA-!ZBH$8ymz344XW`J*=M& z0F$MJbPC`Za5aeU4klnxajhKyl3pKID1FwfDvL@;xpyMto1dH=d~d>qH#eRmny;5L zl0eS!cuEdGV6EK#jx74@4~eyH0`5BS@lXT%8>ofVkK;UmCvE|>Q7jN6_(m?bz_J*4 zKeN^Z3?p2P2MLU$L(x%BpFK1E@j+34d&vLDPD5jZJdQsFi#lGuEFD-k0JH)`(9oU6 zuUQ~EHSE;CWZaP`6KJPXW%Em zt8w)eXp(1(2#1BAuch6v@DGO&Dn(r`c1dU}@#P}QQ;@eY1(zr|6+4Hzpc zD~B(xy9%=sMsdbZrL2^c2c$fZ2v4ITvGT#L7q)*kFu!(XSGQ03zGvt7Q}@=o(B=lr zAz;ARFb*$@E;=xn%Uw|3l5ltKyDUin z5M=$<2>^;57s+sQxI0^wl{U&G+{L>ZVfCP?QON#$EBC7Y=%D7ohRgKW>CsUs%DPa@ zhsC3(D!ErhNDHOj$96E0^=;C5zu0v)V1=aP<0FbD;X?U-7H`3yqNb-8)97=YRa%PY zG3|s3N~6B$=~5yow~0*gH$KqRKYYiFK2>FR3o`98Am&&^>6E!dcnaNXg2En(01#cpD%Vg!t~EHScPW7Qhr2Nd5&M zGAEi%f!KSQkbqb|5d*MO4}YV-4<>6V{-`6Jt99K3rHmdsi920L|ya)t^`o_;w5JU@yJdi*@m$-SB^MJ|@kP|O2 zuWNEGB?U!vOw8>Xy&{$LcKBfLlLnlZ>+kPRumg!wK}jj7$4p9F`>k0@tI-2ZP2#|B zWAxNK7PkR$1@g?`_wTe5Z+<(@;2rSajx$Fv-o*~q1HS{*J@E5VS7(@b1B6n*$TkkG z!WO#oS}6t;M2M=6&g8sD0N{6kA5O@v*N8ubX$m+> zVgNY+Y7?Ls^~yMasQ&HSx45#ULyHz*2Aw@Px73sPKaUxh53=65^Ab9==1K}=Mvys> zd8iCB{x4tP-m>-e zb=AMUWk3dUrgiQfqe?&f0yQ=#quon5aqd$K-%r#%gU$PfeVc@Pld~4n=`>?&q}+e` z%^YKPF><-v&g*DC9|M=sUrw@Fl<7}=WYz(;j=#-iz_fNFa6$&+1W-uQR8S>xfpJ#E z&W4Q;RP=4^*je?If!tsGmHM2|lCbiJ_VfDxJ!8qcO@4~zOKnn_p#qANNUO={!*DWi*#Jzq2nCrP7D1&(*uAQClYZmG(M+4a- zV0a?3Rw+~`4LdoXh*uKJ;sKjM@mTs%qMEaFhAoJXEfIR{SPBCwvaG;9sg=ZPu zT2n9=1GD=r$IJzVQ{DDx(~yHw=NPnaaJL@{xb>j;0RcwH47BISZR@hWfh!KvF$`Km zU0v*q(RF(Qbk~64hXb{xFE8oR=z7DTT<|F_97G0=KAsyb(%+z<_5NH_L)v^cQVb-B zt)S6QO^qQj^u>eu?qkjR?mvRGfM6|kUjAyI-aYl=(T%<0*dkpv0SMxO_v}pklqKtn zdHxB0>_&0u>bimYXcl!;a}HECF?$O~v=IMf>Hz5j*-0mEmO{(eZ%^2vo|+mSPyuM% zeeH1MRI|rF*cj`S<%p+pn2okJ!7vS$E+RR3v=RfjV}xJLxC<6!ZaK-Y3;c}( z`2{Q<9T;Chq@Gfx#-e@)!^m-43?rT~DUviKyB{)wDcEgZ&V>%w1qAVnL4CC+ zMb^n&n!~GY>)9MN!o9db@(+HafMGbsivmM=;CsXS^#Ej@1a|E7^mLjE7toPKO(w_I zxEMSDW>=Mp^Uv8+n!%HgyV^|ie{K0_uR$qEOHiLyRdEFuu$LK0MWK*6t*d@`{QJNi zOqkw4HMkWY0DKA+`@n6U)Fj&hJok}Ki(SZD@V-{=ld|0OaDb8t`7{XPyv zu5B}e?_3_kB(|IcpyPo6?d~*LYDD;Gxors4JSkapNA1xWTTfTOcXXP(NcDdZpi*=Pg#%W zLib{9`sh)ZcM7V*wr!WbWv81U3SKDjpI&>|1$b_n(9Hlj7Xj#-`nP$rZzJv!>8cQl z130l$)oO%h%v`8NwNk7{2g0tR>HoRy%u6*9~1b`kO%TcM0 z1;p87u|vYHII5}K6-N4m=wx(dV~Hy!y~BFn(sE^D6GJ(?BR=_alN6Qlmq{k=!;*El zW{U90cD2aL&Za{{1Zsl`sGO`UGT;;2%D9yYv41;Q)Sf+CiI$a@_i&1z128=ebPC3k z71n?BuaRwSQoxTsJti9;7bj<9a~Dt|+(7vYMkx7@>(@#2UGad?HV*V62ykr)5NNM& zY-A3ty#R0s@W^mR5w!GjS_K}GGK57)IVfFgI~hmiqAf#pngxi|efdq)Ox%Vo3pP_8 zKr|ksqROBKtEMUV&EWC@2fJi2$~@5MHhnAr&sm^F^AlGq$OV_uy*MviM(DgX7JI0q zqVh8VdETlc0uP`;Bw{XuX_jsZAjFTOk(e#?!$P?Q*UG%*u{`Rj1wwotH(ySZU+*96 zSHVtAxPKx}zu0xS7;;PGvT%y@G>=CQ5S>CphTVgU7u1rYBO_n(nm-@yN(3d>Wnd0G z&;z(QR6l8?K9ny{mluL5%Opr|JZgh<0XzKBc-c`LrFi1A{hd=k`VrE8x3mCy=_Lu>&`|3S{dm0Uo zOe%Hy$-tEY8JC)x8knklUu(1cBrJKLiE+sGr%nS~1z@y+0S^!#HGSQx_1=`g$xFY@ zZ-;XArh)A6s7fHh26=a>0_j8dZuuVvF$U!~Di&?$T0QW9h1)5`P+T>ba3c9LSsd9- zAuc5oVIOW{&-X7oQmKt5I0^Kq{Z$kRZT{`0fi~^J6et*a<{iO)Bg7780MmIjNUC-0 zhCF+S{pv`9R##-MQ|VqFqo#?o@za+%1HTxv-v3KN{P7LT$T0c@(bC%gvQ=Gu!nCKX zWMP_NP&Yc-Ew7_GK6l%~`*&&{j=#~TTn0%ch17iNW#X>Pr^YNimY&$wxkVqCg>&=-IqL^w70Zfrtd0r&XI_#4=?&?n8ctJk>exsU1E8O@3lCwc^ zxKyj56TQ&-quB0;-Y8#_@e;->7HnS-*(G%}ag-a&zvh*bX>I){p#yLC7YRv1Sp$_i z8n~F!+?)vv@>Eq-ao1fVB7(lEHIlg*s-SjMnWK}(kjyL2G#fiqe}JZHK5u^f_(8!N z2j<^}IL~31>mfb;x(DwabtBUAKt^KN2}e=uytxHL9Q?-%6HcB_> zE&1;o--8SH>@Xq}+k-Hxj8GV%hz4G_l;Y*~F+YSa9 zz=#5rlX%Dv499Bjd!v~Q=rc!JZ<%YY`Xfr!C->fBpBh2RO;IMnMaKV=hi`=E*Kpy` z6IMd-2n1Eu6E+n_`R92~eqD&GVFnD5SVDWE7UY;rrDYZZvs@Ji6JC*d41enAZ~K0! zu)-TkY}Bt`zs@-d{{DPg8$Dd$LhvWT@O#JTuhv6(o4baol&_ORP_=t6lvVYgIS>3f z0H@vX3jlmK02X?jMTZLD<3Brva-ut_!(Mf~Dw1O_nZ|rtUx_IL;o9NP>|9$L+9yzB z`|l~hRX)RU0NcsLHax;t{&dP!6g6`#kTO;`rlS6oF@&pXX(*4-;QAq`1SBkW?G68r*$ z_)$o(22q8if5(0lZz@obExFel^QX+a@7s3BI1?6-le~I2nuAF=Sw$L{uc8+#80!kF z#^KHW(Z<$;e~A45zJFzA)l+Z_1`tVHix8-Fpj*V|$4*4_`#iyqF57IET9(KWs(o%= z|IpFrq>~_PcHvJ}{^tP)AR7H;%wfp=fn*6<78BAnv{xaFaFEL)|G&7FC57waA}(-| zkYk2sgEHO}QTR-hPAH5hTXq!PEz79sA?rK%vjjmmYc-6e)Na)AzWyQp(0IJ*%l&If zx{A(e`rQQ;0!n*M{Xf8&3M>~~K{sr}_gg_Ho2oieBfiMI8m7+@i6wf9Z$So@f`#3_ z!Q%@hSY@-&|Lor&)~M9BA)Noa0C+&WBi+AVx~uiLs#BxX(`i@oRr1pWKN{hw2&bNR zM!A1Hp~f*h@C*AED3ufe@^21z5;|s|hos-v{Po^?ptBQ|?7JDEL`1Yv{SnCl6-)St z4Hl3}5kfMHob=DkBghYJ71>KIh>D_T7$f<{#k885;j!Bzm1mn8ff)TqYjv|-@n!MA zb4K)D1}nU8%j`1!l^%AVUm3qw;yzAoDZe6gA6Dyb_X&vssQ5j;2Rj}8*P*Fe$6 zVI4*RN;)hwJFXl+gr8Po8ol6Tie5f{r)KH)Z&YE(!_ykx;S1(vJ(+kP1^j6H$q0%= z8f6pWq=+Psl(~8K-%m=*@{fmWDY6VAsIOV_2=*CEQx~SVg__jx{~ylYIxfp?>-)Va zr5ov#Qj}0SMG)yyK#`D85D)<=k&*^!R8Uf+loAl>E-K-RRo7_Vb=| zp0oeh`?Hs>8}50{Ys@jn_>J#xNN*aBa)XQLd5EB_8stD!f~q-ChJdVu4$*#>!U*t% zCNgTUxSgUmheLRx?3CROq|FsB6!8QNkX^{Idu7iycmIzs&>F4%5-AovQoaGIwNw;t z37sB&3ybjb2_@A&2T~CpqyL=AutRzt+hxNiT>I-2fFTC=d?-nPk`&^lXs>HeoWyfl zyXWk}IW{MF)w3cSl~s9jP8MPdoK1a45iX=I?-BUt`a3{{JumMY0V+z>-z@pkaD=Oq641>hMlYtbD!ZoBqAI&RRDTttHDfwtwt zU`xaD+HY&vBC?0?HNF`7b=i)z-6R4Ht`WJ4)Ry?S=H-~iSNeY+)HK%-Az~H@9!wmz z=AV1c%o402;l@#DY7pIDazl6_tMKSqsb3!|fJkZKW&l}kiuTLHfL+>GdOjXC`G%@1 zE}K^5*^>9hqlX`>%S&iEC1QD5toVju1G@dS09}@XGytm-q*Fsbgdjh-W=0>5DCM*+)y>~mBfT)|UlX)lq?iS(Aofe7d6*l)WWQB${p5JgSvIkdM|2Ub zJM@nVu15D?y=dEr{X&O)NcU@P0RA=pOULo8qqHaGd(Qr`h^^6}#se^wNl`(aEc-?H zbNphA!)Psz<1xOTX0QC;lGzBBeD zN~QrBr>~^!X#R6Ve|G67iFJ%e#S)pHR zd*<*js%mCki{;gvV_zNkdN=A6(fHnZ0+_t}CkXg}NHCfME!&OSl{XE`4t+rue;}fy zI;0&G?@!&iuT?5pTWF|eS6=3bv2Gj%zi^Mo@2}1k#73PxAF;N!235UafMr2b*-xAi zrnzJ(rLqpbX^X0R@&tRG-R5%F61hwsA4BGkW0vas6g@~qMur0z0(h-)f)UZ`TTGv3 z?qK69lJs(fT^$$uC`-o9B_466DZjfYZ(F9G=aj_lzvC7dL00+qaj2{sgMhH*(2VI}VFlyxW zz33}ltzS#UJxkbit=GqLvA$i;P@p`)p#{v+oB- zZY(G&+1PYqs)m#(no3wRY`zUuq1HTL$)w{-4gc%QKUT(p`!{eLKv2((nz}rR%;O|i z5#YAHjhkuy{P?>`k76vZ>-6=r>iPzC>QUS?@OR{U$bJbW_mR_W%28a)(2yvzd)CW> zEkx8I+g`1U9m4Vz^6q%g`tu|Akt$Z@oA=lyfhyFIm94qW!HOaOidcYk!!||*nJb42 zF-h&SX2ozSnH~@4tqV=lSIWui^w$V~3tKrgB?i(mE&4y+R2XI>#KjkaaWRZXZc4j_ zW_m1YboH;;2*QQeh~GH9TS0lcg0W zdGAbK)rLMH4L=LZJrHg%X6xqGgJw`uOAEoYK=A^AlF*}54;Mc37Oa+nA@JI5X-yEqDOWGr8>zdK~{()fwKfWU=^aOi8MDO2)0 zzz1AUEQH)FWANqnZ)bpwsRXaUDe0g-!HF37D zcu51HZ}}kyCnt2TsiA{>&d`t%@OelR2zBn(trNDT<42HIZJemRX?_PE7qqRuI>@GL z%aURrxUi%Kbun8RBnvRVUHkm{@opcris+Vb9Y~wwAH;1~xTigmqzLeH{D(!|ElotAok0*#?i9Pj^< zo+R%;NqS{uALQZT(7M0Y6^f%zUwn7Ey{TJ6X7M4Tq$?>5LoBRs1qX3ga=*djSHH#@ zR(6fQW}KQoE4_&5$+ERV;%CO+NpIHC{>VjOhw$I|B01UtD$M!Ms{o~evc~w%RvSoj`%VN+MOS)$uPX=z>`=H-`{Vc>anZ(-H7ygxoDi0!J){w2-k@aV?EhsVquQm_8X z-T=IyaCp_U2!ir7_o-7qM$|PlmOmGeQ&2qbF3#QS+wV_&3EV_PlrX)puway!40&#B zN{A20{7^oHr@71RzXs1oro`|Xt}M&lyewGZlkjWg!ik!`Q0g+AV|_;VKmPm<(or`j zN$6exA*%N0cThx-WUv1yO4#4-tJ%KUq#iF!bX*~H=JVE9h5SA<=!-MQb&vndHWEL_wfD`PwyY8)Wdg&33YpDb*I9m z3x!O3Ni0`(cU0}&hpD&g{4^*O%!f4rkjf8a?}%0hM$C|YCv3E922P)KT!D0mfr5fm zBd{Tc=I_1_W5I=)Ia2I8n02DG-?k3B7_oD_N6A9PR4(F6ph~nAdjWY2L3W5+$s=Md z?)5%8t~tKn=Y$671*5%5N6+n9F66Tfm(BvR0%T4fmY?O1IDtkLQ4fP`Y}ug(WIqOX zKMHL&FzeC@}Q2E{i#_wF4TS$apRv(J1K#r8l{v2@lCX_WB zs`+PaIbx34?i};yu@X&E*&=HhTEC8iickxN31FvBciHE1_YGjZFn+HWB#GzJ&VXDQ zkSGE!OT*Gqw$9GNFzgtFMuyEBAA~>`RsAQd-*g+vyRY#Qhp0vRFL-srS{Ki6T&n$n&H^6=&bVg(Xj$*F zDu!o~D6I=;FBHuo{fPhQJctH44lKv3XZoIhKC@O@b&ptLrv&fE-Tndl2XCieXHPn> zGbAmIW=M35Ki&0Ab{o!1`V-Vx+G*)`{yFZuyEJdjVad;9A!E?W@^`WZlF zV4)7{*7o)Sh|~>KIEYNx6{bg4K?J`sK$nLw>fRgg`$R}@lINvL|x(s3t@`Y--y2lYNou+}ounB+L|3_^A&AR|1 z`Xex>3F`6$QTy|Sr=bl+ud4VEmk+XNH~C~t^Ij;Dm$nT|CRk!cZ!yvg9f0?mPU}l0#wuKy}T^z z#XL^+(hPg_BJAz@`=OQx;(6M|drEYN=u!kqi|}V*jun0-6s$-Hv0BZ)c0KRaPgo#d zMl`IDpD?2=8C?=_N$QT(v zYNVO31^2vc`I^}qC9#vsck>;HL6E5)SiQyAWc?wv8~k2Uc&RTI!-6ly1nC`FmWxs< zm*0&p5KoX01*WU0f~)@y4ZIodLV11jVnyjOlWQIb>D8eH7i2S_P2aK_=F#5oU74U^ z{nh=yeb$@+av6B(M#U=G#_xn}0Q(gLNI%b-55S>*k?9`x`&hoCp9T$$E1YjfSiCAX z;57iExv1j|DQaxMBDtyZ-goti1IUje=xs~uetriXu3hg{3#T3TF99}(T@klOcSWsl z-I6sidA@u}iY=PD*OkY*b;oj;-WWQmlWd$~d_DQIIh=C8HQ5AB1qppomT_ltq!m#%wST;9;<+pQyMn+dP0NY@J1axDu?X|?zAOxt z$jFTT{0>Ud`|dfQ&)BimczOoMhh?*>1K6e>B|jGix+m|xkJ8Uo0L&zE^9d39!nu@i zLsK50ADom%4~r(!jgJl@Rfdpo<2^R~i@#F8e}2dbUfy<>ogSQk4*8v?oXN?_3a4by zmya%9qE1>`hCl|*Jc%5f^YTDRyYu2xbA#yOX(d&=Av=!|)5ffH=A|m9+R(XUvx6$~ z<3Yc!W5252HheI*v}H>l*3875xp%%V(};G}aZtp2NF)H^k8$(x%)h=aNY;vFZJ;c% z!X_V9rb6D`m)rEQGg-w{7fNu>P=f2iq5SsOv-nqwzNn>zPB3e6J_C?AOT8Y#w+E*x zq-^?MGbJM_pRvuC7|rIeCyj+6o)3~agj7s{{&D5f@G5@BVai9zVDs+n|J;txYMHk6 zdXNY4=K9yR$A(J>0B{-uvcbOR*Ec~_K)HqemKbmUna6 zgg{xXoyFo;@f3Q4-53hG`W!#Y5Q_r7d zYo88vS_)PV!IclXKZuB;R=F`VIWclmk}pw5Sd`SF%nz= zNz07m0Z8r806fUo+&pZ{IssbQ)`k!GEJ~J~)ou#PlR&x)d#1sKvd!j`eqm0$k#Sq@ zY1!>jXz0YnrOG;|W=uW^I{)wOE%ET!>H0Ufx_{4J0SNU_E=b|sdjv*3{%DjF9Pxdw zaaVLcr)UM#Hsf%Su0t?_Kv79~^$Hn0g&Pov`4DW0T`NcIb>QED_|U`6lZ`FJ{Z(mH zP~s0lRhcj24^QOYxH1KP1>F0%v6(`gm?`=ll3s`+c()FeceH}* zcV}2V`7j2u9;)lIG&AMoJwk<+lDR`HH#Z#mBcM-ur*$CcXZA&9)9LhiyOzyU3vF4E!XKhmYWyMefAEiJoEjCCF{TlXv#klBGpg{o* zXzk)Rs)k_X>^SSAOoFs)Wj`OIlq${HwMx4Hq2`K?d0nT1MdVp_t3Sb~Nc9=Y<;+@K zUx9Ct%psujUHX)674e0Nq?~IuP=sIlNA%=@p65>Wuk;M#!!2fXCkuoH72|;H}STVwcu4P`C^tebXgtCt@ zj%mcUIBv^7g6FFr*%%9Djoim~a!#!L>$xHQq`TWDedm)4&XGTSW#I4M92Ond#4+)B zgws`C;P>0nijVZa<^B5a8u=mAL3)oF1y!YJ-b=6Z+k^8}a^ahy*IZE-{c5mrzfpjG z76*~rLTRBJE4gD=I>EljI~X$7TZuk=iK5Ant_QZPH;B<$Z~@Vp8q`2XLb&Kn%WuvX z)hOsRI{73^CpxYhn&dZp-Z!KZaP$B5yZdlEu$Jy~|BEHf%9hH{TAq_-7(=2r`QU9z zN^Pvz(N(9%T7}CxPqpG?PKuY}9rpe&h_<8cf|zB~1MRmfyi=?QLYNF37EThLQM3?q zcSytWC;I0|q#QtY2qd8WC3l!}bd6jC6rrO0AYEOOpp)W>vT55ltkMgsWW2!s4uBOOWQOo5lT_`O2*Czp4*^OSQV(Osk+K!3i} zlKGQ-c=&b%iIbuOKauQ>$i_^jGR)(XX*+qRi=L;nF*Pn622(bu-oSI}IiX|2Y04>e zJQ%Nc@`|Mwwtl+fXx~QM^}@Ten1`{+3;m!{yF=)I!=`kK?8`R zhyxqQe(m$ROlD`IAIEK6{k7H+2Z^-O4E!n$5o zs91D=N`s4uuD&n3Ztp_#QGehYAC~zjH;^Zw{qi!Ro>xxC{}r7+kzW>vGysdc=eBI8 zFKzl=0YQbxg^G+)oTGEDvPZ>#cjkU)&S;Jh+_Il{TB2KwC~L_dJ;4^NH|t+S$E9xZ zA0yy9_jzf=A9oDa@QtWcomwOt-G8kj%(j4<*j3U7LLHg4q$G{Qa-jH)5cXqUKln<8 zi&e?90le3XahLzn3*0~54-VUlH;Vu`M=BhD`qvTH;G3dZFS>Xep-_9;jx(6kr|;Ni zy*8Ow+P|;$^G_Bd(qo5?Hq)i1@N)|T9ZJ4$m50sm9%K>LyqH5Jf4f z*VVN3wNdF_9_u&}eUI+_yJsk1`MqOVwuOO>+a zs{hs+|Fg5T4__Amto^Z-zpH#>JFe@-nh!jWenT@7hk(Q9`W&=xuoyUg;`W%PoRXYG z!qg^YNxt76xv=Cf*ai&L`D}~n@J=?xl|?iF^*dyCU-aJE)UBUT!mIFmYd4(%RES-cjG{l_I1|vN;x)ols?R2JXt*R{7q{x)lU5C zyKhH!_jlYR3(GJ7!c%k=+3R4Iz+*`?0KgDL+>1bcU;wXoRvL=#Ex?F<03-DEFqlSp z0VYy&jBA(>1Uq!rd>->&5QWnQ8ocS+w7(9EsME0Y@ZLObJi=T_b@u2LbmiY`^Erm` zV|{yI$AF&+vB(=xM;LBR4Vps|{U76)T>^bMXg$^VZ+cPB#DS4Y<_^N> zv$V9Faf}B?t~*{k*Ld;OMpB_72_sRK#e=^$o(;a*DbUjpowFoxV<=5?XX<)UanL(; zu8IPI#3X*+sMd%+|GPDe&c~_W1T28D4+aXRy=X*i4FVToCp1Fy4G%qZw+dtA6EI(Y zrNovI=rtrLC`~lLX!o(4M<;v$ga8$KSIMN`Ki+-tO>$l2cL^oc&eue-Cwmh*?H6pv z${5uWPVoG7NYtbVnZv{$2(LjgEitkWMAMy{pT0oxx^(VUHZ)-Pp@)ZD@vHZ$x4y-v zOsPbDyje637ka2;0^=zN(_4s_M1bM|Q)l;zl8s01YG{O+3l=iQ{33^&;jdum(;Jza zn~Q@2&AL;$2M|K4hcbC@@tZc{BEW@IZqtMPy$34Mkk$BrV}?n>K5>dB7V6~VkU=9oAedkkw_yfF3@Qbh6R0;v z!-!;Rsv+_&bI*xx&F4(>UemmfgzeEG3EFK6X0LJ>)-&sFHk!RL=42<0xn?NRwGHW ztb2!9!kG{S-37W*b|qmLf@uz;Z#g23goUpcsm?M4%zUY8o_~Cv`G(x{-<@5=>Im7f z&d^~;eCK?Pq3!@KgA~L}l0h>2s}cRh*3{r1hoG3jIEVPBlX#-2>k13}BOt~4JU(-Q zAGR9IL*njECLp=)njo8#X8w}QDfQ16pK50PLHcAnfQOkV;lc}(4HzgWDj*$g-m)q3 zN(2byu>}94zPtm)s7+pcVQ?q(o>q?PfPDkAbJe5tNO`kV(oph^OA$9OQ~gyMhd=4j z$XC#XS^urIugKjDrdAJ7V^6Nx)}_Z94bFP#!Khb-)Qz@B=)H)xC29EQ$SRdgZ^?@E z-;*lON0ATW|uBbX#7xWN7{T`He|deWtdy_Iad2llsL|d+pspTXlGcgyJv;vw{0^r)kUt@?|Dl~b z{jV{gSAjx$^gu4ELgumA@P%OX9d+)l8RL>oC4Mp;QytmIp6(@?`T{(K4m`@4v{C*O zLS-wpB{%4KZ`PKN)8v+{$6F6KFmw;d73*?H47!B#w zCZqd{LLZW(B)G-OM{EJC^`g9-4z+@)6ukDoCwI)y{8eJYjJl~DJd)4?#Taj6#L(h+klc@Es#omf$hc4$%&BVVO-5Nt2>(X0vT+Dh77fM z6bg);S`d4Pl!W_>gTk>@z>kz>m`kmUkO2JY{gME?HRQx}1Np-gL^? z^*^&DoLtr&(1c*1pwD#kLtLjxMZceOzb^QLcJuc9yY*jTSfAV}sU4m>#hcoqUy@hu?M&LOrJ1A)Y&F%zG$ax)ADFTtDH(PkKx{D(Gv4Ywm~^E|@Y_S)c5xTH&O#Xga#p8rEIZhHpmSwp<3U@_m2ERDtK6k88Hah7 zV>3i_se5be*s|vo6cjRtpg(kfbaCiTtmh4T;5jO)so|hN>v7ypX&Xp0089&l>M&KT zk=qH7Jz$`q3yTb~V|ngROqe3vo}Xu9c|f^Y*2%8sv%33O%Bn4d_bgp+`-87z4^iIO ziR4~M3=vNo#63yf(>nc+dSoQ3!o-}g}c3((+^bo~-H zWl(r{!1+2JD+;*P8hmk_jb%0Y<|cTjhB>@#J#$1O@eCDHCf-s&72LJ4x_c|eti(h6 ziw-azpZCA!U>bSr^5nV{r>NxZ3SkNPT_@|~-y+UjUkc*J_4%~QPD%df-3Un(s=TI3 zM;L3EEYCdEymAHR0PTRU=nMEyHDZeI2B=C|7}DvgKI~ zc^H^Wxp@r32sts)$|dOeZ`oeFP~3h6?tC!m6}<<$8KlJE`kJ6V`#9$@_^+P6o?SeN zqXT&ZRNF>umsccGb+e!AW&_q+{1kp3*j07UqTaM$Ki2eC2m`TVy&n#qI)OZrZV~e^ z!B8@U7|pJn&b6Pj#{dk(zgZ7eqyJmh!|bxl4x;PWSYIxwndXl(YlIXMT#G>cjkU($ z5Nb(PD8S~)2Jq~_C(lM+GkFp%HII-Qek8ptpvk3r&T%6VR8c;kyKv{<8aG_scrdkp zI?%}H#Y3sqp@niQ^Ya8ylRley6KWRPm{UqjQhY103-@r{ zD5)xdkk$v9od~V~-0KQYBlC9(q24W%Q4AeCAkct$KDZqPwwg!V(2Lr!z=02x11)|1 zlLHxj^T_4QIBD^rD&+;lauviqNVbrlA3H1p4s0+7=m)nL;B&JN!PJI>g9GXfFO~h= z^+!*i27|!}*cPOem+Kdv{^dc8Y@{nVzYK_B{CA!|5}@mZpre4E9OF&imgZ(Qpu6zK zW_M46O-uoZe-HxB-MdBR*WBF1qobp*!|vDB=BYS#%6j@qk!5oaYKwj#FhcV8#wX?l6d_G z1EFL^hJxYKDDX+pqLe_(s^E#ur#s@UOAhn6Uedud*1O+0Qv<8At zkkOI>jBsww3M>JY-*Wsviae?XhAIRf>VfJ9F^$OV9+)*F`bNVV!iFt7vr2>UCfB-M zfX%%n=*UHKt5h$XH#=;~@Es?@W-UP)(Ajy0Wgmlv0WT=eUB`j2uk{YNKsgHDlq8AB zSRY~{31<#)33=dSH5*^YSZpgDUEP~6&QQ>4p02P)!4*N?+1a_a{T>WHZ?=oT*fWyY>a_q50vTA0j)~!*R=#xd@h3|`(hHqGC(eT8 zW^_YwOTNvjI2CxQKv7VDE~`vbbSsy3CKHH2E-o#UeA)NBZ|#043<0r-lq<2s{hTOi z1<^KX@Ab?fjE)o2U7W|Q29IiTBoTo$&0jk>0zbZBEr=+y8n5f*1LJSaEPT%=hmMDn z9sFd#F$v{=s#;CgH@}`R(A8=$28)O0Km6aVe?JXLf{l1k+D|O`=25!EL%-)`i+4vL z13|L%rxr;NE)csR#0d@12Ec9tZWI)c*gy^gW<`m4aWr&IoI7o1XD zQSH2_&;7VfS|$SCU%1b?C#UYsYIx1rjTfE$diz~Qdirxf-0^i0@_<7$q89adby0q; zo-VW7{_-7#hH{?q0wtN*fb#^CKfGjR+>`j;R5XWEyFB}}u&}8+Q)uFLiiU#dvCn*4 zz+XjmKFiUAWRW{b`SVo()q>wkEKck$C(?2^4%`LHw}m&f*Dhb4iVs*!e9K6Ts8L2Q zw1J2SJP69Y6Q&_6UJ){b^J$bI3@+g7dVnidOem&$RVvU$#3Vr{^5Nq5rJ7A?R_5fK zCW>zINU}bAoJ>BIT-Svb1;KvtJdZ;)Z4W&JV6c231z7I-9+~4rBOZ8$jfxPt!4uzH z&-WUc^^gGr<avfgCUeSX#&%EtlEWkN2=(w5e>q3S4XNR=Ml2aY07k!P5I#)ZlJLJ!6`~Mx z3PgVym6%ARMG0>*KyALYCqC54)~&nF?^qfELrw^F3(R&!;YK7CcVY(^v(ptGbC?Yt z<1qgO_w$(L>FY)xpw_^xed3J*yTK2!RE00SCh=hT2$LkHt~R;US)aIZcCBnq*)mgO zt)Ey9rCVf@+%P*X&39||^#&!nn4zWNxEpTd{V_p|;;coxJOC;{V;2<}319|~!97Oe zqfkjZlHw#V+TY)QJ$>*r=w7Oa6kZf`L>wS_#xecyhYuSd=j@uEjyw)({e=aatjn?o zR9vYn*TXKkz~q4`R}JQkVQ#@YJB*)}$c;eBowc`Xz-~}_}kn=w6htD4c?WwGM(3VMUb_k@z9=B1_}@K-pcca0ao7ovrQp{N?V+_fJP`FW$QK za(Of`Xv$wm-9lN!m6)7w2O}g)HoA1iQhrY)HZ^@Ex%x_|!h@rnP2o5ZezPbbRp$yr}c_0LyE1R_LiJIWDO(3oEJka~CZGGD_Lnw``tH zE(1C)i`L85SBJ`lhYj^yZmVc%l4;3mowJ(pBh_cYAdgmU`{Cy0wQ~H-2fqiZN)F#W zrARu?KE1mn#w921a#Q;5x4YLqSb)X~{?k*>A;~(w>bioUO=S+YG#Z*-wO3me3@tW8 zY)7uXmFvCNpbEf?z^Jn>KVKgrvtLqxY)Fe5@V|jJ0Gfy3s$~OpRv9sy^kj$?86a}aP9TAmevpHOIfmy$s3d3P&GwROuD1iL(b&0W`&CJ*)c}7S z-#&R;&n0R~o%cAy!nHw3C-5t4T8V$cT4=X2s>o|M&}3e5NZw=;k})9ayhQI< zlk%xSaG%0b+?dSK%RY=c-!|8`i3*q2+JKA5cG1F5zdg-HkY-Z8tC@x?TDVzM%N$ST zoh>XF_}byX6-<@0;dq3k515;-CsZhcHV!!xB-eW`t?cYU?b_ILivyI}aX}m^&w413 zZa#IrUJ1#Q$Ct)Rm?X0=c-zmYe3~Ai`tdBiow3$>Xzq39ApIF#>KB+lr?^<#(mQEy;lAXZMiGyT_W%Wzxwv zkf{6UlI^kPjUz>YvmrV4aQ715(@ZUch%0sZ2P)?~CGt*$^3W3w*vOkXw>tq=TkhRM(A05fpdbg4(s}vVw;k}S^X6tb~ifO0ph|DAL$*RLD zlGQkTCa{qJ@Yr8@?`$}el$&e|{1+uaYatT{E1Ixwd?jV`GqiB6F)#Gn3#QP*6lgx7 zb$PZSO5BNpGNZbV2t&lms~n?uQRBSi3xymO+_>oU2D9U;{4aeEq^mHl@2m6d7Zfou z$;`JU5ttB>Np;|#p}3ws^Yt)bIXNa z)tRfGoVV7TU1_=fF_-c0pNuy!pI1S=IGDKIHVQ!&L5REWESBJ6*BeiPxx70ZP)Irf zcNBbC(#WKwaHyCGYpFoTlMNH~?y{0%*`~^j+Xz$FH}M|=uWG7jJoRpT+B-of#>8k+ z|Mlu%q#5z&t(7r-0%zT|xdwFLydB1pv?Wy`3Ml7gyZPJ_Hd0*ZMgsl{)Z@TCLx$?% zf(W-NxE?n^Uk4c%9lPwH`yAk?aOH2JHcrU;s;m#DY#^fr+3W+LN`Es_N9* zVBnLL+$b)E)_gNf<~Q@v9q~Dl5xk%}=9BM}_mj5OMgd`L;UYt zVWOts#`|Lkjt0TVi!y*o8RY&KfT)TH`q?=+gj&dv(AlnZYboN*iFnQLdxEan!h2mT z+gdf_KUeyw8$_or;GVfB7o&cnL29dYb0QKenR3AXTXhtEAag`c(OAh|G{0~Rbsc(4 zJ^Iy6rFTgPq7vaYgZskAGD6sHshm;sXCmDOta|dQ@baQB$ll(l*BdWl^1e0P5_y7% z=qG`X_bsU(Gf7a@9h$qoz8n2y9ot*PxyQ^am-_A+q2{|QdLWhr>jf)vH!VIZu{2Mu z{BtZQ2u*N%eHyyH8I%?vpnf`x`^fc?4ekYrWTIP_;*Qq4uf|5spOqVYmEc)fUQB${ zN=}99hu{f2JhmRc1Oj#+ zW}P6t(ktRj?S(_>^iy#&$*v3=?;Bq!H|Kwf4D)R>WNnRG(F9haJd*FT;?}L3a5?s- zUW>-$qwX0`JNwDb>m0W9uyw;e9iOL6EpG1l?c~g}hbP?OGMPS$tb)LE{)<85GZtJ` zWRPF$VTNLx?BF9JdU`+Zq9A6-fO%@I#qW^yPSh(b@Y!1wf}+IzS$_zWCEY!tZH!Qa)CTrF|?rFuYP- zL5Sg^rWh}KqG3-iMj$(r$-?V{$M8~1!@Sk&eLo)cY$Uj`rMf>)({P34Jf)FZ|oP73S~A7GNI5HtTye&EQ?`=edFH9%mU;caki^bXhI71IWftt8>C5;VC%gLx6$vmkZM-yR+W{Q+XNCnc3AWF5pF zB%q!fRWx;}C$EfbMZ$N9y60vb7q9=Tz5drNiH@7A4zi5Oa~`a>@7?6s7Zj!CEPgZB zcIM9oj*GngA}t9Ke^~`f>Ri(-n#0FV_*zGFh9ml74S`_Z8mE?Tt&04L3P<52U~ldK+@UV+y7>SPu7HCmY~n0Gf0}IHiCVj&3i1s zo7&0xTW$F|S&%8q579F3(Vrp{B732l3F~vY&5X4%^j5D^8(E*~??52C^*fNA;WSr@ z1|%JfZ<7Ehby$TpRrGr3C2P(;7lpa6)+y}?1Qv_4WuQ{83Qu9T1)G~gi6 zs+wTiOr$c2_$Yj0b@$5I?R5Vq+*j@ohh6k~n=;I#e+L~Hg7lAYBHF^BXAf9nj^FCf zR-{`!#`4a0uu_L!tI9^SQmCSVDLyTF{J9CI>hg1fTM}05lXb;H{n~YgCk{g=89MWn z|EM@R|C8T%jSBmQ_NP719kR=O%s6@jA+K9nu`&8K&WwH8HMgWXuNe>NSkq~z%~5H3 zO!AjJ=r$5NmFl}_c03;x$kJs=*+Ai|8z`Wz{bK0{PnyyG*{J|q*`E)Mc&v2Q9(XLr zhcQVA%7slgr==8WF?GG2^2X}8{pvy$*N^iir)mE@>Qzb89mI^vS^nI3_|NW(!S9V- zyDS;}Y&o;;S{ZAaTHg*aJCO&Dk1E;5l~|2N8RyStBdedZac=3;F3}}~+FY_6In2Y9 zPc5O9z*ekNgU*V3pMQQbH>B_S*1sceRh=B*-AA`^CG31HBUTK*NC#@`D!ulr1PsaQ z>Lc0V;&mMp=w$mDHANx25?rESLfOB2)@CW4REvSXS*Z=%|5%1Sa|8caL8P$DfCDTa zQO%;|rWp=KOp&9jjUV2Lxv|)L)u@#SL*2@vZv2Rv8(cU3ifzauf(cv6&_^I^kT=nA zy`%siR#^?UdkaUsRjGQuWe@njy3sK9a=2$^wd%u3Nf}O@s*le(@IATjju#|b+jWgc zY?Yh&{|~#1Aboz-I&I&I%6#rk$_`c@{^fic5{uReg$SWrQ?BNvc$W&_u-*=w_i(k# z#Tn8F_?-!Fnx04YSr*QvVrJi-6Z9+^OH{*SF3+9UPxU)X_stMf`m5F^NFHBec!a%v z)&ItABeIS=c@3F6i@0`236SuiRnL#sGxpSw8!QM#AU-zprnj%Yh@dfdI%e;Eixc zkrkhW1SY!?UT}I_-=fWA*4wW(CLEh2Ae(@T9BCqnXB;U|GlPFm*2j4#2}_z8o) zi*V;(H@J_{ksy${Qs%&caA)B<2{T2mRbRe~>yl!F#V&t4&m-xVT*y0@rVI{=KPxfk zpX}IKMa=U}HQU7TZQ+MZB~43u4zO}bpDY|-pnjlCyAtoI5ZXbQD{6BaF$JB}eh8DeIS0?m&)ZH}-;p%ph2@N7 zY^I=my;_Tb4He%Rxsmb%-~4Kt<~pwj zEPe-s(mFgX6vr>EjGm{j7MmcnlF&r*j?|a!y5&2A0 zkm{{-Z5NWyk4}Gy6z+@C(Aryf-DKoy7i%t6J)J~*Tj9Rp%IC-nC}!#bI0&T}zMR|i zMTe7_N{&;k*REZIVh2BT?LfpTucmVQkpb^TSRtqfO|{GS%<(q|pT|2R79CI>G^KTb z@*Crmy_a2*GM7ERqVJq=H#AQPUs8k8hg#KzpU z;)tSzoU>hRWYsiH{VIUc?tj`#?Fe|bobOk)A_NSAnbkA0wV@1BObky5eb0!Vp5J)r zdi$Ex{v}I)zE7Jpo!1X<6zCgh6fD_&(2>Bc#%8#b%*n|q_}P;|ZqTd%gTMZ3$TBe# zn-vq1+CIwoqL^5S(dPWZm%b4NcdFp+uo5}Kgo@o-Z%3gddNYNeQ=jH*d1V$FyXmd8 zCf8aV22ikO+AV(+blT$tu;2&+k050DnOx{J64l~4`qGZC>t(0RddexrznD4QlamnQ zJw43)sHD4fb?P0Z>H>osk#{N`AJdg-|e7 z!zDCzbRK8TB0FM`T{bV^*1}PKLj1jR!ydcz$T^%`Je6RUHHGQfCxcf5g{QR~y@=u^ zRMMd4!>@si)2Dn7VSJf>o(HGYjXc(MP_=-iwFh+6DZ_F~m{V>6gQd=FJwCYlKvk{U zQQj3s5%nZv5ydY12|z!Hj%jg+ie(u}P~(=;54sooXYE^wYIAxL>r{|UGfYqL>LcL= zZha>cYkK^@Kdkk_zuj*<-?Y3<`&^5C;j9<(dZg-d!4?4Vsu=tDoO4A1cu?nDD_Qbs zK0HNc>TgvT6O=vYK_9l+h2#6)^X#wmRK0I~*!kpKvyoutsG&PiCUCQ|x{68Sq^}Cq zuKRGs#0Kcja?8r~C(!myx9X#(dW&?$?>J% zh$L`1wqr9=H+KLI>dUr&DrdFQYrQsHT_2Wwngbm1I>sRy2gsvMlW;hi*nPp z&VA44d4IT1(Zx8nNh)f^Xv_yFa9YNEaPD{HFqMlu5h*yK#!j?D?>%Oi^CiYwYyGp` zrqx5X@R;X%V^2Fz&58J^#lP>)ss%{s9EP0`5^1WPmq;M6a zoxaihS%#3R{&|tUT2E1kyuVdHzYt`9b~6%cgt{_#=Yf1J6nGR|*Zpt%9@dU28MO#& z^;bw2DvWV&T}mSkxHmx^Ip492^*xNsz|}gCDvv)MKd-&h}5`E^7r111?D3HeyXz!o9^HS!5aAp*;j zy$FJ;Q+95Yhw~d(61MfoP7AdQ#FHSG@wku7UA^6D-YlpcD|eG{Nsf-3MxrXBcftbS zg7qb&BYu~bILyJI5Hyy)K=44Fn{H2B{Uk?X0?kB|lGi(Xo@fI-P+Qsmn9)Lkxa|nk z3$Su^<9}7#p;!lPjhTTWE=O_0yL^E`rlA7rkFG3HWNUqIAPh=+ma>E6Pn=<&e6IUz z+jqK_uLMZ;*qdx8a*NSQ%&u>2>h%rcpse+=?}w#uwwMb`+5ws@~597y9|{jp?VI=2$A$fkO6xT_qcme z*;E)y=WAc4^C$G8d*mlpMHGogT}@oV?CNCANZjLg9GT#NSoj>09z#f0y)Om#g7lOE z7JJ&s0Jh*4*GZyxU)NI5+nC#)vQ;Wd_y*SH#g@d+NRh&*CqMb90I8T3^hh!s`^G!R zIp4*{J=(L(U#urN47058(Wg(IgjzDvVkvi;cb09r3Fdy#Z3l%2$Z1xuqN&!2F{&~6 zP}Mmrlo3jcF4LAu=UMKC`xP9fY6)b#!vfLT*r!OZU0emg43di2}oP z#f$c=Ub4ki`5yO8N%T!R^<6!+T^QCOZKz`83w$VZW~ZfTSq>=#3E(dv-8g8s`N6nA z3b@n&oeie{0bmAJ6*RzWK^W}7xNmfls&HPchXX8F$hsG_Sq#O+#b7}x3o<4!vhdY< zp@4YNS@z`_8uWDz0wf8zFE7esW##SbX}GM6;2iyQAp0(+P^;o5@tBLyo2~f}&ArdK zX{rYJIy;?V;r6Ax<&1e8T1*=99|JoM>sleIq#e`vgaZ6fPX`;E;~=F17s|2}GeE7q zHfhj=UN0n7?oi0J*XQ83XlKaLFP^L1e)ZXE7;F#_^(AXelXHySwWS)^5nhg1$t}@a98~5H>g@pf7;^91mwMDC+_gIf@yQ+SIbHI&om; zg^V3}il5PW5b(Bz$OKH$xCp`(w476{qcOc0t!>x-T6OZba?U^jM)L{x2sQp-(F~)raEwhfPV3Pu<0zdUk z>!q)xTA`3+>J#g1^QWng1U-WbgjBq&m0 zfzAUeZFT ziX?fd#LUWtvB%-21(GUett(zR?+gYpr4u1MD`1yVKmC0i#4gPsObi_;vM_rW$@=cY z2gC#n^eE7oeGNtNQ~GakVUq)P8KO4QrvybDL!VQ7ya=CZG8imXt~S#nGGq89Q790F zo2Zf(YRK#ff(`-`RSLf3@aJ~~^>MGIYmcWs)*TGs6!Jc*8`!Gvh0G{N=t}< zqI83lh=6pL0@B^_&xQJa@Ar>$#yDq>J=|lsm1nK>toy#_yyg{N9^!Z6IC$ARU?~07 z%!d%Uvb55$rRUkqKn%b8*^Cs4V}`kpIyuX6Y^w<(9)SQmOgL6re%**mNEoskwkeSp zAA_WnPCJj~kuvb8Qjb=EZ#vFwf2@S%*OVzqTvN)yQSn)%uHX4+i-r=~6uj=`mn_~! zGl*GH(-$J|lV3Mp(wqAAx9!)$(NLMl=27Iy(XrkC7a;8`2ew%u2nUQQE;@* zb!QRZh^B|EpBj1)uvka2-q?)xIZRKjwgwu;$A|3+Y|+1V4!W{b8Nee2RXbE7+$bcc3EX;3T@|)tpWDE_3q^E&flpkAk3EUv!r-bqa15wB{s$bce z1`>ZGVlM++^Pb+`IxE3+cmZBsH9w13kb(g`TRZ#KDVkbD;4LonbWlz^957WRuB?ENAPr3VAb> z-w0D=t})1DUQPSCE8+R+v1HUWLc1_UGc2BR3+r92mGe#uFKJz@46c6>VpX|3W*gE# zb!=+AV}dpF^IYff`>@|DaOxy~JLyfRjW$i`4eTrWcQlK@`b=E``D!p0eNznHR)KG;pz zB1c~zdP@Q;1SXnYe<5+QB5r?a{O(=V2e;;Nhrd7a9~v=jQ`?f0FiE(b%_%hT)ea*L zy?3d(;o7EL_K)B*0vAn?`oqvF^Gta7((ZCLnct`N!jO4SS0gX3;78eO3u9+Kcdw=8 z$hJTEcVI@~4zlsTQgVfp4oTmA8KG@pFm6|oosCnQ6P{89g*vh!`tRrKln{aJCIBcN zu>S$a#RF1}h}CkqJ^PUuP(G)*PRH%R8XunqYp7^Zpo_)9M3*`OQaw2yR;9kJn(_lw zOxrIQT0*^}l6A#|2s~FHP0giZatGUI34&Ij=7l{Mw9}YHttR{sLx#$7;Y}V}U2+nc z+}NY(n**x*#$J8Y!p^~@OGUsCrj%J}=*X+-q|7_HOz~|E$N5l(0O{86Itu91UG_Rt z#&Zzl(9~;oHKx^M_PkI+Gw;ZWawS89-86cQSnw>(j&8J}@rlsv2DN_6_|D*T(gI(^ z*7)O$W?uE#&~CnECz0uG>h<&XcpaoQ6T(w*X5UGh>QrTt;sVN(ZL!d6&M%ZKhm4m? zW}LhdqmZmGG(!K(A{kiNP~blV9L8t2`A1^X>j$>cj~~}Yj=t#RKSSF#%8_k1{o}i$ z($YJBeb!mwsfgy7S)3#mm0Mqgr#3fsU)^u>i0pQa!a3G+i<%HdzpmCkU>7Z8=~`gV z27hW&cq+c8=B}ai_W7X_+vqaKS)ev$eFIq_U=qknW?^Z$79x3w28Rt(lFzXl)Ko0* z%Ic3MtAMuylYEDkBd7$LJXWeges^cFkl=vp9_FnVm+J)jb;){ZXf(m9(C)y=Ef4`~ zm$p;9^~#-q({lr?bs-qTe+cq5-8bj_t}~Kc9Ww{)(GUCq;q~=@)?J%JR}LDifcA$n z;2)@#cqdoR#>S>WL>2p0$E=K=o}SEy`1p88W5T2x&es!lS{y)}|3Nr`V zy;@5vho~qmG@(!lfI`|tEfGFCo?8#z5*cr8K=pvRjLPD+dc6Gvun-$y6oY3W?qJz# z7#4%X2>3WbUIYhQ3)p44QH6zt!Ql;WUH}gtKn=vPaIR|tKX-1+{pZi0sOtmvFmpii z<<{37fFX=oWNKo<2z4D)v*102XQ7$fJ2zS(gqoPB3t(a|gK`W0CQRtSWR*V~7{P=P zcnB{!lO;rc%=H(*ZiRR=E3k2L;zQd2iHBfPEvuFYv5h4*qcm_;J1fLeB*5H1lh{t# z(+5~ZMrJ0`(ZLBVWev9syakwAE|vlt3>E`oaSh*Pz>tX3oG*|dfOjwr(q$=Y2rjO2 z-$F+L)q(LRZuX;&B$&q|&q@aOc{fn`yK*$yL28x){)n2YyoemlZk|u4k5#cJ{w_rf?e79VlJ9T z(ZlEA$Xlz&=Xo$$uj$+gj-W^uZe?X9Tl606Q`68)+wKZWN-`LDt~Clr?=9K-gcnrr zoGkyyX0rZ$A8llLf9r^A4bU2ov z5;>GrWKO>MMx0x+$i}T={A`)bHatPFdy$5Q5q_`Un0oo~=_6#cpT)5q^&Y6EBifZ= zh`ssq_w<9(xrN|l+5`A{WZ&oDOneI99;lo&Sa;#OWT%;gCcf#7JSW(5PQrJD>%sp6 z_G6HT5gMcA%~my;i^*f2R{q~3BnXWcRwOytEwK#)DO?G_$f*kO^;q6JeeTjwPx@;aEbeCHjeUxby=E*H| zv`<~Mm;flA?d_i-oajDs@zk6Ota++kEi;t63a5`#=LWus>{`>m;%vgwOH!mZ&m4(EYw>{Ap0Sdb{%lIPHG6?Eg!zAS?2Y z^4drjJ6_sYwp9BH&yJKQ@(N3h-fqHy&Ly$ZS$X_p9kZZOTAX13B?A5(dY3bGP=Zzi zof*Y_5Poe0r9FgbGN4e@QZcdswT-)RwD85@btXoxuP4(5^g0;*c#odVm_{F4z`WH zW^bH?hSA-tp7%V||395sW2K3dj;6YLm))=lzku6oZoTpp5C9NLCVRcDw}WTQ|H?@J z!Aq$3IrK5#@_n67Ke!X4%l1G+n@c}2CXBOUAq8jSUFo-(X2-<&)sf8mareAZ>EMsU z%-k1ukORVJtw#6T`j@~7A)Uq+;3L8k?e}*{Kc2kug2-Lc-czFQlME%g^Sx`;loM7Jk~MX`cRDw^hxCqEd1LyC8y<)b722# z78~UQyd*$;XH)btKDscmVnY{B@Ai`vMkFX-Fz*Ex{@LC`=sL|g+^e8dLr86r+oQ$S zwAFAB1}DS5N%!w++TKw0uBBDry|Cr+GyO%e+Dj={l|)d5{&<#sdX1FL=@#CRyXkdI z7?*uX_Ren74xwAcx<7{Xr)_jL(-Jk2QxD{5T4`K?o)w{-8+WD2eYj6saO)08JZWt! zNrAZO)_~YG1b|HOH+b=esX$wMsPo~0RQj~bLLVV=p}Tv^T3xwXw-@9 z@K}>7jT!>(xT`jQZ0yqw~b>#u)$kVlOB*LdGV+Ut8+-)4}JG9C|`IWVfP9Nq)O@dVeVH=DYtOK6mx@E+d>} z2nWXnaVQ8EoS+*7Ob12+nVDxmw+j}sxFR#_Z9L6D!lR4bL%Lan6Gkg!69@`mTkj28 zI9DG4R0mZaBGUs$=E2SyIDime157_{ltk=}bSWqa9C-plrvd>JCu~aPL5e2Xxdsvui|W0t@tw9?gy2c`1^*7Sn{O>&?GZBJs4!du6(3*>6+or zG%#5^d6b_BS4Ga9eW=Yv8r1x6U+J^Z7k01GN15#^`iN!Ueiu9py%?Zu9G5H(Iql%I zg_(p9;tK$%-plt=AVEgZRwR}cRPpH7oY(U0sP)I-0V+i@VS@M>K2pETnHJ|Ivr;_W!{O65wGGD0D#8Eam1-x znHoZ#0c#Dg9xhG=cm^nu5X~&^tBE@RMF9)AxIy#N7)JN>qJ0m^SeHfP+0x+wRN)S%5(7^b3Y?SIJb zdzl|Qts3ym{+>3^aYcU^Mp1Q&3u*1wbnf?}%S{nhbZn2ir7q4|UQKIS{f8D6?FPOF z;%gWMC&G{i#Wy(bmwrD*s^+>dAc7#W)fH?qnxM)|a9RYH4PK)!p)cR3=LzfCzH}Z9 z&3FQ@64$%WsW+<@v0W|kCkKQWmcOExT4{As|432k2;tt@QA=>greo1$Wx9BBZ_dF7 zd!a96LS)mvrF1NI0Wd{7vvPyzjJggfo6zN{4`mNoljxuOJ2#K+mbRP;=PZr#>t>%y zXBNFf?>rT(e+I@|8x4+Wy#HA|Q_o`PgdI%Mts%p_2~;N?mgful@xvR@;a3ne;h=-7 z$FsX!vB)-gpXBXD;?{-}aAfnFKJj@Jsgfl+%ZDeUoe^<>lQiZddo)uqDZb};lc7H~ zdQxQl+@{xZaOyZl%*9A${dAa)6~~^D8O~k%#e`igW)d>YFp^bITFa@k@buB0sU|Uu za`xtepOna!StMbv(1L`5j}n4E3MlQZ1c%+CooCoDL8V#xJ2TO!@CE|OJOhZ&E<;5h zA13XFqYjdX#SL|Ihui~?CGa1u)jT+VNbq^^1Y4?a(-LJyj{b0~$Jpo*p~T>$<^<#X z7>OWC9{T$_0c)>lbQ5#s!`KA-&m#nav_38yZMtiY;rp)?dOxj1Z>qAzrpiatBjeds zV9S0u#zC$zh!qF@^XNlnuYG+W6XHmnb6@gq$gbg!om_p>}q`1 z#kp5!%gOFweklJIB$-$IpcPG2%-h=wd9K-cWM8h4Zy^Qa{yh-to)9`gy(5jJF zisZ=fAX`*97-R%$3F`@B9p>P+(|{`k#FW$@=WvGEl{p)>stBli{kBYVmZtseF)xBLJ|MTTZHGYN@u`!mza^lI& zJX5(1M3==Ud~=`DeH7P>E4|p^YelxuhT#510s#AS>z0zBVq#+hK)%Fkq$q+7R2p%5 z)rOU3x<1(-r?=qRSbDIl2tXosb;29NVXU{L!N$Y+ z@RvWn=_%#z_r1-v(55k5_irbC@u6xOv4~n6{PZf~ZQ$yjC-gS(YkL(ra~K9PU2EiQ z>ZRk68lup7DmHHrV}3Xy7d~6$`Hg+{$uho+EmuOuaVutC z(msY@g)}2tU5UQK#6Sm)4iOW zu&nxulJN`5wLNC2zdn)0gg{`n%qO5CvD}1e-~dRme@^t!^!axD$19PF+2G z7xd1-J@MjqXB&XB&sP{g)4#)?qW4m-2pOmMxU(*%>1V-agGns5ih^`=kGkftG;l&t z1N+~Q=S5t9OLQcr6f6sGug^FBeV1p#?A6I9CCbUSbe03g-FdATUa`>y`>9IxZ`{Et zoZAsRU$nm}l}aI+TWL7_od1F13nFdPg%qvTHtP$7p7j3|Im#`|aOQt{*zVu{P_!WY z0#1ESfy0}@cw8|0bpOL{%aN_Y-cA!T99|=rRka?Fz|*9UkkR)q?=~eYxR)rF%waUF zKrAs}*G;5;d9J&7`2`~quGL~^!>-=3Yi99St8{+FRkINc`sKB87}=H zjSvTq-3xr`xB4}VUWZNv{M8eICpjjHgB%ZeNVCpLFufh~Iq^d9;1BonPUBbh_xD}7 zPtPZgQ06lBZ&W-%d)nhNhV;}!QUta44pnb^zrX+hl4<|7dix`_R^F0mQl6+Q#} zqJ~1+rS?0~l%A#iXF`P5gX~YIDZh|u3IOIX7eOo%2CXDkreR8^?>dQYBK`2zlR0FHrY*$RbpVoq~@K9=&Cs-$Oo=qZ-C7Tkcgzz&q&N>K>St0^Bt^w+3a) zsmZ|wqFk07@XAzt%Riq>4-{cp~0-yhR&=D;#zgMT|T(3d2Ts!fn ziBI%l77}GjwQu#DU9F`rWj+il;L&-wG0l@`u~t8G|9r{E>dQsCc}$Qo_Z!|FWa^gq z`ZWOx6dF=HS(%xFBN9&8YgF$*q>S=~n38MnqcU#f_ND=!kBI?A^7JYY;WF|B8cTJ@ zaWnZ=n3lrOv-!F4hMS#D_%6k7-Z+{$00r3%fn@!Q4Uuj)D*4AEQkNR~<44_p1bFxq zP8^Bf84!EVbBjyWGR&U2+zz3L!E-g0fu_KFVnKn>qTDo8%tCyzx5arTe?A)gu;PVbR7>0IdCnbcna-#_y+L^{Ap^&J zC%m?wRprp$S)f{C(KmX_6nS^7u)D3@N{6<+@Kc%`2W>T@=8X@5CPrTej%2t;BeK z>!41noZ$2ov`xV4h*qcrme$kvL|*2$J7KOhoxum*g%2!)MB!C1N-b)-M*4=~uXYhc zWHhxoKQJD#nHJsW%CFwH2u;HQ!cC55))dIa8x&E0-ehCT|(d>rp!F#{zyd>|f7Ul`|AAH{TP9Y)DYsklMvz6@^QPS`E_&7>|qc5_wY&(bohsUiCVW<@to? zbMB45K(p>{MoUz}CzmfGp3I0G<|~WJVR;D}U}20n(y_Cr)x-wN|2rSM`!_Of*5$VS zH;&K9^ zL3$W%CztT&87}q9ML{qcYeTjQKBnOta6lquN)HV%YeDZs{rX_MF~Vy z?ZPivaf9dG(W+rqAe5jh)lvJzh!`pZcN|Ot1 z-Ujpa$#---w;MnAc4f?w@vNi1Q_8c_gT-|Tu0l^P9C!wBn5|6?_HmI zEbp8sqZ}?+tUACq3A>z0wRQiqnfm1r%N|H?JIhau6p!j@FrQYQy{PuMoIGmEr@nZ} z9&<2>J-D3z`aLpm3k_J$O&|5wxbjiKDO5 zhy(TMb>hGEyBudG?qJ1oIp?zWLkdhk+*3uxv&&MMfz|yCDedL-#d;m}a2q5zW26fO zY;yiFQLgX) z^6xThHPSD8km7fDc*<`K!9qXXCe zl(mLLF#?MZEYA^|1Vm_kv>lfKP!1mBbHyNpbATtO6o2b%rWyx$EbR%&{Mq7+Bd|4r z8Om>nCI-avedWh>Iz_+V!|KL9!FbdvO}#&c?KOVDs7aF`X0T(zO3z?epXnfktSN;@ zjM<}0*pw8*Ye(wr`3t7-9aywX9dZdjqx{SLV z=fpydQ27n`54c=W-UnSDyiA6Oc!`ejKa0;?kCD^9Upq~;=KKWX7SXkd%^;uQ7;BA* zOfeSs%s+T$*B+EdeJeZ0a1tTzq97Dlg{`*MN>E`5qmElj31vp^HgU4M`#}&0kD_RbWpM@-GCBr`^*YAUr03e)lTAf0 zkLwb&y)$3!T?)65h`k>?zJ046-?)7)kp@uS@@boqvT_>y?ZJt4tom&xbXe*LRUG8d zxPzy8aLDbyfBzm|q!?yb)Ph#jvg#(MJx1u76V=xFP~P(d#nXape&o!C6fbr_g(gM9 zeeImbeTX0JD=>&YI@oD;HAS{K*}O#lw>;nC^L=xzu%J;7DsWM?!|uXv9_ybJ+eIV5 zpbx60S4~iKK;dLI1IZ_LARy9*H4KnbbyF{^e>Ko;z{Z5F{yh?2JjG z>qlOwlApZ*nFjFo{6HFCYR|&GDPjZ@c3MFz6zZi?=4n+)5*X|wdqMn<2v1%<=3+|6 z5#5hRP^*jw_+EyGKkiNfNMRa;nyucphA-#BbAprp{!pss1U_EzLzRKZUp6C0#sY95 z4Z+jgVnwNJR{YyafGq(jKL7|a>Uq)k>Hn?fThLhm%#8AlWmlcbD;XJ|#KE?qcndg|6Og4U3ruYh=Xsz_;lq3q;9$drzB~}q@Yunr3>7C#&5;o&eI6W#BBDhTT7H5R}I?|n-B?a8_*SI|&AeJ*rtKScsf z8w=n;!bOjhsEkw|nNfR<*x4%_rm()gu|U0 zYba8pd{wd<5RY~6O1tW*$Q(8$d}3a7AN<6`@0QvzftrUo{uI8spx`f!0Z3Vl_ZAu$ z9Gt;1w5aY|^#3({%kZ8s*4R*e_RNBr8lxQ9sLk-Ra>sa4q?)tfI(z(v0R|{=wBsWt zG{_7U>HdLL7YxjqKLR`8G$7)6#Pl7sRX129aS>S+bC^IG8(k~Qm~Vc+H8Mn4E-aAV zFKv6#;I&ALfYuGQ?2P>xLgw6AJmm>k-_0!PVZ$ITB5A)9*lKpw2nYJ3Wu_XfyYgsyX$}#+>>gUldD7y3}ZgEB* zqZ=Yy9+#B#ElLvBFvYhBb0I||C&A;QEIIa{#Vy3(+^lf3!ie^n{1F+y@GgX{9Dm?eA%Xd)k_?wGU7^o z9&j1NE$AQq$C(1Jgi2F@%i&$6S$aq=GknyKKz;q`?t3?T4qe4Pi&jzYBj2v@$+^yz|V!_{RBAjM*W%ZuSb zDUdUv0Z}(tWkgTfztLQtT7qWb=Cm-VFjAz5)H2dvRY?;;+q~VWjWF^6YjhZoh>#%4hgabe3Sxlv!0lYYi3hLzx>Z z>(|T9W-4X=>Gr+-bqQsXD^eHi$pCxv2Z`5p0+rBx3w-rY4p67~U`=<%rzrg%?tB3^ zw)XTQhj2CV_~K|Z3pa*t8VyZdWIhe8cdsb+sK|`qW=-}d$_D-(s79kGSXD5;yX&Z% ztxa`Ua+QA0rTpYRZ>Uztk3hdgy8bPf%@gC2lcRy;V`5?gets!s@nnuRCA7#OElcGM zj0F-w15IxH=k+d;fT48uxP#=#TZ6zA#+H<}Jwk$jq(I$a`lNU5aLf`Xg)z|5Y4&=H z!LtgyWNp@Y^!5(J*tl`y^89KMZlofj?tTCR5|hiuoy`ACec<)Cj{74lk&Sa*CX-C@ z-WXlIR~%wZ6?em|{F`sC38`!i%ap6XGW>o^{_$!t^4n%X;0;=D8mTf7#pZtKF1k{(y3LMU5j||TX_*>d96sFp^sv#P z^aHeRkc~bqaoY5eHb#x>Q}(Wn&0Ej*sxp!Bvfc{eqZ8OQNhZv_rFX4DO{z|T*yZ*; zJGoPkJQX*#55R4Js+S?q9J!)6ge9QGUP`t&s>?N?#(UQm=-kzy%mJ{r8tFJlw zZF*;FM&gXqEe%1pNGI?AR5F>OCRAMa$=fDF6O>3WTYjZdIWaPu|M|{z6XJ3L(iL02 z7M)X;xWk6?vh!ZJ;W-On%QlCdWj}oM%QXD5zSxJ%GqL3s_R6Z_1ef$}SoHt=PkmU7*jIJbWmfcSi^sDUPN;eC1lAqnj2BPbT6$)(n=XFZUrmOQd+?|^Y3RzFdRI)045c{#r`jo3PjoZBjf5Xsa|i-+s4U`9gGk*_%<_Y&LwAR-I8>YF))L86SF6~m**O#d2m>TKlg5<}3(dpjZZ|vyy zE|I={_kxr zUP5a~Ekl1Bqp~Q-ZZVPAc1ti+pA+*fDS+0UzmDk@HCKX<9lRc8=x$!rHEjc%BbQz4 zkbtI*;pnuP19P><)n^r8#vhbw8Qb3{q#RTeB>+aaDwi)6H!S1lYw0YnBFHTDh)R&K z8*t(rquYZ1lozi#_@yE;X}rn6v~UV(xgq%COo`~`!u8p7V6isk?l&IpZKQ%LHKcc& zn3@JHX@VUv94n|7q5PTs5I&jY} zpR9)DR^DaH^o`k#Js6wmBWy}9 zeUaz_sG1QUG?1B~pa68S5}2cDUf2(e@k4f_Ry zORY~|@PLG zk`I7Mi7=M|Fv5;+qgZ*HR5CR+6+8@F>GsmIaJn(472nEKky>vli4ifyAkPVq8Il7i z72$itNBi{`0#_Z_fZn6SzEsVx08F6}q4EV~?OIlE?{4N7G#_760_4TLTkq0h19Dkl zry10EQ^i&GSQNQ}i2)yUx`_G$7DHoEzdmjA^Q2@b{g6Z*LP^NfX3d=L2O>DYkh1pn zd?2Kz6{>dgx#ipH_C^CoH&b*Zf!*M8Ue;OAxAVfqg(>64y}88-muX=J3?x3Z`6C*pBX zqzr_OJKEY9*Kf9>y;_V{E(aiuCX(&v!29=?3C{6j0*~$sB#n<*;ogJF3)uIFNYNEo zi|SfhSg4lP);idK>ATAyLW&k-fiPbOf;$pPGC5>fU}{uM3a(@BRT6V6;h`BY_(d#{ z!h#uNp@Ss?8d@YkstsC&jHLl{Ayund*KScVn-7eq{C0PaHP8g(G^VBAx#Z0HQ{37N zVHM40_lB?w^%&ij7w{f!^Szx{`K0ER)4zTWajAqNx;6*j>5W2Gx4=k#UY;CGz6Yj1 z6&4~n$>)Ky=p8QB1|coHpb~K0es#^B8QBD=34$^6@DTS(B-Hu+-H+&C-r~|}?3xM$ zPH45fze~Ur%4&VLT`s+gk)1v3pCA8O<)6R+wJV-{hbZ&s^t zr|fydE%RelDCOpS*Uz%s4ro^#HmT!t>OwrM0qds&N9Ji8pgr=5>qNtum;KZFu`$+P zyA5P=YROBj2EHL#-RGQq6crU4mcVHL23aAvC!@Rv&4)*TPmXY;l@bNHGN)ZYgal_z zBbd5&f(-=l_ysR0gN!vCF$siR?Vj*fNTHfS*qm^N5C2q6LX7Ys%{u8*$MMkttxdAZ zF$COWqSqO;K|ZA#2RTgpR^0;%xH;q75Drq9QOh{W?%h>Lc(tn-T|EdWp@3rw{sjhd zOzzdMCWXOzy&#@Qqt;EomELuoEf!QWwIF_=>tERs@_b;2tb2>1$f(@^TN%TcF|+n+ zy7)_T=cHX=gvUOgSbu4kSn%Yr?bH!PUCPweQ!xs&>qsdK-O^KQoLENW3Tmr%SQ$yg;tvM!ZhMbW&5!Lg46Jx_ z^zO4Iv;6q%@U2bLSl8p6#L&u(SK_-TElb#l-JzEJ`BDk_NVFV>mR}A@EXR_6f3j|M z)Wkw4NMR63z=S=Sb5!WVD<;=q8M!PCQGk{hgp~y4l0rfyliQ%-Ls)?^#u$Dmh&(4m z8Q!A#atm;3h)mZ-`NB-|PqZWa%^=7VI-Df-sSRQ94uu5=CZAW2$hNw+1OLl)(!za7ox`l$u#8n%niU*rvk1Ag$<#?a(A2}E6ArY{=e&GY60Gp zRxueZcMbqGin6kD%lBVE6h#QW;9R?CJWyaD36DPR`SUsp!u@fg`B<}$YR#(MiTU2db0Je;Z0|XyHDTaJQ!aR^w zhlI~_$8(w}gAy3bB{1yfWnO11}F>RjsC!n!7O^E#O4IpWmoeDpOa%Cj+mO7 z!ZVVx1{%c;h*^j17KG;1g9KlkTE#a+CC0~N!&nLac1m%vV0IoTHO$7V#gU6*kT~Cy z*=2?-P!!~(U`r?Yx^FLn+d>Gy#)5Hb!dyTJK_*pgzwq|eu&!-kc9+;1pt9&2J|$Oz z`UK(9g3Tv<31Zn?5cC0GSWy20;Xwu4{CRtkrr6lnE|{6%gNWrtVj_Otoi<0z4pJ1n zt$D1kSzrsHG15&{yKw0I1;iS!nO`{uI@;nIKOat#~XCCz}cXy_-J@1VB|o)triTzsPtXYAd~{rZvG3{M>7BX^^4Wy_&THSt1FL?84}7DTxyY^EUK$F{+hHV zCDCuXe*NVD)E}@XTK(A#n_=*F-};j{7!po4gdlD@)C6a|nOUEzW_(8yC`v|E^HiH% zVIl)N@otVY8GVCY4QB_bm^_MW`n)=56iKG<=e-cssgG!Lly1zi-Ff@TlzvWMnR zm|Snz9nBAVH*Ub#eZ0O@m6^oyo}Tnp^zN$6E8591*H)~F^~-b*O)SxD&2lg!JGlXy?3U*%g5uF7q{u%BjTWY++9R|2$LD1G+-kjJ%KcW=offhavn5_;}&@ zdKH47N5Qxku{QzM2`mwWX1)oF2R7_+?*lGGmjt$>P=g{i=U@)5_~jXk`{KvuRp|K8 zuRYf@D6*jVx&DRxx51}}%AJkvg0^FxQbqcz>?T+E`PJ0aNTOKM}(Pv@`i0gU0t0>`N$JY&PgGWL-|Ruu|d4TM#j z)ZGdgufFh%{Pn#7!|jTnqmZYNs*!_28G!MT+OQ0_q?*{zl#mghQjRfnG2l#h|hkJ;k2((}Nd zlLbuH^A9NIKfyx8Ia%`*j*x*1OjLBVcp6f(IoI)NxWRq1VII#T^N{WbvY2yN|Z0_e# z$^;Xb@4B_Um@~+J!vp!Ip9V)(HwL$ELKz8#B){qW1H@JF*^Nrb2gfttxnnD}RcM$! zY$uAIRGvKp@Qd*k0as`LbBH}K@h8&a(!P+8zP)-zI_|4)}7$&ie*!-FSVaGs_J=pt?Coi^_zZ^ zuA-l+&(xeDQA+PhfkKq)lAo&VM})x(RS#<}P#g7HQ z+B-6^>}R|);%9)bQJ|nS?lKV75EFp>vKg8PYCd$tV-DH1qjM6THyYIeeUH}r`}glE z10zsXozU2@v$CebIf|QDApk`YcvswPdA=QbljiT+Ka;u4i}oAZM*G!IzPe;TmW4m! zJx|^Va2_@$@k-TBtXA7w>UDK{eXf6IK9$Jv=z~TsgQ|6Q<9Kmj>eq7;=EDj9DQ>@! z6#`Q)d)xBfy>p1IAO!9aY3n&tNF47P-_7g0t4d2orQyIxf-~_{p5FXg9axv>I<&GX zCxKl-^5gOLM4@ShlU@0S4y2VN4|}PdY}O0fZU`2f*eN-tdM`^J_NPF%z{M=X141_Q z(L3qvD_MaqIIa(d*RFzy2ODtr>{-pPSV|pz)4gjs)T8Qj#U1-(HX}vlG~is(FdhAy zijM5>&q-@BN{T_WO-XJu=Ll%7W=~dkw+@1_)8Sr+PZOMe_DG)q1<&;ePBvLnNYpAl zTyOgb*1wKyk@)!d_Ml*&f?1_)sV(&FQdEN?d7nR%!{KPLTLQ*9AB?$miU~9`mFm*u z!qp567>c7|XowIbzzm!c`k2}rnqm;t>;xSqz;ck4bxI;{x}2eV$ck7h^6NdkSpfrJ z0Z#Vxhg(*=OU-FXJizH^z-FC(@|4X<8E@2S5*LLZzEnEfY+=RyxFmZ=6Zs+DM%8&g zO!y7K_yHl(LYWZ=WC+97XRJW;w{-o8U^_tXn*gHLbui&TqU*jvatpZ7eANGrHT=;S z8$q*x_iFO!Q93XPKrIYK1x(W+%tsa^%AI`RP2;BYEN!^NmN=mTsy_G=pp`*xm4^t(?pAvWTM6pOI241&&c|NX_Q(Wb#3QFtrpw0eje_rMZbU7jvF`f3ysz`y?QEaw;k+-)2sUbZJekac98h zm2_xPI9un$10Z933o0)`PzvBj1)NB&hnA#T2r-xMVWP#m^ctFA6H+dIGXAB`{(X$N z;?vgvg!Jej#03smGFcIArqak>8jQ(TH*RlVrn?Jg!Ww`pM(qg#QUY?AQqao~Ey+>o z1p?YW`Ro?Af|*xO05&o5|H0OQHY_~c9@4LZ&!U4cYiYPdUSFRcBB{e+SGr!%pX^u( z-Eai;S{wdH#L)x+Wo}9Wk_=^GT1pBEjCc?uo6D5=u)sj+0t$6Vu!w*<)Ofxp2WTUA zZ`VBfTZjBHi_dPdcmjjHce%Mw6B6*=Y7CCFF{M%eg<H5SS5 z`T$W8F{3G2v7bOphoqGPSG=pn=Vaz?=7ql$JlQxLG;NA`e0pEUlM2IO(*!3(3u7HE zOt@m7260H<63b249@g3R*!A13f*0*zBP~=`3WVWrk!*WHjCeOF-(f8?gRq$jskkjf zkLn>q4I5EyRJv^>7(aQ&yoG42{`k8_n+xMITb%p5ZT7Wjek5{Un61;U5Vfj!cXaJS zq$jPSroPgQ;~x55hgv^VP&)>-w&9`Jocbopbupgxa^*&V5A+}N$fB2li{{=n47u6` zO|Utvf;O;j$pyOzAcY@4emuPY#tEb@3wGDkbaf*@Dj^M6!GFR2S$&EGu7*6GgvCp$ z*Ea`cL~-!u4#x1OedU=rj5bR2Y<4Ckg9NR_f8!44YvR1@)r`JXVUZeB6v2Py&%BOI zuCGwy5*A~8n?2mT@$umvVs>LwI(Y^a+S>1Ndd7!N$;DhWob%xMk+l+3-nQ9Eltv7O zrRfd|^x(+ZWbz%9o<8&z^%yd-T~Z9?WbKn=YjA!ZS2p?UX-&o!!y1jd%f-!vk78GO zU7KiI{RC2UCjk_H$bWhSf;A2ZJ-(4i%7B-jw9OplkL3~usrGlAhWDH)4vW+IUOB3X z3h1A=Y}%^Y>}-)4mwY~W==#u@OaEfF%-aTrhnP?sf%~l^&%{+geoWh5AhmWQux?3l z4mEIoJ85A-hEf}M#rv`sH{oJnXB@lwCQEwMkompn3Hre@m7lLr_Nd;z@Fi7GY(V`} zR~)G@?>SLTYQXWaklH+;LgT4x(vV=h93ue+6cC*h2|oD{M1rmL@)VnD1fzG5uzpDp zu!Psu2mxhOvOY~7cf(W+(c4+}U53hr0vf@x7q{|48F9eGV?^)N=oE3?j*a^Z+72z( z+&BuBtktX*$<_mV03eP4e=q#!Q(Ku{Z>t{11ObN;3Ep$@PHPiY;({Z1{t$NAv4vdt8thF zm~^<1l}DjstuqMJl5_kns~8A9{Ff?G>d9`}5KMUboOuk0a(nR@orJ8wjt8ucg3eue z2&Nb;^ld4EkbjB5Pc@><&(8cAHs<81y(N|(>QM&KE*2&ye<6= zC&++@q24hlpy1bh<0Y8u(=sd+ZUs5Oi<}M2T$`+>9|10&ZY^jAcKVdq3!bh&XK5ljdM5Vu&%Eq8 z9fOf`@~I-K$uq2m6nI$Ni24ZmCUhNe!LYVMrLl zu^_${1y0^d*;jbvf2{VW^14L3tAPJ`QP~-#mYxZi<6Gsd25#;FF+=&6&9|>z)7qn- z)mmJ8od6U!YkX@0{JOc}5+39h!I`DOO0e0O=JYO^71V)!{JYzwu6V(1z(pJz(?2$2 zx$sxE*qS)O5rbdqN)<2=vuD3`a@viPkQFC8O&>|qstg=SEvbOyd%4I^D~HT+BVsN& z|I}bCt@!c`)s{Qx=fP6#GT6a!zG%>hKo${f-1jWto@H#qo@+r`N!f#)Bohy@ndC^F#WPFwy$lEx;x5*Nr{w@(k`h z3%Wm91i&Ul2N~L>d!c|wGfjw*7Ds0u5F%XFpF`Vxi6u?Hnw#ZZ4xPSH6zd_*SE&u3 z>D{yc3@DwL?!J11VV{`s75PF?fmRid_0C){q2pteEgAyR;84C&SL)G`v)Nc3eRBv z2`V~I2?NK!5lhHQOHbb;-q8xsx$Hm`8+6XbU$+?m@46fg-ih_s_AK-3T09gb8Kl zcued}dyMkYQ&;n-y%#lidX*sg{mMt18dD~EsDp*qch#q4lkPnMmTHVOp7JJgi;;{?^V1p@ zTK~<1_>$jr`xZnX+krFyOhJe)J%9JV-rFFuBYJR^$6ScYLaX8Y9H4|HQ-l3rZif*w zi0$D<`4O{(q**f+Hl5jf4L_x8MIM4@i_Vu?+VP($(rmW!0QOHdJ<#*oFB zR^0MPJOX^L8}y3BtSUwZFo3WZSl(S=B(kCq!f?X%*~n4ONn1<14R*W?yDTuTP@u7d zC;}{?IG!iMM6XOiow|1;gv1^vJYz}?Zv8XeV0NC_Mqs|9LtMW;YKoX6dmns!***+X zmMR$~SF^kPBJ;nL>6@1%wZ?oyfw3D+hD2Q`fcKCzzjwax~1}@-H8AZE@(qe z>5sA(r1;;Wfv`jxbiJuoLYFf1Ht(y~{8oYR8a&lsO)uMXV{liKRJ#To!DsBsja;DQ zSh&4B4TOMI&il4ZQiHufYCYQr@@Zs-yk0DUqz;7LNg=)!Y;2D7`QbJpmI`srKqM$K z`Af7QRd@}Qs|7dwA?0yw>+dH)znSXQ4~Z_t|E#y4s0rO3>QFI=Yd&yWGKWv#MnXhj z)$4MHFyj1=7#YfkDfvu=0Q1cXaDqI8r#T+8?VEen+0xekSK^b^zGdrmvHNmPYnB_B z6RYd~U|>TEhAf~EFq7QRxN{KR4@bB=*7F&Cl4 z;)%eZ_e&}L5bjh3%Y%ZotJklW6H9kcFQl#fcghp~I(Upd&+I}@?Mq12X)mCt@2nrii7JwPZ(?+R z=8RDN=G zbryND#(x<5KuAUTONV3}K6)x>jSYS!At3TZcw_RtQ~{GYcRjv;EL2t;5xBNj8CwXJ zz^dN-^bAk^A=u=BTKOfb8pTH_u&wxa^nF4KPq&=V8QTw~$wFBd%d{Ma5sTv&=V3Qoj*uE1PS&k<) zlOpBh3VS|5g0l~Cr00^7tzyqcM=MGFb%p-W6e$uy+bSa9o{`O$6HfW6Y1LNQv0YJ2 zgc++MSM08umAt*OSx#X39ZCI5*)rff2r8>)UndGosnAx0nnOCtN4UtEH2w+y;m6$h$B(H)0OucZkB==$xQRv9M0)J zPPN6XuAZ)q`L(Kq@+Q* z1nzn;-uS-%j(^-SuER5YL(bV}KYOp7b1uMjoGxju;#y&BJPmTIdf(Sr=)Y|;-&Mgm zTw3Z6z~^{hgqCxoH?DWcCJi?8(l0Z8?CGyEx`CVUlO9a zK%+W4`kZ**w+o8S$|PUo+ZBHR;n#T1fXAKf$M6?~9I+jMnq!htdyXCM73kdlwzKgh z*Ka6do#8~7tFiOXg`_opf-z0HbI*sp6e*wozU?7Nn^Js2i2RPl3lsyNa)hbfI;P+w zEI65j4(EEp<)>o<_+mK1{0?!v&{HCQJGBSUZ7DApD+=u+UQ`HEs|#D>saBI>AjMO; zHze8+H1MTea8gw9a#|^>nRxaLQsHZ^=M-Qzh%$OW)9F&>(Z7RaQa7QHKD0c*+hZvp zJ;}0FiMOf9aQ;@Z&cf>6Cu!xV1ZO7EXZT)wJ8`gd0hlr3dyxqk$V z{o+rs6F5e%Bm}*o)0+w%Zg@99k~o2&82Kn}eHg!<3;Mn6RXO!Q*?hjgTPGR6UiZr? zAAhy@P|0L;?|GXTt$QHs|7LdklP-T8w9Qlc*}aTBt5mNhrkk>?uj{pW~~fp`s!0ZPDcFrj!b(oy|jTBr2+;*quT^?dL& z%eaxAH|vwtAKA-%=IYBvIm~TwFLx?X7Yn;^LX|M$SR?E>ih{$&3itL7sl`njX^=N? z9DdrHRgM}YV)rrki1XbEwT%mg4~vP2mvNres0pORIw|gaGIXi+fnk1=qcN19pTBm4 z_bI#onshBJfvWjmhr*5Zc#1g02Djg2pz%M<`ZEV>oO~Y}cu2e(tarCCZm@;c)4aYd zS6g;qiXdhSav+)mOf&EB0oqwQe##Ks`1eE3sC8C16tKslhN*;J4RUS*WUp==v(Wb6 zaUm4cIAkLJ03j280wEO&639_dUWExgd?1a0B0r?P@t*l4EMLrtZZf?m;1ht8f8_LR zYp;Yv8_F-)Sx_b-*^F}k;_SG)}J*VO1zobu}-mSl{MD2Ftn9K2NUGX zw7>>487mx7Vft^@J@KVbPRTKwXG}03fU(qWgU*%4mWN0S`JR+jV@tR7Nr3bP10r7W z0D}GxsfxvqflSQy5&oey1h=>p)B=FejO&RiuZQWU2xT_iJK!tV)+NJX0@q@FsYl#S z2adg2rB{pa#A{i;((2}Adx<=gy>fc0SWe>!I|hvN%Y7b$*RXWx>xk$Isi}FKm?Ljp zP7^E=y0u<-rBM2}8UJ}VFkvtp5aRge124f?Pu9Sr>Vl7hKQ`-9Rn`p<{7E}s49~?k zBqACQNX}vnK5!0ch%g;SAskw$sV*FqnEMC=|GrNI51B8g5<_7B#}MjcJSNt@M`+nB zfKZE@d+$8mN2KTXd!VpUK>&pXo*R?-QsVHQA!&7h{aqb)6iC#ssjeFZ#>asH<<4+F z8*V-=T*BV|-_i8LEO(@F5dFG0EyCAqHRMSmI{I*jT=7kyy6hia&%ZDk94gazTKNmV zN440ax-T41XB{8Y&*2`g=j2xW-tTQE%`%a14-yxOT&lepUjBRhm7w+xkO%}o;qp4k z-whdFi5RfEdLDwF9`R?b=R)2aJDuceDH7Nx6LolswZp=oqQi4#qL{mB>E^1K+bK0% zix2acj)Ik_QEpoHs|yt2rw#3>yz?aOLX7U;Ned5qw}45u-PIvZtZiV$FeL8ReHpF_ zUyr37TzzyC6txf*M3O49>bps05ZzaQYDMLsEY~!@DA27TqWZ;ESi8>pX9(bY0LAw9NhyC@1wssIZxd)5guA-ds5!2@zpf(XAGAg+O5)4W1u(AX z_u7G&At_z-w2(jGF=6TpWPg80kX{#3y8BG|iY63(ybCCbKLS)5X7j=m)VGTN=6Wl2 zkB%UlP~>XPfiPMSD9_M+-2?|oEOJ3B)Oy00`#bO=mUrT?J=~3++e}KW1e1>b+KS`4 z&qE-E-F7Y-?t2tbbSn9rN9_I0bHg1vgWll;$uYvM_y>25-3CF?#(d3lXP-X}p`)1H zDe&pJ_iWYijPX$%w#enr zN!Q~0N`63R?52<};H|WWy;^)Ot%OwSltSpI%n^!-00*ldtFGiPEI*$RCq$fi>gLnm z!8V8^`#y+Z!~q=b8@(S#v5%JViehr?U}~U>+dNFYBR?RAr2@<1+oeY}i16Zwgx7sr zg-4W<+FZ_3qGhV zp$uy}bfgDxmoik#tBFNv#0@MGA|GLe(1v%G$&ihbQv+>cM1IwJ)K=$9@q`I72_5>$ zCtvZ@I*2whwU0lK{m*#~NLhi%2t;A(Uam+#v)Y#eh$LLsFs^Gb`jEG8+`oh*+Dgv$ zy?V^0y22`feaL{m!YHzfk93{|hsznfGmZb2thKGe#oFJJ3h^F@r z`&K(I5Hf!SbdQmq8#KoYOR{PgHYcj~moEq(Pj7?J{>523R0~eyH{fAdE<2na_D&fu z#=kf>lcjvQr8tCl(0C&aFCUNETZFE_B>cRCq}2Q02?;@XD5vyHt$DLcT>hQooyHrTFge#~8|N#>&K*Pcn73ngq=_-xQ*828QKz-aX~9ow zE8kfE1U?L__p#TjIda2nmO_o9)^C^RigW5cfaw-*k`7rDb|*dmtscuOS0y%Vn#%KA zBbYBUkJ@M?G|8hNdW%5SeFZ~dnZIl!V{pk zuwks`kZqrBN*Htba^>(KetSw$zN!=}7~}d^u2sGl{ddn|1;^EUOt$V!eju{wY^mKE zR#~b`0Bnp)TOo_`fXyXEXd`iuXhuAOajE8QjF-6K`?G&D2lozY-!}oS3q^Dg#<6Lg zIUezMH-4S4F|c#`2SK;aA|Fzyf!HgDRN8EU(?0ZnXwahM>zV_p2M`x(L6|?AL-uo6zJDDp_ zc;0;?y{qea_HOPc6H40Phg~LBd-?Zr1#&-OX*_%O>{RYcRHY{;$nMJ6jc8qE(mh+}_)coQ$3c>FJ*@k8C zRg-JP?`3@LF_*Em;g9gFd&V^9>JVvXWmb`mgAc`t5tH@mw)I)3kel;9I6h=Kk)J4s zystjP)iof}R8j|-tGFc~pvW$BT)VH%bFqsDxmN21& z%jGf&PVGh6VpFGJ){5nG>_M)<>0bkc$afZB$lxpVk!YOD4`rw{DLLG!wNarZ<-OWJ zv_gF9!CZ!6vEzZ|tHY{7X&2*b`MR6}o~E&de!iUNs6#{m-q0^XQ<->yx`YSh&HWRh zhcnx+WMWjfK&e_Ge{{HAO)LZhm-5jrVs+{}#E}1RtOiiP^!BSDh3U33y2#R} zOm0#Ks}D(P4l=O=MIq^*uImQme(ilWg>^>JqcGu2XIPd>xon84ou0E%JLk4?<@cHk z?w+28K?azZnDdAGDdh;QmnF8ofzs)?0gi!BjTGhqy+&fBF<5BO{eE!E#1t zN*G2u$DNh=OOFV&N;dd|m3q3P%2G{A?i#g@oqSou%Ucu%M|@*8!Dk*&6MB7)1a`A` zFmBv<!9}w`9bVy7eIyL4`lA(^+7S_+Q?u2oi;SOLl zb&nU|Bogdc&2scPd-+075Z_(KBp)%4Et%N`<(wCc1pXX{v7=*3K?PAm_e5vz4Nnz> zmk9niv;B_TYlP4{W5IUz{0d|GS z5)PB7e*qw_6`2!Y=@fKyk|2-D262#=hN-Eku#k{n!(`f~yckKqW9|bBp>6yI=7 zyyM38>k0Ak@rv{z<(3P{0C#MjS?}A$^~R#*#q{8Zg_BcJWtmd4vMK;9Sx!jE4e;Lj zxFA7vHmEK&W9$y!+eYJJ3K|-5_GBwg9{7M+AVNoPIH5 zQ2cOz_wI?KV--Y)2Dl5OmF4~c0n=9H23}3ia&-aq>7Urz+NwxH2O9qF-rllLpT+@X zOnOpYRMZm|JMVxl&c4I|81w0=DS*jj*5`6L?2&SDslpCPSy@?mX~5)_`LXKi*B{lR z4v-&x`t&JGU+@J>1fSQJ5d!$}G+-0Kpom_UZ_im6Y`&`tb;E8NRH6VC;K}&-IPBUx z;D{+s$^-U4Fffn^uEQiBM%p~8HVk_6m)0>#fo)2M(G}#Rq-TS)S*e+rWYf~ppekv} z)xT9Hi0iWoGDAs;iIE$e6D5G#YMQe!^ZAN@$IbqeY1BBrUs%{Jchii%p=BpQT(}eG z;bYJG_%S?>zw2?6jQmIjRk`qbepN+>%tq5p-aYXE z_Bl+v)!rXP&kVj0zQ$-^V6u7Qe63Yc0Xu1hq6Xthn;9B)%!Z>t5*O#!?wv+D6$ja} zn%c}Sge8KH^nc(ZC^R8S0Fz}Us>#{pNZ>iaJzaTsba+^ZDfRu+7#0ILJXCRx)iovYb^9q*WtGBwSGr$Saf(tb}8ZTI2) zC&%61vW0d>gSnOMRV*JjH@}{_DX9f*us@u#%U8b;CbpaUK(8YXt5`9s)i5nV?f5@t z+xvXC;b42Q^PIHaXMuc$(c7Sjv*u|4xzz!Gk`lgDrwV)r9HatsgFQ_!8e4J_$;$FI8=)F@$6WH-S;mUpB~)lA{xCD}bOAqm~re??i-# zrUEFCIpZ?oZ+O;G9WcnpHvOB6IpTOuxhdGkcwjKD-5H$d(9O@C3f$Xhk2cHVno27c ztEkOvL3=nGsp*>eZOG%z36$`+6R38v%^Arw1{Ei=)szfAookTR1 z`IbjA`ypfMb?!M7^^3o~@24H7u*`9HTqe15^z=6vzmbLEan2jgG#x$>_&jV;@zuj4`jFngvvL!fFq%$ zl>}8_1{ec{t>%34Z_BhXA~7hr>pGV9{Q+~@vjGqCv9-jL(Fuh^ws|*O0xIRV0%N4x zLcQ24`x^ZF+mes)t|wI1sZxEG$zJi1?bkOismZy?6f%3@fT_ZDpI3)uNQ=!Bvho*veH(TFcOR~(@-}-RYOZY5|P3sm#H7@*JUO*p)6jDGLgn64_+29oNvAeNl>6k`jYT|Qe__PNl_3~( z<&?h{#%W)q-4Q*KSd>+mdJ|betU3MgT@2fyk@Ey*9^_UPv3W+~WV2+MeXt6Z(#sVT zfap5MVY@Y3Tx12llJW;=X@syV#tP&s3qU0|O{VQJbQl*)RTo~Tjc7At!KI7X#W>^n4(p7k*YS(n zR$D3Xu6ViNxezS~)>gF6dt~KDv75(vQ*(*&t91ky+t_FL)N0b{S%{b8$6#d)cg!Y_ zUJb_Fo?8_$StK=Z>59w0UH_{7DqbIQK*O1g!ZE3pQ^D=n4I7PBsQ_uL5R*m~cW+RP zc_guBGAa+w?O%BUVSPjgZoimB%2>vm zurPPrP4=*wXY;xkf$Zx>cq$Ca{4L)al`0Z88=lhXy`XW8=DPQ|I1)c8dYnpx%8+J* zJf&cc-k_`W>Lkx3slxNufK6Z68a4^()0a6qJm>>MUiaw}NJylmral4~bWY)He}P^P zuh#JyLD3bFaU0yfK|941Wa9LAdiWdJET(lM>w*~hw+}RFJC}^M+FSCe} z9NZpEJ6DNkXJM3Egu$T9Ty#fGvvh;IHeS)x0;SW|&o_&Nl9IAuX??fAeCF}em(u*v zEpWNv?0`L~KD1Myq&iARQv_%^;#2z|24Ib9_r^zmDJ$T4s{ZGz7~fpJAYOng_T8Oo z$zAUm>8hnLyKDI-O;M#h-W#m28b^hMGllaknc4v_UIlOZ*?Qi}6L{cx%8T%hQd@`P z7QI*`iv)#iW@o(@P9cQw)8D?OCni3WlsQ=h(K9xiMJkC{cp}scD4<8=Ea+8s+W7&f zOQ1S`^u|B`l8NW9m2l;A3P=zVNPqHG%XqO*Pd;{-`Fhkp-0Ss;((URy*7Zte$w$(D zgap2Y!!KoVoLdI+j>JP^#?B-l=LGVi2i1-Ty}-Ovff`Cyw^|IBLtsS2Fu8+JHz=($ z+HWn&E*zVC^^4l}o)ETXzKbiC6KRcp&n0~S_Q2u5bhC|AdzBp7+Qz=tQh#;bfR&;c z60m6+)X|ExMB=}JvA#2i^q9VHAei3HN6)r~j)=5-!h2U>&%l;F!guOv zxTYOUa*w@W`BBpOkxLzK@%jiwd%9~GYPH*`ChHuE{t|JObya8`BHjhMigqO=#H=^8@6ZkpUV&R2 z%8t$lv$5IiHd0|6ky_&k3$Z^g+Gggd>Bf^{TJiJ-ty}I`Rw0^c3FwBMDefaQ6i1aO z*ox*!c3bkW7tN(#t6cW<=<8%ra$b5{KZqDkFX8th{%~#DmzO#TbsFZZuRmj~3Q_rH zXO_jP<%qa&W*W})UG^5~SuQuSQeAVzi=TgwY%0q0^Z6{-dyN(eZqWXq4l;3&oMu0w z+*M)3L9Iytd~j1{@u0lsjB-w6&$1tJym-{XrcdtN$JN|svduXU$`CrA-Kn)R(*>b{ zm_~iOWRHmj`(nJYunc-WeAdOY+}+$g5pozcEm>N;6=E+^`Jl2ohnHY6r?W)WN=)Rk z+)?P;>mN}%Z~6M79j#Xf!ba|#CSqLWP3=DDC1JE6651E%vfGPw&K>7Uo#YtE?V zG+y0qyymB@Uw9WcFNN6% z@=%vG12PaPWL;^}Bb|9v_g_#jUJU6cY6$we1F37LBM(^)27cDPwO7568|X zNM^aAm!L@avfYKh^07opO}r}Paq|Ed=Kpl!)bF|zVSPEiQ{96gBU@#Ra@FjpP|kKk zxt&-Xtp&$qVcDbjL;jj{T7_B+`dvI$e;STC+Gz+Vg|uCQXG&ylrB+9cJnNp)bWJ|K zU#xmdudvQw3-WH=(As!IiG1}~%iR|9jBEMjYQZNhJDzYVKrl9u>^PDbmS;(pbx~I@ zSR@ATd0OWPkG5?0t#ePy?sF;=P8iXJk?NqtPWt+C^4~fZ?uWc?`ZnO6!{B=6oz_^23DFF_1{)T0=>stpt~n z!JC)-KGZuBAYX5sOq*F?Kq+H*hI^ms_*Jox|B#5n?n_MI^FdW4Hq$IBOs#IN$eE@Z z?TY96&x^5{H5VlHZJk5)Be*os=c|dkJ@Gl;-*Wf9sI|;KduesKRtLb z7vYTWVBVSVAeTu@#9H6*_W{Mc3oHiYmm<+Dl~LF3D|cnogCIJE8b!T_N7L`a8HgE` zXOdV7S*s;Q=#=*kbw>(KtTAj0A*@_}oS&ThcHzqn*Sm%h}oW`0t>Bq7I>vqITZ!#N!ux!BpukYXq3&#QMHO01D&zzk-$yQ1p^gDYjBka6ug z481zogU|r^DzO*Xx0Z6G!8K&{$28&7=*(-GC3eWlcbIF^DJf>H%nMVT+OvjZr&T6B zl}1Rf)CR{cJQzQFe3i-H3q|n_cn9}ihqutiqj+I9{;5%S@nDyfcR`q0az`-LxM7%R zmKyH|;^fqIQ!sEn6 z$@2UYXfkKN7AMJA`<@rT)uFkN99x_yWYjiZIsGU~CMa@=HoH$J9nJZ{64{oXkbp~7 z+d63{PdyQNYa*l?Y}|cmEP}(WUzgmG6Ly2=)g4sl2eusetG1`qMgO>f5>tM!1LyPH zQJJwh@l7+Kkg}D39m`|6KHaQLYF&TQ)+alw*dcZzmfVUX#T~Ds>6+hN!J%o`vS2Dl ziwq(iDD$Az4>8sGQkv z$%iP8EXc%OI5^eOEK!9T&WnPExIxgAp>+zs9J*Vt)~UM%90N(>bjob<%Q>9hu?2Sr z5->{fFL@X0DV9lT*bQmXO0|P;|Gf&?@kdopOr@%2=gP)X=~fo(hFyps%l3xe<7@gn zpD8itj=SoFyw>oBspMcSIEgFzAhEk58HHC4N>whzB?F{B7$qfS${Q5H$&zI>hd zf$-?WQu^mj{|ZzbjE&%CO%H!6$H1C12a-jhcf{sfusNSVfS;1uvl38jQ~u@Fg{fxw z+fGf{b{(rBi;{!i#?lSfO|-mjQ48)(nh>V9l!7E6Hku4f`D3Y%gS~}MTIhe((>d$S*G?K`RbeUU@E!--#{tr@ zGgN=^QT@s2>rJzxl+0Jv92r70gJnKJghoY?b0IC)x>D*UzoWk1+wv78(2>?1qXiM` z{f7z+R8-2>18~phV=>cSh;<~=6qgVXZ-jwvQ2a(pc1$=K$|nW*{zT&mSrDh&u2%=K zo4-;HGog55`W`;#8i{mUq*w%*h|LP7&r|VSaS=N54WnqmS7>z95>H8@VkUY6rBj~a z)rScQ!>SWrp1atzu!+>JKr3Ss73KczKkfn7s7{lPjPhw@&9!ED8eP9*R=W+vw;O}q0H=2A~ntx#4S)Y zJ)e70S}n`&^hD8bq03#_2d!aKjG``2lFz@TI@3J5plpe= z{hIu;0~vBV2yDbZHcym^pGT9n)lDP6Ox2g$$J{cy&%xTQ-Qu@)J#2~go~46pUm{uB zI2%n}8pI}w;ET2eDHaUN^BXgw_*FNbhg&{SENNeweDbF4h1q2+o)p1fu~OtUQSFFD zp_`4-7O!4U9#VFEq?u}0zHQxq7qv3UGUmocQ^0;2M`_Ib=Xo{c??me5Yl#DkrYMy} zZP<`b4?@|5Ym3d)n{xr4=czg@TX0fGn8q+V7ucH%o?DkfC~xt!Osewkqj4002Mo2o zl$ajRG&Hnwu~P1{y*&CWEb3ASrF?3(5k@T-Gk1RP%i5G9x!kvtwt3vropP$%La&73 z9|Q00^EFNQ65592>V>pKzwm1bu9NjM_chS2Sj3qqhv_>g*`i0dqn}a7p zxVXP|r@pz51S8*VB1DthmD*oI(4+%b?Na9MnFMT3y8>w21^%q@S#LZ?oEpmPjn`G@ zOU=20*)?8$VI>f8v`#>5dEPxWBEdaFR)6Zirg@GugT0dvvRJ3O8g5|>IkgU`4|7RC z)N^6r6B6}A{r&2oojhS%sS4Hpu(sae6P|DXcqBo&Yu|&L0}kSp!Rj;}NT2Uj*)L=Z zV3gbn6qyo@p|W2gqf=NEtAdHQy~}rFf;DAl!3i=3zIQuMuP5v-HIu@Iu`tV#FX46^ z_8i|+S!roq=!s&pJddrZt%cm~?}?FjHu+bwV0Qal#LThoxZE(bA;};LGwd=;R3#T2 zG90u&dPI_+)iltO^L-)z6tzPbg#I~)yD9R@DR3c zVYzG{h1KjIiJq{2^+myBs#!UB@$wq5X7ad$k}45CrEhvi4BR$xmp8 zwO&fmw0omu(dYZ94fjg67OSy?|6F(6+R(u2nIw!-8Mm5xOCz*9XE- zA6x#4?NA=S9=w|K{nY@{PF^dAg2)TGm{ z&?qfF_$rVz!UOgRr&b+D1w)1klaEdAyUkxXUHI*%yEw{y zdlN^*Ee}bOA#Z&W>hg)Q^Xj9<==fieKX_NOA(QL<@E1;F?+e8 ziBKjB4_=lY(;XJ#etR7Wpi&{6hV~vfAGzC41TD=`RPg`!W%vXt?RSO=_>xnNXQnBJ z?t)=Lo`YIh&Zzl|x?!23IY{X2p7ek2nTDqh&p{z!uU~h7@JoEg+lTt!UOuO4F>$F_%wU5rx%@_7y%CM+C3ZQcI#GT8Z7A;o}ZVY^a^WP8lSUYcy6{cNZtzLic z&p7FML}?!Z&GU4^U-?L@A621io;UyV@7%@#O2l-@rceQimu zvR$3GiFHqB$PgS4#H~a^bP=N1LFxmLP~-_a>@g(pwwRBs_0#$>g8eu6W&gKLXKNTz zt`TBCzJsK){J|po?i_5gqkLUj81pw529663evT*83GX834+VwoPC!%QdwdNx@>lYMLh&$%KU6^5*M%%I4fZ z`YDD_ck6^vW7z8JXpbDmarY?Y!nEJ4_og$V$>dt-PbO?Q zIb`0N^!M6*@-(CV3BuoZ11dh`WMmS>#v_`AYo({u8U6(PES+W_U#bPUhO|V%B2TT5 ze`Ouo5hBU2xpq$uqMxWc`3d&3zBE^Q=lSrdno>ItHLB3cp%we<&U6;N7Us+!59KHn z>FIw~{Kz`;YL>*gt$LhOXn)V^QBq#3e@Fp?Dob_N>MJKIg$E9IlcrzgfGv_a5KiBt z8`j9!JCBB>ip005v)a~Eq@msO7ls*|XDs+%aG!n)(}IEm0usJ|e}33_>f&F8{xd_~P1i`k!-J#Gj_si8 z_}GDj+TvShkh&rl+1H?ZGa9yLT>Z59DqB+*rbQ#NcDQaXq~GxiXT7c5V)kVZYA<}# zVyR-C@0MIt;XujHUPxhZ-2Z7WFV<^XHu{;l;gW@r&Svj=rT4MzG9S7$5-oW!@y z!)yvDe3ZvBJ!|d<7MhXRHoRTjt0$Qd;w6PRD#1$2n6ZbyYv#RYI)i?v5%=#0Fe1dnu2PG2mEms+O9dmBKlR=g-Q`S)YegdbHPJs{!jNT@<^{3L_oL!cgZl=_;SR?L%g*Jj(zY z!1gk2Z4oolCR zD&M739shn*ib(2U_0~pktJ7|2!vG>vpH6NUKE6{Wj3pkeDfZCWc>&rJIdP)7WV|3g(D#H}9i3j(X zzT6{A`c$?kzP??Grsu(ChVo$g0NFMgQn_4KQAjKgzA4#_^7eOi5XCDGN=;^;@$w#S z_7P84%gDcOjt$9EJ}50%bi?o+$;JzW@cw0@qnK0bV&GDM-_SoxiLZIY)gp& zy$~mLh=quaT622qe+ct`77gEt@ketY(|bwneJk&HyHRxMlz!auPmBvvo9~|x>8tiz zLp`97L;jhuV6COIV==w+^P1ddONCHbwYq0=Fn5~GblQpJK$)K};re2p zLMWq0_Qx9+(a35{&;Lg-t*8QUr;eF`zUBcelt17*1b|1nqnQUu75m(J>s2y3SN;~c zf~1dD_$~(d2Y0h9mE$U(>M#${5MSyH`c;+uSvebeJl(Y-y)&nY>L8}}xkdAm+?QS$ zcI%M&LrcMIhSKle452MLRnU)RyAF!`DYNVilnyhYStmvG69$B!ns1gT72v<v9MBSI}uBrKQdN2jCAtf2J({ zG}5qsnX+Z(^w#~dQLCqJ0T_(mgrH!fG-DdjM<%$&!0MkO6U0b13T8*pY~}@}$ssuz zmM>8lhP2a){^oXezgE&wPM1I0tJs3^M@dY$Y0`;piTU>I45dDNb0egBFTMGH1+XY> z*I=V2QXGj*R;^?f?S!I#Y?_7O1rg?zFLDY{yy&FS^0?JKbtp;_08L*l54>#X7|4E* zA{AdwK|UG&fb5r8$Vz5bv}2g9XWc3_@uOk`26;hZ@1RN8&oc-q*WN<~qzM#YC)JID z=lrv_2Q|-6nDo+JI-EJ6B!c&)QVGVAO551jcAFqX24(YQ3{0Qw5b|Q>IGXQM&Jm84#WM4YIZJl9Zt+EhAq`r^7p48=Y zp!4*7`c&pjr)1cHu`qV52sr=i2$EN^%t9|1b0-YlL= zzsEm0F~Okz0fm^DnC$LdUO;RDf?=pYF975ZW+OpqWxjMKdO>o4$?kX39-fYgHcT$Q z)xd~}cA+uRp=eccAvCLPiy))B_qh;i{%h?bAi17h+kOgwQ!v08S zcXxfp`uKyCBqZWSvg9A)^2|Uc`K!Hrs=j|vqoTe>eUP9@u0lqzEe-Z6qlx&Ef35c% zjO*HaU>E;vk&5Q2o0Ry>PHOxGD1bBi#{-;;llb>Dm5D03ADXlHvTT}CLb6@3>}%)c zf)0*!xFD=SU3MBV8~vlT*yD}8tWA%j2AxU_qcjEXvvE60Dc1pkZ*&T;QN6)y)s@{xol)iR*c763`n(pReZCd3S#8Nqn_@vaS61$wC(xbF;-Z#Y&h5QF3Fc$Wl>jdbfXPN+Ao*d!AP$gcz7d8 z*$y0c5a1+$=Sw~o0o;;n?(k>8_ZYQwS7Fbt!_h%1FqYTSq`-k@%qnUwI)`@I8Uv?C zX=`fEj7U5rtAQn~H5mC6>~n)?41qVi+}yene7Mp@s7n(?RQGLoi^yOS;PoN&%Q@lR zD^}$gXHV-FfG}S`pMKE50{qT0ZGD!Dpe3;`f{bck*<09YK;bq_`i-sRn6_%?H^J|j z^)x_O*3&}J<63(sO&@c0RZXXOI13Pkm+(2{q2imKnF-!7BVVn0j0TXulcE3rk-RA> zDHP=7>0;4*%U5LpOnVa$i%X_Hq*PRw>Pl+Ib+@Yt!`E=ld{S#TC#p*XrS+OQ+pX(UBN8~b}2=m94$jHd@hL;$jpGT zN<4h{25y+NyuAG6mp5-tZ!V80fmsF>LpTZA`Fa2W_D_U4g#%@`Hdn^f07e5W*a5)F z0lwD5LrPfKodaJyt3jVG%wlg&MokM;$$7GH6ciQ10Z zKJy$Pw0ZKB%}q@b(7W=|XliPj2aqige?S>pW>Owj1n~STcIe^^nvGWmm#iRK2iQ<* zeeS1E4*|TJ4o?ln0KjQ9E;fKU0RD+EWwWoW=@Qm`9yofiJ~>B6M=uQo3v6bV_r=&c zix&7;UVs+%M}W%<{3M*5DqUS&Up?v?-F`P&?UdzmE3eHrBqRmK zhwy?5e^FvO&I(LHVzij;q&8ogq}R0Hn2iIlI`f9PM_v5;_xAw?)#)Lgqgu(^P+oIS zvZ2_g!`I8xb4tp<`Ndt!XVowCM=W)!3pcj+Z9L)LXMkgDHPov&mmkoW#{bC4JCD2l z4&pYKRti{VZQ|lnX7x669K)sz?!Sn4_5E@a=GneD{a+BO+m{$Ak6dDjjzv)BkrOangg^2SDyUkV2htJD<IdR=)O`WDBAbQl0zu(rBK|M<$1;#EsJvMbI?Zfa{IpzCp;Oh#=ycPQGXlEyujJHbN;a476qBE_Eauk5 z%jWm1-F)!r6t?27Nz|O`6>_~R8#fv&k1iq5xiHG}BB8hK*Z2Xb9>S=dDwg{0JE;C$6}C zONRIob+7Eci~CvJKHY75wjNHkrb8o{ORpUPeG}JQOCeo$c%DeW?=0|}gR&p@92F!CRh;Oc+M8Uf9k0Ue6R$~Mo<@mWFt->q@ zurq~}16c{#WhrgJ5#J%ND5$gK|3^q;;T zosprs7|fv({B9}$KLOgp2#Cn!BN}wY=mz68J-GSCosIpev~}|d_S0UzF=PZ1g}axR zk>oJX?O0HN7034x^w%sNwj8ycv1~Fpb$kkA&Gp;-(e5~ga10V4O&?(*=(L9%f7kv4 zN{CVH8l-GD*n=FmtJ66uGW%)&6OE*_GwbEXUzu(@6J^Z+^OLEROP(YpC2bZBS2>g; zL++_mD_A!2BaF+0!2A{tY3t}vzy;ng`S==j$;#;GWx%Rk^+zlAUQDiz98h77!c6*T zT4Xm&K%@74dsc2!{Rc_q%~kek`L}(8{c;5g(_`G&iddO8yM30_7r#L|>=N@hoBo!P z%JjGv`45Y{FRU?nU4}(&uGy2kj%#r~n)E!6wO`4LBCKZ{CWdlV7 zdj`gMh=F$GQuhGb@p^a$o<9?EhV~C!@r(~68~$h-UslNJDrCbjXe82U-I;Pn;XnPaj}f-F-1sYMug_pn}OhWHN10 z>~aHYPL-6wdnsntTLw6OZQ4=*8Z<20<0d=VX^F7!{(3FZ(AK06I^Sh9P=8Z|VN*!i zkYsY%7Mn)z%C7HQ+v?shj2r{q>1r*%9IMG#r0xrFk~M8Fs@aSufOa4SFjO1cSSmu5 z_}t|@D+*(B*0WtFKjkz9-t5>_!onYQQ zHnXv^t`o>WhjZFnl`EIu0n`n8mR$)TZnU?5o}!oSNi{f~;ZRR*fotUl zb%km;N%}k42W0#GD)2Ng@#0G*E<%CFzIfJfn>%QBOeMi+JH`+i8jtevJ+1-}_BZ0o ztaQqPE$jh=82`LVy9`N*4=l-^^0T#6KTMf9df<1+SLm&yAgFKp>D16oE2~O}jSS^9 zk!Gii8)GCwk~_Y`Hyrxc+Z-&1%zq*)D zT8=5uh6ifO$)CFQF|%2oYC|mz?D3r1ft9ZH5dB(2EwRvWZF}DZ+7;$d&!9wyr#cHR zq1%O=i^@X&fX2$;#3zjX)r>?@f36)ML|Epk_i+3E!3%k?L_Z$r@d-sT&8E0}mOnYg zp{D2I5dMNhFd$n_@BBcT}XOPaRDQNa7e?&W1uPu%GVp$jOhVzm4Y9AB}3<4 zgPy?A^E&o<@$121&kDO}6PlVnPeR3S@AkxrFhdC{yu7&kd_UobC@lP&d*VXQ3a1DQ zvOZ*Ju~IwOHzqXKx3n=ejlR#fWZ1K}G>KZi`05Si#U@QUBz4fFv-~f+v%og!k9JQZ zXu?sD>%l{pT1+9}fzG@5AsuJ=9_O{A7F>Hn$)qNGaYoXxmv zL?v>k>__XFEN^59Kt>xbGBPVw#^bz!Vg%jszd0{vl9WO%V$w)VVR>nQ47`Ox-}I&( zKR^11(7)%H?@Tp>eT)a`r1hvc#&mym|4>j;p}o_N&iVKMB4-vT1MPw~Hf!BTu_lRD zUt|qGtiitCprHjbiC2*YHP;HNJ~h<$K#JH02|sU3@1x(f;|Flk)@`J$O0~ znOVsUezr~yY1<+c3^a)}pFRQ#9E9b0@{iXNa8OiE{^L*^8v#`Ec;S3EqBm4PPC|ND9qA+84hNk&PT`Cs286OR)mmI}sO0#UtFhGLvyfPsvGo@fl`|SIck<#byb3E&pa#vjv{2yq9js+m(TD}{5;1$ zdHKkBRyb~R!7u>$%6Oa?QH2{i#)(|7ocgn*eE7P$2DCUHxC5 z?EiS%{&Acm+LB=a%b;P=zX&MDHA}C2s|}QwMe6y3z2M`#@zGu~{k@tAq!}-bw{Ux* zG6dl(r1}5DLTY@0xC{Yx64XBa>*xG#)c-UGe_P<+PD_n1>?tTSX)>)-pkrPA*RyK} z%k7NEdA6V$YIX?e*UimMvD(oAp1{xdU?Ki)`r8y0T!$`KY(89yUdTUFV0^qjOg&_K z{`DnH2+0HvuEOzs(Cv(UfX(e#0eawRFmtGIeiK+#g)er^vOqD1-DcmJXuWDugKpMSlq8;vxT=>x|>`PwdN(TWH`u%pJ8~;e^UW(d>|b`S-66 zyztm%{R?1EfI*t$EO2%X|AuC+tubl5&781WpgKDVrb9 z5CE;LaA)Y(0U5%CPQVw79x<6saW>EvfoIe|2;fE-m-&7jYS_I@#7nYA{qRpd1j{Sv9audWns8R>~?2S zhW;7DqWAi|F{Foj*&L12u86J^%x_z=JAnBWZ;PO!H-h}&BiX;25Wo0tn;H^10bk8Z z1EZp{fTt_zbhsX4xiKpt)cx|rDGM0zQ8df~@jRc0n-J*U1Pm?x-s^Dqv{NvQDg!h( z3O+I+d~0OCVL)S1Pimqoz8&cXz?mie+qsW^Xm3NU+2Z@xk7&d9PyITXx_G~wrG@Mz zVok`m|0qxl&{jhGsch1lop+z6;yZ7G&}12mKLp-CW10~Z+4zS03 zgq${2sMnT*&+w@44WBe>DpkoEn{Tj1XP2w)^Rccw!l@o3a9`cou~EqCx9gZ@U932e zs z-1FZbj>L@Nmq=>4V&Ah3fm7c)Tr!}02}FdnwDc%erc%X?@=Ax~FhD%540cA5%n6zQ z0>3M-zlvM`$64aCL9R_zW!{Max2jtkjC~3mA_t=q#>l0$Qc2I;Z{Bb(P|M=Hq zetJ28P%4}rvg#H~H@iK1`LbTdTsNtpBgDay>(%^i4Bk7a3QIqouri4F1t@s}ETv=F zC&CdobAnWx5tS?QnlmvbI-@0~DpT^cNj+_~SpAUB<^Muh7s{g7IZ7u&uA3;kR2ql7)p;(ci+2eX$u;` zssw|CL*H^M=LbM7(^BixH@H4MKh&7z(8!eBjE)w(Sd-I$?Pf}(t&cHalv`R_QpDQ- zUhkRG&0le$#lQT*rc~g)6psor#8?HkI}+O&Q%Dp9FY3kqI?N&^<>3NQh0vr)qFc9a z+4k0pf`r&)%0;|aNvA4*te6n1#Ru^{v+4}KJ=^fDM)^O$#e$7>bes^3uR64*HVV0O zwOSLW_&t>B6L6BDVU>*P$FCHclwsv>82o>^<{wi~Gt+x>mATxwl5AuXK@9N8hk8 z+QCO!b)FlWBE}ly-kAm?TSG^`Wg*U%t5>t$W215{Mq*h<-Wc78j^?%aV5=l~Yiulb z;nesP_bP)V;8J68NfF&S6>u_WXm*EM7T=f-#>hVMkY=nH(UbP#`fAlv*54kO_Ux$) z*Iarc&j$-ok#&;}Z?ik7SWLH6?|L zTA5N3Q~ zo1-y=jdItoU8{XNVDiiq`G55Fx%n*Yqaal{kZ_xT02X4^JuE!w#i_~hU!;DTmy3br zsEH%g>bNa*L3(OTL(r4IexVc|A=BQ8*p>)pfF#@QIR9|)-SwH;?HFdQ#quIHk|wcL z12Bc$Pox?oZ<|CZ#|I=B+I0<&v1yI#yxU8+4En;71C|((gfti8IeOZ5IIZ*nA121^ zG7WBCz{GjEu9YZyj?I(y&?W$5aZn+fA@Rp_0URTeH?+Wa=;O!ix>sNrT6wEuqd zrHyawd!<7qmaQRJ!BdR4N(F$ zk%g*GGhKq^+~QXM8bE~UwfW=-O8OW%6_b)arrO~{ps(5j8++j{NBewfsc)G$dfD;U z(*V^sw74PX(NVHMp2NAXk1&+htTx_b1XbaVE?f|t>%^%{+iD^UBH zV0J8Kv)|p$uC732DAcpcgF9GuInJXngTkyp)BWygIOjUm87hZXtXQGp|NhmEn#&Op zzIB)K@r(mP7pw8Av^pHS+zAPXS4-ShA|n@|K3caa&n6NxbYc{Wcm(W6uj_4d*Puz}R1FS~);)M&RqvrW+5eqtR1}IG(57ucvq$z`i?#K)|4#_oG zIlr;N9UFqqg)UNM`n7iTPiG95bX?#oVOy(R7KM8t;pI)W`RqOrj-QTIj5%?-`Wrk3 zVCgjzAzY5Cj$L=QCv*+n{dIlJ4J3oF@#tmZw0xb9z)tYdSYL&~;1~nIZcjfo*2)yG?o9L1{7g&*+i zkNkaJ`VqFKEV!uW){GiylLs5Yn*0`bca-=8rola!-yPRd6~`3Tl8TY84a zXoFu6c4uPOtg#4II-(hs#c(M&0H3`y34Va=S~aztS~rA>M`WKWzZGjo=MLcoEJg8R zlqLa0zxdUQ7tC5^_;(!#4{(9MCvNkG9x+|+!(tCLr9-vJ60BC7*XCGXS9g}!Kjb-~ zem^ok(w~$cp+D-O6R~i8IdAW?_ubtyLG3CE23#gb?aI7;i+fWnN)Y%8ABCiv#V(?b zy>yv)*d8=F-fdtkLO*3MoGX7|I=L3_-@a`a{bFUV$IaYkX7^W%z~qgzjPi6y0t8+- zFZr^*zg0|(3TIT#3ed#U^S*I)W%Ze;JSvy@8Y$x(TGrb9-O+2)`_ELuM8n2ir=Uea zUfpd^RJ)eJ=VvQ4KK`H*a@w*jOsjnf_2l!FHyd_+vMmVct+ObByVG>6(sc4oAl5e2 zJ;mC%u<@kArz+81;JQE`o6e6ay64{Wooc@zBj;kidsR#_OsN{0*MSf|))r{zF)40z znI9ZD&9EE!|8I-nbaeazW=1x%Km@^;N?ws*r8goRzc`Qmd#bp4L|n0TCiY2jP>s9^kUOi%(9h1S&$?t`B=Og+qSCnNnpnwD5Ej% z@hQR3K6}FmSR-f_g}rb{U-Wm=X?5X?{2!Z9~i?;zd+@HsHS0MpSQNYl8c%40LJSUyGYygWM<@q^%fheR5 z?0)^}skl(MYdIr_T6* zl~QtEeems~5B6|EtnBDNa3&D=r%Y-T4MI-c>)vx~{c@I*%c&d+o7o&}ThT3*mbK=I zx0psS!=RF~NBRKSmeQ>RK_zUTw&ER?(-M3C^iLGoOkQn(GhTBFeXVUo-Y82fJdTP0 z{zO!EdPo+|jArpqE`&B$7s|=5;oPy`DC@){>+o9e5&G24>RGYl3I~PBN)x;X@4Vb; zsl|xZ!6R=j5qtuA1nhTA#=ob+6iasFp`i`Ztj%#I7#F<-e^@zc>Q=~3i_uuvltI^x)3ek@izCy zhRxZd!jo190XOf{qYuKREu?2$@Vw#u;EB?<{P%cP#*wty|9^9~v9z z0A}vV2(_&Lh@Hw-=-XkYcGWttT+5TupYq8~T&5ZF5gEr^C+IU!c1PW_7ge zq8$b8bXuC1y%YJ|;Qqq9^$WdpFD;Pkx`Rq-jcanlV`Y~Y@#BU?z+{rnf4JN=CA87X zsOkFCH( z>hPbiC{pPXCNNaAL^6To=Cb-@q zS(;o|Wk{zUjpkNDOoI0P;*j|jxtlKmNh|PuscUV08|_bNb^Ffil#8FHeyH{^fX++# zu^VT1KMvy7%;ac8YeA%yoCJW*vBOR4&wx|aDoN`(d=DWYM`w9SCg9TVkl4`O5^ z_Sh-eRQZ%3I1bkCROH^8#vtz~i*&wUqYxmhz-SrIy|3hb9c*oFGv=oc3F+>aeHV$0 z-3^TvIoXOkC7Q8WCf(=JndWCOEN3RZ!|FXIax2#~+tl)#T~5_)^_^%D3l^=l|23V; zbRr&D@O{a$wbWH_)=Ny!6i?gFEi z_q3H1&>^G>U01w|RD2)H$9ynKk0{-nTe++rxynW;k3+9RxMaQIRvG@}bMuh91Wc*U zue!W6{Y~K(zY6xj+)?}30j-(+Vb!N2f9QG0?64?v_QjFF3Qq5YSE(UVvWpcIoZL6& z7X#wg6E|3C3p4YK3{Ij1=pvzGr+B+vTI2VSPYy;XB_S6Tf^2 zI(bgxj|lQS3DR?M2%E!27G}?6#xlY;&!p_MwT;d$8{%7h_9NiIcz>nPyCd$_HQDf3 znPG=Di!vhETNwU7b@a^H_3Z9T=ES7~Rt9gF?b)jtyT0=XcFWs;tkrL)2ZLa$Ffk0% zgmqz(=>6m2%fY+Yu6M>}aEiYjQ+?74Sakl=H%vpQX7hjQHWcarZ9r#t52kFpA}y_~ z#82gX;U~vz_m7cLSIO7g;IsFXQ{USD;LgI+;Zq;2mzS4ENw#^5@aXstCQQngUkodk zfMyQMSsPzi=P4f1$Ayj>)%|~76n!Adn+Sk6K>{FqdG#5d4P+Nv{NEE7+ep%Lp)7Pk z{_^@%D%fB}q44i-OMUrn7B}JswK5Dor5ohf>ACi0POklJaTQz0B}V0Q7`ToaD?Tc! z{=Pr6=5HZP$}%FP4eOh{G|``37(%m2lh0Y!fiBp)PVqdJ3Qyq_l-ZX8P*b1|PLiaw zY9W2hnl;O43GABJll0{f!HQf78~(gB^qvTbqSXQhJ#fOU5=VUtX7@9R<7 z+=z^<>nWOwI&1GNaZnlZ>#T_xEia>21d5vrd^;A2O-|iY|8;rjIvSiWlSJs_3Cc#V z#wdzWOY8YF;Woj6^)eNj@Q|vo(Bos=1=W-W-$gv~EeZIx6^rkWvh3GtOpJVQVf8fu zbPssqLl3+&Kk<*bW$lVPlO!?~M(}qDc)Ha6{1CdDHje}vFH_pPad8J62t9%l zxP4oss6tae2MGoEn{U9@t7fs9Ac{?x^f=e2S3CpcrI^;{(Mrn7yylCz5KsRndc?H; z&%fxA3fEk0@5rXw+!JL~(i}>|u|<5fo*pv!FoY4%6KOTMqa#Gwx;^PmC9XSUaq?}ZIDkBtgP&1e?N9o;YUFzQItC+<+KKQGljH*j{ z>5f<7ASJ4U(7T-im#5WwyTtBjPiS#dH&j_V|E<}-d2xdd`;;OFA4um9S5pR;rO+kCj^#XE<5dB>TEV2pQZ zclq%$_a|u?qkpQ$BHaDH33v@@s3?^@NoBI(;I?=Xs)lsvP)(T>(HDf%jDi}#`%dT{ zZYm*zQSk%j8myOcQuguIj12*ZdU0s-(OduDFJ07lb~FxO+!h+eE&0HyFDOKF0Z;8Q``M;PaPkZ(o778?AS1 zyYUZmL=brsbq$-Khu5z+G?Q}*2%#3l#?O=wD%#o2>4ZS)o%>ftP z!KY{?>#79`2J)I3DMvgAXgS2E8B3?3`||kw_W9iKU)Ddw`KO^Zr6VH=opz}#nIj;@ zKngm#(Xp088+*|!&##KinC&+&E)D?(psAH2V?0;sxtFH!OLsye4qi-KYMT$Lo;ElT zKrcTClz7!oKb{X+^y75$br0(HIg$_AB_AyzWiFFUyw|FL{3%L_+^tRMmYX}S6EKj1K>{`W6F+K3z4ZDlwXII(;S9zfN`Fvb= zl)#V;SBn2de6-tfqb;HQbo4Ml1q>J?1z>c(SF@$-z6f$Q>Cl$nvYG!60gh3m*>$;t zok+T^O!BUlE*F`c7RWpI!PIXI#gq`IYQM)H7J%UKf>*=t1k*T_JU26sMdk+&-qoKN z>7vsPd>6l;_T&WdEp8N{)%8GKkJN&J0UI1eO`leKAqBNj1pw-l9hD|=-C%|mdsCZ3dCNTu|U$&K)Vyj zca2^4oZtsC=wh~qE+Q>RSokj<^P!;l5e*NQ+?Lm-S5e5v0DdWRNtWWA;>tWU+dyqt zMOaod${Q9;8Y{k3`3_>00umH;q);C9Ubj1oIhOfgm2g`d&v>_0vTQiTT80ql9%Jo(LufRq8qL7Ro! z0!`3h?Nax+BvRbs9KJVkAwBlL@H{U3OSimqz#`f}Avd{K$0@HoX3lc2SvCbmY~KlO z264Fta%!CCRz(exs3xxQ3_btB?<)kxI`HVb;Lp!h5SuOG=UsfPFB=}!z2u3ThkTAw zu4taFeUJCo0D=X<2g!@X+Yah$SD=?2D{}nn$`xxcP=R=!!b=9@NhA!rJ(=e>Z&S9a z%)?6W*1~-Q-8GGf;l}D6GBPrN;*X=?SFs-ZawumE3j=!p3$|a!K+exAc6eYJNQ`zM zbB}VpiFiM_esF985_O^eM4xsaGJP`PfY@*io0XHX@h)oF8cy{8&|dtpPwg9V;-T{I zjI>l@{J(yJQm2EA{+(Zx;!A(?$^n1%zv>@z(laS+Shbq~EU0^RTfIfhcaIOW?cq_`}+R0V>#^HZp^&p-d&~Wip1O@fWAR>YpfEcUF6Wa_lj^MKoZXCrH}@ z^t9d&s*o8#g+XJ9`+?q!e_#xmKEL`k`LP%ZgZeNrG&C%Xi^N)YAQl*c_Vue~=H?iS z+8F~n6=b!}TRu7&?@Nw^YEGSJ`c1I-e<`}ABxzTzZFmPjXGGpF^6wTvf`GLJb$7Ri z4nkVs0kjwcB^ZC?x+esuy!q$SrE1mHt80Oqrek@IC(_-ESFf!1@4t;WjJ8(;ZuVyJ z+BW1Eyn27JdpdbqIN*^=(1JMkl@9F9()_A@jZ<0;<9cf<*$9^1z@V+s)h&HQOF(SFxXd*rA~(UsOXs>Fe^DSrQSwCPMcy)KJhF;L&=p~TrZ*b; zH(-F9{`h-(+mV0fq9;Fct})98iV-?~D4WXtooM!u6!1wB(h(O_9tqDA(rMz7tK^C5 ziG;b>Rz@(Al^6wq05^hz&k;S*D)0u#8IVyVr>4?Ski~2~Y#LT+B^iES2!abL>~x+6 zbJB%-7`QzloXIaFLp*3tkN^dZISFI~hSQ06H=vpJ{N&dlW)vRla|SywSqFqDu!pnU z#1f1*QHlr|6Wt8lK~B#=mzYlE9z9X!`n6!1NZvxHuHYct$$AwWdcJ@RxFWx@!y#V;7u5ToGv8B6kLOMRz!jiu z+K1Df5w3wc1?VG>2x<&}{b{tQMKn*T+bHKG@FiPzwUad5?rFV!j>g&P(wT13S&nX0sLgofLAm#wFwi0DtceuMiuwm z4UK6X#x|fNkO~#NzV2`Z9hPh5ys763g6-aFtCWV8O2CAK?D9l_WQU_jASP`j@#lNp zz5_IrKsJ!(_^*0x3ZYxEy3q7 z(a3^RB7X-|itMKVd30XL;z4VL21KPRCKUA27qR#FD-zt`yK9|jBPQQM^zO2FiZ)PUSwB`A}7L2BAzY%OkY zD$*ON0-=I}s*c6`LQLVYcLUE!Hy9Zu@;-_UTynN;OH)%5#Xtwzigm!JWuI3gM|ooX z08IGpfFYJkh8&PENs~a1$ROe7kwwDv^U>0g$H= z79vW;Je#`{uaIzL*au@lIDWzh5)L1;ZjdsQS`ylOT+61OTZna22`4eS(fOnqmU7gm znm3Y%H!?E9$%Rb?R4{yVSRtd1xd-#mkD<_nJM_Vk)Z-MOz8waawa$+ng@<&y1Nrz-FejC{^fC{`=_HsK8iW4Tv z$$SDdkVM~t`HctFA+~5)S1i?WeL9Eq9Ee)8B$+sWciI`j#$WkFDisTQ;@ zFnt;>t6@quQH32{V#!_mX0gkueF(SUzZ)hc9h07zYYMAoh2Wd;SOd|6_Fk0pN!z#1 z6a&#ZM(9KJ1xggFir0 z-|*zFGd_&8C*1?RsDNwHJ6f+;y2hsJCswmpk^~oVYAw;HHGGs^bTQXX1EY>6b-1@V zot>SwOKDg;KsNG_gT23G0Q>|GS7I^WrS!O8hNE4i?y`tw!GrrqP)pMgY68e5Iw7Oy zFNru0g=XN;0QQxg&nK+{saZ85b1k`7nokV8?>#$vU0R;*Hh=KRnk>++=8de+)N= zPbof6YwL}F`X_ANe*!mpX*%@K-b0$bI-&X}yQRx>_w@N-#~P0ppUk*3n1<6jV)U*Y8#JH~(WfW&l~xe{Oxd zR$%Qs0F$W%w*x343nhBdvQLF4d~`18n0TP)Zlo`(>cJoX7_}h(v%|sGXtVUtfDrxHqY0vdAsEc#RdPJA)tn((P8p9X(R@z? zOdV(NYIlBz0YBY7 zmkfF|9OYxU)q0G`6A1c@O+l@1bLyd@(s1`+2Ota8=KLAEM;YqxRS?Zuax*z-wHCRI z9hbg%(c0x1Gkbe3Qzb?Cdy&P>efM^7M}809?RcvH38m<0vQjUB;*q&EaIei`Hl^>_ z_>c`1v_4A|R1(-%HmuoOig_r`5CT0J`lPpLvi{`zeX9jfwX2#CW;5*JbllL~_caM> z>Ax>8LFxbrNc25>&mAWEFTq!XI(FPEyFARtG9gAu2rz5n#U#@YW^qTY6IS;Qe1STV zD-`hp+pr3Q6J#Zmh_PI%Wl73NuQ`JW6keP4Ya=4aJ>T*#J1lJ!TJx&4cLNIK06_bR zS-_WNHr7`mg#!Yh*9FV#1Z6(zGRt}wgAEF%&uk|ruighYA?!AoEFu!H-OS%l_?3Rx zj^#0%K+tmr>Z7H-U|uhOooL()RVypnJC*>Pl=CTEe*H>VL_}i_1B2k0WFvsr(8sn6 zOd2B*{f>2R2qClMTkIt@Dca>8wmUt-Xg3MVBA1awcf{(x(43QTPY%|0Oblreh+lT| zkpp3_00ZfGyi;t@74>Qk;1MoWJr)Op(z?_2vN%OQf!o3rZ++AkGOQyJ`7$yz&TaY1 z*VZE4q@pOWA`l=7=GXu;_^S&JP7N2ucoXf@ix-hfL}rMu1$fqFq6`(qeaZBr+;>Mc zdgBngZ57S|1Oe+s>PMtYb5f3w5DkHq03}rPx*k=N@fdj8&FM4B=?NrG#x1d$-j+Ld>^LX# zM;D}&t9SUtb8JVozf!@D`(VzIT& zCDJ%qV?@3{Vmx23fjaxHi6vBmQGA<83d2e4ia*9JAau$JB@xLLawHsIA=t4)GszKweLURJKQTznlNvMP?#mEfU z1(r{nHhh?HZ3Xa>XzXK5j)gA^N*y?-IBtly96z-Hs;;E3JcHDr2z(ZZU8IvDR}PE` z8nRkIgE^^p=%&bnucD8Xv=M}IkZ^BsKft*rif-IuLXP3}dJ?JG5P{Sn9L)-W>n$Nu z30oSu$^ZQx2{3nSG||Sh&myz*V85YFn7=cdQ(WZ}O!_Immug}o|TTiKIQJh-CEn*0M2#7crBU#vig{~nW zK93TtpFq$;pn%Wc9Pt2miFVEA(4qTiiBh16ChE_I_BTfwb~$6zH15x-E{F1#By%M7 zES#NPk+3)+h7|Xz5Xr8&3V>m-I?1$M`X_nqXn4J(KZ|sL8HE>x4>LvEkcc!Eb$1npC9{K5R*;2H_ z$TkuMAj_lnH&PIxOD%Yd@o)5mgg|ch!}fEcRYnUzR$StQkY_>qWdM4Gwy_vtAWAw? z6kEG1auZx&$^!KxZ|r0o7X6|7a)$WFI{+!_b;DP)EY&)U{}D8hXdbaSC=+lR=q~~W z15~R=J?Kj0=zCM6AoT|XNoF-9=#qJ;OHxR&39$=lcVGkp&?W`FXeuEd0fHKJV^38< zPcczp;y($#fv3KI>C&YHP=TRLeXli&pG^lJ|7ez=?}U`#d1QwV3_Hq4I%i9|6el?- zq@Wp2!so!rU~idg`{IET&?j1F93Uz7tvlqFX_VCUeFzDtjB|Ge(TbuD{-X z(u5E?d=S7A(Ylg7drIl%3HsUqV+IEjCLKZOW)6>IBbXymB)+(@h7IJ2Br~YWqF=tl z0!nL=-o2XEo88{H9R-SCMsyV7&(lM*G+77nh1BW2-M|Fq-k5P5tx6!YVF921)Q%v9 z1uDGA`?#!W_=O6(@cld;^L0^9j}cA=8}c#E1RkJ{&wBir&7M>7_4K;8!~q7Vr;;ax zSU9($*FU3+OiEl zk@ejIVz+0gVu8C#YJY-e;52KXf5C_I^6P?Ds1Ixb7N2{{5lSX7Vv+k z_bQ>i!5v2`wu`UKvdut)rwt50Sfsm={E8Xhe#Iv!^DK!$5-F}o>ztTq2!RDa7K9Yw zDStf$FS#bW& tuple[Tensor, Tensor, Tensor]: + @torch.compile + def impl(x: Tensor, w: Tensor): + assert x.is_contiguous() and w.is_contiguous() + x_f8 = x.div(x_s).to(torch.float8_e4m3fn) + w_f8 = w.div(w_s).to(torch.float8_e4m3fn) + out = torch._scaled_mm( + x_f8, + w_f8.T, + out_dtype=torch.bfloat16, + scale_a=x.new_tensor(x_s, dtype=torch.float32), + scale_b=x.new_tensor(w_s, dtype=torch.float32), + use_fast_accum=True, + ) + return out, x_f8, w_f8 + + return impl(x, w) + +@mm_op.register_fake +def _(x: Tensor, w: Tensor, *_): + assert x.ndim == w.ndim == 2 + assert x.shape[1] == w.shape[1] + assert x.device == w.device + assert x.is_contiguous() and w.is_contiguous() + return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) + +@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) +def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: + @torch.compile + def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): + assert grad.is_contiguous() + x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) + w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) + grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) + grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) + grad_x = torch._scaled_mm( + grad_f8, + w_f8.T.contiguous().T, + out_dtype=torch.bfloat16, + scale_a=grad_inv_s, + scale_b=w_inv_s, + use_fast_accum=False, + ) + # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) + grad_w = torch._scaled_mm( + x_f8.T.contiguous(), + grad_f8.T.contiguous().T, + out_dtype=torch.float32, + scale_a=x_inv_s, + scale_b=grad_inv_s, + use_fast_accum=False, + ).T + return grad_x, grad_w + + return impl(g, x_f8, w_f8) + +@mm_backward_op.register_fake +def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): + return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) + +def backward(ctx, grad_out: Tensor, *_): + x_f8, w_f8 = ctx.saved_tensors + x_s, w_s, grad_s = ctx.scales + grad_x, grad_w = torch.ops.nanogpt.mm_backward( + grad_out, x_f8, w_f8, x_s, w_s, grad_s + ) + return grad_x, grad_w, None, None, None + +def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): + *_, x_s, w_s, grad_s = inputs + _, x_f8, w_f8 = output + ctx.save_for_backward(x_f8, w_f8) + ctx.scales = x_s, w_s, grad_s + ctx.set_materialize_grads(False) + +mm_op.register_autograd(backward, setup_context=setup_context) + +# ----------------------------------------------------------------------------- +# Triton kernel for symmetric matrix multiplication by @byronxu99 + +def _get_autotune_configs(): + return [ + triton.Config( + { + "BLOCK_SIZE_M": bm, + "BLOCK_SIZE_N": bn, + "BLOCK_SIZE_K": bk, + "GROUP_SIZE_M": 8, + "LOWER_UPPER": 1, + }, + num_stages=stages, + num_warps=warps, + ) + for bm in [64, 128] + for bn in [64, 128, 256] + for bk in [64, 128] + for stages, warps in [(3, 4), (3, 8), (4, 4)] + if bm // bn <= 2 and bn // bm <= 2 + ] + +@triton.jit +def _pid_to_block( + pid, + M, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) + + # Map PID to a single matrix in batch + batch_idx = pid // (num_pid_m * num_pid_n) + pid = pid % (num_pid_m * num_pid_n) + + # Map PID to 2D grid of blocks + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) + + m_idx = pid_m * BLOCK_SIZE_M + n_idx = pid_n * BLOCK_SIZE_N + return batch_idx, m_idx, n_idx + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def XXT_kernel( + A_ptr, C_ptr, + M, K, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def XXT(A: torch.Tensor, out: torch.Tensor): + """ + Launch Triton kernel to compute C = A @ A.T + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert out.size(-2) == M, "Output matrix has incorrect shape" + assert out.size(-1) == M, "Output matrix has incorrect shape" + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + XXT_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + K=K, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + ) + return out + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def ba_plus_cAA_kernel( + A_ptr, C_ptr, + M, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + alpha, beta, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A + # Performance is slightly slower than XXT_kernel, so we use two separate kernels + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + # Load block of A to add (corresponds to the current block of C) + offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) + a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) + a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) + a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) + + # Apply alpha and beta + accumulator *= alpha + accumulator += a_add * beta + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): + """ + Launch Triton kernel to compute C = alpha * A @ A.T + beta * A + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert M == K, "Input matrix must be square" + assert out.size(-2) == M + assert out.size(-1) == M + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + ba_plus_cAA_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + alpha=alpha, + beta=beta, + ) + return out + +# Computed for num_iters=5, safety_factor=2e-2, cushion=2 +polar_express_coeffs = [ + (8.156554524902461, -22.48329292557795, 15.878769915207462), + (4.042929935166739, -2.808917465908714, 0.5000178451051316), + (3.8916678022926607, -2.772484153217685, 0.5060648178503393), + (3.285753657755655, -2.3681294933425376, 0.46449024233003106), + (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) +] + +@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower +def polar_express(G: torch.Tensor): + """ + Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 + by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. + """ + X = G.bfloat16() + if G.size(-2) > G.size(-1): + X = X.mT + + # Ensure spectral norm is at most 1 + X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) + + # Allocate buffers + X = X.contiguous() + A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) + B = torch.empty_like(A) + C = torch.empty_like(X) + + aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm + + # Perform the iterations + for a, b, c in polar_express_coeffs: + XXT(X, out=A) # A = X @ X.mT + ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A + aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X + X, C = C, X # Swap references to avoid unnecessary copies + + if G.size(-2) > G.size(-1): + X = X.mT + return X + +# ----------------------------------------------------------------------------- +# Muon optimizer + +class NorMuon(torch.optim.Optimizer): + """ + Muon - MomentUm Orthogonalized by Newton-schulz + + https://kellerjordan.github.io/posts/muon/ + + Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- + processing step, in which each 2D parameter's update is replaced with the nearest orthogonal + matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has + the advantage that it can be stably run in bfloat16 on the GPU. + + Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, + or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). + + Differences from standard Muon: + - Newton-Shulz is replaced with Polar Express for the orthogonalization step + - NorMuon adds a low-rank variance estimator similar to Adafactor. + - small 1D parameters handled here instead of in Adam + - Cautious weight decay, a gated version of decoupled weight decay + - Custom distributed sizing: + The model stores all attn and mlp weights in the same shape, and then updates the view as + needed on the forward pass. This enables attn and mlp weights to be contained within the same + dist.reduce_scatter_tensor() call. The model architecture has been customized to enable + (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. + The scheduling is: + 1. reduce scatter smear_gate (1 param 7 padding params) + 2. reduce scatter attn_gate (10 params 6 padding params) + 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) + 4. reduce scatter attn/mlp round 2 (16 mlp params) + 5. wait on step 1, then compute update of 1 and schedule all gather + 6. wait on step 2, then compute update of 2 and schedule all gather + 7. wait on step 3, then compute update of 3 and schedule all gather + GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] + GPUs that receive params of type attn reshape before computing update + 8. wait on 4, then compute update of 4 and schedule all gather + 9. wait for each all gather to complete and update params + Empirically, leading with small params provides an additional 0.2s improvement. + """ + def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, beta2=0.95, custom_sizing=True): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + # custom sizing requires 8 GPUs + if custom_sizing and dist.get_world_size()==8: + param_groups = self.generate_custom_param_groups(params) + else: + param_groups = self.generate_standard_param_groups(params) + super().__init__(param_groups, defaults) + + def reset(self): + # expose a reset for clearing buffers + for group in self.param_groups: + group["momentum_buffer"].zero_() + group["second_momentum_buffer"].zero_() + + def generate_standard_param_groups(self, params): + """ + Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. + Creates one param group per module. + """ + groups = defaultdict(list) + for param in params: + groups[param.label].append(param) + + param_groups = [] + for module_name, group_params in groups.items(): + chunk_size = (len(group_params) + self.world_size - 1) // self.world_size + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + + return param_groups + + def generate_custom_param_groups(self, params): + """ + Implementation requires that a single GPU does not receive both attn + and mlp params when a param group is split across GPUs. + """ + module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp'] + params_list = list(params) + params_list.sort(key=lambda x: module_group_order.index(x.label)) + + idx = 0 + group_sizes = [1, 10, 16, 16] + assert len(params_list) == sum(group_sizes) + param_groups = [] + for size in group_sizes: + chunk_size = (size + self.world_size - 1) // self.world_size + group_params = params_list[idx: idx + size] + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + idx += size + + return param_groups + + @torch.no_grad() + def step(self): + # Efficient systems-wise implementation of step developed by @YouJiacheng, + # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, + # @ryanyang0, @vagrawal, and @varunneal. + rank = dist.get_rank() + group_infos = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + if not params: + continue + + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + stacked_grads = torch.empty( + (padded_num_params, *params[0].shape), + dtype=params[0].dtype, + device=params[0].device + ) + for i, p in enumerate(params): + stacked_grads[i].copy_(p.grad, non_blocking=True) + if len(params) < padded_num_params: + stacked_grads[len(params):].zero_() + + grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) + + reduce_future = dist.reduce_scatter_tensor( + grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True + ).get_future() + + group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) + + all_gather_infos = [] + # Second pass: wait for gradients, compute updates for the local shard of parameters, + # and launch all async all_gather operations. + for group, info in zip(self.param_groups, group_infos): + info["reduce_future"].wait() + + params = group["params"] + grad_chunk = info["grad_chunk"] + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + start_idx = rank * chunk_size + module_idx = start_idx if start_idx < len(params) else 0 + + num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank + + if "momentum_buffer" not in group: + group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) + momentum_buffer = group["momentum_buffer"] + # Apply momentum update to the persistent momentum buffer in-place + momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) + updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) + + grad_shape = updated_grads.shape + if params[module_idx].label == 'attn': + # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] + for p in params[module_idx:module_idx + num_params]: + assert p.label == 'attn' + updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) + ref_param = params[module_idx] + param_shape = ref_param.shape + + if "second_momentum_buffer" not in group: + group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) + if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) + ) + second_momentum_buffer = group["second_momentum_buffer"] + + if "param_lr" not in group: + group["param_lr"] = ( + max(1., param_shape[-2] / param_shape[-1]) ** 0.5 + * ref_param.new_tensor( + [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + ) + + group["param_wd"] = ref_param.new_tensor( + [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + + # Determine LR and WR + eff_lr = group["lr"] * group["param_lr"] + eff_wd = group["lr"] * group["weight_decay"] * group["param_wd"] + + # Compute zeropower for the entire chunk in a single, batched call. + if num_params == 0: + v_chunk = updated_grads + else: + v_chunk = polar_express(updated_grads) + + # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) + v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) + second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) + step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() + v_chunk.mul_(step_size) + v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) + + v_chunk = v_chunk.view(grad_shape) + + updated_params = torch.empty_like(grad_chunk) + param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) + + # "Cautious" weight decay (https://arxiv.org/abs/2510.12402) + mask = (v_chunk * param_chunk) >= 0 + v_chunk.addcmul_(param_chunk, (eff_wd * mask).to(ref_param.dtype)) + + param_chunk.addcmul_(v_chunk, -eff_lr) + + updated_params[:num_params].copy_(param_chunk) + if num_params < chunk_size: + updated_params[num_params:].zero_() + + stacked_params = torch.empty( + (padded_num_params, *param_shape), + dtype=updated_params.dtype, + device=updated_params.device, + ) + + gather_future = dist.all_gather_into_tensor( + stacked_params, updated_params, async_op=True + ).get_future() + + all_gather_infos.append( + { + "gather_future": gather_future, + "stacked_params": stacked_params, + "orig_params": params, + } + ) + + # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. + for info in all_gather_infos: + info["gather_future"].wait() + stacked_params = info["stacked_params"] + orig_params = info["orig_params"] + + unstacked_params = torch.unbind(stacked_params) + for i, p in enumerate(orig_params): + p.copy_(unstacked_params[i], non_blocking=True) + + +class DistAdam(torch.optim.Optimizer): + def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + params = list(params) + sizes = {p.shape for p in params} + # create one buffer per unique parameter-size + param_groups = [] + for size in sizes: + group_params = [p for p in params if p.shape == size] + param_groups.append(dict(params=group_params)) + super().__init__(param_groups, defaults) + # init state + for p in params: + chunk_size = p.size(0) // self.world_size + exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) + exp_avg_sq = torch.zeros_like(exp_avg) + self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) + # DistributedAdam implementation by @vagrawal + + @torch.compile + @torch.no_grad() + def step(self): + rank = dist.get_rank() + reduce_scatter_futures: list[torch.Future] = [] + all_gather_futures: list[torch.Future] = [] + grad_slices = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + for param in params: + grad = param.grad + rank_size = grad.shape[0] // self.world_size + grad_slice = torch.empty_like(grad[:rank_size]) + reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) + grad_slices.append(grad_slice) + + idx = 0 + for group in self.param_groups: + beta1, beta2 = group['betas'] + eps = group['eps'] + wd = group['weight_decay'] + params = group['params'] + for param in params: + reduce_scatter_futures[idx].wait() + rank_size = param.shape[0] // self.world_size + p_slice = param[rank * rank_size:(rank + 1) * rank_size] + lr = group['lr'] * getattr(param, "lr_mul", 1.0) + state = self.state[param] + g_slice = grad_slices[idx] + + exp_avg = state["exp_avg"] + exp_avg_sq = state["exp_avg_sq"] + state["step"] += 1 + t = state["step"] + # weight decay + if wd != 0: + eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) + p_slice.mul_(1 - eff_weight_decay) + # update running averages + exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) + # bias corrections + bias1 = 1 - beta1 ** t + bias2 = 1 - beta2 ** t + # compute step + denom = exp_avg_sq.sqrt().add_(eps) + step_size = lr * (bias2 ** 0.5 / bias1) + update = exp_avg.div(denom).mul_(step_size) + p_slice.add_(other=update, alpha=-1.0) + idx += 1 + all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) + torch.futures.collect_all(all_gather_futures).wait() + +# ----------------------------------------------------------------------------- +# PyTorch nn.Module definitions for the model + +def norm(x: Tensor): + return F.rms_norm(x, (x.size(-1),)) + +class CastedLinear(nn.Linear): + def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): + super().__init__(in_features, out_features, bias=False) + self.use_fp8 = use_fp8 + self.x_s = x_s + self.w_s = w_s + self.grad_s = grad_s + + def reset_parameters(self) -> None: + with torch.no_grad(): + self.weight.zero_() # @Grad62304977 and others + + def forward(self, x: Tensor): + if self.use_fp8 and self.training: + _x = x.flatten(0, -2) + out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] + return out.reshape(*x.shape[:-1], -1) + else: + return F.linear(x, self.weight.type_as(x)) + +# yarn implementation @classiclarryd +class Yarn(nn.Module): + def __init__(self, head_dim, max_seq_len): + super().__init__() + self.head_dim = head_dim + self.max_seq_len = max_seq_len + self.reset() + + def reset(self): + angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) + # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) + angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) + theta = torch.outer(t, angular_freq) + self.cos = nn.Buffer( + theta.cos().to(torch.bfloat16), persistent=False + ) + self.sin = nn.Buffer( + theta.sin().to(torch.bfloat16), persistent=False + ) + self.angular_freq = angular_freq + # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 + self.attn_scale = 0.1 + + def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): + rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) + scaling_factor = old_window / new_window + interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) + self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) + theta = torch.outer(t, self.angular_freq) + self.cos.copy_(theta.cos()) + self.sin.copy_(theta.sin()) + self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 + +def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): + assert cos.size(0) >= x_BTHD.size(-3) + cos, sin = ( + cos[None, : x_BTHD.size(-3), None, :], + sin[None, : x_BTHD.size(-3), None, :], + ) + x1, x2 = x_BTHD.chunk(2, dim=-1) + y1 = x1 * cos + x2 * sin + y2 = x1 * (-sin) + x2 * cos + return torch.cat((y1, y2), 3) + +@dataclass +class AttnArgs: + ve: torch.Tensor + sa_lambdas: torch.Tensor + seqlens: torch.Tensor + bm_size: int + cos: torch.Tensor + sin: torch.Tensor + attn_scale: float + +flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface + +class CausalSelfAttention(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int): + super().__init__() + self.num_heads = num_heads + self.head_dim = head_dim + self.dim = dim + self.hdim = num_heads * head_dim + + assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" + std = 0.5 * (self.dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng + # https://x.com/hi_tysam/status/1879699187107033311 + # make matrices the same shape as MLP to enable batched call in optimizer + self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) + # label module to enable custom optimizer sizing + self.qkvo_w.label='attn' + + with torch.no_grad(): + self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights + self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero + + # sparse gated attention to enable context based no-op by @classiclarryd + self.attn_gate = CastedLinear(12, num_heads) + # label module to enable custom optimizer sizing + self.attn_gate.weight.label = 'attn_gate' + + def forward(self, x: Tensor, attn_args: AttnArgs): + B, T = x.size(0), x.size(1) # batch size, sequence length + assert B == 1, "varlen sequences requires B == 1" + assert T % 16 == 0 + # unpack attention args + cos, sin = attn_args.cos, attn_args.sin + ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas + seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size + + q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) + q, k = norm(q), norm(k) # QK norm @Grad62304977 + q, k = rotary(q, cos, sin), rotary(k, cos, sin) + if ve is not None: + v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 + else: # skip mid-layers token value embeddings by @YouJiacheng + v = sa_lambdas[0] * v + + max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) + + # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng + y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, + max_seqlen_q=max_len, max_seqlen_k=max_len, + causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) + y = y.view(B, T, self.num_heads, self.head_dim) + y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) + y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side + y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) + return y + + +class MLP(nn.Module): + def __init__(self, dim: int): + super().__init__() + hdim = 4 * dim + # make matrices the same shape to enable batched call in optimizer + self.c_fc = nn.Parameter(torch.empty(dim, hdim)) + self.c_proj = nn.Parameter(torch.empty(dim, hdim)) + # label modules to enable custom optimizer sizing + self.c_fc.label = 'mlp' + self.c_proj.label = 'mlp' + # corrective factor to account for transpose + self.c_fc.lr_mul = 2. + + std = 0.5 * (dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + with torch.no_grad(): + self.c_fc.uniform_(-bound, bound) + self.c_proj.zero_() # zero init suggested by @Grad62304977 + + def forward(self, x: Tensor): + x = F.linear(x, self.c_fc.T.type_as(x)) + x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 + x = F.linear(x, self.c_proj.type_as(x)) + return x + +class Block(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): + super().__init__() + # skip attention of blocks.7 (the 8th layer) by @YouJiacheng + self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None + # skip MLP blocks for first MLP layer by @EmelyanenkoK + self.mlp = MLP(dim) if layer_idx != 0 else None + + def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): + x = lambdas[0] * x + lambdas[1] * x0 + if self.attn is not None: + x = x + self.attn(norm(x), attn_args) + if self.mlp is not None: + x = x + self.mlp(norm(x)) + return x + +# ----------------------------------------------------------------------------- +# The main model + +def next_multiple_of_n(v: float | int, *, n: int): + return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) + +class GPT(nn.Module): + def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): + super().__init__() + vocab_size = next_multiple_of_n(vocab_size, n=128) + self.embed = nn.Embedding(vocab_size, model_dim) + self.smear_gate = CastedLinear(12, 1) + # label modules to enable custom optimizer sizing + self.smear_gate.weight.label = 'smear_gate' + # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 + # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 + self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) + self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) + self.yarn = Yarn(head_dim, max_seq_len) + # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. + # suggested to me by @Grad62304977. this originates from Karpathy's experiments. + use_fp8 = not os.environ.get("DISABLE_FP8", False) + self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) + # Add learnable skip connection weights for decoder layers + assert num_layers % 2 == 0 + pad = (-num_layers * 5 - 2) % dist.get_world_size() + self.scalars = nn.Parameter( + torch.cat( + [ + -1.5 + * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 + *[ + torch.tensor([1.0, 0.0]) for _ in range(num_layers) + ], # block lambdas + *[ + torch.tensor([0.5, 0.5]) for _ in range(num_layers) + ], # SA lambdas + torch.zeros(1), # smear_lambda + 0.5*torch.ones(1), # backout_lambda + torch.ones(pad), + ] + ) + ) + # set learning rates + for param in self.embed.parameters(): + param.lr_mul = 75. + for param in self.value_embeds.parameters(): + param.lr_mul = 75. + self.lm_head.weight.lr_mul = 1.0 + self.scalars.lr_mul = 5.0 + + def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): + assert input_seq.ndim == 1 + + ve = [value_embed(input_seq) for value_embed in self.value_embeds] + # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure + # dropping first layer updates this to .12 ... 012 + ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] + assert len(ve) == len(self.blocks) + + short_bm = ws_short * args.block_size + long_bm = ws_long * args.block_size + bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] + assert len(bm_sizes) == len(self.blocks) + + x = self.embed(input_seq) + + skip_weights = self.scalars[:(len(self.blocks) // 2)] + lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) + sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) + smear_lambda = self.scalars[5 * len(self.blocks)] + backout_lambda = self.scalars[5 * len(self.blocks)+1] + + # smear token embed forward 1 position @classiclarryd + smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) + x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) + x = x0 = norm(x[None]) + + # U-net design by @brendanh0gan + skip_connections = [] + n = len(self.blocks) // 2 + + x_backout = None + backout_layer = 8 + # skip layer zero + for i in range(1,len(self.blocks)): + attn_args = AttnArgs( + ve=ve[i], + sa_lambdas=sa_lambdas[i], + seqlens=seqlens, + bm_size=bm_sizes[i], + cos=self.yarn.cos, + sin=self.yarn.sin, + attn_scale=self.yarn.attn_scale + ) + # since layer 0 is skipped, layer 11 does not have skip_connection + if i >= n and i<11: + gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) + x = x + gate * skip_connections.pop() + x = self.blocks[i](x, x0, lambdas[i], attn_args) + if i < n: + skip_connections.append(x) + if i == backout_layer: + x_backout = x + + # back out contributions from first 8 layers that are only required for downstream context and not direct prediction + x -= backout_lambda * x_backout + x = norm(x) + logits = self.lm_head(x) + # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) + logits = 30 * torch.sigmoid(logits / 7.5) + logits_for_loss = logits.float() if not self.training else logits + loss = F.cross_entropy( + logits_for_loss.view(-1, logits_for_loss.size(-1)), + target_seq, + reduction="sum" if self.training else "mean", + ) + return loss + +# ----------------------------------------------------------------------------- +# Distributed data loader + +def _load_data_shard(file: Path): + header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 + assert header[0] == 20240520, "magic number mismatch in the data .bin file" + assert header[1] == 1, "unsupported version" + num_tokens = int(header[2]) # number of tokens (claimed) + with file.open("rb", buffering=0) as f: + tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng + f.seek(256 * 4) + nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng + assert nbytes == 2 * num_tokens, "number of tokens read does not match header" + return tokens + +BOS_ID = 50256 + +class BOSFinder: + # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd + def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): + # Precompute BOS positions once per shard + self.tokens=tokens + self.size = tokens.numel() + self.quickload = quickload + if quickload: + # only scan first 4 million tokens, then kickoff async thread to scan rest + self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.thread = None + self.ready = threading.Event() + self.start() + else: + self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.i = 0 + self.world_size = world_size + self.batch_iter = 0 + + def _load(self): + self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + self.bos_idx = self.bos_idx_async + + def next_batch(self, num_tokens_local: int, max_seq_len: int): + # if quickload was used, repoint to the full dataset after 5 batches + if self.quickload and self.batch_iter==5: + self.get() + n = len(self.bos_idx) + starts = [[] for _ in range(self.world_size)] + ends = [[] for _ in range(self.world_size)] + + idx = self.i + for r in range(self.world_size): + cur_len = 0 + while cur_len <= num_tokens_local: + if idx >= n: + raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") + cur = self.bos_idx[idx] + starts[r].append(cur) + end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, + cur + max_seq_len, + cur + num_tokens_local - cur_len + 1) + ends[r].append(end) + cur_len += end - cur + idx += 1 + + assert cur_len == num_tokens_local + 1 + self.i = idx + self.batch_iter+=1 + return starts, ends + +class DataPreloader: + # Helper for asynchronously loading next shard and indexing bos tokens + def __init__(self, file_iter, world_size: int = 1): + self.file_iter = file_iter + self.world_size = world_size + self.thread = None + self.data = None + self.ready = threading.Event() + + def _load(self): + tokens = _load_data_shard(next(self.file_iter)) + self.data = (tokens, BOSFinder(tokens, self.world_size)) + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + return self.data + +def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): + # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len + rank = dist.get_rank() if dist.is_initialized() else 0 + world_size = dist.get_world_size() if dist.is_initialized() else 1 + assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" + num_tokens = num_tokens // grad_accum_steps + + files = [Path(file) for file in sorted(glob.glob(filename_pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") + + file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training + tokens = _load_data_shard(next(file_iter)) + if align_to_bos: + finder = BOSFinder(tokens, world_size=world_size, quickload=True) + preloader = DataPreloader(file_iter, world_size) + preloader.start() + else: + pos = 0 # for unaligned case + + while True: + num_tokens_local = num_tokens // world_size + max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 + + if align_to_bos: + try: + seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) + start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) + except StopIteration: + # This shard is exhausted, load the next one in the next loop iteration. + tokens, finder = preloader.get() + preloader.start() + continue + + buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) + _inputs = buf[:-1] + _targets = buf[1:] + end_idxs[-1] -= 1 # last document was too long to account for _targets offset + cum_lengths = (end_idxs - start_idxs).cumsum(0) + + else: + if pos + num_tokens + 1 >= len(tokens): # should not occur for val data + tokens, pos = _load_data_shard(next(file_iter)), 0 + + pos_local = pos + rank * num_tokens_local + buf = tokens[pos_local: pos_local + num_tokens_local + 1] + _inputs = buf[:-1].view(num_tokens_local, ) + _targets = buf[1:].view(num_tokens_local, ) + + cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] + pos += num_tokens + + + _cum_lengths = torch.full((max_num_docs,), num_tokens_local) + _cum_lengths[0] = 0 + _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths + + new_params = yield ( + _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), + _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), + _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) + ) + + if new_params is not None: + # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() + new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params + assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" + num_tokens = new_num_tokens + max_seq_len = new_max_seq_len + grad_accum_steps = new_grad_accum_steps + + +# ----------------------------------------------------------------------------- +# int main + +@dataclass +class Hyperparameters: + # data + train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on + val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on + val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons + train_batch_size: int = 2048 * 16 * 8 + train_max_seq_len: int = 128 * 16 + val_batch_size: int = 4 * 64 * 1024 * 8 + # optimization + num_scheduled_iterations: int = 2205 # number of steps to complete lr and ws schedule + num_extension_iterations: int = 40 # number of steps to continue training at final lr and ws + num_iterations: int = num_scheduled_iterations + num_extension_iterations + cooldown_frac: float = 0.50 # fraction of num_scheduled_iterations spent cooling down the learning rate + # evaluation and logging + run_id: str = f"{uuid.uuid4()}" + val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end + save_checkpoint: bool = False + # attention masking + block_size: int = 128 + ws_schedule: tuple = (3, 7, 11) + ws_final: int = 13 # increase final validation ws, used for YaRN extension and short window size @classiclarryd + ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN + +args = Hyperparameters() + +data_path = os.environ.get("DATA_PATH", ".") +args.train_files = os.path.join(data_path, args.train_files) +args.val_files = os.path.join(data_path, args.val_files) + +# torchrun sets these env variables +rank = int(os.environ["RANK"]) +world_size = int(os.environ["WORLD_SIZE"]) +assert 8 % world_size == 0, "world_size must be a divisor of 8" +grad_accum_steps = 8 // world_size +assert torch.cuda.is_available() +device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) +torch.cuda.set_device(device) +dist.init_process_group(backend="nccl", device_id=device) +dist.barrier() +master_process = (rank == 0) # this process will do logging, checkpointing etc. + +# begin logging +logfile = None +if master_process: + run_id = args.run_id + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{run_id}.txt" + print(logfile) +def print0(s, console=False): + if master_process: + with open(logfile, "a") as f: + if console: + print(s) + print(s, file=f) + +# begin by printing this file (the Python code) +print0(code) +print0("="*100) +# log information about the hardware/software environment this is running on +print0(f"Running Python {sys.version}") +print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") +print0(f"Running Triton version {triton.__version__}") + +def nvidia_smi(): + import subprocess # avoid top level import + return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout +print0(nvidia_smi()) +print0("="*100) + +model: nn.Module = GPT( + vocab_size=50257, + num_layers=12, + num_heads=6, + head_dim=128, + model_dim=768, + max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) +).cuda() +for m in model.modules(): + if isinstance(m, (nn.Embedding, nn.Linear)): + m.bfloat16() +for param in model.parameters(): + dist.broadcast(param.detach(), 0) + +# collect the parameters to optimize +hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] +embed_params = [p for n, p in model.named_parameters() if "embed" in n] +scalar_params = [p for p in model.parameters() if p.ndim < 2] +head_params = [model.lm_head.weight] +gate_params = [p for n, p in model.named_parameters() if "gate" in n] + +# init the optimizer(s) +# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence +# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 +optimizer1 = DistAdam( + scalar_params + head_params + embed_params, + lr=0.008, + betas=(0.65, 0.95), + eps=1e-8, + weight_decay=0.0, +) +optimizer2 = NorMuon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=1.2) +optimizers = [optimizer1, optimizer2] +for opt in optimizers: + for group in opt.param_groups: + group["initial_lr"] = group["lr"] + +# learning rate schedule: flat, then linear decay, then flat +def get_lr(step: int): + x = min(0.9999, step / args.num_scheduled_iterations) + assert 0 <= x < 1 + lr = 1.0 + if x >= 1 - args.cooldown_frac: + w = (1 - x) / args.cooldown_frac + lr = w * 1.0 + (1 - w) * 0.1 + return lr + +def get_ws(step: int): + # set short window size to half of long window size + # Higher ws on "extension" steps + if step >= args.num_scheduled_iterations: + return args.ws_final // 2, args.ws_final + x = step / args.num_scheduled_iterations + assert 0 <= x < 1 + ws_idx = int(len(args.ws_schedule) * x) + return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] + +def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): + # warmup phase: linearly increase momentum from min to max + # cooldown phase: linearly decrease momentum from max to min + momentum_cd_start = args.num_iterations - muon_cooldown_steps + if step < muon_warmup_steps: + frac = step / muon_warmup_steps + momentum = momentum_min + frac * (momentum_max - momentum_min) + elif step > momentum_cd_start: + frac = (step - momentum_cd_start) / muon_cooldown_steps + momentum = momentum_max - frac * (momentum_max - momentum_min) + else: + momentum = momentum_max + return momentum + +def step_optimizers(step: int, optimizers, model): + # update lr + for optimizer in optimizers: + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * get_lr(step) + + # set muon momentum based on step + momentum = get_muon_momentum(step) + for group in optimizers[1].param_groups: + group["momentum"] = momentum + + # on even steps, only step Muon params + # on odd steps, step all params + if step%2==0: + optimizers[1].step() + optimizers[1].zero_grad(set_to_none=True) + else: + for optimizer in optimizers: + optimizer.step() + model.zero_grad(set_to_none=True) + +model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) + +######################################## +# Warmup kernels # +######################################## + +# Warmup the training kernels, then re-initialize the state so we aren't cheating +warmup_steps = 30 +initial_state = dict(model=copy.deepcopy(model.state_dict()), + optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +ws_schedule = list(args.ws_schedule) + [args.ws_final] +ws_long = ws_schedule[0] +for step in range(warmup_steps): + inputs, targets, cum_seqlens = next(train_loader) + # each window size is a new graph, need to warm up each with Yarn.attn_scale + ws_idx = step % len(ws_schedule) + if ws_idx==0: + model.yarn.reset() + ws_long = ws_schedule[0] + else: + new_ws_long = ws_schedule[ws_idx] + model.yarn.apply(ws_long, new_ws_long) + ws_long = new_ws_long + model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() + for opt in optimizers: + opt.step() + model.zero_grad(set_to_none=True) +model.yarn.reset() # rotary buffer is not stored in state_dict +model.load_state_dict(initial_state["model"]) +optimizer2.reset() # muon momentum buffers not in state dict +for opt, opt_state in zip(optimizers, initial_state["optimizers"]): + opt.load_state_dict(opt_state) +del train_loader, initial_state + +######################################## +# Training and validation # +######################################## + +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +training_time_ms = 0 +# start the clock +torch.cuda.synchronize() +t0 = time.perf_counter() +# begin training +train_steps = args.num_iterations +ws_short, ws_long = get_ws(0) +for step in range(train_steps + 1): + last_step = (step == train_steps) + ws_short, new_ws_long = get_ws(step) + if new_ws_long != ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long=new_ws_long + + # --------------- VALIDATION SECTION ----------------- + if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): + if last_step: + ws_long = args.ws_validate_post_yarn_ext + # stop the clock + torch.cuda.synchronize() + training_time_ms += 1000 * (time.perf_counter() - t0) + model.eval() + assert args.val_tokens % args.val_batch_size == 0 + val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size + val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) + val_loss = 0 + with torch.no_grad(): + for _ in range(val_steps): + inputs, targets, cum_seqlens = next(val_loader) + val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) + val_loss /= val_steps + del val_loader + dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) + print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) + model.train() + # start the clock again + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if master_process and args.save_checkpoint: + log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) + os.makedirs(f"logs/{run_id}", exist_ok=True) + torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") + # the last step only has the validation loop, so break to avoid training + break + + # --------------- TRAINING SECTION ----------------- + for _ in range(grad_accum_steps): + inputs, targets, cum_seqlens = next(train_loader) + (model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps).backward() + step_optimizers(step, optimizers, model) + + # logging + approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) + print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) + +print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) +dist.destroy_process_group() + +==================================================================================================== +Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] +Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 +Running Triton version 3.5.0 +Mon Nov 10 21:44:05 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | +| N/A 42C P0 132W / 700W | 5858MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | +| N/A 35C P0 122W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | +| N/A 34C P0 120W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | +| N/A 39C P0 122W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | +| N/A 41C P0 131W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | +| N/A 34C P0 124W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | +| N/A 40C P0 122W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | +| N/A 34C P0 118W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| ++-----------------------------------------------------------------------------------------+ + +==================================================================================================== +step:0/2245 val_loss:10.8258 train_time:0ms step_avg:0.02ms +step:1/2245 train_time:123ms step_avg:122.63ms +step:2/2245 train_time:144ms step_avg:71.98ms +step:3/2245 train_time:182ms step_avg:60.74ms +step:4/2245 train_time:239ms step_avg:59.63ms +step:5/2245 train_time:298ms step_avg:59.57ms +step:6/2245 train_time:357ms step_avg:59.45ms +step:7/2245 train_time:441ms step_avg:63.00ms +step:8/2245 train_time:499ms step_avg:62.38ms +step:9/2245 train_time:560ms step_avg:62.23ms +step:10/2245 train_time:619ms step_avg:61.87ms +step:11/2245 train_time:680ms step_avg:61.82ms +step:12/2245 train_time:739ms step_avg:61.56ms +step:13/2245 train_time:800ms step_avg:61.55ms +step:14/2245 train_time:860ms step_avg:61.43ms +step:15/2245 train_time:921ms step_avg:61.40ms +step:16/2245 train_time:980ms step_avg:61.27ms +step:17/2245 train_time:1044ms step_avg:61.39ms +step:18/2245 train_time:1104ms step_avg:61.35ms +step:19/2245 train_time:1168ms step_avg:61.47ms +step:20/2245 train_time:1228ms step_avg:61.40ms +step:21/2245 train_time:1290ms step_avg:61.43ms +step:22/2245 train_time:1349ms step_avg:61.33ms +step:23/2245 train_time:1412ms step_avg:61.39ms +step:24/2245 train_time:1473ms step_avg:61.37ms +step:25/2245 train_time:1534ms step_avg:61.37ms +step:26/2245 train_time:1594ms step_avg:61.31ms +step:27/2245 train_time:1656ms step_avg:61.32ms +step:28/2245 train_time:1714ms step_avg:61.22ms +step:29/2245 train_time:1776ms step_avg:61.24ms +step:30/2245 train_time:1835ms step_avg:61.18ms +step:31/2245 train_time:1897ms step_avg:61.19ms +step:32/2245 train_time:1957ms step_avg:61.15ms +step:33/2245 train_time:2020ms step_avg:61.20ms +step:34/2245 train_time:2081ms step_avg:61.20ms +step:35/2245 train_time:2144ms step_avg:61.26ms +step:36/2245 train_time:2204ms step_avg:61.21ms +step:37/2245 train_time:2266ms step_avg:61.23ms +step:38/2245 train_time:2325ms step_avg:61.18ms +step:39/2245 train_time:2387ms step_avg:61.22ms +step:40/2245 train_time:2447ms step_avg:61.18ms +step:41/2245 train_time:2510ms step_avg:61.21ms +step:42/2245 train_time:2569ms step_avg:61.16ms +step:43/2245 train_time:2630ms step_avg:61.16ms +step:44/2245 train_time:2689ms step_avg:61.11ms +step:45/2245 train_time:2751ms step_avg:61.14ms +step:46/2245 train_time:2810ms step_avg:61.09ms +step:47/2245 train_time:2872ms step_avg:61.11ms +step:48/2245 train_time:2931ms step_avg:61.07ms +step:49/2245 train_time:2994ms step_avg:61.10ms +step:50/2245 train_time:3054ms step_avg:61.07ms +step:51/2245 train_time:3116ms step_avg:61.09ms +step:52/2245 train_time:3176ms step_avg:61.07ms +step:53/2245 train_time:3238ms step_avg:61.09ms +step:54/2245 train_time:3297ms step_avg:61.06ms +step:55/2245 train_time:3359ms step_avg:61.08ms +step:56/2245 train_time:3420ms step_avg:61.07ms +step:57/2245 train_time:3483ms step_avg:61.10ms +step:58/2245 train_time:3542ms step_avg:61.06ms +step:59/2245 train_time:3603ms step_avg:61.07ms +step:60/2245 train_time:3662ms step_avg:61.04ms +step:61/2245 train_time:3724ms step_avg:61.05ms +step:62/2245 train_time:3783ms step_avg:61.02ms +step:63/2245 train_time:3846ms step_avg:61.04ms +step:64/2245 train_time:3904ms step_avg:61.01ms +step:65/2245 train_time:3966ms step_avg:61.02ms +step:66/2245 train_time:4026ms step_avg:61.00ms +step:67/2245 train_time:4087ms step_avg:61.01ms +step:68/2245 train_time:4146ms step_avg:60.98ms +step:69/2245 train_time:4208ms step_avg:60.98ms +step:70/2245 train_time:4267ms step_avg:60.96ms +step:71/2245 train_time:4329ms step_avg:60.97ms +step:72/2245 train_time:4388ms step_avg:60.95ms +step:73/2245 train_time:4450ms step_avg:60.96ms +step:74/2245 train_time:4510ms step_avg:60.94ms +step:75/2245 train_time:4572ms step_avg:60.96ms +step:76/2245 train_time:4631ms step_avg:60.94ms +step:77/2245 train_time:4693ms step_avg:60.95ms +step:78/2245 train_time:4752ms step_avg:60.92ms +step:79/2245 train_time:4814ms step_avg:60.94ms +step:80/2245 train_time:4873ms step_avg:60.92ms +step:81/2245 train_time:4935ms step_avg:60.92ms +step:82/2245 train_time:4994ms step_avg:60.91ms +step:83/2245 train_time:5056ms step_avg:60.92ms +step:84/2245 train_time:5116ms step_avg:60.90ms +step:85/2245 train_time:5178ms step_avg:60.92ms +step:86/2245 train_time:5238ms step_avg:60.91ms +step:87/2245 train_time:5301ms step_avg:60.93ms +step:88/2245 train_time:5361ms step_avg:60.92ms +step:89/2245 train_time:5423ms step_avg:60.93ms +step:90/2245 train_time:5482ms step_avg:60.91ms +step:91/2245 train_time:5544ms step_avg:60.92ms +step:92/2245 train_time:5603ms step_avg:60.91ms +step:93/2245 train_time:5666ms step_avg:60.92ms +step:94/2245 train_time:5725ms step_avg:60.91ms +step:95/2245 train_time:5786ms step_avg:60.91ms +step:96/2245 train_time:5845ms step_avg:60.88ms +step:97/2245 train_time:5906ms step_avg:60.89ms +step:98/2245 train_time:5967ms step_avg:60.89ms +step:99/2245 train_time:6028ms step_avg:60.89ms +step:100/2245 train_time:6087ms step_avg:60.87ms +step:101/2245 train_time:6148ms step_avg:60.87ms +step:102/2245 train_time:6207ms step_avg:60.85ms +step:103/2245 train_time:6269ms step_avg:60.86ms +step:104/2245 train_time:6328ms step_avg:60.85ms +step:105/2245 train_time:6390ms step_avg:60.86ms +step:106/2245 train_time:6450ms step_avg:60.85ms +step:107/2245 train_time:6512ms step_avg:60.86ms +step:108/2245 train_time:6571ms step_avg:60.84ms +step:109/2245 train_time:6632ms step_avg:60.85ms +step:110/2245 train_time:6691ms step_avg:60.83ms +step:111/2245 train_time:6752ms step_avg:60.83ms +step:112/2245 train_time:6812ms step_avg:60.82ms +step:113/2245 train_time:6873ms step_avg:60.83ms +step:114/2245 train_time:6933ms step_avg:60.81ms +step:115/2245 train_time:6997ms step_avg:60.84ms +step:116/2245 train_time:7054ms step_avg:60.81ms +step:117/2245 train_time:7116ms step_avg:60.82ms +step:118/2245 train_time:7175ms step_avg:60.81ms +step:119/2245 train_time:7237ms step_avg:60.81ms +step:120/2245 train_time:7297ms step_avg:60.81ms +step:121/2245 train_time:7360ms step_avg:60.82ms +step:122/2245 train_time:7420ms step_avg:60.82ms +step:123/2245 train_time:7482ms step_avg:60.83ms +step:124/2245 train_time:7541ms step_avg:60.82ms +step:125/2245 train_time:7603ms step_avg:60.82ms +step:126/2245 train_time:7662ms step_avg:60.81ms +step:127/2245 train_time:7725ms step_avg:60.82ms +step:128/2245 train_time:7784ms step_avg:60.81ms +step:129/2245 train_time:7845ms step_avg:60.81ms +step:130/2245 train_time:7904ms step_avg:60.80ms +step:131/2245 train_time:7966ms step_avg:60.81ms +step:132/2245 train_time:8025ms step_avg:60.79ms +step:133/2245 train_time:8087ms step_avg:60.81ms +step:134/2245 train_time:8146ms step_avg:60.79ms +step:135/2245 train_time:8207ms step_avg:60.80ms +step:136/2245 train_time:8267ms step_avg:60.78ms +step:137/2245 train_time:8328ms step_avg:60.79ms +step:138/2245 train_time:8387ms step_avg:60.78ms +step:139/2245 train_time:8449ms step_avg:60.79ms +step:140/2245 train_time:8508ms step_avg:60.77ms +step:141/2245 train_time:8571ms step_avg:60.78ms +step:142/2245 train_time:8630ms step_avg:60.77ms +step:143/2245 train_time:8691ms step_avg:60.78ms +step:144/2245 train_time:8750ms step_avg:60.76ms +step:145/2245 train_time:8812ms step_avg:60.77ms +step:146/2245 train_time:8871ms step_avg:60.76ms +step:147/2245 train_time:8932ms step_avg:60.76ms +step:148/2245 train_time:8991ms step_avg:60.75ms +step:149/2245 train_time:9053ms step_avg:60.76ms +step:150/2245 train_time:9112ms step_avg:60.74ms +step:151/2245 train_time:9173ms step_avg:60.75ms +step:152/2245 train_time:9232ms step_avg:60.74ms +step:153/2245 train_time:9294ms step_avg:60.74ms +step:154/2245 train_time:9353ms step_avg:60.73ms +step:155/2245 train_time:9415ms step_avg:60.74ms +step:156/2245 train_time:9475ms step_avg:60.74ms +step:157/2245 train_time:9537ms step_avg:60.75ms +step:158/2245 train_time:9597ms step_avg:60.74ms +step:159/2245 train_time:9659ms step_avg:60.75ms +step:160/2245 train_time:9719ms step_avg:60.75ms +step:161/2245 train_time:9781ms step_avg:60.75ms +step:162/2245 train_time:9841ms step_avg:60.74ms +step:163/2245 train_time:9902ms step_avg:60.75ms +step:164/2245 train_time:9961ms step_avg:60.74ms +step:165/2245 train_time:10022ms step_avg:60.74ms +step:166/2245 train_time:10081ms step_avg:60.73ms +step:167/2245 train_time:10143ms step_avg:60.74ms +step:168/2245 train_time:10202ms step_avg:60.73ms +step:169/2245 train_time:10264ms step_avg:60.73ms +step:170/2245 train_time:10323ms step_avg:60.72ms +step:171/2245 train_time:10384ms step_avg:60.73ms +step:172/2245 train_time:10443ms step_avg:60.72ms +step:173/2245 train_time:10505ms step_avg:60.72ms +step:174/2245 train_time:10564ms step_avg:60.71ms +step:175/2245 train_time:10625ms step_avg:60.72ms +step:176/2245 train_time:10684ms step_avg:60.71ms +step:177/2245 train_time:10746ms step_avg:60.71ms +step:178/2245 train_time:10805ms step_avg:60.70ms +step:179/2245 train_time:10867ms step_avg:60.71ms +step:180/2245 train_time:10926ms step_avg:60.70ms +step:181/2245 train_time:10987ms step_avg:60.70ms +step:182/2245 train_time:11046ms step_avg:60.69ms +step:183/2245 train_time:11107ms step_avg:60.70ms +step:184/2245 train_time:11166ms step_avg:60.69ms +step:185/2245 train_time:11227ms step_avg:60.69ms +step:186/2245 train_time:11286ms step_avg:60.68ms +step:187/2245 train_time:11347ms step_avg:60.68ms +step:188/2245 train_time:11406ms step_avg:60.67ms +step:189/2245 train_time:11468ms step_avg:60.67ms +step:190/2245 train_time:11526ms step_avg:60.67ms +step:191/2245 train_time:11588ms step_avg:60.67ms +step:192/2245 train_time:11647ms step_avg:60.66ms +step:193/2245 train_time:11709ms step_avg:60.67ms +step:194/2245 train_time:11768ms step_avg:60.66ms +step:195/2245 train_time:11829ms step_avg:60.66ms +step:196/2245 train_time:11888ms step_avg:60.65ms +step:197/2245 train_time:11950ms step_avg:60.66ms +step:198/2245 train_time:12009ms step_avg:60.65ms +step:199/2245 train_time:12070ms step_avg:60.66ms +step:200/2245 train_time:12129ms step_avg:60.64ms +step:201/2245 train_time:12190ms step_avg:60.65ms +step:202/2245 train_time:12250ms step_avg:60.64ms +step:203/2245 train_time:12311ms step_avg:60.65ms +step:204/2245 train_time:12371ms step_avg:60.64ms +step:205/2245 train_time:12432ms step_avg:60.64ms +step:206/2245 train_time:12491ms step_avg:60.64ms +step:207/2245 train_time:12552ms step_avg:60.64ms +step:208/2245 train_time:12611ms step_avg:60.63ms +step:209/2245 train_time:12672ms step_avg:60.63ms +step:210/2245 train_time:12731ms step_avg:60.62ms +step:211/2245 train_time:12793ms step_avg:60.63ms +step:212/2245 train_time:12852ms step_avg:60.62ms +step:213/2245 train_time:12914ms step_avg:60.63ms +step:214/2245 train_time:12974ms step_avg:60.62ms +step:215/2245 train_time:13036ms step_avg:60.63ms +step:216/2245 train_time:13095ms step_avg:60.63ms +step:217/2245 train_time:13157ms step_avg:60.63ms +step:218/2245 train_time:13216ms step_avg:60.62ms +step:219/2245 train_time:13277ms step_avg:60.63ms +step:220/2245 train_time:13336ms step_avg:60.62ms +step:221/2245 train_time:13398ms step_avg:60.63ms +step:222/2245 train_time:13458ms step_avg:60.62ms +step:223/2245 train_time:13519ms step_avg:60.63ms +step:224/2245 train_time:13579ms step_avg:60.62ms +step:225/2245 train_time:13641ms step_avg:60.63ms +step:226/2245 train_time:13700ms step_avg:60.62ms +step:227/2245 train_time:13761ms step_avg:60.62ms +step:228/2245 train_time:13820ms step_avg:60.62ms +step:229/2245 train_time:13882ms step_avg:60.62ms +step:230/2245 train_time:13942ms step_avg:60.62ms +step:231/2245 train_time:14003ms step_avg:60.62ms +step:232/2245 train_time:14062ms step_avg:60.61ms +step:233/2245 train_time:14124ms step_avg:60.62ms +step:234/2245 train_time:14183ms step_avg:60.61ms +step:235/2245 train_time:14245ms step_avg:60.62ms +step:236/2245 train_time:14303ms step_avg:60.60ms +step:237/2245 train_time:14364ms step_avg:60.61ms +step:238/2245 train_time:14423ms step_avg:60.60ms +step:239/2245 train_time:14484ms step_avg:60.60ms +step:240/2245 train_time:14543ms step_avg:60.60ms +step:241/2245 train_time:14604ms step_avg:60.60ms +step:242/2245 train_time:14663ms step_avg:60.59ms +step:243/2245 train_time:14726ms step_avg:60.60ms +step:244/2245 train_time:14784ms step_avg:60.59ms +step:245/2245 train_time:14845ms step_avg:60.59ms +step:246/2245 train_time:14904ms step_avg:60.58ms +step:247/2245 train_time:14965ms step_avg:60.59ms +step:248/2245 train_time:15024ms step_avg:60.58ms +step:249/2245 train_time:15086ms step_avg:60.59ms +step:250/2245 train_time:15145ms step_avg:60.58ms +step:250/2245 val_loss:4.0752 train_time:15207ms step_avg:60.83ms +step:251/2245 train_time:15226ms step_avg:60.66ms +step:252/2245 train_time:15267ms step_avg:60.58ms +step:253/2245 train_time:15334ms step_avg:60.61ms +step:254/2245 train_time:15399ms step_avg:60.63ms +step:255/2245 train_time:15460ms step_avg:60.63ms +step:256/2245 train_time:15519ms step_avg:60.62ms +step:257/2245 train_time:15580ms step_avg:60.62ms +step:258/2245 train_time:15638ms step_avg:60.61ms +step:259/2245 train_time:15699ms step_avg:60.61ms +step:260/2245 train_time:15758ms step_avg:60.61ms +step:261/2245 train_time:15818ms step_avg:60.61ms +step:262/2245 train_time:15876ms step_avg:60.60ms +step:263/2245 train_time:15937ms step_avg:60.60ms +step:264/2245 train_time:15996ms step_avg:60.59ms +step:265/2245 train_time:16056ms step_avg:60.59ms +step:266/2245 train_time:16116ms step_avg:60.59ms +step:267/2245 train_time:16179ms step_avg:60.59ms +step:268/2245 train_time:16240ms step_avg:60.60ms +step:269/2245 train_time:16304ms step_avg:60.61ms +step:270/2245 train_time:16364ms step_avg:60.61ms +step:271/2245 train_time:16426ms step_avg:60.61ms +step:272/2245 train_time:16485ms step_avg:60.61ms +step:273/2245 train_time:16547ms step_avg:60.61ms +step:274/2245 train_time:16606ms step_avg:60.60ms +step:275/2245 train_time:16668ms step_avg:60.61ms +step:276/2245 train_time:16727ms step_avg:60.60ms +step:277/2245 train_time:16787ms step_avg:60.60ms +step:278/2245 train_time:16846ms step_avg:60.60ms +step:279/2245 train_time:16907ms step_avg:60.60ms +step:280/2245 train_time:16966ms step_avg:60.59ms +step:281/2245 train_time:17028ms step_avg:60.60ms +step:282/2245 train_time:17087ms step_avg:60.59ms +step:283/2245 train_time:17148ms step_avg:60.59ms +step:284/2245 train_time:17207ms step_avg:60.59ms +step:285/2245 train_time:17269ms step_avg:60.59ms +step:286/2245 train_time:17328ms step_avg:60.59ms +step:287/2245 train_time:17389ms step_avg:60.59ms +step:288/2245 train_time:17448ms step_avg:60.58ms +step:289/2245 train_time:17509ms step_avg:60.59ms +step:290/2245 train_time:17568ms step_avg:60.58ms +step:291/2245 train_time:17630ms step_avg:60.58ms +step:292/2245 train_time:17689ms step_avg:60.58ms +step:293/2245 train_time:17750ms step_avg:60.58ms +step:294/2245 train_time:17809ms step_avg:60.57ms +step:295/2245 train_time:17870ms step_avg:60.58ms +step:296/2245 train_time:17929ms step_avg:60.57ms +step:297/2245 train_time:17990ms step_avg:60.57ms +step:298/2245 train_time:18048ms step_avg:60.56ms +step:299/2245 train_time:18110ms step_avg:60.57ms +step:300/2245 train_time:18168ms step_avg:60.56ms +step:301/2245 train_time:18230ms step_avg:60.56ms +step:302/2245 train_time:18289ms step_avg:60.56ms +step:303/2245 train_time:18350ms step_avg:60.56ms +step:304/2245 train_time:18409ms step_avg:60.56ms +step:305/2245 train_time:18470ms step_avg:60.56ms +step:306/2245 train_time:18529ms step_avg:60.55ms +step:307/2245 train_time:18590ms step_avg:60.56ms +step:308/2245 train_time:18649ms step_avg:60.55ms +step:309/2245 train_time:18710ms step_avg:60.55ms +step:310/2245 train_time:18769ms step_avg:60.54ms +step:311/2245 train_time:18830ms step_avg:60.55ms +step:312/2245 train_time:18889ms step_avg:60.54ms +step:313/2245 train_time:18950ms step_avg:60.54ms +step:314/2245 train_time:19009ms step_avg:60.54ms +step:315/2245 train_time:19070ms step_avg:60.54ms +step:316/2245 train_time:19129ms step_avg:60.53ms +step:317/2245 train_time:19190ms step_avg:60.54ms +step:318/2245 train_time:19251ms step_avg:60.54ms +step:319/2245 train_time:19309ms step_avg:60.53ms +step:320/2245 train_time:19368ms step_avg:60.53ms +step:321/2245 train_time:19430ms step_avg:60.53ms +step:322/2245 train_time:19488ms step_avg:60.52ms +step:323/2245 train_time:19549ms step_avg:60.52ms +step:324/2245 train_time:19609ms step_avg:60.52ms +step:325/2245 train_time:19670ms step_avg:60.52ms +step:326/2245 train_time:19728ms step_avg:60.51ms +step:327/2245 train_time:19789ms step_avg:60.52ms +step:328/2245 train_time:19847ms step_avg:60.51ms +step:329/2245 train_time:19908ms step_avg:60.51ms +step:330/2245 train_time:19967ms step_avg:60.51ms +step:331/2245 train_time:20029ms step_avg:60.51ms +step:332/2245 train_time:20087ms step_avg:60.50ms +step:333/2245 train_time:20148ms step_avg:60.51ms +step:334/2245 train_time:20207ms step_avg:60.50ms +step:335/2245 train_time:20268ms step_avg:60.50ms +step:336/2245 train_time:20327ms step_avg:60.50ms +step:337/2245 train_time:20389ms step_avg:60.50ms +step:338/2245 train_time:20447ms step_avg:60.49ms +step:339/2245 train_time:20509ms step_avg:60.50ms +step:340/2245 train_time:20567ms step_avg:60.49ms +step:341/2245 train_time:20629ms step_avg:60.50ms +step:342/2245 train_time:20688ms step_avg:60.49ms +step:343/2245 train_time:20748ms step_avg:60.49ms +step:344/2245 train_time:20807ms step_avg:60.49ms +step:345/2245 train_time:20868ms step_avg:60.49ms +step:346/2245 train_time:20927ms step_avg:60.48ms +step:347/2245 train_time:20988ms step_avg:60.48ms +step:348/2245 train_time:21046ms step_avg:60.48ms +step:349/2245 train_time:21108ms step_avg:60.48ms +step:350/2245 train_time:21167ms step_avg:60.48ms +step:351/2245 train_time:21228ms step_avg:60.48ms +step:352/2245 train_time:21287ms step_avg:60.47ms +step:353/2245 train_time:21348ms step_avg:60.48ms +step:354/2245 train_time:21407ms step_avg:60.47ms +step:355/2245 train_time:21470ms step_avg:60.48ms +step:356/2245 train_time:21528ms step_avg:60.47ms +step:357/2245 train_time:21589ms step_avg:60.47ms +step:358/2245 train_time:21648ms step_avg:60.47ms +step:359/2245 train_time:21709ms step_avg:60.47ms +step:360/2245 train_time:21768ms step_avg:60.47ms +step:361/2245 train_time:21829ms step_avg:60.47ms +step:362/2245 train_time:21887ms step_avg:60.46ms +step:363/2245 train_time:21948ms step_avg:60.46ms +step:364/2245 train_time:22007ms step_avg:60.46ms +step:365/2245 train_time:22068ms step_avg:60.46ms +step:366/2245 train_time:22127ms step_avg:60.46ms +step:367/2245 train_time:22188ms step_avg:60.46ms +step:368/2245 train_time:22247ms step_avg:60.45ms +step:369/2245 train_time:22308ms step_avg:60.46ms +step:370/2245 train_time:22367ms step_avg:60.45ms +step:371/2245 train_time:22428ms step_avg:60.45ms +step:372/2245 train_time:22486ms step_avg:60.45ms +step:373/2245 train_time:22548ms step_avg:60.45ms +step:374/2245 train_time:22607ms step_avg:60.45ms +step:375/2245 train_time:22669ms step_avg:60.45ms +step:376/2245 train_time:22727ms step_avg:60.45ms +step:377/2245 train_time:22789ms step_avg:60.45ms +step:378/2245 train_time:22847ms step_avg:60.44ms +step:379/2245 train_time:22908ms step_avg:60.44ms +step:380/2245 train_time:22966ms step_avg:60.44ms +step:381/2245 train_time:23028ms step_avg:60.44ms +step:382/2245 train_time:23087ms step_avg:60.44ms +step:383/2245 train_time:23148ms step_avg:60.44ms +step:384/2245 train_time:23207ms step_avg:60.43ms +step:385/2245 train_time:23268ms step_avg:60.44ms +step:386/2245 train_time:23327ms step_avg:60.43ms +step:387/2245 train_time:23389ms step_avg:60.44ms +step:388/2245 train_time:23448ms step_avg:60.43ms +step:389/2245 train_time:23509ms step_avg:60.43ms +step:390/2245 train_time:23568ms step_avg:60.43ms +step:391/2245 train_time:23629ms step_avg:60.43ms +step:392/2245 train_time:23688ms step_avg:60.43ms +step:393/2245 train_time:23749ms step_avg:60.43ms +step:394/2245 train_time:23808ms step_avg:60.43ms +step:395/2245 train_time:23869ms step_avg:60.43ms +step:396/2245 train_time:23927ms step_avg:60.42ms +step:397/2245 train_time:23988ms step_avg:60.42ms +step:398/2245 train_time:24047ms step_avg:60.42ms +step:399/2245 train_time:24108ms step_avg:60.42ms +step:400/2245 train_time:24167ms step_avg:60.42ms +step:401/2245 train_time:24228ms step_avg:60.42ms +step:402/2245 train_time:24287ms step_avg:60.41ms +step:403/2245 train_time:24350ms step_avg:60.42ms +step:404/2245 train_time:24407ms step_avg:60.41ms +step:405/2245 train_time:24469ms step_avg:60.42ms +step:406/2245 train_time:24528ms step_avg:60.41ms +step:407/2245 train_time:24590ms step_avg:60.42ms +step:408/2245 train_time:24648ms step_avg:60.41ms +step:409/2245 train_time:24709ms step_avg:60.41ms +step:410/2245 train_time:24768ms step_avg:60.41ms +step:411/2245 train_time:24830ms step_avg:60.41ms +step:412/2245 train_time:24888ms step_avg:60.41ms +step:413/2245 train_time:24949ms step_avg:60.41ms +step:414/2245 train_time:25007ms step_avg:60.40ms +step:415/2245 train_time:25070ms step_avg:60.41ms +step:416/2245 train_time:25128ms step_avg:60.40ms +step:417/2245 train_time:25189ms step_avg:60.41ms +step:418/2245 train_time:25248ms step_avg:60.40ms +step:419/2245 train_time:25310ms step_avg:60.41ms +step:420/2245 train_time:25368ms step_avg:60.40ms +step:421/2245 train_time:25430ms step_avg:60.40ms +step:422/2245 train_time:25489ms step_avg:60.40ms +step:423/2245 train_time:25550ms step_avg:60.40ms +step:424/2245 train_time:25608ms step_avg:60.40ms +step:425/2245 train_time:25670ms step_avg:60.40ms +step:426/2245 train_time:25729ms step_avg:60.40ms +step:427/2245 train_time:25790ms step_avg:60.40ms +step:428/2245 train_time:25848ms step_avg:60.39ms +step:429/2245 train_time:25910ms step_avg:60.40ms +step:430/2245 train_time:25968ms step_avg:60.39ms +step:431/2245 train_time:26030ms step_avg:60.39ms +step:432/2245 train_time:26088ms step_avg:60.39ms +step:433/2245 train_time:26150ms step_avg:60.39ms +step:434/2245 train_time:26208ms step_avg:60.39ms +step:435/2245 train_time:26270ms step_avg:60.39ms +step:436/2245 train_time:26328ms step_avg:60.39ms +step:437/2245 train_time:26390ms step_avg:60.39ms +step:438/2245 train_time:26449ms step_avg:60.38ms +step:439/2245 train_time:26510ms step_avg:60.39ms +step:440/2245 train_time:26569ms step_avg:60.38ms +step:441/2245 train_time:26630ms step_avg:60.39ms +step:442/2245 train_time:26689ms step_avg:60.38ms +step:443/2245 train_time:26750ms step_avg:60.38ms +step:444/2245 train_time:26809ms step_avg:60.38ms +step:445/2245 train_time:26870ms step_avg:60.38ms +step:446/2245 train_time:26928ms step_avg:60.38ms +step:447/2245 train_time:26989ms step_avg:60.38ms +step:448/2245 train_time:27048ms step_avg:60.38ms +step:449/2245 train_time:27109ms step_avg:60.38ms +step:450/2245 train_time:27168ms step_avg:60.37ms +step:451/2245 train_time:27229ms step_avg:60.37ms +step:452/2245 train_time:27288ms step_avg:60.37ms +step:453/2245 train_time:27349ms step_avg:60.37ms +step:454/2245 train_time:27408ms step_avg:60.37ms +step:455/2245 train_time:27469ms step_avg:60.37ms +step:456/2245 train_time:27528ms step_avg:60.37ms +step:457/2245 train_time:27589ms step_avg:60.37ms +step:458/2245 train_time:27648ms step_avg:60.37ms +step:459/2245 train_time:27709ms step_avg:60.37ms +step:460/2245 train_time:27768ms step_avg:60.36ms +step:461/2245 train_time:27829ms step_avg:60.37ms +step:462/2245 train_time:27888ms step_avg:60.36ms +step:463/2245 train_time:27949ms step_avg:60.36ms +step:464/2245 train_time:28008ms step_avg:60.36ms +step:465/2245 train_time:28070ms step_avg:60.36ms +step:466/2245 train_time:28128ms step_avg:60.36ms +step:467/2245 train_time:28190ms step_avg:60.36ms +step:468/2245 train_time:28248ms step_avg:60.36ms +step:469/2245 train_time:28309ms step_avg:60.36ms +step:470/2245 train_time:28368ms step_avg:60.36ms +step:471/2245 train_time:28429ms step_avg:60.36ms +step:472/2245 train_time:28489ms step_avg:60.36ms +step:473/2245 train_time:28550ms step_avg:60.36ms +step:474/2245 train_time:28609ms step_avg:60.36ms +step:475/2245 train_time:28670ms step_avg:60.36ms +step:476/2245 train_time:28729ms step_avg:60.36ms +step:477/2245 train_time:28790ms step_avg:60.36ms +step:478/2245 train_time:28849ms step_avg:60.35ms +step:479/2245 train_time:28910ms step_avg:60.36ms +step:480/2245 train_time:28969ms step_avg:60.35ms +step:481/2245 train_time:29031ms step_avg:60.35ms +step:482/2245 train_time:29089ms step_avg:60.35ms +step:483/2245 train_time:29152ms step_avg:60.36ms +step:484/2245 train_time:29209ms step_avg:60.35ms +step:485/2245 train_time:29270ms step_avg:60.35ms +step:486/2245 train_time:29329ms step_avg:60.35ms +step:487/2245 train_time:29390ms step_avg:60.35ms +step:488/2245 train_time:29449ms step_avg:60.35ms +step:489/2245 train_time:29510ms step_avg:60.35ms +step:490/2245 train_time:29569ms step_avg:60.34ms +step:491/2245 train_time:29630ms step_avg:60.35ms +step:492/2245 train_time:29689ms step_avg:60.34ms +step:493/2245 train_time:29750ms step_avg:60.34ms +step:494/2245 train_time:29809ms step_avg:60.34ms +step:495/2245 train_time:29870ms step_avg:60.34ms +step:496/2245 train_time:29929ms step_avg:60.34ms +step:497/2245 train_time:29990ms step_avg:60.34ms +step:498/2245 train_time:30049ms step_avg:60.34ms +step:499/2245 train_time:30110ms step_avg:60.34ms +step:500/2245 train_time:30169ms step_avg:60.34ms +step:500/2245 val_loss:3.8231 train_time:30231ms step_avg:60.46ms +step:501/2245 train_time:30252ms step_avg:60.38ms +step:502/2245 train_time:30293ms step_avg:60.34ms +step:503/2245 train_time:30357ms step_avg:60.35ms +step:504/2245 train_time:30417ms step_avg:60.35ms +step:505/2245 train_time:30479ms step_avg:60.35ms +step:506/2245 train_time:30537ms step_avg:60.35ms +step:507/2245 train_time:30598ms step_avg:60.35ms +step:508/2245 train_time:30657ms step_avg:60.35ms +step:509/2245 train_time:30717ms step_avg:60.35ms +step:510/2245 train_time:30775ms step_avg:60.34ms +step:511/2245 train_time:30836ms step_avg:60.35ms +step:512/2245 train_time:30895ms step_avg:60.34ms +step:513/2245 train_time:30956ms step_avg:60.34ms +step:514/2245 train_time:31015ms step_avg:60.34ms +step:515/2245 train_time:31075ms step_avg:60.34ms +step:516/2245 train_time:31134ms step_avg:60.34ms +step:517/2245 train_time:31198ms step_avg:60.34ms +step:518/2245 train_time:31258ms step_avg:60.34ms +step:519/2245 train_time:31321ms step_avg:60.35ms +step:520/2245 train_time:31381ms step_avg:60.35ms +step:521/2245 train_time:31442ms step_avg:60.35ms +step:522/2245 train_time:31501ms step_avg:60.35ms +step:523/2245 train_time:31562ms step_avg:60.35ms +step:524/2245 train_time:31621ms step_avg:60.35ms +step:525/2245 train_time:31682ms step_avg:60.35ms +step:526/2245 train_time:31742ms step_avg:60.35ms +step:527/2245 train_time:31803ms step_avg:60.35ms +step:528/2245 train_time:31862ms step_avg:60.34ms +step:529/2245 train_time:31923ms step_avg:60.35ms +step:530/2245 train_time:31982ms step_avg:60.34ms +step:531/2245 train_time:32043ms step_avg:60.34ms +step:532/2245 train_time:32102ms step_avg:60.34ms +step:533/2245 train_time:32164ms step_avg:60.35ms +step:534/2245 train_time:32224ms step_avg:60.34ms +step:535/2245 train_time:32286ms step_avg:60.35ms +step:536/2245 train_time:32346ms step_avg:60.35ms +step:537/2245 train_time:32408ms step_avg:60.35ms +step:538/2245 train_time:32467ms step_avg:60.35ms +step:539/2245 train_time:32529ms step_avg:60.35ms +step:540/2245 train_time:32588ms step_avg:60.35ms +step:541/2245 train_time:32650ms step_avg:60.35ms +step:542/2245 train_time:32709ms step_avg:60.35ms +step:543/2245 train_time:32770ms step_avg:60.35ms +step:544/2245 train_time:32830ms step_avg:60.35ms +step:545/2245 train_time:32891ms step_avg:60.35ms +step:546/2245 train_time:32951ms step_avg:60.35ms +step:547/2245 train_time:33012ms step_avg:60.35ms +step:548/2245 train_time:33071ms step_avg:60.35ms +step:549/2245 train_time:33133ms step_avg:60.35ms +step:550/2245 train_time:33192ms step_avg:60.35ms +step:551/2245 train_time:33254ms step_avg:60.35ms +step:552/2245 train_time:33312ms step_avg:60.35ms +step:553/2245 train_time:33374ms step_avg:60.35ms +step:554/2245 train_time:33432ms step_avg:60.35ms +step:555/2245 train_time:33494ms step_avg:60.35ms +step:556/2245 train_time:33553ms step_avg:60.35ms +step:557/2245 train_time:33614ms step_avg:60.35ms +step:558/2245 train_time:33673ms step_avg:60.35ms +step:559/2245 train_time:33734ms step_avg:60.35ms +step:560/2245 train_time:33793ms step_avg:60.34ms +step:561/2245 train_time:33854ms step_avg:60.35ms +step:562/2245 train_time:33913ms step_avg:60.34ms +step:563/2245 train_time:33974ms step_avg:60.35ms +step:564/2245 train_time:34033ms step_avg:60.34ms +step:565/2245 train_time:34095ms step_avg:60.34ms +step:566/2245 train_time:34154ms step_avg:60.34ms +step:567/2245 train_time:34215ms step_avg:60.34ms +step:568/2245 train_time:34274ms step_avg:60.34ms +step:569/2245 train_time:34335ms step_avg:60.34ms +step:570/2245 train_time:34394ms step_avg:60.34ms +step:571/2245 train_time:34456ms step_avg:60.34ms +step:572/2245 train_time:34514ms step_avg:60.34ms +step:573/2245 train_time:34576ms step_avg:60.34ms +step:574/2245 train_time:34635ms step_avg:60.34ms +step:575/2245 train_time:34696ms step_avg:60.34ms +step:576/2245 train_time:34755ms step_avg:60.34ms +step:577/2245 train_time:34816ms step_avg:60.34ms +step:578/2245 train_time:34875ms step_avg:60.34ms +step:579/2245 train_time:34937ms step_avg:60.34ms +step:580/2245 train_time:34995ms step_avg:60.34ms +step:581/2245 train_time:35057ms step_avg:60.34ms +step:582/2245 train_time:35115ms step_avg:60.34ms +step:583/2245 train_time:35177ms step_avg:60.34ms +step:584/2245 train_time:35236ms step_avg:60.33ms +step:585/2245 train_time:35298ms step_avg:60.34ms +step:586/2245 train_time:35356ms step_avg:60.34ms +step:587/2245 train_time:35418ms step_avg:60.34ms +step:588/2245 train_time:35476ms step_avg:60.33ms +step:589/2245 train_time:35538ms step_avg:60.34ms +step:590/2245 train_time:35597ms step_avg:60.33ms +step:591/2245 train_time:35658ms step_avg:60.34ms +step:592/2245 train_time:35717ms step_avg:60.33ms +step:593/2245 train_time:35779ms step_avg:60.33ms +step:594/2245 train_time:35838ms step_avg:60.33ms +step:595/2245 train_time:35899ms step_avg:60.33ms +step:596/2245 train_time:35958ms step_avg:60.33ms +step:597/2245 train_time:36019ms step_avg:60.33ms +step:598/2245 train_time:36078ms step_avg:60.33ms +step:599/2245 train_time:36139ms step_avg:60.33ms +step:600/2245 train_time:36199ms step_avg:60.33ms +step:601/2245 train_time:36261ms step_avg:60.33ms +step:602/2245 train_time:36320ms step_avg:60.33ms +step:603/2245 train_time:36381ms step_avg:60.33ms +step:604/2245 train_time:36440ms step_avg:60.33ms +step:605/2245 train_time:36502ms step_avg:60.33ms +step:606/2245 train_time:36560ms step_avg:60.33ms +step:607/2245 train_time:36622ms step_avg:60.33ms +step:608/2245 train_time:36681ms step_avg:60.33ms +step:609/2245 train_time:36742ms step_avg:60.33ms +step:610/2245 train_time:36802ms step_avg:60.33ms +step:611/2245 train_time:36864ms step_avg:60.33ms +step:612/2245 train_time:36923ms step_avg:60.33ms +step:613/2245 train_time:36984ms step_avg:60.33ms +step:614/2245 train_time:37044ms step_avg:60.33ms +step:615/2245 train_time:37106ms step_avg:60.33ms +step:616/2245 train_time:37165ms step_avg:60.33ms +step:617/2245 train_time:37227ms step_avg:60.34ms +step:618/2245 train_time:37286ms step_avg:60.33ms +step:619/2245 train_time:37348ms step_avg:60.34ms +step:620/2245 train_time:37408ms step_avg:60.34ms +step:621/2245 train_time:37469ms step_avg:60.34ms +step:622/2245 train_time:37528ms step_avg:60.33ms +step:623/2245 train_time:37590ms step_avg:60.34ms +step:624/2245 train_time:37650ms step_avg:60.34ms +step:625/2245 train_time:37711ms step_avg:60.34ms +step:626/2245 train_time:37770ms step_avg:60.34ms +step:627/2245 train_time:37831ms step_avg:60.34ms +step:628/2245 train_time:37890ms step_avg:60.33ms +step:629/2245 train_time:37951ms step_avg:60.34ms +step:630/2245 train_time:38010ms step_avg:60.33ms +step:631/2245 train_time:38072ms step_avg:60.34ms +step:632/2245 train_time:38131ms step_avg:60.33ms +step:633/2245 train_time:38193ms step_avg:60.34ms +step:634/2245 train_time:38252ms step_avg:60.33ms +step:635/2245 train_time:38314ms step_avg:60.34ms +step:636/2245 train_time:38372ms step_avg:60.33ms +step:637/2245 train_time:38433ms step_avg:60.34ms +step:638/2245 train_time:38492ms step_avg:60.33ms +step:639/2245 train_time:38554ms step_avg:60.33ms +step:640/2245 train_time:38613ms step_avg:60.33ms +step:641/2245 train_time:38674ms step_avg:60.33ms +step:642/2245 train_time:38733ms step_avg:60.33ms +step:643/2245 train_time:38794ms step_avg:60.33ms +step:644/2245 train_time:38853ms step_avg:60.33ms +step:645/2245 train_time:38914ms step_avg:60.33ms +step:646/2245 train_time:38973ms step_avg:60.33ms +step:647/2245 train_time:39036ms step_avg:60.33ms +step:648/2245 train_time:39095ms step_avg:60.33ms +step:649/2245 train_time:39156ms step_avg:60.33ms +step:650/2245 train_time:39214ms step_avg:60.33ms +step:651/2245 train_time:39276ms step_avg:60.33ms +step:652/2245 train_time:39335ms step_avg:60.33ms +step:653/2245 train_time:39396ms step_avg:60.33ms +step:654/2245 train_time:39455ms step_avg:60.33ms +step:655/2245 train_time:39517ms step_avg:60.33ms +step:656/2245 train_time:39576ms step_avg:60.33ms +step:657/2245 train_time:39638ms step_avg:60.33ms +step:658/2245 train_time:39697ms step_avg:60.33ms +step:659/2245 train_time:39758ms step_avg:60.33ms +step:660/2245 train_time:39817ms step_avg:60.33ms +step:661/2245 train_time:39879ms step_avg:60.33ms +step:662/2245 train_time:39938ms step_avg:60.33ms +step:663/2245 train_time:39999ms step_avg:60.33ms +step:664/2245 train_time:40058ms step_avg:60.33ms +step:665/2245 train_time:40119ms step_avg:60.33ms +step:666/2245 train_time:40178ms step_avg:60.33ms +step:667/2245 train_time:40239ms step_avg:60.33ms +step:668/2245 train_time:40298ms step_avg:60.33ms +step:669/2245 train_time:40359ms step_avg:60.33ms +step:670/2245 train_time:40418ms step_avg:60.33ms +step:671/2245 train_time:40479ms step_avg:60.33ms +step:672/2245 train_time:40538ms step_avg:60.32ms +step:673/2245 train_time:40599ms step_avg:60.33ms +step:674/2245 train_time:40659ms step_avg:60.32ms +step:675/2245 train_time:40721ms step_avg:60.33ms +step:676/2245 train_time:40780ms step_avg:60.33ms +step:677/2245 train_time:40841ms step_avg:60.33ms +step:678/2245 train_time:40900ms step_avg:60.32ms +step:679/2245 train_time:40962ms step_avg:60.33ms +step:680/2245 train_time:41021ms step_avg:60.32ms +step:681/2245 train_time:41082ms step_avg:60.33ms +step:682/2245 train_time:41141ms step_avg:60.32ms +step:683/2245 train_time:41203ms step_avg:60.33ms +step:684/2245 train_time:41263ms step_avg:60.33ms +step:685/2245 train_time:41324ms step_avg:60.33ms +step:686/2245 train_time:41383ms step_avg:60.33ms +step:687/2245 train_time:41445ms step_avg:60.33ms +step:688/2245 train_time:41505ms step_avg:60.33ms +step:689/2245 train_time:41567ms step_avg:60.33ms +step:690/2245 train_time:41626ms step_avg:60.33ms +step:691/2245 train_time:41688ms step_avg:60.33ms +step:692/2245 train_time:41748ms step_avg:60.33ms +step:693/2245 train_time:41810ms step_avg:60.33ms +step:694/2245 train_time:41869ms step_avg:60.33ms +step:695/2245 train_time:41930ms step_avg:60.33ms +step:696/2245 train_time:41989ms step_avg:60.33ms +step:697/2245 train_time:42051ms step_avg:60.33ms +step:698/2245 train_time:42109ms step_avg:60.33ms +step:699/2245 train_time:42171ms step_avg:60.33ms +step:700/2245 train_time:42229ms step_avg:60.33ms +step:701/2245 train_time:42291ms step_avg:60.33ms +step:702/2245 train_time:42351ms step_avg:60.33ms +step:703/2245 train_time:42412ms step_avg:60.33ms +step:704/2245 train_time:42471ms step_avg:60.33ms +step:705/2245 train_time:42533ms step_avg:60.33ms +step:706/2245 train_time:42595ms step_avg:60.33ms +step:707/2245 train_time:42653ms step_avg:60.33ms +step:708/2245 train_time:42713ms step_avg:60.33ms +step:709/2245 train_time:42774ms step_avg:60.33ms +step:710/2245 train_time:42832ms step_avg:60.33ms +step:711/2245 train_time:42893ms step_avg:60.33ms +step:712/2245 train_time:42952ms step_avg:60.33ms +step:713/2245 train_time:43013ms step_avg:60.33ms +step:714/2245 train_time:43072ms step_avg:60.32ms +step:715/2245 train_time:43133ms step_avg:60.33ms +step:716/2245 train_time:43192ms step_avg:60.32ms +step:717/2245 train_time:43254ms step_avg:60.33ms +step:718/2245 train_time:43314ms step_avg:60.33ms +step:719/2245 train_time:43375ms step_avg:60.33ms +step:720/2245 train_time:43434ms step_avg:60.32ms +step:721/2245 train_time:44026ms step_avg:61.06ms +step:722/2245 train_time:44083ms step_avg:61.06ms +step:723/2245 train_time:44143ms step_avg:61.06ms +step:724/2245 train_time:44202ms step_avg:61.05ms +step:725/2245 train_time:44262ms step_avg:61.05ms +step:726/2245 train_time:44320ms step_avg:61.05ms +step:727/2245 train_time:44381ms step_avg:61.05ms +step:728/2245 train_time:44439ms step_avg:61.04ms +step:729/2245 train_time:44500ms step_avg:61.04ms +step:730/2245 train_time:44558ms step_avg:61.04ms +step:731/2245 train_time:44618ms step_avg:61.04ms +step:732/2245 train_time:44676ms step_avg:61.03ms +step:733/2245 train_time:44737ms step_avg:61.03ms +step:734/2245 train_time:44795ms step_avg:61.03ms +step:735/2245 train_time:44857ms step_avg:61.03ms +step:736/2245 train_time:44923ms step_avg:61.04ms +step:737/2245 train_time:44990ms step_avg:61.04ms +step:738/2245 train_time:45051ms step_avg:61.05ms +step:739/2245 train_time:45113ms step_avg:61.05ms +step:740/2245 train_time:45173ms step_avg:61.04ms +step:741/2245 train_time:45234ms step_avg:61.04ms +step:742/2245 train_time:45293ms step_avg:61.04ms +step:743/2245 train_time:45355ms step_avg:61.04ms +step:744/2245 train_time:45414ms step_avg:61.04ms +step:745/2245 train_time:45476ms step_avg:61.04ms +step:746/2245 train_time:45535ms step_avg:61.04ms +step:747/2245 train_time:45597ms step_avg:61.04ms +step:748/2245 train_time:45656ms step_avg:61.04ms +step:749/2245 train_time:45717ms step_avg:61.04ms +step:750/2245 train_time:45776ms step_avg:61.03ms +step:750/2245 val_loss:3.6687 train_time:45839ms step_avg:61.12ms +step:751/2245 train_time:45860ms step_avg:61.07ms +step:752/2245 train_time:45903ms step_avg:61.04ms +step:753/2245 train_time:45964ms step_avg:61.04ms +step:754/2245 train_time:46024ms step_avg:61.04ms +step:755/2245 train_time:46087ms step_avg:61.04ms +step:756/2245 train_time:46147ms step_avg:61.04ms +step:757/2245 train_time:46208ms step_avg:61.04ms +step:758/2245 train_time:46267ms step_avg:61.04ms +step:759/2245 train_time:46329ms step_avg:61.04ms +step:760/2245 train_time:46387ms step_avg:61.04ms +step:761/2245 train_time:46448ms step_avg:61.04ms +step:762/2245 train_time:46507ms step_avg:61.03ms +step:763/2245 train_time:46568ms step_avg:61.03ms +step:764/2245 train_time:46627ms step_avg:61.03ms +step:765/2245 train_time:46689ms step_avg:61.03ms +step:766/2245 train_time:46755ms step_avg:61.04ms +step:767/2245 train_time:46823ms step_avg:61.05ms +step:768/2245 train_time:46883ms step_avg:61.05ms +step:769/2245 train_time:46945ms step_avg:61.05ms +step:770/2245 train_time:47005ms step_avg:61.05ms +step:771/2245 train_time:47067ms step_avg:61.05ms +step:772/2245 train_time:47126ms step_avg:61.04ms +step:773/2245 train_time:47187ms step_avg:61.04ms +step:774/2245 train_time:47246ms step_avg:61.04ms +step:775/2245 train_time:47307ms step_avg:61.04ms +step:776/2245 train_time:47366ms step_avg:61.04ms +step:777/2245 train_time:47429ms step_avg:61.04ms +step:778/2245 train_time:47487ms step_avg:61.04ms +step:779/2245 train_time:47547ms step_avg:61.04ms +step:780/2245 train_time:47606ms step_avg:61.03ms +step:781/2245 train_time:47670ms step_avg:61.04ms +step:782/2245 train_time:47733ms step_avg:61.04ms +step:783/2245 train_time:47796ms step_avg:61.04ms +step:784/2245 train_time:47857ms step_avg:61.04ms +step:785/2245 train_time:47921ms step_avg:61.05ms +step:786/2245 train_time:47981ms step_avg:61.04ms +step:787/2245 train_time:48044ms step_avg:61.05ms +step:788/2245 train_time:48104ms step_avg:61.05ms +step:789/2245 train_time:48165ms step_avg:61.05ms +step:790/2245 train_time:48225ms step_avg:61.04ms +step:791/2245 train_time:48286ms step_avg:61.04ms +step:792/2245 train_time:48345ms step_avg:61.04ms +step:793/2245 train_time:48406ms step_avg:61.04ms +step:794/2245 train_time:48465ms step_avg:61.04ms +step:795/2245 train_time:48526ms step_avg:61.04ms +step:796/2245 train_time:48586ms step_avg:61.04ms +step:797/2245 train_time:48649ms step_avg:61.04ms +step:798/2245 train_time:48709ms step_avg:61.04ms +step:799/2245 train_time:48773ms step_avg:61.04ms +step:800/2245 train_time:48834ms step_avg:61.04ms +step:801/2245 train_time:48897ms step_avg:61.04ms +step:802/2245 train_time:48957ms step_avg:61.04ms +step:803/2245 train_time:49021ms step_avg:61.05ms +step:804/2245 train_time:49081ms step_avg:61.05ms +step:805/2245 train_time:49143ms step_avg:61.05ms +step:806/2245 train_time:49203ms step_avg:61.05ms +step:807/2245 train_time:49265ms step_avg:61.05ms +step:808/2245 train_time:49325ms step_avg:61.05ms +step:809/2245 train_time:49386ms step_avg:61.05ms +step:810/2245 train_time:49445ms step_avg:61.04ms +step:811/2245 train_time:49506ms step_avg:61.04ms +step:812/2245 train_time:49565ms step_avg:61.04ms +step:813/2245 train_time:49628ms step_avg:61.04ms +step:814/2245 train_time:49687ms step_avg:61.04ms +step:815/2245 train_time:49750ms step_avg:61.04ms +step:816/2245 train_time:49810ms step_avg:61.04ms +step:817/2245 train_time:49873ms step_avg:61.04ms +step:818/2245 train_time:49933ms step_avg:61.04ms +step:819/2245 train_time:49996ms step_avg:61.05ms +step:820/2245 train_time:50057ms step_avg:61.05ms +step:821/2245 train_time:50120ms step_avg:61.05ms +step:822/2245 train_time:50180ms step_avg:61.05ms +step:823/2245 train_time:50242ms step_avg:61.05ms +step:824/2245 train_time:50302ms step_avg:61.05ms +step:825/2245 train_time:50364ms step_avg:61.05ms +step:826/2245 train_time:50423ms step_avg:61.05ms +step:827/2245 train_time:50485ms step_avg:61.05ms +step:828/2245 train_time:50545ms step_avg:61.04ms +step:829/2245 train_time:50607ms step_avg:61.05ms +step:830/2245 train_time:50666ms step_avg:61.04ms +step:831/2245 train_time:50728ms step_avg:61.04ms +step:832/2245 train_time:50788ms step_avg:61.04ms +step:833/2245 train_time:50851ms step_avg:61.05ms +step:834/2245 train_time:50911ms step_avg:61.04ms +step:835/2245 train_time:50975ms step_avg:61.05ms +step:836/2245 train_time:51034ms step_avg:61.05ms +step:837/2245 train_time:51097ms step_avg:61.05ms +step:838/2245 train_time:51158ms step_avg:61.05ms +step:839/2245 train_time:51221ms step_avg:61.05ms +step:840/2245 train_time:51281ms step_avg:61.05ms +step:841/2245 train_time:51343ms step_avg:61.05ms +step:842/2245 train_time:51403ms step_avg:61.05ms +step:843/2245 train_time:51465ms step_avg:61.05ms +step:844/2245 train_time:51525ms step_avg:61.05ms +step:845/2245 train_time:51587ms step_avg:61.05ms +step:846/2245 train_time:51646ms step_avg:61.05ms +step:847/2245 train_time:51708ms step_avg:61.05ms +step:848/2245 train_time:51768ms step_avg:61.05ms +step:849/2245 train_time:51830ms step_avg:61.05ms +step:850/2245 train_time:51891ms step_avg:61.05ms +step:851/2245 train_time:51954ms step_avg:61.05ms +step:852/2245 train_time:52014ms step_avg:61.05ms +step:853/2245 train_time:52076ms step_avg:61.05ms +step:854/2245 train_time:52137ms step_avg:61.05ms +step:855/2245 train_time:52200ms step_avg:61.05ms +step:856/2245 train_time:52263ms step_avg:61.05ms +step:857/2245 train_time:52321ms step_avg:61.05ms +step:858/2245 train_time:52381ms step_avg:61.05ms +step:859/2245 train_time:52443ms step_avg:61.05ms +step:860/2245 train_time:52503ms step_avg:61.05ms +step:861/2245 train_time:52567ms step_avg:61.05ms +step:862/2245 train_time:52626ms step_avg:61.05ms +step:863/2245 train_time:52687ms step_avg:61.05ms +step:864/2245 train_time:52747ms step_avg:61.05ms +step:865/2245 train_time:52809ms step_avg:61.05ms +step:866/2245 train_time:52868ms step_avg:61.05ms +step:867/2245 train_time:52931ms step_avg:61.05ms +step:868/2245 train_time:52991ms step_avg:61.05ms +step:869/2245 train_time:53054ms step_avg:61.05ms +step:870/2245 train_time:53115ms step_avg:61.05ms +step:871/2245 train_time:53178ms step_avg:61.05ms +step:872/2245 train_time:53238ms step_avg:61.05ms +step:873/2245 train_time:53301ms step_avg:61.05ms +step:874/2245 train_time:53361ms step_avg:61.05ms +step:875/2245 train_time:53423ms step_avg:61.05ms +step:876/2245 train_time:53483ms step_avg:61.05ms +step:877/2245 train_time:53545ms step_avg:61.05ms +step:878/2245 train_time:53604ms step_avg:61.05ms +step:879/2245 train_time:53666ms step_avg:61.05ms +step:880/2245 train_time:53726ms step_avg:61.05ms +step:881/2245 train_time:53788ms step_avg:61.05ms +step:882/2245 train_time:53848ms step_avg:61.05ms +step:883/2245 train_time:53911ms step_avg:61.05ms +step:884/2245 train_time:53971ms step_avg:61.05ms +step:885/2245 train_time:54034ms step_avg:61.06ms +step:886/2245 train_time:54093ms step_avg:61.05ms +step:887/2245 train_time:54156ms step_avg:61.06ms +step:888/2245 train_time:54218ms step_avg:61.06ms +step:889/2245 train_time:54280ms step_avg:61.06ms +step:890/2245 train_time:54340ms step_avg:61.06ms +step:891/2245 train_time:54402ms step_avg:61.06ms +step:892/2245 train_time:54461ms step_avg:61.06ms +step:893/2245 train_time:54523ms step_avg:61.06ms +step:894/2245 train_time:54583ms step_avg:61.05ms +step:895/2245 train_time:54644ms step_avg:61.06ms +step:896/2245 train_time:54704ms step_avg:61.05ms +step:897/2245 train_time:54766ms step_avg:61.05ms +step:898/2245 train_time:54826ms step_avg:61.05ms +step:899/2245 train_time:54889ms step_avg:61.06ms +step:900/2245 train_time:54948ms step_avg:61.05ms +step:901/2245 train_time:55012ms step_avg:61.06ms +step:902/2245 train_time:55071ms step_avg:61.05ms +step:903/2245 train_time:55134ms step_avg:61.06ms +step:904/2245 train_time:55193ms step_avg:61.05ms +step:905/2245 train_time:55256ms step_avg:61.06ms +step:906/2245 train_time:55317ms step_avg:61.06ms +step:907/2245 train_time:55380ms step_avg:61.06ms +step:908/2245 train_time:55440ms step_avg:61.06ms +step:909/2245 train_time:55502ms step_avg:61.06ms +step:910/2245 train_time:55562ms step_avg:61.06ms +step:911/2245 train_time:55624ms step_avg:61.06ms +step:912/2245 train_time:55684ms step_avg:61.06ms +step:913/2245 train_time:55746ms step_avg:61.06ms +step:914/2245 train_time:55805ms step_avg:61.06ms +step:915/2245 train_time:55868ms step_avg:61.06ms +step:916/2245 train_time:55927ms step_avg:61.06ms +step:917/2245 train_time:55990ms step_avg:61.06ms +step:918/2245 train_time:56050ms step_avg:61.06ms +step:919/2245 train_time:56112ms step_avg:61.06ms +step:920/2245 train_time:56172ms step_avg:61.06ms +step:921/2245 train_time:56235ms step_avg:61.06ms +step:922/2245 train_time:56295ms step_avg:61.06ms +step:923/2245 train_time:56358ms step_avg:61.06ms +step:924/2245 train_time:56419ms step_avg:61.06ms +step:925/2245 train_time:56482ms step_avg:61.06ms +step:926/2245 train_time:56541ms step_avg:61.06ms +step:927/2245 train_time:56603ms step_avg:61.06ms +step:928/2245 train_time:56662ms step_avg:61.06ms +step:929/2245 train_time:56725ms step_avg:61.06ms +step:930/2245 train_time:56785ms step_avg:61.06ms +step:931/2245 train_time:56846ms step_avg:61.06ms +step:932/2245 train_time:56905ms step_avg:61.06ms +step:933/2245 train_time:56968ms step_avg:61.06ms +step:934/2245 train_time:57027ms step_avg:61.06ms +step:935/2245 train_time:57090ms step_avg:61.06ms +step:936/2245 train_time:57150ms step_avg:61.06ms +step:937/2245 train_time:57212ms step_avg:61.06ms +step:938/2245 train_time:57272ms step_avg:61.06ms +step:939/2245 train_time:57335ms step_avg:61.06ms +step:940/2245 train_time:57395ms step_avg:61.06ms +step:941/2245 train_time:57458ms step_avg:61.06ms +step:942/2245 train_time:57519ms step_avg:61.06ms +step:943/2245 train_time:57582ms step_avg:61.06ms +step:944/2245 train_time:57642ms step_avg:61.06ms +step:945/2245 train_time:57704ms step_avg:61.06ms +step:946/2245 train_time:57764ms step_avg:61.06ms +step:947/2245 train_time:57826ms step_avg:61.06ms +step:948/2245 train_time:57885ms step_avg:61.06ms +step:949/2245 train_time:57947ms step_avg:61.06ms +step:950/2245 train_time:58006ms step_avg:61.06ms +step:951/2245 train_time:58068ms step_avg:61.06ms +step:952/2245 train_time:58128ms step_avg:61.06ms +step:953/2245 train_time:58190ms step_avg:61.06ms +step:954/2245 train_time:58250ms step_avg:61.06ms +step:955/2245 train_time:58313ms step_avg:61.06ms +step:956/2245 train_time:58373ms step_avg:61.06ms +step:957/2245 train_time:58436ms step_avg:61.06ms +step:958/2245 train_time:58497ms step_avg:61.06ms +step:959/2245 train_time:58560ms step_avg:61.06ms +step:960/2245 train_time:58621ms step_avg:61.06ms +step:961/2245 train_time:58683ms step_avg:61.06ms +step:962/2245 train_time:58743ms step_avg:61.06ms +step:963/2245 train_time:58805ms step_avg:61.06ms +step:964/2245 train_time:58865ms step_avg:61.06ms +step:965/2245 train_time:58927ms step_avg:61.06ms +step:966/2245 train_time:58986ms step_avg:61.06ms +step:967/2245 train_time:59048ms step_avg:61.06ms +step:968/2245 train_time:59107ms step_avg:61.06ms +step:969/2245 train_time:59169ms step_avg:61.06ms +step:970/2245 train_time:59230ms step_avg:61.06ms +step:971/2245 train_time:59293ms step_avg:61.06ms +step:972/2245 train_time:59353ms step_avg:61.06ms +step:973/2245 train_time:59416ms step_avg:61.06ms +step:974/2245 train_time:59477ms step_avg:61.06ms +step:975/2245 train_time:59541ms step_avg:61.07ms +step:976/2245 train_time:59601ms step_avg:61.07ms +step:977/2245 train_time:59663ms step_avg:61.07ms +step:978/2245 train_time:59724ms step_avg:61.07ms +step:979/2245 train_time:59785ms step_avg:61.07ms +step:980/2245 train_time:59844ms step_avg:61.07ms +step:981/2245 train_time:59906ms step_avg:61.07ms +step:982/2245 train_time:59966ms step_avg:61.07ms +step:983/2245 train_time:60028ms step_avg:61.07ms +step:984/2245 train_time:60087ms step_avg:61.06ms +step:985/2245 train_time:60150ms step_avg:61.07ms +step:986/2245 train_time:60210ms step_avg:61.06ms +step:987/2245 train_time:60273ms step_avg:61.07ms +step:988/2245 train_time:60333ms step_avg:61.07ms +step:989/2245 train_time:60395ms step_avg:61.07ms +step:990/2245 train_time:60456ms step_avg:61.07ms +step:991/2245 train_time:60520ms step_avg:61.07ms +step:992/2245 train_time:60580ms step_avg:61.07ms +step:993/2245 train_time:60643ms step_avg:61.07ms +step:994/2245 train_time:60703ms step_avg:61.07ms +step:995/2245 train_time:60765ms step_avg:61.07ms +step:996/2245 train_time:60826ms step_avg:61.07ms +step:997/2245 train_time:60887ms step_avg:61.07ms +step:998/2245 train_time:60947ms step_avg:61.07ms +step:999/2245 train_time:61009ms step_avg:61.07ms +step:1000/2245 train_time:61068ms step_avg:61.07ms +step:1000/2245 val_loss:3.5916 train_time:61132ms step_avg:61.13ms +step:1001/2245 train_time:61151ms step_avg:61.09ms +step:1002/2245 train_time:61196ms step_avg:61.07ms +step:1003/2245 train_time:61262ms step_avg:61.08ms +step:1004/2245 train_time:61321ms step_avg:61.08ms +step:1005/2245 train_time:61383ms step_avg:61.08ms +step:1006/2245 train_time:61442ms step_avg:61.08ms +step:1007/2245 train_time:61503ms step_avg:61.08ms +step:1008/2245 train_time:61563ms step_avg:61.07ms +step:1009/2245 train_time:61626ms step_avg:61.08ms +step:1010/2245 train_time:61685ms step_avg:61.07ms +step:1011/2245 train_time:61747ms step_avg:61.08ms +step:1012/2245 train_time:61806ms step_avg:61.07ms +step:1013/2245 train_time:61868ms step_avg:61.07ms +step:1014/2245 train_time:61927ms step_avg:61.07ms +step:1015/2245 train_time:61989ms step_avg:61.07ms +step:1016/2245 train_time:62049ms step_avg:61.07ms +step:1017/2245 train_time:62113ms step_avg:61.07ms +step:1018/2245 train_time:62175ms step_avg:61.08ms +step:1019/2245 train_time:62239ms step_avg:61.08ms +step:1020/2245 train_time:62299ms step_avg:61.08ms +step:1021/2245 train_time:62362ms step_avg:61.08ms +step:1022/2245 train_time:62421ms step_avg:61.08ms +step:1023/2245 train_time:62484ms step_avg:61.08ms +step:1024/2245 train_time:62543ms step_avg:61.08ms +step:1025/2245 train_time:62605ms step_avg:61.08ms +step:1026/2245 train_time:62666ms step_avg:61.08ms +step:1027/2245 train_time:62727ms step_avg:61.08ms +step:1028/2245 train_time:62787ms step_avg:61.08ms +step:1029/2245 train_time:62848ms step_avg:61.08ms +step:1030/2245 train_time:62908ms step_avg:61.08ms +step:1031/2245 train_time:62970ms step_avg:61.08ms +step:1032/2245 train_time:63031ms step_avg:61.08ms +step:1033/2245 train_time:63094ms step_avg:61.08ms +step:1034/2245 train_time:63156ms step_avg:61.08ms +step:1035/2245 train_time:63219ms step_avg:61.08ms +step:1036/2245 train_time:63280ms step_avg:61.08ms +step:1037/2245 train_time:63342ms step_avg:61.08ms +step:1038/2245 train_time:63402ms step_avg:61.08ms +step:1039/2245 train_time:63464ms step_avg:61.08ms +step:1040/2245 train_time:63524ms step_avg:61.08ms +step:1041/2245 train_time:63586ms step_avg:61.08ms +step:1042/2245 train_time:63645ms step_avg:61.08ms +step:1043/2245 train_time:63707ms step_avg:61.08ms +step:1044/2245 train_time:63767ms step_avg:61.08ms +step:1045/2245 train_time:63829ms step_avg:61.08ms +step:1046/2245 train_time:63888ms step_avg:61.08ms +step:1047/2245 train_time:63950ms step_avg:61.08ms +step:1048/2245 train_time:64011ms step_avg:61.08ms +step:1049/2245 train_time:64072ms step_avg:61.08ms +step:1050/2245 train_time:64133ms step_avg:61.08ms +step:1051/2245 train_time:64196ms step_avg:61.08ms +step:1052/2245 train_time:64257ms step_avg:61.08ms +step:1053/2245 train_time:64319ms step_avg:61.08ms +step:1054/2245 train_time:64378ms step_avg:61.08ms +step:1055/2245 train_time:64441ms step_avg:61.08ms +step:1056/2245 train_time:64501ms step_avg:61.08ms +step:1057/2245 train_time:64563ms step_avg:61.08ms +step:1058/2245 train_time:64622ms step_avg:61.08ms +step:1059/2245 train_time:64684ms step_avg:61.08ms +step:1060/2245 train_time:64743ms step_avg:61.08ms +step:1061/2245 train_time:64807ms step_avg:61.08ms +step:1062/2245 train_time:64866ms step_avg:61.08ms +step:1063/2245 train_time:64928ms step_avg:61.08ms +step:1064/2245 train_time:64988ms step_avg:61.08ms +step:1065/2245 train_time:65051ms step_avg:61.08ms +step:1066/2245 train_time:65111ms step_avg:61.08ms +step:1067/2245 train_time:65174ms step_avg:61.08ms +step:1068/2245 train_time:65234ms step_avg:61.08ms +step:1069/2245 train_time:65298ms step_avg:61.08ms +step:1070/2245 train_time:65359ms step_avg:61.08ms +step:1071/2245 train_time:65421ms step_avg:61.08ms +step:1072/2245 train_time:65480ms step_avg:61.08ms +step:1073/2245 train_time:65542ms step_avg:61.08ms +step:1074/2245 train_time:65602ms step_avg:61.08ms +step:1075/2245 train_time:65664ms step_avg:61.08ms +step:1076/2245 train_time:65723ms step_avg:61.08ms +step:1077/2245 train_time:65785ms step_avg:61.08ms +step:1078/2245 train_time:65844ms step_avg:61.08ms +step:1079/2245 train_time:65906ms step_avg:61.08ms +step:1080/2245 train_time:65966ms step_avg:61.08ms +step:1081/2245 train_time:66029ms step_avg:61.08ms +step:1082/2245 train_time:66089ms step_avg:61.08ms +step:1083/2245 train_time:66152ms step_avg:61.08ms +step:1084/2245 train_time:66212ms step_avg:61.08ms +step:1085/2245 train_time:66275ms step_avg:61.08ms +step:1086/2245 train_time:66336ms step_avg:61.08ms +step:1087/2245 train_time:66399ms step_avg:61.08ms +step:1088/2245 train_time:66459ms step_avg:61.08ms +step:1089/2245 train_time:66521ms step_avg:61.08ms +step:1090/2245 train_time:66580ms step_avg:61.08ms +step:1091/2245 train_time:66643ms step_avg:61.08ms +step:1092/2245 train_time:66703ms step_avg:61.08ms +step:1093/2245 train_time:66764ms step_avg:61.08ms +step:1094/2245 train_time:66824ms step_avg:61.08ms +step:1095/2245 train_time:66885ms step_avg:61.08ms +step:1096/2245 train_time:66945ms step_avg:61.08ms +step:1097/2245 train_time:67007ms step_avg:61.08ms +step:1098/2245 train_time:67067ms step_avg:61.08ms +step:1099/2245 train_time:67130ms step_avg:61.08ms +step:1100/2245 train_time:67190ms step_avg:61.08ms +step:1101/2245 train_time:67253ms step_avg:61.08ms +step:1102/2245 train_time:67316ms step_avg:61.09ms +step:1103/2245 train_time:67378ms step_avg:61.09ms +step:1104/2245 train_time:67438ms step_avg:61.08ms +step:1105/2245 train_time:67500ms step_avg:61.09ms +step:1106/2245 train_time:67560ms step_avg:61.09ms +step:1107/2245 train_time:67622ms step_avg:61.09ms +step:1108/2245 train_time:67682ms step_avg:61.08ms +step:1109/2245 train_time:67743ms step_avg:61.09ms +step:1110/2245 train_time:67803ms step_avg:61.08ms +step:1111/2245 train_time:67866ms step_avg:61.09ms +step:1112/2245 train_time:67925ms step_avg:61.08ms +step:1113/2245 train_time:67987ms step_avg:61.08ms +step:1114/2245 train_time:68046ms step_avg:61.08ms +step:1115/2245 train_time:68109ms step_avg:61.08ms +step:1116/2245 train_time:68169ms step_avg:61.08ms +step:1117/2245 train_time:68232ms step_avg:61.09ms +step:1118/2245 train_time:68293ms step_avg:61.08ms +step:1119/2245 train_time:68356ms step_avg:61.09ms +step:1120/2245 train_time:68417ms step_avg:61.09ms +step:1121/2245 train_time:68480ms step_avg:61.09ms +step:1122/2245 train_time:68539ms step_avg:61.09ms +step:1123/2245 train_time:68601ms step_avg:61.09ms +step:1124/2245 train_time:68661ms step_avg:61.09ms +step:1125/2245 train_time:68723ms step_avg:61.09ms +step:1126/2245 train_time:68783ms step_avg:61.09ms +step:1127/2245 train_time:68845ms step_avg:61.09ms +step:1128/2245 train_time:68905ms step_avg:61.09ms +step:1129/2245 train_time:68967ms step_avg:61.09ms +step:1130/2245 train_time:69027ms step_avg:61.09ms +step:1131/2245 train_time:69089ms step_avg:61.09ms +step:1132/2245 train_time:69149ms step_avg:61.09ms +step:1133/2245 train_time:69213ms step_avg:61.09ms +step:1134/2245 train_time:69272ms step_avg:61.09ms +step:1135/2245 train_time:69335ms step_avg:61.09ms +step:1136/2245 train_time:69396ms step_avg:61.09ms +step:1137/2245 train_time:69459ms step_avg:61.09ms +step:1138/2245 train_time:69518ms step_avg:61.09ms +step:1139/2245 train_time:69580ms step_avg:61.09ms +step:1140/2245 train_time:69640ms step_avg:61.09ms +step:1141/2245 train_time:69702ms step_avg:61.09ms +step:1142/2245 train_time:69763ms step_avg:61.09ms +step:1143/2245 train_time:69824ms step_avg:61.09ms +step:1144/2245 train_time:69884ms step_avg:61.09ms +step:1145/2245 train_time:69945ms step_avg:61.09ms +step:1146/2245 train_time:70005ms step_avg:61.09ms +step:1147/2245 train_time:70067ms step_avg:61.09ms +step:1148/2245 train_time:70127ms step_avg:61.09ms +step:1149/2245 train_time:70189ms step_avg:61.09ms +step:1150/2245 train_time:70250ms step_avg:61.09ms +step:1151/2245 train_time:70313ms step_avg:61.09ms +step:1152/2245 train_time:70373ms step_avg:61.09ms +step:1153/2245 train_time:70436ms step_avg:61.09ms +step:1154/2245 train_time:70496ms step_avg:61.09ms +step:1155/2245 train_time:70559ms step_avg:61.09ms +step:1156/2245 train_time:70618ms step_avg:61.09ms +step:1157/2245 train_time:70681ms step_avg:61.09ms +step:1158/2245 train_time:70740ms step_avg:61.09ms +step:1159/2245 train_time:70802ms step_avg:61.09ms +step:1160/2245 train_time:70862ms step_avg:61.09ms +step:1161/2245 train_time:70925ms step_avg:61.09ms +step:1162/2245 train_time:70984ms step_avg:61.09ms +step:1163/2245 train_time:71046ms step_avg:61.09ms +step:1164/2245 train_time:71106ms step_avg:61.09ms +step:1165/2245 train_time:71170ms step_avg:61.09ms +step:1166/2245 train_time:71230ms step_avg:61.09ms +step:1167/2245 train_time:71292ms step_avg:61.09ms +step:1168/2245 train_time:71353ms step_avg:61.09ms +step:1169/2245 train_time:71415ms step_avg:61.09ms +step:1170/2245 train_time:71475ms step_avg:61.09ms +step:1171/2245 train_time:71538ms step_avg:61.09ms +step:1172/2245 train_time:71598ms step_avg:61.09ms +step:1173/2245 train_time:71660ms step_avg:61.09ms +step:1174/2245 train_time:71720ms step_avg:61.09ms +step:1175/2245 train_time:71782ms step_avg:61.09ms +step:1176/2245 train_time:71842ms step_avg:61.09ms +step:1177/2245 train_time:71904ms step_avg:61.09ms +step:1178/2245 train_time:71965ms step_avg:61.09ms +step:1179/2245 train_time:72027ms step_avg:61.09ms +step:1180/2245 train_time:72086ms step_avg:61.09ms +step:1181/2245 train_time:72148ms step_avg:61.09ms +step:1182/2245 train_time:72208ms step_avg:61.09ms +step:1183/2245 train_time:72271ms step_avg:61.09ms +step:1184/2245 train_time:72331ms step_avg:61.09ms +step:1185/2245 train_time:72393ms step_avg:61.09ms +step:1186/2245 train_time:72455ms step_avg:61.09ms +step:1187/2245 train_time:72519ms step_avg:61.09ms +step:1188/2245 train_time:72578ms step_avg:61.09ms +step:1189/2245 train_time:72641ms step_avg:61.09ms +step:1190/2245 train_time:72700ms step_avg:61.09ms +step:1191/2245 train_time:72763ms step_avg:61.09ms +step:1192/2245 train_time:72822ms step_avg:61.09ms +step:1193/2245 train_time:72885ms step_avg:61.09ms +step:1194/2245 train_time:72945ms step_avg:61.09ms +step:1195/2245 train_time:73006ms step_avg:61.09ms +step:1196/2245 train_time:73066ms step_avg:61.09ms +step:1197/2245 train_time:73128ms step_avg:61.09ms +step:1198/2245 train_time:73188ms step_avg:61.09ms +step:1199/2245 train_time:73251ms step_avg:61.09ms +step:1200/2245 train_time:73311ms step_avg:61.09ms +step:1201/2245 train_time:73374ms step_avg:61.09ms +step:1202/2245 train_time:73434ms step_avg:61.09ms +step:1203/2245 train_time:73498ms step_avg:61.10ms +step:1204/2245 train_time:73557ms step_avg:61.09ms +step:1205/2245 train_time:73619ms step_avg:61.09ms +step:1206/2245 train_time:73679ms step_avg:61.09ms +step:1207/2245 train_time:73741ms step_avg:61.09ms +step:1208/2245 train_time:73800ms step_avg:61.09ms +step:1209/2245 train_time:73863ms step_avg:61.09ms +step:1210/2245 train_time:73923ms step_avg:61.09ms +step:1211/2245 train_time:73984ms step_avg:61.09ms +step:1212/2245 train_time:74044ms step_avg:61.09ms +step:1213/2245 train_time:74106ms step_avg:61.09ms +step:1214/2245 train_time:74166ms step_avg:61.09ms +step:1215/2245 train_time:74229ms step_avg:61.09ms +step:1216/2245 train_time:74290ms step_avg:61.09ms +step:1217/2245 train_time:74353ms step_avg:61.09ms +step:1218/2245 train_time:74413ms step_avg:61.09ms +step:1219/2245 train_time:74476ms step_avg:61.10ms +step:1220/2245 train_time:74535ms step_avg:61.09ms +step:1221/2245 train_time:74598ms step_avg:61.10ms +step:1222/2245 train_time:74658ms step_avg:61.10ms +step:1223/2245 train_time:74720ms step_avg:61.10ms +step:1224/2245 train_time:74781ms step_avg:61.10ms +step:1225/2245 train_time:74843ms step_avg:61.10ms +step:1226/2245 train_time:74902ms step_avg:61.09ms +step:1227/2245 train_time:74964ms step_avg:61.10ms +step:1228/2245 train_time:75023ms step_avg:61.09ms +step:1229/2245 train_time:75085ms step_avg:61.09ms +step:1230/2245 train_time:75145ms step_avg:61.09ms +step:1231/2245 train_time:75208ms step_avg:61.09ms +step:1232/2245 train_time:75268ms step_avg:61.09ms +step:1233/2245 train_time:75331ms step_avg:61.10ms +step:1234/2245 train_time:75391ms step_avg:61.09ms +step:1235/2245 train_time:75454ms step_avg:61.10ms +step:1236/2245 train_time:75513ms step_avg:61.09ms +step:1237/2245 train_time:75577ms step_avg:61.10ms +step:1238/2245 train_time:75636ms step_avg:61.10ms +step:1239/2245 train_time:75699ms step_avg:61.10ms +step:1240/2245 train_time:75759ms step_avg:61.10ms +step:1241/2245 train_time:75822ms step_avg:61.10ms +step:1242/2245 train_time:75881ms step_avg:61.10ms +step:1243/2245 train_time:75943ms step_avg:61.10ms +step:1244/2245 train_time:76003ms step_avg:61.10ms +step:1245/2245 train_time:76065ms step_avg:61.10ms +step:1246/2245 train_time:76125ms step_avg:61.10ms +step:1247/2245 train_time:76187ms step_avg:61.10ms +step:1248/2245 train_time:76246ms step_avg:61.09ms +step:1249/2245 train_time:76310ms step_avg:61.10ms +step:1250/2245 train_time:76370ms step_avg:61.10ms +step:1250/2245 val_loss:3.5215 train_time:76434ms step_avg:61.15ms +step:1251/2245 train_time:76453ms step_avg:61.11ms +step:1252/2245 train_time:76496ms step_avg:61.10ms +step:1253/2245 train_time:76561ms step_avg:61.10ms +step:1254/2245 train_time:76620ms step_avg:61.10ms +step:1255/2245 train_time:76683ms step_avg:61.10ms +step:1256/2245 train_time:76744ms step_avg:61.10ms +step:1257/2245 train_time:76805ms step_avg:61.10ms +step:1258/2245 train_time:76866ms step_avg:61.10ms +step:1259/2245 train_time:76928ms step_avg:61.10ms +step:1260/2245 train_time:76988ms step_avg:61.10ms +step:1261/2245 train_time:77051ms step_avg:61.10ms +step:1262/2245 train_time:77110ms step_avg:61.10ms +step:1263/2245 train_time:77171ms step_avg:61.10ms +step:1264/2245 train_time:77231ms step_avg:61.10ms +step:1265/2245 train_time:77292ms step_avg:61.10ms +step:1266/2245 train_time:77352ms step_avg:61.10ms +step:1267/2245 train_time:77416ms step_avg:61.10ms +step:1268/2245 train_time:77477ms step_avg:61.10ms +step:1269/2245 train_time:77540ms step_avg:61.10ms +step:1270/2245 train_time:77601ms step_avg:61.10ms +step:1271/2245 train_time:77663ms step_avg:61.10ms +step:1272/2245 train_time:77723ms step_avg:61.10ms +step:1273/2245 train_time:77785ms step_avg:61.10ms +step:1274/2245 train_time:77845ms step_avg:61.10ms +step:1275/2245 train_time:77907ms step_avg:61.10ms +step:1276/2245 train_time:77966ms step_avg:61.10ms +step:1277/2245 train_time:78029ms step_avg:61.10ms +step:1278/2245 train_time:78089ms step_avg:61.10ms +step:1279/2245 train_time:78151ms step_avg:61.10ms +step:1280/2245 train_time:78211ms step_avg:61.10ms +step:1281/2245 train_time:78272ms step_avg:61.10ms +step:1282/2245 train_time:78333ms step_avg:61.10ms +step:1283/2245 train_time:78396ms step_avg:61.10ms +step:1284/2245 train_time:78456ms step_avg:61.10ms +step:1285/2245 train_time:78518ms step_avg:61.10ms +step:1286/2245 train_time:78578ms step_avg:61.10ms +step:1287/2245 train_time:78641ms step_avg:61.10ms +step:1288/2245 train_time:78701ms step_avg:61.10ms +step:1289/2245 train_time:78764ms step_avg:61.10ms +step:1290/2245 train_time:78824ms step_avg:61.10ms +step:1291/2245 train_time:78887ms step_avg:61.11ms +step:1292/2245 train_time:78946ms step_avg:61.10ms +step:1293/2245 train_time:79008ms step_avg:61.10ms +step:1294/2245 train_time:79068ms step_avg:61.10ms +step:1295/2245 train_time:79130ms step_avg:61.10ms +step:1296/2245 train_time:79190ms step_avg:61.10ms +step:1297/2245 train_time:79253ms step_avg:61.11ms +step:1298/2245 train_time:79312ms step_avg:61.10ms +step:1299/2245 train_time:79374ms step_avg:61.10ms +step:1300/2245 train_time:79435ms step_avg:61.10ms +step:1301/2245 train_time:79498ms step_avg:61.11ms +step:1302/2245 train_time:79557ms step_avg:61.10ms +step:1303/2245 train_time:79619ms step_avg:61.10ms +step:1304/2245 train_time:79679ms step_avg:61.10ms +step:1305/2245 train_time:79741ms step_avg:61.10ms +step:1306/2245 train_time:79802ms step_avg:61.10ms +step:1307/2245 train_time:79865ms step_avg:61.11ms +step:1308/2245 train_time:79925ms step_avg:61.10ms +step:1309/2245 train_time:79986ms step_avg:61.10ms +step:1310/2245 train_time:80047ms step_avg:61.10ms +step:1311/2245 train_time:80110ms step_avg:61.11ms +step:1312/2245 train_time:80170ms step_avg:61.10ms +step:1313/2245 train_time:80232ms step_avg:61.11ms +step:1314/2245 train_time:80292ms step_avg:61.11ms +step:1315/2245 train_time:80354ms step_avg:61.11ms +step:1316/2245 train_time:80414ms step_avg:61.11ms +step:1317/2245 train_time:80476ms step_avg:61.11ms +step:1318/2245 train_time:80536ms step_avg:61.10ms +step:1319/2245 train_time:80598ms step_avg:61.11ms +step:1320/2245 train_time:80658ms step_avg:61.10ms +step:1321/2245 train_time:80720ms step_avg:61.11ms +step:1322/2245 train_time:80780ms step_avg:61.10ms +step:1323/2245 train_time:80843ms step_avg:61.11ms +step:1324/2245 train_time:80904ms step_avg:61.11ms +step:1325/2245 train_time:80966ms step_avg:61.11ms +step:1326/2245 train_time:81026ms step_avg:61.11ms +step:1327/2245 train_time:81089ms step_avg:61.11ms +step:1328/2245 train_time:81149ms step_avg:61.11ms +step:1329/2245 train_time:81212ms step_avg:61.11ms +step:1330/2245 train_time:81271ms step_avg:61.11ms +step:1331/2245 train_time:81334ms step_avg:61.11ms +step:1332/2245 train_time:81395ms step_avg:61.11ms +step:1333/2245 train_time:81456ms step_avg:61.11ms +step:1334/2245 train_time:81516ms step_avg:61.11ms +step:1335/2245 train_time:81577ms step_avg:61.11ms +step:1336/2245 train_time:81637ms step_avg:61.11ms +step:1337/2245 train_time:81699ms step_avg:61.11ms +step:1338/2245 train_time:81759ms step_avg:61.11ms +step:1339/2245 train_time:81821ms step_avg:61.11ms +step:1340/2245 train_time:81881ms step_avg:61.11ms +step:1341/2245 train_time:81945ms step_avg:61.11ms +step:1342/2245 train_time:82005ms step_avg:61.11ms +step:1343/2245 train_time:82068ms step_avg:61.11ms +step:1344/2245 train_time:82128ms step_avg:61.11ms +step:1345/2245 train_time:82191ms step_avg:61.11ms +step:1346/2245 train_time:82251ms step_avg:61.11ms +step:1347/2245 train_time:82314ms step_avg:61.11ms +step:1348/2245 train_time:82374ms step_avg:61.11ms +step:1349/2245 train_time:82437ms step_avg:61.11ms +step:1350/2245 train_time:82498ms step_avg:61.11ms +step:1351/2245 train_time:82560ms step_avg:61.11ms +step:1352/2245 train_time:82619ms step_avg:61.11ms +step:1353/2245 train_time:82680ms step_avg:61.11ms +step:1354/2245 train_time:82740ms step_avg:61.11ms +step:1355/2245 train_time:82802ms step_avg:61.11ms +step:1356/2245 train_time:82862ms step_avg:61.11ms +step:1357/2245 train_time:82925ms step_avg:61.11ms +step:1358/2245 train_time:82985ms step_avg:61.11ms +step:1359/2245 train_time:83048ms step_avg:61.11ms +step:1360/2245 train_time:83108ms step_avg:61.11ms +step:1361/2245 train_time:83171ms step_avg:61.11ms +step:1362/2245 train_time:83231ms step_avg:61.11ms +step:1363/2245 train_time:83293ms step_avg:61.11ms +step:1364/2245 train_time:83353ms step_avg:61.11ms +step:1365/2245 train_time:83415ms step_avg:61.11ms +step:1366/2245 train_time:83476ms step_avg:61.11ms +step:1367/2245 train_time:83538ms step_avg:61.11ms +step:1368/2245 train_time:83597ms step_avg:61.11ms +step:1369/2245 train_time:83659ms step_avg:61.11ms +step:1370/2245 train_time:83719ms step_avg:61.11ms +step:1371/2245 train_time:83781ms step_avg:61.11ms +step:1372/2245 train_time:83841ms step_avg:61.11ms +step:1373/2245 train_time:83904ms step_avg:61.11ms +step:1374/2245 train_time:83964ms step_avg:61.11ms +step:1375/2245 train_time:84026ms step_avg:61.11ms +step:1376/2245 train_time:84087ms step_avg:61.11ms +step:1377/2245 train_time:84150ms step_avg:61.11ms +step:1378/2245 train_time:84210ms step_avg:61.11ms +step:1379/2245 train_time:84272ms step_avg:61.11ms +step:1380/2245 train_time:84331ms step_avg:61.11ms +step:1381/2245 train_time:84393ms step_avg:61.11ms +step:1382/2245 train_time:84453ms step_avg:61.11ms +step:1383/2245 train_time:84515ms step_avg:61.11ms +step:1384/2245 train_time:84574ms step_avg:61.11ms +step:1385/2245 train_time:84637ms step_avg:61.11ms +step:1386/2245 train_time:84696ms step_avg:61.11ms +step:1387/2245 train_time:84758ms step_avg:61.11ms +step:1388/2245 train_time:84818ms step_avg:61.11ms +step:1389/2245 train_time:84880ms step_avg:61.11ms +step:1390/2245 train_time:84940ms step_avg:61.11ms +step:1391/2245 train_time:85004ms step_avg:61.11ms +step:1392/2245 train_time:85064ms step_avg:61.11ms +step:1393/2245 train_time:85126ms step_avg:61.11ms +step:1394/2245 train_time:85187ms step_avg:61.11ms +step:1395/2245 train_time:85249ms step_avg:61.11ms +step:1396/2245 train_time:85310ms step_avg:61.11ms +step:1397/2245 train_time:85373ms step_avg:61.11ms +step:1398/2245 train_time:85432ms step_avg:61.11ms +step:1399/2245 train_time:85494ms step_avg:61.11ms +step:1400/2245 train_time:85554ms step_avg:61.11ms +step:1401/2245 train_time:85616ms step_avg:61.11ms +step:1402/2245 train_time:85675ms step_avg:61.11ms +step:1403/2245 train_time:85737ms step_avg:61.11ms +step:1404/2245 train_time:85798ms step_avg:61.11ms +step:1405/2245 train_time:85859ms step_avg:61.11ms +step:1406/2245 train_time:85919ms step_avg:61.11ms +step:1407/2245 train_time:85982ms step_avg:61.11ms +step:1408/2245 train_time:86042ms step_avg:61.11ms +step:1409/2245 train_time:86106ms step_avg:61.11ms +step:1410/2245 train_time:86165ms step_avg:61.11ms +step:1411/2245 train_time:86228ms step_avg:61.11ms +step:1412/2245 train_time:86290ms step_avg:61.11ms +step:1413/2245 train_time:86353ms step_avg:61.11ms +step:1414/2245 train_time:86412ms step_avg:61.11ms +step:1415/2245 train_time:86474ms step_avg:61.11ms +step:1416/2245 train_time:86534ms step_avg:61.11ms +step:1417/2245 train_time:86596ms step_avg:61.11ms +step:1418/2245 train_time:86656ms step_avg:61.11ms +step:1419/2245 train_time:86718ms step_avg:61.11ms +step:1420/2245 train_time:86777ms step_avg:61.11ms +step:1421/2245 train_time:86839ms step_avg:61.11ms +step:1422/2245 train_time:86900ms step_avg:61.11ms +step:1423/2245 train_time:86962ms step_avg:61.11ms +step:1424/2245 train_time:87022ms step_avg:61.11ms +step:1425/2245 train_time:87085ms step_avg:61.11ms +step:1426/2245 train_time:87145ms step_avg:61.11ms +step:1427/2245 train_time:87208ms step_avg:61.11ms +step:1428/2245 train_time:87269ms step_avg:61.11ms +step:1429/2245 train_time:87331ms step_avg:61.11ms +step:1430/2245 train_time:87391ms step_avg:61.11ms +step:1431/2245 train_time:87453ms step_avg:61.11ms +step:1432/2245 train_time:87513ms step_avg:61.11ms +step:1433/2245 train_time:87575ms step_avg:61.11ms +step:1434/2245 train_time:87634ms step_avg:61.11ms +step:1435/2245 train_time:87697ms step_avg:61.11ms +step:1436/2245 train_time:87757ms step_avg:61.11ms +step:1437/2245 train_time:87819ms step_avg:61.11ms +step:1438/2245 train_time:87878ms step_avg:61.11ms +step:1439/2245 train_time:87941ms step_avg:61.11ms +step:1440/2245 train_time:88001ms step_avg:61.11ms +step:1441/2245 train_time:88064ms step_avg:61.11ms +step:1442/2245 train_time:88124ms step_avg:61.11ms +step:1443/2245 train_time:88187ms step_avg:61.11ms +step:1444/2245 train_time:88248ms step_avg:61.11ms +step:1445/2245 train_time:88310ms step_avg:61.11ms +step:1446/2245 train_time:88370ms step_avg:61.11ms +step:1447/2245 train_time:88433ms step_avg:61.11ms +step:1448/2245 train_time:88493ms step_avg:61.11ms +step:1449/2245 train_time:88555ms step_avg:61.11ms +step:1450/2245 train_time:88615ms step_avg:61.11ms +step:1451/2245 train_time:88677ms step_avg:61.11ms +step:1452/2245 train_time:88737ms step_avg:61.11ms +step:1453/2245 train_time:88799ms step_avg:61.11ms +step:1454/2245 train_time:88859ms step_avg:61.11ms +step:1455/2245 train_time:88921ms step_avg:61.11ms +step:1456/2245 train_time:88981ms step_avg:61.11ms +step:1457/2245 train_time:89043ms step_avg:61.11ms +step:1458/2245 train_time:89104ms step_avg:61.11ms +step:1459/2245 train_time:89167ms step_avg:61.11ms +step:1460/2245 train_time:89227ms step_avg:61.11ms +step:1461/2245 train_time:89289ms step_avg:61.12ms +step:1462/2245 train_time:89350ms step_avg:61.11ms +step:1463/2245 train_time:89413ms step_avg:61.12ms +step:1464/2245 train_time:89473ms step_avg:61.12ms +step:1465/2245 train_time:89535ms step_avg:61.12ms +step:1466/2245 train_time:89595ms step_avg:61.12ms +step:1467/2245 train_time:89657ms step_avg:61.12ms +step:1468/2245 train_time:89717ms step_avg:61.12ms +step:1469/2245 train_time:89779ms step_avg:61.12ms +step:1470/2245 train_time:89838ms step_avg:61.11ms +step:1471/2245 train_time:89900ms step_avg:61.11ms +step:1472/2245 train_time:89960ms step_avg:61.11ms +step:1473/2245 train_time:90023ms step_avg:61.12ms +step:1474/2245 train_time:90083ms step_avg:61.11ms +step:1475/2245 train_time:90147ms step_avg:61.12ms +step:1476/2245 train_time:90208ms step_avg:61.12ms +step:1477/2245 train_time:90270ms step_avg:61.12ms +step:1478/2245 train_time:90330ms step_avg:61.12ms +step:1479/2245 train_time:90393ms step_avg:61.12ms +step:1480/2245 train_time:90453ms step_avg:61.12ms +step:1481/2245 train_time:90516ms step_avg:61.12ms +step:1482/2245 train_time:90576ms step_avg:61.12ms +step:1483/2245 train_time:90638ms step_avg:61.12ms +step:1484/2245 train_time:90699ms step_avg:61.12ms +step:1485/2245 train_time:90761ms step_avg:61.12ms +step:1486/2245 train_time:90821ms step_avg:61.12ms +step:1487/2245 train_time:90883ms step_avg:61.12ms +step:1488/2245 train_time:90944ms step_avg:61.12ms +step:1489/2245 train_time:91007ms step_avg:61.12ms +step:1490/2245 train_time:91067ms step_avg:61.12ms +step:1491/2245 train_time:91130ms step_avg:61.12ms +step:1492/2245 train_time:91190ms step_avg:61.12ms +step:1493/2245 train_time:91253ms step_avg:61.12ms +step:1494/2245 train_time:91312ms step_avg:61.12ms +step:1495/2245 train_time:91375ms step_avg:61.12ms +step:1496/2245 train_time:91436ms step_avg:61.12ms +step:1497/2245 train_time:91499ms step_avg:61.12ms +step:1498/2245 train_time:91560ms step_avg:61.12ms +step:1499/2245 train_time:91623ms step_avg:61.12ms +step:1500/2245 train_time:91684ms step_avg:61.12ms +step:1500/2245 val_loss:3.4415 train_time:91748ms step_avg:61.17ms +step:1501/2245 train_time:91768ms step_avg:61.14ms +step:1502/2245 train_time:91812ms step_avg:61.13ms +step:1503/2245 train_time:91874ms step_avg:61.13ms +step:1504/2245 train_time:91933ms step_avg:61.13ms +step:1505/2245 train_time:91996ms step_avg:61.13ms +step:1506/2245 train_time:92057ms step_avg:61.13ms +step:1507/2245 train_time:92119ms step_avg:61.13ms +step:1508/2245 train_time:92179ms step_avg:61.13ms +step:1509/2245 train_time:92240ms step_avg:61.13ms +step:1510/2245 train_time:92299ms step_avg:61.13ms +step:1511/2245 train_time:92361ms step_avg:61.13ms +step:1512/2245 train_time:92421ms step_avg:61.12ms +step:1513/2245 train_time:92483ms step_avg:61.13ms +step:1514/2245 train_time:92543ms step_avg:61.12ms +step:1515/2245 train_time:92606ms step_avg:61.13ms +step:1516/2245 train_time:92672ms step_avg:61.13ms +step:1517/2245 train_time:92739ms step_avg:61.13ms +step:1518/2245 train_time:92801ms step_avg:61.13ms +step:1519/2245 train_time:92864ms step_avg:61.13ms +step:1520/2245 train_time:92924ms step_avg:61.13ms +step:1521/2245 train_time:92987ms step_avg:61.14ms +step:1522/2245 train_time:93048ms step_avg:61.14ms +step:1523/2245 train_time:93110ms step_avg:61.14ms +step:1524/2245 train_time:93170ms step_avg:61.13ms +step:1525/2245 train_time:93232ms step_avg:61.14ms +step:1526/2245 train_time:93292ms step_avg:61.14ms +step:1527/2245 train_time:93354ms step_avg:61.14ms +step:1528/2245 train_time:93414ms step_avg:61.13ms +step:1529/2245 train_time:93476ms step_avg:61.14ms +step:1530/2245 train_time:93535ms step_avg:61.13ms +step:1531/2245 train_time:93598ms step_avg:61.14ms +step:1532/2245 train_time:93659ms step_avg:61.14ms +step:1533/2245 train_time:93724ms step_avg:61.14ms +step:1534/2245 train_time:93786ms step_avg:61.14ms +step:1535/2245 train_time:93850ms step_avg:61.14ms +step:1536/2245 train_time:93910ms step_avg:61.14ms +step:1537/2245 train_time:93973ms step_avg:61.14ms +step:1538/2245 train_time:94033ms step_avg:61.14ms +step:1539/2245 train_time:94095ms step_avg:61.14ms +step:1540/2245 train_time:94156ms step_avg:61.14ms +step:1541/2245 train_time:94218ms step_avg:61.14ms +step:1542/2245 train_time:94279ms step_avg:61.14ms +step:1543/2245 train_time:94340ms step_avg:61.14ms +step:1544/2245 train_time:94400ms step_avg:61.14ms +step:1545/2245 train_time:94462ms step_avg:61.14ms +step:1546/2245 train_time:94523ms step_avg:61.14ms +step:1547/2245 train_time:94586ms step_avg:61.14ms +step:1548/2245 train_time:94646ms step_avg:61.14ms +step:1549/2245 train_time:94710ms step_avg:61.14ms +step:1550/2245 train_time:94771ms step_avg:61.14ms +step:1551/2245 train_time:94834ms step_avg:61.14ms +step:1552/2245 train_time:94895ms step_avg:61.14ms +step:1553/2245 train_time:94957ms step_avg:61.14ms +step:1554/2245 train_time:95018ms step_avg:61.14ms +step:1555/2245 train_time:95080ms step_avg:61.14ms +step:1556/2245 train_time:95141ms step_avg:61.14ms +step:1557/2245 train_time:95204ms step_avg:61.15ms +step:1558/2245 train_time:95264ms step_avg:61.15ms +step:1559/2245 train_time:95326ms step_avg:61.15ms +step:1560/2245 train_time:95386ms step_avg:61.15ms +step:1561/2245 train_time:95449ms step_avg:61.15ms +step:1562/2245 train_time:95510ms step_avg:61.15ms +step:1563/2245 train_time:95573ms step_avg:61.15ms +step:1564/2245 train_time:95633ms step_avg:61.15ms +step:1565/2245 train_time:95696ms step_avg:61.15ms +step:1566/2245 train_time:95756ms step_avg:61.15ms +step:1567/2245 train_time:95819ms step_avg:61.15ms +step:1568/2245 train_time:95881ms step_avg:61.15ms +step:1569/2245 train_time:95944ms step_avg:61.15ms +step:1570/2245 train_time:96005ms step_avg:61.15ms +step:1571/2245 train_time:96069ms step_avg:61.15ms +step:1572/2245 train_time:96129ms step_avg:61.15ms +step:1573/2245 train_time:96192ms step_avg:61.15ms +step:1574/2245 train_time:96251ms step_avg:61.15ms +step:1575/2245 train_time:96313ms step_avg:61.15ms +step:1576/2245 train_time:96373ms step_avg:61.15ms +step:1577/2245 train_time:96436ms step_avg:61.15ms +step:1578/2245 train_time:96496ms step_avg:61.15ms +step:1579/2245 train_time:96558ms step_avg:61.15ms +step:1580/2245 train_time:96618ms step_avg:61.15ms +step:1581/2245 train_time:96681ms step_avg:61.15ms +step:1582/2245 train_time:96742ms step_avg:61.15ms +step:1583/2245 train_time:96805ms step_avg:61.15ms +step:1584/2245 train_time:96866ms step_avg:61.15ms +step:1585/2245 train_time:96929ms step_avg:61.15ms +step:1586/2245 train_time:96989ms step_avg:61.15ms +step:1587/2245 train_time:97054ms step_avg:61.16ms +step:1588/2245 train_time:97113ms step_avg:61.15ms +step:1589/2245 train_time:97175ms step_avg:61.15ms +step:1590/2245 train_time:97235ms step_avg:61.15ms +step:1591/2245 train_time:97297ms step_avg:61.15ms +step:1592/2245 train_time:97357ms step_avg:61.15ms +step:1593/2245 train_time:97420ms step_avg:61.15ms +step:1594/2245 train_time:97481ms step_avg:61.15ms +step:1595/2245 train_time:97544ms step_avg:61.16ms +step:1596/2245 train_time:97604ms step_avg:61.16ms +step:1597/2245 train_time:97666ms step_avg:61.16ms +step:1598/2245 train_time:97727ms step_avg:61.16ms +step:1599/2245 train_time:97790ms step_avg:61.16ms +step:1600/2245 train_time:97850ms step_avg:61.16ms +step:1601/2245 train_time:97913ms step_avg:61.16ms +step:1602/2245 train_time:97974ms step_avg:61.16ms +step:1603/2245 train_time:98037ms step_avg:61.16ms +step:1604/2245 train_time:98097ms step_avg:61.16ms +step:1605/2245 train_time:98160ms step_avg:61.16ms +step:1606/2245 train_time:98220ms step_avg:61.16ms +step:1607/2245 train_time:98282ms step_avg:61.16ms +step:1608/2245 train_time:98342ms step_avg:61.16ms +step:1609/2245 train_time:98405ms step_avg:61.16ms +step:1610/2245 train_time:98466ms step_avg:61.16ms +step:1611/2245 train_time:98529ms step_avg:61.16ms +step:1612/2245 train_time:98589ms step_avg:61.16ms +step:1613/2245 train_time:98652ms step_avg:61.16ms +step:1614/2245 train_time:98712ms step_avg:61.16ms +step:1615/2245 train_time:98775ms step_avg:61.16ms +step:1616/2245 train_time:98834ms step_avg:61.16ms +step:1617/2245 train_time:98897ms step_avg:61.16ms +step:1618/2245 train_time:98958ms step_avg:61.16ms +step:1619/2245 train_time:99020ms step_avg:61.16ms +step:1620/2245 train_time:99080ms step_avg:61.16ms +step:1621/2245 train_time:99143ms step_avg:61.16ms +step:1622/2245 train_time:99204ms step_avg:61.16ms +step:1623/2245 train_time:99266ms step_avg:61.16ms +step:1624/2245 train_time:99327ms step_avg:61.16ms +step:1625/2245 train_time:99389ms step_avg:61.16ms +step:1626/2245 train_time:99450ms step_avg:61.16ms +step:1627/2245 train_time:99512ms step_avg:61.16ms +step:1628/2245 train_time:99572ms step_avg:61.16ms +step:1629/2245 train_time:99635ms step_avg:61.16ms +step:1630/2245 train_time:99696ms step_avg:61.16ms +step:1631/2245 train_time:99758ms step_avg:61.16ms +step:1632/2245 train_time:99818ms step_avg:61.16ms +step:1633/2245 train_time:99881ms step_avg:61.16ms +step:1634/2245 train_time:99941ms step_avg:61.16ms +step:1635/2245 train_time:100004ms step_avg:61.16ms +step:1636/2245 train_time:100065ms step_avg:61.16ms +step:1637/2245 train_time:100129ms step_avg:61.17ms +step:1638/2245 train_time:100189ms step_avg:61.17ms +step:1639/2245 train_time:100251ms step_avg:61.17ms +step:1640/2245 train_time:100311ms step_avg:61.17ms +step:1641/2245 train_time:100373ms step_avg:61.17ms +step:1642/2245 train_time:100433ms step_avg:61.17ms +step:1643/2245 train_time:100496ms step_avg:61.17ms +step:1644/2245 train_time:100557ms step_avg:61.17ms +step:1645/2245 train_time:100619ms step_avg:61.17ms +step:1646/2245 train_time:100680ms step_avg:61.17ms +step:1647/2245 train_time:100743ms step_avg:61.17ms +step:1648/2245 train_time:100804ms step_avg:61.17ms +step:1649/2245 train_time:100868ms step_avg:61.17ms +step:1650/2245 train_time:100928ms step_avg:61.17ms +step:1651/2245 train_time:100991ms step_avg:61.17ms +step:1652/2245 train_time:101051ms step_avg:61.17ms +step:1653/2245 train_time:101113ms step_avg:61.17ms +step:1654/2245 train_time:101174ms step_avg:61.17ms +step:1655/2245 train_time:101236ms step_avg:61.17ms +step:1656/2245 train_time:101297ms step_avg:61.17ms +step:1657/2245 train_time:101359ms step_avg:61.17ms +step:1658/2245 train_time:101419ms step_avg:61.17ms +step:1659/2245 train_time:101483ms step_avg:61.17ms +step:1660/2245 train_time:101547ms step_avg:61.17ms +step:1661/2245 train_time:101607ms step_avg:61.17ms +step:1662/2245 train_time:101667ms step_avg:61.17ms +step:1663/2245 train_time:101730ms step_avg:61.17ms +step:1664/2245 train_time:101790ms step_avg:61.17ms +step:1665/2245 train_time:101853ms step_avg:61.17ms +step:1666/2245 train_time:101913ms step_avg:61.17ms +step:1667/2245 train_time:101977ms step_avg:61.17ms +step:1668/2245 train_time:102037ms step_avg:61.17ms +step:1669/2245 train_time:102100ms step_avg:61.17ms +step:1670/2245 train_time:102160ms step_avg:61.17ms +step:1671/2245 train_time:102224ms step_avg:61.18ms +step:1672/2245 train_time:102285ms step_avg:61.18ms +step:1673/2245 train_time:102348ms step_avg:61.18ms +step:1674/2245 train_time:102408ms step_avg:61.18ms +step:1675/2245 train_time:102471ms step_avg:61.18ms +step:1676/2245 train_time:102531ms step_avg:61.18ms +step:1677/2245 train_time:102594ms step_avg:61.18ms +step:1678/2245 train_time:102654ms step_avg:61.18ms +step:1679/2245 train_time:102716ms step_avg:61.18ms +step:1680/2245 train_time:102778ms step_avg:61.18ms +step:1681/2245 train_time:102840ms step_avg:61.18ms +step:1682/2245 train_time:102901ms step_avg:61.18ms +step:1683/2245 train_time:102964ms step_avg:61.18ms +step:1684/2245 train_time:103025ms step_avg:61.18ms +step:1685/2245 train_time:103089ms step_avg:61.18ms +step:1686/2245 train_time:103149ms step_avg:61.18ms +step:1687/2245 train_time:103212ms step_avg:61.18ms +step:1688/2245 train_time:103272ms step_avg:61.18ms +step:1689/2245 train_time:103335ms step_avg:61.18ms +step:1690/2245 train_time:103395ms step_avg:61.18ms +step:1691/2245 train_time:103457ms step_avg:61.18ms +step:1692/2245 train_time:103518ms step_avg:61.18ms +step:1693/2245 train_time:103581ms step_avg:61.18ms +step:1694/2245 train_time:103641ms step_avg:61.18ms +step:1695/2245 train_time:103705ms step_avg:61.18ms +step:1696/2245 train_time:103766ms step_avg:61.18ms +step:1697/2245 train_time:103829ms step_avg:61.18ms +step:1698/2245 train_time:103889ms step_avg:61.18ms +step:1699/2245 train_time:103952ms step_avg:61.18ms +step:1700/2245 train_time:104012ms step_avg:61.18ms +step:1701/2245 train_time:104074ms step_avg:61.18ms +step:1702/2245 train_time:104135ms step_avg:61.18ms +step:1703/2245 train_time:104198ms step_avg:61.18ms +step:1704/2245 train_time:104258ms step_avg:61.18ms +step:1705/2245 train_time:104321ms step_avg:61.19ms +step:1706/2245 train_time:104382ms step_avg:61.18ms +step:1707/2245 train_time:104444ms step_avg:61.19ms +step:1708/2245 train_time:104505ms step_avg:61.19ms +step:1709/2245 train_time:104569ms step_avg:61.19ms +step:1710/2245 train_time:104630ms step_avg:61.19ms +step:1711/2245 train_time:104693ms step_avg:61.19ms +step:1712/2245 train_time:104753ms step_avg:61.19ms +step:1713/2245 train_time:104816ms step_avg:61.19ms +step:1714/2245 train_time:104876ms step_avg:61.19ms +step:1715/2245 train_time:104939ms step_avg:61.19ms +step:1716/2245 train_time:105000ms step_avg:61.19ms +step:1717/2245 train_time:105063ms step_avg:61.19ms +step:1718/2245 train_time:105123ms step_avg:61.19ms +step:1719/2245 train_time:105186ms step_avg:61.19ms +step:1720/2245 train_time:105247ms step_avg:61.19ms +step:1721/2245 train_time:105310ms step_avg:61.19ms +step:1722/2245 train_time:105371ms step_avg:61.19ms +step:1723/2245 train_time:105433ms step_avg:61.19ms +step:1724/2245 train_time:105494ms step_avg:61.19ms +step:1725/2245 train_time:105557ms step_avg:61.19ms +step:1726/2245 train_time:105617ms step_avg:61.19ms +step:1727/2245 train_time:105680ms step_avg:61.19ms +step:1728/2245 train_time:105740ms step_avg:61.19ms +step:1729/2245 train_time:105803ms step_avg:61.19ms +step:1730/2245 train_time:105865ms step_avg:61.19ms +step:1731/2245 train_time:105927ms step_avg:61.19ms +step:1732/2245 train_time:105988ms step_avg:61.19ms +step:1733/2245 train_time:106050ms step_avg:61.19ms +step:1734/2245 train_time:106110ms step_avg:61.19ms +step:1735/2245 train_time:106173ms step_avg:61.19ms +step:1736/2245 train_time:106233ms step_avg:61.19ms +step:1737/2245 train_time:106295ms step_avg:61.19ms +step:1738/2245 train_time:106355ms step_avg:61.19ms +step:1739/2245 train_time:106418ms step_avg:61.19ms +step:1740/2245 train_time:106478ms step_avg:61.19ms +step:1741/2245 train_time:106542ms step_avg:61.20ms +step:1742/2245 train_time:106603ms step_avg:61.20ms +step:1743/2245 train_time:106666ms step_avg:61.20ms +step:1744/2245 train_time:106727ms step_avg:61.20ms +step:1745/2245 train_time:106790ms step_avg:61.20ms +step:1746/2245 train_time:106851ms step_avg:61.20ms +step:1747/2245 train_time:106913ms step_avg:61.20ms +step:1748/2245 train_time:106973ms step_avg:61.20ms +step:1749/2245 train_time:107036ms step_avg:61.20ms +step:1750/2245 train_time:107097ms step_avg:61.20ms +step:1750/2245 val_loss:3.3772 train_time:107160ms step_avg:61.23ms +step:1751/2245 train_time:107179ms step_avg:61.21ms +step:1752/2245 train_time:107222ms step_avg:61.20ms +step:1753/2245 train_time:107288ms step_avg:61.20ms +step:1754/2245 train_time:107349ms step_avg:61.20ms +step:1755/2245 train_time:107412ms step_avg:61.20ms +step:1756/2245 train_time:107474ms step_avg:61.20ms +step:1757/2245 train_time:107534ms step_avg:61.20ms +step:1758/2245 train_time:107594ms step_avg:61.20ms +step:1759/2245 train_time:107656ms step_avg:61.20ms +step:1760/2245 train_time:107715ms step_avg:61.20ms +step:1761/2245 train_time:107778ms step_avg:61.20ms +step:1762/2245 train_time:107837ms step_avg:61.20ms +step:1763/2245 train_time:107899ms step_avg:61.20ms +step:1764/2245 train_time:107959ms step_avg:61.20ms +step:1765/2245 train_time:108021ms step_avg:61.20ms +step:1766/2245 train_time:108081ms step_avg:61.20ms +step:1767/2245 train_time:108146ms step_avg:61.20ms +step:1768/2245 train_time:108207ms step_avg:61.20ms +step:1769/2245 train_time:108271ms step_avg:61.20ms +step:1770/2245 train_time:108332ms step_avg:61.20ms +step:1771/2245 train_time:108396ms step_avg:61.21ms +step:1772/2245 train_time:108456ms step_avg:61.21ms +step:1773/2245 train_time:108519ms step_avg:61.21ms +step:1774/2245 train_time:108579ms step_avg:61.21ms +step:1775/2245 train_time:108642ms step_avg:61.21ms +step:1776/2245 train_time:108702ms step_avg:61.21ms +step:1777/2245 train_time:108765ms step_avg:61.21ms +step:1778/2245 train_time:108825ms step_avg:61.21ms +step:1779/2245 train_time:108888ms step_avg:61.21ms +step:1780/2245 train_time:108949ms step_avg:61.21ms +step:1781/2245 train_time:109011ms step_avg:61.21ms +step:1782/2245 train_time:109073ms step_avg:61.21ms +step:1783/2245 train_time:109137ms step_avg:61.21ms +step:1784/2245 train_time:109197ms step_avg:61.21ms +step:1785/2245 train_time:109261ms step_avg:61.21ms +step:1786/2245 train_time:109321ms step_avg:61.21ms +step:1787/2245 train_time:109383ms step_avg:61.21ms +step:1788/2245 train_time:109444ms step_avg:61.21ms +step:1789/2245 train_time:109507ms step_avg:61.21ms +step:1790/2245 train_time:109571ms step_avg:61.21ms +step:1791/2245 train_time:109630ms step_avg:61.21ms +step:1792/2245 train_time:109691ms step_avg:61.21ms +step:1793/2245 train_time:109754ms step_avg:61.21ms +step:1794/2245 train_time:109814ms step_avg:61.21ms +step:1795/2245 train_time:109876ms step_avg:61.21ms +step:1796/2245 train_time:109935ms step_avg:61.21ms +step:1797/2245 train_time:109998ms step_avg:61.21ms +step:1798/2245 train_time:110059ms step_avg:61.21ms +step:1799/2245 train_time:110121ms step_avg:61.21ms +step:1800/2245 train_time:110182ms step_avg:61.21ms +step:1801/2245 train_time:110245ms step_avg:61.21ms +step:1802/2245 train_time:110305ms step_avg:61.21ms +step:1803/2245 train_time:110370ms step_avg:61.21ms +step:1804/2245 train_time:110429ms step_avg:61.21ms +step:1805/2245 train_time:110492ms step_avg:61.21ms +step:1806/2245 train_time:110552ms step_avg:61.21ms +step:1807/2245 train_time:110615ms step_avg:61.21ms +step:1808/2245 train_time:110675ms step_avg:61.21ms +step:1809/2245 train_time:110738ms step_avg:61.22ms +step:1810/2245 train_time:110799ms step_avg:61.21ms +step:1811/2245 train_time:110861ms step_avg:61.22ms +step:1812/2245 train_time:110921ms step_avg:61.21ms +step:1813/2245 train_time:110983ms step_avg:61.22ms +step:1814/2245 train_time:111043ms step_avg:61.21ms +step:1815/2245 train_time:111107ms step_avg:61.22ms +step:1816/2245 train_time:111168ms step_avg:61.22ms +step:1817/2245 train_time:111232ms step_avg:61.22ms +step:1818/2245 train_time:111293ms step_avg:61.22ms +step:1819/2245 train_time:111356ms step_avg:61.22ms +step:1820/2245 train_time:111415ms step_avg:61.22ms +step:1821/2245 train_time:111479ms step_avg:61.22ms +step:1822/2245 train_time:111539ms step_avg:61.22ms +step:1823/2245 train_time:111602ms step_avg:61.22ms +step:1824/2245 train_time:111662ms step_avg:61.22ms +step:1825/2245 train_time:111725ms step_avg:61.22ms +step:1826/2245 train_time:111785ms step_avg:61.22ms +step:1827/2245 train_time:111848ms step_avg:61.22ms +step:1828/2245 train_time:111909ms step_avg:61.22ms +step:1829/2245 train_time:111971ms step_avg:61.22ms +step:1830/2245 train_time:112032ms step_avg:61.22ms +step:1831/2245 train_time:112095ms step_avg:61.22ms +step:1832/2245 train_time:112155ms step_avg:61.22ms +step:1833/2245 train_time:112219ms step_avg:61.22ms +step:1834/2245 train_time:112279ms step_avg:61.22ms +step:1835/2245 train_time:112341ms step_avg:61.22ms +step:1836/2245 train_time:112402ms step_avg:61.22ms +step:1837/2245 train_time:112465ms step_avg:61.22ms +step:1838/2245 train_time:112525ms step_avg:61.22ms +step:1839/2245 train_time:112588ms step_avg:61.22ms +step:1840/2245 train_time:112649ms step_avg:61.22ms +step:1841/2245 train_time:112711ms step_avg:61.22ms +step:1842/2245 train_time:112772ms step_avg:61.22ms +step:1843/2245 train_time:112835ms step_avg:61.22ms +step:1844/2245 train_time:112895ms step_avg:61.22ms +step:1845/2245 train_time:112957ms step_avg:61.22ms +step:1846/2245 train_time:113018ms step_avg:61.22ms +step:1847/2245 train_time:113080ms step_avg:61.22ms +step:1848/2245 train_time:113141ms step_avg:61.22ms +step:1849/2245 train_time:113203ms step_avg:61.22ms +step:1850/2245 train_time:113264ms step_avg:61.22ms +step:1851/2245 train_time:113327ms step_avg:61.22ms +step:1852/2245 train_time:113387ms step_avg:61.22ms +step:1853/2245 train_time:113450ms step_avg:61.23ms +step:1854/2245 train_time:113511ms step_avg:61.22ms +step:1855/2245 train_time:113573ms step_avg:61.23ms +step:1856/2245 train_time:113633ms step_avg:61.22ms +step:1857/2245 train_time:113696ms step_avg:61.23ms +step:1858/2245 train_time:113756ms step_avg:61.23ms +step:1859/2245 train_time:113819ms step_avg:61.23ms +step:1860/2245 train_time:113880ms step_avg:61.23ms +step:1861/2245 train_time:113942ms step_avg:61.23ms +step:1862/2245 train_time:114003ms step_avg:61.23ms +step:1863/2245 train_time:114066ms step_avg:61.23ms +step:1864/2245 train_time:114127ms step_avg:61.23ms +step:1865/2245 train_time:114190ms step_avg:61.23ms +step:1866/2245 train_time:114251ms step_avg:61.23ms +step:1867/2245 train_time:114313ms step_avg:61.23ms +step:1868/2245 train_time:114374ms step_avg:61.23ms +step:1869/2245 train_time:114436ms step_avg:61.23ms +step:1870/2245 train_time:114496ms step_avg:61.23ms +step:1871/2245 train_time:114559ms step_avg:61.23ms +step:1872/2245 train_time:114619ms step_avg:61.23ms +step:1873/2245 train_time:114681ms step_avg:61.23ms +step:1874/2245 train_time:114742ms step_avg:61.23ms +step:1875/2245 train_time:114804ms step_avg:61.23ms +step:1876/2245 train_time:114864ms step_avg:61.23ms +step:1877/2245 train_time:114927ms step_avg:61.23ms +step:1878/2245 train_time:114988ms step_avg:61.23ms +step:1879/2245 train_time:115051ms step_avg:61.23ms +step:1880/2245 train_time:115112ms step_avg:61.23ms +step:1881/2245 train_time:115176ms step_avg:61.23ms +step:1882/2245 train_time:115236ms step_avg:61.23ms +step:1883/2245 train_time:115298ms step_avg:61.23ms +step:1884/2245 train_time:115359ms step_avg:61.23ms +step:1885/2245 train_time:115422ms step_avg:61.23ms +step:1886/2245 train_time:115481ms step_avg:61.23ms +step:1887/2245 train_time:115544ms step_avg:61.23ms +step:1888/2245 train_time:115604ms step_avg:61.23ms +step:1889/2245 train_time:115667ms step_avg:61.23ms +step:1890/2245 train_time:115728ms step_avg:61.23ms +step:1891/2245 train_time:115790ms step_avg:61.23ms +step:1892/2245 train_time:115851ms step_avg:61.23ms +step:1893/2245 train_time:115914ms step_avg:61.23ms +step:1894/2245 train_time:115974ms step_avg:61.23ms +step:1895/2245 train_time:116036ms step_avg:61.23ms +step:1896/2245 train_time:116097ms step_avg:61.23ms +step:1897/2245 train_time:116159ms step_avg:61.23ms +step:1898/2245 train_time:116219ms step_avg:61.23ms +step:1899/2245 train_time:116281ms step_avg:61.23ms +step:1900/2245 train_time:116342ms step_avg:61.23ms +step:1901/2245 train_time:116405ms step_avg:61.23ms +step:1902/2245 train_time:116466ms step_avg:61.23ms +step:1903/2245 train_time:116528ms step_avg:61.23ms +step:1904/2245 train_time:116589ms step_avg:61.23ms +step:1905/2245 train_time:116651ms step_avg:61.23ms +step:1906/2245 train_time:116711ms step_avg:61.23ms +step:1907/2245 train_time:116773ms step_avg:61.23ms +step:1908/2245 train_time:116834ms step_avg:61.23ms +step:1909/2245 train_time:116896ms step_avg:61.23ms +step:1910/2245 train_time:116956ms step_avg:61.23ms +step:1911/2245 train_time:117019ms step_avg:61.23ms +step:1912/2245 train_time:117079ms step_avg:61.23ms +step:1913/2245 train_time:117142ms step_avg:61.23ms +step:1914/2245 train_time:117203ms step_avg:61.23ms +step:1915/2245 train_time:117266ms step_avg:61.24ms +step:1916/2245 train_time:117326ms step_avg:61.23ms +step:1917/2245 train_time:117389ms step_avg:61.24ms +step:1918/2245 train_time:117450ms step_avg:61.24ms +step:1919/2245 train_time:117512ms step_avg:61.24ms +step:1920/2245 train_time:117573ms step_avg:61.24ms +step:1921/2245 train_time:117636ms step_avg:61.24ms +step:1922/2245 train_time:117696ms step_avg:61.24ms +step:1923/2245 train_time:117759ms step_avg:61.24ms +step:1924/2245 train_time:117819ms step_avg:61.24ms +step:1925/2245 train_time:117882ms step_avg:61.24ms +step:1926/2245 train_time:117941ms step_avg:61.24ms +step:1927/2245 train_time:118005ms step_avg:61.24ms +step:1928/2245 train_time:118065ms step_avg:61.24ms +step:1929/2245 train_time:118129ms step_avg:61.24ms +step:1930/2245 train_time:118189ms step_avg:61.24ms +step:1931/2245 train_time:118252ms step_avg:61.24ms +step:1932/2245 train_time:118312ms step_avg:61.24ms +step:1933/2245 train_time:118375ms step_avg:61.24ms +step:1934/2245 train_time:118435ms step_avg:61.24ms +step:1935/2245 train_time:118498ms step_avg:61.24ms +step:1936/2245 train_time:118558ms step_avg:61.24ms +step:1937/2245 train_time:118620ms step_avg:61.24ms +step:1938/2245 train_time:118681ms step_avg:61.24ms +step:1939/2245 train_time:118744ms step_avg:61.24ms +step:1940/2245 train_time:118804ms step_avg:61.24ms +step:1941/2245 train_time:118867ms step_avg:61.24ms +step:1942/2245 train_time:118927ms step_avg:61.24ms +step:1943/2245 train_time:118990ms step_avg:61.24ms +step:1944/2245 train_time:119051ms step_avg:61.24ms +step:1945/2245 train_time:119114ms step_avg:61.24ms +step:1946/2245 train_time:119175ms step_avg:61.24ms +step:1947/2245 train_time:119237ms step_avg:61.24ms +step:1948/2245 train_time:119297ms step_avg:61.24ms +step:1949/2245 train_time:119360ms step_avg:61.24ms +step:1950/2245 train_time:119420ms step_avg:61.24ms +step:1951/2245 train_time:119482ms step_avg:61.24ms +step:1952/2245 train_time:119542ms step_avg:61.24ms +step:1953/2245 train_time:119605ms step_avg:61.24ms +step:1954/2245 train_time:119665ms step_avg:61.24ms +step:1955/2245 train_time:119728ms step_avg:61.24ms +step:1956/2245 train_time:119789ms step_avg:61.24ms +step:1957/2245 train_time:119852ms step_avg:61.24ms +step:1958/2245 train_time:119912ms step_avg:61.24ms +step:1959/2245 train_time:119975ms step_avg:61.24ms +step:1960/2245 train_time:120035ms step_avg:61.24ms +step:1961/2245 train_time:120098ms step_avg:61.24ms +step:1962/2245 train_time:120158ms step_avg:61.24ms +step:1963/2245 train_time:120220ms step_avg:61.24ms +step:1964/2245 train_time:120281ms step_avg:61.24ms +step:1965/2245 train_time:120343ms step_avg:61.24ms +step:1966/2245 train_time:120404ms step_avg:61.24ms +step:1967/2245 train_time:120467ms step_avg:61.24ms +step:1968/2245 train_time:120528ms step_avg:61.24ms +step:1969/2245 train_time:120591ms step_avg:61.24ms +step:1970/2245 train_time:120652ms step_avg:61.24ms +step:1971/2245 train_time:120714ms step_avg:61.25ms +step:1972/2245 train_time:120774ms step_avg:61.24ms +step:1973/2245 train_time:120837ms step_avg:61.25ms +step:1974/2245 train_time:120898ms step_avg:61.24ms +step:1975/2245 train_time:120960ms step_avg:61.25ms +step:1976/2245 train_time:121021ms step_avg:61.25ms +step:1977/2245 train_time:121083ms step_avg:61.25ms +step:1978/2245 train_time:121144ms step_avg:61.25ms +step:1979/2245 train_time:121207ms step_avg:61.25ms +step:1980/2245 train_time:121268ms step_avg:61.25ms +step:1981/2245 train_time:121331ms step_avg:61.25ms +step:1982/2245 train_time:121393ms step_avg:61.25ms +step:1983/2245 train_time:121455ms step_avg:61.25ms +step:1984/2245 train_time:121515ms step_avg:61.25ms +step:1985/2245 train_time:121578ms step_avg:61.25ms +step:1986/2245 train_time:121638ms step_avg:61.25ms +step:1987/2245 train_time:121701ms step_avg:61.25ms +step:1988/2245 train_time:121761ms step_avg:61.25ms +step:1989/2245 train_time:121824ms step_avg:61.25ms +step:1990/2245 train_time:121884ms step_avg:61.25ms +step:1991/2245 train_time:121947ms step_avg:61.25ms +step:1992/2245 train_time:122008ms step_avg:61.25ms +step:1993/2245 train_time:122071ms step_avg:61.25ms +step:1994/2245 train_time:122131ms step_avg:61.25ms +step:1995/2245 train_time:122193ms step_avg:61.25ms +step:1996/2245 train_time:122254ms step_avg:61.25ms +step:1997/2245 train_time:122317ms step_avg:61.25ms +step:1998/2245 train_time:122377ms step_avg:61.25ms +step:1999/2245 train_time:122440ms step_avg:61.25ms +step:2000/2245 train_time:122500ms step_avg:61.25ms +step:2000/2245 val_loss:3.3229 train_time:122564ms step_avg:61.28ms +step:2001/2245 train_time:122583ms step_avg:61.26ms +step:2002/2245 train_time:122627ms step_avg:61.25ms +step:2003/2245 train_time:122692ms step_avg:61.25ms +step:2004/2245 train_time:122753ms step_avg:61.25ms +step:2005/2245 train_time:122816ms step_avg:61.25ms +step:2006/2245 train_time:122876ms step_avg:61.25ms +step:2007/2245 train_time:122938ms step_avg:61.25ms +step:2008/2245 train_time:122998ms step_avg:61.25ms +step:2009/2245 train_time:123060ms step_avg:61.25ms +step:2010/2245 train_time:123121ms step_avg:61.25ms +step:2011/2245 train_time:123182ms step_avg:61.25ms +step:2012/2245 train_time:123242ms step_avg:61.25ms +step:2013/2245 train_time:123304ms step_avg:61.25ms +step:2014/2245 train_time:123364ms step_avg:61.25ms +step:2015/2245 train_time:123426ms step_avg:61.25ms +step:2016/2245 train_time:123487ms step_avg:61.25ms +step:2017/2245 train_time:123552ms step_avg:61.26ms +step:2018/2245 train_time:123613ms step_avg:61.26ms +step:2019/2245 train_time:123677ms step_avg:61.26ms +step:2020/2245 train_time:123739ms step_avg:61.26ms +step:2021/2245 train_time:123803ms step_avg:61.26ms +step:2022/2245 train_time:123863ms step_avg:61.26ms +step:2023/2245 train_time:123925ms step_avg:61.26ms +step:2024/2245 train_time:123985ms step_avg:61.26ms +step:2025/2245 train_time:124048ms step_avg:61.26ms +step:2026/2245 train_time:124108ms step_avg:61.26ms +step:2027/2245 train_time:124170ms step_avg:61.26ms +step:2028/2245 train_time:124230ms step_avg:61.26ms +step:2029/2245 train_time:124292ms step_avg:61.26ms +step:2030/2245 train_time:124351ms step_avg:61.26ms +step:2031/2245 train_time:124414ms step_avg:61.26ms +step:2032/2245 train_time:124475ms step_avg:61.26ms +step:2033/2245 train_time:124538ms step_avg:61.26ms +step:2034/2245 train_time:124600ms step_avg:61.26ms +step:2035/2245 train_time:124663ms step_avg:61.26ms +step:2036/2245 train_time:124725ms step_avg:61.26ms +step:2037/2245 train_time:124788ms step_avg:61.26ms +step:2038/2245 train_time:124847ms step_avg:61.26ms +step:2039/2245 train_time:124910ms step_avg:61.26ms +step:2040/2245 train_time:124970ms step_avg:61.26ms +step:2041/2245 train_time:125034ms step_avg:61.26ms +step:2042/2245 train_time:125094ms step_avg:61.26ms +step:2043/2245 train_time:125157ms step_avg:61.26ms +step:2044/2245 train_time:125218ms step_avg:61.26ms +step:2045/2245 train_time:125281ms step_avg:61.26ms +step:2046/2245 train_time:125341ms step_avg:61.26ms +step:2047/2245 train_time:125404ms step_avg:61.26ms +step:2048/2245 train_time:125464ms step_avg:61.26ms +step:2049/2245 train_time:125528ms step_avg:61.26ms +step:2050/2245 train_time:125589ms step_avg:61.26ms +step:2051/2245 train_time:125652ms step_avg:61.26ms +step:2052/2245 train_time:125713ms step_avg:61.26ms +step:2053/2245 train_time:125776ms step_avg:61.26ms +step:2054/2245 train_time:125837ms step_avg:61.26ms +step:2055/2245 train_time:125900ms step_avg:61.27ms +step:2056/2245 train_time:125961ms step_avg:61.26ms +step:2057/2245 train_time:126023ms step_avg:61.27ms +step:2058/2245 train_time:126083ms step_avg:61.26ms +step:2059/2245 train_time:126145ms step_avg:61.27ms +step:2060/2245 train_time:126205ms step_avg:61.26ms +step:2061/2245 train_time:126268ms step_avg:61.27ms +step:2062/2245 train_time:126328ms step_avg:61.26ms +step:2063/2245 train_time:126391ms step_avg:61.27ms +step:2064/2245 train_time:126451ms step_avg:61.27ms +step:2065/2245 train_time:126514ms step_avg:61.27ms +step:2066/2245 train_time:126576ms step_avg:61.27ms +step:2067/2245 train_time:126639ms step_avg:61.27ms +step:2068/2245 train_time:126700ms step_avg:61.27ms +step:2069/2245 train_time:126763ms step_avg:61.27ms +step:2070/2245 train_time:126824ms step_avg:61.27ms +step:2071/2245 train_time:126887ms step_avg:61.27ms +step:2072/2245 train_time:126947ms step_avg:61.27ms +step:2073/2245 train_time:127011ms step_avg:61.27ms +step:2074/2245 train_time:127071ms step_avg:61.27ms +step:2075/2245 train_time:127134ms step_avg:61.27ms +step:2076/2245 train_time:127195ms step_avg:61.27ms +step:2077/2245 train_time:127257ms step_avg:61.27ms +step:2078/2245 train_time:127319ms step_avg:61.27ms +step:2079/2245 train_time:127381ms step_avg:61.27ms +step:2080/2245 train_time:127441ms step_avg:61.27ms +step:2081/2245 train_time:127504ms step_avg:61.27ms +step:2082/2245 train_time:127563ms step_avg:61.27ms +step:2083/2245 train_time:127626ms step_avg:61.27ms +step:2084/2245 train_time:127686ms step_avg:61.27ms +step:2085/2245 train_time:127749ms step_avg:61.27ms +step:2086/2245 train_time:127809ms step_avg:61.27ms +step:2087/2245 train_time:127872ms step_avg:61.27ms +step:2088/2245 train_time:127933ms step_avg:61.27ms +step:2089/2245 train_time:127997ms step_avg:61.27ms +step:2090/2245 train_time:128057ms step_avg:61.27ms +step:2091/2245 train_time:128120ms step_avg:61.27ms +step:2092/2245 train_time:128180ms step_avg:61.27ms +step:2093/2245 train_time:128243ms step_avg:61.27ms +step:2094/2245 train_time:128304ms step_avg:61.27ms +step:2095/2245 train_time:128366ms step_avg:61.27ms +step:2096/2245 train_time:128426ms step_avg:61.27ms +step:2097/2245 train_time:128488ms step_avg:61.27ms +step:2098/2245 train_time:128548ms step_avg:61.27ms +step:2099/2245 train_time:128611ms step_avg:61.27ms +step:2100/2245 train_time:128672ms step_avg:61.27ms +step:2101/2245 train_time:128735ms step_avg:61.27ms +step:2102/2245 train_time:128797ms step_avg:61.27ms +step:2103/2245 train_time:128860ms step_avg:61.27ms +step:2104/2245 train_time:128921ms step_avg:61.27ms +step:2105/2245 train_time:128984ms step_avg:61.28ms +step:2106/2245 train_time:129044ms step_avg:61.27ms +step:2107/2245 train_time:129107ms step_avg:61.28ms +step:2108/2245 train_time:129167ms step_avg:61.27ms +step:2109/2245 train_time:129229ms step_avg:61.28ms +step:2110/2245 train_time:129289ms step_avg:61.27ms +step:2111/2245 train_time:129352ms step_avg:61.28ms +step:2112/2245 train_time:129413ms step_avg:61.27ms +step:2113/2245 train_time:129476ms step_avg:61.28ms +step:2114/2245 train_time:129537ms step_avg:61.28ms +step:2115/2245 train_time:129599ms step_avg:61.28ms +step:2116/2245 train_time:129659ms step_avg:61.28ms +step:2117/2245 train_time:129722ms step_avg:61.28ms +step:2118/2245 train_time:129782ms step_avg:61.28ms +step:2119/2245 train_time:129845ms step_avg:61.28ms +step:2120/2245 train_time:129905ms step_avg:61.28ms +step:2121/2245 train_time:129968ms step_avg:61.28ms +step:2122/2245 train_time:130029ms step_avg:61.28ms +step:2123/2245 train_time:130091ms step_avg:61.28ms +step:2124/2245 train_time:130152ms step_avg:61.28ms +step:2125/2245 train_time:130215ms step_avg:61.28ms +step:2126/2245 train_time:130275ms step_avg:61.28ms +step:2127/2245 train_time:130337ms step_avg:61.28ms +step:2128/2245 train_time:130397ms step_avg:61.28ms +step:2129/2245 train_time:130460ms step_avg:61.28ms +step:2130/2245 train_time:130521ms step_avg:61.28ms +step:2131/2245 train_time:130583ms step_avg:61.28ms +step:2132/2245 train_time:130644ms step_avg:61.28ms +step:2133/2245 train_time:130707ms step_avg:61.28ms +step:2134/2245 train_time:130766ms step_avg:61.28ms +step:2135/2245 train_time:130829ms step_avg:61.28ms +step:2136/2245 train_time:130889ms step_avg:61.28ms +step:2137/2245 train_time:130952ms step_avg:61.28ms +step:2138/2245 train_time:131012ms step_avg:61.28ms +step:2139/2245 train_time:131075ms step_avg:61.28ms +step:2140/2245 train_time:131136ms step_avg:61.28ms +step:2141/2245 train_time:131199ms step_avg:61.28ms +step:2142/2245 train_time:131260ms step_avg:61.28ms +step:2143/2245 train_time:131323ms step_avg:61.28ms +step:2144/2245 train_time:131383ms step_avg:61.28ms +step:2145/2245 train_time:131446ms step_avg:61.28ms +step:2146/2245 train_time:131506ms step_avg:61.28ms +step:2147/2245 train_time:131568ms step_avg:61.28ms +step:2148/2245 train_time:131629ms step_avg:61.28ms +step:2149/2245 train_time:131691ms step_avg:61.28ms +step:2150/2245 train_time:131751ms step_avg:61.28ms +step:2151/2245 train_time:131814ms step_avg:61.28ms +step:2152/2245 train_time:131875ms step_avg:61.28ms +step:2153/2245 train_time:131939ms step_avg:61.28ms +step:2154/2245 train_time:131999ms step_avg:61.28ms +step:2155/2245 train_time:132062ms step_avg:61.28ms +step:2156/2245 train_time:132123ms step_avg:61.28ms +step:2157/2245 train_time:132185ms step_avg:61.28ms +step:2158/2245 train_time:132246ms step_avg:61.28ms +step:2159/2245 train_time:132309ms step_avg:61.28ms +step:2160/2245 train_time:132369ms step_avg:61.28ms +step:2161/2245 train_time:132432ms step_avg:61.28ms +step:2162/2245 train_time:132492ms step_avg:61.28ms +step:2163/2245 train_time:132555ms step_avg:61.28ms +step:2164/2245 train_time:132616ms step_avg:61.28ms +step:2165/2245 train_time:132679ms step_avg:61.28ms +step:2166/2245 train_time:132739ms step_avg:61.28ms +step:2167/2245 train_time:132802ms step_avg:61.28ms +step:2168/2245 train_time:132862ms step_avg:61.28ms +step:2169/2245 train_time:132925ms step_avg:61.28ms +step:2170/2245 train_time:132985ms step_avg:61.28ms +step:2171/2245 train_time:133048ms step_avg:61.28ms +step:2172/2245 train_time:133108ms step_avg:61.28ms +step:2173/2245 train_time:133171ms step_avg:61.28ms +step:2174/2245 train_time:133231ms step_avg:61.28ms +step:2175/2245 train_time:133294ms step_avg:61.28ms +step:2176/2245 train_time:133354ms step_avg:61.28ms +step:2177/2245 train_time:133417ms step_avg:61.28ms +step:2178/2245 train_time:133478ms step_avg:61.28ms +step:2179/2245 train_time:133541ms step_avg:61.29ms +step:2180/2245 train_time:133601ms step_avg:61.28ms +step:2181/2245 train_time:133664ms step_avg:61.29ms +step:2182/2245 train_time:133725ms step_avg:61.29ms +step:2183/2245 train_time:133787ms step_avg:61.29ms +step:2184/2245 train_time:133847ms step_avg:61.29ms +step:2185/2245 train_time:133909ms step_avg:61.29ms +step:2186/2245 train_time:133969ms step_avg:61.29ms +step:2187/2245 train_time:134032ms step_avg:61.29ms +step:2188/2245 train_time:134093ms step_avg:61.29ms +step:2189/2245 train_time:134156ms step_avg:61.29ms +step:2190/2245 train_time:134217ms step_avg:61.29ms +step:2191/2245 train_time:134280ms step_avg:61.29ms +step:2192/2245 train_time:134340ms step_avg:61.29ms +step:2193/2245 train_time:134402ms step_avg:61.29ms +step:2194/2245 train_time:134463ms step_avg:61.29ms +step:2195/2245 train_time:134525ms step_avg:61.29ms +step:2196/2245 train_time:134585ms step_avg:61.29ms +step:2197/2245 train_time:134648ms step_avg:61.29ms +step:2198/2245 train_time:134708ms step_avg:61.29ms +step:2199/2245 train_time:134771ms step_avg:61.29ms +step:2200/2245 train_time:134832ms step_avg:61.29ms +step:2201/2245 train_time:134895ms step_avg:61.29ms +step:2202/2245 train_time:134955ms step_avg:61.29ms +step:2203/2245 train_time:135018ms step_avg:61.29ms +step:2204/2245 train_time:135079ms step_avg:61.29ms +step:2205/2245 train_time:135142ms step_avg:61.29ms +step:2206/2245 train_time:135203ms step_avg:61.29ms +step:2207/2245 train_time:135266ms step_avg:61.29ms +step:2208/2245 train_time:135326ms step_avg:61.29ms +step:2209/2245 train_time:135389ms step_avg:61.29ms +step:2210/2245 train_time:135450ms step_avg:61.29ms +step:2211/2245 train_time:135513ms step_avg:61.29ms +step:2212/2245 train_time:135574ms step_avg:61.29ms +step:2213/2245 train_time:135637ms step_avg:61.29ms +step:2214/2245 train_time:135698ms step_avg:61.29ms +step:2215/2245 train_time:135762ms step_avg:61.29ms +step:2216/2245 train_time:135823ms step_avg:61.29ms +step:2217/2245 train_time:135885ms step_avg:61.29ms +step:2218/2245 train_time:135946ms step_avg:61.29ms +step:2219/2245 train_time:136008ms step_avg:61.29ms +step:2220/2245 train_time:136068ms step_avg:61.29ms +step:2221/2245 train_time:136132ms step_avg:61.29ms +step:2222/2245 train_time:136193ms step_avg:61.29ms +step:2223/2245 train_time:136257ms step_avg:61.29ms +step:2224/2245 train_time:136318ms step_avg:61.29ms +step:2225/2245 train_time:136381ms step_avg:61.29ms +step:2226/2245 train_time:136441ms step_avg:61.29ms +step:2227/2245 train_time:136503ms step_avg:61.29ms +step:2228/2245 train_time:136564ms step_avg:61.29ms +step:2229/2245 train_time:136626ms step_avg:61.29ms +step:2230/2245 train_time:136686ms step_avg:61.29ms +step:2231/2245 train_time:136749ms step_avg:61.29ms +step:2232/2245 train_time:136809ms step_avg:61.29ms +step:2233/2245 train_time:136872ms step_avg:61.30ms +step:2234/2245 train_time:136933ms step_avg:61.29ms +step:2235/2245 train_time:136996ms step_avg:61.30ms +step:2236/2245 train_time:137057ms step_avg:61.30ms +step:2237/2245 train_time:137120ms step_avg:61.30ms +step:2238/2245 train_time:137182ms step_avg:61.30ms +step:2239/2245 train_time:137244ms step_avg:61.30ms +step:2240/2245 train_time:137304ms step_avg:61.30ms +step:2241/2245 train_time:137368ms step_avg:61.30ms +step:2242/2245 train_time:137429ms step_avg:61.30ms +step:2243/2245 train_time:137492ms step_avg:61.30ms +step:2244/2245 train_time:137553ms step_avg:61.30ms +step:2245/2245 train_time:137617ms step_avg:61.30ms +step:2245/2245 val_loss:3.2772 train_time:137677ms step_avg:61.33ms +peak memory allocated: 29249 MiB reserved: 50528 MiB diff --git a/records/track_1_short/2025-11-10_CautiousWD/cdef87a8-cc95-4916-bc6a-83c9615d24c2.txt b/records/track_1_short/2025-11-10_CautiousWD/cdef87a8-cc95-4916-bc6a-83c9615d24c2.txt new file mode 100644 index 000000000..f4c36b51b --- /dev/null +++ b/records/track_1_short/2025-11-10_CautiousWD/cdef87a8-cc95-4916-bc6a-83c9615d24c2.txt @@ -0,0 +1,3772 @@ +import os +import sys + +with open(sys.argv[0]) as f: + code = f.read() # read the code of this file ASAP, for logging +import copy +import glob +import math +import threading +import time +import uuid +from dataclasses import dataclass +from collections import defaultdict +from itertools import accumulate +from pathlib import Path + +os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" +import torch + +torch.empty( + 1, device="cuda", requires_grad=True +).backward() # prevents a bug on some systems +import torch._dynamo as dynamo +import torch.distributed as dist +import torch.nn.functional as F + +# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min +import triton +import triton.language as tl +from kernels import get_kernel +from torch import Tensor, nn + +dynamo.config.recompile_limit = 64 + +# ----------------------------------------------------------------------------- +# Custom operators: FP8 matmul by @YouJiacheng + + +@torch.library.custom_op("nanogpt::mm", mutates_args=()) +def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: + @torch.compile + def impl(x: Tensor, w: Tensor): + assert x.is_contiguous() and w.is_contiguous() + x_f8 = x.div(x_s).to(torch.float8_e4m3fn) + w_f8 = w.div(w_s).to(torch.float8_e4m3fn) + out = torch._scaled_mm( + x_f8, + w_f8.T, + out_dtype=torch.bfloat16, + scale_a=x.new_tensor(x_s, dtype=torch.float32), + scale_b=x.new_tensor(w_s, dtype=torch.float32), + use_fast_accum=True, + ) + return out, x_f8, w_f8 + + return impl(x, w) + +@mm_op.register_fake +def _(x: Tensor, w: Tensor, *_): + assert x.ndim == w.ndim == 2 + assert x.shape[1] == w.shape[1] + assert x.device == w.device + assert x.is_contiguous() and w.is_contiguous() + return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) + +@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) +def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: + @torch.compile + def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): + assert grad.is_contiguous() + x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) + w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) + grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) + grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) + grad_x = torch._scaled_mm( + grad_f8, + w_f8.T.contiguous().T, + out_dtype=torch.bfloat16, + scale_a=grad_inv_s, + scale_b=w_inv_s, + use_fast_accum=False, + ) + # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) + grad_w = torch._scaled_mm( + x_f8.T.contiguous(), + grad_f8.T.contiguous().T, + out_dtype=torch.float32, + scale_a=x_inv_s, + scale_b=grad_inv_s, + use_fast_accum=False, + ).T + return grad_x, grad_w + + return impl(g, x_f8, w_f8) + +@mm_backward_op.register_fake +def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): + return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) + +def backward(ctx, grad_out: Tensor, *_): + x_f8, w_f8 = ctx.saved_tensors + x_s, w_s, grad_s = ctx.scales + grad_x, grad_w = torch.ops.nanogpt.mm_backward( + grad_out, x_f8, w_f8, x_s, w_s, grad_s + ) + return grad_x, grad_w, None, None, None + +def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): + *_, x_s, w_s, grad_s = inputs + _, x_f8, w_f8 = output + ctx.save_for_backward(x_f8, w_f8) + ctx.scales = x_s, w_s, grad_s + ctx.set_materialize_grads(False) + +mm_op.register_autograd(backward, setup_context=setup_context) + +# ----------------------------------------------------------------------------- +# Triton kernel for symmetric matrix multiplication by @byronxu99 + +def _get_autotune_configs(): + return [ + triton.Config( + { + "BLOCK_SIZE_M": bm, + "BLOCK_SIZE_N": bn, + "BLOCK_SIZE_K": bk, + "GROUP_SIZE_M": 8, + "LOWER_UPPER": 1, + }, + num_stages=stages, + num_warps=warps, + ) + for bm in [64, 128] + for bn in [64, 128, 256] + for bk in [64, 128] + for stages, warps in [(3, 4), (3, 8), (4, 4)] + if bm // bn <= 2 and bn // bm <= 2 + ] + +@triton.jit +def _pid_to_block( + pid, + M, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) + + # Map PID to a single matrix in batch + batch_idx = pid // (num_pid_m * num_pid_n) + pid = pid % (num_pid_m * num_pid_n) + + # Map PID to 2D grid of blocks + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) + + m_idx = pid_m * BLOCK_SIZE_M + n_idx = pid_n * BLOCK_SIZE_N + return batch_idx, m_idx, n_idx + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def XXT_kernel( + A_ptr, C_ptr, + M, K, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def XXT(A: torch.Tensor, out: torch.Tensor): + """ + Launch Triton kernel to compute C = A @ A.T + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert out.size(-2) == M, "Output matrix has incorrect shape" + assert out.size(-1) == M, "Output matrix has incorrect shape" + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + XXT_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + K=K, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + ) + return out + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def ba_plus_cAA_kernel( + A_ptr, C_ptr, + M, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + alpha, beta, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A + # Performance is slightly slower than XXT_kernel, so we use two separate kernels + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + # Load block of A to add (corresponds to the current block of C) + offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) + a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) + a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) + a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) + + # Apply alpha and beta + accumulator *= alpha + accumulator += a_add * beta + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): + """ + Launch Triton kernel to compute C = alpha * A @ A.T + beta * A + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert M == K, "Input matrix must be square" + assert out.size(-2) == M + assert out.size(-1) == M + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + ba_plus_cAA_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + alpha=alpha, + beta=beta, + ) + return out + +# Computed for num_iters=5, safety_factor=2e-2, cushion=2 +polar_express_coeffs = [ + (8.156554524902461, -22.48329292557795, 15.878769915207462), + (4.042929935166739, -2.808917465908714, 0.5000178451051316), + (3.8916678022926607, -2.772484153217685, 0.5060648178503393), + (3.285753657755655, -2.3681294933425376, 0.46449024233003106), + (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) +] + +@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower +def polar_express(G: torch.Tensor): + """ + Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 + by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. + """ + X = G.bfloat16() + if G.size(-2) > G.size(-1): + X = X.mT + + # Ensure spectral norm is at most 1 + X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) + + # Allocate buffers + X = X.contiguous() + A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) + B = torch.empty_like(A) + C = torch.empty_like(X) + + aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm + + # Perform the iterations + for a, b, c in polar_express_coeffs: + XXT(X, out=A) # A = X @ X.mT + ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A + aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X + X, C = C, X # Swap references to avoid unnecessary copies + + if G.size(-2) > G.size(-1): + X = X.mT + return X + +# ----------------------------------------------------------------------------- +# Muon optimizer + +class NorMuon(torch.optim.Optimizer): + """ + Muon - MomentUm Orthogonalized by Newton-schulz + + https://kellerjordan.github.io/posts/muon/ + + Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- + processing step, in which each 2D parameter's update is replaced with the nearest orthogonal + matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has + the advantage that it can be stably run in bfloat16 on the GPU. + + Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, + or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). + + Differences from standard Muon: + - Newton-Shulz is replaced with Polar Express for the orthogonalization step + - NorMuon adds a low-rank variance estimator similar to Adafactor. + - small 1D parameters handled here instead of in Adam + - Cautious weight decay, a gated version of decoupled weight decay + - Custom distributed sizing: + The model stores all attn and mlp weights in the same shape, and then updates the view as + needed on the forward pass. This enables attn and mlp weights to be contained within the same + dist.reduce_scatter_tensor() call. The model architecture has been customized to enable + (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. + The scheduling is: + 1. reduce scatter smear_gate (1 param 7 padding params) + 2. reduce scatter attn_gate (10 params 6 padding params) + 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) + 4. reduce scatter attn/mlp round 2 (16 mlp params) + 5. wait on step 1, then compute update of 1 and schedule all gather + 6. wait on step 2, then compute update of 2 and schedule all gather + 7. wait on step 3, then compute update of 3 and schedule all gather + GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] + GPUs that receive params of type attn reshape before computing update + 8. wait on 4, then compute update of 4 and schedule all gather + 9. wait for each all gather to complete and update params + Empirically, leading with small params provides an additional 0.2s improvement. + """ + def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, beta2=0.95, custom_sizing=True): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + # custom sizing requires 8 GPUs + if custom_sizing and dist.get_world_size()==8: + param_groups = self.generate_custom_param_groups(params) + else: + param_groups = self.generate_standard_param_groups(params) + super().__init__(param_groups, defaults) + + def reset(self): + # expose a reset for clearing buffers + for group in self.param_groups: + group["momentum_buffer"].zero_() + group["second_momentum_buffer"].zero_() + + def generate_standard_param_groups(self, params): + """ + Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. + Creates one param group per module. + """ + groups = defaultdict(list) + for param in params: + groups[param.label].append(param) + + param_groups = [] + for module_name, group_params in groups.items(): + chunk_size = (len(group_params) + self.world_size - 1) // self.world_size + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + + return param_groups + + def generate_custom_param_groups(self, params): + """ + Implementation requires that a single GPU does not receive both attn + and mlp params when a param group is split across GPUs. + """ + module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp'] + params_list = list(params) + params_list.sort(key=lambda x: module_group_order.index(x.label)) + + idx = 0 + group_sizes = [1, 10, 16, 16] + assert len(params_list) == sum(group_sizes) + param_groups = [] + for size in group_sizes: + chunk_size = (size + self.world_size - 1) // self.world_size + group_params = params_list[idx: idx + size] + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + idx += size + + return param_groups + + @torch.no_grad() + def step(self): + # Efficient systems-wise implementation of step developed by @YouJiacheng, + # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, + # @ryanyang0, @vagrawal, and @varunneal. + rank = dist.get_rank() + group_infos = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + if not params: + continue + + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + stacked_grads = torch.empty( + (padded_num_params, *params[0].shape), + dtype=params[0].dtype, + device=params[0].device + ) + for i, p in enumerate(params): + stacked_grads[i].copy_(p.grad, non_blocking=True) + if len(params) < padded_num_params: + stacked_grads[len(params):].zero_() + + grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) + + reduce_future = dist.reduce_scatter_tensor( + grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True + ).get_future() + + group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) + + all_gather_infos = [] + # Second pass: wait for gradients, compute updates for the local shard of parameters, + # and launch all async all_gather operations. + for group, info in zip(self.param_groups, group_infos): + info["reduce_future"].wait() + + params = group["params"] + grad_chunk = info["grad_chunk"] + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + start_idx = rank * chunk_size + module_idx = start_idx if start_idx < len(params) else 0 + + num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank + + if "momentum_buffer" not in group: + group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) + momentum_buffer = group["momentum_buffer"] + # Apply momentum update to the persistent momentum buffer in-place + momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) + updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) + + grad_shape = updated_grads.shape + if params[module_idx].label == 'attn': + # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] + for p in params[module_idx:module_idx + num_params]: + assert p.label == 'attn' + updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) + ref_param = params[module_idx] + param_shape = ref_param.shape + + if "second_momentum_buffer" not in group: + group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) + if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) + ) + second_momentum_buffer = group["second_momentum_buffer"] + + if "param_lr" not in group: + group["param_lr"] = ( + max(1., param_shape[-2] / param_shape[-1]) ** 0.5 + * ref_param.new_tensor( + [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + ) + + group["param_wd"] = ref_param.new_tensor( + [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + + # Determine LR and WR + eff_lr = group["lr"] * group["param_lr"] + eff_wd = group["lr"] * group["weight_decay"] * group["param_wd"] + + # Compute zeropower for the entire chunk in a single, batched call. + if num_params == 0: + v_chunk = updated_grads + else: + v_chunk = polar_express(updated_grads) + + # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) + v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) + second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) + step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() + v_chunk.mul_(step_size) + v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) + + v_chunk = v_chunk.view(grad_shape) + + updated_params = torch.empty_like(grad_chunk) + param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) + + # "Cautious" weight decay (https://arxiv.org/abs/2510.12402) + mask = (v_chunk * param_chunk) >= 0 + v_chunk.addcmul_(param_chunk, (eff_wd * mask).to(ref_param.dtype)) + + param_chunk.addcmul_(v_chunk, -eff_lr) + + updated_params[:num_params].copy_(param_chunk) + if num_params < chunk_size: + updated_params[num_params:].zero_() + + stacked_params = torch.empty( + (padded_num_params, *param_shape), + dtype=updated_params.dtype, + device=updated_params.device, + ) + + gather_future = dist.all_gather_into_tensor( + stacked_params, updated_params, async_op=True + ).get_future() + + all_gather_infos.append( + { + "gather_future": gather_future, + "stacked_params": stacked_params, + "orig_params": params, + } + ) + + # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. + for info in all_gather_infos: + info["gather_future"].wait() + stacked_params = info["stacked_params"] + orig_params = info["orig_params"] + + unstacked_params = torch.unbind(stacked_params) + for i, p in enumerate(orig_params): + p.copy_(unstacked_params[i], non_blocking=True) + + +class DistAdam(torch.optim.Optimizer): + def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + params = list(params) + sizes = {p.shape for p in params} + # create one buffer per unique parameter-size + param_groups = [] + for size in sizes: + group_params = [p for p in params if p.shape == size] + param_groups.append(dict(params=group_params)) + super().__init__(param_groups, defaults) + # init state + for p in params: + chunk_size = p.size(0) // self.world_size + exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) + exp_avg_sq = torch.zeros_like(exp_avg) + self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) + # DistributedAdam implementation by @vagrawal + + @torch.compile + @torch.no_grad() + def step(self): + rank = dist.get_rank() + reduce_scatter_futures: list[torch.Future] = [] + all_gather_futures: list[torch.Future] = [] + grad_slices = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + for param in params: + grad = param.grad + rank_size = grad.shape[0] // self.world_size + grad_slice = torch.empty_like(grad[:rank_size]) + reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) + grad_slices.append(grad_slice) + + idx = 0 + for group in self.param_groups: + beta1, beta2 = group['betas'] + eps = group['eps'] + wd = group['weight_decay'] + params = group['params'] + for param in params: + reduce_scatter_futures[idx].wait() + rank_size = param.shape[0] // self.world_size + p_slice = param[rank * rank_size:(rank + 1) * rank_size] + lr = group['lr'] * getattr(param, "lr_mul", 1.0) + state = self.state[param] + g_slice = grad_slices[idx] + + exp_avg = state["exp_avg"] + exp_avg_sq = state["exp_avg_sq"] + state["step"] += 1 + t = state["step"] + # weight decay + if wd != 0: + eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) + p_slice.mul_(1 - eff_weight_decay) + # update running averages + exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) + # bias corrections + bias1 = 1 - beta1 ** t + bias2 = 1 - beta2 ** t + # compute step + denom = exp_avg_sq.sqrt().add_(eps) + step_size = lr * (bias2 ** 0.5 / bias1) + update = exp_avg.div(denom).mul_(step_size) + p_slice.add_(other=update, alpha=-1.0) + idx += 1 + all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) + torch.futures.collect_all(all_gather_futures).wait() + +# ----------------------------------------------------------------------------- +# PyTorch nn.Module definitions for the model + +def norm(x: Tensor): + return F.rms_norm(x, (x.size(-1),)) + +class CastedLinear(nn.Linear): + def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): + super().__init__(in_features, out_features, bias=False) + self.use_fp8 = use_fp8 + self.x_s = x_s + self.w_s = w_s + self.grad_s = grad_s + + def reset_parameters(self) -> None: + with torch.no_grad(): + self.weight.zero_() # @Grad62304977 and others + + def forward(self, x: Tensor): + if self.use_fp8 and self.training: + _x = x.flatten(0, -2) + out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] + return out.reshape(*x.shape[:-1], -1) + else: + return F.linear(x, self.weight.type_as(x)) + +# yarn implementation @classiclarryd +class Yarn(nn.Module): + def __init__(self, head_dim, max_seq_len): + super().__init__() + self.head_dim = head_dim + self.max_seq_len = max_seq_len + self.reset() + + def reset(self): + angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) + # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) + angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) + theta = torch.outer(t, angular_freq) + self.cos = nn.Buffer( + theta.cos().to(torch.bfloat16), persistent=False + ) + self.sin = nn.Buffer( + theta.sin().to(torch.bfloat16), persistent=False + ) + self.angular_freq = angular_freq + # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 + self.attn_scale = 0.1 + + def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): + rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) + scaling_factor = old_window / new_window + interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) + self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) + theta = torch.outer(t, self.angular_freq) + self.cos.copy_(theta.cos()) + self.sin.copy_(theta.sin()) + self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 + +def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): + assert cos.size(0) >= x_BTHD.size(-3) + cos, sin = ( + cos[None, : x_BTHD.size(-3), None, :], + sin[None, : x_BTHD.size(-3), None, :], + ) + x1, x2 = x_BTHD.chunk(2, dim=-1) + y1 = x1 * cos + x2 * sin + y2 = x1 * (-sin) + x2 * cos + return torch.cat((y1, y2), 3) + +@dataclass +class AttnArgs: + ve: torch.Tensor + sa_lambdas: torch.Tensor + seqlens: torch.Tensor + bm_size: int + cos: torch.Tensor + sin: torch.Tensor + attn_scale: float + +flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface + +class CausalSelfAttention(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int): + super().__init__() + self.num_heads = num_heads + self.head_dim = head_dim + self.dim = dim + self.hdim = num_heads * head_dim + + assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" + std = 0.5 * (self.dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng + # https://x.com/hi_tysam/status/1879699187107033311 + # make matrices the same shape as MLP to enable batched call in optimizer + self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) + # label module to enable custom optimizer sizing + self.qkvo_w.label='attn' + + with torch.no_grad(): + self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights + self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero + + # sparse gated attention to enable context based no-op by @classiclarryd + self.attn_gate = CastedLinear(12, num_heads) + # label module to enable custom optimizer sizing + self.attn_gate.weight.label = 'attn_gate' + + def forward(self, x: Tensor, attn_args: AttnArgs): + B, T = x.size(0), x.size(1) # batch size, sequence length + assert B == 1, "varlen sequences requires B == 1" + assert T % 16 == 0 + # unpack attention args + cos, sin = attn_args.cos, attn_args.sin + ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas + seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size + + q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) + q, k = norm(q), norm(k) # QK norm @Grad62304977 + q, k = rotary(q, cos, sin), rotary(k, cos, sin) + if ve is not None: + v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 + else: # skip mid-layers token value embeddings by @YouJiacheng + v = sa_lambdas[0] * v + + max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) + + # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng + y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, + max_seqlen_q=max_len, max_seqlen_k=max_len, + causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) + y = y.view(B, T, self.num_heads, self.head_dim) + y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) + y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side + y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) + return y + + +class MLP(nn.Module): + def __init__(self, dim: int): + super().__init__() + hdim = 4 * dim + # make matrices the same shape to enable batched call in optimizer + self.c_fc = nn.Parameter(torch.empty(dim, hdim)) + self.c_proj = nn.Parameter(torch.empty(dim, hdim)) + # label modules to enable custom optimizer sizing + self.c_fc.label = 'mlp' + self.c_proj.label = 'mlp' + # corrective factor to account for transpose + self.c_fc.lr_mul = 2. + + std = 0.5 * (dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + with torch.no_grad(): + self.c_fc.uniform_(-bound, bound) + self.c_proj.zero_() # zero init suggested by @Grad62304977 + + def forward(self, x: Tensor): + x = F.linear(x, self.c_fc.T.type_as(x)) + x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 + x = F.linear(x, self.c_proj.type_as(x)) + return x + +class Block(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): + super().__init__() + # skip attention of blocks.7 (the 8th layer) by @YouJiacheng + self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None + # skip MLP blocks for first MLP layer by @EmelyanenkoK + self.mlp = MLP(dim) if layer_idx != 0 else None + + def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): + x = lambdas[0] * x + lambdas[1] * x0 + if self.attn is not None: + x = x + self.attn(norm(x), attn_args) + if self.mlp is not None: + x = x + self.mlp(norm(x)) + return x + +# ----------------------------------------------------------------------------- +# The main model + +def next_multiple_of_n(v: float | int, *, n: int): + return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) + +class GPT(nn.Module): + def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): + super().__init__() + vocab_size = next_multiple_of_n(vocab_size, n=128) + self.embed = nn.Embedding(vocab_size, model_dim) + self.smear_gate = CastedLinear(12, 1) + # label modules to enable custom optimizer sizing + self.smear_gate.weight.label = 'smear_gate' + # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 + # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 + self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) + self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) + self.yarn = Yarn(head_dim, max_seq_len) + # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. + # suggested to me by @Grad62304977. this originates from Karpathy's experiments. + use_fp8 = not os.environ.get("DISABLE_FP8", False) + self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) + # Add learnable skip connection weights for decoder layers + assert num_layers % 2 == 0 + pad = (-num_layers * 5 - 2) % dist.get_world_size() + self.scalars = nn.Parameter( + torch.cat( + [ + -1.5 + * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 + *[ + torch.tensor([1.0, 0.0]) for _ in range(num_layers) + ], # block lambdas + *[ + torch.tensor([0.5, 0.5]) for _ in range(num_layers) + ], # SA lambdas + torch.zeros(1), # smear_lambda + 0.5*torch.ones(1), # backout_lambda + torch.ones(pad), + ] + ) + ) + # set learning rates + for param in self.embed.parameters(): + param.lr_mul = 75. + for param in self.value_embeds.parameters(): + param.lr_mul = 75. + self.lm_head.weight.lr_mul = 1.0 + self.scalars.lr_mul = 5.0 + + def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): + assert input_seq.ndim == 1 + + ve = [value_embed(input_seq) for value_embed in self.value_embeds] + # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure + # dropping first layer updates this to .12 ... 012 + ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] + assert len(ve) == len(self.blocks) + + short_bm = ws_short * args.block_size + long_bm = ws_long * args.block_size + bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] + assert len(bm_sizes) == len(self.blocks) + + x = self.embed(input_seq) + + skip_weights = self.scalars[:(len(self.blocks) // 2)] + lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) + sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) + smear_lambda = self.scalars[5 * len(self.blocks)] + backout_lambda = self.scalars[5 * len(self.blocks)+1] + + # smear token embed forward 1 position @classiclarryd + smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) + x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) + x = x0 = norm(x[None]) + + # U-net design by @brendanh0gan + skip_connections = [] + n = len(self.blocks) // 2 + + x_backout = None + backout_layer = 8 + # skip layer zero + for i in range(1,len(self.blocks)): + attn_args = AttnArgs( + ve=ve[i], + sa_lambdas=sa_lambdas[i], + seqlens=seqlens, + bm_size=bm_sizes[i], + cos=self.yarn.cos, + sin=self.yarn.sin, + attn_scale=self.yarn.attn_scale + ) + # since layer 0 is skipped, layer 11 does not have skip_connection + if i >= n and i<11: + gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) + x = x + gate * skip_connections.pop() + x = self.blocks[i](x, x0, lambdas[i], attn_args) + if i < n: + skip_connections.append(x) + if i == backout_layer: + x_backout = x + + # back out contributions from first 8 layers that are only required for downstream context and not direct prediction + x -= backout_lambda * x_backout + x = norm(x) + logits = self.lm_head(x) + # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) + logits = 30 * torch.sigmoid(logits / 7.5) + logits_for_loss = logits.float() if not self.training else logits + loss = F.cross_entropy( + logits_for_loss.view(-1, logits_for_loss.size(-1)), + target_seq, + reduction="sum" if self.training else "mean", + ) + return loss + +# ----------------------------------------------------------------------------- +# Distributed data loader + +def _load_data_shard(file: Path): + header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 + assert header[0] == 20240520, "magic number mismatch in the data .bin file" + assert header[1] == 1, "unsupported version" + num_tokens = int(header[2]) # number of tokens (claimed) + with file.open("rb", buffering=0) as f: + tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng + f.seek(256 * 4) + nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng + assert nbytes == 2 * num_tokens, "number of tokens read does not match header" + return tokens + +BOS_ID = 50256 + +class BOSFinder: + # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd + def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): + # Precompute BOS positions once per shard + self.tokens=tokens + self.size = tokens.numel() + self.quickload = quickload + if quickload: + # only scan first 4 million tokens, then kickoff async thread to scan rest + self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.thread = None + self.ready = threading.Event() + self.start() + else: + self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.i = 0 + self.world_size = world_size + self.batch_iter = 0 + + def _load(self): + self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + self.bos_idx = self.bos_idx_async + + def next_batch(self, num_tokens_local: int, max_seq_len: int): + # if quickload was used, repoint to the full dataset after 5 batches + if self.quickload and self.batch_iter==5: + self.get() + n = len(self.bos_idx) + starts = [[] for _ in range(self.world_size)] + ends = [[] for _ in range(self.world_size)] + + idx = self.i + for r in range(self.world_size): + cur_len = 0 + while cur_len <= num_tokens_local: + if idx >= n: + raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") + cur = self.bos_idx[idx] + starts[r].append(cur) + end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, + cur + max_seq_len, + cur + num_tokens_local - cur_len + 1) + ends[r].append(end) + cur_len += end - cur + idx += 1 + + assert cur_len == num_tokens_local + 1 + self.i = idx + self.batch_iter+=1 + return starts, ends + +class DataPreloader: + # Helper for asynchronously loading next shard and indexing bos tokens + def __init__(self, file_iter, world_size: int = 1): + self.file_iter = file_iter + self.world_size = world_size + self.thread = None + self.data = None + self.ready = threading.Event() + + def _load(self): + tokens = _load_data_shard(next(self.file_iter)) + self.data = (tokens, BOSFinder(tokens, self.world_size)) + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + return self.data + +def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): + # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len + rank = dist.get_rank() if dist.is_initialized() else 0 + world_size = dist.get_world_size() if dist.is_initialized() else 1 + assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" + num_tokens = num_tokens // grad_accum_steps + + files = [Path(file) for file in sorted(glob.glob(filename_pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") + + file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training + tokens = _load_data_shard(next(file_iter)) + if align_to_bos: + finder = BOSFinder(tokens, world_size=world_size, quickload=True) + preloader = DataPreloader(file_iter, world_size) + preloader.start() + else: + pos = 0 # for unaligned case + + while True: + num_tokens_local = num_tokens // world_size + max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 + + if align_to_bos: + try: + seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) + start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) + except StopIteration: + # This shard is exhausted, load the next one in the next loop iteration. + tokens, finder = preloader.get() + preloader.start() + continue + + buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) + _inputs = buf[:-1] + _targets = buf[1:] + end_idxs[-1] -= 1 # last document was too long to account for _targets offset + cum_lengths = (end_idxs - start_idxs).cumsum(0) + + else: + if pos + num_tokens + 1 >= len(tokens): # should not occur for val data + tokens, pos = _load_data_shard(next(file_iter)), 0 + + pos_local = pos + rank * num_tokens_local + buf = tokens[pos_local: pos_local + num_tokens_local + 1] + _inputs = buf[:-1].view(num_tokens_local, ) + _targets = buf[1:].view(num_tokens_local, ) + + cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] + pos += num_tokens + + + _cum_lengths = torch.full((max_num_docs,), num_tokens_local) + _cum_lengths[0] = 0 + _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths + + new_params = yield ( + _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), + _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), + _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) + ) + + if new_params is not None: + # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() + new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params + assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" + num_tokens = new_num_tokens + max_seq_len = new_max_seq_len + grad_accum_steps = new_grad_accum_steps + + +# ----------------------------------------------------------------------------- +# int main + +@dataclass +class Hyperparameters: + # data + train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on + val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on + val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons + train_batch_size: int = 2048 * 16 * 8 + train_max_seq_len: int = 128 * 16 + val_batch_size: int = 4 * 64 * 1024 * 8 + # optimization + num_scheduled_iterations: int = 2205 # number of steps to complete lr and ws schedule + num_extension_iterations: int = 40 # number of steps to continue training at final lr and ws + num_iterations: int = num_scheduled_iterations + num_extension_iterations + cooldown_frac: float = 0.50 # fraction of num_scheduled_iterations spent cooling down the learning rate + # evaluation and logging + run_id: str = f"{uuid.uuid4()}" + val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end + save_checkpoint: bool = False + # attention masking + block_size: int = 128 + ws_schedule: tuple = (3, 7, 11) + ws_final: int = 13 # increase final validation ws, used for YaRN extension and short window size @classiclarryd + ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN + +args = Hyperparameters() + +data_path = os.environ.get("DATA_PATH", ".") +args.train_files = os.path.join(data_path, args.train_files) +args.val_files = os.path.join(data_path, args.val_files) + +# torchrun sets these env variables +rank = int(os.environ["RANK"]) +world_size = int(os.environ["WORLD_SIZE"]) +assert 8 % world_size == 0, "world_size must be a divisor of 8" +grad_accum_steps = 8 // world_size +assert torch.cuda.is_available() +device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) +torch.cuda.set_device(device) +dist.init_process_group(backend="nccl", device_id=device) +dist.barrier() +master_process = (rank == 0) # this process will do logging, checkpointing etc. + +# begin logging +logfile = None +if master_process: + run_id = args.run_id + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{run_id}.txt" + print(logfile) +def print0(s, console=False): + if master_process: + with open(logfile, "a") as f: + if console: + print(s) + print(s, file=f) + +# begin by printing this file (the Python code) +print0(code) +print0("="*100) +# log information about the hardware/software environment this is running on +print0(f"Running Python {sys.version}") +print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") +print0(f"Running Triton version {triton.__version__}") + +def nvidia_smi(): + import subprocess # avoid top level import + return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout +print0(nvidia_smi()) +print0("="*100) + +model: nn.Module = GPT( + vocab_size=50257, + num_layers=12, + num_heads=6, + head_dim=128, + model_dim=768, + max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) +).cuda() +for m in model.modules(): + if isinstance(m, (nn.Embedding, nn.Linear)): + m.bfloat16() +for param in model.parameters(): + dist.broadcast(param.detach(), 0) + +# collect the parameters to optimize +hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] +embed_params = [p for n, p in model.named_parameters() if "embed" in n] +scalar_params = [p for p in model.parameters() if p.ndim < 2] +head_params = [model.lm_head.weight] +gate_params = [p for n, p in model.named_parameters() if "gate" in n] + +# init the optimizer(s) +# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence +# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 +optimizer1 = DistAdam( + scalar_params + head_params + embed_params, + lr=0.008, + betas=(0.65, 0.95), + eps=1e-8, + weight_decay=0.0, +) +optimizer2 = NorMuon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=1.2) +optimizers = [optimizer1, optimizer2] +for opt in optimizers: + for group in opt.param_groups: + group["initial_lr"] = group["lr"] + +# learning rate schedule: flat, then linear decay, then flat +def get_lr(step: int): + x = min(0.9999, step / args.num_scheduled_iterations) + assert 0 <= x < 1 + lr = 1.0 + if x >= 1 - args.cooldown_frac: + w = (1 - x) / args.cooldown_frac + lr = w * 1.0 + (1 - w) * 0.1 + return lr + +def get_ws(step: int): + # set short window size to half of long window size + # Higher ws on "extension" steps + if step >= args.num_scheduled_iterations: + return args.ws_final // 2, args.ws_final + x = step / args.num_scheduled_iterations + assert 0 <= x < 1 + ws_idx = int(len(args.ws_schedule) * x) + return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] + +def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): + # warmup phase: linearly increase momentum from min to max + # cooldown phase: linearly decrease momentum from max to min + momentum_cd_start = args.num_iterations - muon_cooldown_steps + if step < muon_warmup_steps: + frac = step / muon_warmup_steps + momentum = momentum_min + frac * (momentum_max - momentum_min) + elif step > momentum_cd_start: + frac = (step - momentum_cd_start) / muon_cooldown_steps + momentum = momentum_max - frac * (momentum_max - momentum_min) + else: + momentum = momentum_max + return momentum + +def step_optimizers(step: int, optimizers, model): + # update lr + for optimizer in optimizers: + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * get_lr(step) + + # set muon momentum based on step + momentum = get_muon_momentum(step) + for group in optimizers[1].param_groups: + group["momentum"] = momentum + + # on even steps, only step Muon params + # on odd steps, step all params + if step%2==0: + optimizers[1].step() + optimizers[1].zero_grad(set_to_none=True) + else: + for optimizer in optimizers: + optimizer.step() + model.zero_grad(set_to_none=True) + +model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) + +######################################## +# Warmup kernels # +######################################## + +# Warmup the training kernels, then re-initialize the state so we aren't cheating +warmup_steps = 30 +initial_state = dict(model=copy.deepcopy(model.state_dict()), + optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +ws_schedule = list(args.ws_schedule) + [args.ws_final] +ws_long = ws_schedule[0] +for step in range(warmup_steps): + inputs, targets, cum_seqlens = next(train_loader) + # each window size is a new graph, need to warm up each with Yarn.attn_scale + ws_idx = step % len(ws_schedule) + if ws_idx==0: + model.yarn.reset() + ws_long = ws_schedule[0] + else: + new_ws_long = ws_schedule[ws_idx] + model.yarn.apply(ws_long, new_ws_long) + ws_long = new_ws_long + model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() + for opt in optimizers: + opt.step() + model.zero_grad(set_to_none=True) +model.yarn.reset() # rotary buffer is not stored in state_dict +model.load_state_dict(initial_state["model"]) +optimizer2.reset() # muon momentum buffers not in state dict +for opt, opt_state in zip(optimizers, initial_state["optimizers"]): + opt.load_state_dict(opt_state) +del train_loader, initial_state + +######################################## +# Training and validation # +######################################## + +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +training_time_ms = 0 +# start the clock +torch.cuda.synchronize() +t0 = time.perf_counter() +# begin training +train_steps = args.num_iterations +ws_short, ws_long = get_ws(0) +for step in range(train_steps + 1): + last_step = (step == train_steps) + ws_short, new_ws_long = get_ws(step) + if new_ws_long != ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long=new_ws_long + + # --------------- VALIDATION SECTION ----------------- + if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): + if last_step: + ws_long = args.ws_validate_post_yarn_ext + # stop the clock + torch.cuda.synchronize() + training_time_ms += 1000 * (time.perf_counter() - t0) + model.eval() + assert args.val_tokens % args.val_batch_size == 0 + val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size + val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) + val_loss = 0 + with torch.no_grad(): + for _ in range(val_steps): + inputs, targets, cum_seqlens = next(val_loader) + val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) + val_loss /= val_steps + del val_loader + dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) + print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) + model.train() + # start the clock again + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if master_process and args.save_checkpoint: + log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) + os.makedirs(f"logs/{run_id}", exist_ok=True) + torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") + # the last step only has the validation loop, so break to avoid training + break + + # --------------- TRAINING SECTION ----------------- + for _ in range(grad_accum_steps): + inputs, targets, cum_seqlens = next(train_loader) + (model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps).backward() + step_optimizers(step, optimizers, model) + + # logging + approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) + print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) + +print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) +dist.destroy_process_group() + +==================================================================================================== +Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] +Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 +Running Triton version 3.5.0 +Mon Nov 10 22:05:46 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | +| N/A 42C P0 132W / 700W | 5858MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | +| N/A 35C P0 121W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | +| N/A 34C P0 118W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | +| N/A 39C P0 123W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | +| N/A 41C P0 131W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | +| N/A 34C P0 122W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | +| N/A 40C P0 123W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | +| N/A 34C P0 120W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| ++-----------------------------------------------------------------------------------------+ + +==================================================================================================== +step:0/2245 val_loss:10.8258 train_time:0ms step_avg:0.02ms +step:1/2245 train_time:118ms step_avg:118.22ms +step:2/2245 train_time:140ms step_avg:70.03ms +step:3/2245 train_time:178ms step_avg:59.27ms +step:4/2245 train_time:234ms step_avg:58.52ms +step:5/2245 train_time:293ms step_avg:58.65ms +step:6/2245 train_time:352ms step_avg:58.63ms +step:7/2245 train_time:412ms step_avg:58.92ms +step:8/2245 train_time:471ms step_avg:58.88ms +step:9/2245 train_time:532ms step_avg:59.12ms +step:10/2245 train_time:591ms step_avg:59.07ms +step:11/2245 train_time:651ms step_avg:59.22ms +step:12/2245 train_time:710ms step_avg:59.17ms +step:13/2245 train_time:771ms step_avg:59.32ms +step:14/2245 train_time:830ms step_avg:59.27ms +step:15/2245 train_time:891ms step_avg:59.41ms +step:16/2245 train_time:950ms step_avg:59.39ms +step:17/2245 train_time:1013ms step_avg:59.58ms +step:18/2245 train_time:1076ms step_avg:59.77ms +step:19/2245 train_time:1140ms step_avg:59.99ms +step:20/2245 train_time:1200ms step_avg:59.98ms +step:21/2245 train_time:1262ms step_avg:60.08ms +step:22/2245 train_time:1321ms step_avg:60.05ms +step:23/2245 train_time:1383ms step_avg:60.14ms +step:24/2245 train_time:1442ms step_avg:60.10ms +step:25/2245 train_time:1504ms step_avg:60.16ms +step:26/2245 train_time:1564ms step_avg:60.14ms +step:27/2245 train_time:1625ms step_avg:60.19ms +step:28/2245 train_time:1684ms step_avg:60.16ms +step:29/2245 train_time:1746ms step_avg:60.22ms +step:30/2245 train_time:1806ms step_avg:60.19ms +step:31/2245 train_time:1868ms step_avg:60.24ms +step:32/2245 train_time:1927ms step_avg:60.23ms +step:33/2245 train_time:1991ms step_avg:60.33ms +step:34/2245 train_time:2052ms step_avg:60.35ms +step:35/2245 train_time:2115ms step_avg:60.42ms +step:36/2245 train_time:2174ms step_avg:60.40ms +step:37/2245 train_time:2236ms step_avg:60.44ms +step:38/2245 train_time:2295ms step_avg:60.40ms +step:39/2245 train_time:2357ms step_avg:60.44ms +step:40/2245 train_time:2416ms step_avg:60.40ms +step:41/2245 train_time:2478ms step_avg:60.43ms +step:42/2245 train_time:2537ms step_avg:60.40ms +step:43/2245 train_time:2598ms step_avg:60.43ms +step:44/2245 train_time:2658ms step_avg:60.41ms +step:45/2245 train_time:2720ms step_avg:60.44ms +step:46/2245 train_time:2779ms step_avg:60.41ms +step:47/2245 train_time:2841ms step_avg:60.45ms +step:48/2245 train_time:2901ms step_avg:60.43ms +step:49/2245 train_time:2964ms step_avg:60.48ms +step:50/2245 train_time:3024ms step_avg:60.48ms +step:51/2245 train_time:3087ms step_avg:60.53ms +step:52/2245 train_time:3146ms step_avg:60.50ms +step:53/2245 train_time:3208ms step_avg:60.52ms +step:54/2245 train_time:3268ms step_avg:60.52ms +step:55/2245 train_time:3330ms step_avg:60.55ms +step:56/2245 train_time:3390ms step_avg:60.54ms +step:57/2245 train_time:3452ms step_avg:60.56ms +step:58/2245 train_time:3511ms step_avg:60.54ms +step:59/2245 train_time:3573ms step_avg:60.57ms +step:60/2245 train_time:3632ms step_avg:60.54ms +step:61/2245 train_time:3694ms step_avg:60.56ms +step:62/2245 train_time:3754ms step_avg:60.54ms +step:63/2245 train_time:3815ms step_avg:60.56ms +step:64/2245 train_time:3874ms step_avg:60.53ms +step:65/2245 train_time:3936ms step_avg:60.55ms +step:66/2245 train_time:3995ms step_avg:60.53ms +step:67/2245 train_time:4058ms step_avg:60.57ms +step:68/2245 train_time:4117ms step_avg:60.54ms +step:69/2245 train_time:4180ms step_avg:60.58ms +step:70/2245 train_time:4239ms step_avg:60.56ms +step:71/2245 train_time:4302ms step_avg:60.59ms +step:72/2245 train_time:4361ms step_avg:60.57ms +step:73/2245 train_time:4423ms step_avg:60.59ms +step:74/2245 train_time:4483ms step_avg:60.58ms +step:75/2245 train_time:4545ms step_avg:60.60ms +step:76/2245 train_time:4604ms step_avg:60.58ms +step:77/2245 train_time:4666ms step_avg:60.60ms +step:78/2245 train_time:4726ms step_avg:60.58ms +step:79/2245 train_time:4787ms step_avg:60.60ms +step:80/2245 train_time:4847ms step_avg:60.58ms +step:81/2245 train_time:4909ms step_avg:60.60ms +step:82/2245 train_time:4969ms step_avg:60.59ms +step:83/2245 train_time:5031ms step_avg:60.61ms +step:84/2245 train_time:5090ms step_avg:60.60ms +step:85/2245 train_time:5153ms step_avg:60.62ms +step:86/2245 train_time:5212ms step_avg:60.60ms +step:87/2245 train_time:5274ms step_avg:60.62ms +step:88/2245 train_time:5332ms step_avg:60.59ms +step:89/2245 train_time:5394ms step_avg:60.60ms +step:90/2245 train_time:5452ms step_avg:60.58ms +step:91/2245 train_time:5514ms step_avg:60.60ms +step:92/2245 train_time:5573ms step_avg:60.58ms +step:93/2245 train_time:5634ms step_avg:60.59ms +step:94/2245 train_time:5693ms step_avg:60.56ms +step:95/2245 train_time:5755ms step_avg:60.58ms +step:96/2245 train_time:5814ms step_avg:60.56ms +step:97/2245 train_time:5875ms step_avg:60.57ms +step:98/2245 train_time:5934ms step_avg:60.55ms +step:99/2245 train_time:5996ms step_avg:60.57ms +step:100/2245 train_time:6056ms step_avg:60.56ms +step:101/2245 train_time:6117ms step_avg:60.57ms +step:102/2245 train_time:6176ms step_avg:60.55ms +step:103/2245 train_time:6238ms step_avg:60.56ms +step:104/2245 train_time:6297ms step_avg:60.55ms +step:105/2245 train_time:6360ms step_avg:60.57ms +step:106/2245 train_time:6419ms step_avg:60.55ms +step:107/2245 train_time:6480ms step_avg:60.56ms +step:108/2245 train_time:6539ms step_avg:60.55ms +step:109/2245 train_time:6601ms step_avg:60.56ms +step:110/2245 train_time:6661ms step_avg:60.55ms +step:111/2245 train_time:6722ms step_avg:60.56ms +step:112/2245 train_time:6781ms step_avg:60.54ms +step:113/2245 train_time:6843ms step_avg:60.56ms +step:114/2245 train_time:6902ms step_avg:60.54ms +step:115/2245 train_time:6964ms step_avg:60.55ms +step:116/2245 train_time:7023ms step_avg:60.54ms +step:117/2245 train_time:7085ms step_avg:60.56ms +step:118/2245 train_time:7145ms step_avg:60.55ms +step:119/2245 train_time:7207ms step_avg:60.56ms +step:120/2245 train_time:7266ms step_avg:60.55ms +step:121/2245 train_time:7328ms step_avg:60.56ms +step:122/2245 train_time:7388ms step_avg:60.56ms +step:123/2245 train_time:7449ms step_avg:60.56ms +step:124/2245 train_time:7509ms step_avg:60.56ms +step:125/2245 train_time:7571ms step_avg:60.57ms +step:126/2245 train_time:7631ms step_avg:60.56ms +step:127/2245 train_time:7693ms step_avg:60.57ms +step:128/2245 train_time:7751ms step_avg:60.56ms +step:129/2245 train_time:7813ms step_avg:60.56ms +step:130/2245 train_time:7871ms step_avg:60.55ms +step:131/2245 train_time:7933ms step_avg:60.55ms +step:132/2245 train_time:7991ms step_avg:60.54ms +step:133/2245 train_time:8053ms step_avg:60.55ms +step:134/2245 train_time:8112ms step_avg:60.54ms +step:135/2245 train_time:8174ms step_avg:60.55ms +step:136/2245 train_time:8233ms step_avg:60.54ms +step:137/2245 train_time:8294ms step_avg:60.54ms +step:138/2245 train_time:8353ms step_avg:60.53ms +step:139/2245 train_time:8415ms step_avg:60.54ms +step:140/2245 train_time:8474ms step_avg:60.53ms +step:141/2245 train_time:8535ms step_avg:60.53ms +step:142/2245 train_time:8593ms step_avg:60.52ms +step:143/2245 train_time:8655ms step_avg:60.52ms +step:144/2245 train_time:8713ms step_avg:60.51ms +step:145/2245 train_time:8775ms step_avg:60.52ms +step:146/2245 train_time:8834ms step_avg:60.50ms +step:147/2245 train_time:8895ms step_avg:60.51ms +step:148/2245 train_time:8953ms step_avg:60.49ms +step:149/2245 train_time:9015ms step_avg:60.50ms +step:150/2245 train_time:9073ms step_avg:60.49ms +step:151/2245 train_time:9134ms step_avg:60.49ms +step:152/2245 train_time:9192ms step_avg:60.48ms +step:153/2245 train_time:9254ms step_avg:60.48ms +step:154/2245 train_time:9312ms step_avg:60.47ms +step:155/2245 train_time:9374ms step_avg:60.48ms +step:156/2245 train_time:9433ms step_avg:60.46ms +step:157/2245 train_time:9494ms step_avg:60.47ms +step:158/2245 train_time:9552ms step_avg:60.46ms +step:159/2245 train_time:9614ms step_avg:60.46ms +step:160/2245 train_time:9673ms step_avg:60.45ms +step:161/2245 train_time:9733ms step_avg:60.46ms +step:162/2245 train_time:9792ms step_avg:60.44ms +step:163/2245 train_time:9853ms step_avg:60.45ms +step:164/2245 train_time:9912ms step_avg:60.44ms +step:165/2245 train_time:9973ms step_avg:60.45ms +step:166/2245 train_time:10032ms step_avg:60.43ms +step:167/2245 train_time:10094ms step_avg:60.44ms +step:168/2245 train_time:10152ms step_avg:60.43ms +step:169/2245 train_time:10214ms step_avg:60.44ms +step:170/2245 train_time:10274ms step_avg:60.43ms +step:171/2245 train_time:10335ms step_avg:60.44ms +step:172/2245 train_time:10393ms step_avg:60.43ms +step:173/2245 train_time:10455ms step_avg:60.43ms +step:174/2245 train_time:10514ms step_avg:60.42ms +step:175/2245 train_time:10576ms step_avg:60.43ms +step:176/2245 train_time:10634ms step_avg:60.42ms +step:177/2245 train_time:10695ms step_avg:60.43ms +step:178/2245 train_time:10754ms step_avg:60.42ms +step:179/2245 train_time:10815ms step_avg:60.42ms +step:180/2245 train_time:10874ms step_avg:60.41ms +step:181/2245 train_time:10935ms step_avg:60.42ms +step:182/2245 train_time:10994ms step_avg:60.41ms +step:183/2245 train_time:11055ms step_avg:60.41ms +step:184/2245 train_time:11114ms step_avg:60.40ms +step:185/2245 train_time:11176ms step_avg:60.41ms +step:186/2245 train_time:11234ms step_avg:60.40ms +step:187/2245 train_time:11296ms step_avg:60.41ms +step:188/2245 train_time:11355ms step_avg:60.40ms +step:189/2245 train_time:11416ms step_avg:60.40ms +step:190/2245 train_time:11475ms step_avg:60.39ms +step:191/2245 train_time:11536ms step_avg:60.40ms +step:192/2245 train_time:11595ms step_avg:60.39ms +step:193/2245 train_time:11657ms step_avg:60.40ms +step:194/2245 train_time:11716ms step_avg:60.39ms +step:195/2245 train_time:11777ms step_avg:60.39ms +step:196/2245 train_time:11835ms step_avg:60.38ms +step:197/2245 train_time:11897ms step_avg:60.39ms +step:198/2245 train_time:11956ms step_avg:60.38ms +step:199/2245 train_time:12017ms step_avg:60.39ms +step:200/2245 train_time:12076ms step_avg:60.38ms +step:201/2245 train_time:12138ms step_avg:60.39ms +step:202/2245 train_time:12196ms step_avg:60.38ms +step:203/2245 train_time:12257ms step_avg:60.38ms +step:204/2245 train_time:12316ms step_avg:60.37ms +step:205/2245 train_time:12377ms step_avg:60.38ms +step:206/2245 train_time:12436ms step_avg:60.37ms +step:207/2245 train_time:12497ms step_avg:60.37ms +step:208/2245 train_time:12557ms step_avg:60.37ms +step:209/2245 train_time:12618ms step_avg:60.37ms +step:210/2245 train_time:12678ms step_avg:60.37ms +step:211/2245 train_time:12739ms step_avg:60.37ms +step:212/2245 train_time:12798ms step_avg:60.37ms +step:213/2245 train_time:12859ms step_avg:60.37ms +step:214/2245 train_time:12918ms step_avg:60.36ms +step:215/2245 train_time:12979ms step_avg:60.37ms +step:216/2245 train_time:13037ms step_avg:60.36ms +step:217/2245 train_time:13099ms step_avg:60.36ms +step:218/2245 train_time:13158ms step_avg:60.36ms +step:219/2245 train_time:13219ms step_avg:60.36ms +step:220/2245 train_time:13278ms step_avg:60.36ms +step:221/2245 train_time:13340ms step_avg:60.36ms +step:222/2245 train_time:13398ms step_avg:60.35ms +step:223/2245 train_time:13461ms step_avg:60.36ms +step:224/2245 train_time:13519ms step_avg:60.35ms +step:225/2245 train_time:13581ms step_avg:60.36ms +step:226/2245 train_time:13640ms step_avg:60.35ms +step:227/2245 train_time:13702ms step_avg:60.36ms +step:228/2245 train_time:13761ms step_avg:60.35ms +step:229/2245 train_time:13822ms step_avg:60.36ms +step:230/2245 train_time:13881ms step_avg:60.35ms +step:231/2245 train_time:13943ms step_avg:60.36ms +step:232/2245 train_time:14001ms step_avg:60.35ms +step:233/2245 train_time:14063ms step_avg:60.35ms +step:234/2245 train_time:14122ms step_avg:60.35ms +step:235/2245 train_time:14184ms step_avg:60.36ms +step:236/2245 train_time:14243ms step_avg:60.35ms +step:237/2245 train_time:14305ms step_avg:60.36ms +step:238/2245 train_time:14365ms step_avg:60.36ms +step:239/2245 train_time:14427ms step_avg:60.37ms +step:240/2245 train_time:14487ms step_avg:60.36ms +step:241/2245 train_time:14548ms step_avg:60.37ms +step:242/2245 train_time:14609ms step_avg:60.37ms +step:243/2245 train_time:14671ms step_avg:60.38ms +step:244/2245 train_time:14730ms step_avg:60.37ms +step:245/2245 train_time:14792ms step_avg:60.37ms +step:246/2245 train_time:14851ms step_avg:60.37ms +step:247/2245 train_time:14912ms step_avg:60.37ms +step:248/2245 train_time:14972ms step_avg:60.37ms +step:249/2245 train_time:15033ms step_avg:60.37ms +step:250/2245 train_time:15092ms step_avg:60.37ms +step:250/2245 val_loss:4.0984 train_time:15153ms step_avg:60.61ms +step:251/2245 train_time:15172ms step_avg:60.45ms +step:252/2245 train_time:15217ms step_avg:60.38ms +step:253/2245 train_time:15281ms step_avg:60.40ms +step:254/2245 train_time:15346ms step_avg:60.42ms +step:255/2245 train_time:15408ms step_avg:60.42ms +step:256/2245 train_time:15467ms step_avg:60.42ms +step:257/2245 train_time:15528ms step_avg:60.42ms +step:258/2245 train_time:15586ms step_avg:60.41ms +step:259/2245 train_time:15647ms step_avg:60.41ms +step:260/2245 train_time:15705ms step_avg:60.40ms +step:261/2245 train_time:15765ms step_avg:60.40ms +step:262/2245 train_time:15823ms step_avg:60.39ms +step:263/2245 train_time:15884ms step_avg:60.40ms +step:264/2245 train_time:15942ms step_avg:60.39ms +step:265/2245 train_time:16002ms step_avg:60.39ms +step:266/2245 train_time:16060ms step_avg:60.38ms +step:267/2245 train_time:16122ms step_avg:60.38ms +step:268/2245 train_time:16181ms step_avg:60.38ms +step:269/2245 train_time:16244ms step_avg:60.39ms +step:270/2245 train_time:16305ms step_avg:60.39ms +step:271/2245 train_time:16367ms step_avg:60.40ms +step:272/2245 train_time:16427ms step_avg:60.39ms +step:273/2245 train_time:16489ms step_avg:60.40ms +step:274/2245 train_time:16547ms step_avg:60.39ms +step:275/2245 train_time:16609ms step_avg:60.40ms +step:276/2245 train_time:16667ms step_avg:60.39ms +step:277/2245 train_time:16728ms step_avg:60.39ms +step:278/2245 train_time:16787ms step_avg:60.38ms +step:279/2245 train_time:16848ms step_avg:60.39ms +step:280/2245 train_time:16906ms step_avg:60.38ms +step:281/2245 train_time:16966ms step_avg:60.38ms +step:282/2245 train_time:17024ms step_avg:60.37ms +step:283/2245 train_time:17085ms step_avg:60.37ms +step:284/2245 train_time:17144ms step_avg:60.36ms +step:285/2245 train_time:17205ms step_avg:60.37ms +step:286/2245 train_time:17265ms step_avg:60.37ms +step:287/2245 train_time:17327ms step_avg:60.37ms +step:288/2245 train_time:17387ms step_avg:60.37ms +step:289/2245 train_time:17449ms step_avg:60.38ms +step:290/2245 train_time:17508ms step_avg:60.37ms +step:291/2245 train_time:17569ms step_avg:60.38ms +step:292/2245 train_time:17628ms step_avg:60.37ms +step:293/2245 train_time:17689ms step_avg:60.37ms +step:294/2245 train_time:17747ms step_avg:60.36ms +step:295/2245 train_time:17809ms step_avg:60.37ms +step:296/2245 train_time:17867ms step_avg:60.36ms +step:297/2245 train_time:17928ms step_avg:60.36ms +step:298/2245 train_time:17986ms step_avg:60.36ms +step:299/2245 train_time:18047ms step_avg:60.36ms +step:300/2245 train_time:18106ms step_avg:60.35ms +step:301/2245 train_time:18167ms step_avg:60.36ms +step:302/2245 train_time:18227ms step_avg:60.35ms +step:303/2245 train_time:18288ms step_avg:60.36ms +step:304/2245 train_time:18348ms step_avg:60.36ms +step:305/2245 train_time:18411ms step_avg:60.36ms +step:306/2245 train_time:18470ms step_avg:60.36ms +step:307/2245 train_time:18531ms step_avg:60.36ms +step:308/2245 train_time:18589ms step_avg:60.35ms +step:309/2245 train_time:18650ms step_avg:60.36ms +step:310/2245 train_time:18709ms step_avg:60.35ms +step:311/2245 train_time:18770ms step_avg:60.36ms +step:312/2245 train_time:18830ms step_avg:60.35ms +step:313/2245 train_time:18891ms step_avg:60.36ms +step:314/2245 train_time:18950ms step_avg:60.35ms +step:315/2245 train_time:19011ms step_avg:60.35ms +step:316/2245 train_time:19070ms step_avg:60.35ms +step:317/2245 train_time:19133ms step_avg:60.36ms +step:318/2245 train_time:19193ms step_avg:60.35ms +step:319/2245 train_time:19254ms step_avg:60.36ms +step:320/2245 train_time:19314ms step_avg:60.36ms +step:321/2245 train_time:19376ms step_avg:60.36ms +step:322/2245 train_time:19434ms step_avg:60.35ms +step:323/2245 train_time:19495ms step_avg:60.36ms +step:324/2245 train_time:19554ms step_avg:60.35ms +step:325/2245 train_time:19616ms step_avg:60.36ms +step:326/2245 train_time:19675ms step_avg:60.35ms +step:327/2245 train_time:19736ms step_avg:60.35ms +step:328/2245 train_time:19795ms step_avg:60.35ms +step:329/2245 train_time:19856ms step_avg:60.35ms +step:330/2245 train_time:19916ms step_avg:60.35ms +step:331/2245 train_time:19978ms step_avg:60.36ms +step:332/2245 train_time:20036ms step_avg:60.35ms +step:333/2245 train_time:20098ms step_avg:60.35ms +step:334/2245 train_time:20156ms step_avg:60.35ms +step:335/2245 train_time:20218ms step_avg:60.35ms +step:336/2245 train_time:20278ms step_avg:60.35ms +step:337/2245 train_time:20339ms step_avg:60.35ms +step:338/2245 train_time:20398ms step_avg:60.35ms +step:339/2245 train_time:20459ms step_avg:60.35ms +step:340/2245 train_time:20518ms step_avg:60.35ms +step:341/2245 train_time:20579ms step_avg:60.35ms +step:342/2245 train_time:20637ms step_avg:60.34ms +step:343/2245 train_time:20699ms step_avg:60.35ms +step:344/2245 train_time:20757ms step_avg:60.34ms +step:345/2245 train_time:20819ms step_avg:60.34ms +step:346/2245 train_time:20878ms step_avg:60.34ms +step:347/2245 train_time:20939ms step_avg:60.34ms +step:348/2245 train_time:20998ms step_avg:60.34ms +step:349/2245 train_time:21060ms step_avg:60.34ms +step:350/2245 train_time:21118ms step_avg:60.34ms +step:351/2245 train_time:21180ms step_avg:60.34ms +step:352/2245 train_time:21239ms step_avg:60.34ms +step:353/2245 train_time:21300ms step_avg:60.34ms +step:354/2245 train_time:21358ms step_avg:60.33ms +step:355/2245 train_time:21420ms step_avg:60.34ms +step:356/2245 train_time:21478ms step_avg:60.33ms +step:357/2245 train_time:21540ms step_avg:60.33ms +step:358/2245 train_time:21598ms step_avg:60.33ms +step:359/2245 train_time:21659ms step_avg:60.33ms +step:360/2245 train_time:21718ms step_avg:60.33ms +step:361/2245 train_time:21779ms step_avg:60.33ms +step:362/2245 train_time:21837ms step_avg:60.32ms +step:363/2245 train_time:21898ms step_avg:60.33ms +step:364/2245 train_time:21957ms step_avg:60.32ms +step:365/2245 train_time:22019ms step_avg:60.32ms +step:366/2245 train_time:22078ms step_avg:60.32ms +step:367/2245 train_time:22139ms step_avg:60.33ms +step:368/2245 train_time:22198ms step_avg:60.32ms +step:369/2245 train_time:22259ms step_avg:60.32ms +step:370/2245 train_time:22318ms step_avg:60.32ms +step:371/2245 train_time:22380ms step_avg:60.32ms +step:372/2245 train_time:22438ms step_avg:60.32ms +step:373/2245 train_time:22499ms step_avg:60.32ms +step:374/2245 train_time:22558ms step_avg:60.32ms +step:375/2245 train_time:22619ms step_avg:60.32ms +step:376/2245 train_time:22678ms step_avg:60.31ms +step:377/2245 train_time:22739ms step_avg:60.31ms +step:378/2245 train_time:22798ms step_avg:60.31ms +step:379/2245 train_time:22859ms step_avg:60.31ms +step:380/2245 train_time:22918ms step_avg:60.31ms +step:381/2245 train_time:22979ms step_avg:60.31ms +step:382/2245 train_time:23038ms step_avg:60.31ms +step:383/2245 train_time:23099ms step_avg:60.31ms +step:384/2245 train_time:23158ms step_avg:60.31ms +step:385/2245 train_time:23220ms step_avg:60.31ms +step:386/2245 train_time:23278ms step_avg:60.31ms +step:387/2245 train_time:23340ms step_avg:60.31ms +step:388/2245 train_time:23399ms step_avg:60.31ms +step:389/2245 train_time:23460ms step_avg:60.31ms +step:390/2245 train_time:23518ms step_avg:60.30ms +step:391/2245 train_time:23580ms step_avg:60.31ms +step:392/2245 train_time:23638ms step_avg:60.30ms +step:393/2245 train_time:23699ms step_avg:60.30ms +step:394/2245 train_time:23758ms step_avg:60.30ms +step:395/2245 train_time:23819ms step_avg:60.30ms +step:396/2245 train_time:23878ms step_avg:60.30ms +step:397/2245 train_time:23939ms step_avg:60.30ms +step:398/2245 train_time:23998ms step_avg:60.30ms +step:399/2245 train_time:24060ms step_avg:60.30ms +step:400/2245 train_time:24119ms step_avg:60.30ms +step:401/2245 train_time:24180ms step_avg:60.30ms +step:402/2245 train_time:24239ms step_avg:60.30ms +step:403/2245 train_time:24301ms step_avg:60.30ms +step:404/2245 train_time:24359ms step_avg:60.30ms +step:405/2245 train_time:24420ms step_avg:60.30ms +step:406/2245 train_time:24479ms step_avg:60.29ms +step:407/2245 train_time:24540ms step_avg:60.29ms +step:408/2245 train_time:24599ms step_avg:60.29ms +step:409/2245 train_time:24660ms step_avg:60.29ms +step:410/2245 train_time:24719ms step_avg:60.29ms +step:411/2245 train_time:24780ms step_avg:60.29ms +step:412/2245 train_time:24838ms step_avg:60.29ms +step:413/2245 train_time:24900ms step_avg:60.29ms +step:414/2245 train_time:24959ms step_avg:60.29ms +step:415/2245 train_time:25020ms step_avg:60.29ms +step:416/2245 train_time:25079ms step_avg:60.29ms +step:417/2245 train_time:25140ms step_avg:60.29ms +step:418/2245 train_time:25199ms step_avg:60.29ms +step:419/2245 train_time:25261ms step_avg:60.29ms +step:420/2245 train_time:25321ms step_avg:60.29ms +step:421/2245 train_time:25382ms step_avg:60.29ms +step:422/2245 train_time:25440ms step_avg:60.29ms +step:423/2245 train_time:25502ms step_avg:60.29ms +step:424/2245 train_time:25560ms step_avg:60.28ms +step:425/2245 train_time:25621ms step_avg:60.28ms +step:426/2245 train_time:25679ms step_avg:60.28ms +step:427/2245 train_time:25740ms step_avg:60.28ms +step:428/2245 train_time:25799ms step_avg:60.28ms +step:429/2245 train_time:25860ms step_avg:60.28ms +step:430/2245 train_time:25919ms step_avg:60.28ms +step:431/2245 train_time:25980ms step_avg:60.28ms +step:432/2245 train_time:26039ms step_avg:60.28ms +step:433/2245 train_time:26100ms step_avg:60.28ms +step:434/2245 train_time:26159ms step_avg:60.27ms +step:435/2245 train_time:26220ms step_avg:60.28ms +step:436/2245 train_time:26279ms step_avg:60.27ms +step:437/2245 train_time:26340ms step_avg:60.28ms +step:438/2245 train_time:26400ms step_avg:60.27ms +step:439/2245 train_time:26461ms step_avg:60.28ms +step:440/2245 train_time:26520ms step_avg:60.27ms +step:441/2245 train_time:26580ms step_avg:60.27ms +step:442/2245 train_time:26639ms step_avg:60.27ms +step:443/2245 train_time:26700ms step_avg:60.27ms +step:444/2245 train_time:26758ms step_avg:60.27ms +step:445/2245 train_time:26820ms step_avg:60.27ms +step:446/2245 train_time:26878ms step_avg:60.26ms +step:447/2245 train_time:26939ms step_avg:60.27ms +step:448/2245 train_time:26998ms step_avg:60.26ms +step:449/2245 train_time:27060ms step_avg:60.27ms +step:450/2245 train_time:27118ms step_avg:60.26ms +step:451/2245 train_time:27179ms step_avg:60.26ms +step:452/2245 train_time:27238ms step_avg:60.26ms +step:453/2245 train_time:27299ms step_avg:60.26ms +step:454/2245 train_time:27358ms step_avg:60.26ms +step:455/2245 train_time:27420ms step_avg:60.26ms +step:456/2245 train_time:27479ms step_avg:60.26ms +step:457/2245 train_time:27540ms step_avg:60.26ms +step:458/2245 train_time:27599ms step_avg:60.26ms +step:459/2245 train_time:27660ms step_avg:60.26ms +step:460/2245 train_time:27719ms step_avg:60.26ms +step:461/2245 train_time:27780ms step_avg:60.26ms +step:462/2245 train_time:27838ms step_avg:60.26ms +step:463/2245 train_time:27900ms step_avg:60.26ms +step:464/2245 train_time:27958ms step_avg:60.25ms +step:465/2245 train_time:28020ms step_avg:60.26ms +step:466/2245 train_time:28078ms step_avg:60.25ms +step:467/2245 train_time:28139ms step_avg:60.26ms +step:468/2245 train_time:28198ms step_avg:60.25ms +step:469/2245 train_time:28259ms step_avg:60.25ms +step:470/2245 train_time:28319ms step_avg:60.25ms +step:471/2245 train_time:28380ms step_avg:60.26ms +step:472/2245 train_time:28439ms step_avg:60.25ms +step:473/2245 train_time:28500ms step_avg:60.25ms +step:474/2245 train_time:28558ms step_avg:60.25ms +step:475/2245 train_time:28620ms step_avg:60.25ms +step:476/2245 train_time:28678ms step_avg:60.25ms +step:477/2245 train_time:28740ms step_avg:60.25ms +step:478/2245 train_time:28798ms step_avg:60.25ms +step:479/2245 train_time:28860ms step_avg:60.25ms +step:480/2245 train_time:28918ms step_avg:60.25ms +step:481/2245 train_time:28980ms step_avg:60.25ms +step:482/2245 train_time:29038ms step_avg:60.24ms +step:483/2245 train_time:29099ms step_avg:60.25ms +step:484/2245 train_time:29158ms step_avg:60.24ms +step:485/2245 train_time:29220ms step_avg:60.25ms +step:486/2245 train_time:29278ms step_avg:60.24ms +step:487/2245 train_time:29339ms step_avg:60.25ms +step:488/2245 train_time:29399ms step_avg:60.24ms +step:489/2245 train_time:29460ms step_avg:60.25ms +step:490/2245 train_time:29519ms step_avg:60.24ms +step:491/2245 train_time:29580ms step_avg:60.24ms +step:492/2245 train_time:29639ms step_avg:60.24ms +step:493/2245 train_time:29700ms step_avg:60.24ms +step:494/2245 train_time:29758ms step_avg:60.24ms +step:495/2245 train_time:29820ms step_avg:60.24ms +step:496/2245 train_time:29878ms step_avg:60.24ms +step:497/2245 train_time:29940ms step_avg:60.24ms +step:498/2245 train_time:29998ms step_avg:60.24ms +step:499/2245 train_time:30060ms step_avg:60.24ms +step:500/2245 train_time:30118ms step_avg:60.24ms +step:500/2245 val_loss:3.8233 train_time:30181ms step_avg:60.36ms +step:501/2245 train_time:30201ms step_avg:60.28ms +step:502/2245 train_time:30242ms step_avg:60.24ms +step:503/2245 train_time:30309ms step_avg:60.26ms +step:504/2245 train_time:30370ms step_avg:60.26ms +step:505/2245 train_time:30431ms step_avg:60.26ms +step:506/2245 train_time:30490ms step_avg:60.26ms +step:507/2245 train_time:30550ms step_avg:60.26ms +step:508/2245 train_time:30609ms step_avg:60.25ms +step:509/2245 train_time:30670ms step_avg:60.25ms +step:510/2245 train_time:30727ms step_avg:60.25ms +step:511/2245 train_time:30788ms step_avg:60.25ms +step:512/2245 train_time:30846ms step_avg:60.25ms +step:513/2245 train_time:30907ms step_avg:60.25ms +step:514/2245 train_time:30966ms step_avg:60.25ms +step:515/2245 train_time:31027ms step_avg:60.25ms +step:516/2245 train_time:31085ms step_avg:60.24ms +step:517/2245 train_time:31147ms step_avg:60.25ms +step:518/2245 train_time:31207ms step_avg:60.24ms +step:519/2245 train_time:31270ms step_avg:60.25ms +step:520/2245 train_time:31329ms step_avg:60.25ms +step:521/2245 train_time:31392ms step_avg:60.25ms +step:522/2245 train_time:31451ms step_avg:60.25ms +step:523/2245 train_time:31512ms step_avg:60.25ms +step:524/2245 train_time:31571ms step_avg:60.25ms +step:525/2245 train_time:31632ms step_avg:60.25ms +step:526/2245 train_time:31690ms step_avg:60.25ms +step:527/2245 train_time:31751ms step_avg:60.25ms +step:528/2245 train_time:31809ms step_avg:60.25ms +step:529/2245 train_time:31871ms step_avg:60.25ms +step:530/2245 train_time:31929ms step_avg:60.24ms +step:531/2245 train_time:31990ms step_avg:60.24ms +step:532/2245 train_time:32048ms step_avg:60.24ms +step:533/2245 train_time:32110ms step_avg:60.24ms +step:534/2245 train_time:32170ms step_avg:60.24ms +step:535/2245 train_time:32233ms step_avg:60.25ms +step:536/2245 train_time:32292ms step_avg:60.25ms +step:537/2245 train_time:32354ms step_avg:60.25ms +step:538/2245 train_time:32413ms step_avg:60.25ms +step:539/2245 train_time:32475ms step_avg:60.25ms +step:540/2245 train_time:32534ms step_avg:60.25ms +step:541/2245 train_time:32595ms step_avg:60.25ms +step:542/2245 train_time:32654ms step_avg:60.25ms +step:543/2245 train_time:32715ms step_avg:60.25ms +step:544/2245 train_time:32773ms step_avg:60.25ms +step:545/2245 train_time:32834ms step_avg:60.25ms +step:546/2245 train_time:32893ms step_avg:60.24ms +step:547/2245 train_time:32955ms step_avg:60.25ms +step:548/2245 train_time:33014ms step_avg:60.24ms +step:549/2245 train_time:33076ms step_avg:60.25ms +step:550/2245 train_time:33135ms step_avg:60.25ms +step:551/2245 train_time:33197ms step_avg:60.25ms +step:552/2245 train_time:33257ms step_avg:60.25ms +step:553/2245 train_time:33319ms step_avg:60.25ms +step:554/2245 train_time:33378ms step_avg:60.25ms +step:555/2245 train_time:33440ms step_avg:60.25ms +step:556/2245 train_time:33500ms step_avg:60.25ms +step:557/2245 train_time:33561ms step_avg:60.25ms +step:558/2245 train_time:33621ms step_avg:60.25ms +step:559/2245 train_time:33683ms step_avg:60.26ms +step:560/2245 train_time:33742ms step_avg:60.25ms +step:561/2245 train_time:33804ms step_avg:60.26ms +step:562/2245 train_time:33864ms step_avg:60.26ms +step:563/2245 train_time:33926ms step_avg:60.26ms +step:564/2245 train_time:33985ms step_avg:60.26ms +step:565/2245 train_time:34047ms step_avg:60.26ms +step:566/2245 train_time:34106ms step_avg:60.26ms +step:567/2245 train_time:34168ms step_avg:60.26ms +step:568/2245 train_time:34226ms step_avg:60.26ms +step:569/2245 train_time:34288ms step_avg:60.26ms +step:570/2245 train_time:34347ms step_avg:60.26ms +step:571/2245 train_time:34409ms step_avg:60.26ms +step:572/2245 train_time:34468ms step_avg:60.26ms +step:573/2245 train_time:34529ms step_avg:60.26ms +step:574/2245 train_time:34588ms step_avg:60.26ms +step:575/2245 train_time:34649ms step_avg:60.26ms +step:576/2245 train_time:34707ms step_avg:60.26ms +step:577/2245 train_time:34768ms step_avg:60.26ms +step:578/2245 train_time:34827ms step_avg:60.25ms +step:579/2245 train_time:34889ms step_avg:60.26ms +step:580/2245 train_time:34948ms step_avg:60.25ms +step:581/2245 train_time:35009ms step_avg:60.26ms +step:582/2245 train_time:35068ms step_avg:60.25ms +step:583/2245 train_time:35130ms step_avg:60.26ms +step:584/2245 train_time:35188ms step_avg:60.25ms +step:585/2245 train_time:35249ms step_avg:60.26ms +step:586/2245 train_time:35308ms step_avg:60.25ms +step:587/2245 train_time:35369ms step_avg:60.25ms +step:588/2245 train_time:35427ms step_avg:60.25ms +step:589/2245 train_time:35489ms step_avg:60.25ms +step:590/2245 train_time:35547ms step_avg:60.25ms +step:591/2245 train_time:35609ms step_avg:60.25ms +step:592/2245 train_time:35667ms step_avg:60.25ms +step:593/2245 train_time:35729ms step_avg:60.25ms +step:594/2245 train_time:35787ms step_avg:60.25ms +step:595/2245 train_time:35849ms step_avg:60.25ms +step:596/2245 train_time:35908ms step_avg:60.25ms +step:597/2245 train_time:35969ms step_avg:60.25ms +step:598/2245 train_time:36028ms step_avg:60.25ms +step:599/2245 train_time:36089ms step_avg:60.25ms +step:600/2245 train_time:36148ms step_avg:60.25ms +step:601/2245 train_time:36209ms step_avg:60.25ms +step:602/2245 train_time:36269ms step_avg:60.25ms +step:603/2245 train_time:36330ms step_avg:60.25ms +step:604/2245 train_time:36388ms step_avg:60.25ms +step:605/2245 train_time:36449ms step_avg:60.25ms +step:606/2245 train_time:36507ms step_avg:60.24ms +step:607/2245 train_time:36569ms step_avg:60.25ms +step:608/2245 train_time:36627ms step_avg:60.24ms +step:609/2245 train_time:36689ms step_avg:60.24ms +step:610/2245 train_time:36748ms step_avg:60.24ms +step:611/2245 train_time:36810ms step_avg:60.24ms +step:612/2245 train_time:36869ms step_avg:60.24ms +step:613/2245 train_time:36930ms step_avg:60.25ms +step:614/2245 train_time:36989ms step_avg:60.24ms +step:615/2245 train_time:37050ms step_avg:60.24ms +step:616/2245 train_time:37108ms step_avg:60.24ms +step:617/2245 train_time:37169ms step_avg:60.24ms +step:618/2245 train_time:37228ms step_avg:60.24ms +step:619/2245 train_time:37290ms step_avg:60.24ms +step:620/2245 train_time:37348ms step_avg:60.24ms +step:621/2245 train_time:37410ms step_avg:60.24ms +step:622/2245 train_time:37468ms step_avg:60.24ms +step:623/2245 train_time:37530ms step_avg:60.24ms +step:624/2245 train_time:37588ms step_avg:60.24ms +step:625/2245 train_time:37649ms step_avg:60.24ms +step:626/2245 train_time:37708ms step_avg:60.24ms +step:627/2245 train_time:37770ms step_avg:60.24ms +step:628/2245 train_time:37828ms step_avg:60.24ms +step:629/2245 train_time:37889ms step_avg:60.24ms +step:630/2245 train_time:37948ms step_avg:60.24ms +step:631/2245 train_time:38010ms step_avg:60.24ms +step:632/2245 train_time:38069ms step_avg:60.24ms +step:633/2245 train_time:38130ms step_avg:60.24ms +step:634/2245 train_time:38190ms step_avg:60.24ms +step:635/2245 train_time:38251ms step_avg:60.24ms +step:636/2245 train_time:38310ms step_avg:60.24ms +step:637/2245 train_time:38371ms step_avg:60.24ms +step:638/2245 train_time:38430ms step_avg:60.23ms +step:639/2245 train_time:38491ms step_avg:60.24ms +step:640/2245 train_time:38550ms step_avg:60.23ms +step:641/2245 train_time:38611ms step_avg:60.24ms +step:642/2245 train_time:38670ms step_avg:60.23ms +step:643/2245 train_time:38731ms step_avg:60.24ms +step:644/2245 train_time:38790ms step_avg:60.23ms +step:645/2245 train_time:38852ms step_avg:60.24ms +step:646/2245 train_time:38910ms step_avg:60.23ms +step:647/2245 train_time:38972ms step_avg:60.24ms +step:648/2245 train_time:39031ms step_avg:60.23ms +step:649/2245 train_time:39092ms step_avg:60.23ms +step:650/2245 train_time:39151ms step_avg:60.23ms +step:651/2245 train_time:39213ms step_avg:60.23ms +step:652/2245 train_time:39272ms step_avg:60.23ms +step:653/2245 train_time:39333ms step_avg:60.23ms +step:654/2245 train_time:39392ms step_avg:60.23ms +step:655/2245 train_time:39454ms step_avg:60.23ms +step:656/2245 train_time:39513ms step_avg:60.23ms +step:657/2245 train_time:39574ms step_avg:60.23ms +step:658/2245 train_time:39632ms step_avg:60.23ms +step:659/2245 train_time:39694ms step_avg:60.23ms +step:660/2245 train_time:39753ms step_avg:60.23ms +step:661/2245 train_time:39814ms step_avg:60.23ms +step:662/2245 train_time:39873ms step_avg:60.23ms +step:663/2245 train_time:39935ms step_avg:60.23ms +step:664/2245 train_time:39994ms step_avg:60.23ms +step:665/2245 train_time:40056ms step_avg:60.23ms +step:666/2245 train_time:40115ms step_avg:60.23ms +step:667/2245 train_time:40176ms step_avg:60.23ms +step:668/2245 train_time:40235ms step_avg:60.23ms +step:669/2245 train_time:40296ms step_avg:60.23ms +step:670/2245 train_time:40355ms step_avg:60.23ms +step:671/2245 train_time:40417ms step_avg:60.23ms +step:672/2245 train_time:40476ms step_avg:60.23ms +step:673/2245 train_time:40537ms step_avg:60.23ms +step:674/2245 train_time:40596ms step_avg:60.23ms +step:675/2245 train_time:40657ms step_avg:60.23ms +step:676/2245 train_time:40716ms step_avg:60.23ms +step:677/2245 train_time:40778ms step_avg:60.23ms +step:678/2245 train_time:40837ms step_avg:60.23ms +step:679/2245 train_time:40899ms step_avg:60.23ms +step:680/2245 train_time:40959ms step_avg:60.23ms +step:681/2245 train_time:41020ms step_avg:60.24ms +step:682/2245 train_time:41080ms step_avg:60.23ms +step:683/2245 train_time:41141ms step_avg:60.24ms +step:684/2245 train_time:41201ms step_avg:60.24ms +step:685/2245 train_time:41263ms step_avg:60.24ms +step:686/2245 train_time:41323ms step_avg:60.24ms +step:687/2245 train_time:41384ms step_avg:60.24ms +step:688/2245 train_time:41443ms step_avg:60.24ms +step:689/2245 train_time:41505ms step_avg:60.24ms +step:690/2245 train_time:41565ms step_avg:60.24ms +step:691/2245 train_time:41627ms step_avg:60.24ms +step:692/2245 train_time:41686ms step_avg:60.24ms +step:693/2245 train_time:41747ms step_avg:60.24ms +step:694/2245 train_time:41806ms step_avg:60.24ms +step:695/2245 train_time:41867ms step_avg:60.24ms +step:696/2245 train_time:41926ms step_avg:60.24ms +step:697/2245 train_time:41988ms step_avg:60.24ms +step:698/2245 train_time:42046ms step_avg:60.24ms +step:699/2245 train_time:42108ms step_avg:60.24ms +step:700/2245 train_time:42167ms step_avg:60.24ms +step:701/2245 train_time:42228ms step_avg:60.24ms +step:702/2245 train_time:42287ms step_avg:60.24ms +step:703/2245 train_time:42348ms step_avg:60.24ms +step:704/2245 train_time:42407ms step_avg:60.24ms +step:705/2245 train_time:42468ms step_avg:60.24ms +step:706/2245 train_time:42527ms step_avg:60.24ms +step:707/2245 train_time:42588ms step_avg:60.24ms +step:708/2245 train_time:42647ms step_avg:60.24ms +step:709/2245 train_time:42708ms step_avg:60.24ms +step:710/2245 train_time:42767ms step_avg:60.23ms +step:711/2245 train_time:42828ms step_avg:60.24ms +step:712/2245 train_time:42887ms step_avg:60.23ms +step:713/2245 train_time:42949ms step_avg:60.24ms +step:714/2245 train_time:43007ms step_avg:60.23ms +step:715/2245 train_time:43069ms step_avg:60.24ms +step:716/2245 train_time:43128ms step_avg:60.23ms +step:717/2245 train_time:43189ms step_avg:60.24ms +step:718/2245 train_time:43248ms step_avg:60.23ms +step:719/2245 train_time:43309ms step_avg:60.24ms +step:720/2245 train_time:43368ms step_avg:60.23ms +step:721/2245 train_time:43429ms step_avg:60.23ms +step:722/2245 train_time:43871ms step_avg:60.76ms +step:723/2245 train_time:43931ms step_avg:60.76ms +step:724/2245 train_time:43989ms step_avg:60.76ms +step:725/2245 train_time:44049ms step_avg:60.76ms +step:726/2245 train_time:44107ms step_avg:60.75ms +step:727/2245 train_time:44168ms step_avg:60.75ms +step:728/2245 train_time:44226ms step_avg:60.75ms +step:729/2245 train_time:44286ms step_avg:60.75ms +step:730/2245 train_time:44345ms step_avg:60.75ms +step:731/2245 train_time:44405ms step_avg:60.75ms +step:732/2245 train_time:44463ms step_avg:60.74ms +step:733/2245 train_time:44524ms step_avg:60.74ms +step:734/2245 train_time:44582ms step_avg:60.74ms +step:735/2245 train_time:44643ms step_avg:60.74ms +step:736/2245 train_time:44702ms step_avg:60.74ms +step:737/2245 train_time:44768ms step_avg:60.74ms +step:738/2245 train_time:44832ms step_avg:60.75ms +step:739/2245 train_time:44896ms step_avg:60.75ms +step:740/2245 train_time:44957ms step_avg:60.75ms +step:741/2245 train_time:45020ms step_avg:60.76ms +step:742/2245 train_time:45081ms step_avg:60.76ms +step:743/2245 train_time:45143ms step_avg:60.76ms +step:744/2245 train_time:45202ms step_avg:60.76ms +step:745/2245 train_time:45264ms step_avg:60.76ms +step:746/2245 train_time:45324ms step_avg:60.76ms +step:747/2245 train_time:45386ms step_avg:60.76ms +step:748/2245 train_time:45445ms step_avg:60.75ms +step:749/2245 train_time:45506ms step_avg:60.76ms +step:750/2245 train_time:45566ms step_avg:60.75ms +step:750/2245 val_loss:3.6688 train_time:45628ms step_avg:60.84ms +step:751/2245 train_time:45648ms step_avg:60.78ms +step:752/2245 train_time:45688ms step_avg:60.76ms +step:753/2245 train_time:45750ms step_avg:60.76ms +step:754/2245 train_time:45810ms step_avg:60.76ms +step:755/2245 train_time:45873ms step_avg:60.76ms +step:756/2245 train_time:45932ms step_avg:60.76ms +step:757/2245 train_time:45994ms step_avg:60.76ms +step:758/2245 train_time:46052ms step_avg:60.76ms +step:759/2245 train_time:46113ms step_avg:60.76ms +step:760/2245 train_time:46172ms step_avg:60.75ms +step:761/2245 train_time:46233ms step_avg:60.75ms +step:762/2245 train_time:46292ms step_avg:60.75ms +step:763/2245 train_time:46353ms step_avg:60.75ms +step:764/2245 train_time:46412ms step_avg:60.75ms +step:765/2245 train_time:46472ms step_avg:60.75ms +step:766/2245 train_time:46536ms step_avg:60.75ms +step:767/2245 train_time:46604ms step_avg:60.76ms +step:768/2245 train_time:46664ms step_avg:60.76ms +step:769/2245 train_time:46727ms step_avg:60.76ms +step:770/2245 train_time:46787ms step_avg:60.76ms +step:771/2245 train_time:46849ms step_avg:60.76ms +step:772/2245 train_time:46908ms step_avg:60.76ms +step:773/2245 train_time:46971ms step_avg:60.76ms +step:774/2245 train_time:47030ms step_avg:60.76ms +step:775/2245 train_time:47092ms step_avg:60.76ms +step:776/2245 train_time:47151ms step_avg:60.76ms +step:777/2245 train_time:47212ms step_avg:60.76ms +step:778/2245 train_time:47271ms step_avg:60.76ms +step:779/2245 train_time:47331ms step_avg:60.76ms +step:780/2245 train_time:47390ms step_avg:60.76ms +step:781/2245 train_time:47452ms step_avg:60.76ms +step:782/2245 train_time:47513ms step_avg:60.76ms +step:783/2245 train_time:47576ms step_avg:60.76ms +step:784/2245 train_time:47637ms step_avg:60.76ms +step:785/2245 train_time:47700ms step_avg:60.76ms +step:786/2245 train_time:47760ms step_avg:60.76ms +step:787/2245 train_time:47823ms step_avg:60.77ms +step:788/2245 train_time:47883ms step_avg:60.76ms +step:789/2245 train_time:47945ms step_avg:60.77ms +step:790/2245 train_time:48005ms step_avg:60.77ms +step:791/2245 train_time:48068ms step_avg:60.77ms +step:792/2245 train_time:48128ms step_avg:60.77ms +step:793/2245 train_time:48189ms step_avg:60.77ms +step:794/2245 train_time:48249ms step_avg:60.77ms +step:795/2245 train_time:48311ms step_avg:60.77ms +step:796/2245 train_time:48370ms step_avg:60.77ms +step:797/2245 train_time:48432ms step_avg:60.77ms +step:798/2245 train_time:48491ms step_avg:60.77ms +step:799/2245 train_time:48554ms step_avg:60.77ms +step:800/2245 train_time:48614ms step_avg:60.77ms +step:801/2245 train_time:48676ms step_avg:60.77ms +step:802/2245 train_time:48736ms step_avg:60.77ms +step:803/2245 train_time:48798ms step_avg:60.77ms +step:804/2245 train_time:48859ms step_avg:60.77ms +step:805/2245 train_time:48921ms step_avg:60.77ms +step:806/2245 train_time:48981ms step_avg:60.77ms +step:807/2245 train_time:49044ms step_avg:60.77ms +step:808/2245 train_time:49104ms step_avg:60.77ms +step:809/2245 train_time:49166ms step_avg:60.77ms +step:810/2245 train_time:49226ms step_avg:60.77ms +step:811/2245 train_time:49288ms step_avg:60.77ms +step:812/2245 train_time:49348ms step_avg:60.77ms +step:813/2245 train_time:49411ms step_avg:60.78ms +step:814/2245 train_time:49470ms step_avg:60.77ms +step:815/2245 train_time:49532ms step_avg:60.78ms +step:816/2245 train_time:49591ms step_avg:60.77ms +step:817/2245 train_time:49654ms step_avg:60.78ms +step:818/2245 train_time:49714ms step_avg:60.77ms +step:819/2245 train_time:49775ms step_avg:60.78ms +step:820/2245 train_time:49835ms step_avg:60.77ms +step:821/2245 train_time:49898ms step_avg:60.78ms +step:822/2245 train_time:49958ms step_avg:60.78ms +step:823/2245 train_time:50021ms step_avg:60.78ms +step:824/2245 train_time:50081ms step_avg:60.78ms +step:825/2245 train_time:50144ms step_avg:60.78ms +step:826/2245 train_time:50203ms step_avg:60.78ms +step:827/2245 train_time:50266ms step_avg:60.78ms +step:828/2245 train_time:50326ms step_avg:60.78ms +step:829/2245 train_time:50388ms step_avg:60.78ms +step:830/2245 train_time:50449ms step_avg:60.78ms +step:831/2245 train_time:50512ms step_avg:60.78ms +step:832/2245 train_time:50571ms step_avg:60.78ms +step:833/2245 train_time:50634ms step_avg:60.78ms +step:834/2245 train_time:50693ms step_avg:60.78ms +step:835/2245 train_time:50755ms step_avg:60.78ms +step:836/2245 train_time:50815ms step_avg:60.78ms +step:837/2245 train_time:50877ms step_avg:60.78ms +step:838/2245 train_time:50937ms step_avg:60.78ms +step:839/2245 train_time:51000ms step_avg:60.79ms +step:840/2245 train_time:51060ms step_avg:60.79ms +step:841/2245 train_time:51123ms step_avg:60.79ms +step:842/2245 train_time:51182ms step_avg:60.79ms +step:843/2245 train_time:51244ms step_avg:60.79ms +step:844/2245 train_time:51305ms step_avg:60.79ms +step:845/2245 train_time:51367ms step_avg:60.79ms +step:846/2245 train_time:51427ms step_avg:60.79ms +step:847/2245 train_time:51490ms step_avg:60.79ms +step:848/2245 train_time:51550ms step_avg:60.79ms +step:849/2245 train_time:51613ms step_avg:60.79ms +step:850/2245 train_time:51673ms step_avg:60.79ms +step:851/2245 train_time:51735ms step_avg:60.79ms +step:852/2245 train_time:51795ms step_avg:60.79ms +step:853/2245 train_time:51857ms step_avg:60.79ms +step:854/2245 train_time:51916ms step_avg:60.79ms +step:855/2245 train_time:51979ms step_avg:60.79ms +step:856/2245 train_time:52038ms step_avg:60.79ms +step:857/2245 train_time:52102ms step_avg:60.80ms +step:858/2245 train_time:52162ms step_avg:60.79ms +step:859/2245 train_time:52224ms step_avg:60.80ms +step:860/2245 train_time:52284ms step_avg:60.79ms +step:861/2245 train_time:52346ms step_avg:60.80ms +step:862/2245 train_time:52406ms step_avg:60.80ms +step:863/2245 train_time:52469ms step_avg:60.80ms +step:864/2245 train_time:52529ms step_avg:60.80ms +step:865/2245 train_time:52592ms step_avg:60.80ms +step:866/2245 train_time:52651ms step_avg:60.80ms +step:867/2245 train_time:52715ms step_avg:60.80ms +step:868/2245 train_time:52774ms step_avg:60.80ms +step:869/2245 train_time:52836ms step_avg:60.80ms +step:870/2245 train_time:52896ms step_avg:60.80ms +step:871/2245 train_time:52957ms step_avg:60.80ms +step:872/2245 train_time:53017ms step_avg:60.80ms +step:873/2245 train_time:53079ms step_avg:60.80ms +step:874/2245 train_time:53139ms step_avg:60.80ms +step:875/2245 train_time:53203ms step_avg:60.80ms +step:876/2245 train_time:53263ms step_avg:60.80ms +step:877/2245 train_time:53325ms step_avg:60.80ms +step:878/2245 train_time:53385ms step_avg:60.80ms +step:879/2245 train_time:53448ms step_avg:60.81ms +step:880/2245 train_time:53509ms step_avg:60.81ms +step:881/2245 train_time:53572ms step_avg:60.81ms +step:882/2245 train_time:53632ms step_avg:60.81ms +step:883/2245 train_time:53695ms step_avg:60.81ms +step:884/2245 train_time:53755ms step_avg:60.81ms +step:885/2245 train_time:53817ms step_avg:60.81ms +step:886/2245 train_time:53877ms step_avg:60.81ms +step:887/2245 train_time:53939ms step_avg:60.81ms +step:888/2245 train_time:53998ms step_avg:60.81ms +step:889/2245 train_time:54061ms step_avg:60.81ms +step:890/2245 train_time:54120ms step_avg:60.81ms +step:891/2245 train_time:54182ms step_avg:60.81ms +step:892/2245 train_time:54241ms step_avg:60.81ms +step:893/2245 train_time:54304ms step_avg:60.81ms +step:894/2245 train_time:54364ms step_avg:60.81ms +step:895/2245 train_time:54426ms step_avg:60.81ms +step:896/2245 train_time:54487ms step_avg:60.81ms +step:897/2245 train_time:54550ms step_avg:60.81ms +step:898/2245 train_time:54610ms step_avg:60.81ms +step:899/2245 train_time:54673ms step_avg:60.82ms +step:900/2245 train_time:54733ms step_avg:60.81ms +step:901/2245 train_time:54795ms step_avg:60.82ms +step:902/2245 train_time:54855ms step_avg:60.81ms +step:903/2245 train_time:54917ms step_avg:60.82ms +step:904/2245 train_time:54977ms step_avg:60.81ms +step:905/2245 train_time:55038ms step_avg:60.82ms +step:906/2245 train_time:55098ms step_avg:60.81ms +step:907/2245 train_time:55161ms step_avg:60.82ms +step:908/2245 train_time:55221ms step_avg:60.82ms +step:909/2245 train_time:55284ms step_avg:60.82ms +step:910/2245 train_time:55344ms step_avg:60.82ms +step:911/2245 train_time:55406ms step_avg:60.82ms +step:912/2245 train_time:55465ms step_avg:60.82ms +step:913/2245 train_time:55528ms step_avg:60.82ms +step:914/2245 train_time:55588ms step_avg:60.82ms +step:915/2245 train_time:55652ms step_avg:60.82ms +step:916/2245 train_time:55712ms step_avg:60.82ms +step:917/2245 train_time:55774ms step_avg:60.82ms +step:918/2245 train_time:55834ms step_avg:60.82ms +step:919/2245 train_time:55896ms step_avg:60.82ms +step:920/2245 train_time:55956ms step_avg:60.82ms +step:921/2245 train_time:56018ms step_avg:60.82ms +step:922/2245 train_time:56077ms step_avg:60.82ms +step:923/2245 train_time:56139ms step_avg:60.82ms +step:924/2245 train_time:56199ms step_avg:60.82ms +step:925/2245 train_time:56262ms step_avg:60.82ms +step:926/2245 train_time:56321ms step_avg:60.82ms +step:927/2245 train_time:56384ms step_avg:60.82ms +step:928/2245 train_time:56443ms step_avg:60.82ms +step:929/2245 train_time:56506ms step_avg:60.82ms +step:930/2245 train_time:56567ms step_avg:60.82ms +step:931/2245 train_time:56630ms step_avg:60.83ms +step:932/2245 train_time:56690ms step_avg:60.83ms +step:933/2245 train_time:56752ms step_avg:60.83ms +step:934/2245 train_time:56813ms step_avg:60.83ms +step:935/2245 train_time:56875ms step_avg:60.83ms +step:936/2245 train_time:56936ms step_avg:60.83ms +step:937/2245 train_time:56998ms step_avg:60.83ms +step:938/2245 train_time:57057ms step_avg:60.83ms +step:939/2245 train_time:57119ms step_avg:60.83ms +step:940/2245 train_time:57178ms step_avg:60.83ms +step:941/2245 train_time:57241ms step_avg:60.83ms +step:942/2245 train_time:57300ms step_avg:60.83ms +step:943/2245 train_time:57363ms step_avg:60.83ms +step:944/2245 train_time:57423ms step_avg:60.83ms +step:945/2245 train_time:57485ms step_avg:60.83ms +step:946/2245 train_time:57546ms step_avg:60.83ms +step:947/2245 train_time:57609ms step_avg:60.83ms +step:948/2245 train_time:57669ms step_avg:60.83ms +step:949/2245 train_time:57731ms step_avg:60.83ms +step:950/2245 train_time:57791ms step_avg:60.83ms +step:951/2245 train_time:57854ms step_avg:60.84ms +step:952/2245 train_time:57915ms step_avg:60.83ms +step:953/2245 train_time:57976ms step_avg:60.84ms +step:954/2245 train_time:58036ms step_avg:60.83ms +step:955/2245 train_time:58097ms step_avg:60.83ms +step:956/2245 train_time:58156ms step_avg:60.83ms +step:957/2245 train_time:58218ms step_avg:60.83ms +step:958/2245 train_time:58277ms step_avg:60.83ms +step:959/2245 train_time:58340ms step_avg:60.83ms +step:960/2245 train_time:58400ms step_avg:60.83ms +step:961/2245 train_time:58463ms step_avg:60.84ms +step:962/2245 train_time:58523ms step_avg:60.84ms +step:963/2245 train_time:58586ms step_avg:60.84ms +step:964/2245 train_time:58647ms step_avg:60.84ms +step:965/2245 train_time:58710ms step_avg:60.84ms +step:966/2245 train_time:58770ms step_avg:60.84ms +step:967/2245 train_time:58833ms step_avg:60.84ms +step:968/2245 train_time:58893ms step_avg:60.84ms +step:969/2245 train_time:58955ms step_avg:60.84ms +step:970/2245 train_time:59014ms step_avg:60.84ms +step:971/2245 train_time:59076ms step_avg:60.84ms +step:972/2245 train_time:59135ms step_avg:60.84ms +step:973/2245 train_time:59197ms step_avg:60.84ms +step:974/2245 train_time:59256ms step_avg:60.84ms +step:975/2245 train_time:59318ms step_avg:60.84ms +step:976/2245 train_time:59378ms step_avg:60.84ms +step:977/2245 train_time:59441ms step_avg:60.84ms +step:978/2245 train_time:59501ms step_avg:60.84ms +step:979/2245 train_time:59564ms step_avg:60.84ms +step:980/2245 train_time:59624ms step_avg:60.84ms +step:981/2245 train_time:59687ms step_avg:60.84ms +step:982/2245 train_time:59748ms step_avg:60.84ms +step:983/2245 train_time:59811ms step_avg:60.85ms +step:984/2245 train_time:59871ms step_avg:60.84ms +step:985/2245 train_time:59933ms step_avg:60.85ms +step:986/2245 train_time:59993ms step_avg:60.85ms +step:987/2245 train_time:60055ms step_avg:60.85ms +step:988/2245 train_time:60115ms step_avg:60.85ms +step:989/2245 train_time:60177ms step_avg:60.85ms +step:990/2245 train_time:60237ms step_avg:60.85ms +step:991/2245 train_time:60299ms step_avg:60.85ms +step:992/2245 train_time:60359ms step_avg:60.85ms +step:993/2245 train_time:60421ms step_avg:60.85ms +step:994/2245 train_time:60481ms step_avg:60.85ms +step:995/2245 train_time:60544ms step_avg:60.85ms +step:996/2245 train_time:60604ms step_avg:60.85ms +step:997/2245 train_time:60666ms step_avg:60.85ms +step:998/2245 train_time:60728ms step_avg:60.85ms +step:999/2245 train_time:60792ms step_avg:60.85ms +step:1000/2245 train_time:60851ms step_avg:60.85ms +step:1000/2245 val_loss:3.5937 train_time:60914ms step_avg:60.91ms +step:1001/2245 train_time:60934ms step_avg:60.87ms +step:1002/2245 train_time:60976ms step_avg:60.85ms +step:1003/2245 train_time:61041ms step_avg:60.86ms +step:1004/2245 train_time:61105ms step_avg:60.86ms +step:1005/2245 train_time:61169ms step_avg:60.86ms +step:1006/2245 train_time:61228ms step_avg:60.86ms +step:1007/2245 train_time:61290ms step_avg:60.86ms +step:1008/2245 train_time:61349ms step_avg:60.86ms +step:1009/2245 train_time:61411ms step_avg:60.86ms +step:1010/2245 train_time:61470ms step_avg:60.86ms +step:1011/2245 train_time:61531ms step_avg:60.86ms +step:1012/2245 train_time:61590ms step_avg:60.86ms +step:1013/2245 train_time:61652ms step_avg:60.86ms +step:1014/2245 train_time:61711ms step_avg:60.86ms +step:1015/2245 train_time:61773ms step_avg:60.86ms +step:1016/2245 train_time:61832ms step_avg:60.86ms +step:1017/2245 train_time:61895ms step_avg:60.86ms +step:1018/2245 train_time:61955ms step_avg:60.86ms +step:1019/2245 train_time:62020ms step_avg:60.86ms +step:1020/2245 train_time:62081ms step_avg:60.86ms +step:1021/2245 train_time:62145ms step_avg:60.87ms +step:1022/2245 train_time:62206ms step_avg:60.87ms +step:1023/2245 train_time:62268ms step_avg:60.87ms +step:1024/2245 train_time:62327ms step_avg:60.87ms +step:1025/2245 train_time:62390ms step_avg:60.87ms +step:1026/2245 train_time:62450ms step_avg:60.87ms +step:1027/2245 train_time:62511ms step_avg:60.87ms +step:1028/2245 train_time:62570ms step_avg:60.87ms +step:1029/2245 train_time:62631ms step_avg:60.87ms +step:1030/2245 train_time:62690ms step_avg:60.86ms +step:1031/2245 train_time:62752ms step_avg:60.86ms +step:1032/2245 train_time:62811ms step_avg:60.86ms +step:1033/2245 train_time:62873ms step_avg:60.86ms +step:1034/2245 train_time:62933ms step_avg:60.86ms +step:1035/2245 train_time:62997ms step_avg:60.87ms +step:1036/2245 train_time:63058ms step_avg:60.87ms +step:1037/2245 train_time:63121ms step_avg:60.87ms +step:1038/2245 train_time:63181ms step_avg:60.87ms +step:1039/2245 train_time:63244ms step_avg:60.87ms +step:1040/2245 train_time:63304ms step_avg:60.87ms +step:1041/2245 train_time:63367ms step_avg:60.87ms +step:1042/2245 train_time:63426ms step_avg:60.87ms +step:1043/2245 train_time:63489ms step_avg:60.87ms +step:1044/2245 train_time:63548ms step_avg:60.87ms +step:1045/2245 train_time:63610ms step_avg:60.87ms +step:1046/2245 train_time:63669ms step_avg:60.87ms +step:1047/2245 train_time:63732ms step_avg:60.87ms +step:1048/2245 train_time:63791ms step_avg:60.87ms +step:1049/2245 train_time:63853ms step_avg:60.87ms +step:1050/2245 train_time:63914ms step_avg:60.87ms +step:1051/2245 train_time:63976ms step_avg:60.87ms +step:1052/2245 train_time:64036ms step_avg:60.87ms +step:1053/2245 train_time:64099ms step_avg:60.87ms +step:1054/2245 train_time:64159ms step_avg:60.87ms +step:1055/2245 train_time:64223ms step_avg:60.87ms +step:1056/2245 train_time:64282ms step_avg:60.87ms +step:1057/2245 train_time:64345ms step_avg:60.88ms +step:1058/2245 train_time:64405ms step_avg:60.87ms +step:1059/2245 train_time:64467ms step_avg:60.88ms +step:1060/2245 train_time:64527ms step_avg:60.87ms +step:1061/2245 train_time:64589ms step_avg:60.88ms +step:1062/2245 train_time:64649ms step_avg:60.87ms +step:1063/2245 train_time:64711ms step_avg:60.88ms +step:1064/2245 train_time:64770ms step_avg:60.87ms +step:1065/2245 train_time:64833ms step_avg:60.88ms +step:1066/2245 train_time:64893ms step_avg:60.87ms +step:1067/2245 train_time:64955ms step_avg:60.88ms +step:1068/2245 train_time:65014ms step_avg:60.87ms +step:1069/2245 train_time:65076ms step_avg:60.88ms +step:1070/2245 train_time:65137ms step_avg:60.88ms +step:1071/2245 train_time:65200ms step_avg:60.88ms +step:1072/2245 train_time:65260ms step_avg:60.88ms +step:1073/2245 train_time:65323ms step_avg:60.88ms +step:1074/2245 train_time:65382ms step_avg:60.88ms +step:1075/2245 train_time:65445ms step_avg:60.88ms +step:1076/2245 train_time:65506ms step_avg:60.88ms +step:1077/2245 train_time:65568ms step_avg:60.88ms +step:1078/2245 train_time:65628ms step_avg:60.88ms +step:1079/2245 train_time:65690ms step_avg:60.88ms +step:1080/2245 train_time:65751ms step_avg:60.88ms +step:1081/2245 train_time:65812ms step_avg:60.88ms +step:1082/2245 train_time:65872ms step_avg:60.88ms +step:1083/2245 train_time:65936ms step_avg:60.88ms +step:1084/2245 train_time:65995ms step_avg:60.88ms +step:1085/2245 train_time:66057ms step_avg:60.88ms +step:1086/2245 train_time:66117ms step_avg:60.88ms +step:1087/2245 train_time:66179ms step_avg:60.88ms +step:1088/2245 train_time:66239ms step_avg:60.88ms +step:1089/2245 train_time:66301ms step_avg:60.88ms +step:1090/2245 train_time:66362ms step_avg:60.88ms +step:1091/2245 train_time:66424ms step_avg:60.88ms +step:1092/2245 train_time:66484ms step_avg:60.88ms +step:1093/2245 train_time:66547ms step_avg:60.88ms +step:1094/2245 train_time:66607ms step_avg:60.88ms +step:1095/2245 train_time:66670ms step_avg:60.89ms +step:1096/2245 train_time:66730ms step_avg:60.89ms +step:1097/2245 train_time:66793ms step_avg:60.89ms +step:1098/2245 train_time:66852ms step_avg:60.89ms +step:1099/2245 train_time:66914ms step_avg:60.89ms +step:1100/2245 train_time:66974ms step_avg:60.89ms +step:1101/2245 train_time:67037ms step_avg:60.89ms +step:1102/2245 train_time:67097ms step_avg:60.89ms +step:1103/2245 train_time:67159ms step_avg:60.89ms +step:1104/2245 train_time:67219ms step_avg:60.89ms +step:1105/2245 train_time:67281ms step_avg:60.89ms +step:1106/2245 train_time:67341ms step_avg:60.89ms +step:1107/2245 train_time:67404ms step_avg:60.89ms +step:1108/2245 train_time:67464ms step_avg:60.89ms +step:1109/2245 train_time:67527ms step_avg:60.89ms +step:1110/2245 train_time:67588ms step_avg:60.89ms +step:1111/2245 train_time:67651ms step_avg:60.89ms +step:1112/2245 train_time:67711ms step_avg:60.89ms +step:1113/2245 train_time:67773ms step_avg:60.89ms +step:1114/2245 train_time:67832ms step_avg:60.89ms +step:1115/2245 train_time:67895ms step_avg:60.89ms +step:1116/2245 train_time:67955ms step_avg:60.89ms +step:1117/2245 train_time:68017ms step_avg:60.89ms +step:1118/2245 train_time:68077ms step_avg:60.89ms +step:1119/2245 train_time:68139ms step_avg:60.89ms +step:1120/2245 train_time:68198ms step_avg:60.89ms +step:1121/2245 train_time:68261ms step_avg:60.89ms +step:1122/2245 train_time:68321ms step_avg:60.89ms +step:1123/2245 train_time:68383ms step_avg:60.89ms +step:1124/2245 train_time:68444ms step_avg:60.89ms +step:1125/2245 train_time:68506ms step_avg:60.89ms +step:1126/2245 train_time:68566ms step_avg:60.89ms +step:1127/2245 train_time:68629ms step_avg:60.90ms +step:1128/2245 train_time:68689ms step_avg:60.89ms +step:1129/2245 train_time:68752ms step_avg:60.90ms +step:1130/2245 train_time:68811ms step_avg:60.89ms +step:1131/2245 train_time:68874ms step_avg:60.90ms +step:1132/2245 train_time:68933ms step_avg:60.90ms +step:1133/2245 train_time:68995ms step_avg:60.90ms +step:1134/2245 train_time:69055ms step_avg:60.90ms +step:1135/2245 train_time:69117ms step_avg:60.90ms +step:1136/2245 train_time:69176ms step_avg:60.89ms +step:1137/2245 train_time:69239ms step_avg:60.90ms +step:1138/2245 train_time:69299ms step_avg:60.90ms +step:1139/2245 train_time:69361ms step_avg:60.90ms +step:1140/2245 train_time:69421ms step_avg:60.90ms +step:1141/2245 train_time:69484ms step_avg:60.90ms +step:1142/2245 train_time:69544ms step_avg:60.90ms +step:1143/2245 train_time:69606ms step_avg:60.90ms +step:1144/2245 train_time:69667ms step_avg:60.90ms +step:1145/2245 train_time:69729ms step_avg:60.90ms +step:1146/2245 train_time:69789ms step_avg:60.90ms +step:1147/2245 train_time:69852ms step_avg:60.90ms +step:1148/2245 train_time:69912ms step_avg:60.90ms +step:1149/2245 train_time:69975ms step_avg:60.90ms +step:1150/2245 train_time:70034ms step_avg:60.90ms +step:1151/2245 train_time:70096ms step_avg:60.90ms +step:1152/2245 train_time:70156ms step_avg:60.90ms +step:1153/2245 train_time:70218ms step_avg:60.90ms +step:1154/2245 train_time:70278ms step_avg:60.90ms +step:1155/2245 train_time:70340ms step_avg:60.90ms +step:1156/2245 train_time:70400ms step_avg:60.90ms +step:1157/2245 train_time:70463ms step_avg:60.90ms +step:1158/2245 train_time:70522ms step_avg:60.90ms +step:1159/2245 train_time:70585ms step_avg:60.90ms +step:1160/2245 train_time:70645ms step_avg:60.90ms +step:1161/2245 train_time:70708ms step_avg:60.90ms +step:1162/2245 train_time:70768ms step_avg:60.90ms +step:1163/2245 train_time:70831ms step_avg:60.90ms +step:1164/2245 train_time:70891ms step_avg:60.90ms +step:1165/2245 train_time:70954ms step_avg:60.90ms +step:1166/2245 train_time:71013ms step_avg:60.90ms +step:1167/2245 train_time:71075ms step_avg:60.90ms +step:1168/2245 train_time:71135ms step_avg:60.90ms +step:1169/2245 train_time:71197ms step_avg:60.90ms +step:1170/2245 train_time:71257ms step_avg:60.90ms +step:1171/2245 train_time:71319ms step_avg:60.90ms +step:1172/2245 train_time:71378ms step_avg:60.90ms +step:1173/2245 train_time:71440ms step_avg:60.90ms +step:1174/2245 train_time:71501ms step_avg:60.90ms +step:1175/2245 train_time:71563ms step_avg:60.90ms +step:1176/2245 train_time:71623ms step_avg:60.90ms +step:1177/2245 train_time:71686ms step_avg:60.91ms +step:1178/2245 train_time:71746ms step_avg:60.91ms +step:1179/2245 train_time:71809ms step_avg:60.91ms +step:1180/2245 train_time:71870ms step_avg:60.91ms +step:1181/2245 train_time:71933ms step_avg:60.91ms +step:1182/2245 train_time:71993ms step_avg:60.91ms +step:1183/2245 train_time:72056ms step_avg:60.91ms +step:1184/2245 train_time:72115ms step_avg:60.91ms +step:1185/2245 train_time:72177ms step_avg:60.91ms +step:1186/2245 train_time:72236ms step_avg:60.91ms +step:1187/2245 train_time:72298ms step_avg:60.91ms +step:1188/2245 train_time:72358ms step_avg:60.91ms +step:1189/2245 train_time:72420ms step_avg:60.91ms +step:1190/2245 train_time:72480ms step_avg:60.91ms +step:1191/2245 train_time:72542ms step_avg:60.91ms +step:1192/2245 train_time:72602ms step_avg:60.91ms +step:1193/2245 train_time:72665ms step_avg:60.91ms +step:1194/2245 train_time:72725ms step_avg:60.91ms +step:1195/2245 train_time:72789ms step_avg:60.91ms +step:1196/2245 train_time:72850ms step_avg:60.91ms +step:1197/2245 train_time:72913ms step_avg:60.91ms +step:1198/2245 train_time:72973ms step_avg:60.91ms +step:1199/2245 train_time:73036ms step_avg:60.91ms +step:1200/2245 train_time:73095ms step_avg:60.91ms +step:1201/2245 train_time:73157ms step_avg:60.91ms +step:1202/2245 train_time:73217ms step_avg:60.91ms +step:1203/2245 train_time:73279ms step_avg:60.91ms +step:1204/2245 train_time:73338ms step_avg:60.91ms +step:1205/2245 train_time:73401ms step_avg:60.91ms +step:1206/2245 train_time:73460ms step_avg:60.91ms +step:1207/2245 train_time:73523ms step_avg:60.91ms +step:1208/2245 train_time:73583ms step_avg:60.91ms +step:1209/2245 train_time:73646ms step_avg:60.91ms +step:1210/2245 train_time:73706ms step_avg:60.91ms +step:1211/2245 train_time:73769ms step_avg:60.92ms +step:1212/2245 train_time:73829ms step_avg:60.92ms +step:1213/2245 train_time:73891ms step_avg:60.92ms +step:1214/2245 train_time:73952ms step_avg:60.92ms +step:1215/2245 train_time:74015ms step_avg:60.92ms +step:1216/2245 train_time:74074ms step_avg:60.92ms +step:1217/2245 train_time:74136ms step_avg:60.92ms +step:1218/2245 train_time:74196ms step_avg:60.92ms +step:1219/2245 train_time:74258ms step_avg:60.92ms +step:1220/2245 train_time:74317ms step_avg:60.92ms +step:1221/2245 train_time:74379ms step_avg:60.92ms +step:1222/2245 train_time:74439ms step_avg:60.92ms +step:1223/2245 train_time:74501ms step_avg:60.92ms +step:1224/2245 train_time:74560ms step_avg:60.92ms +step:1225/2245 train_time:74623ms step_avg:60.92ms +step:1226/2245 train_time:74683ms step_avg:60.92ms +step:1227/2245 train_time:74747ms step_avg:60.92ms +step:1228/2245 train_time:74807ms step_avg:60.92ms +step:1229/2245 train_time:74870ms step_avg:60.92ms +step:1230/2245 train_time:74930ms step_avg:60.92ms +step:1231/2245 train_time:74992ms step_avg:60.92ms +step:1232/2245 train_time:75052ms step_avg:60.92ms +step:1233/2245 train_time:75114ms step_avg:60.92ms +step:1234/2245 train_time:75174ms step_avg:60.92ms +step:1235/2245 train_time:75235ms step_avg:60.92ms +step:1236/2245 train_time:75295ms step_avg:60.92ms +step:1237/2245 train_time:75357ms step_avg:60.92ms +step:1238/2245 train_time:75416ms step_avg:60.92ms +step:1239/2245 train_time:75478ms step_avg:60.92ms +step:1240/2245 train_time:75538ms step_avg:60.92ms +step:1241/2245 train_time:75601ms step_avg:60.92ms +step:1242/2245 train_time:75661ms step_avg:60.92ms +step:1243/2245 train_time:75724ms step_avg:60.92ms +step:1244/2245 train_time:75785ms step_avg:60.92ms +step:1245/2245 train_time:75849ms step_avg:60.92ms +step:1246/2245 train_time:75909ms step_avg:60.92ms +step:1247/2245 train_time:75972ms step_avg:60.92ms +step:1248/2245 train_time:76031ms step_avg:60.92ms +step:1249/2245 train_time:76093ms step_avg:60.92ms +step:1250/2245 train_time:76153ms step_avg:60.92ms +step:1250/2245 val_loss:3.5225 train_time:76216ms step_avg:60.97ms +step:1251/2245 train_time:76240ms step_avg:60.94ms +step:1252/2245 train_time:76279ms step_avg:60.93ms +step:1253/2245 train_time:76346ms step_avg:60.93ms +step:1254/2245 train_time:76406ms step_avg:60.93ms +step:1255/2245 train_time:76468ms step_avg:60.93ms +step:1256/2245 train_time:76528ms step_avg:60.93ms +step:1257/2245 train_time:76589ms step_avg:60.93ms +step:1258/2245 train_time:76648ms step_avg:60.93ms +step:1259/2245 train_time:76709ms step_avg:60.93ms +step:1260/2245 train_time:76768ms step_avg:60.93ms +step:1261/2245 train_time:76830ms step_avg:60.93ms +step:1262/2245 train_time:76889ms step_avg:60.93ms +step:1263/2245 train_time:76951ms step_avg:60.93ms +step:1264/2245 train_time:77011ms step_avg:60.93ms +step:1265/2245 train_time:77073ms step_avg:60.93ms +step:1266/2245 train_time:77133ms step_avg:60.93ms +step:1267/2245 train_time:77198ms step_avg:60.93ms +step:1268/2245 train_time:77261ms step_avg:60.93ms +step:1269/2245 train_time:77325ms step_avg:60.93ms +step:1270/2245 train_time:77386ms step_avg:60.93ms +step:1271/2245 train_time:77449ms step_avg:60.94ms +step:1272/2245 train_time:77508ms step_avg:60.93ms +step:1273/2245 train_time:77570ms step_avg:60.93ms +step:1274/2245 train_time:77629ms step_avg:60.93ms +step:1275/2245 train_time:77691ms step_avg:60.93ms +step:1276/2245 train_time:77750ms step_avg:60.93ms +step:1277/2245 train_time:77812ms step_avg:60.93ms +step:1278/2245 train_time:77872ms step_avg:60.93ms +step:1279/2245 train_time:77934ms step_avg:60.93ms +step:1280/2245 train_time:77994ms step_avg:60.93ms +step:1281/2245 train_time:78057ms step_avg:60.93ms +step:1282/2245 train_time:78118ms step_avg:60.93ms +step:1283/2245 train_time:78181ms step_avg:60.94ms +step:1284/2245 train_time:78241ms step_avg:60.94ms +step:1285/2245 train_time:78304ms step_avg:60.94ms +step:1286/2245 train_time:78364ms step_avg:60.94ms +step:1287/2245 train_time:78426ms step_avg:60.94ms +step:1288/2245 train_time:78486ms step_avg:60.94ms +step:1289/2245 train_time:78548ms step_avg:60.94ms +step:1290/2245 train_time:78607ms step_avg:60.94ms +step:1291/2245 train_time:78669ms step_avg:60.94ms +step:1292/2245 train_time:78728ms step_avg:60.93ms +step:1293/2245 train_time:78790ms step_avg:60.94ms +step:1294/2245 train_time:78850ms step_avg:60.93ms +step:1295/2245 train_time:78912ms step_avg:60.94ms +step:1296/2245 train_time:78972ms step_avg:60.94ms +step:1297/2245 train_time:79034ms step_avg:60.94ms +step:1298/2245 train_time:79095ms step_avg:60.94ms +step:1299/2245 train_time:79159ms step_avg:60.94ms +step:1300/2245 train_time:79220ms step_avg:60.94ms +step:1301/2245 train_time:79283ms step_avg:60.94ms +step:1302/2245 train_time:79343ms step_avg:60.94ms +step:1303/2245 train_time:79405ms step_avg:60.94ms +step:1304/2245 train_time:79465ms step_avg:60.94ms +step:1305/2245 train_time:79527ms step_avg:60.94ms +step:1306/2245 train_time:79587ms step_avg:60.94ms +step:1307/2245 train_time:79649ms step_avg:60.94ms +step:1308/2245 train_time:79708ms step_avg:60.94ms +step:1309/2245 train_time:79770ms step_avg:60.94ms +step:1310/2245 train_time:79830ms step_avg:60.94ms +step:1311/2245 train_time:79893ms step_avg:60.94ms +step:1312/2245 train_time:79952ms step_avg:60.94ms +step:1313/2245 train_time:80014ms step_avg:60.94ms +step:1314/2245 train_time:80074ms step_avg:60.94ms +step:1315/2245 train_time:80137ms step_avg:60.94ms +step:1316/2245 train_time:80198ms step_avg:60.94ms +step:1317/2245 train_time:80261ms step_avg:60.94ms +step:1318/2245 train_time:80321ms step_avg:60.94ms +step:1319/2245 train_time:80383ms step_avg:60.94ms +step:1320/2245 train_time:80443ms step_avg:60.94ms +step:1321/2245 train_time:80505ms step_avg:60.94ms +step:1322/2245 train_time:80565ms step_avg:60.94ms +step:1323/2245 train_time:80627ms step_avg:60.94ms +step:1324/2245 train_time:80687ms step_avg:60.94ms +step:1325/2245 train_time:80750ms step_avg:60.94ms +step:1326/2245 train_time:80809ms step_avg:60.94ms +step:1327/2245 train_time:80871ms step_avg:60.94ms +step:1328/2245 train_time:80931ms step_avg:60.94ms +step:1329/2245 train_time:80994ms step_avg:60.94ms +step:1330/2245 train_time:81055ms step_avg:60.94ms +step:1331/2245 train_time:81118ms step_avg:60.95ms +step:1332/2245 train_time:81179ms step_avg:60.95ms +step:1333/2245 train_time:81242ms step_avg:60.95ms +step:1334/2245 train_time:81302ms step_avg:60.95ms +step:1335/2245 train_time:81365ms step_avg:60.95ms +step:1336/2245 train_time:81425ms step_avg:60.95ms +step:1337/2245 train_time:81486ms step_avg:60.95ms +step:1338/2245 train_time:81546ms step_avg:60.95ms +step:1339/2245 train_time:81608ms step_avg:60.95ms +step:1340/2245 train_time:81667ms step_avg:60.95ms +step:1341/2245 train_time:81729ms step_avg:60.95ms +step:1342/2245 train_time:81789ms step_avg:60.95ms +step:1343/2245 train_time:81851ms step_avg:60.95ms +step:1344/2245 train_time:81911ms step_avg:60.95ms +step:1345/2245 train_time:81973ms step_avg:60.95ms +step:1346/2245 train_time:82033ms step_avg:60.95ms +step:1347/2245 train_time:82096ms step_avg:60.95ms +step:1348/2245 train_time:82156ms step_avg:60.95ms +step:1349/2245 train_time:82219ms step_avg:60.95ms +step:1350/2245 train_time:82281ms step_avg:60.95ms +step:1351/2245 train_time:82343ms step_avg:60.95ms +step:1352/2245 train_time:82404ms step_avg:60.95ms +step:1353/2245 train_time:82465ms step_avg:60.95ms +step:1354/2245 train_time:82525ms step_avg:60.95ms +step:1355/2245 train_time:82587ms step_avg:60.95ms +step:1356/2245 train_time:82647ms step_avg:60.95ms +step:1357/2245 train_time:82709ms step_avg:60.95ms +step:1358/2245 train_time:82768ms step_avg:60.95ms +step:1359/2245 train_time:82830ms step_avg:60.95ms +step:1360/2245 train_time:82890ms step_avg:60.95ms +step:1361/2245 train_time:82952ms step_avg:60.95ms +step:1362/2245 train_time:83012ms step_avg:60.95ms +step:1363/2245 train_time:83075ms step_avg:60.95ms +step:1364/2245 train_time:83135ms step_avg:60.95ms +step:1365/2245 train_time:83199ms step_avg:60.95ms +step:1366/2245 train_time:83260ms step_avg:60.95ms +step:1367/2245 train_time:83323ms step_avg:60.95ms +step:1368/2245 train_time:83384ms step_avg:60.95ms +step:1369/2245 train_time:83446ms step_avg:60.95ms +step:1370/2245 train_time:83506ms step_avg:60.95ms +step:1371/2245 train_time:83567ms step_avg:60.95ms +step:1372/2245 train_time:83626ms step_avg:60.95ms +step:1373/2245 train_time:83689ms step_avg:60.95ms +step:1374/2245 train_time:83748ms step_avg:60.95ms +step:1375/2245 train_time:83811ms step_avg:60.95ms +step:1376/2245 train_time:83870ms step_avg:60.95ms +step:1377/2245 train_time:83933ms step_avg:60.95ms +step:1378/2245 train_time:83993ms step_avg:60.95ms +step:1379/2245 train_time:84055ms step_avg:60.95ms +step:1380/2245 train_time:84115ms step_avg:60.95ms +step:1381/2245 train_time:84178ms step_avg:60.95ms +step:1382/2245 train_time:84239ms step_avg:60.95ms +step:1383/2245 train_time:84302ms step_avg:60.96ms +step:1384/2245 train_time:84362ms step_avg:60.96ms +step:1385/2245 train_time:84425ms step_avg:60.96ms +step:1386/2245 train_time:84485ms step_avg:60.96ms +step:1387/2245 train_time:84548ms step_avg:60.96ms +step:1388/2245 train_time:84607ms step_avg:60.96ms +step:1389/2245 train_time:84669ms step_avg:60.96ms +step:1390/2245 train_time:84729ms step_avg:60.96ms +step:1391/2245 train_time:84791ms step_avg:60.96ms +step:1392/2245 train_time:84851ms step_avg:60.96ms +step:1393/2245 train_time:84913ms step_avg:60.96ms +step:1394/2245 train_time:84973ms step_avg:60.96ms +step:1395/2245 train_time:85036ms step_avg:60.96ms +step:1396/2245 train_time:85096ms step_avg:60.96ms +step:1397/2245 train_time:85158ms step_avg:60.96ms +step:1398/2245 train_time:85219ms step_avg:60.96ms +step:1399/2245 train_time:85282ms step_avg:60.96ms +step:1400/2245 train_time:85342ms step_avg:60.96ms +step:1401/2245 train_time:85404ms step_avg:60.96ms +step:1402/2245 train_time:85464ms step_avg:60.96ms +step:1403/2245 train_time:85526ms step_avg:60.96ms +step:1404/2245 train_time:85586ms step_avg:60.96ms +step:1405/2245 train_time:85648ms step_avg:60.96ms +step:1406/2245 train_time:85707ms step_avg:60.96ms +step:1407/2245 train_time:85769ms step_avg:60.96ms +step:1408/2245 train_time:85829ms step_avg:60.96ms +step:1409/2245 train_time:85892ms step_avg:60.96ms +step:1410/2245 train_time:85952ms step_avg:60.96ms +step:1411/2245 train_time:86015ms step_avg:60.96ms +step:1412/2245 train_time:86075ms step_avg:60.96ms +step:1413/2245 train_time:86138ms step_avg:60.96ms +step:1414/2245 train_time:86198ms step_avg:60.96ms +step:1415/2245 train_time:86261ms step_avg:60.96ms +step:1416/2245 train_time:86321ms step_avg:60.96ms +step:1417/2245 train_time:86383ms step_avg:60.96ms +step:1418/2245 train_time:86443ms step_avg:60.96ms +step:1419/2245 train_time:86506ms step_avg:60.96ms +step:1420/2245 train_time:86565ms step_avg:60.96ms +step:1421/2245 train_time:86627ms step_avg:60.96ms +step:1422/2245 train_time:86688ms step_avg:60.96ms +step:1423/2245 train_time:86749ms step_avg:60.96ms +step:1424/2245 train_time:86809ms step_avg:60.96ms +step:1425/2245 train_time:86872ms step_avg:60.96ms +step:1426/2245 train_time:86932ms step_avg:60.96ms +step:1427/2245 train_time:86995ms step_avg:60.96ms +step:1428/2245 train_time:87054ms step_avg:60.96ms +step:1429/2245 train_time:87117ms step_avg:60.96ms +step:1430/2245 train_time:87177ms step_avg:60.96ms +step:1431/2245 train_time:87241ms step_avg:60.96ms +step:1432/2245 train_time:87301ms step_avg:60.96ms +step:1433/2245 train_time:87364ms step_avg:60.97ms +step:1434/2245 train_time:87423ms step_avg:60.96ms +step:1435/2245 train_time:87485ms step_avg:60.97ms +step:1436/2245 train_time:87545ms step_avg:60.96ms +step:1437/2245 train_time:87607ms step_avg:60.96ms +step:1438/2245 train_time:87666ms step_avg:60.96ms +step:1439/2245 train_time:87728ms step_avg:60.96ms +step:1440/2245 train_time:87788ms step_avg:60.96ms +step:1441/2245 train_time:87850ms step_avg:60.96ms +step:1442/2245 train_time:87910ms step_avg:60.96ms +step:1443/2245 train_time:87972ms step_avg:60.96ms +step:1444/2245 train_time:88032ms step_avg:60.96ms +step:1445/2245 train_time:88095ms step_avg:60.97ms +step:1446/2245 train_time:88156ms step_avg:60.97ms +step:1447/2245 train_time:88219ms step_avg:60.97ms +step:1448/2245 train_time:88279ms step_avg:60.97ms +step:1449/2245 train_time:88342ms step_avg:60.97ms +step:1450/2245 train_time:88402ms step_avg:60.97ms +step:1451/2245 train_time:88464ms step_avg:60.97ms +step:1452/2245 train_time:88524ms step_avg:60.97ms +step:1453/2245 train_time:88587ms step_avg:60.97ms +step:1454/2245 train_time:88646ms step_avg:60.97ms +step:1455/2245 train_time:88708ms step_avg:60.97ms +step:1456/2245 train_time:88768ms step_avg:60.97ms +step:1457/2245 train_time:88830ms step_avg:60.97ms +step:1458/2245 train_time:88890ms step_avg:60.97ms +step:1459/2245 train_time:88953ms step_avg:60.97ms +step:1460/2245 train_time:89012ms step_avg:60.97ms +step:1461/2245 train_time:89075ms step_avg:60.97ms +step:1462/2245 train_time:89136ms step_avg:60.97ms +step:1463/2245 train_time:89200ms step_avg:60.97ms +step:1464/2245 train_time:89259ms step_avg:60.97ms +step:1465/2245 train_time:89322ms step_avg:60.97ms +step:1466/2245 train_time:89381ms step_avg:60.97ms +step:1467/2245 train_time:89444ms step_avg:60.97ms +step:1468/2245 train_time:89503ms step_avg:60.97ms +step:1469/2245 train_time:89566ms step_avg:60.97ms +step:1470/2245 train_time:89625ms step_avg:60.97ms +step:1471/2245 train_time:89687ms step_avg:60.97ms +step:1472/2245 train_time:89747ms step_avg:60.97ms +step:1473/2245 train_time:89810ms step_avg:60.97ms +step:1474/2245 train_time:89870ms step_avg:60.97ms +step:1475/2245 train_time:89933ms step_avg:60.97ms +step:1476/2245 train_time:89993ms step_avg:60.97ms +step:1477/2245 train_time:90056ms step_avg:60.97ms +step:1478/2245 train_time:90116ms step_avg:60.97ms +step:1479/2245 train_time:90180ms step_avg:60.97ms +step:1480/2245 train_time:90240ms step_avg:60.97ms +step:1481/2245 train_time:90303ms step_avg:60.97ms +step:1482/2245 train_time:90363ms step_avg:60.97ms +step:1483/2245 train_time:90426ms step_avg:60.98ms +step:1484/2245 train_time:90487ms step_avg:60.98ms +step:1485/2245 train_time:90549ms step_avg:60.98ms +step:1486/2245 train_time:90609ms step_avg:60.98ms +step:1487/2245 train_time:90672ms step_avg:60.98ms +step:1488/2245 train_time:90733ms step_avg:60.98ms +step:1489/2245 train_time:90795ms step_avg:60.98ms +step:1490/2245 train_time:90855ms step_avg:60.98ms +step:1491/2245 train_time:90918ms step_avg:60.98ms +step:1492/2245 train_time:90978ms step_avg:60.98ms +step:1493/2245 train_time:91040ms step_avg:60.98ms +step:1494/2245 train_time:91101ms step_avg:60.98ms +step:1495/2245 train_time:91164ms step_avg:60.98ms +step:1496/2245 train_time:91224ms step_avg:60.98ms +step:1497/2245 train_time:91287ms step_avg:60.98ms +step:1498/2245 train_time:91348ms step_avg:60.98ms +step:1499/2245 train_time:91411ms step_avg:60.98ms +step:1500/2245 train_time:91473ms step_avg:60.98ms +step:1500/2245 val_loss:3.4421 train_time:91537ms step_avg:61.02ms +step:1501/2245 train_time:91558ms step_avg:61.00ms +step:1502/2245 train_time:91600ms step_avg:60.99ms +step:1503/2245 train_time:91662ms step_avg:60.99ms +step:1504/2245 train_time:91724ms step_avg:60.99ms +step:1505/2245 train_time:91787ms step_avg:60.99ms +step:1506/2245 train_time:91848ms step_avg:60.99ms +step:1507/2245 train_time:91910ms step_avg:60.99ms +step:1508/2245 train_time:91969ms step_avg:60.99ms +step:1509/2245 train_time:92030ms step_avg:60.99ms +step:1510/2245 train_time:92090ms step_avg:60.99ms +step:1511/2245 train_time:92151ms step_avg:60.99ms +step:1512/2245 train_time:92211ms step_avg:60.99ms +step:1513/2245 train_time:92273ms step_avg:60.99ms +step:1514/2245 train_time:92333ms step_avg:60.99ms +step:1515/2245 train_time:92395ms step_avg:60.99ms +step:1516/2245 train_time:92458ms step_avg:60.99ms +step:1517/2245 train_time:92523ms step_avg:60.99ms +step:1518/2245 train_time:92585ms step_avg:60.99ms +step:1519/2245 train_time:92648ms step_avg:60.99ms +step:1520/2245 train_time:92709ms step_avg:60.99ms +step:1521/2245 train_time:92772ms step_avg:60.99ms +step:1522/2245 train_time:92832ms step_avg:60.99ms +step:1523/2245 train_time:92895ms step_avg:60.99ms +step:1524/2245 train_time:92954ms step_avg:60.99ms +step:1525/2245 train_time:93016ms step_avg:60.99ms +step:1526/2245 train_time:93075ms step_avg:60.99ms +step:1527/2245 train_time:93137ms step_avg:60.99ms +step:1528/2245 train_time:93197ms step_avg:60.99ms +step:1529/2245 train_time:93259ms step_avg:60.99ms +step:1530/2245 train_time:93320ms step_avg:60.99ms +step:1531/2245 train_time:93384ms step_avg:61.00ms +step:1532/2245 train_time:93445ms step_avg:61.00ms +step:1533/2245 train_time:93510ms step_avg:61.00ms +step:1534/2245 train_time:93570ms step_avg:61.00ms +step:1535/2245 train_time:93633ms step_avg:61.00ms +step:1536/2245 train_time:93693ms step_avg:61.00ms +step:1537/2245 train_time:93757ms step_avg:61.00ms +step:1538/2245 train_time:93817ms step_avg:61.00ms +step:1539/2245 train_time:93881ms step_avg:61.00ms +step:1540/2245 train_time:93942ms step_avg:61.00ms +step:1541/2245 train_time:94004ms step_avg:61.00ms +step:1542/2245 train_time:94064ms step_avg:61.00ms +step:1543/2245 train_time:94126ms step_avg:61.00ms +step:1544/2245 train_time:94185ms step_avg:61.00ms +step:1545/2245 train_time:94248ms step_avg:61.00ms +step:1546/2245 train_time:94308ms step_avg:61.00ms +step:1547/2245 train_time:94370ms step_avg:61.00ms +step:1548/2245 train_time:94431ms step_avg:61.00ms +step:1549/2245 train_time:94494ms step_avg:61.00ms +step:1550/2245 train_time:94555ms step_avg:61.00ms +step:1551/2245 train_time:94618ms step_avg:61.00ms +step:1552/2245 train_time:94679ms step_avg:61.00ms +step:1553/2245 train_time:94742ms step_avg:61.01ms +step:1554/2245 train_time:94804ms step_avg:61.01ms +step:1555/2245 train_time:94867ms step_avg:61.01ms +step:1556/2245 train_time:94927ms step_avg:61.01ms +step:1557/2245 train_time:94989ms step_avg:61.01ms +step:1558/2245 train_time:95049ms step_avg:61.01ms +step:1559/2245 train_time:95111ms step_avg:61.01ms +step:1560/2245 train_time:95171ms step_avg:61.01ms +step:1561/2245 train_time:95233ms step_avg:61.01ms +step:1562/2245 train_time:95293ms step_avg:61.01ms +step:1563/2245 train_time:95356ms step_avg:61.01ms +step:1564/2245 train_time:95416ms step_avg:61.01ms +step:1565/2245 train_time:95479ms step_avg:61.01ms +step:1566/2245 train_time:95540ms step_avg:61.01ms +step:1567/2245 train_time:95604ms step_avg:61.01ms +step:1568/2245 train_time:95665ms step_avg:61.01ms +step:1569/2245 train_time:95727ms step_avg:61.01ms +step:1570/2245 train_time:95788ms step_avg:61.01ms +step:1571/2245 train_time:95851ms step_avg:61.01ms +step:1572/2245 train_time:95911ms step_avg:61.01ms +step:1573/2245 train_time:95974ms step_avg:61.01ms +step:1574/2245 train_time:96034ms step_avg:61.01ms +step:1575/2245 train_time:96096ms step_avg:61.01ms +step:1576/2245 train_time:96157ms step_avg:61.01ms +step:1577/2245 train_time:96220ms step_avg:61.01ms +step:1578/2245 train_time:96281ms step_avg:61.01ms +step:1579/2245 train_time:96344ms step_avg:61.02ms +step:1580/2245 train_time:96405ms step_avg:61.02ms +step:1581/2245 train_time:96468ms step_avg:61.02ms +step:1582/2245 train_time:96528ms step_avg:61.02ms +step:1583/2245 train_time:96590ms step_avg:61.02ms +step:1584/2245 train_time:96651ms step_avg:61.02ms +step:1585/2245 train_time:96714ms step_avg:61.02ms +step:1586/2245 train_time:96774ms step_avg:61.02ms +step:1587/2245 train_time:96836ms step_avg:61.02ms +step:1588/2245 train_time:96896ms step_avg:61.02ms +step:1589/2245 train_time:96959ms step_avg:61.02ms +step:1590/2245 train_time:97019ms step_avg:61.02ms +step:1591/2245 train_time:97082ms step_avg:61.02ms +step:1592/2245 train_time:97142ms step_avg:61.02ms +step:1593/2245 train_time:97205ms step_avg:61.02ms +step:1594/2245 train_time:97266ms step_avg:61.02ms +step:1595/2245 train_time:97329ms step_avg:61.02ms +step:1596/2245 train_time:97389ms step_avg:61.02ms +step:1597/2245 train_time:97452ms step_avg:61.02ms +step:1598/2245 train_time:97512ms step_avg:61.02ms +step:1599/2245 train_time:97574ms step_avg:61.02ms +step:1600/2245 train_time:97635ms step_avg:61.02ms +step:1601/2245 train_time:97697ms step_avg:61.02ms +step:1602/2245 train_time:97758ms step_avg:61.02ms +step:1603/2245 train_time:97821ms step_avg:61.02ms +step:1604/2245 train_time:97883ms step_avg:61.02ms +step:1605/2245 train_time:97946ms step_avg:61.03ms +step:1606/2245 train_time:98007ms step_avg:61.03ms +step:1607/2245 train_time:98069ms step_avg:61.03ms +step:1608/2245 train_time:98128ms step_avg:61.03ms +step:1609/2245 train_time:98191ms step_avg:61.03ms +step:1610/2245 train_time:98252ms step_avg:61.03ms +step:1611/2245 train_time:98315ms step_avg:61.03ms +step:1612/2245 train_time:98375ms step_avg:61.03ms +step:1613/2245 train_time:98438ms step_avg:61.03ms +step:1614/2245 train_time:98498ms step_avg:61.03ms +step:1615/2245 train_time:98561ms step_avg:61.03ms +step:1616/2245 train_time:98621ms step_avg:61.03ms +step:1617/2245 train_time:98684ms step_avg:61.03ms +step:1618/2245 train_time:98745ms step_avg:61.03ms +step:1619/2245 train_time:98808ms step_avg:61.03ms +step:1620/2245 train_time:98867ms step_avg:61.03ms +step:1621/2245 train_time:98930ms step_avg:61.03ms +step:1622/2245 train_time:98990ms step_avg:61.03ms +step:1623/2245 train_time:99053ms step_avg:61.03ms +step:1624/2245 train_time:99113ms step_avg:61.03ms +step:1625/2245 train_time:99175ms step_avg:61.03ms +step:1626/2245 train_time:99235ms step_avg:61.03ms +step:1627/2245 train_time:99299ms step_avg:61.03ms +step:1628/2245 train_time:99359ms step_avg:61.03ms +step:1629/2245 train_time:99422ms step_avg:61.03ms +step:1630/2245 train_time:99483ms step_avg:61.03ms +step:1631/2245 train_time:99545ms step_avg:61.03ms +step:1632/2245 train_time:99605ms step_avg:61.03ms +step:1633/2245 train_time:99669ms step_avg:61.03ms +step:1634/2245 train_time:99729ms step_avg:61.03ms +step:1635/2245 train_time:99790ms step_avg:61.03ms +step:1636/2245 train_time:99851ms step_avg:61.03ms +step:1637/2245 train_time:99913ms step_avg:61.03ms +step:1638/2245 train_time:99973ms step_avg:61.03ms +step:1639/2245 train_time:100036ms step_avg:61.03ms +step:1640/2245 train_time:100097ms step_avg:61.03ms +step:1641/2245 train_time:100159ms step_avg:61.04ms +step:1642/2245 train_time:100220ms step_avg:61.04ms +step:1643/2245 train_time:100283ms step_avg:61.04ms +step:1644/2245 train_time:100343ms step_avg:61.04ms +step:1645/2245 train_time:100407ms step_avg:61.04ms +step:1646/2245 train_time:100467ms step_avg:61.04ms +step:1647/2245 train_time:100529ms step_avg:61.04ms +step:1648/2245 train_time:100589ms step_avg:61.04ms +step:1649/2245 train_time:100652ms step_avg:61.04ms +step:1650/2245 train_time:100712ms step_avg:61.04ms +step:1651/2245 train_time:100775ms step_avg:61.04ms +step:1652/2245 train_time:100835ms step_avg:61.04ms +step:1653/2245 train_time:100897ms step_avg:61.04ms +step:1654/2245 train_time:100957ms step_avg:61.04ms +step:1655/2245 train_time:101020ms step_avg:61.04ms +step:1656/2245 train_time:101080ms step_avg:61.04ms +step:1657/2245 train_time:101143ms step_avg:61.04ms +step:1658/2245 train_time:101204ms step_avg:61.04ms +step:1659/2245 train_time:101268ms step_avg:61.04ms +step:1660/2245 train_time:101327ms step_avg:61.04ms +step:1661/2245 train_time:101391ms step_avg:61.04ms +step:1662/2245 train_time:101451ms step_avg:61.04ms +step:1663/2245 train_time:101514ms step_avg:61.04ms +step:1664/2245 train_time:101574ms step_avg:61.04ms +step:1665/2245 train_time:101637ms step_avg:61.04ms +step:1666/2245 train_time:101697ms step_avg:61.04ms +step:1667/2245 train_time:101760ms step_avg:61.04ms +step:1668/2245 train_time:101821ms step_avg:61.04ms +step:1669/2245 train_time:101883ms step_avg:61.04ms +step:1670/2245 train_time:101945ms step_avg:61.04ms +step:1671/2245 train_time:102008ms step_avg:61.05ms +step:1672/2245 train_time:102068ms step_avg:61.05ms +step:1673/2245 train_time:102130ms step_avg:61.05ms +step:1674/2245 train_time:102190ms step_avg:61.05ms +step:1675/2245 train_time:102253ms step_avg:61.05ms +step:1676/2245 train_time:102313ms step_avg:61.05ms +step:1677/2245 train_time:102376ms step_avg:61.05ms +step:1678/2245 train_time:102437ms step_avg:61.05ms +step:1679/2245 train_time:102499ms step_avg:61.05ms +step:1680/2245 train_time:102559ms step_avg:61.05ms +step:1681/2245 train_time:102622ms step_avg:61.05ms +step:1682/2245 train_time:102682ms step_avg:61.05ms +step:1683/2245 train_time:102745ms step_avg:61.05ms +step:1684/2245 train_time:102807ms step_avg:61.05ms +step:1685/2245 train_time:102869ms step_avg:61.05ms +step:1686/2245 train_time:102929ms step_avg:61.05ms +step:1687/2245 train_time:102991ms step_avg:61.05ms +step:1688/2245 train_time:103051ms step_avg:61.05ms +step:1689/2245 train_time:103114ms step_avg:61.05ms +step:1690/2245 train_time:103174ms step_avg:61.05ms +step:1691/2245 train_time:103237ms step_avg:61.05ms +step:1692/2245 train_time:103297ms step_avg:61.05ms +step:1693/2245 train_time:103360ms step_avg:61.05ms +step:1694/2245 train_time:103421ms step_avg:61.05ms +step:1695/2245 train_time:103484ms step_avg:61.05ms +step:1696/2245 train_time:103544ms step_avg:61.05ms +step:1697/2245 train_time:103607ms step_avg:61.05ms +step:1698/2245 train_time:103667ms step_avg:61.05ms +step:1699/2245 train_time:103730ms step_avg:61.05ms +step:1700/2245 train_time:103790ms step_avg:61.05ms +step:1701/2245 train_time:103853ms step_avg:61.05ms +step:1702/2245 train_time:103914ms step_avg:61.05ms +step:1703/2245 train_time:103976ms step_avg:61.05ms +step:1704/2245 train_time:104037ms step_avg:61.05ms +step:1705/2245 train_time:104100ms step_avg:61.06ms +step:1706/2245 train_time:104160ms step_avg:61.06ms +step:1707/2245 train_time:104224ms step_avg:61.06ms +step:1708/2245 train_time:104284ms step_avg:61.06ms +step:1709/2245 train_time:104347ms step_avg:61.06ms +step:1710/2245 train_time:104407ms step_avg:61.06ms +step:1711/2245 train_time:104470ms step_avg:61.06ms +step:1712/2245 train_time:104529ms step_avg:61.06ms +step:1713/2245 train_time:104592ms step_avg:61.06ms +step:1714/2245 train_time:104653ms step_avg:61.06ms +step:1715/2245 train_time:104715ms step_avg:61.06ms +step:1716/2245 train_time:104775ms step_avg:61.06ms +step:1717/2245 train_time:104838ms step_avg:61.06ms +step:1718/2245 train_time:104897ms step_avg:61.06ms +step:1719/2245 train_time:104961ms step_avg:61.06ms +step:1720/2245 train_time:105021ms step_avg:61.06ms +step:1721/2245 train_time:105084ms step_avg:61.06ms +step:1722/2245 train_time:105145ms step_avg:61.06ms +step:1723/2245 train_time:105208ms step_avg:61.06ms +step:1724/2245 train_time:105268ms step_avg:61.06ms +step:1725/2245 train_time:105330ms step_avg:61.06ms +step:1726/2245 train_time:105390ms step_avg:61.06ms +step:1727/2245 train_time:105453ms step_avg:61.06ms +step:1728/2245 train_time:105513ms step_avg:61.06ms +step:1729/2245 train_time:105576ms step_avg:61.06ms +step:1730/2245 train_time:105637ms step_avg:61.06ms +step:1731/2245 train_time:105700ms step_avg:61.06ms +step:1732/2245 train_time:105762ms step_avg:61.06ms +step:1733/2245 train_time:105824ms step_avg:61.06ms +step:1734/2245 train_time:105885ms step_avg:61.06ms +step:1735/2245 train_time:105948ms step_avg:61.07ms +step:1736/2245 train_time:106008ms step_avg:61.06ms +step:1737/2245 train_time:106071ms step_avg:61.07ms +step:1738/2245 train_time:106131ms step_avg:61.06ms +step:1739/2245 train_time:106193ms step_avg:61.07ms +step:1740/2245 train_time:106254ms step_avg:61.07ms +step:1741/2245 train_time:106316ms step_avg:61.07ms +step:1742/2245 train_time:106376ms step_avg:61.07ms +step:1743/2245 train_time:106439ms step_avg:61.07ms +step:1744/2245 train_time:106500ms step_avg:61.07ms +step:1745/2245 train_time:106564ms step_avg:61.07ms +step:1746/2245 train_time:106625ms step_avg:61.07ms +step:1747/2245 train_time:106688ms step_avg:61.07ms +step:1748/2245 train_time:106749ms step_avg:61.07ms +step:1749/2245 train_time:106812ms step_avg:61.07ms +step:1750/2245 train_time:106872ms step_avg:61.07ms +step:1750/2245 val_loss:3.3778 train_time:106935ms step_avg:61.11ms +step:1751/2245 train_time:106954ms step_avg:61.08ms +step:1752/2245 train_time:106997ms step_avg:61.07ms +step:1753/2245 train_time:107062ms step_avg:61.07ms +step:1754/2245 train_time:107124ms step_avg:61.07ms +step:1755/2245 train_time:107187ms step_avg:61.08ms +step:1756/2245 train_time:107248ms step_avg:61.08ms +step:1757/2245 train_time:107310ms step_avg:61.08ms +step:1758/2245 train_time:107370ms step_avg:61.08ms +step:1759/2245 train_time:107433ms step_avg:61.08ms +step:1760/2245 train_time:107492ms step_avg:61.08ms +step:1761/2245 train_time:107555ms step_avg:61.08ms +step:1762/2245 train_time:107614ms step_avg:61.07ms +step:1763/2245 train_time:107676ms step_avg:61.08ms +step:1764/2245 train_time:107737ms step_avg:61.08ms +step:1765/2245 train_time:107799ms step_avg:61.08ms +step:1766/2245 train_time:107859ms step_avg:61.08ms +step:1767/2245 train_time:107922ms step_avg:61.08ms +step:1768/2245 train_time:107983ms step_avg:61.08ms +step:1769/2245 train_time:108047ms step_avg:61.08ms +step:1770/2245 train_time:108109ms step_avg:61.08ms +step:1771/2245 train_time:108172ms step_avg:61.08ms +step:1772/2245 train_time:108233ms step_avg:61.08ms +step:1773/2245 train_time:108295ms step_avg:61.08ms +step:1774/2245 train_time:108355ms step_avg:61.08ms +step:1775/2245 train_time:108417ms step_avg:61.08ms +step:1776/2245 train_time:108477ms step_avg:61.08ms +step:1777/2245 train_time:108539ms step_avg:61.08ms +step:1778/2245 train_time:108599ms step_avg:61.08ms +step:1779/2245 train_time:108662ms step_avg:61.08ms +step:1780/2245 train_time:108722ms step_avg:61.08ms +step:1781/2245 train_time:108784ms step_avg:61.08ms +step:1782/2245 train_time:108844ms step_avg:61.08ms +step:1783/2245 train_time:108908ms step_avg:61.08ms +step:1784/2245 train_time:108970ms step_avg:61.08ms +step:1785/2245 train_time:109033ms step_avg:61.08ms +step:1786/2245 train_time:109094ms step_avg:61.08ms +step:1787/2245 train_time:109156ms step_avg:61.08ms +step:1788/2245 train_time:109217ms step_avg:61.08ms +step:1789/2245 train_time:109279ms step_avg:61.08ms +step:1790/2245 train_time:109339ms step_avg:61.08ms +step:1791/2245 train_time:109402ms step_avg:61.08ms +step:1792/2245 train_time:109462ms step_avg:61.08ms +step:1793/2245 train_time:109524ms step_avg:61.08ms +step:1794/2245 train_time:109584ms step_avg:61.08ms +step:1795/2245 train_time:109647ms step_avg:61.08ms +step:1796/2245 train_time:109707ms step_avg:61.08ms +step:1797/2245 train_time:109770ms step_avg:61.09ms +step:1798/2245 train_time:109831ms step_avg:61.08ms +step:1799/2245 train_time:109893ms step_avg:61.09ms +step:1800/2245 train_time:109954ms step_avg:61.09ms +step:1801/2245 train_time:110017ms step_avg:61.09ms +step:1802/2245 train_time:110079ms step_avg:61.09ms +step:1803/2245 train_time:110142ms step_avg:61.09ms +step:1804/2245 train_time:110202ms step_avg:61.09ms +step:1805/2245 train_time:110265ms step_avg:61.09ms +step:1806/2245 train_time:110326ms step_avg:61.09ms +step:1807/2245 train_time:110388ms step_avg:61.09ms +step:1808/2245 train_time:110449ms step_avg:61.09ms +step:1809/2245 train_time:110511ms step_avg:61.09ms +step:1810/2245 train_time:110572ms step_avg:61.09ms +step:1811/2245 train_time:110635ms step_avg:61.09ms +step:1812/2245 train_time:110694ms step_avg:61.09ms +step:1813/2245 train_time:110756ms step_avg:61.09ms +step:1814/2245 train_time:110816ms step_avg:61.09ms +step:1815/2245 train_time:110879ms step_avg:61.09ms +step:1816/2245 train_time:110939ms step_avg:61.09ms +step:1817/2245 train_time:111002ms step_avg:61.09ms +step:1818/2245 train_time:111063ms step_avg:61.09ms +step:1819/2245 train_time:111127ms step_avg:61.09ms +step:1820/2245 train_time:111187ms step_avg:61.09ms +step:1821/2245 train_time:111250ms step_avg:61.09ms +step:1822/2245 train_time:111311ms step_avg:61.09ms +step:1823/2245 train_time:111375ms step_avg:61.09ms +step:1824/2245 train_time:111435ms step_avg:61.09ms +step:1825/2245 train_time:111497ms step_avg:61.09ms +step:1826/2245 train_time:111557ms step_avg:61.09ms +step:1827/2245 train_time:111619ms step_avg:61.09ms +step:1828/2245 train_time:111680ms step_avg:61.09ms +step:1829/2245 train_time:111742ms step_avg:61.09ms +step:1830/2245 train_time:111803ms step_avg:61.09ms +step:1831/2245 train_time:111865ms step_avg:61.09ms +step:1832/2245 train_time:111925ms step_avg:61.09ms +step:1833/2245 train_time:111989ms step_avg:61.10ms +step:1834/2245 train_time:112050ms step_avg:61.10ms +step:1835/2245 train_time:112113ms step_avg:61.10ms +step:1836/2245 train_time:112174ms step_avg:61.10ms +step:1837/2245 train_time:112237ms step_avg:61.10ms +step:1838/2245 train_time:112296ms step_avg:61.10ms +step:1839/2245 train_time:112358ms step_avg:61.10ms +step:1840/2245 train_time:112419ms step_avg:61.10ms +step:1841/2245 train_time:112482ms step_avg:61.10ms +step:1842/2245 train_time:112542ms step_avg:61.10ms +step:1843/2245 train_time:112604ms step_avg:61.10ms +step:1844/2245 train_time:112665ms step_avg:61.10ms +step:1845/2245 train_time:112728ms step_avg:61.10ms +step:1846/2245 train_time:112788ms step_avg:61.10ms +step:1847/2245 train_time:112850ms step_avg:61.10ms +step:1848/2245 train_time:112911ms step_avg:61.10ms +step:1849/2245 train_time:112974ms step_avg:61.10ms +step:1850/2245 train_time:113035ms step_avg:61.10ms +step:1851/2245 train_time:113097ms step_avg:61.10ms +step:1852/2245 train_time:113158ms step_avg:61.10ms +step:1853/2245 train_time:113221ms step_avg:61.10ms +step:1854/2245 train_time:113281ms step_avg:61.10ms +step:1855/2245 train_time:113343ms step_avg:61.10ms +step:1856/2245 train_time:113403ms step_avg:61.10ms +step:1857/2245 train_time:113466ms step_avg:61.10ms +step:1858/2245 train_time:113526ms step_avg:61.10ms +step:1859/2245 train_time:113589ms step_avg:61.10ms +step:1860/2245 train_time:113650ms step_avg:61.10ms +step:1861/2245 train_time:113712ms step_avg:61.10ms +step:1862/2245 train_time:113773ms step_avg:61.10ms +step:1863/2245 train_time:113835ms step_avg:61.10ms +step:1864/2245 train_time:113895ms step_avg:61.10ms +step:1865/2245 train_time:113958ms step_avg:61.10ms +step:1866/2245 train_time:114018ms step_avg:61.10ms +step:1867/2245 train_time:114081ms step_avg:61.10ms +step:1868/2245 train_time:114141ms step_avg:61.10ms +step:1869/2245 train_time:114204ms step_avg:61.10ms +step:1870/2245 train_time:114265ms step_avg:61.10ms +step:1871/2245 train_time:114328ms step_avg:61.11ms +step:1872/2245 train_time:114389ms step_avg:61.10ms +step:1873/2245 train_time:114452ms step_avg:61.11ms +step:1874/2245 train_time:114512ms step_avg:61.11ms +step:1875/2245 train_time:114575ms step_avg:61.11ms +step:1876/2245 train_time:114635ms step_avg:61.11ms +step:1877/2245 train_time:114697ms step_avg:61.11ms +step:1878/2245 train_time:114758ms step_avg:61.11ms +step:1879/2245 train_time:114820ms step_avg:61.11ms +step:1880/2245 train_time:114880ms step_avg:61.11ms +step:1881/2245 train_time:114943ms step_avg:61.11ms +step:1882/2245 train_time:115003ms step_avg:61.11ms +step:1883/2245 train_time:115066ms step_avg:61.11ms +step:1884/2245 train_time:115126ms step_avg:61.11ms +step:1885/2245 train_time:115189ms step_avg:61.11ms +step:1886/2245 train_time:115249ms step_avg:61.11ms +step:1887/2245 train_time:115313ms step_avg:61.11ms +step:1888/2245 train_time:115373ms step_avg:61.11ms +step:1889/2245 train_time:115436ms step_avg:61.11ms +step:1890/2245 train_time:115496ms step_avg:61.11ms +step:1891/2245 train_time:115559ms step_avg:61.11ms +step:1892/2245 train_time:115619ms step_avg:61.11ms +step:1893/2245 train_time:115681ms step_avg:61.11ms +step:1894/2245 train_time:115742ms step_avg:61.11ms +step:1895/2245 train_time:115804ms step_avg:61.11ms +step:1896/2245 train_time:115865ms step_avg:61.11ms +step:1897/2245 train_time:115928ms step_avg:61.11ms +step:1898/2245 train_time:115989ms step_avg:61.11ms +step:1899/2245 train_time:116052ms step_avg:61.11ms +step:1900/2245 train_time:116112ms step_avg:61.11ms +step:1901/2245 train_time:116175ms step_avg:61.11ms +step:1902/2245 train_time:116236ms step_avg:61.11ms +step:1903/2245 train_time:116298ms step_avg:61.11ms +step:1904/2245 train_time:116358ms step_avg:61.11ms +step:1905/2245 train_time:116420ms step_avg:61.11ms +step:1906/2245 train_time:116481ms step_avg:61.11ms +step:1907/2245 train_time:116544ms step_avg:61.11ms +step:1908/2245 train_time:116603ms step_avg:61.11ms +step:1909/2245 train_time:116666ms step_avg:61.11ms +step:1910/2245 train_time:116727ms step_avg:61.11ms +step:1911/2245 train_time:116791ms step_avg:61.11ms +step:1912/2245 train_time:116851ms step_avg:61.11ms +step:1913/2245 train_time:116915ms step_avg:61.12ms +step:1914/2245 train_time:116975ms step_avg:61.12ms +step:1915/2245 train_time:117037ms step_avg:61.12ms +step:1916/2245 train_time:117097ms step_avg:61.12ms +step:1917/2245 train_time:117160ms step_avg:61.12ms +step:1918/2245 train_time:117220ms step_avg:61.12ms +step:1919/2245 train_time:117282ms step_avg:61.12ms +step:1920/2245 train_time:117343ms step_avg:61.12ms +step:1921/2245 train_time:117406ms step_avg:61.12ms +step:1922/2245 train_time:117467ms step_avg:61.12ms +step:1923/2245 train_time:117529ms step_avg:61.12ms +step:1924/2245 train_time:117590ms step_avg:61.12ms +step:1925/2245 train_time:117653ms step_avg:61.12ms +step:1926/2245 train_time:117713ms step_avg:61.12ms +step:1927/2245 train_time:117776ms step_avg:61.12ms +step:1928/2245 train_time:117836ms step_avg:61.12ms +step:1929/2245 train_time:117898ms step_avg:61.12ms +step:1930/2245 train_time:117958ms step_avg:61.12ms +step:1931/2245 train_time:118020ms step_avg:61.12ms +step:1932/2245 train_time:118080ms step_avg:61.12ms +step:1933/2245 train_time:118144ms step_avg:61.12ms +step:1934/2245 train_time:118204ms step_avg:61.12ms +step:1935/2245 train_time:118267ms step_avg:61.12ms +step:1936/2245 train_time:118329ms step_avg:61.12ms +step:1937/2245 train_time:118391ms step_avg:61.12ms +step:1938/2245 train_time:118452ms step_avg:61.12ms +step:1939/2245 train_time:118515ms step_avg:61.12ms +step:1940/2245 train_time:118574ms step_avg:61.12ms +step:1941/2245 train_time:118637ms step_avg:61.12ms +step:1942/2245 train_time:118697ms step_avg:61.12ms +step:1943/2245 train_time:118760ms step_avg:61.12ms +step:1944/2245 train_time:118820ms step_avg:61.12ms +step:1945/2245 train_time:118883ms step_avg:61.12ms +step:1946/2245 train_time:118943ms step_avg:61.12ms +step:1947/2245 train_time:119006ms step_avg:61.12ms +step:1948/2245 train_time:119067ms step_avg:61.12ms +step:1949/2245 train_time:119130ms step_avg:61.12ms +step:1950/2245 train_time:119190ms step_avg:61.12ms +step:1951/2245 train_time:119253ms step_avg:61.12ms +step:1952/2245 train_time:119313ms step_avg:61.12ms +step:1953/2245 train_time:119376ms step_avg:61.12ms +step:1954/2245 train_time:119437ms step_avg:61.12ms +step:1955/2245 train_time:119499ms step_avg:61.12ms +step:1956/2245 train_time:119559ms step_avg:61.12ms +step:1957/2245 train_time:119621ms step_avg:61.12ms +step:1958/2245 train_time:119682ms step_avg:61.12ms +step:1959/2245 train_time:119745ms step_avg:61.13ms +step:1960/2245 train_time:119805ms step_avg:61.12ms +step:1961/2245 train_time:119868ms step_avg:61.13ms +step:1962/2245 train_time:119929ms step_avg:61.13ms +step:1963/2245 train_time:119992ms step_avg:61.13ms +step:1964/2245 train_time:120053ms step_avg:61.13ms +step:1965/2245 train_time:120116ms step_avg:61.13ms +step:1966/2245 train_time:120176ms step_avg:61.13ms +step:1967/2245 train_time:120238ms step_avg:61.13ms +step:1968/2245 train_time:120299ms step_avg:61.13ms +step:1969/2245 train_time:120361ms step_avg:61.13ms +step:1970/2245 train_time:120421ms step_avg:61.13ms +step:1971/2245 train_time:120484ms step_avg:61.13ms +step:1972/2245 train_time:120545ms step_avg:61.13ms +step:1973/2245 train_time:120608ms step_avg:61.13ms +step:1974/2245 train_time:120669ms step_avg:61.13ms +step:1975/2245 train_time:120731ms step_avg:61.13ms +step:1976/2245 train_time:120791ms step_avg:61.13ms +step:1977/2245 train_time:120854ms step_avg:61.13ms +step:1978/2245 train_time:120914ms step_avg:61.13ms +step:1979/2245 train_time:120977ms step_avg:61.13ms +step:1980/2245 train_time:121037ms step_avg:61.13ms +step:1981/2245 train_time:121100ms step_avg:61.13ms +step:1982/2245 train_time:121160ms step_avg:61.13ms +step:1983/2245 train_time:121223ms step_avg:61.13ms +step:1984/2245 train_time:121283ms step_avg:61.13ms +step:1985/2245 train_time:121346ms step_avg:61.13ms +step:1986/2245 train_time:121406ms step_avg:61.13ms +step:1987/2245 train_time:121470ms step_avg:61.13ms +step:1988/2245 train_time:121530ms step_avg:61.13ms +step:1989/2245 train_time:121593ms step_avg:61.13ms +step:1990/2245 train_time:121653ms step_avg:61.13ms +step:1991/2245 train_time:121716ms step_avg:61.13ms +step:1992/2245 train_time:121776ms step_avg:61.13ms +step:1993/2245 train_time:121839ms step_avg:61.13ms +step:1994/2245 train_time:121899ms step_avg:61.13ms +step:1995/2245 train_time:121962ms step_avg:61.13ms +step:1996/2245 train_time:122023ms step_avg:61.13ms +step:1997/2245 train_time:122086ms step_avg:61.13ms +step:1998/2245 train_time:122147ms step_avg:61.13ms +step:1999/2245 train_time:122210ms step_avg:61.14ms +step:2000/2245 train_time:122270ms step_avg:61.14ms +step:2000/2245 val_loss:3.3231 train_time:122333ms step_avg:61.17ms +step:2001/2245 train_time:122353ms step_avg:61.15ms +step:2002/2245 train_time:122397ms step_avg:61.14ms +step:2003/2245 train_time:122464ms step_avg:61.14ms +step:2004/2245 train_time:122524ms step_avg:61.14ms +step:2005/2245 train_time:122587ms step_avg:61.14ms +step:2006/2245 train_time:122648ms step_avg:61.14ms +step:2007/2245 train_time:122710ms step_avg:61.14ms +step:2008/2245 train_time:122770ms step_avg:61.14ms +step:2009/2245 train_time:122831ms step_avg:61.14ms +step:2010/2245 train_time:122891ms step_avg:61.14ms +step:2011/2245 train_time:122954ms step_avg:61.14ms +step:2012/2245 train_time:123014ms step_avg:61.14ms +step:2013/2245 train_time:123077ms step_avg:61.14ms +step:2014/2245 train_time:123138ms step_avg:61.14ms +step:2015/2245 train_time:123200ms step_avg:61.14ms +step:2016/2245 train_time:123261ms step_avg:61.14ms +step:2017/2245 train_time:123324ms step_avg:61.14ms +step:2018/2245 train_time:123386ms step_avg:61.14ms +step:2019/2245 train_time:123450ms step_avg:61.14ms +step:2020/2245 train_time:123512ms step_avg:61.14ms +step:2021/2245 train_time:123575ms step_avg:61.15ms +step:2022/2245 train_time:123635ms step_avg:61.15ms +step:2023/2245 train_time:123698ms step_avg:61.15ms +step:2024/2245 train_time:123758ms step_avg:61.15ms +step:2025/2245 train_time:123820ms step_avg:61.15ms +step:2026/2245 train_time:123880ms step_avg:61.15ms +step:2027/2245 train_time:123942ms step_avg:61.15ms +step:2028/2245 train_time:124002ms step_avg:61.15ms +step:2029/2245 train_time:124064ms step_avg:61.15ms +step:2030/2245 train_time:124124ms step_avg:61.14ms +step:2031/2245 train_time:124187ms step_avg:61.15ms +step:2032/2245 train_time:124248ms step_avg:61.15ms +step:2033/2245 train_time:124312ms step_avg:61.15ms +step:2034/2245 train_time:124373ms step_avg:61.15ms +step:2035/2245 train_time:124438ms step_avg:61.15ms +step:2036/2245 train_time:124499ms step_avg:61.15ms +step:2037/2245 train_time:124561ms step_avg:61.15ms +step:2038/2245 train_time:124621ms step_avg:61.15ms +step:2039/2245 train_time:124684ms step_avg:61.15ms +step:2040/2245 train_time:124745ms step_avg:61.15ms +step:2041/2245 train_time:124807ms step_avg:61.15ms +step:2042/2245 train_time:124867ms step_avg:61.15ms +step:2043/2245 train_time:124930ms step_avg:61.15ms +step:2044/2245 train_time:124990ms step_avg:61.15ms +step:2045/2245 train_time:125054ms step_avg:61.15ms +step:2046/2245 train_time:125115ms step_avg:61.15ms +step:2047/2245 train_time:125178ms step_avg:61.15ms +step:2048/2245 train_time:125238ms step_avg:61.15ms +step:2049/2245 train_time:125301ms step_avg:61.15ms +step:2050/2245 train_time:125361ms step_avg:61.15ms +step:2051/2245 train_time:125425ms step_avg:61.15ms +step:2052/2245 train_time:125486ms step_avg:61.15ms +step:2053/2245 train_time:125548ms step_avg:61.15ms +step:2054/2245 train_time:125608ms step_avg:61.15ms +step:2055/2245 train_time:125671ms step_avg:61.15ms +step:2056/2245 train_time:125732ms step_avg:61.15ms +step:2057/2245 train_time:125795ms step_avg:61.15ms +step:2058/2245 train_time:125855ms step_avg:61.15ms +step:2059/2245 train_time:125918ms step_avg:61.15ms +step:2060/2245 train_time:125977ms step_avg:61.15ms +step:2061/2245 train_time:126040ms step_avg:61.15ms +step:2062/2245 train_time:126100ms step_avg:61.15ms +step:2063/2245 train_time:126162ms step_avg:61.15ms +step:2064/2245 train_time:126222ms step_avg:61.15ms +step:2065/2245 train_time:126285ms step_avg:61.15ms +step:2066/2245 train_time:126346ms step_avg:61.15ms +step:2067/2245 train_time:126409ms step_avg:61.16ms +step:2068/2245 train_time:126470ms step_avg:61.16ms +step:2069/2245 train_time:126533ms step_avg:61.16ms +step:2070/2245 train_time:126593ms step_avg:61.16ms +step:2071/2245 train_time:126656ms step_avg:61.16ms +step:2072/2245 train_time:126717ms step_avg:61.16ms +step:2073/2245 train_time:126780ms step_avg:61.16ms +step:2074/2245 train_time:126841ms step_avg:61.16ms +step:2075/2245 train_time:126903ms step_avg:61.16ms +step:2076/2245 train_time:126965ms step_avg:61.16ms +step:2077/2245 train_time:127026ms step_avg:61.16ms +step:2078/2245 train_time:127086ms step_avg:61.16ms +step:2079/2245 train_time:127149ms step_avg:61.16ms +step:2080/2245 train_time:127210ms step_avg:61.16ms +step:2081/2245 train_time:127273ms step_avg:61.16ms +step:2082/2245 train_time:127334ms step_avg:61.16ms +step:2083/2245 train_time:127398ms step_avg:61.16ms +step:2084/2245 train_time:127458ms step_avg:61.16ms +step:2085/2245 train_time:127521ms step_avg:61.16ms +step:2086/2245 train_time:127581ms step_avg:61.16ms +step:2087/2245 train_time:127644ms step_avg:61.16ms +step:2088/2245 train_time:127704ms step_avg:61.16ms +step:2089/2245 train_time:127767ms step_avg:61.16ms +step:2090/2245 train_time:127827ms step_avg:61.16ms +step:2091/2245 train_time:127891ms step_avg:61.16ms +step:2092/2245 train_time:127952ms step_avg:61.16ms +step:2093/2245 train_time:128015ms step_avg:61.16ms +step:2094/2245 train_time:128075ms step_avg:61.16ms +step:2095/2245 train_time:128138ms step_avg:61.16ms +step:2096/2245 train_time:128198ms step_avg:61.16ms +step:2097/2245 train_time:128261ms step_avg:61.16ms +step:2098/2245 train_time:128320ms step_avg:61.16ms +step:2099/2245 train_time:128384ms step_avg:61.16ms +step:2100/2245 train_time:128444ms step_avg:61.16ms +step:2101/2245 train_time:128508ms step_avg:61.16ms +step:2102/2245 train_time:128568ms step_avg:61.16ms +step:2103/2245 train_time:128631ms step_avg:61.17ms +step:2104/2245 train_time:128692ms step_avg:61.17ms +step:2105/2245 train_time:128755ms step_avg:61.17ms +step:2106/2245 train_time:128816ms step_avg:61.17ms +step:2107/2245 train_time:128879ms step_avg:61.17ms +step:2108/2245 train_time:128941ms step_avg:61.17ms +step:2109/2245 train_time:129002ms step_avg:61.17ms +step:2110/2245 train_time:129062ms step_avg:61.17ms +step:2111/2245 train_time:129125ms step_avg:61.17ms +step:2112/2245 train_time:129185ms step_avg:61.17ms +step:2113/2245 train_time:129248ms step_avg:61.17ms +step:2114/2245 train_time:129308ms step_avg:61.17ms +step:2115/2245 train_time:129372ms step_avg:61.17ms +step:2116/2245 train_time:129433ms step_avg:61.17ms +step:2117/2245 train_time:129497ms step_avg:61.17ms +step:2118/2245 train_time:129557ms step_avg:61.17ms +step:2119/2245 train_time:129620ms step_avg:61.17ms +step:2120/2245 train_time:129680ms step_avg:61.17ms +step:2121/2245 train_time:129742ms step_avg:61.17ms +step:2122/2245 train_time:129802ms step_avg:61.17ms +step:2123/2245 train_time:129865ms step_avg:61.17ms +step:2124/2245 train_time:129925ms step_avg:61.17ms +step:2125/2245 train_time:129988ms step_avg:61.17ms +step:2126/2245 train_time:130047ms step_avg:61.17ms +step:2127/2245 train_time:130111ms step_avg:61.17ms +step:2128/2245 train_time:130171ms step_avg:61.17ms +step:2129/2245 train_time:130235ms step_avg:61.17ms +step:2130/2245 train_time:130295ms step_avg:61.17ms +step:2131/2245 train_time:130358ms step_avg:61.17ms +step:2132/2245 train_time:130418ms step_avg:61.17ms +step:2133/2245 train_time:130481ms step_avg:61.17ms +step:2134/2245 train_time:130542ms step_avg:61.17ms +step:2135/2245 train_time:130604ms step_avg:61.17ms +step:2136/2245 train_time:130666ms step_avg:61.17ms +step:2137/2245 train_time:130728ms step_avg:61.17ms +step:2138/2245 train_time:130788ms step_avg:61.17ms +step:2139/2245 train_time:130852ms step_avg:61.17ms +step:2140/2245 train_time:130914ms step_avg:61.17ms +step:2141/2245 train_time:130977ms step_avg:61.18ms +step:2142/2245 train_time:131038ms step_avg:61.18ms +step:2143/2245 train_time:131100ms step_avg:61.18ms +step:2144/2245 train_time:131160ms step_avg:61.18ms +step:2145/2245 train_time:131222ms step_avg:61.18ms +step:2146/2245 train_time:131283ms step_avg:61.18ms +step:2147/2245 train_time:131346ms step_avg:61.18ms +step:2148/2245 train_time:131406ms step_avg:61.18ms +step:2149/2245 train_time:131469ms step_avg:61.18ms +step:2150/2245 train_time:131530ms step_avg:61.18ms +step:2151/2245 train_time:131593ms step_avg:61.18ms +step:2152/2245 train_time:131655ms step_avg:61.18ms +step:2153/2245 train_time:131718ms step_avg:61.18ms +step:2154/2245 train_time:131778ms step_avg:61.18ms +step:2155/2245 train_time:131841ms step_avg:61.18ms +step:2156/2245 train_time:131901ms step_avg:61.18ms +step:2157/2245 train_time:131963ms step_avg:61.18ms +step:2158/2245 train_time:132024ms step_avg:61.18ms +step:2159/2245 train_time:132087ms step_avg:61.18ms +step:2160/2245 train_time:132147ms step_avg:61.18ms +step:2161/2245 train_time:132210ms step_avg:61.18ms +step:2162/2245 train_time:132271ms step_avg:61.18ms +step:2163/2245 train_time:132334ms step_avg:61.18ms +step:2164/2245 train_time:132395ms step_avg:61.18ms +step:2165/2245 train_time:132458ms step_avg:61.18ms +step:2166/2245 train_time:132518ms step_avg:61.18ms +step:2167/2245 train_time:132581ms step_avg:61.18ms +step:2168/2245 train_time:132641ms step_avg:61.18ms +step:2169/2245 train_time:132704ms step_avg:61.18ms +step:2170/2245 train_time:132764ms step_avg:61.18ms +step:2171/2245 train_time:132827ms step_avg:61.18ms +step:2172/2245 train_time:132888ms step_avg:61.18ms +step:2173/2245 train_time:132951ms step_avg:61.18ms +step:2174/2245 train_time:133012ms step_avg:61.18ms +step:2175/2245 train_time:133075ms step_avg:61.18ms +step:2176/2245 train_time:133135ms step_avg:61.18ms +step:2177/2245 train_time:133198ms step_avg:61.18ms +step:2178/2245 train_time:133258ms step_avg:61.18ms +step:2179/2245 train_time:133320ms step_avg:61.18ms +step:2180/2245 train_time:133381ms step_avg:61.18ms +step:2181/2245 train_time:133443ms step_avg:61.18ms +step:2182/2245 train_time:133504ms step_avg:61.18ms +step:2183/2245 train_time:133567ms step_avg:61.18ms +step:2184/2245 train_time:133627ms step_avg:61.18ms +step:2185/2245 train_time:133691ms step_avg:61.19ms +step:2186/2245 train_time:133751ms step_avg:61.19ms +step:2187/2245 train_time:133815ms step_avg:61.19ms +step:2188/2245 train_time:133876ms step_avg:61.19ms +step:2189/2245 train_time:133939ms step_avg:61.19ms +step:2190/2245 train_time:133999ms step_avg:61.19ms +step:2191/2245 train_time:134062ms step_avg:61.19ms +step:2192/2245 train_time:134122ms step_avg:61.19ms +step:2193/2245 train_time:134184ms step_avg:61.19ms +step:2194/2245 train_time:134245ms step_avg:61.19ms +step:2195/2245 train_time:134307ms step_avg:61.19ms +step:2196/2245 train_time:134368ms step_avg:61.19ms +step:2197/2245 train_time:134431ms step_avg:61.19ms +step:2198/2245 train_time:134491ms step_avg:61.19ms +step:2199/2245 train_time:134555ms step_avg:61.19ms +step:2200/2245 train_time:134616ms step_avg:61.19ms +step:2201/2245 train_time:134680ms step_avg:61.19ms +step:2202/2245 train_time:134739ms step_avg:61.19ms +step:2203/2245 train_time:134802ms step_avg:61.19ms +step:2204/2245 train_time:134863ms step_avg:61.19ms +step:2205/2245 train_time:134925ms step_avg:61.19ms +step:2206/2245 train_time:134986ms step_avg:61.19ms +step:2207/2245 train_time:135048ms step_avg:61.19ms +step:2208/2245 train_time:135109ms step_avg:61.19ms +step:2209/2245 train_time:135173ms step_avg:61.19ms +step:2210/2245 train_time:135234ms step_avg:61.19ms +step:2211/2245 train_time:135297ms step_avg:61.19ms +step:2212/2245 train_time:135357ms step_avg:61.19ms +step:2213/2245 train_time:135420ms step_avg:61.19ms +step:2214/2245 train_time:135480ms step_avg:61.19ms +step:2215/2245 train_time:135543ms step_avg:61.19ms +step:2216/2245 train_time:135603ms step_avg:61.19ms +step:2217/2245 train_time:135667ms step_avg:61.19ms +step:2218/2245 train_time:135727ms step_avg:61.19ms +step:2219/2245 train_time:135790ms step_avg:61.19ms +step:2220/2245 train_time:135851ms step_avg:61.19ms +step:2221/2245 train_time:135915ms step_avg:61.20ms +step:2222/2245 train_time:135975ms step_avg:61.19ms +step:2223/2245 train_time:136037ms step_avg:61.20ms +step:2224/2245 train_time:136097ms step_avg:61.19ms +step:2225/2245 train_time:136160ms step_avg:61.20ms +step:2226/2245 train_time:136220ms step_avg:61.19ms +step:2227/2245 train_time:136283ms step_avg:61.20ms +step:2228/2245 train_time:136343ms step_avg:61.20ms +step:2229/2245 train_time:136406ms step_avg:61.20ms +step:2230/2245 train_time:136467ms step_avg:61.20ms +step:2231/2245 train_time:136530ms step_avg:61.20ms +step:2232/2245 train_time:136591ms step_avg:61.20ms +step:2233/2245 train_time:136654ms step_avg:61.20ms +step:2234/2245 train_time:136714ms step_avg:61.20ms +step:2235/2245 train_time:136777ms step_avg:61.20ms +step:2236/2245 train_time:136838ms step_avg:61.20ms +step:2237/2245 train_time:136900ms step_avg:61.20ms +step:2238/2245 train_time:136961ms step_avg:61.20ms +step:2239/2245 train_time:137023ms step_avg:61.20ms +step:2240/2245 train_time:137084ms step_avg:61.20ms +step:2241/2245 train_time:137147ms step_avg:61.20ms +step:2242/2245 train_time:137208ms step_avg:61.20ms +step:2243/2245 train_time:137271ms step_avg:61.20ms +step:2244/2245 train_time:137332ms step_avg:61.20ms +step:2245/2245 train_time:137396ms step_avg:61.20ms +step:2245/2245 val_loss:3.2785 train_time:137456ms step_avg:61.23ms +peak memory allocated: 29249 MiB reserved: 50528 MiB diff --git a/records/track_1_short/2025-11-10_CautiousWD/f2223004-18ce-47d6-bff7-065ce3a78092.txt b/records/track_1_short/2025-11-10_CautiousWD/f2223004-18ce-47d6-bff7-065ce3a78092.txt new file mode 100644 index 000000000..73f3a9a02 --- /dev/null +++ b/records/track_1_short/2025-11-10_CautiousWD/f2223004-18ce-47d6-bff7-065ce3a78092.txt @@ -0,0 +1,3772 @@ +import os +import sys + +with open(sys.argv[0]) as f: + code = f.read() # read the code of this file ASAP, for logging +import copy +import glob +import math +import threading +import time +import uuid +from dataclasses import dataclass +from collections import defaultdict +from itertools import accumulate +from pathlib import Path + +os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" +import torch + +torch.empty( + 1, device="cuda", requires_grad=True +).backward() # prevents a bug on some systems +import torch._dynamo as dynamo +import torch.distributed as dist +import torch.nn.functional as F + +# torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min +import triton +import triton.language as tl +from kernels import get_kernel +from torch import Tensor, nn + +dynamo.config.recompile_limit = 64 + +# ----------------------------------------------------------------------------- +# Custom operators: FP8 matmul by @YouJiacheng + + +@torch.library.custom_op("nanogpt::mm", mutates_args=()) +def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: + @torch.compile + def impl(x: Tensor, w: Tensor): + assert x.is_contiguous() and w.is_contiguous() + x_f8 = x.div(x_s).to(torch.float8_e4m3fn) + w_f8 = w.div(w_s).to(torch.float8_e4m3fn) + out = torch._scaled_mm( + x_f8, + w_f8.T, + out_dtype=torch.bfloat16, + scale_a=x.new_tensor(x_s, dtype=torch.float32), + scale_b=x.new_tensor(w_s, dtype=torch.float32), + use_fast_accum=True, + ) + return out, x_f8, w_f8 + + return impl(x, w) + +@mm_op.register_fake +def _(x: Tensor, w: Tensor, *_): + assert x.ndim == w.ndim == 2 + assert x.shape[1] == w.shape[1] + assert x.device == w.device + assert x.is_contiguous() and w.is_contiguous() + return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) + +@torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) +def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: + @torch.compile + def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): + assert grad.is_contiguous() + x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) + w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) + grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) + grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) + grad_x = torch._scaled_mm( + grad_f8, + w_f8.T.contiguous().T, + out_dtype=torch.bfloat16, + scale_a=grad_inv_s, + scale_b=w_inv_s, + use_fast_accum=False, + ) + # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) + grad_w = torch._scaled_mm( + x_f8.T.contiguous(), + grad_f8.T.contiguous().T, + out_dtype=torch.float32, + scale_a=x_inv_s, + scale_b=grad_inv_s, + use_fast_accum=False, + ).T + return grad_x, grad_w + + return impl(g, x_f8, w_f8) + +@mm_backward_op.register_fake +def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): + return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) + +def backward(ctx, grad_out: Tensor, *_): + x_f8, w_f8 = ctx.saved_tensors + x_s, w_s, grad_s = ctx.scales + grad_x, grad_w = torch.ops.nanogpt.mm_backward( + grad_out, x_f8, w_f8, x_s, w_s, grad_s + ) + return grad_x, grad_w, None, None, None + +def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): + *_, x_s, w_s, grad_s = inputs + _, x_f8, w_f8 = output + ctx.save_for_backward(x_f8, w_f8) + ctx.scales = x_s, w_s, grad_s + ctx.set_materialize_grads(False) + +mm_op.register_autograd(backward, setup_context=setup_context) + +# ----------------------------------------------------------------------------- +# Triton kernel for symmetric matrix multiplication by @byronxu99 + +def _get_autotune_configs(): + return [ + triton.Config( + { + "BLOCK_SIZE_M": bm, + "BLOCK_SIZE_N": bn, + "BLOCK_SIZE_K": bk, + "GROUP_SIZE_M": 8, + "LOWER_UPPER": 1, + }, + num_stages=stages, + num_warps=warps, + ) + for bm in [64, 128] + for bn in [64, 128, 256] + for bk in [64, 128] + for stages, warps in [(3, 4), (3, 8), (4, 4)] + if bm // bn <= 2 and bn // bm <= 2 + ] + +@triton.jit +def _pid_to_block( + pid, + M, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + # Split output matrix into blocks of size (BLOCK_SIZE_M, BLOCK_SIZE_N) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(M, BLOCK_SIZE_N) + + # Map PID to a single matrix in batch + batch_idx = pid // (num_pid_m * num_pid_n) + pid = pid % (num_pid_m * num_pid_n) + + # Map PID to 2D grid of blocks + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M) + + m_idx = pid_m * BLOCK_SIZE_M + n_idx = pid_n * BLOCK_SIZE_N + return batch_idx, m_idx, n_idx + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "K", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def XXT_kernel( + A_ptr, C_ptr, + M, K, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def XXT(A: torch.Tensor, out: torch.Tensor): + """ + Launch Triton kernel to compute C = A @ A.T + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert out.size(-2) == M, "Output matrix has incorrect shape" + assert out.size(-1) == M, "Output matrix has incorrect shape" + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + XXT_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + K=K, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + ) + return out + +@triton.autotune( + configs=_get_autotune_configs(), + key=["M", "a_stride_r", "a_stride_c", "c_stride_r", "c_stride_c"], +) +@triton.jit +def ba_plus_cAA_kernel( + A_ptr, C_ptr, + M, + a_stride_b, a_stride_r, a_stride_c, + c_stride_b, c_stride_r, c_stride_c, + alpha, beta, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + LOWER_UPPER: tl.constexpr, +): + # This is mostly duplicated from XXT_kernel, but also loads and adds a block of A + # Performance is slightly slower than XXT_kernel, so we use two separate kernels + pid = tl.program_id(axis=0) + batch_idx, m_idx, n_idx = _pid_to_block( + pid, M, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + + # Skip blocks that don't need to be computed + skip_block_below_diag = (LOWER_UPPER == 0) and (n_idx + BLOCK_SIZE_N <= m_idx) + skip_block_above_diag = (LOWER_UPPER != 0) and (m_idx + BLOCK_SIZE_M <= n_idx) + if skip_block_below_diag or skip_block_above_diag: + return + + # Index into one matrix of batch + A_ptr += batch_idx * a_stride_b + C_ptr += batch_idx * c_stride_b + + # Create pointer arrays for A and A.T + offs_m = (m_idx + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (n_idx + tl.arange(0, BLOCK_SIZE_N)) % M + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A_ptr + (offs_m[:, None] * a_stride_r + offs_k[None, :] * a_stride_c) + at_ptrs = A_ptr + (offs_k[:, None] * a_stride_c + offs_n[None, :] * a_stride_r) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Accumulate over blocks of K + for k in tl.range(0, tl.cdiv(M, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < M - k * BLOCK_SIZE_K, other=0.0) + at = tl.load(at_ptrs, mask=offs_k[:, None] < M - k * BLOCK_SIZE_K, other=0.0) + accumulator = tl.dot(a, at, accumulator) + a_ptrs += BLOCK_SIZE_K * a_stride_c + at_ptrs += BLOCK_SIZE_K * a_stride_c + + # Load block of A to add (corresponds to the current block of C) + offs_am = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_an = n_idx + tl.arange(0, BLOCK_SIZE_N) + a_add_ptrs = A_ptr + (offs_am[:, None] * a_stride_r + offs_an[None, :] * a_stride_c) + a_add_mask = (offs_am[:, None] < M) & (offs_an[None, :] < M) + a_add = tl.load(a_add_ptrs, mask=a_add_mask, other=0.0).to(tl.float32) + + # Apply alpha and beta + accumulator *= alpha + accumulator += a_add * beta + + out_dtype = C_ptr.dtype.element_ty + output = accumulator.to(out_dtype) + + # Store block of C + offs_cm = m_idx + tl.arange(0, BLOCK_SIZE_M) + offs_cn = n_idx + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C_ptr + (offs_cm[:, None] * c_stride_r + offs_cn[None, :] * c_stride_c) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M) + tl.store(c_ptrs, output, mask=c_mask) + + # Store block of C mirrored across the diagonal + c_ptrs_t = C_ptr + (offs_cn[:, None] * c_stride_r + offs_cm[None, :] * c_stride_c) + c_mask_t = (offs_cn[:, None] < M) & (offs_cm[None, :] < M) + tl.store(c_ptrs_t, output.T, mask=c_mask_t) + +def ba_plus_cAA(A: torch.Tensor, alpha: float, beta: float, out: torch.Tensor): + """ + Launch Triton kernel to compute C = alpha * A @ A.T + beta * A + """ + assert A.ndim == 2 or A.ndim == 3 + M, K = A.shape[-2:] + assert M == K, "Input matrix must be square" + assert out.size(-2) == M + assert out.size(-1) == M + + batch_size = A.size(0) if A.ndim == 3 else 1 + input_batch_stride = A.stride(0) if A.ndim == 3 else 0 + output_batch_stride = out.stride(0) if out.ndim == 3 else 0 + + grid = lambda meta: ( + batch_size * triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(M, meta["BLOCK_SIZE_N"]), + ) + ba_plus_cAA_kernel[grid]( + A_ptr=A, + C_ptr=out, + M=M, + a_stride_b=input_batch_stride, + a_stride_r=A.stride(-2), + a_stride_c=A.stride(-1), + c_stride_b=output_batch_stride, + c_stride_r=out.stride(-2), + c_stride_c=out.stride(-1), + alpha=alpha, + beta=beta, + ) + return out + +# Computed for num_iters=5, safety_factor=2e-2, cushion=2 +polar_express_coeffs = [ + (8.156554524902461, -22.48329292557795, 15.878769915207462), + (4.042929935166739, -2.808917465908714, 0.5000178451051316), + (3.8916678022926607, -2.772484153217685, 0.5060648178503393), + (3.285753657755655, -2.3681294933425376, 0.46449024233003106), + (2.3465413258596377, -1.7097828382687081, 0.42323551169305323) +] + +@torch.compile(dynamic=False, fullgraph=True) # Must use dynamic=False or else it's much slower +def polar_express(G: torch.Tensor): + """ + Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 + by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. + """ + X = G.bfloat16() + if G.size(-2) > G.size(-1): + X = X.mT + + # Ensure spectral norm is at most 1 + X = X / (X.norm(dim=(-2, -1), keepdim=True) * (1 + 2e-2) + 1e-6) + + # Allocate buffers + X = X.contiguous() + A = torch.empty((*X.shape[:-1], X.size(-2)), device=X.device, dtype=X.dtype) + B = torch.empty_like(A) + C = torch.empty_like(X) + + aX_plus_BX = torch.baddbmm if X.ndim > 2 else torch.addmm + + # Perform the iterations + for a, b, c in polar_express_coeffs: + XXT(X, out=A) # A = X @ X.mT + ba_plus_cAA(A, alpha=c, beta=b, out=B) # B = b * A + c * A @ A + aX_plus_BX(X, B, X, beta=a, out=C) # C = a * X + B @ X + X, C = C, X # Swap references to avoid unnecessary copies + + if G.size(-2) > G.size(-1): + X = X.mT + return X + +# ----------------------------------------------------------------------------- +# Muon optimizer + +class NorMuon(torch.optim.Optimizer): + """ + Muon - MomentUm Orthogonalized by Newton-schulz + + https://kellerjordan.github.io/posts/muon/ + + Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- + processing step, in which each 2D parameter's update is replaced with the nearest orthogonal + matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has + the advantage that it can be stably run in bfloat16 on the GPU. + + Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, + or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). + + Differences from standard Muon: + - Newton-Shulz is replaced with Polar Express for the orthogonalization step + - NorMuon adds a low-rank variance estimator similar to Adafactor. + - small 1D parameters handled here instead of in Adam + - Cautious weight decay, a gated version of decoupled weight decay + - Custom distributed sizing: + The model stores all attn and mlp weights in the same shape, and then updates the view as + needed on the forward pass. This enables attn and mlp weights to be contained within the same + dist.reduce_scatter_tensor() call. The model architecture has been customized to enable + (n_attn_layers+n_mlp_layers*2)%8==0 for batching across 8 GPUs with zero padding on mlp and attn. + The scheduling is: + 1. reduce scatter smear_gate (1 param 7 padding params) + 2. reduce scatter attn_gate (10 params 6 padding params) + 3. reduce scatter attn/mlp round 1 (10 attn params 6 mlp params) + 4. reduce scatter attn/mlp round 2 (16 mlp params) + 5. wait on step 1, then compute update of 1 and schedule all gather + 6. wait on step 2, then compute update of 2 and schedule all gather + 7. wait on step 3, then compute update of 3 and schedule all gather + GPUs receive [2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 ATTN, 2 MLP, 2 MLP, 2 MLP] + GPUs that receive params of type attn reshape before computing update + 8. wait on 4, then compute update of 4 and schedule all gather + 9. wait for each all gather to complete and update params + Empirically, leading with small params provides an additional 0.2s improvement. + """ + def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, beta2=0.95, custom_sizing=True): + defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + # custom sizing requires 8 GPUs + if custom_sizing and dist.get_world_size()==8: + param_groups = self.generate_custom_param_groups(params) + else: + param_groups = self.generate_standard_param_groups(params) + super().__init__(param_groups, defaults) + + def reset(self): + # expose a reset for clearing buffers + for group in self.param_groups: + group["momentum_buffer"].zero_() + group["second_momentum_buffer"].zero_() + + def generate_standard_param_groups(self, params): + """ + Use this method if running on less than 8 GPU or experimenting with additional attn or mlp modules. + Creates one param group per module. + """ + groups = defaultdict(list) + for param in params: + groups[param.label].append(param) + + param_groups = [] + for module_name, group_params in groups.items(): + chunk_size = (len(group_params) + self.world_size - 1) // self.world_size + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + + return param_groups + + def generate_custom_param_groups(self, params): + """ + Implementation requires that a single GPU does not receive both attn + and mlp params when a param group is split across GPUs. + """ + module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp'] + params_list = list(params) + params_list.sort(key=lambda x: module_group_order.index(x.label)) + + idx = 0 + group_sizes = [1, 10, 16, 16] + assert len(params_list) == sum(group_sizes) + param_groups = [] + for size in group_sizes: + chunk_size = (size + self.world_size - 1) // self.world_size + group_params = params_list[idx: idx + size] + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) + idx += size + + return param_groups + + @torch.no_grad() + def step(self): + # Efficient systems-wise implementation of step developed by @YouJiacheng, + # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, + # @ryanyang0, @vagrawal, and @varunneal. + rank = dist.get_rank() + group_infos = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + if not params: + continue + + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + stacked_grads = torch.empty( + (padded_num_params, *params[0].shape), + dtype=params[0].dtype, + device=params[0].device + ) + for i, p in enumerate(params): + stacked_grads[i].copy_(p.grad, non_blocking=True) + if len(params) < padded_num_params: + stacked_grads[len(params):].zero_() + + grad_chunk = torch.empty_like(stacked_grads[:chunk_size]) + + reduce_future = dist.reduce_scatter_tensor( + grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True + ).get_future() + + group_infos.append(dict(grad_chunk=grad_chunk, reduce_future=reduce_future)) + + all_gather_infos = [] + # Second pass: wait for gradients, compute updates for the local shard of parameters, + # and launch all async all_gather operations. + for group, info in zip(self.param_groups, group_infos): + info["reduce_future"].wait() + + params = group["params"] + grad_chunk = info["grad_chunk"] + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * self.world_size + + start_idx = rank * chunk_size + module_idx = start_idx if start_idx < len(params) else 0 + + num_params = min(chunk_size, max(0, len(params) - start_idx)) # num params for this rank + + if "momentum_buffer" not in group: + group["momentum_buffer"] = torch.zeros_like(grad_chunk[:num_params]) + momentum_buffer = group["momentum_buffer"] + # Apply momentum update to the persistent momentum buffer in-place + momentum_buffer.lerp_(grad_chunk[:num_params], 1 - group["momentum"]) + updated_grads = grad_chunk[:num_params].lerp_(momentum_buffer, group["momentum"]) + + grad_shape = updated_grads.shape + if params[module_idx].label == 'attn': + # Reshape attn params from [hdim, dim*4] to [4,hdim,dim] + for p in params[module_idx:module_idx + num_params]: + assert p.label == 'attn' + updated_grads = updated_grads.view(4 * grad_shape[0], grad_shape[1], grad_shape[2] // 4) + ref_param = params[module_idx] + param_shape = ref_param.shape + + if "second_momentum_buffer" not in group: + group["second_momentum_buffer"] = (torch.zeros_like(updated_grads[..., :, :1]) + if param_shape[-2] >= param_shape[-1] else torch.zeros_like(updated_grads[..., :1, :]) + ) + second_momentum_buffer = group["second_momentum_buffer"] + + if "param_lr" not in group: + group["param_lr"] = ( + max(1., param_shape[-2] / param_shape[-1]) ** 0.5 + * ref_param.new_tensor( + [getattr(param, "lr_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + ) + + group["param_wd"] = ref_param.new_tensor( + [getattr(param, "wd_mul", 1.0) for param in params[module_idx:module_idx + num_params]] + ).view(-1, 1, 1) + + # Determine LR and WR + eff_lr = group["lr"] * group["param_lr"] + eff_wd = group["lr"] * group["weight_decay"] * group["param_wd"] + + # Compute zeropower for the entire chunk in a single, batched call. + if num_params == 0: + v_chunk = updated_grads + else: + v_chunk = polar_express(updated_grads) + + # NorMuon: second_momentum_buffer tracks squared magnitude of gradients along one dim (https://arxiv.org/pdf/2510.05491) + v_norm = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_mean = v_chunk.square().mean(dim=-1 if param_shape[-2] >= param_shape[-1] else -2, keepdim=True) + second_momentum_buffer.lerp_(v_mean.to(dtype=ref_param.dtype), 1 - group["beta2"]) + step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt_() + v_chunk.mul_(step_size) + v_norm_new = v_chunk.norm(dim=(-2, -1), keepdim=True) + v_chunk.mul_(v_norm / v_norm_new.clamp_min_(1e-10)) + + v_chunk = v_chunk.view(grad_shape) + + updated_params = torch.empty_like(grad_chunk) + param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) + + # "Cautious" weight decay (https://arxiv.org/abs/2510.12402) + mask = (v_chunk * param_chunk) >= 0 + v_chunk.addcmul_(param_chunk, (eff_wd * mask).to(ref_param.dtype)) + + param_chunk.addcmul_(v_chunk, -eff_lr) + + updated_params[:num_params].copy_(param_chunk) + if num_params < chunk_size: + updated_params[num_params:].zero_() + + stacked_params = torch.empty( + (padded_num_params, *param_shape), + dtype=updated_params.dtype, + device=updated_params.device, + ) + + gather_future = dist.all_gather_into_tensor( + stacked_params, updated_params, async_op=True + ).get_future() + + all_gather_infos.append( + { + "gather_future": gather_future, + "stacked_params": stacked_params, + "orig_params": params, + } + ) + + # Final pass: wait for all_gather to complete and copy results back into original parameter tensors. + for info in all_gather_infos: + info["gather_future"].wait() + stacked_params = info["stacked_params"] + orig_params = info["orig_params"] + + unstacked_params = torch.unbind(stacked_params) + for i, p in enumerate(orig_params): + p.copy_(unstacked_params[i], non_blocking=True) + + +class DistAdam(torch.optim.Optimizer): + def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): + self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + params = list(params) + sizes = {p.shape for p in params} + # create one buffer per unique parameter-size + param_groups = [] + for size in sizes: + group_params = [p for p in params if p.shape == size] + param_groups.append(dict(params=group_params)) + super().__init__(param_groups, defaults) + # init state + for p in params: + chunk_size = p.size(0) // self.world_size + exp_avg = torch.zeros_like(p[:chunk_size], dtype=torch.bfloat16, device=p[0].device) + exp_avg_sq = torch.zeros_like(exp_avg) + self.state[p] = dict(step=0, exp_avg=exp_avg, exp_avg_sq=exp_avg_sq) + # DistributedAdam implementation by @vagrawal + + @torch.compile + @torch.no_grad() + def step(self): + rank = dist.get_rank() + reduce_scatter_futures: list[torch.Future] = [] + all_gather_futures: list[torch.Future] = [] + grad_slices = [] + for group in self.param_groups: + params: list[Tensor] = group["params"] + for param in params: + grad = param.grad + rank_size = grad.shape[0] // self.world_size + grad_slice = torch.empty_like(grad[:rank_size]) + reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) + grad_slices.append(grad_slice) + + idx = 0 + for group in self.param_groups: + beta1, beta2 = group['betas'] + eps = group['eps'] + wd = group['weight_decay'] + params = group['params'] + for param in params: + reduce_scatter_futures[idx].wait() + rank_size = param.shape[0] // self.world_size + p_slice = param[rank * rank_size:(rank + 1) * rank_size] + lr = group['lr'] * getattr(param, "lr_mul", 1.0) + state = self.state[param] + g_slice = grad_slices[idx] + + exp_avg = state["exp_avg"] + exp_avg_sq = state["exp_avg_sq"] + state["step"] += 1 + t = state["step"] + # weight decay + if wd != 0: + eff_weight_decay = lr * wd * getattr(param, "wd_mul", 1.0) + p_slice.mul_(1 - eff_weight_decay) + # update running averages + exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) + # bias corrections + bias1 = 1 - beta1 ** t + bias2 = 1 - beta2 ** t + # compute step + denom = exp_avg_sq.sqrt().add_(eps) + step_size = lr * (bias2 ** 0.5 / bias1) + update = exp_avg.div(denom).mul_(step_size) + p_slice.add_(other=update, alpha=-1.0) + idx += 1 + all_gather_futures.append(dist.all_gather_into_tensor(param, p_slice, async_op=True).get_future()) + torch.futures.collect_all(all_gather_futures).wait() + +# ----------------------------------------------------------------------------- +# PyTorch nn.Module definitions for the model + +def norm(x: Tensor): + return F.rms_norm(x, (x.size(-1),)) + +class CastedLinear(nn.Linear): + def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): + super().__init__(in_features, out_features, bias=False) + self.use_fp8 = use_fp8 + self.x_s = x_s + self.w_s = w_s + self.grad_s = grad_s + + def reset_parameters(self) -> None: + with torch.no_grad(): + self.weight.zero_() # @Grad62304977 and others + + def forward(self, x: Tensor): + if self.use_fp8 and self.training: + _x = x.flatten(0, -2) + out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] + return out.reshape(*x.shape[:-1], -1) + else: + return F.linear(x, self.weight.type_as(x)) + +# yarn implementation @classiclarryd +class Yarn(nn.Module): + def __init__(self, head_dim, max_seq_len): + super().__init__() + self.head_dim = head_dim + self.max_seq_len = max_seq_len + self.reset() + + def reset(self): + angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=self.head_dim//4, dtype=torch.float32, device=device) + # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) + angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(self.head_dim//4)]) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=device) + theta = torch.outer(t, angular_freq) + self.cos = nn.Buffer( + theta.cos().to(torch.bfloat16), persistent=False + ) + self.sin = nn.Buffer( + theta.sin().to(torch.bfloat16), persistent=False + ) + self.angular_freq = angular_freq + # start with 0.1, inspired by 0.12 from @leloykun and learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 + self.attn_scale = 0.1 + + def apply(self, old_window: int, new_window: int, alpha: int=1, beta: int=32): + rotations = args.block_size * old_window * self.angular_freq / (2 * torch.pi) + scaling_factor = old_window / new_window + interpolation_weight = torch.clamp((rotations - alpha) / (beta - alpha), 0, 1) + self.angular_freq *= scaling_factor + interpolation_weight * (1 - scaling_factor) + t = torch.arange(self.max_seq_len, dtype=torch.float32, device=self.angular_freq.device) + theta = torch.outer(t, self.angular_freq) + self.cos.copy_(theta.cos()) + self.sin.copy_(theta.sin()) + self.attn_scale *= 0.2 * math.log(new_window / old_window) + 1 + +def rotary(x_BTHD: Tensor, cos: Tensor, sin: Tensor): + assert cos.size(0) >= x_BTHD.size(-3) + cos, sin = ( + cos[None, : x_BTHD.size(-3), None, :], + sin[None, : x_BTHD.size(-3), None, :], + ) + x1, x2 = x_BTHD.chunk(2, dim=-1) + y1 = x1 * cos + x2 * sin + y2 = x1 * (-sin) + x2 * cos + return torch.cat((y1, y2), 3) + +@dataclass +class AttnArgs: + ve: torch.Tensor + sa_lambdas: torch.Tensor + seqlens: torch.Tensor + bm_size: int + cos: torch.Tensor + sin: torch.Tensor + attn_scale: float + +flash_attn_interface = get_kernel('varunneal/flash-attention-3').flash_attn_interface + +class CausalSelfAttention(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int): + super().__init__() + self.num_heads = num_heads + self.head_dim = head_dim + self.dim = dim + self.hdim = num_heads * head_dim + + assert self.hdim == self.dim, "num_heads * head_dim must equal model_dim" + std = 0.5 * (self.dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng + # https://x.com/hi_tysam/status/1879699187107033311 + # make matrices the same shape as MLP to enable batched call in optimizer + self.qkvo_w = nn.Parameter(torch.empty(self.hdim, self.dim*4)) + # label module to enable custom optimizer sizing + self.qkvo_w.label='attn' + + with torch.no_grad(): + self.qkvo_w.view(4,self.hdim, self.dim)[:3].uniform_(-bound, bound) # init QKV weights + self.qkvo_w.view(4,self.hdim, self.dim)[3].zero_() # init output weights to zero + + # sparse gated attention to enable context based no-op by @classiclarryd + self.attn_gate = CastedLinear(12, num_heads) + # label module to enable custom optimizer sizing + self.attn_gate.weight.label = 'attn_gate' + + def forward(self, x: Tensor, attn_args: AttnArgs): + B, T = x.size(0), x.size(1) # batch size, sequence length + assert B == 1, "varlen sequences requires B == 1" + assert T % 16 == 0 + # unpack attention args + cos, sin = attn_args.cos, attn_args.sin + ve, sa_lambdas = attn_args.ve, attn_args.sa_lambdas + seqlens, attn_scale, bm_size = attn_args.seqlens, attn_args.attn_scale, attn_args.bm_size + + q, k, v = F.linear(x, self.qkvo_w.view(4, self.hdim, self.dim)[:3].flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) + q, k = norm(q), norm(k) # QK norm @Grad62304977 + q, k = rotary(q, cos, sin), rotary(k, cos, sin) + if ve is not None: + v = sa_lambdas[0] * v + sa_lambdas[1] * ve.view_as(v) # @ KoszarskyB & @Grad62304977 + else: # skip mid-layers token value embeddings by @YouJiacheng + v = sa_lambdas[0] * v + + max_len = args.train_max_seq_len if self.training else (args.val_batch_size // (grad_accum_steps * world_size)) + + # use flash_attn over flex_attn @varunneal. flash_attn_varlen suggested by @YouJiacheng + y = flash_attn_interface.flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=seqlens, cu_seqlens_k=seqlens, + max_seqlen_q=max_len, max_seqlen_k=max_len, + causal=True, softmax_scale=attn_scale, window_size=(bm_size, 0)) + y = y.view(B, T, self.num_heads, self.head_dim) + y = y * torch.sigmoid(self.attn_gate(x[..., :self.attn_gate.weight.size(-1)])).view(B, T, self.num_heads, 1) + y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side + y = F.linear(y, self.qkvo_w.view(4, self.hdim, self.dim)[3].type_as(y)) + return y + + +class MLP(nn.Module): + def __init__(self, dim: int): + super().__init__() + hdim = 4 * dim + # make matrices the same shape to enable batched call in optimizer + self.c_fc = nn.Parameter(torch.empty(dim, hdim)) + self.c_proj = nn.Parameter(torch.empty(dim, hdim)) + # label modules to enable custom optimizer sizing + self.c_fc.label = 'mlp' + self.c_proj.label = 'mlp' + # corrective factor to account for transpose + self.c_fc.lr_mul = 2. + + std = 0.5 * (dim ** -0.5) + bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng + with torch.no_grad(): + self.c_fc.uniform_(-bound, bound) + self.c_proj.zero_() # zero init suggested by @Grad62304977 + + def forward(self, x: Tensor): + x = F.linear(x, self.c_fc.T.type_as(x)) + x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 + x = F.linear(x, self.c_proj.type_as(x)) + return x + +class Block(nn.Module): + def __init__(self, dim: int, head_dim: int, num_heads: int, layer_idx: int): + super().__init__() + # skip attention of blocks.7 (the 8th layer) by @YouJiacheng + self.attn = CausalSelfAttention(dim, head_dim, num_heads) if layer_idx not in [0, 7] else None + # skip MLP blocks for first MLP layer by @EmelyanenkoK + self.mlp = MLP(dim) if layer_idx != 0 else None + + def forward(self, x: Tensor, x0: Tensor, lambdas: Tensor, attn_args: AttnArgs): + x = lambdas[0] * x + lambdas[1] * x0 + if self.attn is not None: + x = x + self.attn(norm(x), attn_args) + if self.mlp is not None: + x = x + self.mlp(norm(x)) + return x + +# ----------------------------------------------------------------------------- +# The main model + +def next_multiple_of_n(v: float | int, *, n: int): + return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) + +class GPT(nn.Module): + def __init__(self, vocab_size: int, num_layers: int, num_heads: int, head_dim: int, model_dim: int, max_seq_len: int): + super().__init__() + vocab_size = next_multiple_of_n(vocab_size, n=128) + self.embed = nn.Embedding(vocab_size, model_dim) + self.smear_gate = CastedLinear(12, 1) + # label modules to enable custom optimizer sizing + self.smear_gate.weight.label = 'smear_gate' + # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 + # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 + self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) + self.blocks = nn.ModuleList([Block(model_dim, head_dim, num_heads, i) for i in range(num_layers)]) + self.yarn = Yarn(head_dim, max_seq_len) + # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. + # suggested to me by @Grad62304977. this originates from Karpathy's experiments. + use_fp8 = not os.environ.get("DISABLE_FP8", False) + self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=use_fp8, x_s=(model_dim**0.5)/448, w_s=2**-9, grad_s=1/448) + # Add learnable skip connection weights for decoder layers + assert num_layers % 2 == 0 + pad = (-num_layers * 5 - 2) % dist.get_world_size() + self.scalars = nn.Parameter( + torch.cat( + [ + -1.5 + * torch.ones(num_layers), # skip_weights -> σ(-1.5) ≈ 0.18 + *[ + torch.tensor([1.0, 0.0]) for _ in range(num_layers) + ], # block lambdas + *[ + torch.tensor([0.5, 0.5]) for _ in range(num_layers) + ], # SA lambdas + torch.zeros(1), # smear_lambda + 0.5*torch.ones(1), # backout_lambda + torch.ones(pad), + ] + ) + ) + # set learning rates + for param in self.embed.parameters(): + param.lr_mul = 75. + for param in self.value_embeds.parameters(): + param.lr_mul = 75. + self.lm_head.weight.lr_mul = 1.0 + self.scalars.lr_mul = 5.0 + + def forward(self, input_seq: Tensor, target_seq: Tensor, seqlens: Tensor, ws_short: int, ws_long: int): + assert input_seq.ndim == 1 + + ve = [value_embed(input_seq) for value_embed in self.value_embeds] + # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure + # dropping first layer updates this to .12 ... 012 + ve = [None, ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] + assert len(ve) == len(self.blocks) + + short_bm = ws_short * args.block_size + long_bm = ws_long * args.block_size + bm_sizes = [None, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, None, short_bm, short_bm, short_bm, long_bm] + assert len(bm_sizes) == len(self.blocks) + + x = self.embed(input_seq) + + skip_weights = self.scalars[:(len(self.blocks) // 2)] + lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) + sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) + smear_lambda = self.scalars[5 * len(self.blocks)] + backout_lambda = self.scalars[5 * len(self.blocks)+1] + + # smear token embed forward 1 position @classiclarryd + smear_gate_out = smear_lambda * torch.sigmoid(self.smear_gate(x[1:, :self.smear_gate.weight.size(-1)])) + x = torch.cat([x[:1], x[1:] + smear_gate_out * x[:-1]]) + x = x0 = norm(x[None]) + + # U-net design by @brendanh0gan + skip_connections = [] + n = len(self.blocks) // 2 + + x_backout = None + backout_layer = 8 + # skip layer zero + for i in range(1,len(self.blocks)): + attn_args = AttnArgs( + ve=ve[i], + sa_lambdas=sa_lambdas[i], + seqlens=seqlens, + bm_size=bm_sizes[i], + cos=self.yarn.cos, + sin=self.yarn.sin, + attn_scale=self.yarn.attn_scale + ) + # since layer 0 is skipped, layer 11 does not have skip_connection + if i >= n and i<11: + gate = torch.sigmoid(skip_weights[i - n]) # in (0, 1) + x = x + gate * skip_connections.pop() + x = self.blocks[i](x, x0, lambdas[i], attn_args) + if i < n: + skip_connections.append(x) + if i == backout_layer: + x_backout = x + + # back out contributions from first 8 layers that are only required for downstream context and not direct prediction + x -= backout_lambda * x_backout + x = norm(x) + logits = self.lm_head(x) + # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) + logits = 30 * torch.sigmoid(logits / 7.5) + logits_for_loss = logits.float() if not self.training else logits + loss = F.cross_entropy( + logits_for_loss.view(-1, logits_for_loss.size(-1)), + target_seq, + reduction="sum" if self.training else "mean", + ) + return loss + +# ----------------------------------------------------------------------------- +# Distributed data loader + +def _load_data_shard(file: Path): + header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 + assert header[0] == 20240520, "magic number mismatch in the data .bin file" + assert header[1] == 1, "unsupported version" + num_tokens = int(header[2]) # number of tokens (claimed) + with file.open("rb", buffering=0) as f: + tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng + f.seek(256 * 4) + nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng + assert nbytes == 2 * num_tokens, "number of tokens read does not match header" + return tokens + +BOS_ID = 50256 + +class BOSFinder: + # Helper for getting sequences that start at the beginning of documents by @varunneal based on work by @classiclarryd + def __init__(self, tokens: Tensor, world_size: int = 1, quickload: bool = False): + # Precompute BOS positions once per shard + self.tokens=tokens + self.size = tokens.numel() + self.quickload = quickload + if quickload: + # only scan first 4 million tokens, then kickoff async thread to scan rest + self.bos_idx = (tokens[:4_000_000] == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.thread = None + self.ready = threading.Event() + self.start() + else: + self.bos_idx = (tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.i = 0 + self.world_size = world_size + self.batch_iter = 0 + + def _load(self): + self.bos_idx_async = (self.tokens == BOS_ID).nonzero(as_tuple=True)[0].to(torch.int64).cpu().numpy() + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + self.bos_idx = self.bos_idx_async + + def next_batch(self, num_tokens_local: int, max_seq_len: int): + # if quickload was used, repoint to the full dataset after 5 batches + if self.quickload and self.batch_iter==5: + self.get() + n = len(self.bos_idx) + starts = [[] for _ in range(self.world_size)] + ends = [[] for _ in range(self.world_size)] + + idx = self.i + for r in range(self.world_size): + cur_len = 0 + while cur_len <= num_tokens_local: + if idx >= n: + raise StopIteration(f"Insufficient BOS ahead of position {cur}; hit tail of shard.") + cur = self.bos_idx[idx] + starts[r].append(cur) + end = min(self.bos_idx[idx + 1] if idx + 1 < n else self.size, + cur + max_seq_len, + cur + num_tokens_local - cur_len + 1) + ends[r].append(end) + cur_len += end - cur + idx += 1 + + assert cur_len == num_tokens_local + 1 + self.i = idx + self.batch_iter+=1 + return starts, ends + +class DataPreloader: + # Helper for asynchronously loading next shard and indexing bos tokens + def __init__(self, file_iter, world_size: int = 1): + self.file_iter = file_iter + self.world_size = world_size + self.thread = None + self.data = None + self.ready = threading.Event() + + def _load(self): + tokens = _load_data_shard(next(self.file_iter)) + self.data = (tokens, BOSFinder(tokens, self.world_size)) + self.ready.set() + + def start(self): + self.ready.clear() + self.thread = threading.Thread(target=self._load) + self.thread.start() + + def get(self): + if self.thread: + self.ready.wait() + self.thread.join() + return self.data + +def distributed_data_generator(filename_pattern: str, num_tokens: int, max_seq_len: int, grad_accum_steps: int = 1, align_to_bos: bool = True): + # align_to_bos: each sequence begins with Beginning of Sequence token, sequences truncated to max_seq_len + rank = dist.get_rank() if dist.is_initialized() else 0 + world_size = dist.get_world_size() if dist.is_initialized() else 1 + assert num_tokens % (world_size * grad_accum_steps) == 0, "Batch size must be divisible by world size" + num_tokens = num_tokens // grad_accum_steps + + files = [Path(file) for file in sorted(glob.glob(filename_pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {filename_pattern}") + + file_iter = iter(files) # Use itertools.cycle(files) for multi-epoch training + tokens = _load_data_shard(next(file_iter)) + if align_to_bos: + finder = BOSFinder(tokens, world_size=world_size, quickload=True) + preloader = DataPreloader(file_iter, world_size) + preloader.start() + else: + pos = 0 # for unaligned case + + while True: + num_tokens_local = num_tokens // world_size + max_num_docs = next_multiple_of_n(num_tokens_local // 300, n=128) # median doc length is ~400 + + if align_to_bos: + try: + seq_starts, seq_ends = finder.next_batch(num_tokens_local, max_seq_len) + start_idxs, end_idxs = torch.tensor(seq_starts[rank]), torch.tensor(seq_ends[rank]) + except StopIteration: + # This shard is exhausted, load the next one in the next loop iteration. + tokens, finder = preloader.get() + preloader.start() + continue + + buf = torch.cat([tokens[i:j] for i, j in zip(start_idxs, end_idxs)]) + _inputs = buf[:-1] + _targets = buf[1:] + end_idxs[-1] -= 1 # last document was too long to account for _targets offset + cum_lengths = (end_idxs - start_idxs).cumsum(0) + + else: + if pos + num_tokens + 1 >= len(tokens): # should not occur for val data + tokens, pos = _load_data_shard(next(file_iter)), 0 + + pos_local = pos + rank * num_tokens_local + buf = tokens[pos_local: pos_local + num_tokens_local + 1] + _inputs = buf[:-1].view(num_tokens_local, ) + _targets = buf[1:].view(num_tokens_local, ) + + cum_lengths = torch.nonzero(_inputs == BOS_ID)[:, 0] + pos += num_tokens + + + _cum_lengths = torch.full((max_num_docs,), num_tokens_local) + _cum_lengths[0] = 0 + _cum_lengths[1:len(cum_lengths) + 1] = cum_lengths + + new_params = yield ( + _inputs.to(device="cuda", dtype=torch.int32, non_blocking=True), + _targets.to(device="cuda", dtype=torch.int64, non_blocking=True), + _cum_lengths.to(device="cuda", dtype=torch.int32, non_blocking=True) + ) + + if new_params is not None: + # makes it possible for generator to receive new (num_tokens, max_seq_len, grad_accum_steps) via .send() + new_num_tokens, new_max_seq_len, new_grad_accum_steps = new_params + assert new_num_tokens % (world_size * grad_accum_steps) == 0, "Num tokens must be divisible by world size" + num_tokens = new_num_tokens + max_seq_len = new_max_seq_len + grad_accum_steps = new_grad_accum_steps + + +# ----------------------------------------------------------------------------- +# int main + +@dataclass +class Hyperparameters: + # data + train_files: str = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on + val_files: str = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on + val_tokens: int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons + train_batch_size: int = 2048 * 16 * 8 + train_max_seq_len: int = 128 * 16 + val_batch_size: int = 4 * 64 * 1024 * 8 + # optimization + num_scheduled_iterations: int = 2205 # number of steps to complete lr and ws schedule + num_extension_iterations: int = 40 # number of steps to continue training at final lr and ws + num_iterations: int = num_scheduled_iterations + num_extension_iterations + cooldown_frac: float = 0.50 # fraction of num_scheduled_iterations spent cooling down the learning rate + # evaluation and logging + run_id: str = f"{uuid.uuid4()}" + val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end + save_checkpoint: bool = False + # attention masking + block_size: int = 128 + ws_schedule: tuple = (3, 7, 11) + ws_final: int = 13 # increase final validation ws, used for YaRN extension and short window size @classiclarryd + ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN + +args = Hyperparameters() + +data_path = os.environ.get("DATA_PATH", ".") +args.train_files = os.path.join(data_path, args.train_files) +args.val_files = os.path.join(data_path, args.val_files) + +# torchrun sets these env variables +rank = int(os.environ["RANK"]) +world_size = int(os.environ["WORLD_SIZE"]) +assert 8 % world_size == 0, "world_size must be a divisor of 8" +grad_accum_steps = 8 // world_size +assert torch.cuda.is_available() +device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) +torch.cuda.set_device(device) +dist.init_process_group(backend="nccl", device_id=device) +dist.barrier() +master_process = (rank == 0) # this process will do logging, checkpointing etc. + +# begin logging +logfile = None +if master_process: + run_id = args.run_id + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{run_id}.txt" + print(logfile) +def print0(s, console=False): + if master_process: + with open(logfile, "a") as f: + if console: + print(s) + print(s, file=f) + +# begin by printing this file (the Python code) +print0(code) +print0("="*100) +# log information about the hardware/software environment this is running on +print0(f"Running Python {sys.version}") +print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") +print0(f"Running Triton version {triton.__version__}") + +def nvidia_smi(): + import subprocess # avoid top level import + return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout +print0(nvidia_smi()) +print0("="*100) + +model: nn.Module = GPT( + vocab_size=50257, + num_layers=12, + num_heads=6, + head_dim=128, + model_dim=768, + max_seq_len=max(args.train_batch_size, args.val_batch_size) // (grad_accum_steps * world_size) +).cuda() +for m in model.modules(): + if isinstance(m, (nn.Embedding, nn.Linear)): + m.bfloat16() +for param in model.parameters(): + dist.broadcast(param.detach(), 0) + +# collect the parameters to optimize +hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n and "gate" not in n] +embed_params = [p for n, p in model.named_parameters() if "embed" in n] +scalar_params = [p for p in model.parameters() if p.ndim < 2] +head_params = [model.lm_head.weight] +gate_params = [p for n, p in model.named_parameters() if "gate" in n] + +# init the optimizer(s) +# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence +# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 +optimizer1 = DistAdam( + scalar_params + head_params + embed_params, + lr=0.008, + betas=(0.65, 0.95), + eps=1e-8, + weight_decay=0.0, +) +optimizer2 = NorMuon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=1.2) +optimizers = [optimizer1, optimizer2] +for opt in optimizers: + for group in opt.param_groups: + group["initial_lr"] = group["lr"] + +# learning rate schedule: flat, then linear decay, then flat +def get_lr(step: int): + x = min(0.9999, step / args.num_scheduled_iterations) + assert 0 <= x < 1 + lr = 1.0 + if x >= 1 - args.cooldown_frac: + w = (1 - x) / args.cooldown_frac + lr = w * 1.0 + (1 - w) * 0.1 + return lr + +def get_ws(step: int): + # set short window size to half of long window size + # Higher ws on "extension" steps + if step >= args.num_scheduled_iterations: + return args.ws_final // 2, args.ws_final + x = step / args.num_scheduled_iterations + assert 0 <= x < 1 + ws_idx = int(len(args.ws_schedule) * x) + return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] + +def get_muon_momentum(step: int, muon_warmup_steps=300, muon_cooldown_steps=50, momentum_min=0.85, momentum_max=0.95): + # warmup phase: linearly increase momentum from min to max + # cooldown phase: linearly decrease momentum from max to min + momentum_cd_start = args.num_iterations - muon_cooldown_steps + if step < muon_warmup_steps: + frac = step / muon_warmup_steps + momentum = momentum_min + frac * (momentum_max - momentum_min) + elif step > momentum_cd_start: + frac = (step - momentum_cd_start) / muon_cooldown_steps + momentum = momentum_max - frac * (momentum_max - momentum_min) + else: + momentum = momentum_max + return momentum + +def step_optimizers(step: int, optimizers, model): + # update lr + for optimizer in optimizers: + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * get_lr(step) + + # set muon momentum based on step + momentum = get_muon_momentum(step) + for group in optimizers[1].param_groups: + group["momentum"] = momentum + + # on even steps, only step Muon params + # on odd steps, step all params + if step%2==0: + optimizers[1].step() + optimizers[1].zero_grad(set_to_none=True) + else: + for optimizer in optimizers: + optimizer.step() + model.zero_grad(set_to_none=True) + +model: nn.Module = torch.compile(model, dynamic=False, fullgraph=True) + +######################################## +# Warmup kernels # +######################################## + +# Warmup the training kernels, then re-initialize the state so we aren't cheating +warmup_steps = 30 +initial_state = dict(model=copy.deepcopy(model.state_dict()), + optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +ws_schedule = list(args.ws_schedule) + [args.ws_final] +ws_long = ws_schedule[0] +for step in range(warmup_steps): + inputs, targets, cum_seqlens = next(train_loader) + # each window size is a new graph, need to warm up each with Yarn.attn_scale + ws_idx = step % len(ws_schedule) + if ws_idx==0: + model.yarn.reset() + ws_long = ws_schedule[0] + else: + new_ws_long = ws_schedule[ws_idx] + model.yarn.apply(ws_long, new_ws_long) + ws_long = new_ws_long + model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() + for opt in optimizers: + opt.step() + model.zero_grad(set_to_none=True) +model.yarn.reset() # rotary buffer is not stored in state_dict +model.load_state_dict(initial_state["model"]) +optimizer2.reset() # muon momentum buffers not in state dict +for opt, opt_state in zip(optimizers, initial_state["optimizers"]): + opt.load_state_dict(opt_state) +del train_loader, initial_state + +######################################## +# Training and validation # +######################################## + +train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +training_time_ms = 0 +# start the clock +torch.cuda.synchronize() +t0 = time.perf_counter() +# begin training +train_steps = args.num_iterations +ws_short, ws_long = get_ws(0) +for step in range(train_steps + 1): + last_step = (step == train_steps) + ws_short, new_ws_long = get_ws(step) + if new_ws_long != ws_long: + model.yarn.apply(ws_long, new_ws_long) + ws_long=new_ws_long + + # --------------- VALIDATION SECTION ----------------- + if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): + if last_step: + ws_long = args.ws_validate_post_yarn_ext + # stop the clock + torch.cuda.synchronize() + training_time_ms += 1000 * (time.perf_counter() - t0) + model.eval() + assert args.val_tokens % args.val_batch_size == 0 + val_steps = grad_accum_steps * args.val_tokens // args.val_batch_size + val_loader = distributed_data_generator(args.val_files, args.val_batch_size, -1, grad_accum_steps=grad_accum_steps, align_to_bos=False) + val_loss = 0 + with torch.no_grad(): + for _ in range(val_steps): + inputs, targets, cum_seqlens = next(val_loader) + val_loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) + val_loss /= val_steps + del val_loader + dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) + print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) + model.train() + # start the clock again + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if master_process and args.save_checkpoint: + log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) + os.makedirs(f"logs/{run_id}", exist_ok=True) + torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") + # the last step only has the validation loop, so break to avoid training + break + + # --------------- TRAINING SECTION ----------------- + for _ in range(grad_accum_steps): + inputs, targets, cum_seqlens = next(train_loader) + (model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps).backward() + step_optimizers(step, optimizers, model) + + # logging + approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) + print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) + +print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) +dist.destroy_process_group() + +==================================================================================================== +Running Python 3.10.12 (main, Feb 4 2025, 14:57:36) [GCC 11.4.0] +Running PyTorch 2.10.0.dev20250926+cu126 compiled for CUDA 12.6 +Running Triton version 3.5.0 +Mon Nov 10 21:31:59 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.127.08 Driver Version: 550.127.08 CUDA Version: 12.6 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA H100 80GB HBM3 On | 00000000:19:00.0 Off | 0 | +| N/A 36C P0 125W / 700W | 5858MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 1 NVIDIA H100 80GB HBM3 On | 00000000:3B:00.0 Off | 0 | +| N/A 33C P0 120W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 2 NVIDIA H100 80GB HBM3 On | 00000000:4C:00.0 Off | 0 | +| N/A 32C P0 118W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 3 NVIDIA H100 80GB HBM3 On | 00000000:5D:00.0 Off | 0 | +| N/A 34C P0 120W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 4 NVIDIA H100 80GB HBM3 On | 00000000:9B:00.0 Off | 0 | +| N/A 35C P0 126W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 5 NVIDIA H100 80GB HBM3 On | 00000000:BB:00.0 Off | 0 | +| N/A 32C P0 122W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 6 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | +| N/A 35C P0 120W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ +| 7 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | +| N/A 31C P0 117W / 700W | 1520MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| ++-----------------------------------------------------------------------------------------+ + +==================================================================================================== +step:0/2245 val_loss:10.8258 train_time:0ms step_avg:0.02ms +step:1/2245 train_time:119ms step_avg:119.48ms +step:2/2245 train_time:141ms step_avg:70.58ms +step:3/2245 train_time:179ms step_avg:59.77ms +step:4/2245 train_time:236ms step_avg:58.96ms +step:5/2245 train_time:296ms step_avg:59.13ms +step:6/2245 train_time:354ms step_avg:59.02ms +step:7/2245 train_time:415ms step_avg:59.30ms +step:8/2245 train_time:474ms step_avg:59.22ms +step:9/2245 train_time:535ms step_avg:59.46ms +step:10/2245 train_time:594ms step_avg:59.41ms +step:11/2245 train_time:655ms step_avg:59.57ms +step:12/2245 train_time:714ms step_avg:59.51ms +step:13/2245 train_time:775ms step_avg:59.61ms +step:14/2245 train_time:834ms step_avg:59.55ms +step:15/2245 train_time:895ms step_avg:59.67ms +step:16/2245 train_time:954ms step_avg:59.60ms +step:17/2245 train_time:1017ms step_avg:59.83ms +step:18/2245 train_time:1080ms step_avg:60.00ms +step:19/2245 train_time:1144ms step_avg:60.23ms +step:20/2245 train_time:1204ms step_avg:60.20ms +step:21/2245 train_time:1267ms step_avg:60.31ms +step:22/2245 train_time:1327ms step_avg:60.31ms +step:23/2245 train_time:1389ms step_avg:60.41ms +step:24/2245 train_time:1449ms step_avg:60.36ms +step:25/2245 train_time:1511ms step_avg:60.43ms +step:26/2245 train_time:1570ms step_avg:60.40ms +step:27/2245 train_time:1631ms step_avg:60.42ms +step:28/2245 train_time:1691ms step_avg:60.38ms +step:29/2245 train_time:1753ms step_avg:60.44ms +step:30/2245 train_time:1812ms step_avg:60.39ms +step:31/2245 train_time:1873ms step_avg:60.41ms +step:32/2245 train_time:1932ms step_avg:60.38ms +step:33/2245 train_time:1995ms step_avg:60.44ms +step:34/2245 train_time:2055ms step_avg:60.45ms +step:35/2245 train_time:2118ms step_avg:60.51ms +step:36/2245 train_time:2178ms step_avg:60.51ms +step:37/2245 train_time:2241ms step_avg:60.57ms +step:38/2245 train_time:2301ms step_avg:60.55ms +step:39/2245 train_time:2363ms step_avg:60.59ms +step:40/2245 train_time:2422ms step_avg:60.56ms +step:41/2245 train_time:2484ms step_avg:60.60ms +step:42/2245 train_time:2544ms step_avg:60.57ms +step:43/2245 train_time:2606ms step_avg:60.60ms +step:44/2245 train_time:2666ms step_avg:60.58ms +step:45/2245 train_time:2728ms step_avg:60.62ms +step:46/2245 train_time:2788ms step_avg:60.60ms +step:47/2245 train_time:2849ms step_avg:60.62ms +step:48/2245 train_time:2908ms step_avg:60.59ms +step:49/2245 train_time:2970ms step_avg:60.61ms +step:50/2245 train_time:3029ms step_avg:60.59ms +step:51/2245 train_time:3092ms step_avg:60.62ms +step:52/2245 train_time:3152ms step_avg:60.61ms +step:53/2245 train_time:3214ms step_avg:60.63ms +step:54/2245 train_time:3273ms step_avg:60.61ms +step:55/2245 train_time:3335ms step_avg:60.64ms +step:56/2245 train_time:3394ms step_avg:60.61ms +step:57/2245 train_time:3456ms step_avg:60.63ms +step:58/2245 train_time:3515ms step_avg:60.61ms +step:59/2245 train_time:3578ms step_avg:60.65ms +step:60/2245 train_time:3638ms step_avg:60.63ms +step:61/2245 train_time:3701ms step_avg:60.67ms +step:62/2245 train_time:3760ms step_avg:60.64ms +step:63/2245 train_time:3822ms step_avg:60.66ms +step:64/2245 train_time:3881ms step_avg:60.64ms +step:65/2245 train_time:3943ms step_avg:60.66ms +step:66/2245 train_time:4003ms step_avg:60.65ms +step:67/2245 train_time:4065ms step_avg:60.68ms +step:68/2245 train_time:4125ms step_avg:60.66ms +step:69/2245 train_time:4187ms step_avg:60.68ms +step:70/2245 train_time:4247ms step_avg:60.67ms +step:71/2245 train_time:4309ms step_avg:60.69ms +step:72/2245 train_time:4368ms step_avg:60.67ms +step:73/2245 train_time:4430ms step_avg:60.68ms +step:74/2245 train_time:4489ms step_avg:60.67ms +step:75/2245 train_time:4551ms step_avg:60.68ms +step:76/2245 train_time:4610ms step_avg:60.65ms +step:77/2245 train_time:4671ms step_avg:60.67ms +step:78/2245 train_time:4730ms step_avg:60.65ms +step:79/2245 train_time:4792ms step_avg:60.66ms +step:80/2245 train_time:4852ms step_avg:60.65ms +step:81/2245 train_time:4914ms step_avg:60.67ms +step:82/2245 train_time:4973ms step_avg:60.65ms +step:83/2245 train_time:5035ms step_avg:60.67ms +step:84/2245 train_time:5095ms step_avg:60.65ms +step:85/2245 train_time:5156ms step_avg:60.66ms +step:86/2245 train_time:5216ms step_avg:60.65ms +step:87/2245 train_time:5278ms step_avg:60.67ms +step:88/2245 train_time:5337ms step_avg:60.65ms +step:89/2245 train_time:5400ms step_avg:60.67ms +step:90/2245 train_time:5459ms step_avg:60.65ms +step:91/2245 train_time:5521ms step_avg:60.67ms +step:92/2245 train_time:5580ms step_avg:60.65ms +step:93/2245 train_time:5642ms step_avg:60.66ms +step:94/2245 train_time:5702ms step_avg:60.65ms +step:95/2245 train_time:5763ms step_avg:60.67ms +step:96/2245 train_time:5823ms step_avg:60.65ms +step:97/2245 train_time:5885ms step_avg:60.67ms +step:98/2245 train_time:5945ms step_avg:60.66ms +step:99/2245 train_time:6006ms step_avg:60.67ms +step:100/2245 train_time:6066ms step_avg:60.66ms +step:101/2245 train_time:6128ms step_avg:60.68ms +step:102/2245 train_time:6188ms step_avg:60.67ms +step:103/2245 train_time:6251ms step_avg:60.68ms +step:104/2245 train_time:6310ms step_avg:60.67ms +step:105/2245 train_time:6371ms step_avg:60.67ms +step:106/2245 train_time:6429ms step_avg:60.65ms +step:107/2245 train_time:6491ms step_avg:60.66ms +step:108/2245 train_time:6551ms step_avg:60.65ms +step:109/2245 train_time:6612ms step_avg:60.66ms +step:110/2245 train_time:6671ms step_avg:60.65ms +step:111/2245 train_time:6733ms step_avg:60.66ms +step:112/2245 train_time:6792ms step_avg:60.64ms +step:113/2245 train_time:6853ms step_avg:60.65ms +step:114/2245 train_time:6912ms step_avg:60.63ms +step:115/2245 train_time:6973ms step_avg:60.63ms +step:116/2245 train_time:7032ms step_avg:60.62ms +step:117/2245 train_time:7094ms step_avg:60.63ms +step:118/2245 train_time:7154ms step_avg:60.62ms +step:119/2245 train_time:7215ms step_avg:60.63ms +step:120/2245 train_time:7275ms step_avg:60.62ms +step:121/2245 train_time:7336ms step_avg:60.63ms +step:122/2245 train_time:7395ms step_avg:60.62ms +step:123/2245 train_time:7456ms step_avg:60.62ms +step:124/2245 train_time:7516ms step_avg:60.61ms +step:125/2245 train_time:7578ms step_avg:60.62ms +step:126/2245 train_time:7637ms step_avg:60.61ms +step:127/2245 train_time:7698ms step_avg:60.61ms +step:128/2245 train_time:7757ms step_avg:60.60ms +step:129/2245 train_time:7819ms step_avg:60.61ms +step:130/2245 train_time:7878ms step_avg:60.60ms +step:131/2245 train_time:7940ms step_avg:60.61ms +step:132/2245 train_time:7999ms step_avg:60.60ms +step:133/2245 train_time:8061ms step_avg:60.61ms +step:134/2245 train_time:8121ms step_avg:60.60ms +step:135/2245 train_time:8183ms step_avg:60.62ms +step:136/2245 train_time:8243ms step_avg:60.61ms +step:137/2245 train_time:8305ms step_avg:60.62ms +step:138/2245 train_time:8365ms step_avg:60.62ms +step:139/2245 train_time:8427ms step_avg:60.63ms +step:140/2245 train_time:8487ms step_avg:60.62ms +step:141/2245 train_time:8548ms step_avg:60.62ms +step:142/2245 train_time:8607ms step_avg:60.61ms +step:143/2245 train_time:8669ms step_avg:60.62ms +step:144/2245 train_time:8728ms step_avg:60.61ms +step:145/2245 train_time:8790ms step_avg:60.62ms +step:146/2245 train_time:8848ms step_avg:60.61ms +step:147/2245 train_time:8910ms step_avg:60.61ms +step:148/2245 train_time:8969ms step_avg:60.60ms +step:149/2245 train_time:9031ms step_avg:60.61ms +step:150/2245 train_time:9090ms step_avg:60.60ms +step:151/2245 train_time:9152ms step_avg:60.61ms +step:152/2245 train_time:9212ms step_avg:60.60ms +step:153/2245 train_time:9274ms step_avg:60.61ms +step:154/2245 train_time:9333ms step_avg:60.60ms +step:155/2245 train_time:9395ms step_avg:60.61ms +step:156/2245 train_time:9454ms step_avg:60.60ms +step:157/2245 train_time:9517ms step_avg:60.62ms +step:158/2245 train_time:9576ms step_avg:60.61ms +step:159/2245 train_time:9638ms step_avg:60.62ms +step:160/2245 train_time:9698ms step_avg:60.61ms +step:161/2245 train_time:9759ms step_avg:60.62ms +step:162/2245 train_time:9818ms step_avg:60.60ms +step:163/2245 train_time:9880ms step_avg:60.61ms +step:164/2245 train_time:9938ms step_avg:60.60ms +step:165/2245 train_time:9999ms step_avg:60.60ms +step:166/2245 train_time:10058ms step_avg:60.59ms +step:167/2245 train_time:10120ms step_avg:60.60ms +step:168/2245 train_time:10180ms step_avg:60.59ms +step:169/2245 train_time:10241ms step_avg:60.60ms +step:170/2245 train_time:10301ms step_avg:60.59ms +step:171/2245 train_time:10363ms step_avg:60.60ms +step:172/2245 train_time:10423ms step_avg:60.60ms +step:173/2245 train_time:10485ms step_avg:60.60ms +step:174/2245 train_time:10544ms step_avg:60.60ms +step:175/2245 train_time:10606ms step_avg:60.61ms +step:176/2245 train_time:10665ms step_avg:60.60ms +step:177/2245 train_time:10727ms step_avg:60.60ms +step:178/2245 train_time:10787ms step_avg:60.60ms +step:179/2245 train_time:10848ms step_avg:60.61ms +step:180/2245 train_time:10907ms step_avg:60.60ms +step:181/2245 train_time:10969ms step_avg:60.60ms +step:182/2245 train_time:11027ms step_avg:60.59ms +step:183/2245 train_time:11089ms step_avg:60.60ms +step:184/2245 train_time:11148ms step_avg:60.59ms +step:185/2245 train_time:11210ms step_avg:60.59ms +step:186/2245 train_time:11269ms step_avg:60.59ms +step:187/2245 train_time:11331ms step_avg:60.59ms +step:188/2245 train_time:11391ms step_avg:60.59ms +step:189/2245 train_time:11452ms step_avg:60.59ms +step:190/2245 train_time:11511ms step_avg:60.58ms +step:191/2245 train_time:11572ms step_avg:60.59ms +step:192/2245 train_time:11630ms step_avg:60.57ms +step:193/2245 train_time:11692ms step_avg:60.58ms +step:194/2245 train_time:11751ms step_avg:60.57ms +step:195/2245 train_time:11812ms step_avg:60.57ms +step:196/2245 train_time:11871ms step_avg:60.57ms +step:197/2245 train_time:11932ms step_avg:60.57ms +step:198/2245 train_time:11991ms step_avg:60.56ms +step:199/2245 train_time:12053ms step_avg:60.57ms +step:200/2245 train_time:12112ms step_avg:60.56ms +step:201/2245 train_time:12174ms step_avg:60.57ms +step:202/2245 train_time:12233ms step_avg:60.56ms +step:203/2245 train_time:12295ms step_avg:60.57ms +step:204/2245 train_time:12354ms step_avg:60.56ms +step:205/2245 train_time:12416ms step_avg:60.57ms +step:206/2245 train_time:12475ms step_avg:60.56ms +step:207/2245 train_time:12536ms step_avg:60.56ms +step:208/2245 train_time:12595ms step_avg:60.55ms +step:209/2245 train_time:12656ms step_avg:60.56ms +step:210/2245 train_time:12715ms step_avg:60.55ms +step:211/2245 train_time:12777ms step_avg:60.55ms +step:212/2245 train_time:12836ms step_avg:60.55ms +step:213/2245 train_time:12897ms step_avg:60.55ms +step:214/2245 train_time:12956ms step_avg:60.54ms +step:215/2245 train_time:13018ms step_avg:60.55ms +step:216/2245 train_time:13078ms step_avg:60.54ms +step:217/2245 train_time:13139ms step_avg:60.55ms +step:218/2245 train_time:13198ms step_avg:60.54ms +step:219/2245 train_time:13260ms step_avg:60.55ms +step:220/2245 train_time:13318ms step_avg:60.54ms +step:221/2245 train_time:13380ms step_avg:60.54ms +step:222/2245 train_time:13438ms step_avg:60.53ms +step:223/2245 train_time:13500ms step_avg:60.54ms +step:224/2245 train_time:13559ms step_avg:60.53ms +step:225/2245 train_time:13621ms step_avg:60.54ms +step:226/2245 train_time:13681ms step_avg:60.53ms +step:227/2245 train_time:13742ms step_avg:60.54ms +step:228/2245 train_time:13801ms step_avg:60.53ms +step:229/2245 train_time:13863ms step_avg:60.54ms +step:230/2245 train_time:13922ms step_avg:60.53ms +step:231/2245 train_time:13984ms step_avg:60.54ms +step:232/2245 train_time:14044ms step_avg:60.53ms +step:233/2245 train_time:14105ms step_avg:60.54ms +step:234/2245 train_time:14165ms step_avg:60.53ms +step:235/2245 train_time:14226ms step_avg:60.54ms +step:236/2245 train_time:14286ms step_avg:60.53ms +step:237/2245 train_time:14348ms step_avg:60.54ms +step:238/2245 train_time:14408ms step_avg:60.54ms +step:239/2245 train_time:14470ms step_avg:60.54ms +step:240/2245 train_time:14529ms step_avg:60.54ms +step:241/2245 train_time:14591ms step_avg:60.54ms +step:242/2245 train_time:14650ms step_avg:60.54ms +step:243/2245 train_time:14712ms step_avg:60.54ms +step:244/2245 train_time:14772ms step_avg:60.54ms +step:245/2245 train_time:14833ms step_avg:60.54ms +step:246/2245 train_time:14893ms step_avg:60.54ms +step:247/2245 train_time:14955ms step_avg:60.55ms +step:248/2245 train_time:15013ms step_avg:60.54ms +step:249/2245 train_time:15075ms step_avg:60.54ms +step:250/2245 train_time:15134ms step_avg:60.53ms +step:250/2245 val_loss:4.0721 train_time:15196ms step_avg:60.78ms +step:251/2245 train_time:15216ms step_avg:60.62ms +step:252/2245 train_time:15256ms step_avg:60.54ms +step:253/2245 train_time:15323ms step_avg:60.57ms +step:254/2245 train_time:15385ms step_avg:60.57ms +step:255/2245 train_time:15446ms step_avg:60.57ms +step:256/2245 train_time:15506ms step_avg:60.57ms +step:257/2245 train_time:15567ms step_avg:60.57ms +step:258/2245 train_time:15626ms step_avg:60.56ms +step:259/2245 train_time:15686ms step_avg:60.56ms +step:260/2245 train_time:15745ms step_avg:60.56ms +step:261/2245 train_time:15805ms step_avg:60.55ms +step:262/2245 train_time:15863ms step_avg:60.54ms +step:263/2245 train_time:15923ms step_avg:60.55ms +step:264/2245 train_time:15982ms step_avg:60.54ms +step:265/2245 train_time:16042ms step_avg:60.54ms +step:266/2245 train_time:16101ms step_avg:60.53ms +step:267/2245 train_time:16165ms step_avg:60.54ms +step:268/2245 train_time:16226ms step_avg:60.54ms +step:269/2245 train_time:16289ms step_avg:60.56ms +step:270/2245 train_time:16349ms step_avg:60.55ms +step:271/2245 train_time:16411ms step_avg:60.56ms +step:272/2245 train_time:16470ms step_avg:60.55ms +step:273/2245 train_time:16531ms step_avg:60.55ms +step:274/2245 train_time:16590ms step_avg:60.55ms +step:275/2245 train_time:16651ms step_avg:60.55ms +step:276/2245 train_time:16709ms step_avg:60.54ms +step:277/2245 train_time:16770ms step_avg:60.54ms +step:278/2245 train_time:16829ms step_avg:60.53ms +step:279/2245 train_time:16889ms step_avg:60.54ms +step:280/2245 train_time:16949ms step_avg:60.53ms +step:281/2245 train_time:17009ms step_avg:60.53ms +step:282/2245 train_time:17070ms step_avg:60.53ms +step:283/2245 train_time:17132ms step_avg:60.54ms +step:284/2245 train_time:17192ms step_avg:60.54ms +step:285/2245 train_time:17254ms step_avg:60.54ms +step:286/2245 train_time:17313ms step_avg:60.54ms +step:287/2245 train_time:17375ms step_avg:60.54ms +step:288/2245 train_time:17434ms step_avg:60.53ms +step:289/2245 train_time:17495ms step_avg:60.54ms +step:290/2245 train_time:17553ms step_avg:60.53ms +step:291/2245 train_time:17614ms step_avg:60.53ms +step:292/2245 train_time:17672ms step_avg:60.52ms +step:293/2245 train_time:17734ms step_avg:60.53ms +step:294/2245 train_time:17793ms step_avg:60.52ms +step:295/2245 train_time:17854ms step_avg:60.52ms +step:296/2245 train_time:17913ms step_avg:60.52ms +step:297/2245 train_time:17974ms step_avg:60.52ms +step:298/2245 train_time:18033ms step_avg:60.51ms +step:299/2245 train_time:18094ms step_avg:60.51ms +step:300/2245 train_time:18152ms step_avg:60.51ms +step:301/2245 train_time:18214ms step_avg:60.51ms +step:302/2245 train_time:18274ms step_avg:60.51ms +step:303/2245 train_time:18336ms step_avg:60.52ms +step:304/2245 train_time:18396ms step_avg:60.51ms +step:305/2245 train_time:18457ms step_avg:60.52ms +step:306/2245 train_time:18516ms step_avg:60.51ms +step:307/2245 train_time:18577ms step_avg:60.51ms +step:308/2245 train_time:18636ms step_avg:60.51ms +step:309/2245 train_time:18697ms step_avg:60.51ms +step:310/2245 train_time:18756ms step_avg:60.50ms +step:311/2245 train_time:18818ms step_avg:60.51ms +step:312/2245 train_time:18876ms step_avg:60.50ms +step:313/2245 train_time:18938ms step_avg:60.50ms +step:314/2245 train_time:18996ms step_avg:60.50ms +step:315/2245 train_time:19058ms step_avg:60.50ms +step:316/2245 train_time:19117ms step_avg:60.50ms +step:317/2245 train_time:19178ms step_avg:60.50ms +step:318/2245 train_time:19237ms step_avg:60.50ms +step:319/2245 train_time:19299ms step_avg:60.50ms +step:320/2245 train_time:19358ms step_avg:60.49ms +step:321/2245 train_time:19419ms step_avg:60.50ms +step:322/2245 train_time:19478ms step_avg:60.49ms +step:323/2245 train_time:19539ms step_avg:60.49ms +step:324/2245 train_time:19597ms step_avg:60.49ms +step:325/2245 train_time:19659ms step_avg:60.49ms +step:326/2245 train_time:19718ms step_avg:60.48ms +step:327/2245 train_time:19779ms step_avg:60.49ms +step:328/2245 train_time:19838ms step_avg:60.48ms +step:329/2245 train_time:19900ms step_avg:60.49ms +step:330/2245 train_time:19959ms step_avg:60.48ms +step:331/2245 train_time:20020ms step_avg:60.48ms +step:332/2245 train_time:20079ms step_avg:60.48ms +step:333/2245 train_time:20141ms step_avg:60.48ms +step:334/2245 train_time:20201ms step_avg:60.48ms +step:335/2245 train_time:20262ms step_avg:60.48ms +step:336/2245 train_time:20321ms step_avg:60.48ms +step:337/2245 train_time:20382ms step_avg:60.48ms +step:338/2245 train_time:20441ms step_avg:60.48ms +step:339/2245 train_time:20502ms step_avg:60.48ms +step:340/2245 train_time:20561ms step_avg:60.47ms +step:341/2245 train_time:20622ms step_avg:60.47ms +step:342/2245 train_time:20681ms step_avg:60.47ms +step:343/2245 train_time:20742ms step_avg:60.47ms +step:344/2245 train_time:20801ms step_avg:60.47ms +step:345/2245 train_time:20862ms step_avg:60.47ms +step:346/2245 train_time:20921ms step_avg:60.47ms +step:347/2245 train_time:20983ms step_avg:60.47ms +step:348/2245 train_time:21041ms step_avg:60.46ms +step:349/2245 train_time:21102ms step_avg:60.47ms +step:350/2245 train_time:21161ms step_avg:60.46ms +step:351/2245 train_time:21223ms step_avg:60.46ms +step:352/2245 train_time:21282ms step_avg:60.46ms +step:353/2245 train_time:21343ms step_avg:60.46ms +step:354/2245 train_time:21402ms step_avg:60.46ms +step:355/2245 train_time:21463ms step_avg:60.46ms +step:356/2245 train_time:21522ms step_avg:60.46ms +step:357/2245 train_time:21584ms step_avg:60.46ms +step:358/2245 train_time:21643ms step_avg:60.45ms +step:359/2245 train_time:21705ms step_avg:60.46ms +step:360/2245 train_time:21764ms step_avg:60.46ms +step:361/2245 train_time:21826ms step_avg:60.46ms +step:362/2245 train_time:21885ms step_avg:60.46ms +step:363/2245 train_time:21947ms step_avg:60.46ms +step:364/2245 train_time:22006ms step_avg:60.46ms +step:365/2245 train_time:22068ms step_avg:60.46ms +step:366/2245 train_time:22128ms step_avg:60.46ms +step:367/2245 train_time:22189ms step_avg:60.46ms +step:368/2245 train_time:22249ms step_avg:60.46ms +step:369/2245 train_time:22311ms step_avg:60.46ms +step:370/2245 train_time:22370ms step_avg:60.46ms +step:371/2245 train_time:22431ms step_avg:60.46ms +step:372/2245 train_time:22490ms step_avg:60.46ms +step:373/2245 train_time:22551ms step_avg:60.46ms +step:374/2245 train_time:22610ms step_avg:60.45ms +step:375/2245 train_time:22671ms step_avg:60.46ms +step:376/2245 train_time:22732ms step_avg:60.46ms +step:377/2245 train_time:22793ms step_avg:60.46ms +step:378/2245 train_time:22852ms step_avg:60.46ms +step:379/2245 train_time:22913ms step_avg:60.46ms +step:380/2245 train_time:22972ms step_avg:60.45ms +step:381/2245 train_time:23034ms step_avg:60.46ms +step:382/2245 train_time:23093ms step_avg:60.45ms +step:383/2245 train_time:23154ms step_avg:60.45ms +step:384/2245 train_time:23212ms step_avg:60.45ms +step:385/2245 train_time:23274ms step_avg:60.45ms +step:386/2245 train_time:23333ms step_avg:60.45ms +step:387/2245 train_time:23394ms step_avg:60.45ms +step:388/2245 train_time:23453ms step_avg:60.44ms +step:389/2245 train_time:23513ms step_avg:60.45ms +step:390/2245 train_time:23572ms step_avg:60.44ms +step:391/2245 train_time:23633ms step_avg:60.44ms +step:392/2245 train_time:23692ms step_avg:60.44ms +step:393/2245 train_time:23754ms step_avg:60.44ms +step:394/2245 train_time:23813ms step_avg:60.44ms +step:395/2245 train_time:23875ms step_avg:60.44ms +step:396/2245 train_time:23934ms step_avg:60.44ms +step:397/2245 train_time:23995ms step_avg:60.44ms +step:398/2245 train_time:24054ms step_avg:60.44ms +step:399/2245 train_time:24115ms step_avg:60.44ms +step:400/2245 train_time:24173ms step_avg:60.43ms +step:401/2245 train_time:24234ms step_avg:60.43ms +step:402/2245 train_time:24293ms step_avg:60.43ms +step:403/2245 train_time:24354ms step_avg:60.43ms +step:404/2245 train_time:24413ms step_avg:60.43ms +step:405/2245 train_time:24475ms step_avg:60.43ms +step:406/2245 train_time:24534ms step_avg:60.43ms +step:407/2245 train_time:24595ms step_avg:60.43ms +step:408/2245 train_time:24653ms step_avg:60.42ms +step:409/2245 train_time:24715ms step_avg:60.43ms +step:410/2245 train_time:24774ms step_avg:60.42ms +step:411/2245 train_time:24835ms step_avg:60.43ms +step:412/2245 train_time:24894ms step_avg:60.42ms +step:413/2245 train_time:24956ms step_avg:60.43ms +step:414/2245 train_time:25014ms step_avg:60.42ms +step:415/2245 train_time:25075ms step_avg:60.42ms +step:416/2245 train_time:25134ms step_avg:60.42ms +step:417/2245 train_time:25196ms step_avg:60.42ms +step:418/2245 train_time:25255ms step_avg:60.42ms +step:419/2245 train_time:25317ms step_avg:60.42ms +step:420/2245 train_time:25376ms step_avg:60.42ms +step:421/2245 train_time:25437ms step_avg:60.42ms +step:422/2245 train_time:25496ms step_avg:60.42ms +step:423/2245 train_time:25558ms step_avg:60.42ms +step:424/2245 train_time:25617ms step_avg:60.42ms +step:425/2245 train_time:25679ms step_avg:60.42ms +step:426/2245 train_time:25738ms step_avg:60.42ms +step:427/2245 train_time:25799ms step_avg:60.42ms +step:428/2245 train_time:25858ms step_avg:60.42ms +step:429/2245 train_time:25919ms step_avg:60.42ms +step:430/2245 train_time:25978ms step_avg:60.41ms +step:431/2245 train_time:26040ms step_avg:60.42ms +step:432/2245 train_time:26098ms step_avg:60.41ms +step:433/2245 train_time:26160ms step_avg:60.42ms +step:434/2245 train_time:26219ms step_avg:60.41ms +step:435/2245 train_time:26280ms step_avg:60.41ms +step:436/2245 train_time:26339ms step_avg:60.41ms +step:437/2245 train_time:26400ms step_avg:60.41ms +step:438/2245 train_time:26459ms step_avg:60.41ms +step:439/2245 train_time:26521ms step_avg:60.41ms +step:440/2245 train_time:26580ms step_avg:60.41ms +step:441/2245 train_time:26641ms step_avg:60.41ms +step:442/2245 train_time:26700ms step_avg:60.41ms +step:443/2245 train_time:26763ms step_avg:60.41ms +step:444/2245 train_time:26822ms step_avg:60.41ms +step:445/2245 train_time:26883ms step_avg:60.41ms +step:446/2245 train_time:26942ms step_avg:60.41ms +step:447/2245 train_time:27004ms step_avg:60.41ms +step:448/2245 train_time:27062ms step_avg:60.41ms +step:449/2245 train_time:27124ms step_avg:60.41ms +step:450/2245 train_time:27183ms step_avg:60.41ms +step:451/2245 train_time:27245ms step_avg:60.41ms +step:452/2245 train_time:27304ms step_avg:60.41ms +step:453/2245 train_time:27366ms step_avg:60.41ms +step:454/2245 train_time:27426ms step_avg:60.41ms +step:455/2245 train_time:27487ms step_avg:60.41ms +step:456/2245 train_time:27547ms step_avg:60.41ms +step:457/2245 train_time:27609ms step_avg:60.41ms +step:458/2245 train_time:27668ms step_avg:60.41ms +step:459/2245 train_time:27730ms step_avg:60.41ms +step:460/2245 train_time:27789ms step_avg:60.41ms +step:461/2245 train_time:27850ms step_avg:60.41ms +step:462/2245 train_time:27910ms step_avg:60.41ms +step:463/2245 train_time:27971ms step_avg:60.41ms +step:464/2245 train_time:28030ms step_avg:60.41ms +step:465/2245 train_time:28091ms step_avg:60.41ms +step:466/2245 train_time:28150ms step_avg:60.41ms +step:467/2245 train_time:28211ms step_avg:60.41ms +step:468/2245 train_time:28271ms step_avg:60.41ms +step:469/2245 train_time:28333ms step_avg:60.41ms +step:470/2245 train_time:28392ms step_avg:60.41ms +step:471/2245 train_time:28454ms step_avg:60.41ms +step:472/2245 train_time:28513ms step_avg:60.41ms +step:473/2245 train_time:28575ms step_avg:60.41ms +step:474/2245 train_time:28634ms step_avg:60.41ms +step:475/2245 train_time:28695ms step_avg:60.41ms +step:476/2245 train_time:28753ms step_avg:60.41ms +step:477/2245 train_time:28815ms step_avg:60.41ms +step:478/2245 train_time:28873ms step_avg:60.40ms +step:479/2245 train_time:28935ms step_avg:60.41ms +step:480/2245 train_time:28993ms step_avg:60.40ms +step:481/2245 train_time:29054ms step_avg:60.40ms +step:482/2245 train_time:29112ms step_avg:60.40ms +step:483/2245 train_time:29174ms step_avg:60.40ms +step:484/2245 train_time:29233ms step_avg:60.40ms +step:485/2245 train_time:29295ms step_avg:60.40ms +step:486/2245 train_time:29354ms step_avg:60.40ms +step:487/2245 train_time:29416ms step_avg:60.40ms +step:488/2245 train_time:29475ms step_avg:60.40ms +step:489/2245 train_time:29536ms step_avg:60.40ms +step:490/2245 train_time:29595ms step_avg:60.40ms +step:491/2245 train_time:29656ms step_avg:60.40ms +step:492/2245 train_time:29715ms step_avg:60.40ms +step:493/2245 train_time:29776ms step_avg:60.40ms +step:494/2245 train_time:29835ms step_avg:60.39ms +step:495/2245 train_time:29896ms step_avg:60.40ms +step:496/2245 train_time:29955ms step_avg:60.39ms +step:497/2245 train_time:30016ms step_avg:60.39ms +step:498/2245 train_time:30075ms step_avg:60.39ms +step:499/2245 train_time:30136ms step_avg:60.39ms +step:500/2245 train_time:30195ms step_avg:60.39ms +step:500/2245 val_loss:3.8156 train_time:30258ms step_avg:60.52ms +step:501/2245 train_time:30277ms step_avg:60.43ms +step:502/2245 train_time:30321ms step_avg:60.40ms +step:503/2245 train_time:30385ms step_avg:60.41ms +step:504/2245 train_time:30446ms step_avg:60.41ms +step:505/2245 train_time:30508ms step_avg:60.41ms +step:506/2245 train_time:30567ms step_avg:60.41ms +step:507/2245 train_time:30629ms step_avg:60.41ms +step:508/2245 train_time:30687ms step_avg:60.41ms +step:509/2245 train_time:30748ms step_avg:60.41ms +step:510/2245 train_time:30806ms step_avg:60.40ms +step:511/2245 train_time:30867ms step_avg:60.40ms +step:512/2245 train_time:30925ms step_avg:60.40ms +step:513/2245 train_time:30986ms step_avg:60.40ms +step:514/2245 train_time:31045ms step_avg:60.40ms +step:515/2245 train_time:31106ms step_avg:60.40ms +step:516/2245 train_time:31164ms step_avg:60.40ms +step:517/2245 train_time:31227ms step_avg:60.40ms +step:518/2245 train_time:31288ms step_avg:60.40ms +step:519/2245 train_time:31351ms step_avg:60.41ms +step:520/2245 train_time:31410ms step_avg:60.40ms +step:521/2245 train_time:31472ms step_avg:60.41ms +step:522/2245 train_time:31531ms step_avg:60.41ms +step:523/2245 train_time:31594ms step_avg:60.41ms +step:524/2245 train_time:31652ms step_avg:60.41ms +step:525/2245 train_time:31714ms step_avg:60.41ms +step:526/2245 train_time:31772ms step_avg:60.40ms +step:527/2245 train_time:31834ms step_avg:60.41ms +step:528/2245 train_time:31892ms step_avg:60.40ms +step:529/2245 train_time:31955ms step_avg:60.41ms +step:530/2245 train_time:32013ms step_avg:60.40ms +step:531/2245 train_time:32075ms step_avg:60.40ms +step:532/2245 train_time:32134ms step_avg:60.40ms +step:533/2245 train_time:32197ms step_avg:60.41ms +step:534/2245 train_time:32256ms step_avg:60.40ms +step:535/2245 train_time:32318ms step_avg:60.41ms +step:536/2245 train_time:32377ms step_avg:60.40ms +step:537/2245 train_time:32439ms step_avg:60.41ms +step:538/2245 train_time:32499ms step_avg:60.41ms +step:539/2245 train_time:32561ms step_avg:60.41ms +step:540/2245 train_time:32621ms step_avg:60.41ms +step:541/2245 train_time:32683ms step_avg:60.41ms +step:542/2245 train_time:32743ms step_avg:60.41ms +step:543/2245 train_time:32804ms step_avg:60.41ms +step:544/2245 train_time:32863ms step_avg:60.41ms +step:545/2245 train_time:32925ms step_avg:60.41ms +step:546/2245 train_time:32984ms step_avg:60.41ms +step:547/2245 train_time:33045ms step_avg:60.41ms +step:548/2245 train_time:33104ms step_avg:60.41ms +step:549/2245 train_time:33166ms step_avg:60.41ms +step:550/2245 train_time:33226ms step_avg:60.41ms +step:551/2245 train_time:33288ms step_avg:60.41ms +step:552/2245 train_time:33346ms step_avg:60.41ms +step:553/2245 train_time:33408ms step_avg:60.41ms +step:554/2245 train_time:33467ms step_avg:60.41ms +step:555/2245 train_time:33529ms step_avg:60.41ms +step:556/2245 train_time:33588ms step_avg:60.41ms +step:557/2245 train_time:33651ms step_avg:60.42ms +step:558/2245 train_time:33710ms step_avg:60.41ms +step:559/2245 train_time:33771ms step_avg:60.41ms +step:560/2245 train_time:33829ms step_avg:60.41ms +step:561/2245 train_time:33891ms step_avg:60.41ms +step:562/2245 train_time:33949ms step_avg:60.41ms +step:563/2245 train_time:34011ms step_avg:60.41ms +step:564/2245 train_time:34070ms step_avg:60.41ms +step:565/2245 train_time:34132ms step_avg:60.41ms +step:566/2245 train_time:34191ms step_avg:60.41ms +step:567/2245 train_time:34253ms step_avg:60.41ms +step:568/2245 train_time:34312ms step_avg:60.41ms +step:569/2245 train_time:34373ms step_avg:60.41ms +step:570/2245 train_time:34432ms step_avg:60.41ms +step:571/2245 train_time:34494ms step_avg:60.41ms +step:572/2245 train_time:34553ms step_avg:60.41ms +step:573/2245 train_time:34615ms step_avg:60.41ms +step:574/2245 train_time:34675ms step_avg:60.41ms +step:575/2245 train_time:34735ms step_avg:60.41ms +step:576/2245 train_time:34794ms step_avg:60.41ms +step:577/2245 train_time:34855ms step_avg:60.41ms +step:578/2245 train_time:34914ms step_avg:60.41ms +step:579/2245 train_time:34976ms step_avg:60.41ms +step:580/2245 train_time:35034ms step_avg:60.40ms +step:581/2245 train_time:35096ms step_avg:60.41ms +step:582/2245 train_time:35155ms step_avg:60.40ms +step:583/2245 train_time:35217ms step_avg:60.41ms +step:584/2245 train_time:35276ms step_avg:60.40ms +step:585/2245 train_time:35338ms step_avg:60.41ms +step:586/2245 train_time:35397ms step_avg:60.40ms +step:587/2245 train_time:35459ms step_avg:60.41ms +step:588/2245 train_time:35519ms step_avg:60.41ms +step:589/2245 train_time:35581ms step_avg:60.41ms +step:590/2245 train_time:35641ms step_avg:60.41ms +step:591/2245 train_time:35702ms step_avg:60.41ms +step:592/2245 train_time:35762ms step_avg:60.41ms +step:593/2245 train_time:35824ms step_avg:60.41ms +step:594/2245 train_time:35882ms step_avg:60.41ms +step:595/2245 train_time:35944ms step_avg:60.41ms +step:596/2245 train_time:36004ms step_avg:60.41ms +step:597/2245 train_time:36066ms step_avg:60.41ms +step:598/2245 train_time:36125ms step_avg:60.41ms +step:599/2245 train_time:36187ms step_avg:60.41ms +step:600/2245 train_time:36246ms step_avg:60.41ms +step:601/2245 train_time:36307ms step_avg:60.41ms +step:602/2245 train_time:36366ms step_avg:60.41ms +step:603/2245 train_time:36428ms step_avg:60.41ms +step:604/2245 train_time:36488ms step_avg:60.41ms +step:605/2245 train_time:36550ms step_avg:60.41ms +step:606/2245 train_time:36609ms step_avg:60.41ms +step:607/2245 train_time:36671ms step_avg:60.41ms +step:608/2245 train_time:36729ms step_avg:60.41ms +step:609/2245 train_time:36792ms step_avg:60.41ms +step:610/2245 train_time:36851ms step_avg:60.41ms +step:611/2245 train_time:36912ms step_avg:60.41ms +step:612/2245 train_time:36972ms step_avg:60.41ms +step:613/2245 train_time:37034ms step_avg:60.41ms +step:614/2245 train_time:37093ms step_avg:60.41ms +step:615/2245 train_time:37155ms step_avg:60.41ms +step:616/2245 train_time:37214ms step_avg:60.41ms +step:617/2245 train_time:37276ms step_avg:60.42ms +step:618/2245 train_time:37335ms step_avg:60.41ms +step:619/2245 train_time:37397ms step_avg:60.42ms +step:620/2245 train_time:37456ms step_avg:60.41ms +step:621/2245 train_time:37518ms step_avg:60.42ms +step:622/2245 train_time:37577ms step_avg:60.41ms +step:623/2245 train_time:37639ms step_avg:60.42ms +step:624/2245 train_time:37698ms step_avg:60.41ms +step:625/2245 train_time:37760ms step_avg:60.42ms +step:626/2245 train_time:37819ms step_avg:60.41ms +step:627/2245 train_time:37882ms step_avg:60.42ms +step:628/2245 train_time:37941ms step_avg:60.42ms +step:629/2245 train_time:38003ms step_avg:60.42ms +step:630/2245 train_time:38062ms step_avg:60.42ms +step:631/2245 train_time:38124ms step_avg:60.42ms +step:632/2245 train_time:38184ms step_avg:60.42ms +step:633/2245 train_time:38245ms step_avg:60.42ms +step:634/2245 train_time:38305ms step_avg:60.42ms +step:635/2245 train_time:38366ms step_avg:60.42ms +step:636/2245 train_time:38425ms step_avg:60.42ms +step:637/2245 train_time:38487ms step_avg:60.42ms +step:638/2245 train_time:38546ms step_avg:60.42ms +step:639/2245 train_time:38607ms step_avg:60.42ms +step:640/2245 train_time:38667ms step_avg:60.42ms +step:641/2245 train_time:38728ms step_avg:60.42ms +step:642/2245 train_time:38788ms step_avg:60.42ms +step:643/2245 train_time:38850ms step_avg:60.42ms +step:644/2245 train_time:38909ms step_avg:60.42ms +step:645/2245 train_time:38971ms step_avg:60.42ms +step:646/2245 train_time:39030ms step_avg:60.42ms +step:647/2245 train_time:39091ms step_avg:60.42ms +step:648/2245 train_time:39150ms step_avg:60.42ms +step:649/2245 train_time:39211ms step_avg:60.42ms +step:650/2245 train_time:39271ms step_avg:60.42ms +step:651/2245 train_time:39333ms step_avg:60.42ms +step:652/2245 train_time:39392ms step_avg:60.42ms +step:653/2245 train_time:39453ms step_avg:60.42ms +step:654/2245 train_time:39512ms step_avg:60.42ms +step:655/2245 train_time:39573ms step_avg:60.42ms +step:656/2245 train_time:39632ms step_avg:60.41ms +step:657/2245 train_time:39693ms step_avg:60.42ms +step:658/2245 train_time:39752ms step_avg:60.41ms +step:659/2245 train_time:39814ms step_avg:60.42ms +step:660/2245 train_time:39873ms step_avg:60.41ms +step:661/2245 train_time:39935ms step_avg:60.42ms +step:662/2245 train_time:39994ms step_avg:60.41ms +step:663/2245 train_time:40055ms step_avg:60.42ms +step:664/2245 train_time:40114ms step_avg:60.41ms +step:665/2245 train_time:40175ms step_avg:60.41ms +step:666/2245 train_time:40234ms step_avg:60.41ms +step:667/2245 train_time:40296ms step_avg:60.41ms +step:668/2245 train_time:40355ms step_avg:60.41ms +step:669/2245 train_time:40417ms step_avg:60.41ms +step:670/2245 train_time:40476ms step_avg:60.41ms +step:671/2245 train_time:40538ms step_avg:60.41ms +step:672/2245 train_time:40597ms step_avg:60.41ms +step:673/2245 train_time:40659ms step_avg:60.41ms +step:674/2245 train_time:40718ms step_avg:60.41ms +step:675/2245 train_time:40780ms step_avg:60.41ms +step:676/2245 train_time:40839ms step_avg:60.41ms +step:677/2245 train_time:40900ms step_avg:60.41ms +step:678/2245 train_time:40960ms step_avg:60.41ms +step:679/2245 train_time:41022ms step_avg:60.41ms +step:680/2245 train_time:41081ms step_avg:60.41ms +step:681/2245 train_time:41143ms step_avg:60.42ms +step:682/2245 train_time:41203ms step_avg:60.41ms +step:683/2245 train_time:41265ms step_avg:60.42ms +step:684/2245 train_time:41325ms step_avg:60.42ms +step:685/2245 train_time:41387ms step_avg:60.42ms +step:686/2245 train_time:41445ms step_avg:60.42ms +step:687/2245 train_time:41507ms step_avg:60.42ms +step:688/2245 train_time:41566ms step_avg:60.42ms +step:689/2245 train_time:41627ms step_avg:60.42ms +step:690/2245 train_time:41686ms step_avg:60.41ms +step:691/2245 train_time:41748ms step_avg:60.42ms +step:692/2245 train_time:41806ms step_avg:60.41ms +step:693/2245 train_time:41868ms step_avg:60.41ms +step:694/2245 train_time:41927ms step_avg:60.41ms +step:695/2245 train_time:41989ms step_avg:60.42ms +step:696/2245 train_time:42048ms step_avg:60.41ms +step:697/2245 train_time:42110ms step_avg:60.42ms +step:698/2245 train_time:42168ms step_avg:60.41ms +step:699/2245 train_time:42229ms step_avg:60.41ms +step:700/2245 train_time:42288ms step_avg:60.41ms +step:701/2245 train_time:42349ms step_avg:60.41ms +step:702/2245 train_time:42408ms step_avg:60.41ms +step:703/2245 train_time:42469ms step_avg:60.41ms +step:704/2245 train_time:42528ms step_avg:60.41ms +step:705/2245 train_time:42589ms step_avg:60.41ms +step:706/2245 train_time:42649ms step_avg:60.41ms +step:707/2245 train_time:42710ms step_avg:60.41ms +step:708/2245 train_time:42769ms step_avg:60.41ms +step:709/2245 train_time:42830ms step_avg:60.41ms +step:710/2245 train_time:42889ms step_avg:60.41ms +step:711/2245 train_time:42951ms step_avg:60.41ms +step:712/2245 train_time:43009ms step_avg:60.41ms +step:713/2245 train_time:43071ms step_avg:60.41ms +step:714/2245 train_time:43130ms step_avg:60.41ms +step:715/2245 train_time:43191ms step_avg:60.41ms +step:716/2245 train_time:43250ms step_avg:60.41ms +step:717/2245 train_time:43312ms step_avg:60.41ms +step:718/2245 train_time:43371ms step_avg:60.40ms +step:719/2245 train_time:43433ms step_avg:60.41ms +step:720/2245 train_time:43876ms step_avg:60.94ms +step:721/2245 train_time:43935ms step_avg:60.94ms +step:722/2245 train_time:43993ms step_avg:60.93ms +step:723/2245 train_time:44054ms step_avg:60.93ms +step:724/2245 train_time:44112ms step_avg:60.93ms +step:725/2245 train_time:44172ms step_avg:60.93ms +step:726/2245 train_time:44231ms step_avg:60.92ms +step:727/2245 train_time:44291ms step_avg:60.92ms +step:728/2245 train_time:44350ms step_avg:60.92ms +step:729/2245 train_time:44410ms step_avg:60.92ms +step:730/2245 train_time:44468ms step_avg:60.92ms +step:731/2245 train_time:44529ms step_avg:60.91ms +step:732/2245 train_time:44587ms step_avg:60.91ms +step:733/2245 train_time:44648ms step_avg:60.91ms +step:734/2245 train_time:44707ms step_avg:60.91ms +step:735/2245 train_time:44775ms step_avg:60.92ms +step:736/2245 train_time:44837ms step_avg:60.92ms +step:737/2245 train_time:44901ms step_avg:60.92ms +step:738/2245 train_time:44962ms step_avg:60.92ms +step:739/2245 train_time:45023ms step_avg:60.92ms +step:740/2245 train_time:45083ms step_avg:60.92ms +step:741/2245 train_time:45145ms step_avg:60.92ms +step:742/2245 train_time:45205ms step_avg:60.92ms +step:743/2245 train_time:45267ms step_avg:60.92ms +step:744/2245 train_time:45327ms step_avg:60.92ms +step:745/2245 train_time:45389ms step_avg:60.92ms +step:746/2245 train_time:45448ms step_avg:60.92ms +step:747/2245 train_time:45509ms step_avg:60.92ms +step:748/2245 train_time:45569ms step_avg:60.92ms +step:749/2245 train_time:45630ms step_avg:60.92ms +step:750/2245 train_time:45690ms step_avg:60.92ms +step:750/2245 val_loss:3.6674 train_time:45755ms step_avg:61.01ms +step:751/2245 train_time:45773ms step_avg:60.95ms +step:752/2245 train_time:45817ms step_avg:60.93ms +step:753/2245 train_time:45878ms step_avg:60.93ms +step:754/2245 train_time:45938ms step_avg:60.93ms +step:755/2245 train_time:46001ms step_avg:60.93ms +step:756/2245 train_time:46061ms step_avg:60.93ms +step:757/2245 train_time:46123ms step_avg:60.93ms +step:758/2245 train_time:46181ms step_avg:60.93ms +step:759/2245 train_time:46242ms step_avg:60.93ms +step:760/2245 train_time:46302ms step_avg:60.92ms +step:761/2245 train_time:46363ms step_avg:60.92ms +step:762/2245 train_time:46422ms step_avg:60.92ms +step:763/2245 train_time:46483ms step_avg:60.92ms +step:764/2245 train_time:46542ms step_avg:60.92ms +step:765/2245 train_time:46604ms step_avg:60.92ms +step:766/2245 train_time:46670ms step_avg:60.93ms +step:767/2245 train_time:46737ms step_avg:60.93ms +step:768/2245 train_time:46798ms step_avg:60.93ms +step:769/2245 train_time:46860ms step_avg:60.94ms +step:770/2245 train_time:46921ms step_avg:60.94ms +step:771/2245 train_time:46983ms step_avg:60.94ms +step:772/2245 train_time:47042ms step_avg:60.94ms +step:773/2245 train_time:47104ms step_avg:60.94ms +step:774/2245 train_time:47164ms step_avg:60.94ms +step:775/2245 train_time:47225ms step_avg:60.94ms +step:776/2245 train_time:47284ms step_avg:60.93ms +step:777/2245 train_time:47345ms step_avg:60.93ms +step:778/2245 train_time:47405ms step_avg:60.93ms +step:779/2245 train_time:47466ms step_avg:60.93ms +step:780/2245 train_time:47525ms step_avg:60.93ms +step:781/2245 train_time:47588ms step_avg:60.93ms +step:782/2245 train_time:47650ms step_avg:60.93ms +step:783/2245 train_time:47714ms step_avg:60.94ms +step:784/2245 train_time:47775ms step_avg:60.94ms +step:785/2245 train_time:47838ms step_avg:60.94ms +step:786/2245 train_time:47898ms step_avg:60.94ms +step:787/2245 train_time:47961ms step_avg:60.94ms +step:788/2245 train_time:48021ms step_avg:60.94ms +step:789/2245 train_time:48083ms step_avg:60.94ms +step:790/2245 train_time:48143ms step_avg:60.94ms +step:791/2245 train_time:48205ms step_avg:60.94ms +step:792/2245 train_time:48265ms step_avg:60.94ms +step:793/2245 train_time:48326ms step_avg:60.94ms +step:794/2245 train_time:48386ms step_avg:60.94ms +step:795/2245 train_time:48448ms step_avg:60.94ms +step:796/2245 train_time:48508ms step_avg:60.94ms +step:797/2245 train_time:48570ms step_avg:60.94ms +step:798/2245 train_time:48630ms step_avg:60.94ms +step:799/2245 train_time:48693ms step_avg:60.94ms +step:800/2245 train_time:48754ms step_avg:60.94ms +step:801/2245 train_time:48817ms step_avg:60.94ms +step:802/2245 train_time:48877ms step_avg:60.94ms +step:803/2245 train_time:48939ms step_avg:60.95ms +step:804/2245 train_time:48999ms step_avg:60.94ms +step:805/2245 train_time:49061ms step_avg:60.95ms +step:806/2245 train_time:49122ms step_avg:60.95ms +step:807/2245 train_time:49184ms step_avg:60.95ms +step:808/2245 train_time:49243ms step_avg:60.94ms +step:809/2245 train_time:49305ms step_avg:60.95ms +step:810/2245 train_time:49365ms step_avg:60.94ms +step:811/2245 train_time:49428ms step_avg:60.95ms +step:812/2245 train_time:49487ms step_avg:60.94ms +step:813/2245 train_time:49550ms step_avg:60.95ms +step:814/2245 train_time:49610ms step_avg:60.95ms +step:815/2245 train_time:49673ms step_avg:60.95ms +step:816/2245 train_time:49733ms step_avg:60.95ms +step:817/2245 train_time:49796ms step_avg:60.95ms +step:818/2245 train_time:49855ms step_avg:60.95ms +step:819/2245 train_time:49917ms step_avg:60.95ms +step:820/2245 train_time:49977ms step_avg:60.95ms +step:821/2245 train_time:50039ms step_avg:60.95ms +step:822/2245 train_time:50099ms step_avg:60.95ms +step:823/2245 train_time:50161ms step_avg:60.95ms +step:824/2245 train_time:50221ms step_avg:60.95ms +step:825/2245 train_time:50283ms step_avg:60.95ms +step:826/2245 train_time:50343ms step_avg:60.95ms +step:827/2245 train_time:50406ms step_avg:60.95ms +step:828/2245 train_time:50466ms step_avg:60.95ms +step:829/2245 train_time:50529ms step_avg:60.95ms +step:830/2245 train_time:50590ms step_avg:60.95ms +step:831/2245 train_time:50651ms step_avg:60.95ms +step:832/2245 train_time:50711ms step_avg:60.95ms +step:833/2245 train_time:50774ms step_avg:60.95ms +step:834/2245 train_time:50834ms step_avg:60.95ms +step:835/2245 train_time:50896ms step_avg:60.95ms +step:836/2245 train_time:50956ms step_avg:60.95ms +step:837/2245 train_time:51018ms step_avg:60.95ms +step:838/2245 train_time:51077ms step_avg:60.95ms +step:839/2245 train_time:51140ms step_avg:60.95ms +step:840/2245 train_time:51200ms step_avg:60.95ms +step:841/2245 train_time:51262ms step_avg:60.95ms +step:842/2245 train_time:51322ms step_avg:60.95ms +step:843/2245 train_time:51384ms step_avg:60.95ms +step:844/2245 train_time:51444ms step_avg:60.95ms +step:845/2245 train_time:51507ms step_avg:60.96ms +step:846/2245 train_time:51568ms step_avg:60.96ms +step:847/2245 train_time:51631ms step_avg:60.96ms +step:848/2245 train_time:51691ms step_avg:60.96ms +step:849/2245 train_time:51753ms step_avg:60.96ms +step:850/2245 train_time:51813ms step_avg:60.96ms +step:851/2245 train_time:51875ms step_avg:60.96ms +step:852/2245 train_time:51936ms step_avg:60.96ms +step:853/2245 train_time:51997ms step_avg:60.96ms +step:854/2245 train_time:52057ms step_avg:60.96ms +step:855/2245 train_time:52119ms step_avg:60.96ms +step:856/2245 train_time:52178ms step_avg:60.96ms +step:857/2245 train_time:52241ms step_avg:60.96ms +step:858/2245 train_time:52300ms step_avg:60.96ms +step:859/2245 train_time:52363ms step_avg:60.96ms +step:860/2245 train_time:52423ms step_avg:60.96ms +step:861/2245 train_time:52486ms step_avg:60.96ms +step:862/2245 train_time:52547ms step_avg:60.96ms +step:863/2245 train_time:52610ms step_avg:60.96ms +step:864/2245 train_time:52670ms step_avg:60.96ms +step:865/2245 train_time:52733ms step_avg:60.96ms +step:866/2245 train_time:52792ms step_avg:60.96ms +step:867/2245 train_time:52855ms step_avg:60.96ms +step:868/2245 train_time:52915ms step_avg:60.96ms +step:869/2245 train_time:52976ms step_avg:60.96ms +step:870/2245 train_time:53036ms step_avg:60.96ms +step:871/2245 train_time:53098ms step_avg:60.96ms +step:872/2245 train_time:53157ms step_avg:60.96ms +step:873/2245 train_time:53220ms step_avg:60.96ms +step:874/2245 train_time:53280ms step_avg:60.96ms +step:875/2245 train_time:53343ms step_avg:60.96ms +step:876/2245 train_time:53402ms step_avg:60.96ms +step:877/2245 train_time:53465ms step_avg:60.96ms +step:878/2245 train_time:53525ms step_avg:60.96ms +step:879/2245 train_time:53587ms step_avg:60.96ms +step:880/2245 train_time:53647ms step_avg:60.96ms +step:881/2245 train_time:53711ms step_avg:60.97ms +step:882/2245 train_time:53771ms step_avg:60.96ms +step:883/2245 train_time:53833ms step_avg:60.97ms +step:884/2245 train_time:53893ms step_avg:60.97ms +step:885/2245 train_time:53955ms step_avg:60.97ms +step:886/2245 train_time:54016ms step_avg:60.97ms +step:887/2245 train_time:54078ms step_avg:60.97ms +step:888/2245 train_time:54138ms step_avg:60.97ms +step:889/2245 train_time:54200ms step_avg:60.97ms +step:890/2245 train_time:54260ms step_avg:60.97ms +step:891/2245 train_time:54322ms step_avg:60.97ms +step:892/2245 train_time:54382ms step_avg:60.97ms +step:893/2245 train_time:54444ms step_avg:60.97ms +step:894/2245 train_time:54505ms step_avg:60.97ms +step:895/2245 train_time:54568ms step_avg:60.97ms +step:896/2245 train_time:54628ms step_avg:60.97ms +step:897/2245 train_time:54691ms step_avg:60.97ms +step:898/2245 train_time:54751ms step_avg:60.97ms +step:899/2245 train_time:54813ms step_avg:60.97ms +step:900/2245 train_time:54873ms step_avg:60.97ms +step:901/2245 train_time:54935ms step_avg:60.97ms +step:902/2245 train_time:54994ms step_avg:60.97ms +step:903/2245 train_time:55056ms step_avg:60.97ms +step:904/2245 train_time:55116ms step_avg:60.97ms +step:905/2245 train_time:55178ms step_avg:60.97ms +step:906/2245 train_time:55238ms step_avg:60.97ms +step:907/2245 train_time:55300ms step_avg:60.97ms +step:908/2245 train_time:55360ms step_avg:60.97ms +step:909/2245 train_time:55423ms step_avg:60.97ms +step:910/2245 train_time:55482ms step_avg:60.97ms +step:911/2245 train_time:55545ms step_avg:60.97ms +step:912/2245 train_time:55606ms step_avg:60.97ms +step:913/2245 train_time:55669ms step_avg:60.97ms +step:914/2245 train_time:55729ms step_avg:60.97ms +step:915/2245 train_time:55792ms step_avg:60.98ms +step:916/2245 train_time:55852ms step_avg:60.97ms +step:917/2245 train_time:55915ms step_avg:60.98ms +step:918/2245 train_time:55975ms step_avg:60.97ms +step:919/2245 train_time:56037ms step_avg:60.98ms +step:920/2245 train_time:56096ms step_avg:60.97ms +step:921/2245 train_time:56158ms step_avg:60.97ms +step:922/2245 train_time:56217ms step_avg:60.97ms +step:923/2245 train_time:56280ms step_avg:60.97ms +step:924/2245 train_time:56339ms step_avg:60.97ms +step:925/2245 train_time:56402ms step_avg:60.97ms +step:926/2245 train_time:56461ms step_avg:60.97ms +step:927/2245 train_time:56524ms step_avg:60.97ms +step:928/2245 train_time:56583ms step_avg:60.97ms +step:929/2245 train_time:56646ms step_avg:60.98ms +step:930/2245 train_time:56706ms step_avg:60.97ms +step:931/2245 train_time:56769ms step_avg:60.98ms +step:932/2245 train_time:56829ms step_avg:60.98ms +step:933/2245 train_time:56892ms step_avg:60.98ms +step:934/2245 train_time:56952ms step_avg:60.98ms +step:935/2245 train_time:57014ms step_avg:60.98ms +step:936/2245 train_time:57074ms step_avg:60.98ms +step:937/2245 train_time:57136ms step_avg:60.98ms +step:938/2245 train_time:57195ms step_avg:60.98ms +step:939/2245 train_time:57257ms step_avg:60.98ms +step:940/2245 train_time:57317ms step_avg:60.98ms +step:941/2245 train_time:57379ms step_avg:60.98ms +step:942/2245 train_time:57439ms step_avg:60.98ms +step:943/2245 train_time:57502ms step_avg:60.98ms +step:944/2245 train_time:57562ms step_avg:60.98ms +step:945/2245 train_time:57625ms step_avg:60.98ms +step:946/2245 train_time:57685ms step_avg:60.98ms +step:947/2245 train_time:57749ms step_avg:60.98ms +step:948/2245 train_time:57810ms step_avg:60.98ms +step:949/2245 train_time:57873ms step_avg:60.98ms +step:950/2245 train_time:57932ms step_avg:60.98ms +step:951/2245 train_time:57994ms step_avg:60.98ms +step:952/2245 train_time:58054ms step_avg:60.98ms +step:953/2245 train_time:58116ms step_avg:60.98ms +step:954/2245 train_time:58176ms step_avg:60.98ms +step:955/2245 train_time:58237ms step_avg:60.98ms +step:956/2245 train_time:58297ms step_avg:60.98ms +step:957/2245 train_time:58358ms step_avg:60.98ms +step:958/2245 train_time:58418ms step_avg:60.98ms +step:959/2245 train_time:58480ms step_avg:60.98ms +step:960/2245 train_time:58540ms step_avg:60.98ms +step:961/2245 train_time:58603ms step_avg:60.98ms +step:962/2245 train_time:58663ms step_avg:60.98ms +step:963/2245 train_time:58726ms step_avg:60.98ms +step:964/2245 train_time:58785ms step_avg:60.98ms +step:965/2245 train_time:58849ms step_avg:60.98ms +step:966/2245 train_time:58909ms step_avg:60.98ms +step:967/2245 train_time:58972ms step_avg:60.98ms +step:968/2245 train_time:59032ms step_avg:60.98ms +step:969/2245 train_time:59095ms step_avg:60.99ms +step:970/2245 train_time:59155ms step_avg:60.98ms +step:971/2245 train_time:59217ms step_avg:60.99ms +step:972/2245 train_time:59276ms step_avg:60.98ms +step:973/2245 train_time:59338ms step_avg:60.98ms +step:974/2245 train_time:59398ms step_avg:60.98ms +step:975/2245 train_time:59459ms step_avg:60.98ms +step:976/2245 train_time:59519ms step_avg:60.98ms +step:977/2245 train_time:59582ms step_avg:60.98ms +step:978/2245 train_time:59642ms step_avg:60.98ms +step:979/2245 train_time:59704ms step_avg:60.98ms +step:980/2245 train_time:59765ms step_avg:60.99ms +step:981/2245 train_time:59829ms step_avg:60.99ms +step:982/2245 train_time:59889ms step_avg:60.99ms +step:983/2245 train_time:59951ms step_avg:60.99ms +step:984/2245 train_time:60011ms step_avg:60.99ms +step:985/2245 train_time:60074ms step_avg:60.99ms +step:986/2245 train_time:60134ms step_avg:60.99ms +step:987/2245 train_time:60195ms step_avg:60.99ms +step:988/2245 train_time:60255ms step_avg:60.99ms +step:989/2245 train_time:60317ms step_avg:60.99ms +step:990/2245 train_time:60377ms step_avg:60.99ms +step:991/2245 train_time:60438ms step_avg:60.99ms +step:992/2245 train_time:60499ms step_avg:60.99ms +step:993/2245 train_time:60561ms step_avg:60.99ms +step:994/2245 train_time:60621ms step_avg:60.99ms +step:995/2245 train_time:60684ms step_avg:60.99ms +step:996/2245 train_time:60743ms step_avg:60.99ms +step:997/2245 train_time:60806ms step_avg:60.99ms +step:998/2245 train_time:60867ms step_avg:60.99ms +step:999/2245 train_time:60931ms step_avg:60.99ms +step:1000/2245 train_time:60991ms step_avg:60.99ms +step:1000/2245 val_loss:3.5961 train_time:61054ms step_avg:61.05ms +step:1001/2245 train_time:61073ms step_avg:61.01ms +step:1002/2245 train_time:61117ms step_avg:60.99ms +step:1003/2245 train_time:61183ms step_avg:61.00ms +step:1004/2245 train_time:61246ms step_avg:61.00ms +step:1005/2245 train_time:61309ms step_avg:61.00ms +step:1006/2245 train_time:61369ms step_avg:61.00ms +step:1007/2245 train_time:61430ms step_avg:61.00ms +step:1008/2245 train_time:61489ms step_avg:61.00ms +step:1009/2245 train_time:61551ms step_avg:61.00ms +step:1010/2245 train_time:61610ms step_avg:61.00ms +step:1011/2245 train_time:61671ms step_avg:61.00ms +step:1012/2245 train_time:61731ms step_avg:61.00ms +step:1013/2245 train_time:61793ms step_avg:61.00ms +step:1014/2245 train_time:61852ms step_avg:61.00ms +step:1015/2245 train_time:61914ms step_avg:61.00ms +step:1016/2245 train_time:61975ms step_avg:61.00ms +step:1017/2245 train_time:62039ms step_avg:61.00ms +step:1018/2245 train_time:62100ms step_avg:61.00ms +step:1019/2245 train_time:62164ms step_avg:61.00ms +step:1020/2245 train_time:62225ms step_avg:61.00ms +step:1021/2245 train_time:62289ms step_avg:61.01ms +step:1022/2245 train_time:62349ms step_avg:61.01ms +step:1023/2245 train_time:62411ms step_avg:61.01ms +step:1024/2245 train_time:62471ms step_avg:61.01ms +step:1025/2245 train_time:62533ms step_avg:61.01ms +step:1026/2245 train_time:62592ms step_avg:61.01ms +step:1027/2245 train_time:62654ms step_avg:61.01ms +step:1028/2245 train_time:62713ms step_avg:61.01ms +step:1029/2245 train_time:62775ms step_avg:61.01ms +step:1030/2245 train_time:62835ms step_avg:61.00ms +step:1031/2245 train_time:62896ms step_avg:61.01ms +step:1032/2245 train_time:62957ms step_avg:61.00ms +step:1033/2245 train_time:63020ms step_avg:61.01ms +step:1034/2245 train_time:63080ms step_avg:61.01ms +step:1035/2245 train_time:63144ms step_avg:61.01ms +step:1036/2245 train_time:63204ms step_avg:61.01ms +step:1037/2245 train_time:63268ms step_avg:61.01ms +step:1038/2245 train_time:63328ms step_avg:61.01ms +step:1039/2245 train_time:63391ms step_avg:61.01ms +step:1040/2245 train_time:63451ms step_avg:61.01ms +step:1041/2245 train_time:63513ms step_avg:61.01ms +step:1042/2245 train_time:63572ms step_avg:61.01ms +step:1043/2245 train_time:63634ms step_avg:61.01ms +step:1044/2245 train_time:63693ms step_avg:61.01ms +step:1045/2245 train_time:63755ms step_avg:61.01ms +step:1046/2245 train_time:63814ms step_avg:61.01ms +step:1047/2245 train_time:63876ms step_avg:61.01ms +step:1048/2245 train_time:63936ms step_avg:61.01ms +step:1049/2245 train_time:63999ms step_avg:61.01ms +step:1050/2245 train_time:64059ms step_avg:61.01ms +step:1051/2245 train_time:64122ms step_avg:61.01ms +step:1052/2245 train_time:64182ms step_avg:61.01ms +step:1053/2245 train_time:64245ms step_avg:61.01ms +step:1054/2245 train_time:64306ms step_avg:61.01ms +step:1055/2245 train_time:64369ms step_avg:61.01ms +step:1056/2245 train_time:64428ms step_avg:61.01ms +step:1057/2245 train_time:64491ms step_avg:61.01ms +step:1058/2245 train_time:64550ms step_avg:61.01ms +step:1059/2245 train_time:64612ms step_avg:61.01ms +step:1060/2245 train_time:64672ms step_avg:61.01ms +step:1061/2245 train_time:64734ms step_avg:61.01ms +step:1062/2245 train_time:64793ms step_avg:61.01ms +step:1063/2245 train_time:64856ms step_avg:61.01ms +step:1064/2245 train_time:64915ms step_avg:61.01ms +step:1065/2245 train_time:64978ms step_avg:61.01ms +step:1066/2245 train_time:65038ms step_avg:61.01ms +step:1067/2245 train_time:65101ms step_avg:61.01ms +step:1068/2245 train_time:65161ms step_avg:61.01ms +step:1069/2245 train_time:65224ms step_avg:61.01ms +step:1070/2245 train_time:65285ms step_avg:61.01ms +step:1071/2245 train_time:65348ms step_avg:61.02ms +step:1072/2245 train_time:65408ms step_avg:61.01ms +step:1073/2245 train_time:65471ms step_avg:61.02ms +step:1074/2245 train_time:65531ms step_avg:61.02ms +step:1075/2245 train_time:65593ms step_avg:61.02ms +step:1076/2245 train_time:65653ms step_avg:61.02ms +step:1077/2245 train_time:65715ms step_avg:61.02ms +step:1078/2245 train_time:65775ms step_avg:61.02ms +step:1079/2245 train_time:65837ms step_avg:61.02ms +step:1080/2245 train_time:65897ms step_avg:61.02ms +step:1081/2245 train_time:65960ms step_avg:61.02ms +step:1082/2245 train_time:66019ms step_avg:61.02ms +step:1083/2245 train_time:66082ms step_avg:61.02ms +step:1084/2245 train_time:66143ms step_avg:61.02ms +step:1085/2245 train_time:66206ms step_avg:61.02ms +step:1086/2245 train_time:66266ms step_avg:61.02ms +step:1087/2245 train_time:66328ms step_avg:61.02ms +step:1088/2245 train_time:66388ms step_avg:61.02ms +step:1089/2245 train_time:66451ms step_avg:61.02ms +step:1090/2245 train_time:66511ms step_avg:61.02ms +step:1091/2245 train_time:66574ms step_avg:61.02ms +step:1092/2245 train_time:66634ms step_avg:61.02ms +step:1093/2245 train_time:66696ms step_avg:61.02ms +step:1094/2245 train_time:66756ms step_avg:61.02ms +step:1095/2245 train_time:66818ms step_avg:61.02ms +step:1096/2245 train_time:66878ms step_avg:61.02ms +step:1097/2245 train_time:66940ms step_avg:61.02ms +step:1098/2245 train_time:67000ms step_avg:61.02ms +step:1099/2245 train_time:67063ms step_avg:61.02ms +step:1100/2245 train_time:67123ms step_avg:61.02ms +step:1101/2245 train_time:67187ms step_avg:61.02ms +step:1102/2245 train_time:67247ms step_avg:61.02ms +step:1103/2245 train_time:67310ms step_avg:61.02ms +step:1104/2245 train_time:67369ms step_avg:61.02ms +step:1105/2245 train_time:67432ms step_avg:61.02ms +step:1106/2245 train_time:67492ms step_avg:61.02ms +step:1107/2245 train_time:67555ms step_avg:61.03ms +step:1108/2245 train_time:67615ms step_avg:61.02ms +step:1109/2245 train_time:67677ms step_avg:61.03ms +step:1110/2245 train_time:67737ms step_avg:61.02ms +step:1111/2245 train_time:67800ms step_avg:61.03ms +step:1112/2245 train_time:67860ms step_avg:61.03ms +step:1113/2245 train_time:67922ms step_avg:61.03ms +step:1114/2245 train_time:67982ms step_avg:61.03ms +step:1115/2245 train_time:68045ms step_avg:61.03ms +step:1116/2245 train_time:68105ms step_avg:61.03ms +step:1117/2245 train_time:68168ms step_avg:61.03ms +step:1118/2245 train_time:68228ms step_avg:61.03ms +step:1119/2245 train_time:68289ms step_avg:61.03ms +step:1120/2245 train_time:68349ms step_avg:61.03ms +step:1121/2245 train_time:68411ms step_avg:61.03ms +step:1122/2245 train_time:68472ms step_avg:61.03ms +step:1123/2245 train_time:68534ms step_avg:61.03ms +step:1124/2245 train_time:68594ms step_avg:61.03ms +step:1125/2245 train_time:68657ms step_avg:61.03ms +step:1126/2245 train_time:68717ms step_avg:61.03ms +step:1127/2245 train_time:68779ms step_avg:61.03ms +step:1128/2245 train_time:68839ms step_avg:61.03ms +step:1129/2245 train_time:68902ms step_avg:61.03ms +step:1130/2245 train_time:68961ms step_avg:61.03ms +step:1131/2245 train_time:69024ms step_avg:61.03ms +step:1132/2245 train_time:69084ms step_avg:61.03ms +step:1133/2245 train_time:69147ms step_avg:61.03ms +step:1134/2245 train_time:69207ms step_avg:61.03ms +step:1135/2245 train_time:69269ms step_avg:61.03ms +step:1136/2245 train_time:69329ms step_avg:61.03ms +step:1137/2245 train_time:69392ms step_avg:61.03ms +step:1138/2245 train_time:69452ms step_avg:61.03ms +step:1139/2245 train_time:69514ms step_avg:61.03ms +step:1140/2245 train_time:69574ms step_avg:61.03ms +step:1141/2245 train_time:69636ms step_avg:61.03ms +step:1142/2245 train_time:69695ms step_avg:61.03ms +step:1143/2245 train_time:69759ms step_avg:61.03ms +step:1144/2245 train_time:69819ms step_avg:61.03ms +step:1145/2245 train_time:69881ms step_avg:61.03ms +step:1146/2245 train_time:69940ms step_avg:61.03ms +step:1147/2245 train_time:70003ms step_avg:61.03ms +step:1148/2245 train_time:70063ms step_avg:61.03ms +step:1149/2245 train_time:70126ms step_avg:61.03ms +step:1150/2245 train_time:70186ms step_avg:61.03ms +step:1151/2245 train_time:70249ms step_avg:61.03ms +step:1152/2245 train_time:70308ms step_avg:61.03ms +step:1153/2245 train_time:70371ms step_avg:61.03ms +step:1154/2245 train_time:70431ms step_avg:61.03ms +step:1155/2245 train_time:70493ms step_avg:61.03ms +step:1156/2245 train_time:70554ms step_avg:61.03ms +step:1157/2245 train_time:70616ms step_avg:61.03ms +step:1158/2245 train_time:70676ms step_avg:61.03ms +step:1159/2245 train_time:70738ms step_avg:61.03ms +step:1160/2245 train_time:70798ms step_avg:61.03ms +step:1161/2245 train_time:70860ms step_avg:61.03ms +step:1162/2245 train_time:70920ms step_avg:61.03ms +step:1163/2245 train_time:70982ms step_avg:61.03ms +step:1164/2245 train_time:71042ms step_avg:61.03ms +step:1165/2245 train_time:71105ms step_avg:61.03ms +step:1166/2245 train_time:71165ms step_avg:61.03ms +step:1167/2245 train_time:71228ms step_avg:61.04ms +step:1168/2245 train_time:71288ms step_avg:61.03ms +step:1169/2245 train_time:71350ms step_avg:61.04ms +step:1170/2245 train_time:71410ms step_avg:61.03ms +step:1171/2245 train_time:71473ms step_avg:61.04ms +step:1172/2245 train_time:71533ms step_avg:61.03ms +step:1173/2245 train_time:71595ms step_avg:61.04ms +step:1174/2245 train_time:71655ms step_avg:61.03ms +step:1175/2245 train_time:71717ms step_avg:61.04ms +step:1176/2245 train_time:71776ms step_avg:61.03ms +step:1177/2245 train_time:71838ms step_avg:61.04ms +step:1178/2245 train_time:71898ms step_avg:61.03ms +step:1179/2245 train_time:71960ms step_avg:61.03ms +step:1180/2245 train_time:72020ms step_avg:61.03ms +step:1181/2245 train_time:72084ms step_avg:61.04ms +step:1182/2245 train_time:72144ms step_avg:61.04ms +step:1183/2245 train_time:72208ms step_avg:61.04ms +step:1184/2245 train_time:72268ms step_avg:61.04ms +step:1185/2245 train_time:72330ms step_avg:61.04ms +step:1186/2245 train_time:72389ms step_avg:61.04ms +step:1187/2245 train_time:72452ms step_avg:61.04ms +step:1188/2245 train_time:72511ms step_avg:61.04ms +step:1189/2245 train_time:72574ms step_avg:61.04ms +step:1190/2245 train_time:72633ms step_avg:61.04ms +step:1191/2245 train_time:72695ms step_avg:61.04ms +step:1192/2245 train_time:72755ms step_avg:61.04ms +step:1193/2245 train_time:72816ms step_avg:61.04ms +step:1194/2245 train_time:72877ms step_avg:61.04ms +step:1195/2245 train_time:72939ms step_avg:61.04ms +step:1196/2245 train_time:72999ms step_avg:61.04ms +step:1197/2245 train_time:73062ms step_avg:61.04ms +step:1198/2245 train_time:73122ms step_avg:61.04ms +step:1199/2245 train_time:73185ms step_avg:61.04ms +step:1200/2245 train_time:73246ms step_avg:61.04ms +step:1201/2245 train_time:73309ms step_avg:61.04ms +step:1202/2245 train_time:73369ms step_avg:61.04ms +step:1203/2245 train_time:73431ms step_avg:61.04ms +step:1204/2245 train_time:73491ms step_avg:61.04ms +step:1205/2245 train_time:73554ms step_avg:61.04ms +step:1206/2245 train_time:73613ms step_avg:61.04ms +step:1207/2245 train_time:73675ms step_avg:61.04ms +step:1208/2245 train_time:73734ms step_avg:61.04ms +step:1209/2245 train_time:73796ms step_avg:61.04ms +step:1210/2245 train_time:73856ms step_avg:61.04ms +step:1211/2245 train_time:73919ms step_avg:61.04ms +step:1212/2245 train_time:73979ms step_avg:61.04ms +step:1213/2245 train_time:74042ms step_avg:61.04ms +step:1214/2245 train_time:74102ms step_avg:61.04ms +step:1215/2245 train_time:74165ms step_avg:61.04ms +step:1216/2245 train_time:74225ms step_avg:61.04ms +step:1217/2245 train_time:74288ms step_avg:61.04ms +step:1218/2245 train_time:74349ms step_avg:61.04ms +step:1219/2245 train_time:74411ms step_avg:61.04ms +step:1220/2245 train_time:74472ms step_avg:61.04ms +step:1221/2245 train_time:74533ms step_avg:61.04ms +step:1222/2245 train_time:74593ms step_avg:61.04ms +step:1223/2245 train_time:74656ms step_avg:61.04ms +step:1224/2245 train_time:74716ms step_avg:61.04ms +step:1225/2245 train_time:74778ms step_avg:61.04ms +step:1226/2245 train_time:74838ms step_avg:61.04ms +step:1227/2245 train_time:74901ms step_avg:61.04ms +step:1228/2245 train_time:74961ms step_avg:61.04ms +step:1229/2245 train_time:75023ms step_avg:61.04ms +step:1230/2245 train_time:75083ms step_avg:61.04ms +step:1231/2245 train_time:75146ms step_avg:61.04ms +step:1232/2245 train_time:75207ms step_avg:61.04ms +step:1233/2245 train_time:75269ms step_avg:61.05ms +step:1234/2245 train_time:75329ms step_avg:61.04ms +step:1235/2245 train_time:75391ms step_avg:61.05ms +step:1236/2245 train_time:75451ms step_avg:61.04ms +step:1237/2245 train_time:75513ms step_avg:61.05ms +step:1238/2245 train_time:75572ms step_avg:61.04ms +step:1239/2245 train_time:75635ms step_avg:61.04ms +step:1240/2245 train_time:75695ms step_avg:61.04ms +step:1241/2245 train_time:75756ms step_avg:61.04ms +step:1242/2245 train_time:75816ms step_avg:61.04ms +step:1243/2245 train_time:75878ms step_avg:61.04ms +step:1244/2245 train_time:75938ms step_avg:61.04ms +step:1245/2245 train_time:76001ms step_avg:61.04ms +step:1246/2245 train_time:76061ms step_avg:61.04ms +step:1247/2245 train_time:76124ms step_avg:61.05ms +step:1248/2245 train_time:76184ms step_avg:61.05ms +step:1249/2245 train_time:76247ms step_avg:61.05ms +step:1250/2245 train_time:76307ms step_avg:61.05ms +step:1250/2245 val_loss:3.5218 train_time:76371ms step_avg:61.10ms +step:1251/2245 train_time:76390ms step_avg:61.06ms +step:1252/2245 train_time:76433ms step_avg:61.05ms +step:1253/2245 train_time:76499ms step_avg:61.05ms +step:1254/2245 train_time:76561ms step_avg:61.05ms +step:1255/2245 train_time:76623ms step_avg:61.05ms +step:1256/2245 train_time:76683ms step_avg:61.05ms +step:1257/2245 train_time:76744ms step_avg:61.05ms +step:1258/2245 train_time:76803ms step_avg:61.05ms +step:1259/2245 train_time:76864ms step_avg:61.05ms +step:1260/2245 train_time:76924ms step_avg:61.05ms +step:1261/2245 train_time:76985ms step_avg:61.05ms +step:1262/2245 train_time:77044ms step_avg:61.05ms +step:1263/2245 train_time:77107ms step_avg:61.05ms +step:1264/2245 train_time:77166ms step_avg:61.05ms +step:1265/2245 train_time:77228ms step_avg:61.05ms +step:1266/2245 train_time:77288ms step_avg:61.05ms +step:1267/2245 train_time:77351ms step_avg:61.05ms +step:1268/2245 train_time:77413ms step_avg:61.05ms +step:1269/2245 train_time:77477ms step_avg:61.05ms +step:1270/2245 train_time:77538ms step_avg:61.05ms +step:1271/2245 train_time:77601ms step_avg:61.06ms +step:1272/2245 train_time:77662ms step_avg:61.06ms +step:1273/2245 train_time:77724ms step_avg:61.06ms +step:1274/2245 train_time:77783ms step_avg:61.05ms +step:1275/2245 train_time:77844ms step_avg:61.05ms +step:1276/2245 train_time:77903ms step_avg:61.05ms +step:1277/2245 train_time:77965ms step_avg:61.05ms +step:1278/2245 train_time:78024ms step_avg:61.05ms +step:1279/2245 train_time:78086ms step_avg:61.05ms +step:1280/2245 train_time:78145ms step_avg:61.05ms +step:1281/2245 train_time:78208ms step_avg:61.05ms +step:1282/2245 train_time:78268ms step_avg:61.05ms +step:1283/2245 train_time:78331ms step_avg:61.05ms +step:1284/2245 train_time:78391ms step_avg:61.05ms +step:1285/2245 train_time:78455ms step_avg:61.05ms +step:1286/2245 train_time:78515ms step_avg:61.05ms +step:1287/2245 train_time:78579ms step_avg:61.06ms +step:1288/2245 train_time:78639ms step_avg:61.06ms +step:1289/2245 train_time:78702ms step_avg:61.06ms +step:1290/2245 train_time:78762ms step_avg:61.06ms +step:1291/2245 train_time:78824ms step_avg:61.06ms +step:1292/2245 train_time:78884ms step_avg:61.06ms +step:1293/2245 train_time:78945ms step_avg:61.06ms +step:1294/2245 train_time:79004ms step_avg:61.05ms +step:1295/2245 train_time:79066ms step_avg:61.05ms +step:1296/2245 train_time:79126ms step_avg:61.05ms +step:1297/2245 train_time:79188ms step_avg:61.05ms +step:1298/2245 train_time:79247ms step_avg:61.05ms +step:1299/2245 train_time:79309ms step_avg:61.05ms +step:1300/2245 train_time:79369ms step_avg:61.05ms +step:1301/2245 train_time:79433ms step_avg:61.06ms +step:1302/2245 train_time:79493ms step_avg:61.05ms +step:1303/2245 train_time:79556ms step_avg:61.06ms +step:1304/2245 train_time:79617ms step_avg:61.06ms +step:1305/2245 train_time:79680ms step_avg:61.06ms +step:1306/2245 train_time:79741ms step_avg:61.06ms +step:1307/2245 train_time:79804ms step_avg:61.06ms +step:1308/2245 train_time:79863ms step_avg:61.06ms +step:1309/2245 train_time:79925ms step_avg:61.06ms +step:1310/2245 train_time:79985ms step_avg:61.06ms +step:1311/2245 train_time:80046ms step_avg:61.06ms +step:1312/2245 train_time:80106ms step_avg:61.06ms +step:1313/2245 train_time:80168ms step_avg:61.06ms +step:1314/2245 train_time:80228ms step_avg:61.06ms +step:1315/2245 train_time:80290ms step_avg:61.06ms +step:1316/2245 train_time:80350ms step_avg:61.06ms +step:1317/2245 train_time:80412ms step_avg:61.06ms +step:1318/2245 train_time:80472ms step_avg:61.06ms +step:1319/2245 train_time:80536ms step_avg:61.06ms +step:1320/2245 train_time:80596ms step_avg:61.06ms +step:1321/2245 train_time:80659ms step_avg:61.06ms +step:1322/2245 train_time:80719ms step_avg:61.06ms +step:1323/2245 train_time:80783ms step_avg:61.06ms +step:1324/2245 train_time:80843ms step_avg:61.06ms +step:1325/2245 train_time:80905ms step_avg:61.06ms +step:1326/2245 train_time:80965ms step_avg:61.06ms +step:1327/2245 train_time:81026ms step_avg:61.06ms +step:1328/2245 train_time:81086ms step_avg:61.06ms +step:1329/2245 train_time:81148ms step_avg:61.06ms +step:1330/2245 train_time:81208ms step_avg:61.06ms +step:1331/2245 train_time:81270ms step_avg:61.06ms +step:1332/2245 train_time:81329ms step_avg:61.06ms +step:1333/2245 train_time:81392ms step_avg:61.06ms +step:1334/2245 train_time:81451ms step_avg:61.06ms +step:1335/2245 train_time:81514ms step_avg:61.06ms +step:1336/2245 train_time:81574ms step_avg:61.06ms +step:1337/2245 train_time:81637ms step_avg:61.06ms +step:1338/2245 train_time:81698ms step_avg:61.06ms +step:1339/2245 train_time:81761ms step_avg:61.06ms +step:1340/2245 train_time:81821ms step_avg:61.06ms +step:1341/2245 train_time:81884ms step_avg:61.06ms +step:1342/2245 train_time:81944ms step_avg:61.06ms +step:1343/2245 train_time:82005ms step_avg:61.06ms +step:1344/2245 train_time:82065ms step_avg:61.06ms +step:1345/2245 train_time:82127ms step_avg:61.06ms +step:1346/2245 train_time:82186ms step_avg:61.06ms +step:1347/2245 train_time:82248ms step_avg:61.06ms +step:1348/2245 train_time:82307ms step_avg:61.06ms +step:1349/2245 train_time:82370ms step_avg:61.06ms +step:1350/2245 train_time:82430ms step_avg:61.06ms +step:1351/2245 train_time:82493ms step_avg:61.06ms +step:1352/2245 train_time:82553ms step_avg:61.06ms +step:1353/2245 train_time:82616ms step_avg:61.06ms +step:1354/2245 train_time:82677ms step_avg:61.06ms +step:1355/2245 train_time:82739ms step_avg:61.06ms +step:1356/2245 train_time:82800ms step_avg:61.06ms +step:1357/2245 train_time:82864ms step_avg:61.06ms +step:1358/2245 train_time:82923ms step_avg:61.06ms +step:1359/2245 train_time:82986ms step_avg:61.06ms +step:1360/2245 train_time:83045ms step_avg:61.06ms +step:1361/2245 train_time:83108ms step_avg:61.06ms +step:1362/2245 train_time:83168ms step_avg:61.06ms +step:1363/2245 train_time:83230ms step_avg:61.06ms +step:1364/2245 train_time:83290ms step_avg:61.06ms +step:1365/2245 train_time:83352ms step_avg:61.06ms +step:1366/2245 train_time:83411ms step_avg:61.06ms +step:1367/2245 train_time:83474ms step_avg:61.06ms +step:1368/2245 train_time:83534ms step_avg:61.06ms +step:1369/2245 train_time:83596ms step_avg:61.06ms +step:1370/2245 train_time:83656ms step_avg:61.06ms +step:1371/2245 train_time:83719ms step_avg:61.06ms +step:1372/2245 train_time:83781ms step_avg:61.06ms +step:1373/2245 train_time:83843ms step_avg:61.07ms +step:1374/2245 train_time:83903ms step_avg:61.06ms +step:1375/2245 train_time:83965ms step_avg:61.07ms +step:1376/2245 train_time:84024ms step_avg:61.06ms +step:1377/2245 train_time:84088ms step_avg:61.07ms +step:1378/2245 train_time:84147ms step_avg:61.06ms +step:1379/2245 train_time:84209ms step_avg:61.07ms +step:1380/2245 train_time:84268ms step_avg:61.06ms +step:1381/2245 train_time:84330ms step_avg:61.06ms +step:1382/2245 train_time:84390ms step_avg:61.06ms +step:1383/2245 train_time:84452ms step_avg:61.06ms +step:1384/2245 train_time:84512ms step_avg:61.06ms +step:1385/2245 train_time:84575ms step_avg:61.06ms +step:1386/2245 train_time:84635ms step_avg:61.06ms +step:1387/2245 train_time:84699ms step_avg:61.07ms +step:1388/2245 train_time:84759ms step_avg:61.07ms +step:1389/2245 train_time:84822ms step_avg:61.07ms +step:1390/2245 train_time:84883ms step_avg:61.07ms +step:1391/2245 train_time:84945ms step_avg:61.07ms +step:1392/2245 train_time:85005ms step_avg:61.07ms +step:1393/2245 train_time:85067ms step_avg:61.07ms +step:1394/2245 train_time:85127ms step_avg:61.07ms +step:1395/2245 train_time:85189ms step_avg:61.07ms +step:1396/2245 train_time:85248ms step_avg:61.07ms +step:1397/2245 train_time:85310ms step_avg:61.07ms +step:1398/2245 train_time:85370ms step_avg:61.07ms +step:1399/2245 train_time:85432ms step_avg:61.07ms +step:1400/2245 train_time:85491ms step_avg:61.07ms +step:1401/2245 train_time:85554ms step_avg:61.07ms +step:1402/2245 train_time:85614ms step_avg:61.07ms +step:1403/2245 train_time:85677ms step_avg:61.07ms +step:1404/2245 train_time:85737ms step_avg:61.07ms +step:1405/2245 train_time:85799ms step_avg:61.07ms +step:1406/2245 train_time:85859ms step_avg:61.07ms +step:1407/2245 train_time:85922ms step_avg:61.07ms +step:1408/2245 train_time:85982ms step_avg:61.07ms +step:1409/2245 train_time:86044ms step_avg:61.07ms +step:1410/2245 train_time:86105ms step_avg:61.07ms +step:1411/2245 train_time:86167ms step_avg:61.07ms +step:1412/2245 train_time:86227ms step_avg:61.07ms +step:1413/2245 train_time:86289ms step_avg:61.07ms +step:1414/2245 train_time:86349ms step_avg:61.07ms +step:1415/2245 train_time:86411ms step_avg:61.07ms +step:1416/2245 train_time:86470ms step_avg:61.07ms +step:1417/2245 train_time:86532ms step_avg:61.07ms +step:1418/2245 train_time:86592ms step_avg:61.07ms +step:1419/2245 train_time:86655ms step_avg:61.07ms +step:1420/2245 train_time:86715ms step_avg:61.07ms +step:1421/2245 train_time:86778ms step_avg:61.07ms +step:1422/2245 train_time:86839ms step_avg:61.07ms +step:1423/2245 train_time:86902ms step_avg:61.07ms +step:1424/2245 train_time:86962ms step_avg:61.07ms +step:1425/2245 train_time:87023ms step_avg:61.07ms +step:1426/2245 train_time:87083ms step_avg:61.07ms +step:1427/2245 train_time:87146ms step_avg:61.07ms +step:1428/2245 train_time:87206ms step_avg:61.07ms +step:1429/2245 train_time:87268ms step_avg:61.07ms +step:1430/2245 train_time:87328ms step_avg:61.07ms +step:1431/2245 train_time:87390ms step_avg:61.07ms +step:1432/2245 train_time:87450ms step_avg:61.07ms +step:1433/2245 train_time:87512ms step_avg:61.07ms +step:1434/2245 train_time:87572ms step_avg:61.07ms +step:1435/2245 train_time:87635ms step_avg:61.07ms +step:1436/2245 train_time:87695ms step_avg:61.07ms +step:1437/2245 train_time:87757ms step_avg:61.07ms +step:1438/2245 train_time:87818ms step_avg:61.07ms +step:1439/2245 train_time:87881ms step_avg:61.07ms +step:1440/2245 train_time:87941ms step_avg:61.07ms +step:1441/2245 train_time:88003ms step_avg:61.07ms +step:1442/2245 train_time:88063ms step_avg:61.07ms +step:1443/2245 train_time:88127ms step_avg:61.07ms +step:1444/2245 train_time:88187ms step_avg:61.07ms +step:1445/2245 train_time:88249ms step_avg:61.07ms +step:1446/2245 train_time:88309ms step_avg:61.07ms +step:1447/2245 train_time:88371ms step_avg:61.07ms +step:1448/2245 train_time:88430ms step_avg:61.07ms +step:1449/2245 train_time:88492ms step_avg:61.07ms +step:1450/2245 train_time:88552ms step_avg:61.07ms +step:1451/2245 train_time:88614ms step_avg:61.07ms +step:1452/2245 train_time:88674ms step_avg:61.07ms +step:1453/2245 train_time:88737ms step_avg:61.07ms +step:1454/2245 train_time:88798ms step_avg:61.07ms +step:1455/2245 train_time:88860ms step_avg:61.07ms +step:1456/2245 train_time:88920ms step_avg:61.07ms +step:1457/2245 train_time:88983ms step_avg:61.07ms +step:1458/2245 train_time:89043ms step_avg:61.07ms +step:1459/2245 train_time:89106ms step_avg:61.07ms +step:1460/2245 train_time:89166ms step_avg:61.07ms +step:1461/2245 train_time:89228ms step_avg:61.07ms +step:1462/2245 train_time:89288ms step_avg:61.07ms +step:1463/2245 train_time:89350ms step_avg:61.07ms +step:1464/2245 train_time:89409ms step_avg:61.07ms +step:1465/2245 train_time:89472ms step_avg:61.07ms +step:1466/2245 train_time:89531ms step_avg:61.07ms +step:1467/2245 train_time:89593ms step_avg:61.07ms +step:1468/2245 train_time:89653ms step_avg:61.07ms +step:1469/2245 train_time:89715ms step_avg:61.07ms +step:1470/2245 train_time:89775ms step_avg:61.07ms +step:1471/2245 train_time:89837ms step_avg:61.07ms +step:1472/2245 train_time:89898ms step_avg:61.07ms +step:1473/2245 train_time:89961ms step_avg:61.07ms +step:1474/2245 train_time:90022ms step_avg:61.07ms +step:1475/2245 train_time:90086ms step_avg:61.07ms +step:1476/2245 train_time:90146ms step_avg:61.07ms +step:1477/2245 train_time:90209ms step_avg:61.08ms +step:1478/2245 train_time:90270ms step_avg:61.08ms +step:1479/2245 train_time:90332ms step_avg:61.08ms +step:1480/2245 train_time:90393ms step_avg:61.08ms +step:1481/2245 train_time:90456ms step_avg:61.08ms +step:1482/2245 train_time:90516ms step_avg:61.08ms +step:1483/2245 train_time:90578ms step_avg:61.08ms +step:1484/2245 train_time:90639ms step_avg:61.08ms +step:1485/2245 train_time:90701ms step_avg:61.08ms +step:1486/2245 train_time:90762ms step_avg:61.08ms +step:1487/2245 train_time:90825ms step_avg:61.08ms +step:1488/2245 train_time:90885ms step_avg:61.08ms +step:1489/2245 train_time:90948ms step_avg:61.08ms +step:1490/2245 train_time:91009ms step_avg:61.08ms +step:1491/2245 train_time:91072ms step_avg:61.08ms +step:1492/2245 train_time:91133ms step_avg:61.08ms +step:1493/2245 train_time:91196ms step_avg:61.08ms +step:1494/2245 train_time:91257ms step_avg:61.08ms +step:1495/2245 train_time:91320ms step_avg:61.08ms +step:1496/2245 train_time:91382ms step_avg:61.08ms +step:1497/2245 train_time:91445ms step_avg:61.09ms +step:1498/2245 train_time:91505ms step_avg:61.08ms +step:1499/2245 train_time:91567ms step_avg:61.09ms +step:1500/2245 train_time:91627ms step_avg:61.08ms +step:1500/2245 val_loss:3.4418 train_time:91691ms step_avg:61.13ms +step:1501/2245 train_time:91710ms step_avg:61.10ms +step:1502/2245 train_time:91752ms step_avg:61.09ms +step:1503/2245 train_time:91814ms step_avg:61.09ms +step:1504/2245 train_time:91876ms step_avg:61.09ms +step:1505/2245 train_time:91941ms step_avg:61.09ms +step:1506/2245 train_time:92001ms step_avg:61.09ms +step:1507/2245 train_time:92063ms step_avg:61.09ms +step:1508/2245 train_time:92123ms step_avg:61.09ms +step:1509/2245 train_time:92185ms step_avg:61.09ms +step:1510/2245 train_time:92245ms step_avg:61.09ms +step:1511/2245 train_time:92307ms step_avg:61.09ms +step:1512/2245 train_time:92367ms step_avg:61.09ms +step:1513/2245 train_time:92430ms step_avg:61.09ms +step:1514/2245 train_time:92490ms step_avg:61.09ms +step:1515/2245 train_time:92552ms step_avg:61.09ms +step:1516/2245 train_time:92616ms step_avg:61.09ms +step:1517/2245 train_time:92680ms step_avg:61.09ms +step:1518/2245 train_time:92741ms step_avg:61.09ms +step:1519/2245 train_time:92806ms step_avg:61.10ms +step:1520/2245 train_time:92868ms step_avg:61.10ms +step:1521/2245 train_time:92931ms step_avg:61.10ms +step:1522/2245 train_time:92991ms step_avg:61.10ms +step:1523/2245 train_time:93054ms step_avg:61.10ms +step:1524/2245 train_time:93114ms step_avg:61.10ms +step:1525/2245 train_time:93176ms step_avg:61.10ms +step:1526/2245 train_time:93236ms step_avg:61.10ms +step:1527/2245 train_time:93299ms step_avg:61.10ms +step:1528/2245 train_time:93358ms step_avg:61.10ms +step:1529/2245 train_time:93421ms step_avg:61.10ms +step:1530/2245 train_time:93481ms step_avg:61.10ms +step:1531/2245 train_time:93546ms step_avg:61.10ms +step:1532/2245 train_time:93608ms step_avg:61.10ms +step:1533/2245 train_time:93671ms step_avg:61.10ms +step:1534/2245 train_time:93732ms step_avg:61.10ms +step:1535/2245 train_time:93795ms step_avg:61.10ms +step:1536/2245 train_time:93855ms step_avg:61.10ms +step:1537/2245 train_time:93919ms step_avg:61.11ms +step:1538/2245 train_time:93979ms step_avg:61.10ms +step:1539/2245 train_time:94042ms step_avg:61.11ms +step:1540/2245 train_time:94103ms step_avg:61.11ms +step:1541/2245 train_time:94166ms step_avg:61.11ms +step:1542/2245 train_time:94227ms step_avg:61.11ms +step:1543/2245 train_time:94289ms step_avg:61.11ms +step:1544/2245 train_time:94349ms step_avg:61.11ms +step:1545/2245 train_time:94412ms step_avg:61.11ms +step:1546/2245 train_time:94472ms step_avg:61.11ms +step:1547/2245 train_time:94535ms step_avg:61.11ms +step:1548/2245 train_time:94595ms step_avg:61.11ms +step:1549/2245 train_time:94658ms step_avg:61.11ms +step:1550/2245 train_time:94720ms step_avg:61.11ms +step:1551/2245 train_time:94784ms step_avg:61.11ms +step:1552/2245 train_time:94846ms step_avg:61.11ms +step:1553/2245 train_time:94908ms step_avg:61.11ms +step:1554/2245 train_time:94969ms step_avg:61.11ms +step:1555/2245 train_time:95032ms step_avg:61.11ms +step:1556/2245 train_time:95092ms step_avg:61.11ms +step:1557/2245 train_time:95155ms step_avg:61.11ms +step:1558/2245 train_time:95215ms step_avg:61.11ms +step:1559/2245 train_time:95277ms step_avg:61.11ms +step:1560/2245 train_time:95337ms step_avg:61.11ms +step:1561/2245 train_time:95400ms step_avg:61.11ms +step:1562/2245 train_time:95460ms step_avg:61.11ms +step:1563/2245 train_time:95524ms step_avg:61.12ms +step:1564/2245 train_time:95584ms step_avg:61.12ms +step:1565/2245 train_time:95648ms step_avg:61.12ms +step:1566/2245 train_time:95709ms step_avg:61.12ms +step:1567/2245 train_time:95772ms step_avg:61.12ms +step:1568/2245 train_time:95833ms step_avg:61.12ms +step:1569/2245 train_time:95895ms step_avg:61.12ms +step:1570/2245 train_time:95956ms step_avg:61.12ms +step:1571/2245 train_time:96019ms step_avg:61.12ms +step:1572/2245 train_time:96079ms step_avg:61.12ms +step:1573/2245 train_time:96143ms step_avg:61.12ms +step:1574/2245 train_time:96204ms step_avg:61.12ms +step:1575/2245 train_time:96268ms step_avg:61.12ms +step:1576/2245 train_time:96328ms step_avg:61.12ms +step:1577/2245 train_time:96390ms step_avg:61.12ms +step:1578/2245 train_time:96451ms step_avg:61.12ms +step:1579/2245 train_time:96513ms step_avg:61.12ms +step:1580/2245 train_time:96573ms step_avg:61.12ms +step:1581/2245 train_time:96637ms step_avg:61.12ms +step:1582/2245 train_time:96697ms step_avg:61.12ms +step:1583/2245 train_time:96760ms step_avg:61.12ms +step:1584/2245 train_time:96821ms step_avg:61.12ms +step:1585/2245 train_time:96884ms step_avg:61.13ms +step:1586/2245 train_time:96945ms step_avg:61.13ms +step:1587/2245 train_time:97008ms step_avg:61.13ms +step:1588/2245 train_time:97068ms step_avg:61.13ms +step:1589/2245 train_time:97131ms step_avg:61.13ms +step:1590/2245 train_time:97191ms step_avg:61.13ms +step:1591/2245 train_time:97254ms step_avg:61.13ms +step:1592/2245 train_time:97314ms step_avg:61.13ms +step:1593/2245 train_time:97376ms step_avg:61.13ms +step:1594/2245 train_time:97437ms step_avg:61.13ms +step:1595/2245 train_time:97499ms step_avg:61.13ms +step:1596/2245 train_time:97559ms step_avg:61.13ms +step:1597/2245 train_time:97623ms step_avg:61.13ms +step:1598/2245 train_time:97684ms step_avg:61.13ms +step:1599/2245 train_time:97747ms step_avg:61.13ms +step:1600/2245 train_time:97808ms step_avg:61.13ms +step:1601/2245 train_time:97872ms step_avg:61.13ms +step:1602/2245 train_time:97932ms step_avg:61.13ms +step:1603/2245 train_time:97995ms step_avg:61.13ms +step:1604/2245 train_time:98055ms step_avg:61.13ms +step:1605/2245 train_time:98118ms step_avg:61.13ms +step:1606/2245 train_time:98179ms step_avg:61.13ms +step:1607/2245 train_time:98242ms step_avg:61.13ms +step:1608/2245 train_time:98303ms step_avg:61.13ms +step:1609/2245 train_time:98367ms step_avg:61.14ms +step:1610/2245 train_time:98427ms step_avg:61.13ms +step:1611/2245 train_time:98489ms step_avg:61.14ms +step:1612/2245 train_time:98549ms step_avg:61.13ms +step:1613/2245 train_time:98612ms step_avg:61.14ms +step:1614/2245 train_time:98673ms step_avg:61.14ms +step:1615/2245 train_time:98736ms step_avg:61.14ms +step:1616/2245 train_time:98796ms step_avg:61.14ms +step:1617/2245 train_time:98859ms step_avg:61.14ms +step:1618/2245 train_time:98919ms step_avg:61.14ms +step:1619/2245 train_time:98983ms step_avg:61.14ms +step:1620/2245 train_time:99043ms step_avg:61.14ms +step:1621/2245 train_time:99106ms step_avg:61.14ms +step:1622/2245 train_time:99167ms step_avg:61.14ms +step:1623/2245 train_time:99230ms step_avg:61.14ms +step:1624/2245 train_time:99291ms step_avg:61.14ms +step:1625/2245 train_time:99353ms step_avg:61.14ms +step:1626/2245 train_time:99413ms step_avg:61.14ms +step:1627/2245 train_time:99476ms step_avg:61.14ms +step:1628/2245 train_time:99536ms step_avg:61.14ms +step:1629/2245 train_time:99598ms step_avg:61.14ms +step:1630/2245 train_time:99658ms step_avg:61.14ms +step:1631/2245 train_time:99722ms step_avg:61.14ms +step:1632/2245 train_time:99783ms step_avg:61.14ms +step:1633/2245 train_time:99846ms step_avg:61.14ms +step:1634/2245 train_time:99907ms step_avg:61.14ms +step:1635/2245 train_time:99970ms step_avg:61.14ms +step:1636/2245 train_time:100031ms step_avg:61.14ms +step:1637/2245 train_time:100094ms step_avg:61.14ms +step:1638/2245 train_time:100154ms step_avg:61.14ms +step:1639/2245 train_time:100217ms step_avg:61.15ms +step:1640/2245 train_time:100278ms step_avg:61.14ms +step:1641/2245 train_time:100341ms step_avg:61.15ms +step:1642/2245 train_time:100401ms step_avg:61.15ms +step:1643/2245 train_time:100464ms step_avg:61.15ms +step:1644/2245 train_time:100525ms step_avg:61.15ms +step:1645/2245 train_time:100587ms step_avg:61.15ms +step:1646/2245 train_time:100648ms step_avg:61.15ms +step:1647/2245 train_time:100711ms step_avg:61.15ms +step:1648/2245 train_time:100772ms step_avg:61.15ms +step:1649/2245 train_time:100834ms step_avg:61.15ms +step:1650/2245 train_time:100894ms step_avg:61.15ms +step:1651/2245 train_time:100958ms step_avg:61.15ms +step:1652/2245 train_time:101018ms step_avg:61.15ms +step:1653/2245 train_time:101081ms step_avg:61.15ms +step:1654/2245 train_time:101142ms step_avg:61.15ms +step:1655/2245 train_time:101206ms step_avg:61.15ms +step:1656/2245 train_time:101266ms step_avg:61.15ms +step:1657/2245 train_time:101329ms step_avg:61.15ms +step:1658/2245 train_time:101390ms step_avg:61.15ms +step:1659/2245 train_time:101453ms step_avg:61.15ms +step:1660/2245 train_time:101513ms step_avg:61.15ms +step:1661/2245 train_time:101575ms step_avg:61.15ms +step:1662/2245 train_time:101635ms step_avg:61.15ms +step:1663/2245 train_time:101698ms step_avg:61.15ms +step:1664/2245 train_time:101758ms step_avg:61.15ms +step:1665/2245 train_time:101821ms step_avg:61.15ms +step:1666/2245 train_time:101881ms step_avg:61.15ms +step:1667/2245 train_time:101945ms step_avg:61.15ms +step:1668/2245 train_time:102005ms step_avg:61.15ms +step:1669/2245 train_time:102068ms step_avg:61.16ms +step:1670/2245 train_time:102129ms step_avg:61.16ms +step:1671/2245 train_time:102192ms step_avg:61.16ms +step:1672/2245 train_time:102252ms step_avg:61.16ms +step:1673/2245 train_time:102315ms step_avg:61.16ms +step:1674/2245 train_time:102376ms step_avg:61.16ms +step:1675/2245 train_time:102438ms step_avg:61.16ms +step:1676/2245 train_time:102498ms step_avg:61.16ms +step:1677/2245 train_time:102561ms step_avg:61.16ms +step:1678/2245 train_time:102621ms step_avg:61.16ms +step:1679/2245 train_time:102684ms step_avg:61.16ms +step:1680/2245 train_time:102745ms step_avg:61.16ms +step:1681/2245 train_time:102808ms step_avg:61.16ms +step:1682/2245 train_time:102869ms step_avg:61.16ms +step:1683/2245 train_time:102932ms step_avg:61.16ms +step:1684/2245 train_time:102992ms step_avg:61.16ms +step:1685/2245 train_time:103055ms step_avg:61.16ms +step:1686/2245 train_time:103115ms step_avg:61.16ms +step:1687/2245 train_time:103177ms step_avg:61.16ms +step:1688/2245 train_time:103238ms step_avg:61.16ms +step:1689/2245 train_time:103301ms step_avg:61.16ms +step:1690/2245 train_time:103362ms step_avg:61.16ms +step:1691/2245 train_time:103425ms step_avg:61.16ms +step:1692/2245 train_time:103486ms step_avg:61.16ms +step:1693/2245 train_time:103548ms step_avg:61.16ms +step:1694/2245 train_time:103609ms step_avg:61.16ms +step:1695/2245 train_time:103672ms step_avg:61.16ms +step:1696/2245 train_time:103732ms step_avg:61.16ms +step:1697/2245 train_time:103795ms step_avg:61.16ms +step:1698/2245 train_time:103856ms step_avg:61.16ms +step:1699/2245 train_time:103919ms step_avg:61.16ms +step:1700/2245 train_time:103980ms step_avg:61.16ms +step:1701/2245 train_time:104042ms step_avg:61.17ms +step:1702/2245 train_time:104103ms step_avg:61.17ms +step:1703/2245 train_time:104166ms step_avg:61.17ms +step:1704/2245 train_time:104226ms step_avg:61.17ms +step:1705/2245 train_time:104290ms step_avg:61.17ms +step:1706/2245 train_time:104350ms step_avg:61.17ms +step:1707/2245 train_time:104412ms step_avg:61.17ms +step:1708/2245 train_time:104473ms step_avg:61.17ms +step:1709/2245 train_time:104536ms step_avg:61.17ms +step:1710/2245 train_time:104596ms step_avg:61.17ms +step:1711/2245 train_time:104659ms step_avg:61.17ms +step:1712/2245 train_time:104719ms step_avg:61.17ms +step:1713/2245 train_time:104782ms step_avg:61.17ms +step:1714/2245 train_time:104842ms step_avg:61.17ms +step:1715/2245 train_time:104906ms step_avg:61.17ms +step:1716/2245 train_time:104966ms step_avg:61.17ms +step:1717/2245 train_time:105029ms step_avg:61.17ms +step:1718/2245 train_time:105089ms step_avg:61.17ms +step:1719/2245 train_time:105152ms step_avg:61.17ms +step:1720/2245 train_time:105212ms step_avg:61.17ms +step:1721/2245 train_time:105275ms step_avg:61.17ms +step:1722/2245 train_time:105336ms step_avg:61.17ms +step:1723/2245 train_time:105398ms step_avg:61.17ms +step:1724/2245 train_time:105459ms step_avg:61.17ms +step:1725/2245 train_time:105523ms step_avg:61.17ms +step:1726/2245 train_time:105583ms step_avg:61.17ms +step:1727/2245 train_time:105647ms step_avg:61.17ms +step:1728/2245 train_time:105708ms step_avg:61.17ms +step:1729/2245 train_time:105771ms step_avg:61.17ms +step:1730/2245 train_time:105831ms step_avg:61.17ms +step:1731/2245 train_time:105894ms step_avg:61.17ms +step:1732/2245 train_time:105954ms step_avg:61.17ms +step:1733/2245 train_time:106017ms step_avg:61.18ms +step:1734/2245 train_time:106077ms step_avg:61.17ms +step:1735/2245 train_time:106140ms step_avg:61.18ms +step:1736/2245 train_time:106200ms step_avg:61.18ms +step:1737/2245 train_time:106264ms step_avg:61.18ms +step:1738/2245 train_time:106325ms step_avg:61.18ms +step:1739/2245 train_time:106387ms step_avg:61.18ms +step:1740/2245 train_time:106448ms step_avg:61.18ms +step:1741/2245 train_time:106511ms step_avg:61.18ms +step:1742/2245 train_time:106571ms step_avg:61.18ms +step:1743/2245 train_time:106634ms step_avg:61.18ms +step:1744/2245 train_time:106694ms step_avg:61.18ms +step:1745/2245 train_time:106757ms step_avg:61.18ms +step:1746/2245 train_time:106818ms step_avg:61.18ms +step:1747/2245 train_time:106880ms step_avg:61.18ms +step:1748/2245 train_time:106941ms step_avg:61.18ms +step:1749/2245 train_time:107004ms step_avg:61.18ms +step:1750/2245 train_time:107065ms step_avg:61.18ms +step:1750/2245 val_loss:3.3773 train_time:107129ms step_avg:61.22ms +step:1751/2245 train_time:107149ms step_avg:61.19ms +step:1752/2245 train_time:107192ms step_avg:61.18ms +step:1753/2245 train_time:107260ms step_avg:61.19ms +step:1754/2245 train_time:107322ms step_avg:61.19ms +step:1755/2245 train_time:107384ms step_avg:61.19ms +step:1756/2245 train_time:107444ms step_avg:61.19ms +step:1757/2245 train_time:107506ms step_avg:61.19ms +step:1758/2245 train_time:107566ms step_avg:61.19ms +step:1759/2245 train_time:107627ms step_avg:61.19ms +step:1760/2245 train_time:107687ms step_avg:61.19ms +step:1761/2245 train_time:107749ms step_avg:61.19ms +step:1762/2245 train_time:107809ms step_avg:61.19ms +step:1763/2245 train_time:107871ms step_avg:61.19ms +step:1764/2245 train_time:107931ms step_avg:61.19ms +step:1765/2245 train_time:107993ms step_avg:61.19ms +step:1766/2245 train_time:108054ms step_avg:61.19ms +step:1767/2245 train_time:108118ms step_avg:61.19ms +step:1768/2245 train_time:108181ms step_avg:61.19ms +step:1769/2245 train_time:108246ms step_avg:61.19ms +step:1770/2245 train_time:108307ms step_avg:61.19ms +step:1771/2245 train_time:108370ms step_avg:61.19ms +step:1772/2245 train_time:108430ms step_avg:61.19ms +step:1773/2245 train_time:108493ms step_avg:61.19ms +step:1774/2245 train_time:108553ms step_avg:61.19ms +step:1775/2245 train_time:108616ms step_avg:61.19ms +step:1776/2245 train_time:108676ms step_avg:61.19ms +step:1777/2245 train_time:108740ms step_avg:61.19ms +step:1778/2245 train_time:108800ms step_avg:61.19ms +step:1779/2245 train_time:108862ms step_avg:61.19ms +step:1780/2245 train_time:108922ms step_avg:61.19ms +step:1781/2245 train_time:108984ms step_avg:61.19ms +step:1782/2245 train_time:109044ms step_avg:61.19ms +step:1783/2245 train_time:109107ms step_avg:61.19ms +step:1784/2245 train_time:109169ms step_avg:61.19ms +step:1785/2245 train_time:109232ms step_avg:61.19ms +step:1786/2245 train_time:109294ms step_avg:61.19ms +step:1787/2245 train_time:109358ms step_avg:61.20ms +step:1788/2245 train_time:109418ms step_avg:61.20ms +step:1789/2245 train_time:109481ms step_avg:61.20ms +step:1790/2245 train_time:109542ms step_avg:61.20ms +step:1791/2245 train_time:109603ms step_avg:61.20ms +step:1792/2245 train_time:109664ms step_avg:61.20ms +step:1793/2245 train_time:109726ms step_avg:61.20ms +step:1794/2245 train_time:109786ms step_avg:61.20ms +step:1795/2245 train_time:109848ms step_avg:61.20ms +step:1796/2245 train_time:109907ms step_avg:61.20ms +step:1797/2245 train_time:109970ms step_avg:61.20ms +step:1798/2245 train_time:110030ms step_avg:61.20ms +step:1799/2245 train_time:110093ms step_avg:61.20ms +step:1800/2245 train_time:110154ms step_avg:61.20ms +step:1801/2245 train_time:110217ms step_avg:61.20ms +step:1802/2245 train_time:110279ms step_avg:61.20ms +step:1803/2245 train_time:110342ms step_avg:61.20ms +step:1804/2245 train_time:110403ms step_avg:61.20ms +step:1805/2245 train_time:110466ms step_avg:61.20ms +step:1806/2245 train_time:110526ms step_avg:61.20ms +step:1807/2245 train_time:110589ms step_avg:61.20ms +step:1808/2245 train_time:110650ms step_avg:61.20ms +step:1809/2245 train_time:110712ms step_avg:61.20ms +step:1810/2245 train_time:110773ms step_avg:61.20ms +step:1811/2245 train_time:110837ms step_avg:61.20ms +step:1812/2245 train_time:110897ms step_avg:61.20ms +step:1813/2245 train_time:110959ms step_avg:61.20ms +step:1814/2245 train_time:111019ms step_avg:61.20ms +step:1815/2245 train_time:111082ms step_avg:61.20ms +step:1816/2245 train_time:111143ms step_avg:61.20ms +step:1817/2245 train_time:111206ms step_avg:61.20ms +step:1818/2245 train_time:111266ms step_avg:61.20ms +step:1819/2245 train_time:111329ms step_avg:61.20ms +step:1820/2245 train_time:111390ms step_avg:61.20ms +step:1821/2245 train_time:111453ms step_avg:61.20ms +step:1822/2245 train_time:111514ms step_avg:61.20ms +step:1823/2245 train_time:111578ms step_avg:61.21ms +step:1824/2245 train_time:111639ms step_avg:61.21ms +step:1825/2245 train_time:111702ms step_avg:61.21ms +step:1826/2245 train_time:111762ms step_avg:61.21ms +step:1827/2245 train_time:111825ms step_avg:61.21ms +step:1828/2245 train_time:111885ms step_avg:61.21ms +step:1829/2245 train_time:111948ms step_avg:61.21ms +step:1830/2245 train_time:112008ms step_avg:61.21ms +step:1831/2245 train_time:112070ms step_avg:61.21ms +step:1832/2245 train_time:112131ms step_avg:61.21ms +step:1833/2245 train_time:112194ms step_avg:61.21ms +step:1834/2245 train_time:112254ms step_avg:61.21ms +step:1835/2245 train_time:112318ms step_avg:61.21ms +step:1836/2245 train_time:112379ms step_avg:61.21ms +step:1837/2245 train_time:112442ms step_avg:61.21ms +step:1838/2245 train_time:112503ms step_avg:61.21ms +step:1839/2245 train_time:112565ms step_avg:61.21ms +step:1840/2245 train_time:112626ms step_avg:61.21ms +step:1841/2245 train_time:112688ms step_avg:61.21ms +step:1842/2245 train_time:112749ms step_avg:61.21ms +step:1843/2245 train_time:112812ms step_avg:61.21ms +step:1844/2245 train_time:112873ms step_avg:61.21ms +step:1845/2245 train_time:112937ms step_avg:61.21ms +step:1846/2245 train_time:112997ms step_avg:61.21ms +step:1847/2245 train_time:113060ms step_avg:61.21ms +step:1848/2245 train_time:113120ms step_avg:61.21ms +step:1849/2245 train_time:113183ms step_avg:61.21ms +step:1850/2245 train_time:113244ms step_avg:61.21ms +step:1851/2245 train_time:113308ms step_avg:61.21ms +step:1852/2245 train_time:113367ms step_avg:61.21ms +step:1853/2245 train_time:113430ms step_avg:61.21ms +step:1854/2245 train_time:113490ms step_avg:61.21ms +step:1855/2245 train_time:113552ms step_avg:61.21ms +step:1856/2245 train_time:113613ms step_avg:61.21ms +step:1857/2245 train_time:113676ms step_avg:61.22ms +step:1858/2245 train_time:113737ms step_avg:61.21ms +step:1859/2245 train_time:113800ms step_avg:61.22ms +step:1860/2245 train_time:113861ms step_avg:61.22ms +step:1861/2245 train_time:113924ms step_avg:61.22ms +step:1862/2245 train_time:113984ms step_avg:61.22ms +step:1863/2245 train_time:114047ms step_avg:61.22ms +step:1864/2245 train_time:114108ms step_avg:61.22ms +step:1865/2245 train_time:114170ms step_avg:61.22ms +step:1866/2245 train_time:114230ms step_avg:61.22ms +step:1867/2245 train_time:114293ms step_avg:61.22ms +step:1868/2245 train_time:114354ms step_avg:61.22ms +step:1869/2245 train_time:114417ms step_avg:61.22ms +step:1870/2245 train_time:114478ms step_avg:61.22ms +step:1871/2245 train_time:114540ms step_avg:61.22ms +step:1872/2245 train_time:114601ms step_avg:61.22ms +step:1873/2245 train_time:114663ms step_avg:61.22ms +step:1874/2245 train_time:114724ms step_avg:61.22ms +step:1875/2245 train_time:114787ms step_avg:61.22ms +step:1876/2245 train_time:114848ms step_avg:61.22ms +step:1877/2245 train_time:114910ms step_avg:61.22ms +step:1878/2245 train_time:114971ms step_avg:61.22ms +step:1879/2245 train_time:115035ms step_avg:61.22ms +step:1880/2245 train_time:115096ms step_avg:61.22ms +step:1881/2245 train_time:115160ms step_avg:61.22ms +step:1882/2245 train_time:115220ms step_avg:61.22ms +step:1883/2245 train_time:115282ms step_avg:61.22ms +step:1884/2245 train_time:115342ms step_avg:61.22ms +step:1885/2245 train_time:115405ms step_avg:61.22ms +step:1886/2245 train_time:115465ms step_avg:61.22ms +step:1887/2245 train_time:115528ms step_avg:61.22ms +step:1888/2245 train_time:115587ms step_avg:61.22ms +step:1889/2245 train_time:115651ms step_avg:61.22ms +step:1890/2245 train_time:115711ms step_avg:61.22ms +step:1891/2245 train_time:115775ms step_avg:61.22ms +step:1892/2245 train_time:115836ms step_avg:61.22ms +step:1893/2245 train_time:115899ms step_avg:61.22ms +step:1894/2245 train_time:115960ms step_avg:61.22ms +step:1895/2245 train_time:116022ms step_avg:61.23ms +step:1896/2245 train_time:116082ms step_avg:61.22ms +step:1897/2245 train_time:116145ms step_avg:61.23ms +step:1898/2245 train_time:116206ms step_avg:61.23ms +step:1899/2245 train_time:116268ms step_avg:61.23ms +step:1900/2245 train_time:116328ms step_avg:61.23ms +step:1901/2245 train_time:116391ms step_avg:61.23ms +step:1902/2245 train_time:116452ms step_avg:61.23ms +step:1903/2245 train_time:116514ms step_avg:61.23ms +step:1904/2245 train_time:116575ms step_avg:61.23ms +step:1905/2245 train_time:116638ms step_avg:61.23ms +step:1906/2245 train_time:116698ms step_avg:61.23ms +step:1907/2245 train_time:116761ms step_avg:61.23ms +step:1908/2245 train_time:116822ms step_avg:61.23ms +step:1909/2245 train_time:116884ms step_avg:61.23ms +step:1910/2245 train_time:116945ms step_avg:61.23ms +step:1911/2245 train_time:117007ms step_avg:61.23ms +step:1912/2245 train_time:117068ms step_avg:61.23ms +step:1913/2245 train_time:117131ms step_avg:61.23ms +step:1914/2245 train_time:117191ms step_avg:61.23ms +step:1915/2245 train_time:117254ms step_avg:61.23ms +step:1916/2245 train_time:117315ms step_avg:61.23ms +step:1917/2245 train_time:117378ms step_avg:61.23ms +step:1918/2245 train_time:117439ms step_avg:61.23ms +step:1919/2245 train_time:117502ms step_avg:61.23ms +step:1920/2245 train_time:117562ms step_avg:61.23ms +step:1921/2245 train_time:117625ms step_avg:61.23ms +step:1922/2245 train_time:117685ms step_avg:61.23ms +step:1923/2245 train_time:117748ms step_avg:61.23ms +step:1924/2245 train_time:117808ms step_avg:61.23ms +step:1925/2245 train_time:117871ms step_avg:61.23ms +step:1926/2245 train_time:117932ms step_avg:61.23ms +step:1927/2245 train_time:117995ms step_avg:61.23ms +step:1928/2245 train_time:118057ms step_avg:61.23ms +step:1929/2245 train_time:118120ms step_avg:61.23ms +step:1930/2245 train_time:118181ms step_avg:61.23ms +step:1931/2245 train_time:118243ms step_avg:61.23ms +step:1932/2245 train_time:118303ms step_avg:61.23ms +step:1933/2245 train_time:118366ms step_avg:61.23ms +step:1934/2245 train_time:118426ms step_avg:61.23ms +step:1935/2245 train_time:118488ms step_avg:61.23ms +step:1936/2245 train_time:118550ms step_avg:61.23ms +step:1937/2245 train_time:118612ms step_avg:61.23ms +step:1938/2245 train_time:118672ms step_avg:61.23ms +step:1939/2245 train_time:118735ms step_avg:61.24ms +step:1940/2245 train_time:118795ms step_avg:61.23ms +step:1941/2245 train_time:118859ms step_avg:61.24ms +step:1942/2245 train_time:118918ms step_avg:61.23ms +step:1943/2245 train_time:118981ms step_avg:61.24ms +step:1944/2245 train_time:119041ms step_avg:61.24ms +step:1945/2245 train_time:119104ms step_avg:61.24ms +step:1946/2245 train_time:119164ms step_avg:61.24ms +step:1947/2245 train_time:119227ms step_avg:61.24ms +step:1948/2245 train_time:119287ms step_avg:61.24ms +step:1949/2245 train_time:119350ms step_avg:61.24ms +step:1950/2245 train_time:119411ms step_avg:61.24ms +step:1951/2245 train_time:119474ms step_avg:61.24ms +step:1952/2245 train_time:119535ms step_avg:61.24ms +step:1953/2245 train_time:119598ms step_avg:61.24ms +step:1954/2245 train_time:119658ms step_avg:61.24ms +step:1955/2245 train_time:119720ms step_avg:61.24ms +step:1956/2245 train_time:119781ms step_avg:61.24ms +step:1957/2245 train_time:119843ms step_avg:61.24ms +step:1958/2245 train_time:119903ms step_avg:61.24ms +step:1959/2245 train_time:119966ms step_avg:61.24ms +step:1960/2245 train_time:120026ms step_avg:61.24ms +step:1961/2245 train_time:120090ms step_avg:61.24ms +step:1962/2245 train_time:120150ms step_avg:61.24ms +step:1963/2245 train_time:120213ms step_avg:61.24ms +step:1964/2245 train_time:120273ms step_avg:61.24ms +step:1965/2245 train_time:120336ms step_avg:61.24ms +step:1966/2245 train_time:120397ms step_avg:61.24ms +step:1967/2245 train_time:120460ms step_avg:61.24ms +step:1968/2245 train_time:120520ms step_avg:61.24ms +step:1969/2245 train_time:120582ms step_avg:61.24ms +step:1970/2245 train_time:120643ms step_avg:61.24ms +step:1971/2245 train_time:120706ms step_avg:61.24ms +step:1972/2245 train_time:120766ms step_avg:61.24ms +step:1973/2245 train_time:120828ms step_avg:61.24ms +step:1974/2245 train_time:120889ms step_avg:61.24ms +step:1975/2245 train_time:120951ms step_avg:61.24ms +step:1976/2245 train_time:121011ms step_avg:61.24ms +step:1977/2245 train_time:121074ms step_avg:61.24ms +step:1978/2245 train_time:121135ms step_avg:61.24ms +step:1979/2245 train_time:121198ms step_avg:61.24ms +step:1980/2245 train_time:121259ms step_avg:61.24ms +step:1981/2245 train_time:121322ms step_avg:61.24ms +step:1982/2245 train_time:121382ms step_avg:61.24ms +step:1983/2245 train_time:121444ms step_avg:61.24ms +step:1984/2245 train_time:121504ms step_avg:61.24ms +step:1985/2245 train_time:121567ms step_avg:61.24ms +step:1986/2245 train_time:121628ms step_avg:61.24ms +step:1987/2245 train_time:121691ms step_avg:61.24ms +step:1988/2245 train_time:121752ms step_avg:61.24ms +step:1989/2245 train_time:121815ms step_avg:61.24ms +step:1990/2245 train_time:121875ms step_avg:61.24ms +step:1991/2245 train_time:121938ms step_avg:61.24ms +step:1992/2245 train_time:121999ms step_avg:61.24ms +step:1993/2245 train_time:122062ms step_avg:61.25ms +step:1994/2245 train_time:122122ms step_avg:61.24ms +step:1995/2245 train_time:122185ms step_avg:61.25ms +step:1996/2245 train_time:122246ms step_avg:61.25ms +step:1997/2245 train_time:122308ms step_avg:61.25ms +step:1998/2245 train_time:122369ms step_avg:61.25ms +step:1999/2245 train_time:122432ms step_avg:61.25ms +step:2000/2245 train_time:122492ms step_avg:61.25ms +step:2000/2245 val_loss:3.3233 train_time:122557ms step_avg:61.28ms +step:2001/2245 train_time:122575ms step_avg:61.26ms +step:2002/2245 train_time:122618ms step_avg:61.25ms +step:2003/2245 train_time:122682ms step_avg:61.25ms +step:2004/2245 train_time:122743ms step_avg:61.25ms +step:2005/2245 train_time:122806ms step_avg:61.25ms +step:2006/2245 train_time:122866ms step_avg:61.25ms +step:2007/2245 train_time:122928ms step_avg:61.25ms +step:2008/2245 train_time:122988ms step_avg:61.25ms +step:2009/2245 train_time:123050ms step_avg:61.25ms +step:2010/2245 train_time:123109ms step_avg:61.25ms +step:2011/2245 train_time:123173ms step_avg:61.25ms +step:2012/2245 train_time:123232ms step_avg:61.25ms +step:2013/2245 train_time:123295ms step_avg:61.25ms +step:2014/2245 train_time:123355ms step_avg:61.25ms +step:2015/2245 train_time:123417ms step_avg:61.25ms +step:2016/2245 train_time:123478ms step_avg:61.25ms +step:2017/2245 train_time:123542ms step_avg:61.25ms +step:2018/2245 train_time:123605ms step_avg:61.25ms +step:2019/2245 train_time:123668ms step_avg:61.25ms +step:2020/2245 train_time:123730ms step_avg:61.25ms +step:2021/2245 train_time:123793ms step_avg:61.25ms +step:2022/2245 train_time:123853ms step_avg:61.25ms +step:2023/2245 train_time:123916ms step_avg:61.25ms +step:2024/2245 train_time:123976ms step_avg:61.25ms +step:2025/2245 train_time:124039ms step_avg:61.25ms +step:2026/2245 train_time:124100ms step_avg:61.25ms +step:2027/2245 train_time:124163ms step_avg:61.25ms +step:2028/2245 train_time:124224ms step_avg:61.25ms +step:2029/2245 train_time:124287ms step_avg:61.26ms +step:2030/2245 train_time:124347ms step_avg:61.25ms +step:2031/2245 train_time:124409ms step_avg:61.25ms +step:2032/2245 train_time:124469ms step_avg:61.25ms +step:2033/2245 train_time:124532ms step_avg:61.26ms +step:2034/2245 train_time:124593ms step_avg:61.26ms +step:2035/2245 train_time:124656ms step_avg:61.26ms +step:2036/2245 train_time:124718ms step_avg:61.26ms +step:2037/2245 train_time:124781ms step_avg:61.26ms +step:2038/2245 train_time:124842ms step_avg:61.26ms +step:2039/2245 train_time:124905ms step_avg:61.26ms +step:2040/2245 train_time:124965ms step_avg:61.26ms +step:2041/2245 train_time:125028ms step_avg:61.26ms +step:2042/2245 train_time:125088ms step_avg:61.26ms +step:2043/2245 train_time:125151ms step_avg:61.26ms +step:2044/2245 train_time:125211ms step_avg:61.26ms +step:2045/2245 train_time:125274ms step_avg:61.26ms +step:2046/2245 train_time:125334ms step_avg:61.26ms +step:2047/2245 train_time:125398ms step_avg:61.26ms +step:2048/2245 train_time:125459ms step_avg:61.26ms +step:2049/2245 train_time:125521ms step_avg:61.26ms +step:2050/2245 train_time:125582ms step_avg:61.26ms +step:2051/2245 train_time:125647ms step_avg:61.26ms +step:2052/2245 train_time:125707ms step_avg:61.26ms +step:2053/2245 train_time:125770ms step_avg:61.26ms +step:2054/2245 train_time:125830ms step_avg:61.26ms +step:2055/2245 train_time:125893ms step_avg:61.26ms +step:2056/2245 train_time:125954ms step_avg:61.26ms +step:2057/2245 train_time:126017ms step_avg:61.26ms +step:2058/2245 train_time:126077ms step_avg:61.26ms +step:2059/2245 train_time:126140ms step_avg:61.26ms +step:2060/2245 train_time:126201ms step_avg:61.26ms +step:2061/2245 train_time:126264ms step_avg:61.26ms +step:2062/2245 train_time:126325ms step_avg:61.26ms +step:2063/2245 train_time:126387ms step_avg:61.26ms +step:2064/2245 train_time:126448ms step_avg:61.26ms +step:2065/2245 train_time:126511ms step_avg:61.26ms +step:2066/2245 train_time:126572ms step_avg:61.26ms +step:2067/2245 train_time:126634ms step_avg:61.26ms +step:2068/2245 train_time:126695ms step_avg:61.26ms +step:2069/2245 train_time:126759ms step_avg:61.27ms +step:2070/2245 train_time:126820ms step_avg:61.27ms +step:2071/2245 train_time:126884ms step_avg:61.27ms +step:2072/2245 train_time:126945ms step_avg:61.27ms +step:2073/2245 train_time:127007ms step_avg:61.27ms +step:2074/2245 train_time:127067ms step_avg:61.27ms +step:2075/2245 train_time:127130ms step_avg:61.27ms +step:2076/2245 train_time:127190ms step_avg:61.27ms +step:2077/2245 train_time:127253ms step_avg:61.27ms +step:2078/2245 train_time:127313ms step_avg:61.27ms +step:2079/2245 train_time:127376ms step_avg:61.27ms +step:2080/2245 train_time:127437ms step_avg:61.27ms +step:2081/2245 train_time:127499ms step_avg:61.27ms +step:2082/2245 train_time:127561ms step_avg:61.27ms +step:2083/2245 train_time:127624ms step_avg:61.27ms +step:2084/2245 train_time:127684ms step_avg:61.27ms +step:2085/2245 train_time:127748ms step_avg:61.27ms +step:2086/2245 train_time:127808ms step_avg:61.27ms +step:2087/2245 train_time:127871ms step_avg:61.27ms +step:2088/2245 train_time:127931ms step_avg:61.27ms +step:2089/2245 train_time:127995ms step_avg:61.27ms +step:2090/2245 train_time:128055ms step_avg:61.27ms +step:2091/2245 train_time:128119ms step_avg:61.27ms +step:2092/2245 train_time:128180ms step_avg:61.27ms +step:2093/2245 train_time:128244ms step_avg:61.27ms +step:2094/2245 train_time:128305ms step_avg:61.27ms +step:2095/2245 train_time:128367ms step_avg:61.27ms +step:2096/2245 train_time:128427ms step_avg:61.27ms +step:2097/2245 train_time:128490ms step_avg:61.27ms +step:2098/2245 train_time:128550ms step_avg:61.27ms +step:2099/2245 train_time:128613ms step_avg:61.27ms +step:2100/2245 train_time:128674ms step_avg:61.27ms +step:2101/2245 train_time:128737ms step_avg:61.27ms +step:2102/2245 train_time:128799ms step_avg:61.27ms +step:2103/2245 train_time:128862ms step_avg:61.28ms +step:2104/2245 train_time:128923ms step_avg:61.28ms +step:2105/2245 train_time:128986ms step_avg:61.28ms +step:2106/2245 train_time:129046ms step_avg:61.28ms +step:2107/2245 train_time:129109ms step_avg:61.28ms +step:2108/2245 train_time:129170ms step_avg:61.28ms +step:2109/2245 train_time:129233ms step_avg:61.28ms +step:2110/2245 train_time:129294ms step_avg:61.28ms +step:2111/2245 train_time:129357ms step_avg:61.28ms +step:2112/2245 train_time:129417ms step_avg:61.28ms +step:2113/2245 train_time:129480ms step_avg:61.28ms +step:2114/2245 train_time:129541ms step_avg:61.28ms +step:2115/2245 train_time:129604ms step_avg:61.28ms +step:2116/2245 train_time:129665ms step_avg:61.28ms +step:2117/2245 train_time:129727ms step_avg:61.28ms +step:2118/2245 train_time:129788ms step_avg:61.28ms +step:2119/2245 train_time:129851ms step_avg:61.28ms +step:2120/2245 train_time:129911ms step_avg:61.28ms +step:2121/2245 train_time:129974ms step_avg:61.28ms +step:2122/2245 train_time:130035ms step_avg:61.28ms +step:2123/2245 train_time:130098ms step_avg:61.28ms +step:2124/2245 train_time:130159ms step_avg:61.28ms +step:2125/2245 train_time:130222ms step_avg:61.28ms +step:2126/2245 train_time:130283ms step_avg:61.28ms +step:2127/2245 train_time:130345ms step_avg:61.28ms +step:2128/2245 train_time:130406ms step_avg:61.28ms +step:2129/2245 train_time:130469ms step_avg:61.28ms +step:2130/2245 train_time:130529ms step_avg:61.28ms +step:2131/2245 train_time:130592ms step_avg:61.28ms +step:2132/2245 train_time:130652ms step_avg:61.28ms +step:2133/2245 train_time:130715ms step_avg:61.28ms +step:2134/2245 train_time:130776ms step_avg:61.28ms +step:2135/2245 train_time:130839ms step_avg:61.28ms +step:2136/2245 train_time:130900ms step_avg:61.28ms +step:2137/2245 train_time:130963ms step_avg:61.28ms +step:2138/2245 train_time:131024ms step_avg:61.28ms +step:2139/2245 train_time:131086ms step_avg:61.28ms +step:2140/2245 train_time:131146ms step_avg:61.28ms +step:2141/2245 train_time:131209ms step_avg:61.28ms +step:2142/2245 train_time:131269ms step_avg:61.28ms +step:2143/2245 train_time:131332ms step_avg:61.28ms +step:2144/2245 train_time:131393ms step_avg:61.28ms +step:2145/2245 train_time:131456ms step_avg:61.28ms +step:2146/2245 train_time:131516ms step_avg:61.28ms +step:2147/2245 train_time:131579ms step_avg:61.28ms +step:2148/2245 train_time:131640ms step_avg:61.28ms +step:2149/2245 train_time:131703ms step_avg:61.29ms +step:2150/2245 train_time:131765ms step_avg:61.29ms +step:2151/2245 train_time:131827ms step_avg:61.29ms +step:2152/2245 train_time:131887ms step_avg:61.29ms +step:2153/2245 train_time:131951ms step_avg:61.29ms +step:2154/2245 train_time:132011ms step_avg:61.29ms +step:2155/2245 train_time:132075ms step_avg:61.29ms +step:2156/2245 train_time:132135ms step_avg:61.29ms +step:2157/2245 train_time:132199ms step_avg:61.29ms +step:2158/2245 train_time:132260ms step_avg:61.29ms +step:2159/2245 train_time:132323ms step_avg:61.29ms +step:2160/2245 train_time:132384ms step_avg:61.29ms +step:2161/2245 train_time:132447ms step_avg:61.29ms +step:2162/2245 train_time:132507ms step_avg:61.29ms +step:2163/2245 train_time:132571ms step_avg:61.29ms +step:2164/2245 train_time:132631ms step_avg:61.29ms +step:2165/2245 train_time:132694ms step_avg:61.29ms +step:2166/2245 train_time:132755ms step_avg:61.29ms +step:2167/2245 train_time:132818ms step_avg:61.29ms +step:2168/2245 train_time:132878ms step_avg:61.29ms +step:2169/2245 train_time:132942ms step_avg:61.29ms +step:2170/2245 train_time:133003ms step_avg:61.29ms +step:2171/2245 train_time:133067ms step_avg:61.29ms +step:2172/2245 train_time:133127ms step_avg:61.29ms +step:2173/2245 train_time:133189ms step_avg:61.29ms +step:2174/2245 train_time:133249ms step_avg:61.29ms +step:2175/2245 train_time:133312ms step_avg:61.29ms +step:2176/2245 train_time:133373ms step_avg:61.29ms +step:2177/2245 train_time:133436ms step_avg:61.29ms +step:2178/2245 train_time:133496ms step_avg:61.29ms +step:2179/2245 train_time:133559ms step_avg:61.29ms +step:2180/2245 train_time:133621ms step_avg:61.29ms +step:2181/2245 train_time:133684ms step_avg:61.29ms +step:2182/2245 train_time:133745ms step_avg:61.29ms +step:2183/2245 train_time:133808ms step_avg:61.30ms +step:2184/2245 train_time:133868ms step_avg:61.29ms +step:2185/2245 train_time:133931ms step_avg:61.30ms +step:2186/2245 train_time:133991ms step_avg:61.30ms +step:2187/2245 train_time:134054ms step_avg:61.30ms +step:2188/2245 train_time:134114ms step_avg:61.30ms +step:2189/2245 train_time:134177ms step_avg:61.30ms +step:2190/2245 train_time:134237ms step_avg:61.30ms +step:2191/2245 train_time:134300ms step_avg:61.30ms +step:2192/2245 train_time:134362ms step_avg:61.30ms +step:2193/2245 train_time:134425ms step_avg:61.30ms +step:2194/2245 train_time:134485ms step_avg:61.30ms +step:2195/2245 train_time:134547ms step_avg:61.30ms +step:2196/2245 train_time:134607ms step_avg:61.30ms +step:2197/2245 train_time:134669ms step_avg:61.30ms +step:2198/2245 train_time:134730ms step_avg:61.30ms +step:2199/2245 train_time:134793ms step_avg:61.30ms +step:2200/2245 train_time:134853ms step_avg:61.30ms +step:2201/2245 train_time:134917ms step_avg:61.30ms +step:2202/2245 train_time:134978ms step_avg:61.30ms +step:2203/2245 train_time:135042ms step_avg:61.30ms +step:2204/2245 train_time:135103ms step_avg:61.30ms +step:2205/2245 train_time:135167ms step_avg:61.30ms +step:2206/2245 train_time:135228ms step_avg:61.30ms +step:2207/2245 train_time:135291ms step_avg:61.30ms +step:2208/2245 train_time:135351ms step_avg:61.30ms +step:2209/2245 train_time:135415ms step_avg:61.30ms +step:2210/2245 train_time:135475ms step_avg:61.30ms +step:2211/2245 train_time:135538ms step_avg:61.30ms +step:2212/2245 train_time:135599ms step_avg:61.30ms +step:2213/2245 train_time:135663ms step_avg:61.30ms +step:2214/2245 train_time:135723ms step_avg:61.30ms +step:2215/2245 train_time:135787ms step_avg:61.30ms +step:2216/2245 train_time:135847ms step_avg:61.30ms +step:2217/2245 train_time:135910ms step_avg:61.30ms +step:2218/2245 train_time:135971ms step_avg:61.30ms +step:2219/2245 train_time:136033ms step_avg:61.30ms +step:2220/2245 train_time:136094ms step_avg:61.30ms +step:2221/2245 train_time:136158ms step_avg:61.30ms +step:2222/2245 train_time:136219ms step_avg:61.30ms +step:2223/2245 train_time:136282ms step_avg:61.31ms +step:2224/2245 train_time:136343ms step_avg:61.31ms +step:2225/2245 train_time:136406ms step_avg:61.31ms +step:2226/2245 train_time:136467ms step_avg:61.31ms +step:2227/2245 train_time:136530ms step_avg:61.31ms +step:2228/2245 train_time:136590ms step_avg:61.31ms +step:2229/2245 train_time:136653ms step_avg:61.31ms +step:2230/2245 train_time:136714ms step_avg:61.31ms +step:2231/2245 train_time:136778ms step_avg:61.31ms +step:2232/2245 train_time:136838ms step_avg:61.31ms +step:2233/2245 train_time:136902ms step_avg:61.31ms +step:2234/2245 train_time:136962ms step_avg:61.31ms +step:2235/2245 train_time:137026ms step_avg:61.31ms +step:2236/2245 train_time:137086ms step_avg:61.31ms +step:2237/2245 train_time:137149ms step_avg:61.31ms +step:2238/2245 train_time:137209ms step_avg:61.31ms +step:2239/2245 train_time:137273ms step_avg:61.31ms +step:2240/2245 train_time:137333ms step_avg:61.31ms +step:2241/2245 train_time:137396ms step_avg:61.31ms +step:2242/2245 train_time:137456ms step_avg:61.31ms +step:2243/2245 train_time:137520ms step_avg:61.31ms +step:2244/2245 train_time:137581ms step_avg:61.31ms +step:2245/2245 train_time:137644ms step_avg:61.31ms +step:2245/2245 val_loss:3.2783 train_time:137705ms step_avg:61.34ms +peak memory allocated: 29249 MiB reserved: 50528 MiB diff --git a/train_gpt.py b/train_gpt.py index 7030c04f1..bc23355b2 100644 --- a/train_gpt.py +++ b/train_gpt.py @@ -378,7 +378,6 @@ def polar_express(G: torch.Tensor): """ Polar Express Sign Method: https://arxiv.org/pdf/2505.16932 by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. - Code adapted from https://github.com/NoahAmsel/PolarExpress/tree/main by @varunneal. """ X = G.bfloat16() if G.size(-2) > G.size(-1): @@ -409,7 +408,7 @@ def polar_express(G: torch.Tensor): # ----------------------------------------------------------------------------- # Muon optimizer -class Muon(torch.optim.Optimizer): +class NorMuon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz @@ -419,15 +418,16 @@ class Muon(torch.optim.Optimizer): processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. - Note: A later PR replaced Newton-Shulz with Polar Express for the orthogonalization step Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - Though empirically small 1D params perform efficiently here: - NS approximately performs a magnitude normalization of the grad - This hyper-optimized class has faster execution time than the current impl of Adam for small params - Custom distributed sizing: + Differences from standard Muon: + - Newton-Shulz is replaced with Polar Express for the orthogonalization step + - NorMuon adds a low-rank variance estimator similar to Adafactor. + - small 1D parameters handled here instead of in Adam + - Cautious weight decay, a gated version of decoupled weight decay + - Custom distributed sizing: The model stores all attn and mlp weights in the same shape, and then updates the view as needed on the forward pass. This enables attn and mlp weights to be contained within the same dist.reduce_scatter_tensor() call. The model architecture has been customized to enable @@ -446,7 +446,7 @@ class Muon(torch.optim.Optimizer): 9. wait for each all gather to complete and update params Empirically, leading with small params provides an additional 0.2s improvement. """ - def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, eps=1e-8, beta2=0.95, custom_sizing=True): + def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, beta2=0.95, custom_sizing=True): defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, beta2=beta2) self.world_size = dist.get_world_size() if dist.is_initialized() else 1 # custom sizing requires 8 GPUs @@ -483,7 +483,7 @@ def generate_custom_param_groups(self, params): Implementation requires that a single GPU does not receive both attn and mlp params when a param group is split across GPUs. """ - module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp_up', 'mlp_down'] + module_group_order = ['smear_gate', 'attn_gate', 'attn', 'mlp'] params_list = list(params) params_list.sort(key=lambda x: module_group_order.index(x.label)) @@ -584,14 +584,11 @@ def step(self): # Determine LR and WR eff_lr = group["lr"] * group["param_lr"] - eff_wd = group["weight_decay"] * group["param_wd"] + eff_wd = group["lr"] * group["weight_decay"] * group["param_wd"] # Compute zeropower for the entire chunk in a single, batched call. if num_params == 0: v_chunk = updated_grads - elif params[module_idx].label == "smear_gate": - # dividing by magnitude is equivalent of SVN for 1d tensors - v_chunk = updated_grads / (updated_grads.norm(dim=(-2, -1), keepdim=True).clamp_min(1e-10)) else: v_chunk = polar_express(updated_grads) @@ -608,10 +605,12 @@ def step(self): updated_params = torch.empty_like(grad_chunk) param_chunk = torch.stack(params[module_idx:module_idx + num_params]) if num_params > 0 else torch.zeros_like(v_chunk) - # Apply weight decay directly to the buffer. - param_chunk.mul_(1 - eff_wd) - param_chunk.add_(-eff_lr * v_chunk) + # "Cautious" weight decay (https://arxiv.org/abs/2510.12402) + mask = (v_chunk * param_chunk) >= 0 + v_chunk.addcmul_(param_chunk, (eff_wd * mask).to(ref_param.dtype)) + + param_chunk.addcmul_(v_chunk, -eff_lr) updated_params[:num_params].copy_(param_chunk) if num_params < chunk_size: @@ -868,8 +867,8 @@ def __init__(self, dim: int): self.c_fc = nn.Parameter(torch.empty(dim, hdim)) self.c_proj = nn.Parameter(torch.empty(dim, hdim)) # label modules to enable custom optimizer sizing - self.c_fc.label = 'mlp_up' - self.c_proj.label = 'mlp_down' + self.c_fc.label = 'mlp' + self.c_proj.label = 'mlp' # corrective factor to account for transpose self.c_fc.lr_mul = 2. @@ -1209,16 +1208,18 @@ class Hyperparameters: train_max_seq_len: int = 128 * 16 val_batch_size: int = 4 * 64 * 1024 * 8 # optimization - num_iterations: int = 2285 - lr_schedule = (0.5, 0.98) # breakpoints for 3-part schedule: (flat, linear decay, flat) - lr_min = 0.1 + num_scheduled_iterations: int = 2205 # number of steps to complete lr and ws schedule + num_extension_iterations: int = 40 # number of steps to continue training at final lr and ws + num_iterations: int = num_scheduled_iterations + num_extension_iterations + cooldown_frac: float = 0.50 # fraction of num_scheduled_iterations spent cooling down the learning rate # evaluation and logging run_id: str = f"{uuid.uuid4()}" val_loss_every: int = 250 # every how many steps to evaluate val loss? 0 for only at the end save_checkpoint: bool = False # attention masking block_size: int = 128 - ws_schedule: tuple = (3, 5, 7, 9, 11, 13) + ws_schedule: tuple = (3, 7, 11) + ws_final: int = 13 # increase final validation ws, used for YaRN extension and short window size @classiclarryd ws_validate_post_yarn_ext: int = 20 # extend long windows out even further after applying YaRN args = Hyperparameters() @@ -1298,30 +1299,29 @@ def nvidia_smi(): eps=1e-8, weight_decay=0.0, ) -optimizer2 = Muon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=0.0) +optimizer2 = NorMuon(hidden_matrix_params + gate_params, lr=0.03, momentum=0.95, beta2=0.95, weight_decay=1.2) optimizers = [optimizer1, optimizer2] for opt in optimizers: for group in opt.param_groups: group["initial_lr"] = group["lr"] +# learning rate schedule: flat, then linear decay, then flat def get_lr(step: int): - assert step < args.num_iterations - # Three part schedule: flat, linear decrease, flat - lr_schedule = args.lr_schedule - x = step / args.num_iterations - - if x < lr_schedule[0]: - return 1.0 - elif x < lr_schedule[1]: - progress = (x - lr_schedule[0]) / (lr_schedule[1] - lr_schedule[0]) - lr = 1.0 - (1.0 - args.lr_min) * progress - else: - lr = args.lr_min + x = min(0.9999, step / args.num_scheduled_iterations) + assert 0 <= x < 1 + lr = 1.0 + if x >= 1 - args.cooldown_frac: + w = (1 - x) / args.cooldown_frac + lr = w * 1.0 + (1 - w) * 0.1 return lr def get_ws(step: int): - assert step <= args.num_iterations - x = step / (args.num_iterations + 1) + # set short window size to half of long window size + # Higher ws on "extension" steps + if step >= args.num_scheduled_iterations: + return args.ws_final // 2, args.ws_final + x = step / args.num_scheduled_iterations + assert 0 <= x < 1 ws_idx = int(len(args.ws_schedule) * x) return args.ws_schedule[ws_idx] // 2, args.ws_schedule[ws_idx] @@ -1371,25 +1371,26 @@ def step_optimizers(step: int, optimizers, model): initial_state = dict(model=copy.deepcopy(model.state_dict()), optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state train_loader = distributed_data_generator(args.train_files, args.train_batch_size, args.train_max_seq_len, grad_accum_steps=grad_accum_steps) +ws_schedule = list(args.ws_schedule) + [args.ws_final] +ws_long = ws_schedule[0] for step in range(warmup_steps): inputs, targets, cum_seqlens = next(train_loader) # each window size is a new graph, need to warm up each with Yarn.attn_scale - ws_idx = step % len(args.ws_schedule) + ws_idx = step % len(ws_schedule) if ws_idx==0: model.yarn.reset() - ws_long = args.ws_schedule[0] + ws_long = ws_schedule[0] else: - new_ws_long = args.ws_schedule[ws_idx] - if new_ws_long > ws_long: - model.yarn.apply(ws_long, new_ws_long) - ws_long = new_ws_long + new_ws_long = ws_schedule[ws_idx] + model.yarn.apply(ws_long, new_ws_long) + ws_long = new_ws_long model(inputs, targets, cum_seqlens, ws_long//2, ws_long).backward() for opt in optimizers: opt.step() model.zero_grad(set_to_none=True) model.yarn.reset() # rotary buffer is not stored in state_dict model.load_state_dict(initial_state["model"]) -optimizer2.reset() # momentum buffer not in state dict +optimizer2.reset() # muon momentum buffers not in state dict for opt, opt_state in zip(optimizers, initial_state["optimizers"]): opt.load_state_dict(opt_state) del train_loader, initial_state @@ -1447,11 +1448,9 @@ def step_optimizers(step: int, optimizers, model): break # --------------- TRAINING SECTION ----------------- - loss = 0 for _ in range(grad_accum_steps): inputs, targets, cum_seqlens = next(train_loader) - loss += model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps - loss.backward() + (model(inputs, targets, cum_seqlens, ws_short, ws_long) / grad_accum_steps).backward() step_optimizers(step, optimizers, model) # logging

  • UNM<;b zx$Ds^v$hGHQ%(#9IPn&Fo3p0Ehmmh4hVBjp+Om=LupDe z#j%H0-Q6#0#f%NqUu;zpeDe(TY%NOJDu+EjUD6NiaG&l-aisnsI_1zQO*$fs?KvtKuH1wh91 z3XFWQN&yoPsgGc$+UV>#fhTZPRtuQ?=lTI+CqRAXqu7}_jR7L}jIW>AvLQ4h<2Gqw zF%dY`?F6=235Lr%v!Dp_U>l-7$|s1svqu@hzg}vC2ziN!-v9t1WQqrs&>Bi=^I^Hm z!1ES@Y6~%)tN6H%y{+oWg58)^*> zTYk(E^pO9c=Ft%vWkOJhD}lG73n}BTm6w4SXgAvmZ{N4 zxsUAf2CZd%)JW%-Z3mLh$_v%3Pumj5K0XU-V^-|rD{ifX!l$~mS>{aD@pz^SS2FE+ zv|*Xh?wC7;qD`xG2o4+Wj&Y}yS@PX&3goy&kLwZIs3 zX@TlhSJ}3lqmG1?QEP3?qhTd`4&byjNp?wvqIC7Rn_cSm5=ON9hU?J*RPkNk#`>m0 z@;=)tm%5~UJpTSpns!h>zaQ&N&4r78N<|-b%tf?=CndbC>>uq{s>{@xofvS0t4f*y zEKdmYX9K$+dYJ9?GC58Xod7e%M`WUlH)fW8c0bZ8WsXcL-&*J%+H;ZKs!4Kdzt{X> zlcVa3y>OpMAvnZJApq*#IARJX6 zkv`6t+t!J0QXNmFQQ2ofHwv`e+vu8YF~vFIqB|#z=pGK203D6SeANfOyPvVovRtIC~^@kz-msU2t(2F672S&838|Q0bo9yRt zD=SV_e}u->9KB*H00_Q<;cbjvv%||77_ZOQzD!k@OiTvgpCo+21R7mCdIX(bj}BF2hiz_kP{8CCX?K1lnskr3PQ_*J8YVU2FZAlE)I(WoD*)D|XmtQtq@ zdnTq*dsL^3XNt6vt77*BsinN*G`%9z9zjBe1jw;z!e{W&#C}Jg9W5Oe!)F8-S(>0VzgkI z9yKW6sI-VymuF*y0H#*YfLX5LI05F$&V8y`t_Mc%i0{#A+&KUJ+W%qi-Q%I^*Y|Oy zQc06?C}E09k)%QiQwfzM#5hl-$T^X-St8_|Q$kFroK4QfYiG18Pr)fG#o+gSD*XPd@yl!V3Vf6MddC4P z(E~k*V{r1;rm05~We_X<4T z?TlL}>Q5X?9`X%W#aTmz^`!DQ%ZSrp`@w+t*4z#}v&TV*QelWP zMPMv=;_E`ld_(!3vl5mnXaIQ~=8&ilz;EUQBh+;DCTN z^dC;-Dc1~hV?nq>X@TkhL`e`JmvF0VI8a4{2xr*_G!p;+bRj-q#)*drLtuBT0}j=T zD1K+2j^#l90rd{3gh_#$JUD=B1hl$C*8>g4W%OAr+)uV%6`B3(-J^cL7_OHeA7*jE z{chb+zg_XJcOZ0{{bAWbUh(%8mzo5*Ud-=5V31k-cFA4ugTw^--MX)57Qg%KjblK1 z^4|;j&D-#Q*NWJb4O-qwHX2(1EDa_RK9b|UZScGfIqSU)c(;%%P&sj?E6v#(zR-nP z{`{BToB;%Ox+Uq?J9KC!ZzVq$|MitK|L+6#-zWXg-hH-8<{>0)^|!lpf;-2cf$gmI z^+3?p?IhfXDF9?D6EEFn)tKMk1a^NA6Qtq>F{|6Bpk;FjNf(pRXr0;-Yyy7MpJ4}4$<~+(anlXKeP@oTs0T){0>NOzN zAOqLE1;{~d0j4L41^UtY`?mfspa#B5f+msz7$Ae=v-Wgj_XkWb?7^4r=O`*eenF+G zZ*&cKd$$xMPx3DI)=1@;fbE=9kuUD0EBU^9#G@K|%msEO>;UesyIXny z3P1UKOK7Pj(Dx1?j6SmMF@UQ>nFZf0={i5@;)B`69MG=Xa-;`&-ctm41H zH85)h72{>h2?l2pT;)LlO+5J;uXkgMhu9Z*r%K)s+RtRN=z0N$@w8yl`$@psj4<;H zZ@9wrR{S*_P2<#hH$h=mwn+BUPi|@BKRC7%T_?!hR%$!gZ%`5Yp2Iw&TixWW@PV3# zKKv%3LsE>LG}47)vTfk9#I~tdMw3OSdcP3*AH+Y0(=C}Bo3Fvrp#M(Tc>)&1b{QYO zrBu)OC#Esm%*o&H+_=$-Mc&EL=!FP_^x zd$BZ}Aa^~>l2d*AF<`1vJC@gYdojP4!b`mHp0Lh__p6T90`lnum{npShYTkF zTWXZe-Nlg;(wTcrXgfcuUIzC^?=S{JiN61zH0r(Veg=2hcOlnO}o}nPLjR!FUMx{Uv(`f%=%xdJ;YBu1^wMo!mQv1tUA zPGUeSszQ&UQ!dOblGCsVE^{bx6XO=XN6r$mlf7KaWwtqGIT+Ne1h5~G-`U;<4D@oWbw zUnHPZ4r$xtF1L`hmXF>Q+U5b>%xMIm#B&1B0(*Z3)r(Rs;>e?`s+Sd@<}%eJfi(FA zfFC<2S?PmOiRs1BSD`Z{$Ru(*-ZNk}(I((1{)7Cw^UI&&_NU}Z%)hK4zbjQTMe64@G=0E-oh6`e9;~uCm7~-x*E-BT- z)h+A4{qbwWz=(aU3*+caO3Kue?A_A44&vAB!fF;Q19b??@P3- zp=ZhJduWf5tl?b7-5K%C<_Cp467LE>dK5B<{BTPH(;Uj+;e~hZVn%_zHM;nIT@`BQ zOY8ynJ0EDlU*+)}hhA_0mBaG0 zzRDkuiO~|*WhkIgQ4IMefi4jDgXKm~6J*)J0I5$!K58KfSdU|vAt1|7u{H(fe(Od| zzx3smdI-mX&pi)pJ2MLMpq%Vp+}CY&%lP)?SsTL{s1MjO1QSpsA0dhB0f?>T4_4n4ov=g#??m@5|2SJ3}R*OwmGoL$iPKN@p@0 z-;SPYW^itRNJ6C-1|7&F#5FBzOw>csa*#LL5SWJ;y*?z@b`K5a^_E|Tu|#H`fUJY< z+_2jn`^0{|1owI)NJ@zCv_6^h^);AWaPg%irqa5^BT!H50C& zon7^b;$Sjse$D~3oFRL7U%6_NVCSgI;$3tzy-KRBE6QES3^#=Lo6{ZnoT8t@xU0W> zv9(-gS9p4>LNn{RYe9x3Tl*z3bp!^N*8~ynNjUrQ*{F`&12{oDUxNgO%keV@L%3q$ zlfVp0TuI8dC!=m9w`g0IQ4Q_^&URf+akG`fD3jQM{@xCgU|vRrWgggXP59+dgLi zv4SRtDZX8wRyxK1VQrfJg&q`4lt_fRz;KjHO1h~9W(v_tc}#&bka+KYnA{PuWA9SQ z8R0f7QLz?^k5uqboMg&`VpK%^O!KoE42e+42rUp_!`;`NdMw0anP!c4JkF6b?r?dd%E&WGObSV#+Ee+j~ z;@z-HBPCzrh3es*|#-jv@b`LNeux;t!6<1B*vbbTSXB?0S zzyPvwfw5{f^jQ%ExooNFTmR|wHVZnQp+r&#H=IqNAH!e9%hG~tZs7b!&~Kc5p|MJe62Ht}0i&>Ga4+s&ntwzH1HWMq7Xi(S&#dAPaS1k$4ij&v~BO z=3%Pt?>gA|Siq=YqP*E%XI`f?w#&ibrZVoHvwP$%?JJI=>8aaUQ0NE0A!hzDfQ^(s z2KkdIcz%s}k?XGKn?d2LiKKPS*VxjJg12X~u$F))3PANSnNXKc9hfL(c!JOt2`z-q z2N03u&po;|oP2a$>a zFd8yN355D$@@2djHr*QcGkV#?+LeU2jIysiMOVx4|+^~uJza$FG$-*x=29|Hv2GacHT&P+hh2VpRrJ#IS{{JsLb=BBnGGL zcc>4khG==etbNTcwmT6l^Ox_}A0ghllK>m5DS@4tTF#AeKTYxEbjcXgHxiu-?;K^D z8_@`RC72QR(HWmSktw;?^>JFUHuvz-=?rD=14B}Y42(O-487dp4v1Z$K*naKD%HlM8pwPNdUtGBNPGqIIr+6u74fw2;qpT>rJ2+vxv7tB&ayB0z}!OU zFn$f0iXmUN-O9`uo)2Q!S3#x|CYW&#F&#JI*cDkC{h>snbNqeL|kl z&8Ca!8*4Iq^e`%Oa*my*7oRI9@QbKSTiPLYZxvtEU8b<%^ShW`L}QCn zb|K*&@sQM0&KaeL5Lu6~MZk<|(pGA;eOMhaeezoCaxX`v+$QK#Jc)Z{Wu|1%kp^5F zokc#iiA0@Jc=&77WWkv6SrD3QT0wsGj5*wj1w_k1xH*WlmKoRBrPjQt=NKXwJ|hQ1(uULvfybY{>s@xw(h$Lk*~k#C?1MQFwq z!<3uWW~SX_`_cCQhLB-J!eBWVM?mm0bSaupg*ss{ZUFLAkaJdkuxx#)@rvNB;%~XD z;ltmL?QIugv1=W=V9d|*gC!yXe7fGBAoFG}xLyHCLz1DPHWj*x#c>BUMg3h_HP2s$ z@uPX9**}h!ez0A=BeXT?`NP}{E2IY8NhCWLRjYRgQgdkWbm?AqwzlP$3P|S?eexHl zrr~V|EF)WOM)G&(Lw7xV)x{jY)!EoZ1mQ;L)JAZpX-br7>TC;PFbS1AUNr%4VkqW( zR8w9v;?gbKZ6sSq56IZHI-= zlr^VcFdU85K13fuQ=5j}eEhB^mqJ|BeJI8(EKuy8*ay7E+(JN>{X-RB!7HU>~=xKrg!E#Kn+3v`=AOizvDl z1GraeP~xgF2-Bb{!Kid2gA1rh&x1**0B$SCuct8tn*S3YtYF%uknehzh(k0a$<#KW zp*+JcQAc(7#ysT;90Lu86S!|vQf6GtUJvw{+HN&cf7dw1$jmYO8ib=>`kbSlckqYc_n^6{_Jnn)t5ej0+^vReo zZMe1PTi<1@^CqS0pz`@WkqrA@wy{W)$4*?gZwp#dEV@HPWU6lGT1n?Qs&`T`kK}`T z&#P}x}di4(zimY(PZn4-Z+nI7i8^%(96IGag!g7bUzjTT-^M$pnK>1hCVf6 zyA`WB@4jOwvgFCNqj)!;iOvRlcV+DW$M9m0vfk-Q%XmZNvw~}pv&DoyNDfG=2M}wB zKw?+G@M^gXW%F7!G#P$P14ZfJri+0Se5W}vv*;}D`lVN*LOyOuO!1xy(;bBqabW38X|mUPh;oIXOGr}uro z{h)_*Cnl4k#EKu(&4P{y9%!3e62LAh6AaaC&u4@xZyLdJGdIxY>ol%{AG+Gw>9WxC z#P9kI1(H%~I4{vcDWqd@Ri6C)RWnChxZ~<6n*Hv{v&UP94B-k}&k(+rK zY-yJ6`vha?(zN|I3)CnR`T^Y;JEdz5@rTxZ=B8~Ri(D$E@Yc?6bAHs}-`N)HlRyR zzbN0!Oe)n_f@|eL98VI7+=g1t>#T z6jFZ{vHy5X?C*Ms1_i$o|6cDPtmi7fQ2gu$K)Ui56~+IBe8C;$#0Sf?rcg^Vc+KCT z@}Bb7ev&_J{5Ovwevi6OH%Nu99_=J717z=>>$2;Aeg{noN9^A9aDpj5cH?McbBYXn z@daVI{{d^uufY2hJBCILWwtQV|9(DLlwgfqHMFJP1<)`rKsp~!!5x-A0p|aC#!2u+ z(Zo|2;PfQS41zL*8q6nd_cO(m>k~__FMh|G(+qu}<5RShH&&Kvz=BInL z9_{`hvT|A^=KZQ}f3Lh414t>9 z!O*1qDt@qRTV{uPBH-;WUku36p4c5N^RCQokXY(H)?He4B>7l9;p5j+G184+-4}o$ zZEXQK%UcU*6L_5xr$O<@%JgP%+L9kR#gBg^=kum$wjMFS(ddQW?|5d#e`1v$#7@Ew z^yPE565c!piCw>fP#Q?`>|0Ufr038qfG-k#N;!L#4|;A6zOeoE?W6oGEQ1@gx=1RF zUDNRPxM2yJ7=8vfjOfv-F5g*1zT6`bC?R*|1y5I&d-jXYLZ?fbvs`CIW(}lg;T_|z z5Z!(sZ{p{=ic^5&_4Lt_9b9W_9Qw>DuvGEzugd&h#GqYe!fE9YDR6^A*iP*_Iktf(ypX^lxXl3ZiyF^#o3s2RcODeMme_$qMq7#AHKcMH)2Bq=k&PT?|9bXT(8d zd_C^jZOFhJMGW;GKIgnd&lo;!|kRBN|R*gGwiFSpPw%#Of#C zvWGi%0FSUQ5gy6lyWAC1ut3{3Kz(z-+wBw^mPFR#G87g!MZbU7b$@k7OkpiHig5{U zbxrJqQ8bb^D`&su+N+EoEE_*yQBjbM%%`#vQA!x5(k!&Vx%46_-qJ|J+r`69L4H%V zE~2xX^n^P@JAB4n0*UI^G&Z9u(gpQj67Ea*9e6cI9*YxUu56T-!hNBjHvnCHLr zp}2V z%J~0+ItP?Z?Ge`jw-Wr+}T&=*S!_~fKMLU zXK?!MA|qze@Gd?|+aqf=X}$5)AXD9FL=<6M?N<3n*G1K%=ilSU1L3WS&Yc62lv-TE z3G3rT(nqQf%y^+n$+4bXADJ zPps$*mRNI)MvX{pBH!@3@+RP7L0CIk+i#QL)-3(iHflL-)QNr){!H8h*eI$3=cN3x zf;4qKim0w!!nJL)^2`?1dsWtZ>t)bjj>7R) zP>QPq@bAg;JbMoMB;Tb9orE-1|{rLZ<_uHJJnEkzSfK zvU4-_QfvO?4YO~yfVXqX-t*9@*9_q(7Ap6e>bmL&`SSe!e3LpPNA{s9YEM+D3+egY zNH}aelZF?2c5s)V67~{n zb7bu-ZB|CGlM^+H7gGPIk5d4k;R&V$>H zm2y`7bV5PRL_0qhdN3Bl`QZnPLoA#F4JP6?FfP)sz+YC%xyTL}^N!=$uiIw^2lplL zdj;sG?$C&{xpDeQod#_bEJQCe5I%>Wm4C~)K#LjzOM{pg;N5L_uHCu^p7HAFUuI*#AHMY@ym5=`2eeX`vTYUw)K;UNj+^@@?sGj3uZ_jio?=)!M6 z6O`^o+p;oLH|s>o;u(|VN5@9SMiENENC*KLaUPK>RZCN(eq_FrV;}l1CpLuSvv?u< zIQH34VTk=`ZSrfN+d{=xJ(62HcS3?wIn%ba*UEj1dW>e{bVrKbc}&vmr~_nozbjR! zlvG3DDc|qBV-HEqu}W2=XUxI>1Jw#m$;pEI2n7UxkZ0#D;{>K`w~{zN_~GfE*n&mV ztaRt(>Mk`X!>DiS(G{oD&Bq?8d2nEMyL27l@hcl!qG!4}hlqcwlu6llmD+-pg;l6E zgScJ^Dux!A4&9;2)D~NJ5cdYZS@3UuNB%h3Q=XurdG;KBP|=W#3vw6wrZU_lG=Ej* z*x02LLl4|*mamh<=8c);2x7_y1;IVwbWiQB%8o?vCaXI>q~AwqfyCHZFV39)b^V;7D)m(WeAyAqqVFR7B$j*APznG#e&#c^&T!~KtX@<3 zmErL9uMCIw-?yiZg~t2YvK1%NC%7%k9@mV}I^=}F$4=*C%n~fO8tWZ0d0~3@ z;~|_r#5jt40xcb$z|c+<0rICW0WhwOoP0_*3}RSa zKp^&+DCAS=aa<+dEes)k>5eZcJo>fomDxMX?4*A>y2EMRGeMogiz@4%_kV@4pjmxqWAJBRC3w_azEf>ZYDB=X?x>4vRu zYH}}cVcDB?i{25RBE*c^=OLz~uw$$-?)HSdZ!HMiS6d{MW$=v<^y z(&Db>bjP{m@RKh$2>lMLy4X#eMkMhG zVw36jyw9=v(m_n?c(qzuiop)3@jkv02=U=7r!m~{S68QMjKnD#nDem;W%jCeED#sN z?qp5e8494jyqYCidvmoQg=Dgw(Ydpkh#MYvwvUC~fY^^NZjw6X|Gtv7==`7>Xk>o8p7w4vd>rLz*14ZGV-BtNG% zd=!i;eYqmdGMFo4h4vmQ+Hm&~znXjRxwnloQE4fb=bTW-s}!fTCx?z&jM7fvM?xD2 zJ*-0_&){tgwGMb6?d6+jS2_4{xXKDfG&@s1Mquk29i0o;BLxixM^Kz+R3qr2+|G(x z?*qqrb7$|Eu!bPc9p+7uz_^3=$rR?%bB0Jq!YD#b+FI2;b|E(v77caRm8yasDVL#q z3A>UaH?1louF*nyPPQe-X?%Sk=H@H{2>D8n8kginAEmt@jo(0AbCpnW!|rGhJyIUh zq3qPPa+Q9-*$n1X?>70w$L~MdDCl!{0skhq8Z;GYi+nIrPUN-W^niGNo0anY4`D(< zu6fT{@JpLp^1q+4=Igpza5G20GE6AMQ^-?t6%F}0V-0iP9jxtJENTz~(r^l)-l>b) z8QRn$;t!S`5J1EVUU-)8XTEcHN~%IeZ!Rx^_=TBICEWbd5Z(soo0^3u5u27eeBj1O{%zM1sI@(Msky<1$M7l!B z+k1yB&u>K=m$U~!+f0jmK`(>!u6)lkOQwNz?U8bDQ6oJGI=k^5DaJe#UBj+~8S&&Q zJp3KCbEeO|#ZtP9wcGmp6=G7?@-dL2ihSEipmWm(z8v|%B0A7xKl8rt^6ctSDaJUk z@?o5P9NOztSyU+@Lst-6eKy(JYLwh7QAkLg@0q^>Or>7Bd%V>TYCPhNZJ}S~KRy9% z*+{rLxLuWgBpfy+jt~@kKBr;C@CuW7Ml^UzPEQv3-$~b>Dim3H5pgJ^HxeNAZ=u(P zI44Vw`Y@FE8AcbYP;k03m9k_xN0gpCvXWZ@d0ce0y{LdWrL=e|{BF)S12G47|FXKuAqx zl>iik;}x_n6GJCZWBo;0@y~zPpPg@kxZb$2y|os)?#`Xj_?v#qYlP+>DFy$7uUY@l zQCSE!fv-v7nV?GU7A!OV&+lMS-T#$&;Zvx&vg>VEtUD+cpWnwbo%C6jRDW%&cy+<@ z$)JvbL=%2JAs{IN2n4;K`I0cB1P}ou*L8a0&)P2kd@=~XH-hR+W2==z49zpRdjXY~ zdj@D7g+Ntu?jSo#L{<@LVuc`Pg9zsf{v*(SMfnlPXpq@gqw%W_E)E--oiP>VoMffI zN$if+%%woRL#NM3-#lDueQ<*+_k|5sNB7lTV6&Kf@Gm?&zx}-7qaYJOu3iITq!Mr) z*uUEf^DsTj5EqJUjbOZ0g|~yol1Uyk*{cVHfn8&Xj`3=?B^Njy@y*RN(M?CxyZ7in zM=U$(h%CkfJ;HAa9pq33SM~wWktzz{5B+!~0a|#peR1&tahtXas(vA_#(j%0Lt+6H zih71w;K8g|Qk~$f_dMOzDGmd%h9_ zI*yOh$;Ou=s)@HAUSwnp)Z2d^EjWC5e@%u|&SKxiu&PrjTZ_yDVi=x`z04AnIq>js zd?730U8*Wo8)3p%9qR$cjkP~q9@21Pv?OJ7>qy^ZA#txU>0(fY{uwABLB18<=YFq6 z=-d3Vse!W;`?-Bt`a>VIAH3oXlB_&7@PkE?W$=-F9SMlg;;Fi{ME+KldnqP3TXr#4LOI^ssHTbLYq%M4b-uy{VnPsO5G7M@G!y4NCa1!S;a18pi&)=$Z7VD3ywe9mQV>aAOmJJGC`M09KSQE;jA5rEHU89rYF;og1KF4JwT-mOp-kSM z+tJxcslGC&aREOr1GAsE9A#LL%afYE#BNn1%r%1@zdK{?>eD;CzYMl#zkS_l>TfR5uGnn48kFYhe7x#jaAqr zH7S6Mc&p&R%s!!po5`c_T{s=eVR0KPJ%`GIki-qvn}K2pVsrPNqOX)KF@kpAUl;z#$T>VCKlLWoxk(juPF4+}%?oDD zlzX;OI>xwIh5KN!#Eru!-Zin%O1@9TLJWAHCppp9&}jl$xg^n=er3846bwc~lc7wC zoh>ZIloWq#7i1mN+=5k+*bLR@vNweQ)27kptnI*2^!d_j#F5`PH7ywZ>frRx`_cad zLH)IJUphZ5!RJxL1Xyvmn(JYgu4)@2YnCf83_8 zTGuVMMNJ2SjN>y60u%8*xE266p4J~Y1hy-V@n5Q46awr{H}lp|ucKfX1w1}W6)u})~YiLkU~oo3h$ zZX!hwpwQA_td!4F-uO2k3Su~g3bL-KI)b@6;4=MSxtcEucVqy%x<6smZoWqgu1@*E z(rUe(P+;+IU76Tx#71s$shjBCQKr%?s*KYkw1bZ z;eZZc$oG+qWcqLsGg6PR)N6w2nhyc1rq=O;Wx8@4hog51}eGVd;*e!1z0!iVoz1e2whIt8AE>Rjmv zsV|ba;TK;#k|Y7ZRa-D|U#cubI_ic$et{7xcD{;k#jS5yG@8R$1jUR9!wTkN;+LAF z&3bvCS=l&{dY|JSATtjj>@l6Y&h5hwtqh6;fw4@64H*i)c(m9!$J|bA=R2PN2ClcYsxaMs3hGAYB0}+7>#fCyO*DQ7V)zX|&bk1C7R zpiDCg01Jn($pTuWyq;`c?M=i#WM$=Er5@JLj5fvkczRKHVXyF>9Sh3~Ird0aBTQEy z#gn1OjDfehKbAxW!2J20#@xO9;c2^%PO&@|tx&lWyIMF7Z@W+5rHYKQ9bB{yp&0j# zz>i{?yI6Kc7E~CBR_H{_>r2{xJxWT8zLTz+;LwJB#%Ke`j@J!rjiXE>*nzf=EY1RP zXU&1HS_?Zbdrkd+!LvFsD z(X`Dg$Yx9cH0ZndM%`|ZAW(aiUbv#8#3Gn*x1qa^X^*^!k#cSX9^rJ&2NMf4n-%5G zoE6Pu1N+p6j$3%cbo|t-T3Ib-MpatHzFXydj!R}yXy1ogIvS%(U@4%T#9bgkMfErh zNq&f^6WrcmJzP*1W4HO^$D)0c^zTlI3=a44@hRyOEmgwRDmz|B$+-pqYric%n0b>0 z{JHk>-R{!u%TFmhZ(o}i$xP#xV#I~a32b7qt9@p^ORrYncf3MdO3>Ki6u64W76$FG zP$@fkKgp~*Z`_%;GPXA7)kZ2cLmm>7hf&XLl2`bl|MvOAqm6VM)oA=s%#7&ycD7Y@B6o@1k2!`COHlPJHJ_ z-kPS=*CKUw%|(r?2Uvo(ds@7!kgQql{8v7dXHqlR_8^LBG$%VL$#J}j3rh1FTCI#ws6 z*3R6Ct0^=|5HyM zP~m+yC*yrCmlTq{b%#DwNi`BXDETgB4v;VIN&2uwJ0`Dsh&j5zL?e-U+MsondG3< zVNKIYGk>ilODee-912D;kA=Ff$q}mg#q}2Mn$s2f(Ic6zG@AnzI;{-Ca0`U7V+yrk zPPiXu<`PVCK|ON%crSNMO2m;3g~gHjeTnd7fKk@!4p(#=Ptl8_o8mR4X@_i1Wxm;^ zGK8`7tG$`UEvu#Ga{7BvWE3gKCg6HVdlSQqyIte3OLPcoMDiUIqB`JJMfP%!C7d1C zZL^#H6{5fVl3S$Jljk=v`Pang-#GUFIgJE>m1K0kAA!LLRvwx0Iq$ zD-Lm0tCu<$Pu!*L?I>bZh3N!0jlyOogPZ99x5^+*Xsjdy-0FCu1;?sE8FUD>383|;>?wQlWo|R!9}MYp~V4P!eWUU zym#@VsbZf3Jk?`4(k85dlEE8rdH1Pf>>sOk%(3p>QkD#Fi=iK;jZ@9ZSV9&Q#8^LpTS2h0 zijP7epoUfiT_~KVQ}jnjE0cy2vm)eTiC`2;|V*}6xAj6iI$GJaA5metRl z6|eihX`g#z;B4*5oQZhRJ(~W5#Z8Kt05=deo@wr`8pQgOvn#aU`|76 z0g@bodO;u~FQ3Ls_G+x1rHFP_$zHQ;m8P~Uy}CP+Gc{l;Va{_9* z8^U(Fr}@3#N^hn|F`iFJ4>ydKA%qC!^Unl#w#h+*MrNOi8+`LRkJ}u7w?l>_B2lq! z-yj`L_NPcx6nA+u)IEmcP35CqjoLd;r^5TPz9)x%3ZUH(ZxA(&*~ARuPrn%+URXYp z9~Ir#%pUwebnmmf!qg4|c~1wyM1ma^1XY`luK^JZSX7 z5}pKP4-|)vbb--j#7^TgZnbe7qXUTR>@J+@gL6?#Z2D}r!rsP=+mGDkWn1hb@nZ5a zj=il%SI$buu4Se>LE)%y0EQp8_4D% zb&PE(C?nKhAm0@NGt)N3wC=mZ{y3k#Qv+6#(8MVz=q83U-;~1ZwTwYfa@QJ$`+;0a z_$fjsAs$aVNc4RdJ&X!w+}X3E$GnqzN8RM%Xx$J!9gckC@fnl;Fw-MJ%g65l{K#qz z-uKqMEPI|22D`;3iDxIDxmw?6x7tN509n^umnns9#(=yqhkZYJ_Zvbf-_aA%Acv+@ zQ2eT@Dv8|gDnn0@5zP2B!(OYeP)B_uu1?@QdKqMgo1!Q}AZX<1_&qqZxHHPl?E%!? ze=H8U%2R&YPk82R;64?Gogeo2blV)F1RF%#r5#7Rz4Rk zfr=Y=5Q|Y!>c5^5S%aQU2L04v_zQ+A1k|>&%P>7z&`->M{`2ZnNs+qGUT!W|Dgn3gClDk z1?{u%)3xl~cMELkV`F=mVxeH)bxZ>_y>L_050<_h z>Ff<8A$bB)u6iqeU{V$H9+~X}xPx1jl||*>2#V?-<$e~!!p+D4MKHP?f$+oRXV8kj ziWe%JC{S$|t8}4k;A)VEwe9DYnc9VtO;|t^H7Mth2ql~nXz|; zqR=5RN1J96I+o-lL)WquILZWZqqOgCz}*YWz|gcH z>B)Tg6Kj?ZrgduyiL5CN^1Cm9R|+b?-ge*E=9@uT{8X&#kpMkAQ@>(z-rf#rBNj_W zR9mBm`uLW80=5ZtEu}5RMSUgqtL2X;PSj_G(CWCSaXCJIRNSlync7#_6g&FnV=dHC zID<(EHhr0bDSgjHd)(Z+ppYKDi;c?7DAdZ5@P!k9RbZ2^ou;PU#vBgBSk zlQQaM-`t}kb4jaw%y|vqw;KjN*vz)41O#JGs>w_I9C$DZVE}II7F!hL!JRNy!lRgn zacLAQ+eq!ul(A6oIeHx(V8+$KXWQ(mJ9%=)+Zw|B;CB4PQwRK59lW*5RhTwC9DIm> zwTCkTW;TPm4uHOD+P&zS^zL}|=#ijxskusRsh%G!FQ!xC zD#}%xMfX1iF7fASfwee*y%|ANEn|P<$CUlSvb}IIj=AX@o0AFHTl|RpQY^o%CUv{Er1pe@KU_@|HIz9heO%-YvbBTQB6X!or;o# zR8qo~6yZj)nL?&g2_e~pm@CTnTJv zNA{-lE+9htH^lZW3iGG?wdFrVVv%F;9mjQxr*u$!$O7wVCE+4*-t51BS$`V-KW^JO zG*FcOBN8)0@_GU$qyzDnsSLESi8#iC>`VfReb@`+h1sD>NW#MhTbMF``eJ{NjQ`d1 zP20ILtY`K;x;dblSG7&+TC%tw?{nx_+k5}F@x@uJV_sLOWy8p>vKw1P-r*6dUvJQPn(~Et3C%OVmna0`AmLtOMq9b(>j3o!n!=rvbv751NUq#be z+!XG7{c6Tbb+=&lJ6xS0$(|iuF5cMV;Cl~E5mg5*%T%!faOL?tgL!;!OT}7wY|drC z?zDaZv17}-h~H0fJ~T05Pp%U_-Fs2KHx!wJB_^@`5dAGfiz&y@NAa_aO5KqjsTdnO zhd1_3)p9%!_0Os|DpZK>PTt~7f67iS=SSKRURaPNTjKG~hm|&xsFF4UYiYEK*F1Xa zLaYU9MRB6DGYwU-p%VVmer4cQG@N%lB;zN?`&wIYe)F&LqC!jI_wuY#v+H_z?BQ`N zI}Lk*fqU_dlq*0M~{49GfQ>yMcR^>&2T4E|@g?!=b=#=P*S`L_8)G?imo z_wwd1#<-i>m$!VWczG_h?J|dAq{t$NT|7XFO0jsC zDRIv%CoLGVPX)t19} z)dLXlW(D(4!7aK^gAY03U2kKDjpJQQ$;qF}u}-<7NX!T##g2om3Bp3qh&0tQ*z2xn zY5ty9FxNmN15?vUo%sy=2eD)~-k>(JEaTa=a4TX1#IRU~G+c-B5kFecjvTg)dm}?F zt28c&pc^@PY?h}DoOcG{2RfRlml}_x;zU@wRH$~CtBMI^c4zUjT=b^Uf)HH;~Cg{k2y|H;L7hCm$62`$|)lV$; z3DO4`DNIR5m7c)`sNYjy+VE+L&iLNT=$ee?Z}33nN~KWdZ%OCt?iRkfwah=LmT2GK zl9h>4%2CqJ^!$7xhXW(yDxWE;@WPD&vj>6hknB6v8Zn1z)Ph>`Lz^IfzY zdRCXEJl{?D-h&ejk~!_4dp|48Pqo7>H=zF8KF$2Qr3pEJyg-YhClrM&R>#h!Jcwu# z>T)!BF_`%1Ow|eDYp0fPaTFeS^J@BgVUJK)eDzqHU2_A7mcM>tX(%t#g&rHUuuAcr zieP)18~_I#8rqr9Rty8lLZTbO4PG7hI?=i6WJej^I{bicjV{kCGOrV73LnK84sRt~ zu$zG_#vL!k*Lhl)V;Z|4%9y6|=m>|KBRzSq_pYbsKxmws8j;$-%qU+7)JKOr zV4V$mOe#h4juIOdpQEX!Eq%VD?sz>~cnz|Lbl#+Vs3hYFTGwy=BK>Q4OWBlx5#;;Tb!5Io)e=9tuB z#fTiKi?COnK}l#8zq^l~y|JA& zP1V`$Dj9us^4bYZRMu=Mw;2OX?g7r@bOXllXcD<)m)(b|_1EuD)2X6otxFPgmD#YHEj;X;eaTWtiSDuwF12TV^0T@ttK!=RoRsS2P^ya_# zUDQdc6w810UuyhkhR1_AUe9%=YhXaDo_l}$6J z|MVt*g$$EIh8bskk!!z_qg^1-nub@FGcfd&3+SKS|7Q?m!(a3)G=%GV{1V)G0{W|O zc?NRl6dpCRW!c{!J_gjcT#2;B zO5S2`?t>o7=C}ju#cRjIpR~p?EZ)GXwjV5K8Wg6I^^;@o;C^%8HyRcV zVG4{(b{qOygOL$2RLq<_Q%H+>&|SkMr)%|aw_?-dD`TZ^4oh<f(<~yE+E6=neBn_$z!c+2g$bxacLi}(LZlT>? zp9`A*O|}8Wa!OyX%=0)C!1X}^+hxt#S6|I_gB`b$=yr7NTgq(Y8PA8>2c!Ac{r9gC z$8`o4Fl5}}i@gn}Zr%n&0^OCYrBZDXt1;%l5|oT)9CQdi<_^^(M`K{h#8N}A67%Zu zl)!?GN^IH8i4;3p2arRLP~Ibr8+miz^34a`?zgPP-1ghDzJT_})kVZ?M3|$j{i-Wr zNl4Ng-d)!#?6%Bm8Qnssw3=6A^A>AlWhuSpH%bb3y-9PD`C95$6^Shyre50hxEa`6 z->2;h7u*=Y^6E*T=#tMlH5lvnN#Y=${)F3)MFET7<5Bn9JniN928G)}ltp3AuE-$U zRnr^ksNiyO)4}n^6bng@ip>DI;a%Z|jW+%UV57ZJi~ z(Tpf=%ZhFel}5iX5jX?43{n{GL)#`tPZv(_dkLL#LIfBxlue+4+!kaG`(73YGMGwk zV|K!*k^ENpE)F~Gy6zYlity9)Tmq3F_~x^E%Xc2g%eN6)``K-}SIRhLaizE~f7~L) z0Qb!E_2RqQy*$W?m`dT%#q2Ko`*|;)^c{;iyH+_`qAXpfFRSfxyQs_bcr~Tm3;AmDdFEgEZ(sg?Ho}nMBN8<;$oZH=)tN=*A$>#Q#e)TN z`8_Roi-BUwKqk!1&fyIY_j+?Z$kliK0gl&zOIfJqL^gJ3&m^L?8h|g6XeWW9EIGG( z%!`MbpHgs0*3}B=gUA5in9E}xN!m<00+F+02@lwDF6p1|4!DQ30hh#`vuwMiOmV`I z+kvatFU#s?{=&1d%2) zvmTJv8omxQWq&oOk?vq>(A!!Q9QVo)5_P~{Cqn?R;$ly1d8JXo%2zQ_LBvS7g-X^rK*gpz+Y>(t^+{ICeEx<0+ z^?Ks3vKOB@hW|H6rS3Romu6M_zPmvR636t4B5#T7`7sgDcyy!Wan&MMR{9bk zOc;&2P>J)t$v2{WKKEJnqN^YkR81AfS}hLmi@h;YROy|7?4BI&En=GokH}9Hj20Qp zEQS0sVLA&L{z7h5s`dW8hl1tp4?yXv*$K@?!yDU)27k-3P+#R4$B_^A@O@RZF8-Cg ziiQJM+kr&VTjB)nWQ|uBdoMVl9?IoV<0?16mYg1ivDpuo@W~@h*yfjkTZuaC1RqLc zoYWv8h*rONN?Puld2Y%gBMtOe9*G?0`3l=FQKJlBimP5aCqC`Of!+}C0DVWYS6!ij-7^41iPJmQ z4AKxE`InGv{c;8{J~2&VCo;`b;#h!Wp!5h;u>&SJ0hA2Yg*l%zrt4@GKIXr=| zNg5le{JFjQv9eYR(|7qoHR+0Bk<8v~A#7Ktce(?gHw3n(n3r4`UnxWT9gMHQCUwC4! z6|uH=_hD7FOY7TC==%CFWkz=$>d`E3b!j?sIxnvPmfL?vWbfIYTil-tId%i$q9`V# z=cW5xWX4n>INUN=?O_E{Rpf);0>Vky*#)UhajlgaFTxBAKAQXbv?m*Vdl#oTlMz9w zMP$7;Gn8nJmmN{8>(u#(JX?(Ij#2Tu@M`nr8jv=$PFGbrFHji{!nu3#3SYFf6M&K@_z^~aN3lbFpq3~qs+9HAnF z+Q2%4UJZH@W8b|(TA+WX{PC--f_^Mm%7%5LR_hd_f$!l4Ts|S|{f#4pnx|d2tw-28 zHxs=|&PAv~&oFEGCgvZTPHXI$0)3G}i6ePJ2J!w+_S0>Da(q$|th$cPI)W$Xm^1Wf zR@L#w8w+_O_LdFJJ3ZnAO|MmRTYs3m*AI!B)&l_&nqpAV2+n@5!!$9v{|lja1K?&%a)i`s0$qfWZ3jD8$E@9E}SBp5>+L9^RR=X3^=wLPO?h6V7ew>p$L{ z<-n|nSc99OM3q!Kx|~@G!*V_Z#u8JQr_>E?bMhoS#?97eNHCAEHnS5?cV#gQM`oYy zaGQHEUm3rk9wydt<;MC-^Y8_kconHc%2*OU?Sb6FPmagxx)SxKUWNTbGwXypt{MYh zXkYoBMtf8(dbh3e7U1@#HsH`2%z;cxh&~b1dL%Os_h}*AcC;S4XMO<{ANB#>&u~zY z?^_syY=YtG=%zw{vCU`i@>FmvW4o0i%F*sJqmBV`Dq@tulf$O%3q23>*9?RAY>0~c zxk*I!lYjv$3*SLYzH% zFh|oQTU37=e7!N%b%2Gx0l=?)ljQlBVkLhSvCo!GiJ1j2%=-^Tc5Wsroc*O(Yq12? zwdWK@gIz1aM-}rgy}AG9&2Q&VZrs~@0Ftm29Kkar2=h*vF;SNGf*PU7o6PSK&~}MP zm^M>H18P$)TGWVEl`bjR>_~y#sQEm!S1+aS!psDU;y&2xAYCbwg|nbYa-PF9oqhjk z@{!GDy9X($Ka}%wTC7{_%vn+^TbPHDZY+vqhYbv;ptyMLJ?@W+X&5}&CHuK7`;n5w ztvN4_3h_-_QR!(aTR9a4QBq=(+se9ljA= z>?jr|W1hN3f5EtCMKc&FBNuioZWBW79cU;VhRGH<7ym^E%*dIq1ptx3O`gd;=E$>bHez--l(Q}>x#lqVmauzAqAkfR!)gWvi{mtQ4DesPffvVUSW~rZdz8ym?d@GEs z8|kzU8U2wRaQOC3-SYfCPq2$9%S;rSouZpa-G{Fw1f0@2H;si@TS0M&57!Qo2_dgz zos@E4ZmGcY4zh`)=MRu2B`zT{<=B;pvQ>R9#g%B8jttIg(NUpuPuerML*qsE8eUzn zHT81I2$$2ne93~I{G*fe+sF66@A15<@ZeVoIXUUT)y?gDxy-(I=7mdK$??`TEehx# zIv5v=F}^H5M&_n{1SiL0U$H%gxfhLgcgo!GkTA+0{t0vqF@Chu?~;?_n)Xf#l&DU; zNHeoF^JW*{ykQ`kb>(D47;-wm^W_1(bPupGg(%tac8ICF~HN=00qN153>_1FpG zvYmiPe^aj30V*nd+H;B;if}WTM}}<9#4Cl17*oey+Asa+U#Ch-zIMpB0h^Mjz!?0w z$T5$u)`uhD>e2LAK|)l&1fA<~NBTHv8`6)>4#eNVQ0r@U?FIR?+c}5!A&}fYoH$BB z)`{1ADcH)A2w4^;3E*b;H@1LtyeDZVPTaHVv)R&vDSz9?jJvO9j4|>_?gN$OhulJ| zH!>2J4{-XIPD^k3A$>M8T3cx9^21N+tmEyJspPK$i63n@9HJif6i3r4w^~hjh#tbH zNp3Pbr+Gdr-gvOlQZWtJ263!d9Lb~DZ|*8P+-cvF+Ex3gKXv{c^Qg@w9(>RnamS5G zcqd_Fi=wjuIw0Uhy1=sLufOK3bAX!FAQfk8+gCz`T=mQ*WbDjcPU;@dOcQ;eoO9|HhpJtgi7!U)`k=OI_`wz( zGu9NN5(G3_?8tJyxa`~>Nt(%$s5jwa===d@aImfyJYYs`L@{LHxiU$fBRTKgBWhcy z33eV%pfbCl&6cUpsCi{C&6Ms5XX{JfcfZoq;VRI^gQ z2w(n@=nu~9=h3*D$MAP>UrE=k(M+AZ1K_nlq;?HUo27B-!Hz>PO~6jM0&Zcjr#fa& z!-?Kg%-?=jZg((a#4aP;vP{A2aD5N{vnEs+b&eIQJSI;fr;)zI5(rB5>RIe2wN(;5 z;3(2pA-@izSUY%+JAfvw;PYD(8#Tv?ErWsxw=Z+|Y*?rx<>f7Tz=}CjaKdNxzTZ*S z>=5f5(*)P7CdWM_LhJVND7<^z(@)%bG{FZQl)96$MgDm%wrThNhP{mYge0w2X27GP zUqvzgL%_r&XW~^9OtIf43b*KK4mz!uJ=$2CuVqTQ-)|%4K|#U?!<9qr3Ks|n!>kNbJqQuG_jV6R1a? zpi=;rK09Re)R|N`=A{)!T=!r=OVGz-zZImwDxMgYpy^_xVWyX&kx$?Yv8(mmq6)nv$&7mJyHvNcYT{jPkg~c-@T+>lKl+m%Gz%aL4QbDe_QR!f)&> z_@18G4+}Gv26bZ#>*UOys%m8OTr$pN#;EzkDNOhS_S8(oNxyO-VwrdX=%URtGrZaf z@;rOXSHH_FIgf%m+5p{&!Ks$9&EL)IUi!BKOZF}wP&=%TYtH%N;Y7K^}#x%U(Y07aDmwTPX(oMltE4*0Z z?MHD&G5u5cIgv;6+GUsFeIyh_eh+B=TGddTCM{zkX?y)b-pzu_XHz`iGs$qVIbE5pKYrBk`&jM=Sx#uM_*^V%hgitHb8-x!*~_W0Ux zM{%`=AEE=cgR*Ri>rO{F)0c5Ok?3z*i2Od$D~c>D7T}UR55@Ho_P*tv5kFPZ!LM+u z`07oSos{s_t-nW{{V`Fxe2JY0+a>O!IUZM&up5YD+%J1Ef?{oRK7_8usCUL_iQ0U!Gf z*Q2n<;a000is|+k&)(*Alb?0DF{MT@6k z_UUXQi?f1Ge7-s?W@LAY1=LU8|@DY_wrh8&8@ zdno_odz`?)=-ZyvDcGCzz?WGSHGdU$$53Aq&}z`l%?@79L?w2O1C$=$7)24RJJ(}B z9Hy8S$b8ZPB>Z?rFfZUXxfvH`19&m+Awr8?>I)dZrImNqq5=-WP^Ht{3+g;yzriJC zV1Fnq9$y&le&4EMKH9ThssHK_)5cYh5_r6-=GB+CTbH)yEyC@7JdlXDU+`zNj~SAD#Hr@$n805gTEQkkj6|LVdpNN7J6-dj{^S6|X=6njrUXQ*WJuI` zj(@(+aa`GtMpcWDaT}mVvELsQiK>4i(ZcqZgBjzttSc#~kSHrJb8Wcs1ME8b0Iw-< zbV+?a4Z(A(^J^hRIbT2boZGN-AL@$E?|)| zNpE?5hlhQ#XXw17V_Ocs`wO7SKL3RL5VXecyS(qIR%$;y*Bal%z{e7^l;`)7kTUmS>>_*r%|0^NjKP%|73PZcl?NN#uKe$W*0eI(E(^3j_yfxMFK_^Q{?Bj# zx)vO3qjvb?U7!IoJ@0Dn0$Kv<3NY8@6zpLltB zdY#bw!)nZM?&vj69)u|V%t%#caU>fun7Z*k$w24D%hvx$QSc95+cW|3Lt{dKQg)L> z*Cgr$vahi@J&3g^(483W`BndKfA^Ean52GT@2__l3{!Z=Cyv? zZVd!Ft$QP7KO(zy38MAZdc?8z66j%|-7Hh{L+~hv?-g2@({xV5$|Mi0_78NWSmD(UE{8V&eym`)+#F(YPC@QU3zsL=$y6=veSUEU@DHITCT)-l0m4} zFLp@p{rVMpqC|R`6xCXsuZ)>`M!r*CFJD(Jcu`<_u5H7cyER>3QF^X5*YL7i{4f0l z=YRI^fAmHF>G1sh(1G^d{}1Ym8kyFY&~z-V1mcdj$XMw;#g>`Q>{@o5_jaTc`+VzL z;m+M_E5Z1cOqFW(P-+qj51j@u-DeU2NhBCSw4i&X+tdTNi_jN1VRbb1qTQ`xs3w%K zn0Nq(wIfafafXCLj%>jPnT@_jz2k%MOq3eH;;(t9t=OJ=3A-f<5wjuet1y(3X4oOnSzm?dv>bHEEgc$X{kKJy*V|`6zE=2M>04^KgMlz3!N@f z=tQqYesC2b%McfGXne>HEbagZj+7kSnRR81L{s$!vaFbPj6*m= zX$c-d%zh(ulZZa;1f>zyVqAl2%Em#P4B*V!&CH`o4Wc8_)Efy& zj!gOKJrPmvA%m!nxgMzv%!C1JuXslboA^EsJ>wEuuz`w!>9%Z;Ha>6vF-eB}a{-hj z>sl$rWXfP5>LCXJr)QhhvZFg$>t#U_s%^ijJU&x1p3-OX;%>G0O# z$Apfd28lH3q(x!YclO09%J1GXBY?=PnmPd8A&F^)&FAL>HPYb4;r8-bVkF6-@ya4} z7e;Fx{Slm^EiK*(a#On5Vk10!6b^x2Rma0c-M}J?B(d(t_FL48n}^9ef(A-VA|NK? z1H_Ie{&_L`UoeWcq5n!;)cseEQ5VEw{(krV zFFY5&Np4x^nT1xVFz9?`NR*>m1BNVNg8^-b^n7gnFSDWk_@0)A;E4aG|7&X5gAr8p zEcknHYF2VX(^FQ3-ogt=I@0+cG}-A!jrz8 z2^d>l_Vo4$NC2upk*naggUxj6 zP?_5QrxMeD9t-RrpFx`;a)A<&P6*hzQ}9*LBqk~gntEq2(kw%LA|^vGKzK|(mJaV# z;Y;DIQFi@KT3fn|W*){#=WvSl%)b^ddfbnNnh5?*u9gp*KdoSW*dV2-6kI*^gn8Ee zc`a!?FlbU@icSYfqaB@pr_&0thQK)FPmLNOgynWki>L9|czQ%~K&pfk+>bXg>yCRh z;!c8|*R5=f|BUl^^u6(Q^jB*Ze>>|Yu3!9w$@ne1qgt$O%{!6gHW0S9bdqv_9RSy+ zwflCxEe`q46tEKvSU~e_pLRLYi)pKUlMWe@(AtQpJo`mDu6m84$^OjRe!>$R!3@F(YI72fsdn1N%g zS5(1zG~n^?SW&gAhIV;f_IPizu~AphW~Npx75(_cpi$xB6IVc2C*nf&)sD}r;cDJ8 zhQU$=g&Lw+U*AQx6&`DFKc=E{f%gf`aoQ1scO#8g zBV^*7v(C#+B_G)I?VQw7^Fi|Wi<#;7l(8=H+Hi$ks>ax|B|6=-Ag{6|mbn6P;C8XL zG2SsRxfiMMQi-}zS4+;G{IY)AA4R+JlfzXo0(mJ)#!MB|;Te)#b&&s?9lkXrRNj8p zji>yU4k@YT2X9a+wz6LKj*s6a}~(%Kwz<#VMUdy+QSeWF^$}YPgQ5_A%34^ zHcuX(M>_VxJGoa~L`;jnSc=__74A(_GFdg3zhr0N|7EDYudDke;G`qF%F$lGF`}sE zb&>8RBMYgnZ)eZQ7BuK*}X&Yr> zW~2n3IMM^!HA2s!yt^l7F<`A#sgAnsN4j$GYSwSxh<<8NT_RSj zXVv1&X9-jde2Cj}G<7k;r7;iTbGDN$f}X!}veaoj>nqLf-e%Fdq%-W?JL=(P<(CkE zy6!Tts^*gou40sRZV6AX_sQFT553gP6EuNdYXetzjFQ*FkjqaH{`y9Q;j-i$G&Fh5 zn9}+F;&AND`ott)dNcKFCg+9@P#dG_TU6U&86)#}6l)KY;@|ZZyO&X-k%Jv&o@{wk zO?@!hGI2EGROsP_KK9qlhl_THe{wvFvW2MTi+FkGM08dB&Z};FLjxSY>yR3Pz~(V5 z&c*U_*ShEhyZn$xIu+;WSn95cCX7@-_;_h{ly}(9s8kpw0iWn$P)`2 zKo!a^HhY{>TgJzvAom_m626m`i~>3!a*?H~cFt!O^yQ8GLbWFi-Sj?c9rIxt6tyf8 zIdhJrQD-LhW70po6tse??Ww=y>Co@8HH>J%HtbK31G{6egxN= zwWl8wyt0F$I%XFHw3j@svor_sA3X)hH>4+Z%>lZ!3A&_$0-)=|V~u}uaM zFd4wT`Hs(1Z0t+V?aO(P=x~}DG{$;=CkI`n%qXcNaFt7Yo$L1kHY`S$c2zLFU+4F! zxD36;<&}vaZJw4NHe`0-S*QyCH}{Qm(*sFgkvJR(d@FB(juDOlk6V&hrwfW`dF&E< z`G-~gjgMwj*0$!WS_QXm^iW8(z`_?Z57V?vo^-gbQp@A3ZEx5p1U*vyt^O3>imUiD zOq>@t`x(g=-G6BGTq$~FqGKgkA>7CWx|@Gs&B9mGcSzC+n+tCOn{_C5YpJO34MyL` zmeIyZ`?~&}eP)|_&s0x*)k<^aV&;Y2t2tNgR3Hga0jKTNo;S{6YduBiLd4n295g~L zS1i=ityeBX@?17;HfO$)yKepEwBCfU||8Ta! zTp!hvo#z-Zw5#Lvz{LGSa~CCxt`1jae=F*O`%HP}b&-S3GKq(SEC|q7UB6)#^6R3$ z0n_qcdHP32r6W7-o%^*}gN|)HSlg!$M=BT5FEttf#U$3YtJ^&k*%8US{~({xf2N#I zIq`kO{j8#Z+YGJJ-{8eT_>OX~al4#T{lZIJj6aoSPt7a#yo+2lbroCvsizUyfq2IjU}CwNJ0{$ZAcEI4 zp-o{ut#5X-iC`venlsq$zHF8gvS+KINYrT=KnzF$W!rP0Z6J_wFhuJ2rKd7lw6eS2 zTFgf*4V`PX7OE~$kXdx0Qy>koElshinN-O7FW*S^%T700sbS(~?AX=!~Y-qp;4 z>aW_5I4Q4n;V>uTb^_CRy8$Zn%770nh%1micE6b%A;W!vc1Q@c$UYtNZ5_mCgA%GF z?)9fNBwnwlw9QUWD2bL#v!aEyf|L3NY}S1!1ICC8gMC30skIa-ReB3b0 z89CMW3X@ZP;$5pGU+6y6w>VO*CGS%z@P0by;)$rsR)Vjg`FGe8h^(Q3&$4YZ zJR-IuLOtTm%Mc`iv;+mtipkd~y<+Z5bb6c~w$7WI;C(7r6!W&1wTrQuF>#ZpqL@9W zjeoQ2%*1cjoyWDrWSAva-a!@6vA8>)-1GQ=R;GL0k||q{Qso5Kd`5JJ(^$u}n94{S zI85s6t3juaT7(|sLTWx}u|T3mjn?~RfNiu^O|Wa_>ke+oqMTUHmJc^C5EqYD^3J?} zZ+At6aUn&1*L3}L(|cNeABpSkEo96Unspcp?ExaP9d?5KiZj&1If-N^yvs4zY+40N z3mmPll`*$-3?4etrAV_au@Wq}rYDTQcSqA5`0UYhC{SMl9cF|rDn+P>CD}gKUQ3Cu z<5SqsE^uefHnRkoJ8<97yH`zmyrjrC&zw)G>ys16Y_KZ!m}8=f$vwLaPzq_t?4BYV z92vKfV~MWHV4%OnRSx&@(SsQC<3CvP)Vy)_>uQCelNn`6-cr%YTH-#zD*~(f>0rLR zVCNl=H0#mqSg7PWB*TctGuOZ1^4NAB4_@|i)Tiht3Cy)oC%J@eOw*-x8@_q>CmJ)1 zc*?ZTv`}O$!EJwx^p}gpHI`BZ#u_{oKX>xU=Ea>auDtD#acqZ7E#TQ6c*#0VXwAWc0pVNRI0rZ{wOk`LM z_h%R^BEpP>EgjlLj^kpmGtXXP*?Dk(35p>Mz(UlqNr?%DnAXa=1@anV`V{=}c;_MF*RplAL zr{RN^1g86?=GJJ-H8?Dt>$!@$>%`-7{a24EXp=A>>>#%9EkaJ+en*U-8_x*n9IiE| z3T~uneC}h}x@#dj3YfO;t&-j1+uwBW1Djr4N3|XP(-nZW99CSZ5BzoF0PXLmID_3kttr!)G_r6L9dRvga6{7G!2glqpatUTV!$|<|4yb} zDOgS7lbo7+#!6EST3tdPQuUoS-zZ@1$*DHFxGk8rdvr*D!v5TS^othsGI99oFLTqp zLGf+h+Pm?iplic?fQC;O#hNKD5>#l(pV1Mr84B zp?vyXPUXfsDdu}J$QKl_9XM$_=||$%ko}6wx7ofvfHwiZVzq++vvM!)R7#R}RBgXX z6fk#{KAAePXS9^74o$rR56D;RH@&}ZCrax!>K3N zUv`ds=g$u>Lyc-#umeeUWt;DR=|n_l9U4wl^>+?Ru~&DmvhBVpg7AzWQhF#G&CAfZ z(Xmgy)M4(3-_f-OC+v`BtL84vTYltdQJ2xu{x_nx!MQeQ8X--M6=R)xfQYE=KpKa| z)YR-6-+3ugSDUE&mowegO%K~Go62pN8T zY*OdTF3wc1!01tJep*#fvGbeZFK>3Y(Cdh~a3kOaO=~|_iy4f0w2f<*4Te2dN@qrd zp?2OrYS3>mgnQ~;KOt2E)aXy<>?sWOHkZcXUB4wq9!YSirw%wx9u;-r*wkNsfM(6w zGijz|JG6tw+I!*qlf(0&5}L2Ac8P|Gt|I)zDNkV2W7#a_f3moJ(&gGp$CB!?BF)_? zMQO57X5fhYa}qglh$G8ig5g8SFt~aR@(?F%b-P8dlF-*8b9xH_Fa9m`v!PnbUT&Um zOQ0n-ePWrk80LXq=O!5+eIK`D{Hm$()ugCfrf=$?C+mdgZy>OTh`GxxoWzMR7M5xS z7i!b`>mNglg<%}H9hOET@{EJTkO9L%ieu2eM`j<%HuP+533R&3Qy5!CN*4aq*3C!$ z{sFr*yL>7wl6>zchg>_jYDiX146Zx?TP<##uy?Clt69o#-<{%=$0rseTO(7@=lgV` z3j_zVF8mbNh|sk3ZoX%4uP{V~%io@)tgpP37D^LUFw0q9s=? zs~XS5e-m>?wpTJGX9dv@a!P315|P@nxqDn}b^vG^&Giwk+=JuLasI$a$A*fjbQge$ z`XB?>)po;V#49tkR^2O?6&!2g_q1JWU8>SR|9}keCVxDz{p&<5>#n;n+IQ{G8ES3J6{$ z&7Q}KIdoCt&7de|s;AnZub!+`S-4j|Kuw^-C0N4XxPd@%!gJF6Z_Z$&u6W&3*mjel z61v4dPLW%|pmjRw`8zGf)hl)0;&l^8@{9b zaGbtR!g3#>y9^1Lzs|ic_buU@NJI4DQp^vpBJ`X!^Jtv{0JAC8FhkZCv}Q|hMe4EH zUR;k^XQ<8IE9s7i5>|(UOUDe`*K4r8iY@}DSJwj|n#x;~sLUYUTkOlOM^s9=My8zD zO=o_MT@3t#-rskh$ZtI7kDE5xLAWvdjeb)d)St~9eB zAwz-VYY`ARl9bgFTA-aY-5(fu_}u!+K;L<$g-3(J)LWG8m>1v?Y`w>hWGQ;al}jX8 z&~`UxSq>d{QtFwbfuN1CiYp*n@h7Nfa#g>wBjmcf4|lBee!JIeKHNtvTz^^#Wd+Wq zQ)dJMtA@6=AM8CrR4govh6l6@NV7LnCg%p3s8T!FX^XTwcAYE?hb2zkXsA$wK4ECJqMX}sEG1@j@(Cyg2-@ciX_>07$ zClA8UdqJ34?~9H3M7Ak_3MQ-M&N=KmZ-jlULL}E$f=rUd#js2DI>vxSBW;wiiDNsG zm$7S0rA@Iagv0^yE^CmPzIX*zzXe3c5VeV?I?pna?sBofVcut1*1-Qxhx@g_3scLW z`H(5k6#lM<9iJNhC_F)bVM&0kb!3UC1I;UdRGtTbfn6bOcyy;Tuf0Y6Z2dpPQvmL^ zN4HQG2l}y6^vkp)(0wM^YB^p3+cY`naThhOL_SMr58mkkGez-NwWsp3LwP+@l+kdz z|2(*IZ;Kk4bs9UkXMQ*j>oZcXr)`d`n<|2+elPsBWp^30lC&&-{7LhpTo!oi6m-va zCgIr!SSRKzW!muLw?m{05%tsEtufoS*{uA7QuJS-^Wt_iB}6@zv zJHM>8-|-Z>p}fb>*+Ok^qI*wFuIX~#irSX2Yzuw44DOfiL;a8hy@P(QO)HRO?YQNE z?K`#lw>Dqq#lqaCP)kHt2^CXZ!fpExH>y)_yp7n;f9UA`O~$uSRk`{KSP@IF)x(NE zm|>??6OU?SUtyVp>GIv{c(;l!HMJTNIQ=!eMr~hdyv)Z=QNQw~=U=Z(y?ejieP)yt zAAVo9A!k!~sW3uQJHU`pXGD?{jVdgq{Pa?1=-Foae(=_$t$0l_WBLJM410ka~5}RL`hBZtzL0%kBuj} zC1O7i9{!{H)&Cd`{2A;12S5KmX*96BHh)7LzK=&Mt-0*FgqSYaO^J7nzIFhC0-tKs+hVP6*2oeO7{nG&>P7v<^l|8uw zNyd^A9rvRciQiY*Ag0`HuhvAZuHz%DJ3e)Q7X2JSw=)H5k458&rRSij%P7vKy8r%u zIG!cM%)5L57^$Wc(*K^wvuC*^gn>c0F}Hx^8SuM^)WBZpuQL<>fek%yS&}`_9rbS5j0Sf z?<=Q*SR+^K+3H1wG51hiX#EtLfuXNCFezO9B>zRfIBeffj(uJVkAG}eOb|lle~Tp* zc`Jri>{uZdQ1Vku!+_DS7iP1Xnd`v`8$-Y_kop_LV66Xd>U&Gwe^ccfbNmb8?w7~G zTa*l=FHHjhrM{N{?x@be3w7GKlo7Ne)pCkA%P9Yz?BT0&!AEB5!&9=#i{DANEwOzy1#|K zHfKeC)wpm%!}+26H2vu-t1VP+nfcoNl9&mQjw#hx-(*M1K3rQ6zMtx0hhKP6WdV=< z)-%oa5_dF+5;22@tf%@E4Qzbg$uAHas-GMcnmfAN@2q}*^q=u*_ME<}IQhTWd+)fWvbAkEii%1T6;TkPqM}B@ zhA^~kLzE(-LKKupZz3WfJzGUYYHWaj7y$vP5vkH5T|uQ2iWqtiB$SYlWcw{<=FD-< z%sF$OIp=%7-}k)p#|#p7lD*fu*Ilpsx+7}$>fXrk7dpK;Ua->gvmBF}6w7Je>v^{6 zyy@X2>^-05e}IGhaj_S!!Nzbk`t))^MHU1Jq~j!IR!CJ0NQYk5L+;q{6=s5T<_`D^ zZon`l{qM7{{x!JnpLjIiS1s4$w}NEXZ|7Rs8&?pg78QZJ0Zd=AJ%D*Wa2>1(ipQT& zH_!ip;{GEmxbXMikf#*&riT{tEZ0WCn3@xLx0}@2;lQC$1kUGzI zhCj98`x(yXkqYWm{v2&5P-f=<73d*9Tu09Dq8Ze!?98@t_1ic4;8Th zEG|Igp}=r@z}T%17`veo4BV~}4Z@IIKwA^`-GluiT>pj;F4`DWu)idZ8~}BY%lC8> zIpq;cPFDbx4X1Rd?|^X+SZ@yH$< z8|4BPf;)#qDLy5n|G8FwI&A~65hW<$6t~)(OgR)D4ES6Da%U^FC;!wn;iZgLfrdT&e zo8HYk2Fj5KCZ;AnK7*y=%07KV#rxrd;nItf8g(>ZAJpZzPVGzr<%hFuCZ~;y8Gu#| zt7FGgfz+X7Wk4O3 zG;GCI)VSqTiFd4YNsl&Q<4^Jw?~WPECem^M$e#6z_bNxT1@xej?;AaWQy*7&KGlo+ zqJ&&XfwD^CIuvOl-Fr;^C(^x-CDwG6419D)hd4p274~I#AEmg?t{0yVX?gbp==Fbe z03VrF%zq)GyktvL4og@lq*;+q>PSGjpwB*B>WO{G=QAT89LuT)!sN2#EV&f!HP~H$ zW7RIL+SkuKFzcmI=~~>?TA!;YpCl%%U4Vy&{Ea5#U;ZpAtfEfs8^KKyhD_p3F8ATs z>t{%F)zdFT^y30|{3DCpKmNWbVeOQLGPv&LI{*-0zsl{!3z}Zs?TgD%3qZ}=!EL)< zWF6@s>0f9mFH3kEUjEiZ#pQT=;L|N6EmyJYN$WLEsBB)2+6YGUrY?LmpGd{=5Q!o5 z^v725@+#T#y}m$_=Oqn??t`-`1fu>+pG+61?;qO91IK3ALF3lME6^lts`IRA7))r@ zIot4y*sTa=oXWIqKC^fsTeKtL3W(dEey@f9*}nn$OE22E^}sij8Uf5i7%92z5pj#$ zub2i?tbi{5T72Rs*bcy*{w)>oKmIv22B@-k9fD`8K?(y58vOO|s9A~cKK#$*85;gv zo&m#3;sufiZa8)rBNYT*PQCqHEwU1*ouz`jB(gau$vT!%7lSTToVfSR?AA1kf&w!A zqu-X-;figW9|+j8O{ax5rBc}zK(ie6_F;uEzM&A|09FrUs25vajO}?|nBl(8t&pCZ z9w`snadxRm99&J5lb;Qcs3C7+Uouy-;p3rfNQcz-wt)4Qz5%Mu{M?;T&;AyE$m|K{ zs?7Wu{#q*itVa>Ct*&y2GR}u)vW4&~@um7cNz6RD(fsB2@Z{62jw84%08=;%t+8_> zzJczi>!!WX-V^9k1`Uj^8KL2Vpw}9eU!WYS@6v4bYnBDFo$DI0LtivA9Q*>u9C! zH}4?r$4QOG6UIHrj7dZhyio{zq2&#=ge%JHwzRP9NU!U>+#d@saYHWCujh^WQM^@p> zF|hGKy7QqhZw_Nkl?1u%D#KD96cyS1U|0<_PH{p9=NR1CvfB#U!WaPPf_59emRF<$ z(kd;7OB=UCbHMnt?}U3)Ti{_4MFc=gs)qR*he-fK|0{`!N7j78BDbdDJ}`sxmYQ7P zv>r=O0XNSFj3fbO6hQy*39l+WNdH=RltyUU!3-1oUyGgQuik|6JY7jz$Yf*4<6YoF z&0FIo5X)0)ZeDUvVr=$}*0cL|fr4obSOxD)sJw0r`qkS_SwWW^#fI}t#9L=%pI7S# z(5`K{UTLy3!h9!NS!^rznsMlJku`$Hm0hOBJ>KSC7QD9Zf`@leBv;=4;f`uM-W8xf z4tN`brNA(Qn-Phx+5^V!iJTh(4Mi-$07u*46G`nN`+nPLrXRVEbXu%;vCuWzCAhPxd-FEi+PE zyb4wFW%BR2KwBTa-=9|Gg1Lja;NDrG#)WP`Xr!v7MsDN)^LSI+Od#g7X2|U6`QzmX z5RP83>whAV1&MSAVl!EVi?(+|`;-@fGjtX#zZ+0NFe(?(miofU1=m6^F0 zIK8>A_=*!R1i0z9&4KJEfE$>+0B+zQd5#LEw_Wxl$uxdg{9rF&JP2ZffbLEJOOdH3 zaV;j@CdrC<;JeMc%1lxqOo@Hk$s~-k7vGNG03tv1hd%OBTSZZyaQID^ z@ehIxT5As}ZM?I*^xaJN9sUTxz7@N1&F8U>D#IKin7Y~`$Pq!m7WQinDQ}F@ea16@lh=vD&D=U%2 zTQtNa{_HALB`>%N-%zr^?EtPoGxjOKw|*zN3yukKhMzbEBZ`NmWB&St|HbRj2$#8~ zoKS?#e^x5Xb0GFkLwEuWCfPJUo}hg6BnP_USQo-q^8(?hLjdg~&#MCNd$EG*<&>p$ zZAt21wF$iNTWTi)r_n6k*(24=<3n^Oz&olBjQ(Ge_LFn|f8{!u*fI`;9I7XSOevtB*PIY!79Kvto`i8+Ue2=O8_ ztV7~tO1Gh0FmnzH6BStI$S;|Je-`=wsn_AP5Zyt9o1l%X6bIf=7Ch21-c-^^)9{q9u-ZOLkpXx_n|A zH`WNTSH<~Jw(Ga`k`q94&k`KN!drhkG`Pzb;0Y7(wmng~rodd>hDV&BgFVBz4X2oJ zEXNOU1&>;f-u|wVzR(D^MD3Gz9|zYibc}g{SAT;2at9d*`S;m`^A11s)GGN0t-JEGhWSqaMAI zKKnO=kwivdEyu$uzz8`RoMo01K06)(# zXKA^ndzn3BG#L~xWyO}Eq|f>NhS>*`#fP1HTZTV*pvxvAJa_@UsXn^5J8fOwz|>s+ zdF-5HBK!&@-MVYszVg7--3fcnyZQ{(F6oXj^q4u8Tj`UxA*uGLUKUTpuRA*Vd{P@V z?r}$W$tvFRog!LB#~N>`7G4BEmH=yzA4_da{p3ny?+AxyYl?T>+P#p4KJVD^GVYc- zaG>vmj5@>vwk(CB(}|K7svT$Z&Qx=w$Pd;pd(WIX4;U3jvyhbyDQbjq=j4lW3U}pk9^`DJMcW zrX%4c)=v~L0%Lk}nWE8-$kDnUlOS;F2(6uGcnK~cD=eteF|kM2n@d`TU%W_GXvgiw zzmwjC6GAA+lHzIaOt}Q6Nn{uE)k|MAAu^CNzD$~{-1Aikxyt>95)1f-3iglAjw>0H z2S_wcEhDpy>FVUIkSXZ#`}g`SP23pbA?jd6ACSMq;?+fCy^0d9c|2EWe#Hyn2>@1$ z{f2Ua!;DgLA;y3RCy1NUg8Fx=IsfV!Q`TeTup)7`@*675f(rtUhezu`fHA)z-4Q@| zmQ5r1fRyr>cZuV>!h5lx8(B!5WEs;xVVIdR|FSXakmr@-rMSbbb^RV^{;a{z$s3u2i+gzfY$nI6o~6QaGT$s8xmNdt|$$N9WB7UJO6z%b(_NmHr2+Y|`E)xIf+wJzE3umSB_We_Vxy zS?FNsCcG6*KNl45#ktC=k}AZ1Jjlv=HUUK2FN;PIbL%w>-UI61u;A=XJb!=!^cX!| zuQc|&6T;a|Q9G$;k$PUHD(sYH)pfEmc!+m)mo=l4Xa|gPpGh!aNs#Ob4w9WO#$VhQ zT5>gliMSOH^>%L+H3l z&(!O=9cnW#=db#Oc>9Z=&2_v6%#h4jzE(=OE`}FPtXLAVweg91KHt|!d2wR<6gOkJ zA@h?9epRp?UZLsg`!Cx|?mCRSDWO}l@wdkBFgvh*Rn*Q0sY`|7v)!3p@VT~0Cb zsSdI1XZu!%%8ZFz)}uxDB))VdFS3am0c>HNk{;yA{1$S$=N@<6(K|g_8R%8euB>bFooT zQZ+QS-xdOusKe6f{{tcaXNO^#IA9R?v1-)uxc6LmikYAuxPzW@EO zbo_S<$Ik#s)MfQ>%~_~BmdsR>N1nT;AkC=@s>Rm^8rX@ZpP_Q3Bfg>Zlv`lC^NYSU z{<+E#p_bH3Vq%__e?#$y;G1?qL8bN!`c??G7vrA6SK+4eCuu-gm>De;`h5?hJ9ul) zp%YMeFSt~XB$PKEk{0(3Rcl-*hQ9kANaY^5RB6jJe3i=o{2dOwWorvZ0NC_%5!U1B z2RDF3;HwT2q87ys1TKwDl_f8;NDkGzo)isM#sY9`5v_4kD&^H#gh9M%#n}?r57*7D z8+8UVN}-W+w*&9XE0OB;nrN|G1O>m{}myB7*p7k7ikW_;e{%Jk4~e^+gwZ|LVF52wVv3D%k zjnwB=k+Q|2=>Bz3wz}>f;D^-rC>atMdz*lYvIK%abz)nu>QN^}{-9x`uo_l~<+;G6 z^2lyYwOShB@Ex@T$gGkZh* z(L?V&H|&nS8c3!cykRZeXB8?pE5NR()U9z-L)>QhSv})Zjpc{)wP3Syxsv zSlat8Jj-}Cu*W=}4Zq8rW9we?-o2e;JfUbj7I^sOjKY*i8L_jLd7jnT7tV5|Q!Djx zV`%8{ihCC{-UV9*>u%Xqf|B)pzX26$KUVp8moR2{FDpf%ci@EGDci)P*p;uNij3^v zgSAfov^a35@)`a?h$}|uI}Wb|W)$dU;r5f*GmNXg+9Bqg-Y+3@vgodTq=<0~TmmfH zu`cJ8jZtx8;mh}?1#iOS_Vk-1AJp7YK7xCNqhraT`?)80aeXYe#8WWZ8jlOh2#04k&yo?z$g%vFG9Dw}el(yKb@r-ag*sCAQ0Fc_p5&>WBEs!L$AN z{)0GDFW#DH=96*A&`47+ADP6j;vQ*0_Hg$C$azJVzyKIry{Y*4XC1uVvK^i`k_J?T zH9UVfjhW&{mUl6H01Ji+m8(FOrJ1ZM!ALxZ>nHPcH`Kj){23P)$dly7mF$o$CEPCA z$%^|J*s-0cP@%o4rs$p7R;6me0cW(LhTeI>wChjRCr#S2UA*Mpy^V_@On;gwK34IxPN=FH}$vf*)|Zhnnve6xO<_t|6xaVe-uH04 z1%f`s6dz3h1Q<0KDB%OA`)l$ndS0@gKl#7nFdHop4(ZJU7wwIjXS#^q7eK?922BlXc=T&N7yg#`aKu;&%OrC z1`wTWXevyewFAcs{qzl$rH8BMuaz@E$kvn(jHB5)cG z7G;CB7p|tr+)O8ktsX=b%N22Y?>a!Y6!Q5Q7cL3UZ_HUNJiIzHU@r6#XbUsyy zB~8KdJI*A_rE&@Px;yN|c9puQc-lJ!pEFOQl%r#1*kEJt)LsT9n8Pfq=HUC@>gl(U zmWuLuY%`j!A+Vc!kdXmu4{A9Vnh=)XNyL>g$6dE;&W)y-&`Dss(P=4o=P&3n+)cPt z{3@4t3%|3tO0{lPfe349Ho6a&U|#%HJh8AHuL#M#t0o2{$Wge2_FX&T$O>RiPO`H! zD>}3)y=g0$KZ$?RuqM#S1`%-eT9YtJQ8sk|qzhraUBTA2RZg7=uoGGET1h*-^Kd~G zRq=F=w`BTJy$YgW?Gfln7) zKjxw^FS}7gRb5&;9fSzeCW|hGG1hX7XVvoes-8;Bp;oIFIc_mTyB`cyUhESJzqWXY^605)i&s(#OlHqPmL*Tj5u4pfknB5{poGo%4M zWQAx(h$1sDw58Lu`CxRa21tuaTFHQu?~7}(!e63PI;E@Yd?;b)&G1ADyN!z4%?MMu zg#Kt`rC+apuYBjRirk~RecE}ibI{{R&sLdfbC5Er>zH!a>?7atJ~<(J)s?1fE&Jkl zi~#lp(vHYDcrnu$y#7ZX(F&=rP}N-*Bx}IA@qR&y9?#iuRbjpS3;vG*w=a2qLp{R| z=Udi>H6=#0<#iK2WxB#WMhV(HQZvGE*ebI6;TMCNLGBN*W6H>(`Gjcxd%XuA9g(P5!^IH>Xr&P34b0>_VasH{{hcy@@pcq|M;;M7sZ3@+oKq3V1GM78Paanb1Fm+ z(Q+m7oE>?KUh%aIf43#bF4)YGeaVXne1J@lte%NJN}?H&1Bq?X2v(Yx0F7(GyzTX= z9K-+o#(rlngf*+-sqLdciT5}I2!I4~nA}|hu{Gx^9XUHdnm)IROrkCXh0&?sP!zv9 z9m79k zy!xf@bl7y&@#qpI^s8n?g?*a7fbiASdox4xHWoEHuR9Pm^uo5+0v&Fj%7<+?+u1zc zP#Rg!Y}3ldUI2s#ryx_(dh#8}9$qw>aHmo|N$N3?>9`00FFL|_kGDpWdicumTZ-tU zz!}hva)Ch9eS2>}#k;;BA2Hyy*x9IH253x8gYlsTusH?`-0>Z-)65u&w zeO;7nCuV!oo=ZMv61anO2RicG0cN-)_=9FMKW{$)oFcK&wLdj4;3m)s%8|13Su?RK z#^JTN9EI){2!LX-ZS6$SQSIQN2>=2f!KG8Pw+CMVw&s}ohDeZKX|p}|T$$~6Pc4%S zILXLCZjNbOBgW|r@mg`CVs4>xOyQ7oXxD+zFrGNRso*eZRrd*blgIUP_bPwsL=mzh z`y27>l_OqCsew&uKsSagS?P23To^w@gK?U&;-Gm z{k$qKGXp8-?DW2aZPL7E?M=ee=(e-MJJX-aL#!j&OH~Q+%+Dvk8sHFfyleP8S%{BjZijF z+4BCaPBAa-4Vzr*R^?7X8_f!+tR1(`qAFX4eg9naaWj0N^)wvb0oUnvAin#5oSGhD z5+pL80}?%FmSq2vew_0qS$WO76JHk|*?KPi#Hgp3=^bbnvH^4l$otI^SE$Reh&^{f zDY7CIvDqTa>M`7|SZ&r2X8vIe3ebOf)Z6WrvSaCY6jPm+-q8d2A};BZwh+DBy!~2hx!!# zOZJVd1L`E=SWG*us1EZv@Vw2-;M~@C7UrG41g#mz4r^vLtBG^Y-i*_`J+55$j{cs+ zFgu-_dug2X0p~%b@reubSDZ}b>Ev!oB|5ldo4P0q8{1`HaJxgtFggE3@X^VKp<=66 zFS6RS67dK9+AwP?dULs zy|vyNzrNUUA{8OaRMxyQcRJ|f$InS3J>(9s4pb^rZ>wUH+^J>Lk`8*O@&gAuObSi5 z3oFfRJK>YcxygH0a?l3O7B9t#aE{4TZ zc~OCViR;pleePOq8gcG#Vef~p&fl9yiB+u?Tp@n-`M62(Ib`_<&WR&EMDbB#7M{FM z+s9eF)q+A6P}KdnkIem1ghe6Y!g(Dz zkvy4m7sK$3#c#YddWF`M8HrH?xW{gUhp!VoI9<27Omn08?g}PKFbX*iT_bmrs;JpR z6`iRf!e8nV-4Ezk`JT=^B>`nSRL`G}J-PHLuVvF+6aKA_=a<36`kGfMVG55MaxQi*Y3aKi$TRd(Sx0lb>Yr7`u@N`YTN72V4rjRDF7V&Q;;JJo)62YI4b>W+?cBN`yJ#hOV{U(Rq8! z1*4-3qs<1miHU7b3CRlQw1PfA-g-T!@H*sju7e=N66fgoAGl=c&Y33cfea>XN5@O1 z8D9+5&yQA#j^ZD~rCZeHP63CSnd)Ckc{HBIa0qVQcA;l!#j($R1aeKot3IA39(XII zw#bf4?r^udKRv4cjbb3F!=r}Rn%i_I^Q@X`yQw2f2v&T)gb|w5jb1;ZeyASM4#M#! zrC!&{v%@E(;4_#BGHa(BO*GNlhZl=<_GlBbZH!DJEFG=J=8QklS-!sup5L5>xzaZWS{|)K8~Dwg*zYf z9vvGZcPXJHCcQ1Es^M{eb_$8W61M*^-dkCyzVBgRP3~JS?JQNzsz7p+&N>F1I_K|OyQFdo&}@C0GtT=TlW zW!h%`_I-=EhDZlAl}{N5Wdk#Lq+@KBmyBf?bHKLM^x?9^dG{AGZWLz1f1UySJ=f=d z3R(gsrKO&|-cB|OQj4dx%?#u=J(3}uIE%?mQ!^3WwaU{hKWn-3nJ1@>MmUEaA~3Q) zY(mx8>&83$l$EMWKV=rEoh4@%k*$QRoVX^N9>Dzx$w&@2%l&~v>o)!jX5FCu{4?NQ z?}){<8yjvlSKkyplbwxjERne*gLDSLUAQQ9@olH9cFIHVc_BbdXO7Jr{>SFDl&0#p zk|Tz;pal=ovWZMhMVApPjP>4lA8Ad0GQm-gb(WqWj`o{2fpN~~x}`o8vo`k#v`Xr9 z?D6eua#`+#tHMw zq&*K{Egm{_^?a^G+!&%c_)G}Xx9=1iZ(aZHidhsOt|J1V7cN?j(4@~zgG(sUiIt3=LRL~|`-6RwD#XF?R{>WfjI9EE039NYm`e%(Q>QqFZ3+aM8e?_mD`}bL=%&ffG!P7>@$P0+#d0=tp?raCp)Knf zAR|y!H-d|2cze7V-=n8jiGE7I#CG0ofPqgDjRTDq+Vh*rK+)@^+77M+MU;zw zM=rfuT)7~vvUWy0_}CydbEjjp#4!jjJuF>P779QC z002G-v;dqm4&R{Eq1ATc@dQs(Y(8TXz`DSd%JymP|4I@#=F32L?vS=S5PEcvFx)Q`A zCzZ!fG+Ycx!FAL|ux5YS{NmgH-@YrA=@-!(K%y248>!if99K(|QRw7xebaEy9wn%3`% z;Qwf%=4)0mws=zaj1Tz(I;J=KStiDwNP))dQiQyi-t^ONjGZqH8@U8)>f{uxF&V=F zufJ4vmPngyt9$1bo#$b=$6m7i$>h+LbgkAA#L@~^A@H>||) zz?*&g?~eor-z2I2dW+$**C6U18g={tv7(kVaE-kk>5cC4eX>ESYgcUDQCL(9NE(}65txG^0uX>OCl*HO8ejDyt0Gii=6M>PozaGs7<<8!xXzCQ4g`4{?#}=sPu-8T?Kj6YE#u zAzz7(pg&?>%#c|zS~7^w^-lm|dQDwo1Z7TJgV+Kg2M3_%;)o!bF%2QW>!_d%@sBQ3 zk4|nU-lu%rXkMcSz;ww%TJ_KP#kxJMZ>5V@&pq%d=o<*|V5{+S6h{2ttsCD<$y@)^ zxe%NPKU0orjDT}IhPW8rty7c(AdO4p>Z0jsGXyppU|KdVoD9A~jt`MLBW5^l*hEWz zernqpF&}%77QHJW!sg`z_ZY0-bY0-rb#EH*QuYOm!hCtl#I72V+N$?YLEP1m5>?c! zlUf`8ETJD3%X72!@=+UqwxQp;6#7aUlVy?`pgx=t5-Q;R49%i(V2GLBP{TIEx=?7m+C8!=r+$)pYd4Fsk6d3O0 z46g(9y_wb<4}cV@J!Ows`Lp1)ZqS3^X2w!EfOwJ}VC`q!%-gOX6o_PJviX7UJd{be z2*yGQNbVI7s{HsCL^C%V!3x$pU8ZuywDq(_Iwg$@mcU6y?K)8RI!2XdPqrbJySlUV zCbF6LIy=(x(aFk?u<@tILZ#OcX5MEEAuDIQ^SO!~y^*}!H>#PV5=)EPow;VV55P{Q zgpHlf$q(EI3hp<#aJRI^F5ye6R68QafzV7~$y60u!yMU9)&fK0o?Tyj;F5?17i#USfS6i5U);TU4>V)Wsg7g74u|y~ z?@cSJYi8XbsbAJ$gE4(lDTkM z9(NZ=a^sVJlBHi=RnPy|HUD~jZn~=MDIXFWEZN_jM()|tc#i={KhmA)OsWh|9hl&S zGi?8<$P&DOtd`m#IYuaf-i{$Ol#f_R2+}e*)N?QVG`{7hO#z)~q(<`4D9m*Rq^&PR zPElzmz^v72N0z}nmdgV+e8GnPbhEBjIx ziaQsTspXyUxYt7_hcIi+0cOb5UA$)?lU{+MUb+(A*FSy|nan|XdDg7A^gKJsBq>6I z2jARSpvhe@KzhFt&_yFEKe}%KaI%sNv@`+X;VN_18-Lu}p|`iVaffsWT+@ z{LfC1Ew<<0ljV>dkuV2@4VB!z0_t^21=*54dJB+3fTkaWP~LKl_cDPnC+IC;x}7zV;Z?laWgSb;X-+jdo`Xxj`a@_%QI+b3erffKld`tG@FT{@hhC8lqqA;YU`6fJ~`SHBSV@<4GZQ6qj)~~0+QV-lw)7Y5P)W=}sw@s{v@*kZRc zr9J5ksO0=1aoax6H3OT#4_yyNQ74I>%o9we(<=!#X{ zg*VSX_(7C2VRI9qOJ+e&D+nV!D1)k#U~iA5fcUOfeNddW1aJ(0VFmz*uX z#{C}Ss*8iJWs|~&0rr-Y*0=|S#IW-^nU79MVP<8Sz{uYz9G*X_=ts8l(tq`_)7Kt* zDtG<+D%Gb?Ejd#?nj0~EQQRdAS5By(kSRRF&K#5|%@LczITYK4m7Fz2vKIXMo{~rl^V-~>p=X_sj)LJfHK8KXaV4*KX^^#)J@XQ)0j4+7 z1`3)alqP0slSR+eI?8EoMqPgYE_D_E>9A?}p!7!Ode@c>H`spM~A*R8|(;ocA0+mI#{c62?I4h7Uqa>OuL$kKk@TBVI zSi%9rg3}w`sGkHmb=yKPXz+$-H=2(nKv$`Tv$2&z2YRe;?csK&j_^nCLl@@WhFf(v zONsIiUiTfe6ypi7rkSr<8^9Jc&|8Bz2feq`$J~kMvjVo3XcQRh9K2<9>Sa6T;FR#q zCGRiC`*mQZuC43&=(ffj$-GZcHJx{%nUC3+nFoNGR3AG+#Nuef57{CdTl_onD%hgy zW?zN!mFUZKizzGWS^a_w&SjO?Q&57aMVEv!)P&FD@fq=W=An6u=?~j-qTZCQ*|&3* z`P*E1jU`iu-+h9us98z0i*#*fCEc|(!h8t^e`TMYrx>&S;#s{X^urfpuSLJBHe1w7 zmH-;*1z1Gg$$QkSIlozSsL2Y(+Hhr;Ij3@m*e}(CdC@8&U6tzePEo#m!L(BB1~tu_ z9zCJTN-f%p^e(C>(Hu{QZYF$4E|A~g8^yd_D8ne!A=t5fgDIVGw*2_pWG#g?AbhW7+q0KbFeYxy;9$tbjh7?PHBdEeH=O=#>3*v zJ;O@U!N|>JqxJooFOOX{I5N-IQq2yA-oolmO!Qxc8(Q<5}+-R14 z!y*j3Y2b&z*Qu@K1U;6l7z4y(W~9ZJW-8M_Xh$s!I5&@-2AS@qYco0gw8*|fi*BEc zuX-G|Js+5Hc?aqa&x34v_fKx_6Bzp$|Z$fJ2bx7 z>*Tx7&I27slV+FT#W>)BmXy_-KW0kQhu;)dSfQ}zU8pZ@e>H(KaG2EO?BOLySb-WBHXiywA};@MhCUl7!ez=tgofp>wFR7UHgHI^PoHr#zeS zAmtlsx?Xqpensiw^@h*%P8n2O!MZtCZTH@1+-|yW#D&lw$?ZozeFfrkH=22uemZDB z`@yBLo1$7!n@8U{?$`(b#8riM$kXC`vj*6W^kP(@Ri{P%AnL}2{_8w}5%xTbRtFH%(k22fPx{V$n>VJO=QJBAx zw1o_$xI3tT97&g{crGsg*fu?Tv?O5nK45NMV8`cuI=EygVSIXJ!!&fcr=z5451`*q zn6ps^Q2mX}qrcMz@tYQwqRC!)2_HZ}0sy(Xp4&Vq71s`qInIy5Giwe@b^V>@-ry=Y z;OIGVwn54tcG-L%0{q^;%NerVTaM6J5a9g%z=8pO%#YXrIzkQ@eO-WZ|CS2hhVRXv zh+%#c)6^CLzj3pVHBut7kZw`p}BL@b*(=A1~@#VcR?+Li=nND*YrMd#mUX?@U@P7d* z)?89Q8W9A6=@WNvkjQ@%1R>E^ZTL8{|G^IN(VdZ%)0~~|Srxk094&W=b;UuaUavDs zFPIwJc(raL3z@i-=Fd9O6wF7c1vi_@EC3KCOJV@rks&=@9(DhGOWLBIK3w50;pbO1 z!_XB1!WfQ3VRlY&&ZEy$3gJ7?$Hxn%?p(hq^eJiJF|u+SN8}Q|4!;NFO2WF8jP#b- zi*Hq524`767^*RY1|FK}Ty?rV^*H7&kN@7Z^7;Y+Otk~kN5#7Ueg`S`UVWY`zBcQU z_$Pdc|4dIeya6aQ*}DK#xY7O4gbELH{`R#!N%sdXOR`1leeb-ya_a=S21|7?!iFFU8>7T-xtCCEB3W58x@`$HLKOnCFf{&Voz;;DAv;8 zi@^xqSxy_aA8>w_n7cSHD{9UY8q1vZSm&g9N?%G44~!%LOFyOxxpL ze_w^T{c;DB&4ilVBINW2itw)BR4*}UG{Oa&>~1~a>lmN5xmfiBsH=|Wj1uX_J{x@^ zdaBZvt=_4ibm4Z#k!!VV2PKknI?VdegT}!9c`C0!#w^(6MxHUUQ=Hs=e?D5dsZmTzCkQoLi1Ei=nJX+B_sCK5~8@uqGn)>B$MdXFq0JdN@z#yg>Q_FP# z9g{`wwY(((6(tSWS)$76{M?{b zJ{CZ{J8r3H{Z~39-{Gbt0LUn70dWktQ+~zO@{6ASWdZ(&&$GCw05o-Kg%GnL4u