|
| 1 | +#!/bin/bash |
| 2 | +set -euo pipefail |
| 3 | +# ══════════════════════════════════════════════════════════════ |
| 4 | +# CUBRIC CADENCE ACCUMULATOR — N/N/N/C pattern |
| 5 | +# |
| 6 | +# HYPOTHESIS: Periodic neural optimization of n-gram hash tables |
| 7 | +# will improve BPP over static tables. The C-step uses already-scored |
| 8 | +# data to: (1) decay stale counts, (2) boost patterns where model and |
| 9 | +# n-gram agree, (3) prune noisy hash collisions, (4) reweight orders |
| 10 | +# by tracked accuracy. This transforms the n-gram system from a |
| 11 | +# static counter into an adaptive pattern reservoir. |
| 12 | +# |
| 13 | +# EXPECTED: 0.003-0.010 BPP improvement over baseline n-gram. |
| 14 | +# The improvement should grow over the eval pass as the C-step |
| 15 | +# accumulates more signal about the document. |
| 16 | +# |
| 17 | +# RISK: C-step could corrupt the tables if pruning/boosting is |
| 18 | +# miscalibrated. Count decay could erase good patterns. |
| 19 | +# |
| 20 | +# ARMS: |
| 21 | +# A: Baseline (n-gram, no cubric) |
| 22 | +# B: Cubric cadence=4 (C every 4 batches, frequent optimization) |
| 23 | +# C: Cubric cadence=10 (C every 10 batches, balanced) |
| 24 | +# D: Cubric cadence=20 (C every 20 batches, conservative) |
| 25 | +# |
| 26 | +# Score-first legal: C-step only reads from already-scored segments. |
| 27 | +# ══════════════════════════════════════════════════════════════ |
| 28 | + |
| 29 | +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" |
| 30 | +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../.." && pwd)" |
| 31 | +cd "${REPO_ROOT}" |
| 32 | + |
| 33 | +if [ -d "flash-attention/hopper" ]; then |
| 34 | + export PYTHONPATH="${REPO_ROOT}/flash-attention/hopper:${PYTHONPATH:-}" |
| 35 | +elif [ -d "local_shims" ]; then |
| 36 | + export PYTHONPATH="${REPO_ROOT}/local_shims:${PYTHONPATH:-}" |
| 37 | +fi |
| 38 | + |
| 39 | +SEED="${SEED:-1337}" |
| 40 | +NPROC="${NPROC_PER_NODE:-8}" |
| 41 | +TRAIN_SCRIPT="${SCRIPT_DIR}/train_gpt_cadence.py" |
| 42 | + |
| 43 | +COMMON_ENV=( |
| 44 | + SEED="${SEED}" |
| 45 | + MLP_ACT=leaky_relu_sq MLP_LEAKY_SLOPE=0.5 |
| 46 | + XSA_LAST_N=4 BIGRAM_VOCAB_SIZE=1536 |
| 47 | + ROPE_DIMS=24 |
| 48 | + COMPILE_ENABLED=1 COMPILE_FULLGRAPH=0 |
| 49 | + NGRAM_EVAL_ORDER=5 |
| 50 | + NGRAM_EVAL_ALPHA=0.30 |
| 51 | + NGRAM_EVAL_MIN_COUNT=2 |
| 52 | + NGRAM_EVAL_BUCKETS=4194304 |
| 53 | + NGRAM_EVAL_ADAPTIVE=1 |
| 54 | + NGRAM_EVAL_ALPHA_MIN=0.05 |
| 55 | + NGRAM_EVAL_ALPHA_MAX=0.60 |
| 56 | +) |
| 57 | + |
| 58 | +run_arm() { |
| 59 | + local arm_id="$1" |
| 60 | + local desc="$2" |
| 61 | + shift 2 |
| 62 | + local run_id="cubcad_${arm_id}_s${SEED}_$(date +%Y%m%d_%H%M%S)" |
| 63 | + echo "" |
| 64 | + echo "═══════════════════════════════════════" |
| 65 | + echo " [${arm_id}] ${desc}" |
| 66 | + echo " RUN_ID: ${run_id}" |
| 67 | + echo "═══════════════════════════════════════" |
| 68 | + env "${COMMON_ENV[@]}" "$@" \ |
| 69 | + RUN_ID="$run_id" \ |
| 70 | + torchrun --standalone --nproc_per_node="$NPROC" \ |
| 71 | + "$TRAIN_SCRIPT" \ |
| 72 | + 2>&1 | tee "logs/${run_id}.log" |
| 73 | + echo "── [${arm_id}] result ──" |
| 74 | + grep -E "final_int6_sliding_window_ngram.*exact|c_steps=" \ |
| 75 | + "logs/${run_id}.log" 2>/dev/null | tail -3 |
| 76 | + echo "" |
| 77 | +} |
| 78 | + |
| 79 | +mkdir -p logs |
| 80 | + |
| 81 | +echo "══════════════════════════════════════════════════" |
| 82 | +echo " CUBRIC CADENCE — N/N/N/C ACCUMULATOR A/B" |
| 83 | +echo "══════════════════════════════════════════════════" |
| 84 | + |
| 85 | +run_arm "A" "CONTROL: static n-gram, no cubric" \ |
| 86 | + CUBRIC_CADENCE=0 |
| 87 | + |
| 88 | +run_arm "B" "H: C every 4 batches (aggressive optimization)" \ |
| 89 | + CUBRIC_CADENCE=4 CUBRIC_COUNT_DECAY=0.02 \ |
| 90 | + CUBRIC_BOOST_CONFIDENT=1 CUBRIC_PRUNE_NOISY=1 CUBRIC_REWEIGHT_ORDERS=1 |
| 91 | + |
| 92 | +run_arm "C" "H: C every 10 batches (balanced)" \ |
| 93 | + CUBRIC_CADENCE=10 CUBRIC_COUNT_DECAY=0.02 \ |
| 94 | + CUBRIC_BOOST_CONFIDENT=1 CUBRIC_PRUNE_NOISY=1 CUBRIC_REWEIGHT_ORDERS=1 |
| 95 | + |
| 96 | +run_arm "D" "H: C every 20 batches (conservative)" \ |
| 97 | + CUBRIC_CADENCE=20 CUBRIC_COUNT_DECAY=0.02 \ |
| 98 | + CUBRIC_BOOST_CONFIDENT=1 CUBRIC_PRUNE_NOISY=1 CUBRIC_REWEIGHT_ORDERS=1 |
| 99 | + |
| 100 | +echo "══════════════════════════════════════════════════" |
| 101 | +echo " SUMMARY" |
| 102 | +echo "══════════════════════════════════════════════════" |
| 103 | +for f in logs/cubcad_*_s${SEED}_*.log; do |
| 104 | + arm=$(basename "$f" | sed 's/cubcad_\([A-D]\)_.*/\1/') |
| 105 | + bpb=$(grep "final_int6_sliding_window_ngram.*exact" "$f" 2>/dev/null | grep -oP 'val_bpb:\K[0-9.]+' || echo "N/A") |
| 106 | + csteps=$(grep -oP 'c_steps=\K[0-9]+' "$f" 2>/dev/null | tail -1 || echo "0") |
| 107 | + echo " [$arm] sliding_ngram_bpb=$bpb c_steps=$csteps" |
| 108 | +done |
| 109 | +echo "══════════════════════════════════════════════════" |
0 commit comments