1717# sudo bash scripts/power-cap-sweep.sh --cooling water # tag the run as water-cooled
1818# sudo bash scripts/power-cap-sweep.sh --cooling air # tag as air-cooled
1919# sudo bash scripts/power-cap-sweep.sh --cooling aio # tag as AIO/closed-loop
20+ # sudo bash scripts/power-cap-sweep.sh --load-mode decode-concurrent --concurrency auto
2021# sudo bash scripts/power-cap-sweep.sh --load-mode decode-concurrent --concurrency 8
2122# sudo bash scripts/power-cap-sweep.sh --load-mode decode-concurrent --concurrency 8 --bench-runs 3
2223# sudo bash scripts/power-cap-sweep.sh --load-mode prefill-heavy
3233# Runs N concurrent chat completions and reports aggregate decode TPS. Use
3334# this for realistic multi-request serving load, especially on larger cards
3435# where decode-single is under-loaded and produces flat power curves.
36+ # Pass --concurrency auto to calibrate the stream count before the sweep:
37+ # the script probes increasing concurrency at the highest requested cap and
38+ # selects the first N that reaches --load-target, or the best non-failing N.
3539#
3640# ⚠️ VARIANCE CAVEAT: decode-concurrent defaults to n=1 measured batch per
3741# cap (one batch of N concurrent requests for narr, one for code). Aggregate
@@ -100,8 +104,11 @@ RESET=1 # 1 = reset to stock at end; 0 = leave at last cap
100104COOLING=" unspecified" # air|water|aio|unspecified — affects how to read the data
101105STEP_SIZE=10 # increment in W between caps when --caps not specified (10W matches @laurimyllari's resolution)
102106LOAD_MODE=" decode-single" # decode-single | decode-concurrent | prefill-heavy
103- CONCURRENCY=4 # parallel streams when LOAD_MODE=decode-concurrent (matches typical compose --max-num-seqs)
107+ CONCURRENCY=4 # parallel streams, or "auto", when LOAD_MODE=decode-concurrent
104108BENCH_RUNS=1 # repeated measured batches for decode-concurrent/prefill-heavy (median reported)
109+ MAX_CONCURRENCY_PROBE=32
110+ LOAD_TARGET=0.85 # target actual-power/cap ratio for --concurrency auto
111+ CALIBRATION_NOTE=" "
105112
106113while [ $# -gt 0 ]; do
107114 case " $1 " in
@@ -112,6 +119,8 @@ while [ $# -gt 0 ]; do
112119 --load-mode) LOAD_MODE=" $2 " ; shift 2 ;;
113120 --concurrency) CONCURRENCY=" $2 " ; shift 2 ;;
114121 --bench-runs) BENCH_RUNS=" $2 " ; shift 2 ;;
122+ --max-concurrency-probe) MAX_CONCURRENCY_PROBE=" $2 " ; shift 2 ;;
123+ --load-target) LOAD_TARGET=" $2 " ; shift 2 ;;
115124 --no-reset) RESET=0; shift ;;
116125 -h|--help)
117126 sed -n ' 1,/^set -euo/p' " $0 " | grep ' ^#' | sed ' s/^# \?//'
@@ -125,14 +134,30 @@ case "$LOAD_MODE" in
125134 decode-single|decode-concurrent|prefill-heavy) ;;
126135 * ) echo " [error] --load-mode must be one of: decode-single, decode-concurrent, prefill-heavy" >&2 ; exit 1 ;;
127136esac
128- if ! [[ " $CONCURRENCY " =~ ^[1-9][0-9]* $ ]]; then
129- echo " [error] --concurrency must be a positive integer" >&2
137+ CONCURRENCY_AUTO=0
138+ if [ " $CONCURRENCY " = " auto" ]; then
139+ CONCURRENCY_AUTO=1
140+ elif ! [[ " $CONCURRENCY " =~ ^[1-9][0-9]* $ ]]; then
141+ echo " [error] --concurrency must be a positive integer or 'auto'" >&2
130142 exit 1
131143fi
132144if ! [[ " $BENCH_RUNS " =~ ^[1-9][0-9]* $ ]]; then
133145 echo " [error] --bench-runs must be a positive integer" >&2
134146 exit 1
135147fi
148+ if ! [[ " $MAX_CONCURRENCY_PROBE " =~ ^[1-9][0-9]* $ ]]; then
149+ echo " [error] --max-concurrency-probe must be a positive integer" >&2
150+ exit 1
151+ fi
152+ if ! python3 - " $LOAD_TARGET " << 'PY ' >/dev/null 2>&1
153+ import sys
154+ x = float(sys.argv[1])
155+ raise SystemExit(0 if 0 < x <= 1 else 1)
156+ PY
157+ then
158+ echo " [error] --load-target must be a float in (0, 1]" >&2
159+ exit 1
160+ fi
136161
137162# Validate --cooling value
138163case " $COOLING " in
@@ -223,6 +248,115 @@ cleanup() {
223248}
224249trap cleanup EXIT INT TERM
225250
251+ run_concurrency_probe () {
252+ local n=" $1 "
253+ local cap=" $2 "
254+ local dir=" $3 "
255+ local sample_file=" $dir /samples-N${n} .csv"
256+ local start_ns end_ns wall_s total_tokens fails tps stats actual_power ratio
257+
258+ (
259+ while true ; do
260+ nvidia-smi --query-gpu=index,utilization.gpu,power.draw,temperature.gpu \
261+ --format=csv,noheader,nounits -i " $GPU_INDEX " 2> /dev/null | head -1
262+ sleep 0.25
263+ done
264+ ) > " $sample_file " &
265+ local probe_sampler_pid=$!
266+
267+ local pids=()
268+ start_ns=$( date +%s%N)
269+ for i in $( seq 1 " $n " ) ; do
270+ local req_file=" $dir /req-N${n} -${i} .json"
271+ python3 - " $req_file " " $MODEL " " $n " " $i " << 'PY '
272+ import json
273+ import sys
274+ import time
275+
276+ path, model, n, i = sys.argv[1:5]
277+ nonce = f"power-cap auto calibration nonce {time.time_ns()} N={n} stream={i}. "
278+ body = {
279+ "model": model,
280+ "messages": [{
281+ "role": "user",
282+ "content": nonce + "Write a detailed 300-word essay explaining transformer attention.",
283+ }],
284+ "max_tokens": 200,
285+ "temperature": 0.6,
286+ }
287+ with open(path, "w", encoding="utf-8") as f:
288+ json.dump(body, f)
289+ PY
290+ curl -sS -f --max-time 90 " ${URL} /v1/chat/completions" \
291+ -H ' Content-Type: application/json' \
292+ -d " @${req_file} " \
293+ -o " $dir /out-N${n} -${i} .json" 2>> " $dir /probe-N${n} .log" &
294+ pids+=(" $! " )
295+ done
296+
297+ fails=0
298+ for pid in " ${pids[@]} " ; do
299+ if ! wait " $pid " ; then
300+ fails=$(( fails + 1 ))
301+ fi
302+ done
303+ end_ns=$( date +%s%N)
304+ kill " $probe_sampler_pid " 2> /dev/null || true
305+ wait " $probe_sampler_pid " 2> /dev/null || true
306+
307+ wall_s=$( python3 - " $start_ns " " $end_ns " << 'PY '
308+ import sys
309+ start, end = map(int, sys.argv[1:3])
310+ print((end - start) / 1e9)
311+ PY
312+ )
313+ total_tokens=0
314+ for i in $( seq 1 " $n " ) ; do
315+ if [ -s " $dir /out-N${n} -${i} .json" ]; then
316+ local t
317+ t=$( python3 -c " import json; print(json.load(open('$dir /out-N${n} -${i} .json')).get('usage',{}).get('completion_tokens',0))" 2> /dev/null || echo 0)
318+ total_tokens=$(( total_tokens + t))
319+ fi
320+ done
321+ tps=$( python3 - " $total_tokens " " $wall_s " << 'PY '
322+ import sys
323+ tokens = int(sys.argv[1])
324+ wall = float(sys.argv[2])
325+ print(f"{tokens / max(wall, 0.001):.2f}")
326+ PY
327+ )
328+ stats=$( python3 - " $sample_file " << 'PY '
329+ import sys
330+ samples = []
331+ with open(sys.argv[1]) as f:
332+ for line in f:
333+ try:
334+ _, util, power, _ = [x.strip() for x in line.strip().split(",")]
335+ if int(util) > 50:
336+ samples.append(float(power))
337+ except Exception:
338+ pass
339+ if not samples:
340+ print("?")
341+ else:
342+ samples.sort()
343+ print(f"{samples[len(samples)//2]:.2f}")
344+ PY
345+ )
346+ actual_power=" $stats "
347+ ratio=$( python3 - " $actual_power " " $cap " << 'PY '
348+ import sys
349+ try:
350+ power = float(sys.argv[1])
351+ cap = float(sys.argv[2])
352+ print(f"{power / max(cap, 0.001):.3f}")
353+ except Exception:
354+ print("0.000")
355+ PY
356+ )
357+ printf " %s %s %s %s %s %s\n" " $n " " $tps " " $actual_power " " $ratio " " $fails " " $wall_s "
358+ }
359+
226360# If --caps not specified, derive a sweep at STEP_SIZE-W increments across the
227361# card's operating range. 10W default matches @laurimyllari's reference
228362# resolution that produced the cleanest 4090 curve. Works on any card class:
254388NUM_CAPS=$( echo " $CAPS " | tr ' ,' ' \n' | wc -l | tr -d ' ' )
255389# ~30s/cap including settle + bench (1 warmup + 2 runs × 500+400 tokens).
256390EST_MIN=$(( (NUM_CAPS * 30 + 59 ) / 60 ))
391+ HIGHEST_CAP=$( python3 - " $CAPS " << 'PY '
392+ import sys
393+ print(max(int(float(x.strip())) for x in sys.argv[1].split(",") if x.strip()))
394+ PY
395+ )
396+
397+ # Persistence mode (one-time; idempotent). Do this before optional
398+ # auto-calibration so clocks/caps behave consistently during probes.
399+ nvidia-smi -pm 1 -i " $GPU_INDEX " > /dev/null 2>&1 || true
400+
401+ if [ " $LOAD_MODE " = " decode-concurrent" ] && [ " $CONCURRENCY_AUTO " -eq 1 ]; then
402+ echo " [calibrate] --concurrency auto: probing stream count at ${HIGHEST_CAP} W cap"
403+ echo " [calibrate] target load: actual power >= $( python3 - " $LOAD_TARGET " << 'PY '
404+ import sys
405+ print(f"{float(sys.argv[1]) * 100:.0f}%")
406+ PY
407+ ) of cap; max probe concurrency: ${MAX_CONCURRENCY_PROBE} "
408+ nvidia-smi -pl " $HIGHEST_CAP " -i " $GPU_INDEX " > /dev/null
409+ sleep 2
410+
411+ CAL_DIR=$( mktemp -d /tmp/power-cap-autoload.XXXXXX)
412+ BEST_N=1
413+ BEST_TPS=0
414+ BEST_POWER=" ?"
415+ BEST_RATIO=0
416+ SELECTED_N=" "
417+ for CANDIDATE in 1 2 4 6 8 12 16 24 32 48 64; do
418+ if [ " $CANDIDATE " -gt " $MAX_CONCURRENCY_PROBE " ]; then
419+ break
420+ fi
421+ read -r PROBE_N PROBE_TPS PROBE_POWER PROBE_RATIO PROBE_FAILS PROBE_WALL < <(
422+ run_concurrency_probe " $CANDIDATE " " $HIGHEST_CAP " " $CAL_DIR "
423+ )
424+ echo " [calibrate] N=${PROBE_N} draw=${PROBE_POWER} W/$HIGHEST_CAP (${PROBE_RATIO} ) aggregate=${PROBE_TPS} TPS fails=${PROBE_FAILS} wall=${PROBE_WALL} s"
425+ if [ " $PROBE_FAILS " -gt 0 ]; then
426+ echo " [calibrate] N=${PROBE_N} had request failures; stopping probe growth."
427+ break
428+ fi
429+ IS_BETTER=$( python3 - " $PROBE_TPS " " $BEST_TPS " << 'PY '
430+ import sys
431+ print("1" if float(sys.argv[1]) > float(sys.argv[2]) else "0")
432+ PY
433+ )
434+ if [ " $IS_BETTER " = " 1" ]; then
435+ BEST_N=" $PROBE_N "
436+ BEST_TPS=" $PROBE_TPS "
437+ BEST_POWER=" $PROBE_POWER "
438+ BEST_RATIO=" $PROBE_RATIO "
439+ fi
440+ REACHED_TARGET=$( python3 - " $PROBE_RATIO " " $LOAD_TARGET " << 'PY '
441+ import sys
442+ print("1" if float(sys.argv[1]) >= float(sys.argv[2]) else "0")
443+ PY
444+ )
445+ if [ " $REACHED_TARGET " = " 1" ]; then
446+ SELECTED_N=" $PROBE_N "
447+ echo " [calibrate] selected N=${SELECTED_N} : reached target load (${PROBE_RATIO} )."
448+ break
449+ fi
450+ done
451+ if [ -z " $SELECTED_N " ]; then
452+ SELECTED_N=" $BEST_N "
453+ echo " [calibrate] selected N=${SELECTED_N} : best non-failing aggregate TPS before target/load limit (draw=${BEST_POWER} W ratio=${BEST_RATIO} )."
454+ echo " [calibrate] If draw is still far below cap, increase --max-concurrency-probe or use --load-mode prefill-heavy."
455+ fi
456+ CALIBRATION_NOTE=" auto-selected concurrency=${SELECTED_N} at ${HIGHEST_CAP} W cap (target=${LOAD_TARGET} , max-probe=${MAX_CONCURRENCY_PROBE} )"
457+ CONCURRENCY=" $SELECTED_N "
458+ rm -rf " $CAL_DIR "
459+ echo
460+ fi
257461
258462echo " [setup] GPU $GPU_INDEX : $GPU_NAME ($GPU_VRAM MiB)"
259463echo " [setup] power envelope: ${MIN_LIMIT} W (min) → ${STOCK_TDP} W (default) → ${MAX_LIMIT} W (max)"
266470 echo " [setup] $CAPS W"
267471fi
268472echo " [setup] load mode: $LOAD_MODE $( [ " $LOAD_MODE " = " decode-concurrent" ] && echo " (concurrency=$CONCURRENCY )" ) $( [ " $LOAD_MODE " != " decode-single" ] && echo " (bench-runs=$BENCH_RUNS )" ) "
473+ [ -n " $CALIBRATION_NOTE " ] && echo " [setup] calibration: $CALIBRATION_NOTE "
269474echo " [setup] estimated runtime: ~${EST_MIN} min (${NUM_CAPS} caps × ~30s/cap)"
270475echo " [setup] reset at end: $( [ $RESET -eq 1 ] && echo yes || echo no) "
271476echo
330535 echo
331536fi
332537
333- # Persistence mode (one-time; idempotent)
334- nvidia-smi -pm 1 -i " $GPU_INDEX " > /dev/null 2>&1 || true
335-
336538# Sweep
337539RESULTS_FILE=/tmp/power-cap-summary.md
338540{
@@ -341,6 +543,7 @@ RESULTS_FILE=/tmp/power-cap-summary.md
341543 echo " **GPU:** $GPU_NAME **VRAM:** ${GPU_VRAM} MiB **Stock TDP:** ${STOCK_TDP} W **Cooling:** ${COOLING} "
342544 echo " **Model:** \` ${MODEL} \` **Engine:** \` ${CONTAINER} \` **Endpoint:** ${URL} "
343545 echo " **Load mode:** \` ${LOAD_MODE} \` $( [ " $LOAD_MODE " = " decode-concurrent" ] && echo " (concurrency=${CONCURRENCY} )" ) $( [ " $LOAD_MODE " != " decode-single" ] && echo " (bench-runs=${BENCH_RUNS} )" ) "
546+ [ -n " $CALIBRATION_NOTE " ] && echo " **Calibration:** ${CALIBRATION_NOTE} "
344547 echo " **Date:** $( date -u +%Y-%m-%dT%H:%M:%S) Z"
345548 echo " "
346549 if [ " $COOLING " = " unspecified" ]; then
0 commit comments