Skip to content

Commit 285713d

Browse files
dvcdsysclaude
andcommitted
feat(server): dashboard-tunable chunker concurrency + llama --cache-ram (OOM fix)
Two new runtime-config fields, end to end (DB migrations 16/17 → runtimecfg → admin API → openapi → dashboard): - chunk_max_concurrent — the wasm chunker's instance-concurrency cap, decoupled from embedding concurrency; resizes the live limiter without a restart. Env: CIX_CHUNK_MAX_CONCURRENT; per-instance memory knobs stay env-only (CIX_CHUNK_MEM_LIMIT_PAGES, CIX_CHUNK_RECYCLE_GROWTH_MB, CIX_CHUNK_MAX_IDLE). - llama_cache_ram_mib — llama-server's HOST prompt cache cap (--cache-ram). Upstream defaults this to 8 GiB (ggml-org/llama.cpp#16391), which is pure waste for an embeddings-only sidecar: prompts are never reused, but the cache fills anyway. Observed on prod: llama-server RSS 365MB→11.3GB within minutes of indexing vscode@main, then cgroup OOM kill — twice at the 10G limit, again at 16G. With --cache-ram 0 (our default; -1 = unlimited) it plateaus at ~900MB under the same load. Env: CIX_LLAMA_CACHE_RAM; shown in the dashboard's Runtime parameters card, applied via Save & Restart. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
1 parent ddd16e9 commit 285713d

14 files changed

Lines changed: 813 additions & 513 deletions

File tree

doc/openapi.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3245,6 +3245,8 @@ components:
32453245
- max_embedding_concurrency
32463246
- llama_batch_size
32473247
- index_embed_batch_chunks
3248+
- chunk_max_concurrent
3249+
- llama_cache_ram_mib
32483250
- source
32493251
properties:
32503252
embedding_model:
@@ -3270,6 +3272,14 @@ components:
32703272
type: integer
32713273
minimum: 0
32723274
description: Cross-file embed-batch size for repo indexing (chunks per embed call). 0 = one call per file.
3275+
chunk_max_concurrent:
3276+
type: integer
3277+
minimum: 0
3278+
description: Chunker (tree-sitter wasm) instance-concurrency cap, decoupled from embedding concurrency. Each instance holds ~69 MiB. 0 = recommended (3).
3279+
llama_cache_ram_mib:
3280+
type: integer
3281+
minimum: -1
3282+
description: llama-server host prompt-cache cap in MiB (--cache-ram). 0 = disabled (recommended for embeddings — prompts are never reused, and llama's upstream 8 GiB default grows RSS until the container OOMs), -1 = unlimited.
32733283
source:
32743284
type: object
32753285
additionalProperties:
@@ -3301,6 +3311,8 @@ components:
33013311
- max_embedding_concurrency
33023312
- llama_batch_size
33033313
- index_embed_batch_chunks
3314+
- chunk_max_concurrent
3315+
- llama_cache_ram_mib
33043316
properties:
33053317
embedding_model: { type: string }
33063318
llama_ctx_size: { type: integer }
@@ -3309,6 +3321,8 @@ components:
33093321
max_embedding_concurrency: { type: integer }
33103322
llama_batch_size: { type: integer }
33113323
index_embed_batch_chunks: { type: integer }
3324+
chunk_max_concurrent: { type: integer }
3325+
llama_cache_ram_mib: { type: integer }
33123326

33133327
RuntimeConfigUpdate:
33143328
type: object
@@ -3339,6 +3353,13 @@ components:
33393353
index_embed_batch_chunks:
33403354
type: integer
33413355
nullable: true
3356+
chunk_max_concurrent:
3357+
type: integer
3358+
nullable: true
3359+
llama_cache_ram_mib:
3360+
type: integer
3361+
nullable: true
3362+
description: MiB; 0 clears the override (falls back to env / recommended = disabled), -1 = unlimited.
33423363

33433364
SidecarStatus:
33443365
type: object

server/cmd/cix-server/main.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,14 @@ import (
1313
_ "net/http/pprof" // opt-in heap/CPU profiling, exposed only when CIX_PPROF_ADDR is set
1414
"os"
1515
"os/signal"
16+
"strconv"
1617
"strings"
1718
"syscall"
1819
"time"
1920

2021
"github.com/dvcdsys/code-index/server/internal/apikeys"
2122
"github.com/dvcdsys/code-index/server/internal/chunker"
23+
"github.com/dvcdsys/code-index/server/internal/chunker/tswasm"
2224
"github.com/dvcdsys/code-index/server/internal/config"
2325
"github.com/dvcdsys/code-index/server/internal/db"
2426
"github.com/dvcdsys/code-index/server/internal/embeddings"
@@ -82,6 +84,17 @@ func main() {
8284

8385
// parseLogLevel maps CIX_LOG_LEVEL (debug|info|warn|error, case-insensitive)
8486
// to a slog level. Unset or unrecognised values fall back to info.
87+
// envPositiveInt returns the positive integer value of an env var, or 0 if unset
88+
// or unparsable. Used for optional numeric tuning knobs.
89+
func envPositiveInt(key string) int {
90+
if s := os.Getenv(key); s != "" {
91+
if n, err := strconv.Atoi(strings.TrimSpace(s)); err == nil && n > 0 {
92+
return n
93+
}
94+
}
95+
return 0
96+
}
97+
8598
func parseLogLevel(s string) slog.Level {
8699
switch strings.ToLower(strings.TrimSpace(s)) {
87100
case "debug":
@@ -118,6 +131,21 @@ func run() error {
118131
chunker.Configure(cfg.Languages)
119132
logger.Info("chunker languages configured", "active", chunker.SupportedLanguages())
120133

134+
// tree-sitter (wasm) memory knobs. Each parser instance loads all grammar
135+
// tables into its own linear memory (~69 MiB baseline). ChunkMaxConcurrent —
136+
// the chunker's instance-concurrency cap — is dashboard-overridable, so it's
137+
// applied below from the resolved runtime-config snapshot (not here). These
138+
// per-instance memory knobs stay ENV-only (rarely tuned).
139+
if v := envPositiveInt("CIX_CHUNK_MEM_LIMIT_PAGES"); v > 0 {
140+
tswasm.MemLimitPages = uint32(v)
141+
}
142+
if v := envPositiveInt("CIX_CHUNK_RECYCLE_GROWTH_MB"); v > 0 {
143+
tswasm.RecycleGrowthBytes = uint64(v) << 20
144+
}
145+
if v := envPositiveInt("CIX_CHUNK_MAX_IDLE"); v > 0 {
146+
tswasm.MaxIdleInstances = v
147+
}
148+
121149
// The system DB is model-INDEPENDENT (one permanent file at
122150
// cfg.SQLitePath holding accounts + catalog + parsed code). Older
123151
// builds suffixed the model name onto the path; adopt any such legacy
@@ -158,13 +186,18 @@ func run() error {
158186
return fmt.Errorf("load runtime_settings: %w", err)
159187
}
160188
snap.ApplyTo(cfg)
189+
// Apply the chunker instance-concurrency cap from the resolved snapshot
190+
// (DB > env > recommended). Live changes from the dashboard re-apply via
191+
// PutRuntimeConfig → tswasm.SetMaxConcurrent.
192+
tswasm.SetMaxConcurrent(snap.ChunkMaxConcurrent)
161193
logger.Info("runtime config resolved",
162194
"embedding_model", cfg.EmbeddingModel,
163195
"llama_ctx", cfg.LlamaCtxSize,
164196
"n_gpu_layers", cfg.LlamaNGpuLayers,
165197
"n_threads", cfg.LlamaNThreads,
166198
"max_concurrency", cfg.MaxEmbeddingConcurrency,
167199
"batch", cfg.LlamaBatchSize,
200+
"chunk_max_concurrent", snap.ChunkMaxConcurrent,
168201
"sources", snap.Source,
169202
)
170203
// The system DB is model-independent (opened above at cfg.SQLitePath).

server/dashboard/src/modules/server/ServerPage.tsx

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ interface Draft {
2828
max_embedding_concurrency: number;
2929
llama_batch_size: number;
3030
index_embed_batch_chunks: number;
31+
chunk_max_concurrent: number;
32+
llama_cache_ram_mib: number;
3133
}
3234

3335
function configToDraft(c: RuntimeConfig): Draft {
@@ -39,6 +41,8 @@ function configToDraft(c: RuntimeConfig): Draft {
3941
max_embedding_concurrency: c.max_embedding_concurrency,
4042
llama_batch_size: c.llama_batch_size,
4143
index_embed_batch_chunks: c.index_embed_batch_chunks,
44+
chunk_max_concurrent: c.chunk_max_concurrent,
45+
llama_cache_ram_mib: c.llama_cache_ram_mib,
4246
};
4347
}
4448

@@ -58,6 +62,8 @@ function diffPatch(c: RuntimeConfig, d: Draft): { patch: RuntimeConfigUpdate; ch
5862
'max_embedding_concurrency',
5963
'llama_batch_size',
6064
'index_embed_batch_chunks',
65+
'chunk_max_concurrent',
66+
'llama_cache_ram_mib',
6167
] as const) {
6268
if (d[k] !== c[k]) {
6369
patch[k] = d[k];
@@ -203,9 +209,11 @@ export default function ServerPage() {
203209
draftCtx={draft.llama_ctx_size}
204210
draftGpuLayers={draft.llama_n_gpu_layers}
205211
draftThreads={draft.llama_n_threads}
212+
draftCacheRAM={draft.llama_cache_ram_mib}
206213
onDraftCtx={(n) => setDraft({ ...draft, llama_ctx_size: n })}
207214
onDraftGpuLayers={(n) => setDraft({ ...draft, llama_n_gpu_layers: n })}
208215
onDraftThreads={(n) => setDraft({ ...draft, llama_n_threads: n })}
216+
onDraftCacheRAM={(n) => setDraft({ ...draft, llama_cache_ram_mib: n })}
209217
/>
210218

211219
<SidecarSection />
@@ -224,9 +232,11 @@ export default function ServerPage() {
224232
draftConcurrency={draft.max_embedding_concurrency}
225233
draftBatch={draft.llama_batch_size}
226234
draftIndexBatch={draft.index_embed_batch_chunks}
235+
draftChunkConc={draft.chunk_max_concurrent}
227236
onDraftConcurrency={(n) => setDraft({ ...draft, max_embedding_concurrency: n })}
228237
onDraftBatch={(n) => setDraft({ ...draft, llama_batch_size: n })}
229238
onDraftIndexBatch={(n) => setDraft({ ...draft, index_embed_batch_chunks: n })}
239+
onDraftChunkConc={(n) => setDraft({ ...draft, chunk_max_concurrent: n })}
230240
isOllama={showOllamaSections}
231241
/>
232242

server/dashboard/src/modules/server/sections/AdvancedSection.tsx

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,11 @@ interface Props {
1010
draftConcurrency: number;
1111
draftBatch: number;
1212
draftIndexBatch: number;
13+
draftChunkConc: number;
1314
onDraftConcurrency: (n: number) => void;
1415
onDraftBatch: (n: number) => void;
1516
onDraftIndexBatch: (n: number) => void;
17+
onDraftChunkConc: (n: number) => void;
1618
// isOllama controls whether the llama-only batch-size field is
1719
// rendered. Concurrency (the Service-level queue depth) applies to
1820
// every provider — caps how many parallel /v1/embeddings POSTs go
@@ -29,14 +31,17 @@ export function AdvancedSection({
2931
draftConcurrency,
3032
draftBatch,
3133
draftIndexBatch,
34+
draftChunkConc,
3235
onDraftConcurrency,
3336
onDraftBatch,
3437
onDraftIndexBatch,
38+
onDraftChunkConc,
3539
isOllama,
3640
}: Props) {
3741
const concId = useId();
3842
const batchId = useId();
3943
const idxBatchId = useId();
44+
const chunkConcId = useId();
4045
const rec = config?.recommended;
4146
const src = config?.source;
4247

@@ -122,6 +127,37 @@ export function AdvancedSection({
122127
</p>
123128
</div>
124129

130+
<div className="space-y-1.5">
131+
<div className="flex items-center justify-between gap-2">
132+
<Label htmlFor={chunkConcId} className="font-medium">
133+
Chunker concurrency
134+
<span className="ml-2 font-normal text-muted-foreground text-xs">(chunk_max_concurrent)</span>
135+
</Label>
136+
<SourcePill source={src?.chunk_max_concurrent} />
137+
</div>
138+
<Input
139+
id={chunkConcId}
140+
type="number"
141+
min={0}
142+
value={Number.isFinite(draftChunkConc) ? draftChunkConc : 0}
143+
onChange={(e) => {
144+
const n = parseInt(e.target.value, 10);
145+
onDraftChunkConc(Number.isFinite(n) ? n : 0);
146+
}}
147+
className="max-w-xs"
148+
/>
149+
<p className="text-xs text-muted-foreground">
150+
How many tree-sitter (wasm) parser instances run at once —
151+
the chunker's OWN concurrency, decoupled from embedding
152+
concurrency above. Each instance holds ~69&nbsp;MiB, so this
153+
bounds peak chunker memory regardless of how many files embed
154+
in parallel. Raise it on big multi-core boxes; lower it if the
155+
indexer is memory-pressured. Applies live (no restart).
156+
0 = recommended:{' '}
157+
<code>{rec?.chunk_max_concurrent ?? 3}</code>.
158+
</p>
159+
</div>
160+
125161
{isOllama ? (
126162
<div className="space-y-1.5">
127163
<div className="flex items-center justify-between gap-2">

server/dashboard/src/modules/server/sections/RuntimeParamsSection.tsx

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,11 @@ interface Props {
5151
draftCtx: number;
5252
draftGpuLayers: number;
5353
draftThreads: number;
54+
draftCacheRAM: number;
5455
onDraftCtx: (n: number) => void;
5556
onDraftGpuLayers: (n: number) => void;
5657
onDraftThreads: (n: number) => void;
58+
onDraftCacheRAM: (n: number) => void;
5759
}
5860

5961
// RuntimeParamsSection: ctx, n_gpu_layers, n_threads form. n_gpu_layers
@@ -64,9 +66,11 @@ export function RuntimeParamsSection({
6466
draftCtx,
6567
draftGpuLayers,
6668
draftThreads,
69+
draftCacheRAM,
6770
onDraftCtx,
6871
onDraftGpuLayers,
6972
onDraftThreads,
73+
onDraftCacheRAM,
7074
}: Props) {
7175
const rec = config?.recommended;
7276
const src = config?.source;
@@ -109,6 +113,16 @@ export function RuntimeParamsSection({
109113
source={src?.llama_n_threads}
110114
onChange={onDraftThreads}
111115
/>
116+
<NumberField
117+
field="llama_cache_ram_mib"
118+
label="Host prompt cache (MiB)"
119+
hint="llama-server's in-RAM prompt cache (--cache-ram). Embeddings never reuse prompts, so 0 (disabled) is right for cix — llama's own 8192 MiB default only inflates RSS until the container hits its memory limit. -1 = unlimited."
120+
value={draftCacheRAM}
121+
recommended={rec?.llama_cache_ram_mib}
122+
source={src?.llama_cache_ram_mib}
123+
onChange={onDraftCacheRAM}
124+
min={-1}
125+
/>
112126
</CardContent>
113127
</Card>
114128
);

server/internal/config/config.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,13 @@ type Config struct {
4141
// Dashboard-overridable via runtimecfg. Env: CIX_INDEX_EMBED_BATCH_CHUNKS.
4242
IndexEmbedBatchChunks int
4343

44+
// ChunkMaxConcurrent caps how many tree-sitter (wasm) parser instances run
45+
// at once — the chunker's OWN concurrency, decoupled from embedding
46+
// concurrency. Each instance holds ~69 MiB, so this bounds peak chunker
47+
// memory regardless of how many files embed in parallel. 0 → recommended (3).
48+
// Dashboard-overridable via runtimecfg. Env: CIX_CHUNK_MAX_CONCURRENT.
49+
ChunkMaxConcurrent int
50+
4451
// Phase 3 — llama-server sidecar configuration.
4552
GGUFPath string // CIX_GGUF_PATH; absolute path. Empty = auto-resolve via cache / dev-fallback / HF download.
4653
GGUFCacheDir string // CIX_GGUF_CACHE_DIR; where HF downloads land.
@@ -51,6 +58,7 @@ type Config struct {
5158
LlamaNGpuLayers int // CIX_N_GPU_LAYERS; -1 on darwin (Metal all layers), 0 elsewhere.
5259
LlamaNThreads int // CIX_LLAMA_THREADS; CPU thread count for llama-server (--threads). 0 = auto.
5360
LlamaBatchSize int // CIX_LLAMA_BATCH; llama-server logical batch size (-b). 0 = match LlamaCtxSize.
61+
LlamaCacheRAMMiB int // CIX_LLAMA_CACHE_RAM; llama-server host prompt-cache cap in MiB (--cache-ram). 0 = disabled (embeddings get zero prompt reuse; upstream default 8192 caused OOM kills), -1 = unlimited.
5462
LlamaStartupSec int // CIX_LLAMA_STARTUP_TIMEOUT; readiness probe ceiling in seconds.
5563
EmbeddingsEnabled bool // CIX_EMBEDDINGS_ENABLED; test hook to bypass sidecar entirely.
5664

@@ -271,6 +279,12 @@ func Load() (*Config, error) {
271279
}
272280
c.IndexEmbedBatchChunks = idxBatch
273281

282+
chunkConc, err := getenvInt("CIX_CHUNK_MAX_CONCURRENT", 0)
283+
if err != nil {
284+
return nil, err
285+
}
286+
c.ChunkMaxConcurrent = chunkConc
287+
274288
maxChunk, err := getenvInt("CIX_MAX_CHUNK_TOKENS", 1500)
275289
if err != nil {
276290
return nil, err
@@ -330,6 +344,16 @@ func Load() (*Config, error) {
330344
}
331345
c.LlamaBatchSize = batch
332346

347+
// CIX_LLAMA_CACHE_RAM: llama-server's host prompt cache, in MiB. The
348+
// upstream default (8192) is pure host-RAM waste for an embeddings-only
349+
// sidecar — prompts are never reused — and grew llama-server's RSS until
350+
// the container OOM-killed it. 0 (our default) disables it; -1 = unlimited.
351+
cacheRAM, err := getenvInt("CIX_LLAMA_CACHE_RAM", 0)
352+
if err != nil {
353+
return nil, err
354+
}
355+
c.LlamaCacheRAMMiB = cacheRAM
356+
333357
startup, err := getenvInt("CIX_LLAMA_STARTUP_TIMEOUT", 60)
334358
if err != nil {
335359
return nil, err

0 commit comments

Comments
 (0)