dvcdsys
diff --git a/‎doc/openapi.yaml‎
Lines changed: 21 additions & 0 deletions b/‎doc/openapi.yaml‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎server/cmd/cix-server/main.go‎
Lines changed: 33 additions & 0 deletions b/‎server/cmd/cix-server/main.go‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎server/dashboard/src/modules/server/ServerPage.tsx‎
Lines changed: 10 additions & 0 deletions b/‎server/dashboard/src/modules/server/ServerPage.tsx‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎server/dashboard/src/modules/server/sections/AdvancedSection.tsx‎
Lines changed: 36 additions & 0 deletions b/‎server/dashboard/src/modules/server/sections/AdvancedSection.tsx‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎server/dashboard/src/modules/server/sections/RuntimeParamsSection.tsx‎
Lines changed: 14 additions & 0 deletions b/‎server/dashboard/src/modules/server/sections/RuntimeParamsSection.tsx‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎server/internal/config/config.go‎
Lines changed: 24 additions & 0 deletions b/‎server/internal/config/config.go‎
Lines changed: 24 additions & 0 deletions
@@ -3245,6 +3245,8 @@ components:
         - max_embedding_concurrency
         - llama_batch_size
         - index_embed_batch_chunks
+        - chunk_max_concurrent
+        - llama_cache_ram_mib
         - source
       properties:
         embedding_model:
@@ -3270,6 +3272,14 @@ components:
           type: integer
           minimum: 0
           description: Cross-file embed-batch size for repo indexing (chunks per embed call). 0 = one call per file.
+        chunk_max_concurrent:
+          type: integer
+          minimum: 0
+          description: Chunker (tree-sitter wasm) instance-concurrency cap, decoupled from embedding concurrency. Each instance holds ~69 MiB. 0 = recommended (3).
+        llama_cache_ram_mib:
+          type: integer
+          minimum: -1
+          description: llama-server host prompt-cache cap in MiB (--cache-ram). 0 = disabled (recommended for embeddings — prompts are never reused, and llama's upstream 8 GiB default grows RSS until the container OOMs), -1 = unlimited.
         source:
           type: object
           additionalProperties:
@@ -3301,6 +3311,8 @@ components:
         - max_embedding_concurrency
         - llama_batch_size
         - index_embed_batch_chunks
+        - chunk_max_concurrent
+        - llama_cache_ram_mib
       properties:
         embedding_model: { type: string }
         llama_ctx_size: { type: integer }
@@ -3309,6 +3321,8 @@ components:
         max_embedding_concurrency: { type: integer }
         llama_batch_size: { type: integer }
         index_embed_batch_chunks: { type: integer }
+        chunk_max_concurrent: { type: integer }
+        llama_cache_ram_mib: { type: integer }
 
     RuntimeConfigUpdate:
       type: object
@@ -3339,6 +3353,13 @@ components:
         index_embed_batch_chunks:
           type: integer
           nullable: true
+        chunk_max_concurrent:
+          type: integer
+          nullable: true
+        llama_cache_ram_mib:
+          type: integer
+          nullable: true
+          description: MiB; 0 clears the override (falls back to env / recommended = disabled), -1 = unlimited.
 
     SidecarStatus:
       type: object
 
@@ -13,12 +13,14 @@ import (
 	_ "net/http/pprof" // opt-in heap/CPU profiling, exposed only when CIX_PPROF_ADDR is set
 	"os"
 	"os/signal"
+	"strconv"
 	"strings"
 	"syscall"
 	"time"
 
 	"github.com/dvcdsys/code-index/server/internal/apikeys"
 	"github.com/dvcdsys/code-index/server/internal/chunker"
+	"github.com/dvcdsys/code-index/server/internal/chunker/tswasm"
 	"github.com/dvcdsys/code-index/server/internal/config"
 	"github.com/dvcdsys/code-index/server/internal/db"
 	"github.com/dvcdsys/code-index/server/internal/embeddings"
@@ -82,6 +84,17 @@ func main() {
 
 // parseLogLevel maps CIX_LOG_LEVEL (debug|info|warn|error, case-insensitive)
 // to a slog level. Unset or unrecognised values fall back to info.
+// envPositiveInt returns the positive integer value of an env var, or 0 if unset
+// or unparsable. Used for optional numeric tuning knobs.
+func envPositiveInt(key string) int {
+	if s := os.Getenv(key); s != "" {
+		if n, err := strconv.Atoi(strings.TrimSpace(s)); err == nil && n > 0 {
+			return n
+		}
+	}
+	return 0
+}
+
 func parseLogLevel(s string) slog.Level {
 	switch strings.ToLower(strings.TrimSpace(s)) {
 	case "debug":
@@ -118,6 +131,21 @@ func run() error {
 	chunker.Configure(cfg.Languages)
 	logger.Info("chunker languages configured", "active", chunker.SupportedLanguages())
 
+	// tree-sitter (wasm) memory knobs. Each parser instance loads all grammar
+	// tables into its own linear memory (~69 MiB baseline). ChunkMaxConcurrent —
+	// the chunker's instance-concurrency cap — is dashboard-overridable, so it's
+	// applied below from the resolved runtime-config snapshot (not here). These
+	// per-instance memory knobs stay ENV-only (rarely tuned).
+	if v := envPositiveInt("CIX_CHUNK_MEM_LIMIT_PAGES"); v > 0 {
+		tswasm.MemLimitPages = uint32(v)
+	}
+	if v := envPositiveInt("CIX_CHUNK_RECYCLE_GROWTH_MB"); v > 0 {
+		tswasm.RecycleGrowthBytes = uint64(v) << 20
+	}
+	if v := envPositiveInt("CIX_CHUNK_MAX_IDLE"); v > 0 {
+		tswasm.MaxIdleInstances = v
+	}
+
 	// The system DB is model-INDEPENDENT (one permanent file at
 	// cfg.SQLitePath holding accounts + catalog + parsed code). Older
 	// builds suffixed the model name onto the path; adopt any such legacy
@@ -158,13 +186,18 @@ func run() error {
 		return fmt.Errorf("load runtime_settings: %w", err)
 	}
 	snap.ApplyTo(cfg)
+	// Apply the chunker instance-concurrency cap from the resolved snapshot
+	// (DB > env > recommended). Live changes from the dashboard re-apply via
+	// PutRuntimeConfig → tswasm.SetMaxConcurrent.
+	tswasm.SetMaxConcurrent(snap.ChunkMaxConcurrent)
 	logger.Info("runtime config resolved",
 		"embedding_model", cfg.EmbeddingModel,
 		"llama_ctx", cfg.LlamaCtxSize,
 		"n_gpu_layers", cfg.LlamaNGpuLayers,
 		"n_threads", cfg.LlamaNThreads,
 		"max_concurrency", cfg.MaxEmbeddingConcurrency,
 		"batch", cfg.LlamaBatchSize,
+		"chunk_max_concurrent", snap.ChunkMaxConcurrent,
 		"sources", snap.Source,
 	)
 	// The system DB is model-independent (opened above at cfg.SQLitePath).
 
@@ -28,6 +28,8 @@ interface Draft {
   max_embedding_concurrency: number;
   llama_batch_size: number;
   index_embed_batch_chunks: number;
+  chunk_max_concurrent: number;
+  llama_cache_ram_mib: number;
 }
 
 function configToDraft(c: RuntimeConfig): Draft {
@@ -39,6 +41,8 @@ function configToDraft(c: RuntimeConfig): Draft {
     max_embedding_concurrency: c.max_embedding_concurrency,
     llama_batch_size: c.llama_batch_size,
     index_embed_batch_chunks: c.index_embed_batch_chunks,
+    chunk_max_concurrent: c.chunk_max_concurrent,
+    llama_cache_ram_mib: c.llama_cache_ram_mib,
   };
 }
 
@@ -58,6 +62,8 @@ function diffPatch(c: RuntimeConfig, d: Draft): { patch: RuntimeConfigUpdate; ch
     'max_embedding_concurrency',
     'llama_batch_size',
     'index_embed_batch_chunks',
+    'chunk_max_concurrent',
+    'llama_cache_ram_mib',
   ] as const) {
     if (d[k] !== c[k]) {
       patch[k] = d[k];
@@ -203,9 +209,11 @@ export default function ServerPage() {
             draftCtx={draft.llama_ctx_size}
             draftGpuLayers={draft.llama_n_gpu_layers}
             draftThreads={draft.llama_n_threads}
+            draftCacheRAM={draft.llama_cache_ram_mib}
             onDraftCtx={(n) => setDraft({ ...draft, llama_ctx_size: n })}
             onDraftGpuLayers={(n) => setDraft({ ...draft, llama_n_gpu_layers: n })}
             onDraftThreads={(n) => setDraft({ ...draft, llama_n_threads: n })}
+            onDraftCacheRAM={(n) => setDraft({ ...draft, llama_cache_ram_mib: n })}
           />
 
           <SidecarSection />
@@ -224,9 +232,11 @@ export default function ServerPage() {
         draftConcurrency={draft.max_embedding_concurrency}
         draftBatch={draft.llama_batch_size}
         draftIndexBatch={draft.index_embed_batch_chunks}
+        draftChunkConc={draft.chunk_max_concurrent}
         onDraftConcurrency={(n) => setDraft({ ...draft, max_embedding_concurrency: n })}
         onDraftBatch={(n) => setDraft({ ...draft, llama_batch_size: n })}
         onDraftIndexBatch={(n) => setDraft({ ...draft, index_embed_batch_chunks: n })}
+        onDraftChunkConc={(n) => setDraft({ ...draft, chunk_max_concurrent: n })}
         isOllama={showOllamaSections}
       />
 
 
@@ -10,9 +10,11 @@ interface Props {
   draftConcurrency: number;
   draftBatch: number;
   draftIndexBatch: number;
+  draftChunkConc: number;
   onDraftConcurrency: (n: number) => void;
   onDraftBatch: (n: number) => void;
   onDraftIndexBatch: (n: number) => void;
+  onDraftChunkConc: (n: number) => void;
   // isOllama controls whether the llama-only batch-size field is
   // rendered. Concurrency (the Service-level queue depth) applies to
   // every provider — caps how many parallel /v1/embeddings POSTs go
@@ -29,14 +31,17 @@ export function AdvancedSection({
   draftConcurrency,
   draftBatch,
   draftIndexBatch,
+  draftChunkConc,
   onDraftConcurrency,
   onDraftBatch,
   onDraftIndexBatch,
+  onDraftChunkConc,
   isOllama,
 }: Props) {
   const concId = useId();
   const batchId = useId();
   const idxBatchId = useId();
+  const chunkConcId = useId();
   const rec = config?.recommended;
   const src = config?.source;
 
@@ -122,6 +127,37 @@ export function AdvancedSection({
               </p>
             </div>
 
+            <div className="space-y-1.5">
+              <div className="flex items-center justify-between gap-2">
+                <Label htmlFor={chunkConcId} className="font-medium">
+                  Chunker concurrency
+                  <span className="ml-2 font-normal text-muted-foreground text-xs">(chunk_max_concurrent)</span>
+                </Label>
+                <SourcePill source={src?.chunk_max_concurrent} />
+              </div>
+              <Input
+                id={chunkConcId}
+                type="number"
+                min={0}
+                value={Number.isFinite(draftChunkConc) ? draftChunkConc : 0}
+                onChange={(e) => {
+                  const n = parseInt(e.target.value, 10);
+                  onDraftChunkConc(Number.isFinite(n) ? n : 0);
+                }}
+                className="max-w-xs"
+              />
+              <p className="text-xs text-muted-foreground">
+                How many tree-sitter (wasm) parser instances run at once —
+                the chunker's OWN concurrency, decoupled from embedding
+                concurrency above. Each instance holds ~69&nbsp;MiB, so this
+                bounds peak chunker memory regardless of how many files embed
+                in parallel. Raise it on big multi-core boxes; lower it if the
+                indexer is memory-pressured. Applies live (no restart).
+                0 = recommended:{' '}
+                <code>{rec?.chunk_max_concurrent ?? 3}</code>.
+              </p>
+            </div>
+
             {isOllama ? (
               <div className="space-y-1.5">
                 <div className="flex items-center justify-between gap-2">
 
@@ -51,9 +51,11 @@ interface Props {
   draftCtx: number;
   draftGpuLayers: number;
   draftThreads: number;
+  draftCacheRAM: number;
   onDraftCtx: (n: number) => void;
   onDraftGpuLayers: (n: number) => void;
   onDraftThreads: (n: number) => void;
+  onDraftCacheRAM: (n: number) => void;
 }
 
 // RuntimeParamsSection: ctx, n_gpu_layers, n_threads form. n_gpu_layers
@@ -64,9 +66,11 @@ export function RuntimeParamsSection({
   draftCtx,
   draftGpuLayers,
   draftThreads,
+  draftCacheRAM,
   onDraftCtx,
   onDraftGpuLayers,
   onDraftThreads,
+  onDraftCacheRAM,
 }: Props) {
   const rec = config?.recommended;
   const src = config?.source;
@@ -109,6 +113,16 @@ export function RuntimeParamsSection({
           source={src?.llama_n_threads}
           onChange={onDraftThreads}
         />
+        <NumberField
+          field="llama_cache_ram_mib"
+          label="Host prompt cache (MiB)"
+          hint="llama-server's in-RAM prompt cache (--cache-ram). Embeddings never reuse prompts, so 0 (disabled) is right for cix — llama's own 8192 MiB default only inflates RSS until the container hits its memory limit. -1 = unlimited."
+          value={draftCacheRAM}
+          recommended={rec?.llama_cache_ram_mib}
+          source={src?.llama_cache_ram_mib}
+          onChange={onDraftCacheRAM}
+          min={-1}
+        />
       </CardContent>
     </Card>
   );
 
@@ -41,6 +41,13 @@ type Config struct {
 	// Dashboard-overridable via runtimecfg. Env: CIX_INDEX_EMBED_BATCH_CHUNKS.
 	IndexEmbedBatchChunks int
 
+	// ChunkMaxConcurrent caps how many tree-sitter (wasm) parser instances run
+	// at once — the chunker's OWN concurrency, decoupled from embedding
+	// concurrency. Each instance holds ~69 MiB, so this bounds peak chunker
+	// memory regardless of how many files embed in parallel. 0 → recommended (3).
+	// Dashboard-overridable via runtimecfg. Env: CIX_CHUNK_MAX_CONCURRENT.
+	ChunkMaxConcurrent int
+
 	// Phase 3 — llama-server sidecar configuration.
 	GGUFPath          string // CIX_GGUF_PATH; absolute path. Empty = auto-resolve via cache / dev-fallback / HF download.
 	GGUFCacheDir      string // CIX_GGUF_CACHE_DIR; where HF downloads land.
@@ -51,6 +58,7 @@ type Config struct {
 	LlamaNGpuLayers   int    // CIX_N_GPU_LAYERS; -1 on darwin (Metal all layers), 0 elsewhere.
 	LlamaNThreads     int    // CIX_LLAMA_THREADS; CPU thread count for llama-server (--threads). 0 = auto.
 	LlamaBatchSize    int    // CIX_LLAMA_BATCH; llama-server logical batch size (-b). 0 = match LlamaCtxSize.
+	LlamaCacheRAMMiB  int    // CIX_LLAMA_CACHE_RAM; llama-server host prompt-cache cap in MiB (--cache-ram). 0 = disabled (embeddings get zero prompt reuse; upstream default 8192 caused OOM kills), -1 = unlimited.
 	LlamaStartupSec   int    // CIX_LLAMA_STARTUP_TIMEOUT; readiness probe ceiling in seconds.
 	EmbeddingsEnabled bool   // CIX_EMBEDDINGS_ENABLED; test hook to bypass sidecar entirely.
 
@@ -271,6 +279,12 @@ func Load() (*Config, error) {
 	}
 	c.IndexEmbedBatchChunks = idxBatch
 
+	chunkConc, err := getenvInt("CIX_CHUNK_MAX_CONCURRENT", 0)
+	if err != nil {
+		return nil, err
+	}
+	c.ChunkMaxConcurrent = chunkConc
+
 	maxChunk, err := getenvInt("CIX_MAX_CHUNK_TOKENS", 1500)
 	if err != nil {
 		return nil, err
@@ -330,6 +344,16 @@ func Load() (*Config, error) {
 	}
 	c.LlamaBatchSize = batch
 
+	// CIX_LLAMA_CACHE_RAM: llama-server's host prompt cache, in MiB. The
+	// upstream default (8192) is pure host-RAM waste for an embeddings-only
+	// sidecar — prompts are never reused — and grew llama-server's RSS until
+	// the container OOM-killed it. 0 (our default) disables it; -1 = unlimited.
+	cacheRAM, err := getenvInt("CIX_LLAMA_CACHE_RAM", 0)
+	if err != nil {
+		return nil, err
+	}
+	c.LlamaCacheRAMMiB = cacheRAM
+
 	startup, err := getenvInt("CIX_LLAMA_STARTUP_TIMEOUT", 60)
 	if err != nil {
 		return nil, err