Skip to content

Commit ddd16e9

Browse files
dvcdsysclaude
andcommitted
feat(chunker): wasm/wazero tree-sitter backend — mmap memory, minified skip, doc-comment attachment
Replace gotreesitter with the official tree-sitter C runtime + 31 grammars compiled to one wasm32-wasi module (ts-core.wasm.br, brotli ~3MB) driven via wazero. No cgo: traps are contained (parse falls back to sliding window, the process survives), and the binary stays CGO_ENABLED=0. Memory design (measured on the prod-shaped churn workload): - linear memory is mmap-backed (experimental.WithMemoryAllocator) instead of wazero's default Go-heap append-grow: no realloc-copy garbage on growth and munmap-on-close returns recycled instances' memory to the OS immediately. Churn heapSys 1135→391MB, peak RSS 1070→535MB; full-repo chunking peak RSS 1516→787MB. - engine pool: hard concurrency cap (dashboard-tunable), 256MiB per-instance linear-memory ceiling (2× headroom over the worst measured instance at the indexer's 512KiB file cap), high-water-mark recycling, 1 idle instance. Chunker quality fixes: - minified/bundled js/ts/css (.min., .bundle.js, >2KiB lines) skip the parser straight to sliding window — the pathological input class that ballooned instances for near-zero semantic value. - a declaration's doc comment now attaches to its chunk (language-agnostic via tree-sitter's extra flag + same-row wrapper climb; verified for Go, TS, C, Python, Rust, Java). Generated files stop spraying comment-only micro chunks: openapi.gen.go 893→517 chunks, median 114→256B, symbols/refs byte-identical. Memory-stress harnesses are committed but gated behind CIX_MEMSTRESS=1. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
1 parent 7384cb1 commit ddd16e9

25 files changed

Lines changed: 2227 additions & 357 deletions

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ Grep and fuzzy file search work fine for small projects. At scale they break dow
7878
│ └── embedded Swagger UI │
7979
│ │
8080
│ Indexing pipeline │
81-
│ ├── gotreesitter (AST chunking, 200+ languages)
81+
│ ├── tree-sitter/wasm (AST chunking, 31 langs) (wazero)
8282
│ ├── llama-server sidecar (Unix socket → CodeRankEmbed Q8 GGUF) │
8383
│ ├── chromem-go (cosine similarity vector store) │
8484
│ ├── SQLite FTS5 chunk mirror (BM25 — powers hybrid workspace) │

poc/wasm-treesitter/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# build artifact — rebuilt by build.sh; the committed module lives as
2+
# server/internal/chunker/tswasm/ts-core.wasm.br (brotli)
3+
ts-core.wasm

poc/wasm-treesitter/build.sh

Lines changed: 90 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,108 @@
11
#!/usr/bin/env bash
2-
# Builds ts-ts.wasm: the OFFICIAL tree-sitter C runtime + the official
3-
# TypeScript grammar, compiled to a standalone wasm32-wasi reactor module.
4-
# No emscripten, no JS glue, no third-party Go host — just official C sources
5-
# driven from Go via wazero (see wasmts.go).
2+
# Builds ts-core.wasm: the OFFICIAL tree-sitter C runtime + the base grammars +
3+
# our host_extra.c (the batched ts_dump_tree walk), compiled to ONE standalone
4+
# wasm32-wasi reactor module via `zig cc`. No emscripten, no JS glue.
65
#
7-
# Requires: zig (provides clang + wasi-libc cross-compilation), git.
8-
# brew install zig
6+
# Requires: zig (clang + wasi-libc cross-compile), git, and tree-sitter CLI (only
7+
# for grammars whose repo ships no committed parser.c — gen=1 rows).
98
#
10-
# Key point: the only wasmtime-dependent part of the runtime (wasm_store.c) is
11-
# guarded by `#ifdef TREE_SITTER_FEATURE_WASM`, which we do NOT define — so the
12-
# stock amalgamation (lib/src/lib.c) compiles to wasi cleanly with no stubs.
9+
# For wasm we compile each grammar IN PLACE from a full clone, so relative
10+
# includes (e.g. typescript's ../../common/scanner.h) and src-root headers (html
11+
# tag.h, haskell unicode.h) resolve naturally — none of the vendor.sh copy/rewrite
12+
# dance is needed. Quirks that remain: SHA pins (dart), `tree-sitter generate`
13+
# (sql), and a 2nd grammar from one repo (tsx). See plan §6.1.
1314
set -euo pipefail
1415
cd "$(dirname "$0")"
1516

16-
TS_VERSION="${TS_VERSION:-v0.25.10}" # tree-sitter runtime
17-
TS_TS_VERSION="${TS_TS_VERSION:-v0.23.2}" # tree-sitter-typescript grammar
17+
TS_VERSION="${TS_VERSION:-v0.25.10}"
18+
OUT="${OUT:-ts-core.wasm}"
1819
WORK="$(mktemp -d)"
1920
trap 'rm -rf "$WORK"' EXIT
2021

21-
git clone --depth 1 --branch "$TS_VERSION" https://github.com/tree-sitter/tree-sitter "$WORK/tree-sitter"
22-
git clone --depth 1 --branch "$TS_TS_VERSION" https://github.com/tree-sitter/tree-sitter-typescript "$WORK/ts-typescript"
22+
# id repo ref srcsubdir [gen]
23+
GRAMMARS=(
24+
"python tree-sitter/tree-sitter-python v0.25.0 src"
25+
"typescript tree-sitter/tree-sitter-typescript v0.23.2 typescript/src"
26+
"tsx tree-sitter/tree-sitter-typescript v0.23.2 tsx/src"
27+
"javascript tree-sitter/tree-sitter-javascript v0.25.0 src"
28+
"go tree-sitter/tree-sitter-go v0.25.0 src"
29+
"rust tree-sitter/tree-sitter-rust v0.24.2 src"
30+
"java tree-sitter/tree-sitter-java v0.23.5 src"
31+
"c tree-sitter/tree-sitter-c v0.24.2 src"
32+
"cpp tree-sitter/tree-sitter-cpp v0.23.4 src"
33+
"ruby tree-sitter/tree-sitter-ruby v0.23.1 src"
34+
"c_sharp tree-sitter/tree-sitter-c-sharp v0.23.5 src"
35+
"php tree-sitter/tree-sitter-php v0.24.2 php/src"
36+
"swift alex-pinkus/tree-sitter-swift 0.7.3-with-generated-files src"
37+
"kotlin tree-sitter-grammars/tree-sitter-kotlin v1.1.0 src"
38+
"scala tree-sitter/tree-sitter-scala v0.26.0 src"
39+
"bash tree-sitter/tree-sitter-bash v0.25.1 src"
40+
"lua tree-sitter-grammars/tree-sitter-lua v0.5.0 src"
41+
"dart UserNobody14/tree-sitter-dart a9bdfa3 src"
42+
"r r-lib/tree-sitter-r v1.2.0 src"
43+
"objc tree-sitter-grammars/tree-sitter-objc v3.0.2 src"
44+
"html tree-sitter/tree-sitter-html v0.23.2 src"
45+
"css tree-sitter/tree-sitter-css v0.25.0 src"
46+
"scss tree-sitter-grammars/tree-sitter-scss v1.0.0 src"
47+
"sql DerekStride/tree-sitter-sql v0.3.11 src 1"
48+
"markdown tree-sitter-grammars/tree-sitter-markdown v0.5.3 tree-sitter-markdown/src"
49+
"zig tree-sitter-grammars/tree-sitter-zig v1.1.2 src"
50+
"julia tree-sitter/tree-sitter-julia v0.25.0 src"
51+
"fortran stadelmanma/tree-sitter-fortran v0.6.0 src"
52+
"haskell tree-sitter/tree-sitter-haskell v0.23.1 src"
53+
"ocaml tree-sitter/tree-sitter-ocaml v0.25.0 grammars/ocaml/src"
54+
"solidity JoranHonig/tree-sitter-solidity v1.2.13 src"
55+
)
2356

57+
clone() { # repo ref dest — tag/branch fast path, SHA fallback
58+
local repo="$1" ref="$2" dest="$3"
59+
git clone --depth 1 --branch "$ref" "https://github.com/$repo" "$dest" >/dev/null 2>&1 && return 0
60+
git clone "https://github.com/$repo" "$dest" >/dev/null 2>&1 || return 1
61+
git -C "$dest" checkout "$ref" >/dev/null 2>&1
62+
}
63+
64+
echo "→ tree-sitter runtime $TS_VERSION"
65+
git clone --depth 1 --branch "$TS_VERSION" https://github.com/tree-sitter/tree-sitter "$WORK/tree-sitter" 2>/dev/null
66+
67+
SRCS=( "$WORK/tree-sitter/lib/src/lib.c" "csrc/host_extra.c" )
68+
INCS=( -I "$WORK/tree-sitter/lib/include" -I "$WORK/tree-sitter/lib/src" )
69+
EXPORTS=()
70+
BUILT=() ; FAILED=()
71+
72+
for row in "${GRAMMARS[@]}"; do
73+
read -r id repo ref sub gen <<<"$row"
74+
printf ' %-12s %s@%s ' "$id" "$repo" "$ref"
75+
if ! clone "$repo" "$ref" "$WORK/$id"; then echo "CLONE FAIL"; FAILED+=("$id"); continue; fi
76+
gsrc="$WORK/$id/$sub"
77+
if [ "${gen:-0}" = "1" ] && [ ! -f "$gsrc/parser.c" ]; then
78+
( cd "$WORK/$id" && tree-sitter generate >/dev/null 2>&1 ) || true
79+
fi
80+
if [ ! -f "$gsrc/parser.c" ]; then echo "NO parser.c"; FAILED+=("$id"); continue; fi
81+
SRCS+=( "$gsrc/parser.c" )
82+
[ -f "$gsrc/scanner.c" ] && SRCS+=( "$gsrc/scanner.c" )
83+
[ -f "$gsrc/scanner.cc" ] && SRCS+=( "$gsrc/scanner.cc" )
84+
INCS+=( -I "$gsrc" )
85+
EXPORTS+=( -Wl,--export=tree_sitter_$id )
86+
BUILT+=("$id")
87+
echo "ok"
88+
done
89+
90+
echo "→ compiling ${#SRCS[@]} sources, ${#BUILT[@]} grammars → $OUT"
2491
zig cc --target=wasm32-wasi-musl -mexec-model=reactor \
25-
-I "$WORK/tree-sitter/lib/include" -I "$WORK/tree-sitter/lib/src" \
26-
-I "$WORK/ts-typescript/typescript/src" \
27-
"$WORK/tree-sitter/lib/src/lib.c" \
28-
"$WORK/ts-typescript/typescript/src/parser.c" \
29-
"$WORK/ts-typescript/typescript/src/scanner.c" \
30-
-o ts-ts.wasm -Oz -fPIC -Wl,--no-entry -Wl,--strip-debug \
92+
"${INCS[@]}" "${SRCS[@]}" \
93+
-o "$OUT" -Oz -fPIC -Wl,--no-entry -Wl,--strip-debug \
3194
-Wl,--export=malloc -Wl,--export=free \
3295
-Wl,--export=ts_parser_new -Wl,--export=ts_parser_delete \
3396
-Wl,--export=ts_parser_set_language -Wl,--export=ts_parser_parse_string \
34-
-Wl,--export=ts_tree_root_node -Wl,--export=ts_tree_delete \
97+
-Wl,--export=ts_parser_reset \
98+
-Wl,--export=ts_tree_delete -Wl,--export=ts_tree_root_node \
3599
-Wl,--export=ts_node_child_count -Wl,--export=ts_node_child \
36100
-Wl,--export=ts_node_type -Wl,--export=ts_node_start_byte \
37101
-Wl,--export=ts_node_end_byte -Wl,--export=ts_node_has_error \
38-
-Wl,--export=tree_sitter_typescript
102+
-Wl,--export=ts_dump_tree -Wl,--export=ts_dump_rec_size \
103+
-Wl,--export=ts_language_symbol_count -Wl,--export=ts_language_symbol_name \
104+
"${EXPORTS[@]}"
39105

40-
echo "built ts-ts.wasm ($(du -h ts-ts.wasm | cut -f1)) from tree-sitter $TS_VERSION + tree-sitter-typescript $TS_TS_VERSION"
106+
echo "built $OUT ($(du -h "$OUT" | cut -f1)) — runtime $TS_VERSION, ${#BUILT[@]} grammars"
107+
[ ${#FAILED[@]} -gt 0 ] && echo "FAILED: ${FAILED[*]}"
108+
echo "grammars: ${BUILT[*]}"
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
// host_extra.c — custom exports compiled INTO the wasm module alongside the
2+
// official tree-sitter runtime. The whole point is `ts_dump_tree`: it walks the
3+
// parsed tree ENTIRELY inside the guest and writes a flat pre-order array of
4+
// fixed-size records into linear memory in ONE shot. The host then does a single
5+
// Memory.Read and runs the chunker's node matching in pure Go — turning the
6+
// ~3 wazero calls-per-node of the naive walk into ~1 call per parse.
7+
//
8+
// See docs/wasm-treesitter-implementation-plan.md §7.2 / §7.3.
9+
#include "tree_sitter/api.h"
10+
#include <stdint.h>
11+
12+
// One record per node. ALL fields uint32 so the Go side reads 9 little-endian
13+
// uint32s with zero packing ambiguity (recSize = 36). kind_id is the TSSymbol
14+
// (fits in 16 bits; widened for clean alignment). The host resolves kind_id ->
15+
// kind-name once per language via ts_language_symbol_name (see below), so the
16+
// per-node string lookup never crosses the boundary.
17+
typedef struct {
18+
uint32_t kind_id; // ts_node_symbol
19+
uint32_t start_byte;
20+
uint32_t end_byte;
21+
uint32_t start_row; // ts_node_start_point().row (0-based line)
22+
uint32_t start_col; // ts_node_start_point().column
23+
uint32_t end_row; // ts_node_end_point().row
24+
uint32_t end_col; // ts_node_end_point().column
25+
uint32_t depth; // pre-order depth (parent reconstruction via depth stack)
26+
uint32_t flags; // bit0 named, bit1 error, bit2 missing, bit3 extra
27+
} NodeRec;
28+
29+
static void emit(NodeRec *out, uint32_t i, TSNode n, uint32_t depth) {
30+
out[i].kind_id = ts_node_symbol(n);
31+
out[i].start_byte = ts_node_start_byte(n);
32+
out[i].end_byte = ts_node_end_byte(n);
33+
TSPoint sp = ts_node_start_point(n);
34+
out[i].start_row = sp.row;
35+
out[i].start_col = sp.column;
36+
TSPoint ep = ts_node_end_point(n);
37+
out[i].end_row = ep.row;
38+
out[i].end_col = ep.column;
39+
out[i].depth = depth;
40+
uint32_t f = 0;
41+
if (ts_node_is_named(n)) f |= 1u;
42+
if (ts_node_is_error(n)) f |= 2u;
43+
if (ts_node_is_missing(n)) f |= 4u;
44+
if (ts_node_is_extra(n)) f |= 8u;
45+
out[i].flags = f;
46+
}
47+
48+
// ts_dump_tree writes up to `cap` records and returns the TRUE node count. If the
49+
// return value > cap the buffer was too small: the host re-mallocs to the
50+
// returned count and calls again. Iterative pre-order DFS via a tree cursor — no
51+
// recursion, no per-node malloc, no boundary crossings.
52+
uint32_t ts_dump_tree(const TSTree *tree, NodeRec *out, uint32_t cap) {
53+
if (tree == 0) return 0;
54+
TSTreeCursor cur = ts_tree_cursor_new(ts_tree_root_node(tree));
55+
uint32_t count = 0;
56+
uint32_t depth = 0;
57+
for (;;) {
58+
if (count < cap) {
59+
emit(out, count, ts_tree_cursor_current_node(&cur), depth);
60+
}
61+
count++;
62+
if (ts_tree_cursor_goto_first_child(&cur)) { depth++; continue; }
63+
for (;;) {
64+
if (ts_tree_cursor_goto_next_sibling(&cur)) break;
65+
if (!ts_tree_cursor_goto_parent(&cur)) {
66+
ts_tree_cursor_delete(&cur);
67+
return count;
68+
}
69+
depth--;
70+
}
71+
}
72+
}
73+
74+
// recSize lets the host assert its struct layout matches the guest's.
75+
uint32_t ts_dump_rec_size(void) { return (uint32_t)sizeof(NodeRec); }

poc/wasm-treesitter/go.mod

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ module github.com/dvcdsys/code-index/poc/wasm-treesitter
22

33
go 1.25.3
44

5-
require (
6-
github.com/tetratelabs/wazero v1.12.0 // indirect
7-
golang.org/x/sys v0.44.0 // indirect
8-
)
5+
require github.com/tetratelabs/wazero v1.12.0
6+
7+
require golang.org/x/sys v0.44.0 // indirect
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
package wasmts
2+
3+
import (
4+
"context"
5+
"testing"
6+
)
7+
8+
// minimal valid-ish snippet per base grammar — enough to exercise load + parse +
9+
// per-language symbol-table resolution through the batched path. We assert the
10+
// language loads, parsing does not trap, and produces nodes; per-language SYMBOL
11+
// correctness (which kinds = function/class/...) is the next phase.
12+
var smoke = []struct {
13+
id string
14+
src string
15+
}{
16+
{"python", "def f():\n pass\n"},
17+
{"typescript", "function f(){}\n"},
18+
{"tsx", "const x = 1;\n"},
19+
{"javascript", "function f(){}\n"},
20+
{"go", "package m\nfunc F(){}\n"},
21+
{"rust", "fn f(){}\n"},
22+
{"java", "class C{}\n"},
23+
{"c", "int f(){return 0;}\n"},
24+
{"cpp", "int f(){return 0;}\n"},
25+
{"ruby", "def f\nend\n"},
26+
{"c_sharp", "class C{}\n"},
27+
{"php", "<?php function f(){}\n"},
28+
{"swift", "func f(){}\n"},
29+
{"kotlin", "fun f(){}\n"},
30+
{"scala", "object O{}\n"},
31+
{"bash", "f(){ echo hi; }\n"},
32+
{"lua", "function f() end\n"},
33+
{"dart", "void f(){}\n"},
34+
{"r", "f <- function() {}\n"},
35+
{"objc", "int f(){return 0;}\n"},
36+
{"html", "<div></div>\n"},
37+
{"css", "a{color:red}\n"},
38+
{"scss", "a{color:red}\n"},
39+
{"sql", "SELECT 1;\n"},
40+
{"markdown", "# Title\n"},
41+
{"zig", "fn f() void {}\n"},
42+
{"julia", "function f() end\n"},
43+
{"fortran", "program p\nend program p\n"},
44+
{"haskell", "main = return ()\n"},
45+
{"ocaml", "let f () = ()\n"},
46+
{"solidity", "contract C {}\n"},
47+
}
48+
49+
func TestAllGrammarsLoad(t *testing.T) {
50+
eng, err := New(context.Background(), 0)
51+
if err != nil {
52+
t.Fatal(err)
53+
}
54+
defer eng.Close()
55+
56+
if len(smoke) != 31 {
57+
t.Fatalf("expected 31 grammars in smoke set, got %d", len(smoke))
58+
}
59+
60+
for _, c := range smoke {
61+
t.Run(c.id, func(t *testing.T) {
62+
nodes, err := eng.ParseNodes("tree_sitter_"+c.id, []byte(c.src))
63+
if err != nil {
64+
t.Fatalf("parse trapped: %v", err)
65+
}
66+
if len(nodes) == 0 {
67+
t.Fatal("no nodes produced")
68+
}
69+
// the root node must carry a resolved kind name (symbol table works)
70+
if nodes[0].Kind == "" {
71+
t.Errorf("root kind unresolved (symbol table empty?)")
72+
}
73+
})
74+
}
75+
}

0 commit comments

Comments
 (0)