Skip to content

Commit 871613e

Browse files
committed
chore: include pre-built dist/ for direct GitHub install [fork-only]
1 parent 093925a commit 871613e

33 files changed

Lines changed: 13124 additions & 1 deletion

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
node_modules/
2-
dist/
2+
# dist/ tracked on this branch for direct GitHub install (no build step needed).
33
package-lock.json
44
.npmrc
55
*.sqlite

dist/ast.d.ts

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/**
2+
* AST-aware chunking support via web-tree-sitter.
3+
*
4+
* Provides language detection, AST break point extraction for supported
5+
* code file types, and a stub for future symbol extraction.
6+
*
7+
* All functions degrade gracefully: parse failures or unsupported languages
8+
* return empty arrays, falling back to regex-only chunking.
9+
*
10+
* ## Dependency Note
11+
*
12+
* Grammar packages (tree-sitter-typescript, etc.) are listed as
13+
* optionalDependencies with pinned versions. They ship native prebuilds
14+
* and source files (~72 MB total) but QMD only uses the .wasm files
15+
* (~5 MB). If install size becomes a concern, the .wasm files can be
16+
* bundled directly in the repo (e.g. assets/grammars/) and resolved
17+
* via import.meta.url instead of require.resolve(), eliminating the
18+
* grammar packages entirely.
19+
*/
20+
import type { BreakPoint } from "./store.js";
21+
export type SupportedLanguage = "typescript" | "tsx" | "javascript" | "python" | "go" | "rust";
22+
/**
23+
* Detect language from file path extension.
24+
* Returns null for unsupported or unknown extensions (including .md).
25+
*/
26+
export declare function detectLanguage(filepath: string): SupportedLanguage | null;
27+
/**
28+
* Parse a source file and return break points at AST node boundaries.
29+
*
30+
* Returns an empty array for unsupported languages, parse failures,
31+
* or grammar loading failures. Never throws.
32+
*
33+
* @param content - The file content to parse.
34+
* @param filepath - The file path (used for language detection).
35+
* @returns Array of BreakPoint objects suitable for merging with regex break points.
36+
*/
37+
export declare function getASTBreakPoints(content: string, filepath: string): Promise<BreakPoint[]>;
38+
/**
39+
* Check which tree-sitter grammars are available.
40+
* Returns a status object for each supported language.
41+
*/
42+
export declare function getASTStatus(): Promise<{
43+
available: boolean;
44+
languages: {
45+
language: SupportedLanguage;
46+
available: boolean;
47+
error?: string;
48+
}[];
49+
}>;
50+
/**
51+
* Metadata about a code symbol within a chunk.
52+
* Stubbed for Phase 2 — always returns empty array in Phase 1.
53+
*/
54+
export interface SymbolInfo {
55+
name: string;
56+
kind: string;
57+
signature?: string;
58+
line: number;
59+
}
60+
/**
61+
* Extract symbol metadata for code within a byte range.
62+
* Stubbed for Phase 2 — returns empty array.
63+
*/
64+
export declare function extractSymbols(_content: string, _language: string, _startPos: number, _endPos: number): SymbolInfo[];

dist/ast.js

Lines changed: 324 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,324 @@
1+
/**
2+
* AST-aware chunking support via web-tree-sitter.
3+
*
4+
* Provides language detection, AST break point extraction for supported
5+
* code file types, and a stub for future symbol extraction.
6+
*
7+
* All functions degrade gracefully: parse failures or unsupported languages
8+
* return empty arrays, falling back to regex-only chunking.
9+
*
10+
* ## Dependency Note
11+
*
12+
* Grammar packages (tree-sitter-typescript, etc.) are listed as
13+
* optionalDependencies with pinned versions. They ship native prebuilds
14+
* and source files (~72 MB total) but QMD only uses the .wasm files
15+
* (~5 MB). If install size becomes a concern, the .wasm files can be
16+
* bundled directly in the repo (e.g. assets/grammars/) and resolved
17+
* via import.meta.url instead of require.resolve(), eliminating the
18+
* grammar packages entirely.
19+
*/
20+
import { createRequire } from "node:module";
21+
import { extname } from "node:path";
22+
const EXTENSION_MAP = {
23+
".ts": "typescript",
24+
".tsx": "tsx",
25+
".js": "javascript",
26+
".jsx": "tsx",
27+
".mts": "typescript",
28+
".cts": "typescript",
29+
".mjs": "javascript",
30+
".cjs": "javascript",
31+
".py": "python",
32+
".go": "go",
33+
".rs": "rust",
34+
};
35+
/**
36+
* Detect language from file path extension.
37+
* Returns null for unsupported or unknown extensions (including .md).
38+
*/
39+
export function detectLanguage(filepath) {
40+
const ext = extname(filepath).toLowerCase();
41+
return EXTENSION_MAP[ext] ?? null;
42+
}
43+
// =============================================================================
44+
// Grammar Resolution
45+
// =============================================================================
46+
/**
47+
* Maps language to the npm package and wasm filename for the grammar.
48+
*/
49+
const GRAMMAR_MAP = {
50+
typescript: { pkg: "tree-sitter-typescript", wasm: "tree-sitter-typescript.wasm" },
51+
tsx: { pkg: "tree-sitter-typescript", wasm: "tree-sitter-tsx.wasm" },
52+
javascript: { pkg: "tree-sitter-typescript", wasm: "tree-sitter-typescript.wasm" },
53+
python: { pkg: "tree-sitter-python", wasm: "tree-sitter-python.wasm" },
54+
go: { pkg: "tree-sitter-go", wasm: "tree-sitter-go.wasm" },
55+
rust: { pkg: "tree-sitter-rust", wasm: "tree-sitter-rust.wasm" },
56+
};
57+
// =============================================================================
58+
// Per-Language Query Definitions
59+
// =============================================================================
60+
/**
61+
* Tree-sitter S-expression queries for each language.
62+
* Each capture name maps to a break point score via SCORE_MAP.
63+
*
64+
* For TypeScript/JavaScript, we match export_statement wrappers to get the
65+
* correct start position (before `export`), plus bare declarations for
66+
* non-exported code.
67+
*/
68+
const LANGUAGE_QUERIES = {
69+
typescript: `
70+
(export_statement) @export
71+
(class_declaration) @class
72+
(function_declaration) @func
73+
(method_definition) @method
74+
(interface_declaration) @iface
75+
(type_alias_declaration) @type
76+
(enum_declaration) @enum
77+
(import_statement) @import
78+
(lexical_declaration (variable_declarator value: (arrow_function))) @func
79+
(lexical_declaration (variable_declarator value: (function_expression))) @func
80+
`,
81+
tsx: `
82+
(export_statement) @export
83+
(class_declaration) @class
84+
(function_declaration) @func
85+
(method_definition) @method
86+
(interface_declaration) @iface
87+
(type_alias_declaration) @type
88+
(enum_declaration) @enum
89+
(import_statement) @import
90+
(lexical_declaration (variable_declarator value: (arrow_function))) @func
91+
(lexical_declaration (variable_declarator value: (function_expression))) @func
92+
`,
93+
javascript: `
94+
(export_statement) @export
95+
(class_declaration) @class
96+
(function_declaration) @func
97+
(method_definition) @method
98+
(import_statement) @import
99+
(lexical_declaration (variable_declarator value: (arrow_function))) @func
100+
(lexical_declaration (variable_declarator value: (function_expression))) @func
101+
`,
102+
python: `
103+
(class_definition) @class
104+
(function_definition) @func
105+
(decorated_definition) @decorated
106+
(import_statement) @import
107+
(import_from_statement) @import
108+
`,
109+
go: `
110+
(type_declaration) @type
111+
(function_declaration) @func
112+
(method_declaration) @method
113+
(import_declaration) @import
114+
`,
115+
rust: `
116+
(struct_item) @struct
117+
(impl_item) @impl
118+
(function_item) @func
119+
(trait_item) @trait
120+
(enum_item) @enum
121+
(use_declaration) @import
122+
(type_item) @type
123+
(mod_item) @mod
124+
`,
125+
};
126+
/**
127+
* Score mapping from capture names to break point scores.
128+
* Aligned with the markdown BREAK_PATTERNS scale (h1=100, h2=90, etc.)
129+
* so findBestCutoff() decay works unchanged.
130+
*/
131+
const SCORE_MAP = {
132+
class: 100,
133+
iface: 100,
134+
struct: 100,
135+
trait: 100,
136+
impl: 100,
137+
mod: 100,
138+
export: 90,
139+
func: 90,
140+
method: 90,
141+
decorated: 90,
142+
type: 80,
143+
enum: 80,
144+
import: 60,
145+
};
146+
// =============================================================================
147+
// Parser Caching & Initialization
148+
// =============================================================================
149+
let ParserClass = null;
150+
let LanguageClass = null;
151+
let QueryClass = null;
152+
let initPromise = null;
153+
/** Languages that have already failed to load — warn only once per process. */
154+
const failedLanguages = new Set();
155+
/** Cached grammar load promises. */
156+
const grammarCache = new Map();
157+
/** Cached compiled queries per language. */
158+
const queryCache = new Map();
159+
/**
160+
* Initialize web-tree-sitter. Called once and cached.
161+
*/
162+
async function ensureInit() {
163+
if (!initPromise) {
164+
initPromise = (async () => {
165+
const mod = await import("web-tree-sitter");
166+
ParserClass = mod.Parser;
167+
LanguageClass = mod.Language;
168+
QueryClass = mod.Query;
169+
await ParserClass.init();
170+
})();
171+
}
172+
return initPromise;
173+
}
174+
/**
175+
* Resolve the filesystem path to a grammar .wasm file.
176+
* Uses createRequire to resolve from installed dependency packages.
177+
*/
178+
function resolveGrammarPath(language) {
179+
const { pkg, wasm } = GRAMMAR_MAP[language];
180+
const require = createRequire(import.meta.url);
181+
return require.resolve(`${pkg}/${wasm}`);
182+
}
183+
/**
184+
* Load and cache a grammar for the given language.
185+
* Returns null on failure (logs once per language).
186+
*/
187+
async function loadGrammar(language) {
188+
if (failedLanguages.has(language))
189+
return null;
190+
const wasmKey = GRAMMAR_MAP[language].wasm;
191+
if (!grammarCache.has(wasmKey)) {
192+
grammarCache.set(wasmKey, (async () => {
193+
const path = resolveGrammarPath(language);
194+
return LanguageClass.load(path);
195+
})());
196+
}
197+
try {
198+
return await grammarCache.get(wasmKey);
199+
}
200+
catch (err) {
201+
failedLanguages.add(language);
202+
grammarCache.delete(wasmKey);
203+
console.warn(`[qmd] Failed to load tree-sitter grammar for ${language}: ${err}`);
204+
return null;
205+
}
206+
}
207+
/**
208+
* Get or create a compiled query for the given language.
209+
*/
210+
function getQuery(language, grammar) {
211+
if (!queryCache.has(language)) {
212+
const source = LANGUAGE_QUERIES[language];
213+
const query = new QueryClass(grammar, source);
214+
queryCache.set(language, query);
215+
}
216+
return queryCache.get(language);
217+
}
218+
// =============================================================================
219+
// AST Break Point Extraction
220+
// =============================================================================
221+
/**
222+
* Parse a source file and return break points at AST node boundaries.
223+
*
224+
* Returns an empty array for unsupported languages, parse failures,
225+
* or grammar loading failures. Never throws.
226+
*
227+
* @param content - The file content to parse.
228+
* @param filepath - The file path (used for language detection).
229+
* @returns Array of BreakPoint objects suitable for merging with regex break points.
230+
*/
231+
export async function getASTBreakPoints(content, filepath) {
232+
const language = detectLanguage(filepath);
233+
if (!language)
234+
return [];
235+
try {
236+
await ensureInit();
237+
const grammar = await loadGrammar(language);
238+
if (!grammar)
239+
return [];
240+
const parser = new ParserClass();
241+
parser.setLanguage(grammar);
242+
const tree = parser.parse(content);
243+
if (!tree) {
244+
parser.delete();
245+
return [];
246+
}
247+
const query = getQuery(language, grammar);
248+
const captures = query.captures(tree.rootNode);
249+
// Deduplicate: at each byte position, keep the highest-scoring capture.
250+
// This handles cases like export_statement wrapping a class_declaration
251+
// at different offsets — we want the outermost (earliest) position.
252+
const seen = new Map();
253+
for (const cap of captures) {
254+
const pos = cap.node.startIndex;
255+
const score = SCORE_MAP[cap.name] ?? 20;
256+
const type = `ast:${cap.name}`;
257+
const existing = seen.get(pos);
258+
if (!existing || score > existing.score) {
259+
seen.set(pos, { pos, score, type });
260+
}
261+
}
262+
tree.delete();
263+
parser.delete();
264+
return Array.from(seen.values()).sort((a, b) => a.pos - b.pos);
265+
}
266+
catch (err) {
267+
console.warn(`[qmd] AST parse failed for ${filepath}, falling back to regex: ${err instanceof Error ? err.message : err}`);
268+
return [];
269+
}
270+
}
271+
// =============================================================================
272+
// Health / Status
273+
// =============================================================================
274+
/**
275+
* Check which tree-sitter grammars are available.
276+
* Returns a status object for each supported language.
277+
*/
278+
export async function getASTStatus() {
279+
const languages = [];
280+
try {
281+
await ensureInit();
282+
}
283+
catch (err) {
284+
return {
285+
available: false,
286+
languages: Object.keys(GRAMMAR_MAP).map(lang => ({
287+
language: lang,
288+
available: false,
289+
error: `web-tree-sitter init failed: ${err instanceof Error ? err.message : err}`,
290+
})),
291+
};
292+
}
293+
for (const lang of Object.keys(GRAMMAR_MAP)) {
294+
try {
295+
const grammar = await loadGrammar(lang);
296+
if (grammar) {
297+
// Also verify the query compiles
298+
getQuery(lang, grammar);
299+
languages.push({ language: lang, available: true });
300+
}
301+
else {
302+
languages.push({ language: lang, available: false, error: "grammar failed to load" });
303+
}
304+
}
305+
catch (err) {
306+
languages.push({
307+
language: lang,
308+
available: false,
309+
error: err instanceof Error ? err.message : String(err),
310+
});
311+
}
312+
}
313+
return {
314+
available: languages.some(l => l.available),
315+
languages,
316+
};
317+
}
318+
/**
319+
* Extract symbol metadata for code within a byte range.
320+
* Stubbed for Phase 2 — returns empty array.
321+
*/
322+
export function extractSymbols(_content, _language, _startPos, _endPos) {
323+
return [];
324+
}

0 commit comments

Comments
 (0)