Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,30 @@ Supported model families:
> since vectors are not cross-compatible between models. The prompt format is
> automatically adjusted for each model family.

### OpenAI Embeddings (Optional)

As an alternative to local embedding models, you can use OpenAI's API for faster, more reliable embeddings:

```yaml
# ~/.config/qmd/index.yml
embedding:
provider: openai
openai:
api_key: sk-... # Optional, falls back to QMD_OPENAI_API_KEY or OPENAI_API_KEY env var
model: text-embedding-3-small # Optional, this is the default
expansion_model: gpt-4o-mini # Optional, model for query expansion/reranking
base_url: https://api.openai.com/v1 # Optional, for OpenAI-compatible APIs (Ollama, vLLM, etc.)
```

Benefits:
- **~10x faster** than local CPU inference
- **No GPU required** - works on any machine
- **More reliable** - no local model loading issues
- **Cost:** ~$0.02 per 1M tokens (very cheap)
- **OpenAI-compatible** - works with Ollama, vLLM, Azure, and other compatible APIs via `base_url`

When using OpenAI embeddings, query expansion and reranking use the OpenAI API instead of local models.

## Installation

```sh
Expand Down
228 changes: 129 additions & 99 deletions bun.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,10 @@
"better-sqlite3": "12.8.0",
"fast-glob": "3.3.3",
"node-llama-cpp": "3.18.1",
"openai": "^4.77.0",
"picomatch": "4.0.4",
"sqlite-vec": "0.1.9",
"tiktoken": "^1.0.22",
"web-tree-sitter": "0.26.7",
"yaml": "2.8.3",
"zod": "4.2.1"
Expand Down
129 changes: 89 additions & 40 deletions src/cli/qmd.ts
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ import {
type ReindexResult,
type ChunkStrategy,
} from "../store.js";
import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCpp, getDefaultEmbeddingLLM, getEmbeddingConfig, withLLMSession, pullModels, setEmbeddingConfig, isUsingOpenAI, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
import {
formatSearchResults,
formatDocuments,
Expand All @@ -98,6 +98,7 @@ import {
listAllContexts,
setConfigIndexName,
loadConfig,
getEmbeddingConfig as getEmbeddingConfigFromYaml,
} from "../collections.js";
import { getEmbeddedQmdSkillContent, getEmbeddedQmdSkillFiles } from "../embedded-skills.js";

Expand Down Expand Up @@ -454,8 +455,14 @@ async function showStatus(): Promise<void> {
console.log(`\n${c.dim}No collections. Run 'qmd collection add .' to index markdown files.${c.reset}`);
}

// Models
{
// Models / Provider info
if (isUsingOpenAI()) {
const embCfg = getEmbeddingConfig();
console.log(`\n${c.bold}Provider${c.reset}`);
console.log(` Mode: ${c.green}OpenAI-compatible${c.reset}`);
console.log(` Base URL: ${embCfg.openai?.baseURL || process.env.QMD_OPENAI_BASE_URL || '(default)'}`);
console.log(` Embed model: ${embCfg.openai?.embedModel || 'text-embedding-3-small'}`);
} else {
// hf:org/repo/file.gguf → https://huggingface.co/org/repo
const hfLink = (uri: string) => {
const match = uri.match(/^hf:([^/]+\/[^/]+)\//);
Expand All @@ -467,7 +474,7 @@ async function showStatus(): Promise<void> {
console.log(` Generation: ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`);
}

// Device / GPU info
// Device / GPU info (local mode only — skip in OpenAI mode to avoid triggering compilation
// Important: probing node-llama-cpp can abort the whole process on machines with
// incompatible GPU drivers (for example Vulkan loader present but no usable driver).
// Keep `qmd status` safe by default and make the expensive/native probe opt-in.
Expand Down Expand Up @@ -498,7 +505,7 @@ async function showStatus(): Promise<void> {
}
console.log(` CPU: ${device.cpuCores} math cores`);
} catch (error) {
console.log(` Status: ${c.dim}probe failed${c.reset}`);
console.log(` Status: ${c.dim}skipped${c.reset} (status probe does not build llama.cpp backends)`);
if (error instanceof Error && error.message) {
console.log(` ${c.dim}${error.message}${c.reset}`);
}
Expand Down Expand Up @@ -1709,34 +1716,37 @@ async function vectorIndex(

const startTime = Date.now();

const result = await generateEmbeddings(storeInstance, {
force,
model,
maxDocsPerBatch: batchOptions?.maxDocsPerBatch,
maxBatchBytes: batchOptions?.maxBatchBytes,
chunkStrategy: batchOptions?.chunkStrategy,
onProgress: (info) => {
if (info.totalBytes === 0) return;
const percent = (info.bytesProcessed / info.totalBytes) * 100;
progress.set(percent);

const elapsed = (Date.now() - startTime) / 1000;
const bytesPerSec = info.bytesProcessed / elapsed;
const remainingBytes = info.totalBytes - info.bytesProcessed;
const etaSec = remainingBytes / bytesPerSec;

const bar = renderProgressBar(percent);
const percentStr = percent.toFixed(0).padStart(3);
const throughput = `${formatBytes(bytesPerSec)}/s`;
const eta = elapsed > 2 ? formatETA(etaSec) : "...";
const errStr = info.errors > 0 ? ` ${c.yellow}${info.errors} err${c.reset}` : "";

if (isTTY) process.stderr.write(`\r${c.cyan}${bar}${c.reset} ${c.bold}${percentStr}%${c.reset} ${c.dim}${info.chunksEmbedded}/${info.totalChunks}${c.reset}${errStr} ${c.dim}${throughput} ETA ${eta}${c.reset} `);
},
});
let result: Awaited<ReturnType<typeof generateEmbeddings>>;
try {
result = await generateEmbeddings(storeInstance, {
force,
model,
maxDocsPerBatch: batchOptions?.maxDocsPerBatch,
maxBatchBytes: batchOptions?.maxBatchBytes,
chunkStrategy: batchOptions?.chunkStrategy,
onProgress: (info) => {
if (info.totalBytes === 0) return;
const percent = (info.bytesProcessed / info.totalBytes) * 100;
progress.set(percent);

progress.clear();
cursor.show();
const elapsed = (Date.now() - startTime) / 1000;
const bytesPerSec = info.bytesProcessed / elapsed;
const remainingBytes = info.totalBytes - info.bytesProcessed;
const etaSec = remainingBytes / bytesPerSec;

const bar = renderProgressBar(percent);
const percentStr = percent.toFixed(0).padStart(3);
const throughput = `${formatBytes(bytesPerSec)}/s`;
const eta = elapsed > 2 ? formatETA(etaSec) : "...";
const errStr = info.errors > 0 ? ` ${c.yellow}${info.errors} err${c.reset}` : "";

if (isTTY) process.stderr.write(`\r${c.cyan}${bar}${c.reset} ${c.bold}${percentStr}%${c.reset} ${c.dim}${info.chunksEmbedded}/${info.totalChunks}${c.reset}${errStr} ${c.dim}${throughput} ETA ${eta}${c.reset} `);
},
});
} finally {
progress.clear();
cursor.show();
}

const totalTimeSec = result.durationMs / 1000;

Expand Down Expand Up @@ -2240,10 +2250,8 @@ function search(query: string, opts: OutputOptions): void {

// Use large limit for --all, otherwise fetch more than needed and let outputResults filter
const fetchLimit = opts.all ? 100000 : Math.max(50, opts.limit * 2);
const results = filterByCollections(
searchFTS(db, query, fetchLimit, singleCollection),
collectionNames
);
// Pass collections directly to searchFTS (it now supports arrays)
const results = searchFTS(db, query, fetchLimit, collectionNames.length > 0 ? collectionNames : undefined);

// Add context to results
const resultsWithContext = results.map(r => ({
Expand Down Expand Up @@ -2291,7 +2299,7 @@ async function vectorSearch(query: string, opts: OutputOptions, _model: string =

checkIndexHealth(store.db);

await withLLMSession(async () => {
const llmSession = async () => {
let results = await vectorSearchQuery(store, query, {
collection: singleCollection,
limit: opts.all ? 500 : (opts.limit || 10),
Expand Down Expand Up @@ -2329,7 +2337,15 @@ async function vectorSearch(query: string, opts: OutputOptions, _model: string =
context: r.context,
docid: r.docid,
})), query, { ...opts, limit: results.length });
}, { maxDuration: 10 * 60 * 1000, name: 'vectorSearch' });
};

if (isUsingOpenAI()) {
await llmSession();
} else {
await withLLMSession(async () => llmSession(),
{ maxDuration: 10 * 60 * 1000, name: 'vectorSearch' }
);
}
}

async function querySearch(query: string, opts: OutputOptions, _embedModel: string = DEFAULT_EMBED_MODEL, _rerankModel: string = DEFAULT_RERANK_MODEL): Promise<void> {
Expand All @@ -2347,7 +2363,7 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
// Intent can come from --intent flag or from intent: line in query document
const intent = opts.intent || parsed?.intent;

await withLLMSession(async () => {
const querySession = async () => {
let results;

if (parsed) {
Expand Down Expand Up @@ -2467,7 +2483,15 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
docid: r.docid,
explain: r.explain,
})), displayQuery, { ...opts, limit: results.length });
}, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
};

if (isUsingOpenAI()) {
await querySession();
} else {
await withLLMSession(async () => querySession(),
{ maxDuration: 10 * 60 * 1000, name: 'querySearch' }
);
}
}

// Parse CLI arguments using util.parseArgs
Expand Down Expand Up @@ -2852,6 +2876,31 @@ if (isMain) {
process.exit(cli.values.help ? 0 : 1);
}

// Load embedding configuration.
// Priority: YAML config > env vars > default (local).
// Setting QMD_OPENAI_BASE_URL alone is enough to activate OpenAI mode.
const embeddingYamlConfig = getEmbeddingConfigFromYaml();
const useOpenAI = embeddingYamlConfig.provider === 'openai'
|| !!process.env.QMD_OPENAI_BASE_URL
|| process.env.QMD_OPENAI === '1';

if (useOpenAI) {
setEmbeddingConfig({
provider: 'openai',
openai: {
apiKey: embeddingYamlConfig.openai?.api_key || process.env.QMD_OPENAI_API_KEY,
embedModel: embeddingYamlConfig.openai?.model || process.env.QMD_OPENAI_EMBED_MODEL,
expansionModel: embeddingYamlConfig.openai?.expansion_model,
rerankModel: embeddingYamlConfig.openai?.rerank_model,
baseURL: embeddingYamlConfig.openai?.base_url || process.env.QMD_OPENAI_BASE_URL,
chatBaseURL: embeddingYamlConfig.openai?.chat_base_url,
chatApiKey: embeddingYamlConfig.openai?.chat_api_key,
rerankBaseURL: embeddingYamlConfig.openai?.rerank_base_url,
rerankApiKey: embeddingYamlConfig.openai?.rerank_api_key,
},
});
}

switch (cli.command) {
case "context": {
const subcommand = cli.args[0];
Expand Down
28 changes: 28 additions & 0 deletions src/collections.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,24 @@ export interface ModelsConfig {
generate?: string;
}

/**
* Embedding provider configuration (optional in config file)
*/
export interface EmbeddingProviderConfig {
provider?: 'local' | 'openai'; // Default: 'local'
openai?: {
api_key?: string; // Falls back to QMD_OPENAI_API_KEY / OPENAI_API_KEY env var
model?: string; // Default: 'text-embedding-3-small'
expansion_model?: string; // Default: 'gpt-4o-mini'
rerank_model?: string; // Default: falls back to expansion_model
base_url?: string; // Base URL for embeddings (OpenAI-compatible)
chat_base_url?: string; // Separate base URL for expansion (falls back to base_url)
chat_api_key?: string; // Separate API key for chat endpoint (falls back to api_key)
rerank_base_url?: string; // Separate base URL for reranking (falls back to chat_base_url)
rerank_api_key?: string; // Separate API key for rerank endpoint (falls back to chat_api_key)
};
}

/**
* The complete configuration file structure
*/
Expand All @@ -51,6 +69,7 @@ export interface CollectionConfig {
editor_uri_template?: string; // Alias for editor_uri
collections: Record<string, Collection>; // Collection name -> config
models?: ModelsConfig;
embedding?: EmbeddingProviderConfig; // Optional embedding provider settings
}

/**
Expand Down Expand Up @@ -510,3 +529,12 @@ export function isValidCollectionName(name: string): boolean {
// Allow alphanumeric, hyphens, underscores
return /^[a-zA-Z0-9_-]+$/.test(name);
}

/**
* Get embedding configuration from config file
* Returns default (local) config if not specified
*/
export function getEmbeddingConfig(): EmbeddingProviderConfig {
const config = loadConfig();
return config.embedding || { provider: 'local' };
}
1 change: 1 addition & 0 deletions src/db.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ export interface Database {
exec(sql: string): void;
prepare(sql: string): Statement;
loadExtension(path: string): void;
transaction<T>(fn: () => T): () => T;
close(): void;
}

Expand Down
30 changes: 30 additions & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ import {
} from "./store.js";
import {
LlamaCpp,
setEmbeddingConfig,
} from "./llm.js";
import {
setConfigSource,
Expand Down Expand Up @@ -335,6 +336,33 @@ export interface QMDStore {
* await store.close()
* ```
*/
function configureEmbeddingProvider(config?: CollectionConfig): void {
const embeddingYamlConfig = config?.embedding || { provider: 'local' as const };
const useOpenAI = embeddingYamlConfig.provider === 'openai'
|| !!process.env.QMD_OPENAI_BASE_URL
|| process.env.QMD_OPENAI === '1';

if (useOpenAI) {
setEmbeddingConfig({
provider: 'openai',
openai: {
apiKey: embeddingYamlConfig.openai?.api_key || process.env.QMD_OPENAI_API_KEY,
embedModel: embeddingYamlConfig.openai?.model || process.env.QMD_OPENAI_EMBED_MODEL,
expansionModel: embeddingYamlConfig.openai?.expansion_model,
rerankModel: embeddingYamlConfig.openai?.rerank_model,
baseURL: embeddingYamlConfig.openai?.base_url || process.env.QMD_OPENAI_BASE_URL,
chatBaseURL: embeddingYamlConfig.openai?.chat_base_url,
chatApiKey: embeddingYamlConfig.openai?.chat_api_key,
rerankBaseURL: embeddingYamlConfig.openai?.rerank_base_url,
rerankApiKey: embeddingYamlConfig.openai?.rerank_api_key,
},
});
return;
}

setEmbeddingConfig({ provider: 'local' });
}

export async function createStore(options: StoreOptions): Promise<QMDStore> {
if (!options.dbPath) {
throw new Error("dbPath is required");
Expand Down Expand Up @@ -365,6 +393,8 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {
}
// else: DB-only mode — no external config, use existing store_collections

configureEmbeddingProvider(config);

// Create a per-store LlamaCpp instance — lazy-loads models on first use,
// auto-unloads after 5 min inactivity to free VRAM.
const llm = new LlamaCpp({
Expand Down
Loading