Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,33 @@

## [Unreleased]

### Features

- **Remote embedding/reranking** via OpenAI-compatible endpoints. Set
`QMD_REMOTE_EMBED_URL` and `QMD_REMOTE_RERANK_URL` to route embedding and
reranking to any OpenAI-compatible server (vLLM, TEI, LiteLLM, Ollama, etc.)
without running local GGUF models. Query expansion and generation are handled
by the same backend via `/v1/chat/completions`. To use a dedicated chat server
separate from the embedding server, set `QMD_REMOTE_GEN_URL`.
All endpoints support `QMD_REMOTE_API_KEY` for bearer authentication.
Circuit breakers protect each endpoint independently.
`qmd status` shows remote server URLs when in remote mode.
- SDK: `createStore()` now accepts an `llm?` option to inject a custom LLM backend
(`HybridLLM`, `RemoteLLM`, or any `LLM` implementation). When omitted,
`QMD_REMOTE_EMBED_URL` / `QMD_REMOTE_RERANK_URL` are checked automatically.

### Fixes

- Remote mode: `QMD_REMOTE_EMBED_URL` is no longer overwritten when a YAML
`models:` block is present — the override is skipped when remote mode is active.
- Remote mode: `RemoteLLM.detokenize()` returns a character-length approximation
instead of empty string, preventing silent chunk loss in the fallback path.
- `chunkDocumentByTokens` accepts an optional `llm?` parameter; internal callers
pass the store-scoped LLM instead of pulling from the global singleton.
- `LLM` interface: `tokenize`, `countTokens`, `detokenize`, `embedBatch` are now
required members. `LLMSessionManager` and `withLLMSessionForLlm` accept `LLM`.
- `RemoteLLM.modelExists()` logs a warning before returning the optimistic
fail-open result when neither server can verify model availability.
- Embedding: `qmd embed -c <collection>` now scopes pending-doc selection
to the requested collection instead of embedding global pending work.
Scoped `--force` clears only collection-owned vectors, preserves shared
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
],
"scripts": {
"prepare": "[ -d .git ] && ./scripts/install-hooks.sh || true",
"build": "tsc -p tsconfig.build.json && printf '#!/usr/bin/env node\n' | cat - dist/cli/qmd.js > dist/cli/qmd.tmp && mv dist/cli/qmd.tmp dist/cli/qmd.js && chmod +x dist/cli/qmd.js",
"build": "tsc -p tsconfig.build.json; { echo '#!/usr/bin/env node'; cat dist/cli/qmd.js; } > dist/cli/qmd.tmp && mv dist/cli/qmd.tmp dist/cli/qmd.js && chmod +x dist/cli/qmd.js",
"test": "vitest run --reporter=verbose test/",
"qmd": "tsx src/cli/qmd.ts",
"index": "tsx src/cli/qmd.ts index",
Expand Down
91 changes: 61 additions & 30 deletions src/cli/qmd.ts
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,9 @@ import {
type ReindexResult,
type ChunkStrategy,
} from "../store.js";
import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, getDefaultLLM, setDefaultLlamaCpp, setDefaultLLM, LlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
import { HybridLLM } from "../hybrid-llm.js";
import { RemoteLLM } from "../remote-llm.js";
import {
formatSearchResults,
formatDocuments,
Expand Down Expand Up @@ -108,6 +110,23 @@ import { getEmbeddedQmdSkillContent, getEmbeddedQmdSkillFiles } from "../embedde
// resolution. The flag is flipped inside the CLI's main-module guard below so
// it only fires when qmd is actually invoked as a script.

// Remote LLM: if QMD_REMOTE_EMBED_URL / QMD_REMOTE_RERANK_URL are set, route
// embedding and reranking through the remote server instead of local GGUF models.
const remoteEmbedUrl = process.env.QMD_REMOTE_EMBED_URL;
const remoteRerankUrl = process.env.QMD_REMOTE_RERANK_URL;
if (remoteEmbedUrl || remoteRerankUrl) {
if (!remoteEmbedUrl || !remoteRerankUrl) {
throw new Error("QMD_REMOTE_EMBED_URL and QMD_REMOTE_RERANK_URL must both be set to enable remote embedding/reranking");
}
const remote = new RemoteLLM({
embedUrl: remoteEmbedUrl,
rerankUrl: remoteRerankUrl,
genUrl: process.env.QMD_REMOTE_GEN_URL,
apiKey: process.env.QMD_REMOTE_API_KEY,
});
setDefaultLLM(new HybridLLM(null, remote));
}

// =============================================================================
// Store/DB lifecycle (no legacy singletons in store.ts)
// =============================================================================
Expand All @@ -123,7 +142,7 @@ function getStore(): ReturnType<typeof createStore> {
try {
const config = loadConfig();
syncConfigToDb(store.db, config);
if (config.models) {
if (config.models && !getDefaultLLM().isRemote) {
setDefaultLlamaCpp(new LlamaCpp({
embedModel: config.models.embed,
generateModel: config.models.generate,
Expand Down Expand Up @@ -457,15 +476,23 @@ async function showStatus(): Promise<void> {

// Models
{
// hf:org/repo/file.gguf → https://huggingface.co/org/repo
const hfLink = (uri: string) => {
const match = uri.match(/^hf:([^/]+\/[^/]+)\//);
return match ? `https://huggingface.co/${match[1]}` : uri;
};
console.log(`\n${c.bold}Models${c.reset}`);
console.log(` Embedding: ${hfLink(DEFAULT_EMBED_MODEL_URI)}`);
console.log(` Reranking: ${hfLink(DEFAULT_RERANK_MODEL_URI)}`);
console.log(` Generation: ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`);
const llmForStatus = getDefaultLLM();
if (llmForStatus.isRemote) {
console.log(`\n${c.bold}Models${c.reset}`);
console.log(` Embedding: ${remoteEmbedUrl ?? process.env.QMD_REMOTE_EMBED_URL ?? "(remote)"}`);
console.log(` Reranking: ${remoteRerankUrl ?? process.env.QMD_REMOTE_RERANK_URL ?? "(remote)"}`);
console.log(` Generation: ${process.env.QMD_REMOTE_GEN_URL ?? "(same as embedding)"}`);
} else {
// hf:org/repo/file.gguf → https://huggingface.co/org/repo
const hfLink = (uri: string) => {
const match = uri.match(/^hf:([^/]+\/[^/]+)\//);
return match ? `https://huggingface.co/${match[1]}` : uri;
};
console.log(`\n${c.bold}Models${c.reset}`);
console.log(` Embedding: ${hfLink(DEFAULT_EMBED_MODEL_URI)}`);
console.log(` Reranking: ${hfLink(DEFAULT_RERANK_MODEL_URI)}`);
console.log(` Generation: ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`);
}
}

// Device / GPU info
Expand All @@ -475,29 +502,33 @@ async function showStatus(): Promise<void> {
if (process.env.QMD_STATUS_DEVICE_PROBE === "1") {
console.log(`\n${c.bold}Device${c.reset}`);
try {
const llm = getDefaultLlamaCpp();
const device = await llm.getDeviceInfo({ allowBuild: false });
if (device.gpu) {
console.log(` GPU: ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
if (device.gpuDevices.length > 0) {
// Deduplicate and count GPUs
const counts = new Map<string, number>();
for (const name of device.gpuDevices) {
counts.set(name, (counts.get(name) || 0) + 1);
const llm = getDefaultLLM() as Partial<LlamaCpp>;
if (typeof llm.getDeviceInfo === "function") {
const device = await llm.getDeviceInfo({ allowBuild: false });
if (device.gpu) {
console.log(` GPU: ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
if (device.gpuDevices.length > 0) {
// Deduplicate and count GPUs
const counts = new Map<string, number>();
for (const name of device.gpuDevices) {
counts.set(name, (counts.get(name) || 0) + 1);
}
const deviceStr = Array.from(counts.entries())
.map(([name, count]) => count > 1 ? `${count}× ${name}` : name)
.join(', ');
console.log(` Devices: ${deviceStr}`);
}
const deviceStr = Array.from(counts.entries())
.map(([name, count]) => count > 1 ? `${count}× ${name}` : name)
.join(', ');
console.log(` Devices: ${deviceStr}`);
}
if (device.vram) {
console.log(` VRAM: ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`);
if (device.vram) {
console.log(` VRAM: ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`);
}
} else {
console.log(` GPU: ${c.yellow}none${c.reset} (running on CPU — models will be slow)`);
console.log(` ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
}
console.log(` CPU: ${device.cpuCores} math cores`);
} else {
console.log(` GPU: ${c.yellow}none${c.reset} (running on CPU — models will be slow)`);
console.log(` ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
console.log(` Status: ${c.dim}remote LLM (no device info)${c.reset}`);
}
console.log(` CPU: ${device.cpuCores} math cores`);
} catch (error) {
console.log(` Status: ${c.dim}probe failed${c.reset}`);
if (error instanceof Error && error.message) {
Expand Down
96 changes: 96 additions & 0 deletions src/hybrid-llm.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import type {
EmbedOptions,
EmbeddingResult,
GenerateOptions,
GenerateResult,
LLM,
ModelInfo,
Queryable,
RerankDocument,
RerankOptions,
RerankResult,
} from "./llm.js";

/**
* Routes all operations to a remote LLM provider (OpenAI API).
*
* When a local LlamaCpp is provided, it serves as a fallback for operations
* the remote doesn't support. When local is null, all operations go remote
* (expandQuery, generate, tokenize use the remote implementation).
*/
export class HybridLLM implements LLM {
readonly isRemote = true;

constructor(
private readonly local: LLM | null,
private readonly remote: LLM,
) {}

async embed(
text: string,
options: EmbedOptions = {},
): Promise<EmbeddingResult | null> {
return this.remote.embed(text, options);
}

async embedBatch(
texts: string[],
options: EmbedOptions = {},
): Promise<(EmbeddingResult | null)[]> {
return this.remote.embedBatch(texts, options);
}

async rerank(
query: string,
documents: RerankDocument[],
options: RerankOptions = {},
): Promise<RerankResult> {
return this.remote.rerank(query, documents, options);
}

async generate(
prompt: string,
options: GenerateOptions = {},
): Promise<GenerateResult | null> {
return this.remote.generate(prompt, options);
}

async expandQuery(
query: string,
options?: { context?: string; includeLexical?: boolean; intent?: string },
): Promise<Queryable[]> {
return this.remote.expandQuery(query, options);
}

// Tokenization: delegate to remote (character-based approximation).
// Remote has no real tokenizer but the approximation is good enough
// for document chunking (~4 chars/token for English text).
async tokenize(text: string): Promise<readonly unknown[]> {
return this.remote.tokenize!(text);
}

async countTokens(text: string): Promise<number> {
return this.remote.countTokens!(text);
}

async detokenize(tokens: readonly unknown[]): Promise<string> {
return this.remote.detokenize!(tokens);
}

async modelExists(model: string): Promise<ModelInfo> {
const results = await Promise.allSettled([
this.remote.modelExists(model),
...(this.local ? [this.local.modelExists(model)] : []),
]);
for (const r of results) {
if (r.status === "fulfilled" && r.value.exists) return r.value;
}
return { name: model, exists: false };
}

async dispose(): Promise<void> {
const disposables = [this.remote.dispose()];
if (this.local) disposables.push(this.local.dispose());
await Promise.allSettled(disposables);
}
}
46 changes: 32 additions & 14 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,9 @@ import {
type EmbedResult,
type ChunkStrategy,
} from "./store.js";
import {
LlamaCpp,
} from "./llm.js";
import { LlamaCpp, type LLM } from "./llm.js";
import { HybridLLM } from "./hybrid-llm.js";
import { RemoteLLM } from "./remote-llm.js";
import {
setConfigSource,
loadConfig,
Expand Down Expand Up @@ -204,6 +204,12 @@ export interface StoreOptions {
configPath?: string;
/** Inline collection config (mutually exclusive with `configPath`) */
config?: CollectionConfig;
/**
* Custom LLM backend. Supports HybridLLM or RemoteLLM for remote mode.
* When omitted, QMD_REMOTE_EMBED_URL / QMD_REMOTE_RERANK_URL env vars are
* checked, then falls back to a local LlamaCpp instance.
*/
llm?: LLM;
}

/**
Expand Down Expand Up @@ -367,16 +373,28 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {
}
// else: DB-only mode — no external config, use existing store_collections

// Create a per-store LlamaCpp instance — lazy-loads models on first use,
// auto-unloads after 5 min inactivity to free VRAM.
const llm = new LlamaCpp({
embedModel: config?.models?.embed,
generateModel: config?.models?.generate,
rerankModel: config?.models?.rerank,
inactivityTimeoutMs: 5 * 60 * 1000,
disposeModelsOnInactivity: true,
});
internal.llm = llm;
// Determine LLM backend: explicit option > env-var remote > local LlamaCpp.
const remoteEmbedUrl = process.env.QMD_REMOTE_EMBED_URL;
const remoteRerankUrl = process.env.QMD_REMOTE_RERANK_URL;
if (options.llm) {
internal.llm = options.llm;
} else if (remoteEmbedUrl && remoteRerankUrl) {
const remote = new RemoteLLM({
embedUrl: remoteEmbedUrl,
rerankUrl: remoteRerankUrl,
genUrl: process.env.QMD_REMOTE_GEN_URL,
apiKey: process.env.QMD_REMOTE_API_KEY,
});
internal.llm = new HybridLLM(null, remote);
} else {
internal.llm = new LlamaCpp({
embedModel: config?.models?.embed,
generateModel: config?.models?.generate,
rerankModel: config?.models?.rerank,
inactivityTimeoutMs: 5 * 60 * 1000,
disposeModelsOnInactivity: true,
});
}

const store: QMDStore = {
internal,
Expand Down Expand Up @@ -532,7 +550,7 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {

// Lifecycle
close: async () => {
await llm.dispose();
await internal.llm?.dispose?.();
internal.close();
if (hasYamlConfig || options.config) {
setConfigSource(undefined); // Reset config source
Expand Down
Loading
Loading