tobi · alexei-led · Apr 15, 2026 · Apr 15, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,8 +2,33 @@
 
 ## [Unreleased]
 
+### Features
+
+- **Remote embedding/reranking** via OpenAI-compatible endpoints. Set
+  `QMD_REMOTE_EMBED_URL` and `QMD_REMOTE_RERANK_URL` to route embedding and
+  reranking to any OpenAI-compatible server (vLLM, TEI, LiteLLM, Ollama, etc.)
+  without running local GGUF models. Query expansion and generation are handled
+  by the same backend via `/v1/chat/completions`. To use a dedicated chat server
+  separate from the embedding server, set `QMD_REMOTE_GEN_URL`.
+  All endpoints support `QMD_REMOTE_API_KEY` for bearer authentication.
+  Circuit breakers protect each endpoint independently.
+  `qmd status` shows remote server URLs when in remote mode.
+- SDK: `createStore()` now accepts an `llm?` option to inject a custom LLM backend
+  (`HybridLLM`, `RemoteLLM`, or any `LLM` implementation). When omitted,
+  `QMD_REMOTE_EMBED_URL` / `QMD_REMOTE_RERANK_URL` are checked automatically.
+
 ### Fixes
 
+- Remote mode: `QMD_REMOTE_EMBED_URL` is no longer overwritten when a YAML
+  `models:` block is present — the override is skipped when remote mode is active.
+- Remote mode: `RemoteLLM.detokenize()` returns a character-length approximation
+  instead of empty string, preventing silent chunk loss in the fallback path.
+- `chunkDocumentByTokens` accepts an optional `llm?` parameter; internal callers
+  pass the store-scoped LLM instead of pulling from the global singleton.
+- `LLM` interface: `tokenize`, `countTokens`, `detokenize`, `embedBatch` are now
+  required members. `LLMSessionManager` and `withLLMSessionForLlm` accept `LLM`.
+- `RemoteLLM.modelExists()` logs a warning before returning the optimistic
+  fail-open result when neither server can verify model availability.
 - Embedding: `qmd embed -c <collection>` now scopes pending-doc selection
   to the requested collection instead of embedding global pending work.
   Scoped `--force` clears only collection-owned vectors, preserves shared

diff --git a/package.json b/package.json
@@ -23,7 +23,7 @@
   ],
   "scripts": {
     "prepare": "[ -d .git ] && ./scripts/install-hooks.sh || true",
-    "build": "tsc -p tsconfig.build.json && printf '#!/usr/bin/env node\n' | cat - dist/cli/qmd.js > dist/cli/qmd.tmp && mv dist/cli/qmd.tmp dist/cli/qmd.js && chmod +x dist/cli/qmd.js",
+    "build": "tsc -p tsconfig.build.json; { echo '#!/usr/bin/env node'; cat dist/cli/qmd.js; } > dist/cli/qmd.tmp && mv dist/cli/qmd.tmp dist/cli/qmd.js && chmod +x dist/cli/qmd.js",
     "test": "vitest run --reporter=verbose test/",
     "qmd": "tsx src/cli/qmd.ts",
     "index": "tsx src/cli/qmd.ts index",

diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts
@@ -78,7 +78,9 @@ import {
   type ReindexResult,
   type ChunkStrategy,
 } from "../store.js";
-import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
+import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, getDefaultLLM, setDefaultLlamaCpp, setDefaultLLM, LlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
+import { HybridLLM } from "../hybrid-llm.js";
+import { RemoteLLM } from "../remote-llm.js";
 import {
   formatSearchResults,
   formatDocuments,
@@ -108,6 +110,23 @@ import { getEmbeddedQmdSkillContent, getEmbeddedQmdSkillFiles } from "../embedde
 // resolution. The flag is flipped inside the CLI's main-module guard below so
 // it only fires when qmd is actually invoked as a script.
 
+// Remote LLM: if QMD_REMOTE_EMBED_URL / QMD_REMOTE_RERANK_URL are set, route
+// embedding and reranking through the remote server instead of local GGUF models.
+const remoteEmbedUrl = process.env.QMD_REMOTE_EMBED_URL;
+const remoteRerankUrl = process.env.QMD_REMOTE_RERANK_URL;
+if (remoteEmbedUrl || remoteRerankUrl) {
+  if (!remoteEmbedUrl || !remoteRerankUrl) {
+    throw new Error("QMD_REMOTE_EMBED_URL and QMD_REMOTE_RERANK_URL must both be set to enable remote embedding/reranking");
+  }
+  const remote = new RemoteLLM({
+    embedUrl: remoteEmbedUrl,
+    rerankUrl: remoteRerankUrl,
+    genUrl: process.env.QMD_REMOTE_GEN_URL,
+    apiKey: process.env.QMD_REMOTE_API_KEY,
+  });
+  setDefaultLLM(new HybridLLM(null, remote));
+}
+
 // =============================================================================
 // Store/DB lifecycle (no legacy singletons in store.ts)
 // =============================================================================
@@ -123,7 +142,7 @@ function getStore(): ReturnType<typeof createStore> {
     try {
       const config = loadConfig();
       syncConfigToDb(store.db, config);
-      if (config.models) {
+      if (config.models && !getDefaultLLM().isRemote) {
         setDefaultLlamaCpp(new LlamaCpp({
           embedModel: config.models.embed,
           generateModel: config.models.generate,
@@ -457,15 +476,23 @@ async function showStatus(): Promise<void> {
 
   // Models
   {
-    // hf:org/repo/file.gguf → https://huggingface.co/org/repo
-    const hfLink = (uri: string) => {
-      const match = uri.match(/^hf:([^/]+\/[^/]+)\//);
-      return match ? `https://huggingface.co/${match[1]}` : uri;
-    };
-    console.log(`\n${c.bold}Models${c.reset}`);
-    console.log(`  Embedding:   ${hfLink(DEFAULT_EMBED_MODEL_URI)}`);
-    console.log(`  Reranking:   ${hfLink(DEFAULT_RERANK_MODEL_URI)}`);
-    console.log(`  Generation:  ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`);
+    const llmForStatus = getDefaultLLM();
+    if (llmForStatus.isRemote) {
+      console.log(`\n${c.bold}Models${c.reset}`);
+      console.log(`  Embedding:   ${remoteEmbedUrl ?? process.env.QMD_REMOTE_EMBED_URL ?? "(remote)"}`);
+      console.log(`  Reranking:   ${remoteRerankUrl ?? process.env.QMD_REMOTE_RERANK_URL ?? "(remote)"}`);
+      console.log(`  Generation:  ${process.env.QMD_REMOTE_GEN_URL ?? "(same as embedding)"}`);
+    } else {
+      // hf:org/repo/file.gguf → https://huggingface.co/org/repo
+      const hfLink = (uri: string) => {
+        const match = uri.match(/^hf:([^/]+\/[^/]+)\//);
+        return match ? `https://huggingface.co/${match[1]}` : uri;
+      };
+      console.log(`\n${c.bold}Models${c.reset}`);
+      console.log(`  Embedding:   ${hfLink(DEFAULT_EMBED_MODEL_URI)}`);
+      console.log(`  Reranking:   ${hfLink(DEFAULT_RERANK_MODEL_URI)}`);
+      console.log(`  Generation:  ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`);
+    }
   }
 
   // Device / GPU info
@@ -475,29 +502,33 @@ async function showStatus(): Promise<void> {
   if (process.env.QMD_STATUS_DEVICE_PROBE === "1") {
     console.log(`\n${c.bold}Device${c.reset}`);
     try {
-      const llm = getDefaultLlamaCpp();
-      const device = await llm.getDeviceInfo({ allowBuild: false });
-      if (device.gpu) {
-        console.log(`  GPU:      ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
-        if (device.gpuDevices.length > 0) {
-          // Deduplicate and count GPUs
-          const counts = new Map<string, number>();
-          for (const name of device.gpuDevices) {
-            counts.set(name, (counts.get(name) || 0) + 1);
+      const llm = getDefaultLLM() as Partial<LlamaCpp>;
+      if (typeof llm.getDeviceInfo === "function") {
+        const device = await llm.getDeviceInfo({ allowBuild: false });
+        if (device.gpu) {
+          console.log(`  GPU:      ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
+          if (device.gpuDevices.length > 0) {
+            // Deduplicate and count GPUs
+            const counts = new Map<string, number>();
+            for (const name of device.gpuDevices) {
+              counts.set(name, (counts.get(name) || 0) + 1);
+            }
+            const deviceStr = Array.from(counts.entries())
+              .map(([name, count]) => count > 1 ? `${count}× ${name}` : name)
+              .join(', ');
+            console.log(`  Devices:  ${deviceStr}`);
           }
-          const deviceStr = Array.from(counts.entries())
-            .map(([name, count]) => count > 1 ? `${count}× ${name}` : name)
-            .join(', ');
-          console.log(`  Devices:  ${deviceStr}`);
-        }
-        if (device.vram) {
-          console.log(`  VRAM:     ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`);
+          if (device.vram) {
+            console.log(`  VRAM:     ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`);
+          }
+        } else {
+          console.log(`  GPU:      ${c.yellow}none${c.reset} (running on CPU — models will be slow)`);
+          console.log(`  ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
         }
+        console.log(`  CPU:      ${device.cpuCores} math cores`);
       } else {
-        console.log(`  GPU:      ${c.yellow}none${c.reset} (running on CPU — models will be slow)`);
-        console.log(`  ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
+        console.log(`  Status:   ${c.dim}remote LLM (no device info)${c.reset}`);
       }
-      console.log(`  CPU:      ${device.cpuCores} math cores`);
     } catch (error) {
       console.log(`  Status:   ${c.dim}probe failed${c.reset}`);
       if (error instanceof Error && error.message) {

diff --git a/src/hybrid-llm.ts b/src/hybrid-llm.ts
@@ -0,0 +1,96 @@
+import type {
+  EmbedOptions,
+  EmbeddingResult,
+  GenerateOptions,
+  GenerateResult,
+  LLM,
+  ModelInfo,
+  Queryable,
+  RerankDocument,
+  RerankOptions,
+  RerankResult,
+} from "./llm.js";
+
+/**
+ * Routes all operations to a remote LLM provider (OpenAI API).
+ *
+ * When a local LlamaCpp is provided, it serves as a fallback for operations
+ * the remote doesn't support. When local is null, all operations go remote
+ * (expandQuery, generate, tokenize use the remote implementation).
+ */
+export class HybridLLM implements LLM {
+  readonly isRemote = true;
+
+  constructor(
+    private readonly local: LLM | null,
+    private readonly remote: LLM,
+  ) {}
+
+  async embed(
+    text: string,
+    options: EmbedOptions = {},
+  ): Promise<EmbeddingResult | null> {
+    return this.remote.embed(text, options);
+  }
+
+  async embedBatch(
+    texts: string[],
+    options: EmbedOptions = {},
+  ): Promise<(EmbeddingResult | null)[]> {
+    return this.remote.embedBatch(texts, options);
+  }
+
+  async rerank(
+    query: string,
+    documents: RerankDocument[],
+    options: RerankOptions = {},
+  ): Promise<RerankResult> {
+    return this.remote.rerank(query, documents, options);
+  }
+
+  async generate(
+    prompt: string,
+    options: GenerateOptions = {},
+  ): Promise<GenerateResult | null> {
+    return this.remote.generate(prompt, options);
+  }
+
+  async expandQuery(
+    query: string,
+    options?: { context?: string; includeLexical?: boolean; intent?: string },
+  ): Promise<Queryable[]> {
+    return this.remote.expandQuery(query, options);
+  }
+
+  // Tokenization: delegate to remote (character-based approximation).
+  // Remote has no real tokenizer but the approximation is good enough
+  // for document chunking (~4 chars/token for English text).
+  async tokenize(text: string): Promise<readonly unknown[]> {
+    return this.remote.tokenize!(text);
+  }
+
+  async countTokens(text: string): Promise<number> {
+    return this.remote.countTokens!(text);
+  }
+
+  async detokenize(tokens: readonly unknown[]): Promise<string> {
+    return this.remote.detokenize!(tokens);
+  }
+
+  async modelExists(model: string): Promise<ModelInfo> {
+    const results = await Promise.allSettled([
+      this.remote.modelExists(model),
+      ...(this.local ? [this.local.modelExists(model)] : []),
+    ]);
+    for (const r of results) {
+      if (r.status === "fulfilled" && r.value.exists) return r.value;
+    }
+    return { name: model, exists: false };
+  }
+
+  async dispose(): Promise<void> {
+    const disposables = [this.remote.dispose()];
+    if (this.local) disposables.push(this.local.dispose());
+    await Promise.allSettled(disposables);
+  }
+}
diff --git a/src/index.ts b/src/index.ts
@@ -64,9 +64,9 @@ import {
   type EmbedResult,
   type ChunkStrategy,
 } from "./store.js";
-import {
-  LlamaCpp,
-} from "./llm.js";
+import { LlamaCpp, type LLM } from "./llm.js";
+import { HybridLLM } from "./hybrid-llm.js";
+import { RemoteLLM } from "./remote-llm.js";
 import {
   setConfigSource,
   loadConfig,
@@ -204,6 +204,12 @@ export interface StoreOptions {
   configPath?: string;
   /** Inline collection config (mutually exclusive with `configPath`) */
   config?: CollectionConfig;
+  /**
+   * Custom LLM backend. Supports HybridLLM or RemoteLLM for remote mode.
+   * When omitted, QMD_REMOTE_EMBED_URL / QMD_REMOTE_RERANK_URL env vars are
+   * checked, then falls back to a local LlamaCpp instance.
+   */
+  llm?: LLM;
 }
 
 /**
@@ -367,16 +373,28 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {
   }
   // else: DB-only mode — no external config, use existing store_collections
 
-  // Create a per-store LlamaCpp instance — lazy-loads models on first use,
-  // auto-unloads after 5 min inactivity to free VRAM.
-  const llm = new LlamaCpp({
-    embedModel: config?.models?.embed,
-    generateModel: config?.models?.generate,
-    rerankModel: config?.models?.rerank,
-    inactivityTimeoutMs: 5 * 60 * 1000,
-    disposeModelsOnInactivity: true,
-  });
-  internal.llm = llm;
+  // Determine LLM backend: explicit option > env-var remote > local LlamaCpp.
+  const remoteEmbedUrl = process.env.QMD_REMOTE_EMBED_URL;
+  const remoteRerankUrl = process.env.QMD_REMOTE_RERANK_URL;
+  if (options.llm) {
+    internal.llm = options.llm;
+  } else if (remoteEmbedUrl && remoteRerankUrl) {
+    const remote = new RemoteLLM({
+      embedUrl: remoteEmbedUrl,
+      rerankUrl: remoteRerankUrl,
+      genUrl: process.env.QMD_REMOTE_GEN_URL,
+      apiKey: process.env.QMD_REMOTE_API_KEY,
+    });
+    internal.llm = new HybridLLM(null, remote);
+  } else {
+    internal.llm = new LlamaCpp({
+      embedModel: config?.models?.embed,
+      generateModel: config?.models?.generate,
+      rerankModel: config?.models?.rerank,
+      inactivityTimeoutMs: 5 * 60 * 1000,
+      disposeModelsOnInactivity: true,
+    });
+  }
 
   const store: QMDStore = {
     internal,
@@ -532,7 +550,7 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {
 
     // Lifecycle
     close: async () => {
-      await llm.dispose();
+      await internal.llm?.dispose?.();
       internal.close();
       if (hasYamlConfig || options.config) {
         setConfigSource(undefined); // Reset config source