tobi
diff --git a/‎CHANGELOG.md‎
Lines changed: 8 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 35 additions & 0 deletions b/‎README.md‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎src/cli/qmd.ts‎
Lines changed: 25 additions & 5 deletions b/‎src/cli/qmd.ts‎
Lines changed: 25 additions & 5 deletions
diff --git a/‎src/collections.ts‎
Lines changed: 12 additions & 0 deletions b/‎src/collections.ts‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/hybrid-llm.ts‎
Lines changed: 61 additions & 0 deletions b/‎src/hybrid-llm.ts‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎src/llm.ts‎
Lines changed: 60 additions & 27 deletions b/‎src/llm.ts‎
Lines changed: 60 additions & 27 deletions
@@ -2,6 +2,14 @@
 
 ## [Unreleased]
 
+### Changes
+
+- Remote embedding and reranking via OpenAI-compatible API (vLLM, Ollama,
+  OpenAI, etc.). Set `QMD_EMBED_API_URL` and `QMD_EMBED_API_MODEL` env vars
+  or add `embed_api_url`/`embed_api_model` to `models:` in `index.yml`.
+  Local query expansion and tokenization are preserved via a hybrid routing
+  layer. Includes circuit breaker, dimension validation, and batch splitting.
+
 ## [2.1.0] - 2026-04-05
 
 Code files now chunk at function and class boundaries via tree-sitter,
 
@@ -939,6 +939,41 @@ Uses node-llama-cpp's `createRankingContext()` and `rankAndSort()` API for cross
 
 Used for generating query variations via `LlamaChatSession`.
 
+### Remote Embedding & Reranking
+
+QMD can offload embedding and reranking to a remote OpenAI-compatible server (vLLM, Ollama, LM Studio, OpenAI, etc.) while keeping query expansion local.
+
+**Environment variables** (presence of `QMD_EMBED_API_URL` activates remote mode):
+
+| Variable | Required | Description |
+|----------|----------|-------------|
+| `QMD_EMBED_API_URL` | Yes | Base URL, e.g. `http://gpu-host:8000/v1` |
+| `QMD_EMBED_API_MODEL` | Yes | Model name, e.g. `BAAI/bge-m3` |
+| `QMD_EMBED_API_KEY` | No | Bearer token for auth |
+| `QMD_RERANK_API_URL` | No | Rerank endpoint (defaults to embed URL) |
+| `QMD_RERANK_API_MODEL` | No | Rerank model name |
+| `QMD_RERANK_API_KEY` | No | Rerank auth (defaults to embed key) |
+
+**YAML config** (`~/.config/qmd/index.yml`):
+```yaml
+models:
+  embed_api_url: "http://gpu-host:8000/v1"
+  embed_api_model: "BAAI/bge-m3"
+  rerank_api_model: "BAAI/bge-reranker-v2-m3"
+```
+
+**Example with vLLM:**
+```sh
+# Start vLLM with an embedding model
+vllm serve BAAI/bge-m3 --task embed
+
+# Point QMD at it
+export QMD_EMBED_API_URL=http://localhost:8000/v1
+export QMD_EMBED_API_MODEL=BAAI/bge-m3
+qmd embed
+qmd query "your search query"
+```
+
 ## License
 
 MIT
@@ -77,7 +77,9 @@ import {
   type ReindexResult,
   type ChunkStrategy,
 } from "../store.js";
-import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
+import { disposeDefaultLlamaCpp, getDefaultLLM, setDefaultLLM, LlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
+import { RemoteLLM, remoteConfigFromEnv } from "../remote-llm.js";
+import { HybridLLM } from "../hybrid-llm.js";
 import {
   formatSearchResults,
   formatDocuments,
@@ -119,11 +121,28 @@ function getStore(): ReturnType<typeof createStore> {
       const config = loadConfig();
       syncConfigToDb(store.db, config);
       if (config.models) {
-        setDefaultLlamaCpp(new LlamaCpp({
+        const localLlm = new LlamaCpp({
           embedModel: config.models.embed,
           generateModel: config.models.generate,
           rerankModel: config.models.rerank,
-        }));
+        });
+
+        // Check if remote embedding is configured (env vars take precedence over YAML)
+        const remoteConfig = remoteConfigFromEnv(config.models);
+        if (remoteConfig) {
+          const remoteLlm = new RemoteLLM(remoteConfig);
+          setDefaultLLM(new HybridLLM(remoteLlm, localLlm));
+        } else {
+          setDefaultLLM(localLlm);
+        }
+      } else {
+        // No YAML models config — still check env vars for remote embedding
+        const remoteConfig = remoteConfigFromEnv();
+        if (remoteConfig) {
+          const remoteLlm = new RemoteLLM(remoteConfig);
+          const localLlm = new LlamaCpp();
+          setDefaultLLM(new HybridLLM(remoteLlm, localLlm));
+        }
       }
     } catch {
       // Config may not exist yet — that's fine, DB works without it
@@ -462,8 +481,9 @@ async function showStatus(): Promise<void> {
 
   // Device / GPU info
   try {
-    const llm = getDefaultLlamaCpp();
-    const device = await llm.getDeviceInfo();
+    const llm = getDefaultLLM();
+    if (typeof (llm as any).getDeviceInfo !== "function") throw new Error("skip");
+    const device = await (llm as any).getDeviceInfo();
     console.log(`\n${c.bold}Device${c.reset}`);
     if (device.gpu) {
       console.log(`  GPU:      ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
 
@@ -40,6 +40,18 @@ export interface ModelsConfig {
   embed?: string;
   rerank?: string;
   generate?: string;
+  /** Remote embedding API base URL (e.g. http://gpu-host:8000/v1) */
+  embed_api_url?: string;
+  /** Remote embedding model name (e.g. BAAI/bge-m3) */
+  embed_api_model?: string;
+  /** Bearer token for remote embedding API */
+  embed_api_key?: string;
+  /** Remote rerank API base URL (defaults to embed_api_url) */
+  rerank_api_url?: string;
+  /** Remote rerank model name */
+  rerank_api_model?: string;
+  /** Bearer token for remote rerank API */
+  rerank_api_key?: string;
 }
 
 /**
 
@@ -0,0 +1,61 @@
+/**
+ * hybrid-llm.ts - Compositor that routes LLM operations between remote and local backends
+ *
+ * Embed/rerank → remote (GPU-heavy, benefits from offloading)
+ * Generate/expandQuery → local LlamaCpp (QMD's fine-tuned query expansion model)
+ * tokenize/countTokens → local LlamaCpp (CPU-cheap, needed for chunking)
+ */
+
+import type {
+  LLM,
+  EmbedOptions,
+  EmbeddingResult,
+  GenerateOptions,
+  GenerateResult,
+  ModelInfo,
+  Queryable,
+  RerankDocument,
+  RerankOptions,
+  RerankResult,
+} from "./llm.js";
+
+export class HybridLLM implements LLM {
+  constructor(
+    private readonly remote: LLM,
+    private readonly local: LLM,
+  ) {}
+
+  get embedModelName(): string {
+    return this.remote.embedModelName;
+  }
+
+  // Route to remote
+  embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null> {
+    return this.remote.embed(text, options);
+  }
+
+  embedBatch(texts: string[], options?: EmbedOptions): Promise<(EmbeddingResult | null)[]> {
+    return this.remote.embedBatch(texts, options);
+  }
+
+  rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult> {
+    return this.remote.rerank(query, documents, options);
+  }
+
+  // Route to local
+  generate(prompt: string, options?: GenerateOptions): Promise<GenerateResult | null> {
+    return this.local.generate(prompt, options);
+  }
+
+  expandQuery(query: string, options?: { context?: string; includeLexical?: boolean }): Promise<Queryable[]> {
+    return this.local.expandQuery(query, options);
+  }
+
+  modelExists(model: string): Promise<ModelInfo> {
+    return this.local.modelExists(model);
+  }
+
+  async dispose(): Promise<void> {
+    await Promise.all([this.remote.dispose(), this.local.dispose()]);
+  }
+}
@@ -30,13 +30,24 @@ export function isQwen3EmbeddingModel(modelUri: string): boolean {
   return /qwen.*embed/i.test(modelUri) || /embed.*qwen/i.test(modelUri);
 }
 
+/**
+ * Detect if a model URI refers to a remote API model (not a local GGUF model).
+ * Remote models handle their own prompt formatting, so no prefixes should be added.
+ */
+export function isRemoteModel(modelUri: string): boolean {
+  // Local models use hf: URIs or local file paths ending in .gguf
+  return !modelUri.startsWith("hf:") && !modelUri.endsWith(".gguf");
+}
+
 /**
  * Format a query for embedding.
  * Uses nomic-style task prefix format for embeddinggemma (default).
  * Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
+ * Remote models receive raw text (they handle their own formatting).
  */
 export function formatQueryForEmbedding(query: string, modelUri?: string): string {
   const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
+  if (isRemoteModel(uri)) return query;
   if (isQwen3EmbeddingModel(uri)) {
     return `Instruct: Retrieve relevant documents for the given query\nQuery: ${query}`;
   }
@@ -47,9 +58,11 @@ export function formatQueryForEmbedding(query: string, modelUri?: string): strin
  * Format a document for embedding.
  * Uses nomic-style format with title and text fields (default).
  * Qwen3-Embedding encodes documents as raw text without special prefixes.
+ * Remote models receive raw text (they handle their own formatting).
  */
 export function formatDocForEmbedding(text: string, title?: string, modelUri?: string): string {
   const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
+  if (isRemoteModel(uri)) return title ? `${title}\n${text}` : text;
   if (isQwen3EmbeddingModel(uri)) {
     // Qwen3-Embedding: documents are raw text, no task prefix
     return title ? `${title}\n${text}` : text;
@@ -319,6 +332,16 @@ export interface LLM {
    */
   embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
 
+  /**
+   * Batch embed multiple texts
+   */
+  embedBatch(texts: string[], options?: EmbedOptions): Promise<(EmbeddingResult | null)[]>;
+
+  /**
+   * The embedding model name/URI
+   */
+  readonly embedModelName: string;
+
   /**
    * Generate text completion
    */
@@ -1316,11 +1339,11 @@ export class LlamaCpp implements LLM {
  * Coordinates with LlamaCpp idle timeout to prevent disposal during active sessions.
  */
 class LLMSessionManager {
-  private llm: LlamaCpp;
+  private llm: LLM;
   private _activeSessionCount = 0;
   private _inFlightOperations = 0;
 
-  constructor(llm: LlamaCpp) {
+  constructor(llm: LLM) {
     this.llm = llm;
   }
 
@@ -1356,7 +1379,7 @@ class LLMSessionManager {
     this._inFlightOperations = Math.max(0, this._inFlightOperations - 1);
   }
 
-  getLlamaCpp(): LlamaCpp {
+  getLLM(): LLM {
     return this.llm;
   }
 }
@@ -1459,38 +1482,38 @@ class LLMSession implements ILLMSession {
   }
 
   async embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null> {
-    return this.withOperation(() => this.manager.getLlamaCpp().embed(text, options));
+    return this.withOperation(() => this.manager.getLLM().embed(text, options));
   }
 
   async embedBatch(texts: string[], options?: EmbedOptions): Promise<(EmbeddingResult | null)[]> {
-    return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts, options));
+    return this.withOperation(() => this.manager.getLLM().embedBatch(texts, options));
   }
 
   async expandQuery(
     query: string,
     options?: { context?: string; includeLexical?: boolean }
   ): Promise<Queryable[]> {
-    return this.withOperation(() => this.manager.getLlamaCpp().expandQuery(query, options));
+    return this.withOperation(() => this.manager.getLLM().expandQuery(query, options));
   }
 
   async rerank(
     query: string,
     documents: RerankDocument[],
     options?: RerankOptions
   ): Promise<RerankResult> {
-    return this.withOperation(() => this.manager.getLlamaCpp().rerank(query, documents, options));
+    return this.withOperation(() => this.manager.getLLM().rerank(query, documents, options));
   }
 }
 
-// Session manager for the default LlamaCpp instance
+// Session manager for the default LLM instance
 let defaultSessionManager: LLMSessionManager | null = null;
 
 /**
- * Get the session manager for the default LlamaCpp instance.
+ * Get the session manager for the default LLM instance.
  */
 function getSessionManager(): LLMSessionManager {
-  const llm = getDefaultLlamaCpp();
-  if (!defaultSessionManager || defaultSessionManager.getLlamaCpp() !== llm) {
+  const llm = getDefaultLLM();
+  if (!defaultSessionManager || defaultSessionManager.getLLM() !== llm) {
     defaultSessionManager = new LLMSessionManager(llm);
   }
   return defaultSessionManager;
@@ -1525,11 +1548,11 @@ export async function withLLMSession<T>(
 }
 
 /**
- * Execute a function with a scoped LLM session using a specific LlamaCpp instance.
+ * Execute a function with a scoped LLM session using a specific LLM instance.
  * Unlike withLLMSession, this does not use the global singleton.
  */
 export async function withLLMSessionForLlm<T>(
-  llm: LlamaCpp,
+  llm: LLM,
   fn: (session: ILLMSession) => Promise<T>,
   options?: LLMSessionOptions
 ): Promise<T> {
@@ -1553,35 +1576,45 @@ export function canUnloadLLM(): boolean {
 }
 
 // =============================================================================
-// Singleton for default LlamaCpp instance
+// Singleton for default LLM instance
 // =============================================================================
 
-let defaultLlamaCpp: LlamaCpp | null = null;
+let defaultLLMInstance: LLM | null = null;
 
 /**
- * Get the default LlamaCpp instance (creates one if needed)
+ * Get the default LLM instance (creates a LlamaCpp if none set)
  */
-export function getDefaultLlamaCpp(): LlamaCpp {
-  if (!defaultLlamaCpp) {
-    defaultLlamaCpp = new LlamaCpp();
+export function getDefaultLLM(): LLM {
+  if (!defaultLLMInstance) {
+    defaultLLMInstance = new LlamaCpp();
   }
-  return defaultLlamaCpp;
+  return defaultLLMInstance;
 }
 
 /**
- * Set a custom default LlamaCpp instance (useful for testing)
+ * Set the default LLM instance
  */
-export function setDefaultLlamaCpp(llm: LlamaCpp | null): void {
-  defaultLlamaCpp = llm;
+export function setDefaultLLM(llm: LLM | null): void {
+  defaultLLMInstance = llm;
+}
+
+/** @deprecated Use getDefaultLLM() */
+export function getDefaultLlamaCpp(): LLM {
+  return getDefaultLLM();
+}
+
+/** @deprecated Use setDefaultLLM() */
+export function setDefaultLlamaCpp(llm: LLM | null): void {
+  setDefaultLLM(llm);
 }
 
 /**
- * Dispose the default LlamaCpp instance if it exists.
+ * Dispose the default LLM instance if it exists.
  * Call this before process exit to prevent NAPI crashes.
  */
 export async function disposeDefaultLlamaCpp(): Promise<void> {
-  if (defaultLlamaCpp) {
-    await defaultLlamaCpp.dispose();
-    defaultLlamaCpp = null;
+  if (defaultLLMInstance) {
+    await defaultLLMInstance.dispose();
+    defaultLLMInstance = null;
   }
 }