Skip to content

Commit f640303

Browse files
Jim Smithclaude
andcommitted
Add OpenAI-compatible remote embedding and reranking
Support offloading embedding and reranking to remote OpenAI-compatible servers (vLLM, Ollama, LM Studio, OpenAI) while preserving local query expansion and tokenization via a hybrid routing layer. - RemoteLLM: HTTP client with circuit breaker, dimension validation, batch splitting, auth headers, configurable timeouts - HybridLLM: routes embed/rerank → remote, generate/expand → local - LLM interface: add embedBatch, embedModelName; generalize singleton and session management from LlamaCpp to LLM - Config: QMD_EMBED_API_URL/MODEL env vars or YAML models section - Skip nomic/Qwen3 text formatting prefixes for remote models - 36 unit tests + 30 integration tests against live vLLM Related: #489, #427, #446, #511 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent c2f3a40 commit f640303

10 files changed

Lines changed: 1565 additions & 48 deletions

File tree

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,14 @@
22

33
## [Unreleased]
44

5+
### Changes
6+
7+
- Remote embedding and reranking via OpenAI-compatible API (vLLM, Ollama,
8+
OpenAI, etc.). Set `QMD_EMBED_API_URL` and `QMD_EMBED_API_MODEL` env vars
9+
or add `embed_api_url`/`embed_api_model` to `models:` in `index.yml`.
10+
Local query expansion and tokenization are preserved via a hybrid routing
11+
layer. Includes circuit breaker, dimension validation, and batch splitting.
12+
513
## [2.1.0] - 2026-04-05
614

715
Code files now chunk at function and class boundaries via tree-sitter,

README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -939,6 +939,41 @@ Uses node-llama-cpp's `createRankingContext()` and `rankAndSort()` API for cross
939939

940940
Used for generating query variations via `LlamaChatSession`.
941941

942+
### Remote Embedding & Reranking
943+
944+
QMD can offload embedding and reranking to a remote OpenAI-compatible server (vLLM, Ollama, LM Studio, OpenAI, etc.) while keeping query expansion local.
945+
946+
**Environment variables** (presence of `QMD_EMBED_API_URL` activates remote mode):
947+
948+
| Variable | Required | Description |
949+
|----------|----------|-------------|
950+
| `QMD_EMBED_API_URL` | Yes | Base URL, e.g. `http://gpu-host:8000/v1` |
951+
| `QMD_EMBED_API_MODEL` | Yes | Model name, e.g. `BAAI/bge-m3` |
952+
| `QMD_EMBED_API_KEY` | No | Bearer token for auth |
953+
| `QMD_RERANK_API_URL` | No | Rerank endpoint (defaults to embed URL) |
954+
| `QMD_RERANK_API_MODEL` | No | Rerank model name |
955+
| `QMD_RERANK_API_KEY` | No | Rerank auth (defaults to embed key) |
956+
957+
**YAML config** (`~/.config/qmd/index.yml`):
958+
```yaml
959+
models:
960+
embed_api_url: "http://gpu-host:8000/v1"
961+
embed_api_model: "BAAI/bge-m3"
962+
rerank_api_model: "BAAI/bge-reranker-v2-m3"
963+
```
964+
965+
**Example with vLLM:**
966+
```sh
967+
# Start vLLM with an embedding model
968+
vllm serve BAAI/bge-m3 --task embed
969+
970+
# Point QMD at it
971+
export QMD_EMBED_API_URL=http://localhost:8000/v1
972+
export QMD_EMBED_API_MODEL=BAAI/bge-m3
973+
qmd embed
974+
qmd query "your search query"
975+
```
976+
942977
## License
943978

944979
MIT

src/cli/qmd.ts

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,9 @@ import {
7777
type ReindexResult,
7878
type ChunkStrategy,
7979
} from "../store.js";
80-
import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
80+
import { disposeDefaultLlamaCpp, getDefaultLLM, setDefaultLLM, LlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
81+
import { RemoteLLM, remoteConfigFromEnv } from "../remote-llm.js";
82+
import { HybridLLM } from "../hybrid-llm.js";
8183
import {
8284
formatSearchResults,
8385
formatDocuments,
@@ -119,11 +121,28 @@ function getStore(): ReturnType<typeof createStore> {
119121
const config = loadConfig();
120122
syncConfigToDb(store.db, config);
121123
if (config.models) {
122-
setDefaultLlamaCpp(new LlamaCpp({
124+
const localLlm = new LlamaCpp({
123125
embedModel: config.models.embed,
124126
generateModel: config.models.generate,
125127
rerankModel: config.models.rerank,
126-
}));
128+
});
129+
130+
// Check if remote embedding is configured (env vars take precedence over YAML)
131+
const remoteConfig = remoteConfigFromEnv(config.models);
132+
if (remoteConfig) {
133+
const remoteLlm = new RemoteLLM(remoteConfig);
134+
setDefaultLLM(new HybridLLM(remoteLlm, localLlm));
135+
} else {
136+
setDefaultLLM(localLlm);
137+
}
138+
} else {
139+
// No YAML models config — still check env vars for remote embedding
140+
const remoteConfig = remoteConfigFromEnv();
141+
if (remoteConfig) {
142+
const remoteLlm = new RemoteLLM(remoteConfig);
143+
const localLlm = new LlamaCpp();
144+
setDefaultLLM(new HybridLLM(remoteLlm, localLlm));
145+
}
127146
}
128147
} catch {
129148
// Config may not exist yet — that's fine, DB works without it
@@ -462,8 +481,9 @@ async function showStatus(): Promise<void> {
462481

463482
// Device / GPU info
464483
try {
465-
const llm = getDefaultLlamaCpp();
466-
const device = await llm.getDeviceInfo();
484+
const llm = getDefaultLLM();
485+
if (typeof (llm as any).getDeviceInfo !== "function") throw new Error("skip");
486+
const device = await (llm as any).getDeviceInfo();
467487
console.log(`\n${c.bold}Device${c.reset}`);
468488
if (device.gpu) {
469489
console.log(` GPU: ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);

src/collections.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,18 @@ export interface ModelsConfig {
4040
embed?: string;
4141
rerank?: string;
4242
generate?: string;
43+
/** Remote embedding API base URL (e.g. http://gpu-host:8000/v1) */
44+
embed_api_url?: string;
45+
/** Remote embedding model name (e.g. BAAI/bge-m3) */
46+
embed_api_model?: string;
47+
/** Bearer token for remote embedding API */
48+
embed_api_key?: string;
49+
/** Remote rerank API base URL (defaults to embed_api_url) */
50+
rerank_api_url?: string;
51+
/** Remote rerank model name */
52+
rerank_api_model?: string;
53+
/** Bearer token for remote rerank API */
54+
rerank_api_key?: string;
4355
}
4456

4557
/**

src/hybrid-llm.ts

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/**
2+
* hybrid-llm.ts - Compositor that routes LLM operations between remote and local backends
3+
*
4+
* Embed/rerank → remote (GPU-heavy, benefits from offloading)
5+
* Generate/expandQuery → local LlamaCpp (QMD's fine-tuned query expansion model)
6+
* tokenize/countTokens → local LlamaCpp (CPU-cheap, needed for chunking)
7+
*/
8+
9+
import type {
10+
LLM,
11+
EmbedOptions,
12+
EmbeddingResult,
13+
GenerateOptions,
14+
GenerateResult,
15+
ModelInfo,
16+
Queryable,
17+
RerankDocument,
18+
RerankOptions,
19+
RerankResult,
20+
} from "./llm.js";
21+
22+
export class HybridLLM implements LLM {
23+
constructor(
24+
private readonly remote: LLM,
25+
private readonly local: LLM,
26+
) {}
27+
28+
get embedModelName(): string {
29+
return this.remote.embedModelName;
30+
}
31+
32+
// Route to remote
33+
embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null> {
34+
return this.remote.embed(text, options);
35+
}
36+
37+
embedBatch(texts: string[], options?: EmbedOptions): Promise<(EmbeddingResult | null)[]> {
38+
return this.remote.embedBatch(texts, options);
39+
}
40+
41+
rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult> {
42+
return this.remote.rerank(query, documents, options);
43+
}
44+
45+
// Route to local
46+
generate(prompt: string, options?: GenerateOptions): Promise<GenerateResult | null> {
47+
return this.local.generate(prompt, options);
48+
}
49+
50+
expandQuery(query: string, options?: { context?: string; includeLexical?: boolean }): Promise<Queryable[]> {
51+
return this.local.expandQuery(query, options);
52+
}
53+
54+
modelExists(model: string): Promise<ModelInfo> {
55+
return this.local.modelExists(model);
56+
}
57+
58+
async dispose(): Promise<void> {
59+
await Promise.all([this.remote.dispose(), this.local.dispose()]);
60+
}
61+
}

src/llm.ts

Lines changed: 60 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,24 @@ export function isQwen3EmbeddingModel(modelUri: string): boolean {
3030
return /qwen.*embed/i.test(modelUri) || /embed.*qwen/i.test(modelUri);
3131
}
3232

33+
/**
34+
* Detect if a model URI refers to a remote API model (not a local GGUF model).
35+
* Remote models handle their own prompt formatting, so no prefixes should be added.
36+
*/
37+
export function isRemoteModel(modelUri: string): boolean {
38+
// Local models use hf: URIs or local file paths ending in .gguf
39+
return !modelUri.startsWith("hf:") && !modelUri.endsWith(".gguf");
40+
}
41+
3342
/**
3443
* Format a query for embedding.
3544
* Uses nomic-style task prefix format for embeddinggemma (default).
3645
* Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
46+
* Remote models receive raw text (they handle their own formatting).
3747
*/
3848
export function formatQueryForEmbedding(query: string, modelUri?: string): string {
3949
const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
50+
if (isRemoteModel(uri)) return query;
4051
if (isQwen3EmbeddingModel(uri)) {
4152
return `Instruct: Retrieve relevant documents for the given query\nQuery: ${query}`;
4253
}
@@ -47,9 +58,11 @@ export function formatQueryForEmbedding(query: string, modelUri?: string): strin
4758
* Format a document for embedding.
4859
* Uses nomic-style format with title and text fields (default).
4960
* Qwen3-Embedding encodes documents as raw text without special prefixes.
61+
* Remote models receive raw text (they handle their own formatting).
5062
*/
5163
export function formatDocForEmbedding(text: string, title?: string, modelUri?: string): string {
5264
const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
65+
if (isRemoteModel(uri)) return title ? `${title}\n${text}` : text;
5366
if (isQwen3EmbeddingModel(uri)) {
5467
// Qwen3-Embedding: documents are raw text, no task prefix
5568
return title ? `${title}\n${text}` : text;
@@ -319,6 +332,16 @@ export interface LLM {
319332
*/
320333
embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
321334

335+
/**
336+
* Batch embed multiple texts
337+
*/
338+
embedBatch(texts: string[], options?: EmbedOptions): Promise<(EmbeddingResult | null)[]>;
339+
340+
/**
341+
* The embedding model name/URI
342+
*/
343+
readonly embedModelName: string;
344+
322345
/**
323346
* Generate text completion
324347
*/
@@ -1316,11 +1339,11 @@ export class LlamaCpp implements LLM {
13161339
* Coordinates with LlamaCpp idle timeout to prevent disposal during active sessions.
13171340
*/
13181341
class LLMSessionManager {
1319-
private llm: LlamaCpp;
1342+
private llm: LLM;
13201343
private _activeSessionCount = 0;
13211344
private _inFlightOperations = 0;
13221345

1323-
constructor(llm: LlamaCpp) {
1346+
constructor(llm: LLM) {
13241347
this.llm = llm;
13251348
}
13261349

@@ -1356,7 +1379,7 @@ class LLMSessionManager {
13561379
this._inFlightOperations = Math.max(0, this._inFlightOperations - 1);
13571380
}
13581381

1359-
getLlamaCpp(): LlamaCpp {
1382+
getLLM(): LLM {
13601383
return this.llm;
13611384
}
13621385
}
@@ -1459,38 +1482,38 @@ class LLMSession implements ILLMSession {
14591482
}
14601483

14611484
async embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null> {
1462-
return this.withOperation(() => this.manager.getLlamaCpp().embed(text, options));
1485+
return this.withOperation(() => this.manager.getLLM().embed(text, options));
14631486
}
14641487

14651488
async embedBatch(texts: string[], options?: EmbedOptions): Promise<(EmbeddingResult | null)[]> {
1466-
return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts, options));
1489+
return this.withOperation(() => this.manager.getLLM().embedBatch(texts, options));
14671490
}
14681491

14691492
async expandQuery(
14701493
query: string,
14711494
options?: { context?: string; includeLexical?: boolean }
14721495
): Promise<Queryable[]> {
1473-
return this.withOperation(() => this.manager.getLlamaCpp().expandQuery(query, options));
1496+
return this.withOperation(() => this.manager.getLLM().expandQuery(query, options));
14741497
}
14751498

14761499
async rerank(
14771500
query: string,
14781501
documents: RerankDocument[],
14791502
options?: RerankOptions
14801503
): Promise<RerankResult> {
1481-
return this.withOperation(() => this.manager.getLlamaCpp().rerank(query, documents, options));
1504+
return this.withOperation(() => this.manager.getLLM().rerank(query, documents, options));
14821505
}
14831506
}
14841507

1485-
// Session manager for the default LlamaCpp instance
1508+
// Session manager for the default LLM instance
14861509
let defaultSessionManager: LLMSessionManager | null = null;
14871510

14881511
/**
1489-
* Get the session manager for the default LlamaCpp instance.
1512+
* Get the session manager for the default LLM instance.
14901513
*/
14911514
function getSessionManager(): LLMSessionManager {
1492-
const llm = getDefaultLlamaCpp();
1493-
if (!defaultSessionManager || defaultSessionManager.getLlamaCpp() !== llm) {
1515+
const llm = getDefaultLLM();
1516+
if (!defaultSessionManager || defaultSessionManager.getLLM() !== llm) {
14941517
defaultSessionManager = new LLMSessionManager(llm);
14951518
}
14961519
return defaultSessionManager;
@@ -1525,11 +1548,11 @@ export async function withLLMSession<T>(
15251548
}
15261549

15271550
/**
1528-
* Execute a function with a scoped LLM session using a specific LlamaCpp instance.
1551+
* Execute a function with a scoped LLM session using a specific LLM instance.
15291552
* Unlike withLLMSession, this does not use the global singleton.
15301553
*/
15311554
export async function withLLMSessionForLlm<T>(
1532-
llm: LlamaCpp,
1555+
llm: LLM,
15331556
fn: (session: ILLMSession) => Promise<T>,
15341557
options?: LLMSessionOptions
15351558
): Promise<T> {
@@ -1553,35 +1576,45 @@ export function canUnloadLLM(): boolean {
15531576
}
15541577

15551578
// =============================================================================
1556-
// Singleton for default LlamaCpp instance
1579+
// Singleton for default LLM instance
15571580
// =============================================================================
15581581

1559-
let defaultLlamaCpp: LlamaCpp | null = null;
1582+
let defaultLLMInstance: LLM | null = null;
15601583

15611584
/**
1562-
* Get the default LlamaCpp instance (creates one if needed)
1585+
* Get the default LLM instance (creates a LlamaCpp if none set)
15631586
*/
1564-
export function getDefaultLlamaCpp(): LlamaCpp {
1565-
if (!defaultLlamaCpp) {
1566-
defaultLlamaCpp = new LlamaCpp();
1587+
export function getDefaultLLM(): LLM {
1588+
if (!defaultLLMInstance) {
1589+
defaultLLMInstance = new LlamaCpp();
15671590
}
1568-
return defaultLlamaCpp;
1591+
return defaultLLMInstance;
15691592
}
15701593

15711594
/**
1572-
* Set a custom default LlamaCpp instance (useful for testing)
1595+
* Set the default LLM instance
15731596
*/
1574-
export function setDefaultLlamaCpp(llm: LlamaCpp | null): void {
1575-
defaultLlamaCpp = llm;
1597+
export function setDefaultLLM(llm: LLM | null): void {
1598+
defaultLLMInstance = llm;
1599+
}
1600+
1601+
/** @deprecated Use getDefaultLLM() */
1602+
export function getDefaultLlamaCpp(): LLM {
1603+
return getDefaultLLM();
1604+
}
1605+
1606+
/** @deprecated Use setDefaultLLM() */
1607+
export function setDefaultLlamaCpp(llm: LLM | null): void {
1608+
setDefaultLLM(llm);
15761609
}
15771610

15781611
/**
1579-
* Dispose the default LlamaCpp instance if it exists.
1612+
* Dispose the default LLM instance if it exists.
15801613
* Call this before process exit to prevent NAPI crashes.
15811614
*/
15821615
export async function disposeDefaultLlamaCpp(): Promise<void> {
1583-
if (defaultLlamaCpp) {
1584-
await defaultLlamaCpp.dispose();
1585-
defaultLlamaCpp = null;
1616+
if (defaultLLMInstance) {
1617+
await defaultLLMInstance.dispose();
1618+
defaultLLMInstance = null;
15861619
}
15871620
}

0 commit comments

Comments
 (0)