Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clients/new-js/packages/ai-embeddings/all/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
"@chroma-core/openai": "workspace:^",
"@chroma-core/together-ai": "workspace:^",
"@chroma-core/voyageai": "workspace:^",
"@chroma-core/chroma-bm25": "workspace:^",
"@chroma-core/chroma-cloud-qwen": "workspace:^",
"@chroma-core/chroma-cloud-splade": "workspace:^"
},
Expand Down
1 change: 1 addition & 0 deletions clients/new-js/packages/ai-embeddings/all/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ export * from "@chroma-core/together-ai";
export * from "@chroma-core/voyageai";
export * from "@chroma-core/chroma-cloud-qwen";
export * from "@chroma-core/chroma-cloud-splade";
export * from "@chroma-core/chroma-bm25";
22 changes: 22 additions & 0 deletions clients/new-js/packages/ai-embeddings/chroma-bm25/jest.config.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import type { Config } from "jest";

const config: Config = {
preset: "ts-jest",
testEnvironment: "node",
testMatch: ["**/*.test.ts"],
transform: {
"^.+\\.tsx?$": [
"ts-jest",
{
useESM: true,
},
],
},
extensionsToTreatAsEsm: [".ts"],
moduleNameMapper: {
"^(\\.{1,2}/.*)\\.js$": "$1",
},
setupFiles: ["./jest.setup.ts"],
};

export default config;
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import * as dotenv from "dotenv";

dotenv.config({ path: "../../../.env" });
55 changes: 55 additions & 0 deletions clients/new-js/packages/ai-embeddings/chroma-bm25/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
{
"name": "@chroma-core/chroma-bm25",
"version": "0.1.7",
"private": false,
"description": "Chroma BM25 sparse embedding function implemented in TypeScript",
"main": "dist/cjs/chroma-bm25.cjs",
"types": "dist/chroma-bm25.d.ts",
"module": "dist/chroma-bm25.legacy-esm.js",
"type": "module",
"exports": {
".": {
"import": {
"types": "./dist/chroma-bm25.d.ts",
"default": "./dist/chroma-bm25.mjs"
},
"require": {
"types": "./dist/cjs/chroma-bm25.d.cts",
"default": "./dist/cjs/chroma-bm25.cjs"
}
}
},
"files": [
"src",
"dist"
],
"scripts": {
"clean": "rimraf dist",
"prebuild": "rimraf dist",
"build": "tsup",
"watch": "tsup --watch",
"test": "jest"
},
"devDependencies": {
"@jest/globals": "^29.7.0",
"dotenv": "^16.3.1",
"jest": "^29.7.0",
"rimraf": "^5.0.0",
"ts-jest": "^29.1.2",
"ts-node": "^10.9.2",
"tsup": "^8.3.5"
},
"peerDependencies": {
"chromadb": "workspace:^"
},
"dependencies": {
"@chroma-core/ai-embeddings-common": "workspace:^",
"snowball-stemmers": "^0.6.0"
},
"engines": {
"node": ">=20"
},
"publishConfig": {
"access": "public"
}
}
113 changes: 113 additions & 0 deletions clients/new-js/packages/ai-embeddings/chroma-bm25/src/index.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import { describe, expect, test } from "@jest/globals";
import {
DEFAULT_CHROMA_BM25_STOPWORDS,
ChromaBm25EmbeddingFunction,
type ChromaBm25Config,
} from "./index";

const isSorted = (arr: number[]): boolean => {
for (let i = 1; i < arr.length; i += 1) {
if (arr[i] < arr[i - 1]) {
return false;
}
}
return true;
};

describe("ChromaBm25EmbeddingFunction", () => {
const embedder = new ChromaBm25EmbeddingFunction();

test("matches comprehensive tokenization expectations", async () => {
const [embedding] = await embedder.generate([
"Usain Bolt's top speed reached ~27.8 mph (44.72 km/h)",
]);

const expectedIndices = [
230246813, 395514983, 458027949, 488165615, 729632045, 734978415,
997512866, 1114505193, 1381820790, 1501587190, 1649421877,
1837285388,
];
const expectedValue = 1.6391153;

expect(embedding.indices).toEqual(expectedIndices);
embedding.values.forEach((value) => {
expect(value).toBeCloseTo(expectedValue, 5);
});
});

// mirrors rust test `test_bm25_stopwords_and_punctuation` to guarantee compatibility
test("ensure Rust impl compatibilty", async () => {
const [embedding] = await embedder.generate([
"The space-time continuum WARPS near massive objects...",
]);

const expectedIndices = [
90097469, 519064992, 737893654, 1110755108, 1950894484, 2031641008,
2058513491,
];
const expectedValue = 1.660867;

expect(embedding.indices).toEqual(expectedIndices);
embedding.values.forEach((value) => {
expect(value).toBeCloseTo(expectedValue, 5);
});
});

test("generates consistent embeddings for multiple documents", async () => {
const texts = [
"Usain Bolt's top speed reached ~27.8 mph (44.72 km/h)",
"The space-time continuum WARPS near massive objects...",
"BM25 is great for sparse retrieval tasks",
];

const embeddings = await embedder.generate(texts);

expect(embeddings).toHaveLength(texts.length);
embeddings.forEach((embedding, index) => {
expect(embedding.indices.length).toBeGreaterThan(0);
expect(embedding.values.length).toBe(embedding.indices.length);
expect(isSorted(embedding.indices)).toBe(true);

embedding.values.forEach((value) => {
expect(value).toBeGreaterThan(0);
expect(Number.isFinite(value)).toBe(true);
});
});
});

test("generateForQueries mirrors generate", async () => {
const query = "retrieve BM25 docs";
const [queryEmbedding] = await embedder.generateForQueries([query]);
const [docEmbedding] = await embedder.generate([query]);

expect(queryEmbedding.indices).toEqual(docEmbedding.indices);
expect(queryEmbedding.values).toEqual(docEmbedding.values);
});

test("config round trip maintains settings", () => {
const config = embedder.getConfig() as Required<ChromaBm25Config>;

expect(config).toMatchObject({
k: 1.2,
b: 0.75,
avg_doc_length: 256,
token_max_length: 40,
});
expect(config.stopwords).toBeUndefined();

const custom = ChromaBm25EmbeddingFunction.buildFromConfig({
...config,
stopwords: DEFAULT_CHROMA_BM25_STOPWORDS.slice(0, 10),
});

const rebuiltConfig =
custom.getConfig() as Required<ChromaBm25Config>;
expect(rebuiltConfig.k).toBeCloseTo(config.k);
expect(rebuiltConfig.b).toBeCloseTo(config.b);
expect(rebuiltConfig.avg_doc_length).toBeCloseTo(config.avg_doc_length);
expect(rebuiltConfig.token_max_length).toBe(config.token_max_length);
expect(rebuiltConfig.stopwords).toEqual(
DEFAULT_CHROMA_BM25_STOPWORDS.slice(0, 10),
);
});
});
Loading
Loading