Skip to content

Commit fe7fdee

Browse files
committed
feat(vectorsearch): add metadata support to Elasticsearch adapter
Adds the ability to store and search by metadata in the Elasticsearch vector search provider: - Extended `add_texts` and `update_texts` to accept optional `metadatas` parameter - Updated index mapping to include a dynamic `metadata` object field - Added `filter` parameter to similarity search methods for metadata-based filtering - All changes maintain backward compatibility with existing code - Added comprehensive test coverage for the new functionality - Created an example script to demonstrate metadata usage with Elasticsearch This enhancement brings the Elasticsearch adapter to parity with other vector stores like Pinecone, Chroma, PGVector and Qdrant that already supported metadata. Users can now store arbitrary metadata alongside their text and embedding vectors, then filter search results based on metadata fields.
1 parent e912e11 commit fe7fdee

File tree

3 files changed

+152
-27
lines changed

3 files changed

+152
-27
lines changed
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# frozen_string_literal: true
2+
3+
require "langchain"
4+
require "dotenv/load"
5+
require "ruby/openai"
6+
7+
# This example assumes you are running Elasticsearch in Docker:
8+
#
9+
# docker run --name es8 -d \
10+
# -p 9200:9200 -p 9300:9300 \
11+
# -e "discovery.type=single-node" \
12+
# -e "xpack.security.enabled=false" \
13+
# docker.elastic.co/elasticsearch/elasticsearch:8.12.2
14+
#
15+
# The container exposes the REST API on http://localhost:9200 which
16+
# the script connects to below. If you use a different host/port, set
17+
# the ELASTICSEARCH_URL environment variable accordingly before running
18+
# the script:
19+
# ELASTICSEARCH_URL=http://localhost:9201 ruby examples/...
20+
21+
# Instantiate the Elasticsearch vector store
22+
es = Langchain::Vectorsearch::Elasticsearch.new(
23+
url: ENV.fetch("ELASTICSEARCH_URL", "http://localhost:9200"),
24+
index_name: "documents",
25+
llm: Langchain::LLM::OpenAI.new(api_key: ENV["OPENAI_API_KEY"])
26+
)
27+
28+
# Create the index & mapping (safe to call if it already exists)
29+
# You may need to delete an old index first if it was created without the metadata field.
30+
begin
31+
es.create_default_schema
32+
rescue => e
33+
warn "Index might already exist: #{e.message}"
34+
end
35+
36+
# Prepare documents with metadata
37+
corpus = [
38+
{
39+
text: "Vector search lets you retrieve semantically similar documents.",
40+
metadata: {lang: "en", author: "alice", topic: "vector-search"}
41+
},
42+
{
43+
text: "Las bases de datos vectoriales permiten búsquedas semánticas.",
44+
metadata: {lang: "es", author: "bob", topic: "vector-search"}
45+
},
46+
{
47+
text: "Ruby makes metaprogramming accessible and fun.",
48+
metadata: {lang: "en", author: "carol", topic: "ruby"}
49+
}
50+
]
51+
52+
puts "\nAdding documents with metadata …"
53+
54+
es.add_texts(
55+
texts: corpus.map { |d| d[:text] },
56+
metadatas: corpus.map { |d| d[:metadata] }
57+
)
58+
59+
sleep 1 # give ES a moment to index
60+
61+
puts "\nSimilarity search for 'vector' restricted to English docs:"
62+
filter = {term: {"metadata.lang" => "en"}}
63+
results = es.similarity_search(text: "vector", k: 2, filter: filter)
64+
pp results
65+
66+
puts "\nSimilarity search by embedding, Spanish docs only:"
67+
embedding = es.llm.embed(text: "vector query").embedding
68+
filter = {term: {"metadata.lang" => "es"}}
69+
pp es.similarity_search_by_vector(embedding: embedding, k: 1, filter: filter)
70+
71+
# Cleanup (optional)
72+
# es.delete_default_schema

lib/langchain/vectorsearch/elasticsearch.rb

Lines changed: 52 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,25 @@ def initialize(url:, index_name:, llm:, api_key: nil, es_options: {})
4848

4949
# Add a list of texts to the index
5050
# @param texts [Array<String>] The list of texts to add
51+
# @param metadatas [Array<Hash>] Optional list of metadata hashes to store alongside each text. Must be the same length as texts when provided.
5152
# @return [Elasticsearch::Response] from the Elasticsearch server
52-
def add_texts(texts: [])
53-
body = texts.map do |text|
53+
def add_texts(texts: [], metadatas: [])
54+
metadatas = Array(metadatas)
55+
56+
if !metadatas.empty? && (metadatas.length != texts.length)
57+
raise ArgumentError, "`metadatas` must be the same length as `texts` when provided"
58+
end
59+
60+
body = texts.map.with_index do |text, i|
61+
document_body = {
62+
input: text,
63+
input_vector: llm.embed(text: text).embedding
64+
}
65+
document_body[:metadata] = metadatas[i] if metadatas[i]
66+
5467
[
5568
{index: {_index: index_name}},
56-
{input: text, input_vector: llm.embed(text: text).embedding}
69+
document_body
5770
]
5871
end.flatten
5972

@@ -63,12 +76,25 @@ def add_texts(texts: [])
6376
# Add a list of texts to the index
6477
# @param texts [Array<String>] The list of texts to update
6578
# @param texts [Array<Integer>] The list of texts to update
79+
# @param metadatas [Array<Hash>] Optional list of metadata hashes to update alongside each text. Must be the same length as texts when provided.
6680
# @return [Elasticsearch::Response] from the Elasticsearch server
67-
def update_texts(texts: [], ids: [])
81+
def update_texts(texts: [], ids: [], metadatas: [])
82+
metadatas = Array(metadatas)
83+
84+
if !metadatas.empty? && (metadatas.length != texts.length)
85+
raise ArgumentError, "`metadatas` must be the same length as `texts` when provided"
86+
end
87+
6888
body = texts.map.with_index do |text, i|
89+
document_body = {
90+
input: text,
91+
input_vector: llm.embed(text: text).embedding
92+
}
93+
document_body[:metadata] = metadatas[i] if metadatas[i]
94+
6995
[
7096
{index: {_index: index_name, _id: ids[i]}},
71-
{input: text, input_vector: llm.embed(text: text).embedding}
97+
document_body
7298
]
7399
end.flatten
74100

@@ -118,7 +144,11 @@ def default_schema
118144
input: {
119145
type: "text"
120146
},
121-
input_vector: vector_settings
147+
input_vector: vector_settings,
148+
metadata: {
149+
type: "object",
150+
dynamic: true
151+
}
122152
}
123153
}
124154
}
@@ -163,34 +193,45 @@ def ask(question:, k: 4, &block)
163193
# @param text [String] The text to search for
164194
# @param k [Integer] The number of results to return
165195
# @param query [Hash] Elasticsearch query that needs to be used while searching (Optional)
196+
# @param filter [Hash] Elasticsearch filter that needs to be used while searching (Optional)
166197
# @return [Elasticsearch::Response] The response from the server
167-
def similarity_search(text: "", k: 10, query: {})
198+
def similarity_search(text: "", k: 10, query: {}, filter: {})
168199
if text.empty? && query.empty?
169200
raise "Either text or query should pass as an argument"
170201
end
171202

203+
# Build base similarity query (script_score by default)
172204
if query.empty?
173205
query_vector = llm.embed(text: text).embedding
174-
175206
query = default_query(query_vector)
176207
end
177208

178-
es_client.search(body: {query: query, size: k}).body
209+
# Apply filter if provided
210+
final_query = if filter.empty?
211+
query
212+
else
213+
{bool: {must: query, filter: filter}}
214+
end
215+
216+
es_client.search(body: {query: final_query, size: k}).body
179217
end
180218

181219
# Search for similar texts by embedding
182220
# @param embedding [Array<Float>] The embedding to search for
183221
# @param k [Integer] The number of results to return
184222
# @param query [Hash] Elasticsearch query that needs to be used while searching (Optional)
223+
# @param filter [Hash] Elasticsearch filter that needs to be used while searching (Optional)
185224
# @return [Elasticsearch::Response] The response from the server
186-
def similarity_search_by_vector(embedding: [], k: 10, query: {})
225+
def similarity_search_by_vector(embedding: [], k: 10, query: {}, filter: {})
187226
if embedding.empty? && query.empty?
188227
raise "Either embedding or query should pass as an argument"
189228
end
190229

191230
query = default_query(embedding) if query.empty?
192231

193-
es_client.search(body: {query: query, size: k}).body
232+
final_query = filter.empty? ? query : {bool: {must: query, filter: filter}}
233+
234+
es_client.search(body: {query: final_query, size: k}).body
194235
end
195236
end
196237
end

spec/langchain/vectorsearch/elasticsearch_spec.rb

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,23 @@
1717
end
1818

1919
describe "#add_texts" do
20-
it "indexes data into elasticsearch" do
20+
it "indexes data into elasticsearch with metadata" do
21+
metadata = {lang: "en"}
2122
es_body = [
2223
{index: {_index: "langchain"}},
23-
{input: "simple text", input_vector: [0.1, 0.2, 0.3]}
24+
{input: "simple text", input_vector: [0.1, 0.2, 0.3], metadata: metadata}
2425
]
2526

2627
allow_any_instance_of(::Elasticsearch::Client).to receive(:bulk).with(body: es_body)
2728
expect_any_instance_of(::Elasticsearch::Client).to receive(:bulk).with(body: es_body).once
2829

29-
subject.add_texts(texts: ["simple text"])
30+
subject.add_texts(texts: ["simple text"], metadatas: [metadata])
31+
end
32+
33+
it "raises error when metadatas length mismatch" do
34+
expect {
35+
subject.add_texts(texts: ["t1", "t2"], metadatas: [{foo: 1}])
36+
}.to raise_error(ArgumentError)
3037
end
3138
end
3239

@@ -38,16 +45,17 @@
3845
.and_return([0.1, 0.2, 0.3, 0.4])
3946
end
4047

41-
it "updates respective document" do
48+
it "updates respective document with metadata" do
49+
metadata = {version: 2}
4250
es_body = [
4351
{index: {_index: "langchain", _id: 1}},
44-
{input: "updated text", input_vector: [0.1, 0.2, 0.3, 0.4]}
52+
{input: "updated text", input_vector: [0.1, 0.2, 0.3, 0.4], metadata: metadata}
4553
]
4654

4755
allow_any_instance_of(::Elasticsearch::Client).to receive(:bulk).with(body: es_body)
4856
expect_any_instance_of(::Elasticsearch::Client).to receive(:bulk).with(body: es_body).once
4957

50-
subject.update_texts(texts: ["updated text"], ids: [1])
58+
subject.update_texts(texts: ["updated text"], ids: [1], metadatas: [metadata])
5159
end
5260
end
5361

@@ -100,7 +108,8 @@
100108
input: {
101109
type: "text"
102110
},
103-
input_vector: {type: "dense_vector", dims: 384}
111+
input_vector: {type: "dense_vector", dims: 384},
112+
metadata: {type: "object", dynamic: true}
104113
}
105114
}
106115
}
@@ -117,7 +126,8 @@
117126
input: {
118127
type: "text"
119128
},
120-
input_vector: {type: "dense_vector", dims: 500}
129+
input_vector: {type: "dense_vector", dims: 500},
130+
metadata: {type: "object", dynamic: true}
121131
}
122132
}
123133
}
@@ -145,7 +155,8 @@
145155
end
146156

147157
describe "#similarity_search" do
148-
it "should return similar documents" do
158+
it "should return similar documents with metadata filter" do
159+
filter = {term: {"metadata.lang": "en"}}
149160
response = [
150161
{_id: 1, input: "simple text", input_vector: [0.1, 0.5, 0.6]},
151162
{_id: 2, input: "update text", input_vector: [0.5, 0.3, 0.1]}
@@ -154,13 +165,13 @@
154165

155166
allow(es_response).to receive(:body).and_return(response)
156167
allow_any_instance_of(::Elasticsearch::Client)
157-
.to receive(:search).with(body: {query: subject.default_query([0.1, 0.2, 0.3]), size: 5}).and_return(es_response)
168+
.to receive(:search).with(body: {query: {bool: {must: subject.default_query([0.1, 0.2, 0.3]), filter: filter}}, size: 5}).and_return(es_response)
158169

159170
expect_any_instance_of(::Elasticsearch::Client)
160-
.to receive(:search).with(body: {query: subject.default_query([0.1, 0.2, 0.3]), size: 5})
171+
.to receive(:search).with(body: {query: {bool: {must: subject.default_query([0.1, 0.2, 0.3]), filter: filter}}, size: 5})
161172
expect(es_response).to receive(:body)
162173

163-
expect(subject.similarity_search(text: "simple", k: 5)).to eq(response)
174+
expect(subject.similarity_search(text: "simple", k: 5, filter: filter)).to eq(response)
164175
end
165176

166177
it "able to search with custom query" do
@@ -197,7 +208,8 @@
197208
end
198209

199210
describe "#similarity_search_by_vector" do
200-
it "should return similar documents" do
211+
it "should return similar documents with metadata filter" do
212+
filter = {term: {"metadata.lang": "en"}}
201213
response = [
202214
{_id: 1, input: "simple text", input_vector: [0.1, 0.5, 0.6]},
203215
{_id: 2, input: "update text", input_vector: [0.5, 0.3, 0.1]}
@@ -206,13 +218,13 @@
206218

207219
allow(es_response).to receive(:body).and_return(response)
208220
allow_any_instance_of(::Elasticsearch::Client)
209-
.to receive(:search).with(body: {query: subject.default_query([0.5, 0.6, 0.7]), size: 5}).and_return(es_response)
221+
.to receive(:search).with(body: {query: {bool: {must: subject.default_query([0.5, 0.6, 0.7]), filter: filter}}, size: 5}).and_return(es_response)
210222

211223
expect_any_instance_of(::Elasticsearch::Client)
212-
.to receive(:search).with(body: {query: subject.default_query([0.5, 0.6, 0.7]), size: 5})
224+
.to receive(:search).with(body: {query: {bool: {must: subject.default_query([0.5, 0.6, 0.7]), filter: filter}}, size: 5})
213225
expect(es_response).to receive(:body)
214226

215-
expect(subject.similarity_search_by_vector(embedding: [0.5, 0.6, 0.7], k: 5)).to eq(response)
227+
expect(subject.similarity_search_by_vector(embedding: [0.5, 0.6, 0.7], k: 5, filter: filter)).to eq(response)
216228
end
217229

218230
it "able to search with custom query" do

0 commit comments

Comments
 (0)