feat(vectorsearch): add metadata support to Elasticsearch adapter

sergiobayona · sergiobayona · commit fe7fdeee00f2 · 2025-06-11T11:03:59.000-05:00
Adds the ability to store and search by metadata in the Elasticsearch vector search provider:

- Extended `add_texts` and `update_texts` to accept optional `metadatas` parameter
- Updated index mapping to include a dynamic `metadata` object field
- Added `filter` parameter to similarity search methods for metadata-based filtering
- All changes maintain backward compatibility with existing code
- Added comprehensive test coverage for the new functionality
- Created an example script to demonstrate metadata usage with Elasticsearch

This enhancement brings the Elasticsearch adapter to parity with other vector
stores like Pinecone, Chroma, PGVector and Qdrant that already supported metadata.
Users can now store arbitrary metadata alongside their text and embedding vectors,
then filter search results based on metadata fields.
diff --git a/examples/store_and_query_with_elasticsearch_using_metadata.rb b/examples/store_and_query_with_elasticsearch_using_metadata.rb
@@ -0,0 +1,72 @@
+# frozen_string_literal: true
+
+require "langchain"
+require "dotenv/load"
+require "ruby/openai"
+
+# This example assumes you are running Elasticsearch in Docker:
+#
+#   docker run --name es8 -d \
+#     -p 9200:9200 -p 9300:9300 \
+#     -e "discovery.type=single-node" \
+#     -e "xpack.security.enabled=false" \
+#     docker.elastic.co/elasticsearch/elasticsearch:8.12.2
+#
+# The container exposes the REST API on http://localhost:9200 which
+# the script connects to below. If you use a different host/port, set
+# the ELASTICSEARCH_URL environment variable accordingly before running
+# the script:
+#   ELASTICSEARCH_URL=http://localhost:9201 ruby examples/...
+
+# Instantiate the Elasticsearch vector store
+es = Langchain::Vectorsearch::Elasticsearch.new(
+  url: ENV.fetch("ELASTICSEARCH_URL", "http://localhost:9200"),
+  index_name: "documents",
+  llm: Langchain::LLM::OpenAI.new(api_key: ENV["OPENAI_API_KEY"])
+)
+
+# Create the index & mapping (safe to call if it already exists)
+# You may need to delete an old index first if it was created without the metadata field.
+begin
+  es.create_default_schema
+rescue => e
+  warn "Index might already exist: #{e.message}"
+end
+
+# Prepare documents with metadata
+corpus = [
+  {
+    text: "Vector search lets you retrieve semantically similar documents.",
+    metadata: {lang: "en", author: "alice", topic: "vector-search"}
+  },
+  {
+    text: "Las bases de datos vectoriales permiten búsquedas semánticas.",
+    metadata: {lang: "es", author: "bob", topic: "vector-search"}
+  },
+  {
+    text: "Ruby makes metaprogramming accessible and fun.",
+    metadata: {lang: "en", author: "carol", topic: "ruby"}
+  }
+]
+
+puts "\nAdding documents with metadata …"
+
+es.add_texts(
+  texts: corpus.map { |d| d[:text] },
+  metadatas: corpus.map { |d| d[:metadata] }
+)
+
+sleep 1 # give ES a moment to index
+
+puts "\nSimilarity search for 'vector' restricted to English docs:"
+filter = {term: {"metadata.lang" => "en"}}
+results = es.similarity_search(text: "vector", k: 2, filter: filter)
+pp results
+
+puts "\nSimilarity search by embedding, Spanish docs only:"
+embedding = es.llm.embed(text: "vector query").embedding
+filter = {term: {"metadata.lang" => "es"}}
+pp es.similarity_search_by_vector(embedding: embedding, k: 1, filter: filter)
+
+# Cleanup (optional)
+# es.delete_default_schema
diff --git a/lib/langchain/vectorsearch/elasticsearch.rb b/lib/langchain/vectorsearch/elasticsearch.rb
@@ -48,12 +48,25 @@ def initialize(url:, index_name:, llm:, api_key: nil, es_options: {})
 
     # Add a list of texts to the index
     # @param texts [Array<String>] The list of texts to add
+    # @param metadatas [Array<Hash>] Optional list of metadata hashes to store alongside each text. Must be the same length as texts when provided.
     # @return [Elasticsearch::Response] from the Elasticsearch server
-    def add_texts(texts: [])
-      body = texts.map do |text|
+    def add_texts(texts: [], metadatas: [])
+      metadatas = Array(metadatas)
+
+      if !metadatas.empty? && (metadatas.length != texts.length)
+        raise ArgumentError, "`metadatas` must be the same length as `texts` when provided"
+      end
+
+      body = texts.map.with_index do |text, i|
+        document_body = {
+          input: text,
+          input_vector: llm.embed(text: text).embedding
+        }
+        document_body[:metadata] = metadatas[i] if metadatas[i]
+
         [
           {index: {_index: index_name}},
-          {input: text, input_vector: llm.embed(text: text).embedding}
+          document_body
         ]
       end.flatten
 
@@ -63,12 +76,25 @@ def add_texts(texts: [])
     # Add a list of texts to the index
     # @param texts [Array<String>] The list of texts to update
     # @param texts [Array<Integer>] The list of texts to update
+    # @param metadatas [Array<Hash>] Optional list of metadata hashes to update alongside each text. Must be the same length as texts when provided.
     # @return [Elasticsearch::Response] from the Elasticsearch server
-    def update_texts(texts: [], ids: [])
+    def update_texts(texts: [], ids: [], metadatas: [])
+      metadatas = Array(metadatas)
+
+      if !metadatas.empty? && (metadatas.length != texts.length)
+        raise ArgumentError, "`metadatas` must be the same length as `texts` when provided"
+      end
+
       body = texts.map.with_index do |text, i|
+        document_body = {
+          input: text,
+          input_vector: llm.embed(text: text).embedding
+        }
+        document_body[:metadata] = metadatas[i] if metadatas[i]
+
         [
           {index: {_index: index_name, _id: ids[i]}},
-          {input: text, input_vector: llm.embed(text: text).embedding}
+          document_body
         ]
       end.flatten
 
@@ -118,7 +144,11 @@ def default_schema
             input: {
               type: "text"
             },
-            input_vector: vector_settings
+            input_vector: vector_settings,
+            metadata: {
+              type: "object",
+              dynamic: true
+            }
           }
         }
       }
@@ -163,34 +193,45 @@ def ask(question:, k: 4, &block)
     # @param text [String] The text to search for
     # @param k [Integer] The number of results to return
     # @param query [Hash] Elasticsearch query that needs to be used while searching (Optional)
+    # @param filter [Hash] Elasticsearch filter that needs to be used while searching (Optional)
     # @return [Elasticsearch::Response] The response from the server
-    def similarity_search(text: "", k: 10, query: {})
+    def similarity_search(text: "", k: 10, query: {}, filter: {})
       if text.empty? && query.empty?
         raise "Either text or query should pass as an argument"
       end
 
+      # Build base similarity query (script_score by default)
       if query.empty?
         query_vector = llm.embed(text: text).embedding
-
         query = default_query(query_vector)
       end
 
-      es_client.search(body: {query: query, size: k}).body
+      # Apply filter if provided
+      final_query = if filter.empty?
+        query
+      else
+        {bool: {must: query, filter: filter}}
+      end
+
+      es_client.search(body: {query: final_query, size: k}).body
     end
 
     # Search for similar texts by embedding
     # @param embedding [Array<Float>] The embedding to search for
     # @param k [Integer] The number of results to return
     # @param query [Hash] Elasticsearch query that needs to be used while searching (Optional)
+    # @param filter [Hash] Elasticsearch filter that needs to be used while searching (Optional)
     # @return [Elasticsearch::Response] The response from the server
-    def similarity_search_by_vector(embedding: [], k: 10, query: {})
+    def similarity_search_by_vector(embedding: [], k: 10, query: {}, filter: {})
       if embedding.empty? && query.empty?
         raise "Either embedding or query should pass as an argument"
       end
 
       query = default_query(embedding) if query.empty?
 
-      es_client.search(body: {query: query, size: k}).body
+      final_query = filter.empty? ? query : {bool: {must: query, filter: filter}}
+
+      es_client.search(body: {query: final_query, size: k}).body
     end
   end
 end
diff --git a/spec/langchain/vectorsearch/elasticsearch_spec.rb b/spec/langchain/vectorsearch/elasticsearch_spec.rb
@@ -17,16 +17,23 @@
   end
 
   describe "#add_texts" do
-    it "indexes data into elasticsearch" do
+    it "indexes data into elasticsearch with metadata" do
+      metadata = {lang: "en"}
       es_body = [
         {index: {_index: "langchain"}},
-        {input: "simple text", input_vector: [0.1, 0.2, 0.3]}
+        {input: "simple text", input_vector: [0.1, 0.2, 0.3], metadata: metadata}
       ]
 
       allow_any_instance_of(::Elasticsearch::Client).to receive(:bulk).with(body: es_body)
       expect_any_instance_of(::Elasticsearch::Client).to receive(:bulk).with(body: es_body).once
 
-      subject.add_texts(texts: ["simple text"])
+      subject.add_texts(texts: ["simple text"], metadatas: [metadata])
+    end
+
+    it "raises error when metadatas length mismatch" do
+      expect {
+        subject.add_texts(texts: ["t1", "t2"], metadatas: [{foo: 1}])
+      }.to raise_error(ArgumentError)
     end
   end
 
@@ -38,16 +45,17 @@
         .and_return([0.1, 0.2, 0.3, 0.4])
     end
 
-    it "updates respective document" do
+    it "updates respective document with metadata" do
+      metadata = {version: 2}
       es_body = [
         {index: {_index: "langchain", _id: 1}},
-        {input: "updated text", input_vector: [0.1, 0.2, 0.3, 0.4]}
+        {input: "updated text", input_vector: [0.1, 0.2, 0.3, 0.4], metadata: metadata}
       ]
 
       allow_any_instance_of(::Elasticsearch::Client).to receive(:bulk).with(body: es_body)
       expect_any_instance_of(::Elasticsearch::Client).to receive(:bulk).with(body: es_body).once
 
-      subject.update_texts(texts: ["updated text"], ids: [1])
+      subject.update_texts(texts: ["updated text"], ids: [1], metadatas: [metadata])
     end
   end
 
@@ -100,7 +108,8 @@
             input: {
               type: "text"
             },
-            input_vector: {type: "dense_vector", dims: 384}
+            input_vector: {type: "dense_vector", dims: 384},
+            metadata: {type: "object", dynamic: true}
           }
         }
       }
@@ -117,7 +126,8 @@
             input: {
               type: "text"
             },
-            input_vector: {type: "dense_vector", dims: 500}
+            input_vector: {type: "dense_vector", dims: 500},
+            metadata: {type: "object", dynamic: true}
           }
         }
       }
@@ -145,7 +155,8 @@
   end
 
   describe "#similarity_search" do
-    it "should return similar documents" do
+    it "should return similar documents with metadata filter" do
+      filter = {term: {"metadata.lang": "en"}}
       response = [
         {_id: 1, input: "simple text", input_vector: [0.1, 0.5, 0.6]},
         {_id: 2, input: "update text", input_vector: [0.5, 0.3, 0.1]}
@@ -154,13 +165,13 @@
 
       allow(es_response).to receive(:body).and_return(response)
       allow_any_instance_of(::Elasticsearch::Client)
-        .to receive(:search).with(body: {query: subject.default_query([0.1, 0.2, 0.3]), size: 5}).and_return(es_response)
+        .to receive(:search).with(body: {query: {bool: {must: subject.default_query([0.1, 0.2, 0.3]), filter: filter}}, size: 5}).and_return(es_response)
 
       expect_any_instance_of(::Elasticsearch::Client)
-        .to receive(:search).with(body: {query: subject.default_query([0.1, 0.2, 0.3]), size: 5})
+        .to receive(:search).with(body: {query: {bool: {must: subject.default_query([0.1, 0.2, 0.3]), filter: filter}}, size: 5})
       expect(es_response).to receive(:body)
 
-      expect(subject.similarity_search(text: "simple", k: 5)).to eq(response)
+      expect(subject.similarity_search(text: "simple", k: 5, filter: filter)).to eq(response)
     end
 
     it "able to search with custom query" do
@@ -197,7 +208,8 @@
   end
 
   describe "#similarity_search_by_vector" do
-    it "should return similar documents" do
+    it "should return similar documents with metadata filter" do
+      filter = {term: {"metadata.lang": "en"}}
       response = [
         {_id: 1, input: "simple text", input_vector: [0.1, 0.5, 0.6]},
         {_id: 2, input: "update text", input_vector: [0.5, 0.3, 0.1]}
@@ -206,13 +218,13 @@
 
       allow(es_response).to receive(:body).and_return(response)
       allow_any_instance_of(::Elasticsearch::Client)
-        .to receive(:search).with(body: {query: subject.default_query([0.5, 0.6, 0.7]), size: 5}).and_return(es_response)
+        .to receive(:search).with(body: {query: {bool: {must: subject.default_query([0.5, 0.6, 0.7]), filter: filter}}, size: 5}).and_return(es_response)
 
       expect_any_instance_of(::Elasticsearch::Client)
-        .to receive(:search).with(body: {query: subject.default_query([0.5, 0.6, 0.7]), size: 5})
+        .to receive(:search).with(body: {query: {bool: {must: subject.default_query([0.5, 0.6, 0.7]), filter: filter}}, size: 5})
       expect(es_response).to receive(:body)
 
-      expect(subject.similarity_search_by_vector(embedding: [0.5, 0.6, 0.7], k: 5)).to eq(response)
+      expect(subject.similarity_search_by_vector(embedding: [0.5, 0.6, 0.7], k: 5, filter: filter)).to eq(response)
     end
 
     it "able to search with custom query" do