@@ -112,8 +112,8 @@ def add_to_lancedb(items: Annotated[object, "Items with the vectors and document
112112 doc_id_field : Annotated [Optional [str ], "Field containing document ID" ] = None ,
113113 metadata_field_list : Annotated [Optional [str ], "Optional metadata field list" ] = None ,
114114 overwrite : Annotated [bool , "If true, overwrite existing table" ]= False ,
115- upsert : Annotated [bool , "If true (default), update existing documents with same ID. If false, raise error on duplicate ID " ]= True ,
116- vector_dim : Annotated [Optional [ int ] , "Expected dimension of vectors" ]= None
115+ vector_dim : Annotated [Optional [ int ] , "Expected dimension of vectors " ]= None ,
116+ batch_size : Annotated [int , "Batch size for adding vectors" ]= 1 ,
117117 ):
118118 """Add vectors and documents to LanceDB using LanceDBDocumentStore.
119119
@@ -151,16 +151,18 @@ def add_to_lancedb(items: Annotated[object, "Items with the vectors and document
151151 # If there's any issue with dropping, continue
152152 logger .warning (f"Could not drop table '{ table_name } ' for overwrite. Continuing without dropping." )
153153
154+ cached_docs = []
154155 for item in items :
155156 # Extract vector
156157 vector = extract_property (item , vector_field , fail_on_missing = True )
157158 if not isinstance (vector , (list , tuple , np .ndarray )):
158159 raise ValueError (f"Vector field '{ vector_field } ' must be a list, tuple, or numpy array" )
159160
160- # Extract document ID if specified
161- doc_id = None
162161 if doc_id_field :
163162 doc_id = extract_property (item , doc_id_field , fail_on_missing = False )
163+ else :
164+ doc_id = str (uuid .uuid4 ())
165+ assign_property (item , "_doc_id" , doc_id )
164166
165167 # Extract metadata
166168 if metadata_field_list :
@@ -171,21 +173,20 @@ def add_to_lancedb(items: Annotated[object, "Items with the vectors and document
171173 metadata = {k : v for k , v in item .items () if k != vector_field }
172174 else :
173175 raise ValueError ("If 'metadata_field_list' is not provided, item must be a dict to extract fields." )
174-
176+
175177 # Convert metadata to Document format (string keys and values)
176178 document = {str (k ): str (v ) for k , v in metadata .items ()}
177179
178- # Add to document store (upsert by default, or strict add if upsert=False)
179- if upsert :
180- added_doc_id = doc_store .upsert_vector (vector , document , doc_id )
181- else :
182- added_doc_id = doc_store .add_vector (vector , document , doc_id )
183-
184- # Add the document ID to the item for reference (only if item is a dict)
185- if isinstance (item , dict ):
186- item ["_doc_id" ] = added_doc_id
180+ cached_docs .append ((vector , document , doc_id ))
181+ if len (cached_docs ) >= batch_size :
182+ doc_store .add_vectors (cached_docs )
183+ cached_docs = []
187184
188185 yield item
186+
187+ if len (cached_docs ) > 0 :
188+ doc_store .add_vectors (cached_docs )
189+ cached_docs = []
189190
190191
191192class LanceDBDocumentStore (DocumentStore , VectorAddable , VectorSearchable ):
@@ -224,6 +225,7 @@ def _get_db(self):
224225
225226 def _get_table (self , schema_if_missing = None ):
226227 """Get or create table with provided schema."""
228+ created_and_updated = False
227229 if self ._table is None :
228230 db = self ._get_db ()
229231 try :
@@ -232,9 +234,10 @@ def _get_table(self, schema_if_missing=None):
232234 if schema_if_missing is not None :
233235 # Create table with provided schema data
234236 self ._table = db .create_table (self .table_name , schema_if_missing )
237+ created_and_updated = True
235238 else :
236239 raise ValueError (f"Table '{ self .table_name } ' not found and no schema provided. Please provide a LanceDB compatible schema." )
237- return self ._table
240+ return self ._table , created_and_updated
238241
239242 def _validate_vector (self , vector : VectorLike ) -> List [float ]:
240243 """Validate vector and return as list of floats."""
@@ -269,7 +272,7 @@ def _deserialize_document(self, document_str: str) -> Document:
269272 def get_document (self , doc_id : DocID ) -> Optional [Document ]:
270273 """Retrieve a document by ID."""
271274 try :
272- table = self ._get_table ()
275+ table , created_and_updated = self ._get_table ()
273276 results = table .search ().where (f"id = '{ doc_id } '" ).to_list ()
274277 if results :
275278 return self ._deserialize_document (results [0 ]["document" ])
@@ -279,91 +282,55 @@ def get_document(self, doc_id: DocID) -> Optional[Document]:
279282
280283 # VectorAddable protocol implementation
281284 def add_vector (self , vector : VectorLike , document : Document , doc_id : Optional [DocID ] = None ) -> DocID :
282- """Add a vector to the store."""
283- vec_list = self ._validate_vector (vector )
284-
285- if doc_id is None :
286- doc_id = str (uuid .uuid4 ())
287-
288- # Check if document already exists
289- existing = self .get_document (doc_id )
290- if existing is not None :
291- raise ValueError (f"Document with ID { doc_id } already exists" )
292-
293- # Prepare schema data for table creation if needed
294- schema_data = [{
295- "id" : doc_id ,
296- "vector" : vec_list ,
297- "document" : self ._serialize_document (document )
298- }]
299-
300- table = self ._get_table (schema_if_missing = schema_data )
301-
302- # If table was just created, data is already there, otherwise add it
303- try :
304- # Check if this is a newly created table by seeing if our data is already there
305- existing_check = table .search ().where (f"id = '{ doc_id } '" ).to_list ()
306- if not existing_check :
307- table .add (schema_data )
308- except Exception :
309- # If there's any issue with the check, just try to add the data
310- table .add (schema_data )
311-
312- return doc_id
313-
314- def upsert_vector (self , vector : VectorLike , document : Document , doc_id : Optional [DocID ] = None ) -> DocID :
315- """Add or update a vector in the store (upsert behavior).
316-
317- If a document with the given doc_id exists, it will be updated.
318- If it doesn't exist, a new document will be created.
285+ return self .add_vectors ([(vector , document , doc_id )])[0 ]
319286
287+ def add_vectors (self , documents : List [tuple ]) -> List [DocID ]:
288+ """Add multiple vectors to the store in a batch operation.
289+
320290 Args:
321- vector: The vector to store
322- document: The document metadata to associate with the vector
323- doc_id: Optional document ID. If not provided, a UUID will be generated.
324-
291+ documents: List of tuples in format (vector, document, doc_id) where:
292+ - vector: VectorLike - the vector data
293+ - document: Document - the document metadata
294+ - doc_id: Optional[DocID] - document ID (generated if None)
295+
325296 Returns:
326- The document ID of the added/updated document.
297+ List of document IDs for the added vectors
327298 """
328- vec_list = self ._validate_vector (vector )
329-
330- if doc_id is None :
331- doc_id = str (uuid .uuid4 ())
332-
333- # Check if document already exists
334- existing = self .get_document (doc_id )
335- if existing is not None :
336- # Delete existing document first
337- self .delete_document (doc_id )
338-
339- # Prepare schema data for table creation if needed
340- schema_data = [{
341- "id" : doc_id ,
342- "vector" : vec_list ,
343- "document" : self ._serialize_document (document )
344- }]
345-
346- table = self ._get_table (schema_if_missing = schema_data )
347-
348- # If table was just created, data is already there, otherwise add it
349- try :
350- # Check if this is a newly created table by seeing if our data is already there
351- existing_check = table .search ().where (f"id = '{ doc_id } '" ).to_list ()
352- if not existing_check :
353- table .add (schema_data )
354- except Exception :
355- # If there's any issue with the check, just try to add the data
356- table .add (schema_data )
357-
358- return doc_id
299+ if not documents :
300+ return []
301+
302+ doc_ids = []
303+ schema_data = []
304+
305+ for vector , document , doc_id in documents :
306+ vec_list = self ._validate_vector (vector )
307+
308+ if doc_id is None :
309+ doc_id = str (uuid .uuid4 ())
310+
311+ doc_ids .append (doc_id )
312+
313+ schema_data .append ({
314+ "id" : doc_id ,
315+ "vector" : vec_list ,
316+ "document" : self ._serialize_document (document )
317+ })
318+
319+ table , created_and_updated = self ._get_table (schema_if_missing = schema_data )
320+
321+ if not created_and_updated :
322+ # Table exists, use merge_insert for upsert behavior
323+ table .merge_insert ('id' ).when_matched_update_all ().when_not_matched_insert_all ().execute (schema_data )
324+
325+ return doc_ids
359326
360327 # VectorSearchable protocol implementation
361328 def vector_search (self , vector : VectorLike , limit : int = 10 ) -> List [SearchResult ]:
362329 """Search for vectors similar to the given vector."""
363330 vec_list = self ._validate_vector (vector )
364331
365332 try :
366- table = self ._get_table ()
333+ table , created_and_updated = self ._get_table ()
367334 results = table .search (vec_list ).limit (limit ).to_list ()
368335
369336 search_results = []
@@ -387,7 +354,7 @@ def vector_search(self, vector: VectorLike, limit: int = 10) -> List[SearchResul
387354 def delete_document (self , doc_id : DocID ) -> bool :
388355 """Delete a document by ID."""
389356 try :
390- table = self ._get_table ()
357+ table , created_and_updated = self ._get_table ()
391358 table = table .delete (f"id = '{ doc_id } '" )
392359 return True
393360 except Exception :
@@ -409,7 +376,7 @@ def update_document(self, doc_id: DocID, document: Document, vector: Optional[Ve
409376 vec_list = self ._validate_vector (vector )
410377 else :
411378 # Get the old vector from the table before deletion
412- table = self ._get_table ()
379+ table , created_and_updated = self ._get_table ()
413380 results = table .search ().where (f"id = '{ doc_id } '" ).to_list ()
414381 if not results :
415382 return False
@@ -424,7 +391,7 @@ def update_document(self, doc_id: DocID, document: Document, vector: Optional[Ve
424391 def count (self ) -> int :
425392 """Return the number of documents in the store."""
426393 try :
427- table = self ._get_table ()
394+ table , created_and_updated = self ._get_table ()
428395 # Use count_rows method if available, otherwise fallback to counting all results
429396 if hasattr (table , 'count_rows' ):
430397 return table .count_rows ()
@@ -436,7 +403,7 @@ def count(self) -> int:
436403 def list_ids (self ) -> List [DocID ]:
437404 """Return a list of all document IDs."""
438405 try :
439- table = self ._get_table ()
406+ table , created_and_updated = self ._get_table ()
440407 results = table .search ().select (["id" ]).to_list ()
441408 return [result ["id" ] for result in results ]
442409 except Exception :
0 commit comments