Added ability to optimize lancedb after each batch. Make it always optimize after all items are added.

travis-bauer · travis-bauer · commit d9113293e016 · 2025-12-17T06:07:43.000-07:00
diff --git a/src/talkpipe/pipelines/vector_databases.py b/src/talkpipe/pipelines/vector_databases.py
@@ -27,6 +27,7 @@ def __init__(self,
                  overwrite: Annotated[bool, "If true, overwrite existing table"] = False,
                  fail_on_error: Annotated[bool, "If true, fail on error instead of logging"] = True,
                  batch_size: Annotated[int, "Batch size for committing in the vector database"] = 100,
+                 optimize_on_batch: Annotated[bool, "If true, optimize the table after each batch.  Otherwise optimize after last batch."]=False,
                  ):
         super().__init__()
         self.embedding_model = embedding_model
@@ -47,7 +48,9 @@ def __init__(self,
                                        table_name=self.table_name,
                                        doc_id_field=self.doc_id_field,
                                        overwrite=self.overwrite,
-                                       batch_size=batch_size)
+                                       batch_size=batch_size,
+                                       optimize_on_batch=optimize_on_batch,
+                                       )
 
     def transform(self, input_iter):
         yield from self.pipeline.transform(input_iter)
diff --git a/src/talkpipe/search/lancedb.py b/src/talkpipe/search/lancedb.py
@@ -114,6 +114,7 @@ def add_to_lancedb(items: Annotated[object, "Items with the vectors and document
                    overwrite: Annotated[bool, "If true, overwrite existing table"]=False,
                    vector_dim: Annotated[Optional[int], "Expected dimension of vectors"]=None,
                    batch_size: Annotated[int, "Batch size for adding vectors"]=1,
+                   optimize_on_batch: Annotated[bool, "If true, optimize the table after each batch.  Otherwise optimize after last batch."]=False,
                    ):
     """Add vectors and documents to LanceDB using LanceDBDocumentStore.
 
@@ -181,12 +182,15 @@ def add_to_lancedb(items: Annotated[object, "Items with the vectors and document
         if len(cached_docs) >= batch_size:
             doc_store.add_vectors(cached_docs)
             cached_docs = []
+            if optimize_on_batch:
+                doc_store._get_table()[0].optimize()
 
         yield item
         
     if len(cached_docs) > 0:
         doc_store.add_vectors(cached_docs)
         cached_docs = []
+        doc_store._get_table()[0].optimize() 
 
 
 class LanceDBDocumentStore(DocumentStore, VectorAddable, VectorSearchable):