Skip to content

Commit d911329

Browse files
committed
Added ability to optimize lancedb after each batch. Make it always optimize after all items are added.
1 parent 2531159 commit d911329

File tree

2 files changed

+8
-1
lines changed

2 files changed

+8
-1
lines changed

src/talkpipe/pipelines/vector_databases.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def __init__(self,
2727
overwrite: Annotated[bool, "If true, overwrite existing table"] = False,
2828
fail_on_error: Annotated[bool, "If true, fail on error instead of logging"] = True,
2929
batch_size: Annotated[int, "Batch size for committing in the vector database"] = 100,
30+
optimize_on_batch: Annotated[bool, "If true, optimize the table after each batch. Otherwise optimize after last batch."]=False,
3031
):
3132
super().__init__()
3233
self.embedding_model = embedding_model
@@ -47,7 +48,9 @@ def __init__(self,
4748
table_name=self.table_name,
4849
doc_id_field=self.doc_id_field,
4950
overwrite=self.overwrite,
50-
batch_size=batch_size)
51+
batch_size=batch_size,
52+
optimize_on_batch=optimize_on_batch,
53+
)
5154

5255
def transform(self, input_iter):
5356
yield from self.pipeline.transform(input_iter)

src/talkpipe/search/lancedb.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ def add_to_lancedb(items: Annotated[object, "Items with the vectors and document
114114
overwrite: Annotated[bool, "If true, overwrite existing table"]=False,
115115
vector_dim: Annotated[Optional[int], "Expected dimension of vectors"]=None,
116116
batch_size: Annotated[int, "Batch size for adding vectors"]=1,
117+
optimize_on_batch: Annotated[bool, "If true, optimize the table after each batch. Otherwise optimize after last batch."]=False,
117118
):
118119
"""Add vectors and documents to LanceDB using LanceDBDocumentStore.
119120
@@ -181,12 +182,15 @@ def add_to_lancedb(items: Annotated[object, "Items with the vectors and document
181182
if len(cached_docs) >= batch_size:
182183
doc_store.add_vectors(cached_docs)
183184
cached_docs = []
185+
if optimize_on_batch:
186+
doc_store._get_table()[0].optimize()
184187

185188
yield item
186189

187190
if len(cached_docs) > 0:
188191
doc_store.add_vectors(cached_docs)
189192
cached_docs = []
193+
doc_store._get_table()[0].optimize()
190194

191195

192196
class LanceDBDocumentStore(DocumentStore, VectorAddable, VectorSearchable):

0 commit comments

Comments
 (0)