-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquickstart.py
65 lines (56 loc) · 2.58 KB
/
quickstart.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import cocoindex
@cocoindex.flow_def(name="TextEmbedding")
def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
# Add a data source to read files from a directory
data_scope["documents"] = flow_builder.add_source(
cocoindex.sources.LocalFile(path="markdown_files"))
# Add a collector for data to be exported to the vector index
doc_embeddings = data_scope.add_collector()
# Transform data of each document
with data_scope["documents"].row() as doc:
# Split the document into chunks, put into `chunks` field
doc["chunks"] = doc["content"].transform(
cocoindex.functions.SplitRecursively(),
language="javascript", chunk_size=300, chunk_overlap=100)
# Transform data of each chunk
with doc["chunks"].row() as chunk:
# Embed the chunk, put into `embedding` field
chunk["embedding"] = chunk["text"].transform(
cocoindex.functions.SentenceTransformerEmbed(
model="sentence-transformers/all-MiniLM-L6-v2"))
# Collect the chunk into the collector.
doc_embeddings.collect(filename=doc["filename"], location=chunk["location"],
text=chunk["text"], embedding=chunk["embedding"])
# Export collected data to a vector index.
doc_embeddings.export(
"doc_embeddings",
cocoindex.storages.Postgres(),
primary_key_fields=["filename", "location"],
vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
name="SemanticsSearch",
flow=text_embedding_flow,
target_name="doc_embeddings",
query_transform_flow=lambda text: text.transform(
cocoindex.functions.SentenceTransformerEmbed(
model="sentence-transformers/all-MiniLM-L6-v2")),
default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)
@cocoindex.main_fn()
def _main():
# Run queries to demonstrate the query capabilities.
while True:
try:
query = input("Enter search query (or Enter to quit): ")
if query == '':
break
results, _ = query_handler.search(query, 10)
print("\nSearch results:")
for result in results:
print(f"[{result.score:.3f}] {result.data['filename']}")
print(f" {result.data['text']}")
print("---")
print()
except KeyboardInterrupt:
break
if __name__ == "__main__":
_main()