Skip to content

Commit 358dbd6

Browse files
Use parameter for retriever (#159)
* fix Signed-off-by: Liangyx2 <[email protected]> for more information, see https://pre-commit.ci --------- Signed-off-by: Liangyx2 <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 530d047 commit 358dbd6

File tree

11 files changed

+156
-16
lines changed

11 files changed

+156
-16
lines changed

comps/cores/proto/docarray.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,19 @@ class Base64ByteStrDoc(BaseDoc):
2020

2121
class DocPath(BaseDoc):
2222
path: str
23+
chunk_size: int = 1500
24+
chunk_overlap: int = 100
2325

2426

2527
class EmbedDoc768(BaseDoc):
2628
text: str
2729
embedding: conlist(float, min_length=768, max_length=768)
30+
search_type: str = "similarity"
31+
k: int = 4
32+
distance_threshold: Optional[float] = None
33+
fetch_k: int = 20
34+
lambda_mult: float = 0.5
35+
score_threshold: float = 0.2
2836

2937

3038
class Audio2TextDoc(AudioDoc):

comps/dataprep/milvus/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,9 @@ Once document preparation microservice for Qdrant is started, user can use below
5353
```bash
5454
curl -X POST -H "Content-Type: application/json" -d '{"path":"/home/user/doc/your_document_name"}' http://localhost:6010/v1/dataprep
5555
```
56+
57+
You can specify chunk_size and chunk_size by the following commands.
58+
59+
```bash
60+
curl -X POST -H "Content-Type: application/json" -d '{"path":"/home/user/doc/your_document_name","chunk_size":1500,"chunk_overlap":100}' http://localhost:6010/v1/dataprep
61+
```

comps/dataprep/milvus/prepare_doc_milvus.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,13 @@
3131
# @opea_telemetry
3232
def ingest_documents(doc_path: DocPath):
3333
"""Ingest document to Milvus."""
34-
doc_path = doc_path.path
35-
print(f"Parsing document {doc_path}.")
34+
path = doc_path.path
35+
print(f"Parsing document {path}.")
3636

37-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True)
38-
content = document_loader(doc_path)
37+
text_splitter = RecursiveCharacterTextSplitter(
38+
chunk_size=doc_path.chunk_size, chunk_overlap=doc_path.chunk_size, add_start_index=True
39+
)
40+
content = document_loader(path)
3941
chunks = text_splitter.split_text(content)
4042

4143
print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")

comps/dataprep/qdrant/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,3 +69,9 @@ Once document preparation microservice for Qdrant is started, user can use below
6969
```bash
7070
curl -X POST -H "Content-Type: application/json" -d '{"path":"/path/to/document"}' http://localhost:6000/v1/dataprep
7171
```
72+
73+
You can specify chunk_size and chunk_size by the following commands.
74+
75+
```bash
76+
curl -X POST -H "Content-Type: application/json" -d '{"path":"/path/to/document","chunk_size":1500,"chunk_overlap":100}' http://localhost:6000/v1/dataprep
77+
```

comps/dataprep/qdrant/prepare_doc_qdrant.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,13 @@
2525
@opea_telemetry
2626
def ingest_documents(doc_path: DocPath):
2727
"""Ingest document to Qdrant."""
28-
doc_path = doc_path.path
29-
print(f"Parsing document {doc_path}.")
28+
path = doc_path.path
29+
print(f"Parsing document {path}.")
3030

31-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True)
32-
content = document_loader(doc_path)
31+
text_splitter = RecursiveCharacterTextSplitter(
32+
chunk_size=doc_path.chunk_size, chunk_overlap=doc_path.chunk_size, add_start_index=True
33+
)
34+
content = document_loader(path)
3335
chunks = text_splitter.split_text(content)
3436

3537
print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")

comps/dataprep/redis/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,17 @@ curl -X POST \
140140
http://localhost:6007/v1/dataprep
141141
```
142142

143+
You can specify chunk_size and chunk_size by the following commands.
144+
145+
```bash
146+
curl -X POST \
147+
-H "Content-Type: multipart/form-data" \
148+
-F "files=@/home/sdp/yuxiang/opea_intent/GenAIComps4/comps/table_extraction/LLAMA2_page6.pdf" \
149+
-F "chunk_size=1500" \
150+
-F "chunk_overlap=100" \
151+
http://localhost:6007/v1/dataprep
152+
```
153+
143154
- Multiple file upload
144155

145156
```bash

comps/dataprep/redis/langchain/prepare_doc_redis.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,13 @@ async def save_file_to_local_disk(save_path: str, file):
3333

3434
def ingest_data_to_redis(doc_path: DocPath):
3535
"""Ingest document to Redis."""
36-
doc_path = doc_path.path
37-
print(f"Parsing document {doc_path}.")
36+
path = doc_path.path
37+
print(f"Parsing document {path}.")
3838

39-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True)
40-
content = document_loader(doc_path)
39+
text_splitter = RecursiveCharacterTextSplitter(
40+
chunk_size=doc_path.chunk_size, chunk_overlap=doc_path.chunk_size, add_start_index=True
41+
)
42+
content = document_loader(path)
4143
chunks = text_splitter.split_text(content)
4244
print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
4345

@@ -99,7 +101,10 @@ def ingest_link_to_redis(link_list: List[str]):
99101
@register_microservice(name="opea_service@prepare_doc_redis", endpoint="/v1/dataprep", host="0.0.0.0", port=6007)
100102
@traceable(run_type="tool")
101103
async def ingest_documents(
102-
files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), link_list: Optional[str] = Form(None)
104+
files: Optional[Union[UploadFile, List[UploadFile]]] = File(None),
105+
link_list: Optional[str] = Form(None),
106+
chunk_size: int = Form(1500),
107+
chunk_overlap: int = Form(100),
103108
):
104109
print(f"files:{files}")
105110
print(f"link_list:{link_list}")
@@ -115,7 +120,7 @@ async def ingest_documents(
115120
for file in files:
116121
save_path = upload_folder + file.filename
117122
await save_file_to_local_disk(save_path, file)
118-
ingest_data_to_redis(DocPath(path=save_path))
123+
ingest_data_to_redis(DocPath(path=save_path, chunk_size=chunk_size, chunk_overlap=chunk_overlap))
119124
print(f"Successfully saved file {save_path}")
120125
return {"status": 200, "message": "Data preparation succeeded"}
121126

comps/retrievers/langchain/milvus/README.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,3 +66,37 @@ curl http://${your_ip}:7000/v1/retrieval \
6666
-d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding}}" \
6767
-H 'Content-Type: application/json'
6868
```
69+
70+
You can set the parameters for the retriever.
71+
72+
```bash
73+
your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
74+
curl http://localhost:7000/v1/retrieval \
75+
-X POST \
76+
-d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity\", \"k\":4}" \
77+
-H 'Content-Type: application/json'
78+
```
79+
80+
```bash
81+
your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
82+
curl http://localhost:7000/v1/retrieval \
83+
-X POST \
84+
-d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity_distance_threshold\", \"k\":4, \"distance_threshold\":1.0}" \
85+
-H 'Content-Type: application/json'
86+
```
87+
88+
```bash
89+
your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
90+
curl http://localhost:7000/v1/retrieval \
91+
-X POST \
92+
-d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity_score_threshold\", \"k\":4, \"score_threshold\":0.2}" \
93+
-H 'Content-Type: application/json'
94+
```
95+
96+
```bash
97+
your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
98+
curl http://localhost:7000/v1/retrieval \
99+
-X POST \
100+
-d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"mmr\", \"k\":4, \"fetch_k\":20, \"lambda_mult\":0.5}" \
101+
-H 'Content-Type: application/json'
102+
```

comps/retrievers/langchain/milvus/retriever_milvus.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,23 @@ def retrieve(input: EmbedDoc768) -> SearchedDoc:
3838
collection_name=COLLECTION_NAME,
3939
)
4040
start = time.time()
41-
search_res = vector_db.similarity_search_by_vector(embedding=input.embedding)
41+
if input.search_type == "similarity":
42+
search_res = vector_db.similarity_search_by_vector(embedding=input.embedding, k=input.k)
43+
elif input.search_type == "similarity_distance_threshold":
44+
if input.distance_threshold is None:
45+
raise ValueError("distance_threshold must be provided for " + "similarity_distance_threshold retriever")
46+
search_res = vector_db.similarity_search_by_vector(
47+
embedding=input.embedding, k=input.k, distance_threshold=input.distance_threshold
48+
)
49+
elif input.search_type == "similarity_score_threshold":
50+
docs_and_similarities = vector_db.similarity_search_with_relevance_scores(
51+
query=input.text, k=input.k, score_threshold=input.score_threshold
52+
)
53+
search_res = [doc for doc, _ in docs_and_similarities]
54+
elif input.search_type == "mmr":
55+
search_res = vector_db.max_marginal_relevance_search(
56+
query=input.text, k=input.k, fetch_k=input.fetch_k, lambda_mult=input.lambda_mult
57+
)
4258
searched_docs = []
4359
for r in search_res:
4460
searched_docs.append(TextDoc(text=r.page_content))

comps/retrievers/langchain/redis/README.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,3 +117,37 @@ curl http://${your_ip}:7000/v1/retrieval \
117117
-d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding}}" \
118118
-H 'Content-Type: application/json'
119119
```
120+
121+
You can set the parameters for the retriever.
122+
123+
```bash
124+
your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
125+
curl http://localhost:7000/v1/retrieval \
126+
-X POST \
127+
-d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity\", \"k\":4}" \
128+
-H 'Content-Type: application/json'
129+
```
130+
131+
```bash
132+
your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
133+
curl http://localhost:7000/v1/retrieval \
134+
-X POST \
135+
-d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity_distance_threshold\", \"k\":4, \"distance_threshold\":1.0}" \
136+
-H 'Content-Type: application/json'
137+
```
138+
139+
```bash
140+
your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
141+
curl http://localhost:7000/v1/retrieval \
142+
-X POST \
143+
-d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity_score_threshold\", \"k\":4, \"score_threshold\":0.2}" \
144+
-H 'Content-Type: application/json'
145+
```
146+
147+
```bash
148+
your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
149+
curl http://localhost:7000/v1/retrieval \
150+
-X POST \
151+
-d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"mmr\", \"k\":4, \"fetch_k\":20, \"lambda_mult\":0.5}" \
152+
-H 'Content-Type: application/json'
153+
```

0 commit comments

Comments
 (0)