Skip to content

Commit 697bcba

Browse files
Included multi-value json benchmarks. Extended en-wiki benchmarks to include the option to generate term-based queries (#78)
1 parent 6579afa commit 697bcba

File tree

5 files changed

+813
-8
lines changed

5 files changed

+813
-8
lines changed

scripts/datagen_redisearch/enwiki_abstract/ftsb_generate_enwiki_abstract.py

Lines changed: 150 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,18 @@
1616
from tqdm import tqdm
1717

1818
# package local imports
19+
1920
sys.path.append(os.getcwd() + "/..")
2021
field_tokenization = ",.<>{}[]\"':;!@#$%^&*()-+=~"
2122

23+
SIMPLE_WORD_QUERY = "simple-1word-query"
24+
SIMPLE_2WORD_UNION_QUERY = "2word-union-query"
25+
SIMPLE_2WORD_INT_QUERY = "2word-intersection-query"
26+
WILDCARD_QUERY = "wildcard"
27+
SUFFIX_QUERY = "suffix"
28+
CONTAINS_QUERY = "contains"
29+
PREFIX_QUERY = "prefix"
30+
2231
from common_datagen import (
2332
download_url,
2433
generate_setup_json,
@@ -140,22 +149,64 @@ def generate_benchmark_commands(
140149
doc = docs[random_doc_pos]
141150
words, totalW = getQueryWords(doc, stop_words, 2)
142151
choice = random.choices(query_choices)[0]
152+
if len(words) < 1:
153+
continue
154+
term = words[0]
155+
len_w1 = len(term)
156+
prefix_min = 3
157+
prefix_max = 3
143158
generated_row = None
144-
if choice == "simple-1word-query" and len(words) >= 1:
159+
if choice == SIMPLE_WORD_QUERY and len(words) >= 1:
145160
generated_row = generate_ft_search_row(
146-
indexname, "simple-1word-query", words[0], search_no_content
161+
indexname, SIMPLE_WORD_QUERY, words[0], search_no_content
162+
)
163+
elif choice == WILDCARD_QUERY and len(term) >= prefix_max:
164+
generated_row = generate_wildcard_row(
165+
indexname,
166+
WILDCARD_QUERY,
167+
term,
168+
prefix_min,
169+
prefix_max,
170+
search_no_content,
147171
)
148-
elif choice == "2word-union-query" and len(words) >= 2:
172+
elif choice == PREFIX_QUERY and len(term) >= prefix_max:
173+
generated_row = generate_prefix_row(
174+
indexname,
175+
PREFIX_QUERY,
176+
term,
177+
prefix_min,
178+
prefix_max,
179+
search_no_content,
180+
)
181+
elif choice == SUFFIX_QUERY and len(term) >= prefix_max:
182+
generated_row = generate_suffix_row(
183+
indexname,
184+
SUFFIX_QUERY,
185+
term,
186+
prefix_min,
187+
prefix_max,
188+
search_no_content,
189+
)
190+
elif choice == CONTAINS_QUERY and len(term) >= prefix_max:
191+
generated_row = generate_contains_row(
192+
indexname,
193+
CONTAINS_QUERY,
194+
term,
195+
prefix_min,
196+
prefix_max,
197+
search_no_content,
198+
)
199+
elif choice == SIMPLE_2WORD_UNION_QUERY and len(words) >= 2:
149200
generated_row = generate_ft_search_row(
150201
indexname,
151-
"2word-union-query",
202+
SIMPLE_2WORD_UNION_QUERY,
152203
"{} {}".format(words[0], words[1]),
153204
search_no_content,
154205
)
155-
elif choice == "2word-intersection-query" and len(words) >= 2:
206+
elif choice == SIMPLE_2WORD_INT_QUERY and len(words) >= 2:
156207
generated_row = generate_ft_search_row(
157208
indexname,
158-
"2word-intersection-query",
209+
SIMPLE_2WORD_INT_QUERY,
159210
"{}|{}".format(words[0], words[1]),
160211
search_no_content,
161212
)
@@ -169,6 +220,76 @@ def generate_benchmark_commands(
169220
all_csvfile.close()
170221

171222

223+
def generate_wildcard_row(
224+
index, query_name, query, prefix_min, prefix_max, search_no_content
225+
):
226+
if (prefix_max - 2) <= prefix_min:
227+
prefix_max = prefix_min + 2
228+
term = query[:prefix_min] + "*" + query[prefix_min + 1 : prefix_max]
229+
cmd = [
230+
"READ",
231+
query_name,
232+
1,
233+
"FT.SEARCH",
234+
"{index}".format(index=index),
235+
"{query}".format(query=term),
236+
]
237+
if search_no_content:
238+
cmd.append("NOCONTENT")
239+
return cmd
240+
241+
242+
def generate_prefix_row(
243+
index, query_name, query, prefix_min, prefix_max, search_no_content
244+
):
245+
term = query[:prefix_min] + "*"
246+
cmd = [
247+
"READ",
248+
query_name,
249+
1,
250+
"FT.SEARCH",
251+
"{index}".format(index=index),
252+
"{query}".format(query=term),
253+
]
254+
if search_no_content:
255+
cmd.append("NOCONTENT")
256+
return cmd
257+
258+
259+
def generate_suffix_row(
260+
index, query_name, query, prefix_min, prefix_max, search_no_content
261+
):
262+
term = "*" + query[:prefix_min]
263+
cmd = [
264+
"READ",
265+
query_name,
266+
1,
267+
"FT.SEARCH",
268+
"{index}".format(index=index),
269+
"{query}".format(query=term),
270+
]
271+
if search_no_content:
272+
cmd.append("NOCONTENT")
273+
return cmd
274+
275+
276+
def generate_contains_row(
277+
index, query_name, query, prefix_min, prefix_max, search_no_content
278+
):
279+
term = "*" + query[:prefix_min] + "*"
280+
cmd = [
281+
"READ",
282+
query_name,
283+
1,
284+
"FT.SEARCH",
285+
"{index}".format(index=index),
286+
"{query}".format(query=term),
287+
]
288+
if search_no_content:
289+
cmd.append("NOCONTENT")
290+
return cmd
291+
292+
172293
def generate_ft_search_row(index, query_name, query, search_no_content):
173294
cmd = [
174295
"READ",
@@ -236,8 +357,24 @@ def generate_ft_search_row(index, query_name, query, search_no_content):
236357
parser.add_argument(
237358
"--query-choices",
238359
type=str,
239-
default="simple-1word-query,2word-union-query,2word-intersection-query",
240-
help="comma separated list of queries to produce. one of: simple-1word-query,2word-union-query,2word-intersection-query",
360+
default=",".join(
361+
[
362+
SIMPLE_WORD_QUERY,
363+
SIMPLE_2WORD_UNION_QUERY,
364+
SIMPLE_2WORD_INT_QUERY,
365+
]
366+
),
367+
help="comma separated list of queries to produce. one of: {}".format(
368+
[
369+
SIMPLE_WORD_QUERY,
370+
SIMPLE_2WORD_UNION_QUERY,
371+
SIMPLE_2WORD_INT_QUERY,
372+
PREFIX_QUERY,
373+
SUFFIX_QUERY,
374+
CONTAINS_QUERY,
375+
WILDCARD_QUERY,
376+
]
377+
),
241378
)
242379
parser.add_argument(
243380
"--upload-artifacts-s3-uncompressed",
@@ -428,6 +565,7 @@ def generate_ft_search_row(index, query_name, query, search_no_content):
428565
tree = ET.iterparse(decompressed_fname)
429566
print("Reading {}\n".format(decompressed_fname))
430567
progress = tqdm(unit="docs")
568+
total_produced = 0
431569
for event, elem in tree:
432570
if elem.tag == "doc":
433571
doc = {}
@@ -437,6 +575,10 @@ def generate_ft_search_row(index, query_name, query, search_no_content):
437575
doc["abstract"] = elem.findtext("abstract")
438576
docs.append(doc)
439577
progress.update()
578+
total_produced = total_produced + 1
579+
if total_produced >= doc_limit and doc_limit > 0:
580+
print("stopping doc read process")
581+
break
440582
elem.clear() # won't need the children any more
441583
progress.close()
442584

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
import sys
2+
import os
3+
4+
sys.path.append(os.getcwd() + "/..")

0 commit comments

Comments
 (0)