1616from tqdm import tqdm
1717
1818# package local imports
19+
1920sys .path .append (os .getcwd () + "/.." )
2021field_tokenization = ",.<>{}[]\" ':;!@#$%^&*()-+=~"
2122
23+ SIMPLE_WORD_QUERY = "simple-1word-query"
24+ SIMPLE_2WORD_UNION_QUERY = "2word-union-query"
25+ SIMPLE_2WORD_INT_QUERY = "2word-intersection-query"
26+ WILDCARD_QUERY = "wildcard"
27+ SUFFIX_QUERY = "suffix"
28+ CONTAINS_QUERY = "contains"
29+ PREFIX_QUERY = "prefix"
30+
2231from common_datagen import (
2332 download_url ,
2433 generate_setup_json ,
@@ -140,22 +149,64 @@ def generate_benchmark_commands(
140149 doc = docs [random_doc_pos ]
141150 words , totalW = getQueryWords (doc , stop_words , 2 )
142151 choice = random .choices (query_choices )[0 ]
152+ if len (words ) < 1 :
153+ continue
154+ term = words [0 ]
155+ len_w1 = len (term )
156+ prefix_min = 3
157+ prefix_max = 3
143158 generated_row = None
144- if choice == "simple-1word-query" and len (words ) >= 1 :
159+ if choice == SIMPLE_WORD_QUERY and len (words ) >= 1 :
145160 generated_row = generate_ft_search_row (
146- indexname , "simple-1word-query" , words [0 ], search_no_content
161+ indexname , SIMPLE_WORD_QUERY , words [0 ], search_no_content
162+ )
163+ elif choice == WILDCARD_QUERY and len (term ) >= prefix_max :
164+ generated_row = generate_wildcard_row (
165+ indexname ,
166+ WILDCARD_QUERY ,
167+ term ,
168+ prefix_min ,
169+ prefix_max ,
170+ search_no_content ,
147171 )
148- elif choice == "2word-union-query" and len (words ) >= 2 :
172+ elif choice == PREFIX_QUERY and len (term ) >= prefix_max :
173+ generated_row = generate_prefix_row (
174+ indexname ,
175+ PREFIX_QUERY ,
176+ term ,
177+ prefix_min ,
178+ prefix_max ,
179+ search_no_content ,
180+ )
181+ elif choice == SUFFIX_QUERY and len (term ) >= prefix_max :
182+ generated_row = generate_suffix_row (
183+ indexname ,
184+ SUFFIX_QUERY ,
185+ term ,
186+ prefix_min ,
187+ prefix_max ,
188+ search_no_content ,
189+ )
190+ elif choice == CONTAINS_QUERY and len (term ) >= prefix_max :
191+ generated_row = generate_contains_row (
192+ indexname ,
193+ CONTAINS_QUERY ,
194+ term ,
195+ prefix_min ,
196+ prefix_max ,
197+ search_no_content ,
198+ )
199+ elif choice == SIMPLE_2WORD_UNION_QUERY and len (words ) >= 2 :
149200 generated_row = generate_ft_search_row (
150201 indexname ,
151- "2word-union-query" ,
202+ SIMPLE_2WORD_UNION_QUERY ,
152203 "{} {}" .format (words [0 ], words [1 ]),
153204 search_no_content ,
154205 )
155- elif choice == "2word-intersection-query" and len (words ) >= 2 :
206+ elif choice == SIMPLE_2WORD_INT_QUERY and len (words ) >= 2 :
156207 generated_row = generate_ft_search_row (
157208 indexname ,
158- "2word-intersection-query" ,
209+ SIMPLE_2WORD_INT_QUERY ,
159210 "{}|{}" .format (words [0 ], words [1 ]),
160211 search_no_content ,
161212 )
@@ -169,6 +220,76 @@ def generate_benchmark_commands(
169220 all_csvfile .close ()
170221
171222
223+ def generate_wildcard_row (
224+ index , query_name , query , prefix_min , prefix_max , search_no_content
225+ ):
226+ if (prefix_max - 2 ) <= prefix_min :
227+ prefix_max = prefix_min + 2
228+ term = query [:prefix_min ] + "*" + query [prefix_min + 1 : prefix_max ]
229+ cmd = [
230+ "READ" ,
231+ query_name ,
232+ 1 ,
233+ "FT.SEARCH" ,
234+ "{index}" .format (index = index ),
235+ "{query}" .format (query = term ),
236+ ]
237+ if search_no_content :
238+ cmd .append ("NOCONTENT" )
239+ return cmd
240+
241+
242+ def generate_prefix_row (
243+ index , query_name , query , prefix_min , prefix_max , search_no_content
244+ ):
245+ term = query [:prefix_min ] + "*"
246+ cmd = [
247+ "READ" ,
248+ query_name ,
249+ 1 ,
250+ "FT.SEARCH" ,
251+ "{index}" .format (index = index ),
252+ "{query}" .format (query = term ),
253+ ]
254+ if search_no_content :
255+ cmd .append ("NOCONTENT" )
256+ return cmd
257+
258+
259+ def generate_suffix_row (
260+ index , query_name , query , prefix_min , prefix_max , search_no_content
261+ ):
262+ term = "*" + query [:prefix_min ]
263+ cmd = [
264+ "READ" ,
265+ query_name ,
266+ 1 ,
267+ "FT.SEARCH" ,
268+ "{index}" .format (index = index ),
269+ "{query}" .format (query = term ),
270+ ]
271+ if search_no_content :
272+ cmd .append ("NOCONTENT" )
273+ return cmd
274+
275+
276+ def generate_contains_row (
277+ index , query_name , query , prefix_min , prefix_max , search_no_content
278+ ):
279+ term = "*" + query [:prefix_min ] + "*"
280+ cmd = [
281+ "READ" ,
282+ query_name ,
283+ 1 ,
284+ "FT.SEARCH" ,
285+ "{index}" .format (index = index ),
286+ "{query}" .format (query = term ),
287+ ]
288+ if search_no_content :
289+ cmd .append ("NOCONTENT" )
290+ return cmd
291+
292+
172293def generate_ft_search_row (index , query_name , query , search_no_content ):
173294 cmd = [
174295 "READ" ,
@@ -236,8 +357,24 @@ def generate_ft_search_row(index, query_name, query, search_no_content):
236357 parser .add_argument (
237358 "--query-choices" ,
238359 type = str ,
239- default = "simple-1word-query,2word-union-query,2word-intersection-query" ,
240- help = "comma separated list of queries to produce. one of: simple-1word-query,2word-union-query,2word-intersection-query" ,
360+ default = "," .join (
361+ [
362+ SIMPLE_WORD_QUERY ,
363+ SIMPLE_2WORD_UNION_QUERY ,
364+ SIMPLE_2WORD_INT_QUERY ,
365+ ]
366+ ),
367+ help = "comma separated list of queries to produce. one of: {}" .format (
368+ [
369+ SIMPLE_WORD_QUERY ,
370+ SIMPLE_2WORD_UNION_QUERY ,
371+ SIMPLE_2WORD_INT_QUERY ,
372+ PREFIX_QUERY ,
373+ SUFFIX_QUERY ,
374+ CONTAINS_QUERY ,
375+ WILDCARD_QUERY ,
376+ ]
377+ ),
241378 )
242379 parser .add_argument (
243380 "--upload-artifacts-s3-uncompressed" ,
@@ -428,6 +565,7 @@ def generate_ft_search_row(index, query_name, query, search_no_content):
428565 tree = ET .iterparse (decompressed_fname )
429566 print ("Reading {}\n " .format (decompressed_fname ))
430567 progress = tqdm (unit = "docs" )
568+ total_produced = 0
431569 for event , elem in tree :
432570 if elem .tag == "doc" :
433571 doc = {}
@@ -437,6 +575,10 @@ def generate_ft_search_row(index, query_name, query, search_no_content):
437575 doc ["abstract" ] = elem .findtext ("abstract" )
438576 docs .append (doc )
439577 progress .update ()
578+ total_produced = total_produced + 1
579+ if total_produced >= doc_limit and doc_limit > 0 :
580+ print ("stopping doc read process" )
581+ break
440582 elem .clear () # won't need the children any more
441583 progress .close ()
442584
0 commit comments