1- import re
21from typing import Optional
32
43
5- def __compute_numerical_range (str_a , str_b , any_digit = "[0-9]" , start_appender_str = "" ):
4+ def __digit_range (start : int , end : int ) -> str :
5+ if start == end :
6+ return str (start )
7+ if start == 0 and end == 9 :
8+ return r"\d"
9+ return f"[{ start } -{ end } ]"
10+
11+
12+ def __tokenize_numeric_pattern (pattern : str ) -> list [str ]:
13+ tokens = []
14+ i = 0
15+ while i < len (pattern ):
16+ if pattern [i ] == "[" :
17+ end = pattern .find ("]" , i )
18+ if end == - 1 :
19+ raise ValueError (f"Malformed range expression: { pattern } " )
20+ tokens .append (pattern [i : end + 1 ])
21+ i = end + 1
22+ elif i + 1 < len (pattern ) and pattern [i ] == "\\ " and pattern [i + 1 ] == "d" :
23+ tokens .append (r"\d" )
24+ i += 2
25+ else :
26+ tokens .append (pattern [i ])
27+ i += 1
28+ return tokens
29+
30+
31+ def __compute_numerical_range (str_a , str_b , start_appender_str = "" ):
632 """
733 Helps generating regex for numerical range.
834 Assumptions are int(str_a) <= int(str_b) and should have equal number of digits.
@@ -19,7 +45,7 @@ def __compute_numerical_range(str_a, str_b, any_digit="[0-9]", start_appender_st
1945 if str_a == str_b :
2046 return start_appender_str + str_a
2147 if str_len == 1 :
22- return f"{ start_appender_str } [ { str_a } - { str_b } ] "
48+ return f"{ start_appender_str } { __digit_range ( int ( str_a ), int ( str_b )) } "
2349 # Counting index position till the characteres are equal
2450 check_equal = - 1
2551 for i in range (str_len ):
@@ -31,7 +57,6 @@ def __compute_numerical_range(str_a, str_b, any_digit="[0-9]", start_appender_st
3157 return __compute_numerical_range (
3258 str_a [check_equal + 1 :],
3359 str_b [check_equal + 1 :],
34- any_digit = any_digit ,
3560 start_appender_str = start_appender_str + str_a [: check_equal + 1 ],
3661 )
3762
@@ -41,37 +66,37 @@ def __compute_numerical_range(str_a, str_b, any_digit="[0-9]", start_appender_st
4166 patterns = []
4267 if intermediate_range :
4368 patterns .append (
44- f"{ start_appender_str } [ { intermediate_range [0 ]} - { intermediate_range [- 1 ]} ] { '' .join ([any_digit ]* (str_len - 1 ))} "
69+ f"{ start_appender_str } { __digit_range ( intermediate_range [0 ], intermediate_range [- 1 ]) } { '' .join ([r'\d' ]* (str_len - 1 ))} "
4570 )
4671 # patterns for the above part ['[2-4][0-9][0-9]']
4772
4873 # Case for str_a
4974 for loop_counter in range (str_len - 1 ): # no_of_digits-1 units
5075 if loop_counter == str_len - 2 : # Find the last loop
5176 patterns .append (
52- f"{ start_appender_str } { str_a [:loop_counter + 1 ]} [ { str_a [- 1 ]} -9] "
77+ f"{ start_appender_str } { str_a [:loop_counter + 1 ]} { __digit_range ( int ( str_a [- 1 ]), 9 ) } "
5378 )
5479 else :
5580 if (
5681 str_a [loop_counter + 1 ] != "9"
5782 ): # if 599 then avoid 10 in '[6-8]...|5[10-9]..|59[9-9].|598[9-9]'
5883 patterns .append (
59- f"{ start_appender_str } { str_a [:loop_counter + 1 ]} [ { int (str_a [loop_counter + 1 ])+ 1 } -9] { '' .join ([any_digit ]* (str_len - 2 - loop_counter ))} "
84+ f"{ start_appender_str } { str_a [:loop_counter + 1 ]} { __digit_range ( int (str_a [loop_counter + 1 ]) + 1 , 9 ) } { '' .join ([r'\d' ]* (str_len - 2 - loop_counter ))} "
6085 )
6186 # patterns for the above part ['1[7-9][0-9]','16[9-9]']
6287
6388 # Case for str_b
6489 for loop_counter in range (str_len - 1 ): # no_of_digits-1 units
6590 if loop_counter == str_len - 2 : # Find the last loop
6691 patterns .append (
67- f"{ start_appender_str } { str_b [:loop_counter + 1 ]} [0- { str_b [- 1 ]} ] "
92+ f"{ start_appender_str } { str_b [:loop_counter + 1 ]} { __digit_range ( 0 , int ( str_b [- 1 ])) } "
6893 )
6994 else :
7095 if (
7196 str_b [loop_counter + 1 ] != "0"
7297 ): # if 1102 then avoid -1 in '11[0--1].|110[0-2]'
7398 patterns .append (
74- f"{ start_appender_str } { str_b [:loop_counter + 1 ]} [0- { int (str_b [loop_counter + 1 ])- 1 } ] { '' .join ([any_digit ]* (str_len - 2 - loop_counter ))} "
99+ f"{ start_appender_str } { str_b [:loop_counter + 1 ]} { __digit_range ( 0 , int (str_b [loop_counter + 1 ]) - 1 ) } { '' .join ([r'\d' ]* (str_len - 2 - loop_counter ))} "
75100 )
76101 # patterns for the above part ['5[0-3][0-9]','54[0-3]']
77102
@@ -169,29 +194,22 @@ def _range_regex(a, b):
169194 ranges = __range_splitter (a , b )
170195 intermediate_regex = "|" .join (
171196 [
172- __compute_numerical_range (
173- str (r [0 ]), str (r [1 ]), any_digit = "[0-9]" , start_appender_str = r [2 ]
174- )
197+ __compute_numerical_range (str (r [0 ]), str (r [1 ]), start_appender_str = r [2 ])
175198 for r in ranges
176199 ]
177200 )
178201
179202 # Modifying the integer supported regex to support float
180203 new_regex = []
181204 for p in intermediate_regex .split ("|" ):
182- if p .find ("[" ) == - 1 :
183- x = [c for c in p if c != "-" ]
184- else :
185- x = [
186- c for d in re .findall (r"-{0,1}(\d+)\[\d-\d\]*" , p ) for c in d
187- ] + re .findall (r"-{0,1}[\d]*(\[\d-\d\]*)" , p )
205+ x = __tokenize_numeric_pattern (p [1 :] if p .startswith ("-" ) else p )
188206
189207 # If x = ['[0-9]'] and max_num_decimal = 2, We need x = ['0','[0-9]']
190208 if len (x ) < max_num_decimal :
191209 x = (["0" ] * (max_num_decimal - len (x ))) + x
192210
193211 # Example x = ['3', '2', '[0-1]', '[0-9]'] for p=32[0-1][0-9]
194- start_appender_str = "-" if re . findall ( "^-" , p ) else ""
212+ start_appender_str = "-" if p . startswith ( "-" ) else ""
195213 # Add a decimal point inbetween, keep the next digit mandatory and others optional (32.[0-1][0-9]?[0-9]*)
196214 fractional_part = (
197215 [x [- max_num_decimal ]] + [z + "?" for z in x [- max_num_decimal + 1 :]]
@@ -202,7 +220,7 @@ def _range_regex(a, b):
202220 "" .join (x [:- max_num_decimal ]) if "" .join (x [:- max_num_decimal ]) else "0?"
203221 )
204222 new_regex .append (
205- rf"{ start_appender_str } { non_fractional_part } \.{ '' .join (fractional_part )} [0-9] *"
223+ rf"{ start_appender_str } { non_fractional_part } \.{ '' .join (fractional_part )} \d *"
206224 )
207225 regex = f"(?:{ '|' .join (new_regex )} )"
208226 return regex
@@ -211,7 +229,7 @@ def _range_regex(a, b):
211229 elif isinstance (a , (int )) and isinstance (b , (int )):
212230 a , b = (a , b ) if a < b else (b , a )
213231 ranges = __range_splitter (a , b )
214- regex = f"(?:{ '|' .join ([__compute_numerical_range (str (r [0 ]),str (r [1 ]),any_digit = '[0-9]' , start_appender_str = r [2 ]) for r in ranges ])} )"
232+ regex = f"(?:{ '|' .join ([__compute_numerical_range (str (r [0 ]),str (r [1 ]),start_appender_str = r [2 ]) for r in ranges ])} )"
215233 return regex
216234
217235 # Neither integer nor float
@@ -234,7 +252,7 @@ def range_regex(minimum: Optional[int] = None, maximum: Optional[int] = None):
234252 If you omit both, all numbers will be matched.
235253 """
236254 if minimum is None and maximum is None :
237- return r"-?(?:[1-9][0-9] *|0)"
255+ return r"-?(?:[1-9]\d *|0)"
238256 if minimum is None :
239257 if maximum == 0 :
240258 return r"(?:-[1-9]\d*|0)"
0 commit comments