Skip to content

Commit 05bfaef

Browse files
author
Zoltan Herczeg
committed
Auto generate unicode property tests.
1 parent 6614b28 commit 05bfaef

File tree

4 files changed

+6420
-2
lines changed

4 files changed

+6420
-2
lines changed

RunTest

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,8 @@ title22="Test 22: \C tests with UTF (not supported for DFA matching)"
8080
title23="Test 23: \C disabled test"
8181
title24="Test 24: Non-UTF pattern conversion tests"
8282
title25="Test 25: UTF pattern conversion tests"
83-
maxtest=25
83+
title26="Test 26: Auto-generated unicode property tests"
84+
maxtest=26
8485

8586
if [ $# -eq 1 -a "$1" = "list" ]; then
8687
echo $title0
@@ -109,6 +110,7 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
109110
echo $title23
110111
echo $title24
111112
echo $title25
113+
echo $title26
112114
exit 0
113115
fi
114116

@@ -238,6 +240,7 @@ do22=no
238240
do23=no
239241
do24=no
240242
do25=no
243+
do26=no
241244

242245
while [ $# -gt 0 ] ; do
243246
case $1 in
@@ -267,6 +270,7 @@ while [ $# -gt 0 ] ; do
267270
23) do23=yes;;
268271
24) do24=yes;;
269272
25) do25=yes;;
273+
26) do26=yes;;
270274
-8) arg8=yes;;
271275
-16) arg16=yes;;
272276
-32) arg32=yes;;
@@ -417,7 +421,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
417421
$do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
418422
$do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
419423
$do20 = no -a $do21 = no -a $do22 = no -a $do23 = no -a \
420-
$do24 = no -a $do25 = no \
424+
$do24 = no -a $do25 = no -a $do26 = no \
421425
]; then
422426
do0=yes
423427
do1=yes
@@ -445,6 +449,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
445449
do23=yes
446450
do24=yes
447451
do25=yes
452+
do26=yes
448453
fi
449454

450455
# Handle any explicit skips at this stage, so that an argument list may consist
@@ -863,6 +868,20 @@ for bmode in "$test8" "$test16" "$test32"; do
863868
fi
864869
fi
865870

871+
# Auto-generated unicode property tests
872+
873+
if [ $do26 = yes ] ; then
874+
echo $title26
875+
if [ $utf -eq 0 ] ; then
876+
echo " Skipped because UTF-$bits support is not available"
877+
else
878+
for opt in "" $jitopt; do
879+
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput26 testtry
880+
checkresult $? 26 "$opt"
881+
done
882+
fi
883+
fi
884+
866885
# End of loop for 8/16/32-bit tests
867886
done
868887

maint/GenerateTest26.py

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
#! /usr/bin/python
2+
3+
# PCRE2 UNICODE PROPERTY SUPPORT
4+
# ------------------------------
5+
#
6+
# This file auto-generates unicode property tests and their expected output.
7+
# It is recommended to re-run this generator after the unicode files are
8+
# updated. The names of the generated files are `testinput26` and `testoutput26`
9+
10+
import re
11+
import sys
12+
13+
from GenerateCommon import \
14+
script_names, \
15+
script_abbrevs
16+
17+
def write_both(text):
18+
input_file.write(text)
19+
output_file.write(text)
20+
21+
def to_string_char(ch_idx):
22+
if ch_idx < 128:
23+
if ch_idx < 16:
24+
return "\\x{0%x}" % ch_idx
25+
if ch_idx >= 32:
26+
return chr(ch_idx)
27+
return "\\x{%x}" % ch_idx
28+
29+
output_directory = ""
30+
31+
if len(sys.argv) > 2:
32+
print('** Too many arguments: just give a directory name')
33+
sys.exit(1)
34+
if len(sys.argv) == 2:
35+
output_directory = sys.argv[1]
36+
if not output_directory.endswith("/"):
37+
output_directory += "/"
38+
39+
try:
40+
input_file = open(output_directory + "testinput26", "w")
41+
output_file = open(output_directory + "testoutput26", "w")
42+
except IOError:
43+
print ("** Couldn't open output files")
44+
sys.exit(1)
45+
46+
write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n")
47+
48+
# ---------------------------------------------------------------------------
49+
# UNICODE SCRIPT EXTENSION TESTS
50+
# ---------------------------------------------------------------------------
51+
52+
write_both("# Unicode Script Extension tests.\n\n")
53+
54+
def gen_script_tests():
55+
script_data = [None] * len(script_names)
56+
char_data = [None] * 0x110000
57+
58+
property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #")
59+
prev_name = ""
60+
script_idx = -1
61+
62+
with open("Unicode.tables/Scripts.txt") as f:
63+
for line in f:
64+
match_obj = property_re.match(line)
65+
66+
if match_obj == None:
67+
continue
68+
69+
name = match_obj.group(3)
70+
if name != prev_name:
71+
script_idx = script_names.index(name)
72+
prev_name = name
73+
74+
low = int(match_obj.group(1), 16)
75+
high = low
76+
char_data[low] = name
77+
78+
if match_obj.group(2) != None:
79+
high = int(match_obj.group(2), 16)
80+
for idx in range(low + 1, high + 1):
81+
char_data[idx] = name
82+
83+
if script_data[script_idx] == None:
84+
script_data[script_idx] = [low, None, None, None, None]
85+
script_data[script_idx][1] = high
86+
87+
extended_script_indicies = {}
88+
89+
with open("Unicode.tables/ScriptExtensions.txt") as f:
90+
for line in f:
91+
match_obj = property_re.match(line)
92+
93+
if match_obj == None:
94+
continue
95+
96+
low = int(match_obj.group(1), 16)
97+
high = low
98+
if match_obj.group(2) != None:
99+
high = int(match_obj.group(2), 16)
100+
101+
for abbrev in match_obj.group(3).split(" "):
102+
if abbrev not in extended_script_indicies:
103+
idx = script_abbrevs.index(abbrev)
104+
extended_script_indicies[abbrev] = idx
105+
rec = script_data[idx]
106+
rec[2] = low
107+
rec[3] = high
108+
else:
109+
idx = extended_script_indicies[abbrev]
110+
rec = script_data[idx]
111+
if rec[2] > low:
112+
rec[2] = low
113+
if rec[3] < high:
114+
rec[3] = high
115+
116+
if rec[4] == None:
117+
name = script_names[idx]
118+
for idx in range(low, high + 1):
119+
if char_data[idx] != name:
120+
rec[4] = idx
121+
break
122+
123+
long_property_name = False
124+
125+
for idx, rec in enumerate(script_data):
126+
script_name = script_names[idx]
127+
128+
if script_name == "Unknown":
129+
continue
130+
131+
script_abbrev = script_abbrevs[idx]
132+
133+
write_both("# Base script check\n")
134+
write_both("/^\\p{sc=%s}/utf\n" % script_name)
135+
write_both(" %s\n" % to_string_char(rec[0]))
136+
output_file.write(" 0: %s\n" % to_string_char(rec[0]))
137+
write_both("\n")
138+
139+
write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
140+
write_both(" %s\n" % to_string_char(rec[1]))
141+
output_file.write(" 0: %s\n" % to_string_char(rec[1]))
142+
write_both("\n")
143+
144+
if rec[2] != None:
145+
property_name = "scx"
146+
if long_property_name:
147+
property_name = "Script_Extensions"
148+
149+
write_both("# Script extension check\n")
150+
write_both("/^\\p{%s}/utf\n" % script_name)
151+
write_both(" %s\n" % to_string_char(rec[2]))
152+
output_file.write(" 0: %s\n" % to_string_char(rec[2]))
153+
write_both("\n")
154+
155+
write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
156+
write_both(" %s\n" % to_string_char(rec[3]))
157+
output_file.write(" 0: %s\n" % to_string_char(rec[3]))
158+
write_both("\n")
159+
160+
long_property_name = not long_property_name
161+
162+
if rec[4] != None:
163+
write_both("# Script extension only character\n")
164+
write_both("/^\\p{%s}/utf\n" % script_name)
165+
write_both(" %s\n" % to_string_char(rec[4]))
166+
output_file.write(" 0: %s\n" % to_string_char(rec[4]))
167+
write_both("\n")
168+
169+
write_both("/^\\p{sc=%s}/utf\n" % script_name)
170+
write_both(" %s\n" % to_string_char(rec[4]))
171+
output_file.write("No match\n")
172+
write_both("\n")
173+
else:
174+
print("External character has not found for %s" % script_name)
175+
176+
high = rec[1]
177+
if rec[3] != None and rec[3] > rec[1]:
178+
high = rec[3]
179+
write_both("# Character not in script\n")
180+
write_both("/^\\p{%s}/utf\n" % script_name)
181+
write_both(" %s\n" % to_string_char(high + 1))
182+
output_file.write("No match\n")
183+
write_both("\n")
184+
185+
186+
gen_script_tests()
187+
188+
write_both("# End of testinput26\n")

0 commit comments

Comments
 (0)