Skip to content

Commit fbfec25

Browse files
author
Zoltan Herczeg
committed
Fix script extension support on jit
1 parent c24047f commit fbfec25

File tree

5 files changed

+65
-41
lines changed

5 files changed

+65
-41
lines changed

src/pcre2_jit_compile.c

Lines changed: 65 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7413,14 +7413,18 @@ return cc;
74137413
static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr);
74147414

74157415
#ifdef SUPPORT_UNICODE
7416-
#define XCLASS_SAVE_CHAR 0x01
7417-
#define XCLASS_CHAR_SAVED 0x02
7418-
#define XCLASS_HAS_TYPE 0x04
7419-
#define XCLASS_HAS_SCRIPT 0x08
7420-
#define XCLASS_HAS_SCRIPT_EXTENSION 0x10
7421-
#define XCLASS_HAS_BIDICO 0x20
7422-
#define XCLASS_HAS_BIDICL 0x40
7416+
#define XCLASS_SAVE_CHAR 0x001
7417+
#define XCLASS_CHAR_SAVED 0x002
7418+
#define XCLASS_HAS_TYPE 0x004
7419+
#define XCLASS_HAS_SCRIPT 0x008
7420+
#define XCLASS_HAS_SCRIPT_EXTENSION 0x010
7421+
#define XCLASS_HAS_BIDICO 0x020
7422+
#define XCLASS_HAS_BIDICL 0x040
74237423
#define XCLASS_NEEDS_UCD (XCLASS_HAS_TYPE | XCLASS_HAS_SCRIPT | XCLASS_HAS_SCRIPT_EXTENSION | XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL)
7424+
#define XCLASS_SCRIPT_EXTENSION_NOTPROP 0x080
7425+
#define XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR 0x100
7426+
#define XCLASS_SCRIPT_EXTENSION_RESTORE_LOCALS0 0x200
7427+
74247428
#endif /* SUPPORT_UNICODE */
74257429

74267430
static void compile_xclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks)
@@ -7521,6 +7525,11 @@ while (*cc != XCL_END)
75217525

75227526
case PT_SCX:
75237527
unicode_status |= XCLASS_HAS_SCRIPT_EXTENSION;
7528+
if (cc[-1] == XCL_NOTPROP)
7529+
{
7530+
unicode_status |= XCLASS_SCRIPT_EXTENSION_NOTPROP;
7531+
break;
7532+
}
75247533
compares++;
75257534

75267535
case PT_SC:
@@ -7679,14 +7688,19 @@ if (unicode_status & XCLASS_NEEDS_UCD)
76797688
{
76807689
SLJIT_ASSERT(*cc == XCL_PROP || *cc == XCL_NOTPROP);
76817690
cc++;
7682-
if (*cc == PT_SC || *cc == PT_SCX)
7691+
switch (*cc)
76837692
{
7693+
case PT_SCX:
7694+
if (cc[-1] == XCL_NOTPROP)
7695+
break;
7696+
7697+
case PT_SC:
76847698
compares--;
76857699
invertcmp = (compares == 0 && list != backtracks);
76867700
if (cc[-1] == XCL_NOTPROP)
76877701
invertcmp ^= 0x1;
7688-
jump = CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1]);
7689-
add_jump(compiler, compares > 0 ? list : backtracks, jump);
7702+
7703+
add_jump(compiler, compares > 0 ? list : backtracks, CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1]));
76907704
}
76917705
cc += 2;
76927706
}
@@ -7697,6 +7711,27 @@ if (unicode_status & XCLASS_NEEDS_UCD)
76977711

76987712
if (unicode_status & XCLASS_HAS_SCRIPT_EXTENSION)
76997713
{
7714+
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx));
7715+
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
7716+
7717+
if (unicode_status & XCLASS_SCRIPT_EXTENSION_NOTPROP)
7718+
{
7719+
if (unicode_status & (XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL | XCLASS_HAS_TYPE))
7720+
{
7721+
if (unicode_status & XCLASS_SAVE_CHAR)
7722+
{
7723+
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, TMP2, 0);
7724+
unicode_status |= XCLASS_SCRIPT_EXTENSION_RESTORE_LOCALS0;
7725+
}
7726+
else
7727+
{
7728+
OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP2, 0);
7729+
unicode_status |= XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR;
7730+
}
7731+
}
7732+
OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script));
7733+
}
7734+
77007735
while (*cc != XCL_END)
77017736
{
77027737
if (*cc == XCL_SINGLE)
@@ -7716,22 +7751,35 @@ if (unicode_status & XCLASS_NEEDS_UCD)
77167751
cc++;
77177752
if (*cc == PT_SCX)
77187753
{
7719-
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx));
7720-
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
7721-
OP1(SLJIT_MOV_U32, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_script_sets) + (cc[1] >> 5)));
7722-
OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)1 << (cc[1] & 0x1f));
7723-
77247754
compares--;
77257755
invertcmp = (compares == 0 && list != backtracks);
7756+
7757+
jump = NULL;
77267758
if (cc[-1] == XCL_NOTPROP)
7759+
{
7760+
jump = CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, (int)cc[1]);
7761+
if (invertcmp)
7762+
{
7763+
add_jump(compiler, backtracks, jump);
7764+
jump = NULL;
7765+
}
77277766
invertcmp ^= 0x1;
7728-
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
7729-
add_jump(compiler, compares > 0 ? list : backtracks, jump);
7767+
}
7768+
7769+
OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_script_sets) + (cc[1] >> 5)), SLJIT_IMM, (sljit_sw)1 << (cc[1] & 0x1f));
7770+
add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp));
7771+
7772+
if (jump != NULL)
7773+
JUMPHERE(jump);
77307774
}
77317775
cc += 2;
77327776
}
77337777
}
77347778

7779+
if (unicode_status & XCLASS_SCRIPT_EXTENSION_RESTORE_LOCALS0)
7780+
OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
7781+
else if (unicode_status & XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR)
7782+
OP1(SLJIT_MOV, TMP2, 0, RETURN_ADDR, 0);
77357783
cc = ccbegin;
77367784
}
77377785

testdata/testinput4

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1144,8 +1144,6 @@
11441144
\= Expect no match
11451145
X\x{06e9}
11461146

1147-
#subject no_jit
1148-
11491147
/^\P{Katakana}+/utf
11501148
\x{3105}
11511149
\= Expect no match
@@ -1157,8 +1155,6 @@
11571155
\x{a014}
11581156
\x{a4c6}
11591157

1160-
#subject -no_jit
1161-
11621158
/^\p{Any}X/utf
11631159
AXYZ
11641160
\x{1234}XYZ
@@ -1410,8 +1406,6 @@
14101406
\x{2116}
14111407
\x{1D183}
14121408

1413-
#subject no_jit
1414-
14151409
/^\p{Inherited}/utf
14161410
\x{200c}
14171411
\= Expect no match
@@ -1464,8 +1458,6 @@
14641458
/\p{sc:katakana}{3,}?/utf
14651459
\x{30a1}\x{30fa}\x{32d0}\x{1b122}\x{ff66}\x{3001}ABC
14661460

1467-
#subject -no_jit
1468-
14691461
/\p{Carian}\p{Cham}\p{Kayah_Li}\p{Lepcha}\p{Lycian}\p{Lydian}\p{Ol_Chiki}\p{Rejang}\p{Saurashtra}\p{Sundanese}\p{Vai}/utf
14701462
\x{102A4}\x{AA52}\x{A91D}\x{1C46}\x{10283}\x{1092E}\x{1C6B}\x{A93B}\x{A8BF}\x{1BA0}\x{A50A}====
14711463

testdata/testinput5

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2035,8 +2035,6 @@
20352035
# doesn't recognize all these scripts. In time these three tests can be moved
20362036
# to test 4.
20372037

2038-
#subject no_jit
2039-
20402038
/^(\p{Adlam}+)(\p{Bhaiksuki}+)(\p{Marchen}+)(\p{Newa}+)(\p{Osage}+)
20412039
(\p{Tangut}+)(\p{Masaram_Gondi}+)(\p{Nushu}+)(\p{Soyombo}+)
20422040
(\p{Zanabazar_Square}+)/x,utf
@@ -2085,8 +2083,6 @@
20852083
\x{655}
20862084
\x{1D1AA}
20872085

2088-
#subject -no_jit
2089-
20902086
/\N{U+}/
20912087

20922088
/\N{U+}/utf

testdata/testoutput4

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1892,8 +1892,6 @@ No match
18921892
X\x{06e9}
18931893
No match
18941894

1895-
#subject no_jit
1896-
18971895
/^\P{Katakana}+/utf
18981896
\x{3105}
18991897
0: \x{3105}
@@ -1910,8 +1908,6 @@ No match
19101908
\x{a4c6}
19111909
No match
19121910

1913-
#subject -no_jit
1914-
19151911
/^\p{Any}X/utf
19161912
AXYZ
19171913
0: AX
@@ -2312,8 +2308,6 @@ No match
23122308
\x{1D183}
23132309
0: \x{1d183}
23142310

2315-
#subject no_jit
2316-
23172311
/^\p{Inherited}/utf
23182312
\x{200c}
23192313
0: \x{200c}
@@ -2392,8 +2386,6 @@ No match
23922386
\x{30a1}\x{30fa}\x{32d0}\x{1b122}\x{ff66}\x{3001}ABC
23932387
0: \x{30a1}\x{30fa}\x{32d0}
23942388

2395-
#subject -no_jit
2396-
23972389
/\p{Carian}\p{Cham}\p{Kayah_Li}\p{Lepcha}\p{Lycian}\p{Lydian}\p{Ol_Chiki}\p{Rejang}\p{Saurashtra}\p{Sundanese}\p{Vai}/utf
23982390
\x{102A4}\x{AA52}\x{A91D}\x{1C46}\x{10283}\x{1092E}\x{1C6B}\x{A93B}\x{A8BF}\x{1BA0}\x{A50A}====
23992391
0: \x{102a4}\x{aa52}\x{a91d}\x{1c46}\x{10283}\x{1092e}\x{1c6b}\x{a93b}\x{a8bf}\x{1ba0}\x{a50a}

testdata/testoutput5

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4599,8 +4599,6 @@ No match
45994599
# doesn't recognize all these scripts. In time these three tests can be moved
46004600
# to test 4.
46014601

4602-
#subject no_jit
4603-
46044602
/^(\p{Adlam}+)(\p{Bhaiksuki}+)(\p{Marchen}+)(\p{Newa}+)(\p{Osage}+)
46054603
(\p{Tangut}+)(\p{Masaram_Gondi}+)(\p{Nushu}+)(\p{Soyombo}+)
46064604
(\p{Zanabazar_Square}+)/x,utf
@@ -4742,8 +4740,6 @@ Callout 0: last capture = 1
47424740
\x{1D1AA}
47434741
0: \x{1d1aa}
47444742

4745-
#subject -no_jit
4746-
47474743
/\N{U+}/
47484744
Failed: error 193 at offset 2: \N{U+dddd} is supported only in Unicode (UTF) mode
47494745

0 commit comments

Comments
 (0)