Skip to content

Implement script extension support in JIT. #66

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion maint/GenerateUcd.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,7 +780,7 @@ def write_records(records, record_size):
bitwords = [0] * script_list_item_size

for idx in d:
bitwords[idx // 32] |= 1 << (idx % 31)
bitwords[idx // 32] |= 1 << (idx & 31)

s = " "
for x in bitwords:
Expand Down
54 changes: 50 additions & 4 deletions src/pcre2_jit_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -7417,9 +7417,10 @@ static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHA
#define XCLASS_CHAR_SAVED 0x02
#define XCLASS_HAS_TYPE 0x04
#define XCLASS_HAS_SCRIPT 0x08
#define XCLASS_HAS_BIDICO 0x10
#define XCLASS_HAS_BIDICL 0x20
#define XCLASS_NEEDS_UCD (XCLASS_HAS_TYPE | XCLASS_HAS_SCRIPT | XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL)
#define XCLASS_HAS_SCRIPT_EXTENSION 0x10
#define XCLASS_HAS_BIDICO 0x20
#define XCLASS_HAS_BIDICL 0x40
#define XCLASS_NEEDS_UCD (XCLASS_HAS_TYPE | XCLASS_HAS_SCRIPT | XCLASS_HAS_SCRIPT_EXTENSION | XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL)
#endif /* SUPPORT_UNICODE */

static void compile_xclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks)
Expand Down Expand Up @@ -7518,6 +7519,10 @@ while (*cc != XCL_END)
unicode_status |= XCLASS_HAS_TYPE;
break;

case PT_SCX:
unicode_status |= XCLASS_HAS_SCRIPT_EXTENSION;
compares++;

case PT_SC:
unicode_status |= XCLASS_HAS_SCRIPT;
break;
Expand Down Expand Up @@ -7674,7 +7679,7 @@ if (unicode_status & XCLASS_NEEDS_UCD)
{
SLJIT_ASSERT(*cc == XCL_PROP || *cc == XCL_NOTPROP);
cc++;
if (*cc == PT_SC)
if (*cc == PT_SC || *cc == PT_SCX)
{
compares--;
invertcmp = (compares == 0 && list != backtracks);
Expand All @@ -7690,6 +7695,46 @@ if (unicode_status & XCLASS_NEEDS_UCD)
cc = ccbegin;
}

if (unicode_status & XCLASS_HAS_SCRIPT_EXTENSION)
{
while (*cc != XCL_END)
{
if (*cc == XCL_SINGLE)
{
cc ++;
GETCHARINCTEST(c, cc);
}
else if (*cc == XCL_RANGE)
{
cc ++;
GETCHARINCTEST(c, cc);
GETCHARINCTEST(c, cc);
}
else
{
SLJIT_ASSERT(*cc == XCL_PROP || *cc == XCL_NOTPROP);
cc++;
if (*cc == PT_SCX)
{
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx));
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
OP1(SLJIT_MOV_U32, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_script_sets) + (cc[1] >> 5)));
OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)1 << (cc[1] & 0x1f));

compares--;
invertcmp = (compares == 0 && list != backtracks);
if (cc[-1] == XCL_NOTPROP)
invertcmp ^= 0x1;
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
add_jump(compiler, compares > 0 ? list : backtracks, jump);
}
cc += 2;
}
}

cc = ccbegin;
}

if (unicode_status & (XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL))
{
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, bidi));
Expand Down Expand Up @@ -7879,6 +7924,7 @@ while (*cc != XCL_END)
break;

case PT_SC:
case PT_SCX:
case PT_BIDICO:
case PT_BIDICL:
compares++;
Expand Down
84 changes: 42 additions & 42 deletions src/pcre2_ucd.c
Original file line number Diff line number Diff line change
Expand Up @@ -172,66 +172,66 @@ const uint32_t PRIV(ucd_script_sets)[] = {
0x00000000u, 0x00000000u, 0x00000000u,
0x00000002u, 0x00000000u, 0x00000000u,
0x00000100u, 0x00000000u, 0x00000000u,
0x00000000u, 0x00008000u, 0x00000000u,
0x00000000u, 0x00004000u, 0x00000000u,
0x00000800u, 0x00000000u, 0x00000000u,
0x00004000u, 0x00000000u, 0x00000000u,
0x00100000u, 0x00000000u, 0x00000000u,
0x00000000u, 0x00000000u, 0x00000004u,
0x00000000u, 0x00000000u, 0x00000001u,
0x20000000u, 0x00000000u, 0x00000000u,
0x00000021u, 0x00000000u, 0x00000000u,
0x00000001u, 0x00000001u, 0x00000000u,
0x00000001u, 0x00000040u, 0x00000000u,
0x00000001u, 0x40000000u, 0x00000000u,
0x00000001u, 0x00000020u, 0x00000000u,
0x20000001u, 0x00000000u, 0x00000000u,
0x00000001u, 0x00000010u, 0x00000000u,
0x00000001u, 0x00000008u, 0x00000000u,
0x00000102u, 0x00000000u, 0x00000000u,
0x00004004u, 0x00000000u, 0x00000000u,
0x00000008u, 0x00000200u, 0x00000000u,
0x00000008u, 0x00000100u, 0x00000000u,
0x00400040u, 0x00000000u, 0x00000000u,
0x00000480u, 0x00000000u, 0x00000000u,
0x00100080u, 0x00000000u, 0x00000000u,
0x00000080u, 0x00800000u, 0x00000000u,
0x00000080u, 0x00400000u, 0x00000000u,
0x20000080u, 0x00000000u, 0x00000000u,
0x00000100u, 0x00010000u, 0x00000000u,
0x00000100u, 0x00000000u, 0x00000004u,
0x00000100u, 0x00002000u, 0x00000000u,
0x00000100u, 0x00000004u, 0x00000000u,
0x00000100u, 0x00008000u, 0x00000000u,
0x00000100u, 0x00000000u, 0x00000001u,
0x00000100u, 0x00001000u, 0x00000000u,
0x00000100u, 0x00000002u, 0x00000000u,
0x00100200u, 0x00000000u, 0x00000000u,
0x00000000u, 0x00010004u, 0x00000000u,
0x00001000u, 0x00020000u, 0x00000000u,
0x00002000u, 0x04000000u, 0x00000000u,
0x00000000u, 0x00008002u, 0x00000000u,
0x00001000u, 0x00010000u, 0x00000000u,
0x00002000u, 0x02000000u, 0x00000000u,
0x00104000u, 0x00000000u, 0x00000000u,
0x000a0000u, 0x00000000u, 0x00000000u,
0x00040000u, 0x00000000u, 0x00000004u,
0x00040000u, 0x00000000u, 0x00000001u,
0x01100000u, 0x00000000u, 0x00000000u,
0x00000000u, 0x00200000u, 0x00000020u,
0x01000000u, 0x00000080u, 0x00000000u,
0x20000001u, 0x00000010u, 0x00000000u,
0x00000001u, 0x00000010u, 0x00000008u,
0x10000002u, 0x00001000u, 0x00000000u,
0x02000000u, 0x00001002u, 0x00000000u,
0x00400040u, 0x00000000u, 0x00000010u,
0x00400040u, 0x00080000u, 0x00000000u,
0x00040100u, 0x00010000u, 0x00000000u,
0x00100100u, 0x00010000u, 0x00000000u,
0x00000000u, 0x00100000u, 0x00000008u,
0x01000000u, 0x00000040u, 0x00000000u,
0x20000001u, 0x00000008u, 0x00000000u,
0x00000001u, 0x00000008u, 0x00000002u,
0x10000002u, 0x00000800u, 0x00000000u,
0x02000000u, 0x00000801u, 0x00000000u,
0x00400040u, 0x00000000u, 0x00000004u,
0x00400040u, 0x00040000u, 0x00000000u,
0x00040100u, 0x00008000u, 0x00000000u,
0x00100100u, 0x00008000u, 0x00000000u,
0x000a4000u, 0x00000000u, 0x00000000u,
0x02100000u, 0x00000100u, 0x00000000u,
0x00040102u, 0x00010000u, 0x00000000u,
0x40010011u, 0x00000000u, 0x00000000u,
0x00000100u, 0x20100400u, 0x00000000u,
0x02100000u, 0x00000080u, 0x00000000u,
0x00040102u, 0x00008000u, 0x00000000u,
0xc0010010u, 0x00000000u, 0x00000000u,
0x00000100u, 0x10080200u, 0x00000000u,
0x000ac004u, 0x00000000u, 0x00000000u,
0x20000001u, 0x00000051u, 0x00000008u,
0x000ac004u, 0x00000020u, 0x00000000u,
0x04840100u, 0x0000000cu, 0x00000000u,
0x20000001u, 0x08000051u, 0x00000008u,
0x04040102u, 0x02010008u, 0x00000004u,
0x20000001u, 0x09200803u, 0x00000020u,
0x00003100u, 0x22564400u, 0x00000000u,
0x04943102u, 0x0201000cu, 0x00000000u,
0x04943102u, 0x0201200cu, 0x00000000u,
0x00043100u, 0x22564400u, 0x00000004u,
0x00843100u, 0x22564400u, 0x00000004u,
0x1c843102u, 0x7215400cu, 0x00000004u,
0x1ca43102u, 0x7215400cu, 0x00000004u,
0x20000001u, 0x40000028u, 0x00000002u,
0x000ac004u, 0x00000010u, 0x00000000u,
0x04840100u, 0x00000006u, 0x00000000u,
0x20000001u, 0x44000028u, 0x00000002u,
0x04040102u, 0x01008004u, 0x00000001u,
0x20000001u, 0xc4900400u, 0x00000008u,
0x00003100u, 0x112b2200u, 0x00000000u,
0x04943102u, 0x01008006u, 0x00000000u,
0x04943102u, 0x01009006u, 0x00000000u,
0x00043100u, 0x112b2200u, 0x00000001u,
0x00843100u, 0x112b2200u, 0x00000001u,
0x1c843102u, 0x390aa006u, 0x00000001u,
0x1ca43102u, 0x390aa006u, 0x00000001u,
};

/* These are the main two-stage UCD tables. The fields in each record are:
Expand Down
4 changes: 0 additions & 4 deletions testdata/testinput4
Original file line number Diff line number Diff line change
Expand Up @@ -1133,8 +1133,6 @@
A\x{300}\x{301}\x{302}BC
\x{300}

#subject no_jit

/^\p{Han}+/utf
\x{2e81}\x{3007}\x{2f804}\x{31a0}
\= Expect no match
Expand All @@ -1157,8 +1155,6 @@
\x{a014}
\x{a4c6}

#subject -no_jit

/^\p{Any}X/utf
AXYZ
\x{1234}XYZ
Expand Down
4 changes: 0 additions & 4 deletions testdata/testinput5
Original file line number Diff line number Diff line change
Expand Up @@ -1337,8 +1337,6 @@

# These scripts weren't yet in Perl when I added Unicode 6.0.0 to PCRE

#subject no_jit

/^[\p{Batak}]/utf
\x{1bc0}
\x{1bff}
Expand All @@ -1358,8 +1356,6 @@
\x{85c}
\x{85d}

#subject -no_jit

/(\X*)(.)/s,utf
A\x{300}

Expand Down
4 changes: 0 additions & 4 deletions testdata/testoutput4
Original file line number Diff line number Diff line change
Expand Up @@ -1876,8 +1876,6 @@ No match
\x{300}
0: \x{300}

#subject no_jit

/^\p{Han}+/utf
\x{2e81}\x{3007}\x{2f804}\x{31a0}
0: \x{2e81}\x{3007}\x{2f804}
Expand Down Expand Up @@ -1910,8 +1908,6 @@ No match
\x{a4c6}
No match

#subject -no_jit

/^\p{Any}X/utf
AXYZ
0: AX
Expand Down
4 changes: 0 additions & 4 deletions testdata/testoutput5
Original file line number Diff line number Diff line change
Expand Up @@ -2842,8 +2842,6 @@ No match

# These scripts weren't yet in Perl when I added Unicode 6.0.0 to PCRE

#subject no_jit

/^[\p{Batak}]/utf
\x{1bc0}
0: \x{1bc0}
Expand Down Expand Up @@ -2873,8 +2871,6 @@ No match
\x{85d}
No match

#subject -no_jit

/(\X*)(.)/s,utf
A\x{300}
0: A
Expand Down