Skip to content

Commit 6185344

Browse files
authored
Improve error offsets for character classes (PCRE2Project#548)
* Error offset should be advanced by one character for "[\d-z]" invalid range error The code does a 1-char lookahead for a hyphen, but then doesn't advance the pointer to consume the hyphen when returning the error. Perl's error message (with "use warnings") does advance to just after the hyphen, so PCRE2 should match. Fixes PCRE2Project#545. * Also improve error offsets for [[:bad:]], [[=...=]] and [z-\p{...}] cases
1 parent c192b8c commit 6185344

File tree

5 files changed

+112
-75
lines changed

5 files changed

+112
-75
lines changed

src/pcre2_compile.c

Lines changed: 34 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -3702,6 +3702,7 @@ while (ptr < ptrend)
37023702

37033703
if (class_range_state == RANGE_STARTED)
37043704
{
3705+
ptr = tempptr + 2;
37053706
errorcode = ERR50;
37063707
goto FAILED;
37073708
}
@@ -3723,8 +3724,9 @@ while (ptr < ptrend)
37233724

37243725
if (*ptr != CHAR_COLON)
37253726
{
3727+
ptr = tempptr + 2;
37263728
errorcode = ERR13;
3727-
goto FAILED_BACK;
3729+
goto FAILED;
37283730
}
37293731

37303732
if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
@@ -3734,19 +3736,18 @@ while (ptr < ptrend)
37343736
}
37353737

37363738
posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3739+
ptr = tempptr + 2;
37373740
if (posix_class < 0)
37383741
{
37393742
errorcode = ERR30;
37403743
goto FAILED;
37413744
}
3742-
ptr = tempptr + 2;
37433745

37443746
/* Set "a hyphen is forbidden to be the start of a range". For the '-]'
37453747
case, the hyphen is treated as a literal, but for '-1' it is disallowed
37463748
(because it would be interpreted as range). */
37473749

37483750
class_range_state = RANGE_FORBID_NO;
3749-
class_range_forbid_ptr = ptr;
37503751
class_op_state = CLASS_OP_OPERAND;
37513752

37523753
/* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
@@ -3989,6 +3990,7 @@ while (ptr < ptrend)
39893990
{
39903991
*parsed_pattern++ = CHAR_MINUS;
39913992
class_range_state = RANGE_FORBID_STARTED;
3993+
class_range_forbid_ptr = ptr;
39923994
}
39933995

39943996
/* Handle a literal character */
@@ -4073,40 +4075,8 @@ while (ptr < ptrend)
40734075
errorcode = ERR7;
40744076
ptr--;
40754077
goto FAILED;
4076-
}
4077-
4078-
/* The second part of a range can be a single-character escape
4079-
sequence (detected above), but not any of the other escapes. Perl
4080-
treats a hyphen as a literal in such circumstances. However, in Perl's
4081-
warning mode, a warning is given, so PCRE now faults it, as it is
4082-
almost certainly a mistake on the user's part. */
40834078

4084-
if (class_range_state == RANGE_STARTED)
4085-
{
4086-
errorcode = ERR50;
4087-
goto FAILED;
4088-
}
4089-
4090-
/* Perl gives a warning unless the hyphen following a multi-character
4091-
escape is the last character in the class. PCRE throws an error. */
4092-
4093-
if (class_range_state == RANGE_FORBID_STARTED)
4094-
{
4095-
ptr = class_range_forbid_ptr;
4096-
errorcode = ERR50;
4097-
goto FAILED;
4098-
}
4099-
4100-
/* Of the remaining escapes, only those that define characters are
4101-
allowed in a class. None may start a range. */
4102-
4103-
class_range_state = RANGE_FORBID_NO;
4104-
class_range_forbid_ptr = ptr;
4105-
class_op_state = CLASS_OP_OPERAND;
4106-
4107-
switch(escape)
4108-
{
4109-
case ESC_N:
4079+
case ESC_N: /* Not permitted by Perl either */
41104080
errorcode = ERR71;
41114081
goto FAILED;
41124082

@@ -4143,7 +4113,6 @@ while (ptr < ptrend)
41434113
if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
41444114
*parsed_pattern++ = META_ESCAPE + escape;
41454115
*parsed_pattern++ = (ptype << 16) | pdata;
4146-
class_range_forbid_ptr = ptr;
41474116
}
41484117
#else
41494118
errorcode = ERR45;
@@ -4156,6 +4125,34 @@ while (ptr < ptrend)
41564125
ptr--;
41574126
goto FAILED;
41584127
}
4128+
4129+
/* All the switch-cases above which end in "break" describe a set
4130+
of characters. None may start a range. */
4131+
4132+
/* The second part of a range can be a single-character escape
4133+
sequence (detected above), but not any of the other escapes. Perl
4134+
treats a hyphen as a literal in such circumstances. However, in Perl's
4135+
warning mode, a warning is given, so PCRE now faults it, as it is
4136+
almost certainly a mistake on the user's part. */
4137+
4138+
if (class_range_state == RANGE_STARTED)
4139+
{
4140+
errorcode = ERR50;
4141+
goto FAILED;
4142+
}
4143+
4144+
/* Perl gives a warning unless the hyphen following a multi-character
4145+
escape is the last character in the class. PCRE throws an error. */
4146+
4147+
if (class_range_state == RANGE_FORBID_STARTED)
4148+
{
4149+
ptr = class_range_forbid_ptr;
4150+
errorcode = ERR50;
4151+
goto FAILED;
4152+
}
4153+
4154+
class_range_state = RANGE_FORBID_NO;
4155+
class_op_state = CLASS_OP_OPERAND;
41594156
}
41604157

41614158
/* Proceed to next thing in the class. */

testdata/testinput2

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7008,4 +7008,12 @@ a)"xI
70087008

70097009
/[[:digit:]\Q\E-H]+/
70107010

7011+
/[z-[:space:]]/
7012+
7013+
/[z-\d]/
7014+
7015+
/[[:space:]-z]/
7016+
7017+
/[\d-z]/
7018+
70117019
# End of testinput2

testdata/testinput5

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3189,4 +3189,12 @@
31893189
/^([\h\x{9000}\x{9002}\x{9004}][\v\x{9000}\x{9002}\x{9004}\x{9006}\x{9008}][\h\v\x{9000}],){4}$/B,utf
31903190
\x09\x0a\x0d,\x{1680}\x{2028}\x{1680},\x{180e}\x{2029}\x{180e},\x{9000}\x{9000}\x{9000},
31913191

3192+
/[z-\p{Lu}]/
3193+
3194+
/[z-\pL]/
3195+
3196+
/[\p{Lu}-z]/
3197+
3198+
/[\pL-z]/
3199+
31923200
# End of testinput5

testdata/testoutput2

Lines changed: 45 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2170,13 +2170,13 @@ Starting code units: % 0 1 A B C D E F G H I J K L M N O P Q R S T U V W
21702170
Subject length lower bound = 1
21712171

21722172
/[[.ch.]]/I
2173-
Failed: error 113 at offset 1: POSIX collating elements are not supported
2173+
Failed: error 113 at offset 7: POSIX collating elements are not supported
21742174

21752175
/[[=ch=]]/I
2176-
Failed: error 113 at offset 1: POSIX collating elements are not supported
2176+
Failed: error 113 at offset 7: POSIX collating elements are not supported
21772177

21782178
/[[:rhubarb:]]/I
2179-
Failed: error 130 at offset 3: unknown POSIX class name
2179+
Failed: error 130 at offset 12: unknown POSIX class name
21802180

21812181
/[[:upper:]]/Ii
21822182
Capture group count = 0
@@ -8775,31 +8775,31 @@ Failed: error 162 at offset 4: subpattern name expected
87758775
Failed: error 162 at offset 4: subpattern name expected
87768776

87778777
/[[:foo:]]/
8778-
Failed: error 130 at offset 3: unknown POSIX class name
8778+
Failed: error 130 at offset 8: unknown POSIX class name
87798779

87808780
/[[:1234:]]/
8781-
Failed: error 130 at offset 3: unknown POSIX class name
8781+
Failed: error 130 at offset 9: unknown POSIX class name
87828782

87838783
/[[:f\oo:]]/
8784-
Failed: error 130 at offset 3: unknown POSIX class name
8784+
Failed: error 130 at offset 9: unknown POSIX class name
87858785

87868786
/[[: :]]/
8787-
Failed: error 130 at offset 3: unknown POSIX class name
8787+
Failed: error 130 at offset 6: unknown POSIX class name
87888788

87898789
/[[:...:]]/
8790-
Failed: error 130 at offset 3: unknown POSIX class name
8790+
Failed: error 130 at offset 8: unknown POSIX class name
87918791

87928792
/[[:l\ower:]]/
8793-
Failed: error 130 at offset 3: unknown POSIX class name
8793+
Failed: error 130 at offset 11: unknown POSIX class name
87948794

87958795
/[[:abc\:]]/
8796-
Failed: error 130 at offset 3: unknown POSIX class name
8796+
Failed: error 130 at offset 9: unknown POSIX class name
87978797

87988798
/[abc[:x\]pqr:]]/
8799-
Failed: error 130 at offset 6: unknown POSIX class name
8799+
Failed: error 130 at offset 14: unknown POSIX class name
88008800

88018801
/[[:a\dz:]]/
8802-
Failed: error 130 at offset 3: unknown POSIX class name
8802+
Failed: error 130 at offset 9: unknown POSIX class name
88038803

88048804
/(^(a|b\g<-1'c))/
88058805
Failed: error 157 at offset 8: \g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number
@@ -11524,7 +11524,7 @@ Failed: error 171 at offset 4: \N is not supported in a class
1152411524
aNc
1152511525

1152611526
/a[B-\Nc]/
11527-
Failed: error 150 at offset 6: invalid range in character class
11527+
Failed: error 171 at offset 6: \N is not supported in a class
1152811528

1152911529
/a[B\Nc]/
1153011530
Failed: error 171 at offset 5: \N is not supported in a class
@@ -13347,16 +13347,16 @@ Failed: error 178 at offset 5: digits missing after \x or in \x{} or \o{} or \N{
1334713347
------------------------------------------------------------------
1334813348

1334913349
/[a-[:digit:]]+/
13350-
Failed: error 150 at offset 4: invalid range in character class
13350+
Failed: error 150 at offset 12: invalid range in character class
1335113351

1335213352
/[A-[:digit:]]+/
13353-
Failed: error 150 at offset 4: invalid range in character class
13353+
Failed: error 150 at offset 12: invalid range in character class
1335413354

1335513355
/[a-[.xxx.]]+/
13356-
Failed: error 150 at offset 4: invalid range in character class
13356+
Failed: error 150 at offset 10: invalid range in character class
1335713357

1335813358
/[a-[=xxx=]]+/
13359-
Failed: error 150 at offset 4: invalid range in character class
13359+
Failed: error 150 at offset 10: invalid range in character class
1336013360

1336113361
/[a-[!xxx!]]+/
1336213362
Failed: error 108 at offset 3: range out of order in character class
@@ -13477,7 +13477,7 @@ No match
1347713477
No match
1347813478

1347913479
/[a[:<:]] should give error/
13480-
Failed: error 130 at offset 4: unknown POSIX class name
13480+
Failed: error 130 at offset 7: unknown POSIX class name
1348113481

1348213482
/(?=ab\K)/aftertext,allow_lookaround_bsk
1348313483
abcd\=startchar
@@ -15879,11 +15879,11 @@ Failed: error 125 at offset 13: length of lookbehind assertion is not limited
1587915879
# Perl accepts these, but gives a warning. We can't warn, so give an error.
1588015880

1588115881
/[a-[:digit:]]+/
15882-
Failed: error 150 at offset 4: invalid range in character class
15882+
Failed: error 150 at offset 12: invalid range in character class
1588315883
a-a9-a
1588415884

1588515885
/[A-[:digit:]]+/
15886-
Failed: error 150 at offset 4: invalid range in character class
15886+
Failed: error 150 at offset 12: invalid range in character class
1588715887
A-A9-A
1588815888

1588915889
/[a-\d]+/
@@ -16020,7 +16020,7 @@ Failed: error 128 at offset 63: atomic assertion expected after (?( or (?(?C)
1602016020
.+(?(?C'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'))?!XXXX.=X
1602116021

1602216022
/[:[:alnum:]-[[a:lnum:]+/
16023-
Failed: error 150 at offset 11: invalid range in character class
16023+
Failed: error 150 at offset 12: invalid range in character class
1602416024

1602516025
/((?(?C'')\QX\E(?!((?(?C'')(?!X=X));=)r*X=X));=)/
1602616026
Failed: error 128 at offset 11: atomic assertion expected after (?( or (?(?C)
@@ -16654,10 +16654,10 @@ Subject length lower bound = 3
1665416654
------------------------------------------------------------------
1665516655

1665616656
/[Q-\N]/B,bad_escape_is_literal
16657-
Failed: error 150 at offset 5: invalid range in character class
16657+
Failed: error 171 at offset 5: \N is not supported in a class
1665816658

1665916659
/[\s-_]/bad_escape_is_literal
16660-
Failed: error 150 at offset 3: invalid range in character class
16660+
Failed: error 150 at offset 4: invalid range in character class
1666116661

1666216662
/[_-\s]/bad_escape_is_literal
1666316663
Failed: error 150 at offset 5: invalid range in character class
@@ -16857,19 +16857,19 @@ No match
1685716857
No match
1685816858

1685916859
/[[:digit:]-a]/
16860-
Failed: error 150 at offset 10: invalid range in character class
16860+
Failed: error 150 at offset 11: invalid range in character class
1686116861

1686216862
/[[:digit:]-[:print:]]/
16863-
Failed: error 150 at offset 10: invalid range in character class
16863+
Failed: error 150 at offset 11: invalid range in character class
1686416864

1686516865
/[\d-a]/
16866-
Failed: error 150 at offset 3: invalid range in character class
16866+
Failed: error 150 at offset 4: invalid range in character class
1686716867

1686816868
/[\H-z]/
16869-
Failed: error 150 at offset 3: invalid range in character class
16869+
Failed: error 150 at offset 4: invalid range in character class
1687016870

1687116871
/[\d-[:print:]]/
16872-
Failed: error 150 at offset 3: invalid range in character class
16872+
Failed: error 150 at offset 4: invalid range in character class
1687316873

1687416874
# Perl gets the second of these wrong, giving no match.
1687516875

@@ -20619,7 +20619,7 @@ Failed: error 211 at offset 7: brackets needed to clarify operator precedence in
2061920619
No match
2062020620

2062120621
/[\d-z]/B,alt_extended_class
20622-
Failed: error 150 at offset 3: invalid range in character class
20622+
Failed: error 150 at offset 4: invalid range in character class
2062320623

2062420624
/[z-\d]/B,alt_extended_class
2062520625
Failed: error 150 at offset 5: invalid range in character class
@@ -20654,16 +20654,28 @@ Failed: error 207 at offset 118: character classes are too deeply nested
2065420654
# --------------
2065520655

2065620656
/[[:digit:] -Z]/xx
20657-
Failed: error 150 at offset 10: invalid range in character class
20657+
Failed: error 150 at offset 14: invalid range in character class
2065820658

2065920659
/[\d -Z]/xx
20660-
Failed: error 150 at offset 3: invalid range in character class
20660+
Failed: error 150 at offset 7: invalid range in character class
2066120661

2066220662
/[[:digit:]\E-H]/
20663-
Failed: error 150 at offset 10: invalid range in character class
20663+
Failed: error 150 at offset 13: invalid range in character class
2066420664

2066520665
/[[:digit:]\Q\E-H]+/
20666-
Failed: error 150 at offset 10: invalid range in character class
20666+
Failed: error 150 at offset 15: invalid range in character class
20667+
20668+
/[z-[:space:]]/
20669+
Failed: error 150 at offset 12: invalid range in character class
20670+
20671+
/[z-\d]/
20672+
Failed: error 150 at offset 5: invalid range in character class
20673+
20674+
/[[:space:]-z]/
20675+
Failed: error 150 at offset 11: invalid range in character class
20676+
20677+
/[\d-z]/
20678+
Failed: error 150 at offset 4: invalid range in character class
2066720679

2066820680
# End of testinput2
2066920681
Error -70: PCRE2_ERROR_BADDATA (unknown error number)

0 commit comments

Comments
 (0)