Skip to content

add PCRE2_ASCII (RFC) #186

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/pcre2_compile.3
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ The option bits are:
PCRE2_ALT_BSUX Alternative handling of \eu, \eU, and \ex
PCRE2_ALT_CIRCUMFLEX Alternative handling of ^ in multiline mode
PCRE2_ALT_VERBNAMES Process backslashes in verb names
PCRE2_ASCII Prefer ASCII in conflicting UTF classes
PCRE2_AUTO_CALLOUT Compile automatic callouts
PCRE2_CASELESS Do caseless matching
PCRE2_DOLLAR_ENDONLY $ not to match newline at end
Expand Down
6 changes: 6 additions & 0 deletions doc/pcre2api.3
Original file line number Diff line number Diff line change
Expand Up @@ -1446,6 +1446,12 @@ included in a name either as \e) or between \eQ and \eE. If the PCRE2_EXTENDED
or PCRE2_EXTENDED_MORE option is set with PCRE2_ALT_VERBNAMES, unescaped
whitespace in verb names is skipped and #-comments are recognized, exactly as
in the rest of the pattern.
.sp
PCRE2_ASCII
.sp
When PCRE2_UTF and PCRE2_UCP are both being used, some classes are changed in
ways that conflict between UTF and ASCII characters. This option can be set
to restrict \ed to only match the non UTF digits.
.sp
PCRE2_AUTO_CALLOUT
.sp
Expand Down
12 changes: 7 additions & 5 deletions doc/pcre2pattern.3
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,11 @@ appearance in a pattern causes an error.
.sp
Another special sequence that may appear at the start of a pattern is (*UCP).
This has the same effect as setting the PCRE2_UCP option: it causes sequences
such as \ed and \ew to use Unicode properties to determine character types,
instead of recognizing only characters with codes less than 256 via a lookup
table. If also causes upper/lower casing operations to use Unicode properties
for characters with code points greater than 127, even when UTF is not set.
such as \ed (unless PCRE2_ASCII was set) and \ew to use Unicode properties
to determine character types, instead of recognizing only characters with
codes less than 256 via a lookup table. It also causes upper/lower casing
operations to use Unicode properties for characters with code points greater
than 127, even when UTF is not set.
.P
Some applications that allow their users to supply patterns may wish to
restrict them for security reasons. If the PCRE2_NEVER_UCP option is passed to
Expand Down Expand Up @@ -670,7 +671,8 @@ determine character types, as follows:
\ew any character that matches \ep{L} or \ep{N}, plus underscore
.sp
The upper case escapes match the inverse sets of characters. Note that \ed
matches only decimal digits, whereas \ew matches any Unicode digit, as well as
matches only decimal digits and could be forced to match only the original
set with PCRE2_ASCII, whereas \ew matches any Unicode digit, as well as
any Unicode letter, and underscore. Note also that PCRE2_UCP affects \eb, and
\eB because they are defined in terms of \ew and \eW. Matching these sequences
is noticeably slower when PCRE2_UCP is set.
Expand Down
3 changes: 2 additions & 1 deletion doc/pcre2syntax.3
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,8 @@ or in the 16-bit and 32-bit libraries. However, if locale-specific matching is
happening, \es and \ew may also match characters with code points in the range
128-255. If the PCRE2_UCP option is set, the behaviour of these escape
sequences is changed to use Unicode properties and they match many more
characters.
characters. Alternatively if the PCRE2_ASCII option is also set \ed original
definition is preserved.
.P
Property descriptions in \ep and \eP are matched caselessly; hyphens,
underscores, and white space are ignored, in accordance with Unicode's "loose
Expand Down
1 change: 1 addition & 0 deletions src/pcre2.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ D is inspected during pcre2_dfa_match() execution
#define PCRE2_EXTENDED_MORE 0x01000000u /* C */
#define PCRE2_LITERAL 0x02000000u /* C */
#define PCRE2_MATCH_INVALID_UTF 0x04000000u /* J M D */
#define PCRE2_ASCII 0x08000000u /* C */

/* An additional compile options word is available in the compile context. */

Expand Down
26 changes: 17 additions & 9 deletions src/pcre2_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -776,7 +776,7 @@ are allowed. */
PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_ASCII)

#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
(PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD)
Expand Down Expand Up @@ -3124,14 +3124,18 @@ while (ptr < ptrend)
}
else
{
*parsed_pattern++ = META_ESCAPE +
((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
ESC_p : ESC_P);
if ((options & PCRE2_ASCII) == 0)
*parsed_pattern++ = META_ESCAPE +
((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
ESC_p : ESC_P);
else
*parsed_pattern++ = META_ESCAPE + escape;
switch(escape)
{
case ESC_d:
case ESC_D:
*parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
if ((options & PCRE2_ASCII) == 0)
*parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
break;

case ESC_s:
Expand Down Expand Up @@ -3671,14 +3675,18 @@ while (ptr < ptrend)
}
else
{
*parsed_pattern++ = META_ESCAPE +
((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
ESC_p : ESC_P);
if ((options & PCRE2_ASCII) == 0)
*parsed_pattern++ = META_ESCAPE +
((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
ESC_p : ESC_P);
else
*parsed_pattern++ = META_ESCAPE + escape;
switch(escape)
{
case ESC_d:
case ESC_D:
*parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
if ((options & PCRE2_ASCII) == 0)
*parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
break;

case ESC_s:
Expand Down
8 changes: 5 additions & 3 deletions src/pcre2test.c
Original file line number Diff line number Diff line change
Expand Up @@ -640,6 +640,7 @@ static modstruct modlist[] = {
{ "alt_verbnames", MOD_PAT, MOD_OPT, PCRE2_ALT_VERBNAMES, PO(options) },
{ "altglobal", MOD_PND, MOD_CTL, CTL_ALTGLOBAL, PO(control) },
{ "anchored", MOD_PD, MOD_OPT, PCRE2_ANCHORED, PD(options) },
{ "ascii", MOD_PATP, MOD_OPT, PCRE2_ASCII, PO(options) },
{ "auto_callout", MOD_PAT, MOD_OPT, PCRE2_AUTO_CALLOUT, PO(options) },
{ "bad_escape_is_literal", MOD_CTC, MOD_OPT, PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL, CO(extra_options) },
{ "bincode", MOD_PAT, MOD_CTL, CTL_BINCODE, PO(control) },
Expand Down Expand Up @@ -762,8 +763,8 @@ static modstruct modlist[] = {
/* Controls and options that are supported for use with the POSIX interface. */

#define POSIX_SUPPORTED_COMPILE_OPTIONS ( \
PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_LITERAL|PCRE2_MULTILINE|PCRE2_UCP| \
PCRE2_UTF|PCRE2_UNGREEDY)
PCRE2_ASCII|PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_LITERAL|PCRE2_MULTILINE| \
PCRE2_UCP| PCRE2_UTF|PCRE2_UNGREEDY)

#define POSIX_SUPPORTED_COMPILE_EXTRA_OPTIONS (0)

Expand Down Expand Up @@ -4202,12 +4203,13 @@ static void
show_compile_options(uint32_t options, const char *before, const char *after)
{
if (options == 0) fprintf(outfile, "%s <none>%s", before, after);
else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
before,
((options & PCRE2_ALT_BSUX) != 0)? " alt_bsux" : "",
((options & PCRE2_ALT_CIRCUMFLEX) != 0)? " alt_circumflex" : "",
((options & PCRE2_ALT_VERBNAMES) != 0)? " alt_verbnames" : "",
((options & PCRE2_ALLOW_EMPTY_CLASS) != 0)? " allow_empty_class" : "",
((options & PCRE2_ASCII) != 0)? " ascii" : "",
((options & PCRE2_ANCHORED) != 0)? " anchored" : "",
((options & PCRE2_AUTO_CALLOUT) != 0)? " auto_callout" : "",
((options & PCRE2_CASELESS) != 0)? " caseless" : "",
Expand Down