diff --git a/doc/pcre2_compile.3 b/doc/pcre2_compile.3 index 5a07b8b03..49644db2b 100644 --- a/doc/pcre2_compile.3 +++ b/doc/pcre2_compile.3 @@ -44,6 +44,7 @@ The option bits are: PCRE2_ALT_BSUX Alternative handling of \eu, \eU, and \ex PCRE2_ALT_CIRCUMFLEX Alternative handling of ^ in multiline mode PCRE2_ALT_VERBNAMES Process backslashes in verb names + PCRE2_ASCII Prefer ASCII in conflicting UTF classes PCRE2_AUTO_CALLOUT Compile automatic callouts PCRE2_CASELESS Do caseless matching PCRE2_DOLLAR_ENDONLY $ not to match newline at end diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 index 28c6033a1..78f4638b4 100644 --- a/doc/pcre2api.3 +++ b/doc/pcre2api.3 @@ -1446,6 +1446,12 @@ included in a name either as \e) or between \eQ and \eE. If the PCRE2_EXTENDED or PCRE2_EXTENDED_MORE option is set with PCRE2_ALT_VERBNAMES, unescaped whitespace in verb names is skipped and #-comments are recognized, exactly as in the rest of the pattern. +.sp + PCRE2_ASCII +.sp +When PCRE2_UTF and PCRE2_UCP are both being used, some classes are changed in +ways that conflict between UTF and ASCII characters. This option can be set +to restrict \ed to only match the non UTF digits. .sp PCRE2_AUTO_CALLOUT .sp diff --git a/doc/pcre2pattern.3 b/doc/pcre2pattern.3 index 3088ec0fb..9ae782912 100644 --- a/doc/pcre2pattern.3 +++ b/doc/pcre2pattern.3 @@ -73,10 +73,11 @@ appearance in a pattern causes an error. .sp Another special sequence that may appear at the start of a pattern is (*UCP). This has the same effect as setting the PCRE2_UCP option: it causes sequences -such as \ed and \ew to use Unicode properties to determine character types, -instead of recognizing only characters with codes less than 256 via a lookup -table. If also causes upper/lower casing operations to use Unicode properties -for characters with code points greater than 127, even when UTF is not set. +such as \ed (unless PCRE2_ASCII was set) and \ew to use Unicode properties +to determine character types, instead of recognizing only characters with +codes less than 256 via a lookup table. It also causes upper/lower casing +operations to use Unicode properties for characters with code points greater +than 127, even when UTF is not set. .P Some applications that allow their users to supply patterns may wish to restrict them for security reasons. If the PCRE2_NEVER_UCP option is passed to @@ -670,7 +671,8 @@ determine character types, as follows: \ew any character that matches \ep{L} or \ep{N}, plus underscore .sp The upper case escapes match the inverse sets of characters. Note that \ed -matches only decimal digits, whereas \ew matches any Unicode digit, as well as +matches only decimal digits and could be forced to match only the original +set with PCRE2_ASCII, whereas \ew matches any Unicode digit, as well as any Unicode letter, and underscore. Note also that PCRE2_UCP affects \eb, and \eB because they are defined in terms of \ew and \eW. Matching these sequences is noticeably slower when PCRE2_UCP is set. diff --git a/doc/pcre2syntax.3 b/doc/pcre2syntax.3 index c0a496f4f..dfe272e51 100644 --- a/doc/pcre2syntax.3 +++ b/doc/pcre2syntax.3 @@ -101,7 +101,8 @@ or in the 16-bit and 32-bit libraries. However, if locale-specific matching is happening, \es and \ew may also match characters with code points in the range 128-255. If the PCRE2_UCP option is set, the behaviour of these escape sequences is changed to use Unicode properties and they match many more -characters. +characters. Alternatively if the PCRE2_ASCII option is also set \ed original +definition is preserved. .P Property descriptions in \ep and \eP are matched caselessly; hyphens, underscores, and white space are ignored, in accordance with Unicode's "loose diff --git a/src/pcre2.h.in b/src/pcre2.h.in index 7b8818de8..5af08952f 100644 --- a/src/pcre2.h.in +++ b/src/pcre2.h.in @@ -143,6 +143,7 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_EXTENDED_MORE 0x01000000u /* C */ #define PCRE2_LITERAL 0x02000000u /* C */ #define PCRE2_MATCH_INVALID_UTF 0x04000000u /* J M D */ +#define PCRE2_ASCII 0x08000000u /* C */ /* An additional compile options word is available in the compile context. */ diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index edf7e82e6..b8230307d 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -776,7 +776,7 @@ are allowed. */ PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \ PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \ PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \ - PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY) + PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_ASCII) #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \ (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD) @@ -3124,14 +3124,18 @@ while (ptr < ptrend) } else { - *parsed_pattern++ = META_ESCAPE + - ((escape == ESC_d || escape == ESC_s || escape == ESC_w)? - ESC_p : ESC_P); + if ((options & PCRE2_ASCII) == 0) + *parsed_pattern++ = META_ESCAPE + + ((escape == ESC_d || escape == ESC_s || escape == ESC_w)? + ESC_p : ESC_P); + else + *parsed_pattern++ = META_ESCAPE + escape; switch(escape) { case ESC_d: case ESC_D: - *parsed_pattern++ = (PT_PC << 16) | ucp_Nd; + if ((options & PCRE2_ASCII) == 0) + *parsed_pattern++ = (PT_PC << 16) | ucp_Nd; break; case ESC_s: @@ -3671,14 +3675,18 @@ while (ptr < ptrend) } else { - *parsed_pattern++ = META_ESCAPE + - ((escape == ESC_d || escape == ESC_s || escape == ESC_w)? - ESC_p : ESC_P); + if ((options & PCRE2_ASCII) == 0) + *parsed_pattern++ = META_ESCAPE + + ((escape == ESC_d || escape == ESC_s || escape == ESC_w)? + ESC_p : ESC_P); + else + *parsed_pattern++ = META_ESCAPE + escape; switch(escape) { case ESC_d: case ESC_D: - *parsed_pattern++ = (PT_PC << 16) | ucp_Nd; + if ((options & PCRE2_ASCII) == 0) + *parsed_pattern++ = (PT_PC << 16) | ucp_Nd; break; case ESC_s: diff --git a/src/pcre2test.c b/src/pcre2test.c index 4fa588423..60dc449f6 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -640,6 +640,7 @@ static modstruct modlist[] = { { "alt_verbnames", MOD_PAT, MOD_OPT, PCRE2_ALT_VERBNAMES, PO(options) }, { "altglobal", MOD_PND, MOD_CTL, CTL_ALTGLOBAL, PO(control) }, { "anchored", MOD_PD, MOD_OPT, PCRE2_ANCHORED, PD(options) }, + { "ascii", MOD_PATP, MOD_OPT, PCRE2_ASCII, PO(options) }, { "auto_callout", MOD_PAT, MOD_OPT, PCRE2_AUTO_CALLOUT, PO(options) }, { "bad_escape_is_literal", MOD_CTC, MOD_OPT, PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL, CO(extra_options) }, { "bincode", MOD_PAT, MOD_CTL, CTL_BINCODE, PO(control) }, @@ -762,8 +763,8 @@ static modstruct modlist[] = { /* Controls and options that are supported for use with the POSIX interface. */ #define POSIX_SUPPORTED_COMPILE_OPTIONS ( \ - PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_LITERAL|PCRE2_MULTILINE|PCRE2_UCP| \ - PCRE2_UTF|PCRE2_UNGREEDY) + PCRE2_ASCII|PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_LITERAL|PCRE2_MULTILINE| \ + PCRE2_UCP| PCRE2_UTF|PCRE2_UNGREEDY) #define POSIX_SUPPORTED_COMPILE_EXTRA_OPTIONS (0) @@ -4202,12 +4203,13 @@ static void show_compile_options(uint32_t options, const char *before, const char *after) { if (options == 0) fprintf(outfile, "%s %s", before, after); -else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", +else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", before, ((options & PCRE2_ALT_BSUX) != 0)? " alt_bsux" : "", ((options & PCRE2_ALT_CIRCUMFLEX) != 0)? " alt_circumflex" : "", ((options & PCRE2_ALT_VERBNAMES) != 0)? " alt_verbnames" : "", ((options & PCRE2_ALLOW_EMPTY_CLASS) != 0)? " allow_empty_class" : "", + ((options & PCRE2_ASCII) != 0)? " ascii" : "", ((options & PCRE2_ANCHORED) != 0)? " anchored" : "", ((options & PCRE2_AUTO_CALLOUT) != 0)? " auto_callout" : "", ((options & PCRE2_CASELESS) != 0)? " caseless" : "",