diff --git a/CHANGELOG.md b/CHANGELOG.md index 41a6d8d4d4..57b3a31457 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,25 @@ +1.3.0 (2019-09-02) +================== +This release adds a plethora of new crate features that permit users of regex +to shrink its size considerably, in exchange for giving up either functionality +(such as Unicode support) or runtime performance. When all such features are +disabled, the dependency tree for `regex` shrinks to exactly 1 crate +(`regex-syntax`). More information about the new crate features can be +[found in the docs](https://docs.rs/regex/*/#crate-features). + +Note that while this is a new minor version release, the minimum supported +Rust version for this crate remains at `1.28.0`. + +New features: + +* [FEATURE #474](https://github.com/rust-lang/regex/issues/474): + The `use_std` feature has been deprecated in favor of the `std` feature. + The `use_std` feature will be removed in regex 2. Until then, `use_std` will + remain as an alias for the `std` feature. +* [FEATURE #583](https://github.com/rust-lang/regex/issues/583): + Add a substantial number of crate features shrinking `regex`. + + 1.2.1 (2019-08-03) ================== This release does a bit of house cleaning. Namely: diff --git a/Cargo.toml b/Cargo.toml index 7ae61bc5fe..2663e17f42 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,15 +24,105 @@ members = [ "bench", "regex-capi", "regex-debug", "regex-syntax", ] -[dependencies] +[lib] +# There are no benchmarks in the library code itself +bench = false +# Doc tests fail when some features aren't present. The easiest way to work +# around this is to disable automatic doc testing, but explicitly test them +# with `cargo test --doc`. +doctest = false + +# Features are documented in the "Crate features" section of the crate docs: +# https://docs.rs/regex/*/#crate-features +[features] +default = ["std", "perf", "unicode"] + +# ECOSYSTEM FEATURES + +# The 'std' feature permits the regex crate to use the standard library. This +# is intended to support future use cases where the regex crate may be able +# to compile without std, and instead just rely on 'core' and 'alloc' (for +# example). Currently, this isn't supported, and removing the 'std' feature +# will prevent regex from compiling. +std = [] +# The 'use_std' feature is DEPRECATED. It will be removed in regex 2. Until +# then, it is alias for the 'std' feature. +use_std = ["std"] + + +# PERFORMANCE FEATURES + +# Enables all performance features. +perf = ["perf-cache", "perf-dfa", "perf-inline", "perf-literal"] +# Enables fast caching. (If disabled, caching is still used, but is slower.) +perf-cache = ["thread_local"] +# Enables use of a lazy DFA when possible. +perf-dfa = [] +# Enables aggressive use of inlining. +perf-inline = [] +# Enables literal optimizations. +perf-literal = ["aho-corasick", "memchr"] + + +# UNICODE DATA FEATURES + +# Enables all Unicode features. This expands if new Unicode features are added. +unicode = [ + "unicode-age", + "unicode-bool", + "unicode-case", + "unicode-gencat", + "unicode-perl", + "unicode-script", + "unicode-segment", +] +# Enables use of the `Age` property, e.g., `\p{Age:3.0}`. +unicode-age = ["regex-syntax/unicode-age"] +# Enables use of a smattering of boolean properties, e.g., `\p{Emoji}`. +unicode-bool = ["regex-syntax/unicode-bool"] +# Enables Unicode-aware case insensitive matching, e.g., `(?i)β`. +unicode-case = ["regex-syntax/unicode-case"] +# Enables Unicode general categories, e.g., `\p{Letter}` or `\pL`. +unicode-gencat = ["regex-syntax/unicode-gencat"] +# Enables Unicode-aware Perl classes corresponding to `\w`, `\s` and `\d`. +unicode-perl = ["regex-syntax/unicode-perl"] +# Enables Unicode scripts and script extensions, e.g., `\p{Greek}`. +unicode-script = ["regex-syntax/unicode-script"] +# Enables Unicode segmentation properties, e.g., `\p{gcb=Extend}`. +unicode-segment = ["regex-syntax/unicode-segment"] + + +# UNSTABLE FEATURES (requires Rust nightly) + +# A blanket feature that governs whether unstable features are enabled or not. +# Unstable features are disabled by default, and typically rely on unstable +# features in rustc itself. +unstable = ["pattern"] + +# Enable to use the unstable pattern traits defined in std. This is enabled +# by default if the unstable feature is enabled. +pattern = [] + # For very fast prefix literal matching. -aho-corasick = "0.7.6" +[dependencies.aho-corasick] +version = "0.7.6" +optional = true + # For skipping along search text quickly when a leading byte is known. -memchr = "2.2.1" +[dependencies.memchr] +version = "2.2.1" +optional = true + # For managing regex caches quickly across multiple threads. -thread_local = "0.3.6" +[dependencies.thread_local] +version = "0.3.6" +optional = true + # For parsing regular expressions. -regex-syntax = { path = "regex-syntax", version = "0.6.11" } +[dependencies.regex-syntax] +path = "regex-syntax" +version = "0.6.11" +default-features = false [dev-dependencies] # For examples. @@ -44,26 +134,6 @@ rand = "0.6.5" # To check README's example doc-comment = "0.3" -[features] -default = ["use_std"] -# The 'use_std' feature permits the regex crate to use the standard library. -# This is intended to support future use cases where the regex crate may be -# able to compile without std, and instead just rely on 'core' and 'alloc' -# (for example). Currently, this isn't supported, and removing the 'use_std' -# feature will prevent regex from compiling. -use_std = [] -# A blanket feature that governs whether unstable features are enabled or not. -# Unstable features are disabled by default, and typically rely on unstable -# features in rustc itself. -unstable = ["pattern"] -# Enable to use the unstable pattern traits defined in std. This is enabled -# by default if the unstable feature is enabled. -pattern = [] - -[lib] -# There are no benchmarks in the library code itself -bench = false - # Run the test suite on the default behavior of Regex::new. # This includes a mish mash of NFAs and DFAs, which are chosen automatically # based on the regex. We test both of the NFA implementations by forcing their diff --git a/README.md b/README.md index cea3da5549..50436498f8 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ linear time with respect to the size of the regular expression and search text. Much of the syntax and implementation is inspired by [RE2](https://github.com/google/re2). -[![Build Status](https://travis-ci.com/rust-lang/regex.svg?branch=master)](https://travis-ci.com/rust-lang/regex) +[![Build status](https://travis-ci.com/rust-lang/regex.svg?branch=master)](https://travis-ci.com/rust-lang/regex) [![Build status](https://ci.appveyor.com/api/projects/status/github/rust-lang/regex?svg=true)](https://ci.appveyor.com/project/rust-lang-libs/regex) [![Coverage Status](https://coveralls.io/repos/github/rust-lang/regex/badge.svg?branch=master)](https://coveralls.io/github/rust-lang/regex?branch=master) [![](https://meritbadge.herokuapp.com/regex)](https://crates.io/crates/regex) @@ -201,9 +201,36 @@ recommended for general use. [Documentation `regex-syntax`.](https://docs.rs/regex-syntax) +### Crate features + +This crate comes with several features that permit tweaking the trade off +between binary size, compilation time and runtime performance. Users of this +crate can selectively disable Unicode tables, or choose from a variety of +optimizations performed by this crate to disable. + +When all of these features are disabled, runtime match performance may be much +worse, but if you're matching on short strings, or if high performance isn't +necessary, then such a configuration is perfectly serviceable. To disable +all such features, use the following `Cargo.toml` dependency configuration: + +```toml +[dependencies.regex] +version = "1.3" +default-features = false +# regex currently requires the standard library, you must re-enable it. +features = ["std"] +``` + +This will reduce the dependency tree of `regex` down to a single crate +(`regex-syntax`). + +The full set of features one can disable are +[in the "Crate features" section of the documentation](https://docs.rs/regex/*/#crate-features). + + ### Minimum Rust version policy -This crate's minimum supported `rustc` version is `1.28.1`. +This crate's minimum supported `rustc` version is `1.28.0`. The current **tentative** policy is that the minimum Rust version required to use this crate can be increased in minor version updates. For example, if diff --git a/ci/script.sh b/ci/script.sh index 2e2cb77c68..fed3748875 100755 --- a/ci/script.sh +++ b/ci/script.sh @@ -1,5 +1,7 @@ #!/bin/sh +# vim: tabstop=2 shiftwidth=2 softtabstop=2 + # This is the main CI script for testing the regex crate and its sub-crates. set -ex @@ -18,26 +20,33 @@ if [ "$TRAVIS_RUST_VERSION" = "$MSRV" ]; then exit fi -# Run tests. If we have nightly, then enable our nightly features. -# Right now there are no nightly features, but that may change in the -# future. -CARGO_TEST_EXTRA_FLAGS="" -if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then - CARGO_TEST_EXTRA_FLAGS="" +# Check formatting, but make sure we use the stable version of rustfmt. +if [ "$TRAVIS_RUST_VERSION" = "stable" ]; then + rustup component add rustfmt + cargo fmt --all -- --check fi -cargo test --verbose ${CARGO_TEST_EXTRA_FLAGS} -# Run the random tests in release mode, as this is faster. -RUST_REGEX_RANDOM_TEST=1 \ - cargo test --release --verbose \ - ${CARGO_TEST_EXTRA_FLAGS} --test crates-regex +# Only run the full test suite on one job to keep build times lower. +if [ "$TRAVIS_RUST_VERSION" = "stable" ]; then + ./test + + # Run the random tests in release mode, as this is faster. + RUST_REGEX_RANDOM_TEST=1 cargo test --release --verbose --test crates-regex +else + cargo test --verbose --test default +fi # Run a test that confirms the shootout benchmarks are correct. ci/run-shootout-test # Run tests on regex-syntax crate. -cargo test --verbose --manifest-path regex-syntax/Cargo.toml cargo doc --verbose --manifest-path regex-syntax/Cargo.toml +# Only run the full test suite on one job, to conserve resources. +if [ "$TRAVIS_RUST_VERSION" = "stable" ]; then + (cd regex-syntax && ./test) +else + cargo test --verbose --manifest-path regex-syntax/Cargo.toml +fi # Run tests on regex-capi crate. ci/test-regex-capi @@ -50,17 +59,10 @@ if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then # Test minimal versions. # - # For now, we remove this check, because it doesn't seem possible to convince - # some maintainers of *core* crates that this is a worthwhile test to add. - # In particular, this test uncovers any *incorrect* dependency specification - # in the chain of dependencies. - # - # We might consider figuring out how to migrate off of rand in order to get - # this check working. (This will be hard, since it either requires dropping - # quickcheck or migrating quickcheck off of rand, which is just probably - # not practical.) - # - # So frustrating. + # rand has started putting the minimal version check in their CI, so we + # should be able to re-enable this soon. This will require upgrading to + # rand 0.7, which breaks our MSRV since it relies on Rust 2018 features in + # order to read the Cargo.toml. # cargo +nightly generate-lockfile -Z minimal-versions # cargo build --verbose # cargo test --verbose diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index ac8f5d5c81..b77b1d573e 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -8,3 +8,25 @@ documentation = "https://docs.rs/regex-syntax" homepage = "https://github.com/rust-lang/regex" description = "A regular expression parser." workspace = ".." + +# Features are documented in the "Crate features" section of the crate docs: +# https://docs.rs/regex-syntax/*/#crate-features +[features] +default = ["unicode"] + +unicode = [ + "unicode-age", + "unicode-bool", + "unicode-case", + "unicode-gencat", + "unicode-perl", + "unicode-script", + "unicode-segment", +] +unicode-age = [] +unicode-bool = [] +unicode-case = [] +unicode-gencat = [] +unicode-perl = [] +unicode-script = [] +unicode-segment = [] diff --git a/regex-syntax/README.md b/regex-syntax/README.md new file mode 100644 index 0000000000..e904601148 --- /dev/null +++ b/regex-syntax/README.md @@ -0,0 +1,99 @@ +regex-syntax +============ +This crate provides a robust regular expression parser. + +[![Build status](https://travis-ci.com/rust-lang/regex.svg?branch=master)](https://travis-ci.com/rust-lang/regex) +[![Build status](https://ci.appveyor.com/api/projects/status/github/rust-lang/regex?svg=true)](https://ci.appveyor.com/project/rust-lang-libs/regex) +[![](https://meritbadge.herokuapp.com/regex-syntax)](https://crates.io/crates/regex-syntax) +[![Rust](https://img.shields.io/badge/rust-1.28.0%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex) + + +### Documentation + +https://docs.rs/regex-syntax + + +### Overview + +There are two primary types exported by this crate: `Ast` and `Hir`. The former +is a faithful abstract syntax of a regular expression, and can convert regular +expressions back to their concrete syntax while mostly preserving its original +form. The latter type is a high level intermediate representation of a regular +expression that is amenable to analysis and compilation into byte codes or +automata. An `Hir` achieves this by drastically simplifying the syntactic +structure of the regular expression. While an `Hir` can be converted back to +its equivalent concrete syntax, the result is unlikely to resemble the original +concrete syntax that produced the `Hir`. + + +### Example + +This example shows how to parse a pattern string into its HIR: + +```rust +use regex_syntax::Parser; +use regex_syntax::hir::{self, Hir}; + +let hir = Parser::new().parse("a|b").unwrap(); +assert_eq!(hir, Hir::alternation(vec![ + Hir::literal(hir::Literal::Unicode('a')), + Hir::literal(hir::Literal::Unicode('b')), +])); +``` + + +### Safety + +This crate has no `unsafe` code and sets `forbid(unsafe_code)`. While it's +possible this crate could use `unsafe` code in the future, the standard +for doing so is extremely high. In general, most code in this crate is not +performance critical, since it tends to be dwarfed by the time it takes to +compile a regular expression into an automaton. Therefore, there is little need +for extreme optimization, and therefore, use of `unsafe`. + +The standard for using `unsafe` in this crate is extremely high because this +crate is intended to be reasonably safe to use with user supplied regular +expressions. Therefore, while their may be bugs in the regex parser itself, +they should _never_ result in memory unsafety unless there is either a bug +in the compiler or the standard library. (Since `regex-syntax` has zero +dependencies.) + + +### Crate features + +By default, this crate bundles a fairly large amount of Unicode data tables +(a source size of ~750KB). Because of their large size, one can disable some +or all of these data tables. If a regular expression attempts to use Unicode +data that is not available, then an error will occur when translating the `Ast` +to the `Hir`. + +The full set of features one can disable are +[in the "Crate features" section of the documentation](https://docs.rs/regex-syntax/*/#crate-features). + + +### Testing + +Simply running `cargo test` will give you very good coverage. However, because +of the large number of features exposed by this crate, a `test` script is +included in this directory which will test several feature combinations. This +is the same script that is run in CI. + + +### Motivation + +The primary purpose of this crate is to provide the parser used by `regex`. +Specifically, this crate is treated as an implementation detail of the `regex`, +and is primarily developed for the needs of `regex`. + +Since this crate is an implementation detail of `regex`, it may experience +breaking change releases at a different cadence from `regex`. This is only +possible because this crate is _not_ a public dependency of `regex`. + +Another consequence of this de-coupling is that there is no direct way to +compile a `regex::Regex` from a `regex_syntax::hir::Hir`. Instead, one must +first convert the `Hir` to a string (via its `std::fmt::Display`) and then +compile that via `Regex::new`. While this does repeat some work, compilation +typically takes much longer than parsing. + +Stated differently, the coupling between `regex` and `regex-syntax` exists only +at the level of the concrete syntax. diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 33115b97bd..c063ea9dc2 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -643,6 +643,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// /// This assumes the parser is currently positioned at `|` and will advance /// the parser to the character following `|`. + #[inline(never)] fn push_alternate(&self, mut concat: ast::Concat) -> Result { assert_eq!(self.char(), '|'); concat.span.end = self.pos(); @@ -680,6 +681,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// /// If there was a problem parsing the start of the group, then an error /// is returned. + #[inline(never)] fn push_group(&self, mut concat: ast::Concat) -> Result { assert_eq!(self.char(), '('); match self.parse_group()? { @@ -720,6 +722,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// /// If no such group could be popped, then an unopened group error is /// returned. + #[inline(never)] fn pop_group(&self, mut group_concat: ast::Concat) -> Result { use self::GroupState::*; @@ -771,6 +774,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// error. /// /// This assumes that the parser has advanced to the end. + #[inline(never)] fn pop_group_end(&self, mut concat: ast::Concat) -> Result { concat.span.end = self.pos(); let mut stack = self.parser().stack_group.borrow_mut(); @@ -813,6 +817,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// If there was a problem parsing the opening of the class, then an error /// is returned. Otherwise, a new union of set items for the class is /// returned (which may be populated with either a `]` or a `-`). + #[inline(never)] fn push_class_open( &self, parent_union: ast::ClassSetUnion, @@ -841,6 +846,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// /// If there is no corresponding opening bracket on the parser's stack, /// then an error is returned. + #[inline(never)] fn pop_class( &self, nested_union: ast::ClassSetUnion, @@ -889,6 +895,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// recently opened class. /// /// This should only be called while parsing a character class. + #[inline(never)] fn unclosed_class_error(&self) -> ast::Error { for state in self.parser().stack_class.borrow().iter().rev() { match *state { @@ -909,6 +916,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// /// A fresh set union is returned, which should be used to build the right /// hand side of this operator. + #[inline(never)] fn push_class_op( &self, next_kind: ast::ClassSetBinaryOpKind, @@ -928,6 +936,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// given set unchanged. If the top of the stack is an operation, then the /// given set will be used as the rhs of the operation on the top of the /// stack. In that case, the binary operation is returned as a set. + #[inline(never)] fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet { let mut stack = self.parser().stack_class.borrow_mut(); let (kind, lhs) = match stack.pop() { @@ -1021,6 +1030,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// The caller should include the concatenation that is being built. The /// concatenation returned includes the repetition operator applied to the /// last expression in the given concatenation. + #[inline(never)] fn parse_uncounted_repetition( &self, mut concat: ast::Concat, @@ -1075,6 +1085,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// The caller should include the concatenation that is being built. The /// concatenation returned includes the repetition operator applied to the /// last expression in the given concatenation. + #[inline(never)] fn parse_counted_repetition( &self, mut concat: ast::Concat, @@ -1182,6 +1193,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// /// If a capture name is given and it is incorrectly specified, then a /// corresponding error is returned. + #[inline(never)] fn parse_group(&self) -> Result> { assert_eq!(self.char(), '('); let open_span = self.span_char(); @@ -1248,6 +1260,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// following the closing `>`. /// /// The caller must provide the capture index of the group for this name. + #[inline(never)] fn parse_capture_name( &self, capture_index: u32, @@ -1308,6 +1321,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// /// If no flags could be found or if the negation operation is not followed /// by any flags, then an error is returned. + #[inline(never)] fn parse_flags(&self) -> Result { let mut flags = ast::Flags { span: self.span(), items: vec![] }; let mut last_was_negation = None; @@ -1359,6 +1373,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// # Errors /// /// If the flag is not recognized, then an error is returned. + #[inline(never)] fn parse_flag(&self) -> Result { match self.char() { 'i' => Ok(ast::Flag::CaseInsensitive), @@ -1425,6 +1440,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// This assumes the parser is positioned at the start of the escape /// sequence, i.e., `\`. It advances the parser to the first position /// immediately following the escape sequence. + #[inline(never)] fn parse_escape(&self) -> Result { assert_eq!(self.char(), '\\'); let start = self.pos(); @@ -1526,6 +1542,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// escapes is enabled. /// /// Assuming the preconditions are met, this routine can never fail. + #[inline(never)] fn parse_octal(&self) -> ast::Literal { use std::char; use std::u32; @@ -1559,6 +1576,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to /// the first character immediately following the hexadecimal literal. + #[inline(never)] fn parse_hex(&self) -> Result { assert!( self.char() == 'x' || self.char() == 'u' || self.char() == 'U' @@ -1588,6 +1606,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`) /// or 8 (for `\UNNNNNNNN`). + #[inline(never)] fn parse_hex_digits( &self, kind: ast::HexLiteralKind, @@ -1633,6 +1652,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// Parse a hex representation of any Unicode scalar value. This expects /// the parser to be positioned at the opening brace `{` and will advance /// the parser to the first character following the closing brace `}`. + #[inline(never)] fn parse_hex_brace( &self, kind: ast::HexLiteralKind, @@ -1726,6 +1746,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// This assumes the parser is positioned at the opening `[`. If parsing /// is successful, then the parser is advanced to the position immediately /// following the closing `]`. + #[inline(never)] fn parse_set_class(&self) -> Result { assert_eq!(self.char(), '['); @@ -1792,6 +1813,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// If an invalid escape is found, or if a character class is found where /// a simple literal is expected (e.g., in a range), then an error is /// returned. + #[inline(never)] fn parse_set_class_range(&self) -> Result { let prim1 = self.parse_set_class_item()?; self.bump_space(); @@ -1838,6 +1860,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// /// Note that it is the caller's responsibility to report an error if an /// illegal primitive was parsed. + #[inline(never)] fn parse_set_class_item(&self) -> Result { if self.char() == '\\' { self.parse_escape() @@ -1868,6 +1891,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// the parser to the first non-special byte of the character class. /// /// An error is returned if EOF is found. + #[inline(never)] fn parse_set_class_open( &self, ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> { @@ -1941,6 +1965,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// advance the parser and `None` is returned. Otherwise, the parser is /// advanced to the first byte following the closing `]` and the /// corresponding ASCII class is returned. + #[inline(never)] fn maybe_parse_ascii_class(&self) -> Option { // ASCII character classes are interesting from a parsing perspective // because parsing cannot fail with any interesting error. For example, @@ -2011,6 +2036,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// advance the parser to the character immediately following the class. /// /// Note that this does not check whether the class name is valid or not. + #[inline(never)] fn parse_unicode_class(&self) -> Result { assert!(self.char() == 'p' || self.char() == 'P'); @@ -2083,6 +2109,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the /// parser is currently at a valid character class name and will be /// advanced to the character immediately following the class. + #[inline(never)] fn parse_perl_class(&self) -> ast::ClassPerl { let c = self.char(); let span = self.span_char(); @@ -2115,6 +2142,7 @@ impl<'p, 's, P: Borrow> NestLimiter<'p, 's, P> { NestLimiter { p: p, depth: 0 } } + #[inline(never)] fn check(self, ast: &Ast) -> Result<()> { ast::visit(ast, self) } diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs index 0fc1a8ecdf..51eed52595 100644 --- a/regex-syntax/src/hir/interval.rs +++ b/regex-syntax/src/hir/interval.rs @@ -4,6 +4,8 @@ use std::fmt::Debug; use std::slice; use std::u8; +use unicode; + // This module contains an *internal* implementation of interval sets. // // The primary invariant that interval sets guards is canonical ordering. That @@ -14,7 +16,8 @@ use std::u8; // // Since case folding (as implemented below) breaks that invariant, we roll // that into this API even though it is a little out of place in an otherwise -// generic interval set. +// generic interval set. (Hence the reason why the `unicode` module is imported +// here.) // // Some of the implementation complexity here is a result of me wanting to // preserve the sequential representation without using additional memory. @@ -72,13 +75,20 @@ impl IntervalSet { /// characters. For example, if this class consists of the range `a-z`, /// then applying case folding will result in the class containing both the /// ranges `a-z` and `A-Z`. - pub fn case_fold_simple(&mut self) { + /// + /// This returns an error if the necessary case mapping data is not + /// available. + pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> { let len = self.ranges.len(); for i in 0..len { let range = self.ranges[i]; - range.case_fold_simple(&mut self.ranges); + if let Err(err) = range.case_fold_simple(&mut self.ranges) { + self.canonicalize(); + return Err(err); + } } self.canonicalize(); + Ok(()) } /// Union this set with the given set, in place. @@ -331,7 +341,10 @@ pub trait Interval: fn upper(&self) -> Self::Bound; fn set_lower(&mut self, bound: Self::Bound); fn set_upper(&mut self, bound: Self::Bound); - fn case_fold_simple(&self, intervals: &mut Vec); + fn case_fold_simple( + &self, + intervals: &mut Vec, + ) -> Result<(), unicode::CaseFoldError>; /// Create a new interval. fn create(lower: Self::Bound, upper: Self::Bound) -> Self { diff --git a/regex-syntax/src/hir/literal/mod.rs b/regex-syntax/src/hir/literal/mod.rs index 0971410874..3ba225c657 100644 --- a/regex-syntax/src/hir/literal/mod.rs +++ b/regex-syntax/src/hir/literal/mod.rs @@ -1105,6 +1105,7 @@ mod tests { test_lit!(pfx_one_lit1, prefixes, "a", M("a")); test_lit!(pfx_one_lit2, prefixes, "abc", M("abc")); test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83")); + #[cfg(feature = "unicode-case")] test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83")); test_lit!(pfx_class1, prefixes, "[1-4]", M("1"), M("2"), M("3"), M("4")); test_lit!( @@ -1114,6 +1115,7 @@ mod tests { M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83") ); + #[cfg(feature = "unicode-case")] test_lit!( pfx_class3, prefixes, @@ -1122,11 +1124,11 @@ mod tests { M("\\xe2\\x85\\xb0"), M("\\xe2\\x98\\x83") ); - test_lit!(pfx_one_lit_casei1, prefixes, "(?i)a", M("A"), M("a")); + test_lit!(pfx_one_lit_casei1, prefixes, "(?i-u)a", M("A"), M("a")); test_lit!( pfx_one_lit_casei2, prefixes, - "(?i)abc", + "(?i-u)abc", M("ABC"), M("aBC"), M("AbC"), @@ -1158,7 +1160,7 @@ mod tests { test_lit!( pfx_cat3, prefixes, - "(?i)[ab]z", + "(?i-u)[ab]z", M("AZ"), M("BZ"), M("aZ"), @@ -1295,7 +1297,7 @@ mod tests { test_exhausted!( pfx_exhausted4, prefixes, - "(?i)foobar", + "(?i-u)foobar", C("FO"), C("fO"), C("Fo"), @@ -1336,6 +1338,7 @@ mod tests { test_lit!(sfx_one_lit1, suffixes, "a", M("a")); test_lit!(sfx_one_lit2, suffixes, "abc", M("abc")); test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83")); + #[cfg(feature = "unicode-case")] test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83")); test_lit!(sfx_class1, suffixes, "[1-4]", M("1"), M("2"), M("3"), M("4")); test_lit!( @@ -1345,6 +1348,7 @@ mod tests { M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83") ); + #[cfg(feature = "unicode-case")] test_lit!( sfx_class3, suffixes, @@ -1353,11 +1357,11 @@ mod tests { M("\\xe2\\x85\\xb0"), M("\\xe2\\x98\\x83") ); - test_lit!(sfx_one_lit_casei1, suffixes, "(?i)a", M("A"), M("a")); + test_lit!(sfx_one_lit_casei1, suffixes, "(?i-u)a", M("A"), M("a")); test_lit!( sfx_one_lit_casei2, suffixes, - "(?i)abc", + "(?i-u)abc", M("ABC"), M("ABc"), M("AbC"), @@ -1389,7 +1393,7 @@ mod tests { test_lit!( sfx_cat3, suffixes, - "(?i)[ab]z", + "(?i-u)[ab]z", M("AZ"), M("Az"), M("BZ"), @@ -1480,7 +1484,7 @@ mod tests { test_exhausted!( sfx_exhausted4, suffixes, - "(?i)foobar", + "(?i-u)foobar", C("AR"), C("Ar"), C("aR"), diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index e938de80de..ee08e83dba 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -5,6 +5,7 @@ use std::char; use std::cmp; use std::error; use std::fmt; +use std::result; use std::u8; use ast::Span; @@ -12,6 +13,7 @@ use hir::interval::{Interval, IntervalSet, IntervalSetIter}; use unicode; pub use hir::visitor::{visit, Visitor}; +pub use unicode::CaseFoldError; mod interval; pub mod literal; @@ -65,6 +67,14 @@ pub enum ErrorKind { /// This occurs when an unrecognized Unicode property value could not /// be found. UnicodePropertyValueNotFound, + /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or + /// `\d`) could not be found. This can occur when the `unicode-perl` + /// crate feature is not enabled. + UnicodePerlClassNotFound, + /// This occurs when the Unicode simple case mapping tables are not + /// available, and the regular expression required Unicode aware case + /// insensitivity. + UnicodeCaseUnavailable, /// This occurs when the translator attempts to construct a character class /// that is empty. /// @@ -88,8 +98,16 @@ impl ErrorKind { InvalidUtf8 => "pattern can match invalid UTF-8", UnicodePropertyNotFound => "Unicode property not found", UnicodePropertyValueNotFound => "Unicode property value not found", + UnicodePerlClassNotFound => { + "Unicode-aware Perl class not found \ + (make sure the unicode-perl feature is enabled)" + } + UnicodeCaseUnavailable => { + "Unicode-aware case insensitivity matching is not available \ + (make sure the unicode-case feature is enabled)" + } EmptyClassNotAllowed => "empty character classes are not allowed", - _ => unreachable!(), + __Nonexhaustive => unreachable!(), } } } @@ -848,8 +866,38 @@ impl ClassUnicode { /// characters, according to Unicode's "simple" mapping. For example, if /// this class consists of the range `a-z`, then applying case folding will /// result in the class containing both the ranges `a-z` and `A-Z`. + /// + /// # Panics + /// + /// This routine panics when the case mapping data necessary for this + /// routine to complete is unavailable. This occurs when the `unicode-case` + /// feature is not enabled. + /// + /// Callers should prefer using `try_case_fold_simple` instead, which will + /// return an error instead of panicking. pub fn case_fold_simple(&mut self) { - self.set.case_fold_simple(); + self.set + .case_fold_simple() + .expect("unicode-case feature must be enabled"); + } + + /// Expand this character class such that it contains all case folded + /// characters, according to Unicode's "simple" mapping. For example, if + /// this class consists of the range `a-z`, then applying case folding will + /// result in the class containing both the ranges `a-z` and `A-Z`. + /// + /// # Panics + /// + /// This routine panics when the case mapping data necessary for this + /// routine to complete is unavailable. This occurs when the `unicode-case` + /// feature is not enabled. + /// + /// Callers should prefer using `try_case_fold_simple` instead, which will + /// return an error instead of panicking. + pub fn try_case_fold_simple( + &mut self, + ) -> result::Result<(), CaseFoldError> { + self.set.case_fold_simple() } /// Negate this character class. @@ -957,9 +1005,12 @@ impl Interval for ClassUnicodeRange { /// /// Additional ranges are appended to the given vector. Canonical ordering /// is *not* maintained in the given vector. - fn case_fold_simple(&self, ranges: &mut Vec) { - if !unicode::contains_simple_case_mapping(self.start, self.end) { - return; + fn case_fold_simple( + &self, + ranges: &mut Vec, + ) -> Result<(), unicode::CaseFoldError> { + if !unicode::contains_simple_case_mapping(self.start, self.end)? { + return Ok(()); } let start = self.start as u32; let end = (self.end as u32).saturating_add(1); @@ -968,7 +1019,7 @@ impl Interval for ClassUnicodeRange { if next_simple_cp.map_or(false, |next| cp < next) { continue; } - let it = match unicode::simple_fold(cp) { + let it = match unicode::simple_fold(cp)? { Ok(it) => it, Err(next) => { next_simple_cp = next; @@ -979,6 +1030,7 @@ impl Interval for ClassUnicodeRange { ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded)); } } + Ok(()) } } @@ -1057,7 +1109,7 @@ impl ClassBytes { /// Note that this only applies ASCII case folding, which is limited to the /// characters `a-z` and `A-Z`. pub fn case_fold_simple(&mut self) { - self.set.case_fold_simple(); + self.set.case_fold_simple().expect("ASCII case folding never fails"); } /// Negate this byte class. @@ -1151,7 +1203,10 @@ impl Interval for ClassBytesRange { /// /// Additional ranges are appended to the given vector. Canonical ordering /// is *not* maintained in the given vector. - fn case_fold_simple(&self, ranges: &mut Vec) { + fn case_fold_simple( + &self, + ranges: &mut Vec, + ) -> Result<(), unicode::CaseFoldError> { if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) { let lower = cmp::max(self.start, b'a'); let upper = cmp::min(self.end, b'z'); @@ -1162,6 +1217,7 @@ impl Interval for ClassBytesRange { let upper = cmp::min(self.end, b'Z'); ranges.push(ClassBytesRange::new(lower + 32, upper + 32)); } + Ok(()) } } @@ -1473,6 +1529,7 @@ mod tests { cls.iter().map(|x| (x.start(), x.end())).collect() } + #[cfg(feature = "unicode-case")] fn ucasefold(cls: &ClassUnicode) -> ClassUnicode { let mut cls_ = cls.clone(); cls_.case_fold_simple(); @@ -1643,6 +1700,7 @@ mod tests { } #[test] + #[cfg(feature = "unicode-case")] fn class_case_fold_unicode() { let cls = uclass(&[ ('C', 'F'), @@ -1700,6 +1758,37 @@ mod tests { assert_eq!(cls, ucasefold(&cls)); } + #[test] + #[cfg(not(feature = "unicode-case"))] + fn class_case_fold_unicode_disabled() { + let mut cls = uclass(&[ + ('C', 'F'), + ('A', 'G'), + ('D', 'J'), + ('A', 'C'), + ('M', 'P'), + ('L', 'S'), + ('c', 'f'), + ]); + assert!(cls.try_case_fold_simple().is_err()); + } + + #[test] + #[should_panic] + #[cfg(not(feature = "unicode-case"))] + fn class_case_fold_unicode_disabled_panics() { + let mut cls = uclass(&[ + ('C', 'F'), + ('A', 'G'), + ('D', 'J'), + ('A', 'C'), + ('M', 'P'), + ('L', 'S'), + ('c', 'f'), + ]); + cls.case_fold_simple(); + } + #[test] fn class_case_fold_bytes() { let cls = bclass(&[ diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 359ec7fae8..3db8796140 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -297,7 +297,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } Ast::Class(ast::Class::Perl(ref x)) => { if self.flags().unicode() { - let cls = self.hir_perl_unicode_class(x); + let cls = self.hir_perl_unicode_class(x)?; let hcls = hir::Class::Unicode(cls); self.push(HirFrame::Expr(Hir::class(hcls))); } else { @@ -313,7 +313,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { Ast::Class(ast::Class::Bracketed(ref ast)) => { if self.flags().unicode() { let mut cls = self.pop().unwrap().unwrap_class_unicode(); - self.unicode_fold_and_negate(ast.negated, &mut cls); + self.unicode_fold_and_negate( + &ast.span, + ast.negated, + &mut cls, + )?; if cls.iter().next().is_none() { return Err(self.error( ast.span, @@ -431,7 +435,9 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { for &(s, e) in ascii_class(&x.kind) { cls.push(hir::ClassUnicodeRange::new(s, e)); } - self.unicode_fold_and_negate(x.negated, &mut cls); + self.unicode_fold_and_negate( + &x.span, x.negated, &mut cls, + )?; self.push(HirFrame::ClassUnicode(cls)); } else { let mut cls = self.pop().unwrap().unwrap_class_bytes(); @@ -450,7 +456,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } ast::ClassSetItem::Perl(ref x) => { if self.flags().unicode() { - let xcls = self.hir_perl_unicode_class(x); + let xcls = self.hir_perl_unicode_class(x)?; let mut cls = self.pop().unwrap().unwrap_class_unicode(); cls.union(&xcls); self.push(HirFrame::ClassUnicode(cls)); @@ -464,7 +470,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { ast::ClassSetItem::Bracketed(ref ast) => { if self.flags().unicode() { let mut cls1 = self.pop().unwrap().unwrap_class_unicode(); - self.unicode_fold_and_negate(ast.negated, &mut cls1); + self.unicode_fold_and_negate( + &ast.span, + ast.negated, + &mut cls1, + )?; let mut cls2 = self.pop().unwrap().unwrap_class_unicode(); cls2.union(&cls1); @@ -527,8 +537,18 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { let mut lhs = self.pop().unwrap().unwrap_class_unicode(); let mut cls = self.pop().unwrap().unwrap_class_unicode(); if self.flags().case_insensitive() { - rhs.case_fold_simple(); - lhs.case_fold_simple(); + rhs.try_case_fold_simple().map_err(|_| { + self.error( + op.rhs.span().clone(), + ErrorKind::UnicodeCaseUnavailable, + ) + })?; + lhs.try_case_fold_simple().map_err(|_| { + self.error( + op.lhs.span().clone(), + ErrorKind::UnicodeCaseUnavailable, + ) + })?; } match op.kind { Intersection => lhs.intersect(&rhs), @@ -659,21 +679,32 @@ impl<'t, 'p> TranslatorI<'t, 'p> { span: Span, c: char, ) -> Result { - // If case folding won't do anything, then don't bother trying. - if !unicode::contains_simple_case_mapping(c, c) { - return self.hir_from_char(span, c); - } if self.flags().unicode() { + // If case folding won't do anything, then don't bother trying. + let map = + unicode::contains_simple_case_mapping(c, c).map_err(|_| { + self.error(span, ErrorKind::UnicodeCaseUnavailable) + })?; + if !map { + return self.hir_from_char(span, c); + } let mut cls = hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new( c, c, )]); - cls.case_fold_simple(); + cls.try_case_fold_simple().map_err(|_| { + self.error(span, ErrorKind::UnicodeCaseUnavailable) + })?; Ok(Hir::class(hir::Class::Unicode(cls))) } else { if c.len_utf8() > 1 { return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); } + // If case folding won't do anything, then don't bother trying. + match c { + 'A'..='Z' | 'a'..='z' => {} + _ => return self.hir_from_char(span, c), + } let mut cls = hir::ClassBytes::new(vec![hir::ClassBytesRange::new( c as u8, c as u8, @@ -800,47 +831,40 @@ impl<'t, 'p> TranslatorI<'t, 'p> { property_value: value, }, }; - match unicode::class(query) { - Ok(mut class) => { - self.unicode_fold_and_negate(ast_class.negated, &mut class); - Ok(class) - } - Err(unicode::Error::PropertyNotFound) => { - Err(self - .error(ast_class.span, ErrorKind::UnicodePropertyNotFound)) - } - Err(unicode::Error::PropertyValueNotFound) => Err(self.error( - ast_class.span, - ErrorKind::UnicodePropertyValueNotFound, - )), + let mut result = self.convert_unicode_class_error( + &ast_class.span, + unicode::class(query), + ); + if let Ok(ref mut class) = result { + self.unicode_fold_and_negate( + &ast_class.span, + ast_class.negated, + class, + )?; } + result } fn hir_perl_unicode_class( &self, ast_class: &ast::ClassPerl, - ) -> hir::ClassUnicode { + ) -> Result { use ast::ClassPerlKind::*; - use unicode_tables::perl_word::PERL_WORD; assert!(self.flags().unicode()); - let mut class = match ast_class.kind { - Digit => { - let query = ClassQuery::Binary("Decimal_Number"); - unicode::class(query).unwrap() - } - Space => { - let query = ClassQuery::Binary("Whitespace"); - unicode::class(query).unwrap() - } - Word => unicode::hir_class(PERL_WORD), + let result = match ast_class.kind { + Digit => unicode::perl_digit(), + Space => unicode::perl_space(), + Word => unicode::perl_word(), }; + let mut class = + self.convert_unicode_class_error(&ast_class.span, result)?; // We needn't apply case folding here because the Perl Unicode classes // are already closed under Unicode simple case folding. if ast_class.negated { class.negate(); } - class + Ok(class) } fn hir_perl_byte_class( @@ -863,21 +887,50 @@ impl<'t, 'p> TranslatorI<'t, 'p> { class } + /// Converts the given Unicode specific error to an HIR translation error. + /// + /// The span given should approximate the position at which an error would + /// occur. + fn convert_unicode_class_error( + &self, + span: &Span, + result: unicode::Result, + ) -> Result { + result.map_err(|err| { + let sp = span.clone(); + match err { + unicode::Error::PropertyNotFound => { + self.error(sp, ErrorKind::UnicodePropertyNotFound) + } + unicode::Error::PropertyValueNotFound => { + self.error(sp, ErrorKind::UnicodePropertyValueNotFound) + } + unicode::Error::PerlClassNotFound => { + self.error(sp, ErrorKind::UnicodePerlClassNotFound) + } + } + }) + } + fn unicode_fold_and_negate( &self, + span: &Span, negated: bool, class: &mut hir::ClassUnicode, - ) { + ) -> Result<()> { // Note that we must apply case folding before negation! // Consider `(?i)[^x]`. If we applied negation field, then // the result would be the character class that matched any // Unicode scalar value. if self.flags().case_insensitive() { - class.case_fold_simple(); + class.try_case_fold_simple().map_err(|_| { + self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable) + })?; } if negated { class.negate(); } + Ok(()) } fn bytes_fold_and_negate( @@ -1017,74 +1070,28 @@ fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes { fn ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)] { use ast::ClassAsciiKind::*; - - // The contortions below with `const` appear necessary for older versions - // of Rust. - type T = &'static [(char, char)]; match *kind { - Alnum => { - const X: T = &[('0', '9'), ('A', 'Z'), ('a', 'z')]; - X - } - Alpha => { - const X: T = &[('A', 'Z'), ('a', 'z')]; - X - } - Ascii => { - const X: T = &[('\x00', '\x7F')]; - X - } - Blank => { - const X: T = &[('\t', '\t'), (' ', ' ')]; - X - } - Cntrl => { - const X: T = &[('\x00', '\x1F'), ('\x7F', '\x7F')]; - X - } - Digit => { - const X: T = &[('0', '9')]; - X - } - Graph => { - const X: T = &[('!', '~')]; - X - } - Lower => { - const X: T = &[('a', 'z')]; - X - } - Print => { - const X: T = &[(' ', '~')]; - X - } - Punct => { - const X: T = &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')]; - X - } - Space => { - const X: T = &[ - ('\t', '\t'), - ('\n', '\n'), - ('\x0B', '\x0B'), - ('\x0C', '\x0C'), - ('\r', '\r'), - (' ', ' '), - ]; - X - } - Upper => { - const X: T = &[('A', 'Z')]; - X - } - Word => { - const X: T = &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')]; - X - } - Xdigit => { - const X: T = &[('0', '9'), ('A', 'F'), ('a', 'f')]; - X - } + Alnum => &[('0', '9'), ('A', 'Z'), ('a', 'z')], + Alpha => &[('A', 'Z'), ('a', 'z')], + Ascii => &[('\x00', '\x7F')], + Blank => &[('\t', '\t'), (' ', ' ')], + Cntrl => &[('\x00', '\x1F'), ('\x7F', '\x7F')], + Digit => &[('0', '9')], + Graph => &[('!', '~')], + Lower => &[('a', 'z')], + Print => &[(' ', '~')], + Punct => &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')], + Space => &[ + ('\t', '\t'), + ('\n', '\n'), + ('\x0B', '\x0B'), + ('\x0C', '\x0C'), + ('\r', '\r'), + (' ', ' '), + ], + Upper => &[('A', 'Z')], + Word => &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')], + Xdigit => &[('0', '9'), ('A', 'F'), ('a', 'f')], } } @@ -1240,13 +1247,14 @@ mod tests { Hir::concat(exprs) } + #[allow(dead_code)] fn hir_uclass_query(query: ClassQuery) -> Hir { Hir::class(hir::Class::Unicode(unicode::class(query).unwrap())) } + #[allow(dead_code)] fn hir_uclass_perl_word() -> Hir { - use unicode_tables::perl_word::PERL_WORD; - Hir::class(hir::Class::Unicode(unicode::hir_class(PERL_WORD))) + Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap())) } fn hir_uclass(ranges: &[(char, char)]) -> Hir { @@ -1297,6 +1305,7 @@ mod tests { } } + #[allow(dead_code)] fn hir_union(expr1: Hir, expr2: Hir) -> Hir { use hir::Class::{Bytes, Unicode}; @@ -1313,6 +1322,7 @@ mod tests { } } + #[allow(dead_code)] fn hir_difference(expr1: Hir, expr2: Hir) -> Hir { use hir::Class::{Bytes, Unicode}; @@ -1412,11 +1422,14 @@ mod tests { #[test] fn literal_case_insensitive() { + #[cfg(feature = "unicode-case")] assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),])); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i:a)"), hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')],)) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("a(?i)a(?-i)a"), hir_cat(vec![ @@ -1425,6 +1438,7 @@ mod tests { hir_lit("a"), ]) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)ab@c"), hir_cat(vec![ @@ -1434,12 +1448,14 @@ mod tests { hir_uclass(&[('C', 'C'), ('c', 'c')]), ]) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)β"), hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),]) ); assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?-u)a(?i)a(?-i)a"), hir_cat(vec![ @@ -1610,6 +1626,7 @@ mod tests { #[test] fn flags() { + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i:a)a"), hir_cat(vec![ @@ -1624,6 +1641,7 @@ mod tests { hir_lit("β"), ]) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)(?-i:a)a"), hir_cat(vec![ @@ -1631,6 +1649,7 @@ mod tests { hir_uclass(&[('A', 'A'), ('a', 'a')]), ]) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?im)a^"), hir_cat(vec![ @@ -1638,6 +1657,7 @@ mod tests { hir_anchor(hir::Anchor::StartLine), ]) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?im)a^(?i-m)a^"), hir_cat(vec![ @@ -1656,6 +1676,7 @@ mod tests { hir_star(false, hir_lit("a")), ]) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?:a(?i)a)a"), hir_cat(vec![ @@ -1666,6 +1687,7 @@ mod tests { hir_lit("a"), ]) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)(?:a(?-i)a)a"), hir_cat(vec![ @@ -1855,6 +1877,7 @@ mod tests { t("[[:^lower:]]"), hir_negate(hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower))) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[[:lower:]]"), hir_uclass(&[ @@ -1899,19 +1922,23 @@ mod tests { } #[test] + #[cfg(feature = "unicode-perl")] fn class_perl() { // Unicode assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit"))); assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space"))); assert_eq!(t(r"\w"), hir_uclass_perl_word()); + #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)\d"), hir_uclass_query(ClassQuery::Binary("digit")) ); + #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)\s"), hir_uclass_query(ClassQuery::Binary("space")) ); + #[cfg(feature = "unicode-case")] assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word()); // Unicode, negated @@ -1924,14 +1951,17 @@ mod tests { hir_negate(hir_uclass_query(ClassQuery::Binary("space"))) ); assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word())); + #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)\D"), hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) ); + #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)\S"), hir_negate(hir_uclass_query(ClassQuery::Binary("space"))) ); + #[cfg(feature = "unicode-case")] assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word())); // ASCII only @@ -2000,7 +2030,56 @@ mod tests { } #[test] - fn class_unicode() { + #[cfg(not(feature = "unicode-perl"))] + fn class_perl_word_disabled() { + assert_eq!( + t_err(r"\w"), + TestError { + kind: hir::ErrorKind::UnicodePerlClassNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(2, 1, 3) + ), + } + ); + } + + #[test] + #[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))] + fn class_perl_space_disabled() { + assert_eq!( + t_err(r"\s"), + TestError { + kind: hir::ErrorKind::UnicodePerlClassNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(2, 1, 3) + ), + } + ); + } + + #[test] + #[cfg(all( + not(feature = "unicode-perl"), + not(feature = "unicode-gencat") + ))] + fn class_perl_digit_disabled() { + assert_eq!( + t_err(r"\d"), + TestError { + kind: hir::ErrorKind::UnicodePerlClassNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(2, 1, 3) + ), + } + ); + } + + #[test] + #[cfg(feature = "unicode-gencat")] + fn class_unicode_gencat() { assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z"))); assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z"))); assert_eq!( @@ -2038,21 +2117,6 @@ mod tests { hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) ); - assert_eq!( - t(r"\p{Greek}"), - hir_uclass_query(ClassQuery::Binary("Greek")) - ); - assert_eq!( - t(r"(?i)\p{Greek}"), - hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek"))) - ); - assert_eq!( - t(r"(?i)\P{Greek}"), - hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( - "Greek" - )))) - ); - assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any"))); assert_eq!( t(r"\p{assigned}"), @@ -2125,6 +2189,54 @@ mod tests { ), } ); + } + + #[test] + #[cfg(not(feature = "unicode-gencat"))] + fn class_unicode_gencat_disabled() { + assert_eq!( + t_err(r"\p{Separator}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(13, 1, 14) + ), + } + ); + + assert_eq!( + t_err(r"\p{Any}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(7, 1, 8) + ), + } + ); + } + + #[test] + #[cfg(feature = "unicode-script")] + fn class_unicode_script() { + assert_eq!( + t(r"\p{Greek}"), + hir_uclass_query(ClassQuery::Binary("Greek")) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\p{Greek}"), + hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek"))) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\P{Greek}"), + hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( + "Greek" + )))) + ); + assert_eq!( t_err(r"\p{sc:Foo}"), TestError { @@ -2145,6 +2257,37 @@ mod tests { ), } ); + } + + #[test] + #[cfg(not(feature = "unicode-script"))] + fn class_unicode_script_disabled() { + assert_eq!( + t_err(r"\p{Greek}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(9, 1, 10) + ), + } + ); + + assert_eq!( + t_err(r"\p{scx:Greek}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(13, 1, 14) + ), + } + ); + } + + #[test] + #[cfg(feature = "unicode-age")] + fn class_unicode_age() { assert_eq!( t_err(r"\p{age:Foo}"), TestError { @@ -2157,6 +2300,21 @@ mod tests { ); } + #[test] + #[cfg(not(feature = "unicode-age"))] + fn class_unicode_age_disabled() { + assert_eq!( + t_err(r"\p{age:3.0}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(11, 1, 12) + ), + } + ); + } + #[test] fn class_bracketed() { assert_eq!(t("[a]"), hir_uclass(&[('a', 'a')])); @@ -2167,28 +2325,39 @@ mod tests { assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')])); assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')])); assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')])); + #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit"))); + #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[\pZ]"), hir_uclass_query(ClassQuery::Binary("separator")) ); + #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[\p{separator}]"), hir_uclass_query(ClassQuery::Binary("separator")) ); + #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit"))); + #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[^\PZ]"), hir_uclass_query(ClassQuery::Binary("separator")) ); + #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[^\P{separator}]"), hir_uclass_query(ClassQuery::Binary("separator")) ); + #[cfg(all( + feature = "unicode-case", + any(feature = "unicode-perl", feature = "unicode-gencat") + ))] assert_eq!( t(r"(?i)[^\D]"), hir_uclass_query(ClassQuery::Binary("digit")) ); + #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] assert_eq!( t(r"(?i)[^\P{greek}]"), hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek"))) @@ -2198,11 +2367,14 @@ mod tests { assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')])); assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')])); + #[cfg(feature = "unicode-case")] assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')])); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[k]"), hir_uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),]) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[β]"), hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),]) @@ -2215,24 +2387,29 @@ mod tests { t_bytes("(?-u)[^a]"), hir_negate(hir_bclass(&[(b'a', b'a')])) ); + #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] assert_eq!( t(r"[^\d]"), hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) ); + #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[^\pZ]"), hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))) ); + #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[^\p{separator}]"), hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))) ); + #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] assert_eq!( t(r"(?i)[^\p{greek}]"), hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( "greek" )))) ); + #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] assert_eq!( t(r"(?i)[\P{greek}]"), hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( @@ -2271,6 +2448,7 @@ mod tests { ), } ); + #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] assert_eq!( t_err(r"[^\s\S]"), TestError { @@ -2281,6 +2459,7 @@ mod tests { ), } ); + #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] assert_eq!( t_err(r"(?-u)[^\s\S]"), TestError { @@ -2296,6 +2475,7 @@ mod tests { #[test] fn class_bracketed_union() { assert_eq!(t("[a-zA-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')])); + #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[a\pZb]"), hir_union( @@ -2303,6 +2483,7 @@ mod tests { hir_uclass_query(ClassQuery::Binary("separator")) ) ); + #[cfg(all(feature = "unicode-gencat", feature = "unicode-script"))] assert_eq!( t(r"[\pZ\p{Greek}]"), hir_union( @@ -2310,6 +2491,11 @@ mod tests { hir_uclass_query(ClassQuery::Binary("separator")) ) ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-gencat", + feature = "unicode-script" + ))] assert_eq!( t(r"[\p{age:3.0}\pZ\p{Greek}]"), hir_union( @@ -2323,6 +2509,11 @@ mod tests { ) ) ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-gencat", + feature = "unicode-script" + ))] assert_eq!( t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"), hir_union( @@ -2340,6 +2531,12 @@ mod tests { ) ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-case", + feature = "unicode-gencat", + feature = "unicode-script" + ))] assert_eq!( t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"), hir_case_fold(hir_union( @@ -2353,6 +2550,11 @@ mod tests { ) )) ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-gencat", + feature = "unicode-script" + ))] assert_eq!( t(r"[^\p{age:3.0}\pZ\p{Greek}]"), hir_negate(hir_union( @@ -2366,6 +2568,12 @@ mod tests { ) )) ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-case", + feature = "unicode-gencat", + feature = "unicode-script" + ))] assert_eq!( t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"), hir_negate(hir_case_fold(hir_union( @@ -2390,16 +2598,20 @@ mod tests { assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')])); assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')])); + #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)[a[^c]]"), hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')]))) ); + #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)[a-b[^c]]"), hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')]))) ); + #[cfg(feature = "unicode-case")] assert_eq!(t(r"(?i)[^a[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')])); + #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)[^a-b[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')]) @@ -2415,6 +2627,7 @@ mod tests { ), } ); + #[cfg(feature = "unicode-case")] assert_eq!( t_err(r"(?i)[^a-c[^c]]"), TestError { @@ -2446,26 +2659,32 @@ mod tests { assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')])); assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')])); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[abc&&b-c]"), hir_case_fold(hir_uclass(&[('b', 'c')])) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[abc&&[b-c]]"), hir_case_fold(hir_uclass(&[('b', 'c')])) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[[abc]&&[b-c]]"), hir_case_fold(hir_uclass(&[('b', 'c')])) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[a-z&&b-y&&c-x]"), hir_case_fold(hir_uclass(&[('c', 'x')])) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[c-da-b&&a-d]"), hir_case_fold(hir_uclass(&[('a', 'd')])) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[a-d&&c-da-b]"), hir_case_fold(hir_uclass(&[('a', 'd')])) @@ -2513,21 +2732,26 @@ mod tests { #[test] fn class_bracketed_intersect_negate() { + #[cfg(feature = "unicode-perl")] assert_eq!( t(r"[^\w&&\d]"), hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) ); assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')]))); + #[cfg(feature = "unicode-perl")] assert_eq!( t(r"[^[\w&&\d]]"), hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) ); + #[cfg(feature = "unicode-perl")] assert_eq!( t(r"[^[^\w&&\d]]"), hir_uclass_query(ClassQuery::Binary("digit")) ); + #[cfg(feature = "unicode-perl")] assert_eq!(t(r"[[[^\w]&&[^\d]]]"), hir_negate(hir_uclass_perl_word())); + #[cfg(feature = "unicode-perl")] assert_eq!( t_bytes(r"(?-u)[^\w&&\d]"), hir_negate(hir_bclass_from_char(ascii_class( @@ -2558,6 +2782,7 @@ mod tests { #[test] fn class_bracketed_difference() { + #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[\pL--[:ascii:]]"), hir_difference( @@ -2574,6 +2799,7 @@ mod tests { #[test] fn class_bracketed_symmetric_difference() { + #[cfg(feature = "unicode-script")] assert_eq!( t(r"[\p{sc:Greek}~~\p{scx:Greek}]"), hir_uclass(&[ @@ -2610,6 +2836,7 @@ mod tests { ); assert_eq!(t(r"(?x)\x5 3"), hir_lit("S")); + #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"(?x)\p # comment { # comment @@ -2832,6 +3059,7 @@ mod tests { assert!(t(r"a{0,}").is_match_empty()); assert!(t(r"a{0,1}").is_match_empty()); assert!(t(r"a{0,10}").is_match_empty()); + #[cfg(feature = "unicode-gencat")] assert!(t(r"\pL*").is_match_empty()); assert!(t(r"a*|b").is_match_empty()); assert!(t(r"b|a*").is_match_empty()); diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index 238c5dd3b8..e9f59d8e7e 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -91,12 +91,75 @@ know a regular expression must match a prefix or suffix literal, then it is often quicker to search for instances of that literal, and then confirm or deny the match using the full regular expression engine. These optimizations are done automatically in the `regex` crate. + + +# Crate features + +An important feature provided by this crate is its Unicode support. This +includes things like case folding, boolean properties, general categories, +scripts and Unicode-aware support for the Perl classes `\w`, `\s` and `\d`. +However, a downside of this support is that it requires bundling several +Unicode data tables that are substantial in size. + +A fair number of use cases do not require full Unicode support. For this +reason, this crate exposes a number of features to control which Unicode +data is available. + +If a regular expression attempts to use a Unicode feature that is not available +because the corresponding crate feature was disabled, then translating that +regular expression to an `Hir` will return an error. (It is still possible +construct an `Ast` for such a regular expression, since Unicode data is not +used until translation to an `Hir`.) Stated differently, enabling or disabling +any of the features below can only add or subtract from the total set of valid +regular expressions. Enabling or disabling a feature will never modify the +match semantics of a regular expression. + +The following features are available: + +* **unicode** - + Enables all Unicode features. This feature is enabled by default, and will + always cover all Unicode features, even if more are added in the future. +* **unicode-age** - + Provide the data for the + [Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age). + This makes it possible to use classes like `\p{Age:6.0}` to refer to all + codepoints first introduced in Unicode 6.0 +* **unicode-bool** - + Provide the data for numerous Unicode boolean properties. The full list + is not included here, but contains properties like `Alphabetic`, `Emoji`, + `Lowercase`, `Math`, `Uppercase` and `White_Space`. +* **unicode-case** - + Provide the data for case insensitive matching using + [Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches). +* **unicode-gencat** - + Provide the data for + [Uncode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values). + This includes, but is not limited to, `Decimal_Number`, `Letter`, + `Math_Symbol`, `Number` and `Punctuation`. +* **unicode-perl** - + Provide the data for supporting the Unicode-aware Perl character classes, + corresponding to `\w`, `\s` and `\d`. This is also necessary for using + Unicode-aware word boundary assertions. Note that if this feature is + disabled, the `\s` and `\d` character classes are still available if the + `unicode-bool` and `unicode-gencat` features are enabled, respectively. +* **unicode-script** - + Provide the data for + [Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/). + This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`, + `Latin` and `Thai`. +* **unicode-segment** - + Provide the data necessary to provide the properties used to implement the + [Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/). + This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and + `\p{sb=ATerm}`. */ #![deny(missing_docs)] +#![forbid(unsafe_code)] pub use error::{Error, Result}; pub use parser::{Parser, ParserBuilder}; +pub use unicode::UnicodeWordError; pub mod ast; mod either; @@ -156,24 +219,35 @@ pub fn is_meta_character(c: char) -> bool { /// is considered a word character if it is in either of the `Alphabetic` or /// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark` /// or `Connector_Punctuation` general categories. +/// +/// # Panics +/// +/// If the `unicode-perl` feature is not enabled, then this function panics. +/// For this reason, it is recommended that callers use +/// [`try_is_word_character`](fn.try_is_word_character.html) +/// instead. pub fn is_word_character(c: char) -> bool { - use std::cmp::Ordering; - use unicode_tables::perl_word::PERL_WORD; + try_is_word_character(c).expect("unicode-perl feature must be enabled") +} - if c <= 0x7F as char && is_word_byte(c as u8) { - return true; - } - PERL_WORD - .binary_search_by(|&(start, end)| { - if start <= c && c <= end { - Ordering::Equal - } else if start > c { - Ordering::Greater - } else { - Ordering::Less - } - }) - .is_ok() +/// Returns true if and only if the given character is a Unicode word +/// character. +/// +/// A Unicode word character is defined by +/// [UTS#18 Annex C](http://unicode.org/reports/tr18/#Compatibility_Properties). +/// In particular, a character +/// is considered a word character if it is in either of the `Alphabetic` or +/// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark` +/// or `Connector_Punctuation` general categories. +/// +/// # Errors +/// +/// If the `unicode-perl` feature is not enabled, then this function always +/// returns an error. +pub fn try_is_word_character( + c: char, +) -> std::result::Result { + unicode::is_word_character(c) } /// Returns true if and only if the given character is an ASCII word character. @@ -200,10 +274,14 @@ mod tests { } #[test] - fn word() { + fn word_byte() { assert!(is_word_byte(b'a')); assert!(!is_word_byte(b'-')); + } + #[test] + #[cfg(feature = "unicode-perl")] + fn word_char() { assert!(is_word_character('a'), "ASCII"); assert!(is_word_character('à'), "Latin-1"); assert!(is_word_character('β'), "Greek"); @@ -216,4 +294,17 @@ mod tests { assert!(!is_word_character('-')); assert!(!is_word_character('☃')); } + + #[test] + #[should_panic] + #[cfg(not(feature = "unicode-perl"))] + fn word_char_disabled_panic() { + assert!(is_word_character('a')); + } + + #[test] + #[cfg(not(feature = "unicode-perl"))] + fn word_char_disabled_error() { + assert!(try_is_word_character('a').is_err()); + } } diff --git a/regex-syntax/src/unicode.rs b/regex-syntax/src/unicode.rs index f95321e426..38a996aa05 100644 --- a/regex-syntax/src/unicode.rs +++ b/regex-syntax/src/unicode.rs @@ -1,20 +1,15 @@ -use std::cmp::Ordering; +use std::error; +use std::fmt; use std::result; use hir; -use unicode_tables::age; -use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; -use unicode_tables::general_category; -use unicode_tables::grapheme_cluster_break; -use unicode_tables::property_bool; -use unicode_tables::property_names::PROPERTY_NAMES; -use unicode_tables::property_values::PROPERTY_VALUES; -use unicode_tables::script; -use unicode_tables::script_extension; -use unicode_tables::sentence_break; -use unicode_tables::word_break; - -type Result = result::Result; + +/// A type alias for errors specific to Unicode handling of classes. +pub type Result = result::Result; + +/// An inclusive range of codepoints from a generated file (hence the static +/// lifetime). +type Range = &'static [(char, char)]; /// An error that occurs when dealing with Unicode. /// @@ -24,17 +19,51 @@ type Result = result::Result; pub enum Error { PropertyNotFound, PropertyValueNotFound, + // Not used when unicode-perl is enabled. + #[allow(dead_code)] + PerlClassNotFound, } -/// An iterator over a codepoint's simple case equivalence class. +/// A type alias for errors specific to Unicode case folding. +pub type FoldResult = result::Result; + +/// An error that occurs when Unicode-aware simple case folding fails. +/// +/// This error can occur when the case mapping tables necessary for Unicode +/// aware case folding are unavailable. This only occurs when the +/// `unicode-case` feature is disabled. (The feature is enabled by default.) +#[derive(Debug)] +pub struct CaseFoldError(()); + +impl error::Error for CaseFoldError {} + +impl fmt::Display for CaseFoldError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "Unicode-aware case folding is not available \ + (probably because the unicode-case feature is not enabled)" + ) + } +} + +/// An error that occurs when the Unicode-aware `\w` class is unavailable. +/// +/// This error can occur when the data tables necessary for the Unicode aware +/// Perl character class `\w` are unavailable. This only occurs when the +/// `unicode-perl` feature is disabled. (The feature is enabled by default.) #[derive(Debug)] -pub struct SimpleFoldIter(::std::slice::Iter<'static, char>); +pub struct UnicodeWordError(()); -impl Iterator for SimpleFoldIter { - type Item = char; +impl error::Error for UnicodeWordError {} - fn next(&mut self) -> Option { - self.0.next().map(|c| *c) +impl fmt::Display for UnicodeWordError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "Unicode-aware \\w class is not available \ + (probably because the unicode-perl feature is not enabled)" + ) } } @@ -47,17 +76,40 @@ impl Iterator for SimpleFoldIter { /// scalar value exists, then `None` is returned. The point of this behavior /// is to permit callers to avoid calling `simple_fold` more than they need /// to, since there is some cost to fetching the equivalence class. -pub fn simple_fold(c: char) -> result::Result> { - CASE_FOLDING_SIMPLE - .binary_search_by_key(&c, |&(c1, _)| c1) - .map(|i| SimpleFoldIter(CASE_FOLDING_SIMPLE[i].1.iter())) - .map_err(|i| { - if i >= CASE_FOLDING_SIMPLE.len() { - None - } else { - Some(CASE_FOLDING_SIMPLE[i].0) - } - }) +/// +/// This returns an error if the Unicode case folding tables are not available. +pub fn simple_fold( + c: char, +) -> FoldResult, Option>> { + #[cfg(not(feature = "unicode-case"))] + fn imp( + _: char, + ) -> FoldResult, Option>> + { + use std::option::IntoIter; + Err::, _>, _>(CaseFoldError(())) + } + + #[cfg(feature = "unicode-case")] + fn imp( + c: char, + ) -> FoldResult, Option>> + { + use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; + + Ok(CASE_FOLDING_SIMPLE + .binary_search_by_key(&c, |&(c1, _)| c1) + .map(|i| CASE_FOLDING_SIMPLE[i].1.iter().map(|&c| c)) + .map_err(|i| { + if i >= CASE_FOLDING_SIMPLE.len() { + None + } else { + Some(CASE_FOLDING_SIMPLE[i].0) + } + })) + } + + imp(c) } /// Returns true if and only if the given (inclusive) range contains at least @@ -65,19 +117,37 @@ pub fn simple_fold(c: char) -> result::Result> { /// mapping. /// /// This function panics if `end < start`. -pub fn contains_simple_case_mapping(start: char, end: char) -> bool { - assert!(start <= end); - CASE_FOLDING_SIMPLE - .binary_search_by(|&(c, _)| { - if start <= c && c <= end { - Ordering::Equal - } else if c > end { - Ordering::Greater - } else { - Ordering::Less - } - }) - .is_ok() +/// +/// This returns an error if the Unicode case folding tables are not available. +pub fn contains_simple_case_mapping( + start: char, + end: char, +) -> FoldResult { + #[cfg(not(feature = "unicode-case"))] + fn imp(_: char, _: char) -> FoldResult { + Err(CaseFoldError(())) + } + + #[cfg(feature = "unicode-case")] + fn imp(start: char, end: char) -> FoldResult { + use std::cmp::Ordering; + use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; + + assert!(start <= end); + Ok(CASE_FOLDING_SIMPLE + .binary_search_by(|&(c, _)| { + if start <= c && c <= end { + Ordering::Equal + } else if c > end { + Ordering::Greater + } else { + Ordering::Less + } + }) + .is_ok()) + } + + imp(start, end) } /// A query for finding a character class defined by Unicode. This supports @@ -123,27 +193,27 @@ impl<'a> ClassQuery<'a> { let property_name = symbolic_name_normalize(property_name); let property_value = symbolic_name_normalize(property_value); - let canon_name = match canonical_prop(&property_name) { + let canon_name = match canonical_prop(&property_name)? { None => return Err(Error::PropertyNotFound), Some(canon_name) => canon_name, }; Ok(match canon_name { "General_Category" => { - let canon = match canonical_gencat(&property_value) { + let canon = match canonical_gencat(&property_value)? { None => return Err(Error::PropertyValueNotFound), Some(canon) => canon, }; CanonicalClassQuery::GeneralCategory(canon) } "Script" => { - let canon = match canonical_script(&property_value) { + let canon = match canonical_script(&property_value)? { None => return Err(Error::PropertyValueNotFound), Some(canon) => canon, }; CanonicalClassQuery::Script(canon) } _ => { - let vals = match property_values(canon_name) { + let vals = match property_values(canon_name)? { None => return Err(Error::PropertyValueNotFound), Some(vals) => vals, }; @@ -167,13 +237,13 @@ impl<'a> ClassQuery<'a> { fn canonical_binary(&self, name: &str) -> Result { let norm = symbolic_name_normalize(name); - if let Some(canon) = canonical_prop(&norm) { + if let Some(canon) = canonical_prop(&norm)? { return Ok(CanonicalClassQuery::Binary(canon)); } - if let Some(canon) = canonical_gencat(&norm) { + if let Some(canon) = canonical_gencat(&norm)? { return Ok(CanonicalClassQuery::GeneralCategory(canon)); } - if let Some(canon) = canonical_script(&norm) { + if let Some(canon) = canonical_script(&norm)? { return Ok(CanonicalClassQuery::Script(canon)); } Err(Error::PropertyNotFound) @@ -211,25 +281,9 @@ pub fn class<'a>(query: ClassQuery<'a>) -> Result { use self::CanonicalClassQuery::*; match query.canonicalize()? { - Binary(name) => property_set(property_bool::BY_NAME, name) - .map(hir_class) - .ok_or(Error::PropertyNotFound), - GeneralCategory("Any") => Ok(hir_class(&[('\0', '\u{10FFFF}')])), - GeneralCategory("Assigned") => { - let mut cls = - property_set(general_category::BY_NAME, "Unassigned") - .map(hir_class) - .ok_or(Error::PropertyNotFound)?; - cls.negate(); - Ok(cls) - } - GeneralCategory("ASCII") => Ok(hir_class(&[('\0', '\x7F')])), - GeneralCategory(name) => property_set(general_category::BY_NAME, name) - .map(hir_class) - .ok_or(Error::PropertyValueNotFound), - Script(name) => property_set(script::BY_NAME, name) - .map(hir_class) - .ok_or(Error::PropertyValueNotFound), + Binary(name) => bool_property(name), + GeneralCategory(name) => gencat(name), + Script(name) => script(name), ByValue { property_name: "Age", property_value } => { let mut class = hir::ClassUnicode::empty(); for set in ages(property_value)? { @@ -238,25 +292,17 @@ pub fn class<'a>(query: ClassQuery<'a>) -> Result { Ok(class) } ByValue { property_name: "Script_Extensions", property_value } => { - property_set(script_extension::BY_NAME, property_value) - .map(hir_class) - .ok_or(Error::PropertyValueNotFound) + script_extension(property_value) } ByValue { property_name: "Grapheme_Cluster_Break", property_value, - } => property_set(grapheme_cluster_break::BY_NAME, property_value) - .map(hir_class) - .ok_or(Error::PropertyValueNotFound), + } => gcb(property_value), ByValue { property_name: "Sentence_Break", property_value } => { - property_set(sentence_break::BY_NAME, property_value) - .map(hir_class) - .ok_or(Error::PropertyValueNotFound) + sb(property_value) } ByValue { property_name: "Word_Break", property_value } => { - property_set(word_break::BY_NAME, property_value) - .map(hir_class) - .ok_or(Error::PropertyValueNotFound) + wb(property_value) } _ => { // What else should we support? @@ -265,6 +311,72 @@ pub fn class<'a>(query: ClassQuery<'a>) -> Result { } } +/// Returns a Unicode aware class for \w. +/// +/// This returns an error if the data is not available for \w. +pub fn perl_word() -> Result { + #[cfg(not(feature = "unicode-perl"))] + fn imp() -> Result { + Err(Error::PerlClassNotFound) + } + + #[cfg(feature = "unicode-perl")] + fn imp() -> Result { + use unicode_tables::perl_word::PERL_WORD; + Ok(hir_class(PERL_WORD)) + } + + imp() +} + +/// Returns a Unicode aware class for \s. +/// +/// This returns an error if the data is not available for \s. +pub fn perl_space() -> Result { + #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))] + fn imp() -> Result { + Err(Error::PerlClassNotFound) + } + + #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] + fn imp() -> Result { + use unicode_tables::perl_space::WHITE_SPACE; + Ok(hir_class(WHITE_SPACE)) + } + + #[cfg(feature = "unicode-bool")] + fn imp() -> Result { + use unicode_tables::property_bool::WHITE_SPACE; + Ok(hir_class(WHITE_SPACE)) + } + + imp() +} + +/// Returns a Unicode aware class for \d. +/// +/// This returns an error if the data is not available for \d. +pub fn perl_digit() -> Result { + #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))] + fn imp() -> Result { + Err(Error::PerlClassNotFound) + } + + #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] + fn imp() -> Result { + use unicode_tables::perl_decimal::DECIMAL_NUMBER; + Ok(hir_class(DECIMAL_NUMBER)) + } + + #[cfg(feature = "unicode-gencat")] + fn imp() -> Result { + use unicode_tables::general_category::DECIMAL_NUMBER; + Ok(hir_class(DECIMAL_NUMBER)) + } + + imp() +} + /// Build a Unicode HIR class from a sequence of Unicode scalar value ranges. pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { let hir_ranges: Vec = ranges @@ -274,6 +386,40 @@ pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { hir::ClassUnicode::new(hir_ranges) } +/// Returns true only if the given codepoint is in the `\w` character class. +/// +/// If the `unicode-perl` feature is not enabled, then this returns an error. +pub fn is_word_character(c: char) -> result::Result { + #[cfg(not(feature = "unicode-perl"))] + fn imp(_: char) -> result::Result { + Err(UnicodeWordError(())) + } + + #[cfg(feature = "unicode-perl")] + fn imp(c: char) -> result::Result { + use is_word_byte; + use std::cmp::Ordering; + use unicode_tables::perl_word::PERL_WORD; + + if c <= 0x7F as char && is_word_byte(c as u8) { + return Ok(true); + } + Ok(PERL_WORD + .binary_search_by(|&(start, end)| { + if start <= c && c <= end { + Ordering::Equal + } else if start > c { + Ordering::Greater + } else { + Ordering::Less + } + }) + .is_ok()) + } + + imp(c) +} + /// A mapping of property values for a specific property. /// /// The first element of each tuple is a normalized property value while the @@ -281,21 +427,21 @@ pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { /// value. type PropertyValues = &'static [(&'static str, &'static str)]; -fn canonical_gencat(normalized_value: &str) -> Option<&'static str> { - match normalized_value { +fn canonical_gencat(normalized_value: &str) -> Result> { + Ok(match normalized_value { "any" => Some("Any"), "assigned" => Some("Assigned"), "ascii" => Some("ASCII"), _ => { - let gencats = property_values("General_Category").unwrap(); + let gencats = property_values("General_Category")?.unwrap(); canonical_value(gencats, normalized_value) } - } + }) } -fn canonical_script(normalized_value: &str) -> Option<&'static str> { - let scripts = property_values("Script").unwrap(); - canonical_value(scripts, normalized_value) +fn canonical_script(normalized_value: &str) -> Result> { + let scripts = property_values("Script")?.unwrap(); + Ok(canonical_value(scripts, normalized_value)) } /// Find the canonical property name for the given normalized property name. @@ -304,11 +450,39 @@ fn canonical_script(normalized_value: &str) -> Option<&'static str> { /// /// The normalized property name must have been normalized according to /// UAX44 LM3, which can be done using `symbolic_name_normalize`. -fn canonical_prop(normalized_name: &str) -> Option<&'static str> { - PROPERTY_NAMES - .binary_search_by_key(&normalized_name, |&(n, _)| n) - .ok() - .map(|i| PROPERTY_NAMES[i].1) +/// +/// If the property names data is not available, then an error is returned. +fn canonical_prop(normalized_name: &str) -> Result> { + #[cfg(not(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + )))] + fn imp(_: &str) -> Result> { + Err(Error::PropertyNotFound) + } + + #[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + ))] + fn imp(name: &str) -> Result> { + use unicode_tables::property_names::PROPERTY_NAMES; + + Ok(PROPERTY_NAMES + .binary_search_by_key(&name, |&(n, _)| n) + .ok() + .map(|i| PROPERTY_NAMES[i].1)) + } + + imp(normalized_name) } /// Find the canonical property value for the given normalized property @@ -330,79 +504,291 @@ fn canonical_value( .map(|i| vals[i].1) } +/// Return the table of property values for the given property name. +/// +/// If the property values data is not available, then an error is returned. fn property_values( canonical_property_name: &'static str, -) -> Option { - PROPERTY_VALUES - .binary_search_by_key(&canonical_property_name, |&(n, _)| n) - .ok() - .map(|i| PROPERTY_VALUES[i].1) +) -> Result> { + #[cfg(not(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + )))] + fn imp(_: &'static str) -> Result> { + Err(Error::PropertyValueNotFound) + } + + #[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + ))] + fn imp(name: &'static str) -> Result> { + use unicode_tables::property_values::PROPERTY_VALUES; + + Ok(PROPERTY_VALUES + .binary_search_by_key(&name, |&(n, _)| n) + .ok() + .map(|i| PROPERTY_VALUES[i].1)) + } + + imp(canonical_property_name) } +// This is only used in some cases, but small enough to just let it be dead +// instead of figuring out (and maintaining) the right set of features. +#[allow(dead_code)] fn property_set( - name_map: &'static [(&'static str, &'static [(char, char)])], + name_map: &'static [(&'static str, Range)], canonical: &'static str, -) -> Option<&'static [(char, char)]> { +) -> Option { name_map .binary_search_by_key(&canonical, |x| x.0) .ok() .map(|i| name_map[i].1) } -/// An iterator over Unicode Age sets. Each item corresponds to a set of -/// codepoints that were added in a particular revision of Unicode. The +/// Returns an iterator over Unicode Age sets. Each item corresponds to a set +/// of codepoints that were added in a particular revision of Unicode. The /// iterator yields items in chronological order. -#[derive(Debug)] -struct AgeIter { - ages: &'static [(&'static str, &'static [(char, char)])], -} +/// +/// If the given age value isn't valid or if the data isn't available, then an +/// error is returned instead. +fn ages(canonical_age: &str) -> Result> { + #[cfg(not(feature = "unicode-age"))] + fn imp(_: &str) -> Result> { + use std::option::IntoIter; + Err::, _>(Error::PropertyNotFound) + } -fn ages(canonical_age: &str) -> Result { - const AGES: &'static [(&'static str, &'static [(char, char)])] = &[ - ("V1_1", age::V1_1), - ("V2_0", age::V2_0), - ("V2_1", age::V2_1), - ("V3_0", age::V3_0), - ("V3_1", age::V3_1), - ("V3_2", age::V3_2), - ("V4_0", age::V4_0), - ("V4_1", age::V4_1), - ("V5_0", age::V5_0), - ("V5_1", age::V5_1), - ("V5_2", age::V5_2), - ("V6_0", age::V6_0), - ("V6_1", age::V6_1), - ("V6_2", age::V6_2), - ("V6_3", age::V6_3), - ("V7_0", age::V7_0), - ("V8_0", age::V8_0), - ("V9_0", age::V9_0), - ("V10_0", age::V10_0), - ("V11_0", age::V11_0), - ("V12_0", age::V12_0), - ("V12_1", age::V12_1), - ]; - assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync"); - - let pos = AGES.iter().position(|&(age, _)| canonical_age == age); - match pos { - None => Err(Error::PropertyValueNotFound), - Some(i) => Ok(AgeIter { ages: &AGES[..i + 1] }), + #[cfg(feature = "unicode-age")] + fn imp(canonical_age: &str) -> Result> { + use unicode_tables::age; + + const AGES: &'static [(&'static str, Range)] = &[ + ("V1_1", age::V1_1), + ("V2_0", age::V2_0), + ("V2_1", age::V2_1), + ("V3_0", age::V3_0), + ("V3_1", age::V3_1), + ("V3_2", age::V3_2), + ("V4_0", age::V4_0), + ("V4_1", age::V4_1), + ("V5_0", age::V5_0), + ("V5_1", age::V5_1), + ("V5_2", age::V5_2), + ("V6_0", age::V6_0), + ("V6_1", age::V6_1), + ("V6_2", age::V6_2), + ("V6_3", age::V6_3), + ("V7_0", age::V7_0), + ("V8_0", age::V8_0), + ("V9_0", age::V9_0), + ("V10_0", age::V10_0), + ("V11_0", age::V11_0), + ("V12_0", age::V12_0), + ("V12_1", age::V12_1), + ]; + assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync"); + + let pos = AGES.iter().position(|&(age, _)| canonical_age == age); + match pos { + None => Err(Error::PropertyValueNotFound), + Some(i) => Ok(AGES[..i + 1].iter().map(|&(_, classes)| classes)), + } } + + imp(canonical_age) } -impl Iterator for AgeIter { - type Item = &'static [(char, char)]; +/// Returns the Unicode HIR class corresponding to the given general category. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given general category could not be found, or if the general +/// category data is not available, then an error is returned. +fn gencat(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-gencat"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } - fn next(&mut self) -> Option<&'static [(char, char)]> { - if self.ages.is_empty() { - None - } else { - let set = self.ages[0]; - self.ages = &self.ages[1..]; - Some(set.1) + #[cfg(feature = "unicode-gencat")] + fn imp(name: &'static str) -> Result { + use unicode_tables::general_category::BY_NAME; + match name { + "ASCII" => Ok(hir_class(&[('\0', '\x7F')])), + "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])), + "Assigned" => { + let mut cls = gencat("Unassigned")?; + cls.negate(); + Ok(cls) + } + name => property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound), } } + + match canonical_name { + "Decimal_Number" => perl_digit(), + name => imp(name), + } +} + +/// Returns the Unicode HIR class corresponding to the given script. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given script could not be found, or if the script data is not +/// available, then an error is returned. +fn script(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-script"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-script")] + fn imp(name: &'static str) -> Result { + use unicode_tables::script::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Returns the Unicode HIR class corresponding to the given script extension. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given script extension could not be found, or if the script data is +/// not available, then an error is returned. +fn script_extension( + canonical_name: &'static str, +) -> Result { + #[cfg(not(feature = "unicode-script"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-script")] + fn imp(name: &'static str) -> Result { + use unicode_tables::script_extension::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Returns the Unicode HIR class corresponding to the given Unicode boolean +/// property. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given boolean property could not be found, or if the boolean +/// property data is not available, then an error is returned. +fn bool_property(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-bool"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-bool")] + fn imp(name: &'static str) -> Result { + use unicode_tables::property_bool::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyNotFound) + } + + match canonical_name { + "Decimal_Number" => perl_digit(), + "White_Space" => perl_space(), + name => imp(name), + } +} + +/// Returns the Unicode HIR class corresponding to the given grapheme cluster +/// break property. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given property could not be found, or if the corresponding data is +/// not available, then an error is returned. +fn gcb(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-segment"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-segment")] + fn imp(name: &'static str) -> Result { + use unicode_tables::grapheme_cluster_break::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Returns the Unicode HIR class corresponding to the given word break +/// property. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given property could not be found, or if the corresponding data is +/// not available, then an error is returned. +fn wb(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-segment"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-segment")] + fn imp(name: &'static str) -> Result { + use unicode_tables::word_break::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Returns the Unicode HIR class corresponding to the given sentence +/// break property. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given property could not be found, or if the corresponding data is +/// not available, then an error is returned. +fn sb(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-segment"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-segment")] + fn imp(name: &'static str) -> Result { + use unicode_tables::sentence_break::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) } /// Like symbolic_name_normalize_bytes, but operates on a string. @@ -413,8 +799,9 @@ fn symbolic_name_normalize(x: &str) -> String { // This should always succeed because `symbolic_name_normalize_bytes` // guarantees that `&tmp[..len]` is always valid UTF-8. // - // N.B. We could use unsafe here to avoid the additional UTF-8 check here, - // but it's unlikely to be worth it. A benchmark must justify it first. + // N.B. We could avoid the additional UTF-8 check here, but it's unlikely + // to be worth skipping the additional safety check. A benchmark must + // justify it first. String::from_utf8(tmp).unwrap() } @@ -446,7 +833,7 @@ fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] { } let mut next_write = 0; for i in start..slice.len() { - // SAFETY ARGUMENT: To guarantee that the resulting slice is valid + // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid // UTF-8, we ensure that the slice contains only ASCII bytes. In // particular, we drop every non-ASCII byte from the normalized string. let b = slice[i]; @@ -480,57 +867,93 @@ mod tests { symbolic_name_normalize_bytes, }; + #[cfg(feature = "unicode-case")] + fn simple_fold_ok(c: char) -> impl Iterator { + simple_fold(c).unwrap().unwrap() + } + + #[cfg(feature = "unicode-case")] + fn simple_fold_err(c: char) -> Option { + match simple_fold(c).unwrap() { + Ok(_) => unreachable!("simple_fold returned Ok iterator"), + Err(next) => next, + } + } + + #[cfg(feature = "unicode-case")] + fn contains_case_map(start: char, end: char) -> bool { + contains_simple_case_mapping(start, end).unwrap() + } + #[test] + #[cfg(feature = "unicode-case")] fn simple_fold_k() { - let xs: Vec = simple_fold('k').unwrap().collect(); + let xs: Vec = simple_fold_ok('k').collect(); assert_eq!(xs, vec!['K', 'K']); - let xs: Vec = simple_fold('K').unwrap().collect(); + let xs: Vec = simple_fold_ok('K').collect(); assert_eq!(xs, vec!['k', 'K']); - let xs: Vec = simple_fold('K').unwrap().collect(); + let xs: Vec = simple_fold_ok('K').collect(); assert_eq!(xs, vec!['K', 'k']); } #[test] + #[cfg(feature = "unicode-case")] fn simple_fold_a() { - let xs: Vec = simple_fold('a').unwrap().collect(); + let xs: Vec = simple_fold_ok('a').collect(); assert_eq!(xs, vec!['A']); - let xs: Vec = simple_fold('A').unwrap().collect(); + let xs: Vec = simple_fold_ok('A').collect(); assert_eq!(xs, vec!['a']); } #[test] + #[cfg(feature = "unicode-case")] fn simple_fold_empty() { - assert_eq!(Some('A'), simple_fold('?').unwrap_err()); - assert_eq!(Some('A'), simple_fold('@').unwrap_err()); - assert_eq!(Some('a'), simple_fold('[').unwrap_err()); - assert_eq!(Some('Ⰰ'), simple_fold('☃').unwrap_err()); + assert_eq!(Some('A'), simple_fold_err('?')); + assert_eq!(Some('A'), simple_fold_err('@')); + assert_eq!(Some('a'), simple_fold_err('[')); + assert_eq!(Some('Ⰰ'), simple_fold_err('☃')); } #[test] + #[cfg(feature = "unicode-case")] fn simple_fold_max() { - assert_eq!(None, simple_fold('\u{10FFFE}').unwrap_err()); - assert_eq!(None, simple_fold('\u{10FFFF}').unwrap_err()); + assert_eq!(None, simple_fold_err('\u{10FFFE}')); + assert_eq!(None, simple_fold_err('\u{10FFFF}')); + } + + #[test] + #[cfg(not(feature = "unicode-case"))] + fn simple_fold_disabled() { + assert!(simple_fold('a').is_err()); } #[test] + #[cfg(feature = "unicode-case")] fn range_contains() { - assert!(contains_simple_case_mapping('A', 'A')); - assert!(contains_simple_case_mapping('Z', 'Z')); - assert!(contains_simple_case_mapping('A', 'Z')); - assert!(contains_simple_case_mapping('@', 'A')); - assert!(contains_simple_case_mapping('Z', '[')); - assert!(contains_simple_case_mapping('☃', 'Ⰰ')); + assert!(contains_case_map('A', 'A')); + assert!(contains_case_map('Z', 'Z')); + assert!(contains_case_map('A', 'Z')); + assert!(contains_case_map('@', 'A')); + assert!(contains_case_map('Z', '[')); + assert!(contains_case_map('☃', 'Ⰰ')); - assert!(!contains_simple_case_mapping('[', '[')); - assert!(!contains_simple_case_mapping('[', '`')); + assert!(!contains_case_map('[', '[')); + assert!(!contains_case_map('[', '`')); - assert!(!contains_simple_case_mapping('☃', '☃')); + assert!(!contains_case_map('☃', '☃')); + } + + #[test] + #[cfg(not(feature = "unicode-case"))] + fn range_contains_disabled() { + assert!(contains_simple_case_mapping('a', 'a').is_err()); } #[test] + #[cfg(feature = "unicode-gencat")] fn regression_466() { use super::{CanonicalClassQuery, ClassQuery}; diff --git a/regex-syntax/src/unicode_tables/age.rs b/regex-syntax/src/unicode_tables/age.rs index f4d8c182f0..30e6ee174a 100644 --- a/regex-syntax/src/unicode_tables/age.rs +++ b/regex-syntax/src/unicode_tables/age.rs @@ -1,6 +1,6 @@ // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // -// ucd-generate age /tmp/ucd-12.1.0/ --chars +// ucd-generate age /tmp/ucd/12.1.0/ --chars // // ucd-generate is available on crates.io. diff --git a/regex-syntax/src/unicode_tables/case_folding_simple.rs b/regex-syntax/src/unicode_tables/case_folding_simple.rs index ee3b7d3643..39f4176585 100644 --- a/regex-syntax/src/unicode_tables/case_folding_simple.rs +++ b/regex-syntax/src/unicode_tables/case_folding_simple.rs @@ -1,6 +1,6 @@ // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // -// ucd-generate case-folding-simple /tmp/ucd-12.1.0/ --chars --all-pairs +// ucd-generate case-folding-simple /tmp/ucd/12.1.0/ --chars --all-pairs // // ucd-generate is available on crates.io. diff --git a/regex-syntax/src/unicode_tables/general_category.rs b/regex-syntax/src/unicode_tables/general_category.rs index 09399a0bee..7662c46aff 100644 --- a/regex-syntax/src/unicode_tables/general_category.rs +++ b/regex-syntax/src/unicode_tables/general_category.rs @@ -1,6 +1,6 @@ // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // -// ucd-generate general-category /tmp/ucd-12.1.0/ --chars --exclude surrogate +// ucd-generate general-category /tmp/ucd/12.1.0/ --chars --exclude surrogate // // ucd-generate is available on crates.io. diff --git a/regex-syntax/src/unicode_tables/grapheme_cluster_break.rs b/regex-syntax/src/unicode_tables/grapheme_cluster_break.rs index ea7eeb8aba..2855eadc58 100644 --- a/regex-syntax/src/unicode_tables/grapheme_cluster_break.rs +++ b/regex-syntax/src/unicode_tables/grapheme_cluster_break.rs @@ -1,6 +1,6 @@ // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // -// ucd-generate grapheme-cluster-break /tmp/ucd-12.1.0/ --chars +// ucd-generate grapheme-cluster-break /tmp/ucd/12.1.0/ --chars // // ucd-generate is available on crates.io. diff --git a/regex-syntax/src/unicode_tables/mod.rs b/regex-syntax/src/unicode_tables/mod.rs index b9adf7ec5c..20736c7ac8 100644 --- a/regex-syntax/src/unicode_tables/mod.rs +++ b/regex-syntax/src/unicode_tables/mod.rs @@ -1,12 +1,57 @@ +#[cfg(feature = "unicode-age")] pub mod age; + +#[cfg(feature = "unicode-case")] pub mod case_folding_simple; + +#[cfg(feature = "unicode-gencat")] pub mod general_category; + +#[cfg(feature = "unicode-segment")] pub mod grapheme_cluster_break; + +#[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] +#[allow(dead_code)] +pub mod perl_decimal; + +#[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] +#[allow(dead_code)] +pub mod perl_space; + +#[cfg(feature = "unicode-perl")] pub mod perl_word; + +#[cfg(feature = "unicode-bool")] pub mod property_bool; + +#[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", +))] pub mod property_names; + +#[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", +))] pub mod property_values; + +#[cfg(feature = "unicode-script")] pub mod script; + +#[cfg(feature = "unicode-script")] pub mod script_extension; + +#[cfg(feature = "unicode-segment")] pub mod sentence_break; + +#[cfg(feature = "unicode-segment")] pub mod word_break; diff --git a/regex-syntax/src/unicode_tables/perl_decimal.rs b/regex-syntax/src/unicode_tables/perl_decimal.rs new file mode 100644 index 0000000000..8f6a046f65 --- /dev/null +++ b/regex-syntax/src/unicode_tables/perl_decimal.rs @@ -0,0 +1,70 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate general-category /tmp/ucd/12.1.0/ --chars --include decimalnumber +// +// ucd-generate is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = + &[("Decimal_Number", DECIMAL_NUMBER)]; + +pub const DECIMAL_NUMBER: &'static [(char, char)] = &[ + ('0', '9'), + ('٠', '٩'), + ('۰', '۹'), + ('߀', '߉'), + ('०', '९'), + ('০', '৯'), + ('੦', '੯'), + ('૦', '૯'), + ('୦', '୯'), + ('௦', '௯'), + ('౦', '౯'), + ('೦', '೯'), + ('൦', '൯'), + ('෦', '෯'), + ('๐', '๙'), + ('໐', '໙'), + ('༠', '༩'), + ('၀', '၉'), + ('႐', '႙'), + ('០', '៩'), + ('᠐', '᠙'), + ('᥆', '᥏'), + ('᧐', '᧙'), + ('᪀', '᪉'), + ('᪐', '᪙'), + ('᭐', '᭙'), + ('᮰', '᮹'), + ('᱀', '᱉'), + ('᱐', '᱙'), + ('꘠', '꘩'), + ('꣐', '꣙'), + ('꤀', '꤉'), + ('꧐', '꧙'), + ('꧰', '꧹'), + ('꩐', '꩙'), + ('꯰', '꯹'), + ('0', '9'), + ('𐒠', '𐒩'), + ('𐴰', '𐴹'), + ('𑁦', '𑁯'), + ('𑃰', '𑃹'), + ('𑄶', '𑄿'), + ('𑇐', '𑇙'), + ('𑋰', '𑋹'), + ('𑑐', '𑑙'), + ('𑓐', '𑓙'), + ('𑙐', '𑙙'), + ('𑛀', '𑛉'), + ('𑜰', '𑜹'), + ('𑣠', '𑣩'), + ('𑱐', '𑱙'), + ('𑵐', '𑵙'), + ('𑶠', '𑶩'), + ('𖩠', '𖩩'), + ('𖭐', '𖭙'), + ('𝟎', '𝟿'), + ('\u{1e140}', '\u{1e149}'), + ('\u{1e2f0}', '\u{1e2f9}'), + ('𞥐', '𞥙'), +]; diff --git a/regex-syntax/src/unicode_tables/perl_space.rs b/regex-syntax/src/unicode_tables/perl_space.rs new file mode 100644 index 0000000000..515724521c --- /dev/null +++ b/regex-syntax/src/unicode_tables/perl_space.rs @@ -0,0 +1,21 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate property-bool /tmp/ucd/12.1.0/ --chars --include whitespace +// +// ucd-generate is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = + &[("White_Space", WHITE_SPACE)]; + +pub const WHITE_SPACE: &'static [(char, char)] = &[ + ('\t', '\r'), + (' ', ' '), + ('\u{85}', '\u{85}'), + ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{200a}'), + ('\u{2028}', '\u{2029}'), + ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), + ('\u{3000}', '\u{3000}'), +]; diff --git a/regex-syntax/src/unicode_tables/perl_word.rs b/regex-syntax/src/unicode_tables/perl_word.rs index e4d8b65684..f4f5706242 100644 --- a/regex-syntax/src/unicode_tables/perl_word.rs +++ b/regex-syntax/src/unicode_tables/perl_word.rs @@ -1,6 +1,6 @@ // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // -// ucd-generate perl-word /tmp/ucd-12.1.0/ --chars +// ucd-generate perl-word /tmp/ucd/12.1.0/ --chars // // ucd-generate is available on crates.io. diff --git a/regex-syntax/src/unicode_tables/property_bool.rs b/regex-syntax/src/unicode_tables/property_bool.rs index 10aeff5273..59713a882e 100644 --- a/regex-syntax/src/unicode_tables/property_bool.rs +++ b/regex-syntax/src/unicode_tables/property_bool.rs @@ -1,6 +1,6 @@ // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // -// ucd-generate property-bool /tmp/ucd-12.1.0/ --chars +// ucd-generate property-bool /tmp/ucd/12.1.0/ --chars // // ucd-generate is available on crates.io. diff --git a/regex-syntax/src/unicode_tables/property_names.rs b/regex-syntax/src/unicode_tables/property_names.rs index c0b2316879..1064b61cd4 100644 --- a/regex-syntax/src/unicode_tables/property_names.rs +++ b/regex-syntax/src/unicode_tables/property_names.rs @@ -1,6 +1,6 @@ // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // -// ucd-generate property-names /tmp/ucd-12.1.0/ +// ucd-generate property-names /tmp/ucd/12.1.0/ // // ucd-generate is available on crates.io. diff --git a/regex-syntax/src/unicode_tables/property_values.rs b/regex-syntax/src/unicode_tables/property_values.rs index 415aa20a4e..0f960b0d44 100644 --- a/regex-syntax/src/unicode_tables/property_values.rs +++ b/regex-syntax/src/unicode_tables/property_values.rs @@ -1,6 +1,6 @@ // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // -// ucd-generate property-values /tmp/ucd-12.1.0/ --include gc,script,scx,age,gcb,wb,sb +// ucd-generate property-values /tmp/ucd/12.1.0/ --include gc,script,scx,age,gcb,wb,sb // // ucd-generate is available on crates.io. diff --git a/regex-syntax/src/unicode_tables/script.rs b/regex-syntax/src/unicode_tables/script.rs index 67b43f1756..12ddf0167f 100644 --- a/regex-syntax/src/unicode_tables/script.rs +++ b/regex-syntax/src/unicode_tables/script.rs @@ -1,6 +1,6 @@ // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // -// ucd-generate script /tmp/ucd-12.1.0/ --chars +// ucd-generate script /tmp/ucd/12.1.0/ --chars // // ucd-generate is available on crates.io. diff --git a/regex-syntax/src/unicode_tables/script_extension.rs b/regex-syntax/src/unicode_tables/script_extension.rs index 81c845823b..a86b17eb22 100644 --- a/regex-syntax/src/unicode_tables/script_extension.rs +++ b/regex-syntax/src/unicode_tables/script_extension.rs @@ -1,6 +1,6 @@ // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // -// ucd-generate script-extension /tmp/ucd-12.1.0/ --chars +// ucd-generate script-extension /tmp/ucd/12.1.0/ --chars // // ucd-generate is available on crates.io. diff --git a/regex-syntax/src/unicode_tables/sentence_break.rs b/regex-syntax/src/unicode_tables/sentence_break.rs index 9214ca1776..2ee7ae5b63 100644 --- a/regex-syntax/src/unicode_tables/sentence_break.rs +++ b/regex-syntax/src/unicode_tables/sentence_break.rs @@ -1,6 +1,6 @@ // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // -// ucd-generate sentence-break /tmp/ucd-12.1.0/ --chars +// ucd-generate sentence-break /tmp/ucd/12.1.0/ --chars // // ucd-generate is available on crates.io. diff --git a/regex-syntax/src/unicode_tables/word_break.rs b/regex-syntax/src/unicode_tables/word_break.rs index 212905b4e9..24f6fefd2c 100644 --- a/regex-syntax/src/unicode_tables/word_break.rs +++ b/regex-syntax/src/unicode_tables/word_break.rs @@ -1,6 +1,6 @@ // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // -// ucd-generate word-break /tmp/ucd-12.1.0/ --chars +// ucd-generate word-break /tmp/ucd/12.1.0/ --chars // // ucd-generate is available on crates.io. diff --git a/regex-syntax/test b/regex-syntax/test new file mode 100755 index 0000000000..9970a9945a --- /dev/null +++ b/regex-syntax/test @@ -0,0 +1,20 @@ +#!/bin/bash + +# This is a convenience script for running a broad swath of the syntax tests. +echo "===== DEFAULT FEATURES ===" +cargo test + +features=( + unicode + unicode-age + unicode-bool + unicode-case + unicode-gencat + unicode-perl + unicode-script + unicode-segment +) +for f in "${features[@]}"; do + echo "===== FEATURE: $f ===" + cargo test --no-default-features --features "$f" +done diff --git a/scripts/generate-unicode-tables b/scripts/generate-unicode-tables index e532d7a163..c01df16e7e 100755 --- a/scripts/generate-unicode-tables +++ b/scripts/generate-unicode-tables @@ -31,8 +31,6 @@ ucd-generate general-category "$ucddir" \ --chars --exclude surrogate > "$out/general_category.rs" ucd-generate grapheme-cluster-break "$ucddir" \ --chars > "$out/grapheme_cluster_break.rs" -ucd-generate perl-word "$ucddir" \ - --chars > "$out/perl_word.rs" ucd-generate property-bool "$ucddir" \ --chars > "$out/property_bool.rs" ucd-generate property-names "$ucddir" \ @@ -47,3 +45,19 @@ ucd-generate sentence-break "$ucddir" \ --chars > "$out/sentence_break.rs" ucd-generate word-break "$ucddir" \ --chars > "$out/word_break.rs" + +# These generate the \w, \d and \s Unicode-aware character classes. \d and \s +# are technically part of the general category and boolean properties generated +# above. However, these are generated separately to make it possible to enable +# or disable them via Cargo features independently of whether all boolean +# properties or general categories are enabled or disabled. The crate ensures +# that only one copy is compiled. +ucd-generate perl-word "$ucddir" \ + --chars > "$out/perl_word.rs" +ucd-generate general-category "$ucddir" \ + --chars --include decimalnumber > "$out/perl_decimal.rs" +ucd-generate property-bool "$ucddir" \ + --chars --include whitespace > "$out/perl_space.rs" + +# Make sure everything is formatted. +cargo +stable fmt --all diff --git a/scripts/regex-match-tests.py b/scripts/regex-match-tests.py index 24635e7797..42311285c6 100755 --- a/scripts/regex-match-tests.py +++ b/scripts/regex-match-tests.py @@ -1,15 +1,5 @@ #!/usr/bin/env python2 -# Copyright 2014 The Rust Project Developers. See the COPYRIGHT -# file at the top-level directory of this distribution and at -# http://rust-lang.org/COPYRIGHT. -# -# Licensed under the Apache License, Version 2.0 or the MIT license -# , at your -# option. This file may not be copied, modified, or distributed -# except according to those terms. - from __future__ import absolute_import, division, print_function import argparse import datetime @@ -86,19 +76,10 @@ def group_tostr(g): for f in args.files: tests += read_tests(f) - tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - + tpl = ''' // DO NOT EDIT. Automatically generated by 'scripts/regex-match-tests.py' // on {date}. -''' +'''.lstrip() print(tpl.format(date=str(datetime.datetime.now()))) for f in args.files: diff --git a/src/backtrack.rs b/src/backtrack.rs index 12a3cf595e..2eaeb72e55 100644 --- a/src/backtrack.rs +++ b/src/backtrack.rs @@ -156,7 +156,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { if matched && self.prog.matches.len() == 1 { return true; } - if at.pos() == end || at.is_end() { + if at.pos() >= end { break; } at = self.input.at(at.next_pos()); diff --git a/src/cache.rs b/src/cache.rs new file mode 100644 index 0000000000..d8991ce4a3 --- /dev/null +++ b/src/cache.rs @@ -0,0 +1,100 @@ +// This module defines a common API for caching internal runtime state. +// The `thread_local` crate provides an extremely optimized version of this. +// However, if the perf-cache feature is disabled, then we drop the +// thread_local dependency and instead use a pretty naive caching mechanism +// with a mutex. +// +// Strictly speaking, the CachedGuard isn't necessary for the much more +// flexible thread_local API, but implementing thread_local's API doesn't +// seem possible in purely safe code. + +pub use self::imp::{Cached, CachedGuard}; + +#[cfg(feature = "perf-cache")] +mod imp { + use thread_local::CachedThreadLocal; + + #[derive(Debug)] + pub struct Cached(CachedThreadLocal); + + #[derive(Debug)] + pub struct CachedGuard<'a, T: 'a>(&'a T); + + impl Cached { + pub fn new() -> Cached { + Cached(CachedThreadLocal::new()) + } + + pub fn get_or(&self, create: impl FnOnce() -> T) -> CachedGuard { + CachedGuard(self.0.get_or(|| Box::new(create()))) + } + } + + impl<'a, T: Send> CachedGuard<'a, T> { + pub fn value(&self) -> &T { + self.0 + } + } +} + +#[cfg(not(feature = "perf-cache"))] +mod imp { + use std::marker::PhantomData; + use std::panic::UnwindSafe; + use std::sync::Mutex; + + #[derive(Debug)] + pub struct Cached { + stack: Mutex>, + /// When perf-cache is enabled, the thread_local crate is used, and + /// its CachedThreadLocal impls Send, Sync and UnwindSafe, but NOT + /// RefUnwindSafe. However, a Mutex impls RefUnwindSafe. So in order + /// to keep the APIs consistent regardless of whether perf-cache is + /// enabled, we force this type to NOT impl RefUnwindSafe too. + /// + /// Ideally, we should always impl RefUnwindSafe, but it seems a little + /// tricky to do that right now. + /// + /// See also: https://github.com/rust-lang/regex/issues/576 + _phantom: PhantomData>, + } + + #[derive(Debug)] + pub struct CachedGuard<'a, T: 'a + Send> { + cache: &'a Cached, + value: Option, + } + + impl Cached { + pub fn new() -> Cached { + Cached { stack: Mutex::new(vec![]), _phantom: PhantomData } + } + + pub fn get_or(&self, create: impl FnOnce() -> T) -> CachedGuard { + let mut stack = self.stack.lock().unwrap(); + match stack.pop() { + None => CachedGuard { cache: self, value: Some(create()) }, + Some(value) => CachedGuard { cache: self, value: Some(value) }, + } + } + + fn put(&self, value: T) { + let mut stack = self.stack.lock().unwrap(); + stack.push(value); + } + } + + impl<'a, T: Send> CachedGuard<'a, T> { + pub fn value(&self) -> &T { + self.value.as_ref().unwrap() + } + } + + impl<'a, T: Send> Drop for CachedGuard<'a, T> { + fn drop(&mut self) { + if let Some(value) = self.value.take() { + self.cache.put(value); + } + } + } +} diff --git a/src/compile.rs b/src/compile.rs index 5be9f3ee0a..1f69967192 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -297,11 +297,25 @@ impl Compiler { self.c_empty_look(prog::EmptyLook::EndText) } WordBoundary(hir::WordBoundary::Unicode) => { + if !cfg!(feature = "unicode-perl") { + return Err(Error::Syntax( + "Unicode word boundaries are unavailable when \ + the unicode-perl feature is disabled" + .to_string(), + )); + } self.compiled.has_unicode_word_boundary = true; self.byte_classes.set_word_boundary(); self.c_empty_look(prog::EmptyLook::WordBoundary) } WordBoundary(hir::WordBoundary::UnicodeNegate) => { + if !cfg!(feature = "unicode-perl") { + return Err(Error::Syntax( + "Unicode word boundaries are unavailable when \ + the unicode-perl feature is disabled" + .to_string(), + )); + } self.compiled.has_unicode_word_boundary = true; self.byte_classes.set_word_boundary(); self.c_empty_look(prog::EmptyLook::NotWordBoundary) diff --git a/src/dfa.rs b/src/dfa.rs index f12558b491..decc3b9874 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -224,6 +224,7 @@ impl Result { /// Maps the given function onto T and returns the result. /// /// If this isn't a match, then this is a no-op. + #[cfg(feature = "perf-literal")] pub fn map U>(self, mut f: F) -> Result { match self { Result::Match(t) => Result::Match(f(t)), @@ -442,7 +443,7 @@ impl CacheInner { } impl<'a> Fsm<'a> { - #[inline(always)] // reduces constant overhead + #[cfg_attr(feature = "perf-inline", inline(always))] pub fn forward( prog: &'a Program, cache: &ProgramCache, @@ -472,7 +473,7 @@ impl<'a> Fsm<'a> { dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text) } - #[inline(always)] // reduces constant overhead + #[cfg_attr(feature = "perf-inline", inline(always))] pub fn reverse( prog: &'a Program, cache: &ProgramCache, @@ -502,7 +503,7 @@ impl<'a> Fsm<'a> { dfa.exec_at_reverse(&mut cache.qcur, &mut cache.qnext, text) } - #[inline(always)] // reduces constant overhead + #[cfg_attr(feature = "perf-inline", inline(always))] pub fn forward_many( prog: &'a Program, cache: &ProgramCache, @@ -550,7 +551,7 @@ impl<'a> Fsm<'a> { /// Executes the DFA on a forward NFA. /// /// {qcur,qnext} are scratch ordered sets which may be non-empty. - #[inline(always)] // reduces constant overhead + #[cfg_attr(feature = "perf-inline", inline(always))] fn exec_at( &mut self, qcur: &mut SparseSet, @@ -743,7 +744,7 @@ impl<'a> Fsm<'a> { } /// Executes the DFA on a reverse NFA. - #[inline(always)] // reduces constant overhead + #[cfg_attr(feature = "perf-inline", inline(always))] fn exec_at_reverse( &mut self, qcur: &mut SparseSet, @@ -848,7 +849,7 @@ impl<'a> Fsm<'a> { /// corresponds to text[i]. /// /// This elides bounds checks, and is therefore unsafe. - #[inline(always)] + #[cfg_attr(feature = "perf-inline", inline(always))] unsafe fn next_si(&self, si: StatePtr, text: &[u8], i: usize) -> StatePtr { // What is the argument for safety here? // We have three unchecked accesses that could possibly violate safety: @@ -1363,7 +1364,7 @@ impl<'a> Fsm<'a> { /// then it is computed, cached and a pointer to it is returned. /// /// This may return STATE_DEAD but never STATE_UNKNOWN. - #[inline(always)] // reduces constant overhead + #[cfg_attr(feature = "perf-inline", inline(always))] fn start_state( &mut self, q: &mut SparseSet, @@ -1525,7 +1526,7 @@ impl<'a> Fsm<'a> { /// Given an input byte or the special EOF sentinel, return its /// corresponding byte class. - #[inline(always)] + #[cfg_attr(feature = "perf-inline", inline(always))] fn byte_class(&self, b: Byte) -> usize { match b.as_byte() { None => self.num_byte_classes() - 1, @@ -1534,7 +1535,7 @@ impl<'a> Fsm<'a> { } /// Like byte_class, but explicitly for u8s. - #[inline(always)] + #[cfg_attr(feature = "perf-inline", inline(always))] fn u8_class(&self, b: u8) -> usize { self.prog.byte_classes[b as usize] as usize } diff --git a/src/exec.rs b/src/exec.rs index 46ee6e0d8e..2ae7842204 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -2,14 +2,16 @@ use std::cell::RefCell; use std::collections::HashMap; use std::sync::Arc; +#[cfg(feature = "perf-literal")] use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; use syntax::hir::literal::Literals; use syntax::hir::Hir; use syntax::ParserBuilder; -use thread_local::CachedThreadLocal; use backtrack; +use cache::{Cached, CachedGuard}; use compile::Compiler; +#[cfg(feature = "perf-dfa")] use dfa; use error::Error; use input::{ByteInput, CharInput}; @@ -32,7 +34,7 @@ pub struct Exec { /// All read only state. ro: Arc, /// Caches for the various matching engines. - cache: CachedThreadLocal, + cache: Cached, } /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This @@ -43,7 +45,7 @@ pub struct ExecNoSync<'c> { /// All read only state. ro: &'c Arc, /// Caches for the various matching engines. - cache: &'c ProgramCache, + cache: CachedGuard<'c, ProgramCache>, } /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8]. @@ -85,6 +87,7 @@ struct ExecReadOnly { /// N.B. We use u32 as a state ID representation under the assumption that /// if we were to exhaust the ID space, we probably would have long /// surpassed the compilation size limit. + #[cfg(feature = "perf-literal")] ac: Option>, /// match_type encodes as much upfront knowledge about how we're going to /// execute a search as possible. @@ -234,39 +237,41 @@ impl ExecBuilder { parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?; bytes = bytes || !expr.is_always_utf8(); - if !expr.is_anchored_start() && expr.is_any_anchored_start() { - // Partial anchors unfortunately make it hard to use prefixes, - // so disable them. - prefixes = None; - } else if is_set && expr.is_anchored_start() { - // Regex sets with anchors do not go well with literal - // optimizations. - prefixes = None; - } - prefixes = prefixes.and_then(|mut prefixes| { - if !prefixes.union_prefixes(&expr) { - None - } else { - Some(prefixes) + if cfg!(feature = "perf-literal") { + if !expr.is_anchored_start() && expr.is_any_anchored_start() { + // Partial anchors unfortunately make it hard to use + // prefixes, so disable them. + prefixes = None; + } else if is_set && expr.is_anchored_start() { + // Regex sets with anchors do not go well with literal + // optimizations. + prefixes = None; } - }); + prefixes = prefixes.and_then(|mut prefixes| { + if !prefixes.union_prefixes(&expr) { + None + } else { + Some(prefixes) + } + }); - if !expr.is_anchored_end() && expr.is_any_anchored_end() { - // Partial anchors unfortunately make it hard to use suffixes, - // so disable them. - suffixes = None; - } else if is_set && expr.is_anchored_end() { - // Regex sets with anchors do not go well with literal - // optimizations. - suffixes = None; - } - suffixes = suffixes.and_then(|mut suffixes| { - if !suffixes.union_suffixes(&expr) { - None - } else { - Some(suffixes) + if !expr.is_anchored_end() && expr.is_any_anchored_end() { + // Partial anchors unfortunately make it hard to use + // suffixes, so disable them. + suffixes = None; + } else if is_set && expr.is_anchored_end() { + // Regex sets with anchors do not go well with literal + // optimizations. + suffixes = None; } - }); + suffixes = suffixes.and_then(|mut suffixes| { + if !suffixes.union_suffixes(&expr) { + None + } else { + Some(suffixes) + } + }); + } exprs.push(expr); } Ok(Parsed { @@ -288,10 +293,11 @@ impl ExecBuilder { dfa: Program::new(), dfa_reverse: Program::new(), suffixes: LiteralSearcher::empty(), + #[cfg(feature = "perf-literal")] ac: None, match_type: MatchType::Nothing, }); - return Ok(Exec { ro: ro, cache: CachedThreadLocal::new() }); + return Ok(Exec { ro: ro, cache: Cached::new() }); } let parsed = self.parse()?; let mut nfa = Compiler::new() @@ -311,43 +317,55 @@ impl ExecBuilder { .reverse(true) .compile(&parsed.exprs)?; + #[cfg(feature = "perf-literal")] + let ac = self.build_aho_corasick(&parsed); nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes); dfa.prefixes = nfa.prefixes.clone(); dfa.dfa_size_limit = self.options.dfa_size_limit; dfa_reverse.dfa_size_limit = self.options.dfa_size_limit; - let mut ac = None; - if parsed.exprs.len() == 1 { - if let Some(lits) = alternation_literals(&parsed.exprs[0]) { - // If we have a small number of literals, then let Teddy - // handle things (see literal/mod.rs). - if lits.len() > 32 { - let fsm = AhoCorasickBuilder::new() - .match_kind(MatchKind::LeftmostFirst) - .auto_configure(&lits) - // We always want this to reduce size, regardless of - // what auto-configure does. - .byte_classes(true) - .build_with_size::(&lits) - .expect("AC automaton too big"); - ac = Some(fsm); - } - } - } - let mut ro = ExecReadOnly { res: self.options.pats, nfa: nfa, dfa: dfa, dfa_reverse: dfa_reverse, suffixes: LiteralSearcher::suffixes(parsed.suffixes), + #[cfg(feature = "perf-literal")] ac: ac, match_type: MatchType::Nothing, }; ro.match_type = ro.choose_match_type(self.match_type); let ro = Arc::new(ro); - Ok(Exec { ro: ro, cache: CachedThreadLocal::new() }) + Ok(Exec { ro: ro, cache: Cached::new() }) + } + + #[cfg(feature = "perf-literal")] + fn build_aho_corasick(&self, parsed: &Parsed) -> Option> { + if parsed.exprs.len() != 1 { + return None; + } + let lits = match alternation_literals(&parsed.exprs[0]) { + None => return None, + Some(lits) => lits, + }; + // If we have a small number of literals, then let Teddy handle + // things (see literal/mod.rs). + if lits.len() <= 32 { + return None; + } + Some( + AhoCorasickBuilder::new() + .match_kind(MatchKind::LeftmostFirst) + .auto_configure(&lits) + // We always want this to reduce size, regardless + // of what auto-configure does. + .byte_classes(true) + .build_with_size::(&lits) + // This should never happen because we'd long exceed the + // compilation limit for regexes first. + .expect("AC automaton too big"), + ) } } @@ -362,22 +380,22 @@ impl<'c> RegularExpression for ExecNoSyncStr<'c> { next_utf8(text.as_bytes(), i) } - #[inline(always)] // reduces constant overhead + #[cfg_attr(feature = "perf-inline", inline(always))] fn shortest_match_at(&self, text: &str, start: usize) -> Option { self.0.shortest_match_at(text.as_bytes(), start) } - #[inline(always)] // reduces constant overhead + #[cfg_attr(feature = "perf-inline", inline(always))] fn is_match_at(&self, text: &str, start: usize) -> bool { self.0.is_match_at(text.as_bytes(), start) } - #[inline(always)] // reduces constant overhead + #[cfg_attr(feature = "perf-inline", inline(always))] fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { self.0.find_at(text.as_bytes(), start) } - #[inline(always)] // reduces constant overhead + #[cfg_attr(feature = "perf-inline", inline(always))] fn captures_read_at( &self, locs: &mut Locations, @@ -404,15 +422,17 @@ impl<'c> RegularExpression for ExecNoSync<'c> { /// Returns the end of a match location, possibly occurring before the /// end location of the correct leftmost-first match. - #[inline(always)] // reduces constant overhead + #[cfg_attr(feature = "perf-inline", inline(always))] fn shortest_match_at(&self, text: &[u8], start: usize) -> Option { if !self.is_anchor_end_match(text) { return None; } match self.ro.match_type { + #[cfg(feature = "perf-literal")] MatchType::Literal(ty) => { self.find_literals(ty, text, start).map(|(_, e)| e) } + #[cfg(feature = "perf-dfa")] MatchType::Dfa | MatchType::DfaMany => { match self.shortest_dfa(text, start) { dfa::Result::Match(end) => Some(end), @@ -420,10 +440,11 @@ impl<'c> RegularExpression for ExecNoSync<'c> { dfa::Result::Quit => self.shortest_nfa(text, start), } } + #[cfg(feature = "perf-dfa")] MatchType::DfaAnchoredReverse => { match dfa::Fsm::reverse( &self.ro.dfa_reverse, - self.cache, + self.cache.value(), true, &text[start..], text.len(), @@ -433,6 +454,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> { dfa::Result::Quit => self.shortest_nfa(text, start), } } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] MatchType::DfaSuffix => { match self.shortest_dfa_reverse_suffix(text, start) { dfa::Result::Match(e) => Some(e), @@ -449,7 +471,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> { /// /// For single regular expressions, this is equivalent to calling /// shortest_match(...).is_some(). - #[inline(always)] // reduces constant overhead + #[cfg_attr(feature = "perf-inline", inline(always))] fn is_match_at(&self, text: &[u8], start: usize) -> bool { if !self.is_anchor_end_match(text) { return false; @@ -458,9 +480,11 @@ impl<'c> RegularExpression for ExecNoSync<'c> { // filling in captures[1], but a RegexSet has no captures. In other // words, a RegexSet can't (currently) use shortest_match. ---AG match self.ro.match_type { + #[cfg(feature = "perf-literal")] MatchType::Literal(ty) => { self.find_literals(ty, text, start).is_some() } + #[cfg(feature = "perf-dfa")] MatchType::Dfa | MatchType::DfaMany => { match self.shortest_dfa(text, start) { dfa::Result::Match(_) => true, @@ -468,10 +492,11 @@ impl<'c> RegularExpression for ExecNoSync<'c> { dfa::Result::Quit => self.match_nfa(text, start), } } + #[cfg(feature = "perf-dfa")] MatchType::DfaAnchoredReverse => { match dfa::Fsm::reverse( &self.ro.dfa_reverse, - self.cache, + self.cache.value(), true, &text[start..], text.len(), @@ -481,6 +506,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> { dfa::Result::Quit => self.match_nfa(text, start), } } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] MatchType::DfaSuffix => { match self.shortest_dfa_reverse_suffix(text, start) { dfa::Result::Match(_) => true, @@ -495,13 +521,15 @@ impl<'c> RegularExpression for ExecNoSync<'c> { /// Finds the start and end location of the leftmost-first match, starting /// at the given location. - #[inline(always)] // reduces constant overhead + #[cfg_attr(feature = "perf-inline", inline(always))] fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> { if !self.is_anchor_end_match(text) { return None; } match self.ro.match_type { + #[cfg(feature = "perf-literal")] MatchType::Literal(ty) => self.find_literals(ty, text, start), + #[cfg(feature = "perf-dfa")] MatchType::Dfa => match self.find_dfa_forward(text, start) { dfa::Result::Match((s, e)) => Some((s, e)), dfa::Result::NoMatch(_) => None, @@ -509,6 +537,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> { self.find_nfa(MatchNfaType::Auto, text, start) } }, + #[cfg(feature = "perf-dfa")] MatchType::DfaAnchoredReverse => { match self.find_dfa_anchored_reverse(text, start) { dfa::Result::Match((s, e)) => Some((s, e)), @@ -518,6 +547,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> { } } } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] MatchType::DfaSuffix => { match self.find_dfa_reverse_suffix(text, start) { dfa::Result::Match((s, e)) => Some((s, e)), @@ -529,6 +559,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> { } MatchType::Nfa(ty) => self.find_nfa(ty, text, start), MatchType::Nothing => None, + #[cfg(feature = "perf-dfa")] MatchType::DfaMany => { unreachable!("BUG: RegexSet cannot be used with find") } @@ -570,6 +601,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> { return None; } match self.ro.match_type { + #[cfg(feature = "perf-literal")] MatchType::Literal(ty) => { self.find_literals(ty, text, start).and_then(|(s, e)| { self.captures_nfa_type( @@ -581,6 +613,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> { ) }) } + #[cfg(feature = "perf-dfa")] MatchType::Dfa => { if self.ro.nfa.is_anchored_start { self.captures_nfa(slots, text, start) @@ -600,6 +633,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> { } } } + #[cfg(feature = "perf-dfa")] MatchType::DfaAnchoredReverse => { match self.find_dfa_anchored_reverse(text, start) { dfa::Result::Match((s, e)) => self.captures_nfa_type( @@ -613,6 +647,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> { dfa::Result::Quit => self.captures_nfa(slots, text, start), } } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] MatchType::DfaSuffix => { match self.find_dfa_reverse_suffix(text, start) { dfa::Result::Match((s, e)) => self.captures_nfa_type( @@ -630,6 +665,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> { self.captures_nfa_type(ty, slots, text, start, text.len()) } MatchType::Nothing => None, + #[cfg(feature = "perf-dfa")] MatchType::DfaMany => { unreachable!("BUG: RegexSet cannot be used with captures") } @@ -639,7 +675,8 @@ impl<'c> RegularExpression for ExecNoSync<'c> { impl<'c> ExecNoSync<'c> { /// Finds the leftmost-first match using only literal search. - #[inline(always)] // reduces constant overhead + #[cfg(feature = "perf-literal")] + #[cfg_attr(feature = "perf-inline", inline(always))] fn find_literals( &self, ty: MatchLiteralType, @@ -682,7 +719,8 @@ impl<'c> ExecNoSync<'c> { /// /// If the result returned indicates that the DFA quit, then another /// matching engine should be used. - #[inline(always)] // reduces constant overhead + #[cfg(feature = "perf-dfa")] + #[cfg_attr(feature = "perf-inline", inline(always))] fn find_dfa_forward( &self, text: &[u8], @@ -691,7 +729,7 @@ impl<'c> ExecNoSync<'c> { use dfa::Result::*; let end = match dfa::Fsm::forward( &self.ro.dfa, - self.cache, + self.cache.value(), false, text, start, @@ -704,7 +742,7 @@ impl<'c> ExecNoSync<'c> { // Now run the DFA in reverse to find the start of the match. match dfa::Fsm::reverse( &self.ro.dfa_reverse, - self.cache, + self.cache.value(), false, &text[start..], end - start, @@ -721,7 +759,8 @@ impl<'c> ExecNoSync<'c> { /// /// If the result returned indicates that the DFA quit, then another /// matching engine should be used. - #[inline(always)] // reduces constant overhead + #[cfg(feature = "perf-dfa")] + #[cfg_attr(feature = "perf-inline", inline(always))] fn find_dfa_anchored_reverse( &self, text: &[u8], @@ -730,7 +769,7 @@ impl<'c> ExecNoSync<'c> { use dfa::Result::*; match dfa::Fsm::reverse( &self.ro.dfa_reverse, - self.cache, + self.cache.value(), false, &text[start..], text.len() - start, @@ -742,15 +781,16 @@ impl<'c> ExecNoSync<'c> { } /// Finds the end of the shortest match using only the DFA. - #[inline(always)] // reduces constant overhead + #[cfg(feature = "perf-dfa")] + #[cfg_attr(feature = "perf-inline", inline(always))] fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result { - dfa::Fsm::forward(&self.ro.dfa, self.cache, true, text, start) + dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start) } /// Finds the end of the shortest match using only the DFA by scanning for /// suffix literals. - /// - #[inline(always)] // reduces constant overhead + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + #[cfg_attr(feature = "perf-inline", inline(always))] fn shortest_dfa_reverse_suffix( &self, text: &[u8], @@ -775,7 +815,8 @@ impl<'c> ExecNoSync<'c> { /// /// If the result returned indicates that the DFA quit, then another /// matching engine should be used. - #[inline(always)] // reduces constant overhead + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + #[cfg_attr(feature = "perf-inline", inline(always))] fn exec_dfa_reverse_suffix( &self, text: &[u8], @@ -796,7 +837,7 @@ impl<'c> ExecNoSync<'c> { end = last_literal + lcs.len(); match dfa::Fsm::reverse( &self.ro.dfa_reverse, - self.cache, + self.cache.value(), false, &text[start..end], end - start, @@ -819,7 +860,8 @@ impl<'c> ExecNoSync<'c> { /// /// If the result returned indicates that the DFA quit, then another /// matching engine should be used. - #[inline(always)] // reduces constant overhead + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + #[cfg_attr(feature = "perf-inline", inline(always))] fn find_dfa_reverse_suffix( &self, text: &[u8], @@ -841,7 +883,7 @@ impl<'c> ExecNoSync<'c> { // leftmost-first match.) match dfa::Fsm::forward( &self.ro.dfa, - self.cache, + self.cache.value(), false, text, match_start, @@ -857,6 +899,7 @@ impl<'c> ExecNoSync<'c> { /// Ideally, we could use shortest_nfa(...).is_some() and get the same /// performance characteristics, but regex sets don't have captures, which /// shortest_nfa depends on. + #[cfg(feature = "perf-dfa")] fn match_nfa(&self, text: &[u8], start: usize) -> bool { self.match_nfa_type(MatchNfaType::Auto, text, start) } @@ -868,10 +911,20 @@ impl<'c> ExecNoSync<'c> { text: &[u8], start: usize, ) -> bool { - self.exec_nfa(ty, &mut [false], &mut [], true, text, start, text.len()) + self.exec_nfa( + ty, + &mut [false], + &mut [], + true, + false, + text, + start, + text.len(), + ) } /// Finds the shortest match using an NFA. + #[cfg(feature = "perf-dfa")] fn shortest_nfa(&self, text: &[u8], start: usize) -> Option { self.shortest_nfa_type(MatchNfaType::Auto, text, start) } @@ -889,6 +942,7 @@ impl<'c> ExecNoSync<'c> { &mut [false], &mut slots, true, + true, text, start, text.len(), @@ -912,6 +966,7 @@ impl<'c> ExecNoSync<'c> { &mut [false], &mut slots, false, + false, text, start, text.len(), @@ -928,6 +983,7 @@ impl<'c> ExecNoSync<'c> { /// Like find_nfa, but fills in captures. /// /// `slots` should have length equal to `2 * nfa.captures.len()`. + #[cfg(feature = "perf-dfa")] fn captures_nfa( &self, slots: &mut [Slot], @@ -952,7 +1008,16 @@ impl<'c> ExecNoSync<'c> { start: usize, end: usize, ) -> Option<(usize, usize)> { - if self.exec_nfa(ty, &mut [false], slots, false, text, start, end) { + if self.exec_nfa( + ty, + &mut [false], + slots, + false, + false, + text, + start, + end, + ) { match (slots[0], slots[1]) { (Some(s), Some(e)) => Some((s, e)), _ => None, @@ -968,6 +1033,7 @@ impl<'c> ExecNoSync<'c> { matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, + quit_after_match_with_pos: bool, text: &[u8], start: usize, end: usize, @@ -980,17 +1046,20 @@ impl<'c> ExecNoSync<'c> { ty = PikeVM; } } - match ty { - Auto => unreachable!(), - Backtrack => self.exec_backtrack(matches, slots, text, start, end), - PikeVM => self.exec_pikevm( + // The backtracker can't return the shortest match position as it is + // implemented today. So if someone calls `shortest_match` and we need + // to run an NFA, then use the PikeVM. + if quit_after_match_with_pos || ty == PikeVM { + self.exec_pikevm( matches, slots, quit_after_match, text, start, end, - ), + ) + } else { + self.exec_backtrack(matches, slots, text, start, end) } } @@ -1007,7 +1076,7 @@ impl<'c> ExecNoSync<'c> { if self.ro.nfa.uses_bytes() { pikevm::Fsm::exec( &self.ro.nfa, - self.cache, + self.cache.value(), matches, slots, quit_after_match, @@ -1018,7 +1087,7 @@ impl<'c> ExecNoSync<'c> { } else { pikevm::Fsm::exec( &self.ro.nfa, - self.cache, + self.cache.value(), matches, slots, quit_after_match, @@ -1041,7 +1110,7 @@ impl<'c> ExecNoSync<'c> { if self.ro.nfa.uses_bytes() { backtrack::Bounded::exec( &self.ro.nfa, - self.cache, + self.cache.value(), matches, slots, ByteInput::new(text, self.ro.nfa.only_utf8), @@ -1051,7 +1120,7 @@ impl<'c> ExecNoSync<'c> { } else { backtrack::Bounded::exec( &self.ro.nfa, - self.cache, + self.cache.value(), matches, slots, CharInput::new(text), @@ -1079,15 +1148,17 @@ impl<'c> ExecNoSync<'c> { return false; } match self.ro.match_type { + #[cfg(feature = "perf-literal")] Literal(ty) => { debug_assert_eq!(matches.len(), 1); matches[0] = self.find_literals(ty, text, start).is_some(); matches[0] } - Dfa | DfaAnchoredReverse | DfaSuffix | DfaMany => { + #[cfg(feature = "perf-dfa")] + Dfa | DfaAnchoredReverse | DfaMany => { match dfa::Fsm::forward_many( &self.ro.dfa, - self.cache, + self.cache.value(), matches, text, start, @@ -1099,6 +1170,30 @@ impl<'c> ExecNoSync<'c> { matches, &mut [], false, + false, + text, + start, + text.len(), + ), + } + } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + DfaSuffix => { + match dfa::Fsm::forward_many( + &self.ro.dfa, + self.cache.value(), + matches, + text, + start, + ) { + dfa::Result::Match(_) => true, + dfa::Result::NoMatch(_) => false, + dfa::Result::Quit => self.exec_nfa( + MatchNfaType::Auto, + matches, + &mut [], + false, + false, text, start, text.len(), @@ -1110,6 +1205,7 @@ impl<'c> ExecNoSync<'c> { matches, &mut [], false, + false, text, start, text.len(), @@ -1118,16 +1214,26 @@ impl<'c> ExecNoSync<'c> { } } - #[inline(always)] // reduces constant overhead + #[cfg_attr(feature = "perf-inline", inline(always))] fn is_anchor_end_match(&self, text: &[u8]) -> bool { - // Only do this check if the haystack is big (>1MB). - if text.len() > (1 << 20) && self.ro.nfa.is_anchored_end { - let lcs = self.ro.suffixes.lcs(); - if lcs.len() >= 1 && !lcs.is_suffix(text) { - return false; + #[cfg(not(feature = "perf-literal"))] + fn imp(_: &ExecReadOnly, _: &[u8]) -> bool { + true + } + + #[cfg(feature = "perf-literal")] + fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool { + // Only do this check if the haystack is big (>1MB). + if text.len() > (1 << 20) && ro.nfa.is_anchored_end { + let lcs = ro.suffixes.lcs(); + if lcs.len() >= 1 && !lcs.is_suffix(text) { + return false; + } } + true } - true + + imp(&self.ro, text) } pub fn capture_name_idx(&self) -> &Arc> { @@ -1143,10 +1249,9 @@ impl<'c> ExecNoSyncStr<'c> { impl Exec { /// Get a searcher that isn't Sync. - #[inline(always)] // reduces constant overhead + #[cfg_attr(feature = "perf-inline", inline(always))] pub fn searcher(&self) -> ExecNoSync { - let create = - || Box::new(RefCell::new(ProgramCacheInner::new(&self.ro))); + let create = || RefCell::new(ProgramCacheInner::new(&self.ro)); ExecNoSync { ro: &self.ro, // a clone is too expensive here! (and not needed) cache: self.cache.get_or(create), @@ -1154,7 +1259,7 @@ impl Exec { } /// Get a searcher that isn't Sync and can match on &str. - #[inline(always)] // reduces constant overhead + #[cfg_attr(feature = "perf-inline", inline(always))] pub fn searcher_str(&self) -> ExecNoSyncStr { ExecNoSyncStr(self.searcher()) } @@ -1201,72 +1306,115 @@ impl Exec { impl Clone for Exec { fn clone(&self) -> Exec { - Exec { ro: self.ro.clone(), cache: CachedThreadLocal::new() } + Exec { ro: self.ro.clone(), cache: Cached::new() } } } impl ExecReadOnly { fn choose_match_type(&self, hint: Option) -> MatchType { - use self::MatchType::*; - if let Some(Nfa(_)) = hint { + if let Some(MatchType::Nfa(_)) = hint { return hint.unwrap(); } // If the NFA is empty, then we'll never match anything. if self.nfa.insts.is_empty() { - return Nothing; + return MatchType::Nothing; } - // If our set of prefixes is complete, then we can use it to find - // a match in lieu of a regex engine. This doesn't quite work well in - // the presence of multiple regexes, so only do it when there's one. - // - // TODO(burntsushi): Also, don't try to match literals if the regex is - // partially anchored. We could technically do it, but we'd need to - // create two sets of literals: all of them and then the subset that - // aren't anchored. We would then only search for all of them when at - // the beginning of the input and use the subset in all other cases. - if self.res.len() == 1 { - if self.ac.is_some() { - return Literal(MatchLiteralType::AhoCorasick); + if let Some(literalty) = self.choose_literal_match_type() { + return literalty; + } + if let Some(dfaty) = self.choose_dfa_match_type() { + return dfaty; + } + // We're so totally hosed. + MatchType::Nfa(MatchNfaType::Auto) + } + + /// If a plain literal scan can be used, then a corresponding literal + /// search type is returned. + fn choose_literal_match_type(&self) -> Option { + #[cfg(not(feature = "perf-literal"))] + fn imp(_: &ExecReadOnly) -> Option { + None + } + + #[cfg(feature = "perf-literal")] + fn imp(ro: &ExecReadOnly) -> Option { + // If our set of prefixes is complete, then we can use it to find + // a match in lieu of a regex engine. This doesn't quite work well + // in the presence of multiple regexes, so only do it when there's + // one. + // + // TODO(burntsushi): Also, don't try to match literals if the regex + // is partially anchored. We could technically do it, but we'd need + // to create two sets of literals: all of them and then the subset + // that aren't anchored. We would then only search for all of them + // when at the beginning of the input and use the subset in all + // other cases. + if ro.res.len() != 1 { + return None; + } + if ro.ac.is_some() { + return Some(MatchType::Literal( + MatchLiteralType::AhoCorasick, + )); } - if self.nfa.prefixes.complete() { - return if self.nfa.is_anchored_start { - Literal(MatchLiteralType::AnchoredStart) + if ro.nfa.prefixes.complete() { + return if ro.nfa.is_anchored_start { + Some(MatchType::Literal(MatchLiteralType::AnchoredStart)) } else { - Literal(MatchLiteralType::Unanchored) + Some(MatchType::Literal(MatchLiteralType::Unanchored)) }; } - if self.suffixes.complete() { - return if self.nfa.is_anchored_end { - Literal(MatchLiteralType::AnchoredEnd) + if ro.suffixes.complete() { + return if ro.nfa.is_anchored_end { + Some(MatchType::Literal(MatchLiteralType::AnchoredEnd)) } else { // This case shouldn't happen. When the regex isn't // anchored, then complete prefixes should imply complete // suffixes. - Literal(MatchLiteralType::Unanchored) + Some(MatchType::Literal(MatchLiteralType::Unanchored)) }; } + None + } + + imp(self) + } + + /// If a DFA scan can be used, then choose the appropriate DFA strategy. + fn choose_dfa_match_type(&self) -> Option { + #[cfg(not(feature = "perf-dfa"))] + fn imp(_: &ExecReadOnly) -> Option { + None } - // If we can execute the DFA, then we totally should. - if dfa::can_exec(&self.dfa) { + + #[cfg(feature = "perf-dfa")] + fn imp(ro: &ExecReadOnly) -> Option { + if !dfa::can_exec(&ro.dfa) { + return None; + } // Regex sets require a slightly specialized path. - if self.res.len() >= 2 { - return DfaMany; + if ro.res.len() >= 2 { + return Some(MatchType::DfaMany); } // If the regex is anchored at the end but not the start, then // just match in reverse from the end of the haystack. - if !self.nfa.is_anchored_start && self.nfa.is_anchored_end { - return DfaAnchoredReverse; + if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end { + return Some(MatchType::DfaAnchoredReverse); } - // If there's a longish suffix literal, then it might be faster - // to look for that first. - if self.should_suffix_scan() { - return DfaSuffix; + #[cfg(feature = "perf-literal")] + { + // If there's a longish suffix literal, then it might be faster + // to look for that first. + if ro.should_suffix_scan() { + return Some(MatchType::DfaSuffix); + } } // Fall back to your garden variety forward searching lazy DFA. - return Dfa; + Some(MatchType::Dfa) } - // We're so totally hosed. - Nfa(MatchNfaType::Auto) + + imp(self) } /// Returns true if the program is amenable to suffix scanning. @@ -1283,6 +1431,7 @@ impl ExecReadOnly { /// account for but (2) is harder. As a proxy, we assume that longer /// strings are generally rarer, so we only enable this optimization when /// we have a meaty suffix. + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] fn should_suffix_scan(&self) -> bool { if self.suffixes.is_empty() { return false; @@ -1296,14 +1445,19 @@ impl ExecReadOnly { enum MatchType { /// A single or multiple literal search. This is only used when the regex /// can be decomposed into a literal search. + #[cfg(feature = "perf-literal")] Literal(MatchLiteralType), /// A normal DFA search. + #[cfg(feature = "perf-dfa")] Dfa, /// A reverse DFA search starting from the end of a haystack. + #[cfg(feature = "perf-dfa")] DfaAnchoredReverse, /// A reverse DFA search with suffix literal scanning. + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] DfaSuffix, /// Use the DFA on two or more regular expressions. + #[cfg(feature = "perf-dfa")] DfaMany, /// An NFA variant. Nfa(MatchNfaType), @@ -1312,6 +1466,7 @@ enum MatchType { } #[derive(Clone, Copy, Debug)] +#[cfg(feature = "perf-literal")] enum MatchLiteralType { /// Match literals anywhere in text. Unanchored, @@ -1324,7 +1479,7 @@ enum MatchLiteralType { AhoCorasick, } -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] enum MatchNfaType { /// Choose between Backtrack and PikeVM. Auto, @@ -1348,7 +1503,9 @@ pub type ProgramCache = RefCell; pub struct ProgramCacheInner { pub pikevm: pikevm::Cache, pub backtrack: backtrack::Cache, + #[cfg(feature = "perf-dfa")] pub dfa: dfa::Cache, + #[cfg(feature = "perf-dfa")] pub dfa_reverse: dfa::Cache, } @@ -1357,7 +1514,9 @@ impl ProgramCacheInner { ProgramCacheInner { pikevm: pikevm::Cache::new(&ro.nfa), backtrack: backtrack::Cache::new(&ro.nfa), + #[cfg(feature = "perf-dfa")] dfa: dfa::Cache::new(&ro.dfa), + #[cfg(feature = "perf-dfa")] dfa_reverse: dfa::Cache::new(&ro.dfa_reverse), } } @@ -1365,6 +1524,7 @@ impl ProgramCacheInner { /// Alternation literals checks if the given HIR is a simple alternation of /// literals, and if so, returns them. Otherwise, this returns None. +#[cfg(feature = "perf-literal")] fn alternation_literals(expr: &Hir) -> Option>> { use syntax::hir::{HirKind, Literal}; diff --git a/src/expand.rs b/src/expand.rs index f0a4554bf3..528f55e717 100644 --- a/src/expand.rs +++ b/src/expand.rs @@ -1,6 +1,6 @@ use std::str; -use memchr::memchr; +use find_byte::find_byte; use re_bytes; use re_unicode; @@ -11,7 +11,7 @@ pub fn expand_str( dst: &mut String, ) { while !replacement.is_empty() { - match memchr(b'$', replacement.as_bytes()) { + match find_byte(b'$', replacement.as_bytes()) { None => break, Some(i) => { dst.push_str(&replacement[..i]); @@ -53,7 +53,7 @@ pub fn expand_bytes( dst: &mut Vec, ) { while !replacement.is_empty() { - match memchr(b'$', replacement) { + match find_byte(b'$', replacement) { None => break, Some(i) => { dst.extend(&replacement[..i]); diff --git a/src/find_byte.rs b/src/find_byte.rs new file mode 100644 index 0000000000..e95f72afb9 --- /dev/null +++ b/src/find_byte.rs @@ -0,0 +1,18 @@ +/// Searches for the given needle in the given haystack. +/// +/// If the perf-literal feature is enabled, then this uses the super optimized +/// memchr crate. Otherwise, it uses the naive byte-at-a-time implementation. +pub fn find_byte(needle: u8, haystack: &[u8]) -> Option { + #[cfg(not(feature = "perf-literal"))] + fn imp(needle: u8, haystack: &[u8]) -> Option { + haystack.iter().position(|&b| b == needle) + } + + #[cfg(feature = "perf-literal")] + fn imp(needle: u8, haystack: &[u8]) -> Option { + use memchr::memchr; + memchr(needle, haystack) + } + + imp(needle, haystack) +} diff --git a/src/input.rs b/src/input.rs index b1aa6d9231..3afa2d0f6c 100644 --- a/src/input.rs +++ b/src/input.rs @@ -66,7 +66,7 @@ impl InputAt { } /// An abstraction over input used in the matching engines. -pub trait Input { +pub trait Input: fmt::Debug { /// Return an encoding of the position at byte offset `i`. fn at(&self, i: usize) -> InputAt; @@ -158,8 +158,12 @@ impl<'t> ops::Deref for CharInput<'t> { impl<'t> Input for CharInput<'t> { fn at(&self, i: usize) -> InputAt { - let c = decode_utf8(&self[i..]).map(|(c, _)| c).into(); - InputAt { pos: i, c: c, byte: None, len: c.len_utf8() } + if i >= self.len() { + InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } + } else { + let c = decode_utf8(&self[i..]).map(|(c, _)| c).into(); + InputAt { pos: i, c: c, byte: None, len: c.len_utf8() } + } } fn next_char(&self, at: InputAt) -> Char { @@ -243,7 +247,16 @@ impl<'t> ops::Deref for ByteInput<'t> { impl<'t> Input for ByteInput<'t> { fn at(&self, i: usize) -> InputAt { - InputAt { pos: i, c: None.into(), byte: self.get(i).cloned(), len: 1 } + if i >= self.len() { + InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } + } else { + InputAt { + pos: i, + c: None.into(), + byte: self.get(i).cloned(), + len: 1, + } + } } fn next_char(&self, at: InputAt) -> Char { @@ -352,16 +365,20 @@ impl Char { /// Returns the length of the character's UTF-8 encoding. /// - /// If the character is absent, then `0` is returned. + /// If the character is absent, then `1` is returned. #[inline] pub fn len_utf8(self) -> usize { - char::from_u32(self.0).map_or(0, |c| c.len_utf8()) + char::from_u32(self.0).map_or(1, |c| c.len_utf8()) } /// Returns true iff the character is a word character. /// /// If the character is absent, then false is returned. pub fn is_word_char(self) -> bool { + // is_word_character can panic if the Unicode data for \w isn't + // available. However, our compiler ensures that if a Unicode word + // boundary is used, then the data must also be available. If it isn't, + // then the compiler returns an error. char::from_u32(self.0).map_or(false, syntax::is_word_character) } diff --git a/src/lib.rs b/src/lib.rs index 4aa4eaa0a3..2a74bf8185 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -22,7 +22,7 @@ used by adding `regex` to your dependencies in your project's `Cargo.toml`. regex = "1" ``` -and this to your crate root: +If you're using Rust 2015, then you'll also need to add it to your crate root: ```rust extern crate regex; @@ -204,7 +204,8 @@ instead.) # Unicode This implementation executes regular expressions **only** on valid UTF-8 -while exposing match locations as byte indices into the search string. +while exposing match locations as byte indices into the search string. (To +relax this restriction, use the [`bytes`](bytes/index.html) sub-module.) Only simple case folding is supported. Namely, when matching case-insensitively, the characters are first mapped using the "simple" case @@ -272,6 +273,12 @@ example, `(?-u:\w)` is an ASCII-only `\w` character class and is legal in an `\xFF`, which is invalid UTF-8 and therefore is illegal in `&str`-based regexes. +Finally, since Unicode support requires bundling large Unicode data +tables, this crate exposes knobs to disable the compilation of those +data tables, which can be useful for shrinking binary size and reducing +compilation times. For details on how to do that, see the section on [crate +features](#crate-features). + # Syntax The syntax supported in this crate is documented below. @@ -477,6 +484,104 @@ These classes are based on the definitions provided in [[:xdigit:]] hex digit ([0-9A-Fa-f]) +# Crate features + +By default, this crate tries pretty hard to make regex matching both as fast +as possible and as correct as it can be, within reason. This means that there +is a lot of code dedicated to performance, the handling of Unicode data and the +Unicode data itself. Overall, this leads to more dependencies, larger binaries +and longer compile times. This trade off may not be appropriate in all cases, +and indeed, even when all Unicode and performance features are disabled, one +is still left with a perfectly serviceable regex engine that will work well +in many cases. + +This crate exposes a number of features for controlling that trade off. Some +of these features are strictly performance oriented, such that disabling them +won't result in a loss of functionality, but may result in worse performance. +Other features, such as the ones controlling the presence or absence of Unicode +data, can result in a loss of functionality. For example, if one disables the +`unicode-case` feature (described below), then compiling the regex `(?i)a` +will fail since Unicode case insensitivity is enabled by default. Instead, +callers must use `(?i-u)a` instead to disable Unicode case folding. Stated +differently, enabling or disabling any of the features below can only add or +subtract from the total set of valid regular expressions. Enabling or disabling +a feature will never modify the match semantics of a regular expression. + +All features below are enabled by default. + +### Ecosystem features + +* **std** - + When enabled, this will cause `regex` to use the standard library. Currently, + disabling this feature will always result in a compilation error. It is + intended to add `alloc`-only support to regex in the future. + +### Performance features + +* **perf** - + Enables all performance related features. This feature is enabled by default + and will always cover all features that improve performance, even if more + are added in the future. +* **perf-cache** - + Enables the use of very fast thread safe caching for internal match state. + When this is disabled, caching is still used, but with a slower and simpler + implementation. Disabling this drops the `thread_local` and `lazy_static` + dependencies. +* **perf-dfa** - + Enables the use of a lazy DFA for matching. The lazy DFA is used to compile + portions of a regex to a very fast DFA on an as-needed basis. This can + result in substantial speedups, usually by an order of magnitude on large + haystacks. The lazy DFA does not bring in any new dependencies, but it can + make compile times longer. +* **perf-inline** - + Enables the use of aggressive inlining inside match routines. This reduces + the overhead of each match. The aggressive inlining, however, increases + compile times and binary size. +* **perf-literal** - + Enables the use of literal optimizations for speeding up matches. In some + cases, literal optimizations can result in speedups of _several_ orders of + magnitude. Disabling this drops the `aho-corasick` and `memchr` dependencies. + +### Unicode features + +* **unicode** - + Enables all Unicode features. This feature is enabled by default, and will + always cover all Unicode features, even if more are added in the future. +* **unicode-age** - + Provide the data for the + [Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age). + This makes it possible to use classes like `\p{Age:6.0}` to refer to all + codepoints first introduced in Unicode 6.0 +* **unicode-bool** - + Provide the data for numerous Unicode boolean properties. The full list + is not included here, but contains properties like `Alphabetic`, `Emoji`, + `Lowercase`, `Math`, `Uppercase` and `White_Space`. +* **unicode-case** - + Provide the data for case insensitive matching using + [Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches). +* **unicode-gencat** - + Provide the data for + [Uncode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values). + This includes, but is not limited to, `Decimal_Number`, `Letter`, + `Math_Symbol`, `Number` and `Punctuation`. +* **unicode-perl** - + Provide the data for supporting the Unicode-aware Perl character classes, + corresponding to `\w`, `\s` and `\d`. This is also necessary for using + Unicode-aware word boundary assertions. Note that if this feature is + disabled, the `\s` and `\d` character classes are still available if the + `unicode-bool` and `unicode-gencat` features are enabled, respectively. +* **unicode-script** - + Provide the data for + [Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/). + This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`, + `Latin` and `Thai`. +* **unicode-segment** - + Provide the data necessary to provide the properties used to implement the + [Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/). + This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and + `\p{sb=ATerm}`. + + # Untrusted input This crate can handle both untrusted regular expressions and untrusted @@ -511,32 +616,35 @@ another matching engine with fixed memory requirements. #![cfg_attr(test, deny(warnings))] #![cfg_attr(feature = "pattern", feature(pattern))] -#[cfg(not(feature = "use_std"))] -compile_error!("`use_std` feature is currently required to build this crate"); +#[cfg(not(feature = "std"))] +compile_error!("`std` feature is currently required to build this crate"); +#[cfg(feature = "perf-literal")] extern crate aho_corasick; +#[cfg(test)] +extern crate doc_comment; +#[cfg(feature = "perf-literal")] extern crate memchr; -extern crate thread_local; #[cfg(test)] -#[macro_use] +#[cfg_attr(feature = "perf-literal", macro_use)] extern crate quickcheck; -#[cfg(test)] -extern crate doc_comment; extern crate regex_syntax as syntax; +#[cfg(feature = "perf-cache")] +extern crate thread_local; #[cfg(test)] doc_comment::doctest!("../README.md"); -#[cfg(feature = "use_std")] +#[cfg(feature = "std")] pub use error::Error; -#[cfg(feature = "use_std")] +#[cfg(feature = "std")] pub use re_builder::set_unicode::*; -#[cfg(feature = "use_std")] +#[cfg(feature = "std")] pub use re_builder::unicode::*; -#[cfg(feature = "use_std")] +#[cfg(feature = "std")] pub use re_set::unicode::*; -#[cfg(feature = "use_std")] -#[cfg(feature = "use_std")] +#[cfg(feature = "std")] +#[cfg(feature = "std")] pub use re_unicode::{ escape, CaptureLocations, CaptureMatches, CaptureNames, Captures, Locations, Match, Matches, NoExpand, Regex, Replacer, ReplacerRef, Split, @@ -630,7 +738,7 @@ When the `s` flag is enabled, `.` matches any byte. In general, one should expect performance on `&[u8]` to be roughly similar to performance on `&str`. */ -#[cfg(feature = "use_std")] +#[cfg(feature = "std")] pub mod bytes { pub use re_builder::bytes::*; pub use re_builder::set_bytes::*; @@ -639,11 +747,15 @@ pub mod bytes { } mod backtrack; +mod cache; mod compile; +#[cfg(feature = "perf-dfa")] mod dfa; mod error; mod exec; mod expand; +mod find_byte; +#[cfg(feature = "perf-literal")] mod freqs; mod input; mod literal; @@ -663,7 +775,7 @@ mod utf8; /// testing different matching engines and supporting the `regex-debug` CLI /// utility. #[doc(hidden)] -#[cfg(feature = "use_std")] +#[cfg(feature = "std")] pub mod internal { pub use compile::Compiler; pub use exec::{Exec, ExecBuilder}; diff --git a/src/literal.rs b/src/literal/imp.rs similarity index 99% rename from src/literal.rs rename to src/literal/imp.rs index ae405cbf4a..38ebd295f4 100644 --- a/src/literal.rs +++ b/src/literal/imp.rs @@ -80,7 +80,7 @@ impl LiteralSearcher { } /// Find the position of a literal in `haystack` if it exists. - #[inline(always)] // reduces constant overhead + #[cfg_attr(feature = "perf-inline", inline(always))] pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> { use self::Matcher::*; match self.matcher { @@ -339,7 +339,7 @@ impl SingleByteSet { } /// Faster find that special cases certain sizes to use memchr. - #[inline(always)] // reduces constant overhead + #[cfg_attr(feature = "perf-inline", inline(always))] fn find(&self, text: &[u8]) -> Option { match self.dense.len() { 0 => None, @@ -452,7 +452,7 @@ impl FreqyPacked { } } - #[inline(always)] // reduces constant overhead + #[cfg_attr(feature = "perf-inline", inline(always))] pub fn find(&self, haystack: &[u8]) -> Option { let pat = &*self.pat; if haystack.len() < pat.len() || pat.is_empty() { @@ -478,7 +478,7 @@ impl FreqyPacked { None } - #[inline(always)] // reduces constant overhead + #[cfg_attr(feature = "perf-inline", inline(always))] pub fn is_suffix(&self, text: &[u8]) -> bool { if text.len() < self.len() { return false; diff --git a/src/literal/mod.rs b/src/literal/mod.rs new file mode 100644 index 0000000000..783c63bd33 --- /dev/null +++ b/src/literal/mod.rs @@ -0,0 +1,55 @@ +pub use self::imp::*; + +#[cfg(feature = "perf-literal")] +mod imp; + +#[allow(missing_docs)] +#[cfg(not(feature = "perf-literal"))] +mod imp { + use syntax::hir::literal::Literals; + + #[derive(Clone, Debug)] + pub struct LiteralSearcher(()); + + impl LiteralSearcher { + pub fn empty() -> Self { + LiteralSearcher(()) + } + + pub fn prefixes(_: Literals) -> Self { + LiteralSearcher(()) + } + + pub fn suffixes(_: Literals) -> Self { + LiteralSearcher(()) + } + + pub fn complete(&self) -> bool { + false + } + + pub fn find(&self, _: &[u8]) -> Option<(usize, usize)> { + unreachable!() + } + + pub fn find_start(&self, _: &[u8]) -> Option<(usize, usize)> { + unreachable!() + } + + pub fn find_end(&self, _: &[u8]) -> Option<(usize, usize)> { + unreachable!() + } + + pub fn is_empty(&self) -> bool { + true + } + + pub fn len(&self) -> usize { + 0 + } + + pub fn approximate_size(&self) -> usize { + 0 + } + } +} diff --git a/src/pikevm.rs b/src/pikevm.rs index f88e78ddee..c106c76f31 100644 --- a/src/pikevm.rs +++ b/src/pikevm.rs @@ -199,7 +199,7 @@ impl<'r, I: Input> Fsm<'r, I> { } } } - if at.pos() == end || at.is_end() { + if at.pos() >= end { break; } at = at_next; diff --git a/src/prog.rs b/src/prog.rs index 3a76efdcb6..6cf4961830 100644 --- a/src/prog.rs +++ b/src/prog.rs @@ -161,7 +161,7 @@ impl Program { impl Deref for Program { type Target = [Inst]; - #[inline(always)] + #[cfg_attr(feature = "perf-inline", inline(always))] fn deref(&self) -> &Self::Target { &*self.insts } diff --git a/src/re_bytes.rs b/src/re_bytes.rs index b4816415f3..2e38c10ca8 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -5,7 +5,7 @@ use std::ops::Index; use std::str::FromStr; use std::sync::Arc; -use memchr::memchr; +use find_byte::find_byte; use error::Error; use exec::{Exec, ExecNoSync}; @@ -1121,7 +1121,7 @@ impl<'a> Replacer for &'a [u8] { } fn no_expansion(&mut self) -> Option> { - match memchr(b'$', *self) { + match find_byte(b'$', *self) { Some(_) => None, None => Some(Cow::Borrowed(*self)), } diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 1512fbe3e0..81aac15260 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -5,7 +5,7 @@ use std::ops::Index; use std::str::FromStr; use std::sync::Arc; -use memchr::memchr; +use find_byte::find_byte; use syntax; use error::Error; @@ -1163,7 +1163,7 @@ impl<'a> Replacer for &'a str { } fn no_expansion(&mut self) -> Option> { - match memchr(b'$', self.as_bytes()) { + match find_byte(b'$', self.as_bytes()) { Some(_) => None, None => Some(Cow::Borrowed(*self)), } diff --git a/src/testdata/basic.dat b/src/testdata/basic.dat index e55efaeec0..632e1bb416 100644 --- a/src/testdata/basic.dat +++ b/src/testdata/basic.dat @@ -48,7 +48,7 @@ E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2) E (a.|.a.)*|(a|.a...) aa (0,2)(0,2) E ab|a xabc (1,3) E ab|a xxabc (2,4) -Ei (Ab|cD)* aBcD (0,4)(2,4) +Ei (?-u)(Ab|cD)* aBcD (0,4)(2,4) BE [^-] --a (2,3) BE [a-]* --a (0,3) BE [a-m-]* --amoma-- (0,4) diff --git a/test b/test new file mode 100755 index 0000000000..3d1351c957 --- /dev/null +++ b/test @@ -0,0 +1,28 @@ +#!/bin/bash + +# This is a convenience script for running a broad swath of tests across +# features. We don't test the complete space, since the complete space is quite +# large. Hopefully once we migrate the test suite to better infrastructure +# (like regex-automata), we'll be able to test more of the space. +echo "===== DEFAULT FEATURES ===" +cargo test + +echo "===== DOC TESTS ===" +cargo test --doc + +features=( + "std" + "std unicode" + "std unicode-perl" + "std perf" + "std perf-cache" + "std perf-dfa" + "std perf-inline" + "std perf-literal" +) +for f in "${features[@]}"; do + echo "===== FEATURE: $f (default) ===" + cargo test --test default --no-default-features --features "$f" + echo "===== FEATURE: $f (default-bytes) ===" + cargo test --test default-bytes --no-default-features --features "$f" +done diff --git a/tests/api.rs b/tests/api.rs index 4399975277..ff136217e1 100644 --- a/tests/api.rs +++ b/tests/api.rs @@ -12,13 +12,13 @@ fn empty_regex_nonempty_match() { #[test] fn one_zero_length_match() { - let re = regex!(r"\d*"); + let re = regex!(r"[0-9]*"); assert_eq!(vec![(0, 0), (1, 2), (3, 4)], findall!(re, "a1b2")); } #[test] fn many_zero_length_match() { - let re = regex!(r"\d*"); + let re = regex!(r"[0-9]*"); assert_eq!( vec![(0, 0), (1, 2), (3, 3), (4, 4), (5, 6)], findall!(re, "a1bbb2") @@ -27,7 +27,7 @@ fn many_zero_length_match() { #[test] fn many_sequential_zero_length_match() { - let re = regex!(r"\d?"); + let re = regex!(r"[0-9]?"); assert_eq!( vec![(0, 0), (1, 2), (2, 3), (4, 5), (6, 6)], findall!(re, "a12b3c") @@ -121,7 +121,7 @@ fn capture_index_lifetime() { // This is a test of whether the types on `caps["..."]` are general // enough. If not, this will fail to typecheck. fn inner(s: &str) -> usize { - let re = regex!(r"(?P\d+)"); + let re = regex!(r"(?P[0-9]+)"); let caps = re.captures(t!(s)).unwrap(); caps["number"].len() } @@ -172,38 +172,38 @@ fn sub_capture_matches() { assert_eq!(t!("5"), match_text!(subs[4].unwrap())); } -expand!(expand1, r"(?P\w+)", "abc", "$foo", "abc"); -expand!(expand2, r"(?P\w+)", "abc", "$0", "abc"); -expand!(expand3, r"(?P\w+)", "abc", "$1", "abc"); -expand!(expand4, r"(?P\w+)", "abc", "$$1", "$1"); -expand!(expand5, r"(?P\w+)", "abc", "$$foo", "$foo"); -expand!(expand6, r"(?P\w+)\s+(?P\d+)", "abc 123", "$b$a", "123abc"); -expand!(expand7, r"(?P\w+)\s+(?P\d+)", "abc 123", "z$bz$az", "z"); +expand!(expand1, r"(?-u)(?P\w+)", "abc", "$foo", "abc"); +expand!(expand2, r"(?-u)(?P\w+)", "abc", "$0", "abc"); +expand!(expand3, r"(?-u)(?P\w+)", "abc", "$1", "abc"); +expand!(expand4, r"(?-u)(?P\w+)", "abc", "$$1", "$1"); +expand!(expand5, r"(?-u)(?P\w+)", "abc", "$$foo", "$foo"); +expand!(expand6, r"(?-u)(?P\w+)\s+(?P\d+)", "abc 123", "$b$a", "123abc"); +expand!(expand7, r"(?-u)(?P\w+)\s+(?P\d+)", "abc 123", "z$bz$az", "z"); expand!( expand8, - r"(?P\w+)\s+(?P\d+)", + r"(?-u)(?P\w+)\s+(?P\d+)", "abc 123", ".$b.$a.", ".123.abc." ); expand!( expand9, - r"(?P\w+)\s+(?P\d+)", + r"(?-u)(?P\w+)\s+(?P\d+)", "abc 123", " $b $a ", " 123 abc " ); -expand!(expand10, r"(?P\w+)\s+(?P\d+)", "abc 123", "$bz$az", ""); +expand!(expand10, r"(?-u)(?P\w+)\s+(?P\d+)", "abc 123", "$bz$az", ""); split!( split1, - r"\s+", + r"(?-u)\s+", "a b\nc\td\n\t e", &[t!("a"), t!("b"), t!("c"), t!("d"), t!("e")] ); split!( split2, - r"\b", + r"(?-u)\b", "a b c", &[t!(""), t!("a"), t!(" "), t!("b"), t!(" "), t!("c")] ); diff --git a/tests/bytes.rs b/tests/bytes.rs index bf1e7ea737..6c5a11ac77 100644 --- a/tests/bytes.rs +++ b/tests/bytes.rs @@ -10,15 +10,20 @@ impl<'a> R<'a> { } mat!(word_boundary, r"(?-u) \b", " δ", None); +#[cfg(feature = "unicode-perl")] mat!(word_boundary_unicode, r" \b", " δ", Some((0, 1))); mat!(word_not_boundary, r"(?-u) \B", " δ", Some((0, 1))); +#[cfg(feature = "unicode-perl")] mat!(word_not_boundary_unicode, r" \B", " δ", None); mat!(perl_w_ascii, r"(?-u)\w+", "aδ", Some((0, 1))); +#[cfg(feature = "unicode-perl")] mat!(perl_w_unicode, r"\w+", "aδ", Some((0, 3))); mat!(perl_d_ascii, r"(?-u)\d+", "1२३9", Some((0, 1))); +#[cfg(feature = "unicode-perl")] mat!(perl_d_unicode, r"\d+", "1२३9", Some((0, 8))); mat!(perl_s_ascii, r"(?-u)\s+", " \u{1680}", Some((0, 1))); +#[cfg(feature = "unicode-perl")] mat!(perl_s_unicode, r"\s+", " \u{1680}", Some((0, 4))); // The first `(.+)` matches two Unicode codepoints, but can't match the 5th @@ -35,6 +40,7 @@ mat!( mat!(case_ascii_one, r"(?i-u)a", "A", Some((0, 1))); mat!(case_ascii_class, r"(?i-u)[a-z]+", "AaAaA", Some((0, 5))); +#[cfg(feature = "unicode-case")] mat!(case_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 7))); mat!(case_not_unicode, r"(?i-u)[a-z]+", "aA\u{212A}aA", Some((0, 2))); diff --git a/tests/crazy.rs b/tests/crazy.rs index 6c45ceb38e..20a3371b2e 100644 --- a/tests/crazy.rs +++ b/tests/crazy.rs @@ -3,13 +3,13 @@ mat!(ascii_literal, r"a", "a", Some((0, 1))); // Some crazy expressions from regular-expressions.info. mat!( match_ranges, - r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b", + r"(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b", "num: 255", Some((5, 8)) ); mat!( match_ranges_not, - r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b", + r"(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b", "num: 256", None ); @@ -19,13 +19,13 @@ mat!(match_float3, r"[-+]?[0-9]*\.?[0-9]+", "a1.2", Some((1, 4))); mat!(match_float4, r"^[-+]?[0-9]*\.?[0-9]+$", "1.a", None); mat!( match_email, - r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b", + r"(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b", "mine is jam.slam@gmail.com ", Some((8, 26)) ); mat!( match_email_not, - r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b", + r"(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b", "mine is jam.slam@gmail ", None ); @@ -33,19 +33,19 @@ mat!(match_email_big, r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{| "mine is jam.slam@gmail.com ", Some((8, 26))); mat!( match_date1, - r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", + r"(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", "1900-01-01", Some((0, 10)) ); mat!( match_date2, - r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", + r"(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", "1900-00-01", None ); mat!( match_date3, - r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", + r"(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", "1900-13-01", None ); @@ -81,11 +81,11 @@ matiter!( // Test negated character classes. mat!(negclass_letters, r"[^ac]", "acx", Some((2, 3))); mat!(negclass_letter_comma, r"[^a,]", "a,x", Some((2, 3))); -mat!(negclass_letter_space, r"[^a\s]", "a x", Some((2, 3))); +mat!(negclass_letter_space, r"[^a[:space:]]", "a x", Some((2, 3))); mat!(negclass_comma, r"[^,]", ",,x", Some((2, 3))); -mat!(negclass_space, r"[^\s]", " a", Some((1, 2))); -mat!(negclass_space_comma, r"[^,\s]", ", a", Some((2, 3))); -mat!(negclass_comma_space, r"[^\s,]", " ,a", Some((2, 3))); +mat!(negclass_space, r"[^[:space:]]", " a", Some((1, 2))); +mat!(negclass_space_comma, r"[^,[:space:]]", ", a", Some((2, 3))); +mat!(negclass_comma_space, r"[^[:space:],]", " ,a", Some((2, 3))); mat!(negclass_ascii, r"[^[:alpha:]Z]", "A1", Some((1, 2))); // Test that repeated empty expressions don't loop forever. @@ -150,7 +150,7 @@ fn nest_limit_makes_it_parse() { use regex::RegexBuilder; RegexBuilder::new( - r#" + r#"(?-u) 2(?: [45]\d{3}| 7(?: diff --git a/tests/flags.rs b/tests/flags.rs index ed6650eaf0..c33b82d434 100644 --- a/tests/flags.rs +++ b/tests/flags.rs @@ -1,16 +1,31 @@ -mat!(match_flag_case, "(?i)abc", "ABC", Some((0, 3))); -mat!(match_flag_weird_case, "(?i)a(?-i)bc", "Abc", Some((0, 3))); -mat!(match_flag_weird_case_not, "(?i)a(?-i)bc", "ABC", None); -mat!(match_flag_case_dotnl, "(?is)a.", "A\n", Some((0, 2))); -mat!(match_flag_case_dotnl_toggle, "(?is)a.(?-is)a.", "A\nab", Some((0, 4))); -mat!(match_flag_case_dotnl_toggle_not, "(?is)a.(?-is)a.", "A\na\n", None); +mat!(match_flag_case, "(?-u)(?i)abc", "ABC", Some((0, 3))); +mat!(match_flag_weird_case, "(?-u)(?i)a(?-i)bc", "Abc", Some((0, 3))); +mat!(match_flag_weird_case_not, "(?-u)(?i)a(?-i)bc", "ABC", None); +mat!(match_flag_case_dotnl, "(?-u)(?is)a(?u:.)", "A\n", Some((0, 2))); +mat!( + match_flag_case_dotnl_toggle, + "(?-u)(?is)a(?u:.)(?-is)a(?u:.)", + "A\nab", + Some((0, 4)) +); +mat!( + match_flag_case_dotnl_toggle_not, + "(?-u)(?is)a(?u:.)(?-is)a(?u:.)", + "A\na\n", + None +); mat!( match_flag_case_dotnl_toggle_ok, - "(?is)a.(?-is:a.)?", + "(?-u)(?is)a(?u:.)(?-is:a(?u:.))?", "A\na\n", Some((0, 2)) ); -mat!(match_flag_multi, "(?m)(?:^\\d+$\n?)+", "123\n456\n789", Some((0, 11))); +mat!( + match_flag_multi, + r"(?-u)(?m)(?:^\d+$\n?)+", + "123\n456\n789", + Some((0, 11)) +); mat!(match_flag_ungreedy, "(?U)a+", "aa", Some((0, 1))); mat!(match_flag_ungreedy_greedy, "(?U)a+?", "aa", Some((0, 2))); mat!(match_flag_ungreedy_noop, "(?U)(?-U)a+", "aa", Some((0, 2))); diff --git a/tests/fowler.rs b/tests/fowler.rs index 45a225172b..5da32935e7 100644 --- a/tests/fowler.rs +++ b/tests/fowler.rs @@ -1,5 +1,5 @@ // DO NOT EDIT. Automatically generated by 'scripts/regex-match-tests.py' -// on 2015-02-28 11:00:00.161706. +// on 2019-09-02 11:07:37.849994. // Tests from basic.dat mat!(match_basic_3, r"abracadabra$", r"abracadabracadabra", Some((7, 18))); @@ -115,7 +115,13 @@ mat!( ); mat!(match_basic_49, r"ab|a", r"xabc", Some((1, 3))); mat!(match_basic_50, r"ab|a", r"xxabc", Some((2, 4))); -mat!(match_basic_51, r"(?i)(Ab|cD)*", r"aBcD", Some((0, 4)), Some((2, 4))); +mat!( + match_basic_51, + r"(?i)(?-u)(Ab|cD)*", + r"aBcD", + Some((0, 4)), + Some((2, 4)) +); mat!(match_basic_52, r"[^-]", r"--a", Some((2, 3))); mat!(match_basic_53, r"[a-]*", r"--a", Some((0, 3))); mat!(match_basic_54, r"[a-m-]*", r"--amoma--", Some((0, 4))); diff --git a/tests/regression.rs b/tests/regression.rs index 143486f3fc..4ab8c73406 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -27,11 +27,13 @@ fn regression_invalid_flags_expression() { } // See: https://github.com/rust-lang/regex/issues/75 -mat!(regression_unsorted_binary_search_1, r"(?i)[a_]+", "A_", Some((0, 2))); -mat!(regression_unsorted_binary_search_2, r"(?i)[A_]+", "a_", Some((0, 2))); +mat!(regression_unsorted_binary_search_1, r"(?i-u)[a_]+", "A_", Some((0, 2))); +mat!(regression_unsorted_binary_search_2, r"(?i-u)[A_]+", "a_", Some((0, 2))); // See: https://github.com/rust-lang/regex/issues/99 +#[cfg(feature = "unicode-case")] mat!(regression_negated_char_class_1, r"(?i)[^x]", "x", None); +#[cfg(feature = "unicode-case")] mat!(regression_negated_char_class_2, r"(?i)[^x]", "X", None); // See: https://github.com/rust-lang/regex/issues/101 @@ -53,17 +55,19 @@ mat!(regression_alt_in_alt2, r"^(.*?)(\n|\r\n?|$)", "ab\rcd", Some((0, 3))); mat!(regression_leftmost_first_prefix, r"z*azb", "azb", Some((0, 3))); // See: https://github.com/rust-lang/regex/issues/76 +#[cfg(all(feature = "unicode-case", feature = "unicode-gencat"))] mat!(uni_case_lower_nocase_flag, r"(?i)\p{Ll}+", "ΛΘΓΔα", Some((0, 10))); // See: https://github.com/rust-lang/regex/issues/191 mat!(many_alternates, r"1|2|3|4|5|6|7|8|9|10|int", "int", Some((0, 3))); // burntsushi was bad and didn't create an issue for this bug. -mat!(anchored_prefix1, r"^a\S", "a ", None); -mat!(anchored_prefix2, r"^a\S", "foo boo a ", None); +mat!(anchored_prefix1, r"^a[[:^space:]]", "a ", None); +mat!(anchored_prefix2, r"^a[[:^space:]]", "foo boo a ", None); mat!(anchored_prefix3, r"^-[a-z]", "r-f", None); // See: https://github.com/rust-lang/regex/issues/204 +#[cfg(feature = "unicode-perl")] split!( split_on_word_boundary, r"\b", @@ -78,6 +82,7 @@ split!( t!("?)") ] ); +#[cfg(feature = "unicode-perl")] matiter!( word_boundary_dfa, r"\b", @@ -116,6 +121,7 @@ mat!( mat!(endl_or_wb, r"(?m:$)|(?-u:\b)", "\u{6084e}", Some((4, 4))); mat!(zero_or_end, r"(?i-u:\x00)|$", "\u{e682f}", Some((4, 4))); mat!(y_or_endl, r"(?i-u:y)|(?m:$)", "\u{b4331}", Some((4, 4))); +#[cfg(feature = "unicode-perl")] mat!(wb_start_x, r"(?u:\b)^(?-u:X)", "X", Some((0, 1))); // See: https://github.com/rust-lang/regex/issues/321 @@ -124,8 +130,8 @@ ismatch!(strange_anchor_non_complete_suffix, r"${2}a", "", false); // See: https://github.com/BurntSushi/ripgrep/issues/1203 ismatch!(reverse_suffix1, r"[0-4][0-4][0-4]000", "153.230000", true); -ismatch!(reverse_suffix2, r"\d\d\d000", "153.230000\n", true); -matiter!(reverse_suffix3, r"\d\d\d000", "153.230000\n", (4, 10)); +ismatch!(reverse_suffix2, r"[0-9][0-9][0-9]000", "153.230000\n", true); +matiter!(reverse_suffix3, r"[0-9][0-9][0-9]000", "153.230000\n", (4, 10)); // See: https://github.com/rust-lang/regex/issues/334 // See: https://github.com/rust-lang/regex/issues/557 @@ -150,7 +156,7 @@ mat!(captures_after_dfa_premature_end3, r"(aa$)?", "aaz", Some((0, 0))); // See: https://github.com/rust-lang/regex/issues/437 ismatch!( literal_panic, - r"typename type\-parameter\-\d+\-\d+::.+", + r"typename type\-parameter\-[0-9]+\-[0-9]+::.+", "test", false ); @@ -188,6 +194,7 @@ mat!( // See: https://github.com/BurntSushi/ripgrep/issues/1247 #[test] +#[cfg(feature = "unicode-perl")] fn regression_nfa_stops1() { let re = ::regex::bytes::Regex::new(r"\bs(?:[ab])").unwrap(); assert_eq!(0, re.find_iter(b"s\xE4").count()); diff --git a/tests/replace.rs b/tests/replace.rs index f552203095..c156a399ff 100644 --- a/tests/replace.rs +++ b/tests/replace.rs @@ -9,14 +9,21 @@ macro_rules! replace( ); ); -replace!(first, replace, r"\d", "age: 26", t!("Z"), "age: Z6"); -replace!(plus, replace, r"\d+", "age: 26", t!("Z"), "age: Z"); -replace!(all, replace_all, r"\d", "age: 26", t!("Z"), "age: ZZ"); -replace!(groups, replace, r"(\S+)\s+(\S+)", "w1 w2", t!("$2 $1"), "w2 w1"); +replace!(first, replace, r"[0-9]", "age: 26", t!("Z"), "age: Z6"); +replace!(plus, replace, r"[0-9]+", "age: 26", t!("Z"), "age: Z"); +replace!(all, replace_all, r"[0-9]", "age: 26", t!("Z"), "age: ZZ"); +replace!( + groups, + replace, + r"(?-u)(\S+)\s+(\S+)", + "w1 w2", + t!("$2 $1"), + "w2 w1" +); replace!( double_dollar, replace, - r"(\S+)\s+(\S+)", + r"(?-u)(\S+)\s+(\S+)", "w1 w2", t!("$2 $$1"), "w2 $1" @@ -26,7 +33,7 @@ replace!( replace!( named, replace_all, - r"(?P\S+)\s+(?P\S+)(?P\s*)", + r"(?-u)(?P\S+)\s+(?P\S+)(?P\s*)", "w1 w2 w3 w4", t!("$last $first$space"), "w2 w1 w4 w3" @@ -41,12 +48,26 @@ replace!( ); replace!(number_hypen, replace, r"(.)(.)", "ab", t!("$1-$2"), "a-b"); // replace!(number_underscore, replace, r"(.)(.)", "ab", t!("$1_$2"), "a_b"); -replace!(simple_expand, replace_all, r"(\w) (\w)", "a b", t!("$2 $1"), "b a"); -replace!(literal_dollar1, replace_all, r"(\w+) (\w+)", "a b", t!("$$1"), "$1"); +replace!( + simple_expand, + replace_all, + r"(?-u)(\w) (\w)", + "a b", + t!("$2 $1"), + "b a" +); +replace!( + literal_dollar1, + replace_all, + r"(?-u)(\w+) (\w+)", + "a b", + t!("$$1"), + "$1" +); replace!( literal_dollar2, replace_all, - r"(\w+) (\w+)", + r"(?-u)(\w+) (\w+)", "a b", t!("$2 $$c $1"), "b $c a" @@ -54,7 +75,7 @@ replace!( replace!( no_expand1, replace, - r"(\S+)\s+(\S+)", + r"(?-u)(\S+)\s+(\S+)", "w1 w2", no_expand!("$2 $1"), "$2 $1" @@ -62,7 +83,7 @@ replace!( replace!( no_expand2, replace, - r"(\S+)\s+(\S+)", + r"(?-u)(\S+)\s+(\S+)", "w1 w2", no_expand!("$$1"), "$$1" @@ -71,7 +92,7 @@ use_!(Captures); replace!( closure_returning_reference, replace, - r"(\d+)", + r"([0-9]+)", "age: 26", |captures: &Captures| { match_text!(captures.get(1).unwrap())[0..1].to_owned() @@ -81,7 +102,7 @@ replace!( replace!( closure_returning_value, replace, - r"\d+", + r"[0-9]+", "age: 26", |_captures: &Captures| t!("Z").to_owned(), "age: Z" diff --git a/tests/set.rs b/tests/set.rs index 4aee11397a..3e9755cc26 100644 --- a/tests/set.rs +++ b/tests/set.rs @@ -12,7 +12,7 @@ matset!(set11, &[r"[a-z]+$", "foo"], "foo 01234", 1); matset!(set12, &[r".*?", "a"], "zzzzzza", 0, 1); matset!(set13, &[r".*", "a"], "zzzzzza", 0, 1); matset!(set14, &[r".*", "a"], "zzzzzz", 0); -matset!(set15, &[r"\ba\b"], "hello a bye", 0); +matset!(set15, &[r"(?-u)\ba\b"], "hello a bye", 0); matset!(set16, &["a"], "a", 0); matset!(set17, &[".*a"], "a", 0); matset!(set18, &["a", "β"], "β", 1); diff --git a/tests/test_backtrack.rs b/tests/test_backtrack.rs index 660fbe11ba..617185f46f 100644 --- a/tests/test_backtrack.rs +++ b/tests/test_backtrack.rs @@ -51,6 +51,9 @@ mod replace; mod searcher; mod set; mod suffix_reverse; +#[cfg(feature = "unicode")] mod unicode; +#[cfg(feature = "unicode-perl")] mod word_boundary; +#[cfg(feature = "unicode-perl")] mod word_boundary_unicode; diff --git a/tests/test_backtrack_bytes.rs b/tests/test_backtrack_bytes.rs index a6e2911334..17df4d85e4 100644 --- a/tests/test_backtrack_bytes.rs +++ b/tests/test_backtrack_bytes.rs @@ -50,6 +50,9 @@ mod regression; mod replace; mod set; mod suffix_reverse; +#[cfg(feature = "unicode")] mod unicode; +#[cfg(feature = "unicode-perl")] mod word_boundary; +#[cfg(feature = "unicode-perl")] mod word_boundary_ascii; diff --git a/tests/test_backtrack_utf8bytes.rs b/tests/test_backtrack_utf8bytes.rs index 143ee9b00c..78a0135bd9 100644 --- a/tests/test_backtrack_utf8bytes.rs +++ b/tests/test_backtrack_utf8bytes.rs @@ -53,6 +53,9 @@ mod replace; mod searcher; mod set; mod suffix_reverse; +#[cfg(feature = "unicode")] mod unicode; +#[cfg(feature = "unicode-perl")] mod word_boundary; +#[cfg(feature = "unicode-perl")] mod word_boundary_unicode; diff --git a/tests/test_default.rs b/tests/test_default.rs index 93a8442b22..c0979c10fc 100644 --- a/tests/test_default.rs +++ b/tests/test_default.rs @@ -54,8 +54,11 @@ mod searcher; mod set; mod shortest_match; mod suffix_reverse; +#[cfg(feature = "unicode")] mod unicode; +#[cfg(feature = "unicode-perl")] mod word_boundary; +#[cfg(feature = "unicode-perl")] mod word_boundary_unicode; #[test] diff --git a/tests/test_default_bytes.rs b/tests/test_default_bytes.rs index d8e78bd7f9..e4a25dc408 100644 --- a/tests/test_default_bytes.rs +++ b/tests/test_default_bytes.rs @@ -70,6 +70,9 @@ mod replace; mod set; mod shortest_match; mod suffix_reverse; +#[cfg(feature = "unicode")] mod unicode; +#[cfg(feature = "unicode-perl")] mod word_boundary; -mod word_boundary_ascii; +#[cfg(feature = "unicode-perl")] +mod word_boundary_unicode; diff --git a/tests/test_nfa.rs b/tests/test_nfa.rs index b7f0aab613..05dad2311c 100644 --- a/tests/test_nfa.rs +++ b/tests/test_nfa.rs @@ -45,6 +45,9 @@ mod replace; mod searcher; mod set; mod suffix_reverse; +#[cfg(feature = "unicode")] mod unicode; +#[cfg(feature = "unicode-perl")] mod word_boundary; +#[cfg(feature = "unicode-perl")] mod word_boundary_unicode; diff --git a/tests/test_nfa_bytes.rs b/tests/test_nfa_bytes.rs index b5b9a7e40c..104231852c 100644 --- a/tests/test_nfa_bytes.rs +++ b/tests/test_nfa_bytes.rs @@ -50,6 +50,9 @@ mod regression; mod replace; mod set; mod suffix_reverse; +#[cfg(feature = "unicode")] mod unicode; +#[cfg(feature = "unicode-perl")] mod word_boundary; -mod word_boundary_ascii; +#[cfg(feature = "unicode-perl")] +mod word_boundary_unicode; diff --git a/tests/test_nfa_utf8bytes.rs b/tests/test_nfa_utf8bytes.rs index e7f7e4ebd6..86487a1ee4 100644 --- a/tests/test_nfa_utf8bytes.rs +++ b/tests/test_nfa_utf8bytes.rs @@ -49,6 +49,9 @@ mod replace; mod searcher; mod set; mod suffix_reverse; +#[cfg(feature = "unicode")] mod unicode; +#[cfg(feature = "unicode-perl")] mod word_boundary; +#[cfg(feature = "unicode-perl")] mod word_boundary_unicode;