Skip to content
This repository was archived by the owner on May 29, 2023. It is now read-only.

Commit 7ad050d

Browse files
authored
Merge pull request #12 from robinst/handle-escapes-in-character-classes
Handle fancy escapes in character classes
2 parents af9f249 + bfa1b99 commit 7ad050d

File tree

2 files changed

+54
-10
lines changed

2 files changed

+54
-10
lines changed

src/parse.rs

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
//! A regex parser yielding an AST.
2222
23+
use regex::escape;
2324
use bit_set::BitSet;
2425
use std::str::FromStr;
2526
use std::usize;
@@ -199,7 +200,13 @@ impl<'a> Parser<'a> {
199200
}
200201
)),
201202
b'(' => self.parse_group(ix, depth),
202-
b'\\' => self.parse_escape(ix),
203+
b'\\' => {
204+
let (next, expr) = try!(self.parse_escape(ix));
205+
if let Expr::Backref(group) = expr {
206+
self.backrefs.insert(group);
207+
}
208+
Ok((next, expr))
209+
},
203210
b'+' | b'*' | b'?' | b'|' | b')' =>
204211
Ok((ix, Expr::Empty)),
205212
b'[' => self.parse_class(ix),
@@ -221,7 +228,7 @@ impl<'a> Parser<'a> {
221228
}
222229

223230
// ix points to \ character
224-
fn parse_escape(&mut self, ix: usize) -> Result<(usize, Expr)> {
231+
fn parse_escape(&self, ix: usize) -> Result<(usize, Expr)> {
225232
if ix + 1 == self.re.len() {
226233
return Err(Error::TrailingBackslash);
227234
}
@@ -233,7 +240,6 @@ impl<'a> Parser<'a> {
233240
if let Some((end, group)) = parse_decimal(self.re, ix + 1) {
234241
// protect BitSet against unreasonably large value
235242
if group < self.re.len() / 2 {
236-
self.backrefs.insert(group);
237243
return Ok((end, Expr::Backref(group)));
238244
}
239245
}
@@ -331,9 +337,9 @@ impl<'a> Parser<'a> {
331337
fn parse_class(&self, ix: usize) -> Result<(usize, Expr)> {
332338
let bytes = self.re.as_bytes();
333339
let mut ix = ix + 1; // skip opening '['
334-
let mut inner = String::new();
340+
let mut class = String::new();
335341
let mut nest = 1;
336-
inner.push('[');
342+
class.push('[');
337343
loop {
338344
ix = self.optional_whitespace(ix);
339345
if ix == self.re.len() {
@@ -344,27 +350,46 @@ impl<'a> Parser<'a> {
344350
if ix + 1 == self.re.len() {
345351
return Err(Error::InvalidClass);
346352
}
347-
ix + 1 + codepoint_len(bytes[ix + 1])
353+
354+
// We support more escapes than regex, so parse it ourselves before delegating.
355+
let (end, expr) = try!(self.parse_escape(ix));
356+
match expr {
357+
Expr::Literal { val, .. } => {
358+
class.push_str(&escape(&val));
359+
}
360+
Expr::Delegate { inner, .. } => {
361+
class.push_str(&inner);
362+
}
363+
_ => {
364+
return Err(Error::InvalidClass);
365+
}
366+
}
367+
end
348368
}
349369
b'[' => {
350370
nest += 1;
371+
class.push('[');
351372
ix + 1
352373
}
353374
b']' => {
354375
nest -= 1;
355376
if nest == 0 {
356377
break;
357378
}
379+
class.push(']');
358380
ix + 1
359381
}
360-
b => ix + codepoint_len(b)
382+
b => {
383+
let end = ix + codepoint_len(b);
384+
class.push_str(&self.re[ix..end]);
385+
end
386+
}
361387
};
362-
inner.push_str(&self.re[ix..end]);
363388
ix = end;
364389
}
365-
inner.push(']');
390+
class.push(']');
366391
let ix = ix + 1; // skip closing ']'
367-
Ok((ix, Expr::Delegate { inner: inner, size: 1 }))
392+
Ok((ix, Expr::Delegate { inner: class, size: 1 }))
368393
}
369394

370395
fn parse_group(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {

tests/matching.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,25 @@ fn control_character_escapes() {
1414
assert_matches(r"\v", "\x0B");
1515
}
1616

17+
#[test]
18+
fn character_class_escapes() {
19+
assert_matches(r"[\[]", "[");
20+
assert_matches(r"[\^]", "^");
21+
22+
// The regex crate would reject the following because it's not necessary to escape them.
23+
// Other engines allow to escape any non-alphanumeric character.
24+
assert_matches(r"[\<]", "<");
25+
assert_matches(r"[\>]", ">");
26+
assert_matches(r"[\.]", ".");
27+
28+
// Character class escape
29+
assert_matches(r"[\d]", "1");
30+
31+
// Control characters
32+
assert_matches(r"[\e]", "\x1B");
33+
assert_matches(r"[\n]", "\x0A");
34+
}
35+
1736

1837
fn assert_matches(re: &str, text: &str) {
1938
let parse_result = Regex::new(re);

0 commit comments

Comments
 (0)