Skip to content

Commit aa7f631

Browse files
committed
syntax: simplify hir::Repetition
This greatly simplifies how repetitions are represented in the HIR from a sprawling set of variants down to just a simple `(u32, Option<u32>)`. This is much simpler and still permits us to specialize all of the cases we did before if necessary. This also simplifies some of the HIR printer's output. e.g., 'a{1}' is just 'a'.
1 parent c5d32f6 commit aa7f631

File tree

5 files changed

+188
-147
lines changed

5 files changed

+188
-147
lines changed

regex-syntax/src/hir/literal/mod.rs

+18-30
Original file line numberDiff line numberDiff line change
@@ -602,26 +602,19 @@ fn prefixes(expr: &Hir, lits: &mut Literals) {
602602
HirKind::Group(hir::Group { ref hir, .. }) => {
603603
prefixes(&**hir, lits);
604604
}
605-
HirKind::Repetition(ref x) => match x.kind {
606-
hir::RepetitionKind::ZeroOrOne => {
605+
HirKind::Repetition(ref x) => match (x.min, x.max) {
606+
(0, Some(1)) => {
607607
repeat_zero_or_one_literals(&x.hir, lits, prefixes);
608608
}
609-
hir::RepetitionKind::ZeroOrMore => {
609+
(0, None) => {
610610
repeat_zero_or_more_literals(&x.hir, lits, prefixes);
611611
}
612-
hir::RepetitionKind::OneOrMore => {
612+
(1, None) => {
613613
repeat_one_or_more_literals(&x.hir, lits, prefixes);
614614
}
615-
hir::RepetitionKind::Range(ref rng) => {
616-
let (min, max) = match *rng {
617-
hir::RepetitionRange::Exactly(m) => (m, Some(m)),
618-
hir::RepetitionRange::AtLeast(m) => (m, None),
619-
hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
620-
};
621-
repeat_range_literals(
622-
&x.hir, min, max, x.greedy, lits, prefixes,
623-
)
624-
}
615+
(min, max) => repeat_range_literals(
616+
&x.hir, min, max, x.greedy, lits, prefixes,
617+
),
625618
},
626619
HirKind::Concat(ref es) if es.is_empty() => {}
627620
HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits),
@@ -678,26 +671,19 @@ fn suffixes(expr: &Hir, lits: &mut Literals) {
678671
HirKind::Group(hir::Group { ref hir, .. }) => {
679672
suffixes(&**hir, lits);
680673
}
681-
HirKind::Repetition(ref x) => match x.kind {
682-
hir::RepetitionKind::ZeroOrOne => {
674+
HirKind::Repetition(ref x) => match (x.min, x.max) {
675+
(0, Some(1)) => {
683676
repeat_zero_or_one_literals(&x.hir, lits, suffixes);
684677
}
685-
hir::RepetitionKind::ZeroOrMore => {
678+
(0, None) => {
686679
repeat_zero_or_more_literals(&x.hir, lits, suffixes);
687680
}
688-
hir::RepetitionKind::OneOrMore => {
681+
(1, None) => {
689682
repeat_one_or_more_literals(&x.hir, lits, suffixes);
690683
}
691-
hir::RepetitionKind::Range(ref rng) => {
692-
let (min, max) = match *rng {
693-
hir::RepetitionRange::Exactly(m) => (m, Some(m)),
694-
hir::RepetitionRange::AtLeast(m) => (m, None),
695-
hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
696-
};
697-
repeat_range_literals(
698-
&x.hir, min, max, x.greedy, lits, suffixes,
699-
)
700-
}
684+
(min, max) => repeat_range_literals(
685+
&x.hir, min, max, x.greedy, lits, suffixes,
686+
),
701687
},
702688
HirKind::Concat(ref es) if es.is_empty() => {}
703689
HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits),
@@ -736,7 +722,8 @@ fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>(
736722
) {
737723
f(
738724
&Hir::repetition(hir::Repetition {
739-
kind: hir::RepetitionKind::ZeroOrMore,
725+
min: 0,
726+
max: None,
740727
// FIXME: Our literal extraction doesn't care about greediness.
741728
// Which is partially why we're treating 'e?' as 'e*'. Namely,
742729
// 'ab??' yields [Complete(ab), Complete(a)], but it should yield
@@ -794,7 +781,8 @@ fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
794781
// just treat it as `e*`.
795782
f(
796783
&Hir::repetition(hir::Repetition {
797-
kind: hir::RepetitionKind::ZeroOrMore,
784+
min: 0,
785+
max: None,
798786
greedy,
799787
hir: Box::new(e.clone()),
800788
}),

regex-syntax/src/hir/mod.rs

+18-35
Original file line numberDiff line numberDiff line change
@@ -1362,8 +1362,18 @@ pub enum GroupKind {
13621362
/// sub-expression.
13631363
#[derive(Clone, Debug, Eq, PartialEq)]
13641364
pub struct Repetition {
1365-
/// The kind of this repetition operator.
1366-
pub kind: RepetitionKind,
1365+
/// The minimum range of the repetition.
1366+
///
1367+
/// Note that special cases like `?`, `+` and `*` all get translated into
1368+
/// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively.
1369+
pub min: u32,
1370+
/// The maximum range of the repetition.
1371+
///
1372+
/// Note that when `max` is `None`, `min` acts as a lower bound but where
1373+
/// there is no upper bound. For something like `x{5}` where the min and
1374+
/// max are equivalent, `min` will be set to `5` and `max` will be set to
1375+
/// `Some(5)`.
1376+
pub max: Option<u32>,
13671377
/// Whether this repetition operator is greedy or not. A greedy operator
13681378
/// will match as much as it can. A non-greedy operator will match as
13691379
/// little as it can.
@@ -1385,42 +1395,14 @@ impl Repetition {
13851395
/// string and one or more occurrences of something that matches the
13861396
/// empty string will always match the empty string. In order to get the
13871397
/// inductive definition, see the corresponding method on [`Hir`].
1398+
///
1399+
/// This returns true in precisely the cases that [`Repetition::min`]
1400+
/// is equal to `0`.
13881401
pub fn is_match_empty(&self) -> bool {
1389-
match self.kind {
1390-
RepetitionKind::ZeroOrOne => true,
1391-
RepetitionKind::ZeroOrMore => true,
1392-
RepetitionKind::OneOrMore => false,
1393-
RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0,
1394-
RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0,
1395-
RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0,
1396-
}
1402+
self.min == 0
13971403
}
13981404
}
13991405

1400-
/// The kind of a repetition operator.
1401-
#[derive(Clone, Debug, Eq, PartialEq)]
1402-
pub enum RepetitionKind {
1403-
/// Matches a sub-expression zero or one times.
1404-
ZeroOrOne,
1405-
/// Matches a sub-expression zero or more times.
1406-
ZeroOrMore,
1407-
/// Matches a sub-expression one or more times.
1408-
OneOrMore,
1409-
/// Matches a sub-expression within a bounded range of times.
1410-
Range(RepetitionRange),
1411-
}
1412-
1413-
/// The kind of a counted repetition operator.
1414-
#[derive(Clone, Debug, Eq, PartialEq)]
1415-
pub enum RepetitionRange {
1416-
/// Matches a sub-expression exactly this many times.
1417-
Exactly(u32),
1418-
/// Matches a sub-expression at least this many times.
1419-
AtLeast(u32),
1420-
/// Matches a sub-expression at least `m` times and at most `n` times.
1421-
Bounded(u32, u32),
1422-
}
1423-
14241406
/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
14251407
/// space but heap space proportional to the depth of the total `Hir`.
14261408
impl Drop for Hir {
@@ -2257,7 +2239,8 @@ mod tests {
22572239
hir: Box::new(expr),
22582240
});
22592241
expr = Hir::repetition(Repetition {
2260-
kind: RepetitionKind::ZeroOrOne,
2242+
min: 0,
2243+
max: Some(1),
22612244
greedy: true,
22622245
hir: Box::new(expr),
22632246
});

regex-syntax/src/hir/print.rs

+113-22
Original file line numberDiff line numberDiff line change
@@ -176,27 +176,31 @@ impl<W: fmt::Write> Visitor for Writer<W> {
176176
| HirKind::Concat(_)
177177
| HirKind::Alternation(_) => {}
178178
HirKind::Repetition(ref x) => {
179-
match x.kind {
180-
hir::RepetitionKind::ZeroOrOne => {
179+
match (x.min, x.max) {
180+
(0, Some(1)) => {
181181
self.wtr.write_str("?")?;
182182
}
183-
hir::RepetitionKind::ZeroOrMore => {
183+
(0, None) => {
184184
self.wtr.write_str("*")?;
185185
}
186-
hir::RepetitionKind::OneOrMore => {
186+
(1, None) => {
187187
self.wtr.write_str("+")?;
188188
}
189-
hir::RepetitionKind::Range(ref x) => match *x {
190-
hir::RepetitionRange::Exactly(m) => {
191-
write!(self.wtr, "{{{}}}", m)?;
192-
}
193-
hir::RepetitionRange::AtLeast(m) => {
194-
write!(self.wtr, "{{{},}}", m)?;
195-
}
196-
hir::RepetitionRange::Bounded(m, n) => {
197-
write!(self.wtr, "{{{},{}}}", m, n)?;
198-
}
199-
},
189+
(1, Some(1)) => {
190+
// 'a{1}' and 'a{1}?' are exactly equivalent to 'a'.
191+
return Ok(());
192+
}
193+
(m, None) => {
194+
write!(self.wtr, "{{{},}}", m)?;
195+
}
196+
(m, Some(n)) if m == n => {
197+
write!(self.wtr, "{{{}}}", m)?;
198+
// a{m} and a{m}? are always exactly equivalent.
199+
return Ok(());
200+
}
201+
(m, Some(n)) => {
202+
write!(self.wtr, "{{{},{}}}", m, n)?;
203+
}
200204
}
201205
if !x.greedy {
202206
self.wtr.write_str("?")?;
@@ -241,7 +245,10 @@ impl<W: fmt::Write> Writer<W> {
241245

242246
#[cfg(test)]
243247
mod tests {
244-
use alloc::string::String;
248+
use alloc::{
249+
boxed::Box,
250+
string::{String, ToString},
251+
};
245252

246253
use crate::ParserBuilder;
247254

@@ -338,14 +345,17 @@ mod tests {
338345
roundtrip("a+?", "a+?");
339346
roundtrip("(?U)a+", "a+?");
340347

341-
roundtrip("a{1}", "a{1}");
342-
roundtrip("a{1,}", "a{1,}");
348+
roundtrip("a{1}", "a");
349+
roundtrip("a{2}", "a{2}");
350+
roundtrip("a{1,}", "a+");
343351
roundtrip("a{1,5}", "a{1,5}");
344-
roundtrip("a{1}?", "a{1}?");
345-
roundtrip("a{1,}?", "a{1,}?");
352+
roundtrip("a{1}?", "a");
353+
roundtrip("a{2}?", "a{2}");
354+
roundtrip("a{1,}?", "a+?");
346355
roundtrip("a{1,5}?", "a{1,5}?");
347-
roundtrip("(?U)a{1}", "a{1}?");
348-
roundtrip("(?U)a{1,}", "a{1,}?");
356+
roundtrip("(?U)a{1}", "a");
357+
roundtrip("(?U)a{2}", "a{2}");
358+
roundtrip("(?U)a{1,}", "a+?");
349359
roundtrip("(?U)a{1,5}", "a{1,5}?");
350360
}
351361

@@ -371,4 +381,85 @@ mod tests {
371381
roundtrip("a|b|c", "a|b|c");
372382
roundtrip("foo|bar|quux", "foo|bar|quux");
373383
}
384+
385+
// This is a regression test that stresses a peculiarity of how the HIR
386+
// is both constructed and printed. Namely, it is legal for a repetition
387+
// to directly contain a concatenation. This particular construct isn't
388+
// really possible to build from the concrete syntax directly, since you'd
389+
// be forced to put the concatenation into (at least) a non-capturing
390+
// group. Concurrently, the printer doesn't consider this case and just
391+
// kind of naively prints the child expression and tacks on the repetition
392+
// operator.
393+
//
394+
// As a result, if you attached '+' to a 'concat(a, b)', the printer gives
395+
// you 'ab+', but clearly it really should be '(?:ab)+'.
396+
//
397+
// This bug isn't easy to surface because most ways of building an HIR
398+
// come directly from the concrete syntax, and as mentioned above, it just
399+
// isn't possible to build this kind of HIR from the concrete syntax.
400+
// Nevertheless, this is definitely a bug.
401+
//
402+
// See: https://github.com/rust-lang/regex/issues/731
403+
#[test]
404+
fn regression_repetition_concat() {
405+
let expr = Hir::concat(alloc::vec![
406+
Hir::literal(hir::Literal::Unicode('x')),
407+
Hir::repetition(hir::Repetition {
408+
min: 1,
409+
max: None,
410+
greedy: true,
411+
hir: Box::new(Hir::concat(alloc::vec![
412+
Hir::literal(hir::Literal::Unicode('a')),
413+
Hir::literal(hir::Literal::Unicode('b')),
414+
])),
415+
}),
416+
Hir::literal(hir::Literal::Unicode('y')),
417+
]);
418+
assert_eq!(r"x(?:ab)+y", expr.to_string());
419+
}
420+
421+
// Just like regression_repetition_concat, but with the repetition using
422+
// an alternation as a child expression instead.
423+
//
424+
// See: https://github.com/rust-lang/regex/issues/731
425+
#[test]
426+
fn regression_repetition_alternation() {
427+
let expr = Hir::concat(alloc::vec![
428+
Hir::literal(hir::Literal::Unicode('x')),
429+
Hir::repetition(hir::Repetition {
430+
min: 1,
431+
max: None,
432+
greedy: true,
433+
hir: Box::new(Hir::alternation(alloc::vec![
434+
Hir::literal(hir::Literal::Unicode('a')),
435+
Hir::literal(hir::Literal::Unicode('b')),
436+
])),
437+
}),
438+
Hir::literal(hir::Literal::Unicode('y')),
439+
]);
440+
assert_eq!(r"x(?:a|b)+y", expr.to_string());
441+
}
442+
443+
// This regression test is very similar in flavor to
444+
// regression_repetition_concat in that the root of the issue lies in a
445+
// peculiarity of how the HIR is represented and how the printer writes it
446+
// out. Like the other regression, this one is also rooted in the fact that
447+
// you can't produce the peculiar HIR from the concrete syntax. Namely, you
448+
// just can't have a 'concat(a, alt(b, c))' because the 'alt' will normally
449+
// be in (at least) a non-capturing group. Why? Because the '|' has very
450+
// low precedence (lower that concatenation), and so something like 'ab|c'
451+
// is actually 'alt(ab, c)'.
452+
//
453+
// See: https://github.com/rust-lang/regex/issues/516
454+
#[test]
455+
fn regression_alternation_concat() {
456+
let expr = Hir::concat(alloc::vec![
457+
Hir::literal(hir::Literal::Unicode('a')),
458+
Hir::alternation(alloc::vec![
459+
Hir::literal(hir::Literal::Unicode('b')),
460+
Hir::literal(hir::Literal::Unicode('c')),
461+
]),
462+
]);
463+
assert_eq!(r"a(?:b|c)", expr.to_string());
464+
}
374465
}

0 commit comments

Comments
 (0)