Skip to content

Commit b097839

Browse files
01mf02BurntSushi
authored andcommitted
syntax: support (?< syntax for named groups
It turns out that both '(?P<name>...)' and '(?<name>...)' are rather common among regex engines. There are several that support just one or the other. Until this commit, the regex crate only supported the former, along with both RE2, RE2/J and Go's regexp package. There are also several regex engines that only supported the latter, such as Onigmo, Onuguruma, Java, Ruby, Boost, .NET and Javascript. To decrease friction, and because there is somewhat little cost to doing so, we elect to support both. It looks like perhaps RE2 and Go's regexp package will go the same route, but it isn't fully decided yet: golang/go#58458 Closes #955, Closes #956
1 parent 0a281a8 commit b097839

File tree

5 files changed

+87
-42
lines changed

5 files changed

+87
-42
lines changed

regex-syntax/src/ast/mod.rs

+9-4
Original file line numberDiff line numberDiff line change
@@ -1162,7 +1162,7 @@ impl Group {
11621162
/// Returns true if and only if this group is capturing.
11631163
pub fn is_capturing(&self) -> bool {
11641164
match self.kind {
1165-
GroupKind::CaptureIndex(_) | GroupKind::CaptureName(_) => true,
1165+
GroupKind::CaptureIndex(_) | GroupKind::CaptureName { .. } => true,
11661166
GroupKind::NonCapturing(_) => false,
11671167
}
11681168
}
@@ -1173,7 +1173,7 @@ impl Group {
11731173
pub fn capture_index(&self) -> Option<u32> {
11741174
match self.kind {
11751175
GroupKind::CaptureIndex(i) => Some(i),
1176-
GroupKind::CaptureName(ref x) => Some(x.index),
1176+
GroupKind::CaptureName { ref name, .. } => Some(name.index),
11771177
GroupKind::NonCapturing(_) => None,
11781178
}
11791179
}
@@ -1184,8 +1184,13 @@ impl Group {
11841184
pub enum GroupKind {
11851185
/// `(a)`
11861186
CaptureIndex(u32),
1187-
/// `(?P<name>a)`
1188-
CaptureName(CaptureName),
1187+
/// `(?<name>a)` or `(?P<name>a)`
1188+
CaptureName {
1189+
/// True if the `?P<` syntax is used and false if the `?<` syntax is used.
1190+
starts_with_p: bool,
1191+
/// The capture name.
1192+
name: CaptureName,
1193+
},
11891194
/// `(?:a)` and `(?i:a)`
11901195
NonCapturing(Flags),
11911196
}

regex-syntax/src/ast/parse.rs

+70-33
Original file line numberDiff line numberDiff line change
@@ -1202,12 +1202,16 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
12021202
));
12031203
}
12041204
let inner_span = self.span();
1205-
if self.bump_if("?P<") {
1205+
let mut starts_with_p = true;
1206+
if self.bump_if("?P<") || {
1207+
starts_with_p = false;
1208+
self.bump_if("?<")
1209+
} {
12061210
let capture_index = self.next_capture_index(open_span)?;
1207-
let cap = self.parse_capture_name(capture_index)?;
1211+
let name = self.parse_capture_name(capture_index)?;
12081212
Ok(Either::Right(ast::Group {
12091213
span: open_span,
1210-
kind: ast::GroupKind::CaptureName(cap),
1214+
kind: ast::GroupKind::CaptureName { starts_with_p, name },
12111215
ast: Box::new(Ast::Empty(self.span())),
12121216
}))
12131217
} else if self.bump_if("?") {
@@ -2800,11 +2804,14 @@ bar
28002804
flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
28012805
Ast::Group(ast::Group {
28022806
span: span_range(pat, 4..pat.len()),
2803-
kind: ast::GroupKind::CaptureName(ast::CaptureName {
2804-
span: span_range(pat, 9..12),
2805-
name: s("foo"),
2806-
index: 1,
2807-
}),
2807+
kind: ast::GroupKind::CaptureName {
2808+
starts_with_p: true,
2809+
name: ast::CaptureName {
2810+
span: span_range(pat, 9..12),
2811+
name: s("foo"),
2812+
index: 1,
2813+
}
2814+
},
28082815
ast: Box::new(lit_with('a', span_range(pat, 14..15))),
28092816
}),
28102817
]
@@ -3819,27 +3826,48 @@ bar
38193826

38203827
#[test]
38213828
fn parse_capture_name() {
3829+
assert_eq!(
3830+
parser("(?<a>z)").parse(),
3831+
Ok(Ast::Group(ast::Group {
3832+
span: span(0..7),
3833+
kind: ast::GroupKind::CaptureName {
3834+
starts_with_p: false,
3835+
name: ast::CaptureName {
3836+
span: span(3..4),
3837+
name: s("a"),
3838+
index: 1,
3839+
}
3840+
},
3841+
ast: Box::new(lit('z', 5)),
3842+
}))
3843+
);
38223844
assert_eq!(
38233845
parser("(?P<a>z)").parse(),
38243846
Ok(Ast::Group(ast::Group {
38253847
span: span(0..8),
3826-
kind: ast::GroupKind::CaptureName(ast::CaptureName {
3827-
span: span(4..5),
3828-
name: s("a"),
3829-
index: 1,
3830-
}),
3848+
kind: ast::GroupKind::CaptureName {
3849+
starts_with_p: true,
3850+
name: ast::CaptureName {
3851+
span: span(4..5),
3852+
name: s("a"),
3853+
index: 1,
3854+
}
3855+
},
38313856
ast: Box::new(lit('z', 6)),
38323857
}))
38333858
);
38343859
assert_eq!(
38353860
parser("(?P<abc>z)").parse(),
38363861
Ok(Ast::Group(ast::Group {
38373862
span: span(0..10),
3838-
kind: ast::GroupKind::CaptureName(ast::CaptureName {
3839-
span: span(4..7),
3840-
name: s("abc"),
3841-
index: 1,
3842-
}),
3863+
kind: ast::GroupKind::CaptureName {
3864+
starts_with_p: true,
3865+
name: ast::CaptureName {
3866+
span: span(4..7),
3867+
name: s("abc"),
3868+
index: 1,
3869+
}
3870+
},
38433871
ast: Box::new(lit('z', 8)),
38443872
}))
38453873
);
@@ -3848,11 +3876,14 @@ bar
38483876
parser("(?P<a_1>z)").parse(),
38493877
Ok(Ast::Group(ast::Group {
38503878
span: span(0..10),
3851-
kind: ast::GroupKind::CaptureName(ast::CaptureName {
3852-
span: span(4..7),
3853-
name: s("a_1"),
3854-
index: 1,
3855-
}),
3879+
kind: ast::GroupKind::CaptureName {
3880+
starts_with_p: true,
3881+
name: ast::CaptureName {
3882+
span: span(4..7),
3883+
name: s("a_1"),
3884+
index: 1,
3885+
}
3886+
},
38563887
ast: Box::new(lit('z', 8)),
38573888
}))
38583889
);
@@ -3861,11 +3892,14 @@ bar
38613892
parser("(?P<a.1>z)").parse(),
38623893
Ok(Ast::Group(ast::Group {
38633894
span: span(0..10),
3864-
kind: ast::GroupKind::CaptureName(ast::CaptureName {
3865-
span: span(4..7),
3866-
name: s("a.1"),
3867-
index: 1,
3868-
}),
3895+
kind: ast::GroupKind::CaptureName {
3896+
starts_with_p: true,
3897+
name: ast::CaptureName {
3898+
span: span(4..7),
3899+
name: s("a.1"),
3900+
index: 1,
3901+
}
3902+
},
38693903
ast: Box::new(lit('z', 8)),
38703904
}))
38713905
);
@@ -3874,11 +3908,14 @@ bar
38743908
parser("(?P<a[1]>z)").parse(),
38753909
Ok(Ast::Group(ast::Group {
38763910
span: span(0..11),
3877-
kind: ast::GroupKind::CaptureName(ast::CaptureName {
3878-
span: span(4..8),
3879-
name: s("a[1]"),
3880-
index: 1,
3881-
}),
3911+
kind: ast::GroupKind::CaptureName {
3912+
starts_with_p: true,
3913+
name: ast::CaptureName {
3914+
span: span(4..8),
3915+
name: s("a[1]"),
3916+
index: 1,
3917+
}
3918+
},
38823919
ast: Box::new(lit('z', 9)),
38833920
}))
38843921
);

regex-syntax/src/ast/print.rs

+5-3
Original file line numberDiff line numberDiff line change
@@ -160,9 +160,10 @@ impl<W: fmt::Write> Writer<W> {
160160
use crate::ast::GroupKind::*;
161161
match ast.kind {
162162
CaptureIndex(_) => self.wtr.write_str("("),
163-
CaptureName(ref x) => {
164-
self.wtr.write_str("(?P<")?;
165-
self.wtr.write_str(&x.name)?;
163+
CaptureName { ref name, starts_with_p } => {
164+
let start = if starts_with_p { "(?P<" } else { "(?<" };
165+
self.wtr.write_str(start)?;
166+
self.wtr.write_str(&name.name)?;
166167
self.wtr.write_str(">")?;
167168
Ok(())
168169
}
@@ -505,6 +506,7 @@ mod tests {
505506
fn print_group() {
506507
roundtrip("(?i:a)");
507508
roundtrip("(?P<foo>a)");
509+
roundtrip("(?<foo>a)");
508510
roundtrip("(a)");
509511
}
510512

regex-syntax/src/hir/translate.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -905,8 +905,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
905905
fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir {
906906
let (index, name) = match group.kind {
907907
ast::GroupKind::CaptureIndex(index) => (index, None),
908-
ast::GroupKind::CaptureName(ref cap) => {
909-
(cap.index, Some(cap.name.clone().into_boxed_str()))
908+
ast::GroupKind::CaptureName { ref name, .. } => {
909+
(name.index, Some(name.name.clone().into_boxed_str()))
910910
}
911911
// The HIR doesn't need to use non-capturing groups, since the way
912912
// in which the data type is defined handles this automatically.

src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,7 @@ regex matches `abc` at positions `0`, `1`, `2` and `3`.
361361
<pre class="rust">
362362
(exp) numbered capture group (indexed by opening parenthesis)
363363
(?P&lt;name&gt;exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
364+
(?&lt;name&gt;exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
364365
(?:exp) non-capturing group
365366
(?flags) set flags within current group
366367
(?flags:exp) set flags for exp (non-capturing)

0 commit comments

Comments
 (0)