Closed
Description
This program
extern crate regex; // 1.0.5
use regex::Regex;
fn main() {
let text = "foo\nbar\nbaz\n";
let re = Regex::new(r#"(?m)^[ \n]*[a-z]+[ \n]*$"#).unwrap();
for c in re.captures_iter(text) {
println!("{:?}", (c.get(0).unwrap().start(), c.get(0).unwrap().end()));
}
println!("-----------------");
let re = Regex::new(r#"(?m)(^)[ \n]*[a-z]+[ \n]*$"#).unwrap();
for c in re.captures_iter(text) {
println!("{:?}", (c.get(0).unwrap().start(), c.get(0).unwrap().end()));
}
}
outputs
(0, 3)
(3, 7)
(7, 12)
-----------------
(0, 3)
(4, 7)
(8, 12)
but the outputs of each loop should be identical. This is almost certainly a case where the DFA gets it right (the first case) but the Pike VM/backtracker gets it wrong (the second case). My bet is that the compiler is producing bad byte code for the capture group. Taking a look at the byte code:
[andrew@Cheetah regex-debug]$ regex-debug compile '(?m)^[ \n]*[a-z]+[ \n]*$'
0000 Save(0) (start)
0001 StartLine
0002 Split(3, 4)
0003 '\n'-'\n', ' '-' ' (goto: 2)
0004 'a'-'z'
0005 Split(4, 6)
0006 Split(7, 8)
0007 '\n'-'\n', ' '-' ' (goto: 6)
0008 EndLine
0009 Save(1)
0010 Match(0)
[andrew@Cheetah regex-debug]$ regex-debug compile '(?m)(^)[ \n]*[a-z]+[ \n]*$'
0000 Save(0) (start)
0001 Save(2)
0002 StartLine
0003 Save(3)
0004 Split(5, 6)
0005 '\n'-'\n', ' '-' ' (goto: 4)
0006 'a'-'z'
0007 Split(6, 8)
0008 Split(9, 10)
0009 '\n'-'\n', ' '-' ' (goto: 8)
0010 EndLine
0011 Save(1)
0012 Match(0)
If there's an error here, I don't think I see it.
cc @retep998