Closed
Description
A quick comparison with oniguruma (the "onig" crate) gives the following results:
extern crate regex;
extern crate onig;
use std::io::prelude::*;
use std::fs::File;
use std::time::SystemTime;
#[allow(non_snake_case)]
fn main() {
let ATTR_RE_STR: String = String::new() +
r"([^<>=\s/]+|/)" +
r"(?:" +
r"\s*=\s*" +
r"(?s:" +
r#""(.*?)""# +
r"|" +
r"'(.*?)'" +
r"|" +
r"([^>\s]*)" +
r")" +
r")?\s*";
let TOKEN_RE_STR: String = String::new() +
r"(?is)" +
r"([^<]+)?" +
r"(?:" +
r"<(?:" +
r"!(?:" +
r"DOCTYPE(\s+\w+.*?)" +
r"|" +
r"--(.*?)--\s*" +
r"|" +
r"\[CDATA\[(.*?)\]\]" +
r")" +
r"|" +
r"\?(.*?)\?" +
r"|" +
r"\s*([^<>\s]+\s*(?:" + &ATTR_RE_STR + r")*)" +
r")>" +
r"|" +
r"(<)" +
r")?" +
r"(.*)$";
let mut f = File::open("example.html").unwrap();
let mut buffer = String::new();
f.read_to_string(&mut buffer).unwrap();
{
use regex::Regex;
let re = Regex::new(&TOKEN_RE_STR).unwrap();
let now = SystemTime::now();
if let Some(caps) = re.captures(&buffer) {
let rest = caps.get(12).map(|c| c.as_str()).unwrap_or("");
let elapsed = now.elapsed().unwrap();
println!("[rust-regex] elapsed: {:.5} sec; rest = {}", elapsed.as_secs() as f64 + elapsed.subsec_nanos() as f64 * 1e-9, rest.len());
}
}
{
use onig::{Regex, Syntax, REGEX_OPTION_NONE};
let re = Regex::with_options(&TOKEN_RE_STR, REGEX_OPTION_NONE, Syntax::perl()).unwrap();
let now = SystemTime::now();
if let Some(caps) = re.captures(&buffer) {
let rest = caps.at(12).unwrap_or("");
let elapsed = now.elapsed().unwrap();
println!("[onig] elapsed: {:.5} sec; rest = {}", elapsed.as_secs() as f64 + elapsed.subsec_nanos() as f64 * 1e-9, rest.len());
}
}
}
example.html
is just https://www.bbc.co.uk saved page. But generally the same results can be reproduced on any input.
# cargo run --release
...
Compiling onig v1.5.0
Compiling regex v0.2.2
...
[rust-regex] elapsed: 0.40805 sec; rest = 292869
[onig] elapsed: 0.00644 sec; rest = 292869
Approx. 60 times slower. Is it by design or is a bug?
The regular expression above is used in my crate victoria-dom.
Metadata
Metadata
Assignees
Labels
No labels