From a08c4e552bfe4048cf55cb68621080697924a166 Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Wed, 2 Jun 2021 15:41:44 +1200 Subject: [PATCH 1/3] Replace bencher with criterion Enable performance improvements to be tracked over time more easily. --- Cargo.toml | 2 +- benches/graphemes.rs | 59 ++++++++++++++++++++++---------------------- 2 files changed, 30 insertions(+), 31 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index bf237cf..ae5116b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,7 +24,7 @@ no_std = [] # This is a no-op, preserved for backward compatibility only. [dev-dependencies] quickcheck = "0.7" -bencher = "0.1" +criterion = "0.3" [[bench]] name = "graphemes" diff --git a/benches/graphemes.rs b/benches/graphemes.rs index 5f14352..1641cfe 100644 --- a/benches/graphemes.rs +++ b/benches/graphemes.rs @@ -1,55 +1,54 @@ -#[macro_use] -extern crate bencher; -extern crate unicode_segmentation; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use unicode_segmentation; -use bencher::Bencher; -use unicode_segmentation::UnicodeSegmentation; use std::fs; +use unicode_segmentation::UnicodeSegmentation; -fn graphemes(bench: &mut Bencher, path: &str) { +fn graphemes(c: &mut Criterion, lang: &str, path: &str) { let text = fs::read_to_string(path).unwrap(); - bench.iter(|| { - for g in UnicodeSegmentation::graphemes(&*text, true) { - bencher::black_box(g); - } - }); - bench.bytes = text.len() as u64; + c.bench_function(&format!("grapheme {}",lang), |bench| { + bench.iter(|| { + for g in UnicodeSegmentation::graphemes(black_box(&*text), true) { + black_box(g); + } + }) + }); } -fn graphemes_arabic(bench: &mut Bencher) { - graphemes(bench, "benches/texts/arabic.txt"); +fn graphemes_arabic(c: &mut Criterion) { + graphemes(c, "arabic" ,"benches/texts/arabic.txt"); } -fn graphemes_english(bench: &mut Bencher) { - graphemes(bench, "benches/texts/english.txt"); +fn graphemes_english(c: &mut Criterion) { + graphemes(c, "english" ,"benches/texts/english.txt"); } -fn graphemes_hindi(bench: &mut Bencher) { - graphemes(bench, "benches/texts/hindi.txt"); +fn graphemes_hindi(c: &mut Criterion) { + graphemes(c, "hindi" ,"benches/texts/hindi.txt"); } -fn graphemes_japanese(bench: &mut Bencher) { - graphemes(bench, "benches/texts/japanese.txt"); +fn graphemes_japanese(c: &mut Criterion) { + graphemes(c, "japanese" ,"benches/texts/japanese.txt"); } -fn graphemes_korean(bench: &mut Bencher) { - graphemes(bench, "benches/texts/korean.txt"); +fn graphemes_korean(c: &mut Criterion) { + graphemes(c, "korean" ,"benches/texts/korean.txt"); } -fn graphemes_mandarin(bench: &mut Bencher) { - graphemes(bench, "benches/texts/mandarin.txt"); +fn graphemes_mandarin(c: &mut Criterion) { + graphemes(c, "mandarin" ,"benches/texts/mandarin.txt"); } -fn graphemes_russian(bench: &mut Bencher) { - graphemes(bench, "benches/texts/russian.txt"); +fn graphemes_russian(c: &mut Criterion) { + graphemes(c, "russian" ,"benches/texts/russian.txt"); } -fn graphemes_source_code(bench: &mut Bencher) { - graphemes(bench, "benches/texts/source_code.txt"); +fn graphemes_source_code(c: &mut Criterion) { + graphemes(c, "source_code","benches/texts/source_code.txt"); } -benchmark_group!( +criterion_group!( benches, graphemes_arabic, graphemes_english, @@ -61,4 +60,4 @@ benchmark_group!( graphemes_source_code, ); -benchmark_main!(benches); +criterion_main!(benches); From 86509eeeaa712e164072d959828ef22cb565b0f2 Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Wed, 2 Jun 2021 16:09:16 +1200 Subject: [PATCH 2/3] Increase inlining --- src/grapheme.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/grapheme.rs b/src/grapheme.rs index ef9e1a1..7fb5cc0 100644 --- a/src/grapheme.rs +++ b/src/grapheme.rs @@ -228,6 +228,7 @@ enum PairResult { Emoji, // a break if preceded by emoji base and (Extend)* } +#[inline] fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult { use crate::tables::grapheme::GraphemeCat::*; use self::PairResult::*; @@ -407,6 +408,7 @@ impl GraphemeCursor { } } + #[inline] fn decide(&mut self, is_break: bool) { self.state = if is_break { GraphemeState::Break @@ -415,11 +417,13 @@ impl GraphemeCursor { }; } + #[inline] fn decision(&mut self, is_break: bool) -> Result { self.decide(is_break); Ok(is_break) } + #[inline] fn is_boundary_result(&self) -> Result { if self.state == GraphemeState::Break { Ok(true) @@ -432,6 +436,7 @@ impl GraphemeCursor { } } + #[inline] fn handle_regional(&mut self, chunk: &str, chunk_start: usize) { use crate::tables::grapheme as gr; let mut ris_count = self.ris_count.unwrap_or(0); @@ -452,6 +457,7 @@ impl GraphemeCursor { self.state = GraphemeState::Regional; } + #[inline] fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) { use crate::tables::grapheme as gr; let mut iter = chunk.chars().rev(); @@ -482,6 +488,7 @@ impl GraphemeCursor { self.state = GraphemeState::Emoji; } + #[inline] /// Determine whether the current cursor location is a grapheme cluster boundary. /// Only a part of the string need be supplied. If `chunk_start` is nonzero or /// the length of `chunk` is not equal to `len` on creation, then this method @@ -563,6 +570,7 @@ impl GraphemeCursor { } } + #[inline] /// Find the next boundary after the current cursor position. Only a part of /// the string need be supplied. If the chunk is incomplete, then this /// method might return `GraphemeIncomplete::PreContext` or From 9310f0f073c050b0c16aa72de3c75da168d918cd Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Wed, 2 Jun 2021 16:14:33 +1200 Subject: [PATCH 3/3] Make bench output same as function names --- benches/graphemes.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benches/graphemes.rs b/benches/graphemes.rs index 1641cfe..8a7a379 100644 --- a/benches/graphemes.rs +++ b/benches/graphemes.rs @@ -7,7 +7,7 @@ use unicode_segmentation::UnicodeSegmentation; fn graphemes(c: &mut Criterion, lang: &str, path: &str) { let text = fs::read_to_string(path).unwrap(); - c.bench_function(&format!("grapheme {}",lang), |bench| { + c.bench_function(&format!("graphemes_{}",lang), |bench| { bench.iter(|| { for g in UnicodeSegmentation::graphemes(black_box(&*text), true) { black_box(g);