Skip to content

Commit 320e4d6

Browse files
authored
Optimize performance of initcap function (~2x faster) (#13691)
* Optimize performance of initcap (~2x faster) Signed-off-by: Tai Le Manh <[email protected]> * format --------- Signed-off-by: Tai Le Manh <[email protected]>
1 parent aeddbd9 commit 320e4d6

File tree

3 files changed

+112
-13
lines changed

3 files changed

+112
-13
lines changed

datafusion/functions/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,3 +207,8 @@ required-features = ["unicode_expressions"]
207207
harness = false
208208
name = "trunc"
209209
required-features = ["math_expressions"]
210+
211+
[[bench]]
212+
harness = false
213+
name = "initcap"
214+
required-features = ["string_expressions"]
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
extern crate criterion;
19+
20+
use arrow::array::OffsetSizeTrait;
21+
use arrow::datatypes::DataType;
22+
use arrow::util::bench_util::{
23+
create_string_array_with_len, create_string_view_array_with_len,
24+
};
25+
use criterion::{black_box, criterion_group, criterion_main, Criterion};
26+
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
27+
use datafusion_functions::string;
28+
use std::sync::Arc;
29+
30+
fn create_args<O: OffsetSizeTrait>(
31+
size: usize,
32+
str_len: usize,
33+
force_view_types: bool,
34+
) -> Vec<ColumnarValue> {
35+
if force_view_types {
36+
let string_array =
37+
Arc::new(create_string_view_array_with_len(size, 0.2, str_len, false));
38+
39+
vec![ColumnarValue::Array(string_array)]
40+
} else {
41+
let string_array =
42+
Arc::new(create_string_array_with_len::<O>(size, 0.2, str_len));
43+
44+
vec![ColumnarValue::Array(string_array)]
45+
}
46+
}
47+
48+
fn criterion_benchmark(c: &mut Criterion) {
49+
let initcap = string::initcap();
50+
for size in [1024, 4096] {
51+
let args = create_args::<i32>(size, 8, true);
52+
c.bench_function(
53+
format!("initcap string view shorter than 12 [size={}]", size).as_str(),
54+
|b| {
55+
b.iter(|| {
56+
black_box(initcap.invoke_with_args(ScalarFunctionArgs {
57+
args: args.clone(),
58+
number_rows: size,
59+
return_type: &DataType::Utf8View,
60+
}))
61+
})
62+
},
63+
);
64+
65+
let args = create_args::<i32>(size, 16, true);
66+
c.bench_function(
67+
format!("initcap string view longer than 12 [size={}]", size).as_str(),
68+
|b| {
69+
b.iter(|| {
70+
black_box(initcap.invoke_with_args(ScalarFunctionArgs {
71+
args: args.clone(),
72+
number_rows: size,
73+
return_type: &DataType::Utf8View,
74+
}))
75+
})
76+
},
77+
);
78+
79+
let args = create_args::<i32>(size, 16, false);
80+
c.bench_function(format!("initcap string [size={}]", size).as_str(), |b| {
81+
b.iter(|| {
82+
black_box(initcap.invoke_with_args(ScalarFunctionArgs {
83+
args: args.clone(),
84+
number_rows: size,
85+
return_type: &DataType::Utf8,
86+
}))
87+
})
88+
});
89+
}
90+
}
91+
92+
criterion_group!(benches, criterion_benchmark);
93+
criterion_main!(benches);

datafusion/functions/src/string/initcap.rs

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -132,21 +132,22 @@ fn initcap_utf8view(args: &[ArrayRef]) -> Result<ArrayRef> {
132132
Ok(Arc::new(result) as ArrayRef)
133133
}
134134

135-
fn initcap_string(string: Option<&str>) -> Option<String> {
136-
let mut char_vector = Vec::<char>::new();
137-
string.map(|string: &str| {
138-
char_vector.clear();
139-
let mut previous_character_letter_or_number = false;
140-
for c in string.chars() {
141-
if previous_character_letter_or_number {
142-
char_vector.push(c.to_ascii_lowercase());
135+
fn initcap_string(input: Option<&str>) -> Option<String> {
136+
input.map(|s| {
137+
let mut result = String::with_capacity(s.len());
138+
let mut prev_is_alphanumeric = false;
139+
140+
for c in s.chars() {
141+
let transformed = if prev_is_alphanumeric {
142+
c.to_ascii_lowercase()
143143
} else {
144-
char_vector.push(c.to_ascii_uppercase());
145-
}
146-
previous_character_letter_or_number =
147-
c.is_ascii_uppercase() || c.is_ascii_lowercase() || c.is_ascii_digit();
144+
c.to_ascii_uppercase()
145+
};
146+
result.push(transformed);
147+
prev_is_alphanumeric = c.is_ascii_alphanumeric();
148148
}
149-
char_vector.iter().collect::<String>()
149+
150+
result
150151
})
151152
}
152153

0 commit comments

Comments
 (0)