Skip to content

Improve performance of reading int8/int16 Parquet data #7055

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
May 9, 2025
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions parquet/benches/arrow_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,18 @@ fn build_test_schema() -> SchemaDescPtr {
OPTIONAL FIXED_LEN_BYTE_ARRAY (8) optional_flba8_leaf;
REQUIRED FIXED_LEN_BYTE_ARRAY (16) mandatory_flba16_leaf;
OPTIONAL FIXED_LEN_BYTE_ARRAY (16) optional_flba16_leaf;
REQUIRED INT32 mandatory_uint8_leaf (INTEGER(8, false));
OPTIONAL INT32 optional_uint8_leaf (INTEGER(8, false));
REQUIRED INT32 mandatory_uint16_leaf (INTEGER(16, false));
OPTIONAL INT32 optional_uint16_leaf (INTEGER(16, false));
REQUIRED INT32 mandatory_uint32_leaf (INTEGER(32, false));
OPTIONAL INT32 optional_uint32_leaf (INTEGER(32, false));
REQUIRED INT32 mandatory_int8_leaf (INTEGER(8, true));
OPTIONAL INT32 optional_int8_leaf (INTEGER(8, true));
REQUIRED INT32 mandatory_int16_leaf (INTEGER(16, true));
OPTIONAL INT32 optional_int16_leaf (INTEGER(16, true));
REQUIRED INT64 mandatory_uint64_leaf (INTEGER(64, false));
OPTIONAL INT64 optional_uint64_leaf (INTEGER(64, false));
}
";
parse_message_type(message_type)
Expand Down Expand Up @@ -1280,6 +1292,18 @@ fn add_benches(c: &mut Criterion) {
let string_list_desc = schema.column(14);
let mandatory_binary_column_desc = schema.column(15);
let optional_binary_column_desc = schema.column(16);
let mandatory_uint8_column_desc = schema.column(27);
Copy link
Contributor

@alamb alamb May 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For my own convenience I pulled the benchmark into its own PR to make it easier to compare this branch to main:

let optional_uint8_column_desc = schema.column(28);
let mandatory_uint16_column_desc = schema.column(29);
let optional_uint16_column_desc = schema.column(30);
let mandatory_uint32_column_desc = schema.column(31);
let optional_uint32_column_desc = schema.column(32);
let mandatory_int8_column_desc = schema.column(33);
let optional_int8_column_desc = schema.column(34);
let mandatory_int16_column_desc = schema.column(35);
let optional_int16_column_desc = schema.column(36);
let mandatory_uint64_column_desc = schema.column(37);
let optional_uint64_column_desc = schema.column(38);

// primitive / int32 benchmarks
// =============================
Expand All @@ -1294,6 +1318,61 @@ fn add_benches(c: &mut Criterion) {
);
group.finish();

// primitive int32 / logical uint8 benchmarks
let mut group = c.benchmark_group("arrow_array_reader/UInt8Array");
bench_primitive::<Int32Type>(
&mut group,
&mandatory_uint8_column_desc,
&optional_uint8_column_desc,
0,
256,
);
group.finish();

// primitive int32 / logical int8 benchmarks
let mut group = c.benchmark_group("arrow_array_reader/Int8Array");
bench_primitive::<Int32Type>(
&mut group,
&mandatory_int8_column_desc,
&optional_int8_column_desc,
0,
128,
);
group.finish();

// primitive int32 / logical uint16 benchmarks
let mut group = c.benchmark_group("arrow_array_reader/UInt16Array");
bench_primitive::<Int32Type>(
&mut group,
&mandatory_uint16_column_desc,
&optional_uint16_column_desc,
0,
65536,
);
group.finish();

// primitive int32 / logical int16 benchmarks
let mut group = c.benchmark_group("arrow_array_reader/Int16Array");
bench_primitive::<Int32Type>(
&mut group,
&mandatory_int16_column_desc,
&optional_int16_column_desc,
0,
32768,
);
group.finish();

// primitive int32 / logical uint32 benchmarks
let mut group = c.benchmark_group("arrow_array_reader/UInt32Array");
bench_primitive::<Int32Type>(
&mut group,
&mandatory_uint32_column_desc,
&optional_uint32_column_desc,
0,
1000,
);
group.finish();

// primitive / int64 benchmarks
// =============================

Expand All @@ -1307,6 +1386,17 @@ fn add_benches(c: &mut Criterion) {
);
group.finish();

// primitive int64 / logical uint64 benchmarks
let mut group = c.benchmark_group("arrow_array_reader/UInt64Array");
bench_primitive::<Int64Type>(
&mut group,
&mandatory_uint64_column_desc,
&optional_uint64_column_desc,
0,
1000,
);
group.finish();

// string benchmarks
//==============================

Expand Down
44 changes: 42 additions & 2 deletions parquet/src/arrow/array_reader/primitive_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ use arrow_array::{
TimestampNanosecondBufferBuilder, TimestampSecondBufferBuilder,
},
ArrayRef, BooleanArray, Decimal128Array, Decimal256Array, Float32Array, Float64Array,
Int32Array, Int64Array, TimestampMicrosecondArray, TimestampMillisecondArray,
TimestampNanosecondArray, TimestampSecondArray, UInt32Array, UInt64Array,
Int16Array, Int32Array, Int64Array, Int8Array, TimestampMicrosecondArray,
TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array,
UInt32Array, UInt64Array, UInt8Array,
};
use arrow_buffer::{i256, BooleanBuffer, Buffer};
use arrow_data::ArrayDataBuilder;
Expand Down Expand Up @@ -261,6 +262,45 @@ where
// - date64: cast int32 to date32, then date32 to date64.
// - decimal: cast int32 to decimal, int64 to decimal
let array = match target_type {
// Using `arrow_cast::cast` has been found to be very slow for converting
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are there other conversions that can avoid the type cast? i saw this being expensive in some other benchmarks as well.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I briefly looked at others and it wasn't super obvious

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what benchmark, btw? I am curious

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Several of the clickbench queries (not sure what data types, but it was spending like 20% of samples in casting during parquet reading).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll try to burrow down into that code again early next week and see if there are any other obvious candidates. I did try signed->unsigned for 32 and 64 bit ints and there was no difference.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Several of the clickbench queries (not sure what data types, but it was spending like 20% of samples in casting during parquet reading).

FWIW many of the clickbench columns are Int16, as I found when working on #7470.

I started running some benchmarks on a draft update to parquet in this PR (hopefully it will show some improvements)

Copy link
Contributor Author

@etseidl etseidl May 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did try signed->unsigned for 32 and 64 bit ints and there was no difference.

Ahh, the reason for this is that I32/64->U32/64 is handled above (around L171). I would think anything that falls through and relies on arrow_cast::cast is going to be potentially slow due to use of unary_opt, but a quick glance at the decimal code looks like it will figure out which casts are infallible and use unary instead. Perhaps other conversions do a similar optimization.

It might be worth exploring enumerating all of the allowed Parquet physical to logical type mappings and account for them here and not rely on arrow_cast machinery.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

// INT32 physical type to lower bitwidth logical types. Since rust casts
// are infallible, instead use `unary` which is much faster (by up to 40%).
// One consequence of this approach is that some malformed integer columns
// will return (an arguably correct) result rather than null.
// See https://github.com/apache/arrow-rs/issues/7040 for a discussion of this
// issue.
ArrowType::UInt8 if *(array.data_type()) == ArrowType::Int32 => {
let array = array
.as_any()
.downcast_ref::<Int32Array>()
.unwrap()
.unary(|i| i as u8) as UInt8Array;
Arc::new(array) as ArrayRef
}
ArrowType::Int8 if *(array.data_type()) == ArrowType::Int32 => {
let array = array
.as_any()
.downcast_ref::<Int32Array>()
.unwrap()
.unary(|i| i as i8) as Int8Array;
Arc::new(array) as ArrayRef
}
ArrowType::UInt16 if *(array.data_type()) == ArrowType::Int32 => {
let array = array
.as_any()
.downcast_ref::<Int32Array>()
.unwrap()
.unary(|i| i as u16) as UInt16Array;
Arc::new(array) as ArrayRef
}
ArrowType::Int16 if *(array.data_type()) == ArrowType::Int32 => {
let array = array
.as_any()
.downcast_ref::<Int32Array>()
.unwrap()
.unary(|i| i as i16) as Int16Array;
Arc::new(array) as ArrayRef
}
ArrowType::Date64 if *(array.data_type()) == ArrowType::Int32 => {
// this is cheap as it internally reinterprets the data
let a = arrow_cast::cast(&array, &ArrowType::Date32)?;
Expand Down
Loading