Skip to content
Open
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
fc36ee6
fix: try merging list of dict if possible
duongcongtoai Oct 19, 2025
e18080c
feat: support interleaving list of struct with dict fields
duongcongtoai Oct 22, 2025
af5c581
test: add null test
duongcongtoai Oct 22, 2025
661a0ee
feat: handle other non merged fields
duongcongtoai Oct 22, 2025
c56354f
test: struct list with mergable dict field
duongcongtoai Oct 23, 2025
a0a607d
fix: handle all dict key size
duongcongtoai Oct 23, 2025
549b8da
fix: linting
duongcongtoai Oct 23, 2025
249422f
test: add bench
duongcongtoai Oct 25, 2025
2ca0ef7
fix: key overflow
duongcongtoai Oct 25, 2025
b0f5e9c
fix: naming
duongcongtoai Oct 25, 2025
8da747d
test: not used test
duongcongtoai Oct 25, 2025
f1b5e4d
chore: doc
duongcongtoai Oct 25, 2025
d18f6f0
chore: add runbench
duongcongtoai Oct 25, 2025
46554d3
chore: rm temp
duongcongtoai Oct 25, 2025
a5f192e
test: add bench for list
duongcongtoai Oct 25, 2025
96bef55
fix: data type
duongcongtoai Oct 25, 2025
77f5371
fix: lint
duongcongtoai Oct 27, 2025
eff7926
feat: best effort merge dictionary on error
duongcongtoai Oct 31, 2025
2331977
feat: simplify the fallback
duongcongtoai Oct 31, 2025
2ea49cc
chore: revert unrelated changes
duongcongtoai Oct 31, 2025
3e0fafb
chore: rm bench script
duongcongtoai Oct 31, 2025
01a7bd4
chore: lint
duongcongtoai Oct 31, 2025
319074e
chore: some more comments
duongcongtoai Oct 31, 2025
b393332
feat: let mutablearraydata handle fallback
duongcongtoai Nov 5, 2025
25e20de
fix: lint
duongcongtoai Nov 5, 2025
3c2d130
fix: handle when all keys are null
duongcongtoai Nov 7, 2025
24edfa7
fix: negative overflow
duongcongtoai Nov 8, 2025
68377e1
fix: minor comment
duongcongtoai Nov 8, 2025
17bc58a
fix: clippy
duongcongtoai Nov 8, 2025
797e236
chore: more comment
duongcongtoai Nov 8, 2025
14cda31
fix: clippy
duongcongtoai Nov 13, 2025
2a9b544
fix: add license
duongcongtoai Nov 13, 2025
407434d
fix: fmt
duongcongtoai Nov 18, 2025
eaab4ff
Merge branch 'main' into fix-overflow-on-interleave-list-of-dict
duongcongtoai Nov 30, 2025
89777ec
fix: more comments
duongcongtoai Dec 9, 2025
1d49d2c
fix: compile err
duongcongtoai Dec 9, 2025
a0e1e30
test: on more distinct keys
duongcongtoai Dec 9, 2025
1998486
test: a case when overflow happens
duongcongtoai Dec 9, 2025
a1b10c9
test: use larger key size
duongcongtoai Dec 9, 2025
042fca8
test: arrow-data
duongcongtoai Dec 10, 2025
b3124d4
Merge remote-tracking branch 'origin/main' into fix-overflow-on-inter…
duongcongtoai Dec 11, 2025
50df8f4
fix: test returned extends closure
duongcongtoai Dec 11, 2025
df12d54
fix: better comment
duongcongtoai Dec 11, 2025
7986107
fix: clippy
duongcongtoai Dec 11, 2025
bf1095e
fix: more cov
duongcongtoai Dec 11, 2025
8a5a028
chore: use different backed value
duongcongtoai Dec 11, 2025
1746bcf
Merge branch 'main' into fix-overflow-on-interleave-list-of-dict
duongcongtoai Dec 14, 2025
9903f59
chore: add some comments on fallback
duongcongtoai Dec 14, 2025
86677e6
Merge branch 'main' of https://github.com/apache/arrow-rs into fix-ov…
toaiduong-blip Apr 23, 2026
8f44ec1
chore: some more test case to track panic in concat
duongcongtoai Apr 23, 2026
8390530
fix: some more test to reproduce key overflow
duongcongtoai Apr 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 140 additions & 0 deletions arrow-data/src/transform/dictionary.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use std::collections::HashMap;

use arrow_buffer::ArrowNativeType;
use arrow_schema::{ArrowError, DataType};

use crate::{
ArrayData,
transform::{_MutableArrayData, Extend, MutableArrayData, utils::iter_in_bytes},
};

pub(crate) fn merge_dictionaries<'a>(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we please leave some comments about what this function does and why it is needed? (aka explain the overflow backup case)

key_data_type: &DataType,
value_data_type: &DataType,
dicts: &[&'a ArrayData],
) -> Result<(Vec<Extend<'a>>, ArrayData), ArrowError> {
match key_data_type {
DataType::UInt8 => merge_dictionaries_casted::<u8>(value_data_type, dicts),
DataType::UInt16 => merge_dictionaries_casted::<u16>(value_data_type, dicts),
DataType::UInt32 => merge_dictionaries_casted::<u32>(value_data_type, dicts),
DataType::UInt64 => merge_dictionaries_casted::<u64>(value_data_type, dicts),
DataType::Int8 => merge_dictionaries_casted::<i8>(value_data_type, dicts),
DataType::Int16 => merge_dictionaries_casted::<i16>(value_data_type, dicts),
DataType::Int32 => merge_dictionaries_casted::<i32>(value_data_type, dicts),
DataType::Int64 => merge_dictionaries_casted::<i64>(value_data_type, dicts),
_ => unreachable!(),
}
}

fn merge_dictionaries_casted<'a, K: ArrowNativeType>(
data_type: &DataType,
dicts: &[&'a ArrayData],
) -> Result<(Vec<Extend<'a>>, ArrayData), ArrowError> {
let mut dedup = HashMap::new();
let mut indices = vec![];
let mut data_refs = vec![];
let new_dict_keys = dicts
.iter()
.enumerate()
.map(|(dict_idx, dict)| {
let value_data = dict.child_data().first().unwrap();
let old_keys = dict.buffer::<K>(0);
data_refs.push(value_data);
let mut new_keys = vec![K::usize_as(0); old_keys.len()];
let values = iter_in_bytes(data_type, value_data);
for (key_index, old_key) in old_keys.iter().enumerate() {
if dict.is_valid(key_index) {
let value = values[old_key.as_usize()];
match K::from_usize(dedup.len()) {
Some(idx) => {
let idx_for_value = dedup.entry(value).or_insert(idx);
// a new entry
if *idx_for_value == idx {
indices.push((dict_idx, old_key.as_usize()));
}

new_keys[key_index] = *idx_for_value;
}
// the built dictionary has reach the cap of the key type
None => match dedup.get(value) {
// as long as this value has already been indexed
// the merge dictionary is still valid
Some(previous_key) => {
new_keys[key_index] = *previous_key;
}
None => return Err(ArrowError::DictionaryKeyOverflowError),
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I ran coverage of this code

cargo llvm-cov --html test -p arrow-select

And found this error path (where the fallback also errors with DictionaryKeyOverflowError) appears not to be covered

Screenshot 2025-12-03 at 4 34 14 PM

Can you please add a test that covers this?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let me add more coverage on this

},
};
}
}

Ok(new_keys)
})
.collect::<Result<Vec<Vec<K>>, ArrowError>>()?;
let shared_value_data = if indices.is_empty() {
ArrayData::new_empty(data_refs[0].data_type())
} else {
let new_values_data = MutableArrayData::new(data_refs, false, indices.len());
interleave(new_values_data, indices)
};

Ok((
new_dict_keys
.into_iter()
.map(|keys| {
Box::new(
move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| {
mutable
.buffer1
.extend_from_slice::<K>(&keys[start..start + len]);
},
) as Extend
})
.collect::<Vec<Extend>>(),
shared_value_data,
))
}

fn interleave(mut array_data: MutableArrayData, indices: Vec<(usize, usize)>) -> ArrayData {
let mut cur_array = indices[0].0;

let mut start_row_idx = indices[0].1;
let mut end_row_idx = start_row_idx + 1;

for (array, row) in indices.iter().skip(1).copied() {
if array == cur_array && row == end_row_idx {
// subsequent row in same batch
end_row_idx += 1;
continue;
}

// emit current batch of rows for current buffer
array_data.extend(cur_array, start_row_idx, end_row_idx);

// start new batch of rows
cur_array = array;
start_row_idx = row;
end_row_idx = start_row_idx + 1;
}

// emit final batch of rows
array_data.extend(cur_array, start_row_idx, end_row_idx);
array_data.freeze()
}
32 changes: 25 additions & 7 deletions arrow-data/src/transform/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,13 @@ use crate::bit_mask::set_bits;
use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer, bit_util, i256};
use arrow_schema::{ArrowError, DataType, IntervalUnit, UnionMode};
use dictionary::merge_dictionaries;
use half::f16;
use num_integer::Integer;
use std::mem;

mod boolean;
mod dictionary;
mod fixed_binary;
mod fixed_size_list;
mod list;
Expand Down Expand Up @@ -604,7 +606,7 @@ impl<'a> MutableArrayData<'a> {
};

// Get the dictionary if any, and if it is a concatenation of multiple
let (dictionary, dict_concat) = match &data_type {
let (mut dictionary, dict_concat) = match &data_type {
DataType::Dictionary(_, _) => {
// If more than one dictionary, concatenate dictionaries together
let dict_concat = !arrays
Expand Down Expand Up @@ -660,9 +662,9 @@ impl<'a> MutableArrayData<'a> {
});

let extend_values = match &data_type {
DataType::Dictionary(_, _) => {
DataType::Dictionary(key_data_type, value_data_type) => {
let mut next_offset = 0;
let extend_values: Result<Vec<_>, _> = arrays
let result = arrays
.iter()
.map(|array| {
let offset = next_offset;
Expand All @@ -672,12 +674,24 @@ impl<'a> MutableArrayData<'a> {
next_offset += dict_len;
}

build_extend_dictionary(array, offset, offset + dict_len)
// -1 since offset is exclusive
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand this comment or hange

I revered the change

(venv) andrewlamb@Andrews-MacBook-Pro-3:~/Software/arrow-rs$ git diff
diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs
index 12b03bbdf0..76a116c4cd 100644
--- a/arrow-data/src/transform/mod.rs
+++ b/arrow-data/src/transform/mod.rs
@@ -674,8 +674,7 @@ impl<'a> MutableArrayData<'a> {
                             next_offset += dict_len;
                         }

-                        // -1 since offset is exclusive
-                        build_extend_dictionary(array, offset, 1.max(offset + dict_len) - 1)
+                        build_extend_dictionary(array, offset, offset + dict_len)
                             .ok_or(ArrowError::DictionaryKeyOverflowError)
                     })
                     .collect::<Result<Vec<_>, ArrowError>>();

And the tests still pass 🤔

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this will trigger the error part

    #[test]
    fn test_uint8_dictionary_overflow_with_256_items() {
        let dict_arr = {
            let input_1_keys = UInt8Array::from_iter_values(0..=255);
            let input_1_values = UInt8Array::from_iter_values(0..=255);
            let input_1 = DictionaryArray::new(input_1_keys, Arc::new(input_1_values));
            input_1
        };

        let arr1 = Arc::new(dict_arr) as ArrayRef;
        let arr2 = arr1.clone();

        concat(&[&arr1, &arr2]).unwrap();
    }

when it reaches this function

                        build_extend_dictionary(array, offset, offset + dict_len)
                            .ok_or(ArrowError::DictionaryKeyOverflowError)

offset will be 0, dict_len is 256, and build_extend_dictionary will try cast 256 as u8, which will throw error DictionaryKeyOverflowError, while it shouldn't be. The test passes anyway because we already added a fallback for this error

build_extend_dictionary(array, offset, 1.max(offset + dict_len) - 1)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than converting an Option --> Result, what about changing build_extend_dictionary to return the Result directly (this would also make it easier to esure you only returned DictionaryKeyOverflowError when the key actually overflowed(

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comment addressed

.ok_or(ArrowError::DictionaryKeyOverflowError)
})
.collect();

extend_values.expect("MutableArrayData::new is infallible")
.collect::<Result<Vec<_>, ArrowError>>();
match result {
Err(_) => {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should only retry when the Err is DictionaryKeyOverflowError -- this code retries regardless of the underlying error

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also think that it would help to add a comment explaining the rationale for this fallback -- namely something like "if the dictionary key overflows, it means there are too many keys in the concatenated dictionary -- in that case fall back to the slower path of merging (deduplicating) the dictionaries

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, I was confused for a while about how this could detect an error as it happens when constructing the extended, not when actually running

I think I understand now (it hasn't changed in this PR) -- that the maximum dictionary key is computed based on each dictionary size which make sense

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i added some more comments

let (extends, merged_dictionary_values) = merge_dictionaries(
key_data_type.as_ref(),
value_data_type.as_ref(),
&arrays,
)
.expect("fail merging dictionary");
dictionary = Some(merged_dictionary_values);
extends
}
Ok(extends) => extends,
}
}
DataType::BinaryView | DataType::Utf8View => {
let mut next_offset = 0u32;
Expand Down Expand Up @@ -705,6 +719,7 @@ impl<'a> MutableArrayData<'a> {
buffer2,
child_data,
};

Self {
arrays,
data,
Expand Down Expand Up @@ -841,6 +856,9 @@ mod test {
use arrow_schema::Field;
use std::sync::Arc;

#[test]
fn test_dictionary_overflow() {}

#[test]
fn test_list_append_with_capacities() {
let array = ArrayData::new_empty(&DataType::List(Arc::new(Field::new(
Expand Down
34 changes: 34 additions & 0 deletions arrow-data/src/transform/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,12 @@
// under the License.

use arrow_buffer::{ArrowNativeType, MutableBuffer, bit_util};
use arrow_schema::DataType;
use num_integer::Integer;
use num_traits::CheckedAdd;

use crate::ArrayData;

/// extends the `buffer` to be able to hold `len` bits, setting all bits of the new size to zero.
#[inline]
pub(super) fn resize_for_bits(buffer: &mut MutableBuffer, len: usize) {
Expand Down Expand Up @@ -58,6 +61,37 @@ pub(super) unsafe fn get_last_offset<T: ArrowNativeType>(offset_buffer: &Mutable
*unsafe { offsets.get_unchecked(offsets.len() - 1) }
}

fn iter_in_bytes_variable_sized<T: ArrowNativeType + Integer>(data: &ArrayData) -> Vec<&[u8]> {
let offsets = data.buffer::<T>(0);

// the offsets of the `ArrayData` are ignored as they are only applied to the offset buffer.
let values = data.buffers()[1].as_slice();
(0..data.len())
.map(move |i| {
let start = offsets[i].to_usize().unwrap();
let end = offsets[i + 1].to_usize().unwrap();
&values[start..end]
})
.collect::<Vec<_>>()
}

fn iter_in_bytes_fixed_sized(data: &ArrayData, size: usize) -> Vec<&[u8]> {
let values = &data.buffers()[0].as_slice()[data.offset() * size..];
values.chunks(size).collect::<Vec<_>>()
}

/// iterate values in raw bytes regardless nullability
pub(crate) fn iter_in_bytes<'a>(data_type: &DataType, data: &'a ArrayData) -> Vec<&'a [u8]> {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is called iter_in_bytes... but it returns a vec 🤔

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i renamed

if data_type.is_primitive() {
return iter_in_bytes_fixed_sized(data, data_type.primitive_width().unwrap());
}
match data_type {
DataType::Utf8 | DataType::Binary => iter_in_bytes_variable_sized::<i32>(data),
DataType::LargeUtf8 | DataType::LargeBinary => iter_in_bytes_variable_sized::<i64>(data),
_ => unimplemented!("iter in bytes is not supported for {data_type}"),
}
}

#[cfg(test)]
mod tests {
use crate::transform::utils::extend_offsets;
Expand Down
Loading
Loading