Skip to content

Commit a3dbc15

Browse files
authored
feat: add has_non_empty_nulls helper function in OffsetBuffer (#9711)
# Which issue does this PR close? N/A # Rationale for this change In variable-length array types (e.g., `StringArray`, `ListArray`), null entries may have non-empty offset ranges, meaning the underlying data buffer contains data behind nulls. This matters when wanting to work on the underlying values of variable length data for example when unwrapping (flattening) a list array, as the child values are exposed, including those behind null entries. If null entries point to non-empty ranges, the unwrapped values will contain data that may not be meaningful to operate on and could cause errors (e.g., division by zero in the child values). Usages when this will be helpful: - flattening list array - casting lists/map - we don't wanna cast values that are not used so this is a check if there is one - explode on list - we don't want the null values behind it so this give us a check if it exists (will have another pr to cleanup empty values) - gc on lists/map/strings to remove unneeded data # What changes are included in this PR? Add `OffsetBuffer::is_there_null_pointing_to_non_empty_value` method that checks if any null positions correspond to non-empty offset ranges # Are these changes tested? Yes # Are there any user-facing changes? Yes, a new public method `OffsetBuffer::is_there_null_pointing_to_non_empty_value` is added. ------- Related to: - apache/datafusion#18921 as it need to unwrap the list values and only get the reachable values
1 parent b00b5aa commit a3dbc15

1 file changed

Lines changed: 340 additions & 1 deletion

File tree

arrow-buffer/src/buffer/offset.rs

Lines changed: 340 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
// under the License.
1717

1818
use crate::buffer::ScalarBuffer;
19-
use crate::{ArrowNativeType, MutableBuffer, OffsetBufferBuilder};
19+
use crate::{ArrowNativeType, MutableBuffer, NullBuffer, OffsetBufferBuilder};
2020
use std::ops::Deref;
2121

2222
/// A non-empty buffer of monotonically increasing, positive integers.
@@ -238,6 +238,94 @@ impl<O: ArrowNativeType> OffsetBuffer<O> {
238238
pub fn ptr_eq(&self, other: &Self) -> bool {
239239
self.0.ptr_eq(&other.0)
240240
}
241+
242+
/// Check if any null positions in the `null_buffer` correspond to
243+
/// non-empty ranges in this [`OffsetBuffer`].
244+
///
245+
/// In variable-length array types (e.g., `StringArray`, `ListArray`),
246+
/// null entries may or may not have empty offset ranges. This method
247+
/// detects cases where a null entry has a non-empty range
248+
/// (i.e., `offsets[i] != offsets[i+1]`), which means the underlying
249+
/// data buffer contains data behind nulls.
250+
///
251+
/// This matters because unwrapping (flattening) a list array exposes
252+
/// the child values, including those behind null entries. If null
253+
/// entries point to non-empty ranges, the unwrapped values will
254+
/// contain data that may not be meaningful to operate on and could
255+
/// cause errors (e.g., division by zero in the child values).
256+
///
257+
/// Returns `false` if `null_buffer` is `None` or contains no nulls.
258+
///
259+
/// # Example
260+
///
261+
/// ```
262+
/// # use arrow_buffer::{OffsetBuffer, ScalarBuffer, NullBuffer};
263+
/// // Offsets where null at index 1 has an empty range (3..3)
264+
/// let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 3, 6]));
265+
/// let nulls = NullBuffer::from(vec![true, false, true]);
266+
/// assert!(!offsets.has_non_empty_nulls(Some(&nulls)));
267+
///
268+
/// // Offsets where null at index 1 has a non-empty range (3..7)
269+
/// let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 7, 10]));
270+
/// let nulls = NullBuffer::from(vec![true, false, true]);
271+
/// assert!(offsets.has_non_empty_nulls(Some(&nulls)));
272+
/// ```
273+
///
274+
/// # Panics
275+
///
276+
/// Panics if the length of the `null_buffer` does not equal `self.len() - 1`.
277+
pub fn has_non_empty_nulls(&self, null_buffer: Option<&NullBuffer>) -> bool {
278+
let Some(null_buffer) = null_buffer else {
279+
return false;
280+
};
281+
282+
assert_eq!(
283+
self.len() - 1,
284+
null_buffer.len(),
285+
"The length of the offsets should be 1 more than the length of the null buffer"
286+
);
287+
288+
if null_buffer.null_count() == 0 {
289+
return false;
290+
}
291+
292+
// Offsets always have at least 1 value
293+
let initial_offset = self[0];
294+
let last_offset = self[self.len() - 1];
295+
296+
// If all the values are null (offsets have 1 more value than the length of the array)
297+
if null_buffer.null_count() == self.len() - 1 {
298+
return last_offset != initial_offset;
299+
}
300+
301+
let mut valid_slices_iter = null_buffer.valid_slices();
302+
303+
// This is safe as we validated that are at least 1 valid value in the array
304+
let (start, end) = valid_slices_iter.next().unwrap();
305+
306+
// If the nulls before have length greater than 0
307+
if self[start] != initial_offset {
308+
return true;
309+
}
310+
311+
// End is exclusive, so it already point to the last offset value
312+
// This is valid as the length of the array is always 1 less than the length of the offsets
313+
let mut end_offset_of_last_valid_value = self[end];
314+
315+
for (start, end) in valid_slices_iter {
316+
// If there is a null value that point to a non-empty value than the start offset of the valid value
317+
// will be different that the end offset of the last valid value
318+
if self[start] != end_offset_of_last_valid_value {
319+
return true;
320+
}
321+
322+
// End is exclusive, so it already point to the last offset value
323+
// This is valid as the length of the array is always 1 less than the length of the offsets
324+
end_offset_of_last_valid_value = self[end];
325+
}
326+
327+
end_offset_of_last_valid_value != last_offset
328+
}
241329
}
242330

243331
impl<T: ArrowNativeType> Deref for OffsetBuffer<T> {
@@ -471,4 +559,255 @@ mod tests {
471559
&[0, third_max as i32, (third_max * 2) as i32]
472560
);
473561
}
562+
563+
// ---------------------------------------------------------------
564+
// Tests for has_non_empty_nulls
565+
// ---------------------------------------------------------------
566+
567+
#[test]
568+
fn has_non_empty_nulls_none_null_buffer() {
569+
// No null buffer at all -> false
570+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 5, 8]));
571+
assert!(!offsets.has_non_empty_nulls(None));
572+
}
573+
574+
#[test]
575+
fn has_non_empty_nulls_all_valid() {
576+
// Null buffer with zero nulls -> false (early return via filter)
577+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 5, 8]));
578+
let nulls = NullBuffer::new_valid(3);
579+
assert!(!offsets.has_non_empty_nulls(Some(&nulls)));
580+
}
581+
582+
#[test]
583+
fn has_non_empty_nulls_all_null_empty_offsets() {
584+
// All values are null and all offsets are equal (no data behind nulls) -> false
585+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 0, 0, 0]));
586+
let nulls = NullBuffer::new_null(3);
587+
assert!(!offsets.has_non_empty_nulls(Some(&nulls)));
588+
}
589+
590+
#[test]
591+
fn has_non_empty_nulls_all_null_non_empty_offsets() {
592+
// All values are null but offsets span data -> true
593+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 5, 7]));
594+
let nulls = NullBuffer::new_null(3);
595+
assert!(offsets.has_non_empty_nulls(Some(&nulls)));
596+
}
597+
598+
#[test]
599+
fn has_non_empty_nulls_all_null_nonzero_but_equal_offsets() {
600+
// All null, offsets start at non-zero but are all equal -> false
601+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![5, 5, 5]));
602+
let nulls = NullBuffer::new_null(2);
603+
assert!(!offsets.has_non_empty_nulls(Some(&nulls)));
604+
}
605+
606+
#[test]
607+
fn has_non_empty_nulls_leading_nulls_with_data() {
608+
// Nulls at the beginning that point to non-empty ranges -> true
609+
// offsets: [0, 3, 5, 8] nulls: [false, true, true]
610+
// Index 0 is null with range 0..3 (non-empty)
611+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 5, 8]));
612+
let nulls = NullBuffer::from(vec![false, true, true]);
613+
assert!(offsets.has_non_empty_nulls(Some(&nulls)));
614+
}
615+
616+
#[test]
617+
fn has_non_empty_nulls_leading_nulls_without_data() {
618+
// Nulls at the beginning with empty ranges -> continue checking
619+
// offsets: [0, 0, 3, 6] nulls: [false, true, true]
620+
// Index 0 is null with range 0..0 (empty)
621+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 0, 3, 6]));
622+
let nulls = NullBuffer::from(vec![false, true, true]);
623+
assert!(!offsets.has_non_empty_nulls(Some(&nulls)));
624+
}
625+
626+
#[test]
627+
fn has_non_empty_nulls_only_trailing_null_has_data() {
628+
// Only the trailing null region has data, everything else is clean
629+
// offsets: [0, 0, 3, 6, 8] nulls: [false, true, true, false]
630+
// Null at 0 (0..0 empty), valid at 1,2 (0..3, 3..6), null at 3 (6..8 non-empty)
631+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 0, 3, 6, 8]));
632+
let nulls = NullBuffer::from(vec![false, true, true, false]);
633+
assert!(offsets.has_non_empty_nulls(Some(&nulls)));
634+
}
635+
636+
#[test]
637+
fn has_non_empty_nulls_trailing_nulls_without_data() {
638+
// Nulls at the end with empty ranges -> false
639+
// offsets: [0, 3, 6, 6] nulls: [true, true, false]
640+
// Index 2 is null with range 6..6 (empty)
641+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 6, 6]));
642+
let nulls = NullBuffer::from(vec![true, true, false]);
643+
assert!(!offsets.has_non_empty_nulls(Some(&nulls)));
644+
}
645+
646+
#[test]
647+
fn has_non_empty_nulls_middle_nulls_with_data() {
648+
// Null in the middle with non-empty range -> true
649+
// offsets: [0, 3, 7, 10] nulls: [true, false, true]
650+
// Index 1 is null with range 3..7 (non-empty)
651+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 7, 10]));
652+
let nulls = NullBuffer::from(vec![true, false, true]);
653+
assert!(offsets.has_non_empty_nulls(Some(&nulls)));
654+
}
655+
656+
#[test]
657+
fn has_non_empty_nulls_middle_nulls_without_data() {
658+
// Null in the middle with empty range -> false
659+
// offsets: [0, 3, 3, 6] nulls: [true, false, true]
660+
// Index 1 is null with range 3..3 (empty)
661+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 3, 6]));
662+
let nulls = NullBuffer::from(vec![true, false, true]);
663+
assert!(!offsets.has_non_empty_nulls(Some(&nulls)));
664+
}
665+
666+
#[test]
667+
fn has_non_empty_nulls_alternating_null_valid_all_empty() {
668+
// Alternating null/valid where every null has an empty range -> false.
669+
670+
// Ends with null
671+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 0, 3, 3, 6, 6]));
672+
let nulls = NullBuffer::from(vec![false, true, false, true, false]);
673+
assert!(!offsets.has_non_empty_nulls(Some(&nulls)));
674+
675+
// Ends with valid
676+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 0, 3, 3, 6, 6, 9]));
677+
let nulls = NullBuffer::from(vec![false, true, false, true, false, true]);
678+
assert!(!offsets.has_non_empty_nulls(Some(&nulls)));
679+
}
680+
681+
#[test]
682+
fn has_non_empty_nulls_multiple_null_regions_second_has_data() {
683+
// Two null regions: first empty, second non-empty -> true
684+
// offsets: [0, 0, 3, 5, 6] nulls: [false, true, false, true]
685+
// Null at index 0 (0..0 empty), null at index 2 (3..5 non-empty)
686+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 0, 3, 5, 6]));
687+
let nulls = NullBuffer::from(vec![false, true, false, true]);
688+
assert!(offsets.has_non_empty_nulls(Some(&nulls)));
689+
}
690+
691+
#[test]
692+
fn has_non_empty_nulls_multiple_null_regions_later_gap_has_data() {
693+
// Three null regions: first two empty, third non-empty -> true
694+
// offsets: [0, 0, 3, 3, 6, 8, 10] nulls: [false, true, false, true, false, true]
695+
// valid_slices: (1,2), (3,4), (5,6)
696+
// first slice: start=1, self[1]=0 == initial_offset=0 OK, end_offset=self[2]=3
697+
// loop iter 1: start=3, self[3]=3 == 3 OK (first gap empty), end_offset=self[4]=6
698+
// loop iter 2: start=5, self[5]=8 != 6 -> true (second gap has data)
699+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 0, 3, 3, 6, 8, 10]));
700+
let nulls = NullBuffer::from(vec![false, true, false, true, false, true]);
701+
assert!(offsets.has_non_empty_nulls(Some(&nulls)));
702+
}
703+
704+
#[test]
705+
fn has_non_empty_nulls_single_element_null_empty() {
706+
// Single element, null with empty range -> false
707+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 0]));
708+
let nulls = NullBuffer::new_null(1);
709+
assert!(!offsets.has_non_empty_nulls(Some(&nulls)));
710+
}
711+
712+
#[test]
713+
fn has_non_empty_nulls_single_element_null_non_empty() {
714+
// Single element, null with non-empty range -> true
715+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 5]));
716+
let nulls = NullBuffer::new_null(1);
717+
assert!(offsets.has_non_empty_nulls(Some(&nulls)));
718+
}
719+
720+
#[test]
721+
fn has_non_empty_nulls_single_element_valid() {
722+
// Single element, valid -> false (no nulls at all)
723+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 5]));
724+
let nulls = NullBuffer::new_valid(1);
725+
assert!(!offsets.has_non_empty_nulls(Some(&nulls)));
726+
}
727+
728+
#[test]
729+
fn has_non_empty_nulls_consecutive_nulls_between_valid_slices() {
730+
// Multiple consecutive nulls between valid regions
731+
// offsets: [0, 2, 2, 2, 5, 8] nulls: [true, false, false, true, true]
732+
// Valid: [0], nulls: [1,2], valid: [3,4]
733+
// Null region [1,2] has offsets 2..2..2 (empty) -> false
734+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 2, 2, 5, 8]));
735+
let nulls = NullBuffer::from(vec![true, false, false, true, true]);
736+
assert!(!offsets.has_non_empty_nulls(Some(&nulls)));
737+
}
738+
739+
#[test]
740+
fn has_non_empty_nulls_consecutive_nulls_between_valid_slices_with_data() {
741+
// Multiple consecutive nulls between valid regions, nulls have data
742+
// offsets: [0, 2, 3, 4, 5, 8] nulls: [true, false, false, true, true]
743+
// valid_slices: (0,1), (3,5)
744+
// first slice: start=0, end=1 -> self[0]=0 == initial_offset=0 OK
745+
// end_offset_of_last_valid_value = self[1] = 2
746+
// second slice: start=3, end=5 -> self[3]=4 != 2 -> true
747+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3, 4, 5, 8]));
748+
let nulls = NullBuffer::from(vec![true, false, false, true, true]);
749+
assert!(offsets.has_non_empty_nulls(Some(&nulls)));
750+
}
751+
752+
#[test]
753+
fn has_non_empty_nulls_nonzero_initial_offset_all_null_equal() {
754+
// Non-zero starting offset, all null, all offsets equal -> false
755+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![10, 10, 10]));
756+
let nulls = NullBuffer::new_null(2);
757+
assert!(!offsets.has_non_empty_nulls(Some(&nulls)));
758+
}
759+
760+
#[test]
761+
fn has_non_empty_nulls_nonzero_initial_offset_with_data() {
762+
// Non-zero starting offset, null has data
763+
// offsets: [10, 15, 20] nulls: [false, true]
764+
// Null at index 0 with range 10..15 (non-empty) -> true
765+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![10, 15, 20]));
766+
let nulls = NullBuffer::from(vec![false, true]);
767+
assert!(offsets.has_non_empty_nulls(Some(&nulls)));
768+
}
769+
770+
#[test]
771+
fn has_non_empty_nulls_sliced_no_nulls_in_null_region() {
772+
// Original: [0, 3, 3, 6, 6, 9] -> slice(1, 3) -> [3, 3, 6, 6]
773+
// initial_offset=3, last_offset=6
774+
// nulls: [false, true, false] (null at index 0 has range 3..3 = empty)
775+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 3, 6, 6, 9]));
776+
let sliced = offsets.slice(1, 3);
777+
let nulls = NullBuffer::from(vec![false, true, false]);
778+
assert!(!sliced.has_non_empty_nulls(Some(&nulls)));
779+
}
780+
781+
#[test]
782+
fn has_non_empty_nulls_sliced_null_has_data() {
783+
// Original: [0, 3, 7, 10, 15] -> slice(1, 2) -> [3, 7, 10]
784+
// initial_offset=3, last_offset=10
785+
// nulls: [false, true] (null at index 0 has range 3..7 = non-empty)
786+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 7, 10, 15]));
787+
let sliced = offsets.slice(1, 2);
788+
let nulls = NullBuffer::from(vec![false, true]);
789+
assert!(sliced.has_non_empty_nulls(Some(&nulls)));
790+
}
791+
792+
#[test]
793+
#[should_panic(
794+
expected = "The length of the offsets should be 1 more than the length of the null buffer"
795+
)]
796+
fn has_non_empty_nulls_all_valid_mismatched_lengths_too_short() {
797+
// All-valid null buffer with wrong length should still panic
798+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 5, 8]));
799+
let nulls = NullBuffer::new_valid(2); // expects 3
800+
offsets.has_non_empty_nulls(Some(&nulls));
801+
}
802+
803+
#[test]
804+
#[should_panic(
805+
expected = "The length of the offsets should be 1 more than the length of the null buffer"
806+
)]
807+
fn has_non_empty_nulls_all_valid_mismatched_lengths_too_long() {
808+
// All-valid null buffer with wrong length should still panic
809+
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 5, 8]));
810+
let nulls = NullBuffer::new_valid(5); // expects 3
811+
offsets.has_non_empty_nulls(Some(&nulls));
812+
}
474813
}

0 commit comments

Comments
 (0)