|
16 | 16 | // under the License. |
17 | 17 |
|
18 | 18 | use crate::buffer::ScalarBuffer; |
19 | | -use crate::{ArrowNativeType, MutableBuffer, OffsetBufferBuilder}; |
| 19 | +use crate::{ArrowNativeType, MutableBuffer, NullBuffer, OffsetBufferBuilder}; |
20 | 20 | use std::ops::Deref; |
21 | 21 |
|
22 | 22 | /// A non-empty buffer of monotonically increasing, positive integers. |
@@ -238,6 +238,94 @@ impl<O: ArrowNativeType> OffsetBuffer<O> { |
238 | 238 | pub fn ptr_eq(&self, other: &Self) -> bool { |
239 | 239 | self.0.ptr_eq(&other.0) |
240 | 240 | } |
| 241 | + |
| 242 | + /// Check if any null positions in the `null_buffer` correspond to |
| 243 | + /// non-empty ranges in this [`OffsetBuffer`]. |
| 244 | + /// |
| 245 | + /// In variable-length array types (e.g., `StringArray`, `ListArray`), |
| 246 | + /// null entries may or may not have empty offset ranges. This method |
| 247 | + /// detects cases where a null entry has a non-empty range |
| 248 | + /// (i.e., `offsets[i] != offsets[i+1]`), which means the underlying |
| 249 | + /// data buffer contains data behind nulls. |
| 250 | + /// |
| 251 | + /// This matters because unwrapping (flattening) a list array exposes |
| 252 | + /// the child values, including those behind null entries. If null |
| 253 | + /// entries point to non-empty ranges, the unwrapped values will |
| 254 | + /// contain data that may not be meaningful to operate on and could |
| 255 | + /// cause errors (e.g., division by zero in the child values). |
| 256 | + /// |
| 257 | + /// Returns `false` if `null_buffer` is `None` or contains no nulls. |
| 258 | + /// |
| 259 | + /// # Example |
| 260 | + /// |
| 261 | + /// ``` |
| 262 | + /// # use arrow_buffer::{OffsetBuffer, ScalarBuffer, NullBuffer}; |
| 263 | + /// // Offsets where null at index 1 has an empty range (3..3) |
| 264 | + /// let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 3, 6])); |
| 265 | + /// let nulls = NullBuffer::from(vec![true, false, true]); |
| 266 | + /// assert!(!offsets.has_non_empty_nulls(Some(&nulls))); |
| 267 | + /// |
| 268 | + /// // Offsets where null at index 1 has a non-empty range (3..7) |
| 269 | + /// let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 7, 10])); |
| 270 | + /// let nulls = NullBuffer::from(vec![true, false, true]); |
| 271 | + /// assert!(offsets.has_non_empty_nulls(Some(&nulls))); |
| 272 | + /// ``` |
| 273 | + /// |
| 274 | + /// # Panics |
| 275 | + /// |
| 276 | + /// Panics if the length of the `null_buffer` does not equal `self.len() - 1`. |
| 277 | + pub fn has_non_empty_nulls(&self, null_buffer: Option<&NullBuffer>) -> bool { |
| 278 | + let Some(null_buffer) = null_buffer else { |
| 279 | + return false; |
| 280 | + }; |
| 281 | + |
| 282 | + assert_eq!( |
| 283 | + self.len() - 1, |
| 284 | + null_buffer.len(), |
| 285 | + "The length of the offsets should be 1 more than the length of the null buffer" |
| 286 | + ); |
| 287 | + |
| 288 | + if null_buffer.null_count() == 0 { |
| 289 | + return false; |
| 290 | + } |
| 291 | + |
| 292 | + // Offsets always have at least 1 value |
| 293 | + let initial_offset = self[0]; |
| 294 | + let last_offset = self[self.len() - 1]; |
| 295 | + |
| 296 | + // If all the values are null (offsets have 1 more value than the length of the array) |
| 297 | + if null_buffer.null_count() == self.len() - 1 { |
| 298 | + return last_offset != initial_offset; |
| 299 | + } |
| 300 | + |
| 301 | + let mut valid_slices_iter = null_buffer.valid_slices(); |
| 302 | + |
| 303 | + // This is safe as we validated that are at least 1 valid value in the array |
| 304 | + let (start, end) = valid_slices_iter.next().unwrap(); |
| 305 | + |
| 306 | + // If the nulls before have length greater than 0 |
| 307 | + if self[start] != initial_offset { |
| 308 | + return true; |
| 309 | + } |
| 310 | + |
| 311 | + // End is exclusive, so it already point to the last offset value |
| 312 | + // This is valid as the length of the array is always 1 less than the length of the offsets |
| 313 | + let mut end_offset_of_last_valid_value = self[end]; |
| 314 | + |
| 315 | + for (start, end) in valid_slices_iter { |
| 316 | + // If there is a null value that point to a non-empty value than the start offset of the valid value |
| 317 | + // will be different that the end offset of the last valid value |
| 318 | + if self[start] != end_offset_of_last_valid_value { |
| 319 | + return true; |
| 320 | + } |
| 321 | + |
| 322 | + // End is exclusive, so it already point to the last offset value |
| 323 | + // This is valid as the length of the array is always 1 less than the length of the offsets |
| 324 | + end_offset_of_last_valid_value = self[end]; |
| 325 | + } |
| 326 | + |
| 327 | + end_offset_of_last_valid_value != last_offset |
| 328 | + } |
241 | 329 | } |
242 | 330 |
|
243 | 331 | impl<T: ArrowNativeType> Deref for OffsetBuffer<T> { |
@@ -471,4 +559,255 @@ mod tests { |
471 | 559 | &[0, third_max as i32, (third_max * 2) as i32] |
472 | 560 | ); |
473 | 561 | } |
| 562 | + |
| 563 | + // --------------------------------------------------------------- |
| 564 | + // Tests for has_non_empty_nulls |
| 565 | + // --------------------------------------------------------------- |
| 566 | + |
| 567 | + #[test] |
| 568 | + fn has_non_empty_nulls_none_null_buffer() { |
| 569 | + // No null buffer at all -> false |
| 570 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 5, 8])); |
| 571 | + assert!(!offsets.has_non_empty_nulls(None)); |
| 572 | + } |
| 573 | + |
| 574 | + #[test] |
| 575 | + fn has_non_empty_nulls_all_valid() { |
| 576 | + // Null buffer with zero nulls -> false (early return via filter) |
| 577 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 5, 8])); |
| 578 | + let nulls = NullBuffer::new_valid(3); |
| 579 | + assert!(!offsets.has_non_empty_nulls(Some(&nulls))); |
| 580 | + } |
| 581 | + |
| 582 | + #[test] |
| 583 | + fn has_non_empty_nulls_all_null_empty_offsets() { |
| 584 | + // All values are null and all offsets are equal (no data behind nulls) -> false |
| 585 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 0, 0, 0])); |
| 586 | + let nulls = NullBuffer::new_null(3); |
| 587 | + assert!(!offsets.has_non_empty_nulls(Some(&nulls))); |
| 588 | + } |
| 589 | + |
| 590 | + #[test] |
| 591 | + fn has_non_empty_nulls_all_null_non_empty_offsets() { |
| 592 | + // All values are null but offsets span data -> true |
| 593 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 5, 7])); |
| 594 | + let nulls = NullBuffer::new_null(3); |
| 595 | + assert!(offsets.has_non_empty_nulls(Some(&nulls))); |
| 596 | + } |
| 597 | + |
| 598 | + #[test] |
| 599 | + fn has_non_empty_nulls_all_null_nonzero_but_equal_offsets() { |
| 600 | + // All null, offsets start at non-zero but are all equal -> false |
| 601 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![5, 5, 5])); |
| 602 | + let nulls = NullBuffer::new_null(2); |
| 603 | + assert!(!offsets.has_non_empty_nulls(Some(&nulls))); |
| 604 | + } |
| 605 | + |
| 606 | + #[test] |
| 607 | + fn has_non_empty_nulls_leading_nulls_with_data() { |
| 608 | + // Nulls at the beginning that point to non-empty ranges -> true |
| 609 | + // offsets: [0, 3, 5, 8] nulls: [false, true, true] |
| 610 | + // Index 0 is null with range 0..3 (non-empty) |
| 611 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 5, 8])); |
| 612 | + let nulls = NullBuffer::from(vec![false, true, true]); |
| 613 | + assert!(offsets.has_non_empty_nulls(Some(&nulls))); |
| 614 | + } |
| 615 | + |
| 616 | + #[test] |
| 617 | + fn has_non_empty_nulls_leading_nulls_without_data() { |
| 618 | + // Nulls at the beginning with empty ranges -> continue checking |
| 619 | + // offsets: [0, 0, 3, 6] nulls: [false, true, true] |
| 620 | + // Index 0 is null with range 0..0 (empty) |
| 621 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 0, 3, 6])); |
| 622 | + let nulls = NullBuffer::from(vec![false, true, true]); |
| 623 | + assert!(!offsets.has_non_empty_nulls(Some(&nulls))); |
| 624 | + } |
| 625 | + |
| 626 | + #[test] |
| 627 | + fn has_non_empty_nulls_only_trailing_null_has_data() { |
| 628 | + // Only the trailing null region has data, everything else is clean |
| 629 | + // offsets: [0, 0, 3, 6, 8] nulls: [false, true, true, false] |
| 630 | + // Null at 0 (0..0 empty), valid at 1,2 (0..3, 3..6), null at 3 (6..8 non-empty) |
| 631 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 0, 3, 6, 8])); |
| 632 | + let nulls = NullBuffer::from(vec![false, true, true, false]); |
| 633 | + assert!(offsets.has_non_empty_nulls(Some(&nulls))); |
| 634 | + } |
| 635 | + |
| 636 | + #[test] |
| 637 | + fn has_non_empty_nulls_trailing_nulls_without_data() { |
| 638 | + // Nulls at the end with empty ranges -> false |
| 639 | + // offsets: [0, 3, 6, 6] nulls: [true, true, false] |
| 640 | + // Index 2 is null with range 6..6 (empty) |
| 641 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 6, 6])); |
| 642 | + let nulls = NullBuffer::from(vec![true, true, false]); |
| 643 | + assert!(!offsets.has_non_empty_nulls(Some(&nulls))); |
| 644 | + } |
| 645 | + |
| 646 | + #[test] |
| 647 | + fn has_non_empty_nulls_middle_nulls_with_data() { |
| 648 | + // Null in the middle with non-empty range -> true |
| 649 | + // offsets: [0, 3, 7, 10] nulls: [true, false, true] |
| 650 | + // Index 1 is null with range 3..7 (non-empty) |
| 651 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 7, 10])); |
| 652 | + let nulls = NullBuffer::from(vec![true, false, true]); |
| 653 | + assert!(offsets.has_non_empty_nulls(Some(&nulls))); |
| 654 | + } |
| 655 | + |
| 656 | + #[test] |
| 657 | + fn has_non_empty_nulls_middle_nulls_without_data() { |
| 658 | + // Null in the middle with empty range -> false |
| 659 | + // offsets: [0, 3, 3, 6] nulls: [true, false, true] |
| 660 | + // Index 1 is null with range 3..3 (empty) |
| 661 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 3, 6])); |
| 662 | + let nulls = NullBuffer::from(vec![true, false, true]); |
| 663 | + assert!(!offsets.has_non_empty_nulls(Some(&nulls))); |
| 664 | + } |
| 665 | + |
| 666 | + #[test] |
| 667 | + fn has_non_empty_nulls_alternating_null_valid_all_empty() { |
| 668 | + // Alternating null/valid where every null has an empty range -> false. |
| 669 | + |
| 670 | + // Ends with null |
| 671 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 0, 3, 3, 6, 6])); |
| 672 | + let nulls = NullBuffer::from(vec![false, true, false, true, false]); |
| 673 | + assert!(!offsets.has_non_empty_nulls(Some(&nulls))); |
| 674 | + |
| 675 | + // Ends with valid |
| 676 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 0, 3, 3, 6, 6, 9])); |
| 677 | + let nulls = NullBuffer::from(vec![false, true, false, true, false, true]); |
| 678 | + assert!(!offsets.has_non_empty_nulls(Some(&nulls))); |
| 679 | + } |
| 680 | + |
| 681 | + #[test] |
| 682 | + fn has_non_empty_nulls_multiple_null_regions_second_has_data() { |
| 683 | + // Two null regions: first empty, second non-empty -> true |
| 684 | + // offsets: [0, 0, 3, 5, 6] nulls: [false, true, false, true] |
| 685 | + // Null at index 0 (0..0 empty), null at index 2 (3..5 non-empty) |
| 686 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 0, 3, 5, 6])); |
| 687 | + let nulls = NullBuffer::from(vec![false, true, false, true]); |
| 688 | + assert!(offsets.has_non_empty_nulls(Some(&nulls))); |
| 689 | + } |
| 690 | + |
| 691 | + #[test] |
| 692 | + fn has_non_empty_nulls_multiple_null_regions_later_gap_has_data() { |
| 693 | + // Three null regions: first two empty, third non-empty -> true |
| 694 | + // offsets: [0, 0, 3, 3, 6, 8, 10] nulls: [false, true, false, true, false, true] |
| 695 | + // valid_slices: (1,2), (3,4), (5,6) |
| 696 | + // first slice: start=1, self[1]=0 == initial_offset=0 OK, end_offset=self[2]=3 |
| 697 | + // loop iter 1: start=3, self[3]=3 == 3 OK (first gap empty), end_offset=self[4]=6 |
| 698 | + // loop iter 2: start=5, self[5]=8 != 6 -> true (second gap has data) |
| 699 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 0, 3, 3, 6, 8, 10])); |
| 700 | + let nulls = NullBuffer::from(vec![false, true, false, true, false, true]); |
| 701 | + assert!(offsets.has_non_empty_nulls(Some(&nulls))); |
| 702 | + } |
| 703 | + |
| 704 | + #[test] |
| 705 | + fn has_non_empty_nulls_single_element_null_empty() { |
| 706 | + // Single element, null with empty range -> false |
| 707 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 0])); |
| 708 | + let nulls = NullBuffer::new_null(1); |
| 709 | + assert!(!offsets.has_non_empty_nulls(Some(&nulls))); |
| 710 | + } |
| 711 | + |
| 712 | + #[test] |
| 713 | + fn has_non_empty_nulls_single_element_null_non_empty() { |
| 714 | + // Single element, null with non-empty range -> true |
| 715 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 5])); |
| 716 | + let nulls = NullBuffer::new_null(1); |
| 717 | + assert!(offsets.has_non_empty_nulls(Some(&nulls))); |
| 718 | + } |
| 719 | + |
| 720 | + #[test] |
| 721 | + fn has_non_empty_nulls_single_element_valid() { |
| 722 | + // Single element, valid -> false (no nulls at all) |
| 723 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 5])); |
| 724 | + let nulls = NullBuffer::new_valid(1); |
| 725 | + assert!(!offsets.has_non_empty_nulls(Some(&nulls))); |
| 726 | + } |
| 727 | + |
| 728 | + #[test] |
| 729 | + fn has_non_empty_nulls_consecutive_nulls_between_valid_slices() { |
| 730 | + // Multiple consecutive nulls between valid regions |
| 731 | + // offsets: [0, 2, 2, 2, 5, 8] nulls: [true, false, false, true, true] |
| 732 | + // Valid: [0], nulls: [1,2], valid: [3,4] |
| 733 | + // Null region [1,2] has offsets 2..2..2 (empty) -> false |
| 734 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 2, 2, 5, 8])); |
| 735 | + let nulls = NullBuffer::from(vec![true, false, false, true, true]); |
| 736 | + assert!(!offsets.has_non_empty_nulls(Some(&nulls))); |
| 737 | + } |
| 738 | + |
| 739 | + #[test] |
| 740 | + fn has_non_empty_nulls_consecutive_nulls_between_valid_slices_with_data() { |
| 741 | + // Multiple consecutive nulls between valid regions, nulls have data |
| 742 | + // offsets: [0, 2, 3, 4, 5, 8] nulls: [true, false, false, true, true] |
| 743 | + // valid_slices: (0,1), (3,5) |
| 744 | + // first slice: start=0, end=1 -> self[0]=0 == initial_offset=0 OK |
| 745 | + // end_offset_of_last_valid_value = self[1] = 2 |
| 746 | + // second slice: start=3, end=5 -> self[3]=4 != 2 -> true |
| 747 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3, 4, 5, 8])); |
| 748 | + let nulls = NullBuffer::from(vec![true, false, false, true, true]); |
| 749 | + assert!(offsets.has_non_empty_nulls(Some(&nulls))); |
| 750 | + } |
| 751 | + |
| 752 | + #[test] |
| 753 | + fn has_non_empty_nulls_nonzero_initial_offset_all_null_equal() { |
| 754 | + // Non-zero starting offset, all null, all offsets equal -> false |
| 755 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![10, 10, 10])); |
| 756 | + let nulls = NullBuffer::new_null(2); |
| 757 | + assert!(!offsets.has_non_empty_nulls(Some(&nulls))); |
| 758 | + } |
| 759 | + |
| 760 | + #[test] |
| 761 | + fn has_non_empty_nulls_nonzero_initial_offset_with_data() { |
| 762 | + // Non-zero starting offset, null has data |
| 763 | + // offsets: [10, 15, 20] nulls: [false, true] |
| 764 | + // Null at index 0 with range 10..15 (non-empty) -> true |
| 765 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![10, 15, 20])); |
| 766 | + let nulls = NullBuffer::from(vec![false, true]); |
| 767 | + assert!(offsets.has_non_empty_nulls(Some(&nulls))); |
| 768 | + } |
| 769 | + |
| 770 | + #[test] |
| 771 | + fn has_non_empty_nulls_sliced_no_nulls_in_null_region() { |
| 772 | + // Original: [0, 3, 3, 6, 6, 9] -> slice(1, 3) -> [3, 3, 6, 6] |
| 773 | + // initial_offset=3, last_offset=6 |
| 774 | + // nulls: [false, true, false] (null at index 0 has range 3..3 = empty) |
| 775 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 3, 6, 6, 9])); |
| 776 | + let sliced = offsets.slice(1, 3); |
| 777 | + let nulls = NullBuffer::from(vec![false, true, false]); |
| 778 | + assert!(!sliced.has_non_empty_nulls(Some(&nulls))); |
| 779 | + } |
| 780 | + |
| 781 | + #[test] |
| 782 | + fn has_non_empty_nulls_sliced_null_has_data() { |
| 783 | + // Original: [0, 3, 7, 10, 15] -> slice(1, 2) -> [3, 7, 10] |
| 784 | + // initial_offset=3, last_offset=10 |
| 785 | + // nulls: [false, true] (null at index 0 has range 3..7 = non-empty) |
| 786 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 7, 10, 15])); |
| 787 | + let sliced = offsets.slice(1, 2); |
| 788 | + let nulls = NullBuffer::from(vec![false, true]); |
| 789 | + assert!(sliced.has_non_empty_nulls(Some(&nulls))); |
| 790 | + } |
| 791 | + |
| 792 | + #[test] |
| 793 | + #[should_panic( |
| 794 | + expected = "The length of the offsets should be 1 more than the length of the null buffer" |
| 795 | + )] |
| 796 | + fn has_non_empty_nulls_all_valid_mismatched_lengths_too_short() { |
| 797 | + // All-valid null buffer with wrong length should still panic |
| 798 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 5, 8])); |
| 799 | + let nulls = NullBuffer::new_valid(2); // expects 3 |
| 800 | + offsets.has_non_empty_nulls(Some(&nulls)); |
| 801 | + } |
| 802 | + |
| 803 | + #[test] |
| 804 | + #[should_panic( |
| 805 | + expected = "The length of the offsets should be 1 more than the length of the null buffer" |
| 806 | + )] |
| 807 | + fn has_non_empty_nulls_all_valid_mismatched_lengths_too_long() { |
| 808 | + // All-valid null buffer with wrong length should still panic |
| 809 | + let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 5, 8])); |
| 810 | + let nulls = NullBuffer::new_valid(5); // expects 3 |
| 811 | + offsets.has_non_empty_nulls(Some(&nulls)); |
| 812 | + } |
474 | 813 | } |
0 commit comments