Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 143 additions & 3 deletions arrow-array/src/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ pub unsafe fn export_array_into_raw(
Ok(())
}

// returns the number of bits that buffer `i` (in the C data interface) is expected to have.
// This is set by the Arrow specification
/// returns the number of bits that buffer `i` (in the C data interface) is expected to have.
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just nicer when working with IDEs.

/// This is set by the Arrow specification
fn bit_width(data_type: &DataType, i: usize) -> Result<usize> {
if let Some(primitive) = data_type.primitive_width() {
return match i {
Expand Down Expand Up @@ -180,6 +180,10 @@ fn bit_width(data_type: &DataType, i: usize) -> Result<usize> {
| (DataType::List(_), 1)
| (DataType::Map(_, _), 1) => i32::BITS as _,
(DataType::Utf8, 2) | (DataType::Binary, 2) => u8::BITS as _,
// List views have two i32 buffers, offsets and sizes
(DataType::ListView(_), 1) | (DataType::ListView(_), 2) => i32::BITS as _,
// Large list views have two i64 buffers, offsets and sizes
(DataType::LargeListView(_), 1) | (DataType::LargeListView(_), 2) => i64::BITS as _,
(DataType::List(_), _) | (DataType::Map(_, _), _) => {
return Err(ArrowError::CDataInterface(format!(
"The datatype \"{data_type}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented."
Expand Down Expand Up @@ -351,6 +355,8 @@ impl ImportedArrowArray<'_> {
DataType::List(field)
| DataType::FixedSizeList(field, _)
| DataType::LargeList(field)
| DataType::ListView(field)
| DataType::LargeListView(field)
| DataType::Map(field, _) => Ok([self.consume_child(0, field.data_type())?].to_vec()),
DataType::Struct(fields) => {
assert!(fields.len() == self.array.num_children());
Expand Down Expand Up @@ -471,6 +477,14 @@ impl ImportedArrowArray<'_> {
debug_assert_eq!(bits % 8, 0);
(length + 1) * (bits / 8)
}
(DataType::ListView(_), 1)
| (DataType::ListView(_), 2)
| (DataType::LargeListView(_), 1)
| (DataType::LargeListView(_), 2) => {
let bits = bit_width(data_type, i)?;
debug_assert_eq!(bits % 8, 0);
length * (bits / 8)
}
(DataType::Utf8, 2) | (DataType::Binary, 2) => {
if self.array.is_empty() {
return Ok(0);
Expand Down Expand Up @@ -553,7 +567,7 @@ mod tests_to_then_from_ffi {
use std::collections::HashMap;
use std::mem::ManuallyDrop;

use arrow_buffer::NullBuffer;
use arrow_buffer::{ArrowNativeType, NullBuffer};
use arrow_schema::Field;

use crate::builder::UnionBuilder;
Expand Down Expand Up @@ -783,6 +797,71 @@ mod tests_to_then_from_ffi {
test_generic_list::<i64>()
}

fn test_generic_list_view<Offset: OffsetSizeTrait + ArrowNativeType>() -> Result<()> {
// Construct a value array
let value_data = ArrayData::builder(DataType::Int16)
.len(8)
.add_buffer(Buffer::from_slice_ref([0_i16, 1, 2, 3, 4, 5, 6, 7]))
.build()
.unwrap();

// Construct a buffer for value offsets, for the nested array:
// [[0, 1, 2], [3, 4, 5], [6, 7]]
let value_offsets = [0_usize, 3, 6]
.iter()
.map(|i| Offset::from_usize(*i).unwrap())
.collect::<Buffer>();

let sizes_buffer = [3_usize, 3, 2]
.iter()
.map(|i| Offset::from_usize(*i).unwrap())
.collect::<Buffer>();

// Construct a list array from the above two
let list_view_dt = GenericListViewArray::<Offset>::DATA_TYPE_CONSTRUCTOR(Arc::new(
Field::new_list_field(DataType::Int16, false),
));

let list_data = ArrayData::builder(list_view_dt)
.len(3)
.add_buffer(value_offsets)
.add_buffer(sizes_buffer)
.add_child_data(value_data)
.build()
.unwrap();

let original = GenericListViewArray::<Offset>::from(list_data.clone());

// export it
let (array, schema) = to_ffi(&original.to_data())?;

// (simulate consumer) import it
let data = unsafe { from_ffi(array, &schema) }?;
let array = make_array(data);

// downcast
let array = array
.as_any()
.downcast_ref::<GenericListViewArray<Offset>>()
.unwrap();

assert_eq!(&array.value(0), &original.value(0));
assert_eq!(&array.value(1), &original.value(1));
assert_eq!(&array.value(2), &original.value(2));

Ok(())
}

#[test]
fn test_list_view() -> Result<()> {
test_generic_list_view::<i32>()
}

#[test]
fn test_large_list_view() -> Result<()> {
test_generic_list_view::<i64>()
}

fn test_generic_binary<Offset: OffsetSizeTrait>() -> Result<()> {
// create an array natively
let array: Vec<Option<&[u8]>> = vec![Some(b"a"), None, Some(b"aaa")];
Expand Down Expand Up @@ -1315,6 +1394,7 @@ mod tests_from_ffi {
use std::ptr::NonNull;
use std::sync::Arc;

use arrow_buffer::NullBuffer;
#[cfg(not(feature = "force_validate"))]
use arrow_buffer::{ScalarBuffer, bit_util, buffer::Buffer};
#[cfg(feature = "force_validate")]
Expand All @@ -1325,6 +1405,7 @@ mod tests_from_ffi {
use arrow_schema::{DataType, Field};

use super::Result;

use crate::builder::GenericByteViewBuilder;
use crate::types::{BinaryViewType, ByteViewType, Int32Type, StringViewType};
use crate::{
Expand Down Expand Up @@ -1528,6 +1609,65 @@ mod tests_from_ffi {
test_round_trip(&data)
}

#[test]
fn test_list_view() -> Result<()> {
// Construct a value array
let value_data = ArrayData::builder(DataType::Int16)
.len(8)
.add_buffer(Buffer::from_slice_ref([0_i16, 1, 2, 3, 4, 5, 6, 7]))
.build()
.unwrap();

// Construct a buffer for value offsets, for the nested array:
// [[0, 1, 2], [3, 4, 5], [6, 7]]
let value_offsets = Buffer::from(vec![0_i32, 3, 6]);
let sizes_buffer = Buffer::from(vec![3_i32, 3, 2]);

// Construct a list array from the above two
let list_view_dt =
DataType::ListView(Arc::new(Field::new_list_field(DataType::Int16, false)));

let list_view_data = ArrayData::builder(list_view_dt)
.len(3)
.add_buffer(value_offsets)
.add_buffer(sizes_buffer)
.add_child_data(value_data)
.build()
.unwrap();

test_round_trip(&list_view_data)
}

#[test]
fn test_list_view_with_nulls() -> Result<()> {
// Construct a value array
let value_data = ArrayData::builder(DataType::Int16)
.len(8)
.add_buffer(Buffer::from_slice_ref([0_i16, 1, 2, 3, 4, 5, 6, 7]))
.build()
.unwrap();

// Construct a buffer for value offsets, for the nested array:
// [[0, 1, 2], [3, 4, 5], [6, 7], null]
let value_offsets = Buffer::from(vec![0_i32, 3, 6, 8]);
let sizes_buffer = Buffer::from(vec![3_i32, 3, 2, 0]);

// Construct a list array from the above two
let list_view_dt =
DataType::ListView(Arc::new(Field::new_list_field(DataType::Int16, true)));

let list_view_data = ArrayData::builder(list_view_dt)
.len(4)
.add_buffer(value_offsets)
.add_buffer(sizes_buffer)
.add_child_data(value_data)
.nulls(Some(NullBuffer::from(vec![true, true, true, false])))
.build()
.unwrap();

test_round_trip(&list_view_data)
}

#[test]
#[cfg(not(feature = "force_validate"))]
fn test_empty_string_with_non_zero_offset() -> Result<()> {
Expand Down
2 changes: 1 addition & 1 deletion arrow-data/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1783,7 +1783,7 @@ impl DataTypeLayout {
},
],
can_contain_null_mask: true,
variadic: true,
variadic: false,
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bug, see the spec, list views aren't represented by a variable number of buffers.

}
}
}
Expand Down
22 changes: 22 additions & 0 deletions arrow-schema/src/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,14 @@ impl TryFrom<&FFI_ArrowSchema> for DataType {
let c_child = c_schema.child(0);
DataType::LargeList(Arc::new(Field::try_from(c_child)?))
}
"+vl" => {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let c_child = c_schema.child(0);
DataType::ListView(Arc::new(Field::try_from(c_child)?))
}
"+vL" => {
let c_child = c_schema.child(0);
DataType::LargeListView(Arc::new(Field::try_from(c_child)?))
}
"+s" => {
let fields = c_schema.children().map(Field::try_from);
DataType::Struct(fields.collect::<Result<_, ArrowError>>()?)
Expand Down Expand Up @@ -657,6 +665,8 @@ impl TryFrom<&DataType> for FFI_ArrowSchema {
let children = match dtype {
DataType::List(child)
| DataType::LargeList(child)
| DataType::ListView(child)
| DataType::LargeListView(child)
| DataType::FixedSizeList(child, _)
| DataType::Map(child, _) => {
vec![FFI_ArrowSchema::try_from(child.as_ref())?]
Expand Down Expand Up @@ -746,6 +756,8 @@ fn get_format_string(dtype: &DataType) -> Result<Cow<'static, str>, ArrowError>
DataType::Interval(IntervalUnit::MonthDayNano) => Ok("tin".into()),
DataType::List(_) => Ok("+l".into()),
DataType::LargeList(_) => Ok("+L".into()),
DataType::ListView(_) => Ok("+vl".into()),
DataType::LargeListView(_) => Ok("+vL".into()),
DataType::Struct(_) => Ok("+s".into()),
DataType::Map(_, _) => Ok("+m".into()),
DataType::RunEndEncoded(_, _) => Ok("+r".into()),
Expand Down Expand Up @@ -874,6 +886,16 @@ mod tests {
DataType::Int16,
false,
))));
round_trip_type(DataType::ListView(Arc::new(Field::new(
"a",
DataType::Int16,
false,
))));
round_trip_type(DataType::LargeListView(Arc::new(Field::new(
"a",
DataType::Int16,
false,
))));
round_trip_type(DataType::Struct(Fields::from(vec![Field::new(
"a",
DataType::Utf8,
Expand Down
Loading