Skip to content

Commit b7a9167

Browse files
committed
Merge remote-tracking branch 'apache/main' into fix_uint_cast
2 parents 6f583c4 + 0e48877 commit b7a9167

File tree

33 files changed

+1871
-212
lines changed

33 files changed

+1871
-212
lines changed

arrow-array/src/array/list_array.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,16 +42,20 @@ pub trait OffsetSizeTrait: ArrowNativeType + std::ops::AddAssign + Integer {
4242
const IS_LARGE: bool;
4343
/// Prefix for the offset size
4444
const PREFIX: &'static str;
45+
/// The max `usize` offset
46+
const MAX_OFFSET: usize;
4547
}
4648

4749
impl OffsetSizeTrait for i32 {
4850
const IS_LARGE: bool = false;
4951
const PREFIX: &'static str = "";
52+
const MAX_OFFSET: usize = i32::MAX as usize;
5053
}
5154

5255
impl OffsetSizeTrait for i64 {
5356
const IS_LARGE: bool = true;
5457
const PREFIX: &'static str = "Large";
58+
const MAX_OFFSET: usize = i64::MAX as usize;
5559
}
5660

5761
/// An array of [variable length lists], similar to JSON arrays

arrow-array/src/array/struct_array.rs

Lines changed: 85 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,28 @@ impl StructArray {
9191
Self::try_new(fields, arrays, nulls).unwrap()
9292
}
9393

94+
/// Create a new [`StructArray`] from the provided parts, returning an error on failure
95+
///
96+
/// The length will be inferred from the length of the child arrays. Returns an error if
97+
/// there are no child arrays. Consider using [`Self::try_new_with_length`] if the length
98+
/// is known to avoid this.
99+
///
100+
/// # Errors
101+
///
102+
/// Errors if
103+
///
104+
/// * `fields.len() == 0`
105+
/// * Any reason that [`Self::try_new_with_length`] would error
106+
pub fn try_new(
107+
fields: Fields,
108+
arrays: Vec<ArrayRef>,
109+
nulls: Option<NullBuffer>,
110+
) -> Result<Self, ArrowError> {
111+
let len = arrays.first().map(|x| x.len()).ok_or_else(||ArrowError::InvalidArgumentError("use StructArray::try_new_with_length or StructArray::new_empty to create a struct array with no fields so that the length can be set correctly".to_string()))?;
112+
113+
Self::try_new_with_length(fields, arrays, nulls, len)
114+
}
115+
94116
/// Create a new [`StructArray`] from the provided parts, returning an error on failure
95117
///
96118
/// # Errors
@@ -102,10 +124,11 @@ impl StructArray {
102124
/// * `arrays[i].len() != arrays[j].len()`
103125
/// * `arrays[i].len() != nulls.len()`
104126
/// * `!fields[i].is_nullable() && !nulls.contains(arrays[i].nulls())`
105-
pub fn try_new(
127+
pub fn try_new_with_length(
106128
fields: Fields,
107129
arrays: Vec<ArrayRef>,
108130
nulls: Option<NullBuffer>,
131+
len: usize,
109132
) -> Result<Self, ArrowError> {
110133
if fields.len() != arrays.len() {
111134
return Err(ArrowError::InvalidArgumentError(format!(
@@ -114,7 +137,6 @@ impl StructArray {
114137
arrays.len()
115138
)));
116139
}
117-
let len = arrays.first().map(|x| x.len()).unwrap_or_default();
118140

119141
if let Some(n) = nulls.as_ref() {
120142
if n.len() != len {
@@ -181,6 +203,10 @@ impl StructArray {
181203

182204
/// Create a new [`StructArray`] from the provided parts without validation
183205
///
206+
/// The length will be inferred from the length of the child arrays. Panics if there are no
207+
/// child arrays. Consider using [`Self::new_unchecked_with_length`] if the length is known
208+
/// to avoid this.
209+
///
184210
/// # Safety
185211
///
186212
/// Safe if [`Self::new`] would not panic with the given arguments
@@ -193,7 +219,32 @@ impl StructArray {
193219
return Self::new(fields, arrays, nulls);
194220
}
195221

196-
let len = arrays.first().map(|x| x.len()).unwrap_or_default();
222+
let len = arrays.first().map(|x| x.len()).expect(
223+
"cannot use StructArray::new_unchecked if there are no fields, length is unknown",
224+
);
225+
Self {
226+
len,
227+
data_type: DataType::Struct(fields),
228+
nulls,
229+
fields: arrays,
230+
}
231+
}
232+
233+
/// Create a new [`StructArray`] from the provided parts without validation
234+
///
235+
/// # Safety
236+
///
237+
/// Safe if [`Self::new`] would not panic with the given arguments
238+
pub unsafe fn new_unchecked_with_length(
239+
fields: Fields,
240+
arrays: Vec<ArrayRef>,
241+
nulls: Option<NullBuffer>,
242+
len: usize,
243+
) -> Self {
244+
if cfg!(feature = "force_validate") {
245+
return Self::try_new_with_length(fields, arrays, nulls, len).unwrap();
246+
}
247+
197248
Self {
198249
len,
199250
data_type: DataType::Struct(fields),
@@ -817,9 +868,38 @@ mod tests {
817868
}
818869

819870
#[test]
871+
#[should_panic(expected = "use StructArray::try_new_with_length")]
820872
fn test_struct_array_from_empty() {
821-
let sa = StructArray::from(vec![]);
822-
assert!(sa.is_empty())
873+
// This can't work because we don't know how many rows the array should have. Previously we inferred 0 but
874+
// that often led to bugs.
875+
let _ = StructArray::from(vec![]);
876+
}
877+
878+
#[test]
879+
fn test_empty_struct_array() {
880+
assert!(StructArray::try_new(Fields::empty(), vec![], None).is_err());
881+
882+
let arr = StructArray::new_empty_fields(10, None);
883+
assert_eq!(arr.len(), 10);
884+
assert_eq!(arr.null_count(), 0);
885+
assert_eq!(arr.num_columns(), 0);
886+
887+
let arr2 = StructArray::try_new_with_length(Fields::empty(), vec![], None, 10).unwrap();
888+
assert_eq!(arr2.len(), 10);
889+
890+
let arr = StructArray::new_empty_fields(10, Some(NullBuffer::new_null(10)));
891+
assert_eq!(arr.len(), 10);
892+
assert_eq!(arr.null_count(), 10);
893+
assert_eq!(arr.num_columns(), 0);
894+
895+
let arr2 = StructArray::try_new_with_length(
896+
Fields::empty(),
897+
vec![],
898+
Some(NullBuffer::new_null(10)),
899+
10,
900+
)
901+
.unwrap();
902+
assert_eq!(arr2.len(), 10);
823903
}
824904

825905
#[test]

arrow-array/src/record_batch.rs

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -211,10 +211,11 @@ impl RecordBatch {
211211
/// Creates a `RecordBatch` from a schema and columns.
212212
///
213213
/// Expects the following:
214-
/// * the vec of columns to not be empty
215-
/// * the schema and column data types to have equal lengths
216-
/// and match
217-
/// * each array in columns to have the same length
214+
///
215+
/// * `!columns.is_empty()`
216+
/// * `schema.fields.len() == columns.len()`
217+
/// * `schema.fields[i].data_type() == columns[i].data_type()`
218+
/// * `columns[i].len() == columns[j].len()`
218219
///
219220
/// If the conditions are not met, an error is returned.
220221
///
@@ -240,6 +241,33 @@ impl RecordBatch {
240241
Self::try_new_impl(schema, columns, &options)
241242
}
242243

244+
/// Creates a `RecordBatch` from a schema and columns, without validation.
245+
///
246+
/// See [`Self::try_new`] for the checked version.
247+
///
248+
/// # Safety
249+
///
250+
/// Expects the following:
251+
///
252+
/// * `schema.fields.len() == columns.len()`
253+
/// * `schema.fields[i].data_type() == columns[i].data_type()`
254+
/// * `columns[i].len() == row_count`
255+
///
256+
/// Note: if the schema does not match the underlying data exactly, it can lead to undefined
257+
/// behavior, for example, via conversion to a `StructArray`, which in turn could lead
258+
/// to incorrect access.
259+
pub unsafe fn new_unchecked(
260+
schema: SchemaRef,
261+
columns: Vec<Arc<dyn Array>>,
262+
row_count: usize,
263+
) -> Self {
264+
Self {
265+
schema,
266+
columns,
267+
row_count,
268+
}
269+
}
270+
243271
/// Creates a `RecordBatch` from a schema and columns, with additional options,
244272
/// such as whether to strictly validate field names.
245273
///
@@ -340,6 +368,11 @@ impl RecordBatch {
340368
})
341369
}
342370

371+
/// Return the schema, columns and row count of this [`RecordBatch`]
372+
pub fn into_parts(self) -> (SchemaRef, Vec<ArrayRef>, usize) {
373+
(self.schema, self.columns, self.row_count)
374+
}
375+
343376
/// Override the schema of this [`RecordBatch`]
344377
///
345378
/// Returns an error if `schema` is not a superset of the current schema

arrow-avro/src/codec.rs

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,21 @@ impl AvroDataType {
5151
Field::new(name, d, self.nullability.is_some()).with_metadata(self.metadata.clone())
5252
}
5353

54+
/// Returns a reference to the codec used by this data type
55+
///
56+
/// The codec determines how Avro data is encoded and mapped to Arrow data types.
57+
/// This is useful when we need to inspect or use the specific encoding of a field.
5458
pub fn codec(&self) -> &Codec {
5559
&self.codec
5660
}
5761

62+
/// Returns the nullability status of this data type
63+
///
64+
/// In Avro, nullability is represented through unions with null types.
65+
/// The returned value indicates how nulls are encoded in the Avro format:
66+
/// - `Some(Nullability::NullFirst)` - Nulls are encoded as the first union variant
67+
/// - `Some(Nullability::NullSecond)` - Nulls are encoded as the second union variant
68+
/// - `None` - The type is not nullable
5869
pub fn nullability(&self) -> Option<Nullability> {
5970
self.nullability
6071
}
@@ -78,6 +89,10 @@ impl AvroField {
7889
&self.data_type
7990
}
8091

92+
/// Returns the name of this Avro field
93+
///
94+
/// This is the field name as defined in the Avro schema.
95+
/// It's used to identify fields within a record structure.
8196
pub fn name(&self) -> &str {
8297
&self.name
8398
}
@@ -108,24 +123,46 @@ impl<'a> TryFrom<&Schema<'a>> for AvroField {
108123
/// <https://avro.apache.org/docs/1.11.1/specification/#encodings>
109124
#[derive(Debug, Clone)]
110125
pub enum Codec {
126+
/// Represents Avro null type, maps to Arrow's Null data type
111127
Null,
128+
/// Represents Avro boolean type, maps to Arrow's Boolean data type
112129
Boolean,
130+
/// Represents Avro int type, maps to Arrow's Int32 data type
113131
Int32,
132+
/// Represents Avro long type, maps to Arrow's Int64 data type
114133
Int64,
134+
/// Represents Avro float type, maps to Arrow's Float32 data type
115135
Float32,
136+
/// Represents Avro double type, maps to Arrow's Float64 data type
116137
Float64,
138+
/// Represents Avro bytes type, maps to Arrow's Binary data type
117139
Binary,
140+
/// String data represented as UTF-8 encoded bytes, corresponding to Arrow's StringArray
118141
Utf8,
142+
/// Represents Avro date logical type, maps to Arrow's Date32 data type
119143
Date32,
144+
/// Represents Avro time-millis logical type, maps to Arrow's Time32(TimeUnit::Millisecond) data type
120145
TimeMillis,
146+
/// Represents Avro time-micros logical type, maps to Arrow's Time64(TimeUnit::Microsecond) data type
121147
TimeMicros,
122-
/// TimestampMillis(is_utc)
148+
/// Represents Avro timestamp-millis or local-timestamp-millis logical type
149+
///
150+
/// Maps to Arrow's Timestamp(TimeUnit::Millisecond) data type
151+
/// The boolean parameter indicates whether the timestamp has a UTC timezone (true) or is local time (false)
123152
TimestampMillis(bool),
124-
/// TimestampMicros(is_utc)
153+
/// Represents Avro timestamp-micros or local-timestamp-micros logical type
154+
///
155+
/// Maps to Arrow's Timestamp(TimeUnit::Microsecond) data type
156+
/// The boolean parameter indicates whether the timestamp has a UTC timezone (true) or is local time (false)
125157
TimestampMicros(bool),
158+
/// Represents Avro fixed type, maps to Arrow's FixedSizeBinary data type
159+
/// The i32 parameter indicates the fixed binary size
126160
Fixed(i32),
161+
/// Represents Avro array type, maps to Arrow's List data type
127162
List(Arc<AvroDataType>),
163+
/// Represents Avro record type, maps to Arrow's Struct data type
128164
Struct(Arc<[AvroField]>),
165+
/// Represents Avro duration logical type, maps to Arrow's Interval(IntervalUnit::MonthDayNano) data type
129166
Interval,
130167
}
131168

arrow-avro/src/compression.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,16 @@ use std::io::Read;
2323
pub const CODEC_METADATA_KEY: &str = "avro.codec";
2424

2525
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
26+
/// Supported compression codecs for Avro data
27+
///
28+
/// Avro supports multiple compression formats for data blocks.
29+
/// This enum represents the compression codecs available in this implementation.
2630
pub enum CompressionCodec {
31+
/// Deflate compression (RFC 1951)
2732
Deflate,
33+
/// Snappy compression
2834
Snappy,
35+
/// ZStandard compression
2936
ZStandard,
3037
}
3138

arrow-avro/src/lib.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,26 @@
2828
#![warn(missing_docs)]
2929
#![allow(unused)] // Temporary
3030

31+
/// Core functionality for reading Avro data into Arrow arrays
32+
///
33+
/// Implements the primary reader interface and record decoding logic.
3134
pub mod reader;
35+
36+
/// Avro schema parsing and representation
37+
///
38+
/// Provides types for parsing and representing Avro schema definitions.
3239
mod schema;
3340

41+
/// Compression codec implementations for Avro
42+
///
43+
/// Provides support for various compression algorithms used in Avro files,
44+
/// including Deflate, Snappy, and ZStandard.
3445
mod compression;
3546

47+
/// Data type conversions between Avro and Arrow types
48+
///
49+
/// This module contains the necessary types and functions to convert between
50+
/// Avro data types and Arrow data types.
3651
mod codec;
3752

3853
#[cfg(test)]

arrow-avro/src/reader/record.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ pub struct RecordDecoder {
3737
}
3838

3939
impl RecordDecoder {
40+
/// Create a new [`RecordDecoder`] from the provided [`AvroDataType`]
4041
pub fn try_new(data_type: &AvroDataType) -> Result<Self, ArrowError> {
4142
match Decoder::try_new(data_type)? {
4243
Decoder::Record(fields, encodings) => Ok(Self {

0 commit comments

Comments
 (0)