Skip to content

Commit 0a01037

Browse files
committed
Merge remote-tracking branch 'upstream/main' into speedup_filters
2 parents 6f9aa36 + 4491b17 commit 0a01037

File tree

10 files changed

+271
-11
lines changed

10 files changed

+271
-11
lines changed

arrow-array/src/array/struct_array.rs

Lines changed: 85 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,28 @@ impl StructArray {
9191
Self::try_new(fields, arrays, nulls).unwrap()
9292
}
9393

94+
/// Create a new [`StructArray`] from the provided parts, returning an error on failure
95+
///
96+
/// The length will be inferred from the length of the child arrays. Returns an error if
97+
/// there are no child arrays. Consider using [`Self::try_new_with_length`] if the length
98+
/// is known to avoid this.
99+
///
100+
/// # Errors
101+
///
102+
/// Errors if
103+
///
104+
/// * `fields.len() == 0`
105+
/// * Any reason that [`Self::try_new_with_length`] would error
106+
pub fn try_new(
107+
fields: Fields,
108+
arrays: Vec<ArrayRef>,
109+
nulls: Option<NullBuffer>,
110+
) -> Result<Self, ArrowError> {
111+
let len = arrays.first().map(|x| x.len()).ok_or_else(||ArrowError::InvalidArgumentError("use StructArray::try_new_with_length or StructArray::new_empty to create a struct array with no fields so that the length can be set correctly".to_string()))?;
112+
113+
Self::try_new_with_length(fields, arrays, nulls, len)
114+
}
115+
94116
/// Create a new [`StructArray`] from the provided parts, returning an error on failure
95117
///
96118
/// # Errors
@@ -102,10 +124,11 @@ impl StructArray {
102124
/// * `arrays[i].len() != arrays[j].len()`
103125
/// * `arrays[i].len() != nulls.len()`
104126
/// * `!fields[i].is_nullable() && !nulls.contains(arrays[i].nulls())`
105-
pub fn try_new(
127+
pub fn try_new_with_length(
106128
fields: Fields,
107129
arrays: Vec<ArrayRef>,
108130
nulls: Option<NullBuffer>,
131+
len: usize,
109132
) -> Result<Self, ArrowError> {
110133
if fields.len() != arrays.len() {
111134
return Err(ArrowError::InvalidArgumentError(format!(
@@ -114,7 +137,6 @@ impl StructArray {
114137
arrays.len()
115138
)));
116139
}
117-
let len = arrays.first().map(|x| x.len()).unwrap_or_default();
118140

119141
if let Some(n) = nulls.as_ref() {
120142
if n.len() != len {
@@ -181,6 +203,10 @@ impl StructArray {
181203

182204
/// Create a new [`StructArray`] from the provided parts without validation
183205
///
206+
/// The length will be inferred from the length of the child arrays. Panics if there are no
207+
/// child arrays. Consider using [`Self::new_unchecked_with_length`] if the length is known
208+
/// to avoid this.
209+
///
184210
/// # Safety
185211
///
186212
/// Safe if [`Self::new`] would not panic with the given arguments
@@ -193,7 +219,32 @@ impl StructArray {
193219
return Self::new(fields, arrays, nulls);
194220
}
195221

196-
let len = arrays.first().map(|x| x.len()).unwrap_or_default();
222+
let len = arrays.first().map(|x| x.len()).expect(
223+
"cannot use StructArray::new_unchecked if there are no fields, length is unknown",
224+
);
225+
Self {
226+
len,
227+
data_type: DataType::Struct(fields),
228+
nulls,
229+
fields: arrays,
230+
}
231+
}
232+
233+
/// Create a new [`StructArray`] from the provided parts without validation
234+
///
235+
/// # Safety
236+
///
237+
/// Safe if [`Self::new`] would not panic with the given arguments
238+
pub unsafe fn new_unchecked_with_length(
239+
fields: Fields,
240+
arrays: Vec<ArrayRef>,
241+
nulls: Option<NullBuffer>,
242+
len: usize,
243+
) -> Self {
244+
if cfg!(feature = "force_validate") {
245+
return Self::try_new_with_length(fields, arrays, nulls, len).unwrap();
246+
}
247+
197248
Self {
198249
len,
199250
data_type: DataType::Struct(fields),
@@ -817,9 +868,38 @@ mod tests {
817868
}
818869

819870
#[test]
871+
#[should_panic(expected = "use StructArray::try_new_with_length")]
820872
fn test_struct_array_from_empty() {
821-
let sa = StructArray::from(vec![]);
822-
assert!(sa.is_empty())
873+
// This can't work because we don't know how many rows the array should have. Previously we inferred 0 but
874+
// that often led to bugs.
875+
let _ = StructArray::from(vec![]);
876+
}
877+
878+
#[test]
879+
fn test_empty_struct_array() {
880+
assert!(StructArray::try_new(Fields::empty(), vec![], None).is_err());
881+
882+
let arr = StructArray::new_empty_fields(10, None);
883+
assert_eq!(arr.len(), 10);
884+
assert_eq!(arr.null_count(), 0);
885+
assert_eq!(arr.num_columns(), 0);
886+
887+
let arr2 = StructArray::try_new_with_length(Fields::empty(), vec![], None, 10).unwrap();
888+
assert_eq!(arr2.len(), 10);
889+
890+
let arr = StructArray::new_empty_fields(10, Some(NullBuffer::new_null(10)));
891+
assert_eq!(arr.len(), 10);
892+
assert_eq!(arr.null_count(), 10);
893+
assert_eq!(arr.num_columns(), 0);
894+
895+
let arr2 = StructArray::try_new_with_length(
896+
Fields::empty(),
897+
vec![],
898+
Some(NullBuffer::new_null(10)),
899+
10,
900+
)
901+
.unwrap();
902+
assert_eq!(arr2.len(), 10);
823903
}
824904

825905
#[test]

arrow-avro/src/codec.rs

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,21 @@ impl AvroDataType {
5151
Field::new(name, d, self.nullability.is_some()).with_metadata(self.metadata.clone())
5252
}
5353

54+
/// Returns a reference to the codec used by this data type
55+
///
56+
/// The codec determines how Avro data is encoded and mapped to Arrow data types.
57+
/// This is useful when we need to inspect or use the specific encoding of a field.
5458
pub fn codec(&self) -> &Codec {
5559
&self.codec
5660
}
5761

62+
/// Returns the nullability status of this data type
63+
///
64+
/// In Avro, nullability is represented through unions with null types.
65+
/// The returned value indicates how nulls are encoded in the Avro format:
66+
/// - `Some(Nullability::NullFirst)` - Nulls are encoded as the first union variant
67+
/// - `Some(Nullability::NullSecond)` - Nulls are encoded as the second union variant
68+
/// - `None` - The type is not nullable
5869
pub fn nullability(&self) -> Option<Nullability> {
5970
self.nullability
6071
}
@@ -78,6 +89,10 @@ impl AvroField {
7889
&self.data_type
7990
}
8091

92+
/// Returns the name of this Avro field
93+
///
94+
/// This is the field name as defined in the Avro schema.
95+
/// It's used to identify fields within a record structure.
8196
pub fn name(&self) -> &str {
8297
&self.name
8398
}
@@ -108,24 +123,46 @@ impl<'a> TryFrom<&Schema<'a>> for AvroField {
108123
/// <https://avro.apache.org/docs/1.11.1/specification/#encodings>
109124
#[derive(Debug, Clone)]
110125
pub enum Codec {
126+
/// Represents Avro null type, maps to Arrow's Null data type
111127
Null,
128+
/// Represents Avro boolean type, maps to Arrow's Boolean data type
112129
Boolean,
130+
/// Represents Avro int type, maps to Arrow's Int32 data type
113131
Int32,
132+
/// Represents Avro long type, maps to Arrow's Int64 data type
114133
Int64,
134+
/// Represents Avro float type, maps to Arrow's Float32 data type
115135
Float32,
136+
/// Represents Avro double type, maps to Arrow's Float64 data type
116137
Float64,
138+
/// Represents Avro bytes type, maps to Arrow's Binary data type
117139
Binary,
140+
/// String data represented as UTF-8 encoded bytes, corresponding to Arrow's StringArray
118141
Utf8,
142+
/// Represents Avro date logical type, maps to Arrow's Date32 data type
119143
Date32,
144+
/// Represents Avro time-millis logical type, maps to Arrow's Time32(TimeUnit::Millisecond) data type
120145
TimeMillis,
146+
/// Represents Avro time-micros logical type, maps to Arrow's Time64(TimeUnit::Microsecond) data type
121147
TimeMicros,
122-
/// TimestampMillis(is_utc)
148+
/// Represents Avro timestamp-millis or local-timestamp-millis logical type
149+
///
150+
/// Maps to Arrow's Timestamp(TimeUnit::Millisecond) data type
151+
/// The boolean parameter indicates whether the timestamp has a UTC timezone (true) or is local time (false)
123152
TimestampMillis(bool),
124-
/// TimestampMicros(is_utc)
153+
/// Represents Avro timestamp-micros or local-timestamp-micros logical type
154+
///
155+
/// Maps to Arrow's Timestamp(TimeUnit::Microsecond) data type
156+
/// The boolean parameter indicates whether the timestamp has a UTC timezone (true) or is local time (false)
125157
TimestampMicros(bool),
158+
/// Represents Avro fixed type, maps to Arrow's FixedSizeBinary data type
159+
/// The i32 parameter indicates the fixed binary size
126160
Fixed(i32),
161+
/// Represents Avro array type, maps to Arrow's List data type
127162
List(Arc<AvroDataType>),
163+
/// Represents Avro record type, maps to Arrow's Struct data type
128164
Struct(Arc<[AvroField]>),
165+
/// Represents Avro duration logical type, maps to Arrow's Interval(IntervalUnit::MonthDayNano) data type
129166
Interval,
130167
}
131168

arrow-avro/src/compression.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,16 @@ use std::io::Read;
2323
pub const CODEC_METADATA_KEY: &str = "avro.codec";
2424

2525
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
26+
/// Supported compression codecs for Avro data
27+
///
28+
/// Avro supports multiple compression formats for data blocks.
29+
/// This enum represents the compression codecs available in this implementation.
2630
pub enum CompressionCodec {
31+
/// Deflate compression (RFC 1951)
2732
Deflate,
33+
/// Snappy compression
2834
Snappy,
35+
/// ZStandard compression
2936
ZStandard,
3037
}
3138

arrow-avro/src/lib.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,26 @@
2828
#![warn(missing_docs)]
2929
#![allow(unused)] // Temporary
3030

31+
/// Core functionality for reading Avro data into Arrow arrays
32+
///
33+
/// Implements the primary reader interface and record decoding logic.
3134
pub mod reader;
35+
36+
/// Avro schema parsing and representation
37+
///
38+
/// Provides types for parsing and representing Avro schema definitions.
3239
mod schema;
3340

41+
/// Compression codec implementations for Avro
42+
///
43+
/// Provides support for various compression algorithms used in Avro files,
44+
/// including Deflate, Snappy, and ZStandard.
3445
mod compression;
3546

47+
/// Data type conversions between Avro and Arrow types
48+
///
49+
/// This module contains the necessary types and functions to convert between
50+
/// Avro data types and Arrow data types.
3651
mod codec;
3752

3853
#[cfg(test)]

arrow-avro/src/reader/record.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ pub struct RecordDecoder {
3737
}
3838

3939
impl RecordDecoder {
40+
/// Create a new [`RecordDecoder`] from the provided [`AvroDataType`]
4041
pub fn try_new(data_type: &AvroDataType) -> Result<Self, ArrowError> {
4142
match Decoder::try_new(data_type)? {
4243
Decoder::Record(fields, encodings) => Ok(Self {

0 commit comments

Comments
 (0)