apache · alamb · Apr 8, 2025 · Apr 7, 2025
diff --git a/datafusion-examples/examples/parquet_index.rs b/datafusion-examples/examples/parquet_index.rs
@@ -685,7 +685,7 @@ fn make_demo_file(path: impl AsRef<Path>, value_range: Range<i32>) -> Result<()>
 
     let num_values = value_range.len();
     let file_names =
-        StringArray::from_iter_values(std::iter::repeat(&filename).take(num_values));
+        StringArray::from_iter_values(std::iter::repeat_n(&filename, num_values));
     let values = Int32Array::from_iter_values(value_range);
     let batch = RecordBatch::try_from_iter(vec![
         ("file_name", Arc::new(file_names) as ArrayRef),

diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs
@@ -27,7 +27,7 @@ use std::convert::Infallible;
 use std::fmt;
 use std::hash::Hash;
 use std::hash::Hasher;
-use std::iter::repeat;
+use std::iter::repeat_n;
 use std::mem::{size_of, size_of_val};
 use std::str::FromStr;
 use std::sync::Arc;
@@ -802,12 +802,14 @@ fn dict_from_scalar<K: ArrowDictionaryKeyType>(
     let values_array = value.to_array_of_size(1)?;
 
     // Create a key array with `size` elements, each of 0
-    let key_array: PrimitiveArray<K> = repeat(if value.is_null() {
-        None
-    } else {
-        Some(K::default_value())
-    })
-    .take(size)
+    let key_array: PrimitiveArray<K> = repeat_n(
+        if value.is_null() {
+            None
+        } else {
+            Some(K::default_value())
+        },
+        size,
+    )
     .collect();
 
     // create a new DictionaryArray
@@ -2189,8 +2191,7 @@ impl ScalarValue {
         scale: i8,
         size: usize,
     ) -> Result<Decimal256Array> {
-        Ok(repeat(value)
-            .take(size)
+        Ok(repeat_n(value, size)
             .collect::<Decimal256Array>()
             .with_precision_and_scale(precision, scale)?)
     }
@@ -2416,69 +2417,59 @@ impl ScalarValue {
             }
             ScalarValue::Utf8(e) => match e {
                 Some(value) => {
-                    Arc::new(StringArray::from_iter_values(repeat(value).take(size)))
+                    Arc::new(StringArray::from_iter_values(repeat_n(value, size)))
                 }
                 None => new_null_array(&DataType::Utf8, size),
             },
             ScalarValue::Utf8View(e) => match e {
                 Some(value) => {
-                    Arc::new(StringViewArray::from_iter_values(repeat(value).take(size)))
+                    Arc::new(StringViewArray::from_iter_values(repeat_n(value, size)))
                 }
                 None => new_null_array(&DataType::Utf8View, size),
             },
             ScalarValue::LargeUtf8(e) => match e {
                 Some(value) => {
-                    Arc::new(LargeStringArray::from_iter_values(repeat(value).take(size)))
+                    Arc::new(LargeStringArray::from_iter_values(repeat_n(value, size)))
                 }
                 None => new_null_array(&DataType::LargeUtf8, size),
             },
             ScalarValue::Binary(e) => match e {
                 Some(value) => Arc::new(
-                    repeat(Some(value.as_slice()))
-                        .take(size)
-                        .collect::<BinaryArray>(),
+                    repeat_n(Some(value.as_slice()), size).collect::<BinaryArray>(),
                 ),
-                None => {
-                    Arc::new(repeat(None::<&str>).take(size).collect::<BinaryArray>())
-                }
+                None => Arc::new(repeat_n(None::<&str>, size).collect::<BinaryArray>()),
             },
             ScalarValue::BinaryView(e) => match e {
                 Some(value) => Arc::new(
-                    repeat(Some(value.as_slice()))
-                        .take(size)
-                        .collect::<BinaryViewArray>(),
+                    repeat_n(Some(value.as_slice()), size).collect::<BinaryViewArray>(),
                 ),
                 None => {
-                    Arc::new(repeat(None::<&str>).take(size).collect::<BinaryViewArray>())
+                    Arc::new(repeat_n(None::<&str>, size).collect::<BinaryViewArray>())
                 }
             },
             ScalarValue::FixedSizeBinary(s, e) => match e {
                 Some(value) => Arc::new(
                     FixedSizeBinaryArray::try_from_sparse_iter_with_size(
-                        repeat(Some(value.as_slice())).take(size),
+                        repeat_n(Some(value.as_slice()), size),
                         *s,
                     )
                     .unwrap(),
                 ),
                 None => Arc::new(
                     FixedSizeBinaryArray::try_from_sparse_iter_with_size(
-                        repeat(None::<&[u8]>).take(size),
+                        repeat_n(None::<&[u8]>, size),
                         *s,
                     )
                     .unwrap(),
                 ),
             },
             ScalarValue::LargeBinary(e) => match e {
                 Some(value) => Arc::new(
-                    repeat(Some(value.as_slice()))
-                        .take(size)
-                        .collect::<LargeBinaryArray>(),
-                ),
-                None => Arc::new(
-                    repeat(None::<&str>)
-                        .take(size)
-                        .collect::<LargeBinaryArray>(),
+                    repeat_n(Some(value.as_slice()), size).collect::<LargeBinaryArray>(),
                 ),
+                None => {
+                    Arc::new(repeat_n(None::<&str>, size).collect::<LargeBinaryArray>())
+                }
             },
             ScalarValue::List(arr) => {
                 Self::list_to_array_of_size(arr.as_ref() as &dyn Array, size)?
@@ -2606,7 +2597,7 @@ impl ScalarValue {
                         child_arrays.push(ar);
                         new_fields.push(field.clone());
                     }
-                    let type_ids = repeat(*v_id).take(size);
+                    let type_ids = repeat_n(*v_id, size);
                     let type_ids = ScalarBuffer::<i8>::from_iter(type_ids);
                     let value_offsets = match mode {
                         UnionMode::Sparse => None,
@@ -2674,7 +2665,7 @@ impl ScalarValue {
     }
 
     fn list_to_array_of_size(arr: &dyn Array, size: usize) -> Result<ArrayRef> {
-        let arrays = repeat(arr).take(size).collect::<Vec<_>>();
+        let arrays = repeat_n(arr, size).collect::<Vec<_>>();
         let ret = match !arrays.is_empty() {
             true => arrow::compute::concat(arrays.as_slice())?,
             false => arr.slice(0, 0),

diff --git a/datafusion/common/src/utils/memory.rs b/datafusion/common/src/utils/memory.rs
@@ -25,7 +25,7 @@ use std::mem::size_of;
 /// # Parameters
 /// - `num_elements`: The number of elements expected in the hash table.
 /// - `fixed_size`: A fixed overhead size associated with the collection
-///    (e.g., HashSet or HashTable).
+///   (e.g., HashSet or HashTable).
 /// - `T`: The type of elements stored in the hash table.
 ///
 /// # Details

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
@@ -1183,7 +1183,7 @@ impl ListingTable {
 /// # Arguments
 /// * `files` - A stream of `Result<PartitionedFile>` items to process
 /// * `limit` - An optional row count limit. If provided, the function will stop collecting files
-///             once the accumulated number of rows exceeds this limit
+///   once the accumulated number of rows exceeds this limit
 /// * `collect_stats` - Whether to collect and accumulate statistics from the files
 ///
 /// # Returns

diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs
@@ -1023,18 +1023,12 @@ impl DefaultPhysicalPlanner {
                         // Collect left & right field indices, the field indices are sorted in ascending order
                         let left_field_indices = cols
                             .iter()
-                            .filter_map(|c| match left_df_schema.index_of_column(c) {
-                                Ok(idx) => Some(idx),
-                                _ => None,
-                            })
+                            .filter_map(|c| left_df_schema.index_of_column(c).ok())
                             .sorted()
                             .collect::<Vec<_>>();
                         let right_field_indices = cols
                             .iter()
-                            .filter_map(|c| match right_df_schema.index_of_column(c) {
-                                Ok(idx) => Some(idx),
-                                _ => None,
-                            })
+                            .filter_map(|c| right_df_schema.index_of_column(c).ok())
                             .sorted()
                             .collect::<Vec<_>>();
 

diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs
@@ -43,7 +43,7 @@ use crate::fuzz_cases::aggregation_fuzzer::data_generator::Dataset;
 ///   - `skip_partial parameters`
 ///   - hint `sorted` or not
 ///   - `spilling` or not (TODO, I think a special `MemoryPool` may be needed
-///      to support this)
+///     to support this)
 ///
 pub struct SessionContextGenerator {
     /// Current testing dataset

diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs
@@ -33,12 +33,12 @@ use crate::fuzz_cases::record_batch_generator::{ColumnDescr, RecordBatchGenerato
 ///     when you call `generate` function
 ///         
 ///   - `rows_num_range`, the number of rows in the datasets will be randomly generated
-///      within this range
+///     within this range
 ///
 ///   - `sort_keys`, if `sort_keys` are defined, when you call the `generate` function, the generator
-///      will generate one `base dataset` firstly. Then the `base dataset` will be sorted
-///      based on each `sort_key` respectively. And finally `len(sort_keys) + 1` datasets
-///      will be returned
+///     will generate one `base dataset` firstly. Then the `base dataset` will be sorted
+///     based on each `sort_key` respectively. And finally `len(sort_keys) + 1` datasets
+///     will be returned
 ///
 #[derive(Debug, Clone)]
 pub struct DatasetGeneratorConfig {

diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs
@@ -270,7 +270,7 @@ impl AggregationFuzzer {
 ///   - `sql`, the selected test sql
 ///
 ///   - `dataset_ref`, the input dataset, store it for error reported when found
-///      the inconsistency between the one for `ctx` and `expected results`.
+///     the inconsistency between the one for `ctx` and `expected results`.
 ///
 struct AggregationFuzzTestTask {
     /// Generated session context in current test case

diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs
@@ -863,11 +863,10 @@ impl Scenario {
                 single_row_batches,
             } => {
                 use datafusion::physical_expr::expressions::col;
-                let batches: Vec<Vec<_>> = std::iter::repeat(maybe_split_batches(
-                    dict_batches(),
-                    *single_row_batches,
-                ))
-                .take(*partitions)
+                let batches: Vec<Vec<_>> = std::iter::repeat_n(
+                    maybe_split_batches(dict_batches(), *single_row_batches),
+                    *partitions,
+                )
                 .collect();
 
                 let schema = batches[0][0].schema();

diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs
@@ -611,7 +611,7 @@ fn make_bytearray_batch(
     large_binary_values: Vec<&[u8]>,
 ) -> RecordBatch {
     let num_rows = string_values.len();
-    let name: StringArray = std::iter::repeat(Some(name)).take(num_rows).collect();
+    let name: StringArray = std::iter::repeat_n(Some(name), num_rows).collect();
     let service_string: StringArray = string_values.iter().map(Some).collect();
     let service_binary: BinaryArray = binary_values.iter().map(Some).collect();
     let service_fixedsize: FixedSizeBinaryArray = fixedsize_values
@@ -659,7 +659,7 @@ fn make_bytearray_batch(
 /// name | service.name
 fn make_names_batch(name: &str, service_name_values: Vec<&str>) -> RecordBatch {
     let num_rows = service_name_values.len();
-    let name: StringArray = std::iter::repeat(Some(name)).take(num_rows).collect();
+    let name: StringArray = std::iter::repeat_n(Some(name), num_rows).collect();
     let service_name: StringArray = service_name_values.iter().map(Some).collect();
 
     let schema = Schema::new(vec![
@@ -698,31 +698,31 @@ fn make_int_batches_with_null(
                 Int8Array::from_iter(
                     v8.into_iter()
                         .map(Some)
-                        .chain(std::iter::repeat(None).take(null_values)),
+                        .chain(std::iter::repeat_n(None, null_values)),
                 )
                 .to_data(),
             ),
             make_array(
                 Int16Array::from_iter(
                     v16.into_iter()
                         .map(Some)
-                        .chain(std::iter::repeat(None).take(null_values)),
+                        .chain(std::iter::repeat_n(None, null_values)),
                 )
                 .to_data(),
             ),
             make_array(
                 Int32Array::from_iter(
                     v32.into_iter()
                         .map(Some)
-                        .chain(std::iter::repeat(None).take(null_values)),
+                        .chain(std::iter::repeat_n(None, null_values)),
                 )
                 .to_data(),
             ),
             make_array(
                 Int64Array::from_iter(
                     v64.into_iter()
                         .map(Some)
-                        .chain(std::iter::repeat(None).take(null_values)),
+                        .chain(std::iter::repeat_n(None, null_values)),
                 )
                 .to_data(),
             ),

diff --git a/datafusion/core/tests/user_defined/user_defined_window_functions.rs b/datafusion/core/tests/user_defined/user_defined_window_functions.rs
@@ -633,7 +633,7 @@ fn odd_count(arr: &Int64Array) -> i64 {
 
 /// returns an array of num_rows that has the number of odd values in `arr`
 fn odd_count_arr(arr: &Int64Array, num_rows: usize) -> ArrayRef {
-    let array: Int64Array = std::iter::repeat(odd_count(arr)).take(num_rows).collect();
+    let array: Int64Array = std::iter::repeat_n(odd_count(arr), num_rows).collect();
     Arc::new(array)
 }
 

diff --git a/datafusion/datasource/src/schema_adapter.rs b/datafusion/datasource/src/schema_adapter.rs
@@ -42,7 +42,7 @@ pub trait SchemaAdapterFactory: Debug + Send + Sync + 'static {
     /// Arguments:
     ///
     /// * `projected_table_schema`: The schema for the table, projected to
-    ///    include only the fields being output (projected) by the this mapping.
+    ///   include only the fields being output (projected) by the this mapping.
     ///
     /// * `table_schema`: The entire table schema for the table
     fn create(

diff --git a/datafusion/datasource/src/url.rs b/datafusion/datasource/src/url.rs
@@ -209,10 +209,10 @@ impl ListingTableUrl {
     /// assert_eq!(url.file_extension(), None);
     /// ```
     pub fn file_extension(&self) -> Option<&str> {
-        if let Some(segments) = self.url.path_segments() {
-            if let Some(last_segment) = segments.last() {
+        if let Some(mut segments) = self.url.path_segments() {
+            if let Some(last_segment) = segments.next_back() {
                 if last_segment.contains(".") && !last_segment.ends_with(".") {
-                    return last_segment.split('.').last();
+                    return last_segment.split('.').next_back();
                 }
             }
         }

diff --git a/datafusion/expr-common/src/interval_arithmetic.rs b/datafusion/expr-common/src/interval_arithmetic.rs
@@ -174,7 +174,7 @@ macro_rules! value_transition {
 ///    - `INF` values are converted to `NULL`s while constructing an interval to
 ///      ensure consistency, with other data types.
 ///    - `NaN` (Not a Number) results are conservatively result in unbounded
-///       endpoints.
+///      endpoints.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct Interval {
     lower: ScalarValue,

diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs
@@ -391,10 +391,11 @@ impl TypeSignature {
                 vec![format!("{}, ..", Self::join_types(types, "/"))]
             }
             TypeSignature::Uniform(arg_count, valid_types) => {
-                vec![std::iter::repeat(Self::join_types(valid_types, "/"))
-                    .take(*arg_count)
-                    .collect::<Vec<String>>()
-                    .join(", ")]
+                vec![
+                    std::iter::repeat_n(Self::join_types(valid_types, "/"), *arg_count)
+                        .collect::<Vec<String>>()
+                        .join(", "),
+                ]
             }
             TypeSignature::String(num) => {
                 vec![format!("String({num})")]
@@ -412,8 +413,7 @@ impl TypeSignature {
                 vec![Self::join_types(types, ", ")]
             }
             TypeSignature::Any(arg_count) => {
-                vec![std::iter::repeat("Any")
-                    .take(*arg_count)
+                vec![std::iter::repeat_n("Any", *arg_count)
                     .collect::<Vec<&str>>()
                     .join(", ")]
             }

diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs
@@ -2192,7 +2192,7 @@ pub fn unnest_with_options(
 
                     // new columns dependent on the same original index
                     dependency_indices
-                        .extend(std::iter::repeat(index).take(transformed_columns.len()));
+                        .extend(std::iter::repeat_n(index, transformed_columns.len()));
                     Ok(transformed_columns
                         .iter()
                         .map(|(col, field)| (col.relation.to_owned(), field.to_owned()))

diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs
@@ -110,8 +110,8 @@ pub struct FFI_TableProvider {
     /// * `session_config` - session configuration
     /// * `projections` - if specified, only a subset of the columns are returned
     /// * `filters_serialized` - filters to apply to the scan, which are a
-    ///    [`LogicalExprList`] protobuf message serialized into bytes to pass
-    ///    across the FFI boundary.
+    ///   [`LogicalExprList`] protobuf message serialized into bytes to pass
+    ///   across the FFI boundary.
     /// * `limit` - if specified, limit the number of rows returned
     pub scan: unsafe extern "C" fn(
         provider: &Self,

diff --git a/datafusion/functions-aggregate/src/array_agg.rs b/datafusion/functions-aggregate/src/array_agg.rs
@@ -289,7 +289,7 @@ impl Accumulator for ArrayAggAccumulator {
         }
 
         let val = Arc::clone(&values[0]);
-        if val.len() > 0 {
+        if !val.is_empty() {
             self.values.push(val);
         }
         Ok(())
@@ -310,7 +310,7 @@ impl Accumulator for ArrayAggAccumulator {
         match Self::get_optional_values_to_merge_as_is(list_arr) {
             Some(values) => {
                 // Make sure we don't insert empty lists
-                if values.len() > 0 {
+                if !values.is_empty() {
                     self.values.push(values);
                 }
             }