fix: correct accounting in DictEncoder::estimated_memory_size, Interner::estimated_memory_size (#9720)

mzabaluev · web-flow · commit cb8d4c025b65 · 2026-04-27T15:11:32.000-04:00
# Which issue does this PR close? - Closes #9719, #9744. # Rationale for this change The returned value should estimate the actual memory usage, but instead it uses the evaluation of the encoded size of the dictionary data, and bypasses the hash table memory usage added by the `Interner` member. The implementation of `Storage::estimated_memory_size` implementation for the unique key storage was not correct as well, but it was unused. # What changes are included in this PR? Correct both problems by making the `KeyStorage`'s implementation of `estimated_memory_size` return the size of the allocated `uniques` vector added with the values' sizes if applicable, and make `DictEncoder::estimated_memory_size` delegate to the `interner`, which calls the method of `KeyStorage` and adds accounting for its own data structure. # Are these changes tested? Added tests verifying that at least the expected added amounts are accounted for when values are added. Overreporting is hard to disprove due to dependency on allocation behavior internal to other libraries. # Are there any user-facing changes? No.
diff --git a/parquet/src/encodings/encoding/dict_encoder.rs b/parquet/src/encodings/encoding/dict_encoder.rs
@@ -64,7 +64,12 @@ impl<T: DataType> Storage for KeyStorage<T> {
     }
 
     fn estimated_memory_size(&self) -> usize {
-        self.size_in_bytes + self.uniques.capacity() * std::mem::size_of::<T::T>()
+        let uniques_heap_bytes = match T::get_physical_type() {
+            Type::FIXED_LEN_BYTE_ARRAY => self.type_length * self.uniques.len(),
+            _ => <Self::Value as ParquetValueType>::variable_length_bytes(&self.uniques)
+                .unwrap_or(0) as usize,
+        };
+        self.uniques.capacity() * std::mem::size_of::<T::T>() + uniques_heap_bytes
     }
 }
 
@@ -183,6 +188,281 @@ impl<T: DataType> Encoder<T> for DictEncoder<T> {
     ///
     /// For this encoder, the indices are unencoded bytes (refer to [`Self::write_indices`]).
     fn estimated_memory_size(&self) -> usize {
-        self.interner.storage().size_in_bytes + self.indices.len() * std::mem::size_of::<usize>()
+        self.interner.estimated_memory_size() + self.indices.capacity() * std::mem::size_of::<u64>()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use super::*;
+    use crate::data_type::{
+        ByteArray, ByteArrayType, FixedLenByteArray, FixedLenByteArrayType, Int32Type,
+    };
+    use crate::encodings::encoding::Encoder;
+    use crate::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType};
+
+    fn make_col_desc<T: DataType>() -> ColumnDescPtr {
+        make_col_desc_with_length::<T>(-1)
+    }
+
+    fn make_col_desc_with_length<T: DataType>(type_length: i32) -> ColumnDescPtr {
+        let ty = SchemaType::primitive_type_builder("col", T::get_physical_type())
+            .with_length(type_length)
+            .build()
+            .unwrap();
+        Arc::new(ColumnDescriptor::new(
+            Arc::new(ty),
+            0,
+            0,
+            ColumnPath::new(vec![]),
+        ))
+    }
+
+    #[test]
+    fn test_estimated_memory_size_primitive_with_duplicates() {
+        let mut encoder = DictEncoder::<Int32Type>::new(make_col_desc::<Int32Type>());
+        let empty_size = encoder.estimated_memory_size();
+
+        // 3 distinct values, repeated to produce 9 indices total.
+        encoder.put(&[1, 2, 3, 1, 2, 3, 1, 2, 3]).unwrap();
+
+        let size = encoder.estimated_memory_size();
+
+        // Must account for the 3 unique dictionary entries.
+        let dict_entry_size = 3 * std::mem::size_of::<i32>();
+        assert!(
+            size >= empty_size + dict_entry_size,
+            "memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)"
+        );
+
+        // Must also account for the 9 buffered indices.
+        let indices_size = 9 * std::mem::size_of::<u64>();
+        assert!(
+            size >= empty_size + dict_entry_size + indices_size,
+            "memory size {size} should include indices ({indices_size} bytes)"
+        );
+    }
+
+    #[test]
+    fn test_estimated_memory_size_primitive_all_distinct() {
+        let mut encoder = DictEncoder::<Int32Type>::new(make_col_desc::<Int32Type>());
+        let empty_size = encoder.estimated_memory_size();
+
+        let values: Vec<i32> = (0..100).collect();
+        encoder.put(&values).unwrap();
+
+        let size = encoder.estimated_memory_size();
+
+        // Must account for the 100 unique dictionary entries.
+        let dict_entry_size = 100 * std::mem::size_of::<i32>();
+        assert!(
+            size >= empty_size + dict_entry_size,
+            "memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)"
+        );
+
+        // Must also account for the 100 buffered indices.
+        let indices_size = 100 * std::mem::size_of::<u64>();
+        assert!(
+            size >= empty_size + dict_entry_size + indices_size,
+            "memory size {size} should include indices ({indices_size} bytes)"
+        );
+    }
+
+    #[test]
+    fn test_estimated_memory_size_byte_array_with_duplicates() {
+        let mut encoder = DictEncoder::<ByteArrayType>::new(make_col_desc::<ByteArrayType>());
+        let empty_size = encoder.estimated_memory_size();
+
+        // 3 distinct byte strings ("foo", "bar", "baz" — 3 bytes each), repeated to produce
+        // 9 indices total.
+        let vals: Vec<ByteArray> = [
+            "foo", "bar", "baz", "foo", "bar", "baz", "foo", "bar", "baz",
+        ]
+        .iter()
+        .map(|s| ByteArray::from(*s))
+        .collect();
+        encoder.put(&vals).unwrap();
+
+        let size = encoder.estimated_memory_size();
+
+        // Must account for the 3 unique dictionary entries, including their heap-allocated bytes.
+        let dict_entry_size = 3 * std::mem::size_of::<ByteArray>() + 3 * 3; // 3 values × 3 bytes each
+        assert!(
+            size >= empty_size + dict_entry_size,
+            "memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)"
+        );
+
+        // Must also account for the 9 buffered indices.
+        let indices_size = 9 * std::mem::size_of::<u64>();
+        assert!(
+            size >= empty_size + dict_entry_size + indices_size,
+            "memory size {size} should include indices ({indices_size} bytes)"
+        );
+    }
+
+    #[test]
+    fn test_estimated_memory_size_byte_array_all_distinct() {
+        let mut encoder = DictEncoder::<ByteArrayType>::new(make_col_desc::<ByteArrayType>());
+        let empty_size = encoder.estimated_memory_size();
+
+        // 100 distinct values: "0".."9" (1 byte each) and "10".."99" (2 bytes each).
+        let values: Vec<ByteArray> = (0..100_u32)
+            .map(|i| ByteArray::from(i.to_string().into_bytes()))
+            .collect();
+        let bytes_total: usize = values.iter().map(|v| v.len()).sum(); // 10×1 + 90×2 = 190
+        encoder.put(&values).unwrap();
+
+        let size = encoder.estimated_memory_size();
+
+        // Must account for the 100 unique dictionary entries, including their heap-allocated bytes.
+        let dict_entry_size = 100 * std::mem::size_of::<ByteArray>() + bytes_total;
+        assert!(
+            size >= empty_size + dict_entry_size,
+            "memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)"
+        );
+
+        // Must also account for the 100 buffered indices.
+        let indices_size = 100 * std::mem::size_of::<u64>();
+        assert!(
+            size >= empty_size + dict_entry_size + indices_size,
+            "memory size {size} should include indices ({indices_size} bytes)"
+        );
+    }
+
+    #[test]
+    fn test_estimated_memory_size_fixed_len_byte_array_with_duplicates() {
+        const TYPE_LEN: usize = 3;
+        let mut encoder = DictEncoder::<FixedLenByteArrayType>::new(make_col_desc_with_length::<
+            FixedLenByteArrayType,
+        >(TYPE_LEN as i32));
+        let empty_size = encoder.estimated_memory_size();
+
+        // 3 distinct 3-byte values, repeated to produce 9 indices total.
+        let vals = [
+            b"foo", b"bar", b"baz", b"foo", b"bar", b"baz", b"foo", b"bar", b"baz",
+        ]
+        .iter()
+        .map(|b| FixedLenByteArray::from(b.to_vec()))
+        .collect::<Vec<_>>();
+        encoder.put(&vals).unwrap();
+
+        let size = encoder.estimated_memory_size();
+
+        // Must account for the 3 unique dictionary entries: struct overhead plus the
+        // fixed-length bytes allocated per entry.
+        let dict_entry_size = 3 * std::mem::size_of::<FixedLenByteArray>() + 3 * TYPE_LEN;
+        assert!(
+            size >= empty_size + dict_entry_size,
+            "memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)"
+        );
+
+        // Must also account for the 9 buffered indices.
+        let indices_size = 9 * std::mem::size_of::<u64>();
+        assert!(
+            size >= empty_size + dict_entry_size + indices_size,
+            "memory size {size} should include indices ({indices_size} bytes)"
+        );
+    }
+
+    #[test]
+    fn test_estimated_memory_size_fixed_len_byte_array_all_distinct() {
+        const TYPE_LEN: usize = 3;
+        let mut encoder = DictEncoder::<FixedLenByteArrayType>::new(make_col_desc_with_length::<
+            FixedLenByteArrayType,
+        >(TYPE_LEN as i32));
+        let empty_size = encoder.estimated_memory_size();
+
+        // 100 distinct 3-byte values: zero-padded big-endian u8 indices.
+        let values = (0..100_u8)
+            .map(|i| FixedLenByteArray::from(vec![0u8, 0u8, i]))
+            .collect::<Vec<_>>();
+        encoder.put(&values).unwrap();
+
+        let size = encoder.estimated_memory_size();
+
+        // Must account for the 100 unique dictionary entries: struct overhead plus the
+        // fixed-length bytes allocated per entry.
+        let dict_entry_size = 100 * std::mem::size_of::<FixedLenByteArray>() + 100 * TYPE_LEN;
+        assert!(
+            size >= empty_size + dict_entry_size,
+            "memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)"
+        );
+
+        // Must also account for the 100 buffered indices.
+        let indices_size = 100 * std::mem::size_of::<u64>();
+        assert!(
+            size >= empty_size + dict_entry_size + indices_size,
+            "memory size {size} should include indices ({indices_size} bytes)"
+        );
+    }
+
+    #[test]
+    fn test_estimated_memory_size_includes_interner_dedup_table() {
+        // The dedup `HashTable` in `Interner` is preallocated with
+        // `DEFAULT_DEDUP_CAPACITY` slots at construction, independent of any
+        // values pushed.
+        let encoder = DictEncoder::<Int32Type>::new(make_col_desc::<Int32Type>());
+
+        let size = encoder.estimated_memory_size();
+
+        assert!(
+            size > 0,
+            "memory size should include the preallocated dedup hash table"
+        );
+    }
+
+    #[test]
+    fn test_estimated_memory_size_accounts_for_indices_capacity() {
+        // Exercises the `indices.capacity()` (not `.len()`) accounting.
+        // After a flush, `indices` is cleared but its capacity is retained; pushing a
+        // smaller batch afterwards leaves capacity strictly greater than length.
+        let mut encoder = DictEncoder::<Int32Type>::new(make_col_desc::<Int32Type>());
+
+        let big: Vec<i32> = vec![0; 64];
+        encoder.put(&big).unwrap();
+        let _ = encoder.flush_buffer().unwrap();
+
+        let flushed_size = encoder.estimated_memory_size();
+
+        // Push a single value — indices.len() == 1 but indices.capacity() >= 64.
+        // No change on the key storage since the value is already interned.
+        encoder.put(&[0]).unwrap();
+
+        let size = encoder.estimated_memory_size();
+
+        assert_eq!(
+            size, flushed_size,
+            "memory size should include retained indices capacity",
+        );
+    }
+
+    #[test]
+    fn test_estimated_memory_size_accounts_for_uniques_capacity() {
+        let mut encoder = DictEncoder::<Int32Type>::new(make_col_desc::<Int32Type>());
+
+        let values: Vec<i32> = (0..64).collect();
+        encoder.put(&values).unwrap();
+        // Flush indices so they don't mask the uniques accounting in the lower bound.
+        let _ = encoder.flush_buffer().unwrap();
+
+        let size1 = encoder.estimated_memory_size();
+
+        // Push more values to trigger uniques capacity growth.
+        // The pre-allocated dedup hash table is unlikely to be resized.
+        let values: Vec<i32> = (64..128).collect();
+        encoder.put(&values).unwrap();
+        // Flush indices so they don't mask the uniques accounting in the lower bound.
+        let _ = encoder.flush_buffer().unwrap();
+
+        let size2 = encoder.estimated_memory_size();
+
+        let min_uniques_bytes = 64 * std::mem::size_of::<i32>();
+        assert!(
+            size2 >= size1 + min_uniques_bytes,
+            "memory size {size2} should grow from {size1} by allocated uniques capacity \
+             (at least {min_uniques_bytes} bytes)"
+        );
     }
 }
diff --git a/parquet/src/util/interner.rs b/parquet/src/util/interner.rs
@@ -77,9 +77,7 @@ impl<S: Storage> Interner<S> {
     /// Return estimate of the memory used, in bytes
     #[allow(dead_code)] // not used in parquet_derive, so is dead there
     pub fn estimated_memory_size(&self) -> usize {
-        self.storage.estimated_memory_size() +
-            // estimate size of dedup hashmap as just th size of the keys
-            self.dedup.capacity() + std::mem::size_of::<S::Key>()
+        self.storage.estimated_memory_size() + self.dedup.allocation_size()
     }
 
     /// Returns the storage for this interner