1515// specific language governing permissions and limitations
1616// under the License.
1717
18+ //! [`GroupValues`] trait for storing and interning group keys
19+
1820use arrow:: record_batch:: RecordBatch ;
1921use arrow_array:: { downcast_primitive, ArrayRef } ;
2022use arrow_schema:: { DataType , SchemaRef } ;
@@ -37,18 +39,61 @@ use datafusion_physical_expr::binary_map::OutputType;
3739
3840mod group_column;
3941
40- /// An interning store for group keys
42+ /// Stores the group values during hash aggregation.
43+ ///
44+ /// # Background
45+ ///
46+ /// In a query such as `SELECT a, b, count(*) FROM t GROUP BY a, b`, the group values
47+ /// identify each group, and correspond to all the distinct values of `(a,b)`.
48+ ///
49+ /// ```sql
50+ /// -- Input has 4 rows with 3 distinct combinations of (a,b) ("groups")
51+ /// create table t(a int, b varchar)
52+ /// as values (1, 'a'), (2, 'b'), (1, 'a'), (3, 'c');
53+ ///
54+ /// select a, b, count(*) from t group by a, b;
55+ /// ----
56+ /// 1 a 2
57+ /// 2 b 1
58+ /// 3 c 1
59+ /// ```
60+ ///
61+ /// # Design
62+ ///
63+ /// Managing group values is a performance critical operation in hash
64+ /// aggregation. The major operations are:
65+ ///
66+ /// 1. Intern: Quickly finding existing and adding new group values
67+ /// 2. Emit: Returning the group values as an array
68+ ///
69+ /// There are multiple specialized implementations of this trait optimized for
70+ /// different data types and number of columns, optimized for these operations.
71+ /// See [`new_group_values`] for details.
72+ ///
73+ /// # Group Ids
74+ ///
75+ /// Each distinct group in a hash aggregation is identified by a unique group id
76+ /// (usize) which is assigned by instances of this trait. Group ids are
77+ /// continuous without gaps, starting from 0.
4178pub trait GroupValues : Send {
42- /// Calculates the `groups` for each input row of `cols`
79+ /// Calculates the group id for each input row of `cols`, assigning new
80+ /// group ids as necessary.
81+ ///
82+ /// When the function returns, `groups` must contain the group id for each
83+ /// row in `cols`.
84+ ///
85+ /// If a row has the same value as a previous row, the same group id is
86+ /// assigned. If a row has a new value, the next available group id is
87+ /// assigned.
4388 fn intern ( & mut self , cols : & [ ArrayRef ] , groups : & mut Vec < usize > ) -> Result < ( ) > ;
4489
45- /// Returns the number of bytes used by this [`GroupValues`]
90+ /// Returns the number of bytes of memory used by this [`GroupValues`]
4691 fn size ( & self ) -> usize ;
4792
4893 /// Returns true if this [`GroupValues`] is empty
4994 fn is_empty ( & self ) -> bool ;
5095
51- /// The number of values stored in this [`GroupValues`]
96+ /// The number of values (distinct group values) stored in this [`GroupValues`]
5297 fn len ( & self ) -> usize ;
5398
5499 /// Emits the group values
@@ -58,6 +103,7 @@ pub trait GroupValues: Send {
58103 fn clear_shrink ( & mut self , batch : & RecordBatch ) ;
59104}
60105
106+ /// Return a specialized implementation of [`GroupValues`] for the given schema.
61107pub fn new_group_values ( schema : SchemaRef ) -> Result < Box < dyn GroupValues > > {
62108 if schema. fields . len ( ) == 1 {
63109 let d = schema. fields [ 0 ] . data_type ( ) ;
0 commit comments