22
22
is_scalar ,
23
23
)
24
24
from pandas .core .base import DataError
25
+ from typing import Type , Callable
25
26
import warnings
26
27
28
+
27
29
from modin .backends .base .query_compiler import BaseQueryCompiler
28
30
from modin .error_message import ErrorMessage
29
31
from modin .utils import try_cast_to_pandas , wrap_udf_function
30
32
from modin .data_management .functions import (
33
+ Function ,
31
34
FoldFunction ,
32
35
MapFunction ,
33
36
MapReduceFunction ,
@@ -150,6 +153,34 @@ def caller(df, *args, **kwargs):
150
153
return caller
151
154
152
155
156
+ def _numeric_only_reduce_fn (applier : Type [Function ], * funcs ) -> Callable :
157
+ """
158
+ Build reduce function for statistic operations with `numeric_only` parameter.
159
+
160
+ Parameters
161
+ ----------
162
+ applier: Callable
163
+ Function object to register `funcs`
164
+ *funcs: list
165
+ List of functions to register in `applier`
166
+
167
+ Returns
168
+ -------
169
+ callable
170
+ A callable function to be applied in the partitions
171
+ """
172
+
173
+ def caller (self , * args , ** kwargs ):
174
+ # If `numeric_only` is None then we don't know what columns/indices will
175
+ # be dropped at the result of reduction function, and so can't preserve labels
176
+ preserve_index = kwargs .get ("numeric_only" , None ) is not None
177
+ return applier .register (* funcs , preserve_index = preserve_index )(
178
+ self , * args , ** kwargs
179
+ )
180
+
181
+ return caller
182
+
183
+
153
184
class PandasQueryCompiler (BaseQueryCompiler ):
154
185
"""This class implements the logic necessary for operating on partitions
155
186
with a Pandas backend. This logic is specific to Pandas."""
@@ -625,29 +656,54 @@ def is_monotonic_decreasing(self):
625
656
is_monotonic = _is_monotonic
626
657
627
658
count = MapReduceFunction .register (pandas .DataFrame .count , pandas .DataFrame .sum )
628
- max = MapReduceFunction . register ( pandas . DataFrame . max , pandas .DataFrame .max )
629
- min = MapReduceFunction . register ( pandas . DataFrame . min , pandas .DataFrame .min )
630
- sum = MapReduceFunction . register ( pandas . DataFrame . sum , pandas .DataFrame .sum )
631
- prod = MapReduceFunction . register ( pandas . DataFrame . prod , pandas .DataFrame .prod )
659
+ max = _numeric_only_reduce_fn ( MapReduceFunction , pandas .DataFrame .max )
660
+ min = _numeric_only_reduce_fn ( MapReduceFunction , pandas .DataFrame .min )
661
+ sum = _numeric_only_reduce_fn ( MapReduceFunction , pandas .DataFrame .sum )
662
+ prod = _numeric_only_reduce_fn ( MapReduceFunction , pandas .DataFrame .prod )
632
663
any = MapReduceFunction .register (pandas .DataFrame .any , pandas .DataFrame .any )
633
664
all = MapReduceFunction .register (pandas .DataFrame .all , pandas .DataFrame .all )
634
665
memory_usage = MapReduceFunction .register (
635
666
pandas .DataFrame .memory_usage ,
636
667
lambda x , * args , ** kwargs : pandas .DataFrame .sum (x ),
637
668
axis = 0 ,
638
669
)
639
- mean = MapReduceFunction .register (
640
- lambda df , ** kwargs : df .apply (
641
- lambda x : (x .sum (skipna = kwargs .get ("skipna" , True )), x .count ()),
642
- axis = kwargs .get ("axis" , 0 ),
643
- result_type = "reduce" ,
644
- ).set_axis (df .axes [kwargs .get ("axis" , 0 ) ^ 1 ], axis = 0 ),
645
- lambda df , ** kwargs : df .apply (
646
- lambda x : x .apply (lambda d : d [0 ]).sum (skipna = kwargs .get ("skipna" , True ))
647
- / x .apply (lambda d : d [1 ]).sum (skipna = kwargs .get ("skipna" , True )),
648
- axis = kwargs .get ("axis" , 0 ),
649
- ).set_axis (df .axes [kwargs .get ("axis" , 0 ) ^ 1 ], axis = 0 ),
650
- )
670
+
671
+ def mean (self , axis , ** kwargs ):
672
+ if kwargs .get ("level" ) is not None :
673
+ return self .default_to_pandas (pandas .DataFrame .mean , axis = axis , ** kwargs )
674
+
675
+ skipna = kwargs .get ("skipna" , True )
676
+
677
+ def map_apply_fn (ser , ** kwargs ):
678
+ try :
679
+ sum_result = ser .sum (skipna = skipna )
680
+ count_result = ser .count ()
681
+ except TypeError :
682
+ return None
683
+ else :
684
+ return (sum_result , count_result )
685
+
686
+ def reduce_apply_fn (ser , ** kwargs ):
687
+ sum_result = ser .apply (lambda x : x [0 ]).sum (skipna = skipna )
688
+ count_result = ser .apply (lambda x : x [1 ]).sum (skipna = skipna )
689
+ return sum_result / count_result
690
+
691
+ def reduce_fn (df , ** kwargs ):
692
+ df .dropna (axis = 1 , inplace = True , how = "any" )
693
+ return build_applyier (reduce_apply_fn , axis = axis )(df )
694
+
695
+ def build_applyier (func , ** applyier_kwargs ):
696
+ def applyier (df , ** kwargs ):
697
+ result = df .apply (func , ** applyier_kwargs )
698
+ return result .set_axis (df .axes [axis ^ 1 ], axis = 0 )
699
+
700
+ return applyier
701
+
702
+ return MapReduceFunction .register (
703
+ build_applyier (map_apply_fn , axis = axis , result_type = "reduce" ),
704
+ reduce_fn ,
705
+ preserve_index = (kwargs .get ("numeric_only" ) is not None ),
706
+ )(self , axis = axis , ** kwargs )
651
707
652
708
def value_counts (self , ** kwargs ):
653
709
"""
@@ -664,7 +720,7 @@ def value_counts(self, **kwargs):
664
720
return self .__constructor__ (new_modin_frame )
665
721
666
722
def map_func (df , * args , ** kwargs ):
667
- return df .squeeze (axis = 1 ).value_counts (** kwargs )
723
+ return df .squeeze (axis = 1 ).value_counts (** kwargs ). to_frame ()
668
724
669
725
def reduce_func (df , * args , ** kwargs ):
670
726
normalize = kwargs .get ("normalize" , False )
@@ -735,28 +791,30 @@ def sort_index_for_equal_values(result, ascending):
735
791
else :
736
792
new_index [j ] = result .index [j ]
737
793
i += 1
738
- return pandas .DataFrame (result , index = new_index )
794
+ return pandas .DataFrame (
795
+ result , index = new_index , columns = ["__reduced__" ]
796
+ )
739
797
740
798
return sort_index_for_equal_values (result , ascending )
741
799
742
- return MapReduceFunction .register (map_func , reduce_func , preserve_index = False )(
743
- self , ** kwargs
744
- )
800
+ return MapReduceFunction .register (
801
+ map_func , reduce_func , axis = 0 , preserve_index = False
802
+ )( self , ** kwargs )
745
803
746
804
# END MapReduce operations
747
805
748
806
# Reduction operations
749
807
idxmax = ReductionFunction .register (pandas .DataFrame .idxmax )
750
808
idxmin = ReductionFunction .register (pandas .DataFrame .idxmin )
751
- median = ReductionFunction . register ( pandas .DataFrame .median )
809
+ median = _numeric_only_reduce_fn ( ReductionFunction , pandas .DataFrame .median )
752
810
nunique = ReductionFunction .register (pandas .DataFrame .nunique )
753
- skew = ReductionFunction . register ( pandas .DataFrame .skew )
754
- kurt = ReductionFunction . register ( pandas .DataFrame .kurt )
755
- sem = ReductionFunction . register ( pandas .DataFrame .sem )
756
- std = ReductionFunction . register ( pandas .DataFrame .std )
757
- var = ReductionFunction . register ( pandas .DataFrame .var )
758
- sum_min_count = ReductionFunction . register ( pandas .DataFrame .sum )
759
- prod_min_count = ReductionFunction . register ( pandas .DataFrame .prod )
811
+ skew = _numeric_only_reduce_fn ( ReductionFunction , pandas .DataFrame .skew )
812
+ kurt = _numeric_only_reduce_fn ( ReductionFunction , pandas .DataFrame .kurt )
813
+ sem = _numeric_only_reduce_fn ( ReductionFunction , pandas .DataFrame .sem )
814
+ std = _numeric_only_reduce_fn ( ReductionFunction , pandas .DataFrame .std )
815
+ var = _numeric_only_reduce_fn ( ReductionFunction , pandas .DataFrame .var )
816
+ sum_min_count = _numeric_only_reduce_fn ( ReductionFunction , pandas .DataFrame .sum )
817
+ prod_min_count = _numeric_only_reduce_fn ( ReductionFunction , pandas .DataFrame .prod )
760
818
quantile_for_single_value = ReductionFunction .register (pandas .DataFrame .quantile )
761
819
mad = ReductionFunction .register (pandas .DataFrame .mad )
762
820
to_datetime = ReductionFunction .register (
0 commit comments