diff --git a/databricks/koalas/generic.py b/databricks/koalas/generic.py index 2d6ce32e25..ec857ec31f 100644 --- a/databricks/koalas/generic.py +++ b/databricks/koalas/generic.py @@ -1086,7 +1086,7 @@ def to_excel( ) def mean( - self, axis: Union[int, str] = None, numeric_only: bool = True + self, axis: Union[int, str] = None, numeric_only: bool = None ) -> Union[Scalar, "Series"]: """ Return the mean of the values. @@ -1095,7 +1095,7 @@ def mean( ---------- axis : {index (0), columns (1)} Axis for the function to be applied on. - numeric_only : bool, default True + numeric_only : bool, default None Include only float, int, boolean columns. False is not supported. This parameter is mainly for pandas compatibility. @@ -1128,6 +1128,10 @@ def mean( >>> df['a'].mean() 2.0 """ + axis = validate_axis(axis) + + if numeric_only is None and axis == 0: + numeric_only = True def mean(spark_column, spark_type): if isinstance(spark_type, BooleanType): @@ -1208,7 +1212,11 @@ def sum( nan """ axis = validate_axis(axis) - numeric_only = True if (numeric_only is None and axis == 0) else numeric_only + + if numeric_only is None and axis == 0: + numeric_only = True + elif numeric_only is True and axis == 1: + numeric_only = None def sum(spark_column, spark_type): if isinstance(spark_type, BooleanType): @@ -1288,7 +1296,11 @@ def product( nan """ axis = validate_axis(axis) - numeric_only = True if (numeric_only is None and axis == 0) else numeric_only + + if numeric_only is None and axis == 0: + numeric_only = True + elif numeric_only is True and axis == 1: + numeric_only = None def prod(spark_column, spark_type): if isinstance(spark_type, BooleanType): @@ -1321,7 +1333,7 @@ def prod(spark_column, spark_type): prod = product def skew( - self, axis: Union[int, str] = None, numeric_only: bool = True + self, axis: Union[int, str] = None, numeric_only: bool = None ) -> Union[Scalar, "Series"]: """ Return unbiased skew normalized by N-1. @@ -1330,7 +1342,7 @@ def skew( ---------- axis : {index (0), columns (1)} Axis for the function to be applied on. - numeric_only : bool, default True + numeric_only : bool, default None Include only float, int, boolean columns. False is not supported. This parameter is mainly for pandas compatibility. @@ -1356,6 +1368,10 @@ def skew( >>> df['a'].skew() 0.0 """ + axis = validate_axis(axis) + + if numeric_only is None and axis == 0: + numeric_only = True def skew(spark_column, spark_type): if isinstance(spark_type, BooleanType): @@ -1373,7 +1389,7 @@ def skew(spark_column, spark_type): ) def kurtosis( - self, axis: Union[int, str] = None, numeric_only: bool = True + self, axis: Union[int, str] = None, numeric_only: bool = None ) -> Union[Scalar, "Series"]: """ Return unbiased kurtosis using Fisher’s definition of kurtosis (kurtosis of normal == 0.0). @@ -1383,7 +1399,7 @@ def kurtosis( ---------- axis : {index (0), columns (1)} Axis for the function to be applied on. - numeric_only : bool, default True + numeric_only : bool, default None Include only float, int, boolean columns. False is not supported. This parameter is mainly for pandas compatibility. @@ -1409,6 +1425,10 @@ def kurtosis( >>> df['a'].kurtosis() -1.5 """ + axis = validate_axis(axis) + + if numeric_only is None and axis == 0: + numeric_only = True def kurtosis(spark_column, spark_type): if isinstance(spark_type, BooleanType): @@ -1472,7 +1492,11 @@ def min( 1.0 """ axis = validate_axis(axis) - numeric_only = True if (numeric_only is None and axis == 0) else numeric_only + + if numeric_only is None and axis == 0: + numeric_only = True + elif numeric_only is True and axis == 1: + numeric_only = None return self._reduce_for_stat_function( F.min, name="min", axis=axis, numeric_only=numeric_only @@ -1523,14 +1547,18 @@ def max( 3.0 """ axis = validate_axis(axis) - numeric_only = True if (numeric_only is None and axis == 0) else numeric_only + + if numeric_only is None and axis == 0: + numeric_only = True + elif numeric_only is True and axis == 1: + numeric_only = None return self._reduce_for_stat_function( F.max, name="max", axis=axis, numeric_only=numeric_only ) def count( - self, axis: Union[int, str] = None, numeric_only: bool = None + self, axis: Union[int, str] = None, numeric_only: bool = False ) -> Union[Scalar, "Series"]: """ Count non-NA cells for each column. @@ -1542,7 +1570,7 @@ def count( axis : {0 or ‘index’, 1 or ‘columns’}, default 0 If 0 or ‘index’ counts are generated for each column. If 1 or ‘columns’ counts are generated for each row. - numeric_only : bool, default None + numeric_only : bool, default False If True, include only float, int, boolean columns. This parameter is mainly for pandas compatibility. @@ -1604,7 +1632,7 @@ def count( ) def std( - self, axis: Union[int, str] = None, ddof: int = 1, numeric_only: bool = True + self, axis: Union[int, str] = None, ddof: int = 1, numeric_only: bool = None ) -> Union[Scalar, "Series"]: """ Return sample standard deviation. @@ -1616,7 +1644,7 @@ def std( ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. - numeric_only : bool, default True + numeric_only : bool, default None Include only float, int, boolean columns. False is not supported. This parameter is mainly for pandas compatibility. @@ -1659,6 +1687,11 @@ def std( """ assert ddof in (0, 1) + axis = validate_axis(axis) + + if numeric_only is None and axis == 0: + numeric_only = True + def std(spark_column, spark_type): if isinstance(spark_type, BooleanType): spark_column = spark_column.cast(LongType()) @@ -1678,7 +1711,7 @@ def std(spark_column, spark_type): ) def var( - self, axis: Union[int, str] = None, ddof: int = 1, numeric_only: bool = True + self, axis: Union[int, str] = None, ddof: int = 1, numeric_only: bool = None ) -> Union[Scalar, "Series"]: """ Return unbiased variance. @@ -1690,7 +1723,7 @@ def var( ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. - numeric_only : bool, default True + numeric_only : bool, default None Include only float, int, boolean columns. False is not supported. This parameter is mainly for pandas compatibility. @@ -1733,6 +1766,11 @@ def var( """ assert ddof in (0, 1) + axis = validate_axis(axis) + + if numeric_only is None and axis == 0: + numeric_only = True + def var(spark_column, spark_type): if isinstance(spark_type, BooleanType): spark_column = spark_column.cast(LongType()) @@ -1752,7 +1790,7 @@ def var(spark_column, spark_type): ) def median( - self, axis: Union[int, str] = None, numeric_only: bool = True, accuracy: int = 10000 + self, axis: Union[int, str] = None, numeric_only: bool = None, accuracy: int = 10000 ) -> Union[Scalar, "Series"]: """ Return the median of the values for the requested axis. @@ -1765,7 +1803,7 @@ def median( ---------- axis : {index (0), columns (1)} Axis for the function to be applied on. - numeric_only : bool, default True + numeric_only : bool, default None Include only float, int, boolean columns. False is not supported. This parameter is mainly for pandas compatibility. accuracy : int, optional @@ -1836,6 +1874,11 @@ def median( >>> (df[('y', 'b')] + 100).median() 103.0 """ + axis = validate_axis(axis) + + if numeric_only is None and axis == 0: + numeric_only = True + if not isinstance(accuracy, int): raise ValueError( "accuracy must be an integer; however, got [%s]" % type(accuracy).__name__ @@ -1856,7 +1899,7 @@ def median(spark_column, spark_type): ) def sem( - self, axis: Union[int, str] = None, ddof: int = 1, numeric_only: bool = True + self, axis: Union[int, str] = None, ddof: int = 1, numeric_only: bool = None ) -> Union[Scalar, "Series"]: """ Return unbiased standard error of the mean over requested axis. @@ -1868,7 +1911,7 @@ def sem( ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. - numeric_only : bool, default True + numeric_only : bool, default None Include only float, int, boolean columns. False is not supported. This parameter is mainly for pandas compatibility. @@ -1918,6 +1961,11 @@ def sem( """ assert ddof in (0, 1) + axis = validate_axis(axis) + + if numeric_only is None and axis == 0: + numeric_only = True + def std(spark_column, spark_type): if isinstance(spark_type, BooleanType): spark_column = spark_column.cast(LongType()) diff --git a/databricks/koalas/tests/test_stats.py b/databricks/koalas/tests/test_stats.py index f04c56d971..4336f03f70 100644 --- a/databricks/koalas/tests/test_stats.py +++ b/databricks/koalas/tests/test_stats.py @@ -181,6 +181,43 @@ def test_axis_on_dataframe(self): self.assert_eq(kdf.sem(axis=1), pdf.sem(axis=1)) self.assert_eq(kdf.sem(axis=1, ddof=0), pdf.sem(axis=1, ddof=0)) + self.assert_eq( + kdf.count(axis=1, numeric_only=True), pdf.count(axis=1, numeric_only=True) + ) + self.assert_eq(kdf.var(axis=1, numeric_only=True), pdf.var(axis=1, numeric_only=True)) + self.assert_eq( + kdf.var(axis=1, ddof=0, numeric_only=True), + pdf.var(axis=1, ddof=0, numeric_only=True), + ) + self.assert_eq(kdf.std(axis=1, numeric_only=True), pdf.std(axis=1, numeric_only=True)) + self.assert_eq( + kdf.std(axis=1, ddof=0, numeric_only=True), + pdf.std(axis=1, ddof=0, numeric_only=True), + ) + self.assert_eq( + kdf.max(axis=1, numeric_only=True), pdf.max(axis=1, numeric_only=True).astype(float) + ) + self.assert_eq( + kdf.min(axis=1, numeric_only=True), pdf.min(axis=1, numeric_only=True).astype(float) + ) + self.assert_eq( + kdf.sum(axis=1, numeric_only=True), pdf.sum(axis=1, numeric_only=True).astype(float) + ) + self.assert_eq( + kdf.product(axis=1, numeric_only=True), + pdf.product(axis=1, numeric_only=True).astype(float), + ) + self.assert_eq( + kdf.kurtosis(axis=1, numeric_only=True), pdf.kurtosis(axis=1, numeric_only=True) + ) + self.assert_eq(kdf.skew(axis=1, numeric_only=True), pdf.skew(axis=1, numeric_only=True)) + self.assert_eq(kdf.mean(axis=1, numeric_only=True), pdf.mean(axis=1, numeric_only=True)) + self.assert_eq(kdf.sem(axis=1, numeric_only=True), pdf.sem(axis=1, numeric_only=True)) + self.assert_eq( + kdf.sem(axis=1, ddof=0, numeric_only=True), + pdf.sem(axis=1, ddof=0, numeric_only=True), + ) + def test_corr(self): # Disable arrow execution since corr() is using UDT internally which is not supported. with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}):