Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@
as_spark_type,
infer_return_type,
spark_type_to_pandas_dtype,
spark_type_to_python_type,
DataFrameType,
SeriesType,
)
Expand Down Expand Up @@ -10200,7 +10201,11 @@ def quantile(spark_column, spark_type):
if isinstance(spark_type, (BooleanType, NumericType)):
return SF.percentile_approx(spark_column.cast(DoubleType()), q, accuracy)
else:
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} to numeric".format(
spark_type_to_python_type(spark_type).__name__
)
)

if isinstance(q, list):
# First calculate the percentiles from all columns and map it to each `quantiles`
Expand Down
54 changes: 44 additions & 10 deletions databricks/koalas/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
from databricks.koalas.indexing import AtIndexer, iAtIndexer, iLocIndexer, LocIndexer
from databricks.koalas.internal import InternalFrame
from databricks.koalas.spark import functions as SF
from databricks.koalas.typedef import Scalar
from databricks.koalas.typedef import Scalar, spark_type_to_python_type
from databricks.koalas.utils import (
is_name_like_tuple,
is_name_like_value,
Expand Down Expand Up @@ -1133,7 +1133,11 @@ def mean(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} to numeric".format(
spark_type_to_python_type(spark_type).__name__
)
)
return F.mean(spark_column)

return self._reduce_for_stat_function(
Expand Down Expand Up @@ -1208,7 +1212,11 @@ def sum(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} to numeric".format(
spark_type_to_python_type(spark_type).__name__
)
)
return F.coalesce(F.sum(spark_column), F.lit(0))

return self._reduce_for_stat_function(
Expand Down Expand Up @@ -1294,7 +1302,11 @@ def prod(spark_column, spark_type):
if isinstance(spark_type, IntegralType):
scol = F.round(scol).cast(LongType())
else:
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} to numeric".format(
spark_type_to_python_type(spark_type).__name__
)
)

return F.coalesce(scol, F.lit(1))

Expand Down Expand Up @@ -1345,7 +1357,11 @@ def skew(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} to numeric".format(
spark_type_to_python_type(spark_type).__name__
)
)
return F.skewness(spark_column)

return self._reduce_for_stat_function(
Expand Down Expand Up @@ -1394,7 +1410,11 @@ def kurtosis(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} to numeric".format(
spark_type_to_python_type(spark_type).__name__
)
)
return F.kurtosis(spark_column)

return self._reduce_for_stat_function(
Expand Down Expand Up @@ -1621,7 +1641,11 @@ def std(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} to numeric".format(
spark_type_to_python_type(spark_type).__name__
)
)
return F.stddev(spark_column)

return self._reduce_for_stat_function(std, name="std", axis=axis, numeric_only=numeric_only)
Expand Down Expand Up @@ -1674,7 +1698,11 @@ def var(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} to numeric".format(
spark_type_to_python_type(spark_type).__name__
)
)
return F.variance(spark_column)

return self._reduce_for_stat_function(var, name="var", axis=axis, numeric_only=numeric_only)
Expand Down Expand Up @@ -1773,7 +1801,11 @@ def median(spark_column, spark_type):
if isinstance(spark_type, (BooleanType, NumericType)):
return SF.percentile_approx(spark_column.cast(DoubleType()), 0.5, accuracy)
else:
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} to numeric".format(
spark_type_to_python_type(spark_type).__name__
)
)

return self._reduce_for_stat_function(
median, name="median", numeric_only=numeric_only, axis=axis
Expand Down Expand Up @@ -1851,7 +1883,9 @@ def abs(kser):
return kser.spark.transform(F.abs)
else:
raise TypeError(
"bad operand type for abs(): {}".format(kser.spark.data_type.simpleString())
"bad operand type for abs(): {}".format(
spark_type_to_python_type(kser.spark.data_type).__name__
)
)

return self._apply_series_op(abs)
Expand Down
20 changes: 16 additions & 4 deletions databricks/koalas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,13 @@
from databricks.koalas.spark import functions as SF
from databricks.koalas.spark.accessors import SparkSeriesMethods
from databricks.koalas.strings import StringMethods
from databricks.koalas.typedef import infer_return_type, SeriesType, ScalarType, Scalar
from databricks.koalas.typedef import (
infer_return_type,
spark_type_to_python_type,
SeriesType,
ScalarType,
Scalar,
)


# This regular expression pattern is complied and defined here to avoid to compile the same
Expand Down Expand Up @@ -3302,7 +3308,9 @@ def quantile(spark_column, spark_type):
return SF.percentile_approx(spark_column.cast(DoubleType()), q, accuracy)
else:
raise TypeError(
"Could not convert {} to numeric".format(spark_type.simpleString())
"Could not convert {} to numeric".format(
spark_type_to_python_type(spark_type).__name__
)
)

return self._reduce_for_stat_function(quantile, name="quantile")
Expand Down Expand Up @@ -5703,7 +5711,9 @@ def _cumsum(self, skipna, part_cols=()):
kser = kser.spark.transform(lambda scol: scol.cast(LongType()))
elif not isinstance(kser.spark.data_type, NumericType):
raise TypeError(
"Could not convert {} to numeric".format(kser.spark.data_type.simpleString())
"Could not convert {} to numeric".format(
spark_type_to_python_type(kser.spark.data_type).__name__
)
)
return kser._cum(F.sum, skipna, part_cols)

Expand Down Expand Up @@ -5731,7 +5741,9 @@ def _cumprod(self, skipna, part_cols=()):
scol = F.round(scol).cast(LongType())
else:
raise TypeError(
"Could not convert {} to numeric".format(self.spark.data_type.simpleString())
"Could not convert {} to numeric".format(
spark_type_to_python_type(self.spark.data_type).__name__
)
)

return self._with_new_scol(scol)
Expand Down
4 changes: 2 additions & 2 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4283,9 +4283,9 @@ def test_quantile(self):
self.assert_eq(kdf.quantile(0.5), pd.Series(name=0.5))
self.assert_eq(kdf.quantile([0.25, 0.5, 0.75]), pd.DataFrame(index=[0.25, 0.5, 0.75]))

with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert str to numeric"):
kdf.quantile(0.5, numeric_only=False)
with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert str to numeric"):
kdf.quantile([0.25, 0.5, 0.75], numeric_only=False)

def test_pct_change(self):
Expand Down
8 changes: 4 additions & 4 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1267,9 +1267,9 @@ def test_quantile(self):
with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"):
ks.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q=["a"])

with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert str to numeric"):
ks.Series(["a", "b", "c"]).quantile()
with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert str to numeric"):
ks.Series(["a", "b", "c"]).quantile([0.25, 0.5, 0.75])

def test_idxmax(self):
Expand Down Expand Up @@ -2228,9 +2228,9 @@ def test_product(self):
kser = ks.from_pandas(pser)
self.assert_eq(pser.prod(min_count=1), kser.prod(min_count=1))

with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert str to numeric"):
ks.Series(["a", "b", "c"]).prod()
with self.assertRaisesRegex(TypeError, "Could not convert timestamp to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert datetime to numeric"):
ks.Series([pd.Timestamp("2016-01-01") for _ in range(3)]).prod()

def test_hasnans(self):
Expand Down
8 changes: 4 additions & 4 deletions databricks/koalas/tests/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,9 @@ def test_abs(self):
self.assert_eq(kdf[["B", "C"]].abs(), pdf[["B", "C"]].abs())
self.assert_eq(kdf[["E"]].abs(), pdf[["E"]].abs())

with self.assertRaisesRegex(TypeError, "bad operand type for abs\\(\\): string"):
with self.assertRaisesRegex(TypeError, "bad operand type for abs\\(\\): str"):
kdf.abs()
with self.assertRaisesRegex(TypeError, "bad operand type for abs\\(\\): string"):
with self.assertRaisesRegex(TypeError, "bad operand type for abs\\(\\): str"):
kdf.D.abs()

def test_axis_on_dataframe(self):
Expand Down Expand Up @@ -307,8 +307,8 @@ def test_numeric_only_unsupported(self):
pdf[["i", "b"]].sum(numeric_only=False).astype(int),
)

with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert str to numeric"):
kdf.sum(numeric_only=False)

with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert str to numeric"):
kdf.s.sum()
11 changes: 10 additions & 1 deletion databricks/koalas/tests/test_typedef.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
TimestampType,
)

from databricks.koalas.typedef import infer_return_type, as_spark_type
from databricks.koalas.typedef import infer_return_type, as_spark_type, spark_type_to_python_type
from databricks import koalas as ks


Expand Down Expand Up @@ -246,7 +246,15 @@ def test_as_spark_type(self):
# DecimalType
decimal.Decimal: DecimalType(38, 18),
# ArrayType
list: ArrayType(StringType()),
np.ndarray: ArrayType(StringType()),
}

for numpy_or_python_type, spark_type in type_mapper.items():
self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)
self.assertEqual(as_spark_type(spark_type_to_python_type(spark_type)), spark_type)

type_mapper = {
List[bytes]: ArrayType(BinaryType()),
List[np.character]: ArrayType(BinaryType()),
List[np.bytes_]: ArrayType(BinaryType()),
Expand Down Expand Up @@ -274,3 +282,4 @@ def test_as_spark_type(self):

for numpy_or_python_type, spark_type in type_mapper.items():
self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)
self.assertEqual(spark_type_to_python_type(spark_type), list)
43 changes: 41 additions & 2 deletions databricks/koalas/typedef/typehints.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def as_spark_type(tpe) -> types.DataType:
"""
# TODO: Add "boolean" and "string" types.
# ArrayType
if tpe in (np.ndarray,):
if tpe in (list, np.ndarray,):
return types.ArrayType(types.StringType())
elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, list):
return types.ArrayType(as_spark_type(tpe.__args__[0]))
Expand Down Expand Up @@ -142,7 +142,46 @@ def as_spark_type(tpe) -> types.DataType:
raise TypeError("Type %s was not understood." % tpe)


def spark_type_to_pandas_dtype(spark_type):
def spark_type_to_python_type(spark_type: types.DataType) -> type:
""" Return the given Spark DataType to Python type. """
# ArrayType
if isinstance(spark_type, types.ArrayType):
return list
# BinaryType
elif isinstance(spark_type, types.BinaryType):
return bytes
# BooleanType
elif isinstance(spark_type, types.BooleanType):
return bool
# DateType
elif isinstance(spark_type, types.DateType):
return datetime.date
# NumericType
elif isinstance(spark_type, types.ByteType):
return np.int8
elif isinstance(spark_type, types.DecimalType):
return decimal.Decimal
elif isinstance(spark_type, types.DoubleType):
return float
elif isinstance(spark_type, types.FloatType):
return np.float32
elif isinstance(spark_type, types.IntegerType):
return np.int32
elif isinstance(spark_type, types.LongType):
return int
elif isinstance(spark_type, types.ShortType):
return np.int16
# StringType
elif isinstance(spark_type, types.StringType):
return str
# TimestampType
elif isinstance(spark_type, types.TimestampType):
return datetime.datetime
else:
raise TypeError("Type %s was not understood." % spark_type.simpleString())


def spark_type_to_pandas_dtype(spark_type: types.DataType) -> np.dtype:
""" Return the given Spark DataType to pandas dtype. """
if isinstance(spark_type, (types.DateType, types.StructType, types.UserDefinedType)):
return np.dtype("object")
Expand Down
7 changes: 4 additions & 3 deletions docs/source/user_guide/types.rst
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,9 @@ np.ndarray ArrayType(StringType())

The table below shows which Python data types are matched to which PySpark data types internally in Koalas.

================= ===================
================= =======================
Python PySpark
================= ===================
================= =======================
bytes BinaryType
int LongType
float DoubleType
Expand All @@ -175,7 +175,8 @@ bool BooleanType
datetime.datetime TimestampType
datetime.date DateType
decimal.Decimal DecimalType(38, 18)
================= ===================
list ArrayType(StringType())
================= =======================

For decimal type, Koalas uses Spark's system default precision and scale.

Expand Down