databricks · HyukjinKwon · Dec 26, 2020 · Dec 23, 2020 · Dec 23, 2020 · Dec 23, 2020
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -110,6 +110,7 @@
     as_spark_type,
     infer_return_type,
     spark_type_to_pandas_dtype,
+    spark_type_to_python_type,
     DataFrameType,
     SeriesType,
 )
@@ -10200,7 +10201,11 @@ def quantile(spark_column, spark_type):
             if isinstance(spark_type, (BooleanType, NumericType)):
                 return SF.percentile_approx(spark_column.cast(DoubleType()), q, accuracy)
             else:
-                raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
+                raise TypeError(
+                    "Could not convert {} to numeric".format(
+                        spark_type_to_python_type(spark_type).__name__
+                    )
+                )
 
         if isinstance(q, list):
             # First calculate the percentiles from all columns and map it to each `quantiles`

diff --git a/databricks/koalas/generic.py b/databricks/koalas/generic.py
@@ -44,7 +44,7 @@
 from databricks.koalas.indexing import AtIndexer, iAtIndexer, iLocIndexer, LocIndexer
 from databricks.koalas.internal import InternalFrame
 from databricks.koalas.spark import functions as SF
-from databricks.koalas.typedef import Scalar
+from databricks.koalas.typedef import Scalar, spark_type_to_python_type
 from databricks.koalas.utils import (
     is_name_like_tuple,
     is_name_like_value,
@@ -1133,7 +1133,11 @@ def mean(spark_column, spark_type):
             if isinstance(spark_type, BooleanType):
                 spark_column = spark_column.cast(LongType())
             elif not isinstance(spark_type, NumericType):
-                raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
+                raise TypeError(
+                    "Could not convert {} to numeric".format(
+                        spark_type_to_python_type(spark_type).__name__
+                    )
+                )
             return F.mean(spark_column)
 
         return self._reduce_for_stat_function(
@@ -1208,7 +1212,11 @@ def sum(spark_column, spark_type):
             if isinstance(spark_type, BooleanType):
                 spark_column = spark_column.cast(LongType())
             elif not isinstance(spark_type, NumericType):
-                raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
+                raise TypeError(
+                    "Could not convert {} to numeric".format(
+                        spark_type_to_python_type(spark_type).__name__
+                    )
+                )
             return F.coalesce(F.sum(spark_column), F.lit(0))
 
         return self._reduce_for_stat_function(
@@ -1294,7 +1302,11 @@ def prod(spark_column, spark_type):
                 if isinstance(spark_type, IntegralType):
                     scol = F.round(scol).cast(LongType())
             else:
-                raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
+                raise TypeError(
+                    "Could not convert {} to numeric".format(
+                        spark_type_to_python_type(spark_type).__name__
+                    )
+                )
 
             return F.coalesce(scol, F.lit(1))
 
@@ -1345,7 +1357,11 @@ def skew(spark_column, spark_type):
             if isinstance(spark_type, BooleanType):
                 spark_column = spark_column.cast(LongType())
             elif not isinstance(spark_type, NumericType):
-                raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
+                raise TypeError(
+                    "Could not convert {} to numeric".format(
+                        spark_type_to_python_type(spark_type).__name__
+                    )
+                )
             return F.skewness(spark_column)
 
         return self._reduce_for_stat_function(
@@ -1394,7 +1410,11 @@ def kurtosis(spark_column, spark_type):
             if isinstance(spark_type, BooleanType):
                 spark_column = spark_column.cast(LongType())
             elif not isinstance(spark_type, NumericType):
-                raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
+                raise TypeError(
+                    "Could not convert {} to numeric".format(
+                        spark_type_to_python_type(spark_type).__name__
+                    )
+                )
             return F.kurtosis(spark_column)
 
         return self._reduce_for_stat_function(
@@ -1621,7 +1641,11 @@ def std(spark_column, spark_type):
             if isinstance(spark_type, BooleanType):
                 spark_column = spark_column.cast(LongType())
             elif not isinstance(spark_type, NumericType):
-                raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
+                raise TypeError(
+                    "Could not convert {} to numeric".format(
+                        spark_type_to_python_type(spark_type).__name__
+                    )
+                )
             return F.stddev(spark_column)
 
         return self._reduce_for_stat_function(std, name="std", axis=axis, numeric_only=numeric_only)
@@ -1674,7 +1698,11 @@ def var(spark_column, spark_type):
             if isinstance(spark_type, BooleanType):
                 spark_column = spark_column.cast(LongType())
             elif not isinstance(spark_type, NumericType):
-                raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
+                raise TypeError(
+                    "Could not convert {} to numeric".format(
+                        spark_type_to_python_type(spark_type).__name__
+                    )
+                )
             return F.variance(spark_column)
 
         return self._reduce_for_stat_function(var, name="var", axis=axis, numeric_only=numeric_only)
@@ -1773,7 +1801,11 @@ def median(spark_column, spark_type):
             if isinstance(spark_type, (BooleanType, NumericType)):
                 return SF.percentile_approx(spark_column.cast(DoubleType()), 0.5, accuracy)
             else:
-                raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
+                raise TypeError(
+                    "Could not convert {} to numeric".format(
+                        spark_type_to_python_type(spark_type).__name__
+                    )
+                )
 
         return self._reduce_for_stat_function(
             median, name="median", numeric_only=numeric_only, axis=axis
@@ -1851,7 +1883,9 @@ def abs(kser):
                 return kser.spark.transform(F.abs)
             else:
                 raise TypeError(
-                    "bad operand type for abs(): {}".format(kser.spark.data_type.simpleString())
+                    "bad operand type for abs(): {}".format(
+                        spark_type_to_python_type(kser.spark.data_type).__name__
+                    )
                 )
 
         return self._apply_series_op(abs)

diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py
@@ -82,7 +82,13 @@
 from databricks.koalas.spark import functions as SF
 from databricks.koalas.spark.accessors import SparkSeriesMethods
 from databricks.koalas.strings import StringMethods
-from databricks.koalas.typedef import infer_return_type, SeriesType, ScalarType, Scalar
+from databricks.koalas.typedef import (
+    infer_return_type,
+    spark_type_to_python_type,
+    SeriesType,
+    ScalarType,
+    Scalar,
+)
 
 
 # This regular expression pattern is complied and defined here to avoid to compile the same
@@ -3302,7 +3308,9 @@ def quantile(spark_column, spark_type):
                     return SF.percentile_approx(spark_column.cast(DoubleType()), q, accuracy)
                 else:
                     raise TypeError(
-                        "Could not convert {} to numeric".format(spark_type.simpleString())
+                        "Could not convert {} to numeric".format(
+                            spark_type_to_python_type(spark_type).__name__
+                        )
                     )
 
             return self._reduce_for_stat_function(quantile, name="quantile")
@@ -5703,7 +5711,9 @@ def _cumsum(self, skipna, part_cols=()):
             kser = kser.spark.transform(lambda scol: scol.cast(LongType()))
         elif not isinstance(kser.spark.data_type, NumericType):
             raise TypeError(
-                "Could not convert {} to numeric".format(kser.spark.data_type.simpleString())
+                "Could not convert {} to numeric".format(
+                    spark_type_to_python_type(kser.spark.data_type).__name__
+                )
             )
         return kser._cum(F.sum, skipna, part_cols)
 
@@ -5731,7 +5741,9 @@ def _cumprod(self, skipna, part_cols=()):
                 scol = F.round(scol).cast(LongType())
         else:
             raise TypeError(
-                "Could not convert {} to numeric".format(self.spark.data_type.simpleString())
+                "Could not convert {} to numeric".format(
+                    spark_type_to_python_type(self.spark.data_type).__name__
+                )
             )
 
         return self._with_new_scol(scol)

diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
@@ -4283,9 +4283,9 @@ def test_quantile(self):
             self.assert_eq(kdf.quantile(0.5), pd.Series(name=0.5))
             self.assert_eq(kdf.quantile([0.25, 0.5, 0.75]), pd.DataFrame(index=[0.25, 0.5, 0.75]))
 
-        with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
+        with self.assertRaisesRegex(TypeError, "Could not convert str to numeric"):
             kdf.quantile(0.5, numeric_only=False)
-        with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
+        with self.assertRaisesRegex(TypeError, "Could not convert str to numeric"):
             kdf.quantile([0.25, 0.5, 0.75], numeric_only=False)
 
     def test_pct_change(self):

diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py
@@ -1267,9 +1267,9 @@ def test_quantile(self):
         with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"):
             ks.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q=["a"])
 
-        with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
+        with self.assertRaisesRegex(TypeError, "Could not convert str to numeric"):
             ks.Series(["a", "b", "c"]).quantile()
-        with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
+        with self.assertRaisesRegex(TypeError, "Could not convert str to numeric"):
             ks.Series(["a", "b", "c"]).quantile([0.25, 0.5, 0.75])
 
     def test_idxmax(self):
@@ -2228,9 +2228,9 @@ def test_product(self):
         kser = ks.from_pandas(pser)
         self.assert_eq(pser.prod(min_count=1), kser.prod(min_count=1))
 
-        with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
+        with self.assertRaisesRegex(TypeError, "Could not convert str to numeric"):
             ks.Series(["a", "b", "c"]).prod()
-        with self.assertRaisesRegex(TypeError, "Could not convert timestamp to numeric"):
+        with self.assertRaisesRegex(TypeError, "Could not convert datetime to numeric"):
             ks.Series([pd.Timestamp("2016-01-01") for _ in range(3)]).prod()
 
     def test_hasnans(self):

diff --git a/databricks/koalas/tests/test_stats.py b/databricks/koalas/tests/test_stats.py
@@ -132,9 +132,9 @@ def test_abs(self):
         self.assert_eq(kdf[["B", "C"]].abs(), pdf[["B", "C"]].abs())
         self.assert_eq(kdf[["E"]].abs(), pdf[["E"]].abs())
 
-        with self.assertRaisesRegex(TypeError, "bad operand type for abs\\(\\): string"):
+        with self.assertRaisesRegex(TypeError, "bad operand type for abs\\(\\): str"):
             kdf.abs()
-        with self.assertRaisesRegex(TypeError, "bad operand type for abs\\(\\): string"):
+        with self.assertRaisesRegex(TypeError, "bad operand type for abs\\(\\): str"):
             kdf.D.abs()
 
     def test_axis_on_dataframe(self):
@@ -307,8 +307,8 @@ def test_numeric_only_unsupported(self):
                 pdf[["i", "b"]].sum(numeric_only=False).astype(int),
             )
 
-        with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
+        with self.assertRaisesRegex(TypeError, "Could not convert str to numeric"):
             kdf.sum(numeric_only=False)
 
-        with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
+        with self.assertRaisesRegex(TypeError, "Could not convert str to numeric"):
             kdf.s.sum()
diff --git a/databricks/koalas/tests/test_typedef.py b/databricks/koalas/tests/test_typedef.py
@@ -40,7 +40,7 @@
     TimestampType,
 )
 
-from databricks.koalas.typedef import infer_return_type, as_spark_type
+from databricks.koalas.typedef import infer_return_type, as_spark_type, spark_type_to_python_type
 from databricks import koalas as ks
 
 
@@ -246,7 +246,15 @@ def test_as_spark_type(self):
             # DecimalType
             decimal.Decimal: DecimalType(38, 18),
             # ArrayType
+            list: ArrayType(StringType()),
             np.ndarray: ArrayType(StringType()),
+        }
+
+        for numpy_or_python_type, spark_type in type_mapper.items():
+            self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)
+            self.assertEqual(as_spark_type(spark_type_to_python_type(spark_type)), spark_type)
+
+        type_mapper = {
             List[bytes]: ArrayType(BinaryType()),
             List[np.character]: ArrayType(BinaryType()),
             List[np.bytes_]: ArrayType(BinaryType()),
@@ -274,3 +282,4 @@ def test_as_spark_type(self):
 
         for numpy_or_python_type, spark_type in type_mapper.items():
             self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)
+            self.assertEqual(spark_type_to_python_type(spark_type), list)
diff --git a/databricks/koalas/typedef/typehints.py b/databricks/koalas/typedef/typehints.py
@@ -103,7 +103,7 @@ def as_spark_type(tpe) -> types.DataType:
     """
     # TODO: Add "boolean" and "string" types.
     # ArrayType
-    if tpe in (np.ndarray,):
+    if tpe in (list, np.ndarray,):
         return types.ArrayType(types.StringType())
     elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, list):
         return types.ArrayType(as_spark_type(tpe.__args__[0]))
@@ -142,7 +142,46 @@ def as_spark_type(tpe) -> types.DataType:
         raise TypeError("Type %s was not understood." % tpe)
 
 
-def spark_type_to_pandas_dtype(spark_type):
+def spark_type_to_python_type(spark_type: types.DataType) -> type:
+    """ Return the given Spark DataType to Python type. """
+    # ArrayType
+    if isinstance(spark_type, types.ArrayType):
+        return list
+    # BinaryType
+    elif isinstance(spark_type, types.BinaryType):
+        return bytes
+    # BooleanType
+    elif isinstance(spark_type, types.BooleanType):
+        return bool
+    # DateType
+    elif isinstance(spark_type, types.DateType):
+        return datetime.date
+    # NumericType
+    elif isinstance(spark_type, types.ByteType):
+        return np.int8
+    elif isinstance(spark_type, types.DecimalType):
+        return decimal.Decimal
+    elif isinstance(spark_type, types.DoubleType):
+        return float
+    elif isinstance(spark_type, types.FloatType):
+        return np.float32
+    elif isinstance(spark_type, types.IntegerType):
+        return np.int32
+    elif isinstance(spark_type, types.LongType):
+        return int
+    elif isinstance(spark_type, types.ShortType):
+        return np.int16
+    # StringType
+    elif isinstance(spark_type, types.StringType):
+        return str
+    # TimestampType
+    elif isinstance(spark_type, types.TimestampType):
+        return datetime.datetime
+    else:
+        raise TypeError("Type %s was not understood." % spark_type.simpleString())
+
+
+def spark_type_to_pandas_dtype(spark_type: types.DataType) -> np.dtype:
     """ Return the given Spark DataType to pandas dtype. """
     if isinstance(spark_type, (types.DateType, types.StructType, types.UserDefinedType)):
         return np.dtype("object")

diff --git a/docs/source/user_guide/types.rst b/docs/source/user_guide/types.rst
@@ -164,9 +164,9 @@ np.ndarray    ArrayType(StringType())
 
 The table below shows which Python data types are matched to which PySpark data types internally in Koalas.
 
-================= ===================
+================= =======================
 Python            PySpark
-================= ===================
+================= =======================
 bytes             BinaryType
 int               LongType
 float             DoubleType
@@ -175,7 +175,8 @@ bool              BooleanType
 datetime.datetime TimestampType
 datetime.date     DateType
 decimal.Decimal   DecimalType(38, 18)
-================= ===================
+list              ArrayType(StringType())
+================= =======================
 
 For decimal type, Koalas uses Spark's system default precision and scale.