Address comments

HyukjinKwon · HyukjinKwon · commit d9fb2c1eaacd · 2020-05-24T15:35:03.000+09:00
diff --git a/databricks/koalas/base.py b/databricks/koalas/base.py
@@ -109,7 +109,7 @@ def wrapper(self, *args):
         new_args = []
         for arg in args:
             # TODO: This is a quick hack to support NumPy type. We should revisit this.
-            if isinstance(self.spark.type, LongType) and isinstance(arg, np.timedelta64):
+            if isinstance(self.spark.data_type, LongType) and isinstance(arg, np.timedelta64):
                 new_args.append(float(arg / np.timedelta64(1, "s")))
             else:
                 new_args.append(arg)
@@ -152,9 +152,9 @@ def spark_column(self):
     __neg__ = column_op(Column.__neg__)
 
     def __add__(self, other):
-        if isinstance(self.spark.type, StringType):
+        if isinstance(self.spark.data_type, StringType):
             # Concatenate string columns
-            if isinstance(other, IndexOpsMixin) and isinstance(other.spark.type, StringType):
+            if isinstance(other, IndexOpsMixin) and isinstance(other.spark.data_type, StringType):
                 return column_op(F.concat)(self, other)
             # Handle df['col'] + 'literal'
             elif isinstance(other, str):
@@ -167,12 +167,12 @@ def __add__(self, other):
     def __sub__(self, other):
         # Note that timestamp subtraction casts arguments to integer. This is to mimic Pandas's
         # behaviors. Pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction.
-        if isinstance(other, IndexOpsMixin) and isinstance(self.spark.type, TimestampType):
-            if not isinstance(other.spark.type, TimestampType):
+        if isinstance(other, IndexOpsMixin) and isinstance(self.spark.data_type, TimestampType):
+            if not isinstance(other.spark.data_type, TimestampType):
                 raise TypeError("datetime subtraction can only be applied to datetime series.")
             return self.astype("bigint") - other.astype("bigint")
-        elif isinstance(other, IndexOpsMixin) and isinstance(self.spark.type, DateType):
-            if not isinstance(other.spark.type, DateType):
+        elif isinstance(other, IndexOpsMixin) and isinstance(self.spark.data_type, DateType):
+            if not isinstance(other.spark.data_type, DateType):
                 raise TypeError("date subtraction can only be applied to date series.")
             return column_op(F.datediff)(self, other)
         else:
@@ -215,7 +215,7 @@ def mod(left, right):
 
     def __radd__(self, other):
         # Handle 'literal' + df['col']
-        if isinstance(self.spark.type, StringType) and isinstance(other, str):
+        if isinstance(self.spark.data_type, StringType) and isinstance(other, str):
             return self._with_new_scol(F.concat(F.lit(other), self.spark.column))
         else:
             return column_op(Column.__radd__)(self, other)
@@ -335,7 +335,7 @@ def dtype(self):
         >>> s.rename("a").to_frame().set_index("a").index.dtype
         dtype('<M8[ns]')
         """
-        return spark_type_to_pandas_dtype(self.spark.type)
+        return spark_type_to_pandas_dtype(self.spark.data_type)
 
     @property
     def empty(self):
diff --git a/databricks/koalas/datetimes.py b/databricks/koalas/datetimes.py
@@ -34,8 +34,10 @@ class DatetimeMethods(object):
     """Date/Time methods for Koalas Series"""
 
     def __init__(self, series: "ks.Series"):
-        if not isinstance(series.spark.type, (DateType, TimestampType)):
-            raise ValueError("Cannot call DatetimeMethods on type {}".format(series.spark.type))
+        if not isinstance(series.spark.data_type, (DateType, TimestampType)):
+            raise ValueError(
+                "Cannot call DatetimeMethods on type {}".format(series.spark.data_type)
+            )
         self._data = series
 
     # Properties
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -2742,7 +2742,7 @@ def pandas_frame_func(f):
                 kser = kdf_or_kser
                 pudf = pandas_udf(
                     func if should_by_pass else pandas_series_func(func),
-                    returnType=kser.spark.type,
+                    returnType=kser.spark.data_type,
                     functionType=PandasUDFType.SCALAR,
                 )
                 columns = self._internal.spark_columns
diff --git a/databricks/koalas/groupby.py b/databricks/koalas/groupby.py
@@ -2067,7 +2067,7 @@ def _reduce_for_stat_function(self, sfun, only_numeric, should_include_groupkeys
         if len(agg_columns) > 0:
             stat_exprs = []
             for kser, c in zip(agg_columns, agg_columns_scols):
-                spark_type = kser.spark.type
+                spark_type = kser.spark.data_type
                 name = kser._internal.data_spark_column_names[0]
                 label = kser._internal.column_labels[0]
                 # TODO: we should have a function that takes dataframes and converts the numeric
@@ -2330,7 +2330,7 @@ def describe(self):
 
         """
         for col in self._agg_columns:
-            if isinstance(col.spark.type, StringType):
+            if isinstance(col.spark.data_type, StringType):
                 raise NotImplementedError(
                     "DataFrameGroupBy.describe() doesn't support for string type for now"
                 )
diff --git a/databricks/koalas/indexes.py b/databricks/koalas/indexes.py
@@ -420,7 +420,7 @@ def values(self):
     @property
     def spark_type(self):
         """ Returns the data type as defined by Spark, as a Spark DataType object."""
-        return self.to_series().spark.type
+        return self.to_series().spark.data_type
 
     @property
     def has_duplicates(self) -> bool:
diff --git a/databricks/koalas/indexing.py b/databricks/koalas/indexing.py
@@ -834,7 +834,7 @@ def _NotImplemented(description):
     def _select_rows_by_series(
         self, rows_sel: "Series"
     ) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
-        assert isinstance(rows_sel.spark.type, BooleanType), rows_sel.spark.type
+        assert isinstance(rows_sel.spark.data_type, BooleanType), rows_sel.spark.data_type
         return rows_sel.spark.column, None, None
 
     def _select_rows_by_spark_column(
@@ -855,7 +855,7 @@ def _select_rows_by_slice(
             sdf = self._internal.spark_frame
             index = self._kdf_or_kser.index
             index_column = index.to_series()
-            index_data_type = index_column.spark.type
+            index_data_type = index_column.spark.data_type
             start = rows_sel.start
             stop = rows_sel.stop
 
@@ -912,7 +912,7 @@ def _select_rows_by_slice(
             return reduce(lambda x, y: x & y, cond), None, None
         else:
             index = self._kdf_or_kser.index
-            index_data_type = [f.dataType for f in index.to_series().spark.type]
+            index_data_type = [f.dataType for f in index.to_series().spark.data_type]
 
             start = rows_sel.start
             if start is not None:
@@ -974,7 +974,7 @@ def _select_rows_by_iterable(
             return F.lit(False), None, None
         elif len(self._internal.index_spark_column_names) == 1:
             index_column = self._kdf_or_kser.index.to_series()
-            index_data_type = index_column.spark.type
+            index_data_type = index_column.spark.data_type
             if len(rows_sel) == 1:
                 return (
                     index_column.spark.column == F.lit(rows_sel[0]).cast(index_data_type),
diff --git a/databricks/koalas/internal.py b/databricks/koalas/internal.py
@@ -889,7 +889,7 @@ def with_filter(self, pred: Union[spark.Column, "Series"]):
         from databricks.koalas.series import Series
 
         if isinstance(pred, Series):
-            assert isinstance(pred.spark.type, BooleanType), pred.spark.type
+            assert isinstance(pred.spark.data_type, BooleanType), pred.spark.data_type
             pred = pred.spark.column
         else:
             spark_type = self.spark_frame.select(pred).schema[0].dataType
diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py
@@ -303,10 +303,6 @@
 str_type = str
 
 
-class SparkMethods(object):
-    pass
-
-
 class Series(Frame, IndexOpsMixin, Generic[T]):
     """
     Koalas Series that corresponds to Pandas Series logically. This holds Spark Column
@@ -399,11 +395,11 @@ def axes(self):
     @property
     def spark_type(self):
         warnings.warn(
-            "Series.spark_type is deprecated as of Series.spark.type. "
+            "Series.spark_type is deprecated as of Series.spark.data_type. "
             "Please use the API instead.",
             FutureWarning,
         )
-        return self.spark.type
+        return self.spark.data_type
 
     spark_type.__doc__ = SparkIndexOpsMethods.type.__doc__
 
@@ -924,7 +920,7 @@ def map(self, arg):
         if isinstance(arg, dict):
             is_start = True
             # In case dictionary is empty.
-            current = F.when(F.lit(False), F.lit(None).cast(self.spark.type))
+            current = F.when(F.lit(False), F.lit(None).cast(self.spark.data_type))
 
             for to_replace, value in arg.items():
                 if is_start:
@@ -938,7 +934,7 @@ def map(self, arg):
                 del arg[np._NoValue]  # Remove in case it's set in defaultdict.
                 current = current.otherwise(F.lit(tmp_val))
             else:
-                current = current.otherwise(F.lit(None).cast(self.spark.type))
+                current = current.otherwise(F.lit(None).cast(self.spark.data_type))
             return self._with_new_scol(current).rename(self.name)
         else:
             return self.apply(arg)
@@ -980,11 +976,11 @@ def astype(self, dtype) -> "Series":
         if not spark_type:
             raise ValueError("Type {} not understood".format(dtype))
         if isinstance(spark_type, BooleanType):
-            if isinstance(self.spark.type, StringType):
+            if isinstance(self.spark.data_type, StringType):
                 scol = F.when(self.spark.column.isNull(), F.lit(False)).otherwise(
                     F.length(self.spark.column) > 0
                 )
-            elif isinstance(self.spark.type, (FloatType, DoubleType)):
+            elif isinstance(self.spark.data_type, (FloatType, DoubleType)):
                 scol = F.when(
                     self.spark.column.isNull() | F.isnan(self.spark.column), F.lit(True)
                 ).otherwise(self.spark.column.cast(spark_type))
@@ -1745,7 +1741,7 @@ def clip(self, lower: Union[float, int] = None, upper: Union[float, int] = None)
         if lower is None and upper is None:
             return self
 
-        if isinstance(self.spark.type, NumericType):
+        if isinstance(self.spark.data_type, NumericType):
             scol = self.spark.column
             if lower is not None:
                 scol = F.when(scol < lower, lower).otherwise(scol)
@@ -2714,7 +2710,7 @@ def apply(self, func, args=(), **kwds):
             pser = self.head(limit)._to_internal_pandas()
             transformed = pser.apply(func, *args, **kwds)
             kser = Series(transformed)
-            return self._transform_batch(apply_each, kser.spark.type)
+            return self._transform_batch(apply_each, kser.spark.data_type)
         else:
             sig_return = infer_return_type(func)
             if not isinstance(sig_return, ScalarType):
@@ -3021,7 +3017,7 @@ def _transform_batch(self, func, return_schema):
             pser = self.head(limit)._to_internal_pandas()
             transformed = pser.transform(func)
             kser = Series(transformed)
-            spark_return_type = kser.spark.type
+            spark_return_type = kser.spark.data_type
         else:
             spark_return_type = return_schema
 
@@ -4987,7 +4983,7 @@ def _cumprod(self, skipna, part_cols=()):
         from pyspark.sql.functions import pandas_udf
 
         def cumprod(scol):
-            @pandas_udf(returnType=self.spark.type)
+            @pandas_udf(returnType=self.spark.data_type)
             def negative_check(s):
                 assert len(s) == 0 or ((s > 0) | (s.isnull())).all(), (
                     "values should be bigger than 0: %s" % s
@@ -5029,7 +5025,7 @@ def _reduce_for_stat_function(self, sfun, name, axis=None, numeric_only=None):
             raise ValueError("Series does not support columns axis.")
         num_args = len(signature(sfun).parameters)
         col_sdf = self.spark.column
-        col_type = self.spark.type
+        col_type = self.spark.data_type
         if isinstance(col_type, BooleanType) and sfun.__name__ not in ("min", "max"):
             # Stat functions cannot be used with boolean values by default
             # Thus, cast to integer (true to 1 and false to 0)
@@ -5050,7 +5046,8 @@ def __len__(self):
     def __getitem__(self, key):
         try:
             if (isinstance(key, slice) and any(type(n) == int for n in [key.start, key.stop])) or (
-                type(key) == int and not isinstance(self.index.spark.type, (IntegerType, LongType))
+                type(key) == int
+                and not isinstance(self.index.spark.data_type, (IntegerType, LongType))
             ):
                 # Seems like pandas Series always uses int as positional search when slicing
                 # with ints, searches based on index values when the value is int.
@@ -5104,10 +5101,10 @@ def __repr__(self):
         return pser.to_string(name=self.name, dtype=self.dtype)
 
     def __dir__(self):
-        if not isinstance(self.spark.type, StructType):
+        if not isinstance(self.spark.data_type, StructType):
             fields = []
         else:
-            fields = [f for f in self.spark.type.fieldNames() if " " not in f]
+            fields = [f for f in self.spark.data_type.fieldNames() if " " not in f]
         return super(Series, self).__dir__() + fields
 
     def __iter__(self):
diff --git a/databricks/koalas/spark.py b/databricks/koalas/spark.py
@@ -39,7 +39,7 @@ def __init__(self, data: Union["IndexOpsMixin"]):
         self._data = data
 
     @property
-    def type(self):
+    def data_type(self):
         """ Returns the data type as defined by Spark, as a Spark DataType object."""
         return self._data._internal.spark_type_for(self._data._internal.column_labels[0])
 
@@ -110,7 +110,12 @@ def transform(self, func):
                 "The output of the function [%s] should be of a "
                 "pyspark.sql.Column; however, got [%s]." % (func, type(output))
             )
-        return self._data._with_new_scol(scol=func(self._data.spark.column)).rename(self._data.name)
+        new_ser = self._data._with_new_scol(scol=output).rename(self._data.name)
+        # Trigger the resolution so it throws an exception if anything does wrong
+        # within the function, for example,
+        # `df1.a.spark.transform(lambda _: F.col("non-existent"))`.
+        new_ser._internal.to_internal_spark_frame
+        return new_ser
 
 
 class SparkFrameMethods(object):
diff --git a/databricks/koalas/strings.py b/databricks/koalas/strings.py
@@ -35,8 +35,8 @@ class StringMethods(object):
     """String methods for Koalas Series"""
 
     def __init__(self, series: "ks.Series"):
-        if not isinstance(series.spark.type, (StringType, BinaryType, ArrayType)):
-            raise ValueError("Cannot call StringMethods on type {}".format(series.spark.type))
+        if not isinstance(series.spark.data_type, (StringType, BinaryType, ArrayType)):
+            raise ValueError("Cannot call StringMethods on type {}".format(series.spark.data_type))
         self._data = series
         self.name = self._data.name
 
@@ -1271,7 +1271,7 @@ def len(self) -> "ks.Series":
         1    0
         Name: 0, dtype: int64
         """
-        if isinstance(self._data.spark.type, (ArrayType, MapType)):
+        if isinstance(self._data.spark.data_type, (ArrayType, MapType)):
             return column_op(lambda c: F.size(c).cast(LongType()))(self._data).alias(
                 self._data.name
             )
diff --git a/databricks/koalas/tests/test_indexops_spark.py b/databricks/koalas/tests/test_indexops_spark.py
@@ -15,6 +15,8 @@
 #
 
 import pandas as pd
+from pyspark.sql.utils import AnalysisException
+from pyspark.sql import functions as F
 
 from databricks import koalas as ks
 from databricks.koalas.testing.utils import ReusedSQLTestCase, SQLTestUtils
@@ -35,6 +37,9 @@ def test_series_transform_negative(self):
         ):
             self.kser.spark.transform(lambda scol: 1)
 
+        with self.assertRaisesRegex(AnalysisException, "cannot resolve.*non-existent.*"):
+            self.kser.spark.transform(lambda scol: F.col("non-existent"))
+
     def test_multiindex_transform_negative(self):
         with self.assertRaisesRegex(
             NotImplementedError, "MultiIndex does not support spark.transform yet"
diff --git a/docs/source/reference/indexing.rst b/docs/source/reference/indexing.rst
@@ -109,7 +109,7 @@ in Spark. These can be accessed by ``Index.spark.<function/property>``.
 .. autosummary::
    :toctree: api/
 
-   Index.spark.type
+   Index.spark.data_type
    Index.spark.column
    Index.spark.transform
 
@@ -248,7 +248,7 @@ in Spark. These can be accessed by ``MultiIndex.spark.<function/property>``.
 .. autosummary::
    :toctree: api/
 
-   MultiIndex.spark.type
+   MultiIndex.spark.data_type
    MultiIndex.spark.column
    MultiIndex.spark.transform
 
diff --git a/docs/source/reference/series.rst b/docs/source/reference/series.rst
@@ -222,7 +222,7 @@ in Spark. These can be accessed by ``Series.spark.<function/property>``.
 .. autosummary::
    :toctree: api/
 
-   Series.spark.type
+   Series.spark.data_type
    Series.spark.column
    Series.spark.transform
 

Original file line number	Diff line number	Diff line change
`@@ -2742,7 +2742,7 @@ def pandas_frame_func(f):`
`2742`	`2742`	`kser = kdf_or_kser`
`2743`	`2743`	`pudf = pandas_udf(`
`2744`	`2744`	`func if should_by_pass else pandas_series_func(func),`
`2745`		`- returnType=kser.spark.type,`
	`2745`	`+ returnType=kser.spark.data_type,`
`2746`	`2746`	`functionType=PandasUDFType.SCALAR,`
`2747`	`2747`	`)`
`2748`	`2748`	`columns = self._internal.spark_columns`