databricks
diff --git a/‎databricks/koalas/base.py‎
Lines changed: 67 additions & 65 deletions b/‎databricks/koalas/base.py‎
Lines changed: 67 additions & 65 deletions
diff --git a/‎databricks/koalas/datetimes.py‎
Lines changed: 2 additions & 2 deletions b/‎databricks/koalas/datetimes.py‎
Lines changed: 2 additions & 2 deletions
@@ -20,12 +20,13 @@
 from collections import OrderedDict
 from functools import wraps, partial
 from typing import Union, Callable, Any
+import warnings
 
 import numpy as np
 import pandas as pd
 from pandas.api.types import is_list_like
-from pyspark import sql as spark
-from pyspark.sql import functions as F, Window
+from pandas.core.accessor import CachedAccessor
+from pyspark.sql import functions as F, Window, Column
 from pyspark.sql.types import DateType, DoubleType, FloatType, LongType, StringType, TimestampType
 
 from databricks import koalas as ks  # For running doctests and reference resolution in PyCharm.
@@ -35,6 +36,7 @@
     NATURAL_ORDER_COLUMN_NAME,
     SPARK_DEFAULT_INDEX_NAME,
 )
+from databricks.koalas.spark import SparkIndexOpsMethods
 from databricks.koalas.typedef import spark_type_to_pandas_dtype
 from databricks.koalas.utils import align_diff_series, same_anchor, scol_for, validate_axis
 from databricks.koalas.frame import DataFrame
@@ -45,19 +47,19 @@ def booleanize_null(left_scol, scol, f):
     Booleanize Null in Spark Column
     """
     comp_ops = [
-        getattr(spark.Column, "__{}__".format(comp_op))
+        getattr(Column, "__{}__".format(comp_op))
         for comp_op in ["eq", "ne", "lt", "le", "ge", "gt"]
     ]
 
     if f in comp_ops:
         # if `f` is "!=", fill null with True otherwise False
-        filler = f == spark.Column.__ne__
+        filler = f == Column.__ne__
         scol = F.when(scol.isNull(), filler).otherwise(scol)
 
-    elif f == spark.Column.__or__:
+    elif f == Column.__or__:
         scol = F.when(left_scol.isNull() | scol.isNull(), False).otherwise(scol)
 
-    elif f == spark.Column.__and__:
+    elif f == Column.__and__:
         scol = F.when(scol.isNull(), False).otherwise(scol)
 
     return scol
@@ -83,9 +85,9 @@ def wrapper(self, *args):
         cols = [arg for arg in args if isinstance(arg, IndexOpsMixin)]
         if all(same_anchor(self, col) for col in cols):
             # Same DataFrame anchors
-            args = [arg.spark_column if isinstance(arg, IndexOpsMixin) else arg for arg in args]
-            scol = f(self.spark_column, *args)
-            scol = booleanize_null(self.spark_column, scol, f)
+            args = [arg.spark.column if isinstance(arg, IndexOpsMixin) else arg for arg in args]
+            scol = f(self.spark.column, *args)
+            scol = booleanize_null(self.spark.column, scol, f)
 
             return self._with_new_scol(scol)
         else:
@@ -107,7 +109,7 @@ def wrapper(self, *args):
         new_args = []
         for arg in args:
             # TODO: This is a quick hack to support NumPy type. We should revisit this.
-            if isinstance(self.spark_type, LongType) and isinstance(arg, np.timedelta64):
+            if isinstance(self.spark.type, LongType) and isinstance(arg, np.timedelta64):
                 new_args.append(float(arg / np.timedelta64(1, "s")))
             else:
                 new_args.append(arg)
@@ -121,13 +123,10 @@ class IndexOpsMixin(object):
 
     Assuming there are following attributes or properties and function.
 
-    :ivar _scol: Spark Column instance
-    :type _scol: pyspark.Column
     :ivar _kdf: Parent's Koalas DataFrame
     :type _kdf: ks.DataFrame
-
-    :ivar spark_type: Spark data type
-    :type spark_type: spark.types.DataType
+    :ivar spark: Spark-related features
+    :type spark: SparkIndexOpsMethods
     """
 
     def __init__(self, internal: InternalFrame, kdf):
@@ -136,47 +135,50 @@ def __init__(self, internal: InternalFrame, kdf):
         self._internal = internal  # type: InternalFrame
         self._kdf = kdf
 
+    spark = CachedAccessor("spark", SparkIndexOpsMethods)
+
     @property
     def spark_column(self):
-        """
-        Spark Column object representing the Series/Index.
+        warnings.warn(
+            "Series.spark_column is deprecated as of Series.spark.column. "
+            "Please use the API instead.",
+            FutureWarning,
+        )
+        return self.spark.column
 
-        .. note:: This Spark Column object is strictly stick to its base DataFrame the Series/Index
-            was derived from.
-        """
-        return self._internal.spark_column
+    spark_column.__doc__ = SparkIndexOpsMethods.column.__doc__
 
     # arithmetic operators
-    __neg__ = column_op(spark.Column.__neg__)
+    __neg__ = column_op(Column.__neg__)
 
     def __add__(self, other):
-        if isinstance(self.spark_type, StringType):
+        if isinstance(self.spark.type, StringType):
             # Concatenate string columns
-            if isinstance(other, IndexOpsMixin) and isinstance(other.spark_type, StringType):
+            if isinstance(other, IndexOpsMixin) and isinstance(other.spark.type, StringType):
                 return column_op(F.concat)(self, other)
             # Handle df['col'] + 'literal'
             elif isinstance(other, str):
                 return column_op(F.concat)(self, F.lit(other))
             else:
                 raise TypeError("string addition can only be applied to string series or literals.")
         else:
-            return column_op(spark.Column.__add__)(self, other)
+            return column_op(Column.__add__)(self, other)
 
     def __sub__(self, other):
         # Note that timestamp subtraction casts arguments to integer. This is to mimic Pandas's
         # behaviors. Pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction.
-        if isinstance(other, IndexOpsMixin) and isinstance(self.spark_type, TimestampType):
-            if not isinstance(other.spark_type, TimestampType):
+        if isinstance(other, IndexOpsMixin) and isinstance(self.spark.type, TimestampType):
+            if not isinstance(other.spark.type, TimestampType):
                 raise TypeError("datetime subtraction can only be applied to datetime series.")
             return self.astype("bigint") - other.astype("bigint")
-        elif isinstance(other, IndexOpsMixin) and isinstance(self.spark_type, DateType):
-            if not isinstance(other.spark_type, DateType):
+        elif isinstance(other, IndexOpsMixin) and isinstance(self.spark.type, DateType):
+            if not isinstance(other.spark.type, DateType):
                 raise TypeError("date subtraction can only be applied to date series.")
             return column_op(F.datediff)(self, other)
         else:
-            return column_op(spark.Column.__sub__)(self, other)
+            return column_op(Column.__sub__)(self, other)
 
-    __mul__ = column_op(spark.Column.__mul__)
+    __mul__ = column_op(Column.__mul__)
 
     def __truediv__(self, other):
         """
@@ -213,13 +215,13 @@ def mod(left, right):
 
     def __radd__(self, other):
         # Handle 'literal' + df['col']
-        if isinstance(self.spark_type, StringType) and isinstance(other, str):
-            return self._with_new_scol(F.concat(F.lit(other), self.spark_column))
+        if isinstance(self.spark.type, StringType) and isinstance(other, str):
+            return self._with_new_scol(F.concat(F.lit(other), self.spark.column))
         else:
-            return column_op(spark.Column.__radd__)(self, other)
+            return column_op(Column.__radd__)(self, other)
 
-    __rsub__ = column_op(spark.Column.__rsub__)
-    __rmul__ = column_op(spark.Column.__rmul__)
+    __rsub__ = column_op(Column.__rsub__)
+    __rmul__ = column_op(Column.__rmul__)
 
     def __rtruediv__(self, other):
         def rtruediv(left, right):
@@ -274,24 +276,24 @@ def rmod(left, right):
 
         return column_op(rmod)(self, other)
 
-    __pow__ = column_op(spark.Column.__pow__)
-    __rpow__ = column_op(spark.Column.__rpow__)
+    __pow__ = column_op(Column.__pow__)
+    __rpow__ = column_op(Column.__rpow__)
 
     # comparison operators
-    __eq__ = column_op(spark.Column.__eq__)
-    __ne__ = column_op(spark.Column.__ne__)
-    __lt__ = column_op(spark.Column.__lt__)
-    __le__ = column_op(spark.Column.__le__)
-    __ge__ = column_op(spark.Column.__ge__)
-    __gt__ = column_op(spark.Column.__gt__)
+    __eq__ = column_op(Column.__eq__)
+    __ne__ = column_op(Column.__ne__)
+    __lt__ = column_op(Column.__lt__)
+    __le__ = column_op(Column.__le__)
+    __ge__ = column_op(Column.__ge__)
+    __gt__ = column_op(Column.__gt__)
 
     # `and`, `or`, `not` cannot be overloaded in Python,
     # so use bitwise operators as boolean operators
-    __and__ = column_op(spark.Column.__and__)
-    __or__ = column_op(spark.Column.__or__)
-    __invert__ = column_op(spark.Column.__invert__)
-    __rand__ = column_op(spark.Column.__rand__)
-    __ror__ = column_op(spark.Column.__ror__)
+    __and__ = column_op(Column.__and__)
+    __or__ = column_op(Column.__or__)
+    __invert__ = column_op(Column.__invert__)
+    __rand__ = column_op(Column.__rand__)
+    __ror__ = column_op(Column.__ror__)
 
     # NDArray Compat
     def __array_ufunc__(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any):
@@ -333,7 +335,7 @@ def dtype(self):
         >>> s.rename("a").to_frame().set_index("a").index.dtype
         dtype('<M8[ns]')
         """
-        return spark_type_to_pandas_dtype(self.spark_type)
+        return spark_type_to_pandas_dtype(self.spark.type)
 
     @property
     def empty(self):
@@ -371,8 +373,8 @@ def hasnans(self):
         >>> ks.Series([1, 2, 3]).rename("a").to_frame().set_index("a").index.hasnans
         False
         """
-        sdf = self._internal._sdf.select(self.spark_column)
-        col = self.spark_column
+        sdf = self._internal._sdf.select(self.spark.column)
+        col = self.spark.column
 
         ret = sdf.select(F.max(col.isNull() | F.isnan(col))).collect()[0][0]
         return ret
@@ -552,7 +554,7 @@ def _is_monotonic(self, order):
                     "__partition_id"
                 ),  # Make sure we use the same partition id in the whole job.
                 F.col(NATURAL_ORDER_COLUMN_NAME),
-                self.spark_column.alias("__origin"),
+                self.spark.column.alias("__origin"),
             )
             .select(
                 F.col("__partition_id"),
@@ -670,7 +672,7 @@ def astype(self, dtype):
         spark_type = as_spark_type(dtype)
         if not spark_type:
             raise ValueError("Type {} not understood".format(dtype))
-        return self._with_new_scol(self.spark_column.cast(spark_type))
+        return self._with_new_scol(self.spark.column.cast(spark_type))
 
     def isin(self, values):
         """
@@ -722,7 +724,7 @@ def isin(self, values):
                 " to isin(), you passed a [{values_type}]".format(values_type=type(values).__name__)
             )
 
-        return self._with_new_scol(self.spark_column.isin(list(values))).rename(self.name)
+        return self._with_new_scol(self.spark.column.isin(list(values))).rename(self.name)
 
     def isnull(self):
         """
@@ -757,10 +759,10 @@ def isnull(self):
             raise NotImplementedError("isna is not defined for MultiIndex")
         if isinstance(self.spark_type, (FloatType, DoubleType)):
             return self._with_new_scol(
-                self.spark_column.isNull() | F.isnan(self.spark_column)
+                self.spark.column.isNull() | F.isnan(self.spark.column)
             ).rename(self.name)
         else:
-            return self._with_new_scol(self.spark_column.isNull()).rename(self.name)
+            return self._with_new_scol(self.spark.column.isNull()).rename(self.name)
 
     isna = isnull
 
@@ -856,7 +858,7 @@ def all(self, axis: Union[int, str] = 0) -> bool:
         if axis != 0:
             raise NotImplementedError('axis should be either 0 or "index" currently.')
 
-        sdf = self._internal._sdf.select(self.spark_column)
+        sdf = self._internal._sdf.select(self.spark.column)
         col = scol_for(sdf, sdf.columns[0])
 
         # Note that we're ignoring `None`s here for now.
@@ -919,7 +921,7 @@ def any(self, axis: Union[int, str] = 0) -> bool:
         if axis != 0:
             raise NotImplementedError('axis should be either 0 or "index" currently.')
 
-        sdf = self._internal._sdf.select(self.spark_column)
+        sdf = self._internal._sdf.select(self.spark.column)
         col = scol_for(sdf, sdf.columns[0])
 
         # Note that we're ignoring `None`s here for now.
@@ -986,7 +988,7 @@ def _shift(self, periods, fill_value, part_cols=()):
         if not isinstance(periods, int):
             raise ValueError("periods should be an int; however, got [%s]" % type(periods))
 
-        col = self.spark_column
+        col = self.spark.column
         window = (
             Window.partitionBy(*part_cols)
             .orderBy(NATURAL_ORDER_COLUMN_NAME)
@@ -1152,9 +1154,9 @@ def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, d
             raise NotImplementedError("value_counts currently does not support bins")
 
         if dropna:
-            sdf_dropna = self._internal._sdf.select(self.spark_column).dropna()
+            sdf_dropna = self._internal._sdf.select(self.spark.column).dropna()
         else:
-            sdf_dropna = self._internal._sdf.select(self.spark_column)
+            sdf_dropna = self._internal._sdf.select(self.spark.column)
         index_name = SPARK_DEFAULT_INDEX_NAME
         column_name = self._internal.data_spark_column_names[0]
         sdf = sdf_dropna.groupby(scol_for(sdf_dropna, column_name).alias(index_name)).count()
@@ -1244,12 +1246,12 @@ def _nunique(self, dropna=True, approx=False, rsd=0.05):
         colname = self._internal.data_spark_column_names[0]
         count_fn = partial(F.approx_count_distinct, rsd=rsd) if approx else F.countDistinct
         if dropna:
-            return count_fn(self.spark_column).alias(colname)
+            return count_fn(self.spark.column).alias(colname)
         else:
             return (
-                count_fn(self.spark_column)
+                count_fn(self.spark.column)
                 + F.when(
-                    F.count(F.when(self.spark_column.isNull(), 1).otherwise(None)) >= 1, 1
+                    F.count(F.when(self.spark.column.isNull(), 1).otherwise(None)) >= 1, 1
                 ).otherwise(0)
             ).alias(colname)
 
 
@@ -34,8 +34,8 @@ class DatetimeMethods(object):
     """Date/Time methods for Koalas Series"""
 
     def __init__(self, series: "ks.Series"):
-        if not isinstance(series.spark_type, (DateType, TimestampType)):
-            raise ValueError("Cannot call DatetimeMethods on type {}".format(series.spark_type))
+        if not isinstance(series.spark.type, (DateType, TimestampType)):
+            raise ValueError("Cannot call DatetimeMethods on type {}".format(series.spark.type))
         self._data = series
 
     # Properties