snorkel-marlin-repos
diff --git a/‎databricks/koalas/base.py‎
Lines changed: 121 additions & 42 deletions b/‎databricks/koalas/base.py‎
Lines changed: 121 additions & 42 deletions
diff --git a/‎databricks/koalas/frame.py‎
Lines changed: 9 additions & 12 deletions b/‎databricks/koalas/frame.py‎
Lines changed: 9 additions & 12 deletions
diff --git a/‎databricks/koalas/indexes/base.py‎
Lines changed: 3 additions & 1 deletion b/‎databricks/koalas/indexes/base.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎databricks/koalas/indexes/multi.py‎
Lines changed: 1 addition & 1 deletion b/‎databricks/koalas/indexes/multi.py‎
Lines changed: 1 addition & 1 deletion
@@ -25,7 +25,7 @@
 
 import numpy as np
 import pandas as pd  # noqa: F401
-from pandas.api.types import is_list_like
+from pandas.api.types import is_list_like, pandas_dtype
 from pyspark import sql as spark
 from pyspark.sql import functions as F, Window, Column
 from pyspark.sql.types import (
@@ -51,7 +51,12 @@
 )
 from databricks.koalas.spark import functions as SF
 from databricks.koalas.spark.accessors import SparkIndexOpsMethods
-from databricks.koalas.typedef import as_spark_type, spark_type_to_pandas_dtype
+from databricks.koalas.typedef import (
+    Dtype,
+    as_spark_type,
+    extension_dtypes,
+    spark_type_to_pandas_dtype,
+)
 from databricks.koalas.utils import (
     combine_frames,
     same_anchor,
@@ -191,7 +196,7 @@ def align_diff_index_ops(func, this_index_ops: "IndexOpsMixin", *args) -> "Index
                 ).rename(that_series.name)
 
 
-def booleanize_null(left_scol, scol, f) -> Column:
+def booleanize_null(scol, f) -> Column:
     """
     Booleanize Null in Spark Column
     """
@@ -205,12 +210,6 @@ def booleanize_null(left_scol, scol, f) -> Column:
         filler = f == Column.__ne__
         scol = F.when(scol.isNull(), filler).otherwise(scol)
 
-    elif f == Column.__or__:
-        scol = F.when(left_scol.isNull() | scol.isNull(), False).otherwise(scol)
-
-    elif f == Column.__and__:
-        scol = F.when(scol.isNull(), False).otherwise(scol)
-
     return scol
 
 
@@ -239,13 +238,23 @@ def wrapper(self, *args):
             # Same DataFrame anchors
             args = [arg.spark.column if isinstance(arg, IndexOpsMixin) else arg for arg in args]
             scol = f(self.spark.column, *args)
-            scol = booleanize_null(self.spark.column, scol, f)
+
+            spark_type = self._internal.spark_frame.select(scol).schema[0].dataType
+            use_extension_dtypes = any(
+                isinstance(col.dtype, extension_dtypes) for col in [self] + cols
+            )
+            dtype = spark_type_to_pandas_dtype(
+                spark_type, use_extension_dtypes=use_extension_dtypes
+            )
+
+            if not isinstance(dtype, extension_dtypes):
+                scol = booleanize_null(scol, f)
 
             if isinstance(self, Series) or not any(isinstance(col, Series) for col in cols):
-                index_ops = self._with_new_scol(scol)
+                index_ops = self._with_new_scol(scol, dtype=dtype)
             else:
                 kser = next(col for col in cols if isinstance(col, Series))
-                index_ops = kser._with_new_scol(scol)
+                index_ops = kser._with_new_scol(scol, dtype=dtype)
         elif get_option("compute.ops_on_diff_frames"):
             index_ops = align_diff_index_ops(f, self, *args)
         else:
@@ -293,7 +302,7 @@ def _kdf(self) -> DataFrame:
         pass
 
     @abstractmethod
-    def _with_new_scol(self, scol: spark.Column):
+    def _with_new_scol(self, scol: spark.Column, *, dtype=None):
         pass
 
     @property
@@ -603,11 +612,63 @@ def rpow_func(left, right):
 
     # `and`, `or`, `not` cannot be overloaded in Python,
     # so use bitwise operators as boolean operators
-    __and__ = column_op(Column.__and__)
-    __or__ = column_op(Column.__or__)
+    def __and__(self, other) -> Union["Series", "Index"]:
+        if isinstance(self.dtype, extension_dtypes) or (
+            isinstance(other, IndexOpsMixin) and isinstance(other.dtype, extension_dtypes)
+        ):
+
+            def and_func(left, right):
+                if not isinstance(right, spark.Column):
+                    if pd.isna(right):
+                        right = F.lit(None)
+                    else:
+                        right = F.lit(right)
+                return left & right
+
+        else:
+
+            def and_func(left, right):
+                if not isinstance(right, spark.Column):
+                    if pd.isna(right):
+                        right = F.lit(None)
+                    else:
+                        right = F.lit(right)
+                scol = left & right
+                return F.when(scol.isNull(), False).otherwise(scol)
+
+        return column_op(and_func)(self, other)
+
+    def __or__(self, other) -> Union["Series", "Index"]:
+        if isinstance(self.dtype, extension_dtypes) or (
+            isinstance(other, IndexOpsMixin) and isinstance(other.dtype, extension_dtypes)
+        ):
+
+            def or_func(left, right):
+                if not isinstance(right, spark.Column):
+                    if pd.isna(right):
+                        right = F.lit(None)
+                    else:
+                        right = F.lit(right)
+                return left | right
+
+        else:
+
+            def or_func(left, right):
+                if not isinstance(right, spark.Column) and pd.isna(right):
+                    return F.lit(False)
+                else:
+                    scol = left | F.lit(right)
+                    return F.when(left.isNull() | scol.isNull(), False).otherwise(scol)
+
+        return column_op(or_func)(self, other)
+
     __invert__ = column_op(Column.__invert__)
-    __rand__ = column_op(Column.__rand__)
-    __ror__ = column_op(Column.__ror__)
+
+    def __rand__(self, other) -> Union["Series", "Index"]:
+        return self.__and__(other)
+
+    def __ror__(self, other) -> Union["Series", "Index"]:
+        return self.__or__(other)
 
     def __len__(self):
         return len(self._kdf)
@@ -632,7 +693,7 @@ def __array_ufunc__(self, ufunc: Callable, method: str, *inputs: Any, **kwargs:
             raise NotImplementedError("Koalas objects currently do not support %s." % ufunc)
 
     @property
-    def dtype(self) -> np.dtype:
+    def dtype(self) -> Dtype:
         """Return the dtype object of the underlying data.
 
         Examples
@@ -652,7 +713,7 @@ def dtype(self) -> np.dtype:
         >>> s.rename("a").to_frame().set_index("a").index.dtype
         dtype('<M8[ns]')
         """
-        return spark_type_to_pandas_dtype(self.spark.data_type)
+        return self._internal.data_dtypes[0]
 
     @property
     def empty(self) -> bool:
@@ -955,7 +1016,7 @@ def ndim(self) -> int:
         """
         return 1
 
-    def astype(self, dtype) -> Union["Index", "Series"]:
+    def astype(self, dtype: Union[str, type, Dtype]) -> Union["Index", "Series"]:
         """
         Cast a Koalas object to a specified dtype ``dtype``.
 
@@ -989,37 +1050,55 @@ def astype(self, dtype) -> Union["Index", "Series"]:
         >>> ser.rename("a").to_frame().set_index("a").index.astype('int64')
         Int64Index([1, 2], dtype='int64', name='a')
         """
+        dtype = pandas_dtype(dtype)
         spark_type = as_spark_type(dtype)
         if not spark_type:
             raise ValueError("Type {} not understood".format(dtype))
         if isinstance(spark_type, BooleanType):
-            if isinstance(self.spark.data_type, StringType):
-                scol = F.when(self.spark.column.isNull(), F.lit(False)).otherwise(
-                    F.length(self.spark.column) > 0
-                )
-            elif isinstance(self.spark.data_type, (FloatType, DoubleType)):
-                scol = F.when(
-                    self.spark.column.isNull() | F.isnan(self.spark.column), F.lit(True)
-                ).otherwise(self.spark.column.cast(spark_type))
+            if isinstance(dtype, extension_dtypes):
+                scol = self.spark.column.cast(spark_type)
             else:
-                scol = F.when(self.spark.column.isNull(), F.lit(False)).otherwise(
-                    self.spark.column.cast(spark_type)
-                )
+                if isinstance(self.spark.data_type, StringType):
+                    scol = F.when(self.spark.column.isNull(), F.lit(False)).otherwise(
+                        F.length(self.spark.column) > 0
+                    )
+                elif isinstance(self.spark.data_type, (FloatType, DoubleType)):
+                    scol = F.when(
+                        self.spark.column.isNull() | F.isnan(self.spark.column), F.lit(True)
+                    ).otherwise(self.spark.column.cast(spark_type))
+                else:
+                    scol = F.when(self.spark.column.isNull(), F.lit(False)).otherwise(
+                        self.spark.column.cast(spark_type)
+                    )
         elif isinstance(spark_type, StringType):
-            if isinstance(self.spark.data_type, NumericType):
-                null_str = str(np.nan)
-            elif isinstance(self.spark.data_type, (DateType, TimestampType)):
-                null_str = str(pd.NaT)
-            else:
-                null_str = str(None)
-            if isinstance(self.spark.data_type, BooleanType):
-                casted = F.when(self.spark.column, "True").otherwise("False")
+            if isinstance(dtype, extension_dtypes):
+                if isinstance(self.spark.data_type, BooleanType):
+                    scol = F.when(
+                        self.spark.column.isNotNull(),
+                        F.when(self.spark.column, "True").otherwise("False"),
+                    )
+                elif isinstance(self.spark.data_type, TimestampType):
+                    # seems like a pandas' bug?
+                    scol = F.when(self.spark.column.isNull(), str(pd.NaT)).otherwise(
+                        self.spark.column.cast(spark_type)
+                    )
+                else:
+                    scol = self.spark.column.cast(spark_type)
             else:
-                casted = self.spark.column.cast(spark_type)
-            scol = F.when(self.spark.column.isNull(), null_str).otherwise(casted)
+                if isinstance(self.spark.data_type, NumericType):
+                    null_str = str(np.nan)
+                elif isinstance(self.spark.data_type, (DateType, TimestampType)):
+                    null_str = str(pd.NaT)
+                else:
+                    null_str = str(None)
+                if isinstance(self.spark.data_type, BooleanType):
+                    casted = F.when(self.spark.column, "True").otherwise("False")
+                else:
+                    casted = self.spark.column.cast(spark_type)
+                scol = F.when(self.spark.column.isNull(), null_str).otherwise(casted)
         else:
             scol = self.spark.column.cast(spark_type)
-        return self._with_new_scol(scol)
+        return self._with_new_scol(scol, dtype=dtype)
 
     def isin(self, values) -> Union["Series", "Index"]:
         """
 
@@ -795,7 +795,7 @@ def apply_op(kdf, this_column_labels, that_column_labels):
                             .alias(name_like_string(label))
                         )
                         column_labels.append(label)
-                internal = self._internal.with_new_columns(applied, column_labels)
+                internal = self._internal.with_new_columns(applied, column_labels=column_labels)
                 return DataFrame(internal)
         else:
             return self._apply_series_op(lambda kser: getattr(kser, op)(other))
@@ -5504,7 +5504,7 @@ def head(self, n: int = 5) -> "DataFrame":
             sdf = self._internal.resolved_copy.spark_frame
             if get_option("compute.ordered_head"):
                 sdf = sdf.orderBy(NATURAL_ORDER_COLUMN_NAME)
-            return DataFrame(self._internal.with_new_sdf(sdf.limit(n)))
+            return DataFrame(self._internal.with_new_sdf(sdf.limit(n), preserve_dtypes=True))
 
     def pivot_table(
         self, values=None, index=None, columns=None, aggfunc="mean", fill_value=None
@@ -5958,17 +5958,12 @@ def columns(self, columns) -> None:
         else:
             column_label_names = None
 
-        data_columns = [name_like_string(label) for label in column_labels]
-        data_spark_columns = [
-            self._internal.spark_column_for(label).alias(name)
-            for label, name in zip(self._internal.column_labels, data_columns)
+        ksers = [
+            self._kser_for(label).rename(name)
+            for label, name in zip(self._internal.column_labels, column_labels)
         ]
         self._update_internal_frame(
-            self._internal.with_new_columns(
-                data_spark_columns,
-                column_labels=column_labels,
-                column_label_names=column_label_names,
-            )
+            self._internal.with_new_columns(ksers, column_label_names=column_label_names)
         )
 
     @property
@@ -6434,7 +6429,9 @@ def _sort(
         }
         by = [mapper[(asc, na_position)](scol) for scol, asc in zip(by, ascending)]
         sdf = self._internal.resolved_copy.spark_frame.sort(*(by + [NATURAL_ORDER_COLUMN_NAME]))
-        kdf = DataFrame(self._internal.with_new_sdf(sdf))  # type: ks.DataFrame
+        kdf = DataFrame(
+            self._internal.with_new_sdf(sdf, preserve_dtypes=True)
+        )  # type: ks.DataFrame
         if inplace:
             self._update_internal_frame(kdf._internal)
             return None
 
@@ -159,7 +159,7 @@ def _internal(self) -> InternalFrame:
     def _column_label(self):
         return self._kdf._internal.index_names[0]
 
-    def _with_new_scol(self, scol: spark.Column) -> "Index":
+    def _with_new_scol(self, scol: spark.Column, *, dtype=None) -> "Index":
         """
         Copy Koalas Index with the new Spark Column.
 
@@ -168,8 +168,10 @@ def _with_new_scol(self, scol: spark.Column) -> "Index":
         """
         internal = self._internal.copy(
             index_spark_columns=[scol.alias(SPARK_DEFAULT_INDEX_NAME)],
+            index_dtypes=[dtype],
             column_labels=[],
             data_spark_columns=[],
+            data_dtypes=[],
         )
         return DataFrame(internal).index
 
 
@@ -100,7 +100,7 @@ def _column_label(self):
     def __abs__(self):
         raise TypeError("TypeError: cannot perform __abs__ with this index type: MultiIndex")
 
-    def _with_new_scol(self, scol: spark.Column):
+    def _with_new_scol(self, scol: spark.Column, *, dtype=None):
         raise NotImplementedError("Not supported for type MultiIndex")
 
     def _align_and_column_op(self, f, *args) -> Index: