Merge branch 'master' into Index.drop

RainFung · web-flow · commit 0979130597e0 · 2019-12-14T11:44:29.000+08:00
diff --git a/databricks/koalas/base.py b/databricks/koalas/base.py
@@ -558,6 +558,9 @@ def isnull(self):
         >>> ser.rename("a").to_frame().set_index("a").index.isna()
         Index([False, False, True], dtype='object', name='a')
         """
+        from databricks.koalas.indexes import MultiIndex
+        if isinstance(self, MultiIndex):
+            raise NotImplementedError("isna is not defined for MultiIndex")
         if isinstance(self.spark_type, (FloatType, DoubleType)):
             return self._with_new_scol(self._scol.isNull() | F.isnan(self._scol)).rename(self.name)
         else:
@@ -599,6 +602,9 @@ def notnull(self):
         >>> ser.rename("a").to_frame().set_index("a").index.notna()
         Index([True, True, False], dtype='object', name='a')
         """
+        from databricks.koalas.indexes import MultiIndex
+        if isinstance(self, MultiIndex):
+            raise NotImplementedError("notna is not defined for MultiIndex")
         return (~self.isnull()).rename(self.name)
 
     notna = notnull
diff --git a/databricks/koalas/indexing.py b/databricks/koalas/indexing.py
@@ -85,7 +85,31 @@ def _unfold(key, kseries):
     return rows_sel, cols_sel
 
 
-class AtIndexer(object):
+class _IndexerLike(object):
+
+    def __init__(self, kdf_or_kser):
+        from databricks.koalas.frame import DataFrame
+        from databricks.koalas.series import Series
+        assert isinstance(kdf_or_kser, (DataFrame, Series)), \
+            'unexpected argument type: {}'.format(type(kdf_or_kser))
+        self._kdf_or_kser = kdf_or_kser
+
+    @property
+    def _is_df(self):
+        from databricks.koalas.frame import DataFrame
+        return isinstance(self._kdf_or_kser, DataFrame)
+
+    @property
+    def _is_series(self):
+        from databricks.koalas.series import Series
+        return isinstance(self._kdf_or_kser, Series)
+
+    @property
+    def _internal(self):
+        return self._kdf_or_kser._internal
+
+
+class AtIndexer(_IndexerLike):
     """
     Access a single value for a row/column label pair.
     If the index is not unique, all matching pairs are returned as an array.
@@ -122,26 +146,6 @@ class AtIndexer(object):
     >>> kdf.at[5, 'B']
     array([ 4, 20])
     """
-    def __init__(self, kdf_or_kser):
-        from databricks.koalas.frame import DataFrame
-        from databricks.koalas.series import Series
-        assert isinstance(kdf_or_kser, (DataFrame, Series)), \
-            'unexpected argument type: {}'.format(type(kdf_or_kser))
-        self._kdf_or_kser = kdf_or_kser
-
-    @property
-    def _is_df(self):
-        from databricks.koalas.frame import DataFrame
-        return isinstance(self._kdf_or_kser, DataFrame)
-
-    @property
-    def _is_series(self):
-        from databricks.koalas.series import Series
-        return isinstance(self._kdf_or_kser, Series)
-
-    @property
-    def _internal(self):
-        return self._kdf_or_kser._internal
 
     def __getitem__(self, key):
         if self._is_df:
@@ -181,7 +185,7 @@ def __getitem__(self, key):
                           or len(values) > 1) else values[0]
 
 
-class LocIndexer(object):
+class LocIndexer(_IndexerLike):
     """
     Access a group of rows and columns by label(s) or a boolean Series.
 
@@ -357,27 +361,6 @@ class LocIndexer(object):
     9          7       8
     """
 
-    def __init__(self, kdf_or_kser):
-        from databricks.koalas.frame import DataFrame
-        from databricks.koalas.series import Series
-        assert isinstance(kdf_or_kser, (DataFrame, Series)), \
-            'unexpected argument type: {}'.format(type(kdf_or_kser))
-        self._kdf_or_kser = kdf_or_kser
-
-    @property
-    def _is_df(self):
-        from databricks.koalas.frame import DataFrame
-        return isinstance(self._kdf_or_kser, DataFrame)
-
-    @property
-    def _is_series(self):
-        from databricks.koalas.series import Series
-        return isinstance(self._kdf_or_kser, Series)
-
-    @property
-    def _internal(self):
-        return self._kdf_or_kser._internal
-
     def __getitem__(self, key):
         from databricks.koalas.frame import DataFrame
         from databricks.koalas.series import Series
@@ -563,7 +546,7 @@ def __setitem__(self, key, value):
                     self._kdf_or_kser[col_sel] = value
 
 
-class ILocIndexer(object):
+class ILocIndexer(_IndexerLike):
     """
     Purely integer-location based indexing for selection by position.
 
@@ -677,27 +660,6 @@ class ILocIndexer(object):
     2  1000  3000
     """
 
-    def __init__(self, kdf_or_kser):
-        from databricks.koalas.frame import DataFrame
-        from databricks.koalas.series import Series
-        assert isinstance(kdf_or_kser, (DataFrame, Series)), \
-            'unexpected argument type: {}'.format(type(kdf_or_kser))
-        self._kdf_or_kser = kdf_or_kser
-
-    @property
-    def _is_df(self):
-        from databricks.koalas.frame import DataFrame
-        return isinstance(self._kdf_or_kser, DataFrame)
-
-    @property
-    def _is_series(self):
-        from databricks.koalas.series import Series
-        return isinstance(self._kdf_or_kser, Series)
-
-    @property
-    def _internal(self):
-        return self._kdf_or_kser._internal
-
     def __getitem__(self, key):
         from databricks.koalas.frame import DataFrame
         from databricks.koalas.indexes import Index
diff --git a/databricks/koalas/numpy_compat.py b/databricks/koalas/numpy_compat.py
@@ -17,7 +17,7 @@
 from typing import Callable, Any
 
 import numpy as np
-from pyspark.sql import functions as F
+from pyspark.sql import functions as F, Column
 from pyspark.sql.types import DoubleType, LongType, BooleanType
 
 
@@ -180,19 +180,23 @@ def maybe_dispatch_ufunc_to_spark_func(
     ser_or_index, ufunc: Callable, method: str, *inputs, **kwargs: Any
 ):
     from databricks.koalas import Series
+    from databricks.koalas.base import _column_op
 
     op_name = ufunc.__name__
 
     if (method == "__call__"
             and (op_name in unary_np_spark_mappings or op_name in binary_np_spark_mappings)
             and kwargs.get("out") is None):
-        inputs = [  # type: ignore
-            inp._scol if isinstance(inp, Series) else F.lit(inp) for inp in inputs]  # type: ignore
 
         np_spark_map_func = (
             unary_np_spark_mappings.get(op_name)
             or binary_np_spark_mappings.get(op_name))
 
-        return ser_or_index._with_new_scol(np_spark_map_func(*inputs))  # type: ignore
+        def convert_arguments(*args):
+            args = [  # type: ignore
+                F.lit(inp) if not isinstance(inp, Column) else inp for inp in args]  # type: ignore
+            return np_spark_map_func(*args)
+
+        return _column_op(convert_arguments)(*inputs)  # type: ignore
     else:
         return NotImplemented
diff --git a/databricks/koalas/tests/test_indexes.py b/databricks/koalas/tests/test_indexes.py
@@ -351,3 +351,26 @@ def test_multiindex_drop(self):
         self.assert_eq(pidx.drop(['a', 'b']), kidx.drop(['a', 'b']))
         self.assert_eq(pidx.drop(['x', 'y'], level='level2'),
                        kidx.drop(['x', 'y'], level='level2'))
+
+    def test_multiindex_isna(self):
+        kidx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)])
+
+        with self.assertRaisesRegex(
+                NotImplementedError,
+                "isna is not defined for MultiIndex"):
+            kidx.isna()
+
+        with self.assertRaisesRegex(
+                NotImplementedError,
+                "isna is not defined for MultiIndex"):
+            kidx.isnull()
+
+        with self.assertRaisesRegex(
+                NotImplementedError,
+                "notna is not defined for MultiIndex"):
+            kidx.notna()
+
+        with self.assertRaisesRegex(
+                NotImplementedError,
+                "notna is not defined for MultiIndex"):
+            kidx.notnull()
diff --git a/databricks/koalas/tests/test_numpy_compat.py b/databricks/koalas/tests/test_numpy_compat.py
@@ -17,6 +17,7 @@
 import pandas as pd
 
 from databricks import koalas as ks
+from databricks.koalas import set_option, reset_option
 from databricks.koalas.numpy_compat import unary_np_spark_mappings, binary_np_spark_mappings
 from databricks.koalas.testing.utils import ReusedSQLTestCase, SQLTestUtils
 
@@ -57,7 +58,10 @@ def test_np_spark_compat(self):
         # Use randomly generated dataFrame
         pdf = pd.DataFrame(
             np.random.randint(-100, 100, size=(np.random.randint(100), 2)), columns=['a', 'b'])
+        pdf2 = pd.DataFrame(
+            np.random.randint(-100, 100, size=(len(pdf), len(pdf.columns))), columns=['a', 'b'])
         kdf = ks.from_pandas(pdf)
+        kdf2 = ks.from_pandas(pdf2)
 
         blacklist = [
             # Koalas does not currently support
@@ -103,3 +107,19 @@ def test_np_spark_compat(self):
                         np_func(pdf.a, 1), np_func(kdf.a, 1), almost=True)
                 except Exception as e:
                     raise AssertionError("Test in '%s' function was failed." % np_name) from e
+
+        # Test only top 5 for now. 'compute.ops_on_diff_frames' option increases too much time.
+        try:
+            set_option('compute.ops_on_diff_frames', True)
+            for np_name, spark_func in list(binary_np_spark_mappings.items())[:5]:
+                np_func = getattr(np, np_name)
+                if np_name not in blacklist:
+                    try:
+                        # binary ufunc
+                        self.assert_eq(
+                            np_func(pdf.a, pdf2.b).sort_index(),
+                            np_func(kdf.a, kdf2.b).sort_index(), almost=True)
+                    except Exception as e:
+                        raise AssertionError("Test in '%s' function was failed." % np_name) from e
+        finally:
+            reset_option('compute.ops_on_diff_frames')