Implemented intersection for Index & MultiIndex (#1747)

itholic · web-flow · commit bb22748361d8 · 2020-11-03T14:33:24.000+09:00
This PR proposes the new API `Index.intersection()` and `MultiIndex.intersection()`.

```python
&gt;&gt;&gt; idx1 = ks.Index([1, 2, 3, 4])
&gt;&gt;&gt; idx2 = ks.Index([3, 4, 5, 6])
&gt;&gt;&gt; idx1.intersection(idx2)
Int64Index([3, 4], dtype='int64')
```
diff --git a/databricks/koalas/indexes.py b/databricks/koalas/indexes.py
@@ -2077,6 +2077,59 @@ def holds_integer(self):
         """
         return isinstance(self.spark.data_type, IntegralType)
 
+    def intersection(self, other) -> "Index":
+        """
+        Form the intersection of two Index objects.
+
+        This returns a new Index with elements common to the index and `other`.
+
+        Parameters
+        ----------
+        other : Index or array-like
+
+        Returns
+        -------
+        intersection : Index
+
+        Examples
+        --------
+        >>> idx1 = ks.Index([1, 2, 3, 4])
+        >>> idx2 = ks.Index([3, 4, 5, 6])
+        >>> idx1.intersection(idx2).sort_values()
+        Int64Index([3, 4], dtype='int64')
+        """
+        keep_name = True
+
+        if isinstance(other, DataFrame):
+            raise ValueError("Index data must be 1-dimensional")
+        elif isinstance(other, MultiIndex):
+            # Always returns an empty MultiIndex if `other` is MultiIndex.
+            return other.to_frame().head(0).index
+        elif isinstance(other, Index):
+            spark_frame_other = other.to_frame().to_spark()
+            keep_name = self.name == other.name
+        elif isinstance(other, Series):
+            spark_frame_other = other.to_frame().to_spark()
+            keep_name = self.name == other.name
+        elif is_list_like(other):
+            other = Index(other)
+            if isinstance(other, MultiIndex):
+                return other.to_frame().head(0).index
+            spark_frame_other = other.to_frame().to_spark()
+            keep_name = False
+        else:
+            raise TypeError("Input must be Index or array-like")
+
+        spark_frame_self = self.to_frame(name=SPARK_DEFAULT_INDEX_NAME).to_spark()
+        spark_frame_intersected = spark_frame_self.intersect(spark_frame_other)
+        if keep_name:
+            index_map = self._internal.index_map
+        else:
+            index_map = OrderedDict([(SPARK_DEFAULT_INDEX_NAME, None)])
+        internal = InternalFrame(spark_frame=spark_frame_intersected, index_map=index_map)
+
+        return DataFrame(internal).index
+
     def item(self):
         """
         Return the first element of the underlying data as a python scalar.
@@ -3118,6 +3171,59 @@ def item(self):
         """
         return self._kdf.head(2)._to_internal_pandas().index.item()
 
+    def intersection(self, other):
+        """
+        Form the intersection of two Index objects.
+
+        This returns a new Index with elements common to the index and `other`.
+
+        Parameters
+        ----------
+        other : Index or array-like
+
+        Returns
+        -------
+        intersection : Index
+
+        Examples
+        --------
+        >>> midx1 = ks.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
+        >>> midx2 = ks.MultiIndex.from_tuples([("c", "z"), ("d", "w")])
+        >>> midx1.intersection(midx2).sort_values()  # doctest: +SKIP
+        MultiIndex([('c', 'z')],
+                   )
+        """
+        keep_name = True
+
+        if isinstance(other, Series) or not is_list_like(other):
+            raise TypeError("other must be a MultiIndex or a list of tuples")
+        elif isinstance(other, DataFrame):
+            raise ValueError("Index data must be 1-dimensional")
+        elif isinstance(other, MultiIndex):
+            spark_frame_other = other.to_frame().to_spark()
+            keep_name = self.names == other.names
+        elif isinstance(other, Index):
+            # Always returns an empty MultiIndex if `other` is Index.
+            return self.to_frame().head(0).index
+        elif not all(isinstance(item, tuple) for item in other):
+            raise TypeError("other must be a MultiIndex or a list of tuples")
+        else:
+            other = MultiIndex.from_tuples(list(other))
+            spark_frame_other = other.to_frame().to_spark()
+            keep_name = True
+
+        default_name = [SPARK_INDEX_NAME_FORMAT(i) for i in range(self.nlevels)]
+        spark_frame_self = self.to_frame(name=default_name).to_spark()
+        spark_frame_intersected = spark_frame_self.intersect(spark_frame_other)
+        if keep_name:
+            index_map = self._internal.index_map
+        else:
+            index_map = OrderedDict(
+                [(SPARK_INDEX_NAME_FORMAT(i), None) for i in range(self.nlevels)]
+            )
+        internal = InternalFrame(spark_frame=spark_frame_intersected, index_map=index_map)
+        return DataFrame(internal).index
+
     @property
     def inferred_type(self):
         """
diff --git a/databricks/koalas/missing/indexes.py b/databricks/koalas/missing/indexes.py
@@ -49,7 +49,6 @@ class MissingPandasLikeIndex(object):
     get_slice_bound = _unsupported_function("get_slice_bound")
     get_value = _unsupported_function("get_value")
     groupby = _unsupported_function("groupby")
-    intersection = _unsupported_function("intersection")
     is_ = _unsupported_function("is_")
     is_lexsorted_for_tuple = _unsupported_function("is_lexsorted_for_tuple")
     join = _unsupported_function("join")
@@ -116,7 +115,6 @@ class MissingPandasLikeMultiIndex(object):
     get_slice_bound = _unsupported_function("get_slice_bound")
     get_value = _unsupported_function("get_value")
     groupby = _unsupported_function("groupby")
-    intersection = _unsupported_function("intersection")
     is_ = _unsupported_function("is_")
     is_lexsorted = _unsupported_function("is_lexsorted")
     is_lexsorted_for_tuple = _unsupported_function("is_lexsorted_for_tuple")
diff --git a/databricks/koalas/tests/test_indexes.py b/databricks/koalas/tests/test_indexes.py
@@ -1447,6 +1447,124 @@ def test_hasnans(self):
         kser = ks.from_pandas(pser)
         self.assert_eq(pser.hasnans, kser.hasnans)
 
+    def test_intersection(self):
+        pidx = pd.Index([1, 2, 3, 4], name="Koalas")
+        kidx = ks.from_pandas(pidx)
+
+        # other = Index
+        pidx_other = pd.Index([3, 4, 5, 6], name="Koalas")
+        kidx_other = ks.from_pandas(pidx_other)
+        self.assert_eq(pidx.intersection(pidx_other), kidx.intersection(kidx_other).sort_values())
+        self.assert_eq(
+            (pidx + 1).intersection(pidx_other), (kidx + 1).intersection(kidx_other).sort_values()
+        )
+
+        pidx_other_different_name = pd.Index([3, 4, 5, 6], name="Databricks")
+        kidx_other_different_name = ks.from_pandas(pidx_other_different_name)
+        self.assert_eq(
+            pidx.intersection(pidx_other_different_name),
+            kidx.intersection(kidx_other_different_name).sort_values(),
+        )
+        self.assert_eq(
+            (pidx + 1).intersection(pidx_other_different_name),
+            (kidx + 1).intersection(kidx_other_different_name).sort_values(),
+        )
+
+        pidx_other_from_frame = pd.DataFrame({"a": [3, 4, 5, 6]}).set_index("a").index
+        kidx_other_from_frame = ks.from_pandas(pidx_other_from_frame)
+        self.assert_eq(
+            pidx.intersection(pidx_other_from_frame),
+            kidx.intersection(kidx_other_from_frame).sort_values(),
+        )
+        self.assert_eq(
+            (pidx + 1).intersection(pidx_other_from_frame),
+            (kidx + 1).intersection(kidx_other_from_frame).sort_values(),
+        )
+
+        # other = MultiIndex
+        pmidx = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
+        kmidx = ks.from_pandas(pmidx)
+        self.assert_eq(
+            pidx.intersection(pmidx), kidx.intersection(kmidx).sort_values(), almost=True
+        )
+        self.assert_eq(
+            (pidx + 1).intersection(pmidx),
+            (kidx + 1).intersection(kmidx).sort_values(),
+            almost=True,
+        )
+
+        # other = Series
+        pser = pd.Series([3, 4, 5, 6])
+        kser = ks.from_pandas(pser)
+        self.assert_eq(pidx.intersection(pser), kidx.intersection(kser).sort_values())
+        self.assert_eq((pidx + 1).intersection(pser), (kidx + 1).intersection(kser).sort_values())
+
+        pser_different_name = pd.Series([3, 4, 5, 6], name="Databricks")
+        kser_different_name = ks.from_pandas(pser_different_name)
+        self.assert_eq(
+            pidx.intersection(pser_different_name),
+            kidx.intersection(kser_different_name).sort_values(),
+        )
+        self.assert_eq(
+            (pidx + 1).intersection(pser_different_name),
+            (kidx + 1).intersection(kser_different_name).sort_values(),
+        )
+
+        # other = list
+        other = [3, 4, 5, 6]
+        self.assert_eq(pidx.intersection(other), kidx.intersection(other).sort_values())
+        self.assert_eq((pidx + 1).intersection(other), (kidx + 1).intersection(other).sort_values())
+
+        # other = tuple
+        other = (3, 4, 5, 6)
+        self.assert_eq(pidx.intersection(other), kidx.intersection(other).sort_values())
+        self.assert_eq((pidx + 1).intersection(other), (kidx + 1).intersection(other).sort_values())
+
+        # other = dict
+        other = {3: None, 4: None, 5: None, 6: None}
+        self.assert_eq(pidx.intersection(other), kidx.intersection(other).sort_values())
+        self.assert_eq((pidx + 1).intersection(other), (kidx + 1).intersection(other).sort_values())
+
+        # MultiIndex / other = Index
+        self.assert_eq(
+            pmidx.intersection(pidx), kmidx.intersection(kidx).sort_values(), almost=True
+        )
+        self.assert_eq(
+            pmidx.intersection(pidx_other_from_frame),
+            kmidx.intersection(kidx_other_from_frame).sort_values(),
+            almost=True,
+        )
+
+        # MultiIndex / other = MultiIndex
+        pmidx_other = pd.MultiIndex.from_tuples([("c", "z"), ("d", "w")])
+        kmidx_other = ks.from_pandas(pmidx_other)
+        self.assert_eq(
+            pmidx.intersection(pmidx_other), kmidx.intersection(kmidx_other).sort_values()
+        )
+
+        # MultiIndex / other = list
+        other = [("c", "z"), ("d", "w")]
+        self.assert_eq(pmidx.intersection(other), kmidx.intersection(other).sort_values())
+
+        # MultiIndex / other = tuple
+        other = (("c", "z"), ("d", "w"))
+        self.assert_eq(pmidx.intersection(other), kmidx.intersection(other).sort_values())
+
+        # MultiIndex / other = dict
+        other = {("c", "z"): None, ("d", "w"): None}
+        self.assert_eq(pmidx.intersection(other), kmidx.intersection(other).sort_values())
+
+        with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"):
+            kidx.intersection(4)
+        with self.assertRaisesRegex(TypeError, "other must be a MultiIndex or a list of tuples"):
+            kmidx.intersection(4)
+        with self.assertRaisesRegex(TypeError, "other must be a MultiIndex or a list of tuples"):
+            kmidx.intersection(ks.Series([3, 4, 5, 6]))
+        with self.assertRaisesRegex(ValueError, "Index data must be 1-dimensional"):
+            kidx.intersection(ks.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}))
+        with self.assertRaisesRegex(ValueError, "Index data must be 1-dimensional"):
+            kmidx.intersection(ks.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}))
+
     def test_item(self):
         pidx = pd.Index([10])
         kidx = ks.from_pandas(pidx)
diff --git a/docs/source/reference/indexing.rst b/docs/source/reference/indexing.rst
@@ -139,6 +139,7 @@ Combining / joining / set operations
    :toctree: api/
 
    Index.append
+   Index.intersection
    Index.union
    Index.difference
    Index.symmetric_difference
@@ -236,6 +237,7 @@ MultiIndex Combining / joining / set operations
    :toctree: api/
 
    MultiIndex.append
+   MultiIndex.intersection
    MultiIndex.union
    MultiIndex.difference
    MultiIndex.symmetric_difference