Add asi8 for Index & MultiIndex (#1764)

itholic · web-flow · commit 5dc34a64c2ff · 2020-09-17T13:28:41.000-07:00
The PR proposes asi8 for Index &amp; MultiIndex

```python
&gt;&gt;&gt; ks.Index([1, 2, 3]).asi8
array([1, 2, 3])

&gt;&gt;&gt; ks.Index(['a', 'b', 'c']).asi8 is None
True
```
diff --git a/databricks/koalas/indexes.py b/databricks/koalas/indexes.py
@@ -439,6 +439,39 @@ def values(self):
         warnings.warn("We recommend using `{}.to_numpy()` instead.".format(type(self).__name__))
         return self.to_numpy()
 
+    @property
+    def asi8(self):
+        """
+        Integer representation of the values.
+
+        .. warning:: We recommend using `Index.to_numpy()` instead.
+
+        .. note:: This method should only be used if the resulting NumPy ndarray is expected
+            to be small, as all the data is loaded into the driver's memory.
+
+        Returns
+        -------
+        numpy.ndarray
+            An ndarray with int64 dtype.
+
+        Examples
+        --------
+        >>> ks.Index([1, 2, 3]).asi8
+        array([1, 2, 3])
+
+        Returns None for non-int64 dtype
+
+        >>> ks.Index(['a', 'b', 'c']).asi8 is None
+        True
+        """
+        warnings.warn("We recommend using `{}.to_numpy()` instead.".format(type(self).__name__))
+        if isinstance(self.spark.data_type, IntegralType):
+            return self.to_numpy()
+        elif isinstance(self.spark.data_type, TimestampType):
+            return np.array(list(map(lambda x: x.astype(np.int64), self.to_numpy())))
+        else:
+            return None
+
     @property
     def spark_type(self):
         """ Returns the data type as defined by Spark, as a Spark DataType object."""
@@ -2865,8 +2898,16 @@ def inferred_type(self):
         """
         Return a string of the type inferred from the values.
         """
-        # It's always 'mixed' for MultiIndex
+        # Always returns "mixed" for MultiIndex
         return "mixed"
 
+    @property
+    def asi8(self):
+        """
+        Integer representation of the values.
+        """
+        # Always returns None for MultiIndex
+        return None
+
     def __iter__(self):
         return MissingPandasLikeMultiIndex.__iter__(self)
diff --git a/databricks/koalas/tests/test_indexes.py b/databricks/koalas/tests/test_indexes.py
@@ -1446,6 +1446,45 @@ def test_inferred_type(self):
         kmidx = ks.from_pandas(pmidx)
         self.assert_eq(pmidx.inferred_type, kmidx.inferred_type)
 
+    def test_asi8(self):
+        # Integer
+        pidx = pd.Index([1, 2, 3])
+        kidx = ks.from_pandas(pidx)
+        self.assert_array_eq(pidx.asi8, kidx.asi8)
+        self.assert_array_eq(pidx.astype("int").asi8, kidx.astype("int").asi8)
+        self.assert_array_eq(pidx.astype("int16").asi8, kidx.astype("int16").asi8)
+        self.assert_array_eq(pidx.astype("int8").asi8, kidx.astype("int8").asi8)
+
+        # Integer with missing value
+        pidx = pd.Index([1, 2, None, 4, 5])
+        kidx = ks.from_pandas(pidx)
+        self.assert_eq(pidx.asi8, kidx.asi8)
+
+        # Datetime
+        pidx = pd.date_range(end="1/1/2018", periods=3)
+        kidx = ks.from_pandas(pidx)
+        self.assert_array_eq(pidx.asi8, kidx.asi8)
+
+        # Floating
+        pidx = pd.Index([1.0, 2.0, 3.0])
+        kidx = ks.from_pandas(pidx)
+        self.assert_eq(pidx.asi8, kidx.asi8)
+
+        # String
+        pidx = pd.Index(["a", "b", "c"])
+        kidx = ks.from_pandas(pidx)
+        self.assert_eq(pidx.asi8, kidx.asi8)
+
+        # Boolean
+        pidx = pd.Index([True, False, True, False])
+        kidx = ks.from_pandas(pidx)
+        self.assert_eq(pidx.asi8, kidx.asi8)
+
+        # MultiIndex
+        pmidx = pd.MultiIndex.from_tuples([(1, 2)])
+        kmidx = ks.from_pandas(pmidx)
+        self.assert_eq(pmidx.asi8, kmidx.asi8)
+
     def test_index_is_unique(self):
         indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)]
         names = [None, "ks", "ks", None]