Skip to content

Commit 5dc34a6

Browse files
authored
Add asi8 for Index & MultiIndex (#1764)
The PR proposes asi8 for Index & MultiIndex ```python >>> ks.Index([1, 2, 3]).asi8 array([1, 2, 3]) >>> ks.Index(['a', 'b', 'c']).asi8 is None True ```
1 parent 9b69568 commit 5dc34a6

File tree

2 files changed

+81
-1
lines changed

2 files changed

+81
-1
lines changed

databricks/koalas/indexes.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,39 @@ def values(self):
439439
warnings.warn("We recommend using `{}.to_numpy()` instead.".format(type(self).__name__))
440440
return self.to_numpy()
441441

442+
@property
443+
def asi8(self):
444+
"""
445+
Integer representation of the values.
446+
447+
.. warning:: We recommend using `Index.to_numpy()` instead.
448+
449+
.. note:: This method should only be used if the resulting NumPy ndarray is expected
450+
to be small, as all the data is loaded into the driver's memory.
451+
452+
Returns
453+
-------
454+
numpy.ndarray
455+
An ndarray with int64 dtype.
456+
457+
Examples
458+
--------
459+
>>> ks.Index([1, 2, 3]).asi8
460+
array([1, 2, 3])
461+
462+
Returns None for non-int64 dtype
463+
464+
>>> ks.Index(['a', 'b', 'c']).asi8 is None
465+
True
466+
"""
467+
warnings.warn("We recommend using `{}.to_numpy()` instead.".format(type(self).__name__))
468+
if isinstance(self.spark.data_type, IntegralType):
469+
return self.to_numpy()
470+
elif isinstance(self.spark.data_type, TimestampType):
471+
return np.array(list(map(lambda x: x.astype(np.int64), self.to_numpy())))
472+
else:
473+
return None
474+
442475
@property
443476
def spark_type(self):
444477
""" Returns the data type as defined by Spark, as a Spark DataType object."""
@@ -2865,8 +2898,16 @@ def inferred_type(self):
28652898
"""
28662899
Return a string of the type inferred from the values.
28672900
"""
2868-
# It's always 'mixed' for MultiIndex
2901+
# Always returns "mixed" for MultiIndex
28692902
return "mixed"
28702903

2904+
@property
2905+
def asi8(self):
2906+
"""
2907+
Integer representation of the values.
2908+
"""
2909+
# Always returns None for MultiIndex
2910+
return None
2911+
28712912
def __iter__(self):
28722913
return MissingPandasLikeMultiIndex.__iter__(self)

databricks/koalas/tests/test_indexes.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1446,6 +1446,45 @@ def test_inferred_type(self):
14461446
kmidx = ks.from_pandas(pmidx)
14471447
self.assert_eq(pmidx.inferred_type, kmidx.inferred_type)
14481448

1449+
def test_asi8(self):
1450+
# Integer
1451+
pidx = pd.Index([1, 2, 3])
1452+
kidx = ks.from_pandas(pidx)
1453+
self.assert_array_eq(pidx.asi8, kidx.asi8)
1454+
self.assert_array_eq(pidx.astype("int").asi8, kidx.astype("int").asi8)
1455+
self.assert_array_eq(pidx.astype("int16").asi8, kidx.astype("int16").asi8)
1456+
self.assert_array_eq(pidx.astype("int8").asi8, kidx.astype("int8").asi8)
1457+
1458+
# Integer with missing value
1459+
pidx = pd.Index([1, 2, None, 4, 5])
1460+
kidx = ks.from_pandas(pidx)
1461+
self.assert_eq(pidx.asi8, kidx.asi8)
1462+
1463+
# Datetime
1464+
pidx = pd.date_range(end="1/1/2018", periods=3)
1465+
kidx = ks.from_pandas(pidx)
1466+
self.assert_array_eq(pidx.asi8, kidx.asi8)
1467+
1468+
# Floating
1469+
pidx = pd.Index([1.0, 2.0, 3.0])
1470+
kidx = ks.from_pandas(pidx)
1471+
self.assert_eq(pidx.asi8, kidx.asi8)
1472+
1473+
# String
1474+
pidx = pd.Index(["a", "b", "c"])
1475+
kidx = ks.from_pandas(pidx)
1476+
self.assert_eq(pidx.asi8, kidx.asi8)
1477+
1478+
# Boolean
1479+
pidx = pd.Index([True, False, True, False])
1480+
kidx = ks.from_pandas(pidx)
1481+
self.assert_eq(pidx.asi8, kidx.asi8)
1482+
1483+
# MultiIndex
1484+
pmidx = pd.MultiIndex.from_tuples([(1, 2)])
1485+
kmidx = ks.from_pandas(pmidx)
1486+
self.assert_eq(pmidx.asi8, kmidx.asi8)
1487+
14491488
def test_index_is_unique(self):
14501489
indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)]
14511490
names = [None, "ks", "ks", None]

0 commit comments

Comments
 (0)