Skip to content

Commit 3d7427f

Browse files
authored
Implements asof() for Index (#1350)
This PR proposes `Index.asof` (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Index.asof.html#pandas.Index.asof) ```python >>> idx = ks.Index(['2013-12-31', '2014-01-02', '2014-01-03']) >>> idx.asof('2014-01-01') '2013-12-31' >>> idx.asof('2014-01-02') '2014-01-02' >>> idx.asof('1999-01-02') nan ```
1 parent 663680b commit 3d7427f

File tree

5 files changed

+87
-5
lines changed

5 files changed

+87
-5
lines changed

databricks/koalas/indexes.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1632,6 +1632,58 @@ def repeat(self, repeats: int) -> "Index":
16321632
else:
16331633
return ks.concat([kdf] * repeats).index
16341634

1635+
def asof(self, label):
1636+
"""
1637+
Return the label from the index, or, if not present, the previous one.
1638+
1639+
Assuming that the index is sorted, return the passed index label if it
1640+
is in the index, or return the previous index label if the passed one
1641+
is not in the index.
1642+
1643+
.. note:: This API is dependent on :meth:`Index.is_monotonic_increasing`
1644+
which can be expensive.
1645+
1646+
Parameters
1647+
----------
1648+
label : object
1649+
The label up to which the method returns the latest index label.
1650+
1651+
Returns
1652+
-------
1653+
object
1654+
The passed label if it is in the index. The previous label if the
1655+
passed label is not in the sorted index or `NaN` if there is no
1656+
such label.
1657+
1658+
Examples
1659+
--------
1660+
`Index.asof` returns the latest index label up to the passed label.
1661+
1662+
>>> idx = ks.Index(['2013-12-31', '2014-01-02', '2014-01-03'])
1663+
>>> idx.asof('2014-01-01')
1664+
'2013-12-31'
1665+
1666+
If the label is in the index, the method returns the passed label.
1667+
1668+
>>> idx.asof('2014-01-02')
1669+
'2014-01-02'
1670+
1671+
If all of the labels in the index are later than the passed label,
1672+
NaN is returned.
1673+
1674+
>>> idx.asof('1999-01-02')
1675+
nan
1676+
"""
1677+
sdf = self._internal._sdf
1678+
if self.is_monotonic_increasing:
1679+
sdf = sdf.select(self._scol).where(self._scol <= label).select(F.max(self._scol))
1680+
elif self.is_monotonic_decreasing:
1681+
sdf = sdf.select(self._scol).where(self._scol >= label).select(F.min(self._scol))
1682+
else:
1683+
raise ValueError("index must be monotonic increasing or decreasing")
1684+
result = sdf.head()[0]
1685+
return result if result is not None else np.nan
1686+
16351687
def union(self, other, sort=None):
16361688
"""
16371689
Form the union of two Index objects.
@@ -2366,6 +2418,11 @@ def argmax(self):
23662418
def argmin(self):
23672419
raise TypeError("reduction operation 'argmin' not allowed for this dtype")
23682420

2421+
def asof(self, label):
2422+
raise NotImplementedError(
2423+
"only the default get_loc method is currently supported for MultiIndex"
2424+
)
2425+
23692426
@property
23702427
def is_all_dates(self):
23712428
"""

databricks/koalas/missing/indexes.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ class _MissingPandasLikeIndex(object):
3939

4040
# Functions
4141
argsort = unsupported_function("argsort")
42-
asof = unsupported_function("asof")
4342
asof_locs = unsupported_function("asof_locs")
4443
delete = unsupported_function("delete")
4544
factorize = unsupported_function("factorize")
@@ -111,7 +110,6 @@ class _MissingPandasLikeMultiIndex(object):
111110

112111
# Functions
113112
argsort = unsupported_function("argsort")
114-
asof = unsupported_function("asof")
115113
asof_locs = unsupported_function("asof_locs")
116114
delete = unsupported_function("delete")
117115
equal_levels = unsupported_function("equal_levels")

databricks/koalas/series.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3551,9 +3551,8 @@ def truncate(self, before=None, after=None, copy=True):
35513551
Truncates a sorted Series before and/or after some particular index value.
35523552
Series should have sorted index.
35533553
3554-
.. note:: the current implementation of truncate uses is_monotonic_increasing internally
3555-
This leads to move all data into single partition in single machine and could cause
3556-
serious performance degradation. Avoid this method against very large dataset.
3554+
.. note:: This API is dependent on :meth:`Index.is_monotonic_increasing`
3555+
which can be expensive.
35573556
35583557
Parameters
35593558
----------

databricks/koalas/tests/test_indexes.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1012,6 +1012,33 @@ def test_unique(self):
10121012
self.assert_eq(kmidx.unique().sort_values(), pmidx.unique().sort_values())
10131013
self.assert_eq(kmidx.unique().sort_values(), pmidx.unique().sort_values())
10141014

1015+
def test_asof(self):
1016+
# Increasing values
1017+
pidx = pd.Index(["2013-12-31", "2014-01-02", "2014-01-03"])
1018+
kidx = ks.from_pandas(pidx)
1019+
1020+
self.assert_eq(kidx.asof("2014-01-01"), pidx.asof("2014-01-01"))
1021+
self.assert_eq(kidx.asof("2014-01-02"), pidx.asof("2014-01-02"))
1022+
self.assert_eq(np.isnan(kidx.asof("1999-01-02")), True)
1023+
self.assert_eq(np.isnan(pidx.asof("1999-01-02")), True)
1024+
1025+
# Decreasing values
1026+
pidx = pd.Index(["2014-01-03", "2014-01-02", "2013-12-31"])
1027+
kidx = ks.from_pandas(pidx)
1028+
1029+
self.assert_eq(kidx.asof("2014-01-01"), pidx.asof("2014-01-01"))
1030+
self.assert_eq(kidx.asof("2014-01-02"), pidx.asof("2014-01-02"))
1031+
self.assert_eq(kidx.asof("1999-01-02"), pidx.asof("1999-01-02"))
1032+
self.assert_eq(np.isnan(kidx.asof("2015-01-02")), True)
1033+
self.assert_eq(np.isnan(pidx.asof("2015-01-02")), True)
1034+
1035+
# Not increasing, neither decreasing (ValueError)
1036+
kidx = ks.Index(["2013-12-31", "2015-01-02", "2014-01-03"])
1037+
self.assertRaises(ValueError, lambda: kidx.asof("2013-12-31"))
1038+
1039+
kmidx = ks.MultiIndex.from_tuples([("a", "a"), ("a", "b"), ("a", "c")])
1040+
self.assertRaises(NotImplementedError, lambda: kmidx.asof(("a", "b")))
1041+
10151042
def test_union(self):
10161043
# Index
10171044
pidx1 = pd.Index([1, 2, 3, 4])

docs/source/reference/indexing.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ Selecting
130130
.. autosummary::
131131
:toctree: api/
132132

133+
Index.asof
133134
Index.isin
134135

135136
.. _api.multiindex:

0 commit comments

Comments
 (0)