Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion databricks/koalas/missing/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@ class _MissingPandasLikeSeries(object):
to_period = unsupported_function('to_period')
to_sql = unsupported_function('to_sql')
to_timestamp = unsupported_function('to_timestamp')
truncate = unsupported_function('truncate')
tshift = unsupported_function('tshift')
tz_convert = unsupported_function('tz_convert')
tz_localize = unsupported_function('tz_localize')
Expand Down
94 changes: 94 additions & 0 deletions databricks/koalas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3224,6 +3224,100 @@ def copy(self) -> 'Series':
"""
return _col(DataFrame(self._internal.copy()))

def truncate(self, before=None, after=None, copy=True):
"""
Truncates a sorted Series before and/or after some particular index value.
Series should have sorted index.

.. note:: the current implementation of truncate uses is_monotonic_increasing internally
This leads to move all data into single partition in single machine and could cause
serious performance degradation. Avoid this method against very large dataset.

Parameters
----------
before : string, int
Truncate all rows before this index value
after : string, int
Truncate all rows after this index value
copy : boolean, default is True,
return a copy of the truncated section

Returns
-------
truncated : Series

Examples
--------


A Series has index that sorted integers.

>>> s = ks.Series([10, 20, 30, 40, 50, 60, 70],
... index=[1, 2, 3, 4, 5, 6, 7])
>>> s
1 10
2 20
3 30
4 40
5 50
6 60
7 70
Name: 0, dtype: int64

>>> s.truncate(2, 5)
2 20
3 30
4 40
5 50
Name: 0, dtype: int64

A Series has index that sorted strings.

>>> s = ks.Series([10, 20, 30, 40, 50, 60, 70],
... index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])
>>> s
a 10
b 20
c 30
d 40
e 50
f 60
g 70
Name: 0, dtype: int64

>>> s.truncate('b', 'e')
b 20
c 30
d 40
e 50
Name: 0, dtype: int64
"""
indexes = self.index
indexes_increasing = indexes.is_monotonic_increasing
if not any([indexes_increasing, indexes.is_monotonic_decreasing]):
raise ValueError("truncate requires a sorted index")
if (before is None) and (after is None):
return self.copy() if copy else self

if before is None:
sdf = indexes._internal.sdf
idx_col_name = self._internal.index_columns[0]
before = sdf.first()[idx_col_name] if indexes_increasing \
else sdf.orderBy(self._internal.scol_for(idx_col_name).desc()).first()[idx_col_name]
if after is None:
sdf = indexes._internal.sdf
idx_col_name = self._internal.index_columns[0]
after = sdf.first()[idx_col_name] if not indexes_increasing \
else sdf.orderBy(self._internal.scol_for(idx_col_name).desc()).first()[idx_col_name]
if before > after:
raise ValueError("Truncate: %s must be after %s" % (after, before))
if indexes_increasing:
result = _col(self.to_frame()[before:after])
else:
result = _col(self.to_frame()[after:before])

return result.copy() if copy else result

def keys(self):
"""
Return alias for index.
Expand Down
24 changes: 24 additions & 0 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,6 +729,30 @@ def test_duplicates(self):
self.assert_eq(pser.drop_duplicates().sort_values(),
kser.drop_duplicates().sort_values())

def test_truncate(self):
pser1 = pd.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 5, 6, 7])
kser1 = ks.Series(pser1)
pser2 = pd.Series([10, 20, 30, 40, 50, 60, 70], index=[7, 6, 5, 4, 3, 2, 1])
kser2 = ks.Series(pser2)

self.assert_eq(kser1.truncate(), pser1.truncate())
self.assert_eq(kser1.truncate(before=2), pser1.truncate(before=2))
self.assert_eq(kser1.truncate(after=5), pser1.truncate(after=5))
self.assert_eq(kser1.truncate(copy=False), pser1.truncate(copy=False))
self.assert_eq(kser1.truncate(2, 5, copy=False), pser1.truncate(2, 5, copy=False))
self.assert_eq(kser2.truncate(4, 6), pser2.truncate(4, 6))
self.assert_eq(kser2.truncate(4, 6, copy=False), pser2.truncate(4, 6, copy=False))

kser = ks.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 3, 2, 1])
msg = "truncate requires a sorted index"
with self.assertRaisesRegex(ValueError, msg):
kser.truncate()

kser = ks.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 5, 6, 7])
msg = "Truncate: 2 must be after 5"
with self.assertRaisesRegex(ValueError, msg):
kser.truncate(5, 2)

def test_getitem(self):
pser = pd.Series([10, 20, 15, 30, 45], ['A', 'A', 'B', 'C', 'D'])
kser = ks.Series(pser)
Expand Down
1 change: 1 addition & 0 deletions docs/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ Reindexing / Selection / Label manipulation
Series.rename
Series.reset_index
Series.sample
Series.truncate

Missing data handling
---------------------
Expand Down