Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion databricks/koalas/missing/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ class _MissingPandasLikeSeries(object):
to_period = unsupported_function('to_period')
to_sql = unsupported_function('to_sql')
to_timestamp = unsupported_function('to_timestamp')
truncate = unsupported_function('truncate')
tshift = unsupported_function('tshift')
tz_convert = unsupported_function('tz_convert')
tz_localize = unsupported_function('tz_localize')
Expand Down
84 changes: 84 additions & 0 deletions databricks/koalas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3223,6 +3223,90 @@ def copy(self) -> 'Series':
"""
return _col(DataFrame(self._internal.copy()))

def truncate(self, before=None, after=None, copy=True):
"""
Truncates a sorted Series before and/or after some particular index value.
Series should have sorted index.

Parameters
----------
before : string, int
Truncate all rows before this index value
after : string, int
Truncate all rows after this index value
copy : boolean, default is True,
return a copy of the truncated section

Returns
-------
truncated : Series

Examples
--------


A Series has index that sorted integers.

>>> s = ks.Series([10, 20, 30, 40, 50, 60, 70],
... index=[1, 2, 3, 4, 5, 6, 7])
>>> s
1 10
2 20
3 30
4 40
5 50
6 60
7 70
Name: 0, dtype: int64

>>> s.truncate(2, 5)
2 20
3 30
4 40
5 50
Name: 0, dtype: int64

A Series has index that sorted strings.

>>> s = ks.Series([10, 20, 30, 40, 50, 60, 70],
... index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])
>>> s
a 10
b 20
c 30
d 40
e 50
f 60
g 70
Name: 0, dtype: int64

>>> s.truncate('b', 'e')
b 20
c 30
d 40
e 50
Name: 0, dtype: int64
"""
indexes = self.index.to_pandas()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@itholic, this will collect everything in index into driver's memory. We should avoid this.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Koalas index implements is_monotonic_increasing and is_monotonic_decreasing. The problem is that this is an expensive operation as warned in their documentations as it leads all data into single node.

Can you add a note that this API will be expensive (see is_monotonic_increasing as an example). We even might have to explicitly don't implement. cc @ueshin

Copy link
Contributor Author

@itholic itholic Oct 22, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@HyukjinKwon Thanks for commenting!

yeah you're right. i used is_monotonic_increasing of pandas' because there was a bug for koalas'. (so i fixed in #930 )

now we can use koalas', but there still have an issue about cost as you said.

and first, i wrote note about warning in this API docstring as you said

indexes_increasing = indexes.is_monotonic_increasing
if not any([indexes_increasing, indexes.is_monotonic_decreasing]):
raise ValueError("truncate requires a sorted index")
if (before is None) and (after is None):
return self.copy() if copy else self

if before is None:
before = indexes[0] if indexes_increasing else indexes[-1]
if after is None:
after = indexes[-1] if indexes_increasing else indexes[0]
if before > after:
raise ValueError("Truncate: %s must be after %s" % (after, before))
if indexes_increasing:
result = _col(self.to_frame()[before:after])
else:
result = _col(self.to_frame()[after:before])

return result.copy() if copy else result

# TODO: 'regex', 'method' parameter
def replace(self, to_replace=None, value=None, regex=False) -> 'Series':
"""
Expand Down
22 changes: 22 additions & 0 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -728,3 +728,25 @@ def test_duplicates(self):

self.assert_eq(pser.drop_duplicates().sort_values(),
kser.drop_duplicates().sort_values())

def test_truncate(self):
pser1 = pd.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 5, 6, 7])
kser1 = ks.Series(pser1)
pser2 = pd.Series([10, 20, 30, 40, 50, 60, 70], index=[7, 6, 5, 4, 3, 2, 1])
kser2 = ks.Series(pser2)

self.assert_eq(kser1.truncate(), pser1.truncate())
self.assert_eq(kser1.truncate(copy=False), pser1.truncate(copy=False))
self.assert_eq(kser1.truncate(2, 5, copy=False), pser1.truncate(2, 5, copy=False))
self.assert_eq(kser2.truncate(4, 6), pser2.truncate(4, 6))
self.assert_eq(kser2.truncate(4, 6, copy=False), pser2.truncate(4, 6, copy=False))

kser = ks.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 3, 2, 1])
msg = "truncate requires a sorted index"
with self.assertRaisesRegex(ValueError, msg):
kser.truncate()

kser = ks.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 5, 6, 7])
msg = "Truncate: 2 must be after 5"
with self.assertRaisesRegex(ValueError, msg):
kser.truncate(5, 2)
1 change: 1 addition & 0 deletions docs/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ Reindexing / Selection / Label manipulation
Series.rename
Series.reset_index
Series.sample
Series.truncate

Missing data handling
---------------------
Expand Down