Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 17 additions & 10 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4441,22 +4441,29 @@ def clip(self, lower: Union[float, int] = None, upper: Union[float, int] = None)

numeric_types = (DecimalType, DoubleType, FloatType, ByteType, IntegerType, LongType,
ShortType)
numeric_columns = [(c, self._internal.scol_for(c)) for c in self.columns
if isinstance(self._internal.spark_type_for(c), numeric_types)]
numeric_columns = [(idx, self._internal.scol_for(idx))
for idx in self._internal.column_index
if isinstance(self._internal.spark_type_for(idx), numeric_types)]

if lower is not None:
numeric_columns = [(c, F.when(scol < lower, lower).otherwise(scol).alias(c))
for c, scol in numeric_columns]
numeric_columns = [(idx, (F.when(scol < lower, lower).otherwise(scol)
.alias(name_like_string(idx))))
for idx, scol in numeric_columns]
if upper is not None:
numeric_columns = [(c, F.when(scol > upper, upper).otherwise(scol).alias(c))
for c, scol in numeric_columns]
numeric_columns = [(idx, (F.when(scol > upper, upper).otherwise(scol)
.alias(name_like_string(idx))))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm .. should we maybe not use alias at all against Spark columns whenever possible?

Copy link
Collaborator Author

@ueshin ueshin Jan 28, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally, we can avoid aliasing, but so far we can't avoid column name access so we should have proper names. Otherwise, e.g., if the name becomes something like "xx`yy" by accident, it throws an error since it breaks the naming rule of Spark.

for idx, scol in numeric_columns]

nonnumeric_columns = [self._internal.scol_for(c) for c in self.columns
if not isinstance(self._internal.spark_type_for(c), numeric_types)]
column_index = [idx for idx, _ in numeric_columns]
column_scols = [scol for _, scol in numeric_columns]

sdf = self._sdf.select([scol for _, scol in numeric_columns] + nonnumeric_columns)
for idx in self._internal.column_index:
if not isinstance(self._internal.spark_type_for(idx), numeric_types):
column_index.append(idx)
column_scols.append(self._internal.scol_for(idx))

return ks.DataFrame(sdf)[list(self.columns)]
internal = self._internal.with_new_columns(column_scols, column_index=column_index)
return DataFrame(internal)[list(self.columns)]

def head(self, n=5):
"""
Expand Down
5 changes: 3 additions & 2 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1139,7 +1139,8 @@ def test_append(self):
self.assert_eq(kdf.append(kdf), pdf.append(pdf))

def test_clip(self):
pdf = pd.DataFrame({'A': [0, 2, 4]})
pdf = pd.DataFrame({'A': [0, 2, 4], 'B': [4, 2, 0], 'X': [-1, 10, 0]},
index=np.random.rand(3))
kdf = ks.from_pandas(pdf)

# Assert list-like values are not accepted for 'lower' and 'upper'
Expand All @@ -1159,7 +1160,7 @@ def test_clip(self):
self.assert_eq(kdf.clip(1, 3), pdf.clip(1, 3))

# Assert behavior on string values
str_kdf = ks.DataFrame({'A': ['a', 'b', 'c']})
str_kdf = ks.DataFrame({'A': ['a', 'b', 'c']}, index=np.random.rand(3))
self.assert_eq(str_kdf.clip(1, 3), str_kdf)

def test_binary_operators(self):
Expand Down
2 changes: 1 addition & 1 deletion databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,7 +570,7 @@ def test_missing(self):
getattr(kser, name)

def test_clip(self):
pser = pd.Series([0, 2, 4])
pser = pd.Series([0, 2, 4], index=np.random.rand(3))
kser = ks.from_pandas(pser)

# Assert list-like values are not accepted for 'lower' and 'upper'
Expand Down