Skip to content

Commit 8367c93

Browse files
itholicHyukjinKwon
authored andcommitted
Enable DataFrame setting value as list of labels. (Resolves #894) (#905)
Resolves #894 for below DataFrame, ```python >>> df = ks.DataFrame([[1, 2], [4, 5], [7, 8]], ... index=['cobra', 'viper', 'sidewinder'], ... columns=['max_speed', 'shield']) >>> df max_speed shield cobra 1 2 viper 4 5 sidewinder 7 8 ``` now we can set value for all items matching the list of labels like Pandas. ```python >>> df.loc[['viper', 'sidewinder'], ['shield']] = 50 >>> df max_speed shield cobra 1 2 viper 4 50 sidewinder 7 50 ```
1 parent b7f1fe0 commit 8367c93

File tree

3 files changed

+109
-17
lines changed

3 files changed

+109
-17
lines changed

databricks/koalas/indexing.py

Lines changed: 50 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -283,12 +283,14 @@ class LocIndexer(object):
283283
284284
**Setting values**
285285
286-
Setting value for all items matching the list of labels is not allowed
286+
Setting value for all items matching the list of labels.
287287
288288
>>> df.loc[['viper', 'sidewinder'], ['shield']] = 50
289-
Traceback (most recent call last):
290-
...
291-
databricks.koalas.exceptions.SparkPandasNotImplementedError: ...
289+
>>> df
290+
max_speed shield
291+
cobra 1 2
292+
viper 4 50
293+
sidewinder 7 50
292294
293295
Setting value for an entire row is not allowed
294296
@@ -303,17 +305,26 @@ class LocIndexer(object):
303305
>>> df
304306
max_speed shield
305307
cobra 30 2
306-
viper 30 5
307-
sidewinder 30 8
308+
viper 30 50
309+
sidewinder 30 50
310+
311+
Set value for an entire list of columns
312+
313+
>>> df.loc[:, ['max_speed', 'shield']] = 100
314+
>>> df
315+
max_speed shield
316+
cobra 100 100
317+
viper 100 100
318+
sidewinder 100 100
308319
309320
Set value with Series
310321
311322
>>> df.loc[:, 'shield'] = df['shield'] * 2
312323
>>> df
313324
max_speed shield
314-
cobra 30 4
315-
viper 30 10
316-
sidewinder 30 16
325+
cobra 100 200
326+
viper 100 200
327+
sidewinder 100 200
317328
318329
**Getting values on a DataFrame with an index that has integer labels**
319330
@@ -492,22 +503,44 @@ def __setitem__(self, key, value):
492503
rows_sel, cols_sel = key
493504

494505
if (not isinstance(rows_sel, slice)) or (rows_sel != slice(None)):
495-
raise SparkPandasNotImplementedError(
496-
description="""Can only assign value to the whole dataframe, the row index
497-
has to be `slice(None)` or `:`""",
498-
pandas_function=".loc[..., ...] = ...",
499-
spark_target_function="withColumn, select")
506+
if isinstance(rows_sel, list):
507+
if isinstance(cols_sel, str):
508+
cols_sel = [cols_sel]
509+
kdf = self._kdf.copy()
510+
for col_sel in cols_sel:
511+
# Uses `kdf` to allow operations on different DataFrames.
512+
# TODO: avoid temp column name or declare `__` prefix is
513+
# reserved for Koalas' internal columns.
514+
kdf["__indexing_temp_col__"] = value
515+
new_col = kdf["__indexing_temp_col__"]._scol
516+
kdf[col_sel] = Series(kdf[col_sel]._internal.copy(
517+
scol=F.when(
518+
kdf._internal.index_scols[0].isin(rows_sel), new_col
519+
).otherwise(kdf[col_sel]._scol)), anchor=kdf)
520+
kdf = kdf.drop(labels=['__indexing_temp_col__'])
521+
522+
self._kdf._internal = kdf._internal.copy()
523+
else:
524+
raise SparkPandasNotImplementedError(
525+
description="""Can only assign value to the whole dataframe, the row index
526+
has to be `slice(None)` or `:`""",
527+
pandas_function=".loc[..., ...] = ...",
528+
spark_target_function="withColumn, select")
500529

501-
if not isinstance(cols_sel, str):
502-
raise ValueError("""only column names can be assigned""")
530+
if not isinstance(cols_sel, (str, list)):
531+
raise ValueError("""only column names or list of column names can be assigned""")
503532

504533
if isinstance(value, DataFrame):
505534
if len(value.columns) == 1:
506535
self._kdf[cols_sel] = _col(value)
507536
else:
508537
raise ValueError("Only a dataframe with one column can be assigned")
509538
else:
510-
self._kdf[cols_sel] = value
539+
if isinstance(cols_sel, str):
540+
cols_sel = [cols_sel]
541+
if (not isinstance(rows_sel, list)) and (isinstance(cols_sel, list)):
542+
for col_sel in cols_sel:
543+
self._kdf[col_sel] = value
511544

512545

513546
class ILocIndexer(object):

databricks/koalas/tests/test_indexing.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,41 @@ def test_iloc_series(self):
507507
self.assert_eq(kseries.iloc[:1], pseries.iloc[:1])
508508
self.assert_eq(kseries.iloc[:-1], pseries.iloc[:-1])
509509

510+
def test_setitem(self):
511+
pdf = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
512+
index=['cobra', 'viper', 'sidewinder'],
513+
columns=['max_speed', 'shield'])
514+
kdf = ks.from_pandas(pdf)
515+
516+
pdf.loc[['viper', 'sidewinder'], ['shield', 'max_speed']] = 10
517+
kdf.loc[['viper', 'sidewinder'], ['shield', 'max_speed']] = 10
518+
self.assert_eq(kdf, pdf)
519+
520+
pdf.loc[['viper', 'sidewinder'], 'shield'] = 50
521+
kdf.loc[['viper', 'sidewinder'], 'shield'] = 50
522+
self.assert_eq(kdf, pdf)
523+
524+
with self.assertRaisesRegex(ValueError,
525+
'Only a dataframe with one column can be assigned'):
526+
kdf.loc[:, 'max_speed'] = kdf
527+
with self.assertRaisesRegex(ValueError,
528+
'only column names or list of column names can be assigned'):
529+
kdf.loc[['viper'], ('max_speed', 'shield')] = 10
530+
msg = """Can only assign value to the whole dataframe, the row index
531+
has to be `slice(None)` or `:`"""
532+
msg = ("Can only assign value to the whole dataframe, the row index")
533+
with self.assertRaisesRegex(SparkPandasNotImplementedError, msg):
534+
kdf.loc['viper', 'max_speed'] = 10
535+
536+
pdf = pd.DataFrame([[1], [4], [7]],
537+
index=['cobra', 'viper', 'sidewinder'],
538+
columns=['max_speed'])
539+
kdf = ks.from_pandas(pdf)
540+
541+
pdf.loc[:, 'max_speed'] = pdf
542+
kdf.loc[:, 'max_speed'] = kdf
543+
self.assert_eq(kdf, pdf)
544+
510545
def test_iloc_raises(self):
511546
pdf = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]})
512547
kdf = ks.from_pandas(pdf)

databricks/koalas/tests/test_ops_on_diff_frames.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,19 @@ def test_multi_index_assignment_frame(self):
395395

396396
self.assert_eq(kdf.sort_index(), pdf.sort_index())
397397

398+
def test_loc_setitem(self):
399+
pdf = pd.DataFrame(
400+
[[1, 2], [4, 5], [7, 8]],
401+
index=['cobra', 'viper', 'sidewinder'],
402+
columns=['max_speed', 'shield'])
403+
kdf = ks.DataFrame(pdf)
404+
another_kdf = ks.DataFrame(pdf)
405+
406+
kdf.loc[['viper', 'sidewinder'], ['shield']] = another_kdf.max_speed
407+
pdf.loc[['viper', 'sidewinder'], ['shield']] = pdf.max_speed
408+
409+
self.assert_eq(kdf.sort_index(), pdf.sort_index())
410+
398411

399412
class OpsOnDiffFramesDisabledTest(ReusedSQLTestCase, SQLTestUtils):
400413

@@ -447,3 +460,14 @@ def test_assignment(self):
447460
with self.assertRaisesRegex(ValueError, "Cannot combine column argument"):
448461
kdf = ks.from_pandas(self.pdf1)
449462
kdf['c'] = self.kdf1.a
463+
464+
def test_loc_setitem(self):
465+
pdf = pd.DataFrame(
466+
[[1, 2], [4, 5], [7, 8]],
467+
index=['cobra', 'viper', 'sidewinder'],
468+
columns=['max_speed', 'shield'])
469+
kdf = ks.DataFrame(pdf)
470+
another_kdf = ks.DataFrame(pdf)
471+
472+
with self.assertRaisesRegex(ValueError, "Cannot combine column argument"):
473+
kdf.loc[['viper', 'sidewinder'], ['shield']] = another_kdf.max_speed

0 commit comments

Comments
 (0)