Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 41 additions & 16 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6684,7 +6684,7 @@ def _reindex_columns(self, columns):

return self._internal.copy(sdf=sdf, data_columns=columns, column_index=idx)

def melt(self, id_vars=None, value_vars=None, var_name='variable',
def melt(self, id_vars=None, value_vars=None, var_name=None,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems the default value of melt's var_name at namespace.py should be changed as well.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, good catch!

value_name='value'):
"""
Unpivot a DataFrame from wide format to long format, optionally
Expand All @@ -6705,7 +6705,8 @@ def melt(self, id_vars=None, value_vars=None, var_name='variable',
Column(s) to unpivot. If not specified, uses all columns that
are not set as `id_vars`.
var_name : scalar, default 'variable'
Name to use for the 'variable' column.
Name to use for the 'variable' column. If None it uses `frame.columns.name` or
‘variable’.
value_name : scalar, default 'value'
Name to use for the 'value' column.

Expand All @@ -6718,7 +6719,8 @@ def melt(self, id_vars=None, value_vars=None, var_name='variable',
--------
>>> df = ks.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
... 'B': {0: 1, 1: 3, 2: 5},
... 'C': {0: 2, 1: 4, 2: 6}})
... 'C': {0: 2, 1: 4, 2: 6}},
... columns=['A', 'B', 'C'])
>>> df
A B C
0 a 1 2
Expand Down Expand Up @@ -6769,29 +6771,52 @@ def melt(self, id_vars=None, value_vars=None, var_name='variable',
"""
if id_vars is None:
id_vars = []
if not isinstance(id_vars, (list, tuple, np.ndarray)):
id_vars = list(id_vars)
elif isinstance(id_vars, str):
id_vars = [(id_vars,)]
elif isinstance(id_vars, tuple):
Copy link
Member

@HyukjinKwon HyukjinKwon Oct 15, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems the tuple alone is not allowed when multi-index:

>>> pdf.melt(id_vars=('X', 'A'))
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/usr/local/lib/python3.7/site-packages/pandas/core/frame.py", line 6500, in melt
    col_level=col_level,
  File "/usr/local/lib/python3.7/site-packages/pandas/core/reshape/melt.py", line 42, in melt
    "id_vars must be a list of tuples when columns" " are a MultiIndex"
ValueError: id_vars must be a list of tuples when columns are a MultiIndex

vs

>>> kdf.melt(id_vars=('X', 'A'))
   ('X', 'A') variable_0 variable_1  value
0           1          X          B      2
1           1          Y          C      7
2           3          X          B      4
3           3          Y          C      8
4           5          X          B      6
5           5          Y          C      9

Maybe we should check if via len(self._internal.index_map) == 1: and disallow a tuple alone.

and .. I think it should be considered as multiple columns. See

>>> kdf.melt(id_vars=('A', 'B'))
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/.../koalas/databricks/koalas/frame.py", line 6818, in melt
    for name in var_name[:self._internal.column_index_level]] +
  File "/.../koalas/databricks/koalas/frame.py", line 6816, in <listcomp>
    for idx in id_vars] +
  File "/.../koalas/databricks/koalas/internal.py", line 534, in scol_for
    return scol_for(self._sdf, self.column_name_for(column_name_or_index))
  File "/.../koalas/databricks/koalas/internal.py", line 523, in column_name_for
    raise KeyError(column_name_or_index)
KeyError: ('A', 'B')

vs

>>> pdf.melt(id_vars=('A', 'B'))
   A  B variable  value
0  1  2        C      7
1  3  4        C      8
2  5  6        C      9

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, I thought it's a weird behavior but let's follow pandas for now.

id_vars = [id_vars]
else:
id_vars = [idv if isinstance(idv, tuple) else (idv,) for idv in id_vars]

data_columns = self._internal.data_columns
column_index = self._internal.column_index

if value_vars is None:
value_vars = []
if not isinstance(value_vars, (list, tuple, np.ndarray)):
value_vars = list(value_vars)
elif isinstance(value_vars, str):
value_vars = [(value_vars,)]
elif isinstance(value_vars, tuple):
value_vars = [value_vars]
else:
value_vars = [valv if isinstance(valv, tuple) else (valv,) for valv in value_vars]
if len(value_vars) == 0:
value_vars = data_columns
value_vars = column_index

column_index = [idx for idx in column_index if idx not in id_vars]

data_columns = [data_column for data_column in data_columns if data_column not in id_vars]
sdf = self._sdf

if var_name is None:
if self._internal.column_index_names is not None:
var_name = self._internal.column_index_names
elif self._internal.column_index_level == 1:
var_name = ['variable']
else:
var_name = ['variable_{}'.format(i)
for i in range(self._internal.column_index_level)]
elif isinstance(var_name, str):
var_name = [var_name]

pairs = F.explode(F.array(*[
F.struct(*(
[F.lit(column).alias(var_name)] +
[self._internal.scol_for(column).alias(value_name)])
) for column in data_columns if column in value_vars]))

columns = (id_vars +
[F.col("pairs.%s" % var_name), F.col("pairs.%s" % value_name)])
[F.lit(c).alias(name) for c, name in zip(idx, var_name)] +
[self._internal.scol_for(idx).alias(value_name)])
) for idx in column_index if idx in value_vars]))

columns = ([self._internal.scol_for(idx).alias(str(idx) if len(idx) > 1 else idx[0])
for idx in id_vars] +
[F.col("pairs.%s" % name)
for name in var_name[:self._internal.column_index_level]] +
[F.col("pairs.%s" % value_name)])
exploded_df = sdf.withColumn("pairs", pairs).select(columns)

return DataFrame(exploded_df)
Expand Down
56 changes: 56 additions & 0 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1755,6 +1755,62 @@ def test_reindex(self):
self.assertRaises(TypeError, lambda: kdf.reindex(columns=['X']))
self.assertRaises(ValueError, lambda: kdf.reindex(columns=[('X',)]))

def test_melt(self):
pdf = pd.DataFrame({'A': [1, 3, 5],
'B': [2, 4, 6],
'C': [7, 8, 9]})
kdf = ks.from_pandas(pdf)

self.assert_eq(kdf.melt().sort_values(['variable', 'value'])
.reset_index(drop=True),
pdf.melt().sort_values(['variable', 'value']))
self.assert_eq(kdf.melt(id_vars='A').sort_values(['variable', 'value'])
.reset_index(drop=True),
pdf.melt(id_vars='A').sort_values(['variable', 'value']))
self.assert_eq(kdf.melt(id_vars=['A', 'B']).sort_values(['variable', 'value'])
.reset_index(drop=True),
pdf.melt(id_vars=['A', 'B']).sort_values(['variable', 'value']))
self.assert_eq(kdf.melt(id_vars=['A'], value_vars=['C']).sort_values(['variable', 'value'])
.reset_index(drop=True),
pdf.melt(id_vars=['A'], value_vars=['C']).sort_values(['variable', 'value']))
self.assert_eq(kdf.melt(id_vars=['A'], value_vars=['B'],
var_name='myVarname', value_name='myValname')
.sort_values(['myVarname', 'myValname']).reset_index(drop=True),
pdf.melt(id_vars=['A'], value_vars=['B'],
var_name='myVarname', value_name='myValname')
.sort_values(['myVarname', 'myValname']))

# multi-index columns
columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B'), ('Y', 'C')])
pdf.columns = columns
kdf.columns = columns

self.assert_eq(kdf.melt().sort_values(['variable_0', 'variable_1', 'value'])
.reset_index(drop=True),
pdf.melt().sort_values(['variable_0', 'variable_1', 'value']))
self.assert_eq(kdf.melt(id_vars=[('X', 'A')])
.sort_values(['variable_0', 'variable_1', 'value']).reset_index(drop=True),
pdf.melt(id_vars=[('X', 'A')])
.sort_values(['variable_0', 'variable_1', 'value']), almost=True)
self.assert_eq(kdf.melt(id_vars=[('X', 'A')], value_vars=[('Y', 'C')])
.sort_values(['variable_0', 'variable_1', 'value']).reset_index(drop=True),
pdf.melt(id_vars=[('X', 'A')], value_vars=[('Y', 'C')])
.sort_values(['variable_0', 'variable_1', 'value']), almost=True)
self.assert_eq(kdf.melt(id_vars=[('X', 'A')], value_vars=[('X', 'B')],
var_name=['myV1', 'myV2'], value_name='myValname')
.sort_values(['myV1', 'myV2', 'myValname']).reset_index(drop=True),
pdf.melt(id_vars=[('X', 'A')], value_vars=[('X', 'B')],
var_name=['myV1', 'myV2'], value_name='myValname')
.sort_values(['myV1', 'myV2', 'myValname']), almost=True)

columns.names = ['v0', 'v1']
pdf.columns = columns
kdf.columns = columns

self.assert_eq(kdf.melt().sort_values(['v0', 'v1', 'value'])
.reset_index(drop=True),
pdf.melt().sort_values(['v0', 'v1', 'value']))

def test_all(self):
pdf = pd.DataFrame({
'col1': [False, False, False],
Expand Down