-
Notifications
You must be signed in to change notification settings - Fork 367
Fix melt for multi-index columns support. #920
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6684,7 +6684,7 @@ def _reindex_columns(self, columns): | |
|
|
||
| return self._internal.copy(sdf=sdf, data_columns=columns, column_index=idx) | ||
|
|
||
| def melt(self, id_vars=None, value_vars=None, var_name='variable', | ||
| def melt(self, id_vars=None, value_vars=None, var_name=None, | ||
| value_name='value'): | ||
| """ | ||
| Unpivot a DataFrame from wide format to long format, optionally | ||
|
|
@@ -6705,7 +6705,8 @@ def melt(self, id_vars=None, value_vars=None, var_name='variable', | |
| Column(s) to unpivot. If not specified, uses all columns that | ||
| are not set as `id_vars`. | ||
| var_name : scalar, default 'variable' | ||
| Name to use for the 'variable' column. | ||
| Name to use for the 'variable' column. If None it uses `frame.columns.name` or | ||
| ‘variable’. | ||
| value_name : scalar, default 'value' | ||
| Name to use for the 'value' column. | ||
|
|
||
|
|
@@ -6718,7 +6719,8 @@ def melt(self, id_vars=None, value_vars=None, var_name='variable', | |
| -------- | ||
| >>> df = ks.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, | ||
| ... 'B': {0: 1, 1: 3, 2: 5}, | ||
| ... 'C': {0: 2, 1: 4, 2: 6}}) | ||
| ... 'C': {0: 2, 1: 4, 2: 6}}, | ||
| ... columns=['A', 'B', 'C']) | ||
| >>> df | ||
| A B C | ||
| 0 a 1 2 | ||
|
|
@@ -6769,29 +6771,52 @@ def melt(self, id_vars=None, value_vars=None, var_name='variable', | |
| """ | ||
| if id_vars is None: | ||
| id_vars = [] | ||
| if not isinstance(id_vars, (list, tuple, np.ndarray)): | ||
| id_vars = list(id_vars) | ||
| elif isinstance(id_vars, str): | ||
| id_vars = [(id_vars,)] | ||
| elif isinstance(id_vars, tuple): | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems the tuple alone is not allowed when multi-index: >>> pdf.melt(id_vars=('X', 'A'))
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/local/lib/python3.7/site-packages/pandas/core/frame.py", line 6500, in melt
col_level=col_level,
File "/usr/local/lib/python3.7/site-packages/pandas/core/reshape/melt.py", line 42, in melt
"id_vars must be a list of tuples when columns" " are a MultiIndex"
ValueError: id_vars must be a list of tuples when columns are a MultiIndexvs >>> kdf.melt(id_vars=('X', 'A'))
('X', 'A') variable_0 variable_1 value
0 1 X B 2
1 1 Y C 7
2 3 X B 4
3 3 Y C 8
4 5 X B 6
5 5 Y C 9Maybe we should check if via and .. I think it should be considered as multiple columns. See >>> kdf.melt(id_vars=('A', 'B'))
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/.../koalas/databricks/koalas/frame.py", line 6818, in melt
for name in var_name[:self._internal.column_index_level]] +
File "/.../koalas/databricks/koalas/frame.py", line 6816, in <listcomp>
for idx in id_vars] +
File "/.../koalas/databricks/koalas/internal.py", line 534, in scol_for
return scol_for(self._sdf, self.column_name_for(column_name_or_index))
File "/.../koalas/databricks/koalas/internal.py", line 523, in column_name_for
raise KeyError(column_name_or_index)
KeyError: ('A', 'B')vs >>> pdf.melt(id_vars=('A', 'B'))
A B variable value
0 1 2 C 7
1 3 4 C 8
2 5 6 C 9
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, I thought it's a weird behavior but let's follow pandas for now. |
||
| id_vars = [id_vars] | ||
| else: | ||
| id_vars = [idv if isinstance(idv, tuple) else (idv,) for idv in id_vars] | ||
|
|
||
| data_columns = self._internal.data_columns | ||
| column_index = self._internal.column_index | ||
|
|
||
| if value_vars is None: | ||
| value_vars = [] | ||
| if not isinstance(value_vars, (list, tuple, np.ndarray)): | ||
| value_vars = list(value_vars) | ||
| elif isinstance(value_vars, str): | ||
| value_vars = [(value_vars,)] | ||
| elif isinstance(value_vars, tuple): | ||
| value_vars = [value_vars] | ||
| else: | ||
| value_vars = [valv if isinstance(valv, tuple) else (valv,) for valv in value_vars] | ||
| if len(value_vars) == 0: | ||
| value_vars = data_columns | ||
| value_vars = column_index | ||
|
|
||
| column_index = [idx for idx in column_index if idx not in id_vars] | ||
|
|
||
| data_columns = [data_column for data_column in data_columns if data_column not in id_vars] | ||
| sdf = self._sdf | ||
|
|
||
| if var_name is None: | ||
| if self._internal.column_index_names is not None: | ||
| var_name = self._internal.column_index_names | ||
| elif self._internal.column_index_level == 1: | ||
| var_name = ['variable'] | ||
| else: | ||
| var_name = ['variable_{}'.format(i) | ||
| for i in range(self._internal.column_index_level)] | ||
| elif isinstance(var_name, str): | ||
| var_name = [var_name] | ||
|
|
||
| pairs = F.explode(F.array(*[ | ||
| F.struct(*( | ||
| [F.lit(column).alias(var_name)] + | ||
| [self._internal.scol_for(column).alias(value_name)]) | ||
| ) for column in data_columns if column in value_vars])) | ||
|
|
||
| columns = (id_vars + | ||
| [F.col("pairs.%s" % var_name), F.col("pairs.%s" % value_name)]) | ||
| [F.lit(c).alias(name) for c, name in zip(idx, var_name)] + | ||
| [self._internal.scol_for(idx).alias(value_name)]) | ||
| ) for idx in column_index if idx in value_vars])) | ||
|
|
||
| columns = ([self._internal.scol_for(idx).alias(str(idx) if len(idx) > 1 else idx[0]) | ||
| for idx in id_vars] + | ||
| [F.col("pairs.%s" % name) | ||
| for name in var_name[:self._internal.column_index_level]] + | ||
| [F.col("pairs.%s" % value_name)]) | ||
| exploded_df = sdf.withColumn("pairs", pairs).select(columns) | ||
|
|
||
| return DataFrame(exploded_df) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems the default value of
melt'svar_nameat namespace.py should be changed as well.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ah, good catch!