-
Notifications
You must be signed in to change notification settings - Fork 367
Add basic merge functionality to DataFrame #264
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
2c05bc1
a0b97d3
eca3989
5472af1
dde5b9c
1508281
1407922
65ce286
abfe0c9
df85c35
e2e835d
104d1ac
4e7c429
0ad39b8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,7 +18,7 @@ | |
| A wrapper class for Spark DataFrame to behave similar to pandas DataFrame. | ||
| """ | ||
| from functools import partial, reduce | ||
| from typing import Any, List, Union | ||
| from typing import Any, List, Tuple, Union | ||
|
|
||
| import numpy as np | ||
| import pandas as pd | ||
|
|
@@ -1467,6 +1467,101 @@ def shape(self): | |
| """ | ||
| return len(self), len(self.columns) | ||
|
|
||
| def merge(self, right: 'DataFrame', how: str = 'inner', on: str = None, | ||
| suffixes: Tuple[str, str] = ('_x', '_y')) -> 'DataFrame': | ||
| """ | ||
| Perform a database (SQL) merge operation between two DataFrame objects. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| right: the DataFrame for the right side of the join operation | ||
| how: a join method out of ['inner', 'left', 'right', 'full', 'outer'] | ||
| on: the column name to be joined on. Defaults to the index if not specified | ||
| suffixes: suffix to apply to overlapping column names in the left and right side, | ||
| respectively | ||
|
|
||
| Returns | ||
| ------- | ||
| DataFrame | ||
| The joined DataFrame | ||
|
|
||
| Examples | ||
| -------- | ||
| >>> left_kdf = koalas.DataFrame({'A': [1, 2]}) | ||
| >>> right_kdf = koalas.DataFrame({'B': ['x', 'y']}, index=[1, 2]) | ||
|
|
||
| >>> left_kdf.merge(right_kdf) | ||
|
||
| A B | ||
| 0 2 x | ||
|
|
||
| >>> left_kdf.merge(right_kdf, how='left') | ||
| A B | ||
| 0 1 None | ||
| 1 2 x | ||
|
|
||
| >>> left_kdf.merge(right_kdf, how='right') | ||
| A B | ||
| 0 2.0 x | ||
| 1 NaN y | ||
|
|
||
| >>> left_kdf.merge(right_kdf, how='outer') | ||
| A B | ||
| 0 1.0 None | ||
| 1 2.0 x | ||
| 2 NaN y | ||
|
|
||
| Notes | ||
| ----- | ||
| As described in #263, joining string columns currently returns None for missing values | ||
| instead of NaN. | ||
| """ | ||
| if how == 'full': | ||
| print("Warning: While Koalas will accept 'full', you should use 'outer' instead to", | ||
|
||
| "be compatible with the pandas merge API") | ||
| pass | ||
HyukjinKwon marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| if how == 'outer': | ||
| # 'outer' in pandas equals 'full' in Spark | ||
| how = 'full' | ||
| if how not in ('inner', 'left', 'right', 'full'): | ||
| raise ValueError("The 'how' parameter has to be amongst the following values: ", | ||
| "['inner', 'left', 'right', 'full', 'outer']") | ||
|
||
|
|
||
| if on is None: | ||
| # FIXME Move index string to constant? | ||
| on = '__index_level_0__' | ||
|
|
||
| left_table = self._sdf.alias('left_table') | ||
| right_table = right._sdf.alias('right_table') | ||
|
|
||
| # Unpack suffixes tuple for convenience | ||
| left_suffix = suffixes[0] | ||
| right_suffix = suffixes[1] | ||
|
|
||
| # Append suffixes to columns with the same name to avoid conflicts later | ||
| duplicate_columns = list(self.columns & right.columns) | ||
| if duplicate_columns: | ||
| for duplicate_column_name in duplicate_columns: | ||
| left_table = left_table.withColumnRenamed(duplicate_column_name, | ||
| duplicate_column_name + left_suffix) | ||
| right_table = right_table.withColumnRenamed(duplicate_column_name, | ||
| duplicate_column_name + right_suffix) | ||
|
|
||
| join_condition = (left_table[on] == right_table[on] if on not in duplicate_columns | ||
| else left_table[on + left_suffix] == right_table[on + right_suffix]) | ||
| joined_table = left_table.join(right_table, join_condition, how=how) | ||
|
|
||
| if on in duplicate_columns: | ||
| # Merge duplicate key columns | ||
| joined_table = joined_table.withColumnRenamed(on + left_suffix, on) | ||
| joined_table = joined_table.drop(on + right_suffix) | ||
|
|
||
| # Remove auxiliary index | ||
| # FIXME Move index string to constant? | ||
| joined_table = joined_table.drop('__index_level_0__') | ||
|
|
||
| kdf = DataFrame(joined_table) | ||
| return kdf | ||
|
|
||
| def _pd_getitem(self, key): | ||
| from databricks.koalas.series import Series | ||
| if key is None: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -380,3 +380,41 @@ def test_isin(self): | |
| msg = "Values should be iterable, Series, DataFrame or dict." | ||
| with self.assertRaisesRegex(TypeError, msg): | ||
| kdf.isin(1) | ||
|
|
||
| def test_merge(self): | ||
| left_kdf = koalas.DataFrame({'A': [1, 2]}) | ||
| right_kdf = koalas.DataFrame({'B': ['x', 'y']}, index=[1, 2]) | ||
|
|
||
| msg = ("The 'how' parameter has to be amongst the following values: ['inner', 'left', " + | ||
| "'right', 'full', 'outer']") | ||
| with self.assertRaises(ValueError, msg=msg): | ||
| left_kdf.merge(right_kdf, how='foo') | ||
|
|
||
| # Assert inner join | ||
| res = left_kdf.merge(right_kdf) | ||
| self.assert_eq(res, pd.DataFrame({'A': [2], 'B': ['x']})) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this behavior following pandas'? Btw, I think basically we should use exactly the same pandas code as the counterpart of the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right. Without specifying any columns to join on, the current implementation defaults to the pandas equivalent of Regarding the tests, I totally agree with you. But because of #263, it currently fails as some data types mismatch.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we don't need to add the parameters here, but at least we should respect the default value of pandas, e.g., assuming As for the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've now added the |
||
|
|
||
| # Assert inner join on non-default column | ||
| left_kdf_with_id = koalas.DataFrame({'A': [1, 2], 'id': [0, 1]}) | ||
| right_kdf_with_id = koalas.DataFrame({'B': ['x', 'y'], 'id': [0, 1]}, index=[1, 2]) | ||
| res = left_kdf_with_id.merge(right_kdf_with_id, on='id') | ||
| self.assert_eq(res, pd.DataFrame({'A': [1, 2], 'id': [0, 1], 'B': ['x', 'y']})) | ||
|
|
||
| # Assert left join | ||
| res = left_kdf.merge(right_kdf, how='left') | ||
| # FIXME Replace None with np.nan once #263 is solved | ||
| self.assert_eq(res, pd.DataFrame({'A': [1, 2], 'B': [None, 'x']})) | ||
|
|
||
| # Assert right join | ||
| res = left_kdf.merge(right_kdf, how='right') | ||
| self.assert_eq(res, pd.DataFrame({'A': [2, np.nan], 'B': ['x', 'y']})) | ||
|
|
||
| # Assert full outer join | ||
| res = left_kdf.merge(right_kdf, how='outer') | ||
| # FIXME Replace None with np.nan once #263 is solved | ||
| self.assert_eq(res, pd.DataFrame({'A': [1, 2, np.nan], 'B': [None, 'x', 'y']})) | ||
|
|
||
| # Assert full outer join also works with 'full' keyword | ||
| res = left_kdf.merge(right_kdf, how='full') | ||
| # FIXME Replace None with np.nan once #263 is solved | ||
| self.assert_eq(res, pd.DataFrame({'A': [1, 2, np.nan], 'B': [None, 'x', 'y']})) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add tests to use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done with 1407922. |
||
Uh oh!
There was an error while loading. Please reload this page.