-
Notifications
You must be signed in to change notification settings - Fork 367
Add basic merge functionality to DataFrame #264
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
2c05bc1
a0b97d3
eca3989
5472af1
dde5b9c
1508281
1407922
65ce286
abfe0c9
df85c35
e2e835d
104d1ac
4e7c429
0ad39b8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,7 +18,7 @@ | |
| A wrapper class for Spark DataFrame to behave similar to pandas DataFrame. | ||
| """ | ||
| from functools import partial, reduce | ||
| from typing import Any, List, Union | ||
| from typing import Any, List, Tuple, Union | ||
|
|
||
| import numpy as np | ||
| import pandas as pd | ||
|
|
@@ -1467,6 +1467,112 @@ def shape(self): | |
| """ | ||
| return len(self), len(self.columns) | ||
|
|
||
| def merge(self, right: 'DataFrame', how: str = 'inner', on: str = None, | ||
| suffixes: Tuple[str, str] = ('_x', '_y')) -> 'DataFrame': | ||
| """ | ||
| Merge DataFrame objects with a database-style join. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| right: Object to merge with. | ||
| how: Type of merge to be performed. | ||
| {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘inner’ | ||
|
|
||
| left: use only keys from left frame, similar to a SQL left outer join; preserve key | ||
| order. | ||
| right: use only keys from right frame, similar to a SQL right outer join; preserve key | ||
| order. | ||
| outer: use union of keys from both frames, similar to a SQL full outer join; sort keys | ||
| lexicographically. | ||
| inner: use intersection of keys from both frames, similar to a SQL inner join; | ||
| preserve the order of the left keys. | ||
| on: Column or index level names to join on. These must be found in both DataFrames. If on | ||
| is None and not merging on indexes then this defaults to the intersection of the | ||
| columns in both DataFrames. | ||
| suffixes: Suffix to apply to overlapping column names in the left and right side, | ||
| respectively. | ||
|
|
||
| Returns | ||
| ------- | ||
| DataFrame | ||
| A DataFrame of the two merged objects. | ||
|
|
||
| Examples | ||
| -------- | ||
| >>> left_kdf = ks.DataFrame({'A': [1, 2]}) | ||
| >>> right_kdf = ks.DataFrame({'B': ['x', 'y']}, index=[1, 2]) | ||
|
|
||
| >>> left_kdf.merge(right_kdf) | ||
| A B | ||
| 0 2 x | ||
|
|
||
| >>> left_kdf.merge(right_kdf, how='left') | ||
| A B | ||
| 0 1 None | ||
| 1 2 x | ||
|
|
||
| >>> left_kdf.merge(right_kdf, how='right') | ||
| A B | ||
| 0 2.0 x | ||
| 1 NaN y | ||
|
|
||
| >>> left_kdf.merge(right_kdf, how='outer') | ||
| A B | ||
| 0 1.0 None | ||
| 1 2.0 x | ||
| 2 NaN y | ||
|
|
||
| Notes | ||
| ----- | ||
| As described in #263, joining string columns currently returns None for missing values | ||
| instead of NaN. | ||
| """ | ||
| if how == 'full': | ||
| print("Warning: While Koalas will accept 'full', you should use 'outer' instead to", | ||
|
||
| "be compatible with the pandas merge API") | ||
| if how == 'outer': | ||
| # 'outer' in pandas equals 'full' in Spark | ||
| how = 'full' | ||
| if how not in ('inner', 'left', 'right', 'full'): | ||
| raise ValueError("The 'how' parameter has to be amongst the following values: ", | ||
| "['inner', 'left', 'right', 'full', 'outer']") | ||
|
||
|
|
||
| if on is None: | ||
| # FIXME Move index string to constant? | ||
| on = '__index_level_0__' | ||
|
|
||
| left_table = self._sdf.alias('left_table') | ||
| right_table = right._sdf.alias('right_table') | ||
|
|
||
| # Unpack suffixes tuple for convenience | ||
| left_suffix = suffixes[0] | ||
| right_suffix = suffixes[1] | ||
|
|
||
| # Append suffixes to columns with the same name to avoid conflicts later | ||
| duplicate_columns = list(self.columns & right.columns) | ||
| if duplicate_columns: | ||
| for duplicate_column_name in duplicate_columns: | ||
| left_table = left_table.withColumnRenamed(duplicate_column_name, | ||
| duplicate_column_name + left_suffix) | ||
| right_table = right_table.withColumnRenamed(duplicate_column_name, | ||
| duplicate_column_name + right_suffix) | ||
|
|
||
| join_condition = (left_table[on] == right_table[on] if on not in duplicate_columns | ||
| else left_table[on + left_suffix] == right_table[on + right_suffix]) | ||
| joined_table = left_table.join(right_table, join_condition, how=how) | ||
|
|
||
| if on in duplicate_columns: | ||
| # Merge duplicate key columns | ||
| joined_table = joined_table.withColumnRenamed(on + left_suffix, on) | ||
| joined_table = joined_table.drop(on + right_suffix) | ||
|
|
||
| # Remove auxiliary index | ||
| # FIXME Move index string to constant? | ||
| joined_table = joined_table.drop('__index_level_0__') | ||
|
|
||
| kdf = DataFrame(joined_table) | ||
| return kdf | ||
|
|
||
| def _pd_getitem(self, key): | ||
| from databricks.koalas.series import Series | ||
| if key is None: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -380,3 +380,41 @@ def test_isin(self): | |
| msg = "Values should be iterable, Series, DataFrame or dict." | ||
| with self.assertRaisesRegex(TypeError, msg): | ||
| kdf.isin(1) | ||
|
|
||
| def test_merge(self): | ||
| left_kdf = koalas.DataFrame({'A': [1, 2]}) | ||
| right_kdf = koalas.DataFrame({'B': ['x', 'y']}, index=[1, 2]) | ||
|
|
||
| msg = ("The 'how' parameter has to be amongst the following values: ['inner', 'left', " + | ||
| "'right', 'full', 'outer']") | ||
| with self.assertRaises(ValueError, msg=msg): | ||
| left_kdf.merge(right_kdf, how='foo') | ||
|
|
||
| # Assert inner join | ||
| res = left_kdf.merge(right_kdf) | ||
| self.assert_eq(res, pd.DataFrame({'A': [2], 'B': ['x']})) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this behavior following pandas'? Btw, I think basically we should use exactly the same pandas code as the counterpart of the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right. Without specifying any columns to join on, the current implementation defaults to the pandas equivalent of Regarding the tests, I totally agree with you. But because of #263, it currently fails as some data types mismatch.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we don't need to add the parameters here, but at least we should respect the default value of pandas, e.g., assuming As for the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've now added the |
||
|
|
||
| # Assert inner join on non-default column | ||
| left_kdf_with_id = koalas.DataFrame({'A': [1, 2], 'id': [0, 1]}) | ||
| right_kdf_with_id = koalas.DataFrame({'B': ['x', 'y'], 'id': [0, 1]}, index=[1, 2]) | ||
| res = left_kdf_with_id.merge(right_kdf_with_id, on='id') | ||
| self.assert_eq(res, pd.DataFrame({'A': [1, 2], 'id': [0, 1], 'B': ['x', 'y']})) | ||
|
|
||
| # Assert left join | ||
| res = left_kdf.merge(right_kdf, how='left') | ||
| # FIXME Replace None with np.nan once #263 is solved | ||
| self.assert_eq(res, pd.DataFrame({'A': [1, 2], 'B': [None, 'x']})) | ||
|
|
||
| # Assert right join | ||
| res = left_kdf.merge(right_kdf, how='right') | ||
| self.assert_eq(res, pd.DataFrame({'A': [2, np.nan], 'B': ['x', 'y']})) | ||
|
|
||
| # Assert full outer join | ||
| res = left_kdf.merge(right_kdf, how='outer') | ||
| # FIXME Replace None with np.nan once #263 is solved | ||
| self.assert_eq(res, pd.DataFrame({'A': [1, 2, np.nan], 'B': [None, 'x', 'y']})) | ||
|
|
||
| # Assert full outer join also works with 'full' keyword | ||
| res = left_kdf.merge(right_kdf, how='full') | ||
| # FIXME Replace None with np.nan once #263 is solved | ||
| self.assert_eq(res, pd.DataFrame({'A': [1, 2, np.nan], 'B': [None, 'x', 'y']})) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add tests to use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done with 1407922. |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
in this case what is the join key?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is outdated. Now it should be
to join on the indices.