Skip to content

feat: df.join lsuffix and rsuffix support #1857

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 85 additions & 11 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3412,37 +3412,72 @@ def join(
*,
on: Optional[str] = None,
how: str = "left",
lsuffix: str = "",
rsuffix: str = "",
) -> DataFrame:
if isinstance(other, bigframes.series.Series):
other = other.to_frame()

left, right = self, other

if not left.columns.intersection(right.columns).empty:
raise NotImplementedError(
f"Deduping column names is not implemented. {constants.FEEDBACK_LINK}"
)
col_intersection = left.columns.intersection(right.columns)

if not col_intersection.empty:
if lsuffix == rsuffix == "":
raise ValueError(
f"columns overlap but no suffix specified: {col_intersection}"
)

if how == "cross":
if on is not None:
raise ValueError("'on' is not supported for cross join.")
result_block = left._block.merge(
right._block,
left_join_ids=[],
right_join_ids=[],
suffixes=("", ""),
suffixes=(lsuffix, rsuffix),
how="cross",
sort=True,
)
return DataFrame(result_block)

# Join left columns with right index
if on is not None:
if left._has_index and (on in left.index.names):
if on in left.columns:
raise ValueError(
f"'{on}' is both an index level and a column label, which is ambiguous."
)
else:
raise NotImplementedError(
f"Joining on index level '{on}' is not yet supported. {constants.FEEDBACK_LINK}"
)
if (left.columns == on).sum() > 1:
raise ValueError(f"The column label '{on}' is not unique.")

if other._block.index.nlevels != 1:
raise ValueError(
"Join on columns must match the index level of the other DataFrame. Join on column with multi-index haven't been supported."
)

# Replace all columns names with unique names for reordering.
left_col_original_names = left.columns
on_col_name = "bigframes_left_col_on"
dup_on_col_name = "bigframes_left_col_on_dup"
left_col_temp_names = [
f"bigframes_left_col_name_{i}" if col_name != on else on_col_name
for i, col_name in enumerate(left_col_original_names)
]
left.columns = pandas.Index(left_col_temp_names)
# if on column is also in right df, we need to duplicate the column
# and set it to be the first column
if on in col_intersection:
left[dup_on_col_name] = left[on_col_name]
on_col_name = dup_on_col_name
left_col_temp_names = [on_col_name] + left_col_temp_names
left = left[left_col_temp_names]

# Switch left index with on column
left_columns = left.columns
left_idx_original_names = left.index.names if left._has_index else ()
left_idx_names_in_cols = [
f"bigframes_left_idx_name_{i}"
Expand All @@ -3451,11 +3486,18 @@ def join(
if left._has_index:
left.index.names = left_idx_names_in_cols
left = left.reset_index(drop=False)
left = left.set_index(on)
left = left.set_index(on_col_name)

right_col_original_names = right.columns
right_col_temp_names = [
f"bigframes_right_col_name_{i}"
for i in range(len(right_col_original_names))
]
right.columns = pandas.Index(right_col_temp_names)

# Join on index and switch back
combined_df = left._perform_join_by_index(right, how=how)
combined_df.index.name = on
combined_df.index.name = on_col_name
combined_df = combined_df.reset_index(drop=False)
combined_df = combined_df.set_index(left_idx_names_in_cols)

Expand All @@ -3468,14 +3510,22 @@ def join(
)

# Reorder columns
combined_df = combined_df[list(left_columns) + list(right.columns)]
return combined_df
combined_df = combined_df[left_col_temp_names + right_col_temp_names]
return combined_df._add_join_suffix(
left_col_original_names,
right_col_original_names,
lsuffix=lsuffix,
rsuffix=rsuffix,
extra_col=on if on_col_name == dup_on_col_name else None,
)

# Join left index with right index
if left._block.index.nlevels != right._block.index.nlevels:
raise ValueError("Index to join on must have the same number of levels.")

return left._perform_join_by_index(right, how=how)
return left._perform_join_by_index(right, how=how)._add_join_suffix(
left.columns, right.columns, lsuffix=lsuffix, rsuffix=rsuffix
)

def _perform_join_by_index(
self,
Expand All @@ -3489,6 +3539,30 @@ def _perform_join_by_index(
)
return DataFrame(block)

def _add_join_suffix(
self,
left_columns,
right_columns,
lsuffix: str = "",
rsuffix: str = "",
extra_col: typing.Optional[str] = None,
):
col_intersection = left_columns.intersection(right_columns)
final_col_names = [] if extra_col is None else [extra_col]
for col_name in left_columns:
if col_name in col_intersection:
final_col_names.append(f"{col_name}{lsuffix}")
else:
final_col_names.append(col_name)

for col_name in right_columns:
if col_name in col_intersection:
final_col_names.append(f"{col_name}{rsuffix}")
else:
final_col_names.append(col_name)
self.columns = pandas.Index(final_col_names)
return self

@validations.requires_ordering()
def rolling(
self,
Expand Down
99 changes: 93 additions & 6 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2816,12 +2816,99 @@ def test_join_different_table(
assert_pandas_df_equal(bf_result, pd_result, ignore_order=True)


def test_join_duplicate_columns_raises_not_implemented(scalars_dfs):
scalars_df, _ = scalars_dfs
df_a = scalars_df[["string_col", "float64_col"]]
df_b = scalars_df[["float64_col"]]
with pytest.raises(NotImplementedError):
df_a.join(df_b, how="outer").to_pandas()
@all_joins
def test_join_different_table_with_duplicate_column_name(
scalars_df_index, scalars_pandas_df_index, how
):
bf_df_a = scalars_df_index[["string_col", "int64_col", "int64_too"]].rename(
columns={"int64_too": "int64_col"}
)
bf_df_b = scalars_df_index.dropna()[
["string_col", "int64_col", "int64_too"]
].rename(columns={"int64_too": "int64_col"})
bf_result = bf_df_a.join(bf_df_b, how=how, lsuffix="_l", rsuffix="_r").to_pandas()
print(bf_result)
pd_df_a = scalars_pandas_df_index[["string_col", "int64_col", "int64_too"]].rename(
columns={"int64_too": "int64_col"}
)
pd_df_b = scalars_pandas_df_index.dropna()[
["string_col", "int64_col", "int64_too"]
].rename(columns={"int64_too": "int64_col"})
pd_result = pd_df_a.join(pd_df_b, how=how, lsuffix="_l", rsuffix="_r")
print(pd_result)

pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False)


@all_joins
def test_join_param_on_with_duplicate_column_name_not_on_col(
scalars_df_index, scalars_pandas_df_index, how
):
# This test is for duplicate column names, but the 'on' column is not duplicated.
if how == "cross":
return
bf_df_a = scalars_df_index[
["string_col", "datetime_col", "timestamp_col", "int64_too"]
].rename(columns={"timestamp_col": "datetime_col"})
bf_df_b = scalars_df_index.dropna()[
["string_col", "datetime_col", "timestamp_col"]
].rename(columns={"timestamp_col": "datetime_col"})
bf_result = bf_df_a.join(
bf_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r"
).to_pandas()
pd_df_a = scalars_pandas_df_index[
["string_col", "datetime_col", "timestamp_col", "int64_too"]
].rename(columns={"timestamp_col": "datetime_col"})
pd_df_b = scalars_pandas_df_index.dropna()[
["string_col", "datetime_col", "timestamp_col"]
].rename(columns={"timestamp_col": "datetime_col"})
pd_result = pd_df_a.join(
pd_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r"
)
pd.testing.assert_frame_equal(
bf_result.sort_index(),
pd_result.sort_index(),
check_like=True,
check_index_type=False,
)


@pytest.mark.skipif(
pandas.__version__.startswith("1."), reason="bad left join in pandas 1.x"
)
@all_joins
def test_join_param_on_with_duplicate_column_name_on_col(
scalars_df_index, scalars_pandas_df_index, how
):
# This test is for duplicate column names, and the 'on' column is duplicated.
if how == "cross":
return
bf_df_a = scalars_df_index[
["string_col", "datetime_col", "timestamp_col", "int64_too"]
].rename(columns={"timestamp_col": "datetime_col"})
bf_df_b = scalars_df_index.dropna()[
["string_col", "datetime_col", "timestamp_col", "int64_too"]
].rename(columns={"timestamp_col": "datetime_col"})
bf_result = bf_df_a.join(
bf_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r"
).to_pandas()
print(bf_result)
pd_df_a = scalars_pandas_df_index[
["string_col", "datetime_col", "timestamp_col", "int64_too"]
].rename(columns={"timestamp_col": "datetime_col"})
pd_df_b = scalars_pandas_df_index.dropna()[
["string_col", "datetime_col", "timestamp_col", "int64_too"]
].rename(columns={"timestamp_col": "datetime_col"})
pd_result = pd_df_a.join(
pd_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r"
)
print(pd_result)
pd.testing.assert_frame_equal(
bf_result.sort_index(),
pd_result.sort_index(),
check_like=True,
check_index_type=False,
)


@all_joins
Expand Down
30 changes: 27 additions & 3 deletions tests/unit/test_dataframe_polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -2443,12 +2443,36 @@ def test_join_different_table(
assert_pandas_df_equal(bf_result, pd_result, ignore_order=True)


def test_join_duplicate_columns_raises_not_implemented(scalars_dfs):
@all_joins
def test_join_raise_when_param_on_duplicate_with_column(scalars_df_index, how):
if how == "cross":
return
bf_df_a = scalars_df_index[["string_col", "int64_col"]].rename(
columns={"int64_col": "string_col"}
)
bf_df_a.index.name = "string_col"
bf_df_b = scalars_df_index.dropna()["string_col"]
with pytest.raises(ValueError):
bf_df_a.join(bf_df_b, on="string_col", how=how, lsuffix="_l", rsuffix="_r")


def test_join_duplicate_columns_raises_value_error(scalars_dfs):
scalars_df, _ = scalars_dfs
df_a = scalars_df[["string_col", "float64_col"]]
df_b = scalars_df[["float64_col"]]
with pytest.raises(NotImplementedError):
df_a.join(df_b, how="outer").to_pandas()
with pytest.raises(ValueError):
df_a.join(df_b, how="outer")


@all_joins
def test_join_param_on_duplicate_with_index_raises_value_error(scalars_df_index, how):
if how == "cross":
return
bf_df_a = scalars_df_index[["string_col"]]
bf_df_a.index.name = "string_col"
bf_df_b = scalars_df_index.dropna()["string_col"]
with pytest.raises(ValueError):
bf_df_a.join(bf_df_b, on="string_col", how=how, lsuffix="_l", rsuffix="_r")


@all_joins
Expand Down
31 changes: 30 additions & 1 deletion third_party/bigframes_vendored/pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4536,7 +4536,15 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame:
# ----------------------------------------------------------------------
# Merging / joining methods

def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame:
def join(
self,
other,
*,
on: Optional[str] = None,
how: str,
lsuffix: str = "",
rsuffix: str = "",
) -> DataFrame:
"""Join columns of another DataFrame.

Join columns with `other` DataFrame on index
Expand Down Expand Up @@ -4609,6 +4617,19 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame:
<BLANKLINE>
[2 rows x 4 columns]

If there are overlapping columns, `lsuffix` and `rsuffix` can be used:

>>> df1 = bpd.DataFrame({'key': ['K0', 'K1', 'K2'], 'A': ['A0', 'A1', 'A2']})
>>> df2 = bpd.DataFrame({'key': ['K0', 'K1', 'K2'], 'A': ['B0', 'B1', 'B2']})
>>> df1.set_index('key').join(df2.set_index('key'), lsuffix='_left', rsuffix='_right')
A_left A_right
key
K0 A0 B0
K1 A1 B1
K2 A2 B2
<BLANKLINE>
[3 rows x 2 columns]

Args:
other:
DataFrame or Series with an Index similar to the Index of this one.
Expand All @@ -4625,6 +4646,10 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame:
index, preserving the order of the calling's one.
``cross``: creates the cartesian product from both frames, preserves
the order of the left keys.
lsuffix(str, default ''):
Suffix to use from left frame's overlapping columns.
rsuffix(str, default ''):
Suffix to use from right frame's overlapping columns.

Returns:
bigframes.pandas.DataFrame:
Expand All @@ -4639,6 +4664,10 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame:
ValueError:
If left index to join on does not have the same number of levels
as the right index.
ValueError:
If columns overlap but no suffix is specified.
ValueError:
If `on` column is not unique.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

Expand Down