googleapis · Genesis929 · Jun 26, 2025 · Jun 26, 2025 · Jun 26, 2025 · Jun 26, 2025
@@ -3412,37 +3412,72 @@ def join(
         *,
         on: Optional[str] = None,
         how: str = "left",
+        lsuffix: str = "",
+        rsuffix: str = "",
     ) -> DataFrame:
         if isinstance(other, bigframes.series.Series):
             other = other.to_frame()
 
         left, right = self, other
 
-        if not left.columns.intersection(right.columns).empty:
-            raise NotImplementedError(
-                f"Deduping column names is not implemented. {constants.FEEDBACK_LINK}"
-            )
+        col_intersection = left.columns.intersection(right.columns)
+
+        if not col_intersection.empty:
+            if lsuffix == rsuffix == "":
+                raise ValueError(
+                    f"columns overlap but no suffix specified: {col_intersection}"
+                )
+
         if how == "cross":
             if on is not None:
                 raise ValueError("'on' is not supported for cross join.")
             result_block = left._block.merge(
                 right._block,
                 left_join_ids=[],
                 right_join_ids=[],
-                suffixes=("", ""),
+                suffixes=(lsuffix, rsuffix),
                 how="cross",
                 sort=True,
             )
             return DataFrame(result_block)
 
         # Join left columns with right index
         if on is not None:
+            if left._has_index and (on in left.index.names):
+                if on in left.columns:
+                    raise ValueError(
+                        f"'{on}' is both an index level and a column label, which is ambiguous."
+                    )
+                else:
+                    raise NotImplementedError(
+                        f"Joining on index level '{on}' is not yet supported. {constants.FEEDBACK_LINK}"
+                    )
+            if (left.columns == on).sum() > 1:
+                raise ValueError(f"The column label '{on}' is not unique.")
+
             if other._block.index.nlevels != 1:
                 raise ValueError(
                     "Join on columns must match the index level of the other DataFrame. Join on column with multi-index haven't been supported."
                 )
+
+            # Replace all columns names with unique names for reordering.
+            left_col_original_names = left.columns
+            on_col_name = "bigframes_left_col_on"
+            dup_on_col_name = "bigframes_left_col_on_dup"
+            left_col_temp_names = [
+                f"bigframes_left_col_name_{i}" if col_name != on else on_col_name
+                for i, col_name in enumerate(left_col_original_names)
+            ]
+            left.columns = pandas.Index(left_col_temp_names)
+            # if on column is also in right df, we need to duplicate the column
+            # and set it to be the first column
+            if on in col_intersection:
+                left[dup_on_col_name] = left[on_col_name]
+                on_col_name = dup_on_col_name
+                left_col_temp_names = [on_col_name] + left_col_temp_names
+                left = left[left_col_temp_names]
+
             # Switch left index with on column
-            left_columns = left.columns
             left_idx_original_names = left.index.names if left._has_index else ()
             left_idx_names_in_cols = [
                 f"bigframes_left_idx_name_{i}"
@@ -3451,11 +3486,18 @@ def join(
             if left._has_index:
                 left.index.names = left_idx_names_in_cols
             left = left.reset_index(drop=False)
-            left = left.set_index(on)
+            left = left.set_index(on_col_name)
+
+            right_col_original_names = right.columns
+            right_col_temp_names = [
+                f"bigframes_right_col_name_{i}"
+                for i in range(len(right_col_original_names))
+            ]
+            right.columns = pandas.Index(right_col_temp_names)
 
             # Join on index and switch back
             combined_df = left._perform_join_by_index(right, how=how)
-            combined_df.index.name = on
+            combined_df.index.name = on_col_name
             combined_df = combined_df.reset_index(drop=False)
             combined_df = combined_df.set_index(left_idx_names_in_cols)
 
@@ -3468,14 +3510,22 @@ def join(
                 )
 
             # Reorder columns
-            combined_df = combined_df[list(left_columns) + list(right.columns)]
-            return combined_df
+            combined_df = combined_df[left_col_temp_names + right_col_temp_names]
+            return combined_df._add_join_suffix(
+                left_col_original_names,
+                right_col_original_names,
+                lsuffix=lsuffix,
+                rsuffix=rsuffix,
+                extra_col=on if on_col_name == dup_on_col_name else None,
+            )
 
         # Join left index with right index
         if left._block.index.nlevels != right._block.index.nlevels:
             raise ValueError("Index to join on must have the same number of levels.")
 
-        return left._perform_join_by_index(right, how=how)
+        return left._perform_join_by_index(right, how=how)._add_join_suffix(
+            left.columns, right.columns, lsuffix=lsuffix, rsuffix=rsuffix
+        )
 
     def _perform_join_by_index(
         self,
@@ -3489,6 +3539,30 @@ def _perform_join_by_index(
         )
         return DataFrame(block)
 
+    def _add_join_suffix(
+        self,
+        left_columns,
+        right_columns,
+        lsuffix: str = "",
+        rsuffix: str = "",
+        extra_col: typing.Optional[str] = None,
+    ):
+        col_intersection = left_columns.intersection(right_columns)
+        final_col_names = [] if extra_col is None else [extra_col]
+        for col_name in left_columns:
+            if col_name in col_intersection:
+                final_col_names.append(f"{col_name}{lsuffix}")
+            else:
+                final_col_names.append(col_name)
+
+        for col_name in right_columns:
+            if col_name in col_intersection:
+                final_col_names.append(f"{col_name}{rsuffix}")
+            else:
+                final_col_names.append(col_name)
+        self.columns = pandas.Index(final_col_names)
+        return self
+
     @validations.requires_ordering()
     def rolling(
         self,

@@ -2816,12 +2816,99 @@ def test_join_different_table(
     assert_pandas_df_equal(bf_result, pd_result, ignore_order=True)
 
 
-def test_join_duplicate_columns_raises_not_implemented(scalars_dfs):
-    scalars_df, _ = scalars_dfs
-    df_a = scalars_df[["string_col", "float64_col"]]
-    df_b = scalars_df[["float64_col"]]
-    with pytest.raises(NotImplementedError):
-        df_a.join(df_b, how="outer").to_pandas()
+@all_joins
+def test_join_different_table_with_duplicate_column_name(
+    scalars_df_index, scalars_pandas_df_index, how
+):
+    bf_df_a = scalars_df_index[["string_col", "int64_col", "int64_too"]].rename(
+        columns={"int64_too": "int64_col"}
+    )
+    bf_df_b = scalars_df_index.dropna()[
+        ["string_col", "int64_col", "int64_too"]
+    ].rename(columns={"int64_too": "int64_col"})
+    bf_result = bf_df_a.join(bf_df_b, how=how, lsuffix="_l", rsuffix="_r").to_pandas()
+    print(bf_result)
+    pd_df_a = scalars_pandas_df_index[["string_col", "int64_col", "int64_too"]].rename(
+        columns={"int64_too": "int64_col"}
+    )
+    pd_df_b = scalars_pandas_df_index.dropna()[
+        ["string_col", "int64_col", "int64_too"]
+    ].rename(columns={"int64_too": "int64_col"})
+    pd_result = pd_df_a.join(pd_df_b, how=how, lsuffix="_l", rsuffix="_r")
+    print(pd_result)
+
+    pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False)
+
+
+@all_joins
+def test_join_param_on_with_duplicate_column_name_not_on_col(
+    scalars_df_index, scalars_pandas_df_index, how
+):
+    # This test is for duplicate column names, but the 'on' column is not duplicated.
+    if how == "cross":
+        return
+    bf_df_a = scalars_df_index[
+        ["string_col", "datetime_col", "timestamp_col", "int64_too"]
+    ].rename(columns={"timestamp_col": "datetime_col"})
+    bf_df_b = scalars_df_index.dropna()[
+        ["string_col", "datetime_col", "timestamp_col"]
+    ].rename(columns={"timestamp_col": "datetime_col"})
+    bf_result = bf_df_a.join(
+        bf_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r"
+    ).to_pandas()
+    pd_df_a = scalars_pandas_df_index[
+        ["string_col", "datetime_col", "timestamp_col", "int64_too"]
+    ].rename(columns={"timestamp_col": "datetime_col"})
+    pd_df_b = scalars_pandas_df_index.dropna()[
+        ["string_col", "datetime_col", "timestamp_col"]
+    ].rename(columns={"timestamp_col": "datetime_col"})
+    pd_result = pd_df_a.join(
+        pd_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r"
+    )
+    pd.testing.assert_frame_equal(
+        bf_result.sort_index(),
+        pd_result.sort_index(),
+        check_like=True,
+        check_index_type=False,
+    )
+
+
+@pytest.mark.skipif(
+    pandas.__version__.startswith("1."), reason="bad left join in pandas 1.x"
+)
+@all_joins
+def test_join_param_on_with_duplicate_column_name_on_col(
+    scalars_df_index, scalars_pandas_df_index, how
+):
+    # This test is for duplicate column names, and the 'on' column is duplicated.
+    if how == "cross":
+        return
+    bf_df_a = scalars_df_index[
+        ["string_col", "datetime_col", "timestamp_col", "int64_too"]
+    ].rename(columns={"timestamp_col": "datetime_col"})
+    bf_df_b = scalars_df_index.dropna()[
+        ["string_col", "datetime_col", "timestamp_col", "int64_too"]
+    ].rename(columns={"timestamp_col": "datetime_col"})
+    bf_result = bf_df_a.join(
+        bf_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r"
+    ).to_pandas()
+    print(bf_result)
+    pd_df_a = scalars_pandas_df_index[
+        ["string_col", "datetime_col", "timestamp_col", "int64_too"]
+    ].rename(columns={"timestamp_col": "datetime_col"})
+    pd_df_b = scalars_pandas_df_index.dropna()[
+        ["string_col", "datetime_col", "timestamp_col", "int64_too"]
+    ].rename(columns={"timestamp_col": "datetime_col"})
+    pd_result = pd_df_a.join(
+        pd_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r"
+    )
+    print(pd_result)
+    pd.testing.assert_frame_equal(
+        bf_result.sort_index(),
+        pd_result.sort_index(),
+        check_like=True,
+        check_index_type=False,
+    )
 
 
 @all_joins

@@ -2443,12 +2443,36 @@ def test_join_different_table(
     assert_pandas_df_equal(bf_result, pd_result, ignore_order=True)
 
 
-def test_join_duplicate_columns_raises_not_implemented(scalars_dfs):
+@all_joins
+def test_join_raise_when_param_on_duplicate_with_column(scalars_df_index, how):
+    if how == "cross":
+        return
+    bf_df_a = scalars_df_index[["string_col", "int64_col"]].rename(
+        columns={"int64_col": "string_col"}
+    )
+    bf_df_a.index.name = "string_col"
+    bf_df_b = scalars_df_index.dropna()["string_col"]
+    with pytest.raises(ValueError):
+        bf_df_a.join(bf_df_b, on="string_col", how=how, lsuffix="_l", rsuffix="_r")
+
+
+def test_join_duplicate_columns_raises_value_error(scalars_dfs):
     scalars_df, _ = scalars_dfs
     df_a = scalars_df[["string_col", "float64_col"]]
     df_b = scalars_df[["float64_col"]]
-    with pytest.raises(NotImplementedError):
-        df_a.join(df_b, how="outer").to_pandas()
+    with pytest.raises(ValueError):
+        df_a.join(df_b, how="outer")
+
+
+@all_joins
+def test_join_param_on_duplicate_with_index_raises_value_error(scalars_df_index, how):
+    if how == "cross":
+        return
+    bf_df_a = scalars_df_index[["string_col"]]
+    bf_df_a.index.name = "string_col"
+    bf_df_b = scalars_df_index.dropna()["string_col"]
+    with pytest.raises(ValueError):
+        bf_df_a.join(bf_df_b, on="string_col", how=how, lsuffix="_l", rsuffix="_r")
 
 
 @all_joins

@@ -4536,7 +4536,15 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame:
     # ----------------------------------------------------------------------
     # Merging / joining methods
 
-    def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame:
+    def join(
+        self,
+        other,
+        *,
+        on: Optional[str] = None,
+        how: str,
+        lsuffix: str = "",
+        rsuffix: str = "",
+    ) -> DataFrame:
         """Join columns of another DataFrame.
 
         Join columns with `other` DataFrame on index
@@ -4609,6 +4617,19 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame:
             <BLANKLINE>
             [2 rows x 4 columns]
 
+        If there are overlapping columns, `lsuffix` and `rsuffix` can be used:
+
+            >>> df1 = bpd.DataFrame({'key': ['K0', 'K1', 'K2'], 'A': ['A0', 'A1', 'A2']})
+            >>> df2 = bpd.DataFrame({'key': ['K0', 'K1', 'K2'], 'A': ['B0', 'B1', 'B2']})
+            >>> df1.set_index('key').join(df2.set_index('key'), lsuffix='_left', rsuffix='_right')
+                 A_left A_right
+            key
+            K0       A0      B0
+            K1       A1      B1
+            K2       A2      B2
+            <BLANKLINE>
+            [3 rows x 2 columns]
+
         Args:
             other:
                 DataFrame or Series with an Index similar to the Index of this one.
@@ -4625,6 +4646,10 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame:
                 index, preserving the order of the calling's one.
                 ``cross``: creates the cartesian product from both frames, preserves
                 the order of the left keys.
+            lsuffix(str, default ''):
+                Suffix to use from left frame's overlapping columns.
+            rsuffix(str, default ''):
+                Suffix to use from right frame's overlapping columns.
 
         Returns:
             bigframes.pandas.DataFrame:
@@ -4639,6 +4664,10 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame:
             ValueError:
                 If left index to join on does not have the same number of levels
                 as the right index.
+            ValueError:
+                If columns overlap but no suffix is specified.
+            ValueError:
+                If `on` column is not unique.
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)