Skip to content

fix: detect duplicate column/index names in read_gbq before send query. #1615

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions bigframes/session/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,31 @@ def _to_index_cols(
return index_cols


def _check_column_duplicates(index_cols: Iterable[str], columns: Iterable[str]):
index_cols_list = list(index_cols) if index_cols is not None else []
columns_list = list(columns) if columns is not None else []
set_index = set(index_cols_list)
set_columns = set(columns_list)

if len(index_cols_list) > len(set_index):
raise ValueError(
"The 'index_col' argument contains duplicate names. "
"All column names specified in 'index_col' must be unique."
)

if len(columns_list) > len(set_columns):
raise ValueError(
"The 'columns' argument contains duplicate names. "
"All column names specified in 'columns' must be unique."
)

if not set_index.isdisjoint(set_columns):
raise ValueError(
"Found column names that exist in both 'index_col' and 'columns' arguments. "
"These arguments must specify distinct sets of columns."
)


@dataclasses.dataclass
class GbqDataLoader:
"""
Expand Down Expand Up @@ -328,6 +353,7 @@ def read_gbq_table(
table=table,
index_col=index_col,
)
_check_column_duplicates(index_cols, columns)

for key in index_cols:
if key not in table_column_names:
Expand Down Expand Up @@ -569,6 +595,7 @@ def read_gbq_query(
)

index_cols = _to_index_cols(index_col)
_check_column_duplicates(index_cols, columns)

filters_copy1, filters_copy2 = itertools.tee(filters)
has_filters = len(list(filters_copy1)) != 0
Expand Down
53 changes: 53 additions & 0 deletions tests/system/small/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -1627,3 +1627,56 @@ def test_read_gbq_test(test_session: bigframes.Session):
actual = test_session.read_gbq(table_id).to_pandas()

assert actual.shape == (1, 1)


@pytest.mark.parametrize(
("query_or_table", "index_col", "columns"),
[
pytest.param(
"{scalars_table_id}",
("int64_col", "string_col", "int64_col"),
("float64_col", "bool_col"),
id="table_input_index_col_dup",
marks=pytest.mark.xfail(
raises=ValueError,
reason="ValueError: Duplicate names within 'index_col'.",
strict=True,
),
),
pytest.param(
"""SELECT int64_col, string_col, float64_col, bool_col
FROM `{scalars_table_id}`""",
("int64_col",),
("string_col", "float64_col", "string_col"),
id="query_input_columns_dup",
marks=pytest.mark.xfail(
raises=ValueError,
reason="ValueError: Duplicate names within 'columns'.",
strict=True,
),
),
pytest.param(
"{scalars_table_id}",
("int64_col", "string_col"),
("float64_col", "string_col", "bool_col"),
id="table_input_cross_dup",
marks=pytest.mark.xfail(
raises=ValueError,
reason="ValueError: Overlap between 'index_col' and 'columns'.",
strict=True,
),
),
],
)
def test_read_gbq_duplicate_columns_xfail(
session: bigframes.Session,
scalars_table_id: str,
query_or_table: str,
index_col: tuple,
columns: tuple,
):
session.read_gbq(
query_or_table.format(scalars_table_id=scalars_table_id),
index_col=index_col,
columns=columns,
)