Skip to content

Commit 5b0d0a0

Browse files
committed
test with cache and to_gbq
1 parent 79f4c58 commit 5b0d0a0

File tree

5 files changed

+65
-4
lines changed

5 files changed

+65
-4
lines changed

bigframes/core/array_value.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,11 @@ def from_table(
122122
# Scan all columns by default, we define this list as it can be pruned while preserving source_def
123123
scan_list = nodes.ScanList(
124124
tuple(
125-
nodes.ScanItem(ids.ColumnId(item.column), item.dtype, item.column)
125+
nodes.ScanItem(
126+
ids.ColumnId(bigframes.core.guid.generate_guid()),
127+
item.dtype,
128+
item.column,
129+
)
126130
for item in schema.items
127131
)
128132
)
@@ -143,7 +147,7 @@ def from_table(
143147
@property
144148
def column_ids(self) -> typing.Sequence[str]:
145149
"""Returns column ids as strings."""
146-
return self.schema.names
150+
return [id_.name for id_ in self.node.ids]
147151

148152
@property
149153
def session(self) -> Session:

bigframes/core/utils.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,26 @@ def label_to_identifier(label: typing.Hashable, strict: bool = False) -> str:
145145
elif identifier[0].isdigit():
146146
# first character must be letter or underscore
147147
identifier = "_" + identifier
148+
149+
# Except in special circumstances (true anonymous query results tables),
150+
# field names are not allowed to start with these (case-insensitive)
151+
# prefixes.
152+
# _PARTITION, _TABLE_, _FILE_, _ROW_TIMESTAMP, __ROOT__ and _COLIDENTIFIER
153+
if any(
154+
identifier.casefold().startswith(invalid_prefix.casefold())
155+
for invalid_prefix in (
156+
"_PARTITION",
157+
"_TABLE_",
158+
"_FILE_",
159+
"_ROW_TIMESTAMP",
160+
"__ROOT__",
161+
"_COLIDENTIFIER",
162+
)
163+
):
164+
# Remove leading _ character(s) to avoid collisions with preserved
165+
# prefixes.
166+
identifier = re.sub("^_+", "", identifier)
167+
148168
return identifier
149169

150170

bigframes/session/loader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -556,11 +556,11 @@ def read_gbq_table(
556556
index_cols = [sequential_index_col]
557557
index_names = [None]
558558

559-
value_columns = [col for col in array_value.column_ids if col not in index_cols]
559+
value_names = [col for col in schema.names if col not in index_names]
560560
block = blocks.Block(
561561
array_value,
562562
index_columns=index_cols,
563-
column_labels=value_columns,
563+
column_labels=value_names,
564564
index_labels=index_names,
565565
)
566566
if max_results:

tests/system/small/test_dataframe.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5051,6 +5051,23 @@ def test_df_cached(scalars_df_index):
50515051
pandas.testing.assert_frame_equal(df.to_pandas(), df_cached_copy.to_pandas())
50525052

50535053

5054+
def test_df_cached_w_wildcard_table(session):
5055+
"""Test the `cache()` API with a DataFrame that contains pseudocolumns from wildcard tables
5056+
5057+
Regression test for internal issue b/405773140.
5058+
"""
5059+
df = session.read_gbq("bigquery-public-data.google_analytics_sample.ga_sessions_*")
5060+
df = (
5061+
df[df["_TABLE_SUFFIX"] == "20161204"]
5062+
.groupby(
5063+
["visitorId", "visitNumber", "visitId", "_TABLE_SUFFIX"], as_index=False
5064+
)
5065+
.size()
5066+
)
5067+
df_cached_copy = df.cache()
5068+
pandas.testing.assert_frame_equal(df.to_pandas(), df_cached_copy.to_pandas())
5069+
5070+
50545071
def test_assign_after_binop_row_joins():
50555072
pd_df = pd.DataFrame(
50565073
{

tests/system/small/test_dataframe_io.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -656,6 +656,26 @@ def test_to_gbq_w_None_column_names(
656656
)
657657

658658

659+
def test_to_gbq_w_wildcard_table(session, dataset_id):
660+
"""Test the `to_gbq` API with a DataFrame that contains pseudocolumns from wildcard tables
661+
662+
Regression test for internal issue b/405773140.
663+
"""
664+
destination_table = f"{dataset_id}.test_to_gbq_w_wildcard_table"
665+
df = session.read_gbq("bigquery-public-data.google_analytics_sample.ga_sessions_*")
666+
df = df[df["_TABLE_SUFFIX"] == "20161204"][
667+
["visitorId", "visitNumber", "visitId", "_TABLE_SUFFIX"]
668+
]
669+
df.to_gbq(destination_table, if_exists="replace")
670+
671+
bf_result = bpd.read_gbq(destination_table)
672+
pd.testing.assert_index_equal(
673+
bf_result.columns,
674+
# Remove leading _ to allow serialization.
675+
pd.Index(["visitorId", "visitNumber", "visitId", "TABLE_SUFFIX"]),
676+
)
677+
678+
659679
@pytest.mark.parametrize(
660680
"clustering_columns",
661681
[

0 commit comments

Comments
 (0)