test with cache and to_gbq

tswast · tswast · commit 5b0d0a0a78eb · 2025-04-28T21:21:07.000-05:00
diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py
@@ -122,7 +122,11 @@ def from_table(
         # Scan all columns by default, we define this list as it can be pruned while preserving source_def
         scan_list = nodes.ScanList(
             tuple(
-                nodes.ScanItem(ids.ColumnId(item.column), item.dtype, item.column)
+                nodes.ScanItem(
+                    ids.ColumnId(bigframes.core.guid.generate_guid()),
+                    item.dtype,
+                    item.column,
+                )
                 for item in schema.items
             )
         )
@@ -143,7 +147,7 @@ def from_table(
     @property
     def column_ids(self) -> typing.Sequence[str]:
         """Returns column ids as strings."""
-        return self.schema.names
+        return [id_.name for id_ in self.node.ids]
 
     @property
     def session(self) -> Session:
diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py
@@ -145,6 +145,26 @@ def label_to_identifier(label: typing.Hashable, strict: bool = False) -> str:
         elif identifier[0].isdigit():
             # first character must be letter or underscore
             identifier = "_" + identifier
+
+    # Except in special circumstances (true anonymous query results tables),
+    # field names are not allowed to start with these (case-insensitive)
+    # prefixes.
+    # _PARTITION, _TABLE_, _FILE_, _ROW_TIMESTAMP, __ROOT__ and _COLIDENTIFIER
+    if any(
+        identifier.casefold().startswith(invalid_prefix.casefold())
+        for invalid_prefix in (
+            "_PARTITION",
+            "_TABLE_",
+            "_FILE_",
+            "_ROW_TIMESTAMP",
+            "__ROOT__",
+            "_COLIDENTIFIER",
+        )
+    ):
+        # Remove leading _ character(s) to avoid collisions with preserved
+        # prefixes.
+        identifier = re.sub("^_+", "", identifier)
+
     return identifier
 
 
diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py
@@ -556,11 +556,11 @@ def read_gbq_table(
             index_cols = [sequential_index_col]
             index_names = [None]
 
-        value_columns = [col for col in array_value.column_ids if col not in index_cols]
+        value_names = [col for col in schema.names if col not in index_names]
         block = blocks.Block(
             array_value,
             index_columns=index_cols,
-            column_labels=value_columns,
+            column_labels=value_names,
             index_labels=index_names,
         )
         if max_results:
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -5051,6 +5051,23 @@ def test_df_cached(scalars_df_index):
     pandas.testing.assert_frame_equal(df.to_pandas(), df_cached_copy.to_pandas())
 
 
+def test_df_cached_w_wildcard_table(session):
+    """Test the `cache()` API with a DataFrame that contains pseudocolumns from wildcard tables
+
+    Regression test for internal issue b/405773140.
+    """
+    df = session.read_gbq("bigquery-public-data.google_analytics_sample.ga_sessions_*")
+    df = (
+        df[df["_TABLE_SUFFIX"] == "20161204"]
+        .groupby(
+            ["visitorId", "visitNumber", "visitId", "_TABLE_SUFFIX"], as_index=False
+        )
+        .size()
+    )
+    df_cached_copy = df.cache()
+    pandas.testing.assert_frame_equal(df.to_pandas(), df_cached_copy.to_pandas())
+
+
 def test_assign_after_binop_row_joins():
     pd_df = pd.DataFrame(
         {
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
@@ -656,6 +656,26 @@ def test_to_gbq_w_None_column_names(
     )
 
 
+def test_to_gbq_w_wildcard_table(session, dataset_id):
+    """Test the `to_gbq` API with a DataFrame that contains pseudocolumns from wildcard tables
+
+    Regression test for internal issue b/405773140.
+    """
+    destination_table = f"{dataset_id}.test_to_gbq_w_wildcard_table"
+    df = session.read_gbq("bigquery-public-data.google_analytics_sample.ga_sessions_*")
+    df = df[df["_TABLE_SUFFIX"] == "20161204"][
+        ["visitorId", "visitNumber", "visitId", "_TABLE_SUFFIX"]
+    ]
+    df.to_gbq(destination_table, if_exists="replace")
+
+    bf_result = bpd.read_gbq(destination_table)
+    pd.testing.assert_index_equal(
+        bf_result.columns,
+        # Remove leading _ to allow serialization.
+        pd.Index(["visitorId", "visitNumber", "visitId", "TABLE_SUFFIX"]),
+    )
+
+
 @pytest.mark.parametrize(
     "clustering_columns",
     [