feat: Add ai_generate_bool to the bigframes.bigquery package (#2060)

sycai · tswast · web-flow · commit 70d6562df64b · 2025-09-15T11:58:44.000-07:00
* feat: Add ai_generate_bool to the bigframes.bigquery package

* fix stuffs

* Fix format

* fix doc format

* fix format

* fix code

* expose ai module and rename the function

* add ai module to doc

* fix test

* fix test

* Update bigframes/bigquery/_operations/ai.py

Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;

---------

Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;
diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py
@@ -18,6 +18,7 @@
 
 import sys
 
+from bigframes.bigquery._operations import ai
 from bigframes.bigquery._operations.approx_agg import approx_top_count
 from bigframes.bigquery._operations.array import (
     array_agg,
@@ -98,7 +99,7 @@
     struct,
 ]
 
-__all__ = [f.__name__ for f in _functions]
+__all__ = [f.__name__ for f in _functions] + ["ai"]
 
 _module = sys.modules[__name__]
 for f in _functions:
diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py
@@ -0,0 +1,171 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This module integrates BigQuery built-in AI functions for use with Series/DataFrame objects,
+such as AI.GENERATE_BOOL:
+https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-generate-bool"""
+
+from __future__ import annotations
+
+import json
+from typing import Any, List, Literal, Mapping, Tuple
+
+from bigframes import clients, dtypes, series
+from bigframes.core import log_adapter
+from bigframes.operations import ai_ops
+
+
+@log_adapter.method_logger(custom_base_name="bigquery_ai")
+def generate_bool(
+    prompt: series.Series | List[str | series.Series] | Tuple[str | series.Series, ...],
+    *,
+    connection_id: str | None = None,
+    endpoint: str | None = None,
+    request_type: Literal["dedicated", "shared", "unspecified"] = "unspecified",
+    model_params: Mapping[Any, Any] | None = None,
+) -> series.Series:
+    """
+    Returns the AI analysis based on the prompt, which can be any combination of text and unstructured data.
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> bpd.options.display.progress_bar = None
+        >>> df = bpd.DataFrame({
+        ...     "col_1": ["apple", "bear", "pear"],
+        ...     "col_2": ["fruit", "animal", "animal"]
+        ... })
+        >>> bbq.ai_generate_bool((df["col_1"], " is a ", df["col_2"]))
+        0    {'result': True, 'full_response': '{"candidate...
+        1    {'result': True, 'full_response': '{"candidate...
+        2    {'result': False, 'full_response': '{"candidat...
+        dtype: struct<result: bool, full_response: string, status: string>[pyarrow]
+
+        >>> bbq.ai_generate_bool((df["col_1"], " is a ", df["col_2"])).struct.field("result")
+        0     True
+        1     True
+        2    False
+        Name: result, dtype: boolean
+
+        >>> model_params = {
+        ...     "generation_config": {
+        ...         "thinking_config": {
+        ...             "thinking_budget": 0
+        ...         }
+        ...     }
+        ... }
+        >>> bbq.ai_generate_bool(
+        ...     (df["col_1"], " is a ", df["col_2"]),
+        ...     endpoint="gemini-2.5-pro",
+        ...     model_params=model_params,
+        ... ).struct.field("result")
+        0     True
+        1     True
+        2    False
+        Name: result, dtype: boolean
+
+    Args:
+        prompt (series.Series | List[str|series.Series] | Tuple[str|series.Series, ...]):
+            A mixture of Series and string literals that specifies the prompt to send to the model.
+        connection_id (str, optional):
+            Specifies the connection to use to communicate with the model. For example, `myproject.us.myconnection`.
+            If not provided, the connection from the current session will be used.
+        endpoint (str, optional):
+            Specifies the Vertex AI endpoint to use for the model. For example `"gemini-2.5-flash"`. You can specify any
+            generally available or preview Gemini model. If you specify the model name, BigQuery ML automatically identifies and
+            uses the full endpoint of the model. If you don't specify an ENDPOINT value, BigQuery ML selects a recent stable
+            version of Gemini to use.
+        request_type (Literal["dedicated", "shared", "unspecified"]):
+            Specifies the type of inference request to send to the Gemini model. The request type determines what quota the request uses.
+            * "dedicated": function only uses Provisioned Throughput quota. The function returns the error Provisioned throughput is not
+            purchased or is not active if Provisioned Throughput quota isn't available.
+            * "shared": the function only uses dynamic shared quota (DSQ), even if you have purchased Provisioned Throughput quota.
+            * "unspecified": If you haven't purchased Provisioned Throughput quota, the function uses DSQ quota.
+            If you have purchased Provisioned Throughput quota, the function uses the Provisioned Throughput quota first.
+            If requests exceed the Provisioned Throughput quota, the overflow traffic uses DSQ quota.
+        model_params (Mapping[Any, Any]):
+            Provides additional parameters to the model. The MODEL_PARAMS value must conform to the generateContent request body format.
+
+    Returns:
+        bigframes.series.Series: A new struct Series with the result data. The struct contains these fields:
+        * "result": a BOOL value containing the model's response to the prompt. The result is None if the request fails or is filtered by responsible AI.
+        * "full_response": a STRING value containing the JSON response from the projects.locations.endpoints.generateContent call to the model.
+        The generated text is in the text element.
+        * "status": a STRING value that contains the API response status for the corresponding row. This value is empty if the operation was successful.
+    """
+
+    prompt_context, series_list = _separate_context_and_series(prompt)
+    assert len(series_list) > 0
+
+    operator = ai_ops.AIGenerateBool(
+        prompt_context=tuple(prompt_context),
+        connection_id=_resolve_connection_id(series_list[0], connection_id),
+        endpoint=endpoint,
+        request_type=request_type,
+        model_params=json.dumps(model_params) if model_params else None,
+    )
+
+    return series_list[0]._apply_nary_op(operator, series_list[1:])
+
+
+def _separate_context_and_series(
+    prompt: series.Series | List[str | series.Series] | Tuple[str | series.Series, ...],
+) -> Tuple[List[str | None], List[series.Series]]:
+    """
+    Returns the two values. The first value is the prompt with all series replaced by None. The second value is all the series
+    in the prompt. The original item order is kept.
+    For example:
+    Input: ("str1", series1, "str2", "str3", series2)
+    Output: ["str1", None, "str2", "str3", None], [series1, series2]
+    """
+    if not isinstance(prompt, (list, tuple, series.Series)):
+        raise ValueError(f"Unsupported prompt type: {type(prompt)}")
+
+    if isinstance(prompt, series.Series):
+        if prompt.dtype == dtypes.OBJ_REF_DTYPE:
+            # Multi-model support
+            return [None], [prompt.blob.read_url()]
+        return [None], [prompt]
+
+    prompt_context: List[str | None] = []
+    series_list: List[series.Series] = []
+
+    for item in prompt:
+        if isinstance(item, str):
+            prompt_context.append(item)
+
+        elif isinstance(item, series.Series):
+            prompt_context.append(None)
+
+            if item.dtype == dtypes.OBJ_REF_DTYPE:
+                # Multi-model support
+                item = item.blob.read_url()
+            series_list.append(item)
+
+        else:
+            raise TypeError(f"Unsupported type in prompt: {type(item)}")
+
+    if not series_list:
+        raise ValueError("Please provide at least one Series in the prompt")
+
+    return prompt_context, series_list
+
+
+def _resolve_connection_id(series: series.Series, connection_id: str | None):
+    return clients.get_canonical_bq_connection_id(
+        connection_id or series._session._bq_connection,
+        series._session._project,
+        series._session._location,
+    )
diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py
@@ -17,8 +17,10 @@
 import functools
 import typing
 
+from bigframes_vendored import ibis
 import bigframes_vendored.ibis.expr.api as ibis_api
 import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes
+import bigframes_vendored.ibis.expr.operations.ai_ops as ai_ops
 import bigframes_vendored.ibis.expr.operations.generic as ibis_generic
 import bigframes_vendored.ibis.expr.operations.udf as ibis_udf
 import bigframes_vendored.ibis.expr.types as ibis_types
@@ -1963,6 +1965,30 @@ def struct_op_impl(
     return ibis_types.struct(data)
 
 
+@scalar_op_compiler.register_nary_op(ops.AIGenerateBool, pass_op=True)
+def ai_generate_bool(
+    *values: ibis_types.Value, op: ops.AIGenerateBool
+) -> ibis_types.StructValue:
+
+    prompt: dict[str, ibis_types.Value | str] = {}
+    column_ref_idx = 0
+
+    for idx, elem in enumerate(op.prompt_context):
+        if elem is None:
+            prompt[f"_field_{idx + 1}"] = values[column_ref_idx]
+            column_ref_idx += 1
+        else:
+            prompt[f"_field_{idx + 1}"] = elem
+
+    return ai_ops.AIGenerateBool(
+        ibis.struct(prompt),  # type: ignore
+        op.connection_id,  # type: ignore
+        op.endpoint,  # type: ignore
+        op.request_type.upper(),  # type: ignore
+        op.model_params,  # type: ignore
+    ).to_expr()
+
+
 @scalar_op_compiler.register_nary_op(ops.RowKey, pass_op=True)
 def rowkey_op_impl(*values: ibis_types.Value, op: ops.RowKey) -> ibis_types.Value:
     return bigframes.core.compile.default_ordering.gen_row_key(values)
diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py
@@ -14,6 +14,7 @@
 
 from __future__ import annotations
 
+from bigframes.operations.ai_ops import AIGenerateBool
 from bigframes.operations.array_ops import (
     ArrayIndexOp,
     ArrayReduceOp,
@@ -408,6 +409,8 @@
     "geo_x_op",
     "geo_y_op",
     "GeoStDistanceOp",
+    # AI ops
+    "AIGenerateBool",
     # Numpy ops mapping
     "NUMPY_TO_BINOP",
     "NUMPY_TO_OP",
diff --git a/bigframes/operations/ai_ops.py b/bigframes/operations/ai_ops.py
@@ -0,0 +1,47 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import ClassVar, Literal, Tuple
+
+import pandas as pd
+import pyarrow as pa
+
+from bigframes import dtypes
+from bigframes.operations import base_ops
+
+
+@dataclasses.dataclass(frozen=True)
+class AIGenerateBool(base_ops.NaryOp):
+    name: ClassVar[str] = "ai_generate_bool"
+
+    # None are the placeholders for column references.
+    prompt_context: Tuple[str | None, ...]
+    connection_id: str
+    endpoint: str | None
+    request_type: Literal["dedicated", "shared", "unspecified"]
+    model_params: str | None
+
+    def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
+        return pd.ArrowDtype(
+            pa.struct(
+                (
+                    pa.field("result", pa.bool_()),
+                    pa.field("full_response", pa.string()),
+                    pa.field("status", pa.string()),
+                )
+            )
+        )
diff --git a/docs/reference/bigframes.bigquery/ai.rst b/docs/reference/bigframes.bigquery/ai.rst
@@ -0,0 +1,7 @@
+bigframes.bigquery.ai
+=============================
+
+.. automodule:: bigframes.bigquery._operations.ai
+    :members:
+    :inherited-members:
+    :undoc-members:
diff --git a/docs/reference/bigframes.bigquery/index.rst b/docs/reference/bigframes.bigquery/index.rst
@@ -5,5 +5,9 @@ BigQuery Built-in Functions
 
 .. automodule:: bigframes.bigquery
     :members:
-    :inherited-members:
     :undoc-members:
+
+.. toctree::
+    :maxdepth: 2
+
+    ai
diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml
@@ -218,6 +218,8 @@
   - items:
     - name: BigQuery built-in functions
       uid: bigframes.bigquery
+    - name: BigQuery AI Functions
+      uid: bigframes.bigquery.ai
     name: bigframes.bigquery
   - items:
     - name: GeoSeries
diff --git a/tests/system/large/bigquery/__init__.py b/tests/system/large/bigquery/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/system/large/bigquery/test_ai.py b/tests/system/large/bigquery/test_ai.py
@@ -0,0 +1,35 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+import pandas.testing
+
+import bigframes.bigquery as bbq
+
+
+def test_ai_generate_bool_multi_model(session):
+    df = session.from_glob_path(
+        "gs://bigframes-dev-testing/a_multimodel/images/*", name="image"
+    )
+
+    result = bbq.ai.generate_bool((df["image"], " contains an animal")).struct.field(
+        "result"
+    )
+
+    pandas.testing.assert_series_equal(
+        result.to_pandas(),
+        pd.Series([True, True, False, False, False], name="result"),
+        check_dtype=False,
+        check_index=False,
+    )
diff --git a/tests/system/small/bigquery/test_ai.py b/tests/system/small/bigquery/test_ai.py
diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py
diff --git a/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py b/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py