docs: clarify that only NULL values are handled by fillna/isna, not NaN (#2176)

tswast · web-flow · commit 8f27e737fc78 · 2025-10-16T15:04:00.000-05:00
* docs: clarify that only NULL values are handled by fillna/isna, not NaN

* fix series fillna doctest
diff --git a/.gitignore b/.gitignore
@@ -62,3 +62,4 @@ system_tests/local_test_setup
 # Make sure a generated file isn't accidentally committed.
 pylintrc
 pylintrc.test
+dummy.pkl
diff --git a/conftest.py b/conftest.py
@@ -0,0 +1,37 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+import pytest
+
+import bigframes._config
+import bigframes.pandas as bpd
+
+
+@pytest.fixture(autouse=True)
+def default_doctest_imports(doctest_namespace):
+    """
+    Avoid some boilerplate in pandas-inspired tests.
+
+    See: https://docs.pytest.org/en/stable/how-to/doctest.html#doctest-namespace-fixture
+    """
+    doctest_namespace["np"] = np
+    doctest_namespace["pd"] = pd
+    doctest_namespace["pa"] = pa
+    doctest_namespace["bpd"] = bpd
+    bigframes._config.options.display.progress_bar = None
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -6978,7 +6978,7 @@ def query(self, expr: str) -> DataFrame | None:
 
     def interpolate(self, method: str = "linear"):
         """
-        Fill NaN values using an interpolation method.
+        Fill NA (NULL in BigQuery) values using an interpolation method.
 
         **Examples:**
 
@@ -7028,35 +7028,39 @@ def interpolate(self, method: str = "linear"):
 
     def fillna(self, value):
         """
-        Fill NA/NaN values using the specified method.
+        Fill NA (NULL in BigQuery) values using the specified method.
 
-        **Examples:**
+        Note that empty strings ``''``, :attr:`numpy.inf`, and
+        :attr:`numpy.nan` are ***not*** considered NA values. This NA/NULL
+        logic differs from numpy, but it is the same as BigQuery and the
+        :class:`pandas.ArrowDtype`.
 
-            >>> import bigframes.pandas as bpd
-            >>> bpd.options.display.progress_bar = None
+        **Examples:**
 
-            >>> df = bpd.DataFrame([[np.nan, 2, np.nan, 0],
-            ...                     [3, 4, np.nan, 1],
-            ...                     [np.nan, np.nan, np.nan, np.nan],
-            ...                     [np.nan, 3, np.nan, 4]],
-            ...                    columns=list("ABCD")).astype("Float64")
+            >>> df = bpd.DataFrame(
+            ...     [
+            ...         pa.array([np.nan, 2, None, 0], type=pa.float64()),
+            ...         pa.array([3, np.nan, None, 1], type=pa.float64()),
+            ...         pa.array([None, None, np.nan, None], type=pa.float64()),
+            ...         pa.array([4, 5, None, np.nan], type=pa.float64()),
+            ...     ], columns=list("ABCD"), dtype=pd.ArrowDtype(pa.float64()))
             >>> df
-                A     B     C     D
-            0  <NA>   2.0  <NA>   0.0
-            1   3.0   4.0  <NA>   1.0
-            2  <NA>  <NA>  <NA>  <NA>
-            3  <NA>   3.0  <NA>   4.0
+                  A     B     C     D
+            0   NaN   2.0  <NA>   0.0
+            1   3.0   NaN  <NA>   1.0
+            2  <NA>  <NA>   NaN  <NA>
+            3   4.0   5.0  <NA>   NaN
             <BLANKLINE>
             [4 rows x 4 columns]
 
-        Replace all NA elements with 0s.
+        Replace all NA (NULL) elements with 0s.
 
             >>> df.fillna(0)
                  A    B    C    D
-            0  0.0  2.0  0.0  0.0
-            1  3.0  4.0  0.0  1.0
-            2  0.0  0.0  0.0  0.0
-            3  0.0  3.0  0.0  4.0
+            0  NaN  2.0  0.0  0.0
+            1  3.0  NaN  0.0  1.0
+            2  0.0  0.0  NaN  0.0
+            3  4.0  5.0  0.0  NaN
             <BLANKLINE>
             [4 rows x 4 columns]
 
@@ -7072,11 +7076,11 @@ def fillna(self, value):
             <BLANKLINE>
             [3 rows x 4 columns]
             >>> df.fillna(df_fill)
-                A    B     C     D
-            0   0.0  2.0   2.0   0.0
-            1   3.0  4.0   6.0   1.0
-            2   8.0  9.0  10.0  11.0
-            3  <NA>  3.0  <NA>   4.0
+                 A    B     C     D
+            0  NaN  2.0   2.0   0.0
+            1  3.0  NaN   6.0   1.0
+            2  8.0  9.0   NaN  11.0
+            3  4.0  5.0  <NA>   NaN
             <BLANKLINE>
             [4 rows x 4 columns]
 
diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py
@@ -816,75 +816,88 @@ def bfill(self, *, limit: Optional[int] = None):
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
     def isna(self) -> NDFrame:
-        """Detect missing values.
+        """Detect missing (NULL) values.
 
-        Return a boolean same-sized object indicating if the values are NA.
-        NA values get mapped to True values. Everything else gets mapped to
-        False values. Characters such as empty strings ``''`` or
-        :attr:`numpy.inf` are not considered NA values.
+        Return a boolean same-sized object indicating if the values are NA
+        (NULL in BigQuery). NA/NULL values get mapped to True values.
+        Everything else gets mapped to False values.
 
-        **Examples:**
+        Note that empty strings ``''``, :attr:`numpy.inf`, and
+        :attr:`numpy.nan` are ***not*** considered NA values. This NA/NULL
+        logic differs from numpy, but it is the same as BigQuery and the
+        :class:`pandas.ArrowDtype`.
 
-            >>> import bigframes.pandas as bpd
-            >>> bpd.options.display.progress_bar = None
-            >>> import numpy as np
+        **Examples:**
 
             >>> df = bpd.DataFrame(dict(
-            ...         age=[5, 6, np.nan],
-            ...         born=[bpd.NA, "1940-04-25", "1940-04-25"],
-            ...         name=['Alfred', 'Batman', ''],
-            ...         toy=[None, 'Batmobile', 'Joker'],
+            ...         age=pd.Series(pa.array(
+            ...             [5, 6, None, 4],
+            ...             type=pa.int64(),
+            ...         ), dtype=pd.ArrowDtype(pa.int64())),
+            ...         born=pd.to_datetime([pd.NA, "1940-04-25", "1940-04-25", "1941-08-25"]),
+            ...         name=['Alfred', 'Batman', '', 'Plastic Man'],
+            ...         toy=[None, 'Batmobile', 'Joker', 'Play dough'],
+            ...         height=pd.Series(pa.array(
+            ...             [6.1, 5.9, None, np.nan],
+            ...             type=pa.float64(),
+            ...         ), dtype=pd.ArrowDtype(pa.float64())),
             ... ))
             >>> df
-                age        born    name        toy
-            0   5.0        <NA>  Alfred       <NA>
-            1   6.0  1940-04-25  Batman  Batmobile
-            2  <NA>  1940-04-25              Joker
+                age                 born         name         toy  height
+            0     5                 <NA>       Alfred        <NA>     6.1
+            1     6  1940-04-25 00:00:00       Batman   Batmobile     5.9
+            2  <NA>  1940-04-25 00:00:00                    Joker    <NA>
+            3     4  1941-08-25 00:00:00  Plastic Man  Play dough     NaN
             <BLANKLINE>
-            [3 rows x 4 columns]
+            [4 rows x 5 columns]
 
-        Show which entries in a DataFrame are NA:
+        Show which entries in a DataFrame are NA (NULL in BigQuery):
 
             >>> df.isna()
-                age   born   name    toy
-            0  False   True  False   True
-            1  False  False  False  False
-            2   True  False  False  False
+                 age   born   name    toy  height
+            0  False   True  False   True   False
+            1  False  False  False  False   False
+            2   True  False  False  False    True
+            3  False  False  False  False   False
             <BLANKLINE>
-            [3 rows x 4 columns]
+            [4 rows x 5 columns]
 
             >>> df.isnull()
-                age   born   name    toy
-            0  False   True  False   True
-            1  False  False  False  False
-            2   True  False  False  False
+                 age   born   name    toy  height
+            0  False   True  False   True   False
+            1  False  False  False  False   False
+            2   True  False  False  False    True
+            3  False  False  False  False   False
             <BLANKLINE>
-            [3 rows x 4 columns]
+            [4 rows x 5 columns]
 
-        Show which entries in a Series are NA:
+        Show which entries in a Series are NA (NULL in BigQuery):
 
-            >>> ser = bpd.Series([5, None, 6, np.nan, bpd.NA])
+            >>> ser = bpd.Series(pa.array(
+            ...     [5, None, 6, np.nan, None],
+            ...     type=pa.float64(),
+            ... ), dtype=pd.ArrowDtype(pa.float64()))
             >>> ser
-            0       5
+            0     5.0
             1    <NA>
-            2       6
-            3    <NA>
+            2     6.0
+            3     NaN
             4    <NA>
-            dtype: Int64
+            dtype: Float64
 
             >>> ser.isna()
             0    False
             1     True
             2    False
-            3     True
+            3    False
             4     True
             dtype: boolean
 
             >>> ser.isnull()
             0    False
             1     True
             2    False
-            3     True
+            3    False
             4     True
             dtype: boolean
 
diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py
@@ -957,17 +957,23 @@ def value_counts(
 
     def fillna(self, value) -> Index:
         """
-        Fill NA/NaN values with the specified value.
+        Fill NA (NULL in BigQuery) values using the specified method.
 
-        **Examples:**
+        Note that empty strings ``''``, :attr:`numpy.inf`, and
+        :attr:`numpy.nan` are ***not*** considered NA values. This NA/NULL
+        logic differs from numpy, but it is the same as BigQuery and the
+        :class:`pandas.ArrowDtype`.
 
-            >>> import bigframes.pandas as bpd
-            >>> import numpy as np
-            >>> bpd.options.display.progress_bar = None
+        **Examples:**
 
-            >>> idx = bpd.Index([np.nan, np.nan, 3])
+            >>> idx = bpd.Index(
+            ...     pa.array([None, np.nan, 3, None], type=pa.float64()),
+            ...     dtype=pd.ArrowDtype(pa.float64()),
+            ... )
+            >>> idx
+            Index([<NA>, nan, 3.0, <NA>], dtype='Float64')
             >>> idx.fillna(0)
-            Index([0.0, 0.0, 3.0], dtype='Float64')
+            Index([0.0, nan, 3.0, 0.0], dtype='Float64')
 
         Args:
             value (scalar):
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py
@@ -2410,26 +2410,30 @@ def fillna(
         value=None,
     ) -> Series | None:
         """
-        Fill NA/NaN values using the specified method.
+        Fill NA (NULL in BigQuery) values using the specified method.
 
-        **Examples:**
+        Note that empty strings ``''``, :attr:`numpy.inf`, and
+        :attr:`numpy.nan` are ***not*** considered NA values. This NA/NULL
+        logic differs from numpy, but it is the same as BigQuery and the
+        :class:`pandas.ArrowDtype`.
 
-            >>> import bigframes.pandas as bpd
-            >>> import numpy as np
-            >>> bpd.options.display.progress_bar = None
+        **Examples:**
 
-            >>> s = bpd.Series([np.nan, 2, np.nan, -1])
+            >>> s = bpd.Series(
+            ...     pa.array([np.nan, 2, None, -1], type=pa.float64()),
+            ...     dtype=pd.ArrowDtype(pa.float64()),
+            ... )
             >>> s
-            0    <NA>
+            0     NaN
             1     2.0
             2    <NA>
             3    -1.0
             dtype: Float64
 
-        Replace all NA elements with 0s.
+        Replace all NA (NULL) elements with 0s.
 
             >>> s.fillna(0)
-            0    0.0
+            0    NaN
             1    2.0
             2    0.0
             3   -1.0
@@ -2439,7 +2443,7 @@ def fillna(
 
             >>> s_fill = bpd.Series([11, 22, 33])
             >>> s.fillna(s_fill)
-            0    11.0
+            0     NaN
             1     2.0
             2    33.0
             3    -1.0
@@ -4482,7 +4486,7 @@ def update(self, other) -> None:
             2    6
             dtype: Int64
 
-            If ``other`` contains NaNs the corresponding values are not updated
+            If ``other`` contains NA (NULL values) the corresponding values are not updated
             in the original Series.
 
             >>> s = bpd.Series([1, 2, 3])