Skip to content

Commit 8f27e73

Browse files
authored
docs: clarify that only NULL values are handled by fillna/isna, not NaN (#2176)
* docs: clarify that only NULL values are handled by fillna/isna, not NaN * fix series fillna doctest
1 parent 7ce0ac5 commit 8f27e73

File tree

6 files changed

+145
-80
lines changed

6 files changed

+145
-80
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,3 +62,4 @@ system_tests/local_test_setup
6262
# Make sure a generated file isn't accidentally committed.
6363
pylintrc
6464
pylintrc.test
65+
dummy.pkl

conftest.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
import numpy as np
18+
import pandas as pd
19+
import pyarrow as pa
20+
import pytest
21+
22+
import bigframes._config
23+
import bigframes.pandas as bpd
24+
25+
26+
@pytest.fixture(autouse=True)
27+
def default_doctest_imports(doctest_namespace):
28+
"""
29+
Avoid some boilerplate in pandas-inspired tests.
30+
31+
See: https://docs.pytest.org/en/stable/how-to/doctest.html#doctest-namespace-fixture
32+
"""
33+
doctest_namespace["np"] = np
34+
doctest_namespace["pd"] = pd
35+
doctest_namespace["pa"] = pa
36+
doctest_namespace["bpd"] = bpd
37+
bigframes._config.options.display.progress_bar = None

third_party/bigframes_vendored/pandas/core/frame.py

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6978,7 +6978,7 @@ def query(self, expr: str) -> DataFrame | None:
69786978

69796979
def interpolate(self, method: str = "linear"):
69806980
"""
6981-
Fill NaN values using an interpolation method.
6981+
Fill NA (NULL in BigQuery) values using an interpolation method.
69826982
69836983
**Examples:**
69846984
@@ -7028,35 +7028,39 @@ def interpolate(self, method: str = "linear"):
70287028

70297029
def fillna(self, value):
70307030
"""
7031-
Fill NA/NaN values using the specified method.
7031+
Fill NA (NULL in BigQuery) values using the specified method.
70327032
7033-
**Examples:**
7033+
Note that empty strings ``''``, :attr:`numpy.inf`, and
7034+
:attr:`numpy.nan` are ***not*** considered NA values. This NA/NULL
7035+
logic differs from numpy, but it is the same as BigQuery and the
7036+
:class:`pandas.ArrowDtype`.
70347037
7035-
>>> import bigframes.pandas as bpd
7036-
>>> bpd.options.display.progress_bar = None
7038+
**Examples:**
70377039
7038-
>>> df = bpd.DataFrame([[np.nan, 2, np.nan, 0],
7039-
... [3, 4, np.nan, 1],
7040-
... [np.nan, np.nan, np.nan, np.nan],
7041-
... [np.nan, 3, np.nan, 4]],
7042-
... columns=list("ABCD")).astype("Float64")
7040+
>>> df = bpd.DataFrame(
7041+
... [
7042+
... pa.array([np.nan, 2, None, 0], type=pa.float64()),
7043+
... pa.array([3, np.nan, None, 1], type=pa.float64()),
7044+
... pa.array([None, None, np.nan, None], type=pa.float64()),
7045+
... pa.array([4, 5, None, np.nan], type=pa.float64()),
7046+
... ], columns=list("ABCD"), dtype=pd.ArrowDtype(pa.float64()))
70437047
>>> df
7044-
A B C D
7045-
0 <NA> 2.0 <NA> 0.0
7046-
1 3.0 4.0 <NA> 1.0
7047-
2 <NA> <NA> <NA> <NA>
7048-
3 <NA> 3.0 <NA> 4.0
7048+
A B C D
7049+
0 NaN 2.0 <NA> 0.0
7050+
1 3.0 NaN <NA> 1.0
7051+
2 <NA> <NA> NaN <NA>
7052+
3 4.0 5.0 <NA> NaN
70497053
<BLANKLINE>
70507054
[4 rows x 4 columns]
70517055
7052-
Replace all NA elements with 0s.
7056+
Replace all NA (NULL) elements with 0s.
70537057
70547058
>>> df.fillna(0)
70557059
A B C D
7056-
0 0.0 2.0 0.0 0.0
7057-
1 3.0 4.0 0.0 1.0
7058-
2 0.0 0.0 0.0 0.0
7059-
3 0.0 3.0 0.0 4.0
7060+
0 NaN 2.0 0.0 0.0
7061+
1 3.0 NaN 0.0 1.0
7062+
2 0.0 0.0 NaN 0.0
7063+
3 4.0 5.0 0.0 NaN
70607064
<BLANKLINE>
70617065
[4 rows x 4 columns]
70627066
@@ -7072,11 +7076,11 @@ def fillna(self, value):
70727076
<BLANKLINE>
70737077
[3 rows x 4 columns]
70747078
>>> df.fillna(df_fill)
7075-
A B C D
7076-
0 0.0 2.0 2.0 0.0
7077-
1 3.0 4.0 6.0 1.0
7078-
2 8.0 9.0 10.0 11.0
7079-
3 <NA> 3.0 <NA> 4.0
7079+
A B C D
7080+
0 NaN 2.0 2.0 0.0
7081+
1 3.0 NaN 6.0 1.0
7082+
2 8.0 9.0 NaN 11.0
7083+
3 4.0 5.0 <NA> NaN
70807084
<BLANKLINE>
70817085
[4 rows x 4 columns]
70827086

third_party/bigframes_vendored/pandas/core/generic.py

Lines changed: 50 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -816,75 +816,88 @@ def bfill(self, *, limit: Optional[int] = None):
816816
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
817817

818818
def isna(self) -> NDFrame:
819-
"""Detect missing values.
819+
"""Detect missing (NULL) values.
820820
821-
Return a boolean same-sized object indicating if the values are NA.
822-
NA values get mapped to True values. Everything else gets mapped to
823-
False values. Characters such as empty strings ``''`` or
824-
:attr:`numpy.inf` are not considered NA values.
821+
Return a boolean same-sized object indicating if the values are NA
822+
(NULL in BigQuery). NA/NULL values get mapped to True values.
823+
Everything else gets mapped to False values.
825824
826-
**Examples:**
825+
Note that empty strings ``''``, :attr:`numpy.inf`, and
826+
:attr:`numpy.nan` are ***not*** considered NA values. This NA/NULL
827+
logic differs from numpy, but it is the same as BigQuery and the
828+
:class:`pandas.ArrowDtype`.
827829
828-
>>> import bigframes.pandas as bpd
829-
>>> bpd.options.display.progress_bar = None
830-
>>> import numpy as np
830+
**Examples:**
831831
832832
>>> df = bpd.DataFrame(dict(
833-
... age=[5, 6, np.nan],
834-
... born=[bpd.NA, "1940-04-25", "1940-04-25"],
835-
... name=['Alfred', 'Batman', ''],
836-
... toy=[None, 'Batmobile', 'Joker'],
833+
... age=pd.Series(pa.array(
834+
... [5, 6, None, 4],
835+
... type=pa.int64(),
836+
... ), dtype=pd.ArrowDtype(pa.int64())),
837+
... born=pd.to_datetime([pd.NA, "1940-04-25", "1940-04-25", "1941-08-25"]),
838+
... name=['Alfred', 'Batman', '', 'Plastic Man'],
839+
... toy=[None, 'Batmobile', 'Joker', 'Play dough'],
840+
... height=pd.Series(pa.array(
841+
... [6.1, 5.9, None, np.nan],
842+
... type=pa.float64(),
843+
... ), dtype=pd.ArrowDtype(pa.float64())),
837844
... ))
838845
>>> df
839-
age born name toy
840-
0 5.0 <NA> Alfred <NA>
841-
1 6.0 1940-04-25 Batman Batmobile
842-
2 <NA> 1940-04-25 Joker
846+
age born name toy height
847+
0 5 <NA> Alfred <NA> 6.1
848+
1 6 1940-04-25 00:00:00 Batman Batmobile 5.9
849+
2 <NA> 1940-04-25 00:00:00 Joker <NA>
850+
3 4 1941-08-25 00:00:00 Plastic Man Play dough NaN
843851
<BLANKLINE>
844-
[3 rows x 4 columns]
852+
[4 rows x 5 columns]
845853
846-
Show which entries in a DataFrame are NA:
854+
Show which entries in a DataFrame are NA (NULL in BigQuery):
847855
848856
>>> df.isna()
849-
age born name toy
850-
0 False True False True
851-
1 False False False False
852-
2 True False False False
857+
age born name toy height
858+
0 False True False True False
859+
1 False False False False False
860+
2 True False False False True
861+
3 False False False False False
853862
<BLANKLINE>
854-
[3 rows x 4 columns]
863+
[4 rows x 5 columns]
855864
856865
>>> df.isnull()
857-
age born name toy
858-
0 False True False True
859-
1 False False False False
860-
2 True False False False
866+
age born name toy height
867+
0 False True False True False
868+
1 False False False False False
869+
2 True False False False True
870+
3 False False False False False
861871
<BLANKLINE>
862-
[3 rows x 4 columns]
872+
[4 rows x 5 columns]
863873
864-
Show which entries in a Series are NA:
874+
Show which entries in a Series are NA (NULL in BigQuery):
865875
866-
>>> ser = bpd.Series([5, None, 6, np.nan, bpd.NA])
876+
>>> ser = bpd.Series(pa.array(
877+
... [5, None, 6, np.nan, None],
878+
... type=pa.float64(),
879+
... ), dtype=pd.ArrowDtype(pa.float64()))
867880
>>> ser
868-
0 5
881+
0 5.0
869882
1 <NA>
870-
2 6
871-
3 <NA>
883+
2 6.0
884+
3 NaN
872885
4 <NA>
873-
dtype: Int64
886+
dtype: Float64
874887
875888
>>> ser.isna()
876889
0 False
877890
1 True
878891
2 False
879-
3 True
892+
3 False
880893
4 True
881894
dtype: boolean
882895
883896
>>> ser.isnull()
884897
0 False
885898
1 True
886899
2 False
887-
3 True
900+
3 False
888901
4 True
889902
dtype: boolean
890903

third_party/bigframes_vendored/pandas/core/indexes/base.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -957,17 +957,23 @@ def value_counts(
957957

958958
def fillna(self, value) -> Index:
959959
"""
960-
Fill NA/NaN values with the specified value.
960+
Fill NA (NULL in BigQuery) values using the specified method.
961961
962-
**Examples:**
962+
Note that empty strings ``''``, :attr:`numpy.inf`, and
963+
:attr:`numpy.nan` are ***not*** considered NA values. This NA/NULL
964+
logic differs from numpy, but it is the same as BigQuery and the
965+
:class:`pandas.ArrowDtype`.
963966
964-
>>> import bigframes.pandas as bpd
965-
>>> import numpy as np
966-
>>> bpd.options.display.progress_bar = None
967+
**Examples:**
967968
968-
>>> idx = bpd.Index([np.nan, np.nan, 3])
969+
>>> idx = bpd.Index(
970+
... pa.array([None, np.nan, 3, None], type=pa.float64()),
971+
... dtype=pd.ArrowDtype(pa.float64()),
972+
... )
973+
>>> idx
974+
Index([<NA>, nan, 3.0, <NA>], dtype='Float64')
969975
>>> idx.fillna(0)
970-
Index([0.0, 0.0, 3.0], dtype='Float64')
976+
Index([0.0, nan, 3.0, 0.0], dtype='Float64')
971977
972978
Args:
973979
value (scalar):

third_party/bigframes_vendored/pandas/core/series.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2410,26 +2410,30 @@ def fillna(
24102410
value=None,
24112411
) -> Series | None:
24122412
"""
2413-
Fill NA/NaN values using the specified method.
2413+
Fill NA (NULL in BigQuery) values using the specified method.
24142414
2415-
**Examples:**
2415+
Note that empty strings ``''``, :attr:`numpy.inf`, and
2416+
:attr:`numpy.nan` are ***not*** considered NA values. This NA/NULL
2417+
logic differs from numpy, but it is the same as BigQuery and the
2418+
:class:`pandas.ArrowDtype`.
24162419
2417-
>>> import bigframes.pandas as bpd
2418-
>>> import numpy as np
2419-
>>> bpd.options.display.progress_bar = None
2420+
**Examples:**
24202421
2421-
>>> s = bpd.Series([np.nan, 2, np.nan, -1])
2422+
>>> s = bpd.Series(
2423+
... pa.array([np.nan, 2, None, -1], type=pa.float64()),
2424+
... dtype=pd.ArrowDtype(pa.float64()),
2425+
... )
24222426
>>> s
2423-
0 <NA>
2427+
0 NaN
24242428
1 2.0
24252429
2 <NA>
24262430
3 -1.0
24272431
dtype: Float64
24282432
2429-
Replace all NA elements with 0s.
2433+
Replace all NA (NULL) elements with 0s.
24302434
24312435
>>> s.fillna(0)
2432-
0 0.0
2436+
0 NaN
24332437
1 2.0
24342438
2 0.0
24352439
3 -1.0
@@ -2439,7 +2443,7 @@ def fillna(
24392443
24402444
>>> s_fill = bpd.Series([11, 22, 33])
24412445
>>> s.fillna(s_fill)
2442-
0 11.0
2446+
0 NaN
24432447
1 2.0
24442448
2 33.0
24452449
3 -1.0
@@ -4482,7 +4486,7 @@ def update(self, other) -> None:
44824486
2 6
44834487
dtype: Int64
44844488
4485-
If ``other`` contains NaNs the corresponding values are not updated
4489+
If ``other`` contains NA (NULL values) the corresponding values are not updated
44864490
in the original Series.
44874491
44884492
>>> s = bpd.Series([1, 2, 3])

0 commit comments

Comments
 (0)