pandas-dev · mroeschke · May 24, 2023 · Mar 20, 2023 · Mar 20, 2023 · Mar 20, 2023
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -265,8 +265,10 @@ Deprecations
 - Deprecated logical operation between two non boolean :class:`Series` with different indexes always coercing the result to bool dtype. In a future version, this will maintain the return type of the inputs. (:issue:`52500`, :issue:`52538`)
 - Deprecated allowing ``downcast`` keyword other than ``None``, ``False``, "infer", or a dict with these as values in :meth:`Series.fillna`, :meth:`DataFrame.fillna` (:issue:`40988`)
 - Deprecated allowing arbitrary ``fill_value`` in :class:`SparseDtype`, in a future version the ``fill_value`` will need to be compatible with the ``dtype.subtype``, either a scalar that can be held by that subtype or ``NaN`` for integer or bool subtypes (:issue:`23124`)
+- Deprecated behavior of :func:`assert_series_equal` and :func:`assert_frame_equal` considering NA-like values (e.g. ``NaN`` vs ``None`` as equivalent) (:issue:`52081`)
 - Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`)
 - Deprecated positional indexing on :class:`Series` with :meth:`Series.__getitem__` and :meth:`Series.__setitem__`, in a future version ``ser[item]`` will *always* interpret ``item`` as a label, not a position (:issue:`50617`)
+-
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_210.performance:

diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx
@@ -1,19 +1,25 @@
 import cmath
 import math
+import warnings
 
 import numpy as np
 
 from numpy cimport import_array
 
 import_array()
 
-from pandas._libs.missing cimport checknull
+from pandas._libs.missing cimport (
+    checknull,
+    is_matching_na,
+)
 from pandas._libs.util cimport (
     is_array,
     is_complex_object,
     is_real_number_object,
 )
 
+from pandas.util._exceptions import find_stack_level
+
 from pandas.core.dtypes.missing import array_equivalent
 
 
@@ -176,13 +182,23 @@ cpdef assert_almost_equal(a, b,
         # classes can't be the same, to raise error
         assert_class_equal(a, b, obj=obj)
 
-    if checknull(a) and checknull(b):
-        # TODO: Should require same-dtype NA?
+    if checknull(a):
         # nan / None comparison
-        return True
-
-    if (checknull(a) and not checknull(b)) or (not checknull(a) and checknull(b)):
-        # boolean value of pd.NA is ambiguous
+        if is_matching_na(a, b, nan_matches_none=False):
+            return True
+        elif checknull(b):
+            # GH#18463
+            warnings.warn(
+                f"Mismatched null-like values {a} and {b} found. In a future "
+                "version, pandas equality-testing functions "
+                "(e.g. assert_frame_equal) will consider these not-matching "
+                "and raise.",
+                FutureWarning,
+                stacklevel=find_stack_level(),
+            )
+            return True
+        raise AssertionError(f"{a} != {b}")
+    elif checknull(b):
         raise AssertionError(f"{a} != {b}")
 
     if a == b:

diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
@@ -9,6 +9,7 @@
     TYPE_CHECKING,
     overload,
 )
+import warnings
 
 import numpy as np
 
@@ -573,17 +574,20 @@ def _array_equivalent_object(left: np.ndarray, right: np.ndarray, strict_nan: bo
             if not isinstance(right_value, float) or not np.isnan(right_value):
                 return False
         else:
-            try:
-                if np.any(np.asarray(left_value != right_value)):
+            with warnings.catch_warnings():
+                # suppress numpy's "elementwise comparison failed"
+                warnings.simplefilter("ignore", DeprecationWarning)
+                try:
+                    if np.any(np.asarray(left_value != right_value)):
+                        return False
+                except TypeError as err:
+                    if "boolean value of NA is ambiguous" in str(err):
+                        return False
+                    raise
+                except ValueError:
+                    # numpy can raise a ValueError if left and right cannot be
+                    # compared (e.g. nested arrays)
                     return False
-            except TypeError as err:
-                if "boolean value of NA is ambiguous" in str(err):
-                    return False
-                raise
-            except ValueError:
-                # numpy can raise a ValueError if left and right cannot be
-                # compared (e.g. nested arrays)
-                return False
     return True
 
 

diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py
@@ -2037,6 +2037,10 @@ def test_td64arr_div_numeric_array(
         if box_with_array is DataFrame:
             expected = [tdser.iloc[0, n] / vector[n] for n in range(len(vector))]
             expected = tm.box_expected(expected, xbox).astype(object)
+            # We specifically expect timedelta64("NaT") here, not pd.NA
+            expected[2] = expected[2].fillna(
+                np.timedelta64("NaT", "ns"), downcast=False
+            )
         else:
             expected = [tdser[n] / vector[n] for n in range(len(tdser))]
             expected = [
@@ -2113,9 +2117,12 @@ def test_td64arr_all_nat_div_object_dtype_numeric(self, box_with_array):
         left = tm.box_expected(tdi, box_with_array)
         right = np.array([2, 2.0], dtype=object)
 
-        expected = Index([np.timedelta64("NaT", "ns")] * 2, dtype=object)
+        tdnat = np.timedelta64("NaT", "ns")
+        expected = Index([tdnat] * 2, dtype=object)
         if box_with_array is not Index:
             expected = tm.box_expected(expected, box_with_array).astype(object)
+            if box_with_array in [Series, DataFrame]:
+                expected = expected.fillna(tdnat, downcast=False)  # GH#18463
 
         result = left / right
         tm.assert_equal(result, expected)

diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py
@@ -204,6 +204,10 @@ def test_error_invalid_values(data, all_arithmetic_operators):
     ]:  # (data[~data.isna()] >= 0).all():
         res = ops(str_ser)
         expected = pd.Series(["foo" * x for x in data], index=s.index)
+        expected = expected.fillna(np.nan)
+        # TODO: doing this fillna to keep tests passing as we make
+        #  assert_almost_equal stricter, but the expected with pd.NA seems
+        #  more-correct than np.nan here.
         tm.assert_series_equal(res, expected)
     else:
         with pytest.raises(TypeError, match=msg):

diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py
@@ -51,7 +51,7 @@ def test_conversions(data_missing):
     # astype to object series
     df = pd.DataFrame({"A": data_missing})
     result = df["A"].astype("object")
-    expected = pd.Series(np.array([np.nan, 1], dtype=object), name="A")
+    expected = pd.Series(np.array([pd.NA, 1], dtype=object), name="A")
     tm.assert_series_equal(result, expected)
 
     # convert to object ndarray

diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py
@@ -16,7 +16,10 @@
 import numpy as np
 import pytest
 
-from pandas.core.dtypes.common import is_bool_dtype
+from pandas.compat import (
+    IS64,
+    is_platform_windows,
+)
 
 import pandas as pd
 import pandas._testing as tm
@@ -382,11 +385,18 @@ class TestUnaryOps(base.BaseUnaryOpsTests):
 
 class TestAccumulation(base.BaseAccumulateTests):
     def check_accumulate(self, s, op_name, skipna):
+        length = 64
+        if not IS64 or is_platform_windows():
+            if not s.dtype.itemsize == 8:
+                length = 32
+
         result = getattr(s, op_name)(skipna=skipna)
         expected = getattr(pd.Series(s.astype("float64")), op_name)(skipna=skipna)
-        tm.assert_series_equal(result, expected, check_dtype=False)
-        if op_name in ("cummin", "cummax"):
-            assert is_bool_dtype(result)
+        if op_name not in ("cummin", "cummax"):
+            expected = expected.astype(f"Int{length}")
+        else:
+            expected = expected.astype("boolean")
+        tm.assert_series_equal(result, expected)
 
     @pytest.mark.parametrize("skipna", [True, False])
     def test_accumulate_series_raises(self, data, all_numeric_accumulations, skipna):

diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py
@@ -265,6 +265,10 @@ def test_compare_ea_and_np_dtype(val1, val2):
             ("b", "other"): np.nan,
         }
     )
+    if val1 is pd.NA and val2 is pd.NA:
+        # GH#18463 TODO: is this really the desired behavior?
+        expected.loc[1, ("a", "self")] = np.nan
+
     if val1 is pd.NA and is_numpy_dev:
         # can't compare with numpy array if it contains pd.NA
         with pytest.raises(TypeError, match="boolean value of NA is ambiguous"):

diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py
@@ -734,6 +734,9 @@ def test_quantile_empty_no_rows_dt64(self, interp_method):
             0.5, numeric_only=False, interpolation=interpolation, method=method
         )
         exp = exp.astype(object)
+        if interpolation == "nearest":
+            # GH#18463 TODO: would we prefer NaTs here?
+            exp = exp.fillna(np.nan, downcast=False)
         tm.assert_series_equal(res, exp)
 
         # both dt64tz

diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py
@@ -112,9 +112,13 @@ def test_reindex_timestamp_with_fold(self, timezone, year, month, day, hour):
             .set_index("index")
             .reindex(["1", "2"])
         )
+        exp = DataFrame({"index": ["1", "2"], "vals": [np.nan, np.nan]}).set_index(
+            "index"
+        )
+        exp = exp.astype(object)
         tm.assert_frame_equal(
             df,
-            DataFrame({"index": ["1", "2"], "vals": [None, None]}).set_index("index"),
+            exp,
         )
 
 
@@ -1191,7 +1195,7 @@ def test_reindex_empty_frame(self, kwargs):
         idx = date_range(start="2020", freq="30s", periods=3)
         df = DataFrame([], index=Index([], name="time"), columns=["a"])
         result = df.reindex(idx, **kwargs)
-        expected = DataFrame({"a": [pd.NA] * 3}, index=idx)
+        expected = DataFrame({"a": [np.nan] * 3}, index=idx, dtype=object)
         tm.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize(

diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py
@@ -1245,19 +1245,19 @@ def test_operators_none_as_na(self, op):
         filled = df.fillna(np.nan)
         result = op(df, 3)
         expected = op(filled, 3).astype(object)
-        expected[pd.isna(expected)] = None
+        expected[pd.isna(expected)] = np.nan
         tm.assert_frame_equal(result, expected)
 
         result = op(df, df)
         expected = op(filled, filled).astype(object)
-        expected[pd.isna(expected)] = None
+        expected[pd.isna(expected)] = np.nan
         tm.assert_frame_equal(result, expected)
 
         result = op(df, df.fillna(7))
         tm.assert_frame_equal(result, expected)
 
         result = op(df.fillna(7), df)
-        tm.assert_frame_equal(result, expected, check_dtype=False)
+        tm.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize("op,res", [("__eq__", False), ("__ne__", True)])
     # TODO: not sure what's correct here.

diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
@@ -331,11 +331,14 @@ def wrapper(x):
             DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object),
         ],
     )
+    @pytest.mark.filterwarnings("ignore:Mismatched null-like values:FutureWarning")
     def test_stat_operators_attempt_obj_array(self, method, df, axis):
         # GH#676
         assert df.values.dtype == np.object_
         result = getattr(df, method)(axis=axis)
         expected = getattr(df.astype("f8"), method)(axis=axis).astype(object)
+        if axis in [1, "columns"] and method in ["min", "max"]:
+            expected[expected.isna()] = None
         tm.assert_series_equal(result, expected)
 
     @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"])

diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py
@@ -1180,6 +1180,10 @@ def test_unstack_mixed_extension_types(self, level):
 
         result = df.unstack(level=level)
         expected = df.astype(object).unstack(level=level)
+        if level == 0:
+            expected[("A", "B")] = expected[("A", "B")].fillna(pd.NA)
+        else:
+            expected[("A", 0)] = expected[("A", 0)].fillna(pd.NA)
 
         expected_dtypes = Series(
             [df.A.dtype] * 2 + [df.B.dtype] * 2, index=result.columns

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -2994,7 +2994,7 @@ def test_groupby_sum_on_nan_should_return_nan(bug_var):
     dfgb = df.groupby(lambda x: x)
     result = dfgb.sum(min_count=1)
 
-    expected_df = DataFrame([bug_var, bug_var, bug_var, np.nan], columns=["A"])
+    expected_df = DataFrame([bug_var, bug_var, bug_var, None], columns=["A"])
     tm.assert_frame_equal(result, expected_df)
 
 

diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py
@@ -154,7 +154,7 @@ def test_first_last_nth_dtypes(df_mixed_floats):
 
 def test_first_last_nth_nan_dtype():
     # GH 33591
-    df = DataFrame({"data": ["A"], "nans": Series([np.nan], dtype=object)})
+    df = DataFrame({"data": ["A"], "nans": Series([None], dtype=object)})
     grouped = df.groupby("data")
 
     expected = df.set_index("data").nans

diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
@@ -746,6 +746,11 @@ def test_cython_transform_frame(op, args, targop):
                 expected = gb.apply(targop)
 
             expected = expected.sort_index(axis=1)
+            if op == "shift":
+                expected["string_missing"] = expected["string_missing"].fillna(
+                    np.nan, downcast=False
+                )
+                expected["string"] = expected["string"].fillna(np.nan, downcast=False)
 
             result = gb[expected.columns].transform(op, *args).sort_index(axis=1)
             tm.assert_frame_equal(result, expected)
@@ -772,8 +777,13 @@ def test_cython_transform_frame(op, args, targop):
                 else:
                     expected = gb[c].apply(targop)
                     expected.name = c
-                    tm.assert_series_equal(expected, gb[c].transform(op, *args))
-                    tm.assert_series_equal(expected, getattr(gb[c], op)(*args))
+                    if c in ["string_missing", "string"]:
+                        expected = expected.fillna(np.nan, downcast=False)
+
+                    res = gb[c].transform(op, *args)
+                    tm.assert_series_equal(expected, res)
+                    res2 = getattr(gb[c], op)(*args)
+                    tm.assert_series_equal(expected, res2)
 
 
 def test_transform_with_non_scalar_group():

diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
@@ -1163,7 +1163,9 @@ def test_loc_setitem_empty_append_expands_rows(self):
         # GH6173, various appends to an empty dataframe
 
         data = [1, 2, 3]
-        expected = DataFrame({"x": data, "y": [None] * len(data)})
+        expected = DataFrame(
+            {"x": data, "y": np.array([np.nan] * len(data), dtype=object)}
+        )
 
         # appends to fit length of data
         df = DataFrame(columns=["x", "y"])
@@ -1174,7 +1176,9 @@ def test_loc_setitem_empty_append_expands_rows_mixed_dtype(self):
         # GH#37932 same as test_loc_setitem_empty_append_expands_rows
         #  but with mixed dtype so we go through take_split_path
         data = [1, 2, 3]
-        expected = DataFrame({"x": data, "y": [None] * len(data)})
+        expected = DataFrame(
+            {"x": data, "y": np.array([np.nan] * len(data), dtype=object)}
+        )
 
         df = DataFrame(columns=["x", "y"])
         df["x"] = df["x"].astype(np.int64)

diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -1149,13 +1149,14 @@ def test_excel_old_index_format(self, read_ext):
         # now be interpreted as rows that include null data.
         data = np.array(
             [
-                [None, None, None, None, None],
+                [np.nan, np.nan, np.nan, np.nan, np.nan],
                 ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
                 ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
                 ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
                 ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
                 ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"],
-            ]
+            ],
+            dtype=object,
         )
         columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
         mi = MultiIndex(

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -928,13 +928,15 @@ def test_doc_example(self):
         result = read_json(json, dtype={"ints": np.int64, "bools": np.bool_})
         tm.assert_frame_equal(result, result)
 
-    def test_round_trip_exception_(self, datapath):
+    def test_round_trip_exception(self, datapath):
         # GH 3867
         path = datapath("io", "json", "data", "teams.csv")
         df = pd.read_csv(path)
         s = df.to_json()
         result = read_json(s)
-        tm.assert_frame_equal(result.reindex(index=df.index, columns=df.columns), df)
+        res = result.reindex(index=df.index, columns=df.columns)
+        res = res.fillna(np.nan, downcast=False)
+        tm.assert_frame_equal(res, df)
 
     @pytest.mark.network
     @tm.network(
@@ -1747,7 +1749,7 @@ def test_emca_262_nan_inf_support(self):
         data = '["a", NaN, "NaN", Infinity, "Infinity", -Infinity, "-Infinity"]'
         result = read_json(data)
         expected = DataFrame(
-            ["a", np.nan, "NaN", np.inf, "Infinity", -np.inf, "-Infinity"]
+            ["a", None, "NaN", np.inf, "Infinity", -np.inf, "-Infinity"]
         )
         tm.assert_frame_equal(result, expected)