diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 8110d078ce041..4a32374fd33d7 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -265,8 +265,10 @@ Deprecations - Deprecated logical operation between two non boolean :class:`Series` with different indexes always coercing the result to bool dtype. In a future version, this will maintain the return type of the inputs. (:issue:`52500`, :issue:`52538`) - Deprecated allowing ``downcast`` keyword other than ``None``, ``False``, "infer", or a dict with these as values in :meth:`Series.fillna`, :meth:`DataFrame.fillna` (:issue:`40988`) - Deprecated allowing arbitrary ``fill_value`` in :class:`SparseDtype`, in a future version the ``fill_value`` will need to be compatible with the ``dtype.subtype``, either a scalar that can be held by that subtype or ``NaN`` for integer or bool subtypes (:issue:`23124`) +- Deprecated behavior of :func:`assert_series_equal` and :func:`assert_frame_equal` considering NA-like values (e.g. ``NaN`` vs ``None`` as equivalent) (:issue:`52081`) - Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`) - Deprecated positional indexing on :class:`Series` with :meth:`Series.__getitem__` and :meth:`Series.__setitem__`, in a future version ``ser[item]`` will *always* interpret ``item`` as a label, not a position (:issue:`50617`) +- .. --------------------------------------------------------------------------- .. _whatsnew_210.performance: diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index 2f3bb566cbcb0..4ba7bce51ed64 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -1,5 +1,6 @@ import cmath import math +import warnings import numpy as np @@ -7,13 +8,18 @@ from numpy cimport import_array import_array() -from pandas._libs.missing cimport checknull +from pandas._libs.missing cimport ( + checknull, + is_matching_na, +) from pandas._libs.util cimport ( is_array, is_complex_object, is_real_number_object, ) +from pandas.util._exceptions import find_stack_level + from pandas.core.dtypes.missing import array_equivalent @@ -176,13 +182,23 @@ cpdef assert_almost_equal(a, b, # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) - if checknull(a) and checknull(b): - # TODO: Should require same-dtype NA? + if checknull(a): # nan / None comparison - return True - - if (checknull(a) and not checknull(b)) or (not checknull(a) and checknull(b)): - # boolean value of pd.NA is ambiguous + if is_matching_na(a, b, nan_matches_none=False): + return True + elif checknull(b): + # GH#18463 + warnings.warn( + f"Mismatched null-like values {a} and {b} found. In a future " + "version, pandas equality-testing functions " + "(e.g. assert_frame_equal) will consider these not-matching " + "and raise.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return True + raise AssertionError(f"{a} != {b}") + elif checknull(b): raise AssertionError(f"{a} != {b}") if a == b: diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 65bbdb0e5df92..000c248beb999 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -9,6 +9,7 @@ TYPE_CHECKING, overload, ) +import warnings import numpy as np @@ -573,17 +574,20 @@ def _array_equivalent_object(left: np.ndarray, right: np.ndarray, strict_nan: bo if not isinstance(right_value, float) or not np.isnan(right_value): return False else: - try: - if np.any(np.asarray(left_value != right_value)): + with warnings.catch_warnings(): + # suppress numpy's "elementwise comparison failed" + warnings.simplefilter("ignore", DeprecationWarning) + try: + if np.any(np.asarray(left_value != right_value)): + return False + except TypeError as err: + if "boolean value of NA is ambiguous" in str(err): + return False + raise + except ValueError: + # numpy can raise a ValueError if left and right cannot be + # compared (e.g. nested arrays) return False - except TypeError as err: - if "boolean value of NA is ambiguous" in str(err): - return False - raise - except ValueError: - # numpy can raise a ValueError if left and right cannot be - # compared (e.g. nested arrays) - return False return True diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 33fc63938407c..7f44e329e6778 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -2037,6 +2037,10 @@ def test_td64arr_div_numeric_array( if box_with_array is DataFrame: expected = [tdser.iloc[0, n] / vector[n] for n in range(len(vector))] expected = tm.box_expected(expected, xbox).astype(object) + # We specifically expect timedelta64("NaT") here, not pd.NA + expected[2] = expected[2].fillna( + np.timedelta64("NaT", "ns"), downcast=False + ) else: expected = [tdser[n] / vector[n] for n in range(len(tdser))] expected = [ @@ -2113,9 +2117,12 @@ def test_td64arr_all_nat_div_object_dtype_numeric(self, box_with_array): left = tm.box_expected(tdi, box_with_array) right = np.array([2, 2.0], dtype=object) - expected = Index([np.timedelta64("NaT", "ns")] * 2, dtype=object) + tdnat = np.timedelta64("NaT", "ns") + expected = Index([tdnat] * 2, dtype=object) if box_with_array is not Index: expected = tm.box_expected(expected, box_with_array).astype(object) + if box_with_array in [Series, DataFrame]: + expected = expected.fillna(tdnat, downcast=False) # GH#18463 result = left / right tm.assert_equal(result, expected) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index a6e91b05efbe9..ce6c245cd0f37 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -204,6 +204,10 @@ def test_error_invalid_values(data, all_arithmetic_operators): ]: # (data[~data.isna()] >= 0).all(): res = ops(str_ser) expected = pd.Series(["foo" * x for x in data], index=s.index) + expected = expected.fillna(np.nan) + # TODO: doing this fillna to keep tests passing as we make + # assert_almost_equal stricter, but the expected with pd.NA seems + # more-correct than np.nan here. tm.assert_series_equal(res, expected) else: with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py index 48f247fb296ca..9ecfc51cb2208 100644 --- a/pandas/tests/arrays/integer/test_construction.py +++ b/pandas/tests/arrays/integer/test_construction.py @@ -51,7 +51,7 @@ def test_conversions(data_missing): # astype to object series df = pd.DataFrame({"A": data_missing}) result = df["A"].astype("object") - expected = pd.Series(np.array([np.nan, 1], dtype=object), name="A") + expected = pd.Series(np.array([pd.NA, 1], dtype=object), name="A") tm.assert_series_equal(result, expected) # convert to object ndarray diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 8c8cbfa5200b1..c9fa28a507745 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -16,7 +16,10 @@ import numpy as np import pytest -from pandas.core.dtypes.common import is_bool_dtype +from pandas.compat import ( + IS64, + is_platform_windows, +) import pandas as pd import pandas._testing as tm @@ -382,11 +385,18 @@ class TestUnaryOps(base.BaseUnaryOpsTests): class TestAccumulation(base.BaseAccumulateTests): def check_accumulate(self, s, op_name, skipna): + length = 64 + if not IS64 or is_platform_windows(): + if not s.dtype.itemsize == 8: + length = 32 + result = getattr(s, op_name)(skipna=skipna) expected = getattr(pd.Series(s.astype("float64")), op_name)(skipna=skipna) - tm.assert_series_equal(result, expected, check_dtype=False) - if op_name in ("cummin", "cummax"): - assert is_bool_dtype(result) + if op_name not in ("cummin", "cummax"): + expected = expected.astype(f"Int{length}") + else: + expected = expected.astype("boolean") + tm.assert_series_equal(result, expected) @pytest.mark.parametrize("skipna", [True, False]) def test_accumulate_series_raises(self, data, all_numeric_accumulations, skipna): diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index fe74ec8077bc9..6369a624bce71 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -265,6 +265,10 @@ def test_compare_ea_and_np_dtype(val1, val2): ("b", "other"): np.nan, } ) + if val1 is pd.NA and val2 is pd.NA: + # GH#18463 TODO: is this really the desired behavior? + expected.loc[1, ("a", "self")] = np.nan + if val1 is pd.NA and is_numpy_dev: # can't compare with numpy array if it contains pd.NA with pytest.raises(TypeError, match="boolean value of NA is ambiguous"): diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 87badfd710a62..d50b3e45969f0 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -734,6 +734,9 @@ def test_quantile_empty_no_rows_dt64(self, interp_method): 0.5, numeric_only=False, interpolation=interpolation, method=method ) exp = exp.astype(object) + if interpolation == "nearest": + # GH#18463 TODO: would we prefer NaTs here? + exp = exp.fillna(np.nan, downcast=False) tm.assert_series_equal(res, exp) # both dt64tz diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 1ed0143e5b309..e8cebd5964236 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -112,9 +112,13 @@ def test_reindex_timestamp_with_fold(self, timezone, year, month, day, hour): .set_index("index") .reindex(["1", "2"]) ) + exp = DataFrame({"index": ["1", "2"], "vals": [np.nan, np.nan]}).set_index( + "index" + ) + exp = exp.astype(object) tm.assert_frame_equal( df, - DataFrame({"index": ["1", "2"], "vals": [None, None]}).set_index("index"), + exp, ) @@ -1191,7 +1195,7 @@ def test_reindex_empty_frame(self, kwargs): idx = date_range(start="2020", freq="30s", periods=3) df = DataFrame([], index=Index([], name="time"), columns=["a"]) result = df.reindex(idx, **kwargs) - expected = DataFrame({"a": [pd.NA] * 3}, index=idx) + expected = DataFrame({"a": [np.nan] * 3}, index=idx, dtype=object) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index c71ceae762e67..52b60a0b83025 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1245,19 +1245,19 @@ def test_operators_none_as_na(self, op): filled = df.fillna(np.nan) result = op(df, 3) expected = op(filled, 3).astype(object) - expected[pd.isna(expected)] = None + expected[pd.isna(expected)] = np.nan tm.assert_frame_equal(result, expected) result = op(df, df) expected = op(filled, filled).astype(object) - expected[pd.isna(expected)] = None + expected[pd.isna(expected)] = np.nan tm.assert_frame_equal(result, expected) result = op(df, df.fillna(7)) tm.assert_frame_equal(result, expected) result = op(df.fillna(7), df) - tm.assert_frame_equal(result, expected, check_dtype=False) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("op,res", [("__eq__", False), ("__ne__", True)]) # TODO: not sure what's correct here. diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 5b814ee785500..b4a4324593d22 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -331,11 +331,14 @@ def wrapper(x): DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object), ], ) + @pytest.mark.filterwarnings("ignore:Mismatched null-like values:FutureWarning") def test_stat_operators_attempt_obj_array(self, method, df, axis): # GH#676 assert df.values.dtype == np.object_ result = getattr(df, method)(axis=axis) expected = getattr(df.astype("f8"), method)(axis=axis).astype(object) + if axis in [1, "columns"] and method in ["min", "max"]: + expected[expected.isna()] = None tm.assert_series_equal(result, expected) @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"]) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index bca4675e6b3e0..dd8e83295a43f 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1180,6 +1180,10 @@ def test_unstack_mixed_extension_types(self, level): result = df.unstack(level=level) expected = df.astype(object).unstack(level=level) + if level == 0: + expected[("A", "B")] = expected[("A", "B")].fillna(pd.NA) + else: + expected[("A", 0)] = expected[("A", 0)].fillna(pd.NA) expected_dtypes = Series( [df.A.dtype] * 2 + [df.B.dtype] * 2, index=result.columns diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3236b165e9444..583b66a99740d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2994,7 +2994,7 @@ def test_groupby_sum_on_nan_should_return_nan(bug_var): dfgb = df.groupby(lambda x: x) result = dfgb.sum(min_count=1) - expected_df = DataFrame([bug_var, bug_var, bug_var, np.nan], columns=["A"]) + expected_df = DataFrame([bug_var, bug_var, bug_var, None], columns=["A"]) tm.assert_frame_equal(result, expected_df) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index d2197ff202fb2..f744c5b741368 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -154,7 +154,7 @@ def test_first_last_nth_dtypes(df_mixed_floats): def test_first_last_nth_nan_dtype(): # GH 33591 - df = DataFrame({"data": ["A"], "nans": Series([np.nan], dtype=object)}) + df = DataFrame({"data": ["A"], "nans": Series([None], dtype=object)}) grouped = df.groupby("data") expected = df.set_index("data").nans diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index d0e1343fbeb54..09b24284d3b37 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -746,6 +746,11 @@ def test_cython_transform_frame(op, args, targop): expected = gb.apply(targop) expected = expected.sort_index(axis=1) + if op == "shift": + expected["string_missing"] = expected["string_missing"].fillna( + np.nan, downcast=False + ) + expected["string"] = expected["string"].fillna(np.nan, downcast=False) result = gb[expected.columns].transform(op, *args).sort_index(axis=1) tm.assert_frame_equal(result, expected) @@ -772,8 +777,13 @@ def test_cython_transform_frame(op, args, targop): else: expected = gb[c].apply(targop) expected.name = c - tm.assert_series_equal(expected, gb[c].transform(op, *args)) - tm.assert_series_equal(expected, getattr(gb[c], op)(*args)) + if c in ["string_missing", "string"]: + expected = expected.fillna(np.nan, downcast=False) + + res = gb[c].transform(op, *args) + tm.assert_series_equal(expected, res) + res2 = getattr(gb[c], op)(*args) + tm.assert_series_equal(expected, res2) def test_transform_with_non_scalar_group(): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 19421345087fc..4017a0e3a2f80 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1163,7 +1163,9 @@ def test_loc_setitem_empty_append_expands_rows(self): # GH6173, various appends to an empty dataframe data = [1, 2, 3] - expected = DataFrame({"x": data, "y": [None] * len(data)}) + expected = DataFrame( + {"x": data, "y": np.array([np.nan] * len(data), dtype=object)} + ) # appends to fit length of data df = DataFrame(columns=["x", "y"]) @@ -1174,7 +1176,9 @@ def test_loc_setitem_empty_append_expands_rows_mixed_dtype(self): # GH#37932 same as test_loc_setitem_empty_append_expands_rows # but with mixed dtype so we go through take_split_path data = [1, 2, 3] - expected = DataFrame({"x": data, "y": [None] * len(data)}) + expected = DataFrame( + {"x": data, "y": np.array([np.nan] * len(data), dtype=object)} + ) df = DataFrame(columns=["x", "y"]) df["x"] = df["x"].astype(np.int64) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index f6d6433cd0643..66dd090ec0783 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1149,13 +1149,14 @@ def test_excel_old_index_format(self, read_ext): # now be interpreted as rows that include null data. data = np.array( [ - [None, None, None, None, None], + [np.nan, np.nan, np.nan, np.nan, np.nan], ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"], - ] + ], + dtype=object, ) columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] mi = MultiIndex( diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 254b6f8dded57..a966ad1dabcaa 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -928,13 +928,15 @@ def test_doc_example(self): result = read_json(json, dtype={"ints": np.int64, "bools": np.bool_}) tm.assert_frame_equal(result, result) - def test_round_trip_exception_(self, datapath): + def test_round_trip_exception(self, datapath): # GH 3867 path = datapath("io", "json", "data", "teams.csv") df = pd.read_csv(path) s = df.to_json() result = read_json(s) - tm.assert_frame_equal(result.reindex(index=df.index, columns=df.columns), df) + res = result.reindex(index=df.index, columns=df.columns) + res = res.fillna(np.nan, downcast=False) + tm.assert_frame_equal(res, df) @pytest.mark.network @tm.network( @@ -1747,7 +1749,7 @@ def test_emca_262_nan_inf_support(self): data = '["a", NaN, "NaN", Infinity, "Infinity", -Infinity, "-Infinity"]' result = read_json(data) expected = DataFrame( - ["a", np.nan, "NaN", np.inf, "Infinity", -np.inf, "-Infinity"] + ["a", None, "NaN", np.inf, "Infinity", -np.inf, "-Infinity"] ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index c06a6fcc2a037..7d7614bc93845 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -2,6 +2,7 @@ from pathlib import Path from typing import Iterator +import numpy as np import pytest import pandas as pd @@ -448,7 +449,7 @@ def test_to_json_append_output_inconsistent_columns(): { "col1": [1, 2, None, None], "col2": ["a", "b", "e", "f"], - "col3": [None, None, "!", "#"], + "col3": [np.nan, np.nan, "!", "#"], } ) with tm.ensure_clean("test.json") as path: @@ -473,8 +474,8 @@ def test_to_json_append_output_different_columns(): expected = DataFrame( { "col1": [1, 2, 3, 4, None, None, None, None], - "col2": ["a", "b", "c", "d", "e", "f", None, None], - "col3": [None, None, None, None, "!", "#", None, None], + "col2": ["a", "b", "c", "d", "e", "f", np.nan, np.nan], + "col3": [np.nan, np.nan, np.nan, np.nan, "!", "#", np.nan, np.nan], "col4": [None, None, None, None, None, None, True, False], } ).astype({"col4": "float"}) @@ -503,8 +504,8 @@ def test_to_json_append_output_different_columns_reordered(): expected = DataFrame( { "col4": [True, False, None, None, None, None, None, None], - "col2": [None, None, "e", "f", "c", "d", "a", "b"], - "col3": [None, None, "!", "#", None, None, None, None], + "col2": [np.nan, np.nan, "e", "f", "c", "d", "a", "b"], + "col3": [np.nan, np.nan, "!", "#", np.nan, np.nan, np.nan, np.nan], "col1": [None, None, None, None, 3, 4, 1, 2], } ).astype({"col4": "float"}) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 647c1753cd660..9a16ec5a50d36 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -32,6 +32,9 @@ def test_string_nas(all_parsers): [["a", "b", "c"], ["d", np.nan, "f"], [np.nan, "g", "h"]], columns=["A", "B", "C"], ) + if parser.engine == "pyarrow": + expected.loc[2, "A"] = None + expected.loc[1, "B"] = None tm.assert_frame_equal(result, expected) @@ -45,6 +48,9 @@ def test_detect_string_na(all_parsers): expected = DataFrame( [["foo", "bar"], [np.nan, "baz"], [np.nan, np.nan]], columns=["A", "B"] ) + if parser.engine == "pyarrow": + expected.loc[[1, 2], "A"] = None + expected.loc[2, "B"] = None result = parser.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) @@ -167,6 +173,9 @@ def test_bool_na_values(all_parsers): "C": [True, False, True], } ) + if parser.engine == "pyarrow": + expected.loc[1, "A"] = None + expected.loc[2, "B"] = None tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index c5bd8341e1a54..01e1be5529bad 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -40,6 +40,7 @@ def check_round_trip(self, df, expected=None, write_kwargs={}, **read_kwargs): to_feather(df, path, **write_kwargs) result = read_feather(path, **read_kwargs) + tm.assert_frame_equal(result, expected) def test_error(self): @@ -86,7 +87,10 @@ def test_basic(self): df["intervals"] = pd.interval_range(0, 3, 3) assert df.dttz.dtype.tz.zone == "US/Eastern" - self.check_round_trip(df) + + expected = df.copy() + expected.loc[1, "bool_with_null"] = None + self.check_round_trip(df, expected=expected) def test_duplicate_columns(self): # https://github.com/wesm/feather/issues/53 diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 5d9e4efd9ecf3..e54a23b1f8ef6 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1463,6 +1463,7 @@ def test_extract_links(self, arg): result = self.read_html(gh_13141_data, extract_links=arg)[0] expected = DataFrame([data_exp, foot_exp], columns=head_exp) + expected = expected.fillna(np.nan, downcast=False) tm.assert_frame_equal(result, expected) def test_extract_links_bad(self, spam_data): diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 954ac25883e73..57ef03b380601 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -8,7 +8,10 @@ import numpy as np import pytest -from pandas._config import get_option +from pandas._config import ( + get_option, + using_copy_on_write, +) from pandas.compat import is_platform_windows from pandas.compat.pyarrow import ( @@ -201,6 +204,8 @@ def compare(repeat): with catch_warnings(record=True): actual = read_parquet(path, **read_kwargs) + if "string_with_nan" in expected: + expected.loc[1, "string_with_nan"] = None tm.assert_frame_equal( expected, actual, @@ -689,6 +694,10 @@ def test_basic_subset_columns(self, pa, df_full): def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): # GH 37105 + msg = "Mismatched null-like values nan and None found" + warn = None + if using_copy_on_write(): + warn = FutureWarning buf_bytes = df_full.to_parquet(engine=pa) assert isinstance(buf_bytes, bytes) @@ -696,7 +705,10 @@ def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): buf_stream = BytesIO(buf_bytes) res = read_parquet(buf_stream) - tm.assert_frame_equal(df_full, res) + expected = df_full.copy(deep=False) + expected.loc[1, "string_with_nan"] = None + with tm.assert_produces_warning(warn, match=msg): + tm.assert_frame_equal(df_full, res) def test_duplicate_columns(self, pa): # not currently able to handle duplicate columns diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index b540cd514c0b5..fff10c7ea4bb3 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -326,20 +326,22 @@ def test_append_empty_frame_to_series_with_dateutil_tz(self): result = df._append([ser, ser], ignore_index=True) tm.assert_frame_equal(result, expected) - def test_append_empty_tz_frame_with_datetime64ns(self): + def test_append_empty_tz_frame_with_datetime64ns(self, using_array_manager): # https://github.com/pandas-dev/pandas/issues/35460 df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") # pd.NaT gets inferred as tz-naive, so append result is tz-naive result = df._append({"a": pd.NaT}, ignore_index=True) - expected = DataFrame({"a": [pd.NaT]}).astype(object) + if using_array_manager: + expected = DataFrame({"a": [pd.NaT]}, dtype=object) + else: + expected = DataFrame({"a": [np.nan]}, dtype=object) tm.assert_frame_equal(result, expected) # also test with typed value to append df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") other = Series({"a": pd.NaT}, dtype="datetime64[ns]") result = df._append(other, ignore_index=True) - expected = DataFrame({"a": [pd.NaT]}).astype(object) tm.assert_frame_equal(result, expected) # mismatched tz @@ -352,7 +354,9 @@ def test_append_empty_tz_frame_with_datetime64ns(self): "dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"] ) @pytest.mark.parametrize("val", [1, "NaT"]) - def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str, val): + def test_append_empty_frame_with_timedelta64ns_nat( + self, dtype_str, val, using_array_manager + ): # https://github.com/pandas-dev/pandas/issues/35460 df = DataFrame(columns=["a"]).astype(dtype_str) @@ -360,6 +364,12 @@ def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str, val): result = df._append(other, ignore_index=True) expected = other.astype(object) + if isinstance(val, str) and dtype_str != "int64" and not using_array_manager: + # TODO: expected used to be `other.astype(object)` which is a more + # reasonable result. This was changed when tightening + # assert_frame_equal's treatment of mismatched NAs to match the + # existing behavior. + expected = DataFrame({"a": [np.nan]}, dtype=object) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index ef02e9f7a465a..dc14e6e74302e 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -681,7 +681,11 @@ def test_concat_null_object_with_dti(): exp_index = Index([None, dti[0]], dtype=object) expected = DataFrame( - {"A": [None, None], "B": [np.nan, np.nan], "C": [np.nan, 0.5274]}, + { + "A": np.array([None, np.nan], dtype=object), + "B": [np.nan, np.nan], + "C": [np.nan, 0.5274], + }, index=exp_index, ) tm.assert_frame_equal(result, expected) @@ -788,7 +792,7 @@ def test_concat_ignore_all_na_object_float(empty_dtype, df_dtype): with tm.assert_produces_warning(warn, match=msg): result = concat([empty, df], ignore_index=True) - expected = DataFrame({"foo": [None, 1, 2], "bar": [None, 1, 2]}, dtype=df_dtype) + expected = DataFrame({"foo": [np.nan, 1, 2], "bar": [np.nan, 1, 2]}, dtype=df_dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 43c6bb03b6a9a..9ec0071ba9afa 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -216,19 +216,30 @@ def test_concat_NaT_dataframes(self, tz): @pytest.mark.parametrize("tz1", [None, "UTC"]) @pytest.mark.parametrize("tz2", [None, "UTC"]) - @pytest.mark.parametrize("s", [pd.NaT, Timestamp("20150101")]) - def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, s): + @pytest.mark.parametrize("item", [pd.NaT, Timestamp("20150101")]) + def test_concat_NaT_dataframes_all_NaT_axis_0( + self, tz1, tz2, item, using_array_manager + ): # GH 12396 # tz-naive first = DataFrame([[pd.NaT], [pd.NaT]]).apply(lambda x: x.dt.tz_localize(tz1)) - second = DataFrame([s]).apply(lambda x: x.dt.tz_localize(tz2)) + second = DataFrame([item]).apply(lambda x: x.dt.tz_localize(tz2)) result = concat([first, second], axis=0) - expected = DataFrame(Series([pd.NaT, pd.NaT, s], index=[0, 1, 0])) + expected = DataFrame(Series([pd.NaT, pd.NaT, item], index=[0, 1, 0])) expected = expected.apply(lambda x: x.dt.tz_localize(tz2)) if tz1 != tz2: expected = expected.astype(object) + if item is pd.NaT and not using_array_manager: + # GH#18463 + # TODO: setting nan here is to keep the test passing as we + # make assert_frame_equal stricter, but is nan really the + # ideal behavior here? + if tz1 is not None: + expected.iloc[-1, 0] = np.nan + else: + expected.iloc[:-1, 0] = np.nan tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 37ed45f0094ec..b50035f2df6c9 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -479,7 +479,7 @@ def test_merge_datetime_multi_index_empty_df(self, merge_type): expected = DataFrame( data={ "data": [1.5, 1.5], - "state": [None, None], + "state": np.array([np.nan, np.nan], dtype=object), }, index=expected_index, ) @@ -488,7 +488,7 @@ def test_merge_datetime_multi_index_empty_df(self, merge_type): else: expected = DataFrame( data={ - "state": [None, None], + "state": np.array([np.nan, np.nan], dtype=object), "data": [1.5, 1.5], }, index=expected_index, diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 5a715fcda97f8..5cdeee20f3435 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -664,7 +664,7 @@ def test_valid_dt_with_missing_values(self): [ date(2013, 1, 1), date(2013, 1, 2), - np.nan, + pd.NaT, date(2013, 1, 4), date(2013, 1, 5), ], @@ -673,7 +673,7 @@ def test_valid_dt_with_missing_values(self): tm.assert_series_equal(result, expected) result = ser.dt.time - expected = Series([time(0), time(0), np.nan, time(0), time(0)], dtype="object") + expected = Series([time(0), time(0), pd.NaT, time(0), time(0)], dtype="object") tm.assert_series_equal(result, expected) def test_dt_accessor_api(self): diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index ede59b3ac9cb9..6b0ff2d3e3704 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -353,7 +353,7 @@ def test_setitem_with_bool_mask_and_values_matching_n_trues_in_length(self): mask = [False] * 3 + [True] * 5 + [False] * 2 ser[mask] = range(5) result = ser - expected = Series([None] * 3 + list(range(5)) + [None] * 2).astype("object") + expected = Series([None] * 3 + list(range(5)) + [None] * 2, dtype=object) tm.assert_series_equal(result, expected) def test_setitem_nan_with_bool(self): @@ -796,6 +796,13 @@ def test_series_where(self, obj, key, expected, val, is_inplace): arr = obj._values res = obj.where(~mask, val) + + if val is NA and res.dtype == object: + expected = expected.fillna(NA) + elif val is None and res.dtype == object: + assert expected.dtype == object + expected = expected.copy() + expected[expected.isna()] = None tm.assert_series_equal(res, expected) self._check_inplace(is_inplace, orig, arr, obj) diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index eabaf23bd36f8..0c8cb493141b7 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -399,7 +399,7 @@ def test_where_datetimelike_coerce(dtype): tm.assert_series_equal(rs, expected) rs = ser.where(mask, [10.0, np.nan]) - expected = Series([10, None], dtype="object") + expected = Series([10, np.nan], dtype="object") tm.assert_series_equal(rs, expected) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 925384cac605e..00d1ad99332e9 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -92,7 +92,12 @@ def test_map_series_stringdtype(any_string_dtype): ) ser2 = Series(["id3", "id2", "id1", "id7000"], dtype=any_string_dtype) result = ser2.map(ser1) - expected = Series(["rabbit", "dog", "cat", pd.NA], dtype=any_string_dtype) + + item = pd.NA + if ser2.dtype == object: + item = np.nan + + expected = Series(data=["rabbit", "dog", "cat", item], dtype=any_string_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index fcadb07a13b83..04a15b4e95e0a 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -314,7 +314,7 @@ def test_reindex_fill_value_datetimelike_upcast(dtype, fill_value, using_array_m ser = Series([NaT], dtype=dtype) result = ser.reindex([0, 1], fill_value=fill_value) - expected = Series([None, fill_value], index=[0, 1], dtype=object) + expected = Series([NaT, fill_value], index=[0, 1], dtype=object) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_case_justify.py b/pandas/tests/strings/test_case_justify.py index e88dddb05eb51..ced941187f548 100644 --- a/pandas/tests/strings/test_case_justify.py +++ b/pandas/tests/strings/test_case_justify.py @@ -21,7 +21,7 @@ def test_title_mixed_object(): s = Series(["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0]) result = s.str.title() expected = Series( - ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", np.nan, np.nan, np.nan] + ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan] ) tm.assert_almost_equal(result, expected) @@ -41,11 +41,11 @@ def test_lower_upper_mixed_object(): s = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]) result = s.str.upper() - expected = Series(["A", np.nan, "B", np.nan, np.nan, "FOO", np.nan, np.nan, np.nan]) + expected = Series(["A", np.nan, "B", np.nan, np.nan, "FOO", None, np.nan, np.nan]) tm.assert_series_equal(result, expected) result = s.str.lower() - expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) + expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan]) tm.assert_series_equal(result, expected) @@ -71,7 +71,7 @@ def test_capitalize_mixed_object(): s = Series(["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0]) result = s.str.capitalize() expected = Series( - ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", np.nan, np.nan, np.nan] + ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan] ) tm.assert_series_equal(result, expected) @@ -87,7 +87,7 @@ def test_swapcase_mixed_object(): s = Series(["FOO", np.nan, "bar", True, datetime.today(), "Blah", None, 1, 2.0]) result = s.str.swapcase() expected = Series( - ["foo", np.nan, "BAR", np.nan, np.nan, "bLAH", np.nan, np.nan, np.nan] + ["foo", np.nan, "BAR", np.nan, np.nan, "bLAH", None, np.nan, np.nan] ) tm.assert_series_equal(result, expected) @@ -138,19 +138,19 @@ def test_pad_mixed_object(): result = s.str.pad(5, side="left") expected = Series( - [" a", np.nan, " b", np.nan, np.nan, " ee", np.nan, np.nan, np.nan] + [" a", np.nan, " b", np.nan, np.nan, " ee", None, np.nan, np.nan] ) tm.assert_series_equal(result, expected) result = s.str.pad(5, side="right") expected = Series( - ["a ", np.nan, "b ", np.nan, np.nan, "ee ", np.nan, np.nan, np.nan] + ["a ", np.nan, "b ", np.nan, np.nan, "ee ", None, np.nan, np.nan] ) tm.assert_series_equal(result, expected) result = s.str.pad(5, side="both") expected = Series( - [" a ", np.nan, " b ", np.nan, np.nan, " ee ", np.nan, np.nan, np.nan] + [" a ", np.nan, " b ", np.nan, np.nan, " ee ", None, np.nan, np.nan] ) tm.assert_series_equal(result, expected) @@ -235,7 +235,7 @@ def test_center_ljust_rjust_mixed_object(): np.nan, " c ", " eee ", - np.nan, + None, np.nan, np.nan, ] @@ -252,7 +252,7 @@ def test_center_ljust_rjust_mixed_object(): np.nan, "c ", "eee ", - np.nan, + None, np.nan, np.nan, ] @@ -269,7 +269,7 @@ def test_center_ljust_rjust_mixed_object(): np.nan, " c", " eee", - np.nan, + None, np.nan, np.nan, ] diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py index 22a5fd28efd8d..4773c13aad376 100644 --- a/pandas/tests/strings/test_extract.py +++ b/pandas/tests/strings/test_extract.py @@ -51,7 +51,7 @@ def test_extract_expand_False_mixed_object(): # single group result = ser.str.extract(".*(BAD[_]+).*BAD", expand=False) expected = Series( - ["BAD_", np.nan, "BAD_", np.nan, np.nan, np.nan, np.nan, np.nan, np.nan] + ["BAD_", np.nan, "BAD_", np.nan, np.nan, np.nan, None, np.nan, np.nan] ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 61f9289dd63f2..89718b1b35f12 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -103,7 +103,7 @@ def test_contains_object_mixed(): result = mixed.str.contains("o") expected = Series( np.array( - [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan], + [False, np.nan, False, np.nan, np.nan, True, None, np.nan, np.nan], dtype=np.object_, ) ) @@ -251,6 +251,11 @@ def test_startswith(pat, dtype, null_value, na): result = values.str.startswith(pat) exp = Series([False, np.nan, True, False, False, np.nan, True]) + if dtype is None and null_value is pd.NA: + # GH#18463 + exp = exp.fillna(null_value) + elif dtype is None and null_value is None: + exp[exp.isna()] = None tm.assert_series_equal(result, exp) result = values.str.startswith(pat, na=na) @@ -263,7 +268,7 @@ def test_startswith(pat, dtype, null_value, na): dtype=np.object_, ) rs = Series(mixed).str.startswith("f") - xp = Series([False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan]) + xp = Series([False, np.nan, False, np.nan, np.nan, True, None, np.nan, np.nan]) tm.assert_series_equal(rs, xp) @@ -304,6 +309,11 @@ def test_endswith(pat, dtype, null_value, na): result = values.str.endswith(pat) exp = Series([False, np.nan, False, False, True, np.nan, True]) + if dtype is None and null_value is pd.NA: + # GH#18463 + exp = exp.fillna(pd.NA) + elif dtype is None and null_value is None: + exp[exp.isna()] = None tm.assert_series_equal(result, exp) result = values.str.endswith(pat, na=na) @@ -316,7 +326,7 @@ def test_endswith(pat, dtype, null_value, na): dtype=object, ) rs = Series(mixed).str.endswith("f") - xp = Series([False, np.nan, False, np.nan, np.nan, False, np.nan, np.nan, np.nan]) + xp = Series([False, np.nan, False, np.nan, np.nan, False, None, np.nan, np.nan]) tm.assert_series_equal(rs, xp) @@ -369,7 +379,7 @@ def test_replace_mixed_object(): ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) result = Series(ser).str.replace("BAD[_]*", "", regex=True) - expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) + expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan]) tm.assert_series_equal(result, expected) @@ -466,7 +476,7 @@ def test_replace_compiled_regex_mixed_object(): ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) result = Series(ser).str.replace(pat, "", regex=True) - expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) + expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan]) tm.assert_series_equal(result, expected) @@ -691,9 +701,7 @@ def test_match_mixed_object(): ] ) result = Series(mixed).str.match(".*(BAD[_]+).*(BAD)") - expected = Series( - [True, np.nan, True, np.nan, np.nan, False, np.nan, np.nan, np.nan] - ) + expected = Series([True, np.nan, True, np.nan, np.nan, False, None, np.nan, np.nan]) assert isinstance(result, Series) tm.assert_series_equal(result, expected) @@ -783,6 +791,9 @@ def test_findall(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan, "foo", "BAD"], dtype=any_string_dtype) result = ser.str.findall("BAD[_]*") expected = Series([["BAD__", "BAD"], np.nan, [], ["BAD"]]) + if ser.dtype != object: + # GH#18463 + expected = expected.fillna(pd.NA) tm.assert_series_equal(result, expected) @@ -810,7 +821,7 @@ def test_findall_mixed_object(): np.nan, np.nan, ["BAD"], - np.nan, + None, np.nan, np.nan, ] diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 74458c13e8df7..0298694ccaf71 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -20,6 +20,9 @@ def test_split(any_string_dtype, method): result = getattr(values.str, method)("_") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) + if values.dtype != object: + # GH#18463 + exp = exp.fillna(pd.NA) tm.assert_series_equal(result, exp) @@ -29,6 +32,9 @@ def test_split_more_than_one_char(any_string_dtype, method): values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"], dtype=any_string_dtype) result = getattr(values.str, method)("__") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) + if values.dtype != object: + # GH#18463 + exp = exp.fillna(pd.NA) tm.assert_series_equal(result, exp) result = getattr(values.str, method)("__", expand=False) @@ -40,6 +46,9 @@ def test_split_more_regex_split(any_string_dtype): values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) result = values.str.split("[,_]") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) + if values.dtype != object: + # GH#18463 + exp = exp.fillna(pd.NA) tm.assert_series_equal(result, exp) @@ -95,7 +104,7 @@ def test_split_object_mixed(expand, method): ["d", "e", "f"], np.nan, np.nan, - np.nan, + None, np.nan, np.nan, ] @@ -119,6 +128,9 @@ def test_rsplit(any_string_dtype): values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) result = values.str.rsplit("[,_]") exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]]) + if values.dtype != object: + # GH#18463 + exp = exp.fillna(pd.NA) tm.assert_series_equal(result, exp) @@ -127,6 +139,9 @@ def test_rsplit_max_number(any_string_dtype): values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) result = values.str.rsplit("_", n=1) exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]]) + if values.dtype != object: + # GH#18463 + exp = exp.fillna(pd.NA) tm.assert_series_equal(result, exp) @@ -144,9 +159,9 @@ def test_split_blank_string_with_non_empty(any_string_dtype): exp = DataFrame( [ ["a", "b", "c"], - ["a", "b", np.nan], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], + ["a", "b", None], + [None, None, None], + [None, None, None], ], dtype=any_string_dtype, ) @@ -228,9 +243,9 @@ def test_split_to_dataframe_unequal_splits(any_string_dtype): 0: ["some", "one"], 1: ["unequal", "of"], 2: ["splits", "these"], - 3: [np.nan, "things"], - 4: [np.nan, "is"], - 5: [np.nan, "not"], + 3: [None, "things"], + 4: [None, "is"], + 5: [None, "not"], }, dtype=any_string_dtype, ) @@ -440,6 +455,9 @@ def test_partition_series_more_than_one_char(method, exp, any_string_dtype): s = Series(["a__b__c", "c__d__e", np.nan, "f__g__h", None], dtype=any_string_dtype) result = getattr(s.str, method)("__", expand=False) expected = Series(exp) + if s.dtype != object: + # GH#18463 + expected = expected.fillna(pd.NA) tm.assert_series_equal(result, expected) @@ -462,6 +480,9 @@ def test_partition_series_none(any_string_dtype, method, exp): s = Series(["a b c", "c d e", np.nan, "f g h", None], dtype=any_string_dtype) result = getattr(s.str, method)(expand=False) expected = Series(exp) + if s.dtype != object: + # GH#18463 + expected = expected.fillna(pd.NA) tm.assert_series_equal(result, expected) @@ -484,6 +505,9 @@ def test_partition_series_not_split(any_string_dtype, method, exp): s = Series(["abc", "cde", np.nan, "fgh", None], dtype=any_string_dtype) result = getattr(s.str, method)("_", expand=False) expected = Series(exp) + if s.dtype != object: + # GH#18463 + expected = expected.fillna(pd.NA) tm.assert_series_equal(result, expected) @@ -507,6 +531,9 @@ def test_partition_series_unicode(any_string_dtype, method, exp): result = getattr(s.str, method)("_", expand=False) expected = Series(exp) + if s.dtype != object: + # GH#18463 + expected = expected.fillna(pd.NA) tm.assert_series_equal(result, expected) @@ -675,7 +702,7 @@ def test_get(): def test_get_mixed_object(): ser = Series(["a_b_c", np.nan, "c_d_e", True, datetime.today(), None, 1, 2.0]) result = ser.str.split("_").str.get(1) - expected = Series(["b", np.nan, "d", np.nan, np.nan, np.nan, np.nan, np.nan]) + expected = Series(["b", np.nan, "d", np.nan, np.nan, None, np.nan, np.nan]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index 8628aafefa4b1..a88dcc8956931 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -4,6 +4,7 @@ from pandas._libs import lib from pandas import ( + NA, DataFrame, Series, _testing as tm, @@ -47,10 +48,16 @@ def test_string_array(nullable_string_dtype, any_string_method): assert result.dtype == "Int64" result = result.astype("float") + if expected.dtype == object: + # GH#18463 + expected[expected.isna()] = NA + elif isinstance(expected, DataFrame): columns = expected.select_dtypes(include="object").columns assert all(result[columns].dtypes == nullable_string_dtype) result[columns] = result[columns].astype(object) + expected[columns] = expected[columns].fillna(NA) # GH#18463 + tm.assert_equal(result, expected) @@ -96,6 +103,7 @@ def test_string_array_extract(nullable_string_dtype): result = a.str.extract(pat, expand=False) expected = b.str.extract(pat, expand=False) + expected = expected.fillna(NA) # GH#18463 assert all(result.dtypes == nullable_string_dtype) result = result.astype(object) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index b863425a24183..7719f13bf7494 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -68,7 +68,7 @@ def test_repeat_mixed_object(): ser = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]) result = ser.str.repeat(3) expected = Series( - ["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", np.nan, np.nan, np.nan] + ["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", None, np.nan, np.nan] ) tm.assert_series_equal(result, expected) @@ -78,7 +78,7 @@ def test_repeat_with_null(any_string_dtype, arg, repeat): # GH: 31632 ser = Series(["a", arg], dtype=any_string_dtype) result = ser.str.repeat([3, repeat]) - expected = Series(["aaa", np.nan], dtype=any_string_dtype) + expected = Series(["aaa", None], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -260,7 +260,7 @@ def test_spilt_join_roundtrip_mixed_object(): ) result = ser.str.split("_").str.join("_") expected = Series( - ["a_b", np.nan, "asdf_cas_asdf", np.nan, np.nan, "foo", np.nan, np.nan, np.nan] + ["a_b", np.nan, "asdf_cas_asdf", np.nan, np.nan, "foo", None, np.nan, np.nan] ) tm.assert_series_equal(result, expected) @@ -381,8 +381,8 @@ def test_slice(start, stop, step, expected, any_string_dtype): @pytest.mark.parametrize( "start, stop, step, expected", [ - (2, 5, None, ["foo", np.nan, "bar", np.nan, np.nan, np.nan, np.nan, np.nan]), - (4, 1, -1, ["oof", np.nan, "rab", np.nan, np.nan, np.nan, np.nan, np.nan]), + (2, 5, None, ["foo", np.nan, "bar", np.nan, np.nan, None, np.nan, np.nan]), + (4, 1, -1, ["oof", np.nan, "rab", np.nan, np.nan, None, np.nan, np.nan]), ], ) def test_slice_mixed_object(start, stop, step, expected): @@ -443,7 +443,7 @@ def test_strip_lstrip_rstrip_mixed_object(method, exp): ser = Series([" aa ", np.nan, " bb \t\n", True, datetime.today(), None, 1, 2.0]) result = getattr(ser.str, method)() - expected = Series(exp + [np.nan, np.nan, np.nan, np.nan, np.nan]) + expected = Series(exp + [np.nan, np.nan, None, np.nan, np.nan]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 58bdf3666caf4..af2cdb8cac9e8 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1324,7 +1324,7 @@ def test_value_counts_dropna(self): ) tm.assert_series_equal( Series([True] * 5 + [False] * 3 + [None] * 2).value_counts(dropna=False), - Series([5, 3, 2], index=[True, False, np.nan], name="count"), + Series([5, 3, 2], index=[True, False, None], name="count"), ) tm.assert_series_equal( Series([10.3, 5.0, 5.0]).value_counts(dropna=True), @@ -1341,7 +1341,7 @@ def test_value_counts_dropna(self): ) result = Series([10.3, 10.3, 5.0, 5.0, 5.0, None]).value_counts(dropna=False) - expected = Series([3, 2, 1], index=[5.0, 10.3, np.nan], name="count") + expected = Series([3, 2, 1], index=[5.0, 10.3, None], name="count") tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", (np.float64, object, "M8[ns]")) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 7d258033748b6..6c903611e212b 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -324,6 +324,16 @@ def check_fun_data( targ = bool(targ) res = testfunc(testarval, axis=axis, skipna=skipna, **kwargs) + + if ( + isinstance(targ, np.complex_) + and isinstance(res, float) + and np.isnan(targ) + and np.isnan(res) + ): + # GH#18463 + targ = res + self.check_results(targ, res, axis, check_dtype=check_dtype) if skipna: res = testfunc(testarval, axis=axis, **kwargs) diff --git a/pandas/tests/util/test_assert_almost_equal.py b/pandas/tests/util/test_assert_almost_equal.py index ad54606547909..a86302f158005 100644 --- a/pandas/tests/util/test_assert_almost_equal.py +++ b/pandas/tests/util/test_assert_almost_equal.py @@ -2,8 +2,10 @@ import pytest from pandas import ( + NA, DataFrame, Index, + NaT, Series, Timestamp, ) @@ -302,16 +304,55 @@ def test_assert_not_almost_equal_null(a, b): (np.inf, np.inf), (np.inf, float("inf")), (np.array([np.inf, np.nan, -np.inf]), np.array([np.inf, np.nan, -np.inf])), - ( - np.array([np.inf, None, -np.inf], dtype=np.object_), - np.array([np.inf, np.nan, -np.inf], dtype=np.object_), - ), ], ) def test_assert_almost_equal_inf(a, b): _assert_almost_equal_both(a, b) +objs = [NA, np.nan, NaT, None, np.datetime64("NaT"), np.timedelta64("NaT")] + + +@pytest.mark.parametrize("left", objs) +@pytest.mark.parametrize("right", objs) +def test_mismatched_na_assert_almost_equal_deprecation(left, right): + left_arr = np.array([left], dtype=object) + right_arr = np.array([right], dtype=object) + + msg = "Mismatched null-like values" + + if left is right: + _assert_almost_equal_both(left, right, check_dtype=False) + tm.assert_numpy_array_equal(left_arr, right_arr) + tm.assert_index_equal( + Index(left_arr, dtype=object), Index(right_arr, dtype=object) + ) + tm.assert_series_equal( + Series(left_arr, dtype=object), Series(right_arr, dtype=object) + ) + tm.assert_frame_equal( + DataFrame(left_arr, dtype=object), DataFrame(right_arr, dtype=object) + ) + + else: + with tm.assert_produces_warning(FutureWarning, match=msg): + _assert_almost_equal_both(left, right, check_dtype=False) + + # TODO: to get the same deprecation in assert_numpy_array_equal we need + # to change/deprecate the default for strict_nan to become True + # TODO: to get the same deprecateion in assert_index_equal we need to + # change/deprecate array_equivalent_object to be stricter, as + # assert_index_equal uses Index.equal which uses array_equivalent. + with tm.assert_produces_warning(FutureWarning, match=msg): + tm.assert_series_equal( + Series(left_arr, dtype=object), Series(right_arr, dtype=object) + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + tm.assert_frame_equal( + DataFrame(left_arr, dtype=object), DataFrame(right_arr, dtype=object) + ) + + def test_assert_not_almost_equal_inf(): _assert_not_almost_equal_both(np.inf, 0)