Skip to content

DEPR: be stricter in assert_almost_equal #52081

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
May 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -265,8 +265,10 @@ Deprecations
- Deprecated logical operation between two non boolean :class:`Series` with different indexes always coercing the result to bool dtype. In a future version, this will maintain the return type of the inputs. (:issue:`52500`, :issue:`52538`)
- Deprecated allowing ``downcast`` keyword other than ``None``, ``False``, "infer", or a dict with these as values in :meth:`Series.fillna`, :meth:`DataFrame.fillna` (:issue:`40988`)
- Deprecated allowing arbitrary ``fill_value`` in :class:`SparseDtype`, in a future version the ``fill_value`` will need to be compatible with the ``dtype.subtype``, either a scalar that can be held by that subtype or ``NaN`` for integer or bool subtypes (:issue:`23124`)
- Deprecated behavior of :func:`assert_series_equal` and :func:`assert_frame_equal` considering NA-like values (e.g. ``NaN`` vs ``None`` as equivalent) (:issue:`52081`)
- Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`)
- Deprecated positional indexing on :class:`Series` with :meth:`Series.__getitem__` and :meth:`Series.__setitem__`, in a future version ``ser[item]`` will *always* interpret ``item`` as a label, not a position (:issue:`50617`)
-

.. ---------------------------------------------------------------------------
.. _whatsnew_210.performance:
Expand Down
30 changes: 23 additions & 7 deletions pandas/_libs/testing.pyx
Original file line number Diff line number Diff line change
@@ -1,19 +1,25 @@
import cmath
import math
import warnings

import numpy as np

from numpy cimport import_array

import_array()

from pandas._libs.missing cimport checknull
from pandas._libs.missing cimport (
checknull,
is_matching_na,
)
from pandas._libs.util cimport (
is_array,
is_complex_object,
is_real_number_object,
)

from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.missing import array_equivalent


Expand Down Expand Up @@ -176,13 +182,23 @@ cpdef assert_almost_equal(a, b,
# classes can't be the same, to raise error
assert_class_equal(a, b, obj=obj)

if checknull(a) and checknull(b):
# TODO: Should require same-dtype NA?
if checknull(a):
# nan / None comparison
return True

if (checknull(a) and not checknull(b)) or (not checknull(a) and checknull(b)):
# boolean value of pd.NA is ambiguous
if is_matching_na(a, b, nan_matches_none=False):
return True
elif checknull(b):
# GH#18463
warnings.warn(
f"Mismatched null-like values {a} and {b} found. In a future "
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need a whatsnew entry for this change since the testing functions are public?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea, will update

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated+green

"version, pandas equality-testing functions "
"(e.g. assert_frame_equal) will consider these not-matching "
"and raise.",
FutureWarning,
stacklevel=find_stack_level(),
)
return True
raise AssertionError(f"{a} != {b}")
elif checknull(b):
raise AssertionError(f"{a} != {b}")

if a == b:
Expand Down
24 changes: 14 additions & 10 deletions pandas/core/dtypes/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
TYPE_CHECKING,
overload,
)
import warnings

import numpy as np

Expand Down Expand Up @@ -573,17 +574,20 @@ def _array_equivalent_object(left: np.ndarray, right: np.ndarray, strict_nan: bo
if not isinstance(right_value, float) or not np.isnan(right_value):
return False
else:
try:
if np.any(np.asarray(left_value != right_value)):
with warnings.catch_warnings():
# suppress numpy's "elementwise comparison failed"
warnings.simplefilter("ignore", DeprecationWarning)
try:
if np.any(np.asarray(left_value != right_value)):
return False
except TypeError as err:
if "boolean value of NA is ambiguous" in str(err):
return False
raise
except ValueError:
# numpy can raise a ValueError if left and right cannot be
# compared (e.g. nested arrays)
return False
except TypeError as err:
if "boolean value of NA is ambiguous" in str(err):
return False
raise
except ValueError:
# numpy can raise a ValueError if left and right cannot be
# compared (e.g. nested arrays)
return False
return True


Expand Down
9 changes: 8 additions & 1 deletion pandas/tests/arithmetic/test_timedelta64.py
Original file line number Diff line number Diff line change
Expand Up @@ -2037,6 +2037,10 @@ def test_td64arr_div_numeric_array(
if box_with_array is DataFrame:
expected = [tdser.iloc[0, n] / vector[n] for n in range(len(vector))]
expected = tm.box_expected(expected, xbox).astype(object)
# We specifically expect timedelta64("NaT") here, not pd.NA
expected[2] = expected[2].fillna(
np.timedelta64("NaT", "ns"), downcast=False
)
else:
expected = [tdser[n] / vector[n] for n in range(len(tdser))]
expected = [
Expand Down Expand Up @@ -2113,9 +2117,12 @@ def test_td64arr_all_nat_div_object_dtype_numeric(self, box_with_array):
left = tm.box_expected(tdi, box_with_array)
right = np.array([2, 2.0], dtype=object)

expected = Index([np.timedelta64("NaT", "ns")] * 2, dtype=object)
tdnat = np.timedelta64("NaT", "ns")
expected = Index([tdnat] * 2, dtype=object)
if box_with_array is not Index:
expected = tm.box_expected(expected, box_with_array).astype(object)
if box_with_array in [Series, DataFrame]:
expected = expected.fillna(tdnat, downcast=False) # GH#18463

result = left / right
tm.assert_equal(result, expected)
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/arrays/integer/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,10 @@ def test_error_invalid_values(data, all_arithmetic_operators):
]: # (data[~data.isna()] >= 0).all():
res = ops(str_ser)
expected = pd.Series(["foo" * x for x in data], index=s.index)
expected = expected.fillna(np.nan)
# TODO: doing this fillna to keep tests passing as we make
# assert_almost_equal stricter, but the expected with pd.NA seems
# more-correct than np.nan here.
tm.assert_series_equal(res, expected)
else:
with pytest.raises(TypeError, match=msg):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/integer/test_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def test_conversions(data_missing):
# astype to object series
df = pd.DataFrame({"A": data_missing})
result = df["A"].astype("object")
expected = pd.Series(np.array([np.nan, 1], dtype=object), name="A")
expected = pd.Series(np.array([pd.NA, 1], dtype=object), name="A")
tm.assert_series_equal(result, expected)

# convert to object ndarray
Expand Down
18 changes: 14 additions & 4 deletions pandas/tests/extension/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@
import numpy as np
import pytest

from pandas.core.dtypes.common import is_bool_dtype
from pandas.compat import (
IS64,
is_platform_windows,
)

import pandas as pd
import pandas._testing as tm
Expand Down Expand Up @@ -382,11 +385,18 @@ class TestUnaryOps(base.BaseUnaryOpsTests):

class TestAccumulation(base.BaseAccumulateTests):
def check_accumulate(self, s, op_name, skipna):
length = 64
if not IS64 or is_platform_windows():
if not s.dtype.itemsize == 8:
length = 32

result = getattr(s, op_name)(skipna=skipna)
expected = getattr(pd.Series(s.astype("float64")), op_name)(skipna=skipna)
tm.assert_series_equal(result, expected, check_dtype=False)
if op_name in ("cummin", "cummax"):
assert is_bool_dtype(result)
if op_name not in ("cummin", "cummax"):
expected = expected.astype(f"Int{length}")
else:
expected = expected.astype("boolean")
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("skipna", [True, False])
def test_accumulate_series_raises(self, data, all_numeric_accumulations, skipna):
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/frame/methods/test_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,10 @@ def test_compare_ea_and_np_dtype(val1, val2):
("b", "other"): np.nan,
}
)
if val1 is pd.NA and val2 is pd.NA:
# GH#18463 TODO: is this really the desired behavior?
expected.loc[1, ("a", "self")] = np.nan

if val1 is pd.NA and is_numpy_dev:
# can't compare with numpy array if it contains pd.NA
with pytest.raises(TypeError, match="boolean value of NA is ambiguous"):
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/frame/methods/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -734,6 +734,9 @@ def test_quantile_empty_no_rows_dt64(self, interp_method):
0.5, numeric_only=False, interpolation=interpolation, method=method
)
exp = exp.astype(object)
if interpolation == "nearest":
# GH#18463 TODO: would we prefer NaTs here?
exp = exp.fillna(np.nan, downcast=False)
tm.assert_series_equal(res, exp)

# both dt64tz
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/frame/methods/test_reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,13 @@ def test_reindex_timestamp_with_fold(self, timezone, year, month, day, hour):
.set_index("index")
.reindex(["1", "2"])
)
exp = DataFrame({"index": ["1", "2"], "vals": [np.nan, np.nan]}).set_index(
"index"
)
exp = exp.astype(object)
tm.assert_frame_equal(
df,
DataFrame({"index": ["1", "2"], "vals": [None, None]}).set_index("index"),
exp,
)


Expand Down Expand Up @@ -1191,7 +1195,7 @@ def test_reindex_empty_frame(self, kwargs):
idx = date_range(start="2020", freq="30s", periods=3)
df = DataFrame([], index=Index([], name="time"), columns=["a"])
result = df.reindex(idx, **kwargs)
expected = DataFrame({"a": [pd.NA] * 3}, index=idx)
expected = DataFrame({"a": [np.nan] * 3}, index=idx, dtype=object)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/frame/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1245,19 +1245,19 @@ def test_operators_none_as_na(self, op):
filled = df.fillna(np.nan)
result = op(df, 3)
expected = op(filled, 3).astype(object)
expected[pd.isna(expected)] = None
expected[pd.isna(expected)] = np.nan
tm.assert_frame_equal(result, expected)

result = op(df, df)
expected = op(filled, filled).astype(object)
expected[pd.isna(expected)] = None
expected[pd.isna(expected)] = np.nan
tm.assert_frame_equal(result, expected)

result = op(df, df.fillna(7))
tm.assert_frame_equal(result, expected)

result = op(df.fillna(7), df)
tm.assert_frame_equal(result, expected, check_dtype=False)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("op,res", [("__eq__", False), ("__ne__", True)])
# TODO: not sure what's correct here.
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,11 +331,14 @@ def wrapper(x):
DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object),
],
)
@pytest.mark.filterwarnings("ignore:Mismatched null-like values:FutureWarning")
def test_stat_operators_attempt_obj_array(self, method, df, axis):
# GH#676
assert df.values.dtype == np.object_
result = getattr(df, method)(axis=axis)
expected = getattr(df.astype("f8"), method)(axis=axis).astype(object)
if axis in [1, "columns"] and method in ["min", "max"]:
expected[expected.isna()] = None
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"])
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -1180,6 +1180,10 @@ def test_unstack_mixed_extension_types(self, level):

result = df.unstack(level=level)
expected = df.astype(object).unstack(level=level)
if level == 0:
expected[("A", "B")] = expected[("A", "B")].fillna(pd.NA)
else:
expected[("A", 0)] = expected[("A", 0)].fillna(pd.NA)

expected_dtypes = Series(
[df.A.dtype] * 2 + [df.B.dtype] * 2, index=result.columns
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2994,7 +2994,7 @@ def test_groupby_sum_on_nan_should_return_nan(bug_var):
dfgb = df.groupby(lambda x: x)
result = dfgb.sum(min_count=1)

expected_df = DataFrame([bug_var, bug_var, bug_var, np.nan], columns=["A"])
expected_df = DataFrame([bug_var, bug_var, bug_var, None], columns=["A"])
tm.assert_frame_equal(result, expected_df)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_nth.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def test_first_last_nth_dtypes(df_mixed_floats):

def test_first_last_nth_nan_dtype():
# GH 33591
df = DataFrame({"data": ["A"], "nans": Series([np.nan], dtype=object)})
df = DataFrame({"data": ["A"], "nans": Series([None], dtype=object)})
grouped = df.groupby("data")

expected = df.set_index("data").nans
Expand Down
14 changes: 12 additions & 2 deletions pandas/tests/groupby/transform/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -746,6 +746,11 @@ def test_cython_transform_frame(op, args, targop):
expected = gb.apply(targop)

expected = expected.sort_index(axis=1)
if op == "shift":
expected["string_missing"] = expected["string_missing"].fillna(
np.nan, downcast=False
)
expected["string"] = expected["string"].fillna(np.nan, downcast=False)

result = gb[expected.columns].transform(op, *args).sort_index(axis=1)
tm.assert_frame_equal(result, expected)
Expand All @@ -772,8 +777,13 @@ def test_cython_transform_frame(op, args, targop):
else:
expected = gb[c].apply(targop)
expected.name = c
tm.assert_series_equal(expected, gb[c].transform(op, *args))
tm.assert_series_equal(expected, getattr(gb[c], op)(*args))
if c in ["string_missing", "string"]:
expected = expected.fillna(np.nan, downcast=False)

res = gb[c].transform(op, *args)
tm.assert_series_equal(expected, res)
res2 = getattr(gb[c], op)(*args)
tm.assert_series_equal(expected, res2)


def test_transform_with_non_scalar_group():
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/indexing/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1163,7 +1163,9 @@ def test_loc_setitem_empty_append_expands_rows(self):
# GH6173, various appends to an empty dataframe

data = [1, 2, 3]
expected = DataFrame({"x": data, "y": [None] * len(data)})
expected = DataFrame(
{"x": data, "y": np.array([np.nan] * len(data), dtype=object)}
)

# appends to fit length of data
df = DataFrame(columns=["x", "y"])
Expand All @@ -1174,7 +1176,9 @@ def test_loc_setitem_empty_append_expands_rows_mixed_dtype(self):
# GH#37932 same as test_loc_setitem_empty_append_expands_rows
# but with mixed dtype so we go through take_split_path
data = [1, 2, 3]
expected = DataFrame({"x": data, "y": [None] * len(data)})
expected = DataFrame(
{"x": data, "y": np.array([np.nan] * len(data), dtype=object)}
)

df = DataFrame(columns=["x", "y"])
df["x"] = df["x"].astype(np.int64)
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1149,13 +1149,14 @@ def test_excel_old_index_format(self, read_ext):
# now be interpreted as rows that include null data.
data = np.array(
[
[None, None, None, None, None],
[np.nan, np.nan, np.nan, np.nan, np.nan],
["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"],
]
],
dtype=object,
)
columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
mi = MultiIndex(
Expand Down
8 changes: 5 additions & 3 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -928,13 +928,15 @@ def test_doc_example(self):
result = read_json(json, dtype={"ints": np.int64, "bools": np.bool_})
tm.assert_frame_equal(result, result)

def test_round_trip_exception_(self, datapath):
def test_round_trip_exception(self, datapath):
# GH 3867
path = datapath("io", "json", "data", "teams.csv")
df = pd.read_csv(path)
s = df.to_json()
result = read_json(s)
tm.assert_frame_equal(result.reindex(index=df.index, columns=df.columns), df)
res = result.reindex(index=df.index, columns=df.columns)
res = res.fillna(np.nan, downcast=False)
tm.assert_frame_equal(res, df)

@pytest.mark.network
@tm.network(
Expand Down Expand Up @@ -1747,7 +1749,7 @@ def test_emca_262_nan_inf_support(self):
data = '["a", NaN, "NaN", Infinity, "Infinity", -Infinity, "-Infinity"]'
result = read_json(data)
expected = DataFrame(
["a", np.nan, "NaN", np.inf, "Infinity", -np.inf, "-Infinity"]
["a", None, "NaN", np.inf, "Infinity", -np.inf, "-Infinity"]
)
tm.assert_frame_equal(result, expected)

Expand Down
Loading