From 50891ae16e07cb194b45bb2e24cd04a3972d6dd9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Sep 2024 15:22:37 +0200 Subject: [PATCH] TST (string dtype): adjust pandas/tests/reshape tests --- pandas/tests/reshape/concat/test_concat.py | 11 ++------ pandas/tests/reshape/merge/test_merge_asof.py | 10 ++----- pandas/tests/reshape/test_get_dummies.py | 10 ++----- pandas/tests/reshape/test_melt.py | 25 ++++++----------- pandas/tests/reshape/test_pivot.py | 28 ++++++++++++------- 5 files changed, 34 insertions(+), 50 deletions(-) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 8af224f1ad64f..d3edee17366f7 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -10,8 +10,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import InvalidIndexError import pandas as pd @@ -47,18 +45,11 @@ def test_append_concat(self): assert isinstance(result.index, PeriodIndex) assert result.index[0] == s1.index[0] - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_concat_copy(self): df = DataFrame(np.random.default_rng(2).standard_normal((4, 3))) df2 = DataFrame(np.random.default_rng(2).integers(0, 10, size=4).reshape(4, 1)) df3 = DataFrame({5: "foo"}, index=range(4)) - # These are actual copies. - result = concat([df, df2, df3], axis=1) - for block in result._mgr.blocks: - assert block.values.base is not None - - # These are the same. result = concat([df, df2, df3], axis=1) for block in result._mgr.blocks: @@ -69,6 +60,8 @@ def test_concat_copy(self): assert arr.base is df2._mgr.blocks[0].values.base elif arr.dtype == object: assert arr.base is not None + elif arr.dtype == "string": + tm.shares_memory(arr, df3._mgr.blocks[0].values) # Float block was consolidated. df4 = DataFrame(np.random.default_rng(2).standard_normal((4, 1))) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 8d972087b0dff..f7b0876c5a605 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -3064,12 +3062,8 @@ def test_on_float_by_int(self): tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_merge_datatype_error_raises(self, using_infer_string): - if using_infer_string: - msg = "incompatible merge keys" - else: - msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype" + def test_merge_datatype_error_raises(self): + msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype" left = pd.DataFrame({"left_val": [1, 5, 10], "a": ["a", "b", "c"]}) right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7], "a": [1, 2, 3, 6, 7]}) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 27a34decae7b0..f07c6845366da 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype @@ -216,11 +214,10 @@ def test_dataframe_dummies_all_obj(self, df, sparse): tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_dataframe_dummies_string_dtype(self, df, using_infer_string): + def test_dataframe_dummies_string_dtype(self, df, any_string_dtype): # GH44965 df = df[["A", "B"]] - df = df.astype({"A": "object", "B": "string"}) + df = df.astype({"A": "str", "B": any_string_dtype}) result = get_dummies(df) expected = DataFrame( { @@ -231,8 +228,7 @@ def test_dataframe_dummies_string_dtype(self, df, using_infer_string): }, dtype=bool, ) - if not using_infer_string: - # infer_string returns numpy bools + if any_string_dtype == "string" and any_string_dtype.na_value is pd.NA: expected[["B_b", "B_c"]] = expected[["B_b", "B_c"]].astype("boolean") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index be4f2ab4d183d..4a12404f6775a 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -21,7 +19,7 @@ def df(): res = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) res["id1"] = (res["A"] > 0).astype(np.int64) @@ -83,7 +81,6 @@ def test_default_col_names(self, df): result2 = df.melt(id_vars=["id1", "id2"]) assert result2.columns.tolist() == ["id1", "id2", "variable", "value"] - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_value_vars(self, df): result3 = df.melt(id_vars=["id1", "id2"], value_vars="A") assert len(result3) == 10 @@ -100,7 +97,6 @@ def test_value_vars(self, df): ) tm.assert_frame_equal(result4, expected4) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("type_", (tuple, list, np.array)) def test_value_vars_types(self, type_, df): # GH 15348 @@ -178,7 +174,6 @@ def test_tuple_vars_fail_with_multiindex(self, id_vars, value_vars, df1): with pytest.raises(ValueError, match=msg): df1.melt(id_vars=id_vars, value_vars=value_vars) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_var_name(self, df, var_name): result5 = df.melt(var_name=var_name) assert result5.columns.tolist() == ["var", "value"] @@ -206,7 +201,6 @@ def test_custom_var_name(self, df, var_name): ) tm.assert_frame_equal(result9, expected9) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_value_name(self, df, value_name): result10 = df.melt(value_name=value_name) assert result10.columns.tolist() == ["variable", "val"] @@ -236,7 +230,6 @@ def test_custom_value_name(self, df, value_name): ) tm.assert_frame_equal(result14, expected14) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_var_and_value_name(self, df, value_name, var_name): result15 = df.melt(var_name=var_name, value_name=value_name) assert result15.columns.tolist() == ["var", "val"] @@ -361,7 +354,6 @@ def test_melt_missing_columns_raises(self): with pytest.raises(KeyError, match=msg): df.melt(["A"], ["F"], col_level=0) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_melt_mixed_int_str_id_vars(self): # GH 29718 df = DataFrame({0: ["foo"], "a": ["bar"], "b": [1], "d": [2]}) @@ -369,6 +361,8 @@ def test_melt_mixed_int_str_id_vars(self): expected = DataFrame( {0: ["foo"] * 2, "a": ["bar"] * 2, "variable": list("bd"), "value": [1, 2]} ) + # the df's columns are mixed type and thus object -> preserves object dtype + expected["variable"] = expected["variable"].astype(object) tm.assert_frame_equal(result, expected) def test_melt_mixed_int_str_value_vars(self): @@ -1222,12 +1216,10 @@ def test_raise_of_column_name_value(self): ): df.melt(id_vars="value", value_name="value") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - @pytest.mark.parametrize("dtype", ["O", "string"]) - def test_missing_stubname(self, dtype): + def test_missing_stubname(self, any_string_dtype): # GH46044 df = DataFrame({"id": ["1", "2"], "a-1": [100, 200], "a-2": [300, 400]}) - df = df.astype({"id": dtype}) + df = df.astype({"id": any_string_dtype}) result = wide_to_long( df, stubnames=["a", "b"], @@ -1243,12 +1235,13 @@ def test_missing_stubname(self, dtype): {"a": [100, 200, 300, 400], "b": [np.nan] * 4}, index=index, ) - new_level = expected.index.levels[0].astype(dtype) + new_level = expected.index.levels[0].astype(any_string_dtype) + if any_string_dtype == "object": + new_level = expected.index.levels[0].astype("str") expected.index = expected.index.set_levels(new_level, level=0) tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_wide_to_long_pyarrow_string_columns(): # GH 57066 pytest.importorskip("pyarrow") @@ -1267,7 +1260,7 @@ def test_wide_to_long_pyarrow_string_columns(): ) expected = DataFrame( [[1, 1], [1, 1], [1, 2]], - columns=Index(["D", "R"], dtype=object), + columns=Index(["D", "R"]), index=pd.MultiIndex.from_arrays( [ [1, 1, 1], diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 8cfe565ebdd65..eccf676b87f89 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1068,7 +1068,6 @@ def test_margins_dtype_len(self, data): tm.assert_frame_equal(expected, result) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)]) def test_pivot_table_multiindex_only(self, cols): # GH 17038 @@ -1078,7 +1077,7 @@ def test_pivot_table_multiindex_only(self, cols): expected = DataFrame( [[4.0, 5.0, 6.0]], columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols), - index=Index(["v"], dtype=object), + index=Index(["v"], dtype="str" if cols == ("a", "b") else "object"), ) tm.assert_frame_equal(result, expected) @@ -2570,13 +2569,16 @@ def test_pivot_empty(self): expected = DataFrame(index=[], columns=[]) tm.assert_frame_equal(result, expected, check_names=False) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - @pytest.mark.parametrize("dtype", [object, "string"]) - def test_pivot_integer_bug(self, dtype): - df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=dtype) + def test_pivot_integer_bug(self, any_string_dtype): + df = DataFrame( + data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=any_string_dtype + ) result = df.pivot(index=1, columns=0, values=2) - tm.assert_index_equal(result.columns, Index(["A", "B"], name=0, dtype=dtype)) + expected_columns = Index(["A", "B"], name=0, dtype=any_string_dtype) + if any_string_dtype == "object": + expected_columns = expected_columns.astype("str") + tm.assert_index_equal(result.columns, expected_columns) def test_pivot_index_none(self): # GH#3962 @@ -2658,7 +2660,9 @@ def test_pivot_columns_not_given(self): with pytest.raises(TypeError, match="missing 1 required keyword-only argument"): df.pivot() - @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_columns_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2674,7 +2678,9 @@ def test_pivot_columns_is_none(self): expected = DataFrame({1: 3}, index=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_index_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2688,7 +2694,9 @@ def test_pivot_index_is_none(self): expected = DataFrame(3, index=[1], columns=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_values_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3})