From eed8b3322f3a1476f873ae9dbe207eda22702fc0 Mon Sep 17 00:00:00 2001 From: parthiban Date: Tue, 23 Jan 2024 15:23:08 +0530 Subject: [PATCH 1/4] BUG: Fix inconsistency when constructing a Series with large integers in a int64 masked array - Refered code from PR#50757 similar issue for non masked ints --- pandas/core/construction.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index e6d99ab773db9..06f7f02db679a 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -48,6 +48,8 @@ maybe_promote, ) from pandas.core.dtypes.common import ( + is_float_dtype, + is_integer_dtype, is_list_like, is_object_dtype, is_string_dtype, @@ -502,11 +504,22 @@ def sanitize_masked_array(data: ma.MaskedArray) -> np.ndarray: Convert numpy MaskedArray to ensure mask is softened. """ mask = ma.getmaskarray(data) + original = data + original_dtype = data.dtype if mask.any(): dtype, fill_value = maybe_promote(data.dtype, np.nan) dtype = cast(np.dtype, dtype) data = ma.asarray(data.astype(dtype, copy=True)) data.soften_mask() # set hardmask False if it was True + if not mask.all(): + idx = np.unravel_index(np.nanargmax(data, axis=None), data.shape) + if not mask[idx] and int(data[idx]) != original[idx]: + if ( + is_integer_dtype(original_dtype) + and is_float_dtype(data.dtype) + and len(data) > 0 + ): + data = ma.asarray(original, "object") data[mask] = fill_value else: data = data.copy() From 0c6d59a00983f9fb37646551f08f1320d22fd897 Mon Sep 17 00:00:00 2001 From: parthiban Date: Tue, 23 Jan 2024 15:24:02 +0530 Subject: [PATCH 2/4] TST: Add test cases --- pandas/tests/series/test_constructors.py | 30 ++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 68737e86f0c6a..11628d76536f4 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2155,6 +2155,36 @@ def test_inference_on_pandas_objects(self): result = Series(idx) assert result.dtype != np.object_ + def test_series_constructor_maskedarray_int_overflow(self): + # GH#56566 + mx = ma.masked_array( + [ + 4873214862074861312, + 4875446630161458944, + 4824652147895424384, + 0, + 3526420114272476800, + ], + mask=[0, 0, 0, 1, 0], + ) + result = Series(mx, dtype="Int64") + expected = Series( + IntegerArray( + np.array( + [ + 4873214862074861312, + 4875446630161458944, + 4824652147895424384, + 0, + 3526420114272476800, + ], + dtype="int64", + ), + np.array([0, 0, 0, 1, 0], dtype=np.bool_), + ) + ) + tm.assert_series_equal(result, expected) + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): From 260fc18f9ab8c97c6e8794f784071d4205a58b6e Mon Sep 17 00:00:00 2001 From: parthiban Date: Tue, 9 Apr 2024 22:15:55 +0530 Subject: [PATCH 3/4] TST: Change expected --- pandas/tests/series/test_constructors.py | 25 ++++++++++-------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 11628d76536f4..58fdfd30dde90 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2168,22 +2168,17 @@ def test_series_constructor_maskedarray_int_overflow(self): mask=[0, 0, 0, 1, 0], ) result = Series(mx, dtype="Int64") - expected = Series( - IntegerArray( - np.array( - [ - 4873214862074861312, - 4875446630161458944, - 4824652147895424384, - 0, - 3526420114272476800, - ], - dtype="int64", - ), - np.array([0, 0, 0, 1, 0], dtype=np.bool_), - ) + expected = np.array( + [ + 4873214862074861312, + 4875446630161458944, + 4824652147895424384, + 3526420114272476800, + ], + dtype="int64", ) - tm.assert_series_equal(result, expected) + result = np.array(result.dropna(ignore_index=True).values) + assert np.all(expected == result) class TestSeriesConstructorIndexCoercion: From ab025e1f1e2e56bfc4c0984a56ed68b7534fbad1 Mon Sep 17 00:00:00 2001 From: parthiban Date: Tue, 9 Apr 2024 22:20:45 +0530 Subject: [PATCH 4/4] BUG: Remove len check --- pandas/core/construction.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 06f7f02db679a..09240ffc831a6 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -514,11 +514,7 @@ def sanitize_masked_array(data: ma.MaskedArray) -> np.ndarray: if not mask.all(): idx = np.unravel_index(np.nanargmax(data, axis=None), data.shape) if not mask[idx] and int(data[idx]) != original[idx]: - if ( - is_integer_dtype(original_dtype) - and is_float_dtype(data.dtype) - and len(data) > 0 - ): + if is_integer_dtype(original_dtype) and is_float_dtype(data.dtype): data = ma.asarray(original, "object") data[mask] = fill_value else: