From bc51b868475d3c813e109a71905c50f39ad9ed77 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 27 Feb 2021 10:17:04 -0800 Subject: [PATCH 1/6] REF: try_cast_integer_dtype --- pandas/core/construction.py | 34 +++++++++++++++++++++--- pandas/core/dtypes/cast.py | 7 +++++ pandas/core/internals/construction.py | 31 ++++++++++++++------- pandas/tests/frame/test_constructors.py | 19 ++++++++++++- pandas/tests/frame/test_stack_unstack.py | 2 +- pandas/tests/series/test_constructors.py | 9 +++++++ 6 files changed, 87 insertions(+), 15 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index f5f49e0e5fc20..6a1a7b6d529a0 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -498,8 +498,12 @@ def sanitize_array( if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype): # possibility of nan -> garbage try: + # Note: we could use try_cast_integer_dtype + # (or even maybe_cast_to_integer_array) but + # we can get non-numpy integer-dtypes here subarr = _try_cast(data, dtype, copy, True) except ValueError: + # raised if `not (data.astype(dtype) == data).all()` subarr = np.array(data, copy=copy) else: # we will try to copy by-definition here @@ -615,6 +619,32 @@ def _maybe_repeat(arr: ArrayLike, index: Optional[Index]) -> ArrayLike: return arr +def try_cast_integer_dtype( + arr: Union[list, np.ndarray], dtype: np.dtype, copy: bool, raise_cast_failure: bool +) -> np.ndarray: + # Caller is responsible for checking + # - is_integer_dtype(dtype) + + # GH#15832: Check if we are requesting a numeric dtype and + # that we can convert the data to the requested dtype. + try: + # this will raise if we have e.g. floats + return maybe_cast_to_integer_array(arr, dtype, copy=copy) + except OverflowError: + if not raise_cast_failure: + # i.e. reached from DataFrame constructor; ignore dtype + # and cast without silent overflow + return np.array(arr, copy=copy) + raise + except ValueError as err: + if "Trying to coerce float values to integers" in str(err): + if not raise_cast_failure: + # Just do it anyway, i.e. DataFrame(floats, dtype="int64") + # is equivalent to DataFrame(floats).astype("int64") + return construct_1d_ndarray_preserving_na(arr, dtype, copy=copy) + raise + + def _try_cast( arr: Union[list, np.ndarray], dtype: Optional[DtypeObj], @@ -664,9 +694,7 @@ def _try_cast( # GH#15832: Check if we are requesting a numeric dtype and # that we can convert the data to the requested dtype. if is_integer_dtype(dtype): - # this will raise if we have e.g. floats - maybe_cast_to_integer_array(arr, dtype) - subarr = arr + return try_cast_integer_dtype(arr, dtype, copy, raise_cast_failure) else: subarr = maybe_cast_to_datetime(arr, dtype) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 06e71c18c55cc..51dd595415826 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1908,6 +1908,13 @@ def maybe_cast_to_integer_array( if is_float_dtype(arr) or is_object_dtype(arr): raise ValueError("Trying to coerce float values to integers") + if casted.dtype < arr.dtype: + # e.g. orig=[1, 200, 923442] and dtype='int8' + raise OverflowError(f"Trying to coerce too-large values to {dtype}") + + # Not sure if this can be reached, but covering our bases + raise ValueError(f"values cannot be losslessly cast to {dtype}") + def convert_scalar_for_putitemlike(scalar: Scalar, dtype: np.dtype) -> Scalar: """ diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 20536c7a94695..c4745d1a55e5b 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -63,6 +63,7 @@ from pandas.core.construction import ( extract_array, sanitize_array, + try_cast_integer_dtype, ) from pandas.core.indexes import base as ibase from pandas.core.indexes.api import ( @@ -229,18 +230,28 @@ def ndarray_to_mgr(values, index, columns, dtype: Optional[DtypeObj], copy: bool # by definition an array here # the dtypes will be coerced to a single dtype + # TODO: the was_masked check is to avoid breaking a very sketchy-looking + # test_constructor_maskedarray + was_masked = isinstance(values, np.ma.MaskedArray) values = _prep_ndarray(values, copy=copy) - if dtype is not None and not is_dtype_equal(values.dtype, dtype): - try: - values = construct_1d_ndarray_preserving_na( - values.ravel(), dtype=dtype, copy=False - ).reshape(values.shape) - except Exception as orig: - # e.g. ValueError when trying to cast object dtype to float64 - raise ValueError( - f"failed to cast to '{dtype}' (Exception was: {orig})" - ) from orig + shape = values.shape + flat = values.ravel() + + if not was_masked and is_integer_dtype(dtype): + values = try_cast_integer_dtype( + flat, dtype=dtype, copy=copy, raise_cast_failure=False + ) + else: + try: + values = construct_1d_ndarray_preserving_na( + flat, dtype=dtype, copy=False + ) + except Exception as err: + # e.g. ValueError when trying to cast object dtype to float64 + msg = f"failed to cast to '{dtype}' (Exception was: {err})" + raise ValueError(msg) from err + values = values.reshape(shape) # _prep_ndarray ensures that values.ndim == 2 at this point index, columns = _get_axes( diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index afc7ccb516c7f..c841ddbdfde97 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -328,6 +328,18 @@ def test_constructor_int_overflow(self, values): assert result[0].dtype == object assert result[0][0] == value + def test_constructor_int8_overflow(self): + # we silently ignore casting errors as dtype may not apply to all cols + vals = [1, 200, 923442] + + result = DataFrame(vals, dtype="int8") + expected = DataFrame(vals) + tm.assert_frame_equal(result, expected) + + result = DataFrame({"A": vals}, dtype="int8") + expected = DataFrame({"A": vals}) + tm.assert_frame_equal(result, expected) + def test_constructor_ordereddict(self): import random @@ -877,7 +889,9 @@ def test_constructor_maskedarray(self): assert 1.0 == frame["A"][1] assert 2.0 == frame["C"][2] - # what is this even checking?? + def test_constructor_maskedarray2(self): + + # TODO: what is this even checking?? mat = ma.masked_all((2, 3), dtype=float) frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) assert np.all(~np.asarray(frame == frame)) @@ -904,6 +918,7 @@ def test_constructor_maskedarray_nonfloat(self): assert 1 == frame["A"][1] assert 2 == frame["C"][2] + def test_constructor_maskedarray_nonfloat2(self): # masked np.datetime64 stays (use NaT as null) mat = ma.masked_all((2, 3), dtype="M8[ns]") # 2-D input @@ -925,6 +940,8 @@ def test_constructor_maskedarray_nonfloat(self): assert 1 == frame["A"].view("i8")[1] assert 2 == frame["C"].view("i8")[2] + def test_constructor_maskedarray_nonfloat3(self): + # masked bool promoted to object mat = ma.masked_all((2, 3), dtype=bool) # 2-D input diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 9945b739f8a87..3b9cb42da5b33 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -355,7 +355,7 @@ def test_unstack_preserve_dtypes(self): "E": Series([1.0, 50.0, 100.0]).astype("float32"), "F": Series([3.0, 4.0, 5.0]).astype("float64"), "G": False, - "H": Series([1, 200, 923442], dtype="int8"), + "H": Series([1, -56, 50], dtype="int8"), } ) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index c2d0bf5975059..a63583867aa8c 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1563,6 +1563,15 @@ def test_construction_from_large_int_scalar_no_overflow(self): expected = Series(n) tm.assert_series_equal(result, expected) + def test_constructor_int8_overflow(self): + # see also: test_constructor_int8_overflow in frame tests; + # behavior is different here bc dtype is not ignorable + + vals = [1, 200, 923442] + msg = "Trying to coerce too-large values to int8" + with pytest.raises(OverflowError, match=msg): + Series(vals, dtype="int8") + def test_constructor_list_of_periods_infers_period_dtype(self): series = Series(list(period_range("2000-01-01", periods=10, freq="D"))) assert series.dtype == "Period[D]" From 52432c76bf13ece8cf6814adfb33b4764ed64477 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 27 Feb 2021 19:32:42 -0800 Subject: [PATCH 2/6] 32bit compat --- pandas/tests/frame/test_constructors.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 19cad92874db9..7a4f5b13ac3a7 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -336,8 +336,10 @@ def test_constructor_int8_overflow(self): expected = DataFrame(vals) tm.assert_frame_equal(result, expected) + # TODO: these should either both come back as int64 or both as intp, + # not mixed-and-matched on 32bit result = DataFrame({"A": vals}, dtype="int8") - expected = DataFrame({"A": vals}) + expected = DataFrame({"A": vals}, dtype=np.intp) tm.assert_frame_equal(result, expected) def test_constructor_ordereddict(self): From e5413741182a9c939bf574370893da50504ad73f Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 28 Feb 2021 11:57:36 -0800 Subject: [PATCH 3/6] troubleshoot windows --- pandas/tests/frame/test_constructors.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 7a4f5b13ac3a7..e8e7466f94206 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -337,9 +337,10 @@ def test_constructor_int8_overflow(self): tm.assert_frame_equal(result, expected) # TODO: these should either both come back as int64 or both as intp, - # not mixed-and-matched on 32bit + # not mixed-and-matched on 32bit/windows result = DataFrame({"A": vals}, dtype="int8") expected = DataFrame({"A": vals}, dtype=np.intp) + assert (expected.dtypes == np.intp).all() # troubleshoot windows builds tm.assert_frame_equal(result, expected) def test_constructor_ordereddict(self): From ab109b6b1d4a3a1a5e892b3878f719aac362600d Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 28 Feb 2021 17:41:16 -0800 Subject: [PATCH 4/6] xfail on windows --- pandas/tests/frame/test_constructors.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index e8e7466f94206..3d50c5d78d145 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -17,7 +17,10 @@ import pytest import pytz -from pandas.compat import np_version_under1p19 +from pandas.compat import ( + is_platform_windows, + np_version_under1p19, +) from pandas.core.dtypes.common import is_integer_dtype from pandas.core.dtypes.dtypes import ( @@ -328,6 +331,9 @@ def test_constructor_int_overflow(self, values): assert result[0].dtype == object assert result[0][0] == value + @pytest.mark.xfail( + is_platform_windows(), reason="dict case result is int32 but expected is int64" + ) def test_constructor_int8_overflow(self): # we silently ignore casting errors as dtype may not apply to all cols vals = [1, 200, 923442] @@ -340,7 +346,6 @@ def test_constructor_int8_overflow(self): # not mixed-and-matched on 32bit/windows result = DataFrame({"A": vals}, dtype="int8") expected = DataFrame({"A": vals}, dtype=np.intp) - assert (expected.dtypes == np.intp).all() # troubleshoot windows builds tm.assert_frame_equal(result, expected) def test_constructor_ordereddict(self): From b1e2bc27c88796bc23afb5e294bbe2a00225507b Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Mar 2021 18:49:26 -0800 Subject: [PATCH 5/6] troubleshoot mypy --- pandas/core/construction.py | 9 ++++++++- pandas/core/internals/construction.py | 3 ++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 212bad1a012b4..3cb2fbf8c7faf 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -658,7 +658,13 @@ def try_cast_integer_dtype( if not raise_cast_failure: # Just do it anyway, i.e. DataFrame(floats, dtype="int64") # is equivalent to DataFrame(floats).astype("int64") - return construct_1d_ndarray_preserving_na(arr, dtype, copy=copy) + + # error: Argument 1 to "construct_1d_ndarray_preserving_na" + # has incompatible type "Union[List[Any], ndarray]"; + # expected "Sequence[Any]" + return construct_1d_ndarray_preserving_na( # type: ignore[arg-type] + arr, dtype, copy=copy + ) raise @@ -711,6 +717,7 @@ def _try_cast( # GH#15832: Check if we are requesting a numeric dtype and # that we can convert the data to the requested dtype. if is_integer_dtype(dtype): + dtype = cast(np.dtype, dtype) return try_cast_integer_dtype(arr, dtype, copy, raise_cast_failure) else: subarr = maybe_cast_to_datetime(arr, dtype) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index c1b5e3e6c410a..fd45955fc7c66 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -45,6 +45,7 @@ is_named_tuple, is_object_dtype, ) +from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCDatetimeIndex, @@ -250,7 +251,7 @@ def ndarray_to_mgr( if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) - if is_extension_array_dtype(values) or is_extension_array_dtype(dtype): + if is_extension_array_dtype(values) or isinstance(dtype, ExtensionDtype): # GH#19157 if isinstance(values, np.ndarray) and values.ndim > 1: From 39fcbb9b53c1c0a35f670f153a209b9cee0d5bdf Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Mar 2021 20:12:41 -0800 Subject: [PATCH 6/6] troubleshoot mypy --- pandas/core/construction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 3cb2fbf8c7faf..db2a4c40e7536 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -662,8 +662,8 @@ def try_cast_integer_dtype( # error: Argument 1 to "construct_1d_ndarray_preserving_na" # has incompatible type "Union[List[Any], ndarray]"; # expected "Sequence[Any]" - return construct_1d_ndarray_preserving_na( # type: ignore[arg-type] - arr, dtype, copy=copy + return construct_1d_ndarray_preserving_na( + arr, dtype, copy=copy # type: ignore[arg-type] ) raise