diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 78a7f1890b5de..db2a4c40e7536 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -498,6 +498,9 @@ def sanitize_array( if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype): # possibility of nan -> garbage try: + # Note: we could use try_cast_integer_dtype + # (or even maybe_cast_to_integer_array) but + # we can get non-numpy integer-dtypes here subarr = _try_cast(data, dtype, copy, True) except ValueError: subarr = np.array(data, copy=copy) @@ -633,6 +636,38 @@ def _maybe_repeat(arr: ArrayLike, index: Optional[Index]) -> ArrayLike: return arr +def try_cast_integer_dtype( + arr: Union[list, np.ndarray], dtype: np.dtype, copy: bool, raise_cast_failure: bool +) -> np.ndarray: + # Caller is responsible for checking + # - is_integer_dtype(dtype) + + # GH#15832: Check if we are requesting a numeric dtype and + # that we can convert the data to the requested dtype. + try: + # this will raise if we have e.g. floats + return maybe_cast_to_integer_array(arr, dtype, copy=copy) + except OverflowError: + if not raise_cast_failure: + # i.e. reached from DataFrame constructor; ignore dtype + # and cast without silent overflow + return np.array(arr, copy=copy) + raise + except ValueError as err: + if "Trying to coerce float values to integers" in str(err): + if not raise_cast_failure: + # Just do it anyway, i.e. DataFrame(floats, dtype="int64") + # is equivalent to DataFrame(floats).astype("int64") + + # error: Argument 1 to "construct_1d_ndarray_preserving_na" + # has incompatible type "Union[List[Any], ndarray]"; + # expected "Sequence[Any]" + return construct_1d_ndarray_preserving_na( + arr, dtype, copy=copy # type: ignore[arg-type] + ) + raise + + def _try_cast( arr: Union[list, np.ndarray], dtype: Optional[DtypeObj], @@ -682,14 +717,8 @@ def _try_cast( # GH#15832: Check if we are requesting a numeric dtype and # that we can convert the data to the requested dtype. if is_integer_dtype(dtype): - # this will raise if we have e.g. floats - - # error: Argument 2 to "maybe_cast_to_integer_array" has incompatible type - # "Union[dtype, ExtensionDtype, None]"; expected "Union[ExtensionDtype, str, - # dtype, Type[str], Type[float], Type[int], Type[complex], Type[bool], - # Type[object]]" - maybe_cast_to_integer_array(arr, dtype) # type: ignore[arg-type] - subarr = arr + dtype = cast(np.dtype, dtype) + return try_cast_integer_dtype(arr, dtype, copy, raise_cast_failure) else: subarr = maybe_cast_to_datetime(arr, dtype) if dtype is not None and dtype.kind == "M": diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 44650500e0f65..6683a08b84622 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -2140,6 +2140,13 @@ def maybe_cast_to_integer_array( if is_float_dtype(arr) or is_object_dtype(arr): raise ValueError("Trying to coerce float values to integers") + if casted.dtype < arr.dtype: + # e.g. orig=[1, 200, 923442] and dtype='int8' + raise OverflowError(f"Trying to coerce too-large values to {dtype}") + + # Not sure if this can be reached, but covering our bases + raise ValueError(f"values cannot be losslessly cast to {dtype}") + def convert_scalar_for_putitemlike(scalar: Scalar, dtype: np.dtype) -> Scalar: """ diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 63a437a91f6e4..fd45955fc7c66 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -45,6 +45,7 @@ is_named_tuple, is_object_dtype, ) +from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCDatetimeIndex, @@ -64,6 +65,7 @@ from pandas.core.construction import ( extract_array, sanitize_array, + try_cast_integer_dtype, ) from pandas.core.indexes import base as ibase from pandas.core.indexes.api import ( @@ -249,7 +251,7 @@ def ndarray_to_mgr( if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) - if is_extension_array_dtype(values) or is_extension_array_dtype(dtype): + if is_extension_array_dtype(values) or isinstance(dtype, ExtensionDtype): # GH#19157 if isinstance(values, np.ndarray) and values.ndim > 1: @@ -266,20 +268,27 @@ def ndarray_to_mgr( # by definition an array here # the dtypes will be coerced to a single dtype + # TODO: the was_masked check is to avoid breaking a very sketchy-looking + # test_constructor_maskedarray + was_masked = isinstance(values, np.ma.MaskedArray) values = _prep_ndarray(values, copy=copy) - if dtype is not None and not is_dtype_equal(values.dtype, dtype): shape = values.shape flat = values.ravel() - if not is_integer_dtype(dtype): - # TODO: skipping integer_dtype is needed to keep the tests passing, - # not clear it is correct + to_int = is_integer_dtype(dtype) + if not was_masked and to_int: + values = try_cast_integer_dtype( + flat, dtype=dtype, copy=copy, raise_cast_failure=False + ) + elif not to_int: # Note: we really only need _try_cast, but keeping to exposed funcs values = sanitize_array( flat, None, dtype=dtype, copy=copy, raise_cast_failure=True ) else: + # TODO: we get here with test_constructor_maskedarray_nonfloat2 + # which looks like the test may be wrong try: values = construct_1d_ndarray_preserving_na( flat, dtype=dtype, copy=False diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 285d6286931af..993f6c2cf5fe9 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -17,7 +17,10 @@ import pytest import pytz -from pandas.compat import np_version_under1p19 +from pandas.compat import ( + is_platform_windows, + np_version_under1p19, +) import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype @@ -338,6 +341,23 @@ def test_constructor_int_overflow(self, values): assert result[0].dtype == object assert result[0][0] == value + @pytest.mark.xfail( + is_platform_windows(), reason="dict case result is int32 but expected is int64" + ) + def test_constructor_int8_overflow(self): + # we silently ignore casting errors as dtype may not apply to all cols + vals = [1, 200, 923442] + + result = DataFrame(vals, dtype="int8") + expected = DataFrame(vals) + tm.assert_frame_equal(result, expected) + + # TODO: these should either both come back as int64 or both as intp, + # not mixed-and-matched on 32bit/windows + result = DataFrame({"A": vals}, dtype="int8") + expected = DataFrame({"A": vals}, dtype=np.intp) + tm.assert_frame_equal(result, expected) + def test_constructor_ordereddict(self): import random @@ -896,7 +916,9 @@ def test_constructor_maskedarray(self): assert 1.0 == frame["A"][1] assert 2.0 == frame["C"][2] - # what is this even checking?? + def test_constructor_maskedarray2(self): + + # TODO: what is this even checking?? mat = ma.masked_all((2, 3), dtype=float) frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) assert np.all(~np.asarray(frame == frame)) @@ -923,6 +945,7 @@ def test_constructor_maskedarray_nonfloat(self): assert 1 == frame["A"][1] assert 2 == frame["C"][2] + def test_constructor_maskedarray_nonfloat2(self): # masked np.datetime64 stays (use NaT as null) mat = ma.masked_all((2, 3), dtype="M8[ns]") # 2-D input @@ -944,6 +967,8 @@ def test_constructor_maskedarray_nonfloat(self): assert 1 == frame["A"].view("i8")[1] assert 2 == frame["C"].view("i8")[2] + def test_constructor_maskedarray_nonfloat3(self): + # masked bool promoted to object mat = ma.masked_all((2, 3), dtype=bool) # 2-D input diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 253f416bd6f18..11bb1e3d949e2 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -358,7 +358,7 @@ def test_unstack_preserve_dtypes(self): "E": Series([1.0, 50.0, 100.0]).astype("float32"), "F": Series([3.0, 4.0, 5.0]).astype("float64"), "G": False, - "H": Series([1, 200, 923442], dtype="int8"), + "H": Series([1, -56, 50], dtype="int8"), } ) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index ab484e7ae9d8a..698301374ba38 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1582,6 +1582,15 @@ def test_construction_from_large_int_scalar_no_overflow(self): expected = Series(n) tm.assert_series_equal(result, expected) + def test_constructor_int8_overflow(self): + # see also: test_constructor_int8_overflow in frame tests; + # behavior is different here bc dtype is not ignorable + + vals = [1, 200, 923442] + msg = "Trying to coerce too-large values to int8" + with pytest.raises(OverflowError, match=msg): + Series(vals, dtype="int8") + def test_constructor_list_of_periods_infers_period_dtype(self): series = Series(list(period_range("2000-01-01", periods=10, freq="D"))) assert series.dtype == "Period[D]"