Skip to content

API: silent overflow in Series(bigints, dtype="int8") #40114

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
bc51b86
REF: try_cast_integer_dtype
jbrockmendel Feb 27, 2021
0e4e8af
Merge branch 'master' into bug-maybe_cast_to_integer_array
jbrockmendel Feb 27, 2021
7649eb4
Merge branch 'master' into bug-maybe_cast_to_integer_array
jbrockmendel Feb 27, 2021
fcab68e
Merge branch 'master' into bug-maybe_cast_to_integer_array
jbrockmendel Feb 27, 2021
8ee8032
Merge branch 'master' into bug-maybe_cast_to_integer_array
jbrockmendel Feb 27, 2021
d87363a
Merge branch 'master' into bug-maybe_cast_to_integer_array
jbrockmendel Feb 27, 2021
bccccd0
Merge branch 'master' into bug-maybe_cast_to_integer_array
jbrockmendel Feb 28, 2021
52432c7
32bit compat
jbrockmendel Feb 28, 2021
7a0e0d6
Merge branch 'master' into bug-maybe_cast_to_integer_array
jbrockmendel Feb 28, 2021
e541374
troubleshoot windows
jbrockmendel Feb 28, 2021
ab109b6
xfail on windows
jbrockmendel Mar 1, 2021
785f828
Merge branch 'master' into bug-maybe_cast_to_integer_array
jbrockmendel Mar 3, 2021
710f31d
Merge branch 'master' into bug-maybe_cast_to_integer_array
jbrockmendel Mar 3, 2021
6c5cc5b
Merge branch 'master' into bug-maybe_cast_to_integer_array
jbrockmendel Mar 6, 2021
b7ee325
Merge branch 'master' into bug-maybe_cast_to_integer_array
jbrockmendel Mar 11, 2021
fcde0e1
Merge branch 'master' into bug-maybe_cast_to_integer_array
jbrockmendel Mar 11, 2021
c0ecaf2
Merge branch 'master' into bug-maybe_cast_to_integer_array
jbrockmendel Mar 12, 2021
b1e2bc2
troubleshoot mypy
jbrockmendel Mar 13, 2021
39fcbb9
troubleshoot mypy
jbrockmendel Mar 13, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 37 additions & 8 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,9 @@ def sanitize_array(
if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype):
# possibility of nan -> garbage
try:
# Note: we could use try_cast_integer_dtype
# (or even maybe_cast_to_integer_array) but
# we can get non-numpy integer-dtypes here
subarr = _try_cast(data, dtype, copy, True)
except ValueError:
subarr = np.array(data, copy=copy)
Expand Down Expand Up @@ -633,6 +636,38 @@ def _maybe_repeat(arr: ArrayLike, index: Optional[Index]) -> ArrayLike:
return arr


def try_cast_integer_dtype(
arr: Union[list, np.ndarray], dtype: np.dtype, copy: bool, raise_cast_failure: bool
) -> np.ndarray:
# Caller is responsible for checking
# - is_integer_dtype(dtype)

# GH#15832: Check if we are requesting a numeric dtype and
# that we can convert the data to the requested dtype.
try:
# this will raise if we have e.g. floats
return maybe_cast_to_integer_array(arr, dtype, copy=copy)
except OverflowError:
if not raise_cast_failure:
# i.e. reached from DataFrame constructor; ignore dtype
# and cast without silent overflow
return np.array(arr, copy=copy)
raise
except ValueError as err:
if "Trying to coerce float values to integers" in str(err):
if not raise_cast_failure:
# Just do it anyway, i.e. DataFrame(floats, dtype="int64")
# is equivalent to DataFrame(floats).astype("int64")

# error: Argument 1 to "construct_1d_ndarray_preserving_na"
# has incompatible type "Union[List[Any], ndarray]";
# expected "Sequence[Any]"
return construct_1d_ndarray_preserving_na(
arr, dtype, copy=copy # type: ignore[arg-type]
)
raise


def _try_cast(
arr: Union[list, np.ndarray],
dtype: Optional[DtypeObj],
Expand Down Expand Up @@ -682,14 +717,8 @@ def _try_cast(
# GH#15832: Check if we are requesting a numeric dtype and
# that we can convert the data to the requested dtype.
if is_integer_dtype(dtype):
# this will raise if we have e.g. floats

# error: Argument 2 to "maybe_cast_to_integer_array" has incompatible type
# "Union[dtype, ExtensionDtype, None]"; expected "Union[ExtensionDtype, str,
# dtype, Type[str], Type[float], Type[int], Type[complex], Type[bool],
# Type[object]]"
maybe_cast_to_integer_array(arr, dtype) # type: ignore[arg-type]
subarr = arr
dtype = cast(np.dtype, dtype)
return try_cast_integer_dtype(arr, dtype, copy, raise_cast_failure)
else:
subarr = maybe_cast_to_datetime(arr, dtype)
if dtype is not None and dtype.kind == "M":
Expand Down
7 changes: 7 additions & 0 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -2140,6 +2140,13 @@ def maybe_cast_to_integer_array(
if is_float_dtype(arr) or is_object_dtype(arr):
raise ValueError("Trying to coerce float values to integers")

if casted.dtype < arr.dtype:
# e.g. orig=[1, 200, 923442] and dtype='int8'
raise OverflowError(f"Trying to coerce too-large values to {dtype}")

# Not sure if this can be reached, but covering our bases
raise ValueError(f"values cannot be losslessly cast to {dtype}")


def convert_scalar_for_putitemlike(scalar: Scalar, dtype: np.dtype) -> Scalar:
"""
Expand Down
19 changes: 14 additions & 5 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
is_named_tuple,
is_object_dtype,
)
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCDatetimeIndex,
Expand All @@ -64,6 +65,7 @@
from pandas.core.construction import (
extract_array,
sanitize_array,
try_cast_integer_dtype,
)
from pandas.core.indexes import base as ibase
from pandas.core.indexes.api import (
Expand Down Expand Up @@ -249,7 +251,7 @@ def ndarray_to_mgr(
if not len(values) and columns is not None and len(columns):
values = np.empty((0, 1), dtype=object)

if is_extension_array_dtype(values) or is_extension_array_dtype(dtype):
if is_extension_array_dtype(values) or isinstance(dtype, ExtensionDtype):
# GH#19157

if isinstance(values, np.ndarray) and values.ndim > 1:
Expand All @@ -266,20 +268,27 @@ def ndarray_to_mgr(

# by definition an array here
# the dtypes will be coerced to a single dtype
# TODO: the was_masked check is to avoid breaking a very sketchy-looking
# test_constructor_maskedarray
was_masked = isinstance(values, np.ma.MaskedArray)
values = _prep_ndarray(values, copy=copy)

if dtype is not None and not is_dtype_equal(values.dtype, dtype):
shape = values.shape
flat = values.ravel()

if not is_integer_dtype(dtype):
# TODO: skipping integer_dtype is needed to keep the tests passing,
# not clear it is correct
to_int = is_integer_dtype(dtype)
if not was_masked and to_int:
values = try_cast_integer_dtype(
flat, dtype=dtype, copy=copy, raise_cast_failure=False
)
elif not to_int:
# Note: we really only need _try_cast, but keeping to exposed funcs
values = sanitize_array(
flat, None, dtype=dtype, copy=copy, raise_cast_failure=True
)
else:
# TODO: we get here with test_constructor_maskedarray_nonfloat2
# which looks like the test may be wrong
try:
values = construct_1d_ndarray_preserving_na(
flat, dtype=dtype, copy=False
Expand Down
29 changes: 27 additions & 2 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@
import pytest
import pytz

from pandas.compat import np_version_under1p19
from pandas.compat import (
is_platform_windows,
np_version_under1p19,
)
import pandas.util._test_decorators as td

from pandas.core.dtypes.common import is_integer_dtype
Expand Down Expand Up @@ -338,6 +341,23 @@ def test_constructor_int_overflow(self, values):
assert result[0].dtype == object
assert result[0][0] == value

@pytest.mark.xfail(
is_platform_windows(), reason="dict case result is int32 but expected is int64"
)
def test_constructor_int8_overflow(self):
# we silently ignore casting errors as dtype may not apply to all cols
vals = [1, 200, 923442]

result = DataFrame(vals, dtype="int8")
expected = DataFrame(vals)
tm.assert_frame_equal(result, expected)

# TODO: these should either both come back as int64 or both as intp,
# not mixed-and-matched on 32bit/windows
result = DataFrame({"A": vals}, dtype="int8")
expected = DataFrame({"A": vals}, dtype=np.intp)
tm.assert_frame_equal(result, expected)

def test_constructor_ordereddict(self):
import random

Expand Down Expand Up @@ -896,7 +916,9 @@ def test_constructor_maskedarray(self):
assert 1.0 == frame["A"][1]
assert 2.0 == frame["C"][2]

# what is this even checking??
def test_constructor_maskedarray2(self):

# TODO: what is this even checking??
mat = ma.masked_all((2, 3), dtype=float)
frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2])
assert np.all(~np.asarray(frame == frame))
Expand All @@ -923,6 +945,7 @@ def test_constructor_maskedarray_nonfloat(self):
assert 1 == frame["A"][1]
assert 2 == frame["C"][2]

def test_constructor_maskedarray_nonfloat2(self):
# masked np.datetime64 stays (use NaT as null)
mat = ma.masked_all((2, 3), dtype="M8[ns]")
# 2-D input
Expand All @@ -944,6 +967,8 @@ def test_constructor_maskedarray_nonfloat(self):
assert 1 == frame["A"].view("i8")[1]
assert 2 == frame["C"].view("i8")[2]

def test_constructor_maskedarray_nonfloat3(self):

# masked bool promoted to object
mat = ma.masked_all((2, 3), dtype=bool)
# 2-D input
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ def test_unstack_preserve_dtypes(self):
"E": Series([1.0, 50.0, 100.0]).astype("float32"),
"F": Series([3.0, 4.0, 5.0]).astype("float64"),
"G": False,
"H": Series([1, 200, 923442], dtype="int8"),
"H": Series([1, -56, 50], dtype="int8"),
}
)

Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1582,6 +1582,15 @@ def test_construction_from_large_int_scalar_no_overflow(self):
expected = Series(n)
tm.assert_series_equal(result, expected)

def test_constructor_int8_overflow(self):
# see also: test_constructor_int8_overflow in frame tests;
# behavior is different here bc dtype is not ignorable

vals = [1, 200, 923442]
msg = "Trying to coerce too-large values to int8"
with pytest.raises(OverflowError, match=msg):
Series(vals, dtype="int8")

def test_constructor_list_of_periods_infers_period_dtype(self):
series = Series(list(period_range("2000-01-01", periods=10, freq="D")))
assert series.dtype == "Period[D]"
Expand Down