diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index af7706f624323..0fddbdd57a023 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -470,6 +470,7 @@ Conversion ^^^^^^^^^^ - Bug in :class:`UInt64Index` constructor when passing a list containing both positive integers small enough to cast to int64 and integers too large too hold in int64 (:issue:`42201`) - Bug in :class:`Series` constructor returning 0 for missing values with dtype ``int64`` and ``False`` for dtype ``bool`` (:issue:`43017`, :issue:`43018`) +- Bug in :class:`IntegerDtype` not allowing coercion from string dtype (:issue:`25472`) - Bug in :func:`to_datetime` with ``arg:xr.DataArray`` and ``unit="ns"`` specified raises TypeError (:issue:`44053`) - diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 4d59832655162..d8b7bf2b86d2c 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -32,6 +32,7 @@ is_integer_dtype, is_list_like, is_object_dtype, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.missing import isna @@ -124,12 +125,10 @@ def safe_cast(values, dtype, copy: bool): Safely cast the values to the dtype if they are equivalent, meaning floats must be equivalent to the ints. - """ try: return values.astype(dtype, casting="safe", copy=copy) except TypeError as err: - casted = values.astype(dtype, copy=copy) if (casted == values).all(): return casted @@ -143,7 +142,7 @@ def coerce_to_array( values, dtype, mask=None, copy: bool = False ) -> tuple[np.ndarray, np.ndarray]: """ - Coerce the input values array to numpy arrays with a mask + Coerce the input values array to numpy arrays with a mask. Parameters ---------- @@ -187,7 +186,8 @@ def coerce_to_array( return values, mask values = np.array(values, copy=copy) - if is_object_dtype(values): + inferred_type = None + if is_object_dtype(values) or is_string_dtype(values): inferred_type = lib.infer_dtype(values, skipna=True) if inferred_type == "empty": values = np.empty(len(values)) @@ -198,6 +198,8 @@ def coerce_to_array( "mixed-integer", "integer-na", "mixed-integer-float", + "string", + "unicode", ]: raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype") @@ -230,7 +232,10 @@ def coerce_to_array( if mask.any(): values = values.copy() values[mask] = 1 - values = safe_cast(values, dtype, copy=False) + if inferred_type in ("string", "unicode"): + # casts from str are always safe since they raise + # a ValueError if the str cannot be parsed into an int + values = values.astype(dtype, copy=copy) else: values = safe_cast(values, dtype, copy=False) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index a9c2b31849425..2e8641c281661 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -143,14 +143,14 @@ def ensure_python_int(value: int | np.integer) -> int: def classes(*klasses) -> Callable: - """evaluate if the tipo is a subclass of the klasses""" + """Evaluate if the tipo is a subclass of the klasses.""" return lambda tipo: issubclass(tipo, klasses) def classes_and_not_datetimelike(*klasses) -> Callable: """ - evaluate if the tipo is a subclass of the klasses - and not a datetimelike + Evaluate if the tipo is a subclass of the klasses + and not a datetimelike. """ return lambda tipo: ( issubclass(tipo, klasses) @@ -674,7 +674,7 @@ def is_integer_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of an integer dtype. - Unlike in `in_any_int_dtype`, timedelta64 instances will return False. + Unlike in `is_any_int_dtype`, timedelta64 instances will return False. The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered as integer by this function. @@ -726,7 +726,7 @@ def is_signed_integer_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a signed integer dtype. - Unlike in `in_any_int_dtype`, timedelta64 instances will return False. + Unlike in `is_any_int_dtype`, timedelta64 instances will return False. The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered as integer by this function. @@ -1521,7 +1521,7 @@ def is_complex_dtype(arr_or_dtype) -> bool: def _is_dtype(arr_or_dtype, condition) -> bool: """ - Return a boolean if the condition is satisfied for the arr_or_dtype. + Return true if the condition is satisfied for the arr_or_dtype. Parameters ---------- @@ -1580,7 +1580,7 @@ def get_dtype(arr_or_dtype) -> DtypeObj: def _is_dtype_type(arr_or_dtype, condition) -> bool: """ - Return a boolean if the condition is satisfied for the arr_or_dtype. + Return true if the condition is satisfied for the arr_or_dtype. Parameters ---------- diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py index b48567d37ecaf..be9230175bb5d 100644 --- a/pandas/tests/arrays/integer/test_construction.py +++ b/pandas/tests/arrays/integer/test_construction.py @@ -44,7 +44,6 @@ def test_from_dtype_from_float(data): def test_conversions(data_missing): - # astype to object series df = pd.DataFrame({"A": data_missing}) result = df["A"].astype("object") @@ -123,7 +122,6 @@ def test_to_integer_array_none_is_nan(a, b): "values", [ ["foo", "bar"], - ["1", "2"], "foo", 1, 1.0, @@ -137,13 +135,14 @@ def test_to_integer_array_error(values): # error in converting existing arrays to IntegerArrays msg = ( r"(:?.* cannot be converted to an IntegerDtype)" + r"|(invalid literal for int\(\) with base 10: .*)" r"|(:?values must be a 1D list-like)" r"|(Cannot pass scalar)" ) with pytest.raises((ValueError, TypeError), match=msg): pd.array(values, dtype="Int64") - with pytest.raises(TypeError, match=msg): + with pytest.raises((ValueError, TypeError), match=msg): IntegerArray._from_sequence(values) @@ -181,6 +180,22 @@ def test_to_integer_array_float(): assert result.dtype == Int64Dtype() +def test_to_integer_array_str(): + result = IntegerArray._from_sequence(["1", "2", None]) + expected = pd.array([1, 2, np.nan], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + with pytest.raises( + ValueError, match=r"invalid literal for int\(\) with base 10: .*" + ): + IntegerArray._from_sequence(["1", "2", ""]) + + with pytest.raises( + ValueError, match=r"invalid literal for int\(\) with base 10: .*" + ): + IntegerArray._from_sequence(["1.5", "2.0"]) + + @pytest.mark.parametrize( "bool_values, int_values, target_dtype, expected_dtype", [ diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 726cd64c6dc23..cdf9c0a1784a4 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -314,3 +314,23 @@ def test_dtype_multi_index(all_parsers): ) tm.assert_frame_equal(result, expected) + + +def test_nullable_int_dtype(all_parsers, any_int_ea_dtype): + # GH 25472 + parser = all_parsers + dtype = any_int_ea_dtype + + data = """a,b,c +,3,5 +1,,6 +2,4,""" + expected = DataFrame( + { + "a": pd.array([pd.NA, 1, 2], dtype=dtype), + "b": pd.array([3, pd.NA, 4], dtype=dtype), + "c": pd.array([5, 6, pd.NA], dtype=dtype), + } + ) + actual = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(actual, expected)