diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 7ad7e8f5a27b0..e4ef752a33635 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -241,6 +241,9 @@ Backwards incompatible API changes - :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`) - Passing an integer dtype other than ``int64`` to ``np.array(period_index, dtype=...)`` will now raise ``TypeError`` instead of incorrectly using ``int64`` (:issue:`32255`) - Passing an invalid ``fill_value`` to :meth:`Categorical.take` raises a ``ValueError`` instead of ``TypeError`` (:issue:`33660`) +- Combining a ``Categorical`` with integer categories and which contains missing values + with a float dtype column in operations such as :func:`concat` or :meth:`~DataFrame.append` + will now result in a float column instead of an object dtyped column (:issue:`33607`) ``MultiIndex.get_indexer`` interprets `method` argument differently ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7447d593a7ff0..bd903d9b1fae3 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1004,7 +1004,7 @@ def _concat_same_type( cls, to_concat: Sequence["ExtensionArray"] ) -> "ExtensionArray": """ - Concatenate multiple array. + Concatenate multiple array of this dtype. Parameters ---------- @@ -1014,6 +1014,11 @@ def _concat_same_type( ------- ExtensionArray """ + # Implementer note: this method will only be called with a sequence of + # ExtensionArrays of this class and with the same dtype as self. This + # should allow "easy" concatenation (no upcasting needed), and result + # in a new ExtensionArray of the same dtype. + # Note: this strict behaviour is only guaranteed starting with pandas 1.1 raise AbstractMethodError(cls) # The _can_hold_na attribute is set to True so that pandas internals diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index de401368d55d7..a7a0df3908268 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2296,9 +2296,9 @@ def _can_hold_na(self): @classmethod def _concat_same_type(self, to_concat): - from pandas.core.dtypes.concat import concat_categorical + from pandas.core.dtypes.concat import union_categoricals - return concat_categorical(to_concat) + return union_categoricals(to_concat) def isin(self, values): """ diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 9d41071755e6f..743267534bfaa 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,11 +1,11 @@ import numbers -from typing import TYPE_CHECKING, Tuple, Type, Union +from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union import warnings import numpy as np from pandas._libs import lib, missing as libmissing -from pandas._typing import ArrayLike +from pandas._typing import ArrayLike, DtypeObj from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.util._decorators import cache_readonly @@ -96,6 +96,17 @@ def construct_array_type(cls) -> Type["IntegerArray"]: """ return IntegerArray + def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: + # for now only handle other integer types + if not all(isinstance(t, _IntegerDtype) for t in dtypes): + return None + np_dtype = np.find_common_type( + [t.numpy_dtype for t in dtypes], [] # type: ignore + ) + if np.issubdtype(np_dtype, np.integer): + return _dtypes[str(np_dtype)] + return None + def __from_arrow__( self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] ) -> "IntegerArray": diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 72b6e07942d5e..e327e11a17f4f 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -952,27 +952,7 @@ def copy(self): @classmethod def _concat_same_type(cls, to_concat): - fill_values = [x.fill_value for x in to_concat] - - fill_value = fill_values[0] - - # np.nan isn't a singleton, so we may end up with multiple - # NaNs here, so we ignore tha all NA case too. - if not (len(set(fill_values)) == 1 or isna(fill_values).all()): - warnings.warn( - "Concatenating sparse arrays with multiple fill " - f"values: '{fill_values}'. Picking the first and " - "converting the rest.", - PerformanceWarning, - stacklevel=6, - ) - keep = to_concat[0] - to_concat2 = [keep] - - for arr in to_concat[1:]: - to_concat2.append(cls(np.asarray(arr), fill_value=fill_value)) - - to_concat = to_concat2 + fill_value = to_concat[0].fill_value values = [] length = 0 diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index afa11586fda04..156a90f6ce600 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -1,11 +1,13 @@ """Sparse Dtype""" import re -from typing import TYPE_CHECKING, Any, Tuple, Type +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type +import warnings import numpy as np -from pandas._typing import Dtype +from pandas._typing import Dtype, DtypeObj +from pandas.errors import PerformanceWarning from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import astype_nansafe @@ -352,3 +354,23 @@ def _subtype_with_str(self): if isinstance(self.fill_value, str): return type(self.fill_value) return self.subtype + + def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: + + fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)] + fill_value = fill_values[0] + + # np.nan isn't a singleton, so we may end up with multiple + # NaNs here, so we ignore tha all NA case too. + if not (len(set(fill_values)) == 1 or isna(fill_values).all()): + warnings.warn( + "Concatenating sparse arrays with multiple fill " + f"values: '{fill_values}'. Picking the first and " + "converting the rest.", + PerformanceWarning, + stacklevel=6, + ) + + # TODO also handle non-numpy other dtypes + np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes] + return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index a4f0ccc2016c0..2d81dd4d884a3 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -6,6 +6,7 @@ import numpy as np +from pandas._typing import DtypeObj from pandas.errors import AbstractMethodError from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries @@ -33,11 +34,12 @@ class ExtensionDtype: * type * name - The following attributes influence the behavior of the dtype in + The following attributes and methods influence the behavior of the dtype in pandas operations * _is_numeric * _is_boolean + * _get_common_dtype Optionally one can override construct_array_type for construction with the name of this dtype via the Registry. See @@ -322,3 +324,31 @@ def _is_boolean(self) -> bool: bool """ return False + + def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: + """ + Return the common dtype, if one exists. + + Used in `find_common_type` implementation. This is for example used + to determine the resulting dtype in a concat operation. + + If no common dtype exists, return None (which gives the other dtypes + the chance to determine a common dtype). If all dtypes in the list + return None, then the common dtype will be "object" dtype (this means + it is never needed to return "object" dtype from this method itself). + + Parameters + ---------- + dtypes : list of dtypes + The dtypes for which to determine a common dtype. This is a list + of np.dtype or ExtensionDtype instances. + + Returns + ------- + Common dtype (np.dtype or ExtensionDtype) or None + """ + if len(set(dtypes)) == 1: + # only itself + return self + else: + return None diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index e50d635a1ba6c..ad307fd99ec9c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -3,7 +3,7 @@ """ from datetime import date, datetime, timedelta -from typing import TYPE_CHECKING, Any, Optional, Tuple, Type +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type import numpy as np @@ -1423,7 +1423,7 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): return value -def find_common_type(types): +def find_common_type(types: List[DtypeObj]) -> DtypeObj: """ Find a common data type among the given dtypes. @@ -1450,8 +1450,16 @@ def find_common_type(types): if all(is_dtype_equal(first, t) for t in types[1:]): return first + # get unique types (dict.fromkeys is used as order-preserving set()) + types = list(dict.fromkeys(types).keys()) + if any(isinstance(t, ExtensionDtype) for t in types): - return np.object + for t in types: + if isinstance(t, ExtensionDtype): + res = t._get_common_dtype(types) + if res is not None: + return res + return np.dtype("object") # take lowest unit if all(is_datetime64_dtype(t) for t in types): diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 2c560a1ed8c62..82b2795582ff1 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -4,6 +4,9 @@ import numpy as np +from pandas._typing import ArrayLike, DtypeObj + +from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( is_bool_dtype, is_categorical_dtype, @@ -17,6 +20,9 @@ ) from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCRangeIndex, ABCSeries +from pandas.core.arrays import ExtensionArray +from pandas.core.construction import array + def get_dtype_kinds(l): """ @@ -58,6 +64,40 @@ def get_dtype_kinds(l): return typs +def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: + """ + Helper function for `arr.astype(common_dtype)` but handling all special + cases. + """ + if ( + is_categorical_dtype(arr.dtype) + and isinstance(dtype, np.dtype) + and np.issubdtype(dtype, np.integer) + ): + # problem case: categorical of int -> gives int as result dtype, + # but categorical can contain NAs -> fall back to object dtype + try: + return arr.astype(dtype, copy=False) + except ValueError: + return arr.astype(object, copy=False) + + if ( + isinstance(arr, np.ndarray) + and arr.dtype.kind in ["m", "M"] + and dtype is np.dtype("object") + ): + # wrap datetime-likes in EA to ensure astype(object) gives Timestamp/Timedelta + # this can happen when concat_compat is called directly on arrays (when arrays + # are not coming from Index/Series._values), eg in BlockManager.quantile + arr = array(arr) + + if is_extension_array_dtype(dtype): + if isinstance(arr, np.ndarray): + # numpy's astype cannot handle ExtensionDtypes + return array(arr, dtype=dtype, copy=False) + return arr.astype(dtype, copy=False) + + def concat_compat(to_concat, axis: int = 0): """ provide concatenation of an array of arrays each of which is a single @@ -93,28 +133,25 @@ def is_nonempty(x) -> bool: typs = get_dtype_kinds(to_concat) _contains_datetime = any(typ.startswith("datetime") for typ in typs) - _contains_period = any(typ.startswith("period") for typ in typs) all_empty = not len(non_empties) single_dtype = len({x.dtype for x in to_concat}) == 1 any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) - if any_ea and single_dtype and axis == 0: - cls = type(to_concat[0]) - return cls._concat_same_type(to_concat) + if any_ea and axis == 0: + if not single_dtype: + target_dtype = find_common_type([x.dtype for x in to_concat]) + to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] - elif "category" in typs: - # this must be prior to concat_datetime, - # to support Categorical + datetime-like - return concat_categorical(to_concat, axis=axis) + if isinstance(to_concat[0], ExtensionArray): + cls = type(to_concat[0]) + return cls._concat_same_type(to_concat) + else: + return np.concatenate(to_concat) - elif _contains_datetime or "timedelta" in typs or _contains_period: + elif _contains_datetime or "timedelta" in typs: return concat_datetime(to_concat, axis=axis, typs=typs) - # these are mandated to handle empties as well - elif "sparse" in typs: - return _concat_sparse(to_concat, axis=axis, typs=typs) - elif any_ea and axis == 1: to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat] return np.concatenate(to_concat, axis=axis) @@ -136,53 +173,6 @@ def is_nonempty(x) -> bool: return np.concatenate(to_concat, axis=axis) -def concat_categorical(to_concat, axis: int = 0): - """ - Concatenate an object/categorical array of arrays, each of which is a - single dtype - - Parameters - ---------- - to_concat : array of arrays - axis : int - Axis to provide concatenation in the current implementation this is - always 0, e.g. we only have 1D categoricals - - Returns - ------- - Categorical - A single array, preserving the combined dtypes - """ - # we could have object blocks and categoricals here - # if we only have a single categoricals then combine everything - # else its a non-compat categorical - categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)] - - # validate the categories - if len(categoricals) != len(to_concat): - pass - else: - # when all categories are identical - first = to_concat[0] - if all(first.is_dtype_equal(other) for other in to_concat[1:]): - return union_categoricals(categoricals) - - # extract the categoricals & coerce to object if needed - to_concat = [ - x._internal_get_values() - if is_categorical_dtype(x.dtype) - else np.asarray(x).ravel() - if not is_datetime64tz_dtype(x) - else np.asarray(x.astype(object)) - for x in to_concat - ] - result = concat_compat(to_concat) - if axis == 1: - # TODO(EA2D): not necessary with 2D EAs - result = result.reshape(1, len(result)) - return result - - def union_categoricals( to_union, sort_categories: bool = False, ignore_order: bool = False ): @@ -415,34 +405,3 @@ def _wrap_datetimelike(arr): if isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"]: arr = pd_array(arr) return arr - - -def _concat_sparse(to_concat, axis=0, typs=None): - """ - provide concatenation of an sparse/dense array of arrays each of which is a - single dtype - - Parameters - ---------- - to_concat : array of arrays - axis : axis to provide concatenation - typs : set of to_concat dtypes - - Returns - ------- - a single array, preserving the combined dtypes - """ - from pandas.core.arrays import SparseArray - - fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)] - fill_value = fill_values[0] - - # TODO: Fix join unit generation so we aren't passed this. - to_concat = [ - x - if isinstance(x, SparseArray) - else SparseArray(x.squeeze(), fill_value=fill_value) - for x in to_concat - ] - - return SparseArray._concat_same_type(to_concat) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 8fe2b3c60d6d0..ceed7e29e4a35 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -21,7 +21,7 @@ from pandas._libs.interval import Interval from pandas._libs.tslibs import NaT, Period, Timestamp, timezones -from pandas._typing import Ordered +from pandas._typing import DtypeObj, Ordered from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCDateOffset, ABCIndexClass @@ -640,6 +640,32 @@ def _is_boolean(self) -> bool: return is_bool_dtype(self.categories) + def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: + # check if we have all categorical dtype with identical categories + if all(isinstance(x, CategoricalDtype) for x in dtypes): + first = dtypes[0] + if all(first == other for other in dtypes[1:]): + return first + + # special case non-initialized categorical + # TODO we should figure out the expected return value in general + non_init_cats = [ + isinstance(x, CategoricalDtype) and x.categories is None for x in dtypes + ] + if all(non_init_cats): + return self + elif any(non_init_cats): + return None + + # extract the categories' dtype + non_cat_dtypes = [ + x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes + ] + # TODO should categorical always give an answer? + from pandas.core.dtypes.cast import find_common_type + + return find_common_type(non_cat_dtypes) + @register_extension_dtype class DatetimeTZDtype(PandasExtensionDtype): diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index e9bbd915df768..bd07fefd03d2a 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -24,6 +24,7 @@ from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algos +from pandas.core.arrays import ExtensionArray from pandas.core.internals.blocks import make_block from pandas.core.internals.managers import BlockManager @@ -65,13 +66,13 @@ def concatenate_block_managers( blk = join_units[0].block vals = [ju.block.values for ju in join_units] - if not blk.is_extension or blk.is_datetimetz or blk.is_categorical: - # datetimetz and categorical can have the same type but multiple - # dtypes, concatting does not necessarily preserve dtype + if not blk.is_extension: values = concat_compat(vals, axis=blk.ndim - 1) else: # TODO(EA2D): special-casing not needed with 2D EAs values = concat_compat(vals) + if not isinstance(values, ExtensionArray): + values = values.reshape(1, len(values)) b = make_block(values, placement=placement, ndim=blk.ndim) else: diff --git a/pandas/tests/arrays/integer/test_concat.py b/pandas/tests/arrays/integer/test_concat.py new file mode 100644 index 0000000000000..3ace35700bd3e --- /dev/null +++ b/pandas/tests/arrays/integer/test_concat.py @@ -0,0 +1,26 @@ +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize( + "to_concat_dtypes, result_dtype", + [ + (["Int64", "Int64"], "Int64"), + (["UInt64", "UInt64"], "UInt64"), + (["Int8", "Int8"], "Int8"), + (["Int8", "Int16"], "Int16"), + (["UInt8", "Int8"], "Int16"), + (["Int32", "UInt32"], "Int64"), + # this still gives object (awaiting float extension dtype) + (["Int64", "UInt64"], "object"), + ], +) +def test_concat_series(to_concat_dtypes, result_dtype): + + result = pd.concat([pd.Series([1, 2, pd.NA], dtype=t) for t in to_concat_dtypes]) + expected = pd.concat([pd.Series([1, 2, pd.NA], dtype=object)] * 2).astype( + result_dtype + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index ee4e199fbfe45..65e32d716a4db 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -112,3 +112,10 @@ def test_construct_from_string_wrong_type_raises(self, dtype): match="'construct_from_string' expects a string, got ", ): type(dtype).construct_from_string(0) + + def test_get_common_dtype(self, dtype): + # in practice we will not typically call this with a 1-length list + # (we shortcut to just use that dtype as the common dtype), but + # still testing as good practice to have this working (and it is the + # only case we can test in general) + assert dtype._get_common_dtype([dtype]) == dtype diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index b2239c077bd69..5fd44d7cd74a9 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -584,7 +584,7 @@ def test_interleave_dtype(self, mgr_string, dtype): mgr = create_mgr("a: complex") assert mgr.as_array().dtype == "complex" mgr = create_mgr("a: f8; b: category") - assert mgr.as_array().dtype == "object" + assert mgr.as_array().dtype == "f8" mgr = create_mgr("a: M8[ns]; b: category") assert mgr.as_array().dtype == "object" mgr = create_mgr("a: M8[ns]; b: bool") diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 7c01664df0607..ac3d83c29cdc4 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -610,11 +610,11 @@ def test_concat_categorical_3elem_coercion(self): s2 = pd.Series([2, 1, 2], dtype="category") s3 = pd.Series([1, 2, 1, 2, np.nan]) - exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype="object") + exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype="float") tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) - exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype="object") + exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype="float") tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) @@ -698,7 +698,7 @@ def test_concat_categorical_coercion_nan(self): s1 = pd.Series([1, np.nan], dtype="category") s2 = pd.Series([np.nan, np.nan]) - exp = pd.Series([1, np.nan, np.nan, np.nan], dtype="object") + exp = pd.Series([1, np.nan, np.nan, np.nan], dtype="float") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)