ENH: Implement convert_dtypes (#30929)

Dr-Irv · jreback · commit 08f2d6411290 · 2020-01-23T22:24:15.000-05:00
diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
@@ -43,6 +43,7 @@ Conversion
    :toctree: api/
 
    DataFrame.astype
+   DataFrame.convert_dtypes
    DataFrame.infer_objects
    DataFrame.copy
    DataFrame.isna
diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst
@@ -46,6 +46,7 @@ Conversion
    :toctree: api/
 
    Series.astype
+   Series.convert_dtypes
    Series.infer_objects
    Series.copy
    Series.bool
diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst
@@ -806,7 +806,8 @@ dtype, it will use ``pd.NA``:
 
 Currently, pandas does not yet use those data types by default (when creating
 a DataFrame or Series, or when reading in data), so you need to specify
-the dtype explicitly.
+the dtype explicitly.  An easy way to convert to those dtypes is explained
+:ref:`here <missing_data.NA.conversion>`.
 
 Propagation in arithmetic and comparison operations
 ---------------------------------------------------
@@ -942,3 +943,29 @@ work with ``NA``, and generally return ``NA``:
    in the future.
 
 See :ref:`dsintro.numpy_interop` for more on ufuncs.
+
+.. _missing_data.NA.conversion:
+
+Conversion
+----------
+
+If you have a DataFrame or Series using traditional types that have missing data
+represented using ``np.nan``, there are convenience methods
+:meth:`~Series.convert_dtypes` in Series and :meth:`~DataFrame.convert_dtypes`
+in DataFrame that can convert data to use the newer dtypes for integers, strings and
+booleans listed :ref:`here <basics.dtypes>`. This is especially helpful after reading
+in data sets when letting the readers such as :meth:`read_csv` and :meth:`read_excel`
+infer default dtypes.
+
+In this example, while the dtypes of all columns are changed, we show the results for
+the first 10 columns.
+
+.. ipython:: python
+
+   bb = pd.read_csv('data/baseball.csv', index_col='id')
+   bb[bb.columns[:10]].dtypes
+
+.. ipython:: python
+
+   bbn = bb.convert_dtypes()
+   bbn[bbn.columns[:10]].dtypes
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -13,6 +13,36 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
+.. _whatsnew_100.convert_dtypes:
+
+``convert_dtypes`` method to ease use of supported extension dtypes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In order to encourage use of the extension dtypes ``StringDtype``,
+``BooleanDtype``, ``Int64Dtype``, ``Int32Dtype``, etc., that support ``pd.NA``, the
+methods :meth:`DataFrame.convert_dtypes` and :meth:`Series.convert_dtypes`
+have been introduced. (:issue:`29752`) (:issue:`30929`)
+
+Example:
+
+.. ipython:: python
+
+   df = pd.DataFrame({'x': ['abc', None, 'def'],
+                      'y': [1, 2, np.nan],
+                      'z': [True, False, True]})
+   df
+   df.dtypes
+
+.. ipython:: python
+
+   converted = df.convert_dtypes()
+   converted
+   converted.dtypes
+
+This is especially useful after reading in data using readers such as :func:`read_csv`
+and :func:`read_excel`.
+See :ref:`here <missing_data.NA.conversion>` for a description.
+
 .. _whatsnew_110.period_index_partial_string_slicing:
 
 Nonmonotonic PeriodIndex Partial String Slicing
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -7,6 +7,7 @@
 from pandas._libs import lib, tslib, tslibs
 from pandas._libs.tslibs import NaT, OutOfBoundsDatetime, Period, iNaT
 from pandas._libs.tslibs.timezones import tz_compare
+from pandas._typing import Dtype
 from pandas.util._validators import validate_bool_kwarg
 
 from pandas.core.dtypes.common import (
@@ -34,6 +35,7 @@
     is_float_dtype,
     is_integer,
     is_integer_dtype,
+    is_numeric_dtype,
     is_object_dtype,
     is_scalar,
     is_string_dtype,
@@ -1018,6 +1020,80 @@ def soft_convert_objects(
     return values
 
 
+def convert_dtypes(
+    input_array,
+    convert_string: bool = True,
+    convert_integer: bool = True,
+    convert_boolean: bool = True,
+) -> Dtype:
+    """
+    Convert objects to best possible type, and optionally,
+    to types supporting ``pd.NA``.
+
+    Parameters
+    ----------
+    input_array : ExtensionArray or PandasArray
+    convert_string : bool, default True
+        Whether object dtypes should be converted to ``StringDtype()``.
+    convert_integer : bool, default True
+        Whether, if possible, conversion can be done to integer extension types.
+    convert_boolean : bool, defaults True
+        Whether object dtypes should be converted to ``BooleanDtypes()``.
+
+    Returns
+    -------
+    dtype
+        new dtype
+    """
+
+    if convert_string or convert_integer or convert_boolean:
+        try:
+            inferred_dtype = lib.infer_dtype(input_array)
+        except ValueError:
+            # Required to catch due to Period.  Can remove once GH 23553 is fixed
+            inferred_dtype = input_array.dtype
+
+        if not convert_string and is_string_dtype(inferred_dtype):
+            inferred_dtype = input_array.dtype
+
+        if convert_integer:
+            target_int_dtype = "Int64"
+
+            if isinstance(inferred_dtype, str) and (
+                inferred_dtype == "mixed-integer"
+                or inferred_dtype == "mixed-integer-float"
+            ):
+                inferred_dtype = target_int_dtype
+            if is_integer_dtype(input_array.dtype) and not is_extension_array_dtype(
+                input_array.dtype
+            ):
+                from pandas.core.arrays.integer import _dtypes
+
+                inferred_dtype = _dtypes.get(input_array.dtype.name, target_int_dtype)
+            if not is_integer_dtype(input_array.dtype) and is_numeric_dtype(
+                input_array.dtype
+            ):
+                inferred_dtype = target_int_dtype
+
+        else:
+            if is_integer_dtype(inferred_dtype):
+                inferred_dtype = input_array.dtype
+
+        if convert_boolean:
+            if is_bool_dtype(input_array.dtype) and not is_extension_array_dtype(
+                input_array.dtype
+            ):
+                inferred_dtype = "boolean"
+        else:
+            if isinstance(inferred_dtype, str) and inferred_dtype == "boolean":
+                inferred_dtype = input_array.dtype
+
+    else:
+        inferred_dtype = input_array.dtype
+
+    return inferred_dtype
+
+
 def maybe_castable(arr) -> bool:
     # return False to force a non-fastpath
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -5702,6 +5702,7 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries:
         to_datetime : Convert argument to datetime.
         to_timedelta : Convert argument to timedelta.
         to_numeric : Convert argument to numeric type.
+        convert_dtypes : Convert argument to best possible dtype.
 
         Examples
         --------
@@ -5730,6 +5731,142 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries:
             )
         ).__finalize__(self)
 
+    def convert_dtypes(
+        self: FrameOrSeries,
+        infer_objects: bool_t = True,
+        convert_string: bool_t = True,
+        convert_integer: bool_t = True,
+        convert_boolean: bool_t = True,
+    ) -> FrameOrSeries:
+        """
+        Convert columns to best possible dtypes using dtypes supporting ``pd.NA``.
+
+        .. versionadded:: 1.1.0
+
+        Parameters
+        ----------
+        infer_objects : bool, default True
+            Whether object dtypes should be converted to the best possible types.
+        convert_string : bool, default True
+            Whether object dtypes should be converted to ``StringDtype()``.
+        convert_integer : bool, default True
+            Whether, if possible, conversion can be done to integer extension types.
+        convert_boolean : bool, defaults True
+            Whether object dtypes should be converted to ``BooleanDtypes()``.
+
+        Returns
+        -------
+        Series or DataFrame
+            Copy of input object with new dtype.
+
+        See Also
+        --------
+        infer_objects : Infer dtypes of objects.
+        to_datetime : Convert argument to datetime.
+        to_timedelta : Convert argument to timedelta.
+        to_numeric : Convert argument to a numeric type.
+
+        Notes
+        -----
+
+        By default, ``convert_dtypes`` will attempt to convert a Series (or each
+        Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options
+        ``convert_string``, ``convert_integer``, and ``convert_boolean``, it is
+        possible to turn off individual conversions to ``StringDtype``, the integer
+        extension types or ``BooleanDtype``, respectively.
+
+        For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference
+        rules as during normal Series/DataFrame construction.  Then, if possible,
+        convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer extension
+        type, otherwise leave as ``object``.
+
+        If the dtype is integer, convert to an appropriate integer extension type.
+
+        If the dtype is numeric, and consists of all integers, convert to an
+        appropriate integer extension type.
+
+        In the future, as new dtypes are added that support ``pd.NA``, the results
+        of this method will change to support those new dtypes.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
+        ...         "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
+        ...         "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")),
+        ...         "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")),
+        ...         "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")),
+        ...         "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
+        ...     }
+        ... )
+
+        Start with a DataFrame with default dtypes.
+
+        >>> df
+           a  b      c    d     e      f
+        0  1  x   True    h  10.0    NaN
+        1  2  y  False    i   NaN  100.5
+        2  3  z    NaN  NaN  20.0  200.0
+
+        >>> df.dtypes
+        a      int32
+        b     object
+        c     object
+        d     object
+        e    float64
+        f    float64
+        dtype: object
+
+        Convert the DataFrame to use best possible dtypes.
+
+        >>> dfn = df.convert_dtypes()
+        >>> dfn
+           a  b      c     d     e      f
+        0  1  x   True     h    10    NaN
+        1  2  y  False     i  <NA>  100.5
+        2  3  z   <NA>  <NA>    20  200.0
+
+        >>> dfn.dtypes
+        a      Int32
+        b     string
+        c    boolean
+        d     string
+        e      Int64
+        f    float64
+        dtype: object
+
+        Start with a Series of strings and missing data represented by ``np.nan``.
+
+        >>> s = pd.Series(["a", "b", np.nan])
+        >>> s
+        0      a
+        1      b
+        2    NaN
+        dtype: object
+
+        Obtain a Series with dtype ``StringDtype``.
+
+        >>> s.convert_dtypes()
+        0       a
+        1       b
+        2    <NA>
+        dtype: string
+        """
+        if self.ndim == 1:
+            return self._convert_dtypes(
+                infer_objects, convert_string, convert_integer, convert_boolean
+            )
+        else:
+            results = [
+                col._convert_dtypes(
+                    infer_objects, convert_string, convert_integer, convert_boolean
+                )
+                for col_name, col in self.items()
+            ]
+            result = pd.concat(results, axis=1, copy=False)
+            return result
+
     # ----------------------------------------------------------------------
     # Filling NA's
 
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -28,6 +28,7 @@
 from pandas.util._decorators import Appender, Substitution
 from pandas.util._validators import validate_bool_kwarg, validate_percentile
 
+from pandas.core.dtypes.cast import convert_dtypes
 from pandas.core.dtypes.common import (
     _is_unorderable_exception,
     ensure_platform_int,
@@ -4372,6 +4373,34 @@ def between(self, left, right, inclusive=True) -> "Series":
 
         return lmask & rmask
 
+    # ----------------------------------------------------------------------
+    # Convert to types that support pd.NA
+
+    def _convert_dtypes(
+        self: ABCSeries,
+        infer_objects: bool = True,
+        convert_string: bool = True,
+        convert_integer: bool = True,
+        convert_boolean: bool = True,
+    ) -> "Series":
+        input_series = self
+        if infer_objects:
+            input_series = input_series.infer_objects()
+            if is_object_dtype(input_series):
+                input_series = input_series.copy()
+
+        if convert_string or convert_integer or convert_boolean:
+            inferred_dtype = convert_dtypes(
+                input_series._values, convert_string, convert_integer, convert_boolean
+            )
+            try:
+                result = input_series.astype(inferred_dtype)
+            except TypeError:
+                result = input_series.copy()
+        else:
+            result = input_series.copy()
+        return result
+
     @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs)
     def isna(self) -> "Series":
         return super().isna()
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
@@ -628,6 +628,7 @@ def to_datetime(
     --------
     DataFrame.astype : Cast argument to a specified dtype.
     to_timedelta : Convert argument to timedelta.
+    convert_dtypes : Convert dtypes.
 
     Examples
     --------
diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py
@@ -70,6 +70,7 @@ def to_numeric(arg, errors="raise", downcast=None):
     to_datetime : Convert argument to datetime.
     to_timedelta : Convert argument to timedelta.
     numpy.ndarray.astype : Cast a numpy array to a specified type.
+    convert_dtypes : Convert dtypes.
 
     Examples
     --------
diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py
diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py
diff --git a/pandas/tests/series/test_convert_dtypes.py b/pandas/tests/series/test_convert_dtypes.py