Skip to content

Commit 08f2d64

Browse files
Dr-Irvjreback
authored andcommitted
ENH: Implement convert_dtypes (#30929)
1 parent 545381f commit 08f2d64

File tree

12 files changed

+574
-1
lines changed

12 files changed

+574
-1
lines changed

doc/source/reference/frame.rst

+1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ Conversion
4343
:toctree: api/
4444

4545
DataFrame.astype
46+
DataFrame.convert_dtypes
4647
DataFrame.infer_objects
4748
DataFrame.copy
4849
DataFrame.isna

doc/source/reference/series.rst

+1
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ Conversion
4646
:toctree: api/
4747

4848
Series.astype
49+
Series.convert_dtypes
4950
Series.infer_objects
5051
Series.copy
5152
Series.bool

doc/source/user_guide/missing_data.rst

+28-1
Original file line numberDiff line numberDiff line change
@@ -806,7 +806,8 @@ dtype, it will use ``pd.NA``:
806806
807807
Currently, pandas does not yet use those data types by default (when creating
808808
a DataFrame or Series, or when reading in data), so you need to specify
809-
the dtype explicitly.
809+
the dtype explicitly. An easy way to convert to those dtypes is explained
810+
:ref:`here <missing_data.NA.conversion>`.
810811

811812
Propagation in arithmetic and comparison operations
812813
---------------------------------------------------
@@ -942,3 +943,29 @@ work with ``NA``, and generally return ``NA``:
942943
in the future.
943944

944945
See :ref:`dsintro.numpy_interop` for more on ufuncs.
946+
947+
.. _missing_data.NA.conversion:
948+
949+
Conversion
950+
----------
951+
952+
If you have a DataFrame or Series using traditional types that have missing data
953+
represented using ``np.nan``, there are convenience methods
954+
:meth:`~Series.convert_dtypes` in Series and :meth:`~DataFrame.convert_dtypes`
955+
in DataFrame that can convert data to use the newer dtypes for integers, strings and
956+
booleans listed :ref:`here <basics.dtypes>`. This is especially helpful after reading
957+
in data sets when letting the readers such as :meth:`read_csv` and :meth:`read_excel`
958+
infer default dtypes.
959+
960+
In this example, while the dtypes of all columns are changed, we show the results for
961+
the first 10 columns.
962+
963+
.. ipython:: python
964+
965+
bb = pd.read_csv('data/baseball.csv', index_col='id')
966+
bb[bb.columns[:10]].dtypes
967+
968+
.. ipython:: python
969+
970+
bbn = bb.convert_dtypes()
971+
bbn[bbn.columns[:10]].dtypes

doc/source/whatsnew/v1.1.0.rst

+30
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,36 @@ including other versions of pandas.
1313
Enhancements
1414
~~~~~~~~~~~~
1515

16+
.. _whatsnew_100.convert_dtypes:
17+
18+
``convert_dtypes`` method to ease use of supported extension dtypes
19+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20+
21+
In order to encourage use of the extension dtypes ``StringDtype``,
22+
``BooleanDtype``, ``Int64Dtype``, ``Int32Dtype``, etc., that support ``pd.NA``, the
23+
methods :meth:`DataFrame.convert_dtypes` and :meth:`Series.convert_dtypes`
24+
have been introduced. (:issue:`29752`) (:issue:`30929`)
25+
26+
Example:
27+
28+
.. ipython:: python
29+
30+
df = pd.DataFrame({'x': ['abc', None, 'def'],
31+
'y': [1, 2, np.nan],
32+
'z': [True, False, True]})
33+
df
34+
df.dtypes
35+
36+
.. ipython:: python
37+
38+
converted = df.convert_dtypes()
39+
converted
40+
converted.dtypes
41+
42+
This is especially useful after reading in data using readers such as :func:`read_csv`
43+
and :func:`read_excel`.
44+
See :ref:`here <missing_data.NA.conversion>` for a description.
45+
1646
.. _whatsnew_110.period_index_partial_string_slicing:
1747

1848
Nonmonotonic PeriodIndex Partial String Slicing

pandas/core/dtypes/cast.py

+76
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from pandas._libs import lib, tslib, tslibs
88
from pandas._libs.tslibs import NaT, OutOfBoundsDatetime, Period, iNaT
99
from pandas._libs.tslibs.timezones import tz_compare
10+
from pandas._typing import Dtype
1011
from pandas.util._validators import validate_bool_kwarg
1112

1213
from pandas.core.dtypes.common import (
@@ -34,6 +35,7 @@
3435
is_float_dtype,
3536
is_integer,
3637
is_integer_dtype,
38+
is_numeric_dtype,
3739
is_object_dtype,
3840
is_scalar,
3941
is_string_dtype,
@@ -1018,6 +1020,80 @@ def soft_convert_objects(
10181020
return values
10191021

10201022

1023+
def convert_dtypes(
1024+
input_array,
1025+
convert_string: bool = True,
1026+
convert_integer: bool = True,
1027+
convert_boolean: bool = True,
1028+
) -> Dtype:
1029+
"""
1030+
Convert objects to best possible type, and optionally,
1031+
to types supporting ``pd.NA``.
1032+
1033+
Parameters
1034+
----------
1035+
input_array : ExtensionArray or PandasArray
1036+
convert_string : bool, default True
1037+
Whether object dtypes should be converted to ``StringDtype()``.
1038+
convert_integer : bool, default True
1039+
Whether, if possible, conversion can be done to integer extension types.
1040+
convert_boolean : bool, defaults True
1041+
Whether object dtypes should be converted to ``BooleanDtypes()``.
1042+
1043+
Returns
1044+
-------
1045+
dtype
1046+
new dtype
1047+
"""
1048+
1049+
if convert_string or convert_integer or convert_boolean:
1050+
try:
1051+
inferred_dtype = lib.infer_dtype(input_array)
1052+
except ValueError:
1053+
# Required to catch due to Period. Can remove once GH 23553 is fixed
1054+
inferred_dtype = input_array.dtype
1055+
1056+
if not convert_string and is_string_dtype(inferred_dtype):
1057+
inferred_dtype = input_array.dtype
1058+
1059+
if convert_integer:
1060+
target_int_dtype = "Int64"
1061+
1062+
if isinstance(inferred_dtype, str) and (
1063+
inferred_dtype == "mixed-integer"
1064+
or inferred_dtype == "mixed-integer-float"
1065+
):
1066+
inferred_dtype = target_int_dtype
1067+
if is_integer_dtype(input_array.dtype) and not is_extension_array_dtype(
1068+
input_array.dtype
1069+
):
1070+
from pandas.core.arrays.integer import _dtypes
1071+
1072+
inferred_dtype = _dtypes.get(input_array.dtype.name, target_int_dtype)
1073+
if not is_integer_dtype(input_array.dtype) and is_numeric_dtype(
1074+
input_array.dtype
1075+
):
1076+
inferred_dtype = target_int_dtype
1077+
1078+
else:
1079+
if is_integer_dtype(inferred_dtype):
1080+
inferred_dtype = input_array.dtype
1081+
1082+
if convert_boolean:
1083+
if is_bool_dtype(input_array.dtype) and not is_extension_array_dtype(
1084+
input_array.dtype
1085+
):
1086+
inferred_dtype = "boolean"
1087+
else:
1088+
if isinstance(inferred_dtype, str) and inferred_dtype == "boolean":
1089+
inferred_dtype = input_array.dtype
1090+
1091+
else:
1092+
inferred_dtype = input_array.dtype
1093+
1094+
return inferred_dtype
1095+
1096+
10211097
def maybe_castable(arr) -> bool:
10221098
# return False to force a non-fastpath
10231099

pandas/core/generic.py

+137
Original file line numberDiff line numberDiff line change
@@ -5702,6 +5702,7 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries:
57025702
to_datetime : Convert argument to datetime.
57035703
to_timedelta : Convert argument to timedelta.
57045704
to_numeric : Convert argument to numeric type.
5705+
convert_dtypes : Convert argument to best possible dtype.
57055706
57065707
Examples
57075708
--------
@@ -5730,6 +5731,142 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries:
57305731
)
57315732
).__finalize__(self)
57325733

5734+
def convert_dtypes(
5735+
self: FrameOrSeries,
5736+
infer_objects: bool_t = True,
5737+
convert_string: bool_t = True,
5738+
convert_integer: bool_t = True,
5739+
convert_boolean: bool_t = True,
5740+
) -> FrameOrSeries:
5741+
"""
5742+
Convert columns to best possible dtypes using dtypes supporting ``pd.NA``.
5743+
5744+
.. versionadded:: 1.1.0
5745+
5746+
Parameters
5747+
----------
5748+
infer_objects : bool, default True
5749+
Whether object dtypes should be converted to the best possible types.
5750+
convert_string : bool, default True
5751+
Whether object dtypes should be converted to ``StringDtype()``.
5752+
convert_integer : bool, default True
5753+
Whether, if possible, conversion can be done to integer extension types.
5754+
convert_boolean : bool, defaults True
5755+
Whether object dtypes should be converted to ``BooleanDtypes()``.
5756+
5757+
Returns
5758+
-------
5759+
Series or DataFrame
5760+
Copy of input object with new dtype.
5761+
5762+
See Also
5763+
--------
5764+
infer_objects : Infer dtypes of objects.
5765+
to_datetime : Convert argument to datetime.
5766+
to_timedelta : Convert argument to timedelta.
5767+
to_numeric : Convert argument to a numeric type.
5768+
5769+
Notes
5770+
-----
5771+
5772+
By default, ``convert_dtypes`` will attempt to convert a Series (or each
5773+
Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options
5774+
``convert_string``, ``convert_integer``, and ``convert_boolean``, it is
5775+
possible to turn off individual conversions to ``StringDtype``, the integer
5776+
extension types or ``BooleanDtype``, respectively.
5777+
5778+
For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference
5779+
rules as during normal Series/DataFrame construction. Then, if possible,
5780+
convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer extension
5781+
type, otherwise leave as ``object``.
5782+
5783+
If the dtype is integer, convert to an appropriate integer extension type.
5784+
5785+
If the dtype is numeric, and consists of all integers, convert to an
5786+
appropriate integer extension type.
5787+
5788+
In the future, as new dtypes are added that support ``pd.NA``, the results
5789+
of this method will change to support those new dtypes.
5790+
5791+
Examples
5792+
--------
5793+
>>> df = pd.DataFrame(
5794+
... {
5795+
... "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
5796+
... "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
5797+
... "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")),
5798+
... "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")),
5799+
... "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")),
5800+
... "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
5801+
... }
5802+
... )
5803+
5804+
Start with a DataFrame with default dtypes.
5805+
5806+
>>> df
5807+
a b c d e f
5808+
0 1 x True h 10.0 NaN
5809+
1 2 y False i NaN 100.5
5810+
2 3 z NaN NaN 20.0 200.0
5811+
5812+
>>> df.dtypes
5813+
a int32
5814+
b object
5815+
c object
5816+
d object
5817+
e float64
5818+
f float64
5819+
dtype: object
5820+
5821+
Convert the DataFrame to use best possible dtypes.
5822+
5823+
>>> dfn = df.convert_dtypes()
5824+
>>> dfn
5825+
a b c d e f
5826+
0 1 x True h 10 NaN
5827+
1 2 y False i <NA> 100.5
5828+
2 3 z <NA> <NA> 20 200.0
5829+
5830+
>>> dfn.dtypes
5831+
a Int32
5832+
b string
5833+
c boolean
5834+
d string
5835+
e Int64
5836+
f float64
5837+
dtype: object
5838+
5839+
Start with a Series of strings and missing data represented by ``np.nan``.
5840+
5841+
>>> s = pd.Series(["a", "b", np.nan])
5842+
>>> s
5843+
0 a
5844+
1 b
5845+
2 NaN
5846+
dtype: object
5847+
5848+
Obtain a Series with dtype ``StringDtype``.
5849+
5850+
>>> s.convert_dtypes()
5851+
0 a
5852+
1 b
5853+
2 <NA>
5854+
dtype: string
5855+
"""
5856+
if self.ndim == 1:
5857+
return self._convert_dtypes(
5858+
infer_objects, convert_string, convert_integer, convert_boolean
5859+
)
5860+
else:
5861+
results = [
5862+
col._convert_dtypes(
5863+
infer_objects, convert_string, convert_integer, convert_boolean
5864+
)
5865+
for col_name, col in self.items()
5866+
]
5867+
result = pd.concat(results, axis=1, copy=False)
5868+
return result
5869+
57335870
# ----------------------------------------------------------------------
57345871
# Filling NA's
57355872

pandas/core/series.py

+29
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from pandas.util._decorators import Appender, Substitution
2929
from pandas.util._validators import validate_bool_kwarg, validate_percentile
3030

31+
from pandas.core.dtypes.cast import convert_dtypes
3132
from pandas.core.dtypes.common import (
3233
_is_unorderable_exception,
3334
ensure_platform_int,
@@ -4372,6 +4373,34 @@ def between(self, left, right, inclusive=True) -> "Series":
43724373

43734374
return lmask & rmask
43744375

4376+
# ----------------------------------------------------------------------
4377+
# Convert to types that support pd.NA
4378+
4379+
def _convert_dtypes(
4380+
self: ABCSeries,
4381+
infer_objects: bool = True,
4382+
convert_string: bool = True,
4383+
convert_integer: bool = True,
4384+
convert_boolean: bool = True,
4385+
) -> "Series":
4386+
input_series = self
4387+
if infer_objects:
4388+
input_series = input_series.infer_objects()
4389+
if is_object_dtype(input_series):
4390+
input_series = input_series.copy()
4391+
4392+
if convert_string or convert_integer or convert_boolean:
4393+
inferred_dtype = convert_dtypes(
4394+
input_series._values, convert_string, convert_integer, convert_boolean
4395+
)
4396+
try:
4397+
result = input_series.astype(inferred_dtype)
4398+
except TypeError:
4399+
result = input_series.copy()
4400+
else:
4401+
result = input_series.copy()
4402+
return result
4403+
43754404
@Appender(generic._shared_docs["isna"] % _shared_doc_kwargs)
43764405
def isna(self) -> "Series":
43774406
return super().isna()

pandas/core/tools/datetimes.py

+1
Original file line numberDiff line numberDiff line change
@@ -628,6 +628,7 @@ def to_datetime(
628628
--------
629629
DataFrame.astype : Cast argument to a specified dtype.
630630
to_timedelta : Convert argument to timedelta.
631+
convert_dtypes : Convert dtypes.
631632
632633
Examples
633634
--------

pandas/core/tools/numeric.py

+1
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ def to_numeric(arg, errors="raise", downcast=None):
7070
to_datetime : Convert argument to datetime.
7171
to_timedelta : Convert argument to timedelta.
7272
numpy.ndarray.astype : Cast a numpy array to a specified type.
73+
convert_dtypes : Convert dtypes.
7374
7475
Examples
7576
--------

0 commit comments

Comments
 (0)