From b534f5656335f7847d6c529b6a3eb502d72cd849 Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Tue, 24 Aug 2021 19:10:29 +0530 Subject: [PATCH 01/28] Implemented the byte_string --- pandas/core/dtypes/cast.py | 8 ++++++++ pandas/tests/series/methods/test_convert_dtypes.py | 14 ++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 4f4276ceddcf9..d9e31cda4acd8 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1410,6 +1410,14 @@ def convert_dtypes( """ inferred_dtype: str | DtypeObj + try: + # case of a byte_string + byte_check = input_array.decode() + inferred_dtype = type(input_array) + return inferred_dtype + except: + pass + if ( convert_string or convert_integer or convert_boolean or convert_floating ) and isinstance(input_array, np.ndarray): diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 81203b944fa92..ff29a5d500a97 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -226,3 +226,17 @@ def test_convert_bool_dtype(self): # GH32287 df = pd.DataFrame({"A": pd.array([True])}) tm.assert_frame_equal(df, df.convert_dtypes()) + + def test_convert_byte_string_dtype(self): + # https://github.com/pandas-dev/pandas/issues/43183 -> not recognising the byte_string dtype + byte_str = b'binary-string' + dataframe = pd.DataFrame( + data={ + "data": byte_str, + }, + index=[0] + ) + converted_dtypes = dataframe.convert_dtypes() + #print(converted_dtypes['data'][0]) + assert converted_dtypes['data'][0].decode('ascii') == "binary-string" + # no need for the tm module as we are just verifying the conversion of the string From 23583fc9747e79e752dee3e1714a2e9dc87563fa Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Tue, 24 Aug 2021 19:15:25 +0530 Subject: [PATCH 02/28] Update test_convert_dtypes.py --- pandas/tests/series/methods/test_convert_dtypes.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index ff29a5d500a97..70917b141a177 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -228,7 +228,8 @@ def test_convert_bool_dtype(self): tm.assert_frame_equal(df, df.convert_dtypes()) def test_convert_byte_string_dtype(self): - # https://github.com/pandas-dev/pandas/issues/43183 -> not recognising the byte_string dtype + # https://github.com/pandas-dev/pandas/issues/43183 -> + # not recognising the byte_string dtype byte_str = b'binary-string' dataframe = pd.DataFrame( data={ @@ -239,4 +240,5 @@ def test_convert_byte_string_dtype(self): converted_dtypes = dataframe.convert_dtypes() #print(converted_dtypes['data'][0]) assert converted_dtypes['data'][0].decode('ascii') == "binary-string" - # no need for the tm module as we are just verifying the conversion of the string + # no need for the tm module as + # we are just verifying the conversion of the string From 96338fae919ce474ec60d83de77469138321df46 Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Tue, 24 Aug 2021 19:17:29 +0530 Subject: [PATCH 03/28] Update test_convert_dtypes.py --- pandas/tests/series/methods/test_convert_dtypes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 70917b141a177..30e0ef38a9e27 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -238,7 +238,8 @@ def test_convert_byte_string_dtype(self): index=[0] ) converted_dtypes = dataframe.convert_dtypes() - #print(converted_dtypes['data'][0]) + # print(converted_dtypes['data'][0]) + # to check whether the type continues to remain a byte_string assert converted_dtypes['data'][0].decode('ascii') == "binary-string" # no need for the tm module as # we are just verifying the conversion of the string From ad4189f55f22dc2988b5fac6c5bcd89d029e7ec7 Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Tue, 24 Aug 2021 19:18:56 +0530 Subject: [PATCH 04/28] Update cast.py --- pandas/core/dtypes/cast.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index d9e31cda4acd8..93048008a9b40 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1416,6 +1416,9 @@ def convert_dtypes( inferred_dtype = type(input_array) return inferred_dtype except: + # it is not a bare except, there is a pass statement + # In the event of an exception, it will not be + # a byte_string, so we process with other types pass if ( From e915bc0466db180a034895acff04c932be220777 Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Wed, 25 Aug 2021 12:11:47 +0530 Subject: [PATCH 05/28] Passes the tests --- pandas/core/dtypes/cast.py | 9 ++++++--- pandas/tests/series/methods/test_convert_dtypes.py | 13 ++++++++----- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 93048008a9b40..9512e96aff152 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1412,9 +1412,12 @@ def convert_dtypes( try: # case of a byte_string - byte_check = input_array.decode() - inferred_dtype = type(input_array) - return inferred_dtype + # all(isinstance(x, bytes) for x in input_array) + + byte_list_set = list(set([type(x) is bytes for x in input_array])) + if len(byte_list_set) == 1 and byte_list_set[0] == True: + inferred_dtype = type(input_array[0]) + return inferred_dtype except: # it is not a bare except, there is a pass statement # In the event of an exception, it will not be diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 30e0ef38a9e27..d2f32634604cc 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -231,15 +231,18 @@ def test_convert_byte_string_dtype(self): # https://github.com/pandas-dev/pandas/issues/43183 -> # not recognising the byte_string dtype byte_str = b'binary-string' - dataframe = pd.DataFrame( + + df= pd.DataFrame( data={ - "data": byte_str, + "A": byte_str, }, index=[0] ) - converted_dtypes = dataframe.convert_dtypes() - # print(converted_dtypes['data'][0]) + + df_convert = df.convert_dtypes() + # to check whether the type continues to remain a byte_string - assert converted_dtypes['data'][0].decode('ascii') == "binary-string" + assert df_convert["A"][0].decode('ascii') == "binary-string" + # no need for the tm module as # we are just verifying the conversion of the string From 6d0a497095bf74ec1f56e038b0d68bb64021b839 Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Wed, 25 Aug 2021 12:36:51 +0530 Subject: [PATCH 06/28] Removed PEP 8 issues --- pandas/core/dtypes/cast.py | 2 +- pandas/tests/series/methods/test_convert_dtypes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9512e96aff152..f86530d9f5ab4 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1415,7 +1415,7 @@ def convert_dtypes( # all(isinstance(x, bytes) for x in input_array) byte_list_set = list(set([type(x) is bytes for x in input_array])) - if len(byte_list_set) == 1 and byte_list_set[0] == True: + if len(byte_list_set) == 1 and byte_list_set[0]: inferred_dtype = type(input_array[0]) return inferred_dtype except: diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index d2f32634604cc..ab9641b5b785d 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -232,7 +232,7 @@ def test_convert_byte_string_dtype(self): # not recognising the byte_string dtype byte_str = b'binary-string' - df= pd.DataFrame( + df = pd.DataFrame( data={ "A": byte_str, }, From 5ab2d79da40ba469739241086a9e00bbb53d8436 Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Wed, 25 Aug 2021 15:18:15 +0530 Subject: [PATCH 07/28] Update cast.py --- pandas/core/dtypes/cast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index f86530d9f5ab4..1f592c1f6bc18 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1418,7 +1418,7 @@ def convert_dtypes( if len(byte_list_set) == 1 and byte_list_set[0]: inferred_dtype = type(input_array[0]) return inferred_dtype - except: + except (UnicodeDecodeError, AttributeError): # it is not a bare except, there is a pass statement # In the event of an exception, it will not be # a byte_string, so we process with other types From 7921c6f2ffdf05b9dd50af50687997cf16ba6735 Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Sun, 29 Aug 2021 12:47:31 +0530 Subject: [PATCH 08/28] Removed Try Except Block --- pandas/core/dtypes/cast.py | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 1f592c1f6bc18..65a4086401d5b 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1410,20 +1410,6 @@ def convert_dtypes( """ inferred_dtype: str | DtypeObj - try: - # case of a byte_string - # all(isinstance(x, bytes) for x in input_array) - - byte_list_set = list(set([type(x) is bytes for x in input_array])) - if len(byte_list_set) == 1 and byte_list_set[0]: - inferred_dtype = type(input_array[0]) - return inferred_dtype - except (UnicodeDecodeError, AttributeError): - # it is not a bare except, there is a pass statement - # In the event of an exception, it will not be - # a byte_string, so we process with other types - pass - if ( convert_string or convert_integer or convert_boolean or convert_floating ) and isinstance(input_array, np.ndarray): @@ -1437,6 +1423,10 @@ def convert_dtypes( if not convert_string: return input_array.dtype else: + byte_list_set = list(set([type(x) is bytes for x in input_array])) + if len(byte_list_set) == 1 and byte_list_set[0]: + inferred_dtype = type(input_array[0]) + return inferred_dtype return pandas_dtype("string") if convert_integer: From 3e7a91fe0bad3ed9d982f1aa1ce133db16c58251 Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Sun, 29 Aug 2021 20:27:38 +0530 Subject: [PATCH 09/28] pre-commit changes added --- pandas/core/dtypes/cast.py | 2 +- pandas/tests/series/methods/test_convert_dtypes.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 65a4086401d5b..668a6b7824c9c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1423,7 +1423,7 @@ def convert_dtypes( if not convert_string: return input_array.dtype else: - byte_list_set = list(set([type(x) is bytes for x in input_array])) + byte_list_set = list({type(x) is bytes for x in input_array}) if len(byte_list_set) == 1 and byte_list_set[0]: inferred_dtype = type(input_array[0]) return inferred_dtype diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index ab9641b5b785d..4cd399de9ea37 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -230,19 +230,19 @@ def test_convert_bool_dtype(self): def test_convert_byte_string_dtype(self): # https://github.com/pandas-dev/pandas/issues/43183 -> # not recognising the byte_string dtype - byte_str = b'binary-string' + byte_str = b"binary-string" df = pd.DataFrame( - data={ + data = { "A": byte_str, }, - index=[0] + index = [0], ) df_convert = df.convert_dtypes() # to check whether the type continues to remain a byte_string - assert df_convert["A"][0].decode('ascii') == "binary-string" + assert df_convert["A"][0].decode("ascii") == "binary-string" # no need for the tm module as # we are just verifying the conversion of the string From b5b0e272d41743fe657110fe398a31f8355cf7db Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Mon, 30 Aug 2021 11:21:40 +0530 Subject: [PATCH 10/28] mypy static error solved --- pandas/core/dtypes/cast.py | 4 ++-- pandas/tests/series/methods/test_convert_dtypes.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 668a6b7824c9c..bd309cfd99101 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1425,8 +1425,8 @@ def convert_dtypes( else: byte_list_set = list({type(x) is bytes for x in input_array}) if len(byte_list_set) == 1 and byte_list_set[0]: - inferred_dtype = type(input_array[0]) - return inferred_dtype + return pandas_dtype("bytes") + return pandas_dtype("string") if convert_integer: diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 4cd399de9ea37..33716a07cf05d 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -233,10 +233,10 @@ def test_convert_byte_string_dtype(self): byte_str = b"binary-string" df = pd.DataFrame( - data = { + data={ "A": byte_str, }, - index = [0], + index=[0], ) df_convert = df.convert_dtypes() From 694fb7adbbdcd6615d0b9bdd905a654582e4b64a Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Mon, 30 Aug 2021 19:54:00 +0530 Subject: [PATCH 11/28] Updated as requested --- pandas/core/dtypes/cast.py | 3 +-- .../tests/series/methods/test_convert_dtypes.py | 15 ++------------- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index bd309cfd99101..bfdd595849810 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1423,8 +1423,7 @@ def convert_dtypes( if not convert_string: return input_array.dtype else: - byte_list_set = list({type(x) is bytes for x in input_array}) - if len(byte_list_set) == 1 and byte_list_set[0]: + if inferred_dtype is "bytes": return pandas_dtype("bytes") return pandas_dtype("string") diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 33716a07cf05d..d851d435b4ddc 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -228,21 +228,10 @@ def test_convert_bool_dtype(self): tm.assert_frame_equal(df, df.convert_dtypes()) def test_convert_byte_string_dtype(self): - # https://github.com/pandas-dev/pandas/issues/43183 -> - # not recognising the byte_string dtype + # GH43183 byte_str = b"binary-string" - df = pd.DataFrame( - data={ - "A": byte_str, - }, - index=[0], - ) - + df = pd.DataFrame(data={"A": byte_str,},index=[0],) df_convert = df.convert_dtypes() - # to check whether the type continues to remain a byte_string assert df_convert["A"][0].decode("ascii") == "binary-string" - - # no need for the tm module as - # we are just verifying the conversion of the string From e12fd22beaf5069b8c243c44feaf0014efbeb0e6 Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Mon, 30 Aug 2021 19:57:19 +0530 Subject: [PATCH 12/28] PEP-8 issues. --- pandas/tests/series/methods/test_convert_dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index d851d435b4ddc..6ab95f1bfb960 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -231,7 +231,7 @@ def test_convert_byte_string_dtype(self): # GH43183 byte_str = b"binary-string" - df = pd.DataFrame(data={"A": byte_str,},index=[0],) + df = pd.DataFrame(data={"A": byte_str, }, index=[0],) df_convert = df.convert_dtypes() assert df_convert["A"][0].decode("ascii") == "binary-string" From b774cefbf558d5854e85347f74d5303417901324 Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Mon, 30 Aug 2021 20:07:49 +0530 Subject: [PATCH 13/28] Update v1.3.3.rst - release note added --- doc/source/whatsnew/v1.3.3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst index 1340188c3d609..47415f05d4956 100644 --- a/doc/source/whatsnew/v1.3.3.rst +++ b/doc/source/whatsnew/v1.3.3.rst @@ -17,7 +17,7 @@ Fixed regressions - Fixed regression in :class:`DataFrame` constructor failing to broadcast for defined :class:`Index` and len one list of :class:`Timestamp` (:issue:`42810`) - Performance regression in :meth:`core.window.ewm.ExponentialMovingWindow.mean` (:issue:`42333`) - Fixed regression in :meth:`.GroupBy.agg` incorrectly raising in some cases (:issue:`42390`) -- +- Fixed regression in :meth:`core.dtypes.cast.convert_dtypes` incorrectly converts byte strings to strings (:issue:`43183`) .. --------------------------------------------------------------------------- From f867f7247c4856b45d37950674ae657fbdc7390c Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Mon, 30 Aug 2021 20:12:57 +0530 Subject: [PATCH 14/28] elif implemented --- pandas/core/dtypes/cast.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index bfdd595849810..08316c07995f1 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1422,10 +1422,9 @@ def convert_dtypes( if is_string_dtype(inferred_dtype): if not convert_string: return input_array.dtype + elif inferred_dtype is "bytes": + return pandas_dtype("bytes") else: - if inferred_dtype is "bytes": - return pandas_dtype("bytes") - return pandas_dtype("string") if convert_integer: From 0cb3f081e4880c3e7804fcc5e0a6e1a175cef3e7 Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Thu, 2 Sep 2021 08:52:06 +0530 Subject: [PATCH 15/28] Changes done --- pandas/core/dtypes/cast.py | 2 +- pandas/tests/series/methods/test_convert_dtypes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 08316c07995f1..0204526fe0884 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1422,7 +1422,7 @@ def convert_dtypes( if is_string_dtype(inferred_dtype): if not convert_string: return input_array.dtype - elif inferred_dtype is "bytes": + elif inferred_dtype == "bytes": return pandas_dtype("bytes") else: return pandas_dtype("string") diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 6ab95f1bfb960..81d605fc38b6f 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -231,7 +231,7 @@ def test_convert_byte_string_dtype(self): # GH43183 byte_str = b"binary-string" - df = pd.DataFrame(data={"A": byte_str, }, index=[0],) + df = pd.DataFrame(data={"A": byte_str}, index=[0]) df_convert = df.convert_dtypes() assert df_convert["A"][0].decode("ascii") == "binary-string" From 2e18ebc1132f4c86a543b393ea2861454367edc6 Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Thu, 9 Sep 2021 18:17:14 +0530 Subject: [PATCH 16/28] Changes done --- doc/source/whatsnew/v1.3.3.rst | 3 +-- pandas/tests/series/methods/test_convert_dtypes.py | 5 ++++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst index 41588a60b0d59..324f5d0d5758c 100644 --- a/doc/source/whatsnew/v1.3.3.rst +++ b/doc/source/whatsnew/v1.3.3.rst @@ -17,8 +17,7 @@ Fixed regressions - Fixed regression in :class:`DataFrame` constructor failing to broadcast for defined :class:`Index` and len one list of :class:`Timestamp` (:issue:`42810`) - Performance regression in :meth:`core.window.ewm.ExponentialMovingWindow.mean` (:issue:`42333`) - Fixed regression in :meth:`.GroupBy.agg` incorrectly raising in some cases (:issue:`42390`) - -- Fixed regression in :meth:`core.dtypes.cast.convert_dtypes` incorrectly converts byte strings to strings (:issue:`43183`) +- Fixed regression in :meth:`DataFrame.convert_dtypes` incorrectly converts byte strings to strings (:issue:`43183`) - Fixed regression in :meth:`.GroupBy.apply` where ``nan`` values were dropped even with ``dropna=False`` (:issue:`43205`) - Fixed regression in :meth:`.GroupBy.quantile` which was failing with ``pandas.NA`` (:issue:`42849`) - Fixed regression in :meth:`merge` where ``on`` columns with ``ExtensionDtype`` or ``bool`` data types were cast to ``object`` in ``right`` and ``outer`` merge (:issue:`40073`) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 81d605fc38b6f..9e4754ac41a41 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -234,4 +234,7 @@ def test_convert_byte_string_dtype(self): df = pd.DataFrame(data={"A": byte_str}, index=[0]) df_convert = df.convert_dtypes() - assert df_convert["A"][0].decode("ascii") == "binary-string" + result = pd.DataFrame(data={"A": df_convert["A"][0].decode("ascii")}, index=[0]) + expected = pd.DataFrame(data={"A": "binary-string"}, index=[0]) + + tm.assert_frame_equal(result, expected) From 3193de93edd0552861c583242fdcde5b4cac0f8b Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Thu, 9 Sep 2021 18:39:18 +0530 Subject: [PATCH 17/28] Update test_convert_dtypes.py --- pandas/tests/series/methods/test_convert_dtypes.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 9e4754ac41a41..e7dfac85a7b5e 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -232,9 +232,8 @@ def test_convert_byte_string_dtype(self): byte_str = b"binary-string" df = pd.DataFrame(data={"A": byte_str}, index=[0]) - df_convert = df.convert_dtypes() - result = pd.DataFrame(data={"A": df_convert["A"][0].decode("ascii")}, index=[0]) + result = pd.DataFrame(data={"A": df.convert_dtypes()["A"][0].decode("ascii")}, index=[0]) expected = pd.DataFrame(data={"A": "binary-string"}, index=[0]) tm.assert_frame_equal(result, expected) From 20f8ddaf38f4e0587c614aacfe898dcbcf72befe Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Thu, 9 Sep 2021 18:51:28 +0530 Subject: [PATCH 18/28] Removed pre-commit error --- pandas/tests/series/methods/test_convert_dtypes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index e7dfac85a7b5e..9e4754ac41a41 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -232,8 +232,9 @@ def test_convert_byte_string_dtype(self): byte_str = b"binary-string" df = pd.DataFrame(data={"A": byte_str}, index=[0]) + df_convert = df.convert_dtypes() - result = pd.DataFrame(data={"A": df.convert_dtypes()["A"][0].decode("ascii")}, index=[0]) + result = pd.DataFrame(data={"A": df_convert["A"][0].decode("ascii")}, index=[0]) expected = pd.DataFrame(data={"A": "binary-string"}, index=[0]) tm.assert_frame_equal(result, expected) From 1503730af89e726ab872c33125b9aa38e89d2120 Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Thu, 9 Sep 2021 21:46:22 +0530 Subject: [PATCH 19/28] Update test_convert_dtypes.py --- pandas/tests/series/methods/test_convert_dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 9e4754ac41a41..8e5284fa18b44 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -228,7 +228,7 @@ def test_convert_bool_dtype(self): tm.assert_frame_equal(df, df.convert_dtypes()) def test_convert_byte_string_dtype(self): - # GH43183 + # GH-43183 byte_str = b"binary-string" df = pd.DataFrame(data={"A": byte_str}, index=[0]) From c081d92d2cdfca50f73c9f3869149db5f850d5a7 Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Fri, 10 Sep 2021 00:01:18 +0530 Subject: [PATCH 20/28] Added astype --- pandas/tests/series/methods/test_convert_dtypes.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 8e5284fa18b44..dda46f16ca83f 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -232,9 +232,4 @@ def test_convert_byte_string_dtype(self): byte_str = b"binary-string" df = pd.DataFrame(data={"A": byte_str}, index=[0]) - df_convert = df.convert_dtypes() - - result = pd.DataFrame(data={"A": df_convert["A"][0].decode("ascii")}, index=[0]) - expected = pd.DataFrame(data={"A": "binary-string"}, index=[0]) - - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, df.convert_dtypes().astype(object)) From 76547297f5b664763f5c6b9d3cbd48c565bd193f Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Fri, 10 Sep 2021 17:25:07 +0530 Subject: [PATCH 21/28] Compare bytes --- pandas/tests/series/methods/test_convert_dtypes.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index dda46f16ca83f..0f5b490802089 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -232,4 +232,6 @@ def test_convert_byte_string_dtype(self): byte_str = b"binary-string" df = pd.DataFrame(data={"A": byte_str}, index=[0]) - tm.assert_frame_equal(df, df.convert_dtypes().astype(object)) + result = df.convert_dtypes() + expected = df.astype(bytes) + tm.assert_frame_equal(result, expected) From 0a2e0a9a74e5eb8aea3bf399f58cef220ae8f754 Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Sat, 11 Sep 2021 10:53:20 +0530 Subject: [PATCH 22/28] Remove build error --- pandas/tests/series/methods/test_convert_dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 0f5b490802089..12743ae413658 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -228,7 +228,7 @@ def test_convert_bool_dtype(self): tm.assert_frame_equal(df, df.convert_dtypes()) def test_convert_byte_string_dtype(self): - # GH-43183 + # GH43183 byte_str = b"binary-string" df = pd.DataFrame(data={"A": byte_str}, index=[0]) From 766a30fa857c8a4e62b1f104123a7a57a2a37a5c Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Mon, 13 Sep 2021 23:03:19 +0530 Subject: [PATCH 23/28] Update 1.3.4.rst and removed 1.3.3.rst --- doc/source/whatsnew/v1.3.3.rst | 1 - doc/source/whatsnew/v1.3.4.rst | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst index 9503caa2f4244..ecec6d975ccb7 100644 --- a/doc/source/whatsnew/v1.3.3.rst +++ b/doc/source/whatsnew/v1.3.3.rst @@ -16,7 +16,6 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :class:`DataFrame` constructor failing to broadcast for defined :class:`Index` and len one list of :class:`Timestamp` (:issue:`42810`) - Fixed regression in :meth:`.GroupBy.agg` incorrectly raising in some cases (:issue:`42390`) -- Fixed regression in :meth:`DataFrame.convert_dtypes` incorrectly converts byte strings to strings (:issue:`43183`) - Fixed regression in :meth:`.GroupBy.apply` where ``nan`` values were dropped even with ``dropna=False`` (:issue:`43205`) - Fixed regression in :meth:`.GroupBy.quantile` which was failing with ``pandas.NA`` (:issue:`42849`) - Fixed regression in :meth:`merge` where ``on`` columns with ``ExtensionDtype`` or ``bool`` data types were cast to ``object`` in ``right`` and ``outer`` merge (:issue:`40073`) diff --git a/doc/source/whatsnew/v1.3.4.rst b/doc/source/whatsnew/v1.3.4.rst index 273686f0aaa8f..ecb273f67bb47 100644 --- a/doc/source/whatsnew/v1.3.4.rst +++ b/doc/source/whatsnew/v1.3.4.rst @@ -14,7 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- +- Fixed regression in :meth:`DataFrame.convert_dtypes` incorrectly converts byte strings to strings (:issue:`43183`) - .. --------------------------------------------------------------------------- From be03f3779c125bcc1a5516a7feb9d58cc75f763c Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Wed, 15 Sep 2021 09:32:00 +0530 Subject: [PATCH 24/28] Remove build error --- pandas/tests/series/methods/test_convert_dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 12743ae413658..0f5b490802089 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -228,7 +228,7 @@ def test_convert_bool_dtype(self): tm.assert_frame_equal(df, df.convert_dtypes()) def test_convert_byte_string_dtype(self): - # GH43183 + # GH-43183 byte_str = b"binary-string" df = pd.DataFrame(data={"A": byte_str}, index=[0]) From 07e1de913eccec679f74bce24b17ae0496e58b93 Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Fri, 17 Sep 2021 00:32:01 +0530 Subject: [PATCH 25/28] Changed df --- pandas/tests/series/methods/test_convert_dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 0f5b490802089..1e88ddf3cd943 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -233,5 +233,5 @@ def test_convert_byte_string_dtype(self): df = pd.DataFrame(data={"A": byte_str}, index=[0]) result = df.convert_dtypes() - expected = df.astype(bytes) + expected = df tm.assert_frame_equal(result, expected) From bfb12422bf34a3e51ed78c019563fdf45cce7c96 Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Sat, 16 Oct 2021 23:41:06 +0530 Subject: [PATCH 26/28] Follows 1.2.5 behaviour --- pandas/core/dtypes/cast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 36d01c89866d1..dc5e8b51b5bd5 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1423,7 +1423,7 @@ def convert_dtypes( if not convert_string: return input_array.dtype elif inferred_dtype == "bytes": - return pandas_dtype("bytes") + return pandas_dtype("object") else: return pandas_dtype("string") From 77ee435f836547b92645b5e0d0f37b736eeeddee Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Sat, 16 Oct 2021 23:54:39 +0530 Subject: [PATCH 27/28] Removed Whitespace --- doc/source/whatsnew/v1.3.4.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.4.rst b/doc/source/whatsnew/v1.3.4.rst index 6d4261644fbbb..937162e154335 100644 --- a/doc/source/whatsnew/v1.3.4.rst +++ b/doc/source/whatsnew/v1.3.4.rst @@ -14,7 +14,6 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - - Fixed regression in :meth:`DataFrame.convert_dtypes` incorrectly converts byte strings to strings (:issue:`43183`) - Fixed regression in :meth:`.GroupBy.agg` where it was failing silently with mixed data types along ``axis=1`` and :class:`MultiIndex` (:issue:`43209`) - Fixed regression in :meth:`merge` with integer and ``NaN`` keys failing with ``outer`` merge (:issue:`43550`) From e70f68b048eea50d80d306ed500e56b772540dc5 Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Sun, 17 Oct 2021 07:44:37 +0530 Subject: [PATCH 28/28] Removed elif --- pandas/core/dtypes/cast.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 257884faa1e29..408e58e23aaed 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1424,10 +1424,8 @@ def convert_dtypes( inferred_dtype = input_array.dtype if is_string_dtype(inferred_dtype): - if not convert_string: + if not convert_string or inferred_dtype == "bytes": return input_array.dtype - elif inferred_dtype == "bytes": - return pandas_dtype("object") else: return pandas_dtype("string")