From 439d17ae739f3c465867f8276067b428af730473 Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 17 Sep 2020 19:01:33 +0100 Subject: [PATCH 1/4] PERF: construct DataFrame with string array and dtype=str --- pandas/core/dtypes/cast.py | 2 +- pandas/core/internals/construction.py | 20 +++++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 05759ffb43dde..747862348d754 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1618,7 +1618,7 @@ def construct_1d_ndarray_preserving_na( array(['1.0', '2.0', None], dtype=object) """ - if dtype is not None and dtype.kind == "U": + if is_string_dtype(dtype): subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy) else: subarr = np.array(values, dtype=dtype, copy=copy) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 2d4163e0dee89..3f82035be0e67 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -13,6 +13,7 @@ from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, + construct_1d_ndarray_preserving_na, maybe_cast_to_datetime, maybe_convert_platform, maybe_infer_to_datetimelike, @@ -189,15 +190,16 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): # the dtypes will be coerced to a single dtype values = _prep_ndarray(values, copy=copy) - if dtype is not None: - if not is_dtype_equal(values.dtype, dtype): - try: - values = values.astype(dtype) - except Exception as orig: - # e.g. ValueError when trying to cast object dtype to float64 - raise ValueError( - f"failed to cast to '{dtype}' (Exception was: {orig})" - ) from orig + if not is_dtype_equal(values.dtype, dtype): + try: + values = construct_1d_ndarray_preserving_na( + values.ravel(), dtype=dtype, copy=False + ).reshape(values.shape) + except Exception as orig: + # e.g. ValueError when trying to cast object dtype to float64 + raise ValueError( + f"failed to cast to '{dtype}' (Exception was: {orig})" + ) from orig # _prep_ndarray ensures that values.ndim == 2 at this point index, columns = _get_axes( From f1d66cb6163f655cb0a4a0aefee7cb6aa8cb1669 Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 17 Sep 2020 19:09:22 +0100 Subject: [PATCH 2/4] add GH number --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 1286577748afa..177485c6dfe7e 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -222,7 +222,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Performance improvements when creating Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`) +- Performance improvements when creating DataFrame or Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`) - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) - Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`) - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`) From 5d1dd81d7310534bb3c7a16e65503ec5d9798aa7 Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 17 Sep 2020 19:19:32 +0100 Subject: [PATCH 3/4] fix dtype issues --- pandas/core/dtypes/cast.py | 2 +- pandas/core/internals/construction.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 747862348d754..05759ffb43dde 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1618,7 +1618,7 @@ def construct_1d_ndarray_preserving_na( array(['1.0', '2.0', None], dtype=object) """ - if is_string_dtype(dtype): + if dtype is not None and dtype.kind == "U": subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy) else: subarr = np.array(values, dtype=dtype, copy=copy) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 3f82035be0e67..d19a0dd8f29e3 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -190,7 +190,7 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): # the dtypes will be coerced to a single dtype values = _prep_ndarray(values, copy=copy) - if not is_dtype_equal(values.dtype, dtype): + if dtype is not None and not is_dtype_equal(values.dtype, dtype): try: values = construct_1d_ndarray_preserving_na( values.ravel(), dtype=dtype, copy=False From 256ad703a46462bf5fe2b54ac9cc01da69934142 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 19 Sep 2020 08:14:36 +0100 Subject: [PATCH 4/4] add ASVs --- asv_bench/benchmarks/strings.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 2023858181baa..d8b35abb94b9d 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -13,13 +13,20 @@ class Construction: param_names = ["dtype"] def setup(self, dtype): - self.data = tm.rands_array(nchars=10 ** 5, size=10) + self.series_arr = tm.rands_array(nchars=10, size=10 ** 5) + self.frame_arr = self.series_arr.reshape((50_000, 2)).copy() - def time_construction(self, dtype): - Series(self.data, dtype=dtype) + def time_series_construction(self, dtype): + Series(self.series_arr, dtype=dtype) - def peakmem_construction(self, dtype): - Series(self.data, dtype=dtype) + def peakmem_series_construction(self, dtype): + Series(self.series_arr, dtype=dtype) + + def time_frame_construction(self, dtype): + DataFrame(self.frame_arr, dtype=dtype) + + def peakmem_frame_construction(self, dtype): + DataFrame(self.frame_arr, dtype=dtype) class Methods: