From 6feadcfe3470dac2f58d50464cbb49c5015aded0 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 9 Dec 2020 12:59:55 -0800 Subject: [PATCH 1/6] CLN: de-duplicate internals.construction --- pandas/core/internals/construction.py | 54 ++++++++++++++++----------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 3c5216b65a70b..ebe5d6e4aad26 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -505,6 +505,7 @@ def to_arrays( """ Return list of arrays, columns. """ + if isinstance(data, ABCDataFrame): if columns is not None: arrays = [ @@ -558,18 +559,13 @@ def _list_to_arrays( coerce_float: bool = False, dtype: Optional[DtypeObj] = None, ) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: - if len(data) > 0 and isinstance(data[0], tuple): - content = list(lib.to_object_array_tuples(data).T) + # Note: we already check len(data) > 0 before getting hre + if isinstance(data[0], tuple): + content = lib.to_object_array_tuples(data) else: # list of lists - content = list(lib.to_object_array(data).T) - # gh-26429 do not raise user-facing AssertionError - try: - columns = _validate_or_indexify_columns(content, columns) - result = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) - except AssertionError as e: - raise ValueError(e) from e - return result, columns + content = lib.to_object_array(data) + return _finalize_columns_and_data(content, columns, dtype, coerce_float) def _list_of_series_to_arrays( @@ -599,15 +595,10 @@ def _list_of_series_to_arrays( values = extract_array(s, extract_numpy=True) aligned_values.append(algorithms.take_1d(values, indexer)) - values = np.vstack(aligned_values) + content = np.vstack(aligned_values) - if values.dtype == np.object_: - content = list(values.T) - columns = _validate_or_indexify_columns(content, columns) - content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) - return content, columns - else: - return values.T, columns + content, columns = _finalize_columns_and_data(content, columns, dtype, coerce_float) + return content, columns def _list_of_dict_to_arrays( @@ -646,9 +637,30 @@ def _list_of_dict_to_arrays( # classes data = [(type(d) is dict) and d or dict(d) for d in data] - content = list(lib.dicts_to_array(data, list(columns)).T) - columns = _validate_or_indexify_columns(content, columns) - content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) + content = lib.dicts_to_array(data, list(columns)) + content, columns = _finalize_columns_and_data(content, columns, dtype, coerce_float) + return content, columns + + +def _finalize_columns_and_data( + content: np.ndarray, + columns, + dtype: Optional[DtypeObj], + coerce_float: bool, +) -> Tuple[List[np.ndarray], Index]: + """ + Ensure we have valid columns, cast object dtypes if possible. + """ + content = list(content.T) + + try: + columns = _validate_or_indexify_columns(content, columns) + except AssertionError as err: + # GH#26429 do not raise user-facing AssertionError + raise ValueError(err) from err + + if len(content) and content[0].dtype == np.object_: + content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) return content, columns From a57aa9b5933b73fba4755c2390aaca7f38043332 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 9 Dec 2020 13:21:08 -0800 Subject: [PATCH 2/6] annotate --- pandas/core/internals/construction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index ebe5d6e4aad26..f0719375f5656 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -644,10 +644,10 @@ def _list_of_dict_to_arrays( def _finalize_columns_and_data( content: np.ndarray, - columns, + columns: Optional[Union[Index, List]], dtype: Optional[DtypeObj], coerce_float: bool, -) -> Tuple[List[np.ndarray], Index]: +) -> Tuple[List[np.ndarray], Union[Index, List[Axis]]]: """ Ensure we have valid columns, cast object dtypes if possible. """ From 0ff9516254090af506856417ca00a038af2d5c91 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 9 Dec 2020 13:23:22 -0800 Subject: [PATCH 3/6] fixup whitespace --- pandas/core/internals/construction.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index f0719375f5656..7129f71a5cf45 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -505,7 +505,6 @@ def to_arrays( """ Return list of arrays, columns. """ - if isinstance(data, ABCDataFrame): if columns is not None: arrays = [ From ba425a524345c61ee12bef747a2f8b691d2ffe3f Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 9 Dec 2020 19:33:42 -0800 Subject: [PATCH 4/6] REF: collect calls to _finalize_columns_and_data --- pandas/core/internals/construction.py | 41 ++++++++++----------------- 1 file changed, 15 insertions(+), 26 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 7129f71a5cf45..d25482872c594 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -524,39 +524,36 @@ def to_arrays( if columns is not None: return [[]] * len(columns), columns return [], [] # columns if columns is not None else [] - if isinstance(data[0], (list, tuple)): - return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) - elif isinstance(data[0], abc.Mapping): - return _list_of_dict_to_arrays( - data, columns, coerce_float=coerce_float, dtype=dtype - ) - elif isinstance(data[0], ABCSeries): - return _list_of_series_to_arrays( - data, columns, coerce_float=coerce_float, dtype=dtype - ) + elif isinstance(data[0], Categorical): if columns is None: columns = ibase.default_index(len(data)) return data, columns - elif ( - isinstance(data, (np.ndarray, ABCSeries, Index)) - and data.dtype.names is not None - ): + elif isinstance(data, np.ndarray) and data.dtype.names is not None: + # e.g. recarray columns = list(data.dtype.names) arrays = [data[k] for k in columns] return arrays, columns + + if isinstance(data[0], (list, tuple)): + content, columns = _list_to_arrays(data, columns) + elif isinstance(data[0], abc.Mapping): + content, columns = _list_of_dict_to_arrays(data, columns) + elif isinstance(data[0], ABCSeries): + content, columns = _list_of_series_to_arrays(data, columns) else: # last ditch effort data = [tuple(x) for x in data] - return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) + content, columns = _list_to_arrays(data, columns) + + content, columns = _finalize_columns_and_data(content, columns, dtype, coerce_float) + return content, columns def _list_to_arrays( data: List[Scalar], columns: Union[Index, List], - coerce_float: bool = False, - dtype: Optional[DtypeObj] = None, ) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: # Note: we already check len(data) > 0 before getting hre if isinstance(data[0], tuple): @@ -564,14 +561,12 @@ def _list_to_arrays( else: # list of lists content = lib.to_object_array(data) - return _finalize_columns_and_data(content, columns, dtype, coerce_float) + return content, columns def _list_of_series_to_arrays( data: List, columns: Union[Index, List], - coerce_float: bool = False, - dtype: Optional[DtypeObj] = None, ) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: if columns is None: # We know pass_data is non-empty because data[0] is a Series @@ -596,15 +591,12 @@ def _list_of_series_to_arrays( content = np.vstack(aligned_values) - content, columns = _finalize_columns_and_data(content, columns, dtype, coerce_float) return content, columns def _list_of_dict_to_arrays( data: List[Dict], columns: Union[Index, List], - coerce_float: bool = False, - dtype: Optional[DtypeObj] = None, ) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: """ Convert list of dicts to numpy arrays @@ -619,8 +611,6 @@ def _list_of_dict_to_arrays( data : iterable collection of records (OrderedDict, dict) columns: iterables or None - coerce_float : bool - dtype : np.dtype Returns ------- @@ -637,7 +627,6 @@ def _list_of_dict_to_arrays( data = [(type(d) is dict) and d or dict(d) for d in data] content = lib.dicts_to_array(data, list(columns)) - content, columns = _finalize_columns_and_data(content, columns, dtype, coerce_float) return content, columns From 9a49c461a7b110f4c919c6f1fdf57a16c107f0f5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 10 Dec 2020 18:35:41 -0800 Subject: [PATCH 5/6] BUG: item_cache invalidation on DataFrame.insert (#38380) * BUG: item_cache invalidation on DataFrame.insert * Whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/core/internals/managers.py | 10 +++++++++- pandas/tests/frame/indexing/test_insert.py | 14 ++++++++++++++ pandas/tests/frame/test_block_internals.py | 15 +++++++++------ pandas/tests/io/sas/test_sas7bdat.py | 7 +++++-- 5 files changed, 38 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 90f611c55e710..d0afc24aaecac 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -127,7 +127,7 @@ Interval Indexing ^^^^^^^^ - +- Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`) - - diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 0b3f1079cdb16..e939c43015aed 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -18,6 +18,7 @@ from pandas._libs import internals as libinternals, lib from pandas._typing import ArrayLike, DtypeObj, Label, Shape +from pandas.errors import PerformanceWarning from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -1222,7 +1223,14 @@ def insert(self, loc: int, item: Label, value, allow_duplicates: bool = False): self._known_consolidated = False if len(self.blocks) > 100: - self._consolidate_inplace() + warnings.warn( + "DataFrame is highly fragmented. This is usually the result " + "of calling `frame.insert` many times, which has poor performance. " + "Consider using pd.concat instead. To get a de-fragmented frame, " + "use `newframe = frame.copy()`", + PerformanceWarning, + stacklevel=5, + ) def reindex_axis( self, diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py index 622c93d1c2fdc..6e4deb5469777 100644 --- a/pandas/tests/frame/indexing/test_insert.py +++ b/pandas/tests/frame/indexing/test_insert.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas.errors import PerformanceWarning + from pandas import DataFrame, Index import pandas._testing as tm @@ -66,3 +68,15 @@ def test_insert_with_columns_dups(self): [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"] ) tm.assert_frame_equal(df, exp) + + def test_insert_item_cache(self): + df = DataFrame(np.random.randn(4, 3)) + ser = df[0] + + with tm.assert_produces_warning(PerformanceWarning): + for n in range(100): + df[n + 3] = df[1] * n + + ser.values[0] = 99 + + assert df.iloc[0, 0] == df[0][0] diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 5513262af8100..8954d8a0e7598 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.errors import PerformanceWarning + import pandas as pd from pandas import ( Categorical, @@ -329,12 +331,13 @@ def test_strange_column_corruption_issue(self): df[0] = np.nan wasCol = {} - for i, dt in enumerate(df.index): - for col in range(100, 200): - if col not in wasCol: - wasCol[col] = 1 - df[col] = np.nan - df[col][dt] = i + with tm.assert_produces_warning(PerformanceWarning): + for i, dt in enumerate(df.index): + for col in range(100, 200): + if col not in wasCol: + wasCol[col] = 1 + df[col] = np.nan + df[col][dt] = i myid = 100 diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index cca62c5af59a1..1ce1ba9d2caae 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -7,7 +7,7 @@ import numpy as np import pytest -from pandas.errors import EmptyDataError +from pandas.errors import EmptyDataError, PerformanceWarning import pandas.util._test_decorators as td import pandas as pd @@ -194,7 +194,10 @@ def test_compact_numerical_values(datapath): def test_many_columns(datapath): # Test for looking for column information in more places (PR #22628) fname = datapath("io", "sas", "data", "many_columns.sas7bdat") - df = pd.read_sas(fname, encoding="latin-1") + with tm.assert_produces_warning(PerformanceWarning): + # Many DataFrame.insert calls + df = pd.read_sas(fname, encoding="latin-1") + fname = datapath("io", "sas", "data", "many_columns.csv") df0 = pd.read_csv(fname, encoding="latin-1") tm.assert_frame_equal(df, df0) From 7e84815001b52d9f13c808e9a3774dcf84072ef3 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 10 Dec 2020 20:54:36 -0800 Subject: [PATCH 6/6] CLN: remove coerce_float keyword from construction.internals --- pandas/core/frame.py | 6 +++++- pandas/core/internals/construction.py | 14 +++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index de60cda382fba..520297cca1747 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1852,7 +1852,11 @@ def from_records( columns = ensure_index(columns) arr_columns = columns else: - arrays, arr_columns = to_arrays(data, columns, coerce_float=coerce_float) + arrays, arr_columns = to_arrays(data, columns) + if coerce_float: + for i, arr in enumerate(arrays): + if arr.dtype == object: + arrays[i] = lib.maybe_convert_objects(arr, try_float=True) arr_columns = ensure_index(arr_columns) if columns is not None: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index d25482872c594..b9d24ad8df8bb 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -499,9 +499,7 @@ def dataclasses_to_dicts(data): # Conversion of Inputs to Arrays -def to_arrays( - data, columns, coerce_float: bool = False, dtype: Optional[DtypeObj] = None -): +def to_arrays(data, columns, dtype: Optional[DtypeObj] = None): """ Return list of arrays, columns. """ @@ -547,7 +545,7 @@ def to_arrays( data = [tuple(x) for x in data] content, columns = _list_to_arrays(data, columns) - content, columns = _finalize_columns_and_data(content, columns, dtype, coerce_float) + content, columns = _finalize_columns_and_data(content, columns, dtype) return content, columns @@ -634,7 +632,6 @@ def _finalize_columns_and_data( content: np.ndarray, columns: Optional[Union[Index, List]], dtype: Optional[DtypeObj], - coerce_float: bool, ) -> Tuple[List[np.ndarray], Union[Index, List[Axis]]]: """ Ensure we have valid columns, cast object dtypes if possible. @@ -648,7 +645,7 @@ def _finalize_columns_and_data( raise ValueError(err) from err if len(content) and content[0].dtype == np.object_: - content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) + content = _convert_object_array(content, dtype=dtype) return content, columns @@ -711,7 +708,7 @@ def _validate_or_indexify_columns( def _convert_object_array( - content: List[Scalar], coerce_float: bool = False, dtype: Optional[DtypeObj] = None + content: List[Scalar], dtype: Optional[DtypeObj] = None ) -> List[Scalar]: """ Internal function ot convert object array. @@ -719,7 +716,6 @@ def _convert_object_array( Parameters ---------- content: list of processed data records - coerce_float: bool, to coerce floats or not, default is False dtype: np.dtype, default is None Returns @@ -729,7 +725,7 @@ def _convert_object_array( # provide soft conversion of object dtypes def convert(arr): if dtype != np.dtype("O"): - arr = lib.maybe_convert_objects(arr, try_float=coerce_float) + arr = lib.maybe_convert_objects(arr) arr = maybe_cast_to_datetime(arr, dtype) return arr