From b50c58102b448993ddd7a2b8672374a55a91e3a8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 8 Oct 2019 15:22:20 +0200 Subject: [PATCH 1/9] BUG: use EA.astype in ExtensionBlock.to_native_types --- pandas/core/internals/blocks.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b76cb5cbec626..aee3d657f1acb 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -687,7 +687,6 @@ def _try_coerce_args(self, other): def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ - values = self.get_values() if slicer is not None: @@ -1783,6 +1782,18 @@ def get_values(self, dtype=None): def to_dense(self): return np.asarray(self.values) + def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): + """override to use ExtensionArray astype for the conversion""" + values = self.values + if slicer is not None: + values = values[slicer] + mask = isna(values) + values = values.astype(str) + values[mask] = na_rep + + # we are expected to return a 2-d ndarray + return values.reshape(1, len(values)) + def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): """ Take values according to indexer and return them as a block. From 890fe820920c28f08d808dc70fd5fd570c5911e6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 8 Oct 2019 16:02:05 +0200 Subject: [PATCH 2/9] fallback to conversion to numpy array (eg for sparse array) --- pandas/core/internals/blocks.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index aee3d657f1acb..7831f4d76e6e8 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1784,15 +1784,24 @@ def to_dense(self): def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): """override to use ExtensionArray astype for the conversion""" - values = self.values - if slicer is not None: - values = values[slicer] - mask = isna(values) - values = values.astype(str) - values[mask] = na_rep - - # we are expected to return a 2-d ndarray - return values.reshape(1, len(values)) + try: + values = self.values + if slicer is not None: + values = values[slicer] + mask = isna(values) + values = values.astype(str) + values[mask] = na_rep + # we are expected to return a 2-d ndarray + return values.reshape(1, len(values)) + except Exception: + # eg SparseArray does not support setitem, needs to be converted to ndarray + values = self.get_values() + if slicer is not None: + values = values[:, slicer] + mask = isna(values) + values = values.astype(str) + values[mask] = na_rep + return values def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): """ From ded3d00f6d9ea35a59fb00a1b31d265f4254f084 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 8 Oct 2019 16:03:53 +0200 Subject: [PATCH 3/9] use super implementation for fallback --- pandas/core/internals/blocks.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7831f4d76e6e8..f7bfc1d1bc73e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1795,13 +1795,7 @@ def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): return values.reshape(1, len(values)) except Exception: # eg SparseArray does not support setitem, needs to be converted to ndarray - values = self.get_values() - if slicer is not None: - values = values[:, slicer] - mask = isna(values) - values = values.astype(str) - values[mask] = na_rep - return values + return super().to_native_types(slicer, na_rep, quoting, **kwargs) def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): """ From ef65a96a288b08dcf4dc0edae423294d8edfaf3b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 9 Oct 2019 10:10:05 +0200 Subject: [PATCH 4/9] override in DatetimeTZBlock as well --- pandas/core/internals/blocks.py | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f7bfc1d1bc73e..025cfd65ba393 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1784,19 +1784,21 @@ def to_dense(self): def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): """override to use ExtensionArray astype for the conversion""" + values = self.values + if slicer is not None: + values = values[slicer] + mask = isna(values) + try: - values = self.values - if slicer is not None: - values = values[slicer] - mask = isna(values) values = values.astype(str) values[mask] = na_rep - # we are expected to return a 2-d ndarray - return values.reshape(1, len(values)) except Exception: # eg SparseArray does not support setitem, needs to be converted to ndarray return super().to_native_types(slicer, na_rep, quoting, **kwargs) + # we are expected to return a 2-d ndarray + return values.reshape(1, len(values)) + def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): """ Take values according to indexer and return them as a block. @@ -2352,6 +2354,22 @@ def to_dense(self): # expects that behavior. return np.asarray(self.values, dtype=_NS_DTYPE) + def to_native_types( + self, slicer=None, na_rep=None, date_format=None, quoting=None, **kwargs + ): + """ + We need to pick DatetimeBlock's version, but the inheritance structure + would use ExtensionBlock's verison + """ + return DatetimeBlock.to_native_types( + self, + slicer=slicer, + na_rep=na_rep, + date_format=date_format, + quoting=quoting, + **kwargs + ) + def _slice(self, slicer): """ return a slice of my values """ if isinstance(slicer, tuple): From ba553968e697b620edf32b27ef0ca4a82412655d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 11 Oct 2019 15:19:14 +0200 Subject: [PATCH 5/9] clean-up --- pandas/core/internals/blocks.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 025cfd65ba393..1495be1f26df5 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2281,6 +2281,7 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): is_extension = True _can_hold_element = DatetimeBlock._can_hold_element + to_native_types = DatetimeBlock.to_native_types fill_value = np.datetime64("NaT", "ns") @property @@ -2354,22 +2355,6 @@ def to_dense(self): # expects that behavior. return np.asarray(self.values, dtype=_NS_DTYPE) - def to_native_types( - self, slicer=None, na_rep=None, date_format=None, quoting=None, **kwargs - ): - """ - We need to pick DatetimeBlock's version, but the inheritance structure - would use ExtensionBlock's verison - """ - return DatetimeBlock.to_native_types( - self, - slicer=slicer, - na_rep=na_rep, - date_format=date_format, - quoting=quoting, - **kwargs - ) - def _slice(self, slicer): """ return a slice of my values """ if isinstance(slicer, tuple): From 1663b3d9a9aa87cea977145727c726bc1e9df618 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 11 Oct 2019 17:54:11 +0200 Subject: [PATCH 6/9] add minimal ListArray for test --- pandas/tests/extension/list/__init__.py | 3 + pandas/tests/extension/list/array.py | 131 +++++++++++++++++++++++ pandas/tests/extension/list/test_list.py | 30 ++++++ 3 files changed, 164 insertions(+) create mode 100644 pandas/tests/extension/list/__init__.py create mode 100644 pandas/tests/extension/list/array.py create mode 100644 pandas/tests/extension/list/test_list.py diff --git a/pandas/tests/extension/list/__init__.py b/pandas/tests/extension/list/__init__.py new file mode 100644 index 0000000000000..108f1937d07d3 --- /dev/null +++ b/pandas/tests/extension/list/__init__.py @@ -0,0 +1,3 @@ +from .array import ListArray, ListDtype, make_data + +__all__ = ["ListArray", "ListDtype", "make_data"] diff --git a/pandas/tests/extension/list/array.py b/pandas/tests/extension/list/array.py new file mode 100644 index 0000000000000..ed284f26c21e8 --- /dev/null +++ b/pandas/tests/extension/list/array.py @@ -0,0 +1,131 @@ +"""Test extension array for storing nested data in a pandas container. + +The ListArray stores an ndarray of lists. +""" +import numbers +import random +import string + +import numpy as np + +from pandas.core.dtypes.base import ExtensionDtype + +import pandas as pd +from pandas.core.arrays import ExtensionArray + + +class ListDtype(ExtensionDtype): + type = list + name = "list" + na_value = np.nan + + @classmethod + def construct_array_type(cls): + """Return the array type associated with this dtype + + Returns + ------- + type + """ + return ListArray + + @classmethod + def construct_from_string(cls, string): + if string == cls.name: + return cls() + else: + raise TypeError("Cannot construct a '{}' from '{}'".format(cls, string)) + + +class ListArray(ExtensionArray): + dtype = ListDtype() + __array_priority__ = 1000 + + def __init__(self, values, dtype=None, copy=False): + if not isinstance(values, np.ndarray): + raise TypeError("Need to pass a numpy array as values") + for val in values: + if not isinstance(val, self.dtype.type) and not pd.isna(val): + raise TypeError("All values must be of type " + str(self.dtype.type)) + self.data = values + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + data = np.empty(len(scalars), dtype=object) + data[:] = scalars + return cls(data) + + def __getitem__(self, item): + if isinstance(item, numbers.Integral): + return self.data[item] + else: + # slice, list-like, mask + return type(self)(self.data[item]) + + def __len__(self) -> int: + return len(self.data) + + def isna(self): + return np.array( + [not isinstance(x, list) and np.isnan(x) for x in self.data], dtype=bool + ) + + def take(self, indexer, allow_fill=False, fill_value=None): + # re-implement here, since NumPy has trouble setting + # sized objects like UserDicts into scalar slots of + # an ndarary. + indexer = np.asarray(indexer) + msg = ( + "Index is out of bounds or cannot do a " + "non-empty take from an empty array." + ) + + if allow_fill: + if fill_value is None: + fill_value = self.dtype.na_value + # bounds check + if (indexer < -1).any(): + raise ValueError + try: + output = [ + self.data[loc] if loc != -1 else fill_value for loc in indexer + ] + except IndexError: + raise IndexError(msg) + else: + try: + output = [self.data[loc] for loc in indexer] + except IndexError: + raise IndexError(msg) + + return self._from_sequence(output) + + def copy(self): + return type(self)(self.data[:]) + + def astype(self, dtype, copy=True): + if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: + if copy: + return self.copy() + return self + elif pd.api.types.is_string_dtype(dtype) and not pd.api.types.is_object_dtype( + dtype + ): + # numpy has problems with astype(str) for nested elements + return np.array([str(x) for x in self.data], dtype=dtype) + return np.array(self.data, dtype=dtype, copy=copy) + + @classmethod + def _concat_same_type(cls, to_concat): + data = np.concatenate([x.data for x in to_concat]) + return cls(data) + + +def make_data(): + # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer + data = np.empty(100, dtype=object) + data[:] = [ + [random.choice(string.ascii_letters) for _ in range(random.randint(0, 10))] + for _ in range(100) + ] + return data diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py new file mode 100644 index 0000000000000..674a7e6fb9ca6 --- /dev/null +++ b/pandas/tests/extension/list/test_list.py @@ -0,0 +1,30 @@ +import pytest + +import pandas as pd + +from .array import ListArray, ListDtype, make_data + + +@pytest.fixture +def dtype(): + return ListDtype() + + +@pytest.fixture +def data(): + """Length-100 ListArray for semantics test.""" + data = make_data() + + while len(data[0]) == len(data[1]): + data = make_data() + + return ListArray(data) + + +def test_to_csv(data): + # https://github.com/pandas-dev/pandas/issues/28840 + # array with list-likes fail when doing astype(str) on the numpy array + # which was done in to_native_types + s = pd.Series(data) + res = s.to_csv() + assert str(data[0]) in res From a5ba1296fd2a133f0db28446750243b79ce628d6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 12 Oct 2019 14:19:21 +0200 Subject: [PATCH 7/9] docstring fixes --- pandas/tests/extension/list/array.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/list/array.py b/pandas/tests/extension/list/array.py index ed284f26c21e8..0ca9fadb68829 100644 --- a/pandas/tests/extension/list/array.py +++ b/pandas/tests/extension/list/array.py @@ -1,4 +1,5 @@ -"""Test extension array for storing nested data in a pandas container. +""" +Test extension array for storing nested data in a pandas container. The ListArray stores an ndarray of lists. """ @@ -21,7 +22,8 @@ class ListDtype(ExtensionDtype): @classmethod def construct_array_type(cls): - """Return the array type associated with this dtype + """ + Return the array type associated with this dtype. Returns ------- From 61eaf92d4938286716860fc92a7e35a48550ab48 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 12 Oct 2019 14:21:29 +0200 Subject: [PATCH 8/9] fix test to avoid deprecation warning --- pandas/tests/extension/list/test_list.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py index 674a7e6fb9ca6..c5c4417155562 100644 --- a/pandas/tests/extension/list/test_list.py +++ b/pandas/tests/extension/list/test_list.py @@ -25,6 +25,6 @@ def test_to_csv(data): # https://github.com/pandas-dev/pandas/issues/28840 # array with list-likes fail when doing astype(str) on the numpy array # which was done in to_native_types - s = pd.Series(data) - res = s.to_csv() + df = pd.DataFrame({"a": data}) + res = df.to_csv() assert str(data[0]) in res From 10defebfa4f728ab3f81657d191b30740675a1c7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 14 Oct 2019 18:59:29 -0400 Subject: [PATCH 9/9] add whatsnew --- doc/source/whatsnew/v0.25.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index 9789c9fce3541..fcb6fc8f347bd 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -64,7 +64,7 @@ I/O - Fix regression in notebook display where tags not used for :attr:`DataFrame.index` (:issue:`28204`). - Regression in :meth:`~DataFrame.to_csv` where writing a :class:`Series` or :class:`DataFrame` indexed by an :class:`IntervalIndex` would incorrectly raise a ``TypeError`` (:issue:`28210`) -- +- Fix :meth:`~DataFrame.to_csv` with ``ExtensionArray`` with list-like values (:issue:`28840`). - Plotting