Skip to content

API: allow nan-likes in StringArray constructor #41412

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 41 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
3e1784d
API: allow nan-likes in StringArray constructor
lithomas1 May 10, 2021
96ff1da
Revert weird changes & Fix stuff
lithomas1 May 11, 2021
418e1d2
Remove failing test
lithomas1 May 11, 2021
25a6c4d
Changes from code review
lithomas1 May 19, 2021
47d68f7
Merge branch 'master' of https://github.com/pandas-dev/pandas into st…
lithomas1 May 19, 2021
8257dbd
typo
lithomas1 May 20, 2021
922436a
Update lib.pyi
lithomas1 May 21, 2021
2f28086
Update lib.pyx
lithomas1 May 29, 2021
3ee2198
Update lib.pyx
lithomas1 May 29, 2021
9426a52
Merge branch 'master' of https://github.com/pandas-dev/pandas into st…
lithomas1 May 30, 2021
3ee55f2
Updates
lithomas1 May 30, 2021
fe4981a
Update lib.pyx
lithomas1 May 30, 2021
a66948a
Update lib.pyx
lithomas1 May 30, 2021
e852719
Update lib.pyx
lithomas1 May 31, 2021
91b73bb
disallow invalid nans in stringarray constructor
lithomas1 Jun 4, 2021
42ec3e4
Merge branch 'master' into stringarray-nan
lithomas1 Jun 4, 2021
41f49d2
add to _from_sequence and fixes
lithomas1 Jun 4, 2021
62cc5be
address code review
lithomas1 Jun 4, 2021
29909f3
Merge branch 'master' into stringarray-nan
lithomas1 Jun 4, 2021
153b6b4
Fix failures
lithomas1 Jun 5, 2021
b27a839
maybe fix benchmarks?
lithomas1 Jun 5, 2021
ed5b953
Partially address code review
lithomas1 Jun 5, 2021
caa5705
Test coerce=False
lithomas1 Jun 6, 2021
2d75031
move benchmarks
lithomas1 Jun 7, 2021
52a00d1
accidental formatting changes
lithomas1 Jun 7, 2021
8dc0b66
Fix
lithomas1 Jun 8, 2021
1bacaed
Merge branch 'master' into stringarray-nan
lithomas1 Jun 8, 2021
66be087
missing import from conflict
lithomas1 Jun 8, 2021
7b058cd
Merge branch 'master' into stringarray-nan
lithomas1 Jun 9, 2021
1be1bdf
Merge branch 'pandas-dev:master' into stringarray-nan
lithomas1 Jun 22, 2021
3c57094
remove old whatsnew
lithomas1 Jul 21, 2021
03738a9
Merge branch 'master' of https://github.com/pandas-dev/pandas into st…
lithomas1 Jul 21, 2021
12351de
move whatsnew
lithomas1 Jul 21, 2021
889829a
Merge branch 'master' into stringarray-nan
lithomas1 Oct 4, 2021
358000f
typo
lithomas1 Oct 4, 2021
c649b1d
Merge branch 'master' into stringarray-nan
lithomas1 Oct 16, 2021
5e5aa9c
Merge branch 'master' into stringarray-nan
lithomas1 Nov 27, 2021
eb7d8f2
Merge branch 'master' into stringarray-nan
lithomas1 Dec 18, 2021
2426319
Merge branch 'master' into stringarray-nan
lithomas1 Dec 27, 2021
20817a7
address comments
lithomas1 Dec 27, 2021
33d8f9a
accept any float nan w/ util.is_nan
lithomas1 Dec 27, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions asv_bench/benchmarks/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
import numpy as np

from pandas import (
NA,
Categorical,
DataFrame,
Series,
)
from pandas.core.arrays import StringArray

from .pandas_vb_common import tm

Expand Down Expand Up @@ -285,3 +287,18 @@ class Iter(Dtypes):
def time_iter(self, dtype):
for i in self.s:
pass


class StringArrayConstruction:
def setup(self):
self.series_arr = tm.rands_array(nchars=10, size=10 ** 5)
self.series_arr_nan = np.concatenate([self.series_arr, np.array([NA] * 1000)])

def time_string_array_construction(self):
StringArray(self.series_arr)

def time_string_array_with_nan_construction(self):
StringArray(self.series_arr_nan)

def peakmem_stringarray_construction(self):
StringArray(self.series_arr)
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,7 @@ Other API changes
- Change in the position of the ``min_rows`` argument in :meth:`DataFrame.to_string` due to change in the docstring (:issue:`44304`)
- Reduction operations for :class:`DataFrame` or :class:`Series` now raising a ``ValueError`` when ``None`` is passed for ``skipna`` (:issue:`44178`)
- :func:`read_csv` and :func:`read_html` no longer raising an error when one of the header rows consists only of ``Unnamed:`` columns (:issue:`13054`)
- :class:`StringArray` now accepts nan-likes (``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`)
- Changed the ``name`` attribute of several holidays in
``USFederalHolidayCalendar`` to match `official federal holiday
names <https://www.opm.gov/policy-data-oversight/pay-leave/federal-holidays/>`_
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def maybe_convert_numeric(
def ensure_string_array(
arr,
na_value: object = ...,
convert_na_value: bool = ...,
coerce: str = ...,
copy: bool = ...,
skipna: bool = ...,
) -> npt.NDArray[np.object_]: ...
Expand Down
98 changes: 85 additions & 13 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ from pandas._libs.missing cimport (
is_null_timedelta64,
isnaobj,
)
from pandas._libs.missing import checknull
from pandas._libs.tslibs.conversion cimport convert_to_tsobject
from pandas._libs.tslibs.nattype cimport (
NPY_NAT,
Expand Down Expand Up @@ -670,12 +671,25 @@ def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray:
return result


ctypedef enum coerce_options:
all = 0
strict_null = 1
null = 2
non_null = 3
none = 4


def strict_check_null(x):
# Cython doesn't let me define this in ensure_string_array :(
return x is None or x is C_NA or util.is_nan(x)


@cython.wraparound(False)
@cython.boundscheck(False)
cpdef ndarray[object] ensure_string_array(
arr,
object na_value=np.nan,
bint convert_na_value=True,
coerce="all",
bint copy=True,
bint skipna=True,
):
Expand All @@ -688,8 +702,16 @@ cpdef ndarray[object] ensure_string_array(
The values to be converted to str, if needed.
na_value : Any, default np.nan
The value to use for na. For example, np.nan or pd.NA.
convert_na_value : bool, default True
If False, existing na values will be used unchanged in the new array.
coerce : {'all', 'strict-null', 'null', 'non-null', None}, default 'all'
Whether to coerce non-string elements to strings.
- 'all' will convert all non-string values.
- 'strict-null' will only convert pd.NA, np.nan, or None to na_value
raising when encountering non-strings and other null values.
- 'null' will convert nulls to na_value w/out converting other non-strings.
- 'non-null' will only convert non-null non-string elements to string.
- None will not convert anything.
If coerce is not 'all', a ValueError will be raised for values
that are not strings or na_value.
copy : bool, default True
Whether to ensure that a new array is returned.
skipna : bool, default True
Expand All @@ -699,10 +721,47 @@ cpdef ndarray[object] ensure_string_array(
Returns
-------
np.ndarray[object]
An array with the input array's elements casted to str or nan-like.
An array of strings and na_value.

Raises
------
ValueError
If an element is encountered that is not a string or valid NA value
and element is not coerced.

Examples
--------
>>> import numpy as np
>>> import pandas as pd
>>> ensure_string_array(np.array([1,2,3, np.datetime64("nat")]), coerce="all")
array(['1', '2', '3', nan], dtype=object)
>>> ensure_string_array(np.array([pd.NA, "a", None]), coerce="strict-null")
array([nan, 'a', nan], dtype=object)
>>> ensure_string_array(np.array([pd.NaT, "1"]), coerce="null")
array([nan, '1'], dtype=object)
>>> ensure_string_array(np.array([1,2,3]), coerce="non-null")
array(['1', '2', '3'], dtype=object)
>>> ensure_string_array(np.array(["1", "2", "3"]), coerce=None)
array(['1', '2', '3'], dtype=object)
"""
cdef:
Py_ssize_t i = 0, n = len(arr)
set strict_na_values = {C_NA, np.nan, None}
coerce_options coerce_val

if coerce == "all":
coerce_val = all
elif coerce == "strict-null":
coerce_val = strict_null
elif coerce == "null":
coerce_val = null
elif coerce == "non-null":
coerce_val = non_null
elif coerce is None:
coerce_val = none
else:
raise ValueError("coerce argument must be one of "
f"'all'|'strict-null'|'null'|'non-null'|None, not {coerce}")

if hasattr(arr, "to_numpy"):

Expand All @@ -722,21 +781,34 @@ cpdef ndarray[object] ensure_string_array(
if copy and result is arr:
result = result.copy()

if coerce_val == strict_null:
# We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid
# If they are present, they are treated like a regular Python object
# and will either cause an exception to be raised or be coerced.
check_null = strict_check_null
else:
check_null = checknull

for i in range(n):
val = arr[i]

if isinstance(val, str):
continue

if not checknull(val):
if not isinstance(val, np.floating):
# f"{val}" is faster than str(val)
result[i] = f"{val}"
if not check_null(val):
if coerce_val == all or coerce_val == non_null:
if not isinstance(val, np.floating):
# f"{val}" is faster than str(val)
result[i] = f"{val}"
else:
# f"{val}" is not always equivalent to str(val) for floats
result[i] = str(val)
else:
# f"{val}" is not always equivalent to str(val) for floats
result[i] = str(val)
raise ValueError(f"Element {val} is not a string or valid null."
"If you want it to be coerced to a string,"
"specify coerce='all'")
else:
if convert_na_value:
if coerce_val != non_null and coerce_val != none:
val = na_value
if skipna:
result[i] = val
Expand Down Expand Up @@ -1881,8 +1953,8 @@ cdef class StringValidator(Validator):
return issubclass(self.dtype.type, np.str_)

cdef bint is_valid_null(self, object value) except -1:
# We deliberately exclude None / NaN here since StringArray uses NA
return value is C_NA
# Override to exclude float('Nan') and complex NaN
return value is None or value is C_NA or value is np.nan


cpdef bint is_string_array(ndarray values, bint skipna=False):
Expand Down
38 changes: 32 additions & 6 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,11 +246,18 @@ class StringArray(BaseStringArray, PandasArray):
.. warning::

Currently, this expects an object-dtype ndarray
where the elements are Python strings or :attr:`pandas.NA`.
where the elements are Python strings
or nan-likes(``None``, ``np.nan``, ``NA``).
This may change without warning in the future. Use
:meth:`pandas.array` with ``dtype="string"`` for a stable way of
creating a `StringArray` from any sequence.

.. versionchanged:: 1.4.0

StringArray now accepts nan-likes(``None``, ``np.nan``) for the
``values`` parameter in its constructor
in addition to strings and :attr:`pandas.NA`

copy : bool, default False
Whether to copy the array of data.

Expand Down Expand Up @@ -310,6 +317,8 @@ def __init__(self, values, copy=False):
values = extract_array(values)

super().__init__(values, copy=copy)
if not isinstance(values, type(self)):
self._validate()
# error: Incompatible types in assignment (expression has type "StringDtype",
# variable has type "PandasDtype")
NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
Expand All @@ -318,16 +327,25 @@ def __init__(self, values, copy=False):

def _validate(self):
"""Validate that we only store NA or strings."""
if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
raise ValueError("StringArray requires a sequence of strings or pandas.NA")
if self._ndarray.dtype != "object":
raise ValueError(
"StringArray requires a sequence of strings or pandas.NA. Got "
f"'{self._ndarray.dtype}' dtype instead."
)
try:
lib.ensure_string_array(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this going through ensure_string_array instead of e.g. is_string_array? For the latter, the ravel would be unnecessary.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is_string_array will not convert the other nans(None, np.nan, etc.) to the correct na_value of pd.NA. FWIW, switching to ensure_string_array will also align us with _from_sequence.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does any of this become simpler if done in conjunction with the edits in #45057

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will try to split this one up.

I'm thinking of maybe sticking to is_string_array and then doing another pass over the data to convert the not pd.NA nans, as a quick short term fix to unblock you. This will probably give a perf regression, but since is_string_array got sped up in #44495 on master(not 1.3.x), all I have to do is make the perf regression less than the perf improvement there, so that the regression is not user visible.

self._ndarray.ravel("K"),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm pretty sure this is broken for 2D EAs even with the ravel.
@jbrockmendel Do you know how I can create a 2D StringArray to test this? Would the correct way to iterate over a 2D EA be using the PyArray_GETITEM and PyArray_ITER_DATA combo like the Validator in libs does? Then I would replace values by PyArray_SETITEMing on the array?
(Not familiar at all with Numpy C API, so going to need some help here)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know how I can create a 2D StringArray to test this?

pd.array(["foo", "bar"] * 5).reshape(5, 2)

Would the correct way to iterate over a 2D EA be using the PyArray_GETITEM and PyArray_ITER_DATA combo like the Validator in libs does?

Not sure I understand the question. The 2D EA has a working __iter__ method, but you shouldn't be passing any EA to any of the cython functions.

Then I would replace values by PyArray_SETITEM ing on the array?

I'm still getting the hang of this (intending to make array_to_timedelta64 handle 2D directly to avoid ravels), would advise NOT trying to handle it in the cython code.

na_value=StringDtype.na_value,
coerce="strict-null",
copy=False,
)
except ValueError:
raise ValueError("StringArray requires a sequence of strings or pandas.NA")

@classmethod
def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
def _from_sequence(
cls, scalars, *, dtype: Dtype | None = None, copy=False, coerce=True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is coerce: bool enough here?

this is like errors='coerce' for coerce=True and errors='raise' for coerce=False, i guess 'ignore' would be meaningless.

but I still think the errors= keyword is better for flexiblity.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will take a look soon-ish. I'm wary of adding keywords here

):
if dtype and not (isinstance(dtype, str) and dtype == "string"):
dtype = pandas_dtype(dtype)
assert isinstance(dtype, StringDtype) and dtype.storage == "python"
Expand All @@ -336,15 +354,23 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):

if isinstance(scalars, BaseMaskedArray):
# avoid costly conversion to object dtype
if coerce:
coerce = "non-null"
else:
coerce = None
na_values = scalars._mask
result = scalars._data
result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
result = lib.ensure_string_array(result, copy=copy, coerce=coerce)
result[na_values] = StringDtype.na_value

else:
# convert non-na-likes to str, and nan-likes to StringDtype.na_value
if coerce:
coerce = "all"
else:
coerce = "strict-null"
result = lib.ensure_string_array(
scalars, na_value=StringDtype.na_value, copy=copy
scalars, na_value=StringDtype.na_value, copy=copy, coerce=coerce
)

# Manually creating new array avoids the validation step in the __init__, so is
Expand Down
16 changes: 13 additions & 3 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,9 @@ def __init__(self, values):
)

@classmethod
def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False):
def _from_sequence(
cls, scalars, dtype: Dtype | None = None, copy: bool = False, coerce=True
):
from pandas.core.arrays.masked import BaseMaskedArray

_chk_pyarrow_available()
Expand All @@ -167,11 +169,19 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False)
# numerical issues with Float32Dtype
na_values = scalars._mask
result = scalars._data
result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
if coerce:
coerce = "non-null"
else:
coerce = None
result = lib.ensure_string_array(result, copy=copy, coerce=coerce)
return cls(pa.array(result, mask=na_values, type=pa.string()))

# convert non-na-likes to str
result = lib.ensure_string_array(scalars, copy=copy)
if coerce:
coerce = "all"
else:
coerce = "strict-null"
result = lib.ensure_string_array(scalars, copy=copy, coerce=coerce)
return cls(pa.array(result, type=pa.string(), from_pandas=True))

@classmethod
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -754,7 +754,7 @@ def _try_cast(

elif dtype.kind == "U":
# TODO: test cases with arr.dtype.kind in ["m", "M"]
return lib.ensure_string_array(arr, convert_na_value=False, copy=copy)
return lib.ensure_string_array(arr, coerce="non-null", copy=copy)

elif dtype.kind in ["m", "M"]:
return maybe_cast_to_datetime(arr, dtype)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1122,7 +1122,7 @@ def astype_nansafe(
return arr.astype(dtype, copy=copy)

if issubclass(dtype.type, str):
return lib.ensure_string_array(arr, skipna=skipna, convert_na_value=False)
return lib.ensure_string_array(arr, skipna=skipna, coerce="non-null")

elif is_datetime64_dtype(arr):
# Non-overlapping equality check (left operand type: "dtype[Any]", right
Expand Down
Loading