pandas-dev · lithomas1 · May 10, 2021 · May 11, 2021 · May 11, 2021 · May 19, 2021
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
@@ -3,10 +3,12 @@
 import numpy as np
 
 from pandas import (
+    NA,
     Categorical,
     DataFrame,
     Series,
 )
+from pandas.core.arrays import StringArray
 
 from .pandas_vb_common import tm
 
@@ -285,3 +287,18 @@ class Iter(Dtypes):
     def time_iter(self, dtype):
         for i in self.s:
             pass
+
+
+class StringArrayConstruction:
+    def setup(self):
+        self.series_arr = tm.rands_array(nchars=10, size=10 ** 5)
+        self.series_arr_nan = np.concatenate([self.series_arr, np.array([NA] * 1000)])
+
+    def time_string_array_construction(self):
+        StringArray(self.series_arr)
+
+    def time_string_array_with_nan_construction(self):
+        StringArray(self.series_arr_nan)
+
+    def peakmem_stringarray_construction(self):
+        StringArray(self.series_arr)
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -459,6 +459,7 @@ Other API changes
 - Change in the position of the ``min_rows`` argument in :meth:`DataFrame.to_string` due to change in the docstring (:issue:`44304`)
 - Reduction operations for :class:`DataFrame` or :class:`Series` now raising a ``ValueError`` when ``None`` is passed for ``skipna`` (:issue:`44178`)
 - :func:`read_csv` and :func:`read_html` no longer raising an error when one of the header rows consists only of ``Unnamed:`` columns (:issue:`13054`)
+- :class:`StringArray` now accepts nan-likes (``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`)
 - Changed the ``name`` attribute of several holidays in
   ``USFederalHolidayCalendar`` to match `official federal holiday
   names <https://www.opm.gov/policy-data-oversight/pay-leave/federal-holidays/>`_

diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
@@ -150,7 +150,7 @@ def maybe_convert_numeric(
 def ensure_string_array(
     arr,
     na_value: object = ...,
-    convert_na_value: bool = ...,
+    coerce: str = ...,
     copy: bool = ...,
     skipna: bool = ...,
 ) -> npt.NDArray[np.object_]: ...

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -99,6 +99,7 @@ from pandas._libs.missing cimport (
     is_null_timedelta64,
     isnaobj,
 )
+from pandas._libs.missing import checknull
 from pandas._libs.tslibs.conversion cimport convert_to_tsobject
 from pandas._libs.tslibs.nattype cimport (
     NPY_NAT,
@@ -670,12 +671,25 @@ def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray:
     return result
 
 
+ctypedef enum coerce_options:
+    all = 0
+    strict_null = 1
+    null = 2
+    non_null = 3
+    none = 4
+
+
+def strict_check_null(x):
+    # Cython doesn't let me define this in ensure_string_array :(
+    return x is None or x is C_NA or util.is_nan(x)
+
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cpdef ndarray[object] ensure_string_array(
         arr,
         object na_value=np.nan,
-        bint convert_na_value=True,
+        coerce="all",
         bint copy=True,
         bint skipna=True,
 ):
@@ -688,8 +702,16 @@ cpdef ndarray[object] ensure_string_array(
         The values to be converted to str, if needed.
     na_value : Any, default np.nan
         The value to use for na. For example, np.nan or pd.NA.
-    convert_na_value : bool, default True
-        If False, existing na values will be used unchanged in the new array.
+    coerce : {'all', 'strict-null', 'null', 'non-null', None}, default 'all'
+        Whether to coerce non-string elements to strings.
+            - 'all' will convert all non-string values.
+            - 'strict-null' will only convert pd.NA, np.nan, or None to na_value
+              raising when encountering non-strings and other null values.
+            - 'null' will convert nulls to na_value w/out converting other non-strings.
+            - 'non-null' will only convert non-null non-string elements to string.
+            - None will not convert anything.
+        If coerce is not 'all', a ValueError will be raised for values
+        that are not strings or na_value.
     copy : bool, default True
         Whether to ensure that a new array is returned.
     skipna : bool, default True
@@ -699,10 +721,47 @@ cpdef ndarray[object] ensure_string_array(
     Returns
     -------
     np.ndarray[object]
-        An array with the input array's elements casted to str or nan-like.
+        An array of strings and na_value.
+
+    Raises
+    ------
+    ValueError
+        If an element is encountered that is not a string or valid NA value
+        and element is not coerced.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import pandas as pd
+    >>> ensure_string_array(np.array([1,2,3, np.datetime64("nat")]), coerce="all")
+    array(['1', '2', '3', nan], dtype=object)
+    >>> ensure_string_array(np.array([pd.NA, "a", None]), coerce="strict-null")
+    array([nan, 'a', nan], dtype=object)
+    >>> ensure_string_array(np.array([pd.NaT, "1"]), coerce="null")
+    array([nan, '1'], dtype=object)
+    >>> ensure_string_array(np.array([1,2,3]), coerce="non-null")
+    array(['1', '2', '3'], dtype=object)
+    >>> ensure_string_array(np.array(["1", "2", "3"]), coerce=None)
+    array(['1', '2', '3'], dtype=object)
     """
     cdef:
         Py_ssize_t i = 0, n = len(arr)
+        set strict_na_values = {C_NA, np.nan, None}
+        coerce_options coerce_val
+
+    if coerce == "all":
+        coerce_val = all
+    elif coerce == "strict-null":
+        coerce_val = strict_null
+    elif coerce == "null":
+        coerce_val = null
+    elif coerce == "non-null":
+        coerce_val = non_null
+    elif coerce is None:
+        coerce_val = none
+    else:
+        raise ValueError("coerce argument must be one of "
+                         f"'all'|'strict-null'|'null'|'non-null'|None, not {coerce}")
 
     if hasattr(arr, "to_numpy"):
 
@@ -722,21 +781,34 @@ cpdef ndarray[object] ensure_string_array(
     if copy and result is arr:
         result = result.copy()
 
+    if coerce_val == strict_null:
+        # We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid
+        # If they are present, they are treated like a regular Python object
+        # and will either cause an exception to be raised or be coerced.
+        check_null = strict_check_null
+    else:
+        check_null = checknull
+
     for i in range(n):
         val = arr[i]
 
         if isinstance(val, str):
             continue
 
-        if not checknull(val):
-            if not isinstance(val, np.floating):
-                # f"{val}" is faster than str(val)
-                result[i] = f"{val}"
+        if not check_null(val):
+            if coerce_val == all or coerce_val == non_null:
+                if not isinstance(val, np.floating):
+                    # f"{val}" is faster than str(val)
+                    result[i] = f"{val}"
+                else:
+                    # f"{val}" is not always equivalent to str(val) for floats
+                    result[i] = str(val)
             else:
-                # f"{val}" is not always equivalent to str(val) for floats
-                result[i] = str(val)
+                raise ValueError(f"Element {val} is not a string or valid null."
+                                 "If you want it to be coerced to a string,"
+                                 "specify coerce='all'")
         else:
-            if convert_na_value:
+            if coerce_val != non_null and coerce_val != none:
                 val = na_value
             if skipna:
                 result[i] = val
@@ -1881,8 +1953,8 @@ cdef class StringValidator(Validator):
         return issubclass(self.dtype.type, np.str_)
 
     cdef bint is_valid_null(self, object value) except -1:
-        # We deliberately exclude None / NaN here since StringArray uses NA
-        return value is C_NA
+        # Override to exclude float('Nan') and complex NaN
+        return value is None or value is C_NA or value is np.nan
 
 
 cpdef bint is_string_array(ndarray values, bint skipna=False):

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -246,11 +246,18 @@ class StringArray(BaseStringArray, PandasArray):
         .. warning::
 
            Currently, this expects an object-dtype ndarray
-           where the elements are Python strings or :attr:`pandas.NA`.
+           where the elements are Python strings
+           or nan-likes(``None``, ``np.nan``, ``NA``).
            This may change without warning in the future. Use
            :meth:`pandas.array` with ``dtype="string"`` for a stable way of
            creating a `StringArray` from any sequence.
 
+        .. versionchanged:: 1.4.0
+
+           StringArray now accepts nan-likes(``None``, ``np.nan``) for the
+           ``values`` parameter in its constructor
+           in addition to strings and :attr:`pandas.NA`
+
     copy : bool, default False
         Whether to copy the array of data.
 
@@ -310,6 +317,8 @@ def __init__(self, values, copy=False):
         values = extract_array(values)
 
         super().__init__(values, copy=copy)
+        if not isinstance(values, type(self)):
+            self._validate()
         # error: Incompatible types in assignment (expression has type "StringDtype",
         # variable has type "PandasDtype")
         NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
@@ -318,16 +327,25 @@ def __init__(self, values, copy=False):
 
     def _validate(self):
         """Validate that we only store NA or strings."""
-        if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
-            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
         if self._ndarray.dtype != "object":
             raise ValueError(
                 "StringArray requires a sequence of strings or pandas.NA. Got "
                 f"'{self._ndarray.dtype}' dtype instead."
             )
+        try:
+            lib.ensure_string_array(
+                self._ndarray.ravel("K"),
+                na_value=StringDtype.na_value,
+                coerce="strict-null",
+                copy=False,
+            )
+        except ValueError:
+            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
 
     @classmethod
-    def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
+    def _from_sequence(
+        cls, scalars, *, dtype: Dtype | None = None, copy=False, coerce=True
+    ):
         if dtype and not (isinstance(dtype, str) and dtype == "string"):
             dtype = pandas_dtype(dtype)
             assert isinstance(dtype, StringDtype) and dtype.storage == "python"
@@ -336,15 +354,23 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
 
         if isinstance(scalars, BaseMaskedArray):
             # avoid costly conversion to object dtype
+            if coerce:
+                coerce = "non-null"
+            else:
+                coerce = None
             na_values = scalars._mask
             result = scalars._data
-            result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
+            result = lib.ensure_string_array(result, copy=copy, coerce=coerce)
             result[na_values] = StringDtype.na_value
 
         else:
             # convert non-na-likes to str, and nan-likes to StringDtype.na_value
+            if coerce:
+                coerce = "all"
+            else:
+                coerce = "strict-null"
             result = lib.ensure_string_array(
-                scalars, na_value=StringDtype.na_value, copy=copy
+                scalars, na_value=StringDtype.na_value, copy=copy, coerce=coerce
             )
 
         # Manually creating new array avoids the validation step in the __init__, so is

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -153,7 +153,9 @@ def __init__(self, values):
             )
 
     @classmethod
-    def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False):
+    def _from_sequence(
+        cls, scalars, dtype: Dtype | None = None, copy: bool = False, coerce=True
+    ):
         from pandas.core.arrays.masked import BaseMaskedArray
 
         _chk_pyarrow_available()
@@ -167,11 +169,19 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False)
             # numerical issues with Float32Dtype
             na_values = scalars._mask
             result = scalars._data
-            result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
+            if coerce:
+                coerce = "non-null"
+            else:
+                coerce = None
+            result = lib.ensure_string_array(result, copy=copy, coerce=coerce)
             return cls(pa.array(result, mask=na_values, type=pa.string()))
 
         # convert non-na-likes to str
-        result = lib.ensure_string_array(scalars, copy=copy)
+        if coerce:
+            coerce = "all"
+        else:
+            coerce = "strict-null"
+        result = lib.ensure_string_array(scalars, copy=copy, coerce=coerce)
         return cls(pa.array(result, type=pa.string(), from_pandas=True))
 
     @classmethod

diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -754,7 +754,7 @@ def _try_cast(
 
     elif dtype.kind == "U":
         # TODO: test cases with arr.dtype.kind in ["m", "M"]
-        return lib.ensure_string_array(arr, convert_na_value=False, copy=copy)
+        return lib.ensure_string_array(arr, coerce="non-null", copy=copy)
 
     elif dtype.kind in ["m", "M"]:
         return maybe_cast_to_datetime(arr, dtype)

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -1122,7 +1122,7 @@ def astype_nansafe(
         return arr.astype(dtype, copy=copy)
 
     if issubclass(dtype.type, str):
-        return lib.ensure_string_array(arr, skipna=skipna, convert_na_value=False)
+        return lib.ensure_string_array(arr, skipna=skipna, coerce="non-null")
 
     elif is_datetime64_dtype(arr):
         # Non-overlapping equality check (left operand type: "dtype[Any]", right