Skip to content

Na scalar string #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1500,7 +1500,7 @@ cdef class Validator:
f'must define is_value_typed')

cdef bint is_valid_null(self, object value) except -1:
return value is None or util.is_nan(value)
return value is None or value is C_NA or util.is_nan(value)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose this is to have inferring and the validation in StringArray working with NA?

One thing I have been thinking about is that it could be an option to let pd.NA play actually a somewhat different role than np.nan or None in construction / type inference. Eg so that if someone does pd.Series([1, 2, pd.NA]) it automatically becomes a nullable integer dtype instead of float (or object). Since pd.NA is new, we can actually do this without breaking backwards compatibility.
(now, not sure if that idea relates to the code here, and it can also be done later)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose this is to have inferring and the validation in StringArray working with NA?

Correct.

let pd.NA play actually a somewhat different role than np.nan or None in construction / type inference.

That may be in infer_dtype


cdef bint is_array_typed(self) except -1:
return False
Expand Down
8 changes: 5 additions & 3 deletions pandas/_libs/testing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -180,13 +180,15 @@ cpdef assert_almost_equal(a, b,
# classes can't be the same, to raise error
assert_class_equal(a, b, obj=obj)

if a == b:
# object comparison
return True
if isna(a) and isna(b):
# TODO: Should require same-dtype NA?
# nan / None comparison
return True

if a == b:
# object comparison
return True

if is_comparable_as_number(a) and is_comparable_as_number(b):
if array_equivalent(a, b, strict_nan=True):
# inf comparison
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/arrays/numpy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,9 @@ def fillna(self, value=None, method=None, limit=None):
return new_values

def take(self, indices, allow_fill=False, fill_value=None):
if fill_value is None:
# Primarily for subclasses
fill_value = self.dtype.na_value
result = take(
self._ndarray, indices, allow_fill=allow_fill, fill_value=fill_value
)
Expand Down
37 changes: 16 additions & 21 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import operator
from typing import TYPE_CHECKING, Type
from typing import Type

import numpy as np

from pandas._libs import lib
from pandas._libs import lib, missing as libmissing

from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.common import pandas_dtype
Expand All @@ -17,9 +17,6 @@
from pandas.core.construction import extract_array
from pandas.core.missing import isna

if TYPE_CHECKING:
from pandas._typing import Scalar


@register_extension_dtype
class StringDtype(ExtensionDtype):
Expand Down Expand Up @@ -50,16 +47,8 @@ class StringDtype(ExtensionDtype):
StringDtype
"""

@property
def na_value(self) -> "Scalar":
"""
StringDtype uses :attr:`numpy.nan` as the missing NA value.

.. warning::

`na_value` may change in a future release.
"""
return np.nan
#: StringDtype.na_value uses pandas.NA
na_value = libmissing.NA

@property
def type(self) -> Type:
Expand Down Expand Up @@ -172,10 +161,10 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
if dtype:
assert dtype == "string"
result = super()._from_sequence(scalars, dtype=object, copy=copy)
# convert None to np.nan
# Standardize all missing-like values to NA
# TODO: it would be nice to do this in _validate / lib.is_string_array
# We are already doing a scan over the values there.
result[result.isna()] = np.nan
result[result.isna()] = StringDtype.na_value
return result

@classmethod
Expand All @@ -192,6 +181,12 @@ def __arrow_array__(self, type=None):
type = pa.string()
return pa.array(self._ndarray, type=type, from_pandas=True)

def _values_for_factorize(self):
arr = self._ndarray.copy()
mask = self.isna()
arr[mask] = -1
return arr, -1

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can't specify pd.NA here as the indicator? (since that is already in the values)
Or does the algo code does not like that?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Algos didn't like it. Somewhere in there we do a value == na_value, which raises.

We may need / want to rewrite things to be masked based.


def __setitem__(self, key, value):
value = extract_array(value, extract_numpy=True)
if isinstance(value, type(self)):
Expand All @@ -205,9 +200,9 @@ def __setitem__(self, key, value):

# validate new items
if scalar_value:
if scalar_value is None:
value = np.nan
elif not (isinstance(value, str) or np.isnan(value)):
if isna(value):
value = StringDtype.na_value
elif not isinstance(value, str):
raise ValueError(
"Cannot set non-string value '{}' into a StringArray.".format(value)
)
Expand Down Expand Up @@ -265,7 +260,7 @@ def method(self, other):
other = other[valid]

result = np.empty_like(self._ndarray, dtype="object")
result[mask] = np.nan
result[mask] = StringDtype.na_value
result[valid] = op(self._ndarray[valid], other)

if op.__name__ in {"add", "radd", "mul", "rmul"}:
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/dtypes/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,9 @@ def array_equivalent(left, right, strict_nan=False):
if left_value is NaT and right_value is not NaT:
return False

elif left_value is libmissing.NA and right_value is not libmissing.NA:
return False

elif isinstance(left_value, float) and np.isnan(left_value):
if not isinstance(right_value, float) or not np.isnan(right_value):
return False
Expand All @@ -451,6 +454,8 @@ def array_equivalent(left, right, strict_nan=False):
if "Cannot compare tz-naive" in str(err):
# tzawareness compat failure, see GH#28507
return False
elif "boolean value of NA is ambiguous" in str(err):
return False
raise
return True

Expand Down
7 changes: 7 additions & 0 deletions pandas/core/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,13 @@ def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object):
if na_mask:
mask = isna(arr)
convert = not np.all(mask)
if convert:
# XXX: This converts pd.NA to np.nan to match the output of
# object-dtype ops that return numeric, like str.count
# We probably want to return Int64Dtype instead.
# NA -> nan
arr[mask] = np.nan

try:
result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert)
except (TypeError, AttributeError) as e:
Expand Down
11 changes: 10 additions & 1 deletion pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
def test_none_to_nan():
a = pd.arrays.StringArray._from_sequence(["a", None, "b"])
assert a[1] is not None
assert np.isnan(a[1])
assert a[1] is pd.NA


def test_setitem_validates():
Expand All @@ -24,6 +24,15 @@ def test_setitem_validates():
a[:] = np.array([1, 2])


def test_setitem_with_scalar_string():
# is_float_dtype considers some strings, like 'd', to be floats
# which can cause issues.
arr = pd.array(["a", "c"], dtype="string")
arr[0] = "d"
expected = pd.array(["d", "c"], dtype="string")
tm.assert_extension_array_equal(arr, expected)


@pytest.mark.parametrize(
"input, method",
[
Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/extension/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def data():
@pytest.fixture
def data_missing():
"""Length 2 array with [NA, Valid]"""
return StringArray._from_sequence([np.nan, "A"])
return StringArray._from_sequence([pd.NA, "A"])


@pytest.fixture
Expand All @@ -35,17 +35,17 @@ def data_for_sorting():

@pytest.fixture
def data_missing_for_sorting():
return StringArray._from_sequence(["B", np.nan, "A"])
return StringArray._from_sequence(["B", pd.NA, "A"])


@pytest.fixture
def na_value():
return np.nan
return pd.NA


@pytest.fixture
def data_for_grouping():
return StringArray._from_sequence(["B", "B", np.nan, np.nan, "A", "A", "B", "C"])
return StringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"])


class TestDtype(base.BaseDtypeTests):
Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3507,3 +3507,11 @@ def test_string_array(any_string_method):
assert all(result[columns].dtypes == "string")
result[columns] = result[columns].astype(object)
tm.assert_equal(result, expected)


@pytest.mark.xfail(reason="not implmented yet")
def test_string_dtype_numeric():
s = Series(["a", "aa", None], dtype="string")
result = s.str.count("a")
expected = Series([1, 2, None], dtype="Int64")
tm.assert_series_equal(result, expected)