-
Notifications
You must be signed in to change notification settings - Fork 0
Na scalar string #1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
266603f
bf12ba8
e83a8a9
004e42f
1660769
c357f8c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,9 @@ | ||
import operator | ||
from typing import TYPE_CHECKING, Type | ||
from typing import Type | ||
|
||
import numpy as np | ||
|
||
from pandas._libs import lib | ||
from pandas._libs import lib, missing as libmissing | ||
|
||
from pandas.core.dtypes.base import ExtensionDtype | ||
from pandas.core.dtypes.common import pandas_dtype | ||
|
@@ -17,9 +17,6 @@ | |
from pandas.core.construction import extract_array | ||
from pandas.core.missing import isna | ||
|
||
if TYPE_CHECKING: | ||
from pandas._typing import Scalar | ||
|
||
|
||
@register_extension_dtype | ||
class StringDtype(ExtensionDtype): | ||
|
@@ -50,16 +47,8 @@ class StringDtype(ExtensionDtype): | |
StringDtype | ||
""" | ||
|
||
@property | ||
def na_value(self) -> "Scalar": | ||
""" | ||
StringDtype uses :attr:`numpy.nan` as the missing NA value. | ||
|
||
.. warning:: | ||
|
||
`na_value` may change in a future release. | ||
""" | ||
return np.nan | ||
#: StringDtype.na_value uses pandas.NA | ||
na_value = libmissing.NA | ||
|
||
@property | ||
def type(self) -> Type: | ||
|
@@ -172,10 +161,10 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): | |
if dtype: | ||
assert dtype == "string" | ||
result = super()._from_sequence(scalars, dtype=object, copy=copy) | ||
# convert None to np.nan | ||
# Standardize all missing-like values to NA | ||
# TODO: it would be nice to do this in _validate / lib.is_string_array | ||
# We are already doing a scan over the values there. | ||
result[result.isna()] = np.nan | ||
result[result.isna()] = StringDtype.na_value | ||
return result | ||
|
||
@classmethod | ||
|
@@ -192,6 +181,12 @@ def __arrow_array__(self, type=None): | |
type = pa.string() | ||
return pa.array(self._ndarray, type=type, from_pandas=True) | ||
|
||
def _values_for_factorize(self): | ||
arr = self._ndarray.copy() | ||
mask = self.isna() | ||
arr[mask] = -1 | ||
return arr, -1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can't specify pd.NA here as the indicator? (since that is already in the values) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Algos didn't like it. Somewhere in there we do a We may need / want to rewrite things to be masked based. |
||
|
||
def __setitem__(self, key, value): | ||
value = extract_array(value, extract_numpy=True) | ||
if isinstance(value, type(self)): | ||
|
@@ -205,9 +200,9 @@ def __setitem__(self, key, value): | |
|
||
# validate new items | ||
if scalar_value: | ||
if scalar_value is None: | ||
value = np.nan | ||
elif not (isinstance(value, str) or np.isnan(value)): | ||
if isna(value): | ||
value = StringDtype.na_value | ||
elif not isinstance(value, str): | ||
raise ValueError( | ||
"Cannot set non-string value '{}' into a StringArray.".format(value) | ||
) | ||
|
@@ -265,7 +260,7 @@ def method(self, other): | |
other = other[valid] | ||
|
||
result = np.empty_like(self._ndarray, dtype="object") | ||
result[mask] = np.nan | ||
result[mask] = StringDtype.na_value | ||
result[valid] = op(self._ndarray[valid], other) | ||
|
||
if op.__name__ in {"add", "radd", "mul", "rmul"}: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I suppose this is to have inferring and the validation in StringArray working with NA?
One thing I have been thinking about is that it could be an option to let
pd.NA
play actually a somewhat different role than np.nan or None in construction / type inference. Eg so that if someone doespd.Series([1, 2, pd.NA])
it automatically becomes a nullable integer dtype instead of float (or object). Since pd.NA is new, we can actually do this without breaking backwards compatibility.(now, not sure if that idea relates to the code here, and it can also be done later)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Correct.
That may be in
infer_dtype